Check in LLVM r95781.

commit: e264f62ca09a8f65c87a46d562a4d0f9ec5d457e [log] [tgz]
author: Shih-wei Liao <sliao@google.com> Wed Feb 10 11:10:31 2010 -0800
committer: Shih-wei Liao <sliao@google.com> Wed Feb 10 11:10:31 2010 -0800
tree: 59e3d57ef656cef79afa708ae0a3daf25cd91fcf
diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h
new file mode 100644
index 0000000..b08f942
--- /dev/null
+++ b/lib/Target/ARM/ARM.h

@@ -0,0 +1,121 @@
+//===-- ARM.h - Top-level interface for ARM representation---- --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// ARM back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_ARM_H
+#define TARGET_ARM_H
+
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cassert>
+
+namespace llvm {
+
+class ARMBaseTargetMachine;
+class FunctionPass;
+class JITCodeEmitter;
+class formatted_raw_ostream;
+
+// Enums corresponding to ARM condition codes
+namespace ARMCC {
+  // The CondCodes constants map directly to the 4-bit encoding of the
+  // condition field for predicated instructions.
+  enum CondCodes {
+    EQ,
+    NE,
+    HS,
+    LO,
+    MI,
+    PL,
+    VS,
+    VC,
+    HI,
+    LS,
+    GE,
+    LT,
+    GT,
+    LE,
+    AL
+  };
+
+  inline static CondCodes getOppositeCondition(CondCodes CC){
+    switch (CC) {
+    default: llvm_unreachable("Unknown condition code");
+    case EQ: return NE;
+    case NE: return EQ;
+    case HS: return LO;
+    case LO: return HS;
+    case MI: return PL;
+    case PL: return MI;
+    case VS: return VC;
+    case VC: return VS;
+    case HI: return LS;
+    case LS: return HI;
+    case GE: return LT;
+    case LT: return GE;
+    case GT: return LE;
+    case LE: return GT;
+    }
+  }
+}
+
+inline static const char *ARMCondCodeToString(ARMCC::CondCodes CC) {
+  switch (CC) {
+  default: llvm_unreachable("Unknown condition code");
+  case ARMCC::EQ:  return "eq";
+  case ARMCC::NE:  return "ne";
+  case ARMCC::HS:  return "hs";
+  case ARMCC::LO:  return "lo";
+  case ARMCC::MI:  return "mi";
+  case ARMCC::PL:  return "pl";
+  case ARMCC::VS:  return "vs";
+  case ARMCC::VC:  return "vc";
+  case ARMCC::HI:  return "hi";
+  case ARMCC::LS:  return "ls";
+  case ARMCC::GE:  return "ge";
+  case ARMCC::LT:  return "lt";
+  case ARMCC::GT:  return "gt";
+  case ARMCC::LE:  return "le";
+  case ARMCC::AL:  return "al";
+  }
+}
+
+FunctionPass *createARMISelDag(ARMBaseTargetMachine &TM,
+                               CodeGenOpt::Level OptLevel);
+
+FunctionPass *createARMJITCodeEmitterPass(ARMBaseTargetMachine &TM,
+                                          JITCodeEmitter &JCE);
+
+FunctionPass *createARMLoadStoreOptimizationPass(bool PreAlloc = false);
+FunctionPass *createARMExpandPseudoPass();
+FunctionPass *createARMConstantIslandPass();
+FunctionPass *createNEONPreAllocPass();
+FunctionPass *createNEONMoveFixPass();
+FunctionPass *createThumb2ITBlockPass();
+FunctionPass *createThumb2SizeReductionPass();
+
+extern Target TheARMTarget, TheThumbTarget;
+
+} // end namespace llvm;
+
+// Defines symbolic names for ARM registers.  This defines a mapping from
+// register name to register number.
+//
+#include "ARMGenRegisterNames.inc"
+
+// Defines symbolic names for the ARM instructions.
+//
+#include "ARMGenInstrNames.inc"
+
+
+#endif

diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
new file mode 100644
index 0000000..7033861
--- /dev/null
+++ b/lib/Target/ARM/ARM.td

@@ -0,0 +1,149 @@
+//===- ARM.td - Describe the ARM Target Machine -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// ARM Subtarget features.
+//
+
+def ArchV4T     : SubtargetFeature<"v4t", "ARMArchVersion", "V4T",
+                                   "ARM v4T">;
+def ArchV5T     : SubtargetFeature<"v5t", "ARMArchVersion", "V5T",
+                                   "ARM v5T">;
+def ArchV5TE    : SubtargetFeature<"v5te", "ARMArchVersion", "V5TE",
+                                   "ARM v5TE, v5TEj, v5TExp">;
+def ArchV6      : SubtargetFeature<"v6", "ARMArchVersion", "V6",
+                                   "ARM v6">;
+def ArchV6T2    : SubtargetFeature<"v6t2", "ARMArchVersion", "V6T2",
+                                   "ARM v6t2">;
+def ArchV7A     : SubtargetFeature<"v7a", "ARMArchVersion", "V7A",
+                                   "ARM v7A">;
+def FeatureVFP2 : SubtargetFeature<"vfp2", "ARMFPUType", "VFPv2",
+                                   "Enable VFP2 instructions">;
+def FeatureVFP3 : SubtargetFeature<"vfp3", "ARMFPUType", "VFPv3",
+                                   "Enable VFP3 instructions">;
+def FeatureNEON : SubtargetFeature<"neon", "ARMFPUType", "NEON",
+                                   "Enable NEON instructions">;
+def FeatureThumb2 : SubtargetFeature<"thumb2", "ThumbMode", "Thumb2",
+                                     "Enable Thumb2 instructions">;
+
+//===----------------------------------------------------------------------===//
+// ARM Processors supported.
+//
+
+include "ARMSchedule.td"
+
+class ProcNoItin<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, GenericItineraries, Features>;
+
+// V4 Processors.
+def : ProcNoItin<"generic",         []>;
+def : ProcNoItin<"arm8",            []>;
+def : ProcNoItin<"arm810",          []>;
+def : ProcNoItin<"strongarm",       []>;
+def : ProcNoItin<"strongarm110",    []>;
+def : ProcNoItin<"strongarm1100",   []>;
+def : ProcNoItin<"strongarm1110",   []>;
+
+// V4T Processors.
+def : ProcNoItin<"arm7tdmi",        [ArchV4T]>;
+def : ProcNoItin<"arm7tdmi-s",      [ArchV4T]>;
+def : ProcNoItin<"arm710t",         [ArchV4T]>;
+def : ProcNoItin<"arm720t",         [ArchV4T]>;
+def : ProcNoItin<"arm9",            [ArchV4T]>;
+def : ProcNoItin<"arm9tdmi",        [ArchV4T]>;
+def : ProcNoItin<"arm920",          [ArchV4T]>;
+def : ProcNoItin<"arm920t",         [ArchV4T]>;
+def : ProcNoItin<"arm922t",         [ArchV4T]>;
+def : ProcNoItin<"arm940t",         [ArchV4T]>;
+def : ProcNoItin<"ep9312",          [ArchV4T]>;
+
+// V5T Processors.
+def : ProcNoItin<"arm10tdmi",       [ArchV5T]>;
+def : ProcNoItin<"arm1020t",        [ArchV5T]>;
+
+// V5TE Processors.
+def : ProcNoItin<"arm9e",           [ArchV5TE]>;
+def : ProcNoItin<"arm926ej-s",      [ArchV5TE]>;
+def : ProcNoItin<"arm946e-s",       [ArchV5TE]>;
+def : ProcNoItin<"arm966e-s",       [ArchV5TE]>;
+def : ProcNoItin<"arm968e-s",       [ArchV5TE]>;
+def : ProcNoItin<"arm10e",          [ArchV5TE]>;
+def : ProcNoItin<"arm1020e",        [ArchV5TE]>;
+def : ProcNoItin<"arm1022e",        [ArchV5TE]>;
+def : ProcNoItin<"xscale",          [ArchV5TE]>;
+def : ProcNoItin<"iwmmxt",          [ArchV5TE]>;
+
+// V6 Processors.
+def : Processor<"arm1136j-s",       ARMV6Itineraries, [ArchV6]>;
+def : Processor<"arm1136jf-s",      ARMV6Itineraries, [ArchV6, FeatureVFP2]>;
+def : Processor<"arm1176jz-s",      ARMV6Itineraries, [ArchV6]>;
+def : Processor<"arm1176jzf-s",     ARMV6Itineraries, [ArchV6, FeatureVFP2]>;
+def : Processor<"mpcorenovfp",      ARMV6Itineraries, [ArchV6]>;
+def : Processor<"mpcore",           ARMV6Itineraries, [ArchV6, FeatureVFP2]>;
+
+// V6T2 Processors.
+def : Processor<"arm1156t2-s",     ARMV6Itineraries,
+                 [ArchV6T2, FeatureThumb2]>;
+def : Processor<"arm1156t2f-s",    ARMV6Itineraries,
+                 [ArchV6T2, FeatureThumb2, FeatureVFP2]>;
+
+// V7 Processors.
+def : Processor<"cortex-a8",        CortexA8Itineraries,
+                [ArchV7A, FeatureThumb2, FeatureNEON]>;
+def : ProcNoItin<"cortex-a9",       [ArchV7A, FeatureThumb2, FeatureNEON]>;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "ARMRegisterInfo.td"
+
+include "ARMCallingConv.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "ARMInstrInfo.td"
+
+def ARMInstrInfo : InstrInfo {
+  // Define how we want to layout our target-specific information field.
+  let TSFlagsFields = ["AddrModeBits",
+                       "SizeFlag",
+                       "IndexModeBits",
+                       "Form",
+                       "isUnaryDataProc",
+                       "canXformTo16Bit",
+                       "Dom"];
+  let TSFlagsShifts = [0,
+                       4,
+                       7,
+                       9,
+                       15,
+                       16,
+                       17];
+}
+
+//===----------------------------------------------------------------------===//
+// Declare the target which we are implementing
+//===----------------------------------------------------------------------===//
+
+def ARM : Target {
+  // Pull in Instruction Info:
+  let InstructionSet = ARMInstrInfo;
+}

diff --git a/lib/Target/ARM/ARMAddressingModes.h b/lib/Target/ARM/ARMAddressingModes.h
new file mode 100644
index 0000000..ddeb1b9
--- /dev/null
+++ b/lib/Target/ARM/ARMAddressingModes.h

@@ -0,0 +1,566 @@
+//===- ARMAddressingModes.h - ARM Addressing Modes --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM addressing mode implementation stuff.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_ARM_ARMADDRESSINGMODES_H
+#define LLVM_TARGET_ARM_ARMADDRESSINGMODES_H
+
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+
+namespace llvm {
+
+/// ARM_AM - ARM Addressing Mode Stuff
+namespace ARM_AM {
+  enum ShiftOpc {
+    no_shift = 0,
+    asr,
+    lsl,
+    lsr,
+    ror,
+    rrx
+  };
+
+  enum AddrOpc {
+    add = '+', sub = '-'
+  };
+
+  static inline const char *getShiftOpcStr(ShiftOpc Op) {
+    switch (Op) {
+    default: assert(0 && "Unknown shift opc!");
+    case ARM_AM::asr: return "asr";
+    case ARM_AM::lsl: return "lsl";
+    case ARM_AM::lsr: return "lsr";
+    case ARM_AM::ror: return "ror";
+    case ARM_AM::rrx: return "rrx";
+    }
+  }
+
+  static inline ShiftOpc getShiftOpcForNode(SDValue N) {
+    switch (N.getOpcode()) {
+    default:          return ARM_AM::no_shift;
+    case ISD::SHL:    return ARM_AM::lsl;
+    case ISD::SRL:    return ARM_AM::lsr;
+    case ISD::SRA:    return ARM_AM::asr;
+    case ISD::ROTR:   return ARM_AM::ror;
+    //case ISD::ROTL:  // Only if imm -> turn into ROTR.
+    // Can't handle RRX here, because it would require folding a flag into
+    // the addressing mode.  :(  This causes us to miss certain things.
+    //case ARMISD::RRX: return ARM_AM::rrx;
+    }
+  }
+
+  enum AMSubMode {
+    bad_am_submode = 0,
+    ia,
+    ib,
+    da,
+    db
+  };
+
+  static inline const char *getAMSubModeStr(AMSubMode Mode) {
+    switch (Mode) {
+    default: assert(0 && "Unknown addressing sub-mode!");
+    case ARM_AM::ia: return "ia";
+    case ARM_AM::ib: return "ib";
+    case ARM_AM::da: return "da";
+    case ARM_AM::db: return "db";
+    }
+  }
+
+  static inline const char *getAMSubModeAltStr(AMSubMode Mode, bool isLD) {
+    switch (Mode) {
+    default: assert(0 && "Unknown addressing sub-mode!");
+    case ARM_AM::ia: return isLD ? "fd" : "ea";
+    case ARM_AM::ib: return isLD ? "ed" : "fa";
+    case ARM_AM::da: return isLD ? "fa" : "ed";
+    case ARM_AM::db: return isLD ? "ea" : "fd";
+    }
+  }
+
+  /// rotr32 - Rotate a 32-bit unsigned value right by a specified # bits.
+  ///
+  static inline unsigned rotr32(unsigned Val, unsigned Amt) {
+    assert(Amt < 32 && "Invalid rotate amount");
+    return (Val >> Amt) | (Val << ((32-Amt)&31));
+  }
+
+  /// rotl32 - Rotate a 32-bit unsigned value left by a specified # bits.
+  ///
+  static inline unsigned rotl32(unsigned Val, unsigned Amt) {
+    assert(Amt < 32 && "Invalid rotate amount");
+    return (Val << Amt) | (Val >> ((32-Amt)&31));
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #1: shift_operand with registers
+  //===--------------------------------------------------------------------===//
+  //
+  // This 'addressing mode' is used for arithmetic instructions.  It can
+  // represent things like:
+  //   reg
+  //   reg [asr|lsl|lsr|ror|rrx] reg
+  //   reg [asr|lsl|lsr|ror|rrx] imm
+  //
+  // This is stored three operands [rega, regb, opc].  The first is the base
+  // reg, the second is the shift amount (or reg0 if not present or imm).  The
+  // third operand encodes the shift opcode and the imm if a reg isn't present.
+  //
+  static inline unsigned getSORegOpc(ShiftOpc ShOp, unsigned Imm) {
+    return ShOp | (Imm << 3);
+  }
+  static inline unsigned getSORegOffset(unsigned Op) {
+    return Op >> 3;
+  }
+  static inline ShiftOpc getSORegShOp(unsigned Op) {
+    return (ShiftOpc)(Op & 7);
+  }
+
+  /// getSOImmValImm - Given an encoded imm field for the reg/imm form, return
+  /// the 8-bit imm value.
+  static inline unsigned getSOImmValImm(unsigned Imm) {
+    return Imm & 0xFF;
+  }
+  /// getSOImmValRot - Given an encoded imm field for the reg/imm form, return
+  /// the rotate amount.
+  static inline unsigned getSOImmValRot(unsigned Imm) {
+    return (Imm >> 8) * 2;
+  }
+
+  /// getSOImmValRotate - Try to handle Imm with an immediate shifter operand,
+  /// computing the rotate amount to use.  If this immediate value cannot be
+  /// handled with a single shifter-op, determine a good rotate amount that will
+  /// take a maximal chunk of bits out of the immediate.
+  static inline unsigned getSOImmValRotate(unsigned Imm) {
+    // 8-bit (or less) immediates are trivially shifter_operands with a rotate
+    // of zero.
+    if ((Imm & ~255U) == 0) return 0;
+
+    // Use CTZ to compute the rotate amount.
+    unsigned TZ = CountTrailingZeros_32(Imm);
+
+    // Rotate amount must be even.  Something like 0x200 must be rotated 8 bits,
+    // not 9.
+    unsigned RotAmt = TZ & ~1;
+
+    // If we can handle this spread, return it.
+    if ((rotr32(Imm, RotAmt) & ~255U) == 0)
+      return (32-RotAmt)&31;  // HW rotates right, not left.
+
+    // For values like 0xF000000F, we should skip the first run of ones, then
+    // retry the hunt.
+    if (Imm & 1) {
+      unsigned TrailingOnes = CountTrailingZeros_32(~Imm);
+      if (TrailingOnes != 32) {  // Avoid overflow on 0xFFFFFFFF
+        // Restart the search for a high-order bit after the initial seconds of
+        // ones.
+        unsigned TZ2 = CountTrailingZeros_32(Imm & ~((1 << TrailingOnes)-1));
+
+        // Rotate amount must be even.
+        unsigned RotAmt2 = TZ2 & ~1;
+
+        // If this fits, use it.
+        if (RotAmt2 != 32 && (rotr32(Imm, RotAmt2) & ~255U) == 0)
+          return (32-RotAmt2)&31;  // HW rotates right, not left.
+      }
+    }
+
+    // Otherwise, we have no way to cover this span of bits with a single
+    // shifter_op immediate.  Return a chunk of bits that will be useful to
+    // handle.
+    return (32-RotAmt)&31;  // HW rotates right, not left.
+  }
+
+  /// getSOImmVal - Given a 32-bit immediate, if it is something that can fit
+  /// into an shifter_operand immediate operand, return the 12-bit encoding for
+  /// it.  If not, return -1.
+  static inline int getSOImmVal(unsigned Arg) {
+    // 8-bit (or less) immediates are trivially shifter_operands with a rotate
+    // of zero.
+    if ((Arg & ~255U) == 0) return Arg;
+
+    unsigned RotAmt = getSOImmValRotate(Arg);
+
+    // If this cannot be handled with a single shifter_op, bail out.
+    if (rotr32(~255U, RotAmt) & Arg)
+      return -1;
+
+    // Encode this correctly.
+    return rotl32(Arg, RotAmt) | ((RotAmt>>1) << 8);
+  }
+
+  /// isSOImmTwoPartVal - Return true if the specified value can be obtained by
+  /// or'ing together two SOImmVal's.
+  static inline bool isSOImmTwoPartVal(unsigned V) {
+    // If this can be handled with a single shifter_op, bail out.
+    V = rotr32(~255U, getSOImmValRotate(V)) & V;
+    if (V == 0)
+      return false;
+
+    // If this can be handled with two shifter_op's, accept.
+    V = rotr32(~255U, getSOImmValRotate(V)) & V;
+    return V == 0;
+  }
+
+  /// getSOImmTwoPartFirst - If V is a value that satisfies isSOImmTwoPartVal,
+  /// return the first chunk of it.
+  static inline unsigned getSOImmTwoPartFirst(unsigned V) {
+    return rotr32(255U, getSOImmValRotate(V)) & V;
+  }
+
+  /// getSOImmTwoPartSecond - If V is a value that satisfies isSOImmTwoPartVal,
+  /// return the second chunk of it.
+  static inline unsigned getSOImmTwoPartSecond(unsigned V) {
+    // Mask out the first hunk.
+    V = rotr32(~255U, getSOImmValRotate(V)) & V;
+
+    // Take what's left.
+    assert(V == (rotr32(255U, getSOImmValRotate(V)) & V));
+    return V;
+  }
+
+  /// getThumbImmValShift - Try to handle Imm with a 8-bit immediate followed
+  /// by a left shift. Returns the shift amount to use.
+  static inline unsigned getThumbImmValShift(unsigned Imm) {
+    // 8-bit (or less) immediates are trivially immediate operand with a shift
+    // of zero.
+    if ((Imm & ~255U) == 0) return 0;
+
+    // Use CTZ to compute the shift amount.
+    return CountTrailingZeros_32(Imm);
+  }
+
+  /// isThumbImmShiftedVal - Return true if the specified value can be obtained
+  /// by left shifting a 8-bit immediate.
+  static inline bool isThumbImmShiftedVal(unsigned V) {
+    // If this can be handled with
+    V = (~255U << getThumbImmValShift(V)) & V;
+    return V == 0;
+  }
+
+  /// getThumbImm16ValShift - Try to handle Imm with a 16-bit immediate followed
+  /// by a left shift. Returns the shift amount to use.
+  static inline unsigned getThumbImm16ValShift(unsigned Imm) {
+    // 16-bit (or less) immediates are trivially immediate operand with a shift
+    // of zero.
+    if ((Imm & ~65535U) == 0) return 0;
+
+    // Use CTZ to compute the shift amount.
+    return CountTrailingZeros_32(Imm);
+  }
+
+  /// isThumbImm16ShiftedVal - Return true if the specified value can be
+  /// obtained by left shifting a 16-bit immediate.
+  static inline bool isThumbImm16ShiftedVal(unsigned V) {
+    // If this can be handled with
+    V = (~65535U << getThumbImm16ValShift(V)) & V;
+    return V == 0;
+  }
+
+  /// getThumbImmNonShiftedVal - If V is a value that satisfies
+  /// isThumbImmShiftedVal, return the non-shiftd value.
+  static inline unsigned getThumbImmNonShiftedVal(unsigned V) {
+    return V >> getThumbImmValShift(V);
+  }
+
+
+  /// getT2SOImmValSplat - Return the 12-bit encoded representation
+  /// if the specified value can be obtained by splatting the low 8 bits
+  /// into every other byte or every byte of a 32-bit value. i.e.,
+  ///     00000000 00000000 00000000 abcdefgh    control = 0
+  ///     00000000 abcdefgh 00000000 abcdefgh    control = 1
+  ///     abcdefgh 00000000 abcdefgh 00000000    control = 2
+  ///     abcdefgh abcdefgh abcdefgh abcdefgh    control = 3
+  /// Return -1 if none of the above apply.
+  /// See ARM Reference Manual A6.3.2.
+  static inline int getT2SOImmValSplatVal(unsigned V) {
+    unsigned u, Vs, Imm;
+    // control = 0
+    if ((V & 0xffffff00) == 0)
+      return V;
+
+    // If the value is zeroes in the first byte, just shift those off
+    Vs = ((V & 0xff) == 0) ? V >> 8 : V;
+    // Any passing value only has 8 bits of payload, splatted across the word
+    Imm = Vs & 0xff;
+    // Likewise, any passing values have the payload splatted into the 3rd byte
+    u = Imm | (Imm << 16);
+
+    // control = 1 or 2
+    if (Vs == u)
+      return (((Vs == V) ? 1 : 2) << 8) | Imm;
+
+    // control = 3
+    if (Vs == (u | (u << 8)))
+      return (3 << 8) | Imm;
+
+    return -1;
+  }
+
+  /// getT2SOImmValRotateVal - Return the 12-bit encoded representation if the
+  /// specified value is a rotated 8-bit value. Return -1 if no rotation
+  /// encoding is possible.
+  /// See ARM Reference Manual A6.3.2.
+  static inline int getT2SOImmValRotateVal(unsigned V) {
+    unsigned RotAmt = CountLeadingZeros_32(V);
+    if (RotAmt >= 24)
+      return -1;
+
+    // If 'Arg' can be handled with a single shifter_op return the value.
+    if ((rotr32(0xff000000U, RotAmt) & V) == V)
+      return (rotr32(V, 24 - RotAmt) & 0x7f) | ((RotAmt + 8) << 7);
+
+    return -1;
+  }
+
+  /// getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit
+  /// into a Thumb-2 shifter_operand immediate operand, return the 12-bit
+  /// encoding for it.  If not, return -1.
+  /// See ARM Reference Manual A6.3.2.
+  static inline int getT2SOImmVal(unsigned Arg) {
+    // If 'Arg' is an 8-bit splat, then get the encoded value.
+    int Splat = getT2SOImmValSplatVal(Arg);
+    if (Splat != -1)
+      return Splat;
+
+    // If 'Arg' can be handled with a single shifter_op return the value.
+    int Rot = getT2SOImmValRotateVal(Arg);
+    if (Rot != -1)
+      return Rot;
+
+    return -1;
+  }
+
+  static inline unsigned getT2SOImmValRotate(unsigned V) {
+    if ((V & ~255U) == 0) return 0;
+    // Use CTZ to compute the rotate amount.
+    unsigned RotAmt = CountTrailingZeros_32(V);
+    return (32 - RotAmt) & 31;
+  }
+
+  static inline bool isT2SOImmTwoPartVal (unsigned Imm) {
+    unsigned V = Imm;
+    // Passing values can be any combination of splat values and shifter
+    // values. If this can be handled with a single shifter or splat, bail
+    // out. Those should be handled directly, not with a two-part val.
+    if (getT2SOImmValSplatVal(V) != -1)
+      return false;
+    V = rotr32 (~255U, getT2SOImmValRotate(V)) & V;
+    if (V == 0)
+      return false;
+
+    // If this can be handled as an immediate, accept.
+    if (getT2SOImmVal(V) != -1) return true;
+
+    // Likewise, try masking out a splat value first.
+    V = Imm;
+    if (getT2SOImmValSplatVal(V & 0xff00ff00U) != -1)
+      V &= ~0xff00ff00U;
+    else if (getT2SOImmValSplatVal(V & 0x00ff00ffU) != -1)
+      V &= ~0x00ff00ffU;
+    // If what's left can be handled as an immediate, accept.
+    if (getT2SOImmVal(V) != -1) return true;
+
+    // Otherwise, do not accept.
+    return false;
+  }
+
+  static inline unsigned getT2SOImmTwoPartFirst(unsigned Imm) {
+    assert (isT2SOImmTwoPartVal(Imm) &&
+            "Immedate cannot be encoded as two part immediate!");
+    // Try a shifter operand as one part
+    unsigned V = rotr32 (~255, getT2SOImmValRotate(Imm)) & Imm;
+    // If the rest is encodable as an immediate, then return it.
+    if (getT2SOImmVal(V) != -1) return V;
+
+    // Try masking out a splat value first.
+    if (getT2SOImmValSplatVal(Imm & 0xff00ff00U) != -1)
+      return Imm & 0xff00ff00U;
+
+    // The other splat is all that's left as an option.
+    assert (getT2SOImmValSplatVal(Imm & 0x00ff00ffU) != -1);
+    return Imm & 0x00ff00ffU;
+  }
+
+  static inline unsigned getT2SOImmTwoPartSecond(unsigned Imm) {
+    // Mask out the first hunk
+    Imm ^= getT2SOImmTwoPartFirst(Imm);
+    // Return what's left
+    assert (getT2SOImmVal(Imm) != -1 &&
+            "Unable to encode second part of T2 two part SO immediate");
+    return Imm;
+  }
+
+
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #2
+  //===--------------------------------------------------------------------===//
+  //
+  // This is used for most simple load/store instructions.
+  //
+  // addrmode2 := reg +/- reg shop imm
+  // addrmode2 := reg +/- imm12
+  //
+  // The first operand is always a Reg.  The second operand is a reg if in
+  // reg/reg form, otherwise it's reg#0.  The third field encodes the operation
+  // in bit 12, the immediate in bits 0-11, and the shift op in 13-15.
+  //
+  // If this addressing mode is a frame index (before prolog/epilog insertion
+  // and code rewriting), this operand will have the form:  FI#, reg0, <offs>
+  // with no shift amount for the frame offset.
+  //
+  static inline unsigned getAM2Opc(AddrOpc Opc, unsigned Imm12, ShiftOpc SO) {
+    assert(Imm12 < (1 << 12) && "Imm too large!");
+    bool isSub = Opc == sub;
+    return Imm12 | ((int)isSub << 12) | (SO << 13);
+  }
+  static inline unsigned getAM2Offset(unsigned AM2Opc) {
+    return AM2Opc & ((1 << 12)-1);
+  }
+  static inline AddrOpc getAM2Op(unsigned AM2Opc) {
+    return ((AM2Opc >> 12) & 1) ? sub : add;
+  }
+  static inline ShiftOpc getAM2ShiftOpc(unsigned AM2Opc) {
+    return (ShiftOpc)(AM2Opc >> 13);
+  }
+
+
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #3
+  //===--------------------------------------------------------------------===//
+  //
+  // This is used for sign-extending loads, and load/store-pair instructions.
+  //
+  // addrmode3 := reg +/- reg
+  // addrmode3 := reg +/- imm8
+  //
+  // The first operand is always a Reg.  The second operand is a reg if in
+  // reg/reg form, otherwise it's reg#0.  The third field encodes the operation
+  // in bit 8, the immediate in bits 0-7.
+
+  /// getAM3Opc - This function encodes the addrmode3 opc field.
+  static inline unsigned getAM3Opc(AddrOpc Opc, unsigned char Offset) {
+    bool isSub = Opc == sub;
+    return ((int)isSub << 8) | Offset;
+  }
+  static inline unsigned char getAM3Offset(unsigned AM3Opc) {
+    return AM3Opc & 0xFF;
+  }
+  static inline AddrOpc getAM3Op(unsigned AM3Opc) {
+    return ((AM3Opc >> 8) & 1) ? sub : add;
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #4
+  //===--------------------------------------------------------------------===//
+  //
+  // This is used for load / store multiple instructions.
+  //
+  // addrmode4 := reg, <mode>
+  //
+  // The four modes are:
+  //    IA - Increment after
+  //    IB - Increment before
+  //    DA - Decrement after
+  //    DB - Decrement before
+  //
+  // If the 4th bit (writeback)is set, then the base register is updated after
+  // the memory transfer.
+
+  static inline AMSubMode getAM4SubMode(unsigned Mode) {
+    return (AMSubMode)(Mode & 0x7);
+  }
+
+  static inline unsigned getAM4ModeImm(AMSubMode SubMode, bool WB = false) {
+    return (int)SubMode | ((int)WB << 3);
+  }
+
+  static inline bool getAM4WBFlag(unsigned Mode) {
+    return (Mode >> 3) & 1;
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #5
+  //===--------------------------------------------------------------------===//
+  //
+  // This is used for coprocessor instructions, such as FP load/stores.
+  //
+  // addrmode5 := reg +/- imm8*4
+  //
+  // The first operand is always a Reg.  The second operand encodes the
+  // operation in bit 8 and the immediate in bits 0-7.
+  //
+  // This is also used for FP load/store multiple ops. The second operand
+  // encodes the writeback mode in bit 8 and the number of registers (or 2
+  // times the number of registers for DPR ops) in bits 0-7. In addition,
+  // bits 9-11 encode one of the following two sub-modes:
+  //
+  //    IA - Increment after
+  //    DB - Decrement before
+
+  /// getAM5Opc - This function encodes the addrmode5 opc field.
+  static inline unsigned getAM5Opc(AddrOpc Opc, unsigned char Offset) {
+    bool isSub = Opc == sub;
+    return ((int)isSub << 8) | Offset;
+  }
+  static inline unsigned char getAM5Offset(unsigned AM5Opc) {
+    return AM5Opc & 0xFF;
+  }
+  static inline AddrOpc getAM5Op(unsigned AM5Opc) {
+    return ((AM5Opc >> 8) & 1) ? sub : add;
+  }
+
+  /// getAM5Opc - This function encodes the addrmode5 opc field for VLDM and
+  /// VSTM instructions.
+  static inline unsigned getAM5Opc(AMSubMode SubMode, bool WB,
+                                   unsigned char Offset) {
+    assert((SubMode == ia || SubMode == db) &&
+           "Illegal addressing mode 5 sub-mode!");
+    return ((int)SubMode << 9) | ((int)WB << 8) | Offset;
+  }
+  static inline AMSubMode getAM5SubMode(unsigned AM5Opc) {
+    return (AMSubMode)((AM5Opc >> 9) & 0x7);
+  }
+  static inline bool getAM5WBFlag(unsigned AM5Opc) {
+    return ((AM5Opc >> 8) & 1);
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Addressing Mode #6
+  //===--------------------------------------------------------------------===//
+  //
+  // This is used for NEON load / store instructions.
+  //
+  // addrmode6 := reg with optional writeback and alignment
+  //
+  // This is stored in four operands [regaddr, regupdate, opc, align].  The
+  // first is the address register.  The second register holds the value of
+  // a post-access increment for writeback or reg0 if no writeback or if the
+  // writeback increment is the size of the memory access.  The third
+  // operand encodes whether there is writeback to the address register. The
+  // fourth operand is the value of the alignment specifier to use or zero if
+  // no explicit alignment.
+
+  static inline unsigned getAM6Opc(bool WB = false) {
+    return (int)WB;
+  }
+
+  static inline bool getAM6WBFlag(unsigned Mode) {
+    return Mode & 1;
+  }
+
+} // end namespace ARM_AM
+} // end namespace llvm
+
+#endif
+

diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
new file mode 100644
index 0000000..6fe7c2c
--- /dev/null
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp

@@ -0,0 +1,1235 @@
+//===- ARMBaseInstrInfo.cpp - ARM Instruction Information -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Base ARM implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMBaseInstrInfo.h"
+#include "ARM.h"
+#include "ARMAddressingModes.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMGenInstrInfo.inc"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMRegisterInfo.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+using namespace llvm;
+
+static cl::opt<bool>
+EnableARM3Addr("enable-arm-3-addr-conv", cl::Hidden,
+               cl::desc("Enable ARM 2-addr to 3-addr conv"));
+
+ARMBaseInstrInfo::ARMBaseInstrInfo(const ARMSubtarget& STI)
+  : TargetInstrInfoImpl(ARMInsts, array_lengthof(ARMInsts)),
+    Subtarget(STI) {
+}
+
+MachineInstr *
+ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
+                                        MachineBasicBlock::iterator &MBBI,
+                                        LiveVariables *LV) const {
+  // FIXME: Thumb2 support.
+
+  if (!EnableARM3Addr)
+    return NULL;
+
+  MachineInstr *MI = MBBI;
+  MachineFunction &MF = *MI->getParent()->getParent();
+  unsigned TSFlags = MI->getDesc().TSFlags;
+  bool isPre = false;
+  switch ((TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift) {
+  default: return NULL;
+  case ARMII::IndexModePre:
+    isPre = true;
+    break;
+  case ARMII::IndexModePost:
+    break;
+  }
+
+  // Try splitting an indexed load/store to an un-indexed one plus an add/sub
+  // operation.
+  unsigned MemOpc = getUnindexedOpcode(MI->getOpcode());
+  if (MemOpc == 0)
+    return NULL;
+
+  MachineInstr *UpdateMI = NULL;
+  MachineInstr *MemMI = NULL;
+  unsigned AddrMode = (TSFlags & ARMII::AddrModeMask);
+  const TargetInstrDesc &TID = MI->getDesc();
+  unsigned NumOps = TID.getNumOperands();
+  bool isLoad = !TID.mayStore();
+  const MachineOperand &WB = isLoad ? MI->getOperand(1) : MI->getOperand(0);
+  const MachineOperand &Base = MI->getOperand(2);
+  const MachineOperand &Offset = MI->getOperand(NumOps-3);
+  unsigned WBReg = WB.getReg();
+  unsigned BaseReg = Base.getReg();
+  unsigned OffReg = Offset.getReg();
+  unsigned OffImm = MI->getOperand(NumOps-2).getImm();
+  ARMCC::CondCodes Pred = (ARMCC::CondCodes)MI->getOperand(NumOps-1).getImm();
+  switch (AddrMode) {
+  default:
+    assert(false && "Unknown indexed op!");
+    return NULL;
+  case ARMII::AddrMode2: {
+    bool isSub = ARM_AM::getAM2Op(OffImm) == ARM_AM::sub;
+    unsigned Amt = ARM_AM::getAM2Offset(OffImm);
+    if (OffReg == 0) {
+      if (ARM_AM::getSOImmVal(Amt) == -1)
+        // Can't encode it in a so_imm operand. This transformation will
+        // add more than 1 instruction. Abandon!
+        return NULL;
+      UpdateMI = BuildMI(MF, MI->getDebugLoc(),
+                         get(isSub ? ARM::SUBri : ARM::ADDri), WBReg)
+        .addReg(BaseReg).addImm(Amt)
+        .addImm(Pred).addReg(0).addReg(0);
+    } else if (Amt != 0) {
+      ARM_AM::ShiftOpc ShOpc = ARM_AM::getAM2ShiftOpc(OffImm);
+      unsigned SOOpc = ARM_AM::getSORegOpc(ShOpc, Amt);
+      UpdateMI = BuildMI(MF, MI->getDebugLoc(),
+                         get(isSub ? ARM::SUBrs : ARM::ADDrs), WBReg)
+        .addReg(BaseReg).addReg(OffReg).addReg(0).addImm(SOOpc)
+        .addImm(Pred).addReg(0).addReg(0);
+    } else
+      UpdateMI = BuildMI(MF, MI->getDebugLoc(),
+                         get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg)
+        .addReg(BaseReg).addReg(OffReg)
+        .addImm(Pred).addReg(0).addReg(0);
+    break;
+  }
+  case ARMII::AddrMode3 : {
+    bool isSub = ARM_AM::getAM3Op(OffImm) == ARM_AM::sub;
+    unsigned Amt = ARM_AM::getAM3Offset(OffImm);
+    if (OffReg == 0)
+      // Immediate is 8-bits. It's guaranteed to fit in a so_imm operand.
+      UpdateMI = BuildMI(MF, MI->getDebugLoc(),
+                         get(isSub ? ARM::SUBri : ARM::ADDri), WBReg)
+        .addReg(BaseReg).addImm(Amt)
+        .addImm(Pred).addReg(0).addReg(0);
+    else
+      UpdateMI = BuildMI(MF, MI->getDebugLoc(),
+                         get(isSub ? ARM::SUBrr : ARM::ADDrr), WBReg)
+        .addReg(BaseReg).addReg(OffReg)
+        .addImm(Pred).addReg(0).addReg(0);
+    break;
+  }
+  }
+
+  std::vector<MachineInstr*> NewMIs;
+  if (isPre) {
+    if (isLoad)
+      MemMI = BuildMI(MF, MI->getDebugLoc(),
+                      get(MemOpc), MI->getOperand(0).getReg())
+        .addReg(WBReg).addReg(0).addImm(0).addImm(Pred);
+    else
+      MemMI = BuildMI(MF, MI->getDebugLoc(),
+                      get(MemOpc)).addReg(MI->getOperand(1).getReg())
+        .addReg(WBReg).addReg(0).addImm(0).addImm(Pred);
+    NewMIs.push_back(MemMI);
+    NewMIs.push_back(UpdateMI);
+  } else {
+    if (isLoad)
+      MemMI = BuildMI(MF, MI->getDebugLoc(),
+                      get(MemOpc), MI->getOperand(0).getReg())
+        .addReg(BaseReg).addReg(0).addImm(0).addImm(Pred);
+    else
+      MemMI = BuildMI(MF, MI->getDebugLoc(),
+                      get(MemOpc)).addReg(MI->getOperand(1).getReg())
+        .addReg(BaseReg).addReg(0).addImm(0).addImm(Pred);
+    if (WB.isDead())
+      UpdateMI->getOperand(0).setIsDead();
+    NewMIs.push_back(UpdateMI);
+    NewMIs.push_back(MemMI);
+  }
+
+  // Transfer LiveVariables states, kill / dead info.
+  if (LV) {
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (MO.isReg() && MO.getReg() &&
+          TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+        unsigned Reg = MO.getReg();
+
+        LiveVariables::VarInfo &VI = LV->getVarInfo(Reg);
+        if (MO.isDef()) {
+          MachineInstr *NewMI = (Reg == WBReg) ? UpdateMI : MemMI;
+          if (MO.isDead())
+            LV->addVirtualRegisterDead(Reg, NewMI);
+        }
+        if (MO.isUse() && MO.isKill()) {
+          for (unsigned j = 0; j < 2; ++j) {
+            // Look at the two new MI's in reverse order.
+            MachineInstr *NewMI = NewMIs[j];
+            if (!NewMI->readsRegister(Reg))
+              continue;
+            LV->addVirtualRegisterKilled(Reg, NewMI);
+            if (VI.removeKill(MI))
+              VI.Kills.push_back(NewMI);
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  MFI->insert(MBBI, NewMIs[1]);
+  MFI->insert(MBBI, NewMIs[0]);
+  return NewMIs[0];
+}
+
+// Branch analysis.
+bool
+ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
+                                MachineBasicBlock *&FBB,
+                                SmallVectorImpl<MachineOperand> &Cond,
+                                bool AllowModify) const {
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+
+  // If there is only one terminator instruction, process it.
+  unsigned LastOpc = LastInst->getOpcode();
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+    if (isUncondBranchOpcode(LastOpc)) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    }
+    if (isCondBranchOpcode(LastOpc)) {
+      // Block ends with fall-through condbranch.
+      TBB = LastInst->getOperand(0).getMBB();
+      Cond.push_back(LastInst->getOperand(1));
+      Cond.push_back(LastInst->getOperand(2));
+      return false;
+    }
+    return true;  // Can't handle indirect branch.
+  }
+
+  // Get the instruction before it if it is a terminator.
+  MachineInstr *SecondLastInst = I;
+
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I))
+    return true;
+
+  // If the block ends with a B and a Bcc, handle it.
+  unsigned SecondLastOpc = SecondLastInst->getOpcode();
+  if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+    TBB =  SecondLastInst->getOperand(0).getMBB();
+    Cond.push_back(SecondLastInst->getOperand(1));
+    Cond.push_back(SecondLastInst->getOperand(2));
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+
+  // If the block ends with two unconditional branches, handle it.  The second
+  // one is not executed, so remove it.
+  if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return false;
+  }
+
+  // ...likewise if it ends with a branch table followed by an unconditional
+  // branch. The branch folder can create these, and we must get rid of them for
+  // correctness of Thumb constant islands.
+  if ((isJumpTableBranchOpcode(SecondLastOpc) ||
+       isIndirectBranchOpcode(SecondLastOpc)) &&
+      isUncondBranchOpcode(LastOpc)) {
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return true;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+
+unsigned ARMBaseInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin()) return 0;
+  --I;
+  if (!isUncondBranchOpcode(I->getOpcode()) &&
+      !isCondBranchOpcode(I->getOpcode()))
+    return 0;
+
+  // Remove the branch.
+  I->eraseFromParent();
+
+  I = MBB.end();
+
+  if (I == MBB.begin()) return 1;
+  --I;
+  if (!isCondBranchOpcode(I->getOpcode()))
+    return 1;
+
+  // Remove the branch.
+  I->eraseFromParent();
+  return 2;
+}
+
+unsigned
+ARMBaseInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                               MachineBasicBlock *FBB,
+                             const SmallVectorImpl<MachineOperand> &Cond) const {
+  // FIXME this should probably have a DebugLoc argument
+  DebugLoc dl = DebugLoc::getUnknownLoc();
+
+  ARMFunctionInfo *AFI = MBB.getParent()->getInfo<ARMFunctionInfo>();
+  int BOpc   = !AFI->isThumbFunction()
+    ? ARM::B : (AFI->isThumb2Function() ? ARM::t2B : ARM::tB);
+  int BccOpc = !AFI->isThumbFunction()
+    ? ARM::Bcc : (AFI->isThumb2Function() ? ARM::t2Bcc : ARM::tBcc);
+
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 2 || Cond.size() == 0) &&
+         "ARM branch conditions have two components!");
+
+  if (FBB == 0) {
+    if (Cond.empty()) // Unconditional branch?
+      BuildMI(&MBB, dl, get(BOpc)).addMBB(TBB);
+    else
+      BuildMI(&MBB, dl, get(BccOpc)).addMBB(TBB)
+        .addImm(Cond[0].getImm()).addReg(Cond[1].getReg());
+    return 1;
+  }
+
+  // Two-way conditional branch.
+  BuildMI(&MBB, dl, get(BccOpc)).addMBB(TBB)
+    .addImm(Cond[0].getImm()).addReg(Cond[1].getReg());
+  BuildMI(&MBB, dl, get(BOpc)).addMBB(FBB);
+  return 2;
+}
+
+bool ARMBaseInstrInfo::
+ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+  ARMCC::CondCodes CC = (ARMCC::CondCodes)(int)Cond[0].getImm();
+  Cond[0].setImm(ARMCC::getOppositeCondition(CC));
+  return false;
+}
+
+bool ARMBaseInstrInfo::
+PredicateInstruction(MachineInstr *MI,
+                     const SmallVectorImpl<MachineOperand> &Pred) const {
+  unsigned Opc = MI->getOpcode();
+  if (isUncondBranchOpcode(Opc)) {
+    MI->setDesc(get(getMatchingCondBranchOpcode(Opc)));
+    MI->addOperand(MachineOperand::CreateImm(Pred[0].getImm()));
+    MI->addOperand(MachineOperand::CreateReg(Pred[1].getReg(), false));
+    return true;
+  }
+
+  int PIdx = MI->findFirstPredOperandIdx();
+  if (PIdx != -1) {
+    MachineOperand &PMO = MI->getOperand(PIdx);
+    PMO.setImm(Pred[0].getImm());
+    MI->getOperand(PIdx+1).setReg(Pred[1].getReg());
+    return true;
+  }
+  return false;
+}
+
+bool ARMBaseInstrInfo::
+SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
+                  const SmallVectorImpl<MachineOperand> &Pred2) const {
+  if (Pred1.size() > 2 || Pred2.size() > 2)
+    return false;
+
+  ARMCC::CondCodes CC1 = (ARMCC::CondCodes)Pred1[0].getImm();
+  ARMCC::CondCodes CC2 = (ARMCC::CondCodes)Pred2[0].getImm();
+  if (CC1 == CC2)
+    return true;
+
+  switch (CC1) {
+  default:
+    return false;
+  case ARMCC::AL:
+    return true;
+  case ARMCC::HS:
+    return CC2 == ARMCC::HI;
+  case ARMCC::LS:
+    return CC2 == ARMCC::LO || CC2 == ARMCC::EQ;
+  case ARMCC::GE:
+    return CC2 == ARMCC::GT;
+  case ARMCC::LE:
+    return CC2 == ARMCC::LT;
+  }
+}
+
+bool ARMBaseInstrInfo::DefinesPredicate(MachineInstr *MI,
+                                    std::vector<MachineOperand> &Pred) const {
+  // FIXME: This confuses implicit_def with optional CPSR def.
+  const TargetInstrDesc &TID = MI->getDesc();
+  if (!TID.getImplicitDefs() && !TID.hasOptionalDef())
+    return false;
+
+  bool Found = false;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.getReg() == ARM::CPSR) {
+      Pred.push_back(MO);
+      Found = true;
+    }
+  }
+
+  return Found;
+}
+
+/// isPredicable - Return true if the specified instruction can be predicated.
+/// By default, this returns true for every instruction with a
+/// PredicateOperand.
+bool ARMBaseInstrInfo::isPredicable(MachineInstr *MI) const {
+  const TargetInstrDesc &TID = MI->getDesc();
+  if (!TID.isPredicable())
+    return false;
+
+  if ((TID.TSFlags & ARMII::DomainMask) == ARMII::DomainNEON) {
+    ARMFunctionInfo *AFI =
+      MI->getParent()->getParent()->getInfo<ARMFunctionInfo>();
+    return AFI->isThumb2Function();
+  }
+  return true;
+}
+
+/// FIXME: Works around a gcc miscompilation with -fstrict-aliasing.
+DISABLE_INLINE
+static unsigned getNumJTEntries(const std::vector<MachineJumpTableEntry> &JT,
+                                unsigned JTI);
+static unsigned getNumJTEntries(const std::vector<MachineJumpTableEntry> &JT,
+                                unsigned JTI) {
+  assert(JTI < JT.size());
+  return JT[JTI].MBBs.size();
+}
+
+/// GetInstSize - Return the size of the specified MachineInstr.
+///
+unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
+  const MachineBasicBlock &MBB = *MI->getParent();
+  const MachineFunction *MF = MBB.getParent();
+  const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
+
+  // Basic size info comes from the TSFlags field.
+  const TargetInstrDesc &TID = MI->getDesc();
+  unsigned TSFlags = TID.TSFlags;
+
+  unsigned Opc = MI->getOpcode();
+  switch ((TSFlags & ARMII::SizeMask) >> ARMII::SizeShift) {
+  default: {
+    // If this machine instr is an inline asm, measure it.
+    if (MI->getOpcode() == ARM::INLINEASM)
+      return getInlineAsmLength(MI->getOperand(0).getSymbolName(), *MAI);
+    if (MI->isLabel())
+      return 0;
+    switch (Opc) {
+    default:
+      llvm_unreachable("Unknown or unset size field for instr!");
+    case TargetOpcode::IMPLICIT_DEF:
+    case TargetOpcode::KILL:
+    case TargetOpcode::DBG_LABEL:
+    case TargetOpcode::EH_LABEL:
+      return 0;
+    }
+    break;
+  }
+  case ARMII::Size8Bytes: return 8;          // ARM instruction x 2.
+  case ARMII::Size4Bytes: return 4;          // ARM / Thumb2 instruction.
+  case ARMII::Size2Bytes: return 2;          // Thumb1 instruction.
+  case ARMII::SizeSpecial: {
+    switch (Opc) {
+    case ARM::CONSTPOOL_ENTRY:
+      // If this machine instr is a constant pool entry, its size is recorded as
+      // operand #2.
+      return MI->getOperand(2).getImm();
+    case ARM::Int_eh_sjlj_setjmp:
+      return 24;
+    case ARM::tInt_eh_sjlj_setjmp:
+      return 14;
+    case ARM::t2Int_eh_sjlj_setjmp:
+      return 14;
+    case ARM::BR_JTr:
+    case ARM::BR_JTm:
+    case ARM::BR_JTadd:
+    case ARM::tBR_JTr:
+    case ARM::t2BR_JT:
+    case ARM::t2TBB:
+    case ARM::t2TBH: {
+      // These are jumptable branches, i.e. a branch followed by an inlined
+      // jumptable. The size is 4 + 4 * number of entries. For TBB, each
+      // entry is one byte; TBH two byte each.
+      unsigned EntrySize = (Opc == ARM::t2TBB)
+        ? 1 : ((Opc == ARM::t2TBH) ? 2 : 4);
+      unsigned NumOps = TID.getNumOperands();
+      MachineOperand JTOP =
+        MI->getOperand(NumOps - (TID.isPredicable() ? 3 : 2));
+      unsigned JTI = JTOP.getIndex();
+      const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+      assert(MJTI != 0);
+      const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+      assert(JTI < JT.size());
+      // Thumb instructions are 2 byte aligned, but JT entries are 4 byte
+      // 4 aligned. The assembler / linker may add 2 byte padding just before
+      // the JT entries.  The size does not include this padding; the
+      // constant islands pass does separate bookkeeping for it.
+      // FIXME: If we know the size of the function is less than (1 << 16) *2
+      // bytes, we can use 16-bit entries instead. Then there won't be an
+      // alignment issue.
+      unsigned InstSize = (Opc == ARM::tBR_JTr || Opc == ARM::t2BR_JT) ? 2 : 4;
+      unsigned NumEntries = getNumJTEntries(JT, JTI);
+      if (Opc == ARM::t2TBB && (NumEntries & 1))
+        // Make sure the instruction that follows TBB is 2-byte aligned.
+        // FIXME: Constant island pass should insert an "ALIGN" instruction
+        // instead.
+        ++NumEntries;
+      return NumEntries * EntrySize + InstSize;
+    }
+    default:
+      // Otherwise, pseudo-instruction sizes are zero.
+      return 0;
+    }
+  }
+  }
+  return 0; // Not reached
+}
+
+/// Return true if the instruction is a register to register move and
+/// leave the source and dest operands in the passed parameters.
+///
+bool
+ARMBaseInstrInfo::isMoveInstr(const MachineInstr &MI,
+                              unsigned &SrcReg, unsigned &DstReg,
+                              unsigned& SrcSubIdx, unsigned& DstSubIdx) const {
+  SrcSubIdx = DstSubIdx = 0; // No sub-registers.
+
+  switch (MI.getOpcode()) {
+  default: break;
+  case ARM::VMOVS:
+  case ARM::VMOVD:
+  case ARM::VMOVDneon:
+  case ARM::VMOVQ: {
+    SrcReg = MI.getOperand(1).getReg();
+    DstReg = MI.getOperand(0).getReg();
+    return true;
+  }
+  case ARM::MOVr:
+  case ARM::tMOVr:
+  case ARM::tMOVgpr2tgpr:
+  case ARM::tMOVtgpr2gpr:
+  case ARM::tMOVgpr2gpr:
+  case ARM::t2MOVr: {
+    assert(MI.getDesc().getNumOperands() >= 2 &&
+           MI.getOperand(0).isReg() &&
+           MI.getOperand(1).isReg() &&
+           "Invalid ARM MOV instruction");
+    SrcReg = MI.getOperand(1).getReg();
+    DstReg = MI.getOperand(0).getReg();
+    return true;
+  }
+  }
+
+  return false;
+}
+
+unsigned
+ARMBaseInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                      int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case ARM::LDR:
+  case ARM::t2LDRs:  // FIXME: don't use t2LDRs to access frame.
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isReg() &&
+        MI->getOperand(3).isImm() &&
+        MI->getOperand(2).getReg() == 0 &&
+        MI->getOperand(3).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  case ARM::t2LDRi12:
+  case ARM::tRestore:
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isImm() &&
+        MI->getOperand(2).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  case ARM::VLDRD:
+  case ARM::VLDRS:
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isImm() &&
+        MI->getOperand(2).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+
+  return 0;
+}
+
+unsigned
+ARMBaseInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                     int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case ARM::STR:
+  case ARM::t2STRs: // FIXME: don't use t2STRs to access frame.
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isReg() &&
+        MI->getOperand(3).isImm() &&
+        MI->getOperand(2).getReg() == 0 &&
+        MI->getOperand(3).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  case ARM::t2STRi12:
+  case ARM::tSpill:
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isImm() &&
+        MI->getOperand(2).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  case ARM::VSTRD:
+  case ARM::VSTRS:
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isImm() &&
+        MI->getOperand(2).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+
+  return 0;
+}
+
+bool
+ARMBaseInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator I,
+                               unsigned DestReg, unsigned SrcReg,
+                               const TargetRegisterClass *DestRC,
+                               const TargetRegisterClass *SrcRC) const {
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  if (DestRC != SrcRC) {
+    if (DestRC->getSize() != SrcRC->getSize())
+      return false;
+
+    // Allow DPR / DPR_VFP2 / DPR_8 cross-class copies.
+    // Allow QPR / QPR_VFP2 / QPR_8 cross-class copies.
+    if (DestRC->getSize() != 8 && DestRC->getSize() != 16)
+      return false;
+  }
+
+  if (DestRC == ARM::GPRRegisterClass) {
+    AddDefaultCC(AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::MOVr),
+                                        DestReg).addReg(SrcReg)));
+  } else if (DestRC == ARM::SPRRegisterClass) {
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VMOVS), DestReg)
+                   .addReg(SrcReg));
+  } else if (DestRC == ARM::DPRRegisterClass) {
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VMOVD), DestReg)
+                   .addReg(SrcReg));
+  } else if (DestRC == ARM::DPR_VFP2RegisterClass ||
+             DestRC == ARM::DPR_8RegisterClass ||
+             SrcRC == ARM::DPR_VFP2RegisterClass ||
+             SrcRC == ARM::DPR_8RegisterClass) {
+    // Always use neon reg-reg move if source or dest is NEON-only regclass.
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VMOVDneon),
+                           DestReg).addReg(SrcReg));
+  } else if (DestRC == ARM::QPRRegisterClass ||
+             DestRC == ARM::QPR_VFP2RegisterClass ||
+             DestRC == ARM::QPR_8RegisterClass) {
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VMOVQ),
+                           DestReg).addReg(SrcReg));
+  } else {
+    return false;
+  }
+
+  return true;
+}
+
+void ARMBaseInstrInfo::
+storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                    unsigned SrcReg, bool isKill, int FI,
+                    const TargetRegisterClass *RC) const {
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  unsigned Align = MFI.getObjectAlignment(FI);
+
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FI),
+                            MachineMemOperand::MOStore, 0,
+                            MFI.getObjectSize(FI),
+                            Align);
+
+  if (RC == ARM::GPRRegisterClass) {
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::STR))
+                   .addReg(SrcReg, getKillRegState(isKill))
+                   .addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO));
+  } else if (RC == ARM::DPRRegisterClass ||
+             RC == ARM::DPR_VFP2RegisterClass ||
+             RC == ARM::DPR_8RegisterClass) {
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRD))
+                   .addReg(SrcReg, getKillRegState(isKill))
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+  } else if (RC == ARM::SPRRegisterClass) {
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRS))
+                   .addReg(SrcReg, getKillRegState(isKill))
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+  } else {
+    assert((RC == ARM::QPRRegisterClass ||
+            RC == ARM::QPR_VFP2RegisterClass) && "Unknown regclass!");
+    // FIXME: Neon instructions should support predicates
+    if (Align >= 16
+        && (getRegisterInfo().canRealignStack(MF))) {
+      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VST1q64))
+                     .addFrameIndex(FI).addImm(0).addImm(0).addImm(128)
+                     .addMemOperand(MMO)
+                     .addReg(SrcReg, getKillRegState(isKill)));
+    } else {
+      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VSTRQ)).
+                     addReg(SrcReg, getKillRegState(isKill))
+                     .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+    }
+  }
+}
+
+void ARMBaseInstrInfo::
+loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                     unsigned DestReg, int FI,
+                     const TargetRegisterClass *RC) const {
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  unsigned Align = MFI.getObjectAlignment(FI);
+
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FI),
+                            MachineMemOperand::MOLoad, 0,
+                            MFI.getObjectSize(FI),
+                            Align);
+
+  if (RC == ARM::GPRRegisterClass) {
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::LDR), DestReg)
+                   .addFrameIndex(FI).addReg(0).addImm(0).addMemOperand(MMO));
+  } else if (RC == ARM::DPRRegisterClass ||
+             RC == ARM::DPR_VFP2RegisterClass ||
+             RC == ARM::DPR_8RegisterClass) {
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRD), DestReg)
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+  } else if (RC == ARM::SPRRegisterClass) {
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRS), DestReg)
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+  } else {
+    assert((RC == ARM::QPRRegisterClass ||
+            RC == ARM::QPR_VFP2RegisterClass ||
+            RC == ARM::QPR_8RegisterClass) && "Unknown regclass!");
+    if (Align >= 16
+        && (getRegisterInfo().canRealignStack(MF))) {
+      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLD1q64), DestReg)
+                     .addFrameIndex(FI).addImm(0).addImm(0).addImm(128)
+                     .addMemOperand(MMO));
+    } else {
+      AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::VLDRQ), DestReg)
+                     .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+    }
+  }
+}
+
+MachineInstr *ARMBaseInstrInfo::
+foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
+                      const SmallVectorImpl<unsigned> &Ops, int FI) const {
+  if (Ops.size() != 1) return NULL;
+
+  unsigned OpNum = Ops[0];
+  unsigned Opc = MI->getOpcode();
+  MachineInstr *NewMI = NULL;
+  if (Opc == ARM::MOVr || Opc == ARM::t2MOVr) {
+    // If it is updating CPSR, then it cannot be folded.
+    if (MI->getOperand(4).getReg() == ARM::CPSR && !MI->getOperand(4).isDead())
+      return NULL;
+    unsigned Pred = MI->getOperand(2).getImm();
+    unsigned PredReg = MI->getOperand(3).getReg();
+    if (OpNum == 0) { // move -> store
+      unsigned SrcReg = MI->getOperand(1).getReg();
+      unsigned SrcSubReg = MI->getOperand(1).getSubReg();
+      bool isKill = MI->getOperand(1).isKill();
+      bool isUndef = MI->getOperand(1).isUndef();
+      if (Opc == ARM::MOVr)
+        NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::STR))
+          .addReg(SrcReg,
+                  getKillRegState(isKill) | getUndefRegState(isUndef),
+                  SrcSubReg)
+          .addFrameIndex(FI).addReg(0).addImm(0).addImm(Pred).addReg(PredReg);
+      else // ARM::t2MOVr
+        NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::t2STRi12))
+          .addReg(SrcReg,
+                  getKillRegState(isKill) | getUndefRegState(isUndef),
+                  SrcSubReg)
+          .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg);
+    } else {          // move -> load
+      unsigned DstReg = MI->getOperand(0).getReg();
+      unsigned DstSubReg = MI->getOperand(0).getSubReg();
+      bool isDead = MI->getOperand(0).isDead();
+      bool isUndef = MI->getOperand(0).isUndef();
+      if (Opc == ARM::MOVr)
+        NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::LDR))
+          .addReg(DstReg,
+                  RegState::Define |
+                  getDeadRegState(isDead) |
+                  getUndefRegState(isUndef), DstSubReg)
+          .addFrameIndex(FI).addReg(0).addImm(0).addImm(Pred).addReg(PredReg);
+      else // ARM::t2MOVr
+        NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::t2LDRi12))
+          .addReg(DstReg,
+                  RegState::Define |
+                  getDeadRegState(isDead) |
+                  getUndefRegState(isUndef), DstSubReg)
+          .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg);
+    }
+  } else if (Opc == ARM::tMOVgpr2gpr ||
+             Opc == ARM::tMOVtgpr2gpr ||
+             Opc == ARM::tMOVgpr2tgpr) {
+    if (OpNum == 0) { // move -> store
+      unsigned SrcReg = MI->getOperand(1).getReg();
+      unsigned SrcSubReg = MI->getOperand(1).getSubReg();
+      bool isKill = MI->getOperand(1).isKill();
+      bool isUndef = MI->getOperand(1).isUndef();
+      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::t2STRi12))
+        .addReg(SrcReg,
+                getKillRegState(isKill) | getUndefRegState(isUndef),
+                SrcSubReg)
+        .addFrameIndex(FI).addImm(0).addImm(ARMCC::AL).addReg(0);
+    } else {          // move -> load
+      unsigned DstReg = MI->getOperand(0).getReg();
+      unsigned DstSubReg = MI->getOperand(0).getSubReg();
+      bool isDead = MI->getOperand(0).isDead();
+      bool isUndef = MI->getOperand(0).isUndef();
+      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::t2LDRi12))
+        .addReg(DstReg,
+                RegState::Define |
+                getDeadRegState(isDead) |
+                getUndefRegState(isUndef),
+                DstSubReg)
+        .addFrameIndex(FI).addImm(0).addImm(ARMCC::AL).addReg(0);
+    }
+  } else if (Opc == ARM::VMOVS) {
+    unsigned Pred = MI->getOperand(2).getImm();
+    unsigned PredReg = MI->getOperand(3).getReg();
+    if (OpNum == 0) { // move -> store
+      unsigned SrcReg = MI->getOperand(1).getReg();
+      unsigned SrcSubReg = MI->getOperand(1).getSubReg();
+      bool isKill = MI->getOperand(1).isKill();
+      bool isUndef = MI->getOperand(1).isUndef();
+      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::VSTRS))
+        .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef),
+                SrcSubReg)
+        .addFrameIndex(FI)
+        .addImm(0).addImm(Pred).addReg(PredReg);
+    } else {          // move -> load
+      unsigned DstReg = MI->getOperand(0).getReg();
+      unsigned DstSubReg = MI->getOperand(0).getSubReg();
+      bool isDead = MI->getOperand(0).isDead();
+      bool isUndef = MI->getOperand(0).isUndef();
+      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::VLDRS))
+        .addReg(DstReg,
+                RegState::Define |
+                getDeadRegState(isDead) |
+                getUndefRegState(isUndef),
+                DstSubReg)
+        .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg);
+    }
+  }
+  else if (Opc == ARM::VMOVD) {
+    unsigned Pred = MI->getOperand(2).getImm();
+    unsigned PredReg = MI->getOperand(3).getReg();
+    if (OpNum == 0) { // move -> store
+      unsigned SrcReg = MI->getOperand(1).getReg();
+      unsigned SrcSubReg = MI->getOperand(1).getSubReg();
+      bool isKill = MI->getOperand(1).isKill();
+      bool isUndef = MI->getOperand(1).isUndef();
+      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::VSTRD))
+        .addReg(SrcReg,
+                getKillRegState(isKill) | getUndefRegState(isUndef),
+                SrcSubReg)
+        .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg);
+    } else {          // move -> load
+      unsigned DstReg = MI->getOperand(0).getReg();
+      unsigned DstSubReg = MI->getOperand(0).getSubReg();
+      bool isDead = MI->getOperand(0).isDead();
+      bool isUndef = MI->getOperand(0).isUndef();
+      NewMI = BuildMI(MF, MI->getDebugLoc(), get(ARM::VLDRD))
+        .addReg(DstReg,
+                RegState::Define |
+                getDeadRegState(isDead) |
+                getUndefRegState(isUndef),
+                DstSubReg)
+        .addFrameIndex(FI).addImm(0).addImm(Pred).addReg(PredReg);
+    }
+  }
+
+  return NewMI;
+}
+
+MachineInstr*
+ARMBaseInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+                                        MachineInstr* MI,
+                                        const SmallVectorImpl<unsigned> &Ops,
+                                        MachineInstr* LoadMI) const {
+  // FIXME
+  return 0;
+}
+
+bool
+ARMBaseInstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
+                                   const SmallVectorImpl<unsigned> &Ops) const {
+  if (Ops.size() != 1) return false;
+
+  unsigned Opc = MI->getOpcode();
+  if (Opc == ARM::MOVr || Opc == ARM::t2MOVr) {
+    // If it is updating CPSR, then it cannot be folded.
+    return MI->getOperand(4).getReg() != ARM::CPSR ||
+      MI->getOperand(4).isDead();
+  } else if (Opc == ARM::tMOVgpr2gpr ||
+             Opc == ARM::tMOVtgpr2gpr ||
+             Opc == ARM::tMOVgpr2tgpr) {
+    return true;
+  } else if (Opc == ARM::VMOVS || Opc == ARM::VMOVD) {
+    return true;
+  } else if (Opc == ARM::VMOVDneon || Opc == ARM::VMOVQ) {
+    return false; // FIXME
+  }
+
+  return false;
+}
+
+/// Create a copy of a const pool value. Update CPI to the new index and return
+/// the label UID.
+static unsigned duplicateCPV(MachineFunction &MF, unsigned &CPI) {
+  MachineConstantPool *MCP = MF.getConstantPool();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+  const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPI];
+  assert(MCPE.isMachineConstantPoolEntry() &&
+         "Expecting a machine constantpool entry!");
+  ARMConstantPoolValue *ACPV =
+    static_cast<ARMConstantPoolValue*>(MCPE.Val.MachineCPVal);
+
+  unsigned PCLabelId = AFI->createConstPoolEntryUId();
+  ARMConstantPoolValue *NewCPV = 0;
+  if (ACPV->isGlobalValue())
+    NewCPV = new ARMConstantPoolValue(ACPV->getGV(), PCLabelId,
+                                      ARMCP::CPValue, 4);
+  else if (ACPV->isExtSymbol())
+    NewCPV = new ARMConstantPoolValue(MF.getFunction()->getContext(),
+                                      ACPV->getSymbol(), PCLabelId, 4);
+  else if (ACPV->isBlockAddress())
+    NewCPV = new ARMConstantPoolValue(ACPV->getBlockAddress(), PCLabelId,
+                                      ARMCP::CPBlockAddress, 4);
+  else
+    llvm_unreachable("Unexpected ARM constantpool value type!!");
+  CPI = MCP->getConstantPoolIndex(NewCPV, MCPE.getAlignment());
+  return PCLabelId;
+}
+
+void ARMBaseInstrInfo::
+reMaterialize(MachineBasicBlock &MBB,
+              MachineBasicBlock::iterator I,
+              unsigned DestReg, unsigned SubIdx,
+              const MachineInstr *Orig,
+              const TargetRegisterInfo *TRI) const {
+  if (SubIdx && TargetRegisterInfo::isPhysicalRegister(DestReg)) {
+    DestReg = TRI->getSubReg(DestReg, SubIdx);
+    SubIdx = 0;
+  }
+
+  unsigned Opcode = Orig->getOpcode();
+  switch (Opcode) {
+  default: {
+    MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig);
+    MI->getOperand(0).setReg(DestReg);
+    MBB.insert(I, MI);
+    break;
+  }
+  case ARM::tLDRpci_pic:
+  case ARM::t2LDRpci_pic: {
+    MachineFunction &MF = *MBB.getParent();
+    unsigned CPI = Orig->getOperand(1).getIndex();
+    unsigned PCLabelId = duplicateCPV(MF, CPI);
+    MachineInstrBuilder MIB = BuildMI(MBB, I, Orig->getDebugLoc(), get(Opcode),
+                                      DestReg)
+      .addConstantPoolIndex(CPI).addImm(PCLabelId);
+    (*MIB).setMemRefs(Orig->memoperands_begin(), Orig->memoperands_end());
+    break;
+  }
+  }
+
+  MachineInstr *NewMI = prior(I);
+  NewMI->getOperand(0).setSubReg(SubIdx);
+}
+
+MachineInstr *
+ARMBaseInstrInfo::duplicate(MachineInstr *Orig, MachineFunction &MF) const {
+  MachineInstr *MI = TargetInstrInfoImpl::duplicate(Orig, MF);
+  switch(Orig->getOpcode()) {
+  case ARM::tLDRpci_pic:
+  case ARM::t2LDRpci_pic: {
+    unsigned CPI = Orig->getOperand(1).getIndex();
+    unsigned PCLabelId = duplicateCPV(MF, CPI);
+    Orig->getOperand(1).setIndex(CPI);
+    Orig->getOperand(2).setImm(PCLabelId);
+    break;
+  }
+  }
+  return MI;
+}
+
+bool ARMBaseInstrInfo::isIdentical(const MachineInstr *MI0,
+                                  const MachineInstr *MI1,
+                                  const MachineRegisterInfo *MRI) const {
+  int Opcode = MI0->getOpcode();
+  if (Opcode == ARM::t2LDRpci ||
+      Opcode == ARM::t2LDRpci_pic ||
+      Opcode == ARM::tLDRpci ||
+      Opcode == ARM::tLDRpci_pic) {
+    if (MI1->getOpcode() != Opcode)
+      return false;
+    if (MI0->getNumOperands() != MI1->getNumOperands())
+      return false;
+
+    const MachineOperand &MO0 = MI0->getOperand(1);
+    const MachineOperand &MO1 = MI1->getOperand(1);
+    if (MO0.getOffset() != MO1.getOffset())
+      return false;
+
+    const MachineFunction *MF = MI0->getParent()->getParent();
+    const MachineConstantPool *MCP = MF->getConstantPool();
+    int CPI0 = MO0.getIndex();
+    int CPI1 = MO1.getIndex();
+    const MachineConstantPoolEntry &MCPE0 = MCP->getConstants()[CPI0];
+    const MachineConstantPoolEntry &MCPE1 = MCP->getConstants()[CPI1];
+    ARMConstantPoolValue *ACPV0 =
+      static_cast<ARMConstantPoolValue*>(MCPE0.Val.MachineCPVal);
+    ARMConstantPoolValue *ACPV1 =
+      static_cast<ARMConstantPoolValue*>(MCPE1.Val.MachineCPVal);
+    return ACPV0->hasSameValue(ACPV1);
+  }
+
+  return TargetInstrInfoImpl::isIdentical(MI0, MI1, MRI);
+}
+
+/// getInstrPredicate - If instruction is predicated, returns its predicate
+/// condition, otherwise returns AL. It also returns the condition code
+/// register by reference.
+ARMCC::CondCodes
+llvm::getInstrPredicate(const MachineInstr *MI, unsigned &PredReg) {
+  int PIdx = MI->findFirstPredOperandIdx();
+  if (PIdx == -1) {
+    PredReg = 0;
+    return ARMCC::AL;
+  }
+
+  PredReg = MI->getOperand(PIdx+1).getReg();
+  return (ARMCC::CondCodes)MI->getOperand(PIdx).getImm();
+}
+
+
+int llvm::getMatchingCondBranchOpcode(int Opc) {
+  if (Opc == ARM::B)
+    return ARM::Bcc;
+  else if (Opc == ARM::tB)
+    return ARM::tBcc;
+  else if (Opc == ARM::t2B)
+      return ARM::t2Bcc;
+
+  llvm_unreachable("Unknown unconditional branch opcode!");
+  return 0;
+}
+
+
+void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator &MBBI, DebugLoc dl,
+                               unsigned DestReg, unsigned BaseReg, int NumBytes,
+                               ARMCC::CondCodes Pred, unsigned PredReg,
+                               const ARMBaseInstrInfo &TII) {
+  bool isSub = NumBytes < 0;
+  if (isSub) NumBytes = -NumBytes;
+
+  while (NumBytes) {
+    unsigned RotAmt = ARM_AM::getSOImmValRotate(NumBytes);
+    unsigned ThisVal = NumBytes & ARM_AM::rotr32(0xFF, RotAmt);
+    assert(ThisVal && "Didn't extract field correctly");
+
+    // We will handle these bits from offset, clear them.
+    NumBytes &= ~ThisVal;
+
+    assert(ARM_AM::getSOImmVal(ThisVal) != -1 && "Bit extraction didn't work?");
+
+    // Build the new ADD / SUB.
+    unsigned Opc = isSub ? ARM::SUBri : ARM::ADDri;
+    BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
+      .addReg(BaseReg, RegState::Kill).addImm(ThisVal)
+      .addImm((unsigned)Pred).addReg(PredReg).addReg(0);
+    BaseReg = DestReg;
+  }
+}
+
+bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                                unsigned FrameReg, int &Offset,
+                                const ARMBaseInstrInfo &TII) {
+  unsigned Opcode = MI.getOpcode();
+  const TargetInstrDesc &Desc = MI.getDesc();
+  unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
+  bool isSub = false;
+
+  // Memory operands in inline assembly always use AddrMode2.
+  if (Opcode == ARM::INLINEASM)
+    AddrMode = ARMII::AddrMode2;
+
+  if (Opcode == ARM::ADDri) {
+    Offset += MI.getOperand(FrameRegIdx+1).getImm();
+    if (Offset == 0) {
+      // Turn it into a move.
+      MI.setDesc(TII.get(ARM::MOVr));
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+      MI.RemoveOperand(FrameRegIdx+1);
+      Offset = 0;
+      return true;
+    } else if (Offset < 0) {
+      Offset = -Offset;
+      isSub = true;
+      MI.setDesc(TII.get(ARM::SUBri));
+    }
+
+    // Common case: small offset, fits into instruction.
+    if (ARM_AM::getSOImmVal(Offset) != -1) {
+      // Replace the FrameIndex with sp / fp
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+      MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset);
+      Offset = 0;
+      return true;
+    }
+
+    // Otherwise, pull as much of the immedidate into this ADDri/SUBri
+    // as possible.
+    unsigned RotAmt = ARM_AM::getSOImmValRotate(Offset);
+    unsigned ThisImmVal = Offset & ARM_AM::rotr32(0xFF, RotAmt);
+
+    // We will handle these bits from offset, clear them.
+    Offset &= ~ThisImmVal;
+
+    // Get the properly encoded SOImmVal field.
+    assert(ARM_AM::getSOImmVal(ThisImmVal) != -1 &&
+           "Bit extraction didn't work?");
+    MI.getOperand(FrameRegIdx+1).ChangeToImmediate(ThisImmVal);
+ } else {
+    unsigned ImmIdx = 0;
+    int InstrOffs = 0;
+    unsigned NumBits = 0;
+    unsigned Scale = 1;
+    switch (AddrMode) {
+    case ARMII::AddrMode2: {
+      ImmIdx = FrameRegIdx+2;
+      InstrOffs = ARM_AM::getAM2Offset(MI.getOperand(ImmIdx).getImm());
+      if (ARM_AM::getAM2Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
+        InstrOffs *= -1;
+      NumBits = 12;
+      break;
+    }
+    case ARMII::AddrMode3: {
+      ImmIdx = FrameRegIdx+2;
+      InstrOffs = ARM_AM::getAM3Offset(MI.getOperand(ImmIdx).getImm());
+      if (ARM_AM::getAM3Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
+        InstrOffs *= -1;
+      NumBits = 8;
+      break;
+    }
+    case ARMII::AddrMode4:
+    case ARMII::AddrMode6:
+      // Can't fold any offset even if it's zero.
+      return false;
+    case ARMII::AddrMode5: {
+      ImmIdx = FrameRegIdx+1;
+      InstrOffs = ARM_AM::getAM5Offset(MI.getOperand(ImmIdx).getImm());
+      if (ARM_AM::getAM5Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
+        InstrOffs *= -1;
+      NumBits = 8;
+      Scale = 4;
+      break;
+    }
+    default:
+      llvm_unreachable("Unsupported addressing mode!");
+      break;
+    }
+
+    Offset += InstrOffs * Scale;
+    assert((Offset & (Scale-1)) == 0 && "Can't encode this offset!");
+    if (Offset < 0) {
+      Offset = -Offset;
+      isSub = true;
+    }
+
+    // Attempt to fold address comp. if opcode has offset bits
+    if (NumBits > 0) {
+      // Common case: small offset, fits into instruction.
+      MachineOperand &ImmOp = MI.getOperand(ImmIdx);
+      int ImmedOffset = Offset / Scale;
+      unsigned Mask = (1 << NumBits) - 1;
+      if ((unsigned)Offset <= Mask * Scale) {
+        // Replace the FrameIndex with sp
+        MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+        if (isSub)
+          ImmedOffset |= 1 << NumBits;
+        ImmOp.ChangeToImmediate(ImmedOffset);
+        Offset = 0;
+        return true;
+      }
+
+      // Otherwise, it didn't fit. Pull in what we can to simplify the immed.
+      ImmedOffset = ImmedOffset & Mask;
+      if (isSub)
+        ImmedOffset |= 1 << NumBits;
+      ImmOp.ChangeToImmediate(ImmedOffset);
+      Offset &= ~(Mask*Scale);
+    }
+  }
+
+  Offset = (isSub) ? -Offset : Offset;
+  return Offset == 0;
+}

diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
new file mode 100644
index 0000000..0d9d4a7
--- /dev/null
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h

@@ -0,0 +1,375 @@
+//===- ARMBaseInstrInfo.h - ARM Base Instruction Information ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Base ARM implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMBASEINSTRUCTIONINFO_H
+#define ARMBASEINSTRUCTIONINFO_H
+
+#include "ARM.h"
+#include "ARMRegisterInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+namespace llvm {
+
+/// ARMII - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace ARMII {
+  enum {
+    //===------------------------------------------------------------------===//
+    // Instruction Flags.
+
+    //===------------------------------------------------------------------===//
+    // This four-bit field describes the addressing mode used.
+
+    AddrModeMask  = 0xf,
+    AddrModeNone    = 0,
+    AddrMode1       = 1,
+    AddrMode2       = 2,
+    AddrMode3       = 3,
+    AddrMode4       = 4,
+    AddrMode5       = 5,
+    AddrMode6       = 6,
+    AddrModeT1_1    = 7,
+    AddrModeT1_2    = 8,
+    AddrModeT1_4    = 9,
+    AddrModeT1_s    = 10, // i8 * 4 for pc and sp relative data
+    AddrModeT2_i12  = 11,
+    AddrModeT2_i8   = 12,
+    AddrModeT2_so   = 13,
+    AddrModeT2_pc   = 14, // +/- i12 for pc relative data
+    AddrModeT2_i8s4 = 15, // i8 * 4
+
+    // Size* - Flags to keep track of the size of an instruction.
+    SizeShift     = 4,
+    SizeMask      = 7 << SizeShift,
+    SizeSpecial   = 1,   // 0 byte pseudo or special case.
+    Size8Bytes    = 2,
+    Size4Bytes    = 3,
+    Size2Bytes    = 4,
+
+    // IndexMode - Unindex, pre-indexed, or post-indexed. Only valid for load
+    // and store ops
+    IndexModeShift = 7,
+    IndexModeMask  = 3 << IndexModeShift,
+    IndexModePre   = 1,
+    IndexModePost  = 2,
+
+    //===------------------------------------------------------------------===//
+    // Instruction encoding formats.
+    //
+    FormShift     = 9,
+    FormMask      = 0x3f << FormShift,
+
+    // Pseudo instructions
+    Pseudo        = 0  << FormShift,
+
+    // Multiply instructions
+    MulFrm        = 1  << FormShift,
+
+    // Branch instructions
+    BrFrm         = 2  << FormShift,
+    BrMiscFrm     = 3  << FormShift,
+
+    // Data Processing instructions
+    DPFrm         = 4  << FormShift,
+    DPSoRegFrm    = 5  << FormShift,
+
+    // Load and Store
+    LdFrm         = 6  << FormShift,
+    StFrm         = 7  << FormShift,
+    LdMiscFrm     = 8  << FormShift,
+    StMiscFrm     = 9  << FormShift,
+    LdStMulFrm    = 10 << FormShift,
+
+    LdStExFrm     = 28 << FormShift,
+
+    // Miscellaneous arithmetic instructions
+    ArithMiscFrm  = 11 << FormShift,
+
+    // Extend instructions
+    ExtFrm        = 12 << FormShift,
+
+    // VFP formats
+    VFPUnaryFrm   = 13 << FormShift,
+    VFPBinaryFrm  = 14 << FormShift,
+    VFPConv1Frm   = 15 << FormShift,
+    VFPConv2Frm   = 16 << FormShift,
+    VFPConv3Frm   = 17 << FormShift,
+    VFPConv4Frm   = 18 << FormShift,
+    VFPConv5Frm   = 19 << FormShift,
+    VFPLdStFrm    = 20 << FormShift,
+    VFPLdStMulFrm = 21 << FormShift,
+    VFPMiscFrm    = 22 << FormShift,
+
+    // Thumb format
+    ThumbFrm      = 23 << FormShift,
+
+    // NEON format
+    NEONFrm       = 24 << FormShift,
+    NEONGetLnFrm  = 25 << FormShift,
+    NEONSetLnFrm  = 26 << FormShift,
+    NEONDupFrm    = 27 << FormShift,
+
+    //===------------------------------------------------------------------===//
+    // Misc flags.
+
+    // UnaryDP - Indicates this is a unary data processing instruction, i.e.
+    // it doesn't have a Rn operand.
+    UnaryDP       = 1 << 15,
+
+    // Xform16Bit - Indicates this Thumb2 instruction may be transformed into
+    // a 16-bit Thumb instruction if certain conditions are met.
+    Xform16Bit    = 1 << 16,
+
+    //===------------------------------------------------------------------===//
+    // Code domain.
+    DomainShift   = 17,
+    DomainMask    = 3 << DomainShift,
+    DomainGeneral = 0 << DomainShift,
+    DomainVFP     = 1 << DomainShift,
+    DomainNEON    = 2 << DomainShift,
+
+    //===------------------------------------------------------------------===//
+    // Field shifts - such shifts are used to set field while generating
+    // machine instructions.
+    M_BitShift     = 5,
+    ShiftImmShift  = 5,
+    ShiftShift     = 7,
+    N_BitShift     = 7,
+    ImmHiShift     = 8,
+    SoRotImmShift  = 8,
+    RegRsShift     = 8,
+    ExtRotImmShift = 10,
+    RegRdLoShift   = 12,
+    RegRdShift     = 12,
+    RegRdHiShift   = 16,
+    RegRnShift     = 16,
+    S_BitShift     = 20,
+    W_BitShift     = 21,
+    AM3_I_BitShift = 22,
+    D_BitShift     = 22,
+    U_BitShift     = 23,
+    P_BitShift     = 24,
+    I_BitShift     = 25,
+    CondShift      = 28
+  };
+
+  /// Target Operand Flag enum.
+  enum TOF {
+    //===------------------------------------------------------------------===//
+    // ARM Specific MachineOperand flags.
+
+    MO_NO_FLAG,
+
+    /// MO_LO16 - On a symbol operand, this represents a relocation containing
+    /// lower 16 bit of the address. Used only via movw instruction.
+    MO_LO16,
+
+    /// MO_HI16 - On a symbol operand, this represents a relocation containing
+    /// higher 16 bit of the address. Used only via movt instruction.
+    MO_HI16
+  };
+}
+
+class ARMBaseInstrInfo : public TargetInstrInfoImpl {
+  const ARMSubtarget& Subtarget;
+protected:
+  // Can be only subclassed.
+  explicit ARMBaseInstrInfo(const ARMSubtarget &STI);
+public:
+  // Return the non-pre/post incrementing version of 'Opc'. Return 0
+  // if there is not such an opcode.
+  virtual unsigned getUnindexedOpcode(unsigned Opc) const =0;
+
+  virtual MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
+                                              MachineBasicBlock::iterator &MBBI,
+                                              LiveVariables *LV) const;
+
+  virtual const ARMBaseRegisterInfo &getRegisterInfo() const =0;
+  const ARMSubtarget &getSubtarget() const { return Subtarget; }
+
+  // Branch analysis.
+  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                             MachineBasicBlock *&FBB,
+                             SmallVectorImpl<MachineOperand> &Cond,
+                             bool AllowModify) const;
+  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                                MachineBasicBlock *FBB,
+                            const SmallVectorImpl<MachineOperand> &Cond) const;
+
+  virtual
+  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+
+  // Predication support.
+  bool isPredicated(const MachineInstr *MI) const {
+    int PIdx = MI->findFirstPredOperandIdx();
+    return PIdx != -1 && MI->getOperand(PIdx).getImm() != ARMCC::AL;
+  }
+
+  ARMCC::CondCodes getPredicate(const MachineInstr *MI) const {
+    int PIdx = MI->findFirstPredOperandIdx();
+    return PIdx != -1 ? (ARMCC::CondCodes)MI->getOperand(PIdx).getImm()
+                      : ARMCC::AL;
+  }
+
+  virtual
+  bool PredicateInstruction(MachineInstr *MI,
+                            const SmallVectorImpl<MachineOperand> &Pred) const;
+
+  virtual
+  bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
+                         const SmallVectorImpl<MachineOperand> &Pred2) const;
+
+  virtual bool DefinesPredicate(MachineInstr *MI,
+                                std::vector<MachineOperand> &Pred) const;
+
+  virtual bool isPredicable(MachineInstr *MI) const;
+
+  /// GetInstSize - Returns the size of the specified MachineInstr.
+  ///
+  virtual unsigned GetInstSizeInBytes(const MachineInstr* MI) const;
+
+  /// Return true if the instruction is a register to register move and return
+  /// the source and dest operands and their sub-register indices by reference.
+  virtual bool isMoveInstr(const MachineInstr &MI,
+                           unsigned &SrcReg, unsigned &DstReg,
+                           unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
+
+  virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                                       int &FrameIndex) const;
+  virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
+                                      int &FrameIndex) const;
+
+  virtual bool copyRegToReg(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator I,
+                            unsigned DestReg, unsigned SrcReg,
+                            const TargetRegisterClass *DestRC,
+                            const TargetRegisterClass *SrcRC) const;
+
+  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   unsigned SrcReg, bool isKill, int FrameIndex,
+                                   const TargetRegisterClass *RC) const;
+
+  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MBBI,
+                                    unsigned DestReg, int FrameIndex,
+                                    const TargetRegisterClass *RC) const;
+
+  virtual bool canFoldMemoryOperand(const MachineInstr *MI,
+                                    const SmallVectorImpl<unsigned> &Ops) const;
+
+  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                              MachineInstr* MI,
+                                              const SmallVectorImpl<unsigned> &Ops,
+                                              int FrameIndex) const;
+
+  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                              MachineInstr* MI,
+                                           const SmallVectorImpl<unsigned> &Ops,
+                                              MachineInstr* LoadMI) const;
+
+  virtual void reMaterialize(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator MI,
+                             unsigned DestReg, unsigned SubIdx,
+                             const MachineInstr *Orig,
+                             const TargetRegisterInfo *TRI) const;
+
+  MachineInstr *duplicate(MachineInstr *Orig, MachineFunction &MF) const;
+
+  virtual bool isIdentical(const MachineInstr *MI, const MachineInstr *Other,
+                           const MachineRegisterInfo *MRI) const;
+};
+
+static inline
+const MachineInstrBuilder &AddDefaultPred(const MachineInstrBuilder &MIB) {
+  return MIB.addImm((int64_t)ARMCC::AL).addReg(0);
+}
+
+static inline
+const MachineInstrBuilder &AddDefaultCC(const MachineInstrBuilder &MIB) {
+  return MIB.addReg(0);
+}
+
+static inline
+const MachineInstrBuilder &AddDefaultT1CC(const MachineInstrBuilder &MIB,
+                                          bool isDead = false) {
+  return MIB.addReg(ARM::CPSR, getDefRegState(true) | getDeadRegState(isDead));
+}
+
+static inline
+const MachineInstrBuilder &AddNoT1CC(const MachineInstrBuilder &MIB) {
+  return MIB.addReg(0);
+}
+
+static inline
+bool isUncondBranchOpcode(int Opc) {
+  return Opc == ARM::B || Opc == ARM::tB || Opc == ARM::t2B;
+}
+
+static inline
+bool isCondBranchOpcode(int Opc) {
+  return Opc == ARM::Bcc || Opc == ARM::tBcc || Opc == ARM::t2Bcc;
+}
+
+static inline
+bool isJumpTableBranchOpcode(int Opc) {
+  return Opc == ARM::BR_JTr || Opc == ARM::BR_JTm || Opc == ARM::BR_JTadd ||
+    Opc == ARM::tBR_JTr || Opc == ARM::t2BR_JT;
+}
+
+static inline
+bool isIndirectBranchOpcode(int Opc) {
+  return Opc == ARM::BRIND || Opc == ARM::tBRIND;
+}
+
+/// getInstrPredicate - If instruction is predicated, returns its predicate
+/// condition, otherwise returns AL. It also returns the condition code
+/// register by reference.
+ARMCC::CondCodes getInstrPredicate(const MachineInstr *MI, unsigned &PredReg);
+
+int getMatchingCondBranchOpcode(int Opc);
+
+/// emitARMRegPlusImmediate / emitT2RegPlusImmediate - Emits a series of
+/// instructions to materializea destreg = basereg + immediate in ARM / Thumb2
+/// code.
+void emitARMRegPlusImmediate(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator &MBBI, DebugLoc dl,
+                             unsigned DestReg, unsigned BaseReg, int NumBytes,
+                             ARMCC::CondCodes Pred, unsigned PredReg,
+                             const ARMBaseInstrInfo &TII);
+
+void emitT2RegPlusImmediate(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator &MBBI, DebugLoc dl,
+                            unsigned DestReg, unsigned BaseReg, int NumBytes,
+                            ARMCC::CondCodes Pred, unsigned PredReg,
+                            const ARMBaseInstrInfo &TII);
+
+
+/// rewriteARMFrameIndex / rewriteT2FrameIndex -
+/// Rewrite MI to access 'Offset' bytes from the FP. Return false if the
+/// offset could not be handled directly in MI, and return the left-over
+/// portion by reference.
+bool rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                          unsigned FrameReg, int &Offset,
+                          const ARMBaseInstrInfo &TII);
+
+bool rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                         unsigned FrameReg, int &Offset,
+                         const ARMBaseInstrInfo &TII);
+
+} // End llvm namespace
+
+#endif

diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
new file mode 100644
index 0000000..cb0bd1d
--- /dev/null
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp

@@ -0,0 +1,1502 @@
+//===- ARMBaseRegisterInfo.cpp - ARM Register Information -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the base ARM implementation of TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMAddressingModes.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMBaseRegisterInfo.h"
+#include "ARMInstrInfo.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMSubtarget.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+static cl::opt<bool>
+ReuseFrameIndexVals("arm-reuse-frame-index-vals", cl::Hidden, cl::init(true),
+          cl::desc("Reuse repeated frame index values"));
+
+unsigned ARMBaseRegisterInfo::getRegisterNumbering(unsigned RegEnum,
+                                                   bool *isSPVFP) {
+  if (isSPVFP)
+    *isSPVFP = false;
+
+  using namespace ARM;
+  switch (RegEnum) {
+  default:
+    llvm_unreachable("Unknown ARM register!");
+  case R0:  case D0:  case Q0:  return 0;
+  case R1:  case D1:  case Q1:  return 1;
+  case R2:  case D2:  case Q2:  return 2;
+  case R3:  case D3:  case Q3:  return 3;
+  case R4:  case D4:  case Q4:  return 4;
+  case R5:  case D5:  case Q5:  return 5;
+  case R6:  case D6:  case Q6:  return 6;
+  case R7:  case D7:  case Q7:  return 7;
+  case R8:  case D8:  case Q8:  return 8;
+  case R9:  case D9:  case Q9:  return 9;
+  case R10: case D10: case Q10: return 10;
+  case R11: case D11: case Q11: return 11;
+  case R12: case D12: case Q12: return 12;
+  case SP:  case D13: case Q13: return 13;
+  case LR:  case D14: case Q14: return 14;
+  case PC:  case D15: case Q15: return 15;
+
+  case D16: return 16;
+  case D17: return 17;
+  case D18: return 18;
+  case D19: return 19;
+  case D20: return 20;
+  case D21: return 21;
+  case D22: return 22;
+  case D23: return 23;
+  case D24: return 24;
+  case D25: return 25;
+  case D26: return 27;
+  case D27: return 27;
+  case D28: return 28;
+  case D29: return 29;
+  case D30: return 30;
+  case D31: return 31;
+
+  case S0: case S1: case S2: case S3:
+  case S4: case S5: case S6: case S7:
+  case S8: case S9: case S10: case S11:
+  case S12: case S13: case S14: case S15:
+  case S16: case S17: case S18: case S19:
+  case S20: case S21: case S22: case S23:
+  case S24: case S25: case S26: case S27:
+  case S28: case S29: case S30: case S31: {
+    if (isSPVFP)
+      *isSPVFP = true;
+    switch (RegEnum) {
+    default: return 0; // Avoid compile time warning.
+    case S0: return 0;
+    case S1: return 1;
+    case S2: return 2;
+    case S3: return 3;
+    case S4: return 4;
+    case S5: return 5;
+    case S6: return 6;
+    case S7: return 7;
+    case S8: return 8;
+    case S9: return 9;
+    case S10: return 10;
+    case S11: return 11;
+    case S12: return 12;
+    case S13: return 13;
+    case S14: return 14;
+    case S15: return 15;
+    case S16: return 16;
+    case S17: return 17;
+    case S18: return 18;
+    case S19: return 19;
+    case S20: return 20;
+    case S21: return 21;
+    case S22: return 22;
+    case S23: return 23;
+    case S24: return 24;
+    case S25: return 25;
+    case S26: return 26;
+    case S27: return 27;
+    case S28: return 28;
+    case S29: return 29;
+    case S30: return 30;
+    case S31: return 31;
+    }
+  }
+  }
+}
+
+ARMBaseRegisterInfo::ARMBaseRegisterInfo(const ARMBaseInstrInfo &tii,
+                                         const ARMSubtarget &sti)
+  : ARMGenRegisterInfo(ARM::ADJCALLSTACKDOWN, ARM::ADJCALLSTACKUP),
+    TII(tii), STI(sti),
+    FramePtr((STI.isTargetDarwin() || STI.isThumb()) ? ARM::R7 : ARM::R11) {
+}
+
+const unsigned*
+ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  static const unsigned CalleeSavedRegs[] = {
+    ARM::LR, ARM::R11, ARM::R10, ARM::R9, ARM::R8,
+    ARM::R7, ARM::R6,  ARM::R5,  ARM::R4,
+
+    ARM::D15, ARM::D14, ARM::D13, ARM::D12,
+    ARM::D11, ARM::D10, ARM::D9,  ARM::D8,
+    0
+  };
+
+  static const unsigned DarwinCalleeSavedRegs[] = {
+    // Darwin ABI deviates from ARM standard ABI. R9 is not a callee-saved
+    // register.
+    ARM::LR,  ARM::R7,  ARM::R6, ARM::R5, ARM::R4,
+    ARM::R11, ARM::R10, ARM::R8,
+
+    ARM::D15, ARM::D14, ARM::D13, ARM::D12,
+    ARM::D11, ARM::D10, ARM::D9,  ARM::D8,
+    0
+  };
+  return STI.isTargetDarwin() ? DarwinCalleeSavedRegs : CalleeSavedRegs;
+}
+
+const TargetRegisterClass* const *
+ARMBaseRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
+  static const TargetRegisterClass * const CalleeSavedRegClasses[] = {
+    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
+    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
+    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
+
+    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
+    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
+    0
+  };
+
+  static const TargetRegisterClass * const ThumbCalleeSavedRegClasses[] = {
+    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
+    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::tGPRRegClass,
+    &ARM::tGPRRegClass,&ARM::tGPRRegClass,&ARM::tGPRRegClass,
+
+    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
+    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
+    0
+  };
+
+  static const TargetRegisterClass * const DarwinCalleeSavedRegClasses[] = {
+    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
+    &ARM::GPRRegClass, &ARM::GPRRegClass, &ARM::GPRRegClass,
+    &ARM::GPRRegClass, &ARM::GPRRegClass,
+
+    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
+    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
+    0
+  };
+
+  static const TargetRegisterClass * const DarwinThumbCalleeSavedRegClasses[] ={
+    &ARM::GPRRegClass,  &ARM::tGPRRegClass, &ARM::tGPRRegClass,
+    &ARM::tGPRRegClass, &ARM::tGPRRegClass, &ARM::GPRRegClass,
+    &ARM::GPRRegClass,  &ARM::GPRRegClass,
+
+    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
+    &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass, &ARM::DPRRegClass,
+    0
+  };
+
+  if (STI.isThumb1Only()) {
+    return STI.isTargetDarwin()
+      ? DarwinThumbCalleeSavedRegClasses : ThumbCalleeSavedRegClasses;
+  }
+  return STI.isTargetDarwin()
+    ? DarwinCalleeSavedRegClasses : CalleeSavedRegClasses;
+}
+
+BitVector ARMBaseRegisterInfo::
+getReservedRegs(const MachineFunction &MF) const {
+  // FIXME: avoid re-calculating this everytime.
+  BitVector Reserved(getNumRegs());
+  Reserved.set(ARM::SP);
+  Reserved.set(ARM::PC);
+  if (STI.isTargetDarwin() || hasFP(MF))
+    Reserved.set(FramePtr);
+  // Some targets reserve R9.
+  if (STI.isR9Reserved())
+    Reserved.set(ARM::R9);
+  return Reserved;
+}
+
+bool ARMBaseRegisterInfo::isReservedReg(const MachineFunction &MF,
+                                        unsigned Reg) const {
+  switch (Reg) {
+  default: break;
+  case ARM::SP:
+  case ARM::PC:
+    return true;
+  case ARM::R7:
+  case ARM::R11:
+    if (FramePtr == Reg && (STI.isTargetDarwin() || hasFP(MF)))
+      return true;
+    break;
+  case ARM::R9:
+    return STI.isR9Reserved();
+  }
+
+  return false;
+}
+
+const TargetRegisterClass *
+ARMBaseRegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A,
+                                              const TargetRegisterClass *B,
+                                              unsigned SubIdx) const {
+  switch (SubIdx) {
+  default: return 0;
+  case 1:
+  case 2:
+  case 3:
+  case 4:
+    // S sub-registers.
+    if (A->getSize() == 8) {
+      if (B == &ARM::SPR_8RegClass)
+        return &ARM::DPR_8RegClass;
+      assert(B == &ARM::SPRRegClass && "Expecting SPR register class!");
+      if (A == &ARM::DPR_8RegClass)
+        return A;
+      return &ARM::DPR_VFP2RegClass;
+    }
+
+    assert(A->getSize() == 16 && "Expecting a Q register class!");
+    if (B == &ARM::SPR_8RegClass)
+      return &ARM::QPR_8RegClass;
+    return &ARM::QPR_VFP2RegClass;
+  case 5:
+  case 6:
+    // D sub-registers.
+    if (B == &ARM::DPR_VFP2RegClass)
+      return &ARM::QPR_VFP2RegClass;
+    if (B == &ARM::DPR_8RegClass)
+      return &ARM::QPR_8RegClass;
+    return A;
+  }
+  return 0;
+}
+
+const TargetRegisterClass *
+ARMBaseRegisterInfo::getPointerRegClass(unsigned Kind) const {
+  return ARM::GPRRegisterClass;
+}
+
+/// getAllocationOrder - Returns the register allocation order for a specified
+/// register class in the form of a pair of TargetRegisterClass iterators.
+std::pair<TargetRegisterClass::iterator,TargetRegisterClass::iterator>
+ARMBaseRegisterInfo::getAllocationOrder(const TargetRegisterClass *RC,
+                                        unsigned HintType, unsigned HintReg,
+                                        const MachineFunction &MF) const {
+  // Alternative register allocation orders when favoring even / odd registers
+  // of register pairs.
+
+  // No FP, R9 is available.
+  static const unsigned GPREven1[] = {
+    ARM::R0, ARM::R2, ARM::R4, ARM::R6, ARM::R8, ARM::R10,
+    ARM::R1, ARM::R3, ARM::R12,ARM::LR, ARM::R5, ARM::R7,
+    ARM::R9, ARM::R11
+  };
+  static const unsigned GPROdd1[] = {
+    ARM::R1, ARM::R3, ARM::R5, ARM::R7, ARM::R9, ARM::R11,
+    ARM::R0, ARM::R2, ARM::R12,ARM::LR, ARM::R4, ARM::R6,
+    ARM::R8, ARM::R10
+  };
+
+  // FP is R7, R9 is available.
+  static const unsigned GPREven2[] = {
+    ARM::R0, ARM::R2, ARM::R4,          ARM::R8, ARM::R10,
+    ARM::R1, ARM::R3, ARM::R12,ARM::LR, ARM::R5, ARM::R6,
+    ARM::R9, ARM::R11
+  };
+  static const unsigned GPROdd2[] = {
+    ARM::R1, ARM::R3, ARM::R5,          ARM::R9, ARM::R11,
+    ARM::R0, ARM::R2, ARM::R12,ARM::LR, ARM::R4, ARM::R6,
+    ARM::R8, ARM::R10
+  };
+
+  // FP is R11, R9 is available.
+  static const unsigned GPREven3[] = {
+    ARM::R0, ARM::R2, ARM::R4, ARM::R6, ARM::R8,
+    ARM::R1, ARM::R3, ARM::R10,ARM::R12,ARM::LR, ARM::R5, ARM::R7,
+    ARM::R9
+  };
+  static const unsigned GPROdd3[] = {
+    ARM::R1, ARM::R3, ARM::R5, ARM::R6, ARM::R9,
+    ARM::R0, ARM::R2, ARM::R10,ARM::R12,ARM::LR, ARM::R4, ARM::R7,
+    ARM::R8
+  };
+
+  // No FP, R9 is not available.
+  static const unsigned GPREven4[] = {
+    ARM::R0, ARM::R2, ARM::R4, ARM::R6,          ARM::R10,
+    ARM::R1, ARM::R3, ARM::R12,ARM::LR, ARM::R5, ARM::R7, ARM::R8,
+    ARM::R11
+  };
+  static const unsigned GPROdd4[] = {
+    ARM::R1, ARM::R3, ARM::R5, ARM::R7,          ARM::R11,
+    ARM::R0, ARM::R2, ARM::R12,ARM::LR, ARM::R4, ARM::R6, ARM::R8,
+    ARM::R10
+  };
+
+  // FP is R7, R9 is not available.
+  static const unsigned GPREven5[] = {
+    ARM::R0, ARM::R2, ARM::R4,                   ARM::R10,
+    ARM::R1, ARM::R3, ARM::R12,ARM::LR, ARM::R5, ARM::R6, ARM::R8,
+    ARM::R11
+  };
+  static const unsigned GPROdd5[] = {
+    ARM::R1, ARM::R3, ARM::R5,                   ARM::R11,
+    ARM::R0, ARM::R2, ARM::R12,ARM::LR, ARM::R4, ARM::R6, ARM::R8,
+    ARM::R10
+  };
+
+  // FP is R11, R9 is not available.
+  static const unsigned GPREven6[] = {
+    ARM::R0, ARM::R2, ARM::R4, ARM::R6,
+    ARM::R1, ARM::R3, ARM::R10,ARM::R12,ARM::LR, ARM::R5, ARM::R7, ARM::R8
+  };
+  static const unsigned GPROdd6[] = {
+    ARM::R1, ARM::R3, ARM::R5, ARM::R7,
+    ARM::R0, ARM::R2, ARM::R10,ARM::R12,ARM::LR, ARM::R4, ARM::R6, ARM::R8
+  };
+
+
+  if (HintType == ARMRI::RegPairEven) {
+    if (isPhysicalRegister(HintReg) && getRegisterPairEven(HintReg, MF) == 0)
+      // It's no longer possible to fulfill this hint. Return the default
+      // allocation order.
+      return std::make_pair(RC->allocation_order_begin(MF),
+                            RC->allocation_order_end(MF));
+
+    if (!STI.isTargetDarwin() && !hasFP(MF)) {
+      if (!STI.isR9Reserved())
+        return std::make_pair(GPREven1,
+                              GPREven1 + (sizeof(GPREven1)/sizeof(unsigned)));
+      else
+        return std::make_pair(GPREven4,
+                              GPREven4 + (sizeof(GPREven4)/sizeof(unsigned)));
+    } else if (FramePtr == ARM::R7) {
+      if (!STI.isR9Reserved())
+        return std::make_pair(GPREven2,
+                              GPREven2 + (sizeof(GPREven2)/sizeof(unsigned)));
+      else
+        return std::make_pair(GPREven5,
+                              GPREven5 + (sizeof(GPREven5)/sizeof(unsigned)));
+    } else { // FramePtr == ARM::R11
+      if (!STI.isR9Reserved())
+        return std::make_pair(GPREven3,
+                              GPREven3 + (sizeof(GPREven3)/sizeof(unsigned)));
+      else
+        return std::make_pair(GPREven6,
+                              GPREven6 + (sizeof(GPREven6)/sizeof(unsigned)));
+    }
+  } else if (HintType == ARMRI::RegPairOdd) {
+    if (isPhysicalRegister(HintReg) && getRegisterPairOdd(HintReg, MF) == 0)
+      // It's no longer possible to fulfill this hint. Return the default
+      // allocation order.
+      return std::make_pair(RC->allocation_order_begin(MF),
+                            RC->allocation_order_end(MF));
+
+    if (!STI.isTargetDarwin() && !hasFP(MF)) {
+      if (!STI.isR9Reserved())
+        return std::make_pair(GPROdd1,
+                              GPROdd1 + (sizeof(GPROdd1)/sizeof(unsigned)));
+      else
+        return std::make_pair(GPROdd4,
+                              GPROdd4 + (sizeof(GPROdd4)/sizeof(unsigned)));
+    } else if (FramePtr == ARM::R7) {
+      if (!STI.isR9Reserved())
+        return std::make_pair(GPROdd2,
+                              GPROdd2 + (sizeof(GPROdd2)/sizeof(unsigned)));
+      else
+        return std::make_pair(GPROdd5,
+                              GPROdd5 + (sizeof(GPROdd5)/sizeof(unsigned)));
+    } else { // FramePtr == ARM::R11
+      if (!STI.isR9Reserved())
+        return std::make_pair(GPROdd3,
+                              GPROdd3 + (sizeof(GPROdd3)/sizeof(unsigned)));
+      else
+        return std::make_pair(GPROdd6,
+                              GPROdd6 + (sizeof(GPROdd6)/sizeof(unsigned)));
+    }
+  }
+  return std::make_pair(RC->allocation_order_begin(MF),
+                        RC->allocation_order_end(MF));
+}
+
+/// ResolveRegAllocHint - Resolves the specified register allocation hint
+/// to a physical register. Returns the physical register if it is successful.
+unsigned
+ARMBaseRegisterInfo::ResolveRegAllocHint(unsigned Type, unsigned Reg,
+                                         const MachineFunction &MF) const {
+  if (Reg == 0 || !isPhysicalRegister(Reg))
+    return 0;
+  if (Type == 0)
+    return Reg;
+  else if (Type == (unsigned)ARMRI::RegPairOdd)
+    // Odd register.
+    return getRegisterPairOdd(Reg, MF);
+  else if (Type == (unsigned)ARMRI::RegPairEven)
+    // Even register.
+    return getRegisterPairEven(Reg, MF);
+  return 0;
+}
+
+void
+ARMBaseRegisterInfo::UpdateRegAllocHint(unsigned Reg, unsigned NewReg,
+                                        MachineFunction &MF) const {
+  MachineRegisterInfo *MRI = &MF.getRegInfo();
+  std::pair<unsigned, unsigned> Hint = MRI->getRegAllocationHint(Reg);
+  if ((Hint.first == (unsigned)ARMRI::RegPairOdd ||
+       Hint.first == (unsigned)ARMRI::RegPairEven) &&
+      Hint.second && TargetRegisterInfo::isVirtualRegister(Hint.second)) {
+    // If 'Reg' is one of the even / odd register pair and it's now changed
+    // (e.g. coalesced) into a different register. The other register of the
+    // pair allocation hint must be updated to reflect the relationship
+    // change.
+    unsigned OtherReg = Hint.second;
+    Hint = MRI->getRegAllocationHint(OtherReg);
+    if (Hint.second == Reg)
+      // Make sure the pair has not already divorced.
+      MRI->setRegAllocationHint(OtherReg, Hint.first, NewReg);
+  }
+}
+
+/// hasFP - Return true if the specified function should have a dedicated frame
+/// pointer register.  This is true if the function has variable sized allocas
+/// or if frame pointer elimination is disabled.
+///
+bool ARMBaseRegisterInfo::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  return ((NoFramePointerElim && MFI->hasCalls())||
+          needsStackRealignment(MF) ||
+          MFI->hasVarSizedObjects() ||
+          MFI->isFrameAddressTaken());
+}
+
+bool ARMBaseRegisterInfo::canRealignStack(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  return (RealignStack &&
+          !AFI->isThumb1OnlyFunction() &&
+          !MFI->hasVarSizedObjects());
+}
+
+bool ARMBaseRegisterInfo::
+needsStackRealignment(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  unsigned StackAlign = MF.getTarget().getFrameInfo()->getStackAlignment();
+  return (RealignStack &&
+          !AFI->isThumb1OnlyFunction() &&
+          (MFI->getMaxAlignment() > StackAlign) &&
+          !MFI->hasVarSizedObjects());
+}
+
+bool ARMBaseRegisterInfo::
+cannotEliminateFrame(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  if (NoFramePointerElim && MFI->hasCalls())
+    return true;
+  return MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken()
+    || needsStackRealignment(MF);
+}
+
+/// estimateStackSize - Estimate and return the size of the frame.
+static unsigned estimateStackSize(MachineFunction &MF, MachineFrameInfo *MFI) {
+  const MachineFrameInfo *FFI = MF.getFrameInfo();
+  int Offset = 0;
+  for (int i = FFI->getObjectIndexBegin(); i != 0; ++i) {
+    int FixedOff = -FFI->getObjectOffset(i);
+    if (FixedOff > Offset) Offset = FixedOff;
+  }
+  for (unsigned i = 0, e = FFI->getObjectIndexEnd(); i != e; ++i) {
+    if (FFI->isDeadObjectIndex(i))
+      continue;
+    Offset += FFI->getObjectSize(i);
+    unsigned Align = FFI->getObjectAlignment(i);
+    // Adjust to alignment boundary
+    Offset = (Offset+Align-1)/Align*Align;
+  }
+  return (unsigned)Offset;
+}
+
+/// estimateRSStackSizeLimit - Look at each instruction that references stack
+/// frames and return the stack size limit beyond which some of these
+/// instructions will require a scratch register during their expansion later.
+unsigned
+ARMBaseRegisterInfo::estimateRSStackSizeLimit(MachineFunction &MF) const {
+  unsigned Limit = (1 << 12) - 1;
+  for (MachineFunction::iterator BB = MF.begin(),E = MF.end(); BB != E; ++BB) {
+    for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end();
+         I != E; ++I) {
+      for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+        if (!I->getOperand(i).isFI()) continue;
+
+        const TargetInstrDesc &Desc = TII.get(I->getOpcode());
+        unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
+        if (AddrMode == ARMII::AddrMode3 ||
+            AddrMode == ARMII::AddrModeT2_i8)
+          return (1 << 8) - 1;
+
+        if (AddrMode == ARMII::AddrMode5 ||
+            AddrMode == ARMII::AddrModeT2_i8s4)
+          Limit = std::min(Limit, ((1U << 8) - 1) * 4);
+
+        if (AddrMode == ARMII::AddrModeT2_i12 && hasFP(MF))
+          // When the stack offset is negative, we will end up using
+          // the i8 instructions instead.
+          return (1 << 8) - 1;
+
+        if (AddrMode == ARMII::AddrMode6)
+          return 0;
+        break; // At most one FI per instruction
+      }
+    }
+  }
+
+  return Limit;
+}
+
+void
+ARMBaseRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                                       RegScavenger *RS) const {
+  // This tells PEI to spill the FP as if it is any other callee-save register
+  // to take advantage the eliminateFrameIndex machinery. This also ensures it
+  // is spilled in the order specified by getCalleeSavedRegs() to make it easier
+  // to combine multiple loads / stores.
+  bool CanEliminateFrame = true;
+  bool CS1Spilled = false;
+  bool LRSpilled = false;
+  unsigned NumGPRSpills = 0;
+  SmallVector<unsigned, 4> UnspilledCS1GPRs;
+  SmallVector<unsigned, 4> UnspilledCS2GPRs;
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+
+  // Calculate and set max stack object alignment early, so we can decide
+  // whether we will need stack realignment (and thus FP).
+  if (RealignStack) {
+    MachineFrameInfo *MFI = MF.getFrameInfo();
+    MFI->calculateMaxStackAlignment();
+  }
+
+  // Spill R4 if Thumb2 function requires stack realignment - it will be used as
+  // scratch register.
+  // FIXME: It will be better just to find spare register here.
+  if (needsStackRealignment(MF) &&
+      AFI->isThumb2Function())
+    MF.getRegInfo().setPhysRegUsed(ARM::R4);
+
+  // Don't spill FP if the frame can be eliminated. This is determined
+  // by scanning the callee-save registers to see if any is used.
+  const unsigned *CSRegs = getCalleeSavedRegs();
+  const TargetRegisterClass* const *CSRegClasses = getCalleeSavedRegClasses();
+  for (unsigned i = 0; CSRegs[i]; ++i) {
+    unsigned Reg = CSRegs[i];
+    bool Spilled = false;
+    if (MF.getRegInfo().isPhysRegUsed(Reg)) {
+      AFI->setCSRegisterIsSpilled(Reg);
+      Spilled = true;
+      CanEliminateFrame = false;
+    } else {
+      // Check alias registers too.
+      for (const unsigned *Aliases = getAliasSet(Reg); *Aliases; ++Aliases) {
+        if (MF.getRegInfo().isPhysRegUsed(*Aliases)) {
+          Spilled = true;
+          CanEliminateFrame = false;
+        }
+      }
+    }
+
+    if (CSRegClasses[i] == ARM::GPRRegisterClass ||
+        CSRegClasses[i] == ARM::tGPRRegisterClass) {
+      if (Spilled) {
+        NumGPRSpills++;
+
+        if (!STI.isTargetDarwin()) {
+          if (Reg == ARM::LR)
+            LRSpilled = true;
+          CS1Spilled = true;
+          continue;
+        }
+
+        // Keep track if LR and any of R4, R5, R6, and R7 is spilled.
+        switch (Reg) {
+        case ARM::LR:
+          LRSpilled = true;
+          // Fallthrough
+        case ARM::R4:
+        case ARM::R5:
+        case ARM::R6:
+        case ARM::R7:
+          CS1Spilled = true;
+          break;
+        default:
+          break;
+        }
+      } else {
+        if (!STI.isTargetDarwin()) {
+          UnspilledCS1GPRs.push_back(Reg);
+          continue;
+        }
+
+        switch (Reg) {
+        case ARM::R4:
+        case ARM::R5:
+        case ARM::R6:
+        case ARM::R7:
+        case ARM::LR:
+          UnspilledCS1GPRs.push_back(Reg);
+          break;
+        default:
+          UnspilledCS2GPRs.push_back(Reg);
+          break;
+        }
+      }
+    }
+  }
+
+  bool ForceLRSpill = false;
+  if (!LRSpilled && AFI->isThumb1OnlyFunction()) {
+    unsigned FnSize = TII.GetFunctionSizeInBytes(MF);
+    // Force LR to be spilled if the Thumb function size is > 2048. This enables
+    // use of BL to implement far jump. If it turns out that it's not needed
+    // then the branch fix up path will undo it.
+    if (FnSize >= (1 << 11)) {
+      CanEliminateFrame = false;
+      ForceLRSpill = true;
+    }
+  }
+
+  bool ExtraCSSpill = false;
+  if (!CanEliminateFrame || cannotEliminateFrame(MF)) {
+    AFI->setHasStackFrame(true);
+
+    // If LR is not spilled, but at least one of R4, R5, R6, and R7 is spilled.
+    // Spill LR as well so we can fold BX_RET to the registers restore (LDM).
+    if (!LRSpilled && CS1Spilled) {
+      MF.getRegInfo().setPhysRegUsed(ARM::LR);
+      AFI->setCSRegisterIsSpilled(ARM::LR);
+      NumGPRSpills++;
+      UnspilledCS1GPRs.erase(std::find(UnspilledCS1GPRs.begin(),
+                                    UnspilledCS1GPRs.end(), (unsigned)ARM::LR));
+      ForceLRSpill = false;
+      ExtraCSSpill = true;
+    }
+
+    // Darwin ABI requires FP to point to the stack slot that contains the
+    // previous FP.
+    if (STI.isTargetDarwin() || hasFP(MF)) {
+      MF.getRegInfo().setPhysRegUsed(FramePtr);
+      NumGPRSpills++;
+    }
+
+    // If stack and double are 8-byte aligned and we are spilling an odd number
+    // of GPRs. Spill one extra callee save GPR so we won't have to pad between
+    // the integer and double callee save areas.
+    unsigned TargetAlign = MF.getTarget().getFrameInfo()->getStackAlignment();
+    if (TargetAlign == 8 && (NumGPRSpills & 1)) {
+      if (CS1Spilled && !UnspilledCS1GPRs.empty()) {
+        for (unsigned i = 0, e = UnspilledCS1GPRs.size(); i != e; ++i) {
+          unsigned Reg = UnspilledCS1GPRs[i];
+          // Don't spill high register if the function is thumb1
+          if (!AFI->isThumb1OnlyFunction() ||
+              isARMLowRegister(Reg) || Reg == ARM::LR) {
+            MF.getRegInfo().setPhysRegUsed(Reg);
+            AFI->setCSRegisterIsSpilled(Reg);
+            if (!isReservedReg(MF, Reg))
+              ExtraCSSpill = true;
+            break;
+          }
+        }
+      } else if (!UnspilledCS2GPRs.empty() &&
+                 !AFI->isThumb1OnlyFunction()) {
+        unsigned Reg = UnspilledCS2GPRs.front();
+        MF.getRegInfo().setPhysRegUsed(Reg);
+        AFI->setCSRegisterIsSpilled(Reg);
+        if (!isReservedReg(MF, Reg))
+          ExtraCSSpill = true;
+      }
+    }
+
+    // Estimate if we might need to scavenge a register at some point in order
+    // to materialize a stack offset. If so, either spill one additional
+    // callee-saved register or reserve a special spill slot to facilitate
+    // register scavenging. Thumb1 needs a spill slot for stack pointer
+    // adjustments also, even when the frame itself is small.
+    if (RS && !ExtraCSSpill) {
+      MachineFrameInfo  *MFI = MF.getFrameInfo();
+      // If any of the stack slot references may be out of range of an
+      // immediate offset, make sure a register (or a spill slot) is
+      // available for the register scavenger. Note that if we're indexing
+      // off the frame pointer, the effective stack size is 4 bytes larger
+      // since the FP points to the stack slot of the previous FP.
+      if (estimateStackSize(MF, MFI) + (hasFP(MF) ? 4 : 0)
+          >= estimateRSStackSizeLimit(MF)) {
+        // If any non-reserved CS register isn't spilled, just spill one or two
+        // extra. That should take care of it!
+        unsigned NumExtras = TargetAlign / 4;
+        SmallVector<unsigned, 2> Extras;
+        while (NumExtras && !UnspilledCS1GPRs.empty()) {
+          unsigned Reg = UnspilledCS1GPRs.back();
+          UnspilledCS1GPRs.pop_back();
+          if (!isReservedReg(MF, Reg)) {
+            Extras.push_back(Reg);
+            NumExtras--;
+          }
+        }
+        // For non-Thumb1 functions, also check for hi-reg CS registers
+        if (!AFI->isThumb1OnlyFunction()) {
+          while (NumExtras && !UnspilledCS2GPRs.empty()) {
+            unsigned Reg = UnspilledCS2GPRs.back();
+            UnspilledCS2GPRs.pop_back();
+            if (!isReservedReg(MF, Reg)) {
+              Extras.push_back(Reg);
+              NumExtras--;
+            }
+          }
+        }
+        if (Extras.size() && NumExtras == 0) {
+          for (unsigned i = 0, e = Extras.size(); i != e; ++i) {
+            MF.getRegInfo().setPhysRegUsed(Extras[i]);
+            AFI->setCSRegisterIsSpilled(Extras[i]);
+          }
+        } else if (!AFI->isThumb1OnlyFunction()) {
+          // note: Thumb1 functions spill to R12, not the stack.
+          // Reserve a slot closest to SP or frame pointer.
+          const TargetRegisterClass *RC = ARM::GPRRegisterClass;
+          RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
+                                                             RC->getAlignment(),
+                                                             false));
+        }
+      }
+    }
+  }
+
+  if (ForceLRSpill) {
+    MF.getRegInfo().setPhysRegUsed(ARM::LR);
+    AFI->setCSRegisterIsSpilled(ARM::LR);
+    AFI->setLRIsSpilledForFarJump(true);
+  }
+}
+
+unsigned ARMBaseRegisterInfo::getRARegister() const {
+  return ARM::LR;
+}
+
+unsigned 
+ARMBaseRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  if (STI.isTargetDarwin() || hasFP(MF))
+    return FramePtr;
+  return ARM::SP;
+}
+
+int
+ARMBaseRegisterInfo::getFrameIndexReference(const MachineFunction &MF, int FI,
+                                            unsigned &FrameReg) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  int Offset = MFI->getObjectOffset(FI) + MFI->getStackSize();
+  bool isFixed = MFI->isFixedObjectIndex(FI);
+
+  FrameReg = ARM::SP;
+  if (AFI->isGPRCalleeSavedArea1Frame(FI))
+    Offset -= AFI->getGPRCalleeSavedArea1Offset();
+  else if (AFI->isGPRCalleeSavedArea2Frame(FI))
+    Offset -= AFI->getGPRCalleeSavedArea2Offset();
+  else if (AFI->isDPRCalleeSavedAreaFrame(FI))
+    Offset -= AFI->getDPRCalleeSavedAreaOffset();
+  else if (needsStackRealignment(MF)) {
+    // When dynamically realigning the stack, use the frame pointer for
+    // parameters, and the stack pointer for locals.
+    assert (hasFP(MF) && "dynamic stack realignment without a FP!");
+    if (isFixed) {
+      FrameReg = getFrameRegister(MF);
+      Offset -= AFI->getFramePtrSpillOffset();
+    }
+  } else if (hasFP(MF) && AFI->hasStackFrame()) {
+    if (isFixed || MFI->hasVarSizedObjects()) {
+      // Use frame pointer to reference fixed objects unless this is a
+      // frameless function.
+      FrameReg = getFrameRegister(MF);
+      Offset -= AFI->getFramePtrSpillOffset();
+    } else if (AFI->isThumb2Function()) {
+      // In Thumb2 mode, the negative offset is very limited.
+      int FPOffset = Offset - AFI->getFramePtrSpillOffset();
+      if (FPOffset >= -255 && FPOffset < 0) {
+        FrameReg = getFrameRegister(MF);
+        Offset = FPOffset;
+      }
+    }
+  }
+  return Offset;
+}
+
+
+int
+ARMBaseRegisterInfo::getFrameIndexOffset(const MachineFunction &MF,
+                                         int FI) const {
+  unsigned FrameReg;
+  return getFrameIndexReference(MF, FI, FrameReg);
+}
+
+unsigned ARMBaseRegisterInfo::getEHExceptionRegister() const {
+  llvm_unreachable("What is the exception register");
+  return 0;
+}
+
+unsigned ARMBaseRegisterInfo::getEHHandlerRegister() const {
+  llvm_unreachable("What is the exception handler register");
+  return 0;
+}
+
+int ARMBaseRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
+  return ARMGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
+}
+
+unsigned ARMBaseRegisterInfo::getRegisterPairEven(unsigned Reg,
+                                              const MachineFunction &MF) const {
+  switch (Reg) {
+  default: break;
+  // Return 0 if either register of the pair is a special register.
+  // So no R12, etc.
+  case ARM::R1:
+    return ARM::R0;
+  case ARM::R3:
+    return ARM::R2;
+  case ARM::R5:
+    return ARM::R4;
+  case ARM::R7:
+    return isReservedReg(MF, ARM::R7)  ? 0 : ARM::R6;
+  case ARM::R9:
+    return isReservedReg(MF, ARM::R9)  ? 0 :ARM::R8;
+  case ARM::R11:
+    return isReservedReg(MF, ARM::R11) ? 0 : ARM::R10;
+
+  case ARM::S1:
+    return ARM::S0;
+  case ARM::S3:
+    return ARM::S2;
+  case ARM::S5:
+    return ARM::S4;
+  case ARM::S7:
+    return ARM::S6;
+  case ARM::S9:
+    return ARM::S8;
+  case ARM::S11:
+    return ARM::S10;
+  case ARM::S13:
+    return ARM::S12;
+  case ARM::S15:
+    return ARM::S14;
+  case ARM::S17:
+    return ARM::S16;
+  case ARM::S19:
+    return ARM::S18;
+  case ARM::S21:
+    return ARM::S20;
+  case ARM::S23:
+    return ARM::S22;
+  case ARM::S25:
+    return ARM::S24;
+  case ARM::S27:
+    return ARM::S26;
+  case ARM::S29:
+    return ARM::S28;
+  case ARM::S31:
+    return ARM::S30;
+
+  case ARM::D1:
+    return ARM::D0;
+  case ARM::D3:
+    return ARM::D2;
+  case ARM::D5:
+    return ARM::D4;
+  case ARM::D7:
+    return ARM::D6;
+  case ARM::D9:
+    return ARM::D8;
+  case ARM::D11:
+    return ARM::D10;
+  case ARM::D13:
+    return ARM::D12;
+  case ARM::D15:
+    return ARM::D14;
+  case ARM::D17:
+    return ARM::D16;
+  case ARM::D19:
+    return ARM::D18;
+  case ARM::D21:
+    return ARM::D20;
+  case ARM::D23:
+    return ARM::D22;
+  case ARM::D25:
+    return ARM::D24;
+  case ARM::D27:
+    return ARM::D26;
+  case ARM::D29:
+    return ARM::D28;
+  case ARM::D31:
+    return ARM::D30;
+  }
+
+  return 0;
+}
+
+unsigned ARMBaseRegisterInfo::getRegisterPairOdd(unsigned Reg,
+                                             const MachineFunction &MF) const {
+  switch (Reg) {
+  default: break;
+  // Return 0 if either register of the pair is a special register.
+  // So no R12, etc.
+  case ARM::R0:
+    return ARM::R1;
+  case ARM::R2:
+    return ARM::R3;
+  case ARM::R4:
+    return ARM::R5;
+  case ARM::R6:
+    return isReservedReg(MF, ARM::R7)  ? 0 : ARM::R7;
+  case ARM::R8:
+    return isReservedReg(MF, ARM::R9)  ? 0 :ARM::R9;
+  case ARM::R10:
+    return isReservedReg(MF, ARM::R11) ? 0 : ARM::R11;
+
+  case ARM::S0:
+    return ARM::S1;
+  case ARM::S2:
+    return ARM::S3;
+  case ARM::S4:
+    return ARM::S5;
+  case ARM::S6:
+    return ARM::S7;
+  case ARM::S8:
+    return ARM::S9;
+  case ARM::S10:
+    return ARM::S11;
+  case ARM::S12:
+    return ARM::S13;
+  case ARM::S14:
+    return ARM::S15;
+  case ARM::S16:
+    return ARM::S17;
+  case ARM::S18:
+    return ARM::S19;
+  case ARM::S20:
+    return ARM::S21;
+  case ARM::S22:
+    return ARM::S23;
+  case ARM::S24:
+    return ARM::S25;
+  case ARM::S26:
+    return ARM::S27;
+  case ARM::S28:
+    return ARM::S29;
+  case ARM::S30:
+    return ARM::S31;
+
+  case ARM::D0:
+    return ARM::D1;
+  case ARM::D2:
+    return ARM::D3;
+  case ARM::D4:
+    return ARM::D5;
+  case ARM::D6:
+    return ARM::D7;
+  case ARM::D8:
+    return ARM::D9;
+  case ARM::D10:
+    return ARM::D11;
+  case ARM::D12:
+    return ARM::D13;
+  case ARM::D14:
+    return ARM::D15;
+  case ARM::D16:
+    return ARM::D17;
+  case ARM::D18:
+    return ARM::D19;
+  case ARM::D20:
+    return ARM::D21;
+  case ARM::D22:
+    return ARM::D23;
+  case ARM::D24:
+    return ARM::D25;
+  case ARM::D26:
+    return ARM::D27;
+  case ARM::D28:
+    return ARM::D29;
+  case ARM::D30:
+    return ARM::D31;
+  }
+
+  return 0;
+}
+
+/// emitLoadConstPool - Emits a load from constpool to materialize the
+/// specified immediate.
+void ARMBaseRegisterInfo::
+emitLoadConstPool(MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator &MBBI,
+                  DebugLoc dl,
+                  unsigned DestReg, unsigned SubIdx, int Val,
+                  ARMCC::CondCodes Pred,
+                  unsigned PredReg) const {
+  MachineFunction &MF = *MBB.getParent();
+  MachineConstantPool *ConstantPool = MF.getConstantPool();
+  Constant *C =
+        ConstantInt::get(Type::getInt32Ty(MF.getFunction()->getContext()), Val);
+  unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
+
+  BuildMI(MBB, MBBI, dl, TII.get(ARM::LDRcp))
+    .addReg(DestReg, getDefRegState(true), SubIdx)
+    .addConstantPoolIndex(Idx)
+    .addReg(0).addImm(0).addImm(Pred).addReg(PredReg);
+}
+
+bool ARMBaseRegisterInfo::
+requiresRegisterScavenging(const MachineFunction &MF) const {
+  return true;
+}
+
+bool ARMBaseRegisterInfo::
+requiresFrameIndexScavenging(const MachineFunction &MF) const {
+  return true;
+}
+
+// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
+// not required, we reserve argument space for call sites in the function
+// immediately on entry to the current function. This eliminates the need for
+// add/sub sp brackets around call sites. Returns true if the call frame is
+// included as part of the stack frame.
+bool ARMBaseRegisterInfo::
+hasReservedCallFrame(MachineFunction &MF) const {
+  const MachineFrameInfo *FFI = MF.getFrameInfo();
+  unsigned CFSize = FFI->getMaxCallFrameSize();
+  // It's not always a good idea to include the call frame as part of the
+  // stack frame. ARM (especially Thumb) has small immediate offset to
+  // address the stack frame. So a large call frame can cause poor codegen
+  // and may even makes it impossible to scavenge a register.
+  if (CFSize >= ((1 << 12) - 1) / 2)  // Half of imm12
+    return false;
+
+  return !MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+static void
+emitSPUpdate(bool isARM,
+             MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+             DebugLoc dl, const ARMBaseInstrInfo &TII,
+             int NumBytes,
+             ARMCC::CondCodes Pred = ARMCC::AL, unsigned PredReg = 0) {
+  if (isARM)
+    emitARMRegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes,
+                            Pred, PredReg, TII);
+  else
+    emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::SP, ARM::SP, NumBytes,
+                           Pred, PredReg, TII);
+}
+
+
+void ARMBaseRegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  if (!hasReservedCallFrame(MF)) {
+    // If we have alloca, convert as follows:
+    // ADJCALLSTACKDOWN -> sub, sp, sp, amount
+    // ADJCALLSTACKUP   -> add, sp, sp, amount
+    MachineInstr *Old = I;
+    DebugLoc dl = Old->getDebugLoc();
+    unsigned Amount = Old->getOperand(0).getImm();
+    if (Amount != 0) {
+      // We need to keep the stack aligned properly.  To do this, we round the
+      // amount of space needed for the outgoing arguments up to the next
+      // alignment boundary.
+      unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment();
+      Amount = (Amount+Align-1)/Align*Align;
+
+      ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+      assert(!AFI->isThumb1OnlyFunction() &&
+             "This eliminateCallFramePseudoInstr does not suppor Thumb1!");
+      bool isARM = !AFI->isThumbFunction();
+
+      // Replace the pseudo instruction with a new instruction...
+      unsigned Opc = Old->getOpcode();
+      ARMCC::CondCodes Pred = (ARMCC::CondCodes)Old->getOperand(1).getImm();
+      // FIXME: Thumb2 version of ADJCALLSTACKUP and ADJCALLSTACKDOWN?
+      if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) {
+        // Note: PredReg is operand 2 for ADJCALLSTACKDOWN.
+        unsigned PredReg = Old->getOperand(2).getReg();
+        emitSPUpdate(isARM, MBB, I, dl, TII, -Amount, Pred, PredReg);
+      } else {
+        // Note: PredReg is operand 3 for ADJCALLSTACKUP.
+        unsigned PredReg = Old->getOperand(3).getReg();
+        assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP);
+        emitSPUpdate(isARM, MBB, I, dl, TII, Amount, Pred, PredReg);
+      }
+    }
+  }
+  MBB.erase(I);
+}
+
+unsigned
+ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                         int SPAdj, int *Value,
+                                         RegScavenger *RS) const {
+  unsigned i = 0;
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  assert(!AFI->isThumb1OnlyFunction() &&
+         "This eliminateFrameIndex does not support Thumb1!");
+
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+
+  int FrameIndex = MI.getOperand(i).getIndex();
+  int Offset = MFI->getObjectOffset(FrameIndex) + MFI->getStackSize() + SPAdj;
+  unsigned FrameReg;
+
+  Offset = getFrameIndexReference(MF, FrameIndex, FrameReg);
+  if (FrameReg != ARM::SP)
+    SPAdj = 0;
+
+  // Modify MI as necessary to handle as much of 'Offset' as possible
+  bool Done = false;
+  if (!AFI->isThumbFunction())
+    Done = rewriteARMFrameIndex(MI, i, FrameReg, Offset, TII);
+  else {
+    assert(AFI->isThumb2Function());
+    Done = rewriteT2FrameIndex(MI, i, FrameReg, Offset, TII);
+  }
+  if (Done)
+    return 0;
+
+  // If we get here, the immediate doesn't fit into the instruction.  We folded
+  // as much as possible above, handle the rest, providing a register that is
+  // SP+LargeImm.
+  assert((Offset ||
+          (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode4 ||
+          (MI.getDesc().TSFlags & ARMII::AddrModeMask) == ARMII::AddrMode6) &&
+         "This code isn't needed if offset already handled!");
+
+  unsigned ScratchReg = 0;
+  int PIdx = MI.findFirstPredOperandIdx();
+  ARMCC::CondCodes Pred = (PIdx == -1)
+    ? ARMCC::AL : (ARMCC::CondCodes)MI.getOperand(PIdx).getImm();
+  unsigned PredReg = (PIdx == -1) ? 0 : MI.getOperand(PIdx+1).getReg();
+  if (Offset == 0)
+    // Must be addrmode4/6.
+    MI.getOperand(i).ChangeToRegister(FrameReg, false, false, false);
+  else {
+    ScratchReg = MF.getRegInfo().createVirtualRegister(ARM::GPRRegisterClass);
+    if (Value) *Value = Offset;
+    if (!AFI->isThumbFunction())
+      emitARMRegPlusImmediate(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg,
+                              Offset, Pred, PredReg, TII);
+    else {
+      assert(AFI->isThumb2Function());
+      emitT2RegPlusImmediate(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg,
+                             Offset, Pred, PredReg, TII);
+    }
+    MI.getOperand(i).ChangeToRegister(ScratchReg, false, false, true);
+    if (!ReuseFrameIndexVals)
+      ScratchReg = 0;
+  }
+  return ScratchReg;
+}
+
+/// Move iterator past the next bunch of callee save load / store ops for
+/// the particular spill area (1: integer area 1, 2: integer area 2,
+/// 3: fp area, 0: don't care).
+static void movePastCSLoadStoreOps(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator &MBBI,
+                                   int Opc1, int Opc2, unsigned Area,
+                                   const ARMSubtarget &STI) {
+  while (MBBI != MBB.end() &&
+         ((MBBI->getOpcode() == Opc1) || (MBBI->getOpcode() == Opc2)) &&
+         MBBI->getOperand(1).isFI()) {
+    if (Area != 0) {
+      bool Done = false;
+      unsigned Category = 0;
+      switch (MBBI->getOperand(0).getReg()) {
+      case ARM::R4:  case ARM::R5:  case ARM::R6: case ARM::R7:
+      case ARM::LR:
+        Category = 1;
+        break;
+      case ARM::R8:  case ARM::R9:  case ARM::R10: case ARM::R11:
+        Category = STI.isTargetDarwin() ? 2 : 1;
+        break;
+      case ARM::D8:  case ARM::D9:  case ARM::D10: case ARM::D11:
+      case ARM::D12: case ARM::D13: case ARM::D14: case ARM::D15:
+        Category = 3;
+        break;
+      default:
+        Done = true;
+        break;
+      }
+      if (Done || Category != Area)
+        break;
+    }
+
+    ++MBBI;
+  }
+}
+
+void ARMBaseRegisterInfo::
+emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo  *MFI = MF.getFrameInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  assert(!AFI->isThumb1OnlyFunction() &&
+         "This emitPrologue does not suppor Thumb1!");
+  bool isARM = !AFI->isThumbFunction();
+  unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize();
+  unsigned NumBytes = MFI->getStackSize();
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  DebugLoc dl = (MBBI != MBB.end() ?
+                 MBBI->getDebugLoc() : DebugLoc::getUnknownLoc());
+
+  // Determine the sizes of each callee-save spill areas and record which frame
+  // belongs to which callee-save spill areas.
+  unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0;
+  int FramePtrSpillFI = 0;
+
+  // Allocate the vararg register save area. This is not counted in NumBytes.
+  if (VARegSaveSize)
+    emitSPUpdate(isARM, MBB, MBBI, dl, TII, -VARegSaveSize);
+
+  if (!AFI->hasStackFrame()) {
+    if (NumBytes != 0)
+      emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes);
+    return;
+  }
+
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    int FI = CSI[i].getFrameIdx();
+    switch (Reg) {
+    case ARM::R4:
+    case ARM::R5:
+    case ARM::R6:
+    case ARM::R7:
+    case ARM::LR:
+      if (Reg == FramePtr)
+        FramePtrSpillFI = FI;
+      AFI->addGPRCalleeSavedArea1Frame(FI);
+      GPRCS1Size += 4;
+      break;
+    case ARM::R8:
+    case ARM::R9:
+    case ARM::R10:
+    case ARM::R11:
+      if (Reg == FramePtr)
+        FramePtrSpillFI = FI;
+      if (STI.isTargetDarwin()) {
+        AFI->addGPRCalleeSavedArea2Frame(FI);
+        GPRCS2Size += 4;
+      } else {
+        AFI->addGPRCalleeSavedArea1Frame(FI);
+        GPRCS1Size += 4;
+      }
+      break;
+    default:
+      AFI->addDPRCalleeSavedAreaFrame(FI);
+      DPRCSSize += 8;
+    }
+  }
+
+  // Build the new SUBri to adjust SP for integer callee-save spill area 1.
+  emitSPUpdate(isARM, MBB, MBBI, dl, TII, -GPRCS1Size);
+  movePastCSLoadStoreOps(MBB, MBBI, ARM::STR, ARM::t2STRi12, 1, STI);
+
+  // Set FP to point to the stack slot that contains the previous FP.
+  // For Darwin, FP is R7, which has now been stored in spill area 1.
+  // Otherwise, if this is not Darwin, all the callee-saved registers go
+  // into spill area 1, including the FP in R11.  In either case, it is
+  // now safe to emit this assignment.
+  if (STI.isTargetDarwin() || hasFP(MF)) {
+    unsigned ADDriOpc = !AFI->isThumbFunction() ? ARM::ADDri : ARM::t2ADDri;
+    MachineInstrBuilder MIB =
+      BuildMI(MBB, MBBI, dl, TII.get(ADDriOpc), FramePtr)
+      .addFrameIndex(FramePtrSpillFI).addImm(0);
+    AddDefaultCC(AddDefaultPred(MIB));
+  }
+
+  // Build the new SUBri to adjust SP for integer callee-save spill area 2.
+  emitSPUpdate(isARM, MBB, MBBI, dl, TII, -GPRCS2Size);
+
+  // Build the new SUBri to adjust SP for FP callee-save spill area.
+  movePastCSLoadStoreOps(MBB, MBBI, ARM::STR, ARM::t2STRi12, 2, STI);
+  emitSPUpdate(isARM, MBB, MBBI, dl, TII, -DPRCSSize);
+
+  // Determine starting offsets of spill areas.
+  unsigned DPRCSOffset  = NumBytes - (GPRCS1Size + GPRCS2Size + DPRCSSize);
+  unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize;
+  unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size;
+  AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) + NumBytes);
+  AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset);
+  AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset);
+  AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset);
+
+  movePastCSLoadStoreOps(MBB, MBBI, ARM::VSTRD, 0, 3, STI);
+  NumBytes = DPRCSOffset;
+  if (NumBytes) {
+    // Adjust SP after all the callee-save spills.
+    emitSPUpdate(isARM, MBB, MBBI, dl, TII, -NumBytes);
+  }
+
+  if (STI.isTargetELF() && hasFP(MF)) {
+    MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() -
+                             AFI->getFramePtrSpillOffset());
+  }
+
+  AFI->setGPRCalleeSavedArea1Size(GPRCS1Size);
+  AFI->setGPRCalleeSavedArea2Size(GPRCS2Size);
+  AFI->setDPRCalleeSavedAreaSize(DPRCSSize);
+
+  // If we need dynamic stack realignment, do it here.
+  if (needsStackRealignment(MF)) {
+    unsigned MaxAlign = MFI->getMaxAlignment();
+    assert (!AFI->isThumb1OnlyFunction());
+    if (!AFI->isThumbFunction()) {
+      // Emit bic sp, sp, MaxAlign
+      AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl,
+                                          TII.get(ARM::BICri), ARM::SP)
+                                  .addReg(ARM::SP, RegState::Kill)
+                                  .addImm(MaxAlign-1)));
+    } else {
+      // We cannot use sp as source/dest register here, thus we're emitting the
+      // following sequence:
+      // mov r4, sp
+      // bic r4, r4, MaxAlign
+      // mov sp, r4
+      // FIXME: It will be better just to find spare register here.
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2tgpr), ARM::R4)
+        .addReg(ARM::SP, RegState::Kill);
+      AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl,
+                                          TII.get(ARM::t2BICri), ARM::R4)
+                                  .addReg(ARM::R4, RegState::Kill)
+                                  .addImm(MaxAlign-1)));
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVtgpr2gpr), ARM::SP)
+        .addReg(ARM::R4, RegState::Kill);
+    }
+  }
+}
+
+static bool isCalleeSavedRegister(unsigned Reg, const unsigned *CSRegs) {
+  for (unsigned i = 0; CSRegs[i]; ++i)
+    if (Reg == CSRegs[i])
+      return true;
+  return false;
+}
+
+static bool isCSRestore(MachineInstr *MI,
+                        const ARMBaseInstrInfo &TII,
+                        const unsigned *CSRegs) {
+  return ((MI->getOpcode() == (int)ARM::VLDRD ||
+           MI->getOpcode() == (int)ARM::LDR ||
+           MI->getOpcode() == (int)ARM::t2LDRi12) &&
+          MI->getOperand(1).isFI() &&
+          isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs));
+}
+
+void ARMBaseRegisterInfo::
+emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  assert(MBBI->getDesc().isReturn() &&
+         "Can only insert epilog into returning blocks");
+  DebugLoc dl = MBBI->getDebugLoc();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  assert(!AFI->isThumb1OnlyFunction() &&
+         "This emitEpilogue does not suppor Thumb1!");
+  bool isARM = !AFI->isThumbFunction();
+
+  unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize();
+  int NumBytes = (int)MFI->getStackSize();
+
+  if (!AFI->hasStackFrame()) {
+    if (NumBytes != 0)
+      emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
+  } else {
+    // Unwind MBBI to point to first LDR / VLDRD.
+    const unsigned *CSRegs = getCalleeSavedRegs();
+    if (MBBI != MBB.begin()) {
+      do
+        --MBBI;
+      while (MBBI != MBB.begin() && isCSRestore(MBBI, TII, CSRegs));
+      if (!isCSRestore(MBBI, TII, CSRegs))
+        ++MBBI;
+    }
+
+    // Move SP to start of FP callee save spill area.
+    NumBytes -= (AFI->getGPRCalleeSavedArea1Size() +
+                 AFI->getGPRCalleeSavedArea2Size() +
+                 AFI->getDPRCalleeSavedAreaSize());
+
+    // Darwin ABI requires FP to point to the stack slot that contains the
+    // previous FP.
+    bool HasFP = hasFP(MF);
+    if ((STI.isTargetDarwin() && NumBytes) || HasFP) {
+      NumBytes = AFI->getFramePtrSpillOffset() - NumBytes;
+      // Reset SP based on frame pointer only if the stack frame extends beyond
+      // frame pointer stack slot or target is ELF and the function has FP.
+      if (HasFP ||
+          AFI->getGPRCalleeSavedArea2Size() ||
+          AFI->getDPRCalleeSavedAreaSize()  ||
+          AFI->getDPRCalleeSavedAreaOffset()) {
+        if (NumBytes) {
+          if (isARM)
+            emitARMRegPlusImmediate(MBB, MBBI, dl, ARM::SP, FramePtr, -NumBytes,
+                                    ARMCC::AL, 0, TII);
+          else
+            emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::SP, FramePtr, -NumBytes,
+                                    ARMCC::AL, 0, TII);
+        } else {
+          // Thumb2 or ARM.
+          if (isARM)
+            BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), ARM::SP)
+              .addReg(FramePtr)
+              .addImm((unsigned)ARMCC::AL).addReg(0).addReg(0);
+          else
+            BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2gpr), ARM::SP)
+              .addReg(FramePtr);
+        }
+      }
+    } else if (NumBytes)
+      emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
+
+    // Move SP to start of integer callee save spill area 2.
+    movePastCSLoadStoreOps(MBB, MBBI, ARM::VLDRD, 0, 3, STI);
+    emitSPUpdate(isARM, MBB, MBBI, dl, TII, AFI->getDPRCalleeSavedAreaSize());
+
+    // Move SP to start of integer callee save spill area 1.
+    movePastCSLoadStoreOps(MBB, MBBI, ARM::LDR, ARM::t2LDRi12, 2, STI);
+    emitSPUpdate(isARM, MBB, MBBI, dl, TII, AFI->getGPRCalleeSavedArea2Size());
+
+    // Move SP to SP upon entry to the function.
+    movePastCSLoadStoreOps(MBB, MBBI, ARM::LDR, ARM::t2LDRi12, 1, STI);
+    emitSPUpdate(isARM, MBB, MBBI, dl, TII, AFI->getGPRCalleeSavedArea1Size());
+  }
+
+  if (VARegSaveSize)
+    emitSPUpdate(isARM, MBB, MBBI, dl, TII, VARegSaveSize);
+}
+
+#include "ARMGenRegisterInfo.inc"

diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h
new file mode 100644
index 0000000..33ba21d
--- /dev/null
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.h

@@ -0,0 +1,163 @@
+//===- ARMBaseRegisterInfo.h - ARM Register Information Impl ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the base ARM implementation of TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMBASEREGISTERINFO_H
+#define ARMBASEREGISTERINFO_H
+
+#include "ARM.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "ARMGenRegisterInfo.h.inc"
+
+namespace llvm {
+  class ARMSubtarget;
+  class ARMBaseInstrInfo;
+  class Type;
+
+/// Register allocation hints.
+namespace ARMRI {
+  enum {
+    RegPairOdd  = 1,
+    RegPairEven = 2
+  };
+}
+
+/// isARMLowRegister - Returns true if the register is low register r0-r7.
+///
+static inline bool isARMLowRegister(unsigned Reg) {
+  using namespace ARM;
+  switch (Reg) {
+  case R0:  case R1:  case R2:  case R3:
+  case R4:  case R5:  case R6:  case R7:
+    return true;
+  default:
+    return false;
+  }
+}
+
+struct ARMBaseRegisterInfo : public ARMGenRegisterInfo {
+protected:
+  const ARMBaseInstrInfo &TII;
+  const ARMSubtarget &STI;
+
+  /// FramePtr - ARM physical register used as frame ptr.
+  unsigned FramePtr;
+
+  // Can be only subclassed.
+  explicit ARMBaseRegisterInfo(const ARMBaseInstrInfo &tii,
+                               const ARMSubtarget &STI);
+
+  // Return the opcode that implements 'Op', or 0 if no opcode
+  unsigned getOpcode(int Op) const;
+
+public:
+  /// getRegisterNumbering - Given the enum value for some register, e.g.
+  /// ARM::LR, return the number that it corresponds to (e.g. 14). It
+  /// also returns true in isSPVFP if the register is a single precision
+  /// VFP register.
+  static unsigned getRegisterNumbering(unsigned RegEnum, bool *isSPVFP = 0);
+
+  /// Code Generation virtual methods...
+  const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+
+  const TargetRegisterClass* const*
+  getCalleeSavedRegClasses(const MachineFunction *MF = 0) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+
+  /// getMatchingSuperRegClass - Return a subclass of the specified register
+  /// class A so that each register in it has a sub-register of the
+  /// specified sub-register index which is in the specified register class B.
+  virtual const TargetRegisterClass *
+  getMatchingSuperRegClass(const TargetRegisterClass *A,
+                           const TargetRegisterClass *B, unsigned Idx) const;
+
+  const TargetRegisterClass *getPointerRegClass(unsigned Kind = 0) const;
+
+  std::pair<TargetRegisterClass::iterator,TargetRegisterClass::iterator>
+  getAllocationOrder(const TargetRegisterClass *RC,
+                     unsigned HintType, unsigned HintReg,
+                     const MachineFunction &MF) const;
+
+  unsigned ResolveRegAllocHint(unsigned Type, unsigned Reg,
+                               const MachineFunction &MF) const;
+
+  void UpdateRegAllocHint(unsigned Reg, unsigned NewReg,
+                          MachineFunction &MF) const;
+
+  bool hasFP(const MachineFunction &MF) const;
+
+  bool canRealignStack(const MachineFunction &MF) const;
+  bool needsStackRealignment(const MachineFunction &MF) const;
+
+  bool cannotEliminateFrame(const MachineFunction &MF) const;
+
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                            RegScavenger *RS = NULL) const;
+
+  // Debug information queries.
+  unsigned getRARegister() const;
+  unsigned getFrameRegister(const MachineFunction &MF) const;
+  int getFrameIndexReference(const MachineFunction &MF, int FI,
+                             unsigned &FrameReg) const;
+  int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
+
+  // Exception handling queries.
+  unsigned getEHExceptionRegister() const;
+  unsigned getEHHandlerRegister() const;
+
+  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+
+  bool isLowRegister(unsigned Reg) const;
+
+
+  /// emitLoadConstPool - Emits a load from constpool to materialize the
+  /// specified immediate.
+  virtual void emitLoadConstPool(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator &MBBI,
+                                 DebugLoc dl,
+                                 unsigned DestReg, unsigned SubIdx,
+                                 int Val,
+                                 ARMCC::CondCodes Pred = ARMCC::AL,
+                                 unsigned PredReg = 0) const;
+
+  /// Code Generation virtual methods...
+  virtual bool isReservedReg(const MachineFunction &MF, unsigned Reg) const;
+
+  virtual bool requiresRegisterScavenging(const MachineFunction &MF) const;
+
+  virtual bool requiresFrameIndexScavenging(const MachineFunction &MF) const;
+
+  virtual bool hasReservedCallFrame(MachineFunction &MF) const;
+
+  virtual void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                             MachineBasicBlock &MBB,
+                                             MachineBasicBlock::iterator I) const;
+
+  virtual unsigned eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                       int SPAdj, int *Value = NULL,
+                                       RegScavenger *RS = NULL) const;
+
+  virtual void emitPrologue(MachineFunction &MF) const;
+  virtual void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+private:
+  unsigned estimateRSStackSizeLimit(MachineFunction &MF) const;
+
+  unsigned getRegisterPairEven(unsigned Reg, const MachineFunction &MF) const;
+
+  unsigned getRegisterPairOdd(unsigned Reg, const MachineFunction &MF) const;
+};
+
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/ARM/ARMBuildAttrs.h b/lib/Target/ARM/ARMBuildAttrs.h
new file mode 100644
index 0000000..3b38375
--- /dev/null
+++ b/lib/Target/ARM/ARMBuildAttrs.h

@@ -0,0 +1,64 @@
+//===-------- ARMBuildAttrs.h - ARM Build Attributes ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains enumerations and support routines for ARM build attributes
+// as defined in ARM ABI addenda document (ABI release 2.07).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __TARGET_ARMBUILDATTRS_H__
+#define __TARGET_ARMBUILDATTRS_H__
+
+namespace ARMBuildAttrs {
+  enum {
+    File                      = 1,
+    Section                   = 2,
+    Symbol                    = 3,
+    CPU_raw_name              = 4,
+    CPU_name                  = 5,
+    CPU_arch                  = 6,
+    CPU_arch_profile          = 7,
+    ARM_ISA_use               = 8,
+    THUMB_ISA_use             = 9,
+    VFP_arch                  = 10,
+    WMMX_arch                 = 11,
+    Advanced_SIMD_arch        = 12,
+    PCS_config                = 13,
+    ABI_PCS_R9_use            = 14,
+    ABI_PCS_RW_data           = 15,
+    ABI_PCS_RO_data           = 16,
+    ABI_PCS_GOT_use           = 17,
+    ABI_PCS_wchar_t           = 18,
+    ABI_FP_rounding           = 19,
+    ABI_FP_denormal           = 20,
+    ABI_FP_exceptions         = 21,
+    ABI_FP_user_exceptions    = 22,
+    ABI_FP_number_model       = 23,
+    ABI_align8_needed         = 24,
+    ABI_align8_preserved      = 25,
+    ABI_enum_size             = 26,
+    ABI_HardFP_use            = 27,
+    ABI_VFP_args              = 28,
+    ABI_WMMX_args             = 29,
+    ABI_optimization_goals    = 30,
+    ABI_FP_optimization_goals = 31,
+    compatibility             = 32,
+    CPU_unaligned_access      = 34,
+    VFP_HP_extension          = 36,
+    ABI_FP_16bit_format       = 38,
+    nodefaults                = 64,
+    also_compatible_with      = 65,
+    T2EE_use                  = 66,
+    conformance               = 67,
+    Virtualization_use        = 68,
+    MPextension_use           = 70
+  };
+}
+
+#endif // __TARGET_ARMBUILDATTRS_H__

diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td
new file mode 100644
index 0000000..8fdb07f
--- /dev/null
+++ b/lib/Target/ARM/ARMCallingConv.td

@@ -0,0 +1,132 @@
+//===- ARMCallingConv.td - Calling Conventions for ARM ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This describes the calling conventions for ARM architecture.
+//===----------------------------------------------------------------------===//
+
+/// CCIfSubtarget - Match if the current subtarget has a feature F.
+class CCIfSubtarget<string F, CCAction A>:
+  CCIf<!strconcat("State.getTarget().getSubtarget<ARMSubtarget>().", F), A>;
+
+/// CCIfAlign - Match of the original alignment of the arg
+class CCIfAlign<string Align, CCAction A>:
+  CCIf<!strconcat("ArgFlags.getOrigAlign() == ", Align), A>;
+
+//===----------------------------------------------------------------------===//
+// ARM APCS Calling Convention
+//===----------------------------------------------------------------------===//
+def CC_ARM_APCS : CallingConv<[
+
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  // f64 and v2f64 are passed in adjacent GPRs, possibly split onto the stack
+  CCIfType<[f64, v2f64], CCCustom<"CC_ARM_APCS_Custom_f64">>,
+
+  CCIfType<[f32], CCBitConvertToType<i32>>,
+  CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,
+
+  CCIfType<[i32], CCAssignToStack<4, 4>>,
+  CCIfType<[f64], CCAssignToStack<8, 4>>,
+  CCIfType<[v2f64], CCAssignToStack<16, 4>>
+]>;
+
+def RetCC_ARM_APCS : CallingConv<[
+  CCIfType<[f32], CCBitConvertToType<i32>>,
+
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_APCS_Custom_f64">>,
+
+  CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,
+  CCIfType<[i64], CCAssignToRegWithShadow<[R0, R2], [R1, R3]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// ARM AAPCS (EABI) Calling Convention, common parts
+//===----------------------------------------------------------------------===//
+
+def CC_ARM_AAPCS_Common : CallingConv<[
+
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // i64/f64 is passed in even pairs of GPRs
+  // i64 is 8-aligned i32 here, so we may need to eat R1 as a pad register
+  // (and the same is true for f64 if VFP is not enabled)
+  CCIfType<[i32], CCIfAlign<"8", CCAssignToRegWithShadow<[R0, R2], [R0, R1]>>>,
+  CCIfType<[i32], CCIf<"State.getNextStackOffset() == 0 &&"
+                       "ArgFlags.getOrigAlign() != 8",
+                       CCAssignToReg<[R0, R1, R2, R3]>>>,
+
+  CCIfType<[i32], CCIfAlign<"8", CCAssignToStack<4, 8>>>,
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+  CCIfType<[f64], CCAssignToStack<8, 8>>,
+  CCIfType<[v2f64], CCAssignToStack<16, 8>>
+]>;
+
+def RetCC_ARM_AAPCS_Common : CallingConv<[
+  CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,
+  CCIfType<[i64], CCAssignToRegWithShadow<[R0, R2], [R1, R3]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// ARM AAPCS (EABI) Calling Convention
+//===----------------------------------------------------------------------===//
+
+def CC_ARM_AAPCS : CallingConv<[
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[f64, v2f64], CCCustom<"CC_ARM_AAPCS_Custom_f64">>,
+  CCIfType<[f32], CCBitConvertToType<i32>>,
+  CCDelegateTo<CC_ARM_AAPCS_Common>
+]>;
+
+def RetCC_ARM_AAPCS : CallingConv<[
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>,
+  CCIfType<[f32], CCBitConvertToType<i32>>,
+  CCDelegateTo<RetCC_ARM_AAPCS_Common>
+]>;
+
+//===----------------------------------------------------------------------===//
+// ARM AAPCS-VFP (EABI) Calling Convention
+//===----------------------------------------------------------------------===//
+
+def CC_ARM_AAPCS_VFP : CallingConv<[
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
+  CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
+  CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
+                                 S9, S10, S11, S12, S13, S14, S15]>>,
+  CCDelegateTo<CC_ARM_AAPCS_Common>
+]>;
+
+def RetCC_ARM_AAPCS_VFP : CallingConv<[
+  // Handle all vector types as either f64 or v2f64.
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+
+  CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
+  CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
+  CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
+                                 S9, S10, S11, S12, S13, S14, S15]>>,
+  CCDelegateTo<RetCC_ARM_AAPCS_Common>
+]>;

diff --git a/lib/Target/ARM/ARMCodeEmitter.cpp b/lib/Target/ARM/ARMCodeEmitter.cpp
new file mode 100644
index 0000000..bd703f4
--- /dev/null
+++ b/lib/Target/ARM/ARMCodeEmitter.cpp

@@ -0,0 +1,1368 @@
+//===-- ARM/ARMCodeEmitter.cpp - Convert ARM code to machine code ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the pass that transforms the ARM machine instructions into
+// relocatable machine code.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jit"
+#include "ARM.h"
+#include "ARMAddressingModes.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMInstrInfo.h"
+#include "ARMRelocations.h"
+#include "ARMSubtarget.h"
+#include "ARMTargetMachine.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/PassManager.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#ifndef NDEBUG
+#include <iomanip>
+#endif
+using namespace llvm;
+
+STATISTIC(NumEmitted, "Number of machine instructions emitted");
+
+namespace {
+
+  class ARMCodeEmitter : public MachineFunctionPass {
+    ARMJITInfo                *JTI;
+    const ARMInstrInfo        *II;
+    const TargetData          *TD;
+    const ARMSubtarget        *Subtarget;
+    TargetMachine             &TM;
+    JITCodeEmitter            &MCE;
+    const std::vector<MachineConstantPoolEntry> *MCPEs;
+    const std::vector<MachineJumpTableEntry> *MJTEs;
+    bool IsPIC;
+    
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<MachineModuleInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+    
+    static char ID;
+  public:
+    ARMCodeEmitter(TargetMachine &tm, JITCodeEmitter &mce)
+      : MachineFunctionPass(&ID), JTI(0), II((ARMInstrInfo*)tm.getInstrInfo()),
+        TD(tm.getTargetData()), TM(tm),
+    MCE(mce), MCPEs(0), MJTEs(0),
+    IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
+    
+    /// getBinaryCodeForInstr - This function, generated by the
+    /// CodeEmitterGenerator using TableGen, produces the binary encoding for
+    /// machine instructions.
+    unsigned getBinaryCodeForInstr(const MachineInstr &MI);
+
+    bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual const char *getPassName() const {
+      return "ARM Machine Code Emitter";
+    }
+
+    void emitInstruction(const MachineInstr &MI);
+
+  private:
+
+    void emitWordLE(unsigned Binary);
+    void emitDWordLE(uint64_t Binary);
+    void emitConstPoolInstruction(const MachineInstr &MI);
+    void emitMOVi2piecesInstruction(const MachineInstr &MI);
+    void emitLEApcrelJTInstruction(const MachineInstr &MI);
+    void emitPseudoMoveInstruction(const MachineInstr &MI);
+    void addPCLabel(unsigned LabelID);
+    void emitPseudoInstruction(const MachineInstr &MI);
+    unsigned getMachineSoRegOpValue(const MachineInstr &MI,
+                                    const TargetInstrDesc &TID,
+                                    const MachineOperand &MO,
+                                    unsigned OpIdx);
+
+    unsigned getMachineSoImmOpValue(unsigned SoImm);
+
+    unsigned getAddrModeSBit(const MachineInstr &MI,
+                             const TargetInstrDesc &TID) const;
+
+    void emitDataProcessingInstruction(const MachineInstr &MI,
+                                       unsigned ImplicitRd = 0,
+                                       unsigned ImplicitRn = 0);
+
+    void emitLoadStoreInstruction(const MachineInstr &MI,
+                                  unsigned ImplicitRd = 0,
+                                  unsigned ImplicitRn = 0);
+
+    void emitMiscLoadStoreInstruction(const MachineInstr &MI,
+                                      unsigned ImplicitRn = 0);
+
+    void emitLoadStoreMultipleInstruction(const MachineInstr &MI);
+
+    void emitMulFrmInstruction(const MachineInstr &MI);
+
+    void emitExtendInstruction(const MachineInstr &MI);
+
+    void emitMiscArithInstruction(const MachineInstr &MI);
+
+    void emitBranchInstruction(const MachineInstr &MI);
+
+    void emitInlineJumpTable(unsigned JTIndex);
+
+    void emitMiscBranchInstruction(const MachineInstr &MI);
+
+    void emitVFPArithInstruction(const MachineInstr &MI);
+
+    void emitVFPConversionInstruction(const MachineInstr &MI);
+
+    void emitVFPLoadStoreInstruction(const MachineInstr &MI);
+
+    void emitVFPLoadStoreMultipleInstruction(const MachineInstr &MI);
+
+    void emitMiscInstruction(const MachineInstr &MI);
+
+    /// getMachineOpValue - Return binary encoding of operand. If the machine
+    /// operand requires relocation, record the relocation and return zero.
+    unsigned getMachineOpValue(const MachineInstr &MI,const MachineOperand &MO);
+    unsigned getMachineOpValue(const MachineInstr &MI, unsigned OpIdx) {
+      return getMachineOpValue(MI, MI.getOperand(OpIdx));
+    }
+
+    /// getShiftOp - Return the shift opcode (bit[6:5]) of the immediate value.
+    ///
+    unsigned getShiftOp(unsigned Imm) const ;
+
+    /// Routines that handle operands which add machine relocations which are
+    /// fixed up by the relocation stage.
+    void emitGlobalAddress(GlobalValue *GV, unsigned Reloc,
+                           bool MayNeedFarStub,  bool Indirect,
+                           intptr_t ACPV = 0);
+    void emitExternalSymbolAddress(const char *ES, unsigned Reloc);
+    void emitConstPoolAddress(unsigned CPI, unsigned Reloc);
+    void emitJumpTableAddress(unsigned JTIndex, unsigned Reloc);
+    void emitMachineBasicBlock(MachineBasicBlock *BB, unsigned Reloc,
+                               intptr_t JTBase = 0);
+  };
+}
+
+char ARMCodeEmitter::ID = 0;
+
+/// createARMJITCodeEmitterPass - Return a pass that emits the collected ARM 
+/// code to the specified MCE object.
+FunctionPass *llvm::createARMJITCodeEmitterPass(ARMBaseTargetMachine &TM,
+                                                JITCodeEmitter &JCE) {
+  return new ARMCodeEmitter(TM, JCE);
+}
+
+bool ARMCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
+  assert((MF.getTarget().getRelocationModel() != Reloc::Default ||
+          MF.getTarget().getRelocationModel() != Reloc::Static) &&
+         "JIT relocation model must be set to static or default!");
+  JTI = ((ARMTargetMachine&)MF.getTarget()).getJITInfo();
+  II = ((ARMTargetMachine&)MF.getTarget()).getInstrInfo();
+  TD = ((ARMTargetMachine&)MF.getTarget()).getTargetData();
+  Subtarget = &TM.getSubtarget<ARMSubtarget>();
+  MCPEs = &MF.getConstantPool()->getConstants();
+  MJTEs = 0;
+  if (MF.getJumpTableInfo()) MJTEs = &MF.getJumpTableInfo()->getJumpTables();
+  IsPIC = TM.getRelocationModel() == Reloc::PIC_;
+  JTI->Initialize(MF, IsPIC);
+  MCE.setModuleInfo(&getAnalysis<MachineModuleInfo>());
+
+  do {
+    DEBUG(errs() << "JITTing function '"
+          << MF.getFunction()->getName() << "'\n");
+    MCE.startFunction(MF);
+    for (MachineFunction::iterator MBB = MF.begin(), E = MF.end();
+         MBB != E; ++MBB) {
+      MCE.StartMachineBasicBlock(MBB);
+      for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end();
+           I != E; ++I)
+        emitInstruction(*I);
+    }
+  } while (MCE.finishFunction(MF));
+
+  return false;
+}
+
+/// getShiftOp - Return the shift opcode (bit[6:5]) of the immediate value.
+///
+unsigned ARMCodeEmitter::getShiftOp(unsigned Imm) const {
+  switch (ARM_AM::getAM2ShiftOpc(Imm)) {
+  default: llvm_unreachable("Unknown shift opc!");
+  case ARM_AM::asr: return 2;
+  case ARM_AM::lsl: return 0;
+  case ARM_AM::lsr: return 1;
+  case ARM_AM::ror:
+  case ARM_AM::rrx: return 3;
+  }
+  return 0;
+}
+
+/// getMachineOpValue - Return binary encoding of operand. If the machine
+/// operand requires relocation, record the relocation and return zero.
+unsigned ARMCodeEmitter::getMachineOpValue(const MachineInstr &MI,
+                                           const MachineOperand &MO) {
+  if (MO.isReg())
+    return ARMRegisterInfo::getRegisterNumbering(MO.getReg());
+  else if (MO.isImm())
+    return static_cast<unsigned>(MO.getImm());
+  else if (MO.isGlobal())
+    emitGlobalAddress(MO.getGlobal(), ARM::reloc_arm_branch, true, false);
+  else if (MO.isSymbol())
+    emitExternalSymbolAddress(MO.getSymbolName(), ARM::reloc_arm_branch);
+  else if (MO.isCPI()) {
+    const TargetInstrDesc &TID = MI.getDesc();
+    // For VFP load, the immediate offset is multiplied by 4.
+    unsigned Reloc =  ((TID.TSFlags & ARMII::FormMask) == ARMII::VFPLdStFrm)
+      ? ARM::reloc_arm_vfp_cp_entry : ARM::reloc_arm_cp_entry;
+    emitConstPoolAddress(MO.getIndex(), Reloc);
+  } else if (MO.isJTI())
+    emitJumpTableAddress(MO.getIndex(), ARM::reloc_arm_relative);
+  else if (MO.isMBB())
+    emitMachineBasicBlock(MO.getMBB(), ARM::reloc_arm_branch);
+  else {
+#ifndef NDEBUG
+    errs() << MO;
+#endif
+    llvm_unreachable(0);
+  }
+  return 0;
+}
+
+/// emitGlobalAddress - Emit the specified address to the code stream.
+///
+void ARMCodeEmitter::emitGlobalAddress(GlobalValue *GV, unsigned Reloc,
+                                       bool MayNeedFarStub, bool Indirect,
+                                       intptr_t ACPV) {
+  MachineRelocation MR = Indirect
+    ? MachineRelocation::getIndirectSymbol(MCE.getCurrentPCOffset(), Reloc,
+                                           GV, ACPV, MayNeedFarStub)
+    : MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc,
+                               GV, ACPV, MayNeedFarStub);
+  MCE.addRelocation(MR);
+}
+
+/// emitExternalSymbolAddress - Arrange for the address of an external symbol to
+/// be emitted to the current location in the function, and allow it to be PC
+/// relative.
+void ARMCodeEmitter::emitExternalSymbolAddress(const char *ES, unsigned Reloc) {
+  MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(),
+                                                 Reloc, ES));
+}
+
+/// emitConstPoolAddress - Arrange for the address of an constant pool
+/// to be emitted to the current location in the function, and allow it to be PC
+/// relative.
+void ARMCodeEmitter::emitConstPoolAddress(unsigned CPI, unsigned Reloc) {
+  // Tell JIT emitter we'll resolve the address.
+  MCE.addRelocation(MachineRelocation::getConstPool(MCE.getCurrentPCOffset(),
+                                                    Reloc, CPI, 0, true));
+}
+
+/// emitJumpTableAddress - Arrange for the address of a jump table to
+/// be emitted to the current location in the function, and allow it to be PC
+/// relative.
+void ARMCodeEmitter::emitJumpTableAddress(unsigned JTIndex, unsigned Reloc) {
+  MCE.addRelocation(MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(),
+                                                    Reloc, JTIndex, 0, true));
+}
+
+/// emitMachineBasicBlock - Emit the specified address basic block.
+void ARMCodeEmitter::emitMachineBasicBlock(MachineBasicBlock *BB,
+                                           unsigned Reloc, intptr_t JTBase) {
+  MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(),
+                                             Reloc, BB, JTBase));
+}
+
+void ARMCodeEmitter::emitWordLE(unsigned Binary) {
+  DEBUG(errs() << "  0x";
+        errs().write_hex(Binary) << "\n");
+  MCE.emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitDWordLE(uint64_t Binary) {
+  DEBUG(errs() << "  0x";
+        errs().write_hex(Binary) << "\n");
+  MCE.emitDWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitInstruction(const MachineInstr &MI) {
+  DEBUG(errs() << "JIT: " << (void*)MCE.getCurrentPCValue() << ":\t" << MI);
+
+  MCE.processDebugLoc(MI.getDebugLoc(), true);
+
+  NumEmitted++;  // Keep track of the # of mi's emitted
+  switch (MI.getDesc().TSFlags & ARMII::FormMask) {
+  default: {
+    llvm_unreachable("Unhandled instruction encoding format!");
+    break;
+  }
+  case ARMII::Pseudo:
+    emitPseudoInstruction(MI);
+    break;
+  case ARMII::DPFrm:
+  case ARMII::DPSoRegFrm:
+    emitDataProcessingInstruction(MI);
+    break;
+  case ARMII::LdFrm:
+  case ARMII::StFrm:
+    emitLoadStoreInstruction(MI);
+    break;
+  case ARMII::LdMiscFrm:
+  case ARMII::StMiscFrm:
+    emitMiscLoadStoreInstruction(MI);
+    break;
+  case ARMII::LdStMulFrm:
+    emitLoadStoreMultipleInstruction(MI);
+    break;
+  case ARMII::MulFrm:
+    emitMulFrmInstruction(MI);
+    break;
+  case ARMII::ExtFrm:
+    emitExtendInstruction(MI);
+    break;
+  case ARMII::ArithMiscFrm:
+    emitMiscArithInstruction(MI);
+    break;
+  case ARMII::BrFrm:
+    emitBranchInstruction(MI);
+    break;
+  case ARMII::BrMiscFrm:
+    emitMiscBranchInstruction(MI);
+    break;
+  // VFP instructions.
+  case ARMII::VFPUnaryFrm:
+  case ARMII::VFPBinaryFrm:
+    emitVFPArithInstruction(MI);
+    break;
+  case ARMII::VFPConv1Frm:
+  case ARMII::VFPConv2Frm:
+  case ARMII::VFPConv3Frm:
+  case ARMII::VFPConv4Frm:
+  case ARMII::VFPConv5Frm:
+    emitVFPConversionInstruction(MI);
+    break;
+  case ARMII::VFPLdStFrm:
+    emitVFPLoadStoreInstruction(MI);
+    break;
+  case ARMII::VFPLdStMulFrm:
+    emitVFPLoadStoreMultipleInstruction(MI);
+    break;
+  case ARMII::VFPMiscFrm:
+    emitMiscInstruction(MI);
+    break;
+  }
+  MCE.processDebugLoc(MI.getDebugLoc(), false);
+}
+
+void ARMCodeEmitter::emitConstPoolInstruction(const MachineInstr &MI) {
+  unsigned CPI = MI.getOperand(0).getImm();       // CP instruction index.
+  unsigned CPIndex = MI.getOperand(1).getIndex(); // Actual cp entry index.
+  const MachineConstantPoolEntry &MCPE = (*MCPEs)[CPIndex];
+
+  // Remember the CONSTPOOL_ENTRY address for later relocation.
+  JTI->addConstantPoolEntryAddr(CPI, MCE.getCurrentPCValue());
+
+  // Emit constpool island entry. In most cases, the actual values will be
+  // resolved and relocated after code emission.
+  if (MCPE.isMachineConstantPoolEntry()) {
+    ARMConstantPoolValue *ACPV =
+      static_cast<ARMConstantPoolValue*>(MCPE.Val.MachineCPVal);
+
+    DEBUG(errs() << "  ** ARM constant pool #" << CPI << " @ "
+          << (void*)MCE.getCurrentPCValue() << " " << *ACPV << '\n');
+
+    assert(ACPV->isGlobalValue() && "unsupported constant pool value");
+    GlobalValue *GV = ACPV->getGV();
+    if (GV) {
+      Reloc::Model RelocM = TM.getRelocationModel();
+      emitGlobalAddress(GV, ARM::reloc_arm_machine_cp_entry,
+                        isa<Function>(GV),
+                        Subtarget->GVIsIndirectSymbol(GV, RelocM),
+                        (intptr_t)ACPV);
+     } else  {
+      emitExternalSymbolAddress(ACPV->getSymbol(), ARM::reloc_arm_absolute);
+    }
+    emitWordLE(0);
+  } else {
+    Constant *CV = MCPE.Val.ConstVal;
+
+    DEBUG({
+        errs() << "  ** Constant pool #" << CPI << " @ "
+               << (void*)MCE.getCurrentPCValue() << " ";
+        if (const Function *F = dyn_cast<Function>(CV))
+          errs() << F->getName();
+        else
+          errs() << *CV;
+        errs() << '\n';
+      });
+
+    if (GlobalValue *GV = dyn_cast<GlobalValue>(CV)) {
+      emitGlobalAddress(GV, ARM::reloc_arm_absolute, isa<Function>(GV), false);
+      emitWordLE(0);
+    } else if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
+      uint32_t Val = *(uint32_t*)CI->getValue().getRawData();
+      emitWordLE(Val);
+    } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV)) {
+      if (CFP->getType()->isFloatTy())
+        emitWordLE(CFP->getValueAPF().bitcastToAPInt().getZExtValue());
+      else if (CFP->getType()->isDoubleTy())
+        emitDWordLE(CFP->getValueAPF().bitcastToAPInt().getZExtValue());
+      else {
+        llvm_unreachable("Unable to handle this constantpool entry!");
+      }
+    } else {
+      llvm_unreachable("Unable to handle this constantpool entry!");
+    }
+  }
+}
+
+void ARMCodeEmitter::emitMOVi2piecesInstruction(const MachineInstr &MI) {
+  const MachineOperand &MO0 = MI.getOperand(0);
+  const MachineOperand &MO1 = MI.getOperand(1);
+  assert(MO1.isImm() && ARM_AM::getSOImmVal(MO1.isImm()) != -1 &&
+                                            "Not a valid so_imm value!");
+  unsigned V1 = ARM_AM::getSOImmTwoPartFirst(MO1.getImm());
+  unsigned V2 = ARM_AM::getSOImmTwoPartSecond(MO1.getImm());
+
+  // Emit the 'mov' instruction.
+  unsigned Binary = 0xd << 21;  // mov: Insts{24-21} = 0b1101
+
+  // Set the conditional execution predicate.
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Encode Rd.
+  Binary |= getMachineOpValue(MI, MO0) << ARMII::RegRdShift;
+
+  // Encode so_imm.
+  // Set bit I(25) to identify this is the immediate form of <shifter_op>
+  Binary |= 1 << ARMII::I_BitShift;
+  Binary |= getMachineSoImmOpValue(V1);
+  emitWordLE(Binary);
+
+  // Now the 'orr' instruction.
+  Binary = 0xc << 21;  // orr: Insts{24-21} = 0b1100
+
+  // Set the conditional execution predicate.
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Encode Rd.
+  Binary |= getMachineOpValue(MI, MO0) << ARMII::RegRdShift;
+
+  // Encode Rn.
+  Binary |= getMachineOpValue(MI, MO0) << ARMII::RegRnShift;
+
+  // Encode so_imm.
+  // Set bit I(25) to identify this is the immediate form of <shifter_op>
+  Binary |= 1 << ARMII::I_BitShift;
+  Binary |= getMachineSoImmOpValue(V2);
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitLEApcrelJTInstruction(const MachineInstr &MI) {
+  // It's basically add r, pc, (LJTI - $+8)
+
+  const TargetInstrDesc &TID = MI.getDesc();
+
+  // Emit the 'add' instruction.
+  unsigned Binary = 0x4 << 21;  // add: Insts{24-31} = 0b0100
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Encode S bit if MI modifies CPSR.
+  Binary |= getAddrModeSBit(MI, TID);
+
+  // Encode Rd.
+  Binary |= getMachineOpValue(MI, 0) << ARMII::RegRdShift;
+
+  // Encode Rn which is PC.
+  Binary |= ARMRegisterInfo::getRegisterNumbering(ARM::PC) << ARMII::RegRnShift;
+
+  // Encode the displacement.
+  Binary |= 1 << ARMII::I_BitShift;
+  emitJumpTableAddress(MI.getOperand(1).getIndex(), ARM::reloc_arm_jt_base);
+
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitPseudoMoveInstruction(const MachineInstr &MI) {
+  unsigned Opcode = MI.getDesc().Opcode;
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Encode S bit if MI modifies CPSR.
+  if (Opcode == ARM::MOVsrl_flag || Opcode == ARM::MOVsra_flag)
+    Binary |= 1 << ARMII::S_BitShift;
+
+  // Encode register def if there is one.
+  Binary |= getMachineOpValue(MI, 0) << ARMII::RegRdShift;
+
+  // Encode the shift operation.
+  switch (Opcode) {
+  default: break;
+  case ARM::MOVrx:
+    // rrx
+    Binary |= 0x6 << 4;
+    break;
+  case ARM::MOVsrl_flag:
+    // lsr #1
+    Binary |= (0x2 << 4) | (1 << 7);
+    break;
+  case ARM::MOVsra_flag:
+    // asr #1
+    Binary |= (0x4 << 4) | (1 << 7);
+    break;
+  }
+
+  // Encode register Rm.
+  Binary |= getMachineOpValue(MI, 1);
+
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::addPCLabel(unsigned LabelID) {
+  DEBUG(errs() << "  ** LPC" << LabelID << " @ "
+        << (void*)MCE.getCurrentPCValue() << '\n');
+  JTI->addPCLabelAddr(LabelID, MCE.getCurrentPCValue());
+}
+
+void ARMCodeEmitter::emitPseudoInstruction(const MachineInstr &MI) {
+  unsigned Opcode = MI.getDesc().Opcode;
+  switch (Opcode) {
+  default:
+    llvm_unreachable("ARMCodeEmitter::emitPseudoInstruction");
+  // FIXME: Add support for MOVimm32.
+  case TargetOpcode::INLINEASM: {
+    // We allow inline assembler nodes with empty bodies - they can
+    // implicitly define registers, which is ok for JIT.
+    if (MI.getOperand(0).getSymbolName()[0]) {
+      llvm_report_error("JIT does not support inline asm!");
+    }
+    break;
+  }
+  case TargetOpcode::DBG_LABEL:
+  case TargetOpcode::EH_LABEL:
+    MCE.emitLabel(MI.getOperand(0).getImm());
+    break;
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::KILL:
+    // Do nothing.
+    break;
+  case ARM::CONSTPOOL_ENTRY:
+    emitConstPoolInstruction(MI);
+    break;
+  case ARM::PICADD: {
+    // Remember of the address of the PC label for relocation later.
+    addPCLabel(MI.getOperand(2).getImm());
+    // PICADD is just an add instruction that implicitly read pc.
+    emitDataProcessingInstruction(MI, 0, ARM::PC);
+    break;
+  }
+  case ARM::PICLDR:
+  case ARM::PICLDRB:
+  case ARM::PICSTR:
+  case ARM::PICSTRB: {
+    // Remember of the address of the PC label for relocation later.
+    addPCLabel(MI.getOperand(2).getImm());
+    // These are just load / store instructions that implicitly read pc.
+    emitLoadStoreInstruction(MI, 0, ARM::PC);
+    break;
+  }
+  case ARM::PICLDRH:
+  case ARM::PICLDRSH:
+  case ARM::PICLDRSB:
+  case ARM::PICSTRH: {
+    // Remember of the address of the PC label for relocation later.
+    addPCLabel(MI.getOperand(2).getImm());
+    // These are just load / store instructions that implicitly read pc.
+    emitMiscLoadStoreInstruction(MI, ARM::PC);
+    break;
+  }
+  case ARM::MOVi2pieces:
+    // Two instructions to materialize a constant.
+    emitMOVi2piecesInstruction(MI);
+    break;
+  case ARM::LEApcrelJT:
+    // Materialize jumptable address.
+    emitLEApcrelJTInstruction(MI);
+    break;
+  case ARM::MOVrx:
+  case ARM::MOVsrl_flag:
+  case ARM::MOVsra_flag:
+    emitPseudoMoveInstruction(MI);
+    break;
+  }
+}
+
+unsigned ARMCodeEmitter::getMachineSoRegOpValue(
+                                                const MachineInstr &MI,
+                                                const TargetInstrDesc &TID,
+                                                const MachineOperand &MO,
+                                                unsigned OpIdx) {
+  unsigned Binary = getMachineOpValue(MI, MO);
+
+  const MachineOperand &MO1 = MI.getOperand(OpIdx + 1);
+  const MachineOperand &MO2 = MI.getOperand(OpIdx + 2);
+  ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO2.getImm());
+
+  // Encode the shift opcode.
+  unsigned SBits = 0;
+  unsigned Rs = MO1.getReg();
+  if (Rs) {
+    // Set shift operand (bit[7:4]).
+    // LSL - 0001
+    // LSR - 0011
+    // ASR - 0101
+    // ROR - 0111
+    // RRX - 0110 and bit[11:8] clear.
+    switch (SOpc) {
+    default: llvm_unreachable("Unknown shift opc!");
+    case ARM_AM::lsl: SBits = 0x1; break;
+    case ARM_AM::lsr: SBits = 0x3; break;
+    case ARM_AM::asr: SBits = 0x5; break;
+    case ARM_AM::ror: SBits = 0x7; break;
+    case ARM_AM::rrx: SBits = 0x6; break;
+    }
+  } else {
+    // Set shift operand (bit[6:4]).
+    // LSL - 000
+    // LSR - 010
+    // ASR - 100
+    // ROR - 110
+    switch (SOpc) {
+    default: llvm_unreachable("Unknown shift opc!");
+    case ARM_AM::lsl: SBits = 0x0; break;
+    case ARM_AM::lsr: SBits = 0x2; break;
+    case ARM_AM::asr: SBits = 0x4; break;
+    case ARM_AM::ror: SBits = 0x6; break;
+    }
+  }
+  Binary |= SBits << 4;
+  if (SOpc == ARM_AM::rrx)
+    return Binary;
+
+  // Encode the shift operation Rs or shift_imm (except rrx).
+  if (Rs) {
+    // Encode Rs bit[11:8].
+    assert(ARM_AM::getSORegOffset(MO2.getImm()) == 0);
+    return Binary |
+      (ARMRegisterInfo::getRegisterNumbering(Rs) << ARMII::RegRsShift);
+  }
+
+  // Encode shift_imm bit[11:7].
+  return Binary | ARM_AM::getSORegOffset(MO2.getImm()) << 7;
+}
+
+unsigned ARMCodeEmitter::getMachineSoImmOpValue(unsigned SoImm) {
+  int SoImmVal = ARM_AM::getSOImmVal(SoImm);
+  assert(SoImmVal != -1 && "Not a valid so_imm value!");
+
+  // Encode rotate_imm.
+  unsigned Binary = (ARM_AM::getSOImmValRot((unsigned)SoImmVal) >> 1)
+    << ARMII::SoRotImmShift;
+
+  // Encode immed_8.
+  Binary |= ARM_AM::getSOImmValImm((unsigned)SoImmVal);
+  return Binary;
+}
+
+unsigned ARMCodeEmitter::getAddrModeSBit(const MachineInstr &MI,
+                                             const TargetInstrDesc &TID) const {
+  for (unsigned i = MI.getNumOperands(), e = TID.getNumOperands(); i != e; --i){
+    const MachineOperand &MO = MI.getOperand(i-1);
+    if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR)
+      return 1 << ARMII::S_BitShift;
+  }
+  return 0;
+}
+
+void ARMCodeEmitter::emitDataProcessingInstruction(
+                                                   const MachineInstr &MI,
+                                                   unsigned ImplicitRd,
+                                                   unsigned ImplicitRn) {
+  const TargetInstrDesc &TID = MI.getDesc();
+
+  if (TID.Opcode == ARM::BFC) {
+    llvm_report_error("ARMv6t2 JIT is not yet supported.");
+  }
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Encode S bit if MI modifies CPSR.
+  Binary |= getAddrModeSBit(MI, TID);
+
+  // Encode register def if there is one.
+  unsigned NumDefs = TID.getNumDefs();
+  unsigned OpIdx = 0;
+  if (NumDefs)
+    Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdShift;
+  else if (ImplicitRd)
+    // Special handling for implicit use (e.g. PC).
+    Binary |= (ARMRegisterInfo::getRegisterNumbering(ImplicitRd)
+               << ARMII::RegRdShift);
+
+  // If this is a two-address operand, skip it. e.g. MOVCCr operand 1.
+  if (TID.getOperandConstraint(OpIdx, TOI::TIED_TO) != -1)
+    ++OpIdx;
+
+  // Encode first non-shifter register operand if there is one.
+  bool isUnary = TID.TSFlags & ARMII::UnaryDP;
+  if (!isUnary) {
+    if (ImplicitRn)
+      // Special handling for implicit use (e.g. PC).
+      Binary |= (ARMRegisterInfo::getRegisterNumbering(ImplicitRn)
+                 << ARMII::RegRnShift);
+    else {
+      Binary |= getMachineOpValue(MI, OpIdx) << ARMII::RegRnShift;
+      ++OpIdx;
+    }
+  }
+
+  // Encode shifter operand.
+  const MachineOperand &MO = MI.getOperand(OpIdx);
+  if ((TID.TSFlags & ARMII::FormMask) == ARMII::DPSoRegFrm) {
+    // Encode SoReg.
+    emitWordLE(Binary | getMachineSoRegOpValue(MI, TID, MO, OpIdx));
+    return;
+  }
+
+  if (MO.isReg()) {
+    // Encode register Rm.
+    emitWordLE(Binary | ARMRegisterInfo::getRegisterNumbering(MO.getReg()));
+    return;
+  }
+
+  // Encode so_imm.
+  Binary |= getMachineSoImmOpValue((unsigned)MO.getImm());
+
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitLoadStoreInstruction(
+                                              const MachineInstr &MI,
+                                              unsigned ImplicitRd,
+                                              unsigned ImplicitRn) {
+  const TargetInstrDesc &TID = MI.getDesc();
+  unsigned Form = TID.TSFlags & ARMII::FormMask;
+  bool IsPrePost = (TID.TSFlags & ARMII::IndexModeMask) != 0;
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  unsigned OpIdx = 0;
+
+  // Operand 0 of a pre- and post-indexed store is the address base
+  // writeback. Skip it.
+  bool Skipped = false;
+  if (IsPrePost && Form == ARMII::StFrm) {
+    ++OpIdx;
+    Skipped = true;
+  }
+
+  // Set first operand
+  if (ImplicitRd)
+    // Special handling for implicit use (e.g. PC).
+    Binary |= (ARMRegisterInfo::getRegisterNumbering(ImplicitRd)
+               << ARMII::RegRdShift);
+  else
+    Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdShift;
+
+  // Set second operand
+  if (ImplicitRn)
+    // Special handling for implicit use (e.g. PC).
+    Binary |= (ARMRegisterInfo::getRegisterNumbering(ImplicitRn)
+               << ARMII::RegRnShift);
+  else
+    Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRnShift;
+
+  // If this is a two-address operand, skip it. e.g. LDR_PRE.
+  if (!Skipped && TID.getOperandConstraint(OpIdx, TOI::TIED_TO) != -1)
+    ++OpIdx;
+
+  const MachineOperand &MO2 = MI.getOperand(OpIdx);
+  unsigned AM2Opc = (ImplicitRn == ARM::PC)
+    ? 0 : MI.getOperand(OpIdx+1).getImm();
+
+  // Set bit U(23) according to sign of immed value (positive or negative).
+  Binary |= ((ARM_AM::getAM2Op(AM2Opc) == ARM_AM::add ? 1 : 0) <<
+             ARMII::U_BitShift);
+  if (!MO2.getReg()) { // is immediate
+    if (ARM_AM::getAM2Offset(AM2Opc))
+      // Set the value of offset_12 field
+      Binary |= ARM_AM::getAM2Offset(AM2Opc);
+    emitWordLE(Binary);
+    return;
+  }
+
+  // Set bit I(25), because this is not in immediate enconding.
+  Binary |= 1 << ARMII::I_BitShift;
+  assert(TargetRegisterInfo::isPhysicalRegister(MO2.getReg()));
+  // Set bit[3:0] to the corresponding Rm register
+  Binary |= ARMRegisterInfo::getRegisterNumbering(MO2.getReg());
+
+  // If this instr is in scaled register offset/index instruction, set
+  // shift_immed(bit[11:7]) and shift(bit[6:5]) fields.
+  if (unsigned ShImm = ARM_AM::getAM2Offset(AM2Opc)) {
+    Binary |= getShiftOp(AM2Opc) << ARMII::ShiftImmShift;  // shift
+    Binary |= ShImm              << ARMII::ShiftShift;     // shift_immed
+  }
+
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitMiscLoadStoreInstruction(const MachineInstr &MI,
+                                                        unsigned ImplicitRn) {
+  const TargetInstrDesc &TID = MI.getDesc();
+  unsigned Form = TID.TSFlags & ARMII::FormMask;
+  bool IsPrePost = (TID.TSFlags & ARMII::IndexModeMask) != 0;
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  unsigned OpIdx = 0;
+
+  // Operand 0 of a pre- and post-indexed store is the address base
+  // writeback. Skip it.
+  bool Skipped = false;
+  if (IsPrePost && Form == ARMII::StMiscFrm) {
+    ++OpIdx;
+    Skipped = true;
+  }
+
+  // Set first operand
+  Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdShift;
+
+  // Skip LDRD and STRD's second operand.
+  if (TID.Opcode == ARM::LDRD || TID.Opcode == ARM::STRD)
+    ++OpIdx;
+
+  // Set second operand
+  if (ImplicitRn)
+    // Special handling for implicit use (e.g. PC).
+    Binary |= (ARMRegisterInfo::getRegisterNumbering(ImplicitRn)
+               << ARMII::RegRnShift);
+  else
+    Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRnShift;
+
+  // If this is a two-address operand, skip it. e.g. LDRH_POST.
+  if (!Skipped && TID.getOperandConstraint(OpIdx, TOI::TIED_TO) != -1)
+    ++OpIdx;
+
+  const MachineOperand &MO2 = MI.getOperand(OpIdx);
+  unsigned AM3Opc = (ImplicitRn == ARM::PC)
+    ? 0 : MI.getOperand(OpIdx+1).getImm();
+
+  // Set bit U(23) according to sign of immed value (positive or negative)
+  Binary |= ((ARM_AM::getAM3Op(AM3Opc) == ARM_AM::add ? 1 : 0) <<
+             ARMII::U_BitShift);
+
+  // If this instr is in register offset/index encoding, set bit[3:0]
+  // to the corresponding Rm register.
+  if (MO2.getReg()) {
+    Binary |= ARMRegisterInfo::getRegisterNumbering(MO2.getReg());
+    emitWordLE(Binary);
+    return;
+  }
+
+  // This instr is in immediate offset/index encoding, set bit 22 to 1.
+  Binary |= 1 << ARMII::AM3_I_BitShift;
+  if (unsigned ImmOffs = ARM_AM::getAM3Offset(AM3Opc)) {
+    // Set operands
+    Binary |= (ImmOffs >> 4) << ARMII::ImmHiShift;  // immedH
+    Binary |= (ImmOffs & 0xF);                      // immedL
+  }
+
+  emitWordLE(Binary);
+}
+
+static unsigned getAddrModeUPBits(unsigned Mode) {
+  unsigned Binary = 0;
+
+  // Set addressing mode by modifying bits U(23) and P(24)
+  // IA - Increment after  - bit U = 1 and bit P = 0
+  // IB - Increment before - bit U = 1 and bit P = 1
+  // DA - Decrement after  - bit U = 0 and bit P = 0
+  // DB - Decrement before - bit U = 0 and bit P = 1
+  switch (Mode) {
+  default: llvm_unreachable("Unknown addressing sub-mode!");
+  case ARM_AM::da:                                     break;
+  case ARM_AM::db: Binary |= 0x1 << ARMII::P_BitShift; break;
+  case ARM_AM::ia: Binary |= 0x1 << ARMII::U_BitShift; break;
+  case ARM_AM::ib: Binary |= 0x3 << ARMII::U_BitShift; break;
+  }
+
+  return Binary;
+}
+
+void ARMCodeEmitter::emitLoadStoreMultipleInstruction(
+                                                       const MachineInstr &MI) {
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Set base address operand
+  Binary |= getMachineOpValue(MI, 0) << ARMII::RegRnShift;
+
+  // Set addressing mode by modifying bits U(23) and P(24)
+  const MachineOperand &MO = MI.getOperand(1);
+  Binary |= getAddrModeUPBits(ARM_AM::getAM4SubMode(MO.getImm()));
+
+  // Set bit W(21)
+  if (ARM_AM::getAM4WBFlag(MO.getImm()))
+    Binary |= 0x1 << ARMII::W_BitShift;
+
+  // Set registers
+  for (unsigned i = 5, e = MI.getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || MO.isImplicit())
+      break;
+    unsigned RegNum = ARMRegisterInfo::getRegisterNumbering(MO.getReg());
+    assert(TargetRegisterInfo::isPhysicalRegister(MO.getReg()) &&
+           RegNum < 16);
+    Binary |= 0x1 << RegNum;
+  }
+
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitMulFrmInstruction(const MachineInstr &MI) {
+  const TargetInstrDesc &TID = MI.getDesc();
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Encode S bit if MI modifies CPSR.
+  Binary |= getAddrModeSBit(MI, TID);
+
+  // 32x32->64bit operations have two destination registers. The number
+  // of register definitions will tell us if that's what we're dealing with.
+  unsigned OpIdx = 0;
+  if (TID.getNumDefs() == 2)
+    Binary |= getMachineOpValue (MI, OpIdx++) << ARMII::RegRdLoShift;
+
+  // Encode Rd
+  Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdHiShift;
+
+  // Encode Rm
+  Binary |= getMachineOpValue(MI, OpIdx++);
+
+  // Encode Rs
+  Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRsShift;
+
+  // Many multiple instructions (e.g. MLA) have three src operands. Encode
+  // it as Rn (for multiply, that's in the same offset as RdLo.
+  if (TID.getNumOperands() > OpIdx &&
+      !TID.OpInfo[OpIdx].isPredicate() &&
+      !TID.OpInfo[OpIdx].isOptionalDef())
+    Binary |= getMachineOpValue(MI, OpIdx) << ARMII::RegRdLoShift;
+
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitExtendInstruction(const MachineInstr &MI) {
+  const TargetInstrDesc &TID = MI.getDesc();
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  unsigned OpIdx = 0;
+
+  // Encode Rd
+  Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdShift;
+
+  const MachineOperand &MO1 = MI.getOperand(OpIdx++);
+  const MachineOperand &MO2 = MI.getOperand(OpIdx);
+  if (MO2.isReg()) {
+    // Two register operand form.
+    // Encode Rn.
+    Binary |= getMachineOpValue(MI, MO1) << ARMII::RegRnShift;
+
+    // Encode Rm.
+    Binary |= getMachineOpValue(MI, MO2);
+    ++OpIdx;
+  } else {
+    Binary |= getMachineOpValue(MI, MO1);
+  }
+
+  // Encode rot imm (0, 8, 16, or 24) if it has a rotate immediate operand.
+  if (MI.getOperand(OpIdx).isImm() &&
+      !TID.OpInfo[OpIdx].isPredicate() &&
+      !TID.OpInfo[OpIdx].isOptionalDef())
+    Binary |= (getMachineOpValue(MI, OpIdx) / 8) << ARMII::ExtRotImmShift;
+
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitMiscArithInstruction(const MachineInstr &MI) {
+  const TargetInstrDesc &TID = MI.getDesc();
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  unsigned OpIdx = 0;
+
+  // Encode Rd
+  Binary |= getMachineOpValue(MI, OpIdx++) << ARMII::RegRdShift;
+
+  const MachineOperand &MO = MI.getOperand(OpIdx++);
+  if (OpIdx == TID.getNumOperands() ||
+      TID.OpInfo[OpIdx].isPredicate() ||
+      TID.OpInfo[OpIdx].isOptionalDef()) {
+    // Encode Rm and it's done.
+    Binary |= getMachineOpValue(MI, MO);
+    emitWordLE(Binary);
+    return;
+  }
+
+  // Encode Rn.
+  Binary |= getMachineOpValue(MI, MO) << ARMII::RegRnShift;
+
+  // Encode Rm.
+  Binary |= getMachineOpValue(MI, OpIdx++);
+
+  // Encode shift_imm.
+  unsigned ShiftAmt = MI.getOperand(OpIdx).getImm();
+  assert(ShiftAmt < 32 && "shift_imm range is 0 to 31!");
+  Binary |= ShiftAmt << ARMII::ShiftShift;
+
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitBranchInstruction(const MachineInstr &MI) {
+  const TargetInstrDesc &TID = MI.getDesc();
+
+  if (TID.Opcode == ARM::TPsoft) {
+    llvm_unreachable("ARM::TPsoft FIXME"); // FIXME
+  }
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Set signed_immed_24 field
+  Binary |= getMachineOpValue(MI, 0);
+
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitInlineJumpTable(unsigned JTIndex) {
+  // Remember the base address of the inline jump table.
+  uintptr_t JTBase = MCE.getCurrentPCValue();
+  JTI->addJumpTableBaseAddr(JTIndex, JTBase);
+  DEBUG(errs() << "  ** Jump Table #" << JTIndex << " @ " << (void*)JTBase
+               << '\n');
+
+  // Now emit the jump table entries.
+  const std::vector<MachineBasicBlock*> &MBBs = (*MJTEs)[JTIndex].MBBs;
+  for (unsigned i = 0, e = MBBs.size(); i != e; ++i) {
+    if (IsPIC)
+      // DestBB address - JT base.
+      emitMachineBasicBlock(MBBs[i], ARM::reloc_arm_pic_jt, JTBase);
+    else
+      // Absolute DestBB address.
+      emitMachineBasicBlock(MBBs[i], ARM::reloc_arm_absolute);
+    emitWordLE(0);
+  }
+}
+
+void ARMCodeEmitter::emitMiscBranchInstruction(const MachineInstr &MI) {
+  const TargetInstrDesc &TID = MI.getDesc();
+
+  // Handle jump tables.
+  if (TID.Opcode == ARM::BR_JTr || TID.Opcode == ARM::BR_JTadd) {
+    // First emit a ldr pc, [] instruction.
+    emitDataProcessingInstruction(MI, ARM::PC);
+
+    // Then emit the inline jump table.
+    unsigned JTIndex =
+      (TID.Opcode == ARM::BR_JTr)
+      ? MI.getOperand(1).getIndex() : MI.getOperand(2).getIndex();
+    emitInlineJumpTable(JTIndex);
+    return;
+  } else if (TID.Opcode == ARM::BR_JTm) {
+    // First emit a ldr pc, [] instruction.
+    emitLoadStoreInstruction(MI, ARM::PC);
+
+    // Then emit the inline jump table.
+    emitInlineJumpTable(MI.getOperand(3).getIndex());
+    return;
+  }
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  if (TID.Opcode == ARM::BX_RET)
+    // The return register is LR.
+    Binary |= ARMRegisterInfo::getRegisterNumbering(ARM::LR);
+  else
+    // otherwise, set the return register
+    Binary |= getMachineOpValue(MI, 0);
+
+  emitWordLE(Binary);
+}
+
+static unsigned encodeVFPRd(const MachineInstr &MI, unsigned OpIdx) {
+  unsigned RegD = MI.getOperand(OpIdx).getReg();
+  unsigned Binary = 0;
+  bool isSPVFP = false;
+  RegD = ARMRegisterInfo::getRegisterNumbering(RegD, &isSPVFP);
+  if (!isSPVFP)
+    Binary |=   RegD               << ARMII::RegRdShift;
+  else {
+    Binary |= ((RegD & 0x1E) >> 1) << ARMII::RegRdShift;
+    Binary |=  (RegD & 0x01)       << ARMII::D_BitShift;
+  }
+  return Binary;
+}
+
+static unsigned encodeVFPRn(const MachineInstr &MI, unsigned OpIdx) {
+  unsigned RegN = MI.getOperand(OpIdx).getReg();
+  unsigned Binary = 0;
+  bool isSPVFP = false;
+  RegN = ARMRegisterInfo::getRegisterNumbering(RegN, &isSPVFP);
+  if (!isSPVFP)
+    Binary |=   RegN               << ARMII::RegRnShift;
+  else {
+    Binary |= ((RegN & 0x1E) >> 1) << ARMII::RegRnShift;
+    Binary |=  (RegN & 0x01)       << ARMII::N_BitShift;
+  }
+  return Binary;
+}
+
+static unsigned encodeVFPRm(const MachineInstr &MI, unsigned OpIdx) {
+  unsigned RegM = MI.getOperand(OpIdx).getReg();
+  unsigned Binary = 0;
+  bool isSPVFP = false;
+  RegM = ARMRegisterInfo::getRegisterNumbering(RegM, &isSPVFP);
+  if (!isSPVFP)
+    Binary |=   RegM;
+  else {
+    Binary |= ((RegM & 0x1E) >> 1);
+    Binary |=  (RegM & 0x01)       << ARMII::M_BitShift;
+  }
+  return Binary;
+}
+
+void ARMCodeEmitter::emitVFPArithInstruction(const MachineInstr &MI) {
+  const TargetInstrDesc &TID = MI.getDesc();
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  unsigned OpIdx = 0;
+  assert((Binary & ARMII::D_BitShift) == 0 &&
+         (Binary & ARMII::N_BitShift) == 0 &&
+         (Binary & ARMII::M_BitShift) == 0 && "VFP encoding bug!");
+
+  // Encode Dd / Sd.
+  Binary |= encodeVFPRd(MI, OpIdx++);
+
+  // If this is a two-address operand, skip it, e.g. FMACD.
+  if (TID.getOperandConstraint(OpIdx, TOI::TIED_TO) != -1)
+    ++OpIdx;
+
+  // Encode Dn / Sn.
+  if ((TID.TSFlags & ARMII::FormMask) == ARMII::VFPBinaryFrm)
+    Binary |= encodeVFPRn(MI, OpIdx++);
+
+  if (OpIdx == TID.getNumOperands() ||
+      TID.OpInfo[OpIdx].isPredicate() ||
+      TID.OpInfo[OpIdx].isOptionalDef()) {
+    // FCMPEZD etc. has only one operand.
+    emitWordLE(Binary);
+    return;
+  }
+
+  // Encode Dm / Sm.
+  Binary |= encodeVFPRm(MI, OpIdx);
+
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitVFPConversionInstruction(
+      const MachineInstr &MI) {
+  const TargetInstrDesc &TID = MI.getDesc();
+  unsigned Form = TID.TSFlags & ARMII::FormMask;
+
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  switch (Form) {
+  default: break;
+  case ARMII::VFPConv1Frm:
+  case ARMII::VFPConv2Frm:
+  case ARMII::VFPConv3Frm:
+    // Encode Dd / Sd.
+    Binary |= encodeVFPRd(MI, 0);
+    break;
+  case ARMII::VFPConv4Frm:
+    // Encode Dn / Sn.
+    Binary |= encodeVFPRn(MI, 0);
+    break;
+  case ARMII::VFPConv5Frm:
+    // Encode Dm / Sm.
+    Binary |= encodeVFPRm(MI, 0);
+    break;
+  }
+
+  switch (Form) {
+  default: break;
+  case ARMII::VFPConv1Frm:
+    // Encode Dm / Sm.
+    Binary |= encodeVFPRm(MI, 1);
+    break;
+  case ARMII::VFPConv2Frm:
+  case ARMII::VFPConv3Frm:
+    // Encode Dn / Sn.
+    Binary |= encodeVFPRn(MI, 1);
+    break;
+  case ARMII::VFPConv4Frm:
+  case ARMII::VFPConv5Frm:
+    // Encode Dd / Sd.
+    Binary |= encodeVFPRd(MI, 1);
+    break;
+  }
+
+  if (Form == ARMII::VFPConv5Frm)
+    // Encode Dn / Sn.
+    Binary |= encodeVFPRn(MI, 2);
+  else if (Form == ARMII::VFPConv3Frm)
+    // Encode Dm / Sm.
+    Binary |= encodeVFPRm(MI, 2);
+
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitVFPLoadStoreInstruction(const MachineInstr &MI) {
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  unsigned OpIdx = 0;
+
+  // Encode Dd / Sd.
+  Binary |= encodeVFPRd(MI, OpIdx++);
+
+  // Encode address base.
+  const MachineOperand &Base = MI.getOperand(OpIdx++);
+  Binary |= getMachineOpValue(MI, Base) << ARMII::RegRnShift;
+
+  // If there is a non-zero immediate offset, encode it.
+  if (Base.isReg()) {
+    const MachineOperand &Offset = MI.getOperand(OpIdx);
+    if (unsigned ImmOffs = ARM_AM::getAM5Offset(Offset.getImm())) {
+      if (ARM_AM::getAM5Op(Offset.getImm()) == ARM_AM::add)
+        Binary |= 1 << ARMII::U_BitShift;
+      Binary |= ImmOffs;
+      emitWordLE(Binary);
+      return;
+    }
+  }
+
+  // If immediate offset is omitted, default to +0.
+  Binary |= 1 << ARMII::U_BitShift;
+
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitVFPLoadStoreMultipleInstruction(
+                                                       const MachineInstr &MI) {
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  // Set base address operand
+  Binary |= getMachineOpValue(MI, 0) << ARMII::RegRnShift;
+
+  // Set addressing mode by modifying bits U(23) and P(24)
+  const MachineOperand &MO = MI.getOperand(1);
+  Binary |= getAddrModeUPBits(ARM_AM::getAM5SubMode(MO.getImm()));
+
+  // Set bit W(21)
+  if (ARM_AM::getAM5WBFlag(MO.getImm()))
+    Binary |= 0x1 << ARMII::W_BitShift;
+
+  // First register is encoded in Dd.
+  Binary |= encodeVFPRd(MI, 5);
+
+  // Number of registers are encoded in offset field.
+  unsigned NumRegs = 1;
+  for (unsigned i = 6, e = MI.getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || MO.isImplicit())
+      break;
+    ++NumRegs;
+  }
+  Binary |= NumRegs * 2;
+
+  emitWordLE(Binary);
+}
+
+void ARMCodeEmitter::emitMiscInstruction(const MachineInstr &MI) {
+  // Part of binary is determined by TableGn.
+  unsigned Binary = getBinaryCodeForInstr(MI);
+
+  // Set the conditional execution predicate
+  Binary |= II->getPredicate(&MI) << ARMII::CondShift;
+
+  emitWordLE(Binary);
+}
+
+#include "ARMGenCodeEmitter.inc"

diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
new file mode 100644
index 0000000..8fa3c04
--- /dev/null
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp

@@ -0,0 +1,1825 @@
+//===-- ARMConstantIslandPass.cpp - ARM constant islands --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that splits the constant pool up into 'islands'
+// which are scattered through-out the function.  This is required due to the
+// limited pc-relative displacements that ARM has.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm-cp-islands"
+#include "ARM.h"
+#include "ARMAddressingModes.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMInstrInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CommandLine.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumCPEs,       "Number of constpool entries");
+STATISTIC(NumSplit,      "Number of uncond branches inserted");
+STATISTIC(NumCBrFixed,   "Number of cond branches fixed");
+STATISTIC(NumUBrFixed,   "Number of uncond branches fixed");
+STATISTIC(NumTBs,        "Number of table branches generated");
+STATISTIC(NumT2CPShrunk, "Number of Thumb2 constantpool instructions shrunk");
+STATISTIC(NumT2BrShrunk, "Number of Thumb2 immediate branches shrunk");
+STATISTIC(NumCBZ,        "Number of CBZ / CBNZ formed");
+STATISTIC(NumJTMoved,    "Number of jump table destination blocks moved");
+STATISTIC(NumJTInserted, "Number of jump table intermediate blocks inserted");
+
+
+static cl::opt<bool>
+AdjustJumpTableBlocks("arm-adjust-jump-tables", cl::Hidden, cl::init(true),
+          cl::desc("Adjust basic block layout to better use TB[BH]"));
+
+namespace {
+  /// ARMConstantIslands - Due to limited PC-relative displacements, ARM
+  /// requires constant pool entries to be scattered among the instructions
+  /// inside a function.  To do this, it completely ignores the normal LLVM
+  /// constant pool; instead, it places constants wherever it feels like with
+  /// special instructions.
+  ///
+  /// The terminology used in this pass includes:
+  ///   Islands - Clumps of constants placed in the function.
+  ///   Water   - Potential places where an island could be formed.
+  ///   CPE     - A constant pool entry that has been placed somewhere, which
+  ///             tracks a list of users.
+  class ARMConstantIslands : public MachineFunctionPass {
+    /// BBSizes - The size of each MachineBasicBlock in bytes of code, indexed
+    /// by MBB Number.  The two-byte pads required for Thumb alignment are
+    /// counted as part of the following block (i.e., the offset and size for
+    /// a padded block will both be ==2 mod 4).
+    std::vector<unsigned> BBSizes;
+
+    /// BBOffsets - the offset of each MBB in bytes, starting from 0.
+    /// The two-byte pads required for Thumb alignment are counted as part of
+    /// the following block.
+    std::vector<unsigned> BBOffsets;
+
+    /// WaterList - A sorted list of basic blocks where islands could be placed
+    /// (i.e. blocks that don't fall through to the following block, due
+    /// to a return, unreachable, or unconditional branch).
+    std::vector<MachineBasicBlock*> WaterList;
+
+    /// NewWaterList - The subset of WaterList that was created since the
+    /// previous iteration by inserting unconditional branches.
+    SmallSet<MachineBasicBlock*, 4> NewWaterList;
+
+    typedef std::vector<MachineBasicBlock*>::iterator water_iterator;
+
+    /// CPUser - One user of a constant pool, keeping the machine instruction
+    /// pointer, the constant pool being referenced, and the max displacement
+    /// allowed from the instruction to the CP.  The HighWaterMark records the
+    /// highest basic block where a new CPEntry can be placed.  To ensure this
+    /// pass terminates, the CP entries are initially placed at the end of the
+    /// function and then move monotonically to lower addresses.  The
+    /// exception to this rule is when the current CP entry for a particular
+    /// CPUser is out of range, but there is another CP entry for the same
+    /// constant value in range.  We want to use the existing in-range CP
+    /// entry, but if it later moves out of range, the search for new water
+    /// should resume where it left off.  The HighWaterMark is used to record
+    /// that point.
+    struct CPUser {
+      MachineInstr *MI;
+      MachineInstr *CPEMI;
+      MachineBasicBlock *HighWaterMark;
+      unsigned MaxDisp;
+      bool NegOk;
+      bool IsSoImm;
+      CPUser(MachineInstr *mi, MachineInstr *cpemi, unsigned maxdisp,
+             bool neg, bool soimm)
+        : MI(mi), CPEMI(cpemi), MaxDisp(maxdisp), NegOk(neg), IsSoImm(soimm) {
+        HighWaterMark = CPEMI->getParent();
+      }
+    };
+
+    /// CPUsers - Keep track of all of the machine instructions that use various
+    /// constant pools and their max displacement.
+    std::vector<CPUser> CPUsers;
+
+    /// CPEntry - One per constant pool entry, keeping the machine instruction
+    /// pointer, the constpool index, and the number of CPUser's which
+    /// reference this entry.
+    struct CPEntry {
+      MachineInstr *CPEMI;
+      unsigned CPI;
+      unsigned RefCount;
+      CPEntry(MachineInstr *cpemi, unsigned cpi, unsigned rc = 0)
+        : CPEMI(cpemi), CPI(cpi), RefCount(rc) {}
+    };
+
+    /// CPEntries - Keep track of all of the constant pool entry machine
+    /// instructions. For each original constpool index (i.e. those that
+    /// existed upon entry to this pass), it keeps a vector of entries.
+    /// Original elements are cloned as we go along; the clones are
+    /// put in the vector of the original element, but have distinct CPIs.
+    std::vector<std::vector<CPEntry> > CPEntries;
+
+    /// ImmBranch - One per immediate branch, keeping the machine instruction
+    /// pointer, conditional or unconditional, the max displacement,
+    /// and (if isCond is true) the corresponding unconditional branch
+    /// opcode.
+    struct ImmBranch {
+      MachineInstr *MI;
+      unsigned MaxDisp : 31;
+      bool isCond : 1;
+      int UncondBr;
+      ImmBranch(MachineInstr *mi, unsigned maxdisp, bool cond, int ubr)
+        : MI(mi), MaxDisp(maxdisp), isCond(cond), UncondBr(ubr) {}
+    };
+
+    /// ImmBranches - Keep track of all the immediate branch instructions.
+    ///
+    std::vector<ImmBranch> ImmBranches;
+
+    /// PushPopMIs - Keep track of all the Thumb push / pop instructions.
+    ///
+    SmallVector<MachineInstr*, 4> PushPopMIs;
+
+    /// T2JumpTables - Keep track of all the Thumb2 jumptable instructions.
+    SmallVector<MachineInstr*, 4> T2JumpTables;
+
+    /// HasFarJump - True if any far jump instruction has been emitted during
+    /// the branch fix up pass.
+    bool HasFarJump;
+
+    /// HasInlineAsm - True if the function contains inline assembly.
+    bool HasInlineAsm;
+
+    const TargetInstrInfo *TII;
+    const ARMSubtarget *STI;
+    ARMFunctionInfo *AFI;
+    bool isThumb;
+    bool isThumb1;
+    bool isThumb2;
+  public:
+    static char ID;
+    ARMConstantIslands() : MachineFunctionPass(&ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual const char *getPassName() const {
+      return "ARM constant island placement and branch shortening pass";
+    }
+
+  private:
+    void DoInitialPlacement(MachineFunction &MF,
+                            std::vector<MachineInstr*> &CPEMIs);
+    CPEntry *findConstPoolEntry(unsigned CPI, const MachineInstr *CPEMI);
+    void JumpTableFunctionScan(MachineFunction &MF);
+    void InitialFunctionScan(MachineFunction &MF,
+                             const std::vector<MachineInstr*> &CPEMIs);
+    MachineBasicBlock *SplitBlockBeforeInstr(MachineInstr *MI);
+    void UpdateForInsertedWaterBlock(MachineBasicBlock *NewBB);
+    void AdjustBBOffsetsAfter(MachineBasicBlock *BB, int delta);
+    bool DecrementOldEntry(unsigned CPI, MachineInstr* CPEMI);
+    int LookForExistingCPEntry(CPUser& U, unsigned UserOffset);
+    bool LookForWater(CPUser&U, unsigned UserOffset, water_iterator &WaterIter);
+    void CreateNewWater(unsigned CPUserIndex, unsigned UserOffset,
+                        MachineBasicBlock *&NewMBB);
+    bool HandleConstantPoolUser(MachineFunction &MF, unsigned CPUserIndex);
+    void RemoveDeadCPEMI(MachineInstr *CPEMI);
+    bool RemoveUnusedCPEntries();
+    bool CPEIsInRange(MachineInstr *MI, unsigned UserOffset,
+                      MachineInstr *CPEMI, unsigned Disp, bool NegOk,
+                      bool DoDump = false);
+    bool WaterIsInRange(unsigned UserOffset, MachineBasicBlock *Water,
+                        CPUser &U);
+    bool OffsetIsInRange(unsigned UserOffset, unsigned TrialOffset,
+                         unsigned Disp, bool NegativeOK, bool IsSoImm = false);
+    bool BBIsInRange(MachineInstr *MI, MachineBasicBlock *BB, unsigned Disp);
+    bool FixUpImmediateBr(MachineFunction &MF, ImmBranch &Br);
+    bool FixUpConditionalBr(MachineFunction &MF, ImmBranch &Br);
+    bool FixUpUnconditionalBr(MachineFunction &MF, ImmBranch &Br);
+    bool UndoLRSpillRestore();
+    bool OptimizeThumb2Instructions(MachineFunction &MF);
+    bool OptimizeThumb2Branches(MachineFunction &MF);
+    bool ReorderThumb2JumpTables(MachineFunction &MF);
+    bool OptimizeThumb2JumpTables(MachineFunction &MF);
+    MachineBasicBlock *AdjustJTTargetBlockForward(MachineBasicBlock *BB,
+                                                  MachineBasicBlock *JTBB);
+
+    unsigned GetOffsetOf(MachineInstr *MI) const;
+    void dumpBBs();
+    void verify(MachineFunction &MF);
+  };
+  char ARMConstantIslands::ID = 0;
+}
+
+/// verify - check BBOffsets, BBSizes, alignment of islands
+void ARMConstantIslands::verify(MachineFunction &MF) {
+  assert(BBOffsets.size() == BBSizes.size());
+  for (unsigned i = 1, e = BBOffsets.size(); i != e; ++i)
+    assert(BBOffsets[i-1]+BBSizes[i-1] == BBOffsets[i]);
+  if (!isThumb)
+    return;
+#ifndef NDEBUG
+  for (MachineFunction::iterator MBBI = MF.begin(), E = MF.end();
+       MBBI != E; ++MBBI) {
+    MachineBasicBlock *MBB = MBBI;
+    if (!MBB->empty() &&
+        MBB->begin()->getOpcode() == ARM::CONSTPOOL_ENTRY) {
+      unsigned MBBId = MBB->getNumber();
+      assert(HasInlineAsm ||
+             (BBOffsets[MBBId]%4 == 0 && BBSizes[MBBId]%4 == 0) ||
+             (BBOffsets[MBBId]%4 != 0 && BBSizes[MBBId]%4 != 0));
+    }
+  }
+  for (unsigned i = 0, e = CPUsers.size(); i != e; ++i) {
+    CPUser &U = CPUsers[i];
+    unsigned UserOffset = GetOffsetOf(U.MI) + (isThumb ? 4 : 8);
+    unsigned CPEOffset  = GetOffsetOf(U.CPEMI);
+    unsigned Disp = UserOffset < CPEOffset ? CPEOffset - UserOffset :
+      UserOffset - CPEOffset;
+    assert(Disp <= U.MaxDisp || "Constant pool entry out of range!");
+  }
+#endif
+}
+
+/// print block size and offset information - debugging
+void ARMConstantIslands::dumpBBs() {
+  for (unsigned J = 0, E = BBOffsets.size(); J !=E; ++J) {
+    DEBUG(errs() << "block " << J << " offset " << BBOffsets[J]
+                 << " size " << BBSizes[J] << "\n");
+  }
+}
+
+/// createARMConstantIslandPass - returns an instance of the constpool
+/// island pass.
+FunctionPass *llvm::createARMConstantIslandPass() {
+  return new ARMConstantIslands();
+}
+
+bool ARMConstantIslands::runOnMachineFunction(MachineFunction &MF) {
+  MachineConstantPool &MCP = *MF.getConstantPool();
+
+  TII = MF.getTarget().getInstrInfo();
+  AFI = MF.getInfo<ARMFunctionInfo>();
+  STI = &MF.getTarget().getSubtarget<ARMSubtarget>();
+
+  isThumb = AFI->isThumbFunction();
+  isThumb1 = AFI->isThumb1OnlyFunction();
+  isThumb2 = AFI->isThumb2Function();
+
+  HasFarJump = false;
+  HasInlineAsm = false;
+
+  // Renumber all of the machine basic blocks in the function, guaranteeing that
+  // the numbers agree with the position of the block in the function.
+  MF.RenumberBlocks();
+
+  // Try to reorder and otherwise adjust the block layout to make good use
+  // of the TB[BH] instructions.
+  bool MadeChange = false;
+  if (isThumb2 && AdjustJumpTableBlocks) {
+    JumpTableFunctionScan(MF);
+    MadeChange |= ReorderThumb2JumpTables(MF);
+    // Data is out of date, so clear it. It'll be re-computed later.
+    T2JumpTables.clear();
+    // Blocks may have shifted around. Keep the numbering up to date.
+    MF.RenumberBlocks();
+  }
+
+  // Thumb1 functions containing constant pools get 4-byte alignment.
+  // This is so we can keep exact track of where the alignment padding goes.
+
+  // ARM and Thumb2 functions need to be 4-byte aligned.
+  if (!isThumb1)
+    MF.EnsureAlignment(2);  // 2 = log2(4)
+
+  // Perform the initial placement of the constant pool entries.  To start with,
+  // we put them all at the end of the function.
+  std::vector<MachineInstr*> CPEMIs;
+  if (!MCP.isEmpty()) {
+    DoInitialPlacement(MF, CPEMIs);
+    if (isThumb1)
+      MF.EnsureAlignment(2);  // 2 = log2(4)
+  }
+
+  /// The next UID to take is the first unused one.
+  AFI->initConstPoolEntryUId(CPEMIs.size());
+
+  // Do the initial scan of the function, building up information about the
+  // sizes of each block, the location of all the water, and finding all of the
+  // constant pool users.
+  InitialFunctionScan(MF, CPEMIs);
+  CPEMIs.clear();
+
+  /// Remove dead constant pool entries.
+  RemoveUnusedCPEntries();
+
+  // Iteratively place constant pool entries and fix up branches until there
+  // is no change.
+  unsigned NoCPIters = 0, NoBRIters = 0;
+  while (true) {
+    bool CPChange = false;
+    for (unsigned i = 0, e = CPUsers.size(); i != e; ++i)
+      CPChange |= HandleConstantPoolUser(MF, i);
+    if (CPChange && ++NoCPIters > 30)
+      llvm_unreachable("Constant Island pass failed to converge!");
+    DEBUG(dumpBBs());
+    
+    // Clear NewWaterList now.  If we split a block for branches, it should
+    // appear as "new water" for the next iteration of constant pool placement.
+    NewWaterList.clear();
+
+    bool BRChange = false;
+    for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i)
+      BRChange |= FixUpImmediateBr(MF, ImmBranches[i]);
+    if (BRChange && ++NoBRIters > 30)
+      llvm_unreachable("Branch Fix Up pass failed to converge!");
+    DEBUG(dumpBBs());
+
+    if (!CPChange && !BRChange)
+      break;
+    MadeChange = true;
+  }
+
+  // Shrink 32-bit Thumb2 branch, load, and store instructions.
+  if (isThumb2)
+    MadeChange |= OptimizeThumb2Instructions(MF);
+
+  // After a while, this might be made debug-only, but it is not expensive.
+  verify(MF);
+
+  // If LR has been forced spilled and no far jumps (i.e. BL) has been issued.
+  // Undo the spill / restore of LR if possible.
+  if (isThumb && !HasFarJump && AFI->isLRSpilledForFarJump())
+    MadeChange |= UndoLRSpillRestore();
+
+  BBSizes.clear();
+  BBOffsets.clear();
+  WaterList.clear();
+  CPUsers.clear();
+  CPEntries.clear();
+  ImmBranches.clear();
+  PushPopMIs.clear();
+  T2JumpTables.clear();
+
+  return MadeChange;
+}
+
+/// DoInitialPlacement - Perform the initial placement of the constant pool
+/// entries.  To start with, we put them all at the end of the function.
+void ARMConstantIslands::DoInitialPlacement(MachineFunction &MF,
+                                        std::vector<MachineInstr*> &CPEMIs) {
+  // Create the basic block to hold the CPE's.
+  MachineBasicBlock *BB = MF.CreateMachineBasicBlock();
+  MF.push_back(BB);
+
+  // Add all of the constants from the constant pool to the end block, use an
+  // identity mapping of CPI's to CPE's.
+  const std::vector<MachineConstantPoolEntry> &CPs =
+    MF.getConstantPool()->getConstants();
+
+  const TargetData &TD = *MF.getTarget().getTargetData();
+  for (unsigned i = 0, e = CPs.size(); i != e; ++i) {
+    unsigned Size = TD.getTypeAllocSize(CPs[i].getType());
+    // Verify that all constant pool entries are a multiple of 4 bytes.  If not,
+    // we would have to pad them out or something so that instructions stay
+    // aligned.
+    assert((Size & 3) == 0 && "CP Entry not multiple of 4 bytes!");
+    MachineInstr *CPEMI =
+      BuildMI(BB, DebugLoc::getUnknownLoc(), TII->get(ARM::CONSTPOOL_ENTRY))
+                           .addImm(i).addConstantPoolIndex(i).addImm(Size);
+    CPEMIs.push_back(CPEMI);
+
+    // Add a new CPEntry, but no corresponding CPUser yet.
+    std::vector<CPEntry> CPEs;
+    CPEs.push_back(CPEntry(CPEMI, i));
+    CPEntries.push_back(CPEs);
+    NumCPEs++;
+    DEBUG(errs() << "Moved CPI#" << i << " to end of function as #" << i
+                 << "\n");
+  }
+}
+
+/// BBHasFallthrough - Return true if the specified basic block can fallthrough
+/// into the block immediately after it.
+static bool BBHasFallthrough(MachineBasicBlock *MBB) {
+  // Get the next machine basic block in the function.
+  MachineFunction::iterator MBBI = MBB;
+  if (llvm::next(MBBI) == MBB->getParent()->end())  // Can't fall off end of function.
+    return false;
+
+  MachineBasicBlock *NextBB = llvm::next(MBBI);
+  for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
+       E = MBB->succ_end(); I != E; ++I)
+    if (*I == NextBB)
+      return true;
+
+  return false;
+}
+
+/// findConstPoolEntry - Given the constpool index and CONSTPOOL_ENTRY MI,
+/// look up the corresponding CPEntry.
+ARMConstantIslands::CPEntry
+*ARMConstantIslands::findConstPoolEntry(unsigned CPI,
+                                        const MachineInstr *CPEMI) {
+  std::vector<CPEntry> &CPEs = CPEntries[CPI];
+  // Number of entries per constpool index should be small, just do a
+  // linear search.
+  for (unsigned i = 0, e = CPEs.size(); i != e; ++i) {
+    if (CPEs[i].CPEMI == CPEMI)
+      return &CPEs[i];
+  }
+  return NULL;
+}
+
+/// JumpTableFunctionScan - Do a scan of the function, building up
+/// information about the sizes of each block and the locations of all
+/// the jump tables.
+void ARMConstantIslands::JumpTableFunctionScan(MachineFunction &MF) {
+  for (MachineFunction::iterator MBBI = MF.begin(), E = MF.end();
+       MBBI != E; ++MBBI) {
+    MachineBasicBlock &MBB = *MBBI;
+
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+         I != E; ++I)
+      if (I->getDesc().isBranch() && I->getOpcode() == ARM::t2BR_JT)
+        T2JumpTables.push_back(I);
+  }
+}
+
+/// InitialFunctionScan - Do the initial scan of the function, building up
+/// information about the sizes of each block, the location of all the water,
+/// and finding all of the constant pool users.
+void ARMConstantIslands::InitialFunctionScan(MachineFunction &MF,
+                                 const std::vector<MachineInstr*> &CPEMIs) {
+  // First thing, see if the function has any inline assembly in it. If so,
+  // we have to be conservative about alignment assumptions, as we don't
+  // know for sure the size of any instructions in the inline assembly.
+  for (MachineFunction::iterator MBBI = MF.begin(), E = MF.end();
+       MBBI != E; ++MBBI) {
+    MachineBasicBlock &MBB = *MBBI;
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+         I != E; ++I)
+      if (I->getOpcode() == ARM::INLINEASM)
+        HasInlineAsm = true;
+  }
+
+  // Now go back through the instructions and build up our data structures
+  unsigned Offset = 0;
+  for (MachineFunction::iterator MBBI = MF.begin(), E = MF.end();
+       MBBI != E; ++MBBI) {
+    MachineBasicBlock &MBB = *MBBI;
+
+    // If this block doesn't fall through into the next MBB, then this is
+    // 'water' that a constant pool island could be placed.
+    if (!BBHasFallthrough(&MBB))
+      WaterList.push_back(&MBB);
+
+    unsigned MBBSize = 0;
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+         I != E; ++I) {
+      // Add instruction size to MBBSize.
+      MBBSize += TII->GetInstSizeInBytes(I);
+
+      int Opc = I->getOpcode();
+      if (I->getDesc().isBranch()) {
+        bool isCond = false;
+        unsigned Bits = 0;
+        unsigned Scale = 1;
+        int UOpc = Opc;
+        switch (Opc) {
+        default:
+          continue;  // Ignore other JT branches
+        case ARM::tBR_JTr:
+          // A Thumb1 table jump may involve padding; for the offsets to
+          // be right, functions containing these must be 4-byte aligned.
+          MF.EnsureAlignment(2U);
+          if ((Offset+MBBSize)%4 != 0 || HasInlineAsm)
+            // FIXME: Add a pseudo ALIGN instruction instead.
+            MBBSize += 2;           // padding
+          continue;   // Does not get an entry in ImmBranches
+        case ARM::t2BR_JT:
+          T2JumpTables.push_back(I);
+          continue;   // Does not get an entry in ImmBranches
+        case ARM::Bcc:
+          isCond = true;
+          UOpc = ARM::B;
+          // Fallthrough
+        case ARM::B:
+          Bits = 24;
+          Scale = 4;
+          break;
+        case ARM::tBcc:
+          isCond = true;
+          UOpc = ARM::tB;
+          Bits = 8;
+          Scale = 2;
+          break;
+        case ARM::tB:
+          Bits = 11;
+          Scale = 2;
+          break;
+        case ARM::t2Bcc:
+          isCond = true;
+          UOpc = ARM::t2B;
+          Bits = 20;
+          Scale = 2;
+          break;
+        case ARM::t2B:
+          Bits = 24;
+          Scale = 2;
+          break;
+        }
+
+        // Record this immediate branch.
+        unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale;
+        ImmBranches.push_back(ImmBranch(I, MaxOffs, isCond, UOpc));
+      }
+
+      if (Opc == ARM::tPUSH || Opc == ARM::tPOP_RET)
+        PushPopMIs.push_back(I);
+
+      if (Opc == ARM::CONSTPOOL_ENTRY)
+        continue;
+
+      // Scan the instructions for constant pool operands.
+      for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op)
+        if (I->getOperand(op).isCPI()) {
+          // We found one.  The addressing mode tells us the max displacement
+          // from the PC that this instruction permits.
+
+          // Basic size info comes from the TSFlags field.
+          unsigned Bits = 0;
+          unsigned Scale = 1;
+          bool NegOk = false;
+          bool IsSoImm = false;
+
+          switch (Opc) {
+          default:
+            llvm_unreachable("Unknown addressing mode for CP reference!");
+            break;
+
+          // Taking the address of a CP entry.
+          case ARM::LEApcrel:
+            // This takes a SoImm, which is 8 bit immediate rotated. We'll
+            // pretend the maximum offset is 255 * 4. Since each instruction
+            // 4 byte wide, this is always correct. We'll check for other
+            // displacements that fits in a SoImm as well.
+            Bits = 8;
+            Scale = 4;
+            NegOk = true;
+            IsSoImm = true;
+            break;
+          case ARM::t2LEApcrel:
+            Bits = 12;
+            NegOk = true;
+            break;
+          case ARM::tLEApcrel:
+            Bits = 8;
+            Scale = 4;
+            break;
+
+          case ARM::LDR:
+          case ARM::LDRcp:
+          case ARM::t2LDRpci:
+            Bits = 12;  // +-offset_12
+            NegOk = true;
+            break;
+
+          case ARM::tLDRpci:
+          case ARM::tLDRcp:
+            Bits = 8;
+            Scale = 4;  // +(offset_8*4)
+            break;
+
+          case ARM::VLDRD:
+          case ARM::VLDRS:
+            Bits = 8;
+            Scale = 4;  // +-(offset_8*4)
+            NegOk = true;
+            break;
+          }
+
+          // Remember that this is a user of a CP entry.
+          unsigned CPI = I->getOperand(op).getIndex();
+          MachineInstr *CPEMI = CPEMIs[CPI];
+          unsigned MaxOffs = ((1 << Bits)-1) * Scale;
+          CPUsers.push_back(CPUser(I, CPEMI, MaxOffs, NegOk, IsSoImm));
+
+          // Increment corresponding CPEntry reference count.
+          CPEntry *CPE = findConstPoolEntry(CPI, CPEMI);
+          assert(CPE && "Cannot find a corresponding CPEntry!");
+          CPE->RefCount++;
+
+          // Instructions can only use one CP entry, don't bother scanning the
+          // rest of the operands.
+          break;
+        }
+    }
+
+    // In thumb mode, if this block is a constpool island, we may need padding
+    // so it's aligned on 4 byte boundary.
+    if (isThumb &&
+        !MBB.empty() &&
+        MBB.begin()->getOpcode() == ARM::CONSTPOOL_ENTRY &&
+        ((Offset%4) != 0 || HasInlineAsm))
+      MBBSize += 2;
+
+    BBSizes.push_back(MBBSize);
+    BBOffsets.push_back(Offset);
+    Offset += MBBSize;
+  }
+}
+
+/// GetOffsetOf - Return the current offset of the specified machine instruction
+/// from the start of the function.  This offset changes as stuff is moved
+/// around inside the function.
+unsigned ARMConstantIslands::GetOffsetOf(MachineInstr *MI) const {
+  MachineBasicBlock *MBB = MI->getParent();
+
+  // The offset is composed of two things: the sum of the sizes of all MBB's
+  // before this instruction's block, and the offset from the start of the block
+  // it is in.
+  unsigned Offset = BBOffsets[MBB->getNumber()];
+
+  // If we're looking for a CONSTPOOL_ENTRY in Thumb, see if this block has
+  // alignment padding, and compensate if so.
+  if (isThumb &&
+      MI->getOpcode() == ARM::CONSTPOOL_ENTRY &&
+      (Offset%4 != 0 || HasInlineAsm))
+    Offset += 2;
+
+  // Sum instructions before MI in MBB.
+  for (MachineBasicBlock::iterator I = MBB->begin(); ; ++I) {
+    assert(I != MBB->end() && "Didn't find MI in its own basic block?");
+    if (&*I == MI) return Offset;
+    Offset += TII->GetInstSizeInBytes(I);
+  }
+}
+
+/// CompareMBBNumbers - Little predicate function to sort the WaterList by MBB
+/// ID.
+static bool CompareMBBNumbers(const MachineBasicBlock *LHS,
+                              const MachineBasicBlock *RHS) {
+  return LHS->getNumber() < RHS->getNumber();
+}
+
+/// UpdateForInsertedWaterBlock - When a block is newly inserted into the
+/// machine function, it upsets all of the block numbers.  Renumber the blocks
+/// and update the arrays that parallel this numbering.
+void ARMConstantIslands::UpdateForInsertedWaterBlock(MachineBasicBlock *NewBB) {
+  // Renumber the MBB's to keep them consequtive.
+  NewBB->getParent()->RenumberBlocks(NewBB);
+
+  // Insert a size into BBSizes to align it properly with the (newly
+  // renumbered) block numbers.
+  BBSizes.insert(BBSizes.begin()+NewBB->getNumber(), 0);
+
+  // Likewise for BBOffsets.
+  BBOffsets.insert(BBOffsets.begin()+NewBB->getNumber(), 0);
+
+  // Next, update WaterList.  Specifically, we need to add NewMBB as having
+  // available water after it.
+  water_iterator IP =
+    std::lower_bound(WaterList.begin(), WaterList.end(), NewBB,
+                     CompareMBBNumbers);
+  WaterList.insert(IP, NewBB);
+}
+
+
+/// Split the basic block containing MI into two blocks, which are joined by
+/// an unconditional branch.  Update data structures and renumber blocks to
+/// account for this change and returns the newly created block.
+MachineBasicBlock *ARMConstantIslands::SplitBlockBeforeInstr(MachineInstr *MI) {
+  MachineBasicBlock *OrigBB = MI->getParent();
+  MachineFunction &MF = *OrigBB->getParent();
+
+  // Create a new MBB for the code after the OrigBB.
+  MachineBasicBlock *NewBB =
+    MF.CreateMachineBasicBlock(OrigBB->getBasicBlock());
+  MachineFunction::iterator MBBI = OrigBB; ++MBBI;
+  MF.insert(MBBI, NewBB);
+
+  // Splice the instructions starting with MI over to NewBB.
+  NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end());
+
+  // Add an unconditional branch from OrigBB to NewBB.
+  // Note the new unconditional branch is not being recorded.
+  // There doesn't seem to be meaningful DebugInfo available; this doesn't
+  // correspond to anything in the source.
+  unsigned Opc = isThumb ? (isThumb2 ? ARM::t2B : ARM::tB) : ARM::B;
+  BuildMI(OrigBB, DebugLoc::getUnknownLoc(), TII->get(Opc)).addMBB(NewBB);
+  NumSplit++;
+
+  // Update the CFG.  All succs of OrigBB are now succs of NewBB.
+  while (!OrigBB->succ_empty()) {
+    MachineBasicBlock *Succ = *OrigBB->succ_begin();
+    OrigBB->removeSuccessor(Succ);
+    NewBB->addSuccessor(Succ);
+
+    // This pass should be run after register allocation, so there should be no
+    // PHI nodes to update.
+    assert((Succ->empty() || !Succ->begin()->isPHI())
+           && "PHI nodes should be eliminated by now!");
+  }
+
+  // OrigBB branches to NewBB.
+  OrigBB->addSuccessor(NewBB);
+
+  // Update internal data structures to account for the newly inserted MBB.
+  // This is almost the same as UpdateForInsertedWaterBlock, except that
+  // the Water goes after OrigBB, not NewBB.
+  MF.RenumberBlocks(NewBB);
+
+  // Insert a size into BBSizes to align it properly with the (newly
+  // renumbered) block numbers.
+  BBSizes.insert(BBSizes.begin()+NewBB->getNumber(), 0);
+
+  // Likewise for BBOffsets.
+  BBOffsets.insert(BBOffsets.begin()+NewBB->getNumber(), 0);
+
+  // Next, update WaterList.  Specifically, we need to add OrigMBB as having
+  // available water after it (but not if it's already there, which happens
+  // when splitting before a conditional branch that is followed by an
+  // unconditional branch - in that case we want to insert NewBB).
+  water_iterator IP =
+    std::lower_bound(WaterList.begin(), WaterList.end(), OrigBB,
+                     CompareMBBNumbers);
+  MachineBasicBlock* WaterBB = *IP;
+  if (WaterBB == OrigBB)
+    WaterList.insert(llvm::next(IP), NewBB);
+  else
+    WaterList.insert(IP, OrigBB);
+  NewWaterList.insert(OrigBB);
+
+  // Figure out how large the first NewMBB is.  (It cannot
+  // contain a constpool_entry or tablejump.)
+  unsigned NewBBSize = 0;
+  for (MachineBasicBlock::iterator I = NewBB->begin(), E = NewBB->end();
+       I != E; ++I)
+    NewBBSize += TII->GetInstSizeInBytes(I);
+
+  unsigned OrigBBI = OrigBB->getNumber();
+  unsigned NewBBI = NewBB->getNumber();
+  // Set the size of NewBB in BBSizes.
+  BBSizes[NewBBI] = NewBBSize;
+
+  // We removed instructions from UserMBB, subtract that off from its size.
+  // Add 2 or 4 to the block to count the unconditional branch we added to it.
+  int delta = isThumb1 ? 2 : 4;
+  BBSizes[OrigBBI] -= NewBBSize - delta;
+
+  // ...and adjust BBOffsets for NewBB accordingly.
+  BBOffsets[NewBBI] = BBOffsets[OrigBBI] + BBSizes[OrigBBI];
+
+  // All BBOffsets following these blocks must be modified.
+  AdjustBBOffsetsAfter(NewBB, delta);
+
+  return NewBB;
+}
+
+/// OffsetIsInRange - Checks whether UserOffset (the location of a constant pool
+/// reference) is within MaxDisp of TrialOffset (a proposed location of a
+/// constant pool entry).
+bool ARMConstantIslands::OffsetIsInRange(unsigned UserOffset,
+                                         unsigned TrialOffset, unsigned MaxDisp,
+                                         bool NegativeOK, bool IsSoImm) {
+  // On Thumb offsets==2 mod 4 are rounded down by the hardware for
+  // purposes of the displacement computation; compensate for that here.
+  // Effectively, the valid range of displacements is 2 bytes smaller for such
+  // references.
+  unsigned TotalAdj = 0;
+  if (isThumb && UserOffset%4 !=0) {
+    UserOffset -= 2;
+    TotalAdj = 2;
+  }
+  // CPEs will be rounded up to a multiple of 4.
+  if (isThumb && TrialOffset%4 != 0) {
+    TrialOffset += 2;
+    TotalAdj += 2;
+  }
+
+  // In Thumb2 mode, later branch adjustments can shift instructions up and
+  // cause alignment change. In the worst case scenario this can cause the
+  // user's effective address to be subtracted by 2 and the CPE's address to
+  // be plus 2.
+  if (isThumb2 && TotalAdj != 4)
+    MaxDisp -= (4 - TotalAdj);
+
+  if (UserOffset <= TrialOffset) {
+    // User before the Trial.
+    if (TrialOffset - UserOffset <= MaxDisp)
+      return true;
+    // FIXME: Make use full range of soimm values.
+  } else if (NegativeOK) {
+    if (UserOffset - TrialOffset <= MaxDisp)
+      return true;
+    // FIXME: Make use full range of soimm values.
+  }
+  return false;
+}
+
+/// WaterIsInRange - Returns true if a CPE placed after the specified
+/// Water (a basic block) will be in range for the specific MI.
+
+bool ARMConstantIslands::WaterIsInRange(unsigned UserOffset,
+                                        MachineBasicBlock* Water, CPUser &U) {
+  unsigned MaxDisp = U.MaxDisp;
+  unsigned CPEOffset = BBOffsets[Water->getNumber()] +
+                       BBSizes[Water->getNumber()];
+
+  // If the CPE is to be inserted before the instruction, that will raise
+  // the offset of the instruction.
+  if (CPEOffset < UserOffset)
+    UserOffset += U.CPEMI->getOperand(2).getImm();
+
+  return OffsetIsInRange(UserOffset, CPEOffset, MaxDisp, U.NegOk, U.IsSoImm);
+}
+
+/// CPEIsInRange - Returns true if the distance between specific MI and
+/// specific ConstPool entry instruction can fit in MI's displacement field.
+bool ARMConstantIslands::CPEIsInRange(MachineInstr *MI, unsigned UserOffset,
+                                      MachineInstr *CPEMI, unsigned MaxDisp,
+                                      bool NegOk, bool DoDump) {
+  unsigned CPEOffset  = GetOffsetOf(CPEMI);
+  assert((CPEOffset%4 == 0 || HasInlineAsm) && "Misaligned CPE");
+
+  if (DoDump) {
+    DEBUG(errs() << "User of CPE#" << CPEMI->getOperand(0).getImm()
+                 << " max delta=" << MaxDisp
+                 << " insn address=" << UserOffset
+                 << " CPE address=" << CPEOffset
+                 << " offset=" << int(CPEOffset-UserOffset) << "\t" << *MI);
+  }
+
+  return OffsetIsInRange(UserOffset, CPEOffset, MaxDisp, NegOk);
+}
+
+#ifndef NDEBUG
+/// BBIsJumpedOver - Return true of the specified basic block's only predecessor
+/// unconditionally branches to its only successor.
+static bool BBIsJumpedOver(MachineBasicBlock *MBB) {
+  if (MBB->pred_size() != 1 || MBB->succ_size() != 1)
+    return false;
+
+  MachineBasicBlock *Succ = *MBB->succ_begin();
+  MachineBasicBlock *Pred = *MBB->pred_begin();
+  MachineInstr *PredMI = &Pred->back();
+  if (PredMI->getOpcode() == ARM::B || PredMI->getOpcode() == ARM::tB
+      || PredMI->getOpcode() == ARM::t2B)
+    return PredMI->getOperand(0).getMBB() == Succ;
+  return false;
+}
+#endif // NDEBUG
+
+void ARMConstantIslands::AdjustBBOffsetsAfter(MachineBasicBlock *BB,
+                                              int delta) {
+  MachineFunction::iterator MBBI = BB; MBBI = llvm::next(MBBI);
+  for(unsigned i = BB->getNumber()+1, e = BB->getParent()->getNumBlockIDs();
+      i < e; ++i) {
+    BBOffsets[i] += delta;
+    // If some existing blocks have padding, adjust the padding as needed, a
+    // bit tricky.  delta can be negative so don't use % on that.
+    if (!isThumb)
+      continue;
+    MachineBasicBlock *MBB = MBBI;
+    if (!MBB->empty() && !HasInlineAsm) {
+      // Constant pool entries require padding.
+      if (MBB->begin()->getOpcode() == ARM::CONSTPOOL_ENTRY) {
+        unsigned OldOffset = BBOffsets[i] - delta;
+        if ((OldOffset%4) == 0 && (BBOffsets[i]%4) != 0) {
+          // add new padding
+          BBSizes[i] += 2;
+          delta += 2;
+        } else if ((OldOffset%4) != 0 && (BBOffsets[i]%4) == 0) {
+          // remove existing padding
+          BBSizes[i] -= 2;
+          delta -= 2;
+        }
+      }
+      // Thumb1 jump tables require padding.  They should be at the end;
+      // following unconditional branches are removed by AnalyzeBranch.
+      MachineInstr *ThumbJTMI = prior(MBB->end());
+      if (ThumbJTMI->getOpcode() == ARM::tBR_JTr) {
+        unsigned NewMIOffset = GetOffsetOf(ThumbJTMI);
+        unsigned OldMIOffset = NewMIOffset - delta;
+        if ((OldMIOffset%4) == 0 && (NewMIOffset%4) != 0) {
+          // remove existing padding
+          BBSizes[i] -= 2;
+          delta -= 2;
+        } else if ((OldMIOffset%4) != 0 && (NewMIOffset%4) == 0) {
+          // add new padding
+          BBSizes[i] += 2;
+          delta += 2;
+        }
+      }
+      if (delta==0)
+        return;
+    }
+    MBBI = llvm::next(MBBI);
+  }
+}
+
+/// DecrementOldEntry - find the constant pool entry with index CPI
+/// and instruction CPEMI, and decrement its refcount.  If the refcount
+/// becomes 0 remove the entry and instruction.  Returns true if we removed
+/// the entry, false if we didn't.
+
+bool ARMConstantIslands::DecrementOldEntry(unsigned CPI, MachineInstr *CPEMI) {
+  // Find the old entry. Eliminate it if it is no longer used.
+  CPEntry *CPE = findConstPoolEntry(CPI, CPEMI);
+  assert(CPE && "Unexpected!");
+  if (--CPE->RefCount == 0) {
+    RemoveDeadCPEMI(CPEMI);
+    CPE->CPEMI = NULL;
+    NumCPEs--;
+    return true;
+  }
+  return false;
+}
+
+/// LookForCPEntryInRange - see if the currently referenced CPE is in range;
+/// if not, see if an in-range clone of the CPE is in range, and if so,
+/// change the data structures so the user references the clone.  Returns:
+/// 0 = no existing entry found
+/// 1 = entry found, and there were no code insertions or deletions
+/// 2 = entry found, and there were code insertions or deletions
+int ARMConstantIslands::LookForExistingCPEntry(CPUser& U, unsigned UserOffset)
+{
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI  = U.CPEMI;
+
+  // Check to see if the CPE is already in-range.
+  if (CPEIsInRange(UserMI, UserOffset, CPEMI, U.MaxDisp, U.NegOk, true)) {
+    DEBUG(errs() << "In range\n");
+    return 1;
+  }
+
+  // No.  Look for previously created clones of the CPE that are in range.
+  unsigned CPI = CPEMI->getOperand(1).getIndex();
+  std::vector<CPEntry> &CPEs = CPEntries[CPI];
+  for (unsigned i = 0, e = CPEs.size(); i != e; ++i) {
+    // We already tried this one
+    if (CPEs[i].CPEMI == CPEMI)
+      continue;
+    // Removing CPEs can leave empty entries, skip
+    if (CPEs[i].CPEMI == NULL)
+      continue;
+    if (CPEIsInRange(UserMI, UserOffset, CPEs[i].CPEMI, U.MaxDisp, U.NegOk)) {
+      DEBUG(errs() << "Replacing CPE#" << CPI << " with CPE#"
+                   << CPEs[i].CPI << "\n");
+      // Point the CPUser node to the replacement
+      U.CPEMI = CPEs[i].CPEMI;
+      // Change the CPI in the instruction operand to refer to the clone.
+      for (unsigned j = 0, e = UserMI->getNumOperands(); j != e; ++j)
+        if (UserMI->getOperand(j).isCPI()) {
+          UserMI->getOperand(j).setIndex(CPEs[i].CPI);
+          break;
+        }
+      // Adjust the refcount of the clone...
+      CPEs[i].RefCount++;
+      // ...and the original.  If we didn't remove the old entry, none of the
+      // addresses changed, so we don't need another pass.
+      return DecrementOldEntry(CPI, CPEMI) ? 2 : 1;
+    }
+  }
+  return 0;
+}
+
+/// getUnconditionalBrDisp - Returns the maximum displacement that can fit in
+/// the specific unconditional branch instruction.
+static inline unsigned getUnconditionalBrDisp(int Opc) {
+  switch (Opc) {
+  case ARM::tB:
+    return ((1<<10)-1)*2;
+  case ARM::t2B:
+    return ((1<<23)-1)*2;
+  default:
+    break;
+  }
+
+  return ((1<<23)-1)*4;
+}
+
+/// LookForWater - Look for an existing entry in the WaterList in which
+/// we can place the CPE referenced from U so it's within range of U's MI.
+/// Returns true if found, false if not.  If it returns true, WaterIter
+/// is set to the WaterList entry.  For Thumb, prefer water that will not
+/// introduce padding to water that will.  To ensure that this pass
+/// terminates, the CPE location for a particular CPUser is only allowed to
+/// move to a lower address, so search backward from the end of the list and
+/// prefer the first water that is in range.
+bool ARMConstantIslands::LookForWater(CPUser &U, unsigned UserOffset,
+                                      water_iterator &WaterIter) {
+  if (WaterList.empty())
+    return false;
+
+  bool FoundWaterThatWouldPad = false;
+  water_iterator IPThatWouldPad;
+  for (water_iterator IP = prior(WaterList.end()),
+         B = WaterList.begin();; --IP) {
+    MachineBasicBlock* WaterBB = *IP;
+    // Check if water is in range and is either at a lower address than the
+    // current "high water mark" or a new water block that was created since
+    // the previous iteration by inserting an unconditional branch.  In the
+    // latter case, we want to allow resetting the high water mark back to
+    // this new water since we haven't seen it before.  Inserting branches
+    // should be relatively uncommon and when it does happen, we want to be
+    // sure to take advantage of it for all the CPEs near that block, so that
+    // we don't insert more branches than necessary.
+    if (WaterIsInRange(UserOffset, WaterBB, U) &&
+        (WaterBB->getNumber() < U.HighWaterMark->getNumber() ||
+         NewWaterList.count(WaterBB))) {
+      unsigned WBBId = WaterBB->getNumber();
+      if (isThumb &&
+          (BBOffsets[WBBId] + BBSizes[WBBId])%4 != 0) {
+        // This is valid Water, but would introduce padding.  Remember
+        // it in case we don't find any Water that doesn't do this.
+        if (!FoundWaterThatWouldPad) {
+          FoundWaterThatWouldPad = true;
+          IPThatWouldPad = IP;
+        }
+      } else {
+        WaterIter = IP;
+        return true;
+      }
+    }
+    if (IP == B)
+      break;
+  }
+  if (FoundWaterThatWouldPad) {
+    WaterIter = IPThatWouldPad;
+    return true;
+  }
+  return false;
+}
+
+/// CreateNewWater - No existing WaterList entry will work for
+/// CPUsers[CPUserIndex], so create a place to put the CPE.  The end of the
+/// block is used if in range, and the conditional branch munged so control
+/// flow is correct.  Otherwise the block is split to create a hole with an
+/// unconditional branch around it.  In either case NewMBB is set to a
+/// block following which the new island can be inserted (the WaterList
+/// is not adjusted).
+void ARMConstantIslands::CreateNewWater(unsigned CPUserIndex,
+                                        unsigned UserOffset,
+                                        MachineBasicBlock *&NewMBB) {
+  CPUser &U = CPUsers[CPUserIndex];
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI  = U.CPEMI;
+  MachineBasicBlock *UserMBB = UserMI->getParent();
+  unsigned OffsetOfNextBlock = BBOffsets[UserMBB->getNumber()] +
+                               BBSizes[UserMBB->getNumber()];
+  assert(OffsetOfNextBlock== BBOffsets[UserMBB->getNumber()+1]);
+
+  // If the block does not end in an unconditional branch already, and if the
+  // end of the block is within range, make new water there.  (The addition
+  // below is for the unconditional branch we will be adding: 4 bytes on ARM +
+  // Thumb2, 2 on Thumb1.  Possible Thumb1 alignment padding is allowed for
+  // inside OffsetIsInRange.
+  if (BBHasFallthrough(UserMBB) &&
+      OffsetIsInRange(UserOffset, OffsetOfNextBlock + (isThumb1 ? 2: 4),
+                      U.MaxDisp, U.NegOk, U.IsSoImm)) {
+    DEBUG(errs() << "Split at end of block\n");
+    if (&UserMBB->back() == UserMI)
+      assert(BBHasFallthrough(UserMBB) && "Expected a fallthrough BB!");
+    NewMBB = llvm::next(MachineFunction::iterator(UserMBB));
+    // Add an unconditional branch from UserMBB to fallthrough block.
+    // Record it for branch lengthening; this new branch will not get out of
+    // range, but if the preceding conditional branch is out of range, the
+    // targets will be exchanged, and the altered branch may be out of
+    // range, so the machinery has to know about it.
+    int UncondBr = isThumb ? ((isThumb2) ? ARM::t2B : ARM::tB) : ARM::B;
+    BuildMI(UserMBB, DebugLoc::getUnknownLoc(),
+            TII->get(UncondBr)).addMBB(NewMBB);
+    unsigned MaxDisp = getUnconditionalBrDisp(UncondBr);
+    ImmBranches.push_back(ImmBranch(&UserMBB->back(),
+                          MaxDisp, false, UncondBr));
+    int delta = isThumb1 ? 2 : 4;
+    BBSizes[UserMBB->getNumber()] += delta;
+    AdjustBBOffsetsAfter(UserMBB, delta);
+  } else {
+    // What a big block.  Find a place within the block to split it.
+    // This is a little tricky on Thumb1 since instructions are 2 bytes
+    // and constant pool entries are 4 bytes: if instruction I references
+    // island CPE, and instruction I+1 references CPE', it will
+    // not work well to put CPE as far forward as possible, since then
+    // CPE' cannot immediately follow it (that location is 2 bytes
+    // farther away from I+1 than CPE was from I) and we'd need to create
+    // a new island.  So, we make a first guess, then walk through the
+    // instructions between the one currently being looked at and the
+    // possible insertion point, and make sure any other instructions
+    // that reference CPEs will be able to use the same island area;
+    // if not, we back up the insertion point.
+
+    // The 4 in the following is for the unconditional branch we'll be
+    // inserting (allows for long branch on Thumb1).  Alignment of the
+    // island is handled inside OffsetIsInRange.
+    unsigned BaseInsertOffset = UserOffset + U.MaxDisp -4;
+    // This could point off the end of the block if we've already got
+    // constant pool entries following this block; only the last one is
+    // in the water list.  Back past any possible branches (allow for a
+    // conditional and a maximally long unconditional).
+    if (BaseInsertOffset >= BBOffsets[UserMBB->getNumber()+1])
+      BaseInsertOffset = BBOffsets[UserMBB->getNumber()+1] -
+                              (isThumb1 ? 6 : 8);
+    unsigned EndInsertOffset = BaseInsertOffset +
+           CPEMI->getOperand(2).getImm();
+    MachineBasicBlock::iterator MI = UserMI;
+    ++MI;
+    unsigned CPUIndex = CPUserIndex+1;
+    for (unsigned Offset = UserOffset+TII->GetInstSizeInBytes(UserMI);
+         Offset < BaseInsertOffset;
+         Offset += TII->GetInstSizeInBytes(MI),
+            MI = llvm::next(MI)) {
+      if (CPUIndex < CPUsers.size() && CPUsers[CPUIndex].MI == MI) {
+        CPUser &U = CPUsers[CPUIndex];
+        if (!OffsetIsInRange(Offset, EndInsertOffset,
+                             U.MaxDisp, U.NegOk, U.IsSoImm)) {
+          BaseInsertOffset -= (isThumb1 ? 2 : 4);
+          EndInsertOffset  -= (isThumb1 ? 2 : 4);
+        }
+        // This is overly conservative, as we don't account for CPEMIs
+        // being reused within the block, but it doesn't matter much.
+        EndInsertOffset += CPUsers[CPUIndex].CPEMI->getOperand(2).getImm();
+        CPUIndex++;
+      }
+    }
+    DEBUG(errs() << "Split in middle of big block\n");
+    NewMBB = SplitBlockBeforeInstr(prior(MI));
+  }
+}
+
+/// HandleConstantPoolUser - Analyze the specified user, checking to see if it
+/// is out-of-range.  If so, pick up the constant pool value and move it some
+/// place in-range.  Return true if we changed any addresses (thus must run
+/// another pass of branch lengthening), false otherwise.
+bool ARMConstantIslands::HandleConstantPoolUser(MachineFunction &MF,
+                                                unsigned CPUserIndex) {
+  CPUser &U = CPUsers[CPUserIndex];
+  MachineInstr *UserMI = U.MI;
+  MachineInstr *CPEMI  = U.CPEMI;
+  unsigned CPI = CPEMI->getOperand(1).getIndex();
+  unsigned Size = CPEMI->getOperand(2).getImm();
+  // Compute this only once, it's expensive.  The 4 or 8 is the value the
+  // hardware keeps in the PC.
+  unsigned UserOffset = GetOffsetOf(UserMI) + (isThumb ? 4 : 8);
+
+  // See if the current entry is within range, or there is a clone of it
+  // in range.
+  int result = LookForExistingCPEntry(U, UserOffset);
+  if (result==1) return false;
+  else if (result==2) return true;
+
+  // No existing clone of this CPE is within range.
+  // We will be generating a new clone.  Get a UID for it.
+  unsigned ID = AFI->createConstPoolEntryUId();
+
+  // Look for water where we can place this CPE.
+  MachineBasicBlock *NewIsland = MF.CreateMachineBasicBlock();
+  MachineBasicBlock *NewMBB;
+  water_iterator IP;
+  if (LookForWater(U, UserOffset, IP)) {
+    DEBUG(errs() << "found water in range\n");
+    MachineBasicBlock *WaterBB = *IP;
+
+    // If the original WaterList entry was "new water" on this iteration,
+    // propagate that to the new island.  This is just keeping NewWaterList
+    // updated to match the WaterList, which will be updated below.
+    if (NewWaterList.count(WaterBB)) {
+      NewWaterList.erase(WaterBB);
+      NewWaterList.insert(NewIsland);
+    }
+    // The new CPE goes before the following block (NewMBB).
+    NewMBB = llvm::next(MachineFunction::iterator(WaterBB));
+
+  } else {
+    // No water found.
+    DEBUG(errs() << "No water found\n");
+    CreateNewWater(CPUserIndex, UserOffset, NewMBB);
+
+    // SplitBlockBeforeInstr adds to WaterList, which is important when it is
+    // called while handling branches so that the water will be seen on the
+    // next iteration for constant pools, but in this context, we don't want
+    // it.  Check for this so it will be removed from the WaterList.
+    // Also remove any entry from NewWaterList.
+    MachineBasicBlock *WaterBB = prior(MachineFunction::iterator(NewMBB));
+    IP = std::find(WaterList.begin(), WaterList.end(), WaterBB);
+    if (IP != WaterList.end())
+      NewWaterList.erase(WaterBB);
+
+    // We are adding new water.  Update NewWaterList.
+    NewWaterList.insert(NewIsland);
+  }
+
+  // Remove the original WaterList entry; we want subsequent insertions in
+  // this vicinity to go after the one we're about to insert.  This
+  // considerably reduces the number of times we have to move the same CPE
+  // more than once and is also important to ensure the algorithm terminates.
+  if (IP != WaterList.end())
+    WaterList.erase(IP);
+
+  // Okay, we know we can put an island before NewMBB now, do it!
+  MF.insert(NewMBB, NewIsland);
+
+  // Update internal data structures to account for the newly inserted MBB.
+  UpdateForInsertedWaterBlock(NewIsland);
+
+  // Decrement the old entry, and remove it if refcount becomes 0.
+  DecrementOldEntry(CPI, CPEMI);
+
+  // Now that we have an island to add the CPE to, clone the original CPE and
+  // add it to the island.
+  U.HighWaterMark = NewIsland;
+  U.CPEMI = BuildMI(NewIsland, DebugLoc::getUnknownLoc(),
+                    TII->get(ARM::CONSTPOOL_ENTRY))
+                .addImm(ID).addConstantPoolIndex(CPI).addImm(Size);
+  CPEntries[CPI].push_back(CPEntry(U.CPEMI, ID, 1));
+  NumCPEs++;
+
+  BBOffsets[NewIsland->getNumber()] = BBOffsets[NewMBB->getNumber()];
+  // Compensate for .align 2 in thumb mode.
+  if (isThumb && (BBOffsets[NewIsland->getNumber()]%4 != 0 || HasInlineAsm))
+    Size += 2;
+  // Increase the size of the island block to account for the new entry.
+  BBSizes[NewIsland->getNumber()] += Size;
+  AdjustBBOffsetsAfter(NewIsland, Size);
+
+  // Finally, change the CPI in the instruction operand to be ID.
+  for (unsigned i = 0, e = UserMI->getNumOperands(); i != e; ++i)
+    if (UserMI->getOperand(i).isCPI()) {
+      UserMI->getOperand(i).setIndex(ID);
+      break;
+    }
+
+  DEBUG(errs() << "  Moved CPE to #" << ID << " CPI=" << CPI
+           << '\t' << *UserMI);
+
+  return true;
+}
+
+/// RemoveDeadCPEMI - Remove a dead constant pool entry instruction. Update
+/// sizes and offsets of impacted basic blocks.
+void ARMConstantIslands::RemoveDeadCPEMI(MachineInstr *CPEMI) {
+  MachineBasicBlock *CPEBB = CPEMI->getParent();
+  unsigned Size = CPEMI->getOperand(2).getImm();
+  CPEMI->eraseFromParent();
+  BBSizes[CPEBB->getNumber()] -= Size;
+  // All succeeding offsets have the current size value added in, fix this.
+  if (CPEBB->empty()) {
+    // In thumb1 mode, the size of island may be padded by two to compensate for
+    // the alignment requirement.  Then it will now be 2 when the block is
+    // empty, so fix this.
+    // All succeeding offsets have the current size value added in, fix this.
+    if (BBSizes[CPEBB->getNumber()] != 0) {
+      Size += BBSizes[CPEBB->getNumber()];
+      BBSizes[CPEBB->getNumber()] = 0;
+    }
+  }
+  AdjustBBOffsetsAfter(CPEBB, -Size);
+  // An island has only one predecessor BB and one successor BB. Check if
+  // this BB's predecessor jumps directly to this BB's successor. This
+  // shouldn't happen currently.
+  assert(!BBIsJumpedOver(CPEBB) && "How did this happen?");
+  // FIXME: remove the empty blocks after all the work is done?
+}
+
+/// RemoveUnusedCPEntries - Remove constant pool entries whose refcounts
+/// are zero.
+bool ARMConstantIslands::RemoveUnusedCPEntries() {
+  unsigned MadeChange = false;
+  for (unsigned i = 0, e = CPEntries.size(); i != e; ++i) {
+      std::vector<CPEntry> &CPEs = CPEntries[i];
+      for (unsigned j = 0, ee = CPEs.size(); j != ee; ++j) {
+        if (CPEs[j].RefCount == 0 && CPEs[j].CPEMI) {
+          RemoveDeadCPEMI(CPEs[j].CPEMI);
+          CPEs[j].CPEMI = NULL;
+          MadeChange = true;
+        }
+      }
+  }
+  return MadeChange;
+}
+
+/// BBIsInRange - Returns true if the distance between specific MI and
+/// specific BB can fit in MI's displacement field.
+bool ARMConstantIslands::BBIsInRange(MachineInstr *MI,MachineBasicBlock *DestBB,
+                                     unsigned MaxDisp) {
+  unsigned PCAdj      = isThumb ? 4 : 8;
+  unsigned BrOffset   = GetOffsetOf(MI) + PCAdj;
+  unsigned DestOffset = BBOffsets[DestBB->getNumber()];
+
+  DEBUG(errs() << "Branch of destination BB#" << DestBB->getNumber()
+               << " from BB#" << MI->getParent()->getNumber()
+               << " max delta=" << MaxDisp
+               << " from " << GetOffsetOf(MI) << " to " << DestOffset
+               << " offset " << int(DestOffset-BrOffset) << "\t" << *MI);
+
+  if (BrOffset <= DestOffset) {
+    // Branch before the Dest.
+    if (DestOffset-BrOffset <= MaxDisp)
+      return true;
+  } else {
+    if (BrOffset-DestOffset <= MaxDisp)
+      return true;
+  }
+  return false;
+}
+
+/// FixUpImmediateBr - Fix up an immediate branch whose destination is too far
+/// away to fit in its displacement field.
+bool ARMConstantIslands::FixUpImmediateBr(MachineFunction &MF, ImmBranch &Br) {
+  MachineInstr *MI = Br.MI;
+  MachineBasicBlock *DestBB = MI->getOperand(0).getMBB();
+
+  // Check to see if the DestBB is already in-range.
+  if (BBIsInRange(MI, DestBB, Br.MaxDisp))
+    return false;
+
+  if (!Br.isCond)
+    return FixUpUnconditionalBr(MF, Br);
+  return FixUpConditionalBr(MF, Br);
+}
+
+/// FixUpUnconditionalBr - Fix up an unconditional branch whose destination is
+/// too far away to fit in its displacement field. If the LR register has been
+/// spilled in the epilogue, then we can use BL to implement a far jump.
+/// Otherwise, add an intermediate branch instruction to a branch.
+bool
+ARMConstantIslands::FixUpUnconditionalBr(MachineFunction &MF, ImmBranch &Br) {
+  MachineInstr *MI = Br.MI;
+  MachineBasicBlock *MBB = MI->getParent();
+  if (!isThumb1)
+    llvm_unreachable("FixUpUnconditionalBr is Thumb1 only!");
+
+  // Use BL to implement far jump.
+  Br.MaxDisp = (1 << 21) * 2;
+  MI->setDesc(TII->get(ARM::tBfar));
+  BBSizes[MBB->getNumber()] += 2;
+  AdjustBBOffsetsAfter(MBB, 2);
+  HasFarJump = true;
+  NumUBrFixed++;
+
+  DEBUG(errs() << "  Changed B to long jump " << *MI);
+
+  return true;
+}
+
+/// FixUpConditionalBr - Fix up a conditional branch whose destination is too
+/// far away to fit in its displacement field. It is converted to an inverse
+/// conditional branch + an unconditional branch to the destination.
+bool
+ARMConstantIslands::FixUpConditionalBr(MachineFunction &MF, ImmBranch &Br) {
+  MachineInstr *MI = Br.MI;
+  MachineBasicBlock *DestBB = MI->getOperand(0).getMBB();
+
+  // Add an unconditional branch to the destination and invert the branch
+  // condition to jump over it:
+  // blt L1
+  // =>
+  // bge L2
+  // b   L1
+  // L2:
+  ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(1).getImm();
+  CC = ARMCC::getOppositeCondition(CC);
+  unsigned CCReg = MI->getOperand(2).getReg();
+
+  // If the branch is at the end of its MBB and that has a fall-through block,
+  // direct the updated conditional branch to the fall-through block. Otherwise,
+  // split the MBB before the next instruction.
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineInstr *BMI = &MBB->back();
+  bool NeedSplit = (BMI != MI) || !BBHasFallthrough(MBB);
+
+  NumCBrFixed++;
+  if (BMI != MI) {
+    if (llvm::next(MachineBasicBlock::iterator(MI)) == prior(MBB->end()) &&
+        BMI->getOpcode() == Br.UncondBr) {
+      // Last MI in the BB is an unconditional branch. Can we simply invert the
+      // condition and swap destinations:
+      // beq L1
+      // b   L2
+      // =>
+      // bne L2
+      // b   L1
+      MachineBasicBlock *NewDest = BMI->getOperand(0).getMBB();
+      if (BBIsInRange(MI, NewDest, Br.MaxDisp)) {
+        DEBUG(errs() << "  Invert Bcc condition and swap its destination with "
+                     << *BMI);
+        BMI->getOperand(0).setMBB(DestBB);
+        MI->getOperand(0).setMBB(NewDest);
+        MI->getOperand(1).setImm(CC);
+        return true;
+      }
+    }
+  }
+
+  if (NeedSplit) {
+    SplitBlockBeforeInstr(MI);
+    // No need for the branch to the next block. We're adding an unconditional
+    // branch to the destination.
+    int delta = TII->GetInstSizeInBytes(&MBB->back());
+    BBSizes[MBB->getNumber()] -= delta;
+    MachineBasicBlock* SplitBB = llvm::next(MachineFunction::iterator(MBB));
+    AdjustBBOffsetsAfter(SplitBB, -delta);
+    MBB->back().eraseFromParent();
+    // BBOffsets[SplitBB] is wrong temporarily, fixed below
+  }
+  MachineBasicBlock *NextBB = llvm::next(MachineFunction::iterator(MBB));
+
+  DEBUG(errs() << "  Insert B to BB#" << DestBB->getNumber()
+               << " also invert condition and change dest. to BB#"
+               << NextBB->getNumber() << "\n");
+
+  // Insert a new conditional branch and a new unconditional branch.
+  // Also update the ImmBranch as well as adding a new entry for the new branch.
+  BuildMI(MBB, DebugLoc::getUnknownLoc(),
+          TII->get(MI->getOpcode()))
+    .addMBB(NextBB).addImm(CC).addReg(CCReg);
+  Br.MI = &MBB->back();
+  BBSizes[MBB->getNumber()] += TII->GetInstSizeInBytes(&MBB->back());
+  BuildMI(MBB, DebugLoc::getUnknownLoc(), TII->get(Br.UncondBr)).addMBB(DestBB);
+  BBSizes[MBB->getNumber()] += TII->GetInstSizeInBytes(&MBB->back());
+  unsigned MaxDisp = getUnconditionalBrDisp(Br.UncondBr);
+  ImmBranches.push_back(ImmBranch(&MBB->back(), MaxDisp, false, Br.UncondBr));
+
+  // Remove the old conditional branch.  It may or may not still be in MBB.
+  BBSizes[MI->getParent()->getNumber()] -= TII->GetInstSizeInBytes(MI);
+  MI->eraseFromParent();
+
+  // The net size change is an addition of one unconditional branch.
+  int delta = TII->GetInstSizeInBytes(&MBB->back());
+  AdjustBBOffsetsAfter(MBB, delta);
+  return true;
+}
+
+/// UndoLRSpillRestore - Remove Thumb push / pop instructions that only spills
+/// LR / restores LR to pc. FIXME: This is done here because it's only possible
+/// to do this if tBfar is not used.
+bool ARMConstantIslands::UndoLRSpillRestore() {
+  bool MadeChange = false;
+  for (unsigned i = 0, e = PushPopMIs.size(); i != e; ++i) {
+    MachineInstr *MI = PushPopMIs[i];
+    // First two operands are predicates, the third is a zero since there
+    // is no writeback.
+    if (MI->getOpcode() == ARM::tPOP_RET &&
+        MI->getOperand(3).getReg() == ARM::PC &&
+        MI->getNumExplicitOperands() == 4) {
+      BuildMI(MI->getParent(), MI->getDebugLoc(), TII->get(ARM::tBX_RET));
+      MI->eraseFromParent();
+      MadeChange = true;
+    }
+  }
+  return MadeChange;
+}
+
+bool ARMConstantIslands::OptimizeThumb2Instructions(MachineFunction &MF) {
+  bool MadeChange = false;
+
+  // Shrink ADR and LDR from constantpool.
+  for (unsigned i = 0, e = CPUsers.size(); i != e; ++i) {
+    CPUser &U = CPUsers[i];
+    unsigned Opcode = U.MI->getOpcode();
+    unsigned NewOpc = 0;
+    unsigned Scale = 1;
+    unsigned Bits = 0;
+    switch (Opcode) {
+    default: break;
+    case ARM::t2LEApcrel:
+      if (isARMLowRegister(U.MI->getOperand(0).getReg())) {
+        NewOpc = ARM::tLEApcrel;
+        Bits = 8;
+        Scale = 4;
+      }
+      break;
+    case ARM::t2LDRpci:
+      if (isARMLowRegister(U.MI->getOperand(0).getReg())) {
+        NewOpc = ARM::tLDRpci;
+        Bits = 8;
+        Scale = 4;
+      }
+      break;
+    }
+
+    if (!NewOpc)
+      continue;
+
+    unsigned UserOffset = GetOffsetOf(U.MI) + 4;
+    unsigned MaxOffs = ((1 << Bits) - 1) * Scale;
+    // FIXME: Check if offset is multiple of scale if scale is not 4.
+    if (CPEIsInRange(U.MI, UserOffset, U.CPEMI, MaxOffs, false, true)) {
+      U.MI->setDesc(TII->get(NewOpc));
+      MachineBasicBlock *MBB = U.MI->getParent();
+      BBSizes[MBB->getNumber()] -= 2;
+      AdjustBBOffsetsAfter(MBB, -2);
+      ++NumT2CPShrunk;
+      MadeChange = true;
+    }
+  }
+
+  MadeChange |= OptimizeThumb2Branches(MF);
+  MadeChange |= OptimizeThumb2JumpTables(MF);
+  return MadeChange;
+}
+
+bool ARMConstantIslands::OptimizeThumb2Branches(MachineFunction &MF) {
+  bool MadeChange = false;
+
+  for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i) {
+    ImmBranch &Br = ImmBranches[i];
+    unsigned Opcode = Br.MI->getOpcode();
+    unsigned NewOpc = 0;
+    unsigned Scale = 1;
+    unsigned Bits = 0;
+    switch (Opcode) {
+    default: break;
+    case ARM::t2B:
+      NewOpc = ARM::tB;
+      Bits = 11;
+      Scale = 2;
+      break;
+    case ARM::t2Bcc: {
+      NewOpc = ARM::tBcc;
+      Bits = 8;
+      Scale = 2;
+      break;
+    }
+    }
+    if (NewOpc) {
+      unsigned MaxOffs = ((1 << (Bits-1))-1) * Scale;
+      MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB();
+      if (BBIsInRange(Br.MI, DestBB, MaxOffs)) {
+        Br.MI->setDesc(TII->get(NewOpc));
+        MachineBasicBlock *MBB = Br.MI->getParent();
+        BBSizes[MBB->getNumber()] -= 2;
+        AdjustBBOffsetsAfter(MBB, -2);
+        ++NumT2BrShrunk;
+        MadeChange = true;
+      }
+    }
+
+    Opcode = Br.MI->getOpcode();
+    if (Opcode != ARM::tBcc)
+      continue;
+
+    NewOpc = 0;
+    unsigned PredReg = 0;
+    ARMCC::CondCodes Pred = llvm::getInstrPredicate(Br.MI, PredReg);
+    if (Pred == ARMCC::EQ)
+      NewOpc = ARM::tCBZ;
+    else if (Pred == ARMCC::NE)
+      NewOpc = ARM::tCBNZ;
+    if (!NewOpc)
+      continue;
+    MachineBasicBlock *DestBB = Br.MI->getOperand(0).getMBB();
+    // Check if the distance is within 126. Subtract starting offset by 2
+    // because the cmp will be eliminated.
+    unsigned BrOffset = GetOffsetOf(Br.MI) + 4 - 2;
+    unsigned DestOffset = BBOffsets[DestBB->getNumber()];
+    if (BrOffset < DestOffset && (DestOffset - BrOffset) <= 126) {
+      MachineBasicBlock::iterator CmpMI = Br.MI; --CmpMI;
+      if (CmpMI->getOpcode() == ARM::tCMPzi8) {
+        unsigned Reg = CmpMI->getOperand(0).getReg();
+        Pred = llvm::getInstrPredicate(CmpMI, PredReg);
+        if (Pred == ARMCC::AL &&
+            CmpMI->getOperand(1).getImm() == 0 &&
+            isARMLowRegister(Reg)) {
+          MachineBasicBlock *MBB = Br.MI->getParent();
+          MachineInstr *NewBR =
+            BuildMI(*MBB, CmpMI, Br.MI->getDebugLoc(), TII->get(NewOpc))
+            .addReg(Reg).addMBB(DestBB, Br.MI->getOperand(0).getTargetFlags());
+          CmpMI->eraseFromParent();
+          Br.MI->eraseFromParent();
+          Br.MI = NewBR;
+          BBSizes[MBB->getNumber()] -= 2;
+          AdjustBBOffsetsAfter(MBB, -2);
+          ++NumCBZ;
+          MadeChange = true;
+        }
+      }
+    }
+  }
+
+  return MadeChange;
+}
+
+/// OptimizeThumb2JumpTables - Use tbb / tbh instructions to generate smaller
+/// jumptables when it's possible.
+bool ARMConstantIslands::OptimizeThumb2JumpTables(MachineFunction &MF) {
+  bool MadeChange = false;
+
+  // FIXME: After the tables are shrunk, can we get rid some of the
+  // constantpool tables?
+  MachineJumpTableInfo *MJTI = MF.getJumpTableInfo();
+  if (MJTI == 0) return false;
+  
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+  for (unsigned i = 0, e = T2JumpTables.size(); i != e; ++i) {
+    MachineInstr *MI = T2JumpTables[i];
+    const TargetInstrDesc &TID = MI->getDesc();
+    unsigned NumOps = TID.getNumOperands();
+    unsigned JTOpIdx = NumOps - (TID.isPredicable() ? 3 : 2);
+    MachineOperand JTOP = MI->getOperand(JTOpIdx);
+    unsigned JTI = JTOP.getIndex();
+    assert(JTI < JT.size());
+
+    bool ByteOk = true;
+    bool HalfWordOk = true;
+    unsigned JTOffset = GetOffsetOf(MI) + 4;
+    const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+    for (unsigned j = 0, ee = JTBBs.size(); j != ee; ++j) {
+      MachineBasicBlock *MBB = JTBBs[j];
+      unsigned DstOffset = BBOffsets[MBB->getNumber()];
+      // Negative offset is not ok. FIXME: We should change BB layout to make
+      // sure all the branches are forward.
+      if (ByteOk && (DstOffset - JTOffset) > ((1<<8)-1)*2)
+        ByteOk = false;
+      unsigned TBHLimit = ((1<<16)-1)*2;
+      if (HalfWordOk && (DstOffset - JTOffset) > TBHLimit)
+        HalfWordOk = false;
+      if (!ByteOk && !HalfWordOk)
+        break;
+    }
+
+    if (ByteOk || HalfWordOk) {
+      MachineBasicBlock *MBB = MI->getParent();
+      unsigned BaseReg = MI->getOperand(0).getReg();
+      bool BaseRegKill = MI->getOperand(0).isKill();
+      if (!BaseRegKill)
+        continue;
+      unsigned IdxReg = MI->getOperand(1).getReg();
+      bool IdxRegKill = MI->getOperand(1).isKill();
+      MachineBasicBlock::iterator PrevI = MI;
+      if (PrevI == MBB->begin())
+        continue;
+
+      MachineInstr *AddrMI = --PrevI;
+      bool OptOk = true;
+      // Examine the instruction that calculate the jumptable entry address.
+      // If it's not the one just before the t2BR_JT, we won't delete it, then
+      // it's not worth doing the optimization.
+      for (unsigned k = 0, eee = AddrMI->getNumOperands(); k != eee; ++k) {
+        const MachineOperand &MO = AddrMI->getOperand(k);
+        if (!MO.isReg() || !MO.getReg())
+          continue;
+        if (MO.isDef() && MO.getReg() != BaseReg) {
+          OptOk = false;
+          break;
+        }
+        if (MO.isUse() && !MO.isKill() && MO.getReg() != IdxReg) {
+          OptOk = false;
+          break;
+        }
+      }
+      if (!OptOk)
+        continue;
+
+      // The previous instruction should be a tLEApcrel or t2LEApcrelJT, we want
+      // to delete it as well.
+      MachineInstr *LeaMI = --PrevI;
+      if ((LeaMI->getOpcode() != ARM::tLEApcrelJT &&
+           LeaMI->getOpcode() != ARM::t2LEApcrelJT) ||
+          LeaMI->getOperand(0).getReg() != BaseReg)
+        OptOk = false;
+
+      if (!OptOk)
+        continue;
+
+      unsigned Opc = ByteOk ? ARM::t2TBB : ARM::t2TBH;
+      MachineInstr *NewJTMI = BuildMI(MBB, MI->getDebugLoc(), TII->get(Opc))
+        .addReg(IdxReg, getKillRegState(IdxRegKill))
+        .addJumpTableIndex(JTI, JTOP.getTargetFlags())
+        .addImm(MI->getOperand(JTOpIdx+1).getImm());
+      // FIXME: Insert an "ALIGN" instruction to ensure the next instruction
+      // is 2-byte aligned. For now, asm printer will fix it up.
+      unsigned NewSize = TII->GetInstSizeInBytes(NewJTMI);
+      unsigned OrigSize = TII->GetInstSizeInBytes(AddrMI);
+      OrigSize += TII->GetInstSizeInBytes(LeaMI);
+      OrigSize += TII->GetInstSizeInBytes(MI);
+
+      AddrMI->eraseFromParent();
+      LeaMI->eraseFromParent();
+      MI->eraseFromParent();
+
+      int delta = OrigSize - NewSize;
+      BBSizes[MBB->getNumber()] -= delta;
+      AdjustBBOffsetsAfter(MBB, -delta);
+
+      ++NumTBs;
+      MadeChange = true;
+    }
+  }
+
+  return MadeChange;
+}
+
+/// ReorderThumb2JumpTables - Adjust the function's block layout to ensure that
+/// jump tables always branch forwards, since that's what tbb and tbh need.
+bool ARMConstantIslands::ReorderThumb2JumpTables(MachineFunction &MF) {
+  bool MadeChange = false;
+
+  MachineJumpTableInfo *MJTI = MF.getJumpTableInfo();
+  if (MJTI == 0) return false;
+  
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+  for (unsigned i = 0, e = T2JumpTables.size(); i != e; ++i) {
+    MachineInstr *MI = T2JumpTables[i];
+    const TargetInstrDesc &TID = MI->getDesc();
+    unsigned NumOps = TID.getNumOperands();
+    unsigned JTOpIdx = NumOps - (TID.isPredicable() ? 3 : 2);
+    MachineOperand JTOP = MI->getOperand(JTOpIdx);
+    unsigned JTI = JTOP.getIndex();
+    assert(JTI < JT.size());
+
+    // We prefer if target blocks for the jump table come after the jump
+    // instruction so we can use TB[BH]. Loop through the target blocks
+    // and try to adjust them such that that's true.
+    int JTNumber = MI->getParent()->getNumber();
+    const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+    for (unsigned j = 0, ee = JTBBs.size(); j != ee; ++j) {
+      MachineBasicBlock *MBB = JTBBs[j];
+      int DTNumber = MBB->getNumber();
+
+      if (DTNumber < JTNumber) {
+        // The destination precedes the switch. Try to move the block forward
+        // so we have a positive offset.
+        MachineBasicBlock *NewBB =
+          AdjustJTTargetBlockForward(MBB, MI->getParent());
+        if (NewBB)
+          MJTI->ReplaceMBBInJumpTable(JTI, JTBBs[j], NewBB);
+        MadeChange = true;
+      }
+    }
+  }
+
+  return MadeChange;
+}
+
+MachineBasicBlock *ARMConstantIslands::
+AdjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB)
+{
+  MachineFunction &MF = *BB->getParent();
+
+  // If it's the destination block is terminated by an unconditional branch,
+  // try to move it; otherwise, create a new block following the jump
+  // table that branches back to the actual target. This is a very simple
+  // heuristic. FIXME: We can definitely improve it.
+  MachineBasicBlock *TBB = 0, *FBB = 0;
+  SmallVector<MachineOperand, 4> Cond;
+  SmallVector<MachineOperand, 4> CondPrior;
+  MachineFunction::iterator BBi = BB;
+  MachineFunction::iterator OldPrior = prior(BBi);
+
+  // If the block terminator isn't analyzable, don't try to move the block
+  bool B = TII->AnalyzeBranch(*BB, TBB, FBB, Cond);
+
+  // If the block ends in an unconditional branch, move it. The prior block
+  // has to have an analyzable terminator for us to move this one. Be paranoid
+  // and make sure we're not trying to move the entry block of the function.
+  if (!B && Cond.empty() && BB != MF.begin() &&
+      !TII->AnalyzeBranch(*OldPrior, TBB, FBB, CondPrior)) {
+    BB->moveAfter(JTBB);
+    OldPrior->updateTerminator();
+    BB->updateTerminator();
+    // Update numbering to account for the block being moved.
+    MF.RenumberBlocks();
+    ++NumJTMoved;
+    return NULL;
+  }
+
+  // Create a new MBB for the code after the jump BB.
+  MachineBasicBlock *NewBB =
+    MF.CreateMachineBasicBlock(JTBB->getBasicBlock());
+  MachineFunction::iterator MBBI = JTBB; ++MBBI;
+  MF.insert(MBBI, NewBB);
+
+  // Add an unconditional branch from NewBB to BB.
+  // There doesn't seem to be meaningful DebugInfo available; this doesn't
+  // correspond directly to anything in the source.
+  assert (isThumb2 && "Adjusting for TB[BH] but not in Thumb2?");
+  BuildMI(NewBB, DebugLoc::getUnknownLoc(), TII->get(ARM::t2B)).addMBB(BB);
+
+  // Update internal data structures to account for the newly inserted MBB.
+  MF.RenumberBlocks(NewBB);
+
+  // Update the CFG.
+  NewBB->addSuccessor(BB);
+  JTBB->removeSuccessor(BB);
+  JTBB->addSuccessor(NewBB);
+
+  ++NumJTInserted;
+  return NewBB;
+}

diff --git a/lib/Target/ARM/ARMConstantPoolValue.cpp b/lib/Target/ARM/ARMConstantPoolValue.cpp
new file mode 100644
index 0000000..90dd0c7
--- /dev/null
+++ b/lib/Target/ARM/ARMConstantPoolValue.cpp

@@ -0,0 +1,121 @@
+//===- ARMConstantPoolValue.cpp - ARM constantpool value --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARM specific constantpool value class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMConstantPoolValue.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/Constant.h"
+#include "llvm/Constants.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Type.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstdlib>
+using namespace llvm;
+
+ARMConstantPoolValue::ARMConstantPoolValue(Constant *cval, unsigned id,
+                                           ARMCP::ARMCPKind K,
+                                           unsigned char PCAdj,
+                                           const char *Modif,
+                                           bool AddCA)
+  : MachineConstantPoolValue((const Type*)cval->getType()),
+    CVal(cval), S(NULL), LabelId(id), Kind(K), PCAdjust(PCAdj),
+    Modifier(Modif), AddCurrentAddress(AddCA) {}
+
+ARMConstantPoolValue::ARMConstantPoolValue(LLVMContext &C,
+                                           const char *s, unsigned id,
+                                           unsigned char PCAdj,
+                                           const char *Modif,
+                                           bool AddCA)
+  : MachineConstantPoolValue((const Type*)Type::getInt32Ty(C)),
+    CVal(NULL), S(strdup(s)), LabelId(id), Kind(ARMCP::CPExtSymbol),
+    PCAdjust(PCAdj), Modifier(Modif), AddCurrentAddress(AddCA) {}
+
+ARMConstantPoolValue::ARMConstantPoolValue(GlobalValue *gv, const char *Modif)
+  : MachineConstantPoolValue((const Type*)Type::getInt32Ty(gv->getContext())),
+    CVal(gv), S(NULL), LabelId(0), Kind(ARMCP::CPValue), PCAdjust(0),
+    Modifier(Modif) {}
+
+GlobalValue *ARMConstantPoolValue::getGV() const {
+  return dyn_cast_or_null<GlobalValue>(CVal);
+}
+
+BlockAddress *ARMConstantPoolValue::getBlockAddress() const {
+  return dyn_cast_or_null<BlockAddress>(CVal);
+}
+
+int ARMConstantPoolValue::getExistingMachineCPValue(MachineConstantPool *CP,
+                                                    unsigned Alignment) {
+  unsigned AlignMask = Alignment - 1;
+  const std::vector<MachineConstantPoolEntry> Constants = CP->getConstants();
+  for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
+    if (Constants[i].isMachineConstantPoolEntry() &&
+        (Constants[i].getAlignment() & AlignMask) == 0) {
+      ARMConstantPoolValue *CPV =
+        (ARMConstantPoolValue *)Constants[i].Val.MachineCPVal;
+      if (CPV->CVal == CVal &&
+          CPV->LabelId == LabelId &&
+          CPV->PCAdjust == PCAdjust &&
+          (CPV->S == S || strcmp(CPV->S, S) == 0) &&
+          (CPV->Modifier == Modifier || strcmp(CPV->Modifier, Modifier) == 0))
+        return i;
+    }
+  }
+
+  return -1;
+}
+
+ARMConstantPoolValue::~ARMConstantPoolValue() {
+  free((void*)S);
+}
+
+void
+ARMConstantPoolValue::AddSelectionDAGCSEId(FoldingSetNodeID &ID) {
+  ID.AddPointer(CVal);
+  ID.AddPointer(S);
+  ID.AddInteger(LabelId);
+  ID.AddInteger(PCAdjust);
+}
+
+bool
+ARMConstantPoolValue::hasSameValue(ARMConstantPoolValue *ACPV) {
+  if (ACPV->Kind == Kind &&
+      ACPV->CVal == CVal &&
+      ACPV->PCAdjust == PCAdjust &&
+      (ACPV->S == S || strcmp(ACPV->S, S) == 0) &&
+      (ACPV->Modifier == Modifier || strcmp(ACPV->Modifier, Modifier) == 0)) {
+    if (ACPV->LabelId == LabelId)
+      return true;
+    // Two PC relative constpool entries containing the same GV address or
+    // external symbols. FIXME: What about blockaddress?
+    if (Kind == ARMCP::CPValue || Kind == ARMCP::CPExtSymbol)
+      return true;
+  }
+  return false;
+}
+
+void ARMConstantPoolValue::dump() const {
+  errs() << "  " << *this;
+}
+
+
+void ARMConstantPoolValue::print(raw_ostream &O) const {
+  if (CVal)
+    O << CVal->getName();
+  else
+    O << S;
+  if (Modifier) O << "(" << Modifier << ")";
+  if (PCAdjust != 0) {
+    O << "-(LPC" << LabelId << "+" << (unsigned)PCAdjust;
+    if (AddCurrentAddress) O << "-.";
+    O << ")";
+  }
+}

diff --git a/lib/Target/ARM/ARMConstantPoolValue.h b/lib/Target/ARM/ARMConstantPoolValue.h
new file mode 100644
index 0000000..741acde
--- /dev/null
+++ b/lib/Target/ARM/ARMConstantPoolValue.h

@@ -0,0 +1,100 @@
+//===- ARMConstantPoolValue.h - ARM constantpool value ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARM specific constantpool value class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_ARM_CONSTANTPOOLVALUE_H
+#define LLVM_TARGET_ARM_CONSTANTPOOLVALUE_H
+
+#include "llvm/CodeGen/MachineConstantPool.h"
+
+namespace llvm {
+
+class Constant;
+class BlockAddress;
+class GlobalValue;
+class LLVMContext;
+
+namespace ARMCP {
+  enum ARMCPKind {
+    CPValue,
+    CPExtSymbol,
+    CPBlockAddress,
+    CPLSDA
+  };
+}
+
+/// ARMConstantPoolValue - ARM specific constantpool value. This is used to
+/// represent PC-relative displacement between the address of the load
+/// instruction and the constant being loaded, i.e. (&GV-(LPIC+8)).
+class ARMConstantPoolValue : public MachineConstantPoolValue {
+  Constant *CVal;          // Constant being loaded.
+  const char *S;           // ExtSymbol being loaded.
+  unsigned LabelId;        // Label id of the load.
+  ARMCP::ARMCPKind Kind;   // Kind of constant.
+  unsigned char PCAdjust;  // Extra adjustment if constantpool is pc-relative.
+                           // 8 for ARM, 4 for Thumb.
+  const char *Modifier;    // GV modifier i.e. (&GV(modifier)-(LPIC+8))
+  bool AddCurrentAddress;
+
+public:
+  ARMConstantPoolValue(Constant *cval, unsigned id,
+                       ARMCP::ARMCPKind Kind = ARMCP::CPValue,
+                       unsigned char PCAdj = 0, const char *Modifier = NULL,
+                       bool AddCurrentAddress = false);
+  ARMConstantPoolValue(LLVMContext &C, const char *s, unsigned id,
+                       unsigned char PCAdj = 0, const char *Modifier = NULL,
+                       bool AddCurrentAddress = false);
+  ARMConstantPoolValue(GlobalValue *GV, const char *Modifier);
+  ARMConstantPoolValue();
+  ~ARMConstantPoolValue();
+
+  GlobalValue *getGV() const;
+  const char *getSymbol() const { return S; }
+  BlockAddress *getBlockAddress() const;
+  const char *getModifier() const { return Modifier; }
+  bool hasModifier() const { return Modifier != NULL; }
+  bool mustAddCurrentAddress() const { return AddCurrentAddress; }
+  unsigned getLabelId() const { return LabelId; }
+  unsigned char getPCAdjustment() const { return PCAdjust; }
+  bool isGlobalValue() const { return Kind == ARMCP::CPValue; }
+  bool isExtSymbol() const { return Kind == ARMCP::CPExtSymbol; }
+  bool isBlockAddress() { return Kind == ARMCP::CPBlockAddress; }
+  bool isLSDA() { return Kind == ARMCP::CPLSDA; }
+
+  virtual unsigned getRelocationInfo() const {
+    // FIXME: This is conservatively claiming that these entries require a
+    // relocation, we may be able to do better than this.
+    return 2;
+  }
+
+  virtual int getExistingMachineCPValue(MachineConstantPool *CP,
+                                        unsigned Alignment);
+
+  virtual void AddSelectionDAGCSEId(FoldingSetNodeID &ID);
+
+  /// hasSameValue - Return true if this ARM constpool value
+  /// can share the same constantpool entry as another ARM constpool value.
+  bool hasSameValue(ARMConstantPoolValue *ACPV);
+
+  void print(raw_ostream *O) const { if (O) print(*O); }
+  void print(raw_ostream &O) const;
+  void dump() const;
+};
+
+inline raw_ostream &operator<<(raw_ostream &O, const ARMConstantPoolValue &V) {
+  V.print(O);
+  return O;
+}
+
+} // End llvm namespace
+
+#endif

diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
new file mode 100644
index 0000000..1b8727d
--- /dev/null
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp

@@ -0,0 +1,128 @@
+//===-- ARMExpandPseudoInsts.cpp - Expand pseudo instructions -----*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that expand pseudo instructions into target
+// instructions to allow proper scheduling, if-conversion, and other late
+// optimizations. This pass should be run after register allocation but before
+// post- regalloc scheduling pass.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm-pseudo"
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+using namespace llvm;
+
+namespace {
+  class ARMExpandPseudo : public MachineFunctionPass {
+  public:
+    static char ID;
+    ARMExpandPseudo() : MachineFunctionPass(&ID) {}
+
+    const TargetInstrInfo *TII;
+
+    virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+    virtual const char *getPassName() const {
+      return "ARM pseudo instruction expansion pass";
+    }
+
+  private:
+    bool ExpandMBB(MachineBasicBlock &MBB);
+  };
+  char ARMExpandPseudo::ID = 0;
+}
+
+bool ARMExpandPseudo::ExpandMBB(MachineBasicBlock &MBB) {
+  bool Modified = false;
+
+  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+  while (MBBI != E) {
+    MachineInstr &MI = *MBBI;
+    MachineBasicBlock::iterator NMBBI = llvm::next(MBBI);
+
+    unsigned Opcode = MI.getOpcode();
+    switch (Opcode) {
+    default: break;
+    case ARM::tLDRpci_pic: 
+    case ARM::t2LDRpci_pic: {
+      unsigned NewLdOpc = (Opcode == ARM::tLDRpci_pic)
+        ? ARM::tLDRpci : ARM::t2LDRpci;
+      unsigned DstReg = MI.getOperand(0).getReg();
+      if (!MI.getOperand(0).isDead()) {
+        MachineInstr *NewMI =
+          AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                                 TII->get(NewLdOpc), DstReg)
+                         .addOperand(MI.getOperand(1)));
+        NewMI->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::tPICADD))
+          .addReg(DstReg, getDefRegState(true))
+          .addReg(DstReg)
+          .addOperand(MI.getOperand(2));
+      }
+      MI.eraseFromParent();
+      Modified = true;
+      break;
+    }
+    case ARM::t2MOVi32imm: {
+      unsigned DstReg = MI.getOperand(0).getReg();
+      if (!MI.getOperand(0).isDead()) {
+        const MachineOperand &MO = MI.getOperand(1);
+        MachineInstrBuilder LO16, HI16;
+
+        LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::t2MOVi16),
+                       DstReg);
+        HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::t2MOVTi16))
+          .addReg(DstReg, getDefRegState(true)).addReg(DstReg);
+
+        if (MO.isImm()) {
+          unsigned Imm = MO.getImm();
+          unsigned Lo16 = Imm & 0xffff;
+          unsigned Hi16 = (Imm >> 16) & 0xffff;
+          LO16 = LO16.addImm(Lo16);
+          HI16 = HI16.addImm(Hi16);
+        } else {
+          GlobalValue *GV = MO.getGlobal();
+          unsigned TF = MO.getTargetFlags();
+          LO16 = LO16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_LO16);
+          HI16 = HI16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_HI16);
+          // FIXME: What's about memoperands?
+        }
+        AddDefaultPred(LO16);
+        AddDefaultPred(HI16);
+      }
+      MI.eraseFromParent();
+      Modified = true;
+    }
+    // FIXME: expand t2MOVi32imm
+    }
+    MBBI = NMBBI;
+  }
+
+  return Modified;
+}
+
+bool ARMExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
+  TII = MF.getTarget().getInstrInfo();
+
+  bool Modified = false;
+  for (MachineFunction::iterator MFI = MF.begin(), E = MF.end(); MFI != E;
+       ++MFI)
+    Modified |= ExpandMBB(*MFI);
+  return Modified;
+}
+
+/// createARMExpandPseudoPass - returns an instance of the pseudo instruction
+/// expansion pass.
+FunctionPass *llvm::createARMExpandPseudoPass() {
+  return new ARMExpandPseudo();
+}

diff --git a/lib/Target/ARM/ARMFrameInfo.h b/lib/Target/ARM/ARMFrameInfo.h
new file mode 100644
index 0000000..d5dae24
--- /dev/null
+++ b/lib/Target/ARM/ARMFrameInfo.h

@@ -0,0 +1,32 @@
+//===-- ARMTargetFrameInfo.h - Define TargetFrameInfo for ARM ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARM_FRAMEINFO_H
+#define ARM_FRAMEINFO_H
+
+#include "ARM.h"
+#include "ARMSubtarget.h"
+#include "llvm/Target/TargetFrameInfo.h"
+
+namespace llvm {
+
+class ARMFrameInfo : public TargetFrameInfo {
+public:
+  explicit ARMFrameInfo(const ARMSubtarget &ST)
+    : TargetFrameInfo(StackGrowsDown, ST.getStackAlignment(), 0, 4) {
+  }
+};
+
+} // End llvm namespace
+
+#endif

diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
new file mode 100644
index 0000000..a458269
--- /dev/null
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp

@@ -0,0 +1,1968 @@
+//===-- ARMISelDAGToDAG.cpp - A dag to dag inst selector for ARM ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the ARM target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMAddressingModes.h"
+#include "ARMISelLowering.h"
+#include "ARMTargetMachine.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+//===--------------------------------------------------------------------===//
+/// ARMDAGToDAGISel - ARM specific code to select ARM machine
+/// instructions for SelectionDAG operations.
+///
+namespace {
+class ARMDAGToDAGISel : public SelectionDAGISel {
+  ARMBaseTargetMachine &TM;
+
+  /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const ARMSubtarget *Subtarget;
+
+public:
+  explicit ARMDAGToDAGISel(ARMBaseTargetMachine &tm,
+                           CodeGenOpt::Level OptLevel)
+    : SelectionDAGISel(tm, OptLevel), TM(tm),
+    Subtarget(&TM.getSubtarget<ARMSubtarget>()) {
+  }
+
+  virtual const char *getPassName() const {
+    return "ARM Instruction Selection";
+  }
+
+  /// getI32Imm - Return a target constant of type i32 with the specified
+  /// value.
+  inline SDValue getI32Imm(unsigned Imm) {
+    return CurDAG->getTargetConstant(Imm, MVT::i32);
+  }
+
+  SDNode *Select(SDNode *N);
+  virtual void InstructionSelect();
+  bool SelectShifterOperandReg(SDNode *Op, SDValue N, SDValue &A,
+                               SDValue &B, SDValue &C);
+  bool SelectAddrMode2(SDNode *Op, SDValue N, SDValue &Base,
+                       SDValue &Offset, SDValue &Opc);
+  bool SelectAddrMode2Offset(SDNode *Op, SDValue N,
+                             SDValue &Offset, SDValue &Opc);
+  bool SelectAddrMode3(SDNode *Op, SDValue N, SDValue &Base,
+                       SDValue &Offset, SDValue &Opc);
+  bool SelectAddrMode3Offset(SDNode *Op, SDValue N,
+                             SDValue &Offset, SDValue &Opc);
+  bool SelectAddrMode4(SDNode *Op, SDValue N, SDValue &Addr,
+                       SDValue &Mode);
+  bool SelectAddrMode5(SDNode *Op, SDValue N, SDValue &Base,
+                       SDValue &Offset);
+  bool SelectAddrMode6(SDNode *Op, SDValue N, SDValue &Addr, SDValue &Update,
+                       SDValue &Opc, SDValue &Align);
+
+  bool SelectAddrModePC(SDNode *Op, SDValue N, SDValue &Offset,
+                        SDValue &Label);
+
+  bool SelectThumbAddrModeRR(SDNode *Op, SDValue N, SDValue &Base,
+                             SDValue &Offset);
+  bool SelectThumbAddrModeRI5(SDNode *Op, SDValue N, unsigned Scale,
+                              SDValue &Base, SDValue &OffImm,
+                              SDValue &Offset);
+  bool SelectThumbAddrModeS1(SDNode *Op, SDValue N, SDValue &Base,
+                             SDValue &OffImm, SDValue &Offset);
+  bool SelectThumbAddrModeS2(SDNode *Op, SDValue N, SDValue &Base,
+                             SDValue &OffImm, SDValue &Offset);
+  bool SelectThumbAddrModeS4(SDNode *Op, SDValue N, SDValue &Base,
+                             SDValue &OffImm, SDValue &Offset);
+  bool SelectThumbAddrModeSP(SDNode *Op, SDValue N, SDValue &Base,
+                             SDValue &OffImm);
+
+  bool SelectT2ShifterOperandReg(SDNode *Op, SDValue N,
+                                 SDValue &BaseReg, SDValue &Opc);
+  bool SelectT2AddrModeImm12(SDNode *Op, SDValue N, SDValue &Base,
+                             SDValue &OffImm);
+  bool SelectT2AddrModeImm8(SDNode *Op, SDValue N, SDValue &Base,
+                            SDValue &OffImm);
+  bool SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N,
+                                 SDValue &OffImm);
+  bool SelectT2AddrModeImm8s4(SDNode *Op, SDValue N, SDValue &Base,
+                              SDValue &OffImm);
+  bool SelectT2AddrModeSoReg(SDNode *Op, SDValue N, SDValue &Base,
+                             SDValue &OffReg, SDValue &ShImm);
+
+  // Include the pieces autogenerated from the target description.
+#include "ARMGenDAGISel.inc"
+
+private:
+  /// SelectARMIndexedLoad - Indexed (pre/post inc/dec) load matching code for
+  /// ARM.
+  SDNode *SelectARMIndexedLoad(SDNode *N);
+  SDNode *SelectT2IndexedLoad(SDNode *N);
+
+  /// SelectDYN_ALLOC - Select dynamic alloc for Thumb.
+  SDNode *SelectDYN_ALLOC(SDNode *N);
+
+  /// SelectVLD - Select NEON load intrinsics.  NumVecs should
+  /// be 2, 3 or 4.  The opcode arrays specify the instructions used for
+  /// loads of D registers and even subregs and odd subregs of Q registers.
+  /// For NumVecs == 2, QOpcodes1 is not used.
+  SDNode *SelectVLD(SDNode *N, unsigned NumVecs, unsigned *DOpcodes,
+                    unsigned *QOpcodes0, unsigned *QOpcodes1);
+
+  /// SelectVST - Select NEON store intrinsics.  NumVecs should
+  /// be 2, 3 or 4.  The opcode arrays specify the instructions used for
+  /// stores of D registers and even subregs and odd subregs of Q registers.
+  /// For NumVecs == 2, QOpcodes1 is not used.
+  SDNode *SelectVST(SDNode *N, unsigned NumVecs, unsigned *DOpcodes,
+                    unsigned *QOpcodes0, unsigned *QOpcodes1);
+
+  /// SelectVLDSTLane - Select NEON load/store lane intrinsics.  NumVecs should
+  /// be 2, 3 or 4.  The opcode arrays specify the instructions used for
+  /// load/store of D registers and even subregs and odd subregs of Q registers.
+  SDNode *SelectVLDSTLane(SDNode *N, bool IsLoad, unsigned NumVecs,
+                          unsigned *DOpcodes, unsigned *QOpcodes0,
+                          unsigned *QOpcodes1);
+
+  /// SelectV6T2BitfieldExtractOp - Select SBFX/UBFX instructions for ARM.
+  SDNode *SelectV6T2BitfieldExtractOp(SDNode *N, unsigned Opc);
+
+  /// SelectCMOVOp - Select CMOV instructions for ARM.
+  SDNode *SelectCMOVOp(SDNode *N);
+  SDNode *SelectT2CMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
+                              ARMCC::CondCodes CCVal, SDValue CCR,
+                              SDValue InFlag);
+  SDNode *SelectARMCMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
+                               ARMCC::CondCodes CCVal, SDValue CCR,
+                               SDValue InFlag);
+  SDNode *SelectT2CMOVSoImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
+                              ARMCC::CondCodes CCVal, SDValue CCR,
+                              SDValue InFlag);
+  SDNode *SelectARMCMOVSoImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
+                               ARMCC::CondCodes CCVal, SDValue CCR,
+                               SDValue InFlag);
+
+  /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+  /// inline asm expressions.
+  virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                            char ConstraintCode,
+                                            std::vector<SDValue> &OutOps);
+
+  /// PairDRegs - Insert a pair of double registers into an implicit def to
+  /// form a quad register.
+  SDNode *PairDRegs(EVT VT, SDValue V0, SDValue V1);
+};
+}
+
+/// isInt32Immediate - This method tests to see if the node is a 32-bit constant
+/// operand. If so Imm will receive the 32-bit value.
+static bool isInt32Immediate(SDNode *N, unsigned &Imm) {
+  if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i32) {
+    Imm = cast<ConstantSDNode>(N)->getZExtValue();
+    return true;
+  }
+  return false;
+}
+
+// isInt32Immediate - This method tests to see if a constant operand.
+// If so Imm will receive the 32 bit value.
+static bool isInt32Immediate(SDValue N, unsigned &Imm) {
+  return isInt32Immediate(N.getNode(), Imm);
+}
+
+// isOpcWithIntImmediate - This method tests to see if the node is a specific
+// opcode and that it has a immediate integer right operand.
+// If so Imm will receive the 32 bit value.
+static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) {
+  return N->getOpcode() == Opc &&
+         isInt32Immediate(N->getOperand(1).getNode(), Imm);
+}
+
+
+void ARMDAGToDAGISel::InstructionSelect() {
+  SelectRoot(*CurDAG);
+  CurDAG->RemoveDeadNodes();
+}
+
+bool ARMDAGToDAGISel::SelectShifterOperandReg(SDNode *Op,
+                                              SDValue N,
+                                              SDValue &BaseReg,
+                                              SDValue &ShReg,
+                                              SDValue &Opc) {
+  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N);
+
+  // Don't match base register only case. That is matched to a separate
+  // lower complexity pattern with explicit register operand.
+  if (ShOpcVal == ARM_AM::no_shift) return false;
+
+  BaseReg = N.getOperand(0);
+  unsigned ShImmVal = 0;
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    ShReg = CurDAG->getRegister(0, MVT::i32);
+    ShImmVal = RHS->getZExtValue() & 31;
+  } else {
+    ShReg = N.getOperand(1);
+  }
+  Opc = CurDAG->getTargetConstant(ARM_AM::getSORegOpc(ShOpcVal, ShImmVal),
+                                  MVT::i32);
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode2(SDNode *Op, SDValue N,
+                                      SDValue &Base, SDValue &Offset,
+                                      SDValue &Opc) {
+  if (N.getOpcode() == ISD::MUL) {
+    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      // X * [3,5,9] -> X + X * [2,4,8] etc.
+      int RHSC = (int)RHS->getZExtValue();
+      if (RHSC & 1) {
+        RHSC = RHSC & ~1;
+        ARM_AM::AddrOpc AddSub = ARM_AM::add;
+        if (RHSC < 0) {
+          AddSub = ARM_AM::sub;
+          RHSC = - RHSC;
+        }
+        if (isPowerOf2_32(RHSC)) {
+          unsigned ShAmt = Log2_32(RHSC);
+          Base = Offset = N.getOperand(0);
+          Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt,
+                                                            ARM_AM::lsl),
+                                          MVT::i32);
+          return true;
+        }
+      }
+    }
+  }
+
+  if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB) {
+    Base = N;
+    if (N.getOpcode() == ISD::FrameIndex) {
+      int FI = cast<FrameIndexSDNode>(N)->getIndex();
+      Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+    } else if (N.getOpcode() == ARMISD::Wrapper &&
+               !(Subtarget->useMovt() &&
+                 N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) {
+      Base = N.getOperand(0);
+    }
+    Offset = CurDAG->getRegister(0, MVT::i32);
+    Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(ARM_AM::add, 0,
+                                                      ARM_AM::no_shift),
+                                    MVT::i32);
+    return true;
+  }
+
+  // Match simple R +/- imm12 operands.
+  if (N.getOpcode() == ISD::ADD)
+    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      int RHSC = (int)RHS->getZExtValue();
+      if ((RHSC >= 0 && RHSC < 0x1000) ||
+          (RHSC < 0 && RHSC > -0x1000)) { // 12 bits.
+        Base = N.getOperand(0);
+        if (Base.getOpcode() == ISD::FrameIndex) {
+          int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+          Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+        }
+        Offset = CurDAG->getRegister(0, MVT::i32);
+
+        ARM_AM::AddrOpc AddSub = ARM_AM::add;
+        if (RHSC < 0) {
+          AddSub = ARM_AM::sub;
+          RHSC = - RHSC;
+        }
+        Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, RHSC,
+                                                          ARM_AM::no_shift),
+                                        MVT::i32);
+        return true;
+      }
+    }
+
+  // Otherwise this is R +/- [possibly shifted] R.
+  ARM_AM::AddrOpc AddSub = N.getOpcode() == ISD::ADD ? ARM_AM::add:ARM_AM::sub;
+  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(1));
+  unsigned ShAmt = 0;
+
+  Base   = N.getOperand(0);
+  Offset = N.getOperand(1);
+
+  if (ShOpcVal != ARM_AM::no_shift) {
+    // Check to see if the RHS of the shift is a constant, if not, we can't fold
+    // it.
+    if (ConstantSDNode *Sh =
+           dyn_cast<ConstantSDNode>(N.getOperand(1).getOperand(1))) {
+      ShAmt = Sh->getZExtValue();
+      Offset = N.getOperand(1).getOperand(0);
+    } else {
+      ShOpcVal = ARM_AM::no_shift;
+    }
+  }
+
+  // Try matching (R shl C) + (R).
+  if (N.getOpcode() == ISD::ADD && ShOpcVal == ARM_AM::no_shift) {
+    ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0));
+    if (ShOpcVal != ARM_AM::no_shift) {
+      // Check to see if the RHS of the shift is a constant, if not, we can't
+      // fold it.
+      if (ConstantSDNode *Sh =
+          dyn_cast<ConstantSDNode>(N.getOperand(0).getOperand(1))) {
+        ShAmt = Sh->getZExtValue();
+        Offset = N.getOperand(0).getOperand(0);
+        Base = N.getOperand(1);
+      } else {
+        ShOpcVal = ARM_AM::no_shift;
+      }
+    }
+  }
+
+  Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal),
+                                  MVT::i32);
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode2Offset(SDNode *Op, SDValue N,
+                                            SDValue &Offset, SDValue &Opc) {
+  unsigned Opcode = Op->getOpcode();
+  ISD::MemIndexedMode AM = (Opcode == ISD::LOAD)
+    ? cast<LoadSDNode>(Op)->getAddressingMode()
+    : cast<StoreSDNode>(Op)->getAddressingMode();
+  ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC)
+    ? ARM_AM::add : ARM_AM::sub;
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) {
+    int Val = (int)C->getZExtValue();
+    if (Val >= 0 && Val < 0x1000) { // 12 bits.
+      Offset = CurDAG->getRegister(0, MVT::i32);
+      Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, Val,
+                                                        ARM_AM::no_shift),
+                                      MVT::i32);
+      return true;
+    }
+  }
+
+  Offset = N;
+  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N);
+  unsigned ShAmt = 0;
+  if (ShOpcVal != ARM_AM::no_shift) {
+    // Check to see if the RHS of the shift is a constant, if not, we can't fold
+    // it.
+    if (ConstantSDNode *Sh = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      ShAmt = Sh->getZExtValue();
+      Offset = N.getOperand(0);
+    } else {
+      ShOpcVal = ARM_AM::no_shift;
+    }
+  }
+
+  Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(AddSub, ShAmt, ShOpcVal),
+                                  MVT::i32);
+  return true;
+}
+
+
+bool ARMDAGToDAGISel::SelectAddrMode3(SDNode *Op, SDValue N,
+                                      SDValue &Base, SDValue &Offset,
+                                      SDValue &Opc) {
+  if (N.getOpcode() == ISD::SUB) {
+    // X - C  is canonicalize to X + -C, no need to handle it here.
+    Base = N.getOperand(0);
+    Offset = N.getOperand(1);
+    Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::sub, 0),MVT::i32);
+    return true;
+  }
+
+  if (N.getOpcode() != ISD::ADD) {
+    Base = N;
+    if (N.getOpcode() == ISD::FrameIndex) {
+      int FI = cast<FrameIndexSDNode>(N)->getIndex();
+      Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+    }
+    Offset = CurDAG->getRegister(0, MVT::i32);
+    Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::add, 0),MVT::i32);
+    return true;
+  }
+
+  // If the RHS is +/- imm8, fold into addr mode.
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    int RHSC = (int)RHS->getZExtValue();
+    if ((RHSC >= 0 && RHSC < 256) ||
+        (RHSC < 0 && RHSC > -256)) { // note -256 itself isn't allowed.
+      Base = N.getOperand(0);
+      if (Base.getOpcode() == ISD::FrameIndex) {
+        int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+        Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+      }
+      Offset = CurDAG->getRegister(0, MVT::i32);
+
+      ARM_AM::AddrOpc AddSub = ARM_AM::add;
+      if (RHSC < 0) {
+        AddSub = ARM_AM::sub;
+        RHSC = - RHSC;
+      }
+      Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, RHSC),MVT::i32);
+      return true;
+    }
+  }
+
+  Base = N.getOperand(0);
+  Offset = N.getOperand(1);
+  Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::add, 0), MVT::i32);
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode3Offset(SDNode *Op, SDValue N,
+                                            SDValue &Offset, SDValue &Opc) {
+  unsigned Opcode = Op->getOpcode();
+  ISD::MemIndexedMode AM = (Opcode == ISD::LOAD)
+    ? cast<LoadSDNode>(Op)->getAddressingMode()
+    : cast<StoreSDNode>(Op)->getAddressingMode();
+  ARM_AM::AddrOpc AddSub = (AM == ISD::PRE_INC || AM == ISD::POST_INC)
+    ? ARM_AM::add : ARM_AM::sub;
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) {
+    int Val = (int)C->getZExtValue();
+    if (Val >= 0 && Val < 256) {
+      Offset = CurDAG->getRegister(0, MVT::i32);
+      Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, Val), MVT::i32);
+      return true;
+    }
+  }
+
+  Offset = N;
+  Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(AddSub, 0), MVT::i32);
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode4(SDNode *Op, SDValue N,
+                                      SDValue &Addr, SDValue &Mode) {
+  Addr = N;
+  Mode = CurDAG->getTargetConstant(0, MVT::i32);
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode5(SDNode *Op, SDValue N,
+                                      SDValue &Base, SDValue &Offset) {
+  if (N.getOpcode() != ISD::ADD) {
+    Base = N;
+    if (N.getOpcode() == ISD::FrameIndex) {
+      int FI = cast<FrameIndexSDNode>(N)->getIndex();
+      Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+    } else if (N.getOpcode() == ARMISD::Wrapper &&
+               !(Subtarget->useMovt() &&
+                 N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) {
+      Base = N.getOperand(0);
+    }
+    Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0),
+                                       MVT::i32);
+    return true;
+  }
+
+  // If the RHS is +/- imm8, fold into addr mode.
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    int RHSC = (int)RHS->getZExtValue();
+    if ((RHSC & 3) == 0) {  // The constant is implicitly multiplied by 4.
+      RHSC >>= 2;
+      if ((RHSC >= 0 && RHSC < 256) ||
+          (RHSC < 0 && RHSC > -256)) { // note -256 itself isn't allowed.
+        Base = N.getOperand(0);
+        if (Base.getOpcode() == ISD::FrameIndex) {
+          int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+          Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+        }
+
+        ARM_AM::AddrOpc AddSub = ARM_AM::add;
+        if (RHSC < 0) {
+          AddSub = ARM_AM::sub;
+          RHSC = - RHSC;
+        }
+        Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(AddSub, RHSC),
+                                           MVT::i32);
+        return true;
+      }
+    }
+  }
+
+  Base = N;
+  Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0),
+                                     MVT::i32);
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrMode6(SDNode *Op, SDValue N,
+                                      SDValue &Addr, SDValue &Update,
+                                      SDValue &Opc, SDValue &Align) {
+  Addr = N;
+  // Default to no writeback.
+  Update = CurDAG->getRegister(0, MVT::i32);
+  Opc = CurDAG->getTargetConstant(ARM_AM::getAM6Opc(false), MVT::i32);
+  // Default to no alignment.
+  Align = CurDAG->getTargetConstant(0, MVT::i32);
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectAddrModePC(SDNode *Op, SDValue N,
+                                       SDValue &Offset, SDValue &Label) {
+  if (N.getOpcode() == ARMISD::PIC_ADD && N.hasOneUse()) {
+    Offset = N.getOperand(0);
+    SDValue N1 = N.getOperand(1);
+    Label  = CurDAG->getTargetConstant(cast<ConstantSDNode>(N1)->getZExtValue(),
+                                       MVT::i32);
+    return true;
+  }
+  return false;
+}
+
+bool ARMDAGToDAGISel::SelectThumbAddrModeRR(SDNode *Op, SDValue N,
+                                            SDValue &Base, SDValue &Offset){
+  // FIXME dl should come from the parent load or store, not the address
+  DebugLoc dl = Op->getDebugLoc();
+  if (N.getOpcode() != ISD::ADD) {
+    ConstantSDNode *NC = dyn_cast<ConstantSDNode>(N);
+    if (!NC || NC->getZExtValue() != 0)
+      return false;
+
+    Base = Offset = N;
+    return true;
+  }
+
+  Base = N.getOperand(0);
+  Offset = N.getOperand(1);
+  return true;
+}
+
+bool
+ARMDAGToDAGISel::SelectThumbAddrModeRI5(SDNode *Op, SDValue N,
+                                        unsigned Scale, SDValue &Base,
+                                        SDValue &OffImm, SDValue &Offset) {
+  if (Scale == 4) {
+    SDValue TmpBase, TmpOffImm;
+    if (SelectThumbAddrModeSP(Op, N, TmpBase, TmpOffImm))
+      return false;  // We want to select tLDRspi / tSTRspi instead.
+    if (N.getOpcode() == ARMISD::Wrapper &&
+        N.getOperand(0).getOpcode() == ISD::TargetConstantPool)
+      return false;  // We want to select tLDRpci instead.
+  }
+
+  if (N.getOpcode() != ISD::ADD) {
+    if (N.getOpcode() == ARMISD::Wrapper &&
+        !(Subtarget->useMovt() &&
+          N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) {
+      Base = N.getOperand(0);
+    } else
+      Base = N;
+
+    Offset = CurDAG->getRegister(0, MVT::i32);
+    OffImm = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+
+  // Thumb does not have [sp, r] address mode.
+  RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(N.getOperand(0));
+  RegisterSDNode *RHSR = dyn_cast<RegisterSDNode>(N.getOperand(1));
+  if ((LHSR && LHSR->getReg() == ARM::SP) ||
+      (RHSR && RHSR->getReg() == ARM::SP)) {
+    Base = N;
+    Offset = CurDAG->getRegister(0, MVT::i32);
+    OffImm = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+
+  // If the RHS is + imm5 * scale, fold into addr mode.
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    int RHSC = (int)RHS->getZExtValue();
+    if ((RHSC & (Scale-1)) == 0) {  // The constant is implicitly multiplied.
+      RHSC /= Scale;
+      if (RHSC >= 0 && RHSC < 32) {
+        Base = N.getOperand(0);
+        Offset = CurDAG->getRegister(0, MVT::i32);
+        OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
+        return true;
+      }
+    }
+  }
+
+  Base = N.getOperand(0);
+  Offset = N.getOperand(1);
+  OffImm = CurDAG->getTargetConstant(0, MVT::i32);
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectThumbAddrModeS1(SDNode *Op, SDValue N,
+                                            SDValue &Base, SDValue &OffImm,
+                                            SDValue &Offset) {
+  return SelectThumbAddrModeRI5(Op, N, 1, Base, OffImm, Offset);
+}
+
+bool ARMDAGToDAGISel::SelectThumbAddrModeS2(SDNode *Op, SDValue N,
+                                            SDValue &Base, SDValue &OffImm,
+                                            SDValue &Offset) {
+  return SelectThumbAddrModeRI5(Op, N, 2, Base, OffImm, Offset);
+}
+
+bool ARMDAGToDAGISel::SelectThumbAddrModeS4(SDNode *Op, SDValue N,
+                                            SDValue &Base, SDValue &OffImm,
+                                            SDValue &Offset) {
+  return SelectThumbAddrModeRI5(Op, N, 4, Base, OffImm, Offset);
+}
+
+bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDNode *Op, SDValue N,
+                                           SDValue &Base, SDValue &OffImm) {
+  if (N.getOpcode() == ISD::FrameIndex) {
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+    OffImm = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+
+  if (N.getOpcode() != ISD::ADD)
+    return false;
+
+  RegisterSDNode *LHSR = dyn_cast<RegisterSDNode>(N.getOperand(0));
+  if (N.getOperand(0).getOpcode() == ISD::FrameIndex ||
+      (LHSR && LHSR->getReg() == ARM::SP)) {
+    // If the RHS is + imm8 * scale, fold into addr mode.
+    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      int RHSC = (int)RHS->getZExtValue();
+      if ((RHSC & 3) == 0) {  // The constant is implicitly multiplied.
+        RHSC >>= 2;
+        if (RHSC >= 0 && RHSC < 256) {
+          Base = N.getOperand(0);
+          if (Base.getOpcode() == ISD::FrameIndex) {
+            int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+            Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+          }
+          OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
+          return true;
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
+bool ARMDAGToDAGISel::SelectT2ShifterOperandReg(SDNode *Op, SDValue N,
+                                                SDValue &BaseReg,
+                                                SDValue &Opc) {
+  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N);
+
+  // Don't match base register only case. That is matched to a separate
+  // lower complexity pattern with explicit register operand.
+  if (ShOpcVal == ARM_AM::no_shift) return false;
+
+  BaseReg = N.getOperand(0);
+  unsigned ShImmVal = 0;
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    ShImmVal = RHS->getZExtValue() & 31;
+    Opc = getI32Imm(ARM_AM::getSORegOpc(ShOpcVal, ShImmVal));
+    return true;
+  }
+
+  return false;
+}
+
+bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDNode *Op, SDValue N,
+                                            SDValue &Base, SDValue &OffImm) {
+  // Match simple R + imm12 operands.
+
+  // Base only.
+  if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB) {
+    if (N.getOpcode() == ISD::FrameIndex) {
+      // Match frame index...
+      int FI = cast<FrameIndexSDNode>(N)->getIndex();
+      Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+      OffImm  = CurDAG->getTargetConstant(0, MVT::i32);
+      return true;
+    } else if (N.getOpcode() == ARMISD::Wrapper &&
+               !(Subtarget->useMovt() &&
+                 N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) {
+      Base = N.getOperand(0);
+      if (Base.getOpcode() == ISD::TargetConstantPool)
+        return false;  // We want to select t2LDRpci instead.
+    } else
+      Base = N;
+    OffImm  = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    if (SelectT2AddrModeImm8(Op, N, Base, OffImm))
+      // Let t2LDRi8 handle (R - imm8).
+      return false;
+
+    int RHSC = (int)RHS->getZExtValue();
+    if (N.getOpcode() == ISD::SUB)
+      RHSC = -RHSC;
+
+    if (RHSC >= 0 && RHSC < 0x1000) { // 12 bits (unsigned)
+      Base   = N.getOperand(0);
+      if (Base.getOpcode() == ISD::FrameIndex) {
+        int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+        Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+      }
+      OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
+      return true;
+    }
+  }
+
+  // Base only.
+  Base = N;
+  OffImm  = CurDAG->getTargetConstant(0, MVT::i32);
+  return true;
+}
+
+bool ARMDAGToDAGISel::SelectT2AddrModeImm8(SDNode *Op, SDValue N,
+                                           SDValue &Base, SDValue &OffImm) {
+  // Match simple R - imm8 operands.
+  if (N.getOpcode() == ISD::ADD || N.getOpcode() == ISD::SUB) {
+    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      int RHSC = (int)RHS->getSExtValue();
+      if (N.getOpcode() == ISD::SUB)
+        RHSC = -RHSC;
+
+      if ((RHSC >= -255) && (RHSC < 0)) { // 8 bits (always negative)
+        Base = N.getOperand(0);
+        if (Base.getOpcode() == ISD::FrameIndex) {
+          int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+          Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+        }
+        OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+bool ARMDAGToDAGISel::SelectT2AddrModeImm8Offset(SDNode *Op, SDValue N,
+                                                 SDValue &OffImm){
+  unsigned Opcode = Op->getOpcode();
+  ISD::MemIndexedMode AM = (Opcode == ISD::LOAD)
+    ? cast<LoadSDNode>(Op)->getAddressingMode()
+    : cast<StoreSDNode>(Op)->getAddressingMode();
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N)) {
+    int RHSC = (int)RHS->getZExtValue();
+    if (RHSC >= 0 && RHSC < 0x100) { // 8 bits.
+      OffImm = ((AM == ISD::PRE_INC) || (AM == ISD::POST_INC))
+        ? CurDAG->getTargetConstant(RHSC, MVT::i32)
+        : CurDAG->getTargetConstant(-RHSC, MVT::i32);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool ARMDAGToDAGISel::SelectT2AddrModeImm8s4(SDNode *Op, SDValue N,
+                                             SDValue &Base, SDValue &OffImm) {
+  if (N.getOpcode() == ISD::ADD) {
+    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      int RHSC = (int)RHS->getZExtValue();
+      if (((RHSC & 0x3) == 0) &&
+          ((RHSC >= 0 && RHSC < 0x400) || (RHSC < 0 && RHSC > -0x400))) { // 8 bits.
+        Base   = N.getOperand(0);
+        OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
+        return true;
+      }
+    }
+  } else if (N.getOpcode() == ISD::SUB) {
+    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      int RHSC = (int)RHS->getZExtValue();
+      if (((RHSC & 0x3) == 0) && (RHSC >= 0 && RHSC < 0x400)) { // 8 bits.
+        Base   = N.getOperand(0);
+        OffImm = CurDAG->getTargetConstant(-RHSC, MVT::i32);
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDNode *Op, SDValue N,
+                                            SDValue &Base,
+                                            SDValue &OffReg, SDValue &ShImm) {
+  // (R - imm8) should be handled by t2LDRi8. The rest are handled by t2LDRi12.
+  if (N.getOpcode() != ISD::ADD)
+    return false;
+
+  // Leave (R + imm12) for t2LDRi12, (R - imm8) for t2LDRi8.
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    int RHSC = (int)RHS->getZExtValue();
+    if (RHSC >= 0 && RHSC < 0x1000) // 12 bits (unsigned)
+      return false;
+    else if (RHSC < 0 && RHSC >= -255) // 8 bits
+      return false;
+  }
+
+  // Look for (R + R) or (R + (R << [1,2,3])).
+  unsigned ShAmt = 0;
+  Base   = N.getOperand(0);
+  OffReg = N.getOperand(1);
+
+  // Swap if it is ((R << c) + R).
+  ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(OffReg);
+  if (ShOpcVal != ARM_AM::lsl) {
+    ShOpcVal = ARM_AM::getShiftOpcForNode(Base);
+    if (ShOpcVal == ARM_AM::lsl)
+      std::swap(Base, OffReg);
+  }
+
+  if (ShOpcVal == ARM_AM::lsl) {
+    // Check to see if the RHS of the shift is a constant, if not, we can't fold
+    // it.
+    if (ConstantSDNode *Sh = dyn_cast<ConstantSDNode>(OffReg.getOperand(1))) {
+      ShAmt = Sh->getZExtValue();
+      if (ShAmt >= 4) {
+        ShAmt = 0;
+        ShOpcVal = ARM_AM::no_shift;
+      } else
+        OffReg = OffReg.getOperand(0);
+    } else {
+      ShOpcVal = ARM_AM::no_shift;
+    }
+  }
+
+  ShImm = CurDAG->getTargetConstant(ShAmt, MVT::i32);
+
+  return true;
+}
+
+//===--------------------------------------------------------------------===//
+
+/// getAL - Returns a ARMCC::AL immediate node.
+static inline SDValue getAL(SelectionDAG *CurDAG) {
+  return CurDAG->getTargetConstant((uint64_t)ARMCC::AL, MVT::i32);
+}
+
+SDNode *ARMDAGToDAGISel::SelectARMIndexedLoad(SDNode *N) {
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  ISD::MemIndexedMode AM = LD->getAddressingMode();
+  if (AM == ISD::UNINDEXED)
+    return NULL;
+
+  EVT LoadedVT = LD->getMemoryVT();
+  SDValue Offset, AMOpc;
+  bool isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC);
+  unsigned Opcode = 0;
+  bool Match = false;
+  if (LoadedVT == MVT::i32 &&
+      SelectAddrMode2Offset(N, LD->getOffset(), Offset, AMOpc)) {
+    Opcode = isPre ? ARM::LDR_PRE : ARM::LDR_POST;
+    Match = true;
+  } else if (LoadedVT == MVT::i16 &&
+             SelectAddrMode3Offset(N, LD->getOffset(), Offset, AMOpc)) {
+    Match = true;
+    Opcode = (LD->getExtensionType() == ISD::SEXTLOAD)
+      ? (isPre ? ARM::LDRSH_PRE : ARM::LDRSH_POST)
+      : (isPre ? ARM::LDRH_PRE : ARM::LDRH_POST);
+  } else if (LoadedVT == MVT::i8 || LoadedVT == MVT::i1) {
+    if (LD->getExtensionType() == ISD::SEXTLOAD) {
+      if (SelectAddrMode3Offset(N, LD->getOffset(), Offset, AMOpc)) {
+        Match = true;
+        Opcode = isPre ? ARM::LDRSB_PRE : ARM::LDRSB_POST;
+      }
+    } else {
+      if (SelectAddrMode2Offset(N, LD->getOffset(), Offset, AMOpc)) {
+        Match = true;
+        Opcode = isPre ? ARM::LDRB_PRE : ARM::LDRB_POST;
+      }
+    }
+  }
+
+  if (Match) {
+    SDValue Chain = LD->getChain();
+    SDValue Base = LD->getBasePtr();
+    SDValue Ops[]= { Base, Offset, AMOpc, getAL(CurDAG),
+                     CurDAG->getRegister(0, MVT::i32), Chain };
+    return CurDAG->getMachineNode(Opcode, N->getDebugLoc(), MVT::i32, MVT::i32,
+                                  MVT::Other, Ops, 6);
+  }
+
+  return NULL;
+}
+
+SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDNode *N) {
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  ISD::MemIndexedMode AM = LD->getAddressingMode();
+  if (AM == ISD::UNINDEXED)
+    return NULL;
+
+  EVT LoadedVT = LD->getMemoryVT();
+  bool isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD;
+  SDValue Offset;
+  bool isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC);
+  unsigned Opcode = 0;
+  bool Match = false;
+  if (SelectT2AddrModeImm8Offset(N, LD->getOffset(), Offset)) {
+    switch (LoadedVT.getSimpleVT().SimpleTy) {
+    case MVT::i32:
+      Opcode = isPre ? ARM::t2LDR_PRE : ARM::t2LDR_POST;
+      break;
+    case MVT::i16:
+      if (isSExtLd)
+        Opcode = isPre ? ARM::t2LDRSH_PRE : ARM::t2LDRSH_POST;
+      else
+        Opcode = isPre ? ARM::t2LDRH_PRE : ARM::t2LDRH_POST;
+      break;
+    case MVT::i8:
+    case MVT::i1:
+      if (isSExtLd)
+        Opcode = isPre ? ARM::t2LDRSB_PRE : ARM::t2LDRSB_POST;
+      else
+        Opcode = isPre ? ARM::t2LDRB_PRE : ARM::t2LDRB_POST;
+      break;
+    default:
+      return NULL;
+    }
+    Match = true;
+  }
+
+  if (Match) {
+    SDValue Chain = LD->getChain();
+    SDValue Base = LD->getBasePtr();
+    SDValue Ops[]= { Base, Offset, getAL(CurDAG),
+                     CurDAG->getRegister(0, MVT::i32), Chain };
+    return CurDAG->getMachineNode(Opcode, N->getDebugLoc(), MVT::i32, MVT::i32,
+                                  MVT::Other, Ops, 5);
+  }
+
+  return NULL;
+}
+
+SDNode *ARMDAGToDAGISel::SelectDYN_ALLOC(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  EVT VT = N->getValueType(0);
+  SDValue Chain = N->getOperand(0);
+  SDValue Size = N->getOperand(1);
+  SDValue Align = N->getOperand(2);
+  SDValue SP = CurDAG->getRegister(ARM::SP, MVT::i32);
+  int32_t AlignVal = cast<ConstantSDNode>(Align)->getSExtValue();
+  if (AlignVal < 0)
+    // We need to align the stack. Use Thumb1 tAND which is the only thumb
+    // instruction that can read and write SP. This matches to a pseudo
+    // instruction that has a chain to ensure the result is written back to
+    // the stack pointer.
+    SP = SDValue(CurDAG->getMachineNode(ARM::tANDsp, dl, VT, SP, Align), 0);
+
+  bool isC = isa<ConstantSDNode>(Size);
+  uint32_t C = isC ? cast<ConstantSDNode>(Size)->getZExtValue() : ~0UL;
+  // Handle the most common case for both Thumb1 and Thumb2:
+  // tSUBspi - immediate is between 0 ... 508 inclusive.
+  if (C <= 508 && ((C & 3) == 0))
+    // FIXME: tSUBspi encode scale 4 implicitly.
+    return CurDAG->SelectNodeTo(N, ARM::tSUBspi_, VT, MVT::Other, SP,
+                                CurDAG->getTargetConstant(C/4, MVT::i32),
+                                Chain);
+
+  if (Subtarget->isThumb1Only()) {
+    // Use tADDspr since Thumb1 does not have a sub r, sp, r. ARMISelLowering
+    // should have negated the size operand already. FIXME: We can't insert
+    // new target independent node at this stage so we are forced to negate
+    // it earlier. Is there a better solution?
+    return CurDAG->SelectNodeTo(N, ARM::tADDspr_, VT, MVT::Other, SP, Size,
+                                Chain);
+  } else if (Subtarget->isThumb2()) {
+    if (isC && Predicate_t2_so_imm(Size.getNode())) {
+      // t2SUBrSPi
+      SDValue Ops[] = { SP, CurDAG->getTargetConstant(C, MVT::i32), Chain };
+      return CurDAG->SelectNodeTo(N, ARM::t2SUBrSPi_, VT, MVT::Other, Ops, 3);
+    } else if (isC && Predicate_imm0_4095(Size.getNode())) {
+      // t2SUBrSPi12
+      SDValue Ops[] = { SP, CurDAG->getTargetConstant(C, MVT::i32), Chain };
+      return CurDAG->SelectNodeTo(N, ARM::t2SUBrSPi12_, VT, MVT::Other, Ops, 3);
+    } else {
+      // t2SUBrSPs
+      SDValue Ops[] = { SP, Size,
+                        getI32Imm(ARM_AM::getSORegOpc(ARM_AM::lsl,0)), Chain };
+      return CurDAG->SelectNodeTo(N, ARM::t2SUBrSPs_, VT, MVT::Other, Ops, 4);
+    }
+  }
+
+  // FIXME: Add ADD / SUB sp instructions for ARM.
+  return 0;
+}
+
+/// PairDRegs - Insert a pair of double registers into an implicit def to
+/// form a quad register.
+SDNode *ARMDAGToDAGISel::PairDRegs(EVT VT, SDValue V0, SDValue V1) {
+  DebugLoc dl = V0.getNode()->getDebugLoc();
+  SDValue Undef =
+    SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, VT), 0);
+  SDValue SubReg0 = CurDAG->getTargetConstant(ARM::DSUBREG_0, MVT::i32);
+  SDValue SubReg1 = CurDAG->getTargetConstant(ARM::DSUBREG_1, MVT::i32);
+  SDNode *Pair = CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, dl,
+                                        VT, Undef, V0, SubReg0);
+  return CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, dl,
+                                VT, SDValue(Pair, 0), V1, SubReg1);
+}
+
+/// GetNEONSubregVT - Given a type for a 128-bit NEON vector, return the type
+/// for a 64-bit subregister of the vector.
+static EVT GetNEONSubregVT(EVT VT) {
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("unhandled NEON type");
+  case MVT::v16i8: return MVT::v8i8;
+  case MVT::v8i16: return MVT::v4i16;
+  case MVT::v4f32: return MVT::v2f32;
+  case MVT::v4i32: return MVT::v2i32;
+  case MVT::v2i64: return MVT::v1i64;
+  }
+}
+
+SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs,
+                                   unsigned *DOpcodes, unsigned *QOpcodes0,
+                                   unsigned *QOpcodes1) {
+  assert(NumVecs >=2 && NumVecs <= 4 && "VLD NumVecs out-of-range");
+  DebugLoc dl = N->getDebugLoc();
+
+  SDValue MemAddr, MemUpdate, MemOpc, Align;
+  if (!SelectAddrMode6(N, N->getOperand(2), MemAddr, MemUpdate, MemOpc, Align))
+    return NULL;
+
+  SDValue Chain = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  bool is64BitVector = VT.is64BitVector();
+
+  unsigned OpcodeIndex;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("unhandled vld type");
+    // Double-register operations:
+  case MVT::v8i8:  OpcodeIndex = 0; break;
+  case MVT::v4i16: OpcodeIndex = 1; break;
+  case MVT::v2f32:
+  case MVT::v2i32: OpcodeIndex = 2; break;
+  case MVT::v1i64: OpcodeIndex = 3; break;
+    // Quad-register operations:
+  case MVT::v16i8: OpcodeIndex = 0; break;
+  case MVT::v8i16: OpcodeIndex = 1; break;
+  case MVT::v4f32:
+  case MVT::v4i32: OpcodeIndex = 2; break;
+  }
+
+  SDValue Pred = CurDAG->getTargetConstant(14, MVT::i32);
+  SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
+  if (is64BitVector) {
+    unsigned Opc = DOpcodes[OpcodeIndex];
+    const SDValue Ops[] = { MemAddr, MemUpdate, MemOpc, Align,
+                            Pred, PredReg, Chain };
+    std::vector<EVT> ResTys(NumVecs, VT);
+    ResTys.push_back(MVT::Other);
+    return CurDAG->getMachineNode(Opc, dl, ResTys, Ops, 7);
+  }
+
+  EVT RegVT = GetNEONSubregVT(VT);
+  if (NumVecs == 2) {
+    // Quad registers are directly supported for VLD2,
+    // loading 2 pairs of D regs.
+    unsigned Opc = QOpcodes0[OpcodeIndex];
+    const SDValue Ops[] = { MemAddr, MemUpdate, MemOpc, Align,
+                            Pred, PredReg, Chain };
+    std::vector<EVT> ResTys(4, VT);
+    ResTys.push_back(MVT::Other);
+    SDNode *VLd = CurDAG->getMachineNode(Opc, dl, ResTys, Ops, 7);
+    Chain = SDValue(VLd, 4);
+
+    // Combine the even and odd subregs to produce the result.
+    for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
+      SDNode *Q = PairDRegs(VT, SDValue(VLd, 2*Vec), SDValue(VLd, 2*Vec+1));
+      ReplaceUses(SDValue(N, Vec), SDValue(Q, 0));
+    }
+  } else {
+    // Otherwise, quad registers are loaded with two separate instructions,
+    // where one loads the even registers and the other loads the odd registers.
+
+    // Enable writeback to the address register.
+    MemOpc = CurDAG->getTargetConstant(ARM_AM::getAM6Opc(true), MVT::i32);
+
+    std::vector<EVT> ResTys(NumVecs, RegVT);
+    ResTys.push_back(MemAddr.getValueType());
+    ResTys.push_back(MVT::Other);
+
+    // Load the even subregs.
+    unsigned Opc = QOpcodes0[OpcodeIndex];
+    const SDValue OpsA[] = { MemAddr, MemUpdate, MemOpc, Align,
+                             Pred, PredReg, Chain };
+    SDNode *VLdA = CurDAG->getMachineNode(Opc, dl, ResTys, OpsA, 7);
+    Chain = SDValue(VLdA, NumVecs+1);
+
+    // Load the odd subregs.
+    Opc = QOpcodes1[OpcodeIndex];
+    const SDValue OpsB[] = { SDValue(VLdA, NumVecs), MemUpdate, MemOpc,
+                             Align, Pred, PredReg, Chain };
+    SDNode *VLdB = CurDAG->getMachineNode(Opc, dl, ResTys, OpsB, 7);
+    Chain = SDValue(VLdB, NumVecs+1);
+
+    // Combine the even and odd subregs to produce the result.
+    for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
+      SDNode *Q = PairDRegs(VT, SDValue(VLdA, Vec), SDValue(VLdB, Vec));
+      ReplaceUses(SDValue(N, Vec), SDValue(Q, 0));
+    }
+  }
+  ReplaceUses(SDValue(N, NumVecs), Chain);
+  return NULL;
+}
+
+SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
+                                   unsigned *DOpcodes, unsigned *QOpcodes0,
+                                   unsigned *QOpcodes1) {
+  assert(NumVecs >=2 && NumVecs <= 4 && "VST NumVecs out-of-range");
+  DebugLoc dl = N->getDebugLoc();
+
+  SDValue MemAddr, MemUpdate, MemOpc, Align;
+  if (!SelectAddrMode6(N, N->getOperand(2), MemAddr, MemUpdate, MemOpc, Align))
+    return NULL;
+
+  SDValue Chain = N->getOperand(0);
+  EVT VT = N->getOperand(3).getValueType();
+  bool is64BitVector = VT.is64BitVector();
+
+  unsigned OpcodeIndex;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("unhandled vst type");
+    // Double-register operations:
+  case MVT::v8i8:  OpcodeIndex = 0; break;
+  case MVT::v4i16: OpcodeIndex = 1; break;
+  case MVT::v2f32:
+  case MVT::v2i32: OpcodeIndex = 2; break;
+  case MVT::v1i64: OpcodeIndex = 3; break;
+    // Quad-register operations:
+  case MVT::v16i8: OpcodeIndex = 0; break;
+  case MVT::v8i16: OpcodeIndex = 1; break;
+  case MVT::v4f32:
+  case MVT::v4i32: OpcodeIndex = 2; break;
+  }
+
+  SDValue Pred = CurDAG->getTargetConstant(14, MVT::i32);
+  SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
+
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(MemAddr);
+  Ops.push_back(MemUpdate);
+  Ops.push_back(MemOpc);
+  Ops.push_back(Align);
+
+  if (is64BitVector) {
+    unsigned Opc = DOpcodes[OpcodeIndex];
+    for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+      Ops.push_back(N->getOperand(Vec+3));
+    Ops.push_back(Pred);
+    Ops.push_back(PredReg);
+    Ops.push_back(Chain);
+    return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops.data(), NumVecs+7);
+  }
+
+  EVT RegVT = GetNEONSubregVT(VT);
+  if (NumVecs == 2) {
+    // Quad registers are directly supported for VST2,
+    // storing 2 pairs of D regs.
+    unsigned Opc = QOpcodes0[OpcodeIndex];
+    for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
+      Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::DSUBREG_0, dl, RegVT,
+                                                   N->getOperand(Vec+3)));
+      Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::DSUBREG_1, dl, RegVT,
+                                                   N->getOperand(Vec+3)));
+    }
+    Ops.push_back(Pred);
+    Ops.push_back(PredReg);
+    Ops.push_back(Chain);
+    return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops.data(), 11);
+  }
+
+  // Otherwise, quad registers are stored with two separate instructions,
+  // where one stores the even registers and the other stores the odd registers.
+
+  // Enable writeback to the address register.
+  MemOpc = CurDAG->getTargetConstant(ARM_AM::getAM6Opc(true), MVT::i32);
+
+  // Store the even subregs.
+  for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+    Ops.push_back(CurDAG->getTargetExtractSubreg(ARM::DSUBREG_0, dl, RegVT,
+                                                 N->getOperand(Vec+3)));
+  Ops.push_back(Pred);
+  Ops.push_back(PredReg);
+  Ops.push_back(Chain);
+  unsigned Opc = QOpcodes0[OpcodeIndex];
+  SDNode *VStA = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(),
+                                        MVT::Other, Ops.data(), NumVecs+7);
+  Chain = SDValue(VStA, 1);
+
+  // Store the odd subregs.
+  Ops[0] = SDValue(VStA, 0); // MemAddr
+  for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+    Ops[Vec+4] = CurDAG->getTargetExtractSubreg(ARM::DSUBREG_1, dl, RegVT,
+                                                N->getOperand(Vec+3));
+  Ops[NumVecs+4] = Pred;
+  Ops[NumVecs+5] = PredReg;
+  Ops[NumVecs+6] = Chain;
+  Opc = QOpcodes1[OpcodeIndex];
+  SDNode *VStB = CurDAG->getMachineNode(Opc, dl, MemAddr.getValueType(),
+                                        MVT::Other, Ops.data(), NumVecs+7);
+  Chain = SDValue(VStB, 1);
+  ReplaceUses(SDValue(N, 0), Chain);
+  return NULL;
+}
+
+SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
+                                         unsigned NumVecs, unsigned *DOpcodes,
+                                         unsigned *QOpcodes0,
+                                         unsigned *QOpcodes1) {
+  assert(NumVecs >=2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range");
+  DebugLoc dl = N->getDebugLoc();
+
+  SDValue MemAddr, MemUpdate, MemOpc, Align;
+  if (!SelectAddrMode6(N, N->getOperand(2), MemAddr, MemUpdate, MemOpc, Align))
+    return NULL;
+
+  SDValue Chain = N->getOperand(0);
+  unsigned Lane =
+    cast<ConstantSDNode>(N->getOperand(NumVecs+3))->getZExtValue();
+  EVT VT = IsLoad ? N->getValueType(0) : N->getOperand(3).getValueType();
+  bool is64BitVector = VT.is64BitVector();
+
+  // Quad registers are handled by load/store of subregs. Find the subreg info.
+  unsigned NumElts = 0;
+  int SubregIdx = 0;
+  EVT RegVT = VT;
+  if (!is64BitVector) {
+    RegVT = GetNEONSubregVT(VT);
+    NumElts = RegVT.getVectorNumElements();
+    SubregIdx = (Lane < NumElts) ? ARM::DSUBREG_0 : ARM::DSUBREG_1;
+  }
+
+  unsigned OpcodeIndex;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("unhandled vld/vst lane type");
+    // Double-register operations:
+  case MVT::v8i8:  OpcodeIndex = 0; break;
+  case MVT::v4i16: OpcodeIndex = 1; break;
+  case MVT::v2f32:
+  case MVT::v2i32: OpcodeIndex = 2; break;
+    // Quad-register operations:
+  case MVT::v8i16: OpcodeIndex = 0; break;
+  case MVT::v4f32:
+  case MVT::v4i32: OpcodeIndex = 1; break;
+  }
+
+  SDValue Pred = CurDAG->getTargetConstant(14, MVT::i32);
+  SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
+
+  SmallVector<SDValue, 9> Ops;
+  Ops.push_back(MemAddr);
+  Ops.push_back(MemUpdate);
+  Ops.push_back(MemOpc);
+  Ops.push_back(Align);
+
+  unsigned Opc = 0;
+  if (is64BitVector) {
+    Opc = DOpcodes[OpcodeIndex];
+    for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+      Ops.push_back(N->getOperand(Vec+3));
+  } else {
+    // Check if this is loading the even or odd subreg of a Q register.
+    if (Lane < NumElts) {
+      Opc = QOpcodes0[OpcodeIndex];
+    } else {
+      Lane -= NumElts;
+      Opc = QOpcodes1[OpcodeIndex];
+    }
+    // Extract the subregs of the input vector.
+    for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
+      Ops.push_back(CurDAG->getTargetExtractSubreg(SubregIdx, dl, RegVT,
+                                                   N->getOperand(Vec+3)));
+  }
+  Ops.push_back(getI32Imm(Lane));
+  Ops.push_back(Pred);
+  Ops.push_back(PredReg);
+  Ops.push_back(Chain);
+
+  if (!IsLoad)
+    return CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops.data(), NumVecs+8);
+
+  std::vector<EVT> ResTys(NumVecs, RegVT);
+  ResTys.push_back(MVT::Other);
+  SDNode *VLdLn =
+    CurDAG->getMachineNode(Opc, dl, ResTys, Ops.data(), NumVecs+8);
+  // For a 64-bit vector load to D registers, nothing more needs to be done.
+  if (is64BitVector)
+    return VLdLn;
+
+  // For 128-bit vectors, take the 64-bit results of the load and insert them
+  // as subregs into the result.
+  for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
+    SDValue QuadVec = CurDAG->getTargetInsertSubreg(SubregIdx, dl, VT,
+                                                    N->getOperand(Vec+3),
+                                                    SDValue(VLdLn, Vec));
+    ReplaceUses(SDValue(N, Vec), QuadVec);
+  }
+
+  Chain = SDValue(VLdLn, NumVecs);
+  ReplaceUses(SDValue(N, NumVecs), Chain);
+  return NULL;
+}
+
+SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
+                                                     unsigned Opc) {
+  if (!Subtarget->hasV6T2Ops())
+    return NULL;
+
+  unsigned Shl_imm = 0;
+  if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, Shl_imm)) {
+    assert(Shl_imm > 0 && Shl_imm < 32 && "bad amount in shift node!");
+    unsigned Srl_imm = 0;
+    if (isInt32Immediate(N->getOperand(1), Srl_imm)) {
+      assert(Srl_imm > 0 && Srl_imm < 32 && "bad amount in shift node!");
+      unsigned Width = 32 - Srl_imm;
+      int LSB = Srl_imm - Shl_imm;
+      if (LSB < 0)
+        return NULL;
+      SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+      SDValue Ops[] = { N->getOperand(0).getOperand(0),
+                        CurDAG->getTargetConstant(LSB, MVT::i32),
+                        CurDAG->getTargetConstant(Width, MVT::i32),
+                        getAL(CurDAG), Reg0 };
+      return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5);
+    }
+  }
+  return NULL;
+}
+
+SDNode *ARMDAGToDAGISel::
+SelectT2CMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
+                    ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) {
+  SDValue CPTmp0;
+  SDValue CPTmp1;
+  if (SelectT2ShifterOperandReg(N, TrueVal, CPTmp0, CPTmp1)) {
+    unsigned SOVal = cast<ConstantSDNode>(CPTmp1)->getZExtValue();
+    unsigned SOShOp = ARM_AM::getSORegShOp(SOVal);
+    unsigned Opc = 0;
+    switch (SOShOp) {
+    case ARM_AM::lsl: Opc = ARM::t2MOVCClsl; break;
+    case ARM_AM::lsr: Opc = ARM::t2MOVCClsr; break;
+    case ARM_AM::asr: Opc = ARM::t2MOVCCasr; break;
+    case ARM_AM::ror: Opc = ARM::t2MOVCCror; break;
+    default:
+      llvm_unreachable("Unknown so_reg opcode!");
+      break;
+    }
+    SDValue SOShImm =
+      CurDAG->getTargetConstant(ARM_AM::getSORegOffset(SOVal), MVT::i32);
+    SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32);
+    SDValue Ops[] = { FalseVal, CPTmp0, SOShImm, CC, CCR, InFlag };
+    return CurDAG->SelectNodeTo(N, Opc, MVT::i32,Ops, 6);
+  }
+  return 0;
+}
+
+SDNode *ARMDAGToDAGISel::
+SelectARMCMOVShiftOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
+                     ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) {
+  SDValue CPTmp0;
+  SDValue CPTmp1;
+  SDValue CPTmp2;
+  if (SelectShifterOperandReg(N, TrueVal, CPTmp0, CPTmp1, CPTmp2)) {
+    SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32);
+    SDValue Ops[] = { FalseVal, CPTmp0, CPTmp1, CPTmp2, CC, CCR, InFlag };
+    return CurDAG->SelectNodeTo(N, ARM::MOVCCs, MVT::i32, Ops, 7);
+  }
+  return 0;
+}
+
+SDNode *ARMDAGToDAGISel::
+SelectT2CMOVSoImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
+                    ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) {
+  ConstantSDNode *T = dyn_cast<ConstantSDNode>(TrueVal);
+  if (!T)
+    return 0;
+
+  if (Predicate_t2_so_imm(TrueVal.getNode())) {
+    SDValue True = CurDAG->getTargetConstant(T->getZExtValue(), MVT::i32);
+    SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32);
+    SDValue Ops[] = { FalseVal, True, CC, CCR, InFlag };
+    return CurDAG->SelectNodeTo(N,
+                                ARM::t2MOVCCi, MVT::i32, Ops, 5);
+  }
+  return 0;
+}
+
+SDNode *ARMDAGToDAGISel::
+SelectARMCMOVSoImmOp(SDNode *N, SDValue FalseVal, SDValue TrueVal,
+                     ARMCC::CondCodes CCVal, SDValue CCR, SDValue InFlag) {
+  ConstantSDNode *T = dyn_cast<ConstantSDNode>(TrueVal);
+  if (!T)
+    return 0;
+
+  if (Predicate_so_imm(TrueVal.getNode())) {
+    SDValue True = CurDAG->getTargetConstant(T->getZExtValue(), MVT::i32);
+    SDValue CC = CurDAG->getTargetConstant(CCVal, MVT::i32);
+    SDValue Ops[] = { FalseVal, True, CC, CCR, InFlag };
+    return CurDAG->SelectNodeTo(N,
+                                ARM::MOVCCi, MVT::i32, Ops, 5);
+  }
+  return 0;
+}
+
+SDNode *ARMDAGToDAGISel::SelectCMOVOp(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  SDValue FalseVal = N->getOperand(0);
+  SDValue TrueVal  = N->getOperand(1);
+  SDValue CC = N->getOperand(2);
+  SDValue CCR = N->getOperand(3);
+  SDValue InFlag = N->getOperand(4);
+  assert(CC.getOpcode() == ISD::Constant);
+  assert(CCR.getOpcode() == ISD::Register);
+  ARMCC::CondCodes CCVal =
+    (ARMCC::CondCodes)cast<ConstantSDNode>(CC)->getZExtValue();
+
+  if (!Subtarget->isThumb1Only() && VT == MVT::i32) {
+    // Pattern: (ARMcmov:i32 GPR:i32:$false, so_reg:i32:$true, (imm:i32):$cc)
+    // Emits: (MOVCCs:i32 GPR:i32:$false, so_reg:i32:$true, (imm:i32):$cc)
+    // Pattern complexity = 18  cost = 1  size = 0
+    SDValue CPTmp0;
+    SDValue CPTmp1;
+    SDValue CPTmp2;
+    if (Subtarget->isThumb()) {
+      SDNode *Res = SelectT2CMOVShiftOp(N, FalseVal, TrueVal,
+                                        CCVal, CCR, InFlag);
+      if (!Res)
+        Res = SelectT2CMOVShiftOp(N, TrueVal, FalseVal,
+                               ARMCC::getOppositeCondition(CCVal), CCR, InFlag);
+      if (Res)
+        return Res;
+    } else {
+      SDNode *Res = SelectARMCMOVShiftOp(N, FalseVal, TrueVal,
+                                         CCVal, CCR, InFlag);
+      if (!Res)
+        Res = SelectARMCMOVShiftOp(N, TrueVal, FalseVal,
+                               ARMCC::getOppositeCondition(CCVal), CCR, InFlag);
+      if (Res)
+        return Res;
+    }
+
+    // Pattern: (ARMcmov:i32 GPR:i32:$false,
+    //             (imm:i32)<<P:Predicate_so_imm>>:$true,
+    //             (imm:i32):$cc)
+    // Emits: (MOVCCi:i32 GPR:i32:$false,
+    //           (so_imm:i32 (imm:i32):$true), (imm:i32):$cc)
+    // Pattern complexity = 10  cost = 1  size = 0
+    if (Subtarget->isThumb()) {
+      SDNode *Res = SelectT2CMOVSoImmOp(N, FalseVal, TrueVal,
+                                        CCVal, CCR, InFlag);
+      if (!Res)
+        Res = SelectT2CMOVSoImmOp(N, TrueVal, FalseVal,
+                               ARMCC::getOppositeCondition(CCVal), CCR, InFlag);
+      if (Res)
+        return Res;
+    } else {
+      SDNode *Res = SelectARMCMOVSoImmOp(N, FalseVal, TrueVal,
+                                         CCVal, CCR, InFlag);
+      if (!Res)
+        Res = SelectARMCMOVSoImmOp(N, TrueVal, FalseVal,
+                               ARMCC::getOppositeCondition(CCVal), CCR, InFlag);
+      if (Res)
+        return Res;
+    }
+  }
+
+  // Pattern: (ARMcmov:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc)
+  // Emits: (MOVCCr:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc)
+  // Pattern complexity = 6  cost = 1  size = 0
+  //
+  // Pattern: (ARMcmov:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc)
+  // Emits: (tMOVCCr:i32 GPR:i32:$false, GPR:i32:$true, (imm:i32):$cc)
+  // Pattern complexity = 6  cost = 11  size = 0
+  //
+  // Also FCPYScc and FCPYDcc.
+  SDValue Tmp2 = CurDAG->getTargetConstant(CCVal, MVT::i32);
+  SDValue Ops[] = { FalseVal, TrueVal, Tmp2, CCR, InFlag };
+  unsigned Opc = 0;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: assert(false && "Illegal conditional move type!");
+    break;
+  case MVT::i32:
+    Opc = Subtarget->isThumb()
+      ? (Subtarget->hasThumb2() ? ARM::t2MOVCCr : ARM::tMOVCCr_pseudo)
+      : ARM::MOVCCr;
+    break;
+  case MVT::f32:
+    Opc = ARM::VMOVScc;
+    break;
+  case MVT::f64:
+    Opc = ARM::VMOVDcc;
+    break;
+  }
+  return CurDAG->SelectNodeTo(N, Opc, VT, Ops, 5);
+}
+
+SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+
+  if (N->isMachineOpcode())
+    return NULL;   // Already selected.
+
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::Constant: {
+    unsigned Val = cast<ConstantSDNode>(N)->getZExtValue();
+    bool UseCP = true;
+    if (Subtarget->hasThumb2())
+      // Thumb2-aware targets have the MOVT instruction, so all immediates can
+      // be done with MOV + MOVT, at worst.
+      UseCP = 0;
+    else {
+      if (Subtarget->isThumb()) {
+        UseCP = (Val > 255 &&                          // MOV
+                 ~Val > 255 &&                         // MOV + MVN
+                 !ARM_AM::isThumbImmShiftedVal(Val));  // MOV + LSL
+      } else
+        UseCP = (ARM_AM::getSOImmVal(Val) == -1 &&     // MOV
+                 ARM_AM::getSOImmVal(~Val) == -1 &&    // MVN
+                 !ARM_AM::isSOImmTwoPartVal(Val));     // two instrs.
+    }
+
+    if (UseCP) {
+      SDValue CPIdx =
+        CurDAG->getTargetConstantPool(ConstantInt::get(
+                                  Type::getInt32Ty(*CurDAG->getContext()), Val),
+                                      TLI.getPointerTy());
+
+      SDNode *ResNode;
+      if (Subtarget->isThumb1Only()) {
+        SDValue Pred = CurDAG->getTargetConstant(14, MVT::i32);
+        SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
+        SDValue Ops[] = { CPIdx, Pred, PredReg, CurDAG->getEntryNode() };
+        ResNode = CurDAG->getMachineNode(ARM::tLDRcp, dl, MVT::i32, MVT::Other,
+                                         Ops, 4);
+      } else {
+        SDValue Ops[] = {
+          CPIdx,
+          CurDAG->getRegister(0, MVT::i32),
+          CurDAG->getTargetConstant(0, MVT::i32),
+          getAL(CurDAG),
+          CurDAG->getRegister(0, MVT::i32),
+          CurDAG->getEntryNode()
+        };
+        ResNode=CurDAG->getMachineNode(ARM::LDRcp, dl, MVT::i32, MVT::Other,
+                                       Ops, 6);
+      }
+      ReplaceUses(SDValue(N, 0), SDValue(ResNode, 0));
+      return NULL;
+    }
+
+    // Other cases are autogenerated.
+    break;
+  }
+  case ISD::FrameIndex: {
+    // Selects to ADDri FI, 0 which in turn will become ADDri SP, imm.
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+    if (Subtarget->isThumb1Only()) {
+      return CurDAG->SelectNodeTo(N, ARM::tADDrSPi, MVT::i32, TFI,
+                                  CurDAG->getTargetConstant(0, MVT::i32));
+    } else {
+      unsigned Opc = ((Subtarget->isThumb() && Subtarget->hasThumb2()) ?
+                      ARM::t2ADDri : ARM::ADDri);
+      SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, MVT::i32),
+                        getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
+                        CurDAG->getRegister(0, MVT::i32) };
+      return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5);
+    }
+  }
+  case ARMISD::DYN_ALLOC:
+    return SelectDYN_ALLOC(N);
+  case ISD::SRL:
+    if (SDNode *I = SelectV6T2BitfieldExtractOp(N,
+                      Subtarget->isThumb() ? ARM::t2UBFX : ARM::UBFX))
+      return I;
+    break;
+  case ISD::SRA:
+    if (SDNode *I = SelectV6T2BitfieldExtractOp(N,
+                      Subtarget->isThumb() ? ARM::t2SBFX : ARM::SBFX))
+      return I;
+    break;
+  case ISD::MUL:
+    if (Subtarget->isThumb1Only())
+      break;
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+      unsigned RHSV = C->getZExtValue();
+      if (!RHSV) break;
+      if (isPowerOf2_32(RHSV-1)) {  // 2^n+1?
+        unsigned ShImm = Log2_32(RHSV-1);
+        if (ShImm >= 32)
+          break;
+        SDValue V = N->getOperand(0);
+        ShImm = ARM_AM::getSORegOpc(ARM_AM::lsl, ShImm);
+        SDValue ShImmOp = CurDAG->getTargetConstant(ShImm, MVT::i32);
+        SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+        if (Subtarget->isThumb()) {
+          SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG), Reg0, Reg0 };
+          return CurDAG->SelectNodeTo(N, ARM::t2ADDrs, MVT::i32, Ops, 6);
+        } else {
+          SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG), Reg0, Reg0 };
+          return CurDAG->SelectNodeTo(N, ARM::ADDrs, MVT::i32, Ops, 7);
+        }
+      }
+      if (isPowerOf2_32(RHSV+1)) {  // 2^n-1?
+        unsigned ShImm = Log2_32(RHSV+1);
+        if (ShImm >= 32)
+          break;
+        SDValue V = N->getOperand(0);
+        ShImm = ARM_AM::getSORegOpc(ARM_AM::lsl, ShImm);
+        SDValue ShImmOp = CurDAG->getTargetConstant(ShImm, MVT::i32);
+        SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+        if (Subtarget->isThumb()) {
+          SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG), Reg0 };
+          return CurDAG->SelectNodeTo(N, ARM::t2RSBrs, MVT::i32, Ops, 5);
+        } else {
+          SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG), Reg0, Reg0 };
+          return CurDAG->SelectNodeTo(N, ARM::RSBrs, MVT::i32, Ops, 7);
+        }
+      }
+    }
+    break;
+  case ISD::AND: {
+    // (and (or x, c2), c1) and top 16-bits of c1 and c2 match, lower 16-bits
+    // of c1 are 0xffff, and lower 16-bit of c2 are 0. That is, the top 16-bits
+    // are entirely contributed by c2 and lower 16-bits are entirely contributed
+    // by x. That's equal to (or (and x, 0xffff), (and c1, 0xffff0000)).
+    // Select it to: "movt x, ((c1 & 0xffff) >> 16)
+    EVT VT = N->getValueType(0);
+    if (VT != MVT::i32)
+      break;
+    unsigned Opc = (Subtarget->isThumb() && Subtarget->hasThumb2())
+      ? ARM::t2MOVTi16
+      : (Subtarget->hasV6T2Ops() ? ARM::MOVTi16 : 0);
+    if (!Opc)
+      break;
+    SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
+    ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+    if (!N1C)
+      break;
+    if (N0.getOpcode() == ISD::OR && N0.getNode()->hasOneUse()) {
+      SDValue N2 = N0.getOperand(1);
+      ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2);
+      if (!N2C)
+        break;
+      unsigned N1CVal = N1C->getZExtValue();
+      unsigned N2CVal = N2C->getZExtValue();
+      if ((N1CVal & 0xffff0000U) == (N2CVal & 0xffff0000U) &&
+          (N1CVal & 0xffffU) == 0xffffU &&
+          (N2CVal & 0xffffU) == 0x0U) {
+        SDValue Imm16 = CurDAG->getTargetConstant((N2CVal & 0xFFFF0000U) >> 16,
+                                                  MVT::i32);
+        SDValue Ops[] = { N0.getOperand(0), Imm16,
+                          getAL(CurDAG), CurDAG->getRegister(0, MVT::i32) };
+        return CurDAG->getMachineNode(Opc, dl, VT, Ops, 4);
+      }
+    }
+    break;
+  }
+  case ARMISD::VMOVRRD:
+    return CurDAG->getMachineNode(ARM::VMOVRRD, dl, MVT::i32, MVT::i32,
+                                  N->getOperand(0), getAL(CurDAG),
+                                  CurDAG->getRegister(0, MVT::i32));
+  case ISD::UMUL_LOHI: {
+    if (Subtarget->isThumb1Only())
+      break;
+    if (Subtarget->isThumb()) {
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                        getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
+                        CurDAG->getRegister(0, MVT::i32) };
+      return CurDAG->getMachineNode(ARM::t2UMULL, dl, MVT::i32, MVT::i32, Ops,4);
+    } else {
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                        getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
+                        CurDAG->getRegister(0, MVT::i32) };
+      return CurDAG->getMachineNode(ARM::UMULL, dl, MVT::i32, MVT::i32, Ops, 5);
+    }
+  }
+  case ISD::SMUL_LOHI: {
+    if (Subtarget->isThumb1Only())
+      break;
+    if (Subtarget->isThumb()) {
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                        getAL(CurDAG), CurDAG->getRegister(0, MVT::i32) };
+      return CurDAG->getMachineNode(ARM::t2SMULL, dl, MVT::i32, MVT::i32, Ops,4);
+    } else {
+      SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                        getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
+                        CurDAG->getRegister(0, MVT::i32) };
+      return CurDAG->getMachineNode(ARM::SMULL, dl, MVT::i32, MVT::i32, Ops, 5);
+    }
+  }
+  case ISD::LOAD: {
+    SDNode *ResNode = 0;
+    if (Subtarget->isThumb() && Subtarget->hasThumb2())
+      ResNode = SelectT2IndexedLoad(N);
+    else
+      ResNode = SelectARMIndexedLoad(N);
+    if (ResNode)
+      return ResNode;
+    // Other cases are autogenerated.
+    break;
+  }
+  case ARMISD::BRCOND: {
+    // Pattern: (ARMbrcond:void (bb:Other):$dst, (imm:i32):$cc)
+    // Emits: (Bcc:void (bb:Other):$dst, (imm:i32):$cc)
+    // Pattern complexity = 6  cost = 1  size = 0
+
+    // Pattern: (ARMbrcond:void (bb:Other):$dst, (imm:i32):$cc)
+    // Emits: (tBcc:void (bb:Other):$dst, (imm:i32):$cc)
+    // Pattern complexity = 6  cost = 1  size = 0
+
+    // Pattern: (ARMbrcond:void (bb:Other):$dst, (imm:i32):$cc)
+    // Emits: (t2Bcc:void (bb:Other):$dst, (imm:i32):$cc)
+    // Pattern complexity = 6  cost = 1  size = 0
+
+    unsigned Opc = Subtarget->isThumb() ?
+      ((Subtarget->hasThumb2()) ? ARM::t2Bcc : ARM::tBcc) : ARM::Bcc;
+    SDValue Chain = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    SDValue N2 = N->getOperand(2);
+    SDValue N3 = N->getOperand(3);
+    SDValue InFlag = N->getOperand(4);
+    assert(N1.getOpcode() == ISD::BasicBlock);
+    assert(N2.getOpcode() == ISD::Constant);
+    assert(N3.getOpcode() == ISD::Register);
+
+    SDValue Tmp2 = CurDAG->getTargetConstant(((unsigned)
+                               cast<ConstantSDNode>(N2)->getZExtValue()),
+                               MVT::i32);
+    SDValue Ops[] = { N1, Tmp2, N3, Chain, InFlag };
+    SDNode *ResNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
+                                             MVT::Flag, Ops, 5);
+    Chain = SDValue(ResNode, 0);
+    if (N->getNumValues() == 2) {
+      InFlag = SDValue(ResNode, 1);
+      ReplaceUses(SDValue(N, 1), InFlag);
+    }
+    ReplaceUses(SDValue(N, 0),
+                SDValue(Chain.getNode(), Chain.getResNo()));
+    return NULL;
+  }
+  case ARMISD::CMOV:
+    return SelectCMOVOp(N);
+  case ARMISD::CNEG: {
+    EVT VT = N->getValueType(0);
+    SDValue N0 = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    SDValue N2 = N->getOperand(2);
+    SDValue N3 = N->getOperand(3);
+    SDValue InFlag = N->getOperand(4);
+    assert(N2.getOpcode() == ISD::Constant);
+    assert(N3.getOpcode() == ISD::Register);
+
+    SDValue Tmp2 = CurDAG->getTargetConstant(((unsigned)
+                               cast<ConstantSDNode>(N2)->getZExtValue()),
+                               MVT::i32);
+    SDValue Ops[] = { N0, N1, Tmp2, N3, InFlag };
+    unsigned Opc = 0;
+    switch (VT.getSimpleVT().SimpleTy) {
+    default: assert(false && "Illegal conditional move type!");
+      break;
+    case MVT::f32:
+      Opc = ARM::VNEGScc;
+      break;
+    case MVT::f64:
+      Opc = ARM::VNEGDcc;
+      break;
+    }
+    return CurDAG->SelectNodeTo(N, Opc, VT, Ops, 5);
+  }
+
+  case ARMISD::VZIP: {
+    unsigned Opc = 0;
+    EVT VT = N->getValueType(0);
+    switch (VT.getSimpleVT().SimpleTy) {
+    default: return NULL;
+    case MVT::v8i8:  Opc = ARM::VZIPd8; break;
+    case MVT::v4i16: Opc = ARM::VZIPd16; break;
+    case MVT::v2f32:
+    case MVT::v2i32: Opc = ARM::VZIPd32; break;
+    case MVT::v16i8: Opc = ARM::VZIPq8; break;
+    case MVT::v8i16: Opc = ARM::VZIPq16; break;
+    case MVT::v4f32:
+    case MVT::v4i32: Opc = ARM::VZIPq32; break;
+    }
+    SDValue Pred = CurDAG->getTargetConstant(14, MVT::i32);
+    SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg };
+    return CurDAG->getMachineNode(Opc, dl, VT, VT, Ops, 4);
+  }
+  case ARMISD::VUZP: {
+    unsigned Opc = 0;
+    EVT VT = N->getValueType(0);
+    switch (VT.getSimpleVT().SimpleTy) {
+    default: return NULL;
+    case MVT::v8i8:  Opc = ARM::VUZPd8; break;
+    case MVT::v4i16: Opc = ARM::VUZPd16; break;
+    case MVT::v2f32:
+    case MVT::v2i32: Opc = ARM::VUZPd32; break;
+    case MVT::v16i8: Opc = ARM::VUZPq8; break;
+    case MVT::v8i16: Opc = ARM::VUZPq16; break;
+    case MVT::v4f32:
+    case MVT::v4i32: Opc = ARM::VUZPq32; break;
+    }
+    SDValue Pred = CurDAG->getTargetConstant(14, MVT::i32);
+    SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg };
+    return CurDAG->getMachineNode(Opc, dl, VT, VT, Ops, 4);
+  }
+  case ARMISD::VTRN: {
+    unsigned Opc = 0;
+    EVT VT = N->getValueType(0);
+    switch (VT.getSimpleVT().SimpleTy) {
+    default: return NULL;
+    case MVT::v8i8:  Opc = ARM::VTRNd8; break;
+    case MVT::v4i16: Opc = ARM::VTRNd16; break;
+    case MVT::v2f32:
+    case MVT::v2i32: Opc = ARM::VTRNd32; break;
+    case MVT::v16i8: Opc = ARM::VTRNq8; break;
+    case MVT::v8i16: Opc = ARM::VTRNq16; break;
+    case MVT::v4f32:
+    case MVT::v4i32: Opc = ARM::VTRNq32; break;
+    }
+    SDValue Pred = CurDAG->getTargetConstant(14, MVT::i32);
+    SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
+    SDValue Ops[] = { N->getOperand(0), N->getOperand(1), Pred, PredReg };
+    return CurDAG->getMachineNode(Opc, dl, VT, VT, Ops, 4);
+  }
+
+  case ISD::INTRINSIC_VOID:
+  case ISD::INTRINSIC_W_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    switch (IntNo) {
+    default:
+      break;
+
+    case Intrinsic::arm_neon_vld2: {
+      unsigned DOpcodes[] = { ARM::VLD2d8, ARM::VLD2d16,
+                              ARM::VLD2d32, ARM::VLD2d64 };
+      unsigned QOpcodes[] = { ARM::VLD2q8, ARM::VLD2q16, ARM::VLD2q32 };
+      return SelectVLD(N, 2, DOpcodes, QOpcodes, 0);
+    }
+
+    case Intrinsic::arm_neon_vld3: {
+      unsigned DOpcodes[] = { ARM::VLD3d8, ARM::VLD3d16,
+                              ARM::VLD3d32, ARM::VLD3d64 };
+      unsigned QOpcodes0[] = { ARM::VLD3q8a, ARM::VLD3q16a, ARM::VLD3q32a };
+      unsigned QOpcodes1[] = { ARM::VLD3q8b, ARM::VLD3q16b, ARM::VLD3q32b };
+      return SelectVLD(N, 3, DOpcodes, QOpcodes0, QOpcodes1);
+    }
+
+    case Intrinsic::arm_neon_vld4: {
+      unsigned DOpcodes[] = { ARM::VLD4d8, ARM::VLD4d16,
+                              ARM::VLD4d32, ARM::VLD4d64 };
+      unsigned QOpcodes0[] = { ARM::VLD4q8a, ARM::VLD4q16a, ARM::VLD4q32a };
+      unsigned QOpcodes1[] = { ARM::VLD4q8b, ARM::VLD4q16b, ARM::VLD4q32b };
+      return SelectVLD(N, 4, DOpcodes, QOpcodes0, QOpcodes1);
+    }
+
+    case Intrinsic::arm_neon_vld2lane: {
+      unsigned DOpcodes[] = { ARM::VLD2LNd8, ARM::VLD2LNd16, ARM::VLD2LNd32 };
+      unsigned QOpcodes0[] = { ARM::VLD2LNq16a, ARM::VLD2LNq32a };
+      unsigned QOpcodes1[] = { ARM::VLD2LNq16b, ARM::VLD2LNq32b };
+      return SelectVLDSTLane(N, true, 2, DOpcodes, QOpcodes0, QOpcodes1);
+    }
+
+    case Intrinsic::arm_neon_vld3lane: {
+      unsigned DOpcodes[] = { ARM::VLD3LNd8, ARM::VLD3LNd16, ARM::VLD3LNd32 };
+      unsigned QOpcodes0[] = { ARM::VLD3LNq16a, ARM::VLD3LNq32a };
+      unsigned QOpcodes1[] = { ARM::VLD3LNq16b, ARM::VLD3LNq32b };
+      return SelectVLDSTLane(N, true, 3, DOpcodes, QOpcodes0, QOpcodes1);
+    }
+
+    case Intrinsic::arm_neon_vld4lane: {
+      unsigned DOpcodes[] = { ARM::VLD4LNd8, ARM::VLD4LNd16, ARM::VLD4LNd32 };
+      unsigned QOpcodes0[] = { ARM::VLD4LNq16a, ARM::VLD4LNq32a };
+      unsigned QOpcodes1[] = { ARM::VLD4LNq16b, ARM::VLD4LNq32b };
+      return SelectVLDSTLane(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
+    }
+
+    case Intrinsic::arm_neon_vst2: {
+      unsigned DOpcodes[] = { ARM::VST2d8, ARM::VST2d16,
+                              ARM::VST2d32, ARM::VST2d64 };
+      unsigned QOpcodes[] = { ARM::VST2q8, ARM::VST2q16, ARM::VST2q32 };
+      return SelectVST(N, 2, DOpcodes, QOpcodes, 0);
+    }
+
+    case Intrinsic::arm_neon_vst3: {
+      unsigned DOpcodes[] = { ARM::VST3d8, ARM::VST3d16,
+                              ARM::VST3d32, ARM::VST3d64 };
+      unsigned QOpcodes0[] = { ARM::VST3q8a, ARM::VST3q16a, ARM::VST3q32a };
+      unsigned QOpcodes1[] = { ARM::VST3q8b, ARM::VST3q16b, ARM::VST3q32b };
+      return SelectVST(N, 3, DOpcodes, QOpcodes0, QOpcodes1);
+    }
+
+    case Intrinsic::arm_neon_vst4: {
+      unsigned DOpcodes[] = { ARM::VST4d8, ARM::VST4d16,
+                              ARM::VST4d32, ARM::VST4d64 };
+      unsigned QOpcodes0[] = { ARM::VST4q8a, ARM::VST4q16a, ARM::VST4q32a };
+      unsigned QOpcodes1[] = { ARM::VST4q8b, ARM::VST4q16b, ARM::VST4q32b };
+      return SelectVST(N, 4, DOpcodes, QOpcodes0, QOpcodes1);
+    }
+
+    case Intrinsic::arm_neon_vst2lane: {
+      unsigned DOpcodes[] = { ARM::VST2LNd8, ARM::VST2LNd16, ARM::VST2LNd32 };
+      unsigned QOpcodes0[] = { ARM::VST2LNq16a, ARM::VST2LNq32a };
+      unsigned QOpcodes1[] = { ARM::VST2LNq16b, ARM::VST2LNq32b };
+      return SelectVLDSTLane(N, false, 2, DOpcodes, QOpcodes0, QOpcodes1);
+    }
+
+    case Intrinsic::arm_neon_vst3lane: {
+      unsigned DOpcodes[] = { ARM::VST3LNd8, ARM::VST3LNd16, ARM::VST3LNd32 };
+      unsigned QOpcodes0[] = { ARM::VST3LNq16a, ARM::VST3LNq32a };
+      unsigned QOpcodes1[] = { ARM::VST3LNq16b, ARM::VST3LNq32b };
+      return SelectVLDSTLane(N, false, 3, DOpcodes, QOpcodes0, QOpcodes1);
+    }
+
+    case Intrinsic::arm_neon_vst4lane: {
+      unsigned DOpcodes[] = { ARM::VST4LNd8, ARM::VST4LNd16, ARM::VST4LNd32 };
+      unsigned QOpcodes0[] = { ARM::VST4LNq16a, ARM::VST4LNq32a };
+      unsigned QOpcodes1[] = { ARM::VST4LNq16b, ARM::VST4LNq32b };
+      return SelectVLDSTLane(N, false, 4, DOpcodes, QOpcodes0, QOpcodes1);
+    }
+    }
+  }
+  }
+
+  return SelectCode(N);
+}
+
+bool ARMDAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
+                             std::vector<SDValue> &OutOps) {
+  assert(ConstraintCode == 'm' && "unexpected asm memory constraint");
+  // Require the address to be in a register.  That is safe for all ARM
+  // variants and it is hard to do anything much smarter without knowing
+  // how the operand is used.
+  OutOps.push_back(Op);
+  return false;
+}
+
+/// createARMISelDag - This pass converts a legalized DAG into a
+/// ARM-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createARMISelDag(ARMBaseTargetMachine &TM,
+                                     CodeGenOpt::Level OptLevel) {
+  return new ARMDAGToDAGISel(TM, OptLevel);
+}

diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
new file mode 100644
index 0000000..614e684
--- /dev/null
+++ b/lib/Target/ARM/ARMISelLowering.cpp

@@ -0,0 +1,4573 @@
+//===-- ARMISelLowering.cpp - ARM DAG Lowering Implementation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that ARM uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMAddressingModes.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMISelLowering.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMPerfectShuffle.h"
+#include "ARMRegisterInfo.h"
+#include "ARMSubtarget.h"
+#include "ARMTargetMachine.h"
+#include "ARMTargetObjectFile.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Instruction.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Type.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/VectorExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <sstream>
+using namespace llvm;
+
+static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
+                                   CCValAssign::LocInfo &LocInfo,
+                                   ISD::ArgFlagsTy &ArgFlags,
+                                   CCState &State);
+static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
+                                    CCValAssign::LocInfo &LocInfo,
+                                    ISD::ArgFlagsTy &ArgFlags,
+                                    CCState &State);
+static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
+                                      CCValAssign::LocInfo &LocInfo,
+                                      ISD::ArgFlagsTy &ArgFlags,
+                                      CCState &State);
+static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
+                                       CCValAssign::LocInfo &LocInfo,
+                                       ISD::ArgFlagsTy &ArgFlags,
+                                       CCState &State);
+
+void ARMTargetLowering::addTypeForNEON(EVT VT, EVT PromotedLdStVT,
+                                       EVT PromotedBitwiseVT) {
+  if (VT != PromotedLdStVT) {
+    setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
+    AddPromotedToType (ISD::LOAD, VT.getSimpleVT(),
+                       PromotedLdStVT.getSimpleVT());
+
+    setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
+    AddPromotedToType (ISD::STORE, VT.getSimpleVT(),
+                       PromotedLdStVT.getSimpleVT());
+  }
+
+  EVT ElemTy = VT.getVectorElementType();
+  if (ElemTy != MVT::i64 && ElemTy != MVT::f64)
+    setOperationAction(ISD::VSETCC, VT.getSimpleVT(), Custom);
+  if (ElemTy == MVT::i8 || ElemTy == MVT::i16)
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom);
+  if (ElemTy != MVT::i32) {
+    setOperationAction(ISD::SINT_TO_FP, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::UINT_TO_FP, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Expand);
+  }
+  setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Expand);
+  if (VT.isInteger()) {
+    setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom);
+    setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom);
+    setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom);
+  }
+
+  // Promote all bit-wise operations.
+  if (VT.isInteger() && VT != PromotedBitwiseVT) {
+    setOperationAction(ISD::AND, VT.getSimpleVT(), Promote);
+    AddPromotedToType (ISD::AND, VT.getSimpleVT(),
+                       PromotedBitwiseVT.getSimpleVT());
+    setOperationAction(ISD::OR,  VT.getSimpleVT(), Promote);
+    AddPromotedToType (ISD::OR,  VT.getSimpleVT(),
+                       PromotedBitwiseVT.getSimpleVT());
+    setOperationAction(ISD::XOR, VT.getSimpleVT(), Promote);
+    AddPromotedToType (ISD::XOR, VT.getSimpleVT(),
+                       PromotedBitwiseVT.getSimpleVT());
+  }
+
+  // Neon does not support vector divide/remainder operations.
+  setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::FDIV, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand);
+}
+
+void ARMTargetLowering::addDRTypeForNEON(EVT VT) {
+  addRegisterClass(VT, ARM::DPRRegisterClass);
+  addTypeForNEON(VT, MVT::f64, MVT::v2i32);
+}
+
+void ARMTargetLowering::addQRTypeForNEON(EVT VT) {
+  addRegisterClass(VT, ARM::QPRRegisterClass);
+  addTypeForNEON(VT, MVT::v2f64, MVT::v4i32);
+}
+
+static TargetLoweringObjectFile *createTLOF(TargetMachine &TM) {
+  if (TM.getSubtarget<ARMSubtarget>().isTargetDarwin())
+    return new TargetLoweringObjectFileMachO();
+  return new ARMElfTargetObjectFile();
+}
+
+ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
+    : TargetLowering(TM, createTLOF(TM)) {
+  Subtarget = &TM.getSubtarget<ARMSubtarget>();
+
+  if (Subtarget->isTargetDarwin()) {
+    // Uses VFP for Thumb libfuncs if available.
+    if (Subtarget->isThumb() && Subtarget->hasVFP2()) {
+      // Single-precision floating-point arithmetic.
+      setLibcallName(RTLIB::ADD_F32, "__addsf3vfp");
+      setLibcallName(RTLIB::SUB_F32, "__subsf3vfp");
+      setLibcallName(RTLIB::MUL_F32, "__mulsf3vfp");
+      setLibcallName(RTLIB::DIV_F32, "__divsf3vfp");
+
+      // Double-precision floating-point arithmetic.
+      setLibcallName(RTLIB::ADD_F64, "__adddf3vfp");
+      setLibcallName(RTLIB::SUB_F64, "__subdf3vfp");
+      setLibcallName(RTLIB::MUL_F64, "__muldf3vfp");
+      setLibcallName(RTLIB::DIV_F64, "__divdf3vfp");
+
+      // Single-precision comparisons.
+      setLibcallName(RTLIB::OEQ_F32, "__eqsf2vfp");
+      setLibcallName(RTLIB::UNE_F32, "__nesf2vfp");
+      setLibcallName(RTLIB::OLT_F32, "__ltsf2vfp");
+      setLibcallName(RTLIB::OLE_F32, "__lesf2vfp");
+      setLibcallName(RTLIB::OGE_F32, "__gesf2vfp");
+      setLibcallName(RTLIB::OGT_F32, "__gtsf2vfp");
+      setLibcallName(RTLIB::UO_F32,  "__unordsf2vfp");
+      setLibcallName(RTLIB::O_F32,   "__unordsf2vfp");
+
+      setCmpLibcallCC(RTLIB::OEQ_F32, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::UNE_F32, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::OLT_F32, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::OLE_F32, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::OGE_F32, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::OGT_F32, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::UO_F32,  ISD::SETNE);
+      setCmpLibcallCC(RTLIB::O_F32,   ISD::SETEQ);
+
+      // Double-precision comparisons.
+      setLibcallName(RTLIB::OEQ_F64, "__eqdf2vfp");
+      setLibcallName(RTLIB::UNE_F64, "__nedf2vfp");
+      setLibcallName(RTLIB::OLT_F64, "__ltdf2vfp");
+      setLibcallName(RTLIB::OLE_F64, "__ledf2vfp");
+      setLibcallName(RTLIB::OGE_F64, "__gedf2vfp");
+      setLibcallName(RTLIB::OGT_F64, "__gtdf2vfp");
+      setLibcallName(RTLIB::UO_F64,  "__unorddf2vfp");
+      setLibcallName(RTLIB::O_F64,   "__unorddf2vfp");
+
+      setCmpLibcallCC(RTLIB::OEQ_F64, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::UNE_F64, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::OLT_F64, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::OLE_F64, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::OGE_F64, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::OGT_F64, ISD::SETNE);
+      setCmpLibcallCC(RTLIB::UO_F64,  ISD::SETNE);
+      setCmpLibcallCC(RTLIB::O_F64,   ISD::SETEQ);
+
+      // Floating-point to integer conversions.
+      // i64 conversions are done via library routines even when generating VFP
+      // instructions, so use the same ones.
+      setLibcallName(RTLIB::FPTOSINT_F64_I32, "__fixdfsivfp");
+      setLibcallName(RTLIB::FPTOUINT_F64_I32, "__fixunsdfsivfp");
+      setLibcallName(RTLIB::FPTOSINT_F32_I32, "__fixsfsivfp");
+      setLibcallName(RTLIB::FPTOUINT_F32_I32, "__fixunssfsivfp");
+
+      // Conversions between floating types.
+      setLibcallName(RTLIB::FPROUND_F64_F32, "__truncdfsf2vfp");
+      setLibcallName(RTLIB::FPEXT_F32_F64,   "__extendsfdf2vfp");
+
+      // Integer to floating-point conversions.
+      // i64 conversions are done via library routines even when generating VFP
+      // instructions, so use the same ones.
+      // FIXME: There appears to be some naming inconsistency in ARM libgcc:
+      // e.g., __floatunsidf vs. __floatunssidfvfp.
+      setLibcallName(RTLIB::SINTTOFP_I32_F64, "__floatsidfvfp");
+      setLibcallName(RTLIB::UINTTOFP_I32_F64, "__floatunssidfvfp");
+      setLibcallName(RTLIB::SINTTOFP_I32_F32, "__floatsisfvfp");
+      setLibcallName(RTLIB::UINTTOFP_I32_F32, "__floatunssisfvfp");
+    }
+  }
+
+  // These libcalls are not available in 32-bit.
+  setLibcallName(RTLIB::SHL_I128, 0);
+  setLibcallName(RTLIB::SRL_I128, 0);
+  setLibcallName(RTLIB::SRA_I128, 0);
+
+  // Libcalls should use the AAPCS base standard ABI, even if hard float
+  // is in effect, as per the ARM RTABI specification, section 4.1.2.
+  if (Subtarget->isAAPCS_ABI()) {
+    for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) {
+      setLibcallCallingConv(static_cast<RTLIB::Libcall>(i),
+                            CallingConv::ARM_AAPCS);
+    }
+  }
+
+  if (Subtarget->isThumb1Only())
+    addRegisterClass(MVT::i32, ARM::tGPRRegisterClass);
+  else
+    addRegisterClass(MVT::i32, ARM::GPRRegisterClass);
+  if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) {
+    addRegisterClass(MVT::f32, ARM::SPRRegisterClass);
+    addRegisterClass(MVT::f64, ARM::DPRRegisterClass);
+
+    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+  }
+
+  if (Subtarget->hasNEON()) {
+    addDRTypeForNEON(MVT::v2f32);
+    addDRTypeForNEON(MVT::v8i8);
+    addDRTypeForNEON(MVT::v4i16);
+    addDRTypeForNEON(MVT::v2i32);
+    addDRTypeForNEON(MVT::v1i64);
+
+    addQRTypeForNEON(MVT::v4f32);
+    addQRTypeForNEON(MVT::v2f64);
+    addQRTypeForNEON(MVT::v16i8);
+    addQRTypeForNEON(MVT::v8i16);
+    addQRTypeForNEON(MVT::v4i32);
+    addQRTypeForNEON(MVT::v2i64);
+
+    // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
+    // neither Neon nor VFP support any arithmetic operations on it.
+    setOperationAction(ISD::FADD, MVT::v2f64, Expand);
+    setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
+    setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
+    setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
+    setOperationAction(ISD::FREM, MVT::v2f64, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Expand);
+    setOperationAction(ISD::VSETCC, MVT::v2f64, Expand);
+    setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
+    setOperationAction(ISD::FABS, MVT::v2f64, Expand);
+    setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);
+    setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
+    setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
+    setOperationAction(ISD::FPOWI, MVT::v2f64, Expand);
+    setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
+    setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
+    setOperationAction(ISD::FLOG2, MVT::v2f64, Expand);
+    setOperationAction(ISD::FLOG10, MVT::v2f64, Expand);
+    setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
+    setOperationAction(ISD::FEXP2, MVT::v2f64, Expand);
+    setOperationAction(ISD::FCEIL, MVT::v2f64, Expand);
+    setOperationAction(ISD::FTRUNC, MVT::v2f64, Expand);
+    setOperationAction(ISD::FRINT, MVT::v2f64, Expand);
+    setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Expand);
+    setOperationAction(ISD::FFLOOR, MVT::v2f64, Expand);
+
+    // Neon does not support some operations on v1i64 and v2i64 types.
+    setOperationAction(ISD::MUL, MVT::v1i64, Expand);
+    setOperationAction(ISD::MUL, MVT::v2i64, Expand);
+    setOperationAction(ISD::VSETCC, MVT::v1i64, Expand);
+    setOperationAction(ISD::VSETCC, MVT::v2i64, Expand);
+
+    setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
+    setTargetDAGCombine(ISD::SHL);
+    setTargetDAGCombine(ISD::SRL);
+    setTargetDAGCombine(ISD::SRA);
+    setTargetDAGCombine(ISD::SIGN_EXTEND);
+    setTargetDAGCombine(ISD::ZERO_EXTEND);
+    setTargetDAGCombine(ISD::ANY_EXTEND);
+  }
+
+  computeRegisterProperties();
+
+  // ARM does not have f32 extending load.
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+
+  // ARM does not have i1 sign extending load.
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+
+  // ARM supports all 4 flavors of integer indexed load / store.
+  if (!Subtarget->isThumb1Only()) {
+    for (unsigned im = (unsigned)ISD::PRE_INC;
+         im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
+      setIndexedLoadAction(im,  MVT::i1,  Legal);
+      setIndexedLoadAction(im,  MVT::i8,  Legal);
+      setIndexedLoadAction(im,  MVT::i16, Legal);
+      setIndexedLoadAction(im,  MVT::i32, Legal);
+      setIndexedStoreAction(im, MVT::i1,  Legal);
+      setIndexedStoreAction(im, MVT::i8,  Legal);
+      setIndexedStoreAction(im, MVT::i16, Legal);
+      setIndexedStoreAction(im, MVT::i32, Legal);
+    }
+  }
+
+  // i64 operation support.
+  if (Subtarget->isThumb1Only()) {
+    setOperationAction(ISD::MUL,     MVT::i64, Expand);
+    setOperationAction(ISD::MULHU,   MVT::i32, Expand);
+    setOperationAction(ISD::MULHS,   MVT::i32, Expand);
+    setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+    setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+  } else {
+    setOperationAction(ISD::MUL,     MVT::i64, Expand);
+    setOperationAction(ISD::MULHU,   MVT::i32, Expand);
+    if (!Subtarget->hasV6Ops())
+      setOperationAction(ISD::MULHS, MVT::i32, Expand);
+  }
+  setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
+  setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
+  setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
+  setOperationAction(ISD::SRL,       MVT::i64, Custom);
+  setOperationAction(ISD::SRA,       MVT::i64, Custom);
+
+  // ARM does not have ROTL.
+  setOperationAction(ISD::ROTL,  MVT::i32, Expand);
+  setOperationAction(ISD::CTTZ,  MVT::i32, Custom);
+  setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+  if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only())
+    setOperationAction(ISD::CTLZ, MVT::i32, Expand);
+
+  // Only ARMv6 has BSWAP.
+  if (!Subtarget->hasV6Ops())
+    setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+
+  // These are expanded into libcalls.
+  setOperationAction(ISD::SDIV,  MVT::i32, Expand);
+  setOperationAction(ISD::UDIV,  MVT::i32, Expand);
+  setOperationAction(ISD::SREM,  MVT::i32, Expand);
+  setOperationAction(ISD::UREM,  MVT::i32, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+
+  setOperationAction(ISD::GlobalAddress, MVT::i32,   Custom);
+  setOperationAction(ISD::ConstantPool,  MVT::i32,   Custom);
+  setOperationAction(ISD::GLOBAL_OFFSET_TABLE, MVT::i32, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
+  setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
+
+  // Use the default implementation.
+  setOperationAction(ISD::VASTART,            MVT::Other, Custom);
+  setOperationAction(ISD::VAARG,              MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY,             MVT::Other, Expand);
+  setOperationAction(ISD::VAEND,              MVT::Other, Expand);
+  setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
+  setOperationAction(ISD::EHSELECTION,        MVT::i32,   Expand);
+  // FIXME: Shouldn't need this, since no register is used, but the legalizer
+  // doesn't yet know how to not do that for SjLj.
+  setExceptionSelectorRegister(ARM::R0);
+  if (Subtarget->isThumb())
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+  else
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
+  setOperationAction(ISD::MEMBARRIER,         MVT::Other, Custom);
+
+  if (!Subtarget->hasV6Ops() && !Subtarget->isThumb2()) {
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8,  Expand);
+  }
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+  if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb1Only())
+    // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
+    // iff target supports vfp2.
+    setOperationAction(ISD::BIT_CONVERT, MVT::i64, Custom);
+
+  // We want to custom lower some of our intrinsics.
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
+  setOperationAction(ISD::SETCC,     MVT::i32, Expand);
+  setOperationAction(ISD::SETCC,     MVT::f32, Expand);
+  setOperationAction(ISD::SETCC,     MVT::f64, Expand);
+  setOperationAction(ISD::SELECT,    MVT::i32, Expand);
+  setOperationAction(ISD::SELECT,    MVT::f32, Expand);
+  setOperationAction(ISD::SELECT,    MVT::f64, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
+
+  setOperationAction(ISD::BRCOND,    MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC,     MVT::i32,   Custom);
+  setOperationAction(ISD::BR_CC,     MVT::f32,   Custom);
+  setOperationAction(ISD::BR_CC,     MVT::f64,   Custom);
+  setOperationAction(ISD::BR_JT,     MVT::Other, Custom);
+
+  // We don't support sin/cos/fmod/copysign/pow
+  setOperationAction(ISD::FSIN,      MVT::f64, Expand);
+  setOperationAction(ISD::FSIN,      MVT::f32, Expand);
+  setOperationAction(ISD::FCOS,      MVT::f32, Expand);
+  setOperationAction(ISD::FCOS,      MVT::f64, Expand);
+  setOperationAction(ISD::FREM,      MVT::f64, Expand);
+  setOperationAction(ISD::FREM,      MVT::f32, Expand);
+  if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) {
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+  }
+  setOperationAction(ISD::FPOW,      MVT::f64, Expand);
+  setOperationAction(ISD::FPOW,      MVT::f32, Expand);
+
+  // int <-> fp are custom expanded into bit_convert + ARMISD ops.
+  if (!UseSoftFloat && Subtarget->hasVFP2() && !Subtarget->isThumb1Only()) {
+    setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
+    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+    setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+  }
+
+  // We have target-specific dag combine patterns for the following nodes:
+  // ARMISD::VMOVRRD  - No need to call setTargetDAGCombine
+  setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::SUB);
+
+  setStackPointerRegisterToSaveRestore(ARM::SP);
+  setSchedulingPreference(SchedulingForRegPressure);
+
+  // FIXME: If-converter should use instruction latency to determine
+  // profitability rather than relying on fixed limits.
+  if (Subtarget->getCPUString() == "generic") {
+    // Generic (and overly aggressive) if-conversion limits.
+    setIfCvtBlockSizeLimit(10);
+    setIfCvtDupBlockSizeLimit(2);
+  } else if (Subtarget->hasV6Ops()) {
+    setIfCvtBlockSizeLimit(2);
+    setIfCvtDupBlockSizeLimit(1);
+  } else {
+    setIfCvtBlockSizeLimit(3);
+    setIfCvtDupBlockSizeLimit(2);
+  }
+
+  maxStoresPerMemcpy = 1;   //// temporary - rewrite interface to use type
+  // Do not enable CodePlacementOpt for now: it currently runs after the
+  // ARMConstantIslandPass and messes up branch relaxation and placement
+  // of constant islands.
+  // benefitFromCodePlacementOpt = true;
+}
+
+const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return 0;
+  case ARMISD::Wrapper:       return "ARMISD::Wrapper";
+  case ARMISD::WrapperJT:     return "ARMISD::WrapperJT";
+  case ARMISD::CALL:          return "ARMISD::CALL";
+  case ARMISD::CALL_PRED:     return "ARMISD::CALL_PRED";
+  case ARMISD::CALL_NOLINK:   return "ARMISD::CALL_NOLINK";
+  case ARMISD::tCALL:         return "ARMISD::tCALL";
+  case ARMISD::BRCOND:        return "ARMISD::BRCOND";
+  case ARMISD::BR_JT:         return "ARMISD::BR_JT";
+  case ARMISD::BR2_JT:        return "ARMISD::BR2_JT";
+  case ARMISD::RET_FLAG:      return "ARMISD::RET_FLAG";
+  case ARMISD::PIC_ADD:       return "ARMISD::PIC_ADD";
+  case ARMISD::CMP:           return "ARMISD::CMP";
+  case ARMISD::CMPZ:          return "ARMISD::CMPZ";
+  case ARMISD::CMPFP:         return "ARMISD::CMPFP";
+  case ARMISD::CMPFPw0:       return "ARMISD::CMPFPw0";
+  case ARMISD::FMSTAT:        return "ARMISD::FMSTAT";
+  case ARMISD::CMOV:          return "ARMISD::CMOV";
+  case ARMISD::CNEG:          return "ARMISD::CNEG";
+
+  case ARMISD::RBIT:          return "ARMISD::RBIT";
+
+  case ARMISD::FTOSI:         return "ARMISD::FTOSI";
+  case ARMISD::FTOUI:         return "ARMISD::FTOUI";
+  case ARMISD::SITOF:         return "ARMISD::SITOF";
+  case ARMISD::UITOF:         return "ARMISD::UITOF";
+
+  case ARMISD::SRL_FLAG:      return "ARMISD::SRL_FLAG";
+  case ARMISD::SRA_FLAG:      return "ARMISD::SRA_FLAG";
+  case ARMISD::RRX:           return "ARMISD::RRX";
+
+  case ARMISD::VMOVRRD:         return "ARMISD::VMOVRRD";
+  case ARMISD::VMOVDRR:         return "ARMISD::VMOVDRR";
+
+  case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP";
+  case ARMISD::EH_SJLJ_LONGJMP:return "ARMISD::EH_SJLJ_LONGJMP";
+
+  case ARMISD::THREAD_POINTER:return "ARMISD::THREAD_POINTER";
+
+  case ARMISD::DYN_ALLOC:     return "ARMISD::DYN_ALLOC";
+
+  case ARMISD::MEMBARRIER:    return "ARMISD::MEMBARRIER";
+  case ARMISD::SYNCBARRIER:   return "ARMISD::SYNCBARRIER";
+
+  case ARMISD::VCEQ:          return "ARMISD::VCEQ";
+  case ARMISD::VCGE:          return "ARMISD::VCGE";
+  case ARMISD::VCGEU:         return "ARMISD::VCGEU";
+  case ARMISD::VCGT:          return "ARMISD::VCGT";
+  case ARMISD::VCGTU:         return "ARMISD::VCGTU";
+  case ARMISD::VTST:          return "ARMISD::VTST";
+
+  case ARMISD::VSHL:          return "ARMISD::VSHL";
+  case ARMISD::VSHRs:         return "ARMISD::VSHRs";
+  case ARMISD::VSHRu:         return "ARMISD::VSHRu";
+  case ARMISD::VSHLLs:        return "ARMISD::VSHLLs";
+  case ARMISD::VSHLLu:        return "ARMISD::VSHLLu";
+  case ARMISD::VSHLLi:        return "ARMISD::VSHLLi";
+  case ARMISD::VSHRN:         return "ARMISD::VSHRN";
+  case ARMISD::VRSHRs:        return "ARMISD::VRSHRs";
+  case ARMISD::VRSHRu:        return "ARMISD::VRSHRu";
+  case ARMISD::VRSHRN:        return "ARMISD::VRSHRN";
+  case ARMISD::VQSHLs:        return "ARMISD::VQSHLs";
+  case ARMISD::VQSHLu:        return "ARMISD::VQSHLu";
+  case ARMISD::VQSHLsu:       return "ARMISD::VQSHLsu";
+  case ARMISD::VQSHRNs:       return "ARMISD::VQSHRNs";
+  case ARMISD::VQSHRNu:       return "ARMISD::VQSHRNu";
+  case ARMISD::VQSHRNsu:      return "ARMISD::VQSHRNsu";
+  case ARMISD::VQRSHRNs:      return "ARMISD::VQRSHRNs";
+  case ARMISD::VQRSHRNu:      return "ARMISD::VQRSHRNu";
+  case ARMISD::VQRSHRNsu:     return "ARMISD::VQRSHRNsu";
+  case ARMISD::VGETLANEu:     return "ARMISD::VGETLANEu";
+  case ARMISD::VGETLANEs:     return "ARMISD::VGETLANEs";
+  case ARMISD::VDUP:          return "ARMISD::VDUP";
+  case ARMISD::VDUPLANE:      return "ARMISD::VDUPLANE";
+  case ARMISD::VEXT:          return "ARMISD::VEXT";
+  case ARMISD::VREV64:        return "ARMISD::VREV64";
+  case ARMISD::VREV32:        return "ARMISD::VREV32";
+  case ARMISD::VREV16:        return "ARMISD::VREV16";
+  case ARMISD::VZIP:          return "ARMISD::VZIP";
+  case ARMISD::VUZP:          return "ARMISD::VUZP";
+  case ARMISD::VTRN:          return "ARMISD::VTRN";
+  }
+}
+
+/// getFunctionAlignment - Return the Log2 alignment of this function.
+unsigned ARMTargetLowering::getFunctionAlignment(const Function *F) const {
+  return getTargetMachine().getSubtarget<ARMSubtarget>().isThumb() ? 0 : 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Lowering Code
+//===----------------------------------------------------------------------===//
+
+/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
+static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {
+  switch (CC) {
+  default: llvm_unreachable("Unknown condition code!");
+  case ISD::SETNE:  return ARMCC::NE;
+  case ISD::SETEQ:  return ARMCC::EQ;
+  case ISD::SETGT:  return ARMCC::GT;
+  case ISD::SETGE:  return ARMCC::GE;
+  case ISD::SETLT:  return ARMCC::LT;
+  case ISD::SETLE:  return ARMCC::LE;
+  case ISD::SETUGT: return ARMCC::HI;
+  case ISD::SETUGE: return ARMCC::HS;
+  case ISD::SETULT: return ARMCC::LO;
+  case ISD::SETULE: return ARMCC::LS;
+  }
+}
+
+/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
+static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
+                        ARMCC::CondCodes &CondCode2) {
+  CondCode2 = ARMCC::AL;
+  switch (CC) {
+  default: llvm_unreachable("Unknown FP condition!");
+  case ISD::SETEQ:
+  case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
+  case ISD::SETGT:
+  case ISD::SETOGT: CondCode = ARMCC::GT; break;
+  case ISD::SETGE:
+  case ISD::SETOGE: CondCode = ARMCC::GE; break;
+  case ISD::SETOLT: CondCode = ARMCC::MI; break;
+  case ISD::SETOLE: CondCode = ARMCC::LS; break;
+  case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
+  case ISD::SETO:   CondCode = ARMCC::VC; break;
+  case ISD::SETUO:  CondCode = ARMCC::VS; break;
+  case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
+  case ISD::SETUGT: CondCode = ARMCC::HI; break;
+  case ISD::SETUGE: CondCode = ARMCC::PL; break;
+  case ISD::SETLT:
+  case ISD::SETULT: CondCode = ARMCC::LT; break;
+  case ISD::SETLE:
+  case ISD::SETULE: CondCode = ARMCC::LE; break;
+  case ISD::SETNE:
+  case ISD::SETUNE: CondCode = ARMCC::NE; break;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                      Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "ARMGenCallingConv.inc"
+
+// APCS f64 is in register pairs, possibly split to stack
+static bool f64AssignAPCS(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
+                          CCValAssign::LocInfo &LocInfo,
+                          CCState &State, bool CanFail) {
+  static const unsigned RegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
+
+  // Try to get the first register.
+  if (unsigned Reg = State.AllocateReg(RegList, 4))
+    State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+  else {
+    // For the 2nd half of a v2f64, do not fail.
+    if (CanFail)
+      return false;
+
+    // Put the whole thing on the stack.
+    State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
+                                           State.AllocateStack(8, 4),
+                                           LocVT, LocInfo));
+    return true;
+  }
+
+  // Try to get the second register.
+  if (unsigned Reg = State.AllocateReg(RegList, 4))
+    State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+  else
+    State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
+                                           State.AllocateStack(4, 4),
+                                           LocVT, LocInfo));
+  return true;
+}
+
+static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
+                                   CCValAssign::LocInfo &LocInfo,
+                                   ISD::ArgFlagsTy &ArgFlags,
+                                   CCState &State) {
+  if (!f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, true))
+    return false;
+  if (LocVT == MVT::v2f64 &&
+      !f64AssignAPCS(ValNo, ValVT, LocVT, LocInfo, State, false))
+    return false;
+  return true;  // we handled it
+}
+
+// AAPCS f64 is in aligned register pairs
+static bool f64AssignAAPCS(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
+                           CCValAssign::LocInfo &LocInfo,
+                           CCState &State, bool CanFail) {
+  static const unsigned HiRegList[] = { ARM::R0, ARM::R2 };
+  static const unsigned LoRegList[] = { ARM::R1, ARM::R3 };
+
+  unsigned Reg = State.AllocateReg(HiRegList, LoRegList, 2);
+  if (Reg == 0) {
+    // For the 2nd half of a v2f64, do not just fail.
+    if (CanFail)
+      return false;
+
+    // Put the whole thing on the stack.
+    State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
+                                           State.AllocateStack(8, 8),
+                                           LocVT, LocInfo));
+    return true;
+  }
+
+  unsigned i;
+  for (i = 0; i < 2; ++i)
+    if (HiRegList[i] == Reg)
+      break;
+
+  State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+  State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i],
+                                         LocVT, LocInfo));
+  return true;
+}
+
+static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
+                                    CCValAssign::LocInfo &LocInfo,
+                                    ISD::ArgFlagsTy &ArgFlags,
+                                    CCState &State) {
+  if (!f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, true))
+    return false;
+  if (LocVT == MVT::v2f64 &&
+      !f64AssignAAPCS(ValNo, ValVT, LocVT, LocInfo, State, false))
+    return false;
+  return true;  // we handled it
+}
+
+static bool f64RetAssign(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
+                         CCValAssign::LocInfo &LocInfo, CCState &State) {
+  static const unsigned HiRegList[] = { ARM::R0, ARM::R2 };
+  static const unsigned LoRegList[] = { ARM::R1, ARM::R3 };
+
+  unsigned Reg = State.AllocateReg(HiRegList, LoRegList, 2);
+  if (Reg == 0)
+    return false; // we didn't handle it
+
+  unsigned i;
+  for (i = 0; i < 2; ++i)
+    if (HiRegList[i] == Reg)
+      break;
+
+  State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+  State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, LoRegList[i],
+                                         LocVT, LocInfo));
+  return true;
+}
+
+static bool RetCC_ARM_APCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
+                                      CCValAssign::LocInfo &LocInfo,
+                                      ISD::ArgFlagsTy &ArgFlags,
+                                      CCState &State) {
+  if (!f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State))
+    return false;
+  if (LocVT == MVT::v2f64 && !f64RetAssign(ValNo, ValVT, LocVT, LocInfo, State))
+    return false;
+  return true;  // we handled it
+}
+
+static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
+                                       CCValAssign::LocInfo &LocInfo,
+                                       ISD::ArgFlagsTy &ArgFlags,
+                                       CCState &State) {
+  return RetCC_ARM_APCS_Custom_f64(ValNo, ValVT, LocVT, LocInfo, ArgFlags,
+                                   State);
+}
+
+/// CCAssignFnForNode - Selects the correct CCAssignFn for a the
+/// given CallingConvention value.
+CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
+                                                 bool Return,
+                                                 bool isVarArg) const {
+  switch (CC) {
+  default:
+    llvm_unreachable("Unsupported calling convention");
+  case CallingConv::C:
+  case CallingConv::Fast:
+    // Use target triple & subtarget features to do actual dispatch.
+    if (Subtarget->isAAPCS_ABI()) {
+      if (Subtarget->hasVFP2() &&
+          FloatABIType == FloatABI::Hard && !isVarArg)
+        return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP);
+      else
+        return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS);
+    } else
+        return (Return ? RetCC_ARM_APCS: CC_ARM_APCS);
+  case CallingConv::ARM_AAPCS_VFP:
+    return (Return ? RetCC_ARM_AAPCS_VFP: CC_ARM_AAPCS_VFP);
+  case CallingConv::ARM_AAPCS:
+    return (Return ? RetCC_ARM_AAPCS: CC_ARM_AAPCS);
+  case CallingConv::ARM_APCS:
+    return (Return ? RetCC_ARM_APCS: CC_ARM_APCS);
+  }
+}
+
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+SDValue
+ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
+                                   CallingConv::ID CallConv, bool isVarArg,
+                                   const SmallVectorImpl<ISD::InputArg> &Ins,
+                                   DebugLoc dl, SelectionDAG &DAG,
+                                   SmallVectorImpl<SDValue> &InVals) {
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 RVLocs, *DAG.getContext());
+  CCInfo.AnalyzeCallResult(Ins,
+                           CCAssignFnForNode(CallConv, /* Return*/ true,
+                                             isVarArg));
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign VA = RVLocs[i];
+
+    SDValue Val;
+    if (VA.needsCustom()) {
+      // Handle f64 or half of a v2f64.
+      SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
+                                      InFlag);
+      Chain = Lo.getValue(1);
+      InFlag = Lo.getValue(2);
+      VA = RVLocs[++i]; // skip ahead to next loc
+      SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
+                                      InFlag);
+      Chain = Hi.getValue(1);
+      InFlag = Hi.getValue(2);
+      Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
+
+      if (VA.getLocVT() == MVT::v2f64) {
+        SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
+        Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
+                          DAG.getConstant(0, MVT::i32));
+
+        VA = RVLocs[++i]; // skip ahead to next loc
+        Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
+        Chain = Lo.getValue(1);
+        InFlag = Lo.getValue(2);
+        VA = RVLocs[++i]; // skip ahead to next loc
+        Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
+        Chain = Hi.getValue(1);
+        InFlag = Hi.getValue(2);
+        Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
+        Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
+                          DAG.getConstant(1, MVT::i32));
+      }
+    } else {
+      Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
+                               InFlag);
+      Chain = Val.getValue(1);
+      InFlag = Val.getValue(2);
+    }
+
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::BCvt:
+      Val = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), Val);
+      break;
+    }
+
+    InVals.push_back(Val);
+  }
+
+  return Chain;
+}
+
+/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
+/// by "Src" to address "Dst" of size "Size".  Alignment information is
+/// specified by the specific parameter attribute.  The copy will be passed as
+/// a byval function parameter.
+/// Sometimes what we are copying is the end of a larger object, the part that
+/// does not fit in registers.
+static SDValue
+CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
+                          ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
+                          DebugLoc dl) {
+  SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
+  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
+                       /*AlwaysInline=*/false, NULL, 0, NULL, 0);
+}
+
+/// LowerMemOpCallTo - Store the argument to the stack.
+SDValue
+ARMTargetLowering::LowerMemOpCallTo(SDValue Chain,
+                                    SDValue StackPtr, SDValue Arg,
+                                    DebugLoc dl, SelectionDAG &DAG,
+                                    const CCValAssign &VA,
+                                    ISD::ArgFlagsTy Flags) {
+  unsigned LocMemOffset = VA.getLocMemOffset();
+  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
+  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
+  if (Flags.isByVal()) {
+    return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
+  }
+  return DAG.getStore(Chain, dl, Arg, PtrOff,
+                      PseudoSourceValue::getStack(), LocMemOffset);
+}
+
+void ARMTargetLowering::PassF64ArgInRegs(DebugLoc dl, SelectionDAG &DAG,
+                                         SDValue Chain, SDValue &Arg,
+                                         RegsToPassVector &RegsToPass,
+                                         CCValAssign &VA, CCValAssign &NextVA,
+                                         SDValue &StackPtr,
+                                         SmallVector<SDValue, 8> &MemOpChains,
+                                         ISD::ArgFlagsTy Flags) {
+
+  SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
+                              DAG.getVTList(MVT::i32, MVT::i32), Arg);
+  RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd));
+
+  if (NextVA.isRegLoc())
+    RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1)));
+  else {
+    assert(NextVA.isMemLoc());
+    if (StackPtr.getNode() == 0)
+      StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
+
+    MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1),
+                                           dl, DAG, NextVA,
+                                           Flags));
+  }
+}
+
+/// LowerCall - Lowering a call into a callseq_start <-
+/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
+/// nodes.
+SDValue
+ARMTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
+                             CallingConv::ID CallConv, bool isVarArg,
+                             bool &isTailCall,
+                             const SmallVectorImpl<ISD::OutputArg> &Outs,
+                             const SmallVectorImpl<ISD::InputArg> &Ins,
+                             DebugLoc dl, SelectionDAG &DAG,
+                             SmallVectorImpl<SDValue> &InVals) {
+  // ARM target does not yet support tail call optimization.
+  isTailCall = false;
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs,
+                 *DAG.getContext());
+  CCInfo.AnalyzeCallOperands(Outs,
+                             CCAssignFnForNode(CallConv, /* Return*/ false,
+                                               isVarArg));
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+
+  // Adjust the stack pointer for the new arguments...
+  // These operations are automatically eliminated by the prolog/epilog pass
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+
+  SDValue StackPtr = DAG.getRegister(ARM::SP, MVT::i32);
+
+  RegsToPassVector RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+
+  // Walk the register/memloc assignments, inserting copies/loads.  In the case
+  // of tail call optimization, arguments are handled later.
+  for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
+       i != e;
+       ++i, ++realArgIdx) {
+    CCValAssign &VA = ArgLocs[i];
+    SDValue Arg = Outs[realArgIdx].Val;
+    ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getLocVT(), Arg);
+      break;
+    }
+
+    // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
+    if (VA.needsCustom()) {
+      if (VA.getLocVT() == MVT::v2f64) {
+        SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
+                                  DAG.getConstant(0, MVT::i32));
+        SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
+                                  DAG.getConstant(1, MVT::i32));
+
+        PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass,
+                         VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
+
+        VA = ArgLocs[++i]; // skip ahead to next loc
+        if (VA.isRegLoc()) {
+          PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass,
+                           VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
+        } else {
+          assert(VA.isMemLoc());
+          if (StackPtr.getNode() == 0)
+            StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
+
+          MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1,
+                                                 dl, DAG, VA, Flags));
+        }
+      } else {
+        PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
+                         StackPtr, MemOpChains, Flags);
+      }
+    } else if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      assert(VA.isMemLoc());
+      if (StackPtr.getNode() == 0)
+        StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
+
+      MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
+                                             dl, DAG, VA, Flags));
+    }
+  }
+
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into the appropriate regs.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
+  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
+  // node so that legalize doesn't hack it.
+  bool isDirect = false;
+  bool isARMFunc = false;
+  bool isLocalARMFunc = false;
+  MachineFunction &MF = DAG.getMachineFunction();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    GlobalValue *GV = G->getGlobal();
+    isDirect = true;
+    bool isExt = GV->isDeclaration() || GV->isWeakForLinker();
+    bool isStub = (isExt && Subtarget->isTargetDarwin()) &&
+                   getTargetMachine().getRelocationModel() != Reloc::Static;
+    isARMFunc = !Subtarget->isThumb() || isStub;
+    // ARM call to a local ARM function is predicable.
+    isLocalARMFunc = !Subtarget->isThumb() && !isExt;
+    // tBX takes a register source operand.
+    if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
+      unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId();
+      ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV,
+                                                           ARMPCLabelIndex,
+                                                           ARMCP::CPValue, 4);
+      SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
+      CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+      Callee = DAG.getLoad(getPointerTy(), dl,
+                           DAG.getEntryNode(), CPAddr,
+                           PseudoSourceValue::getConstantPool(), 0);
+      SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
+      Callee = DAG.getNode(ARMISD::PIC_ADD, dl,
+                           getPointerTy(), Callee, PICLabel);
+   } else
+      Callee = DAG.getTargetGlobalAddress(GV, getPointerTy());
+  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    isDirect = true;
+    bool isStub = Subtarget->isTargetDarwin() &&
+                  getTargetMachine().getRelocationModel() != Reloc::Static;
+    isARMFunc = !Subtarget->isThumb() || isStub;
+    // tBX takes a register source operand.
+    const char *Sym = S->getSymbol();
+    if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
+      unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId();
+      ARMConstantPoolValue *CPV = new ARMConstantPoolValue(*DAG.getContext(),
+                                                       Sym, ARMPCLabelIndex, 4);
+      SDValue CPAddr = DAG.getTargetConstantPool(CPV, getPointerTy(), 4);
+      CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+      Callee = DAG.getLoad(getPointerTy(), dl,
+                           DAG.getEntryNode(), CPAddr,
+                           PseudoSourceValue::getConstantPool(), 0);
+      SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
+      Callee = DAG.getNode(ARMISD::PIC_ADD, dl,
+                           getPointerTy(), Callee, PICLabel);
+    } else
+      Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy());
+  }
+
+  // FIXME: handle tail calls differently.
+  unsigned CallOpc;
+  if (Subtarget->isThumb()) {
+    if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
+      CallOpc = ARMISD::CALL_NOLINK;
+    else
+      CallOpc = isARMFunc ? ARMISD::CALL : ARMISD::tCALL;
+  } else {
+    CallOpc = (isDirect || Subtarget->hasV5TOps())
+      ? (isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL)
+      : ARMISD::CALL_NOLINK;
+  }
+  if (CallOpc == ARMISD::CALL_NOLINK && !Subtarget->isThumb1Only()) {
+    // implicit def LR - LR mustn't be allocated as GRP:$dst of CALL_NOLINK
+    Chain = DAG.getCopyToReg(Chain, dl, ARM::LR, DAG.getUNDEF(MVT::i32),InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  std::vector<SDValue> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are known live
+  // into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+  // Returns a chain and a flag for retval copy to use.
+  Chain = DAG.getNode(CallOpc, dl, DAG.getVTList(MVT::Other, MVT::Flag),
+                      &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                             DAG.getIntPtrConstant(0, true), InFlag);
+  if (!Ins.empty())
+    InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins,
+                         dl, DAG, InVals);
+}
+
+SDValue
+ARMTargetLowering::LowerReturn(SDValue Chain,
+                               CallingConv::ID CallConv, bool isVarArg,
+                               const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               DebugLoc dl, SelectionDAG &DAG) {
+
+  // CCValAssign - represent the assignment of the return value to a location.
+  SmallVector<CCValAssign, 16> RVLocs;
+
+  // CCState - Info about the registers and stack slots.
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs,
+                 *DAG.getContext());
+
+  // Analyze outgoing return values.
+  CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv, /* Return */ true,
+                                               isVarArg));
+
+  // If this is the first return lowered for this function, add
+  // the regs to the liveout set for the function.
+  if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
+    for (unsigned i = 0; i != RVLocs.size(); ++i)
+      if (RVLocs[i].isRegLoc())
+        DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
+  }
+
+  SDValue Flag;
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0, realRVLocIdx = 0;
+       i != RVLocs.size();
+       ++i, ++realRVLocIdx) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    SDValue Arg = Outs[realRVLocIdx].Val;
+
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getLocVT(), Arg);
+      break;
+    }
+
+    if (VA.needsCustom()) {
+      if (VA.getLocVT() == MVT::v2f64) {
+        // Extract the first half and return it in two registers.
+        SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
+                                   DAG.getConstant(0, MVT::i32));
+        SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
+                                       DAG.getVTList(MVT::i32, MVT::i32), Half);
+
+        Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), HalfGPRs, Flag);
+        Flag = Chain.getValue(1);
+        VA = RVLocs[++i]; // skip ahead to next loc
+        Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+                                 HalfGPRs.getValue(1), Flag);
+        Flag = Chain.getValue(1);
+        VA = RVLocs[++i]; // skip ahead to next loc
+
+        // Extract the 2nd half and fall through to handle it as an f64 value.
+        Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
+                          DAG.getConstant(1, MVT::i32));
+      }
+      // Legalize ret f64 -> ret 2 x i32.  We always have fmrrd if f64 is
+      // available.
+      SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
+                                  DAG.getVTList(MVT::i32, MVT::i32), &Arg, 1);
+      Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), fmrrd, Flag);
+      Flag = Chain.getValue(1);
+      VA = RVLocs[++i]; // skip ahead to next loc
+      Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), fmrrd.getValue(1),
+                               Flag);
+    } else
+      Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
+
+    // Guarantee that all emitted copies are
+    // stuck together, avoiding something bad.
+    Flag = Chain.getValue(1);
+  }
+
+  SDValue result;
+  if (Flag.getNode())
+    result = DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, Chain, Flag);
+  else // Return Void
+    result = DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, Chain);
+
+  return result;
+}
+
+// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
+// their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
+// one of the above mentioned nodes. It has to be wrapped because otherwise
+// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
+// be used to form addressing mode. These wrapped nodes will be selected
+// into MOVi.
+static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
+  EVT PtrVT = Op.getValueType();
+  // FIXME there is no actual debug info here
+  DebugLoc dl = Op.getDebugLoc();
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+  SDValue Res;
+  if (CP->isMachineConstantPoolEntry())
+    Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
+                                    CP->getAlignment());
+  else
+    Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
+                                    CP->getAlignment());
+  return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
+}
+
+SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  unsigned ARMPCLabelIndex = 0;
+  DebugLoc DL = Op.getDebugLoc();
+  EVT PtrVT = getPointerTy();
+  BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+  Reloc::Model RelocM = getTargetMachine().getRelocationModel();
+  SDValue CPAddr;
+  if (RelocM == Reloc::Static) {
+    CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4);
+  } else {
+    unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
+    ARMPCLabelIndex = AFI->createConstPoolEntryUId();
+    ARMConstantPoolValue *CPV = new ARMConstantPoolValue(BA, ARMPCLabelIndex,
+                                                         ARMCP::CPBlockAddress,
+                                                         PCAdj);
+    CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+  }
+  CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
+  SDValue Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), CPAddr,
+                               PseudoSourceValue::getConstantPool(), 0);
+  if (RelocM == Reloc::Static)
+    return Result;
+  SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
+  return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
+}
+
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model
+SDValue
+ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
+                                                 SelectionDAG &DAG) {
+  DebugLoc dl = GA->getDebugLoc();
+  EVT PtrVT = getPointerTy();
+  unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
+  MachineFunction &MF = DAG.getMachineFunction();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId();
+  ARMConstantPoolValue *CPV =
+    new ARMConstantPoolValue(GA->getGlobal(), ARMPCLabelIndex,
+                             ARMCP::CPValue, PCAdj, "tlsgd", true);
+  SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+  Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
+  Argument = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Argument,
+                         PseudoSourceValue::getConstantPool(), 0);
+  SDValue Chain = Argument.getValue(1);
+
+  SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
+  Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
+
+  // call __tls_get_addr.
+  ArgListTy Args;
+  ArgListEntry Entry;
+  Entry.Node = Argument;
+  Entry.Ty = (const Type *) Type::getInt32Ty(*DAG.getContext());
+  Args.push_back(Entry);
+  // FIXME: is there useful debug info available here?
+  std::pair<SDValue, SDValue> CallResult =
+    LowerCallTo(Chain, (const Type *) Type::getInt32Ty(*DAG.getContext()),
+                false, false, false, false,
+                0, CallingConv::C, false, /*isReturnValueUsed=*/true,
+                DAG.getExternalSymbol("__tls_get_addr", PtrVT), Args, DAG, dl,
+                DAG.GetOrdering(Chain.getNode()));
+  return CallResult.first;
+}
+
+// Lower ISD::GlobalTLSAddress using the "initial exec" or
+// "local exec" model.
+SDValue
+ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
+                                        SelectionDAG &DAG) {
+  GlobalValue *GV = GA->getGlobal();
+  DebugLoc dl = GA->getDebugLoc();
+  SDValue Offset;
+  SDValue Chain = DAG.getEntryNode();
+  EVT PtrVT = getPointerTy();
+  // Get the Thread Pointer
+  SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
+
+  if (GV->isDeclaration()) {
+    MachineFunction &MF = DAG.getMachineFunction();
+    ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+    unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId();
+    // Initial exec model.
+    unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
+    ARMConstantPoolValue *CPV =
+      new ARMConstantPoolValue(GA->getGlobal(), ARMPCLabelIndex,
+                               ARMCP::CPValue, PCAdj, "gottpoff", true);
+    Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+    Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
+    Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
+                         PseudoSourceValue::getConstantPool(), 0);
+    Chain = Offset.getValue(1);
+
+    SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
+    Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
+
+    Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
+                         PseudoSourceValue::getConstantPool(), 0);
+  } else {
+    // local exec model
+    ARMConstantPoolValue *CPV = new ARMConstantPoolValue(GV, "tpoff");
+    Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+    Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
+    Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
+                         PseudoSourceValue::getConstantPool(), 0);
+  }
+
+  // The address of the thread local variable is the add of the thread
+  // pointer with the offset of the variable.
+  return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
+}
+
+SDValue
+ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) {
+  // TODO: implement the "local dynamic" model
+  assert(Subtarget->isTargetELF() &&
+         "TLS not implemented for non-ELF targets");
+  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+  // If the relocation model is PIC, use the "General Dynamic" TLS Model,
+  // otherwise use the "Local Exec" TLS Model
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_)
+    return LowerToTLSGeneralDynamicModel(GA, DAG);
+  else
+    return LowerToTLSExecModels(GA, DAG);
+}
+
+SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
+                                                 SelectionDAG &DAG) {
+  EVT PtrVT = getPointerTy();
+  DebugLoc dl = Op.getDebugLoc();
+  GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  Reloc::Model RelocM = getTargetMachine().getRelocationModel();
+  if (RelocM == Reloc::PIC_) {
+    bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility();
+    ARMConstantPoolValue *CPV =
+      new ARMConstantPoolValue(GV, UseGOTOFF ? "GOTOFF" : "GOT");
+    SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+    CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+    SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
+                                 CPAddr,
+                                 PseudoSourceValue::getConstantPool(), 0);
+    SDValue Chain = Result.getValue(1);
+    SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(PtrVT);
+    Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result, GOT);
+    if (!UseGOTOFF)
+      Result = DAG.getLoad(PtrVT, dl, Chain, Result,
+                           PseudoSourceValue::getGOT(), 0);
+    return Result;
+  } else {
+    // If we have T2 ops, we can materialize the address directly via movt/movw
+    // pair. This is always cheaper.
+    if (Subtarget->useMovt()) {
+      return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
+                         DAG.getTargetGlobalAddress(GV, PtrVT));
+    } else {
+      SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
+      CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+      return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
+                         PseudoSourceValue::getConstantPool(), 0);
+    }
+  }
+}
+
+SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
+                                                    SelectionDAG &DAG) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  unsigned ARMPCLabelIndex = 0;
+  EVT PtrVT = getPointerTy();
+  DebugLoc dl = Op.getDebugLoc();
+  GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  Reloc::Model RelocM = getTargetMachine().getRelocationModel();
+  SDValue CPAddr;
+  if (RelocM == Reloc::Static)
+    CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
+  else {
+    ARMPCLabelIndex = AFI->createConstPoolEntryUId();
+    unsigned PCAdj = (RelocM != Reloc::PIC_) ? 0 : (Subtarget->isThumb()?4:8);
+    ARMConstantPoolValue *CPV =
+      new ARMConstantPoolValue(GV, ARMPCLabelIndex, ARMCP::CPValue, PCAdj);
+    CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+  }
+  CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+
+  SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
+                               PseudoSourceValue::getConstantPool(), 0);
+  SDValue Chain = Result.getValue(1);
+
+  if (RelocM == Reloc::PIC_) {
+    SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
+    Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
+  }
+
+  if (Subtarget->GVIsIndirectSymbol(GV, RelocM))
+    Result = DAG.getLoad(PtrVT, dl, Chain, Result,
+                         PseudoSourceValue::getGOT(), 0);
+
+  return Result;
+}
+
+SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op,
+                                                    SelectionDAG &DAG){
+  assert(Subtarget->isTargetELF() &&
+         "GLOBAL OFFSET TABLE not implemented for non-ELF targets");
+  MachineFunction &MF = DAG.getMachineFunction();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId();
+  EVT PtrVT = getPointerTy();
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
+  ARMConstantPoolValue *CPV = new ARMConstantPoolValue(*DAG.getContext(),
+                                                       "_GLOBAL_OFFSET_TABLE_",
+                                                       ARMPCLabelIndex, PCAdj);
+  SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+  CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+  SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
+                               PseudoSourceValue::getConstantPool(), 0);
+  SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
+  return DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
+}
+
+SDValue
+ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
+                                           const ARMSubtarget *Subtarget) {
+  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  DebugLoc dl = Op.getDebugLoc();
+  switch (IntNo) {
+  default: return SDValue();    // Don't custom lower most intrinsics.
+  case Intrinsic::arm_thread_pointer: {
+    EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+    return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
+  }
+  case Intrinsic::eh_sjlj_lsda: {
+    MachineFunction &MF = DAG.getMachineFunction();
+    ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+    unsigned ARMPCLabelIndex = AFI->createConstPoolEntryUId();
+    EVT PtrVT = getPointerTy();
+    DebugLoc dl = Op.getDebugLoc();
+    Reloc::Model RelocM = getTargetMachine().getRelocationModel();
+    SDValue CPAddr;
+    unsigned PCAdj = (RelocM != Reloc::PIC_)
+      ? 0 : (Subtarget->isThumb() ? 4 : 8);
+    ARMConstantPoolValue *CPV =
+      new ARMConstantPoolValue(MF.getFunction(), ARMPCLabelIndex,
+                               ARMCP::CPLSDA, PCAdj);
+    CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+    CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+    SDValue Result =
+      DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
+                  PseudoSourceValue::getConstantPool(), 0);
+    SDValue Chain = Result.getValue(1);
+
+    if (RelocM == Reloc::PIC_) {
+      SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
+      Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
+    }
+    return Result;
+  }
+  case Intrinsic::eh_sjlj_setjmp:
+    SDValue Val = Subtarget->isThumb() ?
+      DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::SP, MVT::i32) :
+      DAG.getConstant(0, MVT::i32);
+    return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl, MVT::i32, Op.getOperand(1),
+                       Val);
+  }
+}
+
+static SDValue LowerMEMBARRIER(SDValue Op, SelectionDAG &DAG,
+                          const ARMSubtarget *Subtarget) {
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue Op5 = Op.getOperand(5);
+  SDValue Res;
+  unsigned isDeviceBarrier = cast<ConstantSDNode>(Op5)->getZExtValue();
+  if (isDeviceBarrier) {
+    if (Subtarget->hasV7Ops())
+      Res = DAG.getNode(ARMISD::SYNCBARRIER, dl, MVT::Other, Op.getOperand(0));
+    else
+      Res = DAG.getNode(ARMISD::SYNCBARRIER, dl, MVT::Other, Op.getOperand(0),
+                        DAG.getConstant(0, MVT::i32));
+  } else {
+    if (Subtarget->hasV7Ops())
+      Res = DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
+    else
+      Res = DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0),
+                        DAG.getConstant(0, MVT::i32));
+  }
+  return Res;
+}
+
+static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG,
+                            unsigned VarArgsFrameIndex) {
+  // vastart just stores the address of the VarArgsFrameIndex slot into the
+  // memory location argument.
+  DebugLoc dl = Op.getDebugLoc();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  SDValue FR = DAG.getFrameIndex(VarArgsFrameIndex, PtrVT);
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0);
+}
+
+SDValue
+ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) {
+  SDNode *Node = Op.getNode();
+  DebugLoc dl = Node->getDebugLoc();
+  EVT VT = Node->getValueType(0);
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size  = Op.getOperand(1);
+  SDValue Align = Op.getOperand(2);
+
+  // Chain the dynamic stack allocation so that it doesn't modify the stack
+  // pointer when other instructions are using the stack.
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true));
+
+  unsigned AlignVal = cast<ConstantSDNode>(Align)->getZExtValue();
+  unsigned StackAlign = getTargetMachine().getFrameInfo()->getStackAlignment();
+  if (AlignVal > StackAlign)
+    // Do this now since selection pass cannot introduce new target
+    // independent node.
+    Align = DAG.getConstant(-(uint64_t)AlignVal, VT);
+
+  // In Thumb1 mode, there isn't a "sub r, sp, r" instruction, we will end up
+  // using a "add r, sp, r" instead. Negate the size now so we don't have to
+  // do even more horrible hack later.
+  MachineFunction &MF = DAG.getMachineFunction();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  if (AFI->isThumb1OnlyFunction()) {
+    bool Negate = true;
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Size);
+    if (C) {
+      uint32_t Val = C->getZExtValue();
+      if (Val <= 508 && ((Val & 3) == 0))
+        Negate = false;
+    }
+    if (Negate)
+      Size = DAG.getNode(ISD::SUB, dl, VT, DAG.getConstant(0, VT), Size);
+  }
+
+  SDVTList VTList = DAG.getVTList(VT, MVT::Other);
+  SDValue Ops1[] = { Chain, Size, Align };
+  SDValue Res = DAG.getNode(ARMISD::DYN_ALLOC, dl, VTList, Ops1, 3);
+  Chain = Res.getValue(1);
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, true),
+                             DAG.getIntPtrConstant(0, true), SDValue());
+  SDValue Ops2[] = { Res, Chain };
+  return DAG.getMergeValues(Ops2, 2, dl);
+}
+
+SDValue
+ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
+                                        SDValue &Root, SelectionDAG &DAG,
+                                        DebugLoc dl) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+  TargetRegisterClass *RC;
+  if (AFI->isThumb1OnlyFunction())
+    RC = ARM::tGPRRegisterClass;
+  else
+    RC = ARM::GPRRegisterClass;
+
+  // Transform the arguments stored in physical registers into virtual ones.
+  unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+  SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
+
+  SDValue ArgValue2;
+  if (NextVA.isMemLoc()) {
+    unsigned ArgSize = NextVA.getLocVT().getSizeInBits()/8;
+    MachineFrameInfo *MFI = MF.getFrameInfo();
+    int FI = MFI->CreateFixedObject(ArgSize, NextVA.getLocMemOffset(),
+                                    true, false);
+
+    // Create load node to retrieve arguments from the stack.
+    SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+    ArgValue2 = DAG.getLoad(MVT::i32, dl, Root, FIN,
+                            PseudoSourceValue::getFixedStack(FI), 0);
+  } else {
+    Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
+    ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
+  }
+
+  return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
+}
+
+SDValue
+ARMTargetLowering::LowerFormalArguments(SDValue Chain,
+                                        CallingConv::ID CallConv, bool isVarArg,
+                                        const SmallVectorImpl<ISD::InputArg>
+                                          &Ins,
+                                        DebugLoc dl, SelectionDAG &DAG,
+                                        SmallVectorImpl<SDValue> &InVals) {
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs,
+                 *DAG.getContext());
+  CCInfo.AnalyzeFormalArguments(Ins,
+                                CCAssignFnForNode(CallConv, /* Return*/ false,
+                                                  isVarArg));
+
+  SmallVector<SDValue, 16> ArgValues;
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+
+    // Arguments stored in registers.
+    if (VA.isRegLoc()) {
+      EVT RegVT = VA.getLocVT();
+
+      SDValue ArgValue;
+      if (VA.needsCustom()) {
+        // f64 and vector types are split up into multiple registers or
+        // combinations of registers and stack slots.
+        RegVT = MVT::i32;
+
+        if (VA.getLocVT() == MVT::v2f64) {
+          SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i],
+                                                   Chain, DAG, dl);
+          VA = ArgLocs[++i]; // skip ahead to next loc
+          SDValue ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i],
+                                                   Chain, DAG, dl);
+          ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
+          ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
+                                 ArgValue, ArgValue1, DAG.getIntPtrConstant(0));
+          ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
+                                 ArgValue, ArgValue2, DAG.getIntPtrConstant(1));
+        } else
+          ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
+
+      } else {
+        TargetRegisterClass *RC;
+
+        if (RegVT == MVT::f32)
+          RC = ARM::SPRRegisterClass;
+        else if (RegVT == MVT::f64)
+          RC = ARM::DPRRegisterClass;
+        else if (RegVT == MVT::v2f64)
+          RC = ARM::QPRRegisterClass;
+        else if (RegVT == MVT::i32)
+          RC = (AFI->isThumb1OnlyFunction() ?
+                ARM::tGPRRegisterClass : ARM::GPRRegisterClass);
+        else
+          llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
+
+        // Transform the arguments in physical registers into virtual ones.
+        unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+        ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
+      }
+
+      // If this is an 8 or 16-bit value, it is really passed promoted
+      // to 32 bits.  Insert an assert[sz]ext to capture this, then
+      // truncate to the right size.
+      switch (VA.getLocInfo()) {
+      default: llvm_unreachable("Unknown loc info!");
+      case CCValAssign::Full: break;
+      case CCValAssign::BCvt:
+        ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue);
+        break;
+      case CCValAssign::SExt:
+        ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+        ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+        break;
+      case CCValAssign::ZExt:
+        ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+        ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+        break;
+      }
+
+      InVals.push_back(ArgValue);
+
+    } else { // VA.isRegLoc()
+
+      // sanity check
+      assert(VA.isMemLoc());
+      assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
+
+      unsigned ArgSize = VA.getLocVT().getSizeInBits()/8;
+      int FI = MFI->CreateFixedObject(ArgSize, VA.getLocMemOffset(),
+                                      true, false);
+
+      // Create load nodes to retrieve arguments from the stack.
+      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+      InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
+                                   PseudoSourceValue::getFixedStack(FI), 0));
+    }
+  }
+
+  // varargs
+  if (isVarArg) {
+    static const unsigned GPRArgRegs[] = {
+      ARM::R0, ARM::R1, ARM::R2, ARM::R3
+    };
+
+    unsigned NumGPRs = CCInfo.getFirstUnallocated
+      (GPRArgRegs, sizeof(GPRArgRegs) / sizeof(GPRArgRegs[0]));
+
+    unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment();
+    unsigned VARegSize = (4 - NumGPRs) * 4;
+    unsigned VARegSaveSize = (VARegSize + Align - 1) & ~(Align - 1);
+    unsigned ArgOffset = CCInfo.getNextStackOffset();
+    if (VARegSaveSize) {
+      // If this function is vararg, store any remaining integer argument regs
+      // to their spots on the stack so that they may be loaded by deferencing
+      // the result of va_next.
+      AFI->setVarArgsRegSaveSize(VARegSaveSize);
+      VarArgsFrameIndex = MFI->CreateFixedObject(VARegSaveSize, ArgOffset +
+                                                 VARegSaveSize - VARegSize,
+                                                 true, false);
+      SDValue FIN = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy());
+
+      SmallVector<SDValue, 4> MemOps;
+      for (; NumGPRs < 4; ++NumGPRs) {
+        TargetRegisterClass *RC;
+        if (AFI->isThumb1OnlyFunction())
+          RC = ARM::tGPRRegisterClass;
+        else
+          RC = ARM::GPRRegisterClass;
+
+        unsigned VReg = MF.addLiveIn(GPRArgRegs[NumGPRs], RC);
+        SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
+        SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                        PseudoSourceValue::getFixedStack(VarArgsFrameIndex), 0);
+        MemOps.push_back(Store);
+        FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), FIN,
+                          DAG.getConstant(4, getPointerTy()));
+      }
+      if (!MemOps.empty())
+        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                            &MemOps[0], MemOps.size());
+    } else
+      // This will point to the next argument passed via stack.
+      VarArgsFrameIndex = MFI->CreateFixedObject(4, ArgOffset, true, false);
+  }
+
+  return Chain;
+}
+
+/// isFloatingPointZero - Return true if this is +0.0.
+static bool isFloatingPointZero(SDValue Op) {
+  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
+    return CFP->getValueAPF().isPosZero();
+  else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
+    // Maybe this has already been legalized into the constant pool?
+    if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
+      SDValue WrapperOp = Op.getOperand(1).getOperand(0);
+      if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(WrapperOp))
+        if (ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
+          return CFP->getValueAPF().isPosZero();
+    }
+  }
+  return false;
+}
+
+/// Returns appropriate ARM CMP (cmp) and corresponding condition code for
+/// the given operands.
+SDValue
+ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                             SDValue &ARMCC, SelectionDAG &DAG, DebugLoc dl) {
+  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
+    unsigned C = RHSC->getZExtValue();
+    if (!isLegalICmpImmediate(C)) {
+      // Constant does not fit, try adjusting it by one?
+      switch (CC) {
+      default: break;
+      case ISD::SETLT:
+      case ISD::SETGE:
+        if (isLegalICmpImmediate(C-1)) {
+          CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
+          RHS = DAG.getConstant(C-1, MVT::i32);
+        }
+        break;
+      case ISD::SETULT:
+      case ISD::SETUGE:
+        if (C > 0 && isLegalICmpImmediate(C-1)) {
+          CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
+          RHS = DAG.getConstant(C-1, MVT::i32);
+        }
+        break;
+      case ISD::SETLE:
+      case ISD::SETGT:
+        if (isLegalICmpImmediate(C+1)) {
+          CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
+          RHS = DAG.getConstant(C+1, MVT::i32);
+        }
+        break;
+      case ISD::SETULE:
+      case ISD::SETUGT:
+        if (C < 0xffffffff && isLegalICmpImmediate(C+1)) {
+          CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
+          RHS = DAG.getConstant(C+1, MVT::i32);
+        }
+        break;
+      }
+    }
+  }
+
+  ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
+  ARMISD::NodeType CompareType;
+  switch (CondCode) {
+  default:
+    CompareType = ARMISD::CMP;
+    break;
+  case ARMCC::EQ:
+  case ARMCC::NE:
+    // Uses only Z Flag
+    CompareType = ARMISD::CMPZ;
+    break;
+  }
+  ARMCC = DAG.getConstant(CondCode, MVT::i32);
+  return DAG.getNode(CompareType, dl, MVT::Flag, LHS, RHS);
+}
+
+/// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
+static SDValue getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG,
+                         DebugLoc dl) {
+  SDValue Cmp;
+  if (!isFloatingPointZero(RHS))
+    Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Flag, LHS, RHS);
+  else
+    Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Flag, LHS);
+  return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Flag, Cmp);
+}
+
+SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+  SDValue TrueVal = Op.getOperand(2);
+  SDValue FalseVal = Op.getOperand(3);
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (LHS.getValueType() == MVT::i32) {
+    SDValue ARMCC;
+    SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+    SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMCC, DAG, dl);
+    return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMCC, CCR,Cmp);
+  }
+
+  ARMCC::CondCodes CondCode, CondCode2;
+  FPCCToARMCC(CC, CondCode, CondCode2);
+
+  SDValue ARMCC = DAG.getConstant(CondCode, MVT::i32);
+  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+  SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
+  SDValue Result = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
+                                 ARMCC, CCR, Cmp);
+  if (CondCode2 != ARMCC::AL) {
+    SDValue ARMCC2 = DAG.getConstant(CondCode2, MVT::i32);
+    // FIXME: Needs another CMP because flag can have but one use.
+    SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl);
+    Result = DAG.getNode(ARMISD::CMOV, dl, VT,
+                         Result, TrueVal, ARMCC2, CCR, Cmp2);
+  }
+  return Result;
+}
+
+SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) {
+  SDValue  Chain = Op.getOperand(0);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+  SDValue    LHS = Op.getOperand(2);
+  SDValue    RHS = Op.getOperand(3);
+  SDValue   Dest = Op.getOperand(4);
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (LHS.getValueType() == MVT::i32) {
+    SDValue ARMCC;
+    SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+    SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMCC, DAG, dl);
+    return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other,
+                       Chain, Dest, ARMCC, CCR,Cmp);
+  }
+
+  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
+  ARMCC::CondCodes CondCode, CondCode2;
+  FPCCToARMCC(CC, CondCode, CondCode2);
+
+  SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
+  SDValue ARMCC = DAG.getConstant(CondCode, MVT::i32);
+  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+  SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Flag);
+  SDValue Ops[] = { Chain, Dest, ARMCC, CCR, Cmp };
+  SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops, 5);
+  if (CondCode2 != ARMCC::AL) {
+    ARMCC = DAG.getConstant(CondCode2, MVT::i32);
+    SDValue Ops[] = { Res, Dest, ARMCC, CCR, Res.getValue(1) };
+    Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops, 5);
+  }
+  return Res;
+}
+
+SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) {
+  SDValue Chain = Op.getOperand(0);
+  SDValue Table = Op.getOperand(1);
+  SDValue Index = Op.getOperand(2);
+  DebugLoc dl = Op.getDebugLoc();
+
+  EVT PTy = getPointerTy();
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
+  ARMFunctionInfo *AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
+  SDValue UId = DAG.getConstant(AFI->createJumpTableUId(), PTy);
+  SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
+  Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI, UId);
+  Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, PTy));
+  SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Index, Table);
+  if (Subtarget->isThumb2()) {
+    // Thumb2 uses a two-level jump. That is, it jumps into the jump table
+    // which does another jump to the destination. This also makes it easier
+    // to translate it to TBB / TBH later.
+    // FIXME: This might not work if the function is extremely large.
+    return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
+                       Addr, Op.getOperand(2), JTI, UId);
+  }
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_) {
+    Addr = DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
+                       PseudoSourceValue::getJumpTable(), 0);
+    Chain = Addr.getValue(1);
+    Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr, Table);
+    return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI, UId);
+  } else {
+    Addr = DAG.getLoad(PTy, dl, Chain, Addr,
+                       PseudoSourceValue::getJumpTable(), 0);
+    Chain = Addr.getValue(1);
+    return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI, UId);
+  }
+}
+
+static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned Opc =
+    Op.getOpcode() == ISD::FP_TO_SINT ? ARMISD::FTOSI : ARMISD::FTOUI;
+  Op = DAG.getNode(Opc, dl, MVT::f32, Op.getOperand(0));
+  return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, Op);
+}
+
+static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned Opc =
+    Op.getOpcode() == ISD::SINT_TO_FP ? ARMISD::SITOF : ARMISD::UITOF;
+
+  Op = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Op.getOperand(0));
+  return DAG.getNode(Opc, dl, VT, Op);
+}
+
+static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
+  // Implement fcopysign with a fabs and a conditional fneg.
+  SDValue Tmp0 = Op.getOperand(0);
+  SDValue Tmp1 = Op.getOperand(1);
+  DebugLoc dl = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+  EVT SrcVT = Tmp1.getValueType();
+  SDValue AbsVal = DAG.getNode(ISD::FABS, dl, VT, Tmp0);
+  SDValue Cmp = getVFPCmp(Tmp1, DAG.getConstantFP(0.0, SrcVT), DAG, dl);
+  SDValue ARMCC = DAG.getConstant(ARMCC::LT, MVT::i32);
+  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+  return DAG.getNode(ARMISD::CNEG, dl, VT, AbsVal, AbsVal, ARMCC, CCR, Cmp);
+}
+
+SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) {
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setFrameAddressIsTaken(true);
+  EVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();  // FIXME probably not meaningful
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned FrameReg = (Subtarget->isThumb() || Subtarget->isTargetDarwin())
+    ? ARM::R7 : ARM::R11;
+  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
+  while (Depth--)
+    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0);
+  return FrameAddr;
+}
+
+SDValue
+ARMTargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
+                                           SDValue Chain,
+                                           SDValue Dst, SDValue Src,
+                                           SDValue Size, unsigned Align,
+                                           bool AlwaysInline,
+                                         const Value *DstSV, uint64_t DstSVOff,
+                                         const Value *SrcSV, uint64_t SrcSVOff){
+  // Do repeated 4-byte loads and stores. To be improved.
+  // This requires 4-byte alignment.
+  if ((Align & 3) != 0)
+    return SDValue();
+  // This requires the copy size to be a constant, preferrably
+  // within a subtarget-specific limit.
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  if (!ConstantSize)
+    return SDValue();
+  uint64_t SizeVal = ConstantSize->getZExtValue();
+  if (!AlwaysInline && SizeVal > getSubtarget()->getMaxInlineSizeThreshold())
+    return SDValue();
+
+  unsigned BytesLeft = SizeVal & 3;
+  unsigned NumMemOps = SizeVal >> 2;
+  unsigned EmittedNumMemOps = 0;
+  EVT VT = MVT::i32;
+  unsigned VTSize = 4;
+  unsigned i = 0;
+  const unsigned MAX_LOADS_IN_LDM = 6;
+  SDValue TFOps[MAX_LOADS_IN_LDM];
+  SDValue Loads[MAX_LOADS_IN_LDM];
+  uint64_t SrcOff = 0, DstOff = 0;
+
+  // Emit up to MAX_LOADS_IN_LDM loads, then a TokenFactor barrier, then the
+  // same number of stores.  The loads and stores will get combined into
+  // ldm/stm later on.
+  while (EmittedNumMemOps < NumMemOps) {
+    for (i = 0;
+         i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) {
+      Loads[i] = DAG.getLoad(VT, dl, Chain,
+                             DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
+                                         DAG.getConstant(SrcOff, MVT::i32)),
+                             SrcSV, SrcSVOff + SrcOff);
+      TFOps[i] = Loads[i].getValue(1);
+      SrcOff += VTSize;
+    }
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
+
+    for (i = 0;
+         i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) {
+      TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
+                           DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
+                                       DAG.getConstant(DstOff, MVT::i32)),
+                           DstSV, DstSVOff + DstOff);
+      DstOff += VTSize;
+    }
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
+
+    EmittedNumMemOps += i;
+  }
+
+  if (BytesLeft == 0)
+    return Chain;
+
+  // Issue loads / stores for the trailing (1 - 3) bytes.
+  unsigned BytesLeftSave = BytesLeft;
+  i = 0;
+  while (BytesLeft) {
+    if (BytesLeft >= 2) {
+      VT = MVT::i16;
+      VTSize = 2;
+    } else {
+      VT = MVT::i8;
+      VTSize = 1;
+    }
+
+    Loads[i] = DAG.getLoad(VT, dl, Chain,
+                           DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
+                                       DAG.getConstant(SrcOff, MVT::i32)),
+                           SrcSV, SrcSVOff + SrcOff);
+    TFOps[i] = Loads[i].getValue(1);
+    ++i;
+    SrcOff += VTSize;
+    BytesLeft -= VTSize;
+  }
+  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
+
+  i = 0;
+  BytesLeft = BytesLeftSave;
+  while (BytesLeft) {
+    if (BytesLeft >= 2) {
+      VT = MVT::i16;
+      VTSize = 2;
+    } else {
+      VT = MVT::i8;
+      VTSize = 1;
+    }
+
+    TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
+                            DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
+                                        DAG.getConstant(DstOff, MVT::i32)),
+                            DstSV, DstSVOff + DstOff);
+    ++i;
+    DstOff += VTSize;
+    BytesLeft -= VTSize;
+  }
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
+}
+
+static SDValue ExpandBIT_CONVERT(SDNode *N, SelectionDAG &DAG) {
+  SDValue Op = N->getOperand(0);
+  DebugLoc dl = N->getDebugLoc();
+  if (N->getValueType(0) == MVT::f64) {
+    // Turn i64->f64 into VMOVDRR.
+    SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
+                             DAG.getConstant(0, MVT::i32));
+    SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op,
+                             DAG.getConstant(1, MVT::i32));
+    return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
+  }
+
+  // Turn f64->i64 into VMOVRRD.
+  SDValue Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
+                            DAG.getVTList(MVT::i32, MVT::i32), &Op, 1);
+
+  // Merge the pieces into a single i64 value.
+  return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
+}
+
+/// getZeroVector - Returns a vector of specified type with all zero elements.
+///
+static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) {
+  assert(VT.isVector() && "Expected a vector type");
+
+  // Zero vectors are used to represent vector negation and in those cases
+  // will be implemented with the NEON VNEG instruction.  However, VNEG does
+  // not support i64 elements, so sometimes the zero vectors will need to be
+  // explicitly constructed.  For those cases, and potentially other uses in
+  // the future, always build zero vectors as <16 x i8> or <8 x i8> bitcasted
+  // to their dest type.  This ensures they get CSE'd.
+  SDValue Vec;
+  SDValue Cst = DAG.getTargetConstant(0, MVT::i8);
+  SmallVector<SDValue, 8> Ops;
+  MVT TVT;
+
+  if (VT.getSizeInBits() == 64) {
+    Ops.assign(8, Cst); TVT = MVT::v8i8;
+  } else {
+    Ops.assign(16, Cst); TVT = MVT::v16i8;
+  }
+  Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, TVT, &Ops[0], Ops.size());
+
+  return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
+}
+
+/// getOnesVector - Returns a vector of specified type with all bits set.
+///
+static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) {
+  assert(VT.isVector() && "Expected a vector type");
+
+  // Always build ones vectors as <16 x i8> or <8 x i8> bitcasted to their
+  // dest type. This ensures they get CSE'd.
+  SDValue Vec;
+  SDValue Cst = DAG.getTargetConstant(0xFF, MVT::i8);
+  SmallVector<SDValue, 8> Ops;
+  MVT TVT;
+
+  if (VT.getSizeInBits() == 64) {
+    Ops.assign(8, Cst); TVT = MVT::v8i8;
+  } else {
+    Ops.assign(16, Cst); TVT = MVT::v16i8;
+  }
+  Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, TVT, &Ops[0], Ops.size());
+
+  return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
+}
+
+/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
+/// i32 values and take a 2 x i32 value to shift plus a shift amount.
+SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) {
+  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+  EVT VT = Op.getValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt  = Op.getOperand(2);
+  SDValue ARMCC;
+  unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
+
+  assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
+
+  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
+                                 DAG.getConstant(VTBits, MVT::i32), ShAmt);
+  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
+  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
+                                   DAG.getConstant(VTBits, MVT::i32));
+  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
+  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+  SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
+
+  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+  SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, MVT::i32), ISD::SETGE,
+                          ARMCC, DAG, dl);
+  SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
+  SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMCC,
+                           CCR, Cmp);
+
+  SDValue Ops[2] = { Lo, Hi };
+  return DAG.getMergeValues(Ops, 2, dl);
+}
+
+/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
+/// i32 values and take a 2 x i32 value to shift plus a shift amount.
+SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) {
+  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+  EVT VT = Op.getValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt  = Op.getOperand(2);
+  SDValue ARMCC;
+
+  assert(Op.getOpcode() == ISD::SHL_PARTS);
+  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
+                                 DAG.getConstant(VTBits, MVT::i32), ShAmt);
+  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
+  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
+                                   DAG.getConstant(VTBits, MVT::i32));
+  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
+  SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
+
+  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+  SDValue Cmp = getARMCmp(ExtraShAmt, DAG.getConstant(0, MVT::i32), ISD::SETGE,
+                          ARMCC, DAG, dl);
+  SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+  SDValue Hi = DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, Tmp3, ARMCC,
+                           CCR, Cmp);
+
+  SDValue Ops[2] = { Lo, Hi };
+  return DAG.getMergeValues(Ops, 2, dl);
+}
+
+static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
+                         const ARMSubtarget *ST) {
+  EVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  if (!ST->hasV6T2Ops())
+    return SDValue();
+
+  SDValue rbit = DAG.getNode(ARMISD::RBIT, dl, VT, N->getOperand(0));
+  return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
+}
+
+static SDValue LowerShift(SDNode *N, SelectionDAG &DAG,
+                          const ARMSubtarget *ST) {
+  EVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
+
+  // Lower vector shifts on NEON to use VSHL.
+  if (VT.isVector()) {
+    assert(ST->hasNEON() && "unexpected vector shift");
+
+    // Left shifts translate directly to the vshiftu intrinsic.
+    if (N->getOpcode() == ISD::SHL)
+      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                         DAG.getConstant(Intrinsic::arm_neon_vshiftu, MVT::i32),
+                         N->getOperand(0), N->getOperand(1));
+
+    assert((N->getOpcode() == ISD::SRA ||
+            N->getOpcode() == ISD::SRL) && "unexpected vector shift opcode");
+
+    // NEON uses the same intrinsics for both left and right shifts.  For
+    // right shifts, the shift amounts are negative, so negate the vector of
+    // shift amounts.
+    EVT ShiftVT = N->getOperand(1).getValueType();
+    SDValue NegatedCount = DAG.getNode(ISD::SUB, dl, ShiftVT,
+                                       getZeroVector(ShiftVT, DAG, dl),
+                                       N->getOperand(1));
+    Intrinsic::ID vshiftInt = (N->getOpcode() == ISD::SRA ?
+                               Intrinsic::arm_neon_vshifts :
+                               Intrinsic::arm_neon_vshiftu);
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                       DAG.getConstant(vshiftInt, MVT::i32),
+                       N->getOperand(0), NegatedCount);
+  }
+
+  // We can get here for a node like i32 = ISD::SHL i32, i64
+  if (VT != MVT::i64)
+    return SDValue();
+
+  assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
+         "Unknown shift to lower!");
+
+  // We only lower SRA, SRL of 1 here, all others use generic lowering.
+  if (!isa<ConstantSDNode>(N->getOperand(1)) ||
+      cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() != 1)
+    return SDValue();
+
+  // If we are in thumb mode, we don't have RRX.
+  if (ST->isThumb1Only()) return SDValue();
+
+  // Okay, we have a 64-bit SRA or SRL of 1.  Lower this to an RRX expr.
+  SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
+                             DAG.getConstant(0, MVT::i32));
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(0),
+                             DAG.getConstant(1, MVT::i32));
+
+  // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and
+  // captures the result into a carry flag.
+  unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG;
+  Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Flag), &Hi, 1);
+
+  // The low part is an ARMISD::RRX operand, which shifts the carry in.
+  Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
+
+  // Merge the pieces into a single i64 value.
+ return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+}
+
+static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
+  SDValue TmpOp0, TmpOp1;
+  bool Invert = false;
+  bool Swap = false;
+  unsigned Opc = 0;
+
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDValue CC = Op.getOperand(2);
+  EVT VT = Op.getValueType();
+  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (Op.getOperand(1).getValueType().isFloatingPoint()) {
+    switch (SetCCOpcode) {
+    default: llvm_unreachable("Illegal FP comparison"); break;
+    case ISD::SETUNE:
+    case ISD::SETNE:  Invert = true; // Fallthrough
+    case ISD::SETOEQ:
+    case ISD::SETEQ:  Opc = ARMISD::VCEQ; break;
+    case ISD::SETOLT:
+    case ISD::SETLT: Swap = true; // Fallthrough
+    case ISD::SETOGT:
+    case ISD::SETGT:  Opc = ARMISD::VCGT; break;
+    case ISD::SETOLE:
+    case ISD::SETLE:  Swap = true; // Fallthrough
+    case ISD::SETOGE:
+    case ISD::SETGE: Opc = ARMISD::VCGE; break;
+    case ISD::SETUGE: Swap = true; // Fallthrough
+    case ISD::SETULE: Invert = true; Opc = ARMISD::VCGT; break;
+    case ISD::SETUGT: Swap = true; // Fallthrough
+    case ISD::SETULT: Invert = true; Opc = ARMISD::VCGE; break;
+    case ISD::SETUEQ: Invert = true; // Fallthrough
+    case ISD::SETONE:
+      // Expand this to (OLT | OGT).
+      TmpOp0 = Op0;
+      TmpOp1 = Op1;
+      Opc = ISD::OR;
+      Op0 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp1, TmpOp0);
+      Op1 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp0, TmpOp1);
+      break;
+    case ISD::SETUO: Invert = true; // Fallthrough
+    case ISD::SETO:
+      // Expand this to (OLT | OGE).
+      TmpOp0 = Op0;
+      TmpOp1 = Op1;
+      Opc = ISD::OR;
+      Op0 = DAG.getNode(ARMISD::VCGT, dl, VT, TmpOp1, TmpOp0);
+      Op1 = DAG.getNode(ARMISD::VCGE, dl, VT, TmpOp0, TmpOp1);
+      break;
+    }
+  } else {
+    // Integer comparisons.
+    switch (SetCCOpcode) {
+    default: llvm_unreachable("Illegal integer comparison"); break;
+    case ISD::SETNE:  Invert = true;
+    case ISD::SETEQ:  Opc = ARMISD::VCEQ; break;
+    case ISD::SETLT:  Swap = true;
+    case ISD::SETGT:  Opc = ARMISD::VCGT; break;
+    case ISD::SETLE:  Swap = true;
+    case ISD::SETGE:  Opc = ARMISD::VCGE; break;
+    case ISD::SETULT: Swap = true;
+    case ISD::SETUGT: Opc = ARMISD::VCGTU; break;
+    case ISD::SETULE: Swap = true;
+    case ISD::SETUGE: Opc = ARMISD::VCGEU; break;
+    }
+
+    // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
+    if (Opc == ARMISD::VCEQ) {
+
+      SDValue AndOp;
+      if (ISD::isBuildVectorAllZeros(Op1.getNode()))
+        AndOp = Op0;
+      else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
+        AndOp = Op1;
+
+      // Ignore bitconvert.
+      if (AndOp.getNode() && AndOp.getOpcode() == ISD::BIT_CONVERT)
+        AndOp = AndOp.getOperand(0);
+
+      if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
+        Opc = ARMISD::VTST;
+        Op0 = DAG.getNode(ISD::BIT_CONVERT, dl, VT, AndOp.getOperand(0));
+        Op1 = DAG.getNode(ISD::BIT_CONVERT, dl, VT, AndOp.getOperand(1));
+        Invert = !Invert;
+      }
+    }
+  }
+
+  if (Swap)
+    std::swap(Op0, Op1);
+
+  SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
+
+  if (Invert)
+    Result = DAG.getNOT(dl, Result, VT);
+
+  return Result;
+}
+
+/// isVMOVSplat - Check if the specified splat value corresponds to an immediate
+/// VMOV instruction, and if so, return the constant being splatted.
+static SDValue isVMOVSplat(uint64_t SplatBits, uint64_t SplatUndef,
+                           unsigned SplatBitSize, SelectionDAG &DAG) {
+  switch (SplatBitSize) {
+  case 8:
+    // Any 1-byte value is OK.
+    assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
+    return DAG.getTargetConstant(SplatBits, MVT::i8);
+
+  case 16:
+    // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
+    if ((SplatBits & ~0xff) == 0 ||
+        (SplatBits & ~0xff00) == 0)
+      return DAG.getTargetConstant(SplatBits, MVT::i16);
+    break;
+
+  case 32:
+    // NEON's 32-bit VMOV supports splat values where:
+    // * only one byte is nonzero, or
+    // * the least significant byte is 0xff and the second byte is nonzero, or
+    // * the least significant 2 bytes are 0xff and the third is nonzero.
+    if ((SplatBits & ~0xff) == 0 ||
+        (SplatBits & ~0xff00) == 0 ||
+        (SplatBits & ~0xff0000) == 0 ||
+        (SplatBits & ~0xff000000) == 0)
+      return DAG.getTargetConstant(SplatBits, MVT::i32);
+
+    if ((SplatBits & ~0xffff) == 0 &&
+        ((SplatBits | SplatUndef) & 0xff) == 0xff)
+      return DAG.getTargetConstant(SplatBits | 0xff, MVT::i32);
+
+    if ((SplatBits & ~0xffffff) == 0 &&
+        ((SplatBits | SplatUndef) & 0xffff) == 0xffff)
+      return DAG.getTargetConstant(SplatBits | 0xffff, MVT::i32);
+
+    // Note: there are a few 32-bit splat values (specifically: 00ffff00,
+    // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
+    // VMOV.I32.  A (very) minor optimization would be to replicate the value
+    // and fall through here to test for a valid 64-bit splat.  But, then the
+    // caller would also need to check and handle the change in size.
+    break;
+
+  case 64: {
+    // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
+    uint64_t BitMask = 0xff;
+    uint64_t Val = 0;
+    for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
+      if (((SplatBits | SplatUndef) & BitMask) == BitMask)
+        Val |= BitMask;
+      else if ((SplatBits & BitMask) != 0)
+        return SDValue();
+      BitMask <<= 8;
+    }
+    return DAG.getTargetConstant(Val, MVT::i64);
+  }
+
+  default:
+    llvm_unreachable("unexpected size for isVMOVSplat");
+    break;
+  }
+
+  return SDValue();
+}
+
+/// getVMOVImm - If this is a build_vector of constants which can be
+/// formed by using a VMOV instruction of the specified element size,
+/// return the constant being splatted.  The ByteSize field indicates the
+/// number of bytes of each element [1248].
+SDValue ARM::getVMOVImm(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N);
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
+                                      HasAnyUndefs, ByteSize * 8))
+    return SDValue();
+
+  if (SplatBitSize > ByteSize * 8)
+    return SDValue();
+
+  return isVMOVSplat(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
+                     SplatBitSize, DAG);
+}
+
+static bool isVEXTMask(const SmallVectorImpl<int> &M, EVT VT,
+                       bool &ReverseVEXT, unsigned &Imm) {
+  unsigned NumElts = VT.getVectorNumElements();
+  ReverseVEXT = false;
+  Imm = M[0];
+
+  // If this is a VEXT shuffle, the immediate value is the index of the first
+  // element.  The other shuffle indices must be the successive elements after
+  // the first one.
+  unsigned ExpectedElt = Imm;
+  for (unsigned i = 1; i < NumElts; ++i) {
+    // Increment the expected index.  If it wraps around, it may still be
+    // a VEXT but the source vectors must be swapped.
+    ExpectedElt += 1;
+    if (ExpectedElt == NumElts * 2) {
+      ExpectedElt = 0;
+      ReverseVEXT = true;
+    }
+
+    if (ExpectedElt != static_cast<unsigned>(M[i]))
+      return false;
+  }
+
+  // Adjust the index value if the source operands will be swapped.
+  if (ReverseVEXT)
+    Imm -= NumElts;
+
+  return true;
+}
+
+/// isVREVMask - Check if a vector shuffle corresponds to a VREV
+/// instruction with the specified blocksize.  (The order of the elements
+/// within each block of the vector is reversed.)
+static bool isVREVMask(const SmallVectorImpl<int> &M, EVT VT,
+                       unsigned BlockSize) {
+  assert((BlockSize==16 || BlockSize==32 || BlockSize==64) &&
+         "Only possible block sizes for VREV are: 16, 32, 64");
+
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned BlockElts = M[0] + 1;
+
+  if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
+    return false;
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    if ((unsigned) M[i] !=
+        (i - i%BlockElts) + (BlockElts - 1 - i%BlockElts))
+      return false;
+  }
+
+  return true;
+}
+
+static bool isVTRNMask(const SmallVectorImpl<int> &M, EVT VT,
+                       unsigned &WhichResult) {
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned i = 0; i < NumElts; i += 2) {
+    if ((unsigned) M[i] != i + WhichResult ||
+        (unsigned) M[i+1] != i + NumElts + WhichResult)
+      return false;
+  }
+  return true;
+}
+
+/// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
+static bool isVTRN_v_undef_Mask(const SmallVectorImpl<int> &M, EVT VT,
+                                unsigned &WhichResult) {
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned i = 0; i < NumElts; i += 2) {
+    if ((unsigned) M[i] != i + WhichResult ||
+        (unsigned) M[i+1] != i + WhichResult)
+      return false;
+  }
+  return true;
+}
+
+static bool isVUZPMask(const SmallVectorImpl<int> &M, EVT VT,
+                       unsigned &WhichResult) {
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned i = 0; i != NumElts; ++i) {
+    if ((unsigned) M[i] != 2 * i + WhichResult)
+      return false;
+  }
+
+  // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
+  if (VT.is64BitVector() && EltSz == 32)
+    return false;
+
+  return true;
+}
+
+/// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
+static bool isVUZP_v_undef_Mask(const SmallVectorImpl<int> &M, EVT VT,
+                                unsigned &WhichResult) {
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned Half = VT.getVectorNumElements() / 2;
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned j = 0; j != 2; ++j) {
+    unsigned Idx = WhichResult;
+    for (unsigned i = 0; i != Half; ++i) {
+      if ((unsigned) M[i + j * Half] != Idx)
+        return false;
+      Idx += 2;
+    }
+  }
+
+  // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
+  if (VT.is64BitVector() && EltSz == 32)
+    return false;
+
+  return true;
+}
+
+static bool isVZIPMask(const SmallVectorImpl<int> &M, EVT VT,
+                       unsigned &WhichResult) {
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  unsigned Idx = WhichResult * NumElts / 2;
+  for (unsigned i = 0; i != NumElts; i += 2) {
+    if ((unsigned) M[i] != Idx ||
+        (unsigned) M[i+1] != Idx + NumElts)
+      return false;
+    Idx += 1;
+  }
+
+  // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
+  if (VT.is64BitVector() && EltSz == 32)
+    return false;
+
+  return true;
+}
+
+/// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
+static bool isVZIP_v_undef_Mask(const SmallVectorImpl<int> &M, EVT VT,
+                                unsigned &WhichResult) {
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  unsigned Idx = WhichResult * NumElts / 2;
+  for (unsigned i = 0; i != NumElts; i += 2) {
+    if ((unsigned) M[i] != Idx ||
+        (unsigned) M[i+1] != Idx)
+      return false;
+    Idx += 1;
+  }
+
+  // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
+  if (VT.is64BitVector() && EltSz == 32)
+    return false;
+
+  return true;
+}
+
+
+static SDValue BuildSplat(SDValue Val, EVT VT, SelectionDAG &DAG, DebugLoc dl) {
+  // Canonicalize all-zeros and all-ones vectors.
+  ConstantSDNode *ConstVal = cast<ConstantSDNode>(Val.getNode());
+  if (ConstVal->isNullValue())
+    return getZeroVector(VT, DAG, dl);
+  if (ConstVal->isAllOnesValue())
+    return getOnesVector(VT, DAG, dl);
+
+  EVT CanonicalVT;
+  if (VT.is64BitVector()) {
+    switch (Val.getValueType().getSizeInBits()) {
+    case 8:  CanonicalVT = MVT::v8i8; break;
+    case 16: CanonicalVT = MVT::v4i16; break;
+    case 32: CanonicalVT = MVT::v2i32; break;
+    case 64: CanonicalVT = MVT::v1i64; break;
+    default: llvm_unreachable("unexpected splat element type"); break;
+    }
+  } else {
+    assert(VT.is128BitVector() && "unknown splat vector size");
+    switch (Val.getValueType().getSizeInBits()) {
+    case 8:  CanonicalVT = MVT::v16i8; break;
+    case 16: CanonicalVT = MVT::v8i16; break;
+    case 32: CanonicalVT = MVT::v4i32; break;
+    case 64: CanonicalVT = MVT::v2i64; break;
+    default: llvm_unreachable("unexpected splat element type"); break;
+    }
+  }
+
+  // Build a canonical splat for this value.
+  SmallVector<SDValue, 8> Ops;
+  Ops.assign(CanonicalVT.getVectorNumElements(), Val);
+  SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, dl, CanonicalVT, &Ops[0],
+                            Ops.size());
+  return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Res);
+}
+
+// If this is a case we can't handle, return null and let the default
+// expansion code take care of it.
+static SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
+  BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
+  DebugLoc dl = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
+    if (SplatBitSize <= 64) {
+      SDValue Val = isVMOVSplat(SplatBits.getZExtValue(),
+                                SplatUndef.getZExtValue(), SplatBitSize, DAG);
+      if (Val.getNode())
+        return BuildSplat(Val, VT, DAG, dl);
+    }
+  }
+
+  // If there are only 2 elements in a 128-bit vector, insert them into an
+  // undef vector.  This handles the common case for 128-bit vector argument
+  // passing, where the insertions should be translated to subreg accesses
+  // with no real instructions.
+  if (VT.is128BitVector() && Op.getNumOperands() == 2) {
+    SDValue Val = DAG.getUNDEF(VT);
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+    if (Op0.getOpcode() != ISD::UNDEF)
+      Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, Op0,
+                        DAG.getIntPtrConstant(0));
+    if (Op1.getOpcode() != ISD::UNDEF)
+      Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, Op1,
+                        DAG.getIntPtrConstant(1));
+    return Val;
+  }
+
+  return SDValue();
+}
+
+/// isShuffleMaskLegal - Targets can use this to indicate that they only
+/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
+/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
+/// are assumed to be legal.
+bool
+ARMTargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
+                                      EVT VT) const {
+  if (VT.getVectorNumElements() == 4 &&
+      (VT.is128BitVector() || VT.is64BitVector())) {
+    unsigned PFIndexes[4];
+    for (unsigned i = 0; i != 4; ++i) {
+      if (M[i] < 0)
+        PFIndexes[i] = 8;
+      else
+        PFIndexes[i] = M[i];
+    }
+
+    // Compute the index in the perfect shuffle table.
+    unsigned PFTableIndex =
+      PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
+    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+    unsigned Cost = (PFEntry >> 30);
+
+    if (Cost <= 4)
+      return true;
+  }
+
+  bool ReverseVEXT;
+  unsigned Imm, WhichResult;
+
+  return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
+          isVREVMask(M, VT, 64) ||
+          isVREVMask(M, VT, 32) ||
+          isVREVMask(M, VT, 16) ||
+          isVEXTMask(M, VT, ReverseVEXT, Imm) ||
+          isVTRNMask(M, VT, WhichResult) ||
+          isVUZPMask(M, VT, WhichResult) ||
+          isVZIPMask(M, VT, WhichResult) ||
+          isVTRN_v_undef_Mask(M, VT, WhichResult) ||
+          isVUZP_v_undef_Mask(M, VT, WhichResult) ||
+          isVZIP_v_undef_Mask(M, VT, WhichResult));
+}
+
+/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
+/// the specified operations to build the shuffle.
+static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
+                                      SDValue RHS, SelectionDAG &DAG,
+                                      DebugLoc dl) {
+  unsigned OpNum = (PFEntry >> 26) & 0x0F;
+  unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
+  unsigned RHSID = (PFEntry >>  0) & ((1 << 13)-1);
+
+  enum {
+    OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
+    OP_VREV,
+    OP_VDUP0,
+    OP_VDUP1,
+    OP_VDUP2,
+    OP_VDUP3,
+    OP_VEXT1,
+    OP_VEXT2,
+    OP_VEXT3,
+    OP_VUZPL, // VUZP, left result
+    OP_VUZPR, // VUZP, right result
+    OP_VZIPL, // VZIP, left result
+    OP_VZIPR, // VZIP, right result
+    OP_VTRNL, // VTRN, left result
+    OP_VTRNR  // VTRN, right result
+  };
+
+  if (OpNum == OP_COPY) {
+    if (LHSID == (1*9+2)*9+3) return LHS;
+    assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
+    return RHS;
+  }
+
+  SDValue OpLHS, OpRHS;
+  OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
+  OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
+  EVT VT = OpLHS.getValueType();
+
+  switch (OpNum) {
+  default: llvm_unreachable("Unknown shuffle opcode!");
+  case OP_VREV:
+    return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
+  case OP_VDUP0:
+  case OP_VDUP1:
+  case OP_VDUP2:
+  case OP_VDUP3:
+    return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
+                       OpLHS, DAG.getConstant(OpNum-OP_VDUP0, MVT::i32));
+  case OP_VEXT1:
+  case OP_VEXT2:
+  case OP_VEXT3:
+    return DAG.getNode(ARMISD::VEXT, dl, VT,
+                       OpLHS, OpRHS,
+                       DAG.getConstant(OpNum-OP_VEXT1+1, MVT::i32));
+  case OP_VUZPL:
+  case OP_VUZPR:
+    return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
+                       OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
+  case OP_VZIPL:
+  case OP_VZIPR:
+    return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
+                       OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
+  case OP_VTRNL:
+  case OP_VTRNR:
+    return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
+                       OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
+  }
+}
+
+static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  DebugLoc dl = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
+  SmallVector<int, 8> ShuffleMask;
+
+  // Convert shuffles that are directly supported on NEON to target-specific
+  // DAG nodes, instead of keeping them as shuffles and matching them again
+  // during code selection.  This is more efficient and avoids the possibility
+  // of inconsistencies between legalization and selection.
+  // FIXME: floating-point vectors should be canonicalized to integer vectors
+  // of the same time so that they get CSEd properly.
+  SVN->getMask(ShuffleMask);
+
+  if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) {
+    int Lane = SVN->getSplatIndex();
+    // If this is undef splat, generate it via "just" vdup, if possible.
+    if (Lane == -1) Lane = 0;
+
+    if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+      return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
+    }
+    return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
+                       DAG.getConstant(Lane, MVT::i32));
+  }
+
+  bool ReverseVEXT;
+  unsigned Imm;
+  if (isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
+    if (ReverseVEXT)
+      std::swap(V1, V2);
+    return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
+                       DAG.getConstant(Imm, MVT::i32));
+  }
+
+  if (isVREVMask(ShuffleMask, VT, 64))
+    return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
+  if (isVREVMask(ShuffleMask, VT, 32))
+    return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
+  if (isVREVMask(ShuffleMask, VT, 16))
+    return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
+
+  // Check for Neon shuffles that modify both input vectors in place.
+  // If both results are used, i.e., if there are two shuffles with the same
+  // source operands and with masks corresponding to both results of one of
+  // these operations, DAG memoization will ensure that a single node is
+  // used for both shuffles.
+  unsigned WhichResult;
+  if (isVTRNMask(ShuffleMask, VT, WhichResult))
+    return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
+                       V1, V2).getValue(WhichResult);
+  if (isVUZPMask(ShuffleMask, VT, WhichResult))
+    return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
+                       V1, V2).getValue(WhichResult);
+  if (isVZIPMask(ShuffleMask, VT, WhichResult))
+    return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
+                       V1, V2).getValue(WhichResult);
+
+  if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
+    return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
+                       V1, V1).getValue(WhichResult);
+  if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
+    return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
+                       V1, V1).getValue(WhichResult);
+  if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
+    return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
+                       V1, V1).getValue(WhichResult);
+
+  // If the shuffle is not directly supported and it has 4 elements, use
+  // the PerfectShuffle-generated table to synthesize it from other shuffles.
+  if (VT.getVectorNumElements() == 4 &&
+      (VT.is128BitVector() || VT.is64BitVector())) {
+    unsigned PFIndexes[4];
+    for (unsigned i = 0; i != 4; ++i) {
+      if (ShuffleMask[i] < 0)
+        PFIndexes[i] = 8;
+      else
+        PFIndexes[i] = ShuffleMask[i];
+    }
+
+    // Compute the index in the perfect shuffle table.
+    unsigned PFTableIndex =
+      PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
+
+    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+    unsigned Cost = (PFEntry >> 30);
+
+    if (Cost <= 4)
+      return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
+  }
+
+  return SDValue();
+}
+
+static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue Vec = Op.getOperand(0);
+  SDValue Lane = Op.getOperand(1);
+  assert(VT == MVT::i32 &&
+         Vec.getValueType().getVectorElementType().getSizeInBits() < 32 &&
+         "unexpected type for custom-lowering vector extract");
+  return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
+}
+
+static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
+  // The only time a CONCAT_VECTORS operation can have legal types is when
+  // two 64-bit vectors are concatenated to a 128-bit vector.
+  assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
+         "unexpected CONCAT_VECTORS");
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue Val = DAG.getUNDEF(MVT::v2f64);
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  if (Op0.getOpcode() != ISD::UNDEF)
+    Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
+                      DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f64, Op0),
+                      DAG.getIntPtrConstant(0));
+  if (Op1.getOpcode() != ISD::UNDEF)
+    Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
+                      DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f64, Op1),
+                      DAG.getIntPtrConstant(1));
+  return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Val);
+}
+
+SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Don't know how to custom lower this!");
+  case ISD::ConstantPool:  return LowerConstantPool(Op, DAG);
+  case ISD::BlockAddress:  return LowerBlockAddress(Op, DAG);
+  case ISD::GlobalAddress:
+    return Subtarget->isTargetDarwin() ? LowerGlobalAddressDarwin(Op, DAG) :
+      LowerGlobalAddressELF(Op, DAG);
+  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::SELECT_CC:     return LowerSELECT_CC(Op, DAG);
+  case ISD::BR_CC:         return LowerBR_CC(Op, DAG);
+  case ISD::BR_JT:         return LowerBR_JT(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
+  case ISD::VASTART:       return LowerVASTART(Op, DAG, VarArgsFrameIndex);
+  case ISD::MEMBARRIER:    return LowerMEMBARRIER(Op, DAG, Subtarget);
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:    return LowerINT_TO_FP(Op, DAG);
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:    return LowerFP_TO_INT(Op, DAG);
+  case ISD::FCOPYSIGN:     return LowerFCOPYSIGN(Op, DAG);
+  case ISD::RETURNADDR:    break;
+  case ISD::FRAMEADDR:     return LowerFRAMEADDR(Op, DAG);
+  case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG);
+  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
+                                                               Subtarget);
+  case ISD::BIT_CONVERT:   return ExpandBIT_CONVERT(Op.getNode(), DAG);
+  case ISD::SHL:
+  case ISD::SRL:
+  case ISD::SRA:           return LowerShift(Op.getNode(), DAG, Subtarget);
+  case ISD::SHL_PARTS:     return LowerShiftLeftParts(Op, DAG);
+  case ISD::SRL_PARTS:
+  case ISD::SRA_PARTS:     return LowerShiftRightParts(Op, DAG);
+  case ISD::CTTZ:          return LowerCTTZ(Op.getNode(), DAG, Subtarget);
+  case ISD::VSETCC:        return LowerVSETCC(Op, DAG);
+  case ISD::BUILD_VECTOR:  return LowerBUILD_VECTOR(Op, DAG);
+  case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
+  }
+  return SDValue();
+}
+
+/// ReplaceNodeResults - Replace the results of node with an illegal result
+/// type with new values built out of custom code.
+void ARMTargetLowering::ReplaceNodeResults(SDNode *N,
+                                           SmallVectorImpl<SDValue>&Results,
+                                           SelectionDAG &DAG) {
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Don't know how to custom expand this!");
+    return;
+  case ISD::BIT_CONVERT:
+    Results.push_back(ExpandBIT_CONVERT(N, DAG));
+    return;
+  case ISD::SRL:
+  case ISD::SRA: {
+    SDValue Res = LowerShift(N, DAG, Subtarget);
+    if (Res.getNode())
+      Results.push_back(Res);
+    return;
+  }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                           ARM Scheduler Hooks
+//===----------------------------------------------------------------------===//
+
+MachineBasicBlock *
+ARMTargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
+                                     MachineBasicBlock *BB,
+                                     unsigned Size) const {
+  unsigned dest    = MI->getOperand(0).getReg();
+  unsigned ptr     = MI->getOperand(1).getReg();
+  unsigned oldval  = MI->getOperand(2).getReg();
+  unsigned newval  = MI->getOperand(3).getReg();
+  unsigned scratch = BB->getParent()->getRegInfo()
+    .createVirtualRegister(ARM::GPRRegisterClass);
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+  bool isThumb2 = Subtarget->isThumb2();
+
+  unsigned ldrOpc, strOpc;
+  switch (Size) {
+  default: llvm_unreachable("unsupported size for AtomicCmpSwap!");
+  case 1:
+    ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB;
+    strOpc = isThumb2 ? ARM::t2LDREXB : ARM::STREXB;
+    break;
+  case 2:
+    ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH;
+    strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH;
+    break;
+  case 4:
+    ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX;
+    strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX;
+    break;
+  }
+
+  MachineFunction *MF = BB->getParent();
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It; // insert the new blocks after the current block
+
+  MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MF->insert(It, loop1MBB);
+  MF->insert(It, loop2MBB);
+  MF->insert(It, exitMBB);
+  exitMBB->transferSuccessors(BB);
+
+  //  thisMBB:
+  //   ...
+  //   fallthrough --> loop1MBB
+  BB->addSuccessor(loop1MBB);
+
+  // loop1MBB:
+  //   ldrex dest, [ptr]
+  //   cmp dest, oldval
+  //   bne exitMBB
+  BB = loop1MBB;
+  AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr));
+  AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
+                 .addReg(dest).addReg(oldval));
+  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
+    .addMBB(exitMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
+  BB->addSuccessor(loop2MBB);
+  BB->addSuccessor(exitMBB);
+
+  // loop2MBB:
+  //   strex scratch, newval, [ptr]
+  //   cmp scratch, #0
+  //   bne loop1MBB
+  BB = loop2MBB;
+  AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(newval)
+                 .addReg(ptr));
+  AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
+                 .addReg(scratch).addImm(0));
+  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
+    .addMBB(loop1MBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
+  BB->addSuccessor(loop1MBB);
+  BB->addSuccessor(exitMBB);
+
+  //  exitMBB:
+  //   ...
+  BB = exitMBB;
+
+  MF->DeleteMachineInstr(MI);   // The instruction is gone now.
+
+  return BB;
+}
+
+MachineBasicBlock *
+ARMTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
+                                    unsigned Size, unsigned BinOpcode) const {
+  // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction *MF = BB->getParent();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  unsigned dest = MI->getOperand(0).getReg();
+  unsigned ptr = MI->getOperand(1).getReg();
+  unsigned incr = MI->getOperand(2).getReg();
+  DebugLoc dl = MI->getDebugLoc();
+
+  bool isThumb2 = Subtarget->isThumb2();
+  unsigned ldrOpc, strOpc;
+  switch (Size) {
+  default: llvm_unreachable("unsupported size for AtomicCmpSwap!");
+  case 1:
+    ldrOpc = isThumb2 ? ARM::t2LDREXB : ARM::LDREXB;
+    strOpc = isThumb2 ? ARM::t2STREXB : ARM::STREXB;
+    break;
+  case 2:
+    ldrOpc = isThumb2 ? ARM::t2LDREXH : ARM::LDREXH;
+    strOpc = isThumb2 ? ARM::t2STREXH : ARM::STREXH;
+    break;
+  case 4:
+    ldrOpc = isThumb2 ? ARM::t2LDREX : ARM::LDREX;
+    strOpc = isThumb2 ? ARM::t2STREX : ARM::STREX;
+    break;
+  }
+
+  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MF->insert(It, loopMBB);
+  MF->insert(It, exitMBB);
+  exitMBB->transferSuccessors(BB);
+
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  unsigned scratch = RegInfo.createVirtualRegister(ARM::GPRRegisterClass);
+  unsigned scratch2 = (!BinOpcode) ? incr :
+    RegInfo.createVirtualRegister(ARM::GPRRegisterClass);
+
+  //  thisMBB:
+  //   ...
+  //   fallthrough --> loopMBB
+  BB->addSuccessor(loopMBB);
+
+  //  loopMBB:
+  //   ldrex dest, ptr
+  //   <binop> scratch2, dest, incr
+  //   strex scratch, scratch2, ptr
+  //   cmp scratch, #0
+  //   bne- loopMBB
+  //   fallthrough --> exitMBB
+  BB = loopMBB;
+  AddDefaultPred(BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr));
+  if (BinOpcode) {
+    // operand order needs to go the other way for NAND
+    if (BinOpcode == ARM::BICrr || BinOpcode == ARM::t2BICrr)
+      AddDefaultPred(BuildMI(BB, dl, TII->get(BinOpcode), scratch2).
+                     addReg(incr).addReg(dest)).addReg(0);
+    else
+      AddDefaultPred(BuildMI(BB, dl, TII->get(BinOpcode), scratch2).
+                     addReg(dest).addReg(incr)).addReg(0);
+  }
+
+  AddDefaultPred(BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(scratch2)
+                 .addReg(ptr));
+  AddDefaultPred(BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
+                 .addReg(scratch).addImm(0));
+  BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
+    .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
+
+  BB->addSuccessor(loopMBB);
+  BB->addSuccessor(exitMBB);
+
+  //  exitMBB:
+  //   ...
+  BB = exitMBB;
+
+  MF->DeleteMachineInstr(MI);   // The instruction is gone now.
+
+  return BB;
+}
+
+MachineBasicBlock *
+ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                               MachineBasicBlock *BB,
+                   DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+  bool isThumb2 = Subtarget->isThumb2();
+  switch (MI->getOpcode()) {
+  default:
+    MI->dump();
+    llvm_unreachable("Unexpected instr type to insert");
+
+  case ARM::ATOMIC_LOAD_ADD_I8:
+     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr);
+  case ARM::ATOMIC_LOAD_ADD_I16:
+     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr);
+  case ARM::ATOMIC_LOAD_ADD_I32:
+     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2ADDrr : ARM::ADDrr);
+
+  case ARM::ATOMIC_LOAD_AND_I8:
+     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr);
+  case ARM::ATOMIC_LOAD_AND_I16:
+     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr);
+  case ARM::ATOMIC_LOAD_AND_I32:
+     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2ANDrr : ARM::ANDrr);
+
+  case ARM::ATOMIC_LOAD_OR_I8:
+     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr);
+  case ARM::ATOMIC_LOAD_OR_I16:
+     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr);
+  case ARM::ATOMIC_LOAD_OR_I32:
+     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2ORRrr : ARM::ORRrr);
+
+  case ARM::ATOMIC_LOAD_XOR_I8:
+     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2EORrr : ARM::EORrr);
+  case ARM::ATOMIC_LOAD_XOR_I16:
+     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2EORrr : ARM::EORrr);
+  case ARM::ATOMIC_LOAD_XOR_I32:
+     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2EORrr : ARM::EORrr);
+
+  case ARM::ATOMIC_LOAD_NAND_I8:
+     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2BICrr : ARM::BICrr);
+  case ARM::ATOMIC_LOAD_NAND_I16:
+     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2BICrr : ARM::BICrr);
+  case ARM::ATOMIC_LOAD_NAND_I32:
+     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2BICrr : ARM::BICrr);
+
+  case ARM::ATOMIC_LOAD_SUB_I8:
+     return EmitAtomicBinary(MI, BB, 1, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr);
+  case ARM::ATOMIC_LOAD_SUB_I16:
+     return EmitAtomicBinary(MI, BB, 2, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr);
+  case ARM::ATOMIC_LOAD_SUB_I32:
+     return EmitAtomicBinary(MI, BB, 4, isThumb2 ? ARM::t2SUBrr : ARM::SUBrr);
+
+  case ARM::ATOMIC_SWAP_I8:  return EmitAtomicBinary(MI, BB, 1, 0);
+  case ARM::ATOMIC_SWAP_I16: return EmitAtomicBinary(MI, BB, 2, 0);
+  case ARM::ATOMIC_SWAP_I32: return EmitAtomicBinary(MI, BB, 4, 0);
+
+  case ARM::ATOMIC_CMP_SWAP_I8:  return EmitAtomicCmpSwap(MI, BB, 1);
+  case ARM::ATOMIC_CMP_SWAP_I16: return EmitAtomicCmpSwap(MI, BB, 2);
+  case ARM::ATOMIC_CMP_SWAP_I32: return EmitAtomicCmpSwap(MI, BB, 4);
+
+  case ARM::tMOVCCr_pseudo: {
+    // To "insert" a SELECT_CC instruction, we actually have to insert the
+    // diamond control-flow pattern.  The incoming instruction knows the
+    // destination vreg to set, the condition code register to branch on, the
+    // true/false values to select between, and a branch opcode to use.
+    const BasicBlock *LLVM_BB = BB->getBasicBlock();
+    MachineFunction::iterator It = BB;
+    ++It;
+
+    //  thisMBB:
+    //  ...
+    //   TrueVal = ...
+    //   cmpTY ccX, r1, r2
+    //   bCC copy1MBB
+    //   fallthrough --> copy0MBB
+    MachineBasicBlock *thisMBB  = BB;
+    MachineFunction *F = BB->getParent();
+    MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
+    BuildMI(BB, dl, TII->get(ARM::tBcc)).addMBB(sinkMBB)
+      .addImm(MI->getOperand(3).getImm()).addReg(MI->getOperand(4).getReg());
+    F->insert(It, copy0MBB);
+    F->insert(It, sinkMBB);
+    // Update machine-CFG edges by first adding all successors of the current
+    // block to the new block which will contain the Phi node for the select.
+    // Also inform sdisel of the edge changes.
+    for (MachineBasicBlock::succ_iterator I = BB->succ_begin(), 
+           E = BB->succ_end(); I != E; ++I) {
+      EM->insert(std::make_pair(*I, sinkMBB));
+      sinkMBB->addSuccessor(*I);
+    }
+    // Next, remove all successors of the current block, and add the true
+    // and fallthrough blocks as its successors.
+    while (!BB->succ_empty())
+      BB->removeSuccessor(BB->succ_begin());
+    BB->addSuccessor(copy0MBB);
+    BB->addSuccessor(sinkMBB);
+
+    //  copy0MBB:
+    //   %FalseValue = ...
+    //   # fallthrough to sinkMBB
+    BB = copy0MBB;
+
+    // Update machine-CFG edges
+    BB->addSuccessor(sinkMBB);
+
+    //  sinkMBB:
+    //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+    //  ...
+    BB = sinkMBB;
+    BuildMI(BB, dl, TII->get(ARM::PHI), MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
+      .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
+
+    F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+    return BB;
+  }
+
+  case ARM::tANDsp:
+  case ARM::tADDspr_:
+  case ARM::tSUBspi_:
+  case ARM::t2SUBrSPi_:
+  case ARM::t2SUBrSPi12_:
+  case ARM::t2SUBrSPs_: {
+    MachineFunction *MF = BB->getParent();
+    unsigned DstReg = MI->getOperand(0).getReg();
+    unsigned SrcReg = MI->getOperand(1).getReg();
+    bool DstIsDead = MI->getOperand(0).isDead();
+    bool SrcIsKill = MI->getOperand(1).isKill();
+
+    if (SrcReg != ARM::SP) {
+      // Copy the source to SP from virtual register.
+      const TargetRegisterClass *RC = MF->getRegInfo().getRegClass(SrcReg);
+      unsigned CopyOpc = (RC == ARM::tGPRRegisterClass)
+        ? ARM::tMOVtgpr2gpr : ARM::tMOVgpr2gpr;
+      BuildMI(BB, dl, TII->get(CopyOpc), ARM::SP)
+        .addReg(SrcReg, getKillRegState(SrcIsKill));
+    }
+
+    unsigned OpOpc = 0;
+    bool NeedPred = false, NeedCC = false, NeedOp3 = false;
+    switch (MI->getOpcode()) {
+    default:
+      llvm_unreachable("Unexpected pseudo instruction!");
+    case ARM::tANDsp:
+      OpOpc = ARM::tAND;
+      NeedPred = true;
+      break;
+    case ARM::tADDspr_:
+      OpOpc = ARM::tADDspr;
+      break;
+    case ARM::tSUBspi_:
+      OpOpc = ARM::tSUBspi;
+      break;
+    case ARM::t2SUBrSPi_:
+      OpOpc = ARM::t2SUBrSPi;
+      NeedPred = true; NeedCC = true;
+      break;
+    case ARM::t2SUBrSPi12_:
+      OpOpc = ARM::t2SUBrSPi12;
+      NeedPred = true;
+      break;
+    case ARM::t2SUBrSPs_:
+      OpOpc = ARM::t2SUBrSPs;
+      NeedPred = true; NeedCC = true; NeedOp3 = true;
+      break;
+    }
+    MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(OpOpc), ARM::SP);
+    if (OpOpc == ARM::tAND)
+      AddDefaultT1CC(MIB);
+    MIB.addReg(ARM::SP);
+    MIB.addOperand(MI->getOperand(2));
+    if (NeedOp3)
+      MIB.addOperand(MI->getOperand(3));
+    if (NeedPred)
+      AddDefaultPred(MIB);
+    if (NeedCC)
+      AddDefaultCC(MIB);
+
+    // Copy the result from SP to virtual register.
+    const TargetRegisterClass *RC = MF->getRegInfo().getRegClass(DstReg);
+    unsigned CopyOpc = (RC == ARM::tGPRRegisterClass)
+      ? ARM::tMOVgpr2tgpr : ARM::tMOVgpr2gpr;
+    BuildMI(BB, dl, TII->get(CopyOpc))
+      .addReg(DstReg, getDefRegState(true) | getDeadRegState(DstIsDead))
+      .addReg(ARM::SP);
+    MF->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+    return BB;
+  }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                           ARM Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+static
+SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp,
+                            TargetLowering::DAGCombinerInfo &DCI) {
+  SelectionDAG &DAG = DCI.DAG;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT VT = N->getValueType(0);
+  unsigned Opc = N->getOpcode();
+  bool isSlctCC = Slct.getOpcode() == ISD::SELECT_CC;
+  SDValue LHS = isSlctCC ? Slct.getOperand(2) : Slct.getOperand(1);
+  SDValue RHS = isSlctCC ? Slct.getOperand(3) : Slct.getOperand(2);
+  ISD::CondCode CC = ISD::SETCC_INVALID;
+
+  if (isSlctCC) {
+    CC = cast<CondCodeSDNode>(Slct.getOperand(4))->get();
+  } else {
+    SDValue CCOp = Slct.getOperand(0);
+    if (CCOp.getOpcode() == ISD::SETCC)
+      CC = cast<CondCodeSDNode>(CCOp.getOperand(2))->get();
+  }
+
+  bool DoXform = false;
+  bool InvCC = false;
+  assert ((Opc == ISD::ADD || (Opc == ISD::SUB && Slct == N->getOperand(1))) &&
+          "Bad input!");
+
+  if (LHS.getOpcode() == ISD::Constant &&
+      cast<ConstantSDNode>(LHS)->isNullValue()) {
+    DoXform = true;
+  } else if (CC != ISD::SETCC_INVALID &&
+             RHS.getOpcode() == ISD::Constant &&
+             cast<ConstantSDNode>(RHS)->isNullValue()) {
+    std::swap(LHS, RHS);
+    SDValue Op0 = Slct.getOperand(0);
+    EVT OpVT = isSlctCC ? Op0.getValueType() :
+                          Op0.getOperand(0).getValueType();
+    bool isInt = OpVT.isInteger();
+    CC = ISD::getSetCCInverse(CC, isInt);
+
+    if (!TLI.isCondCodeLegal(CC, OpVT))
+      return SDValue();         // Inverse operator isn't legal.
+
+    DoXform = true;
+    InvCC = true;
+  }
+
+  if (DoXform) {
+    SDValue Result = DAG.getNode(Opc, RHS.getDebugLoc(), VT, OtherOp, RHS);
+    if (isSlctCC)
+      return DAG.getSelectCC(N->getDebugLoc(), OtherOp, Result,
+                             Slct.getOperand(0), Slct.getOperand(1), CC);
+    SDValue CCOp = Slct.getOperand(0);
+    if (InvCC)
+      CCOp = DAG.getSetCC(Slct.getDebugLoc(), CCOp.getValueType(),
+                          CCOp.getOperand(0), CCOp.getOperand(1), CC);
+    return DAG.getNode(ISD::SELECT, N->getDebugLoc(), VT,
+                       CCOp, OtherOp, Result);
+  }
+  return SDValue();
+}
+
+/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
+static SDValue PerformADDCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  // added by evan in r37685 with no testcase.
+  SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
+
+  // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
+  if (N0.getOpcode() == ISD::SELECT && N0.getNode()->hasOneUse()) {
+    SDValue Result = combineSelectAndUse(N, N0, N1, DCI);
+    if (Result.getNode()) return Result;
+  }
+  if (N1.getOpcode() == ISD::SELECT && N1.getNode()->hasOneUse()) {
+    SDValue Result = combineSelectAndUse(N, N1, N0, DCI);
+    if (Result.getNode()) return Result;
+  }
+
+  return SDValue();
+}
+
+/// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
+static SDValue PerformSUBCombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  // added by evan in r37685 with no testcase.
+  SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
+
+  // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
+  if (N1.getOpcode() == ISD::SELECT && N1.getNode()->hasOneUse()) {
+    SDValue Result = combineSelectAndUse(N, N1, N0, DCI);
+    if (Result.getNode()) return Result;
+  }
+
+  return SDValue();
+}
+
+/// PerformVMOVRRDCombine - Target-specific dag combine xforms for
+/// ARMISD::VMOVRRD.
+static SDValue PerformVMOVRRDCombine(SDNode *N,
+                                   TargetLowering::DAGCombinerInfo &DCI) {
+  // fmrrd(fmdrr x, y) -> x,y
+  SDValue InDouble = N->getOperand(0);
+  if (InDouble.getOpcode() == ARMISD::VMOVDRR)
+    return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
+  return SDValue();
+}
+
+/// getVShiftImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift operation, where all the elements of the
+/// build_vector must have the same constant integer value.
+static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
+  // Ignore bit_converts.
+  while (Op.getOpcode() == ISD::BIT_CONVERT)
+    Op = Op.getOperand(0);
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (! BVN || ! BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
+                                      HasAnyUndefs, ElementBits) ||
+      SplatBitSize > ElementBits)
+    return false;
+  Cnt = SplatBits.getSExtValue();
+  return true;
+}
+
+/// isVShiftLImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift left operation.  That value must be in the range:
+///   0 <= Value < ElementBits for a left shift; or
+///   0 <= Value <= ElementBits for a long left shift.
+static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
+  assert(VT.isVector() && "vector shift count is not a vector type");
+  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
+  if (! getVShiftImm(Op, ElementBits, Cnt))
+    return false;
+  return (Cnt >= 0 && (isLong ? Cnt-1 : Cnt) < ElementBits);
+}
+
+/// isVShiftRImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift right operation.  For a shift opcode, the value
+/// is positive, but for an intrinsic the value count must be negative. The
+/// absolute value must be in the range:
+///   1 <= |Value| <= ElementBits for a right shift; or
+///   1 <= |Value| <= ElementBits/2 for a narrow right shift.
+static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
+                         int64_t &Cnt) {
+  assert(VT.isVector() && "vector shift count is not a vector type");
+  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
+  if (! getVShiftImm(Op, ElementBits, Cnt))
+    return false;
+  if (isIntrinsic)
+    Cnt = -Cnt;
+  return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits/2 : ElementBits));
+}
+
+/// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
+static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
+  unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+  switch (IntNo) {
+  default:
+    // Don't do anything for most intrinsics.
+    break;
+
+  // Vector shifts: check for immediate versions and lower them.
+  // Note: This is done during DAG combining instead of DAG legalizing because
+  // the build_vectors for 64-bit vector element shift counts are generally
+  // not legal, and it is hard to see their values after they get legalized to
+  // loads from a constant pool.
+  case Intrinsic::arm_neon_vshifts:
+  case Intrinsic::arm_neon_vshiftu:
+  case Intrinsic::arm_neon_vshiftls:
+  case Intrinsic::arm_neon_vshiftlu:
+  case Intrinsic::arm_neon_vshiftn:
+  case Intrinsic::arm_neon_vrshifts:
+  case Intrinsic::arm_neon_vrshiftu:
+  case Intrinsic::arm_neon_vrshiftn:
+  case Intrinsic::arm_neon_vqshifts:
+  case Intrinsic::arm_neon_vqshiftu:
+  case Intrinsic::arm_neon_vqshiftsu:
+  case Intrinsic::arm_neon_vqshiftns:
+  case Intrinsic::arm_neon_vqshiftnu:
+  case Intrinsic::arm_neon_vqshiftnsu:
+  case Intrinsic::arm_neon_vqrshiftns:
+  case Intrinsic::arm_neon_vqrshiftnu:
+  case Intrinsic::arm_neon_vqrshiftnsu: {
+    EVT VT = N->getOperand(1).getValueType();
+    int64_t Cnt;
+    unsigned VShiftOpc = 0;
+
+    switch (IntNo) {
+    case Intrinsic::arm_neon_vshifts:
+    case Intrinsic::arm_neon_vshiftu:
+      if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
+        VShiftOpc = ARMISD::VSHL;
+        break;
+      }
+      if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
+        VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ?
+                     ARMISD::VSHRs : ARMISD::VSHRu);
+        break;
+      }
+      return SDValue();
+
+    case Intrinsic::arm_neon_vshiftls:
+    case Intrinsic::arm_neon_vshiftlu:
+      if (isVShiftLImm(N->getOperand(2), VT, true, Cnt))
+        break;
+      llvm_unreachable("invalid shift count for vshll intrinsic");
+
+    case Intrinsic::arm_neon_vrshifts:
+    case Intrinsic::arm_neon_vrshiftu:
+      if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
+        break;
+      return SDValue();
+
+    case Intrinsic::arm_neon_vqshifts:
+    case Intrinsic::arm_neon_vqshiftu:
+      if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
+        break;
+      return SDValue();
+
+    case Intrinsic::arm_neon_vqshiftsu:
+      if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
+        break;
+      llvm_unreachable("invalid shift count for vqshlu intrinsic");
+
+    case Intrinsic::arm_neon_vshiftn:
+    case Intrinsic::arm_neon_vrshiftn:
+    case Intrinsic::arm_neon_vqshiftns:
+    case Intrinsic::arm_neon_vqshiftnu:
+    case Intrinsic::arm_neon_vqshiftnsu:
+    case Intrinsic::arm_neon_vqrshiftns:
+    case Intrinsic::arm_neon_vqrshiftnu:
+    case Intrinsic::arm_neon_vqrshiftnsu:
+      // Narrowing shifts require an immediate right shift.
+      if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
+        break;
+      llvm_unreachable("invalid shift count for narrowing vector shift intrinsic");
+
+    default:
+      llvm_unreachable("unhandled vector shift");
+    }
+
+    switch (IntNo) {
+    case Intrinsic::arm_neon_vshifts:
+    case Intrinsic::arm_neon_vshiftu:
+      // Opcode already set above.
+      break;
+    case Intrinsic::arm_neon_vshiftls:
+    case Intrinsic::arm_neon_vshiftlu:
+      if (Cnt == VT.getVectorElementType().getSizeInBits())
+        VShiftOpc = ARMISD::VSHLLi;
+      else
+        VShiftOpc = (IntNo == Intrinsic::arm_neon_vshiftls ?
+                     ARMISD::VSHLLs : ARMISD::VSHLLu);
+      break;
+    case Intrinsic::arm_neon_vshiftn:
+      VShiftOpc = ARMISD::VSHRN; break;
+    case Intrinsic::arm_neon_vrshifts:
+      VShiftOpc = ARMISD::VRSHRs; break;
+    case Intrinsic::arm_neon_vrshiftu:
+      VShiftOpc = ARMISD::VRSHRu; break;
+    case Intrinsic::arm_neon_vrshiftn:
+      VShiftOpc = ARMISD::VRSHRN; break;
+    case Intrinsic::arm_neon_vqshifts:
+      VShiftOpc = ARMISD::VQSHLs; break;
+    case Intrinsic::arm_neon_vqshiftu:
+      VShiftOpc = ARMISD::VQSHLu; break;
+    case Intrinsic::arm_neon_vqshiftsu:
+      VShiftOpc = ARMISD::VQSHLsu; break;
+    case Intrinsic::arm_neon_vqshiftns:
+      VShiftOpc = ARMISD::VQSHRNs; break;
+    case Intrinsic::arm_neon_vqshiftnu:
+      VShiftOpc = ARMISD::VQSHRNu; break;
+    case Intrinsic::arm_neon_vqshiftnsu:
+      VShiftOpc = ARMISD::VQSHRNsu; break;
+    case Intrinsic::arm_neon_vqrshiftns:
+      VShiftOpc = ARMISD::VQRSHRNs; break;
+    case Intrinsic::arm_neon_vqrshiftnu:
+      VShiftOpc = ARMISD::VQRSHRNu; break;
+    case Intrinsic::arm_neon_vqrshiftnsu:
+      VShiftOpc = ARMISD::VQRSHRNsu; break;
+    }
+
+    return DAG.getNode(VShiftOpc, N->getDebugLoc(), N->getValueType(0),
+                       N->getOperand(1), DAG.getConstant(Cnt, MVT::i32));
+  }
+
+  case Intrinsic::arm_neon_vshiftins: {
+    EVT VT = N->getOperand(1).getValueType();
+    int64_t Cnt;
+    unsigned VShiftOpc = 0;
+
+    if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
+      VShiftOpc = ARMISD::VSLI;
+    else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
+      VShiftOpc = ARMISD::VSRI;
+    else {
+      llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
+    }
+
+    return DAG.getNode(VShiftOpc, N->getDebugLoc(), N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2),
+                       DAG.getConstant(Cnt, MVT::i32));
+  }
+
+  case Intrinsic::arm_neon_vqrshifts:
+  case Intrinsic::arm_neon_vqrshiftu:
+    // No immediate versions of these to check for.
+    break;
+  }
+
+  return SDValue();
+}
+
+/// PerformShiftCombine - Checks for immediate versions of vector shifts and
+/// lowers them.  As with the vector shift intrinsics, this is done during DAG
+/// combining instead of DAG legalizing because the build_vectors for 64-bit
+/// vector element shift counts are generally not legal, and it is hard to see
+/// their values after they get legalized to loads from a constant pool.
+static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG,
+                                   const ARMSubtarget *ST) {
+  EVT VT = N->getValueType(0);
+
+  // Nothing to be done for scalar shifts.
+  if (! VT.isVector())
+    return SDValue();
+
+  assert(ST->hasNEON() && "unexpected vector shift");
+  int64_t Cnt;
+
+  switch (N->getOpcode()) {
+  default: llvm_unreachable("unexpected shift opcode");
+
+  case ISD::SHL:
+    if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
+      return DAG.getNode(ARMISD::VSHL, N->getDebugLoc(), VT, N->getOperand(0),
+                         DAG.getConstant(Cnt, MVT::i32));
+    break;
+
+  case ISD::SRA:
+  case ISD::SRL:
+    if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
+      unsigned VShiftOpc = (N->getOpcode() == ISD::SRA ?
+                            ARMISD::VSHRs : ARMISD::VSHRu);
+      return DAG.getNode(VShiftOpc, N->getDebugLoc(), VT, N->getOperand(0),
+                         DAG.getConstant(Cnt, MVT::i32));
+    }
+  }
+  return SDValue();
+}
+
+/// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
+/// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
+static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
+                                    const ARMSubtarget *ST) {
+  SDValue N0 = N->getOperand(0);
+
+  // Check for sign- and zero-extensions of vector extract operations of 8-
+  // and 16-bit vector elements.  NEON supports these directly.  They are
+  // handled during DAG combining because type legalization will promote them
+  // to 32-bit types and it is messy to recognize the operations after that.
+  if (ST->hasNEON() && N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+    SDValue Vec = N0.getOperand(0);
+    SDValue Lane = N0.getOperand(1);
+    EVT VT = N->getValueType(0);
+    EVT EltVT = N0.getValueType();
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+    if (VT == MVT::i32 &&
+        (EltVT == MVT::i8 || EltVT == MVT::i16) &&
+        TLI.isTypeLegal(Vec.getValueType())) {
+
+      unsigned Opc = 0;
+      switch (N->getOpcode()) {
+      default: llvm_unreachable("unexpected opcode");
+      case ISD::SIGN_EXTEND:
+        Opc = ARMISD::VGETLANEs;
+        break;
+      case ISD::ZERO_EXTEND:
+      case ISD::ANY_EXTEND:
+        Opc = ARMISD::VGETLANEu;
+        break;
+      }
+      return DAG.getNode(Opc, N->getDebugLoc(), VT, Vec, Lane);
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
+                                             DAGCombinerInfo &DCI) const {
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::ADD:      return PerformADDCombine(N, DCI);
+  case ISD::SUB:      return PerformSUBCombine(N, DCI);
+  case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI);
+  case ISD::INTRINSIC_WO_CHAIN:
+    return PerformIntrinsicCombine(N, DCI.DAG);
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+    return PerformShiftCombine(N, DCI.DAG, Subtarget);
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::ANY_EXTEND:
+    return PerformExtendCombine(N, DCI.DAG, Subtarget);
+  }
+  return SDValue();
+}
+
+bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const {
+  if (!Subtarget->hasV6Ops())
+    // Pre-v6 does not support unaligned mem access.
+    return false;
+  else {
+    // v6+ may or may not support unaligned mem access depending on the system
+    // configuration.
+    // FIXME: This is pretty conservative. Should we provide cmdline option to
+    // control the behaviour?
+    if (!Subtarget->isTargetDarwin())
+      return false;
+  }
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  default:
+    return false;
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+    return true;
+  // FIXME: VLD1 etc with standard alignment is legal.
+  }
+}
+
+static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
+  if (V < 0)
+    return false;
+
+  unsigned Scale = 1;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: return false;
+  case MVT::i1:
+  case MVT::i8:
+    // Scale == 1;
+    break;
+  case MVT::i16:
+    // Scale == 2;
+    Scale = 2;
+    break;
+  case MVT::i32:
+    // Scale == 4;
+    Scale = 4;
+    break;
+  }
+
+  if ((V & (Scale - 1)) != 0)
+    return false;
+  V /= Scale;
+  return V == (V & ((1LL << 5) - 1));
+}
+
+static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
+                                      const ARMSubtarget *Subtarget) {
+  bool isNeg = false;
+  if (V < 0) {
+    isNeg = true;
+    V = - V;
+  }
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: return false;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+    // + imm12 or - imm8
+    if (isNeg)
+      return V == (V & ((1LL << 8) - 1));
+    return V == (V & ((1LL << 12) - 1));
+  case MVT::f32:
+  case MVT::f64:
+    // Same as ARM mode. FIXME: NEON?
+    if (!Subtarget->hasVFP2())
+      return false;
+    if ((V & 3) != 0)
+      return false;
+    V >>= 2;
+    return V == (V & ((1LL << 8) - 1));
+  }
+}
+
+/// isLegalAddressImmediate - Return true if the integer value can be used
+/// as the offset of the target addressing mode for load / store of the
+/// given type.
+static bool isLegalAddressImmediate(int64_t V, EVT VT,
+                                    const ARMSubtarget *Subtarget) {
+  if (V == 0)
+    return true;
+
+  if (!VT.isSimple())
+    return false;
+
+  if (Subtarget->isThumb1Only())
+    return isLegalT1AddressImmediate(V, VT);
+  else if (Subtarget->isThumb2())
+    return isLegalT2AddressImmediate(V, VT, Subtarget);
+
+  // ARM mode.
+  if (V < 0)
+    V = - V;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: return false;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i32:
+    // +- imm12
+    return V == (V & ((1LL << 12) - 1));
+  case MVT::i16:
+    // +- imm8
+    return V == (V & ((1LL << 8) - 1));
+  case MVT::f32:
+  case MVT::f64:
+    if (!Subtarget->hasVFP2()) // FIXME: NEON?
+      return false;
+    if ((V & 3) != 0)
+      return false;
+    V >>= 2;
+    return V == (V & ((1LL << 8) - 1));
+  }
+}
+
+bool ARMTargetLowering::isLegalT2ScaledAddressingMode(const AddrMode &AM,
+                                                      EVT VT) const {
+  int Scale = AM.Scale;
+  if (Scale < 0)
+    return false;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: return false;
+  case MVT::i1:
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+    if (Scale == 1)
+      return true;
+    // r + r << imm
+    Scale = Scale & ~1;
+    return Scale == 2 || Scale == 4 || Scale == 8;
+  case MVT::i64:
+    // r + r
+    if (((unsigned)AM.HasBaseReg + Scale) <= 2)
+      return true;
+    return false;
+  case MVT::isVoid:
+    // Note, we allow "void" uses (basically, uses that aren't loads or
+    // stores), because arm allows folding a scale into many arithmetic
+    // operations.  This should be made more precise and revisited later.
+
+    // Allow r << imm, but the imm has to be a multiple of two.
+    if (Scale & 1) return false;
+    return isPowerOf2_32(Scale);
+  }
+}
+
+/// isLegalAddressingMode - Return true if the addressing mode represented
+/// by AM is legal for this target, for a load/store of the specified type.
+bool ARMTargetLowering::isLegalAddressingMode(const AddrMode &AM,
+                                              const Type *Ty) const {
+  EVT VT = getValueType(Ty, true);
+  if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
+    return false;
+
+  // Can never fold addr of global into load/store.
+  if (AM.BaseGV)
+    return false;
+
+  switch (AM.Scale) {
+  case 0:  // no scale reg, must be "r+i" or "r", or "i".
+    break;
+  case 1:
+    if (Subtarget->isThumb1Only())
+      return false;
+    // FALL THROUGH.
+  default:
+    // ARM doesn't support any R+R*scale+imm addr modes.
+    if (AM.BaseOffs)
+      return false;
+
+    if (!VT.isSimple())
+      return false;
+
+    if (Subtarget->isThumb2())
+      return isLegalT2ScaledAddressingMode(AM, VT);
+
+    int Scale = AM.Scale;
+    switch (VT.getSimpleVT().SimpleTy) {
+    default: return false;
+    case MVT::i1:
+    case MVT::i8:
+    case MVT::i32:
+      if (Scale < 0) Scale = -Scale;
+      if (Scale == 1)
+        return true;
+      // r + r << imm
+      return isPowerOf2_32(Scale & ~1);
+    case MVT::i16:
+    case MVT::i64:
+      // r + r
+      if (((unsigned)AM.HasBaseReg + Scale) <= 2)
+        return true;
+      return false;
+
+    case MVT::isVoid:
+      // Note, we allow "void" uses (basically, uses that aren't loads or
+      // stores), because arm allows folding a scale into many arithmetic
+      // operations.  This should be made more precise and revisited later.
+
+      // Allow r << imm, but the imm has to be a multiple of two.
+      if (Scale & 1) return false;
+      return isPowerOf2_32(Scale);
+    }
+    break;
+  }
+  return true;
+}
+
+/// isLegalICmpImmediate - Return true if the specified immediate is legal
+/// icmp immediate, that is the target has icmp instructions which can compare
+/// a register against the immediate without having to materialize the
+/// immediate into a register.
+bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
+  if (!Subtarget->isThumb())
+    return ARM_AM::getSOImmVal(Imm) != -1;
+  if (Subtarget->isThumb2())
+    return ARM_AM::getT2SOImmVal(Imm) != -1; 
+  return Imm >= 0 && Imm <= 255;
+}
+
+static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT,
+                                      bool isSEXTLoad, SDValue &Base,
+                                      SDValue &Offset, bool &isInc,
+                                      SelectionDAG &DAG) {
+  if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
+    return false;
+
+  if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
+    // AddressingMode 3
+    Base = Ptr->getOperand(0);
+    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
+      int RHSC = (int)RHS->getZExtValue();
+      if (RHSC < 0 && RHSC > -256) {
+        assert(Ptr->getOpcode() == ISD::ADD);
+        isInc = false;
+        Offset = DAG.getConstant(-RHSC, RHS->getValueType(0));
+        return true;
+      }
+    }
+    isInc = (Ptr->getOpcode() == ISD::ADD);
+    Offset = Ptr->getOperand(1);
+    return true;
+  } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
+    // AddressingMode 2
+    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
+      int RHSC = (int)RHS->getZExtValue();
+      if (RHSC < 0 && RHSC > -0x1000) {
+        assert(Ptr->getOpcode() == ISD::ADD);
+        isInc = false;
+        Offset = DAG.getConstant(-RHSC, RHS->getValueType(0));
+        Base = Ptr->getOperand(0);
+        return true;
+      }
+    }
+
+    if (Ptr->getOpcode() == ISD::ADD) {
+      isInc = true;
+      ARM_AM::ShiftOpc ShOpcVal= ARM_AM::getShiftOpcForNode(Ptr->getOperand(0));
+      if (ShOpcVal != ARM_AM::no_shift) {
+        Base = Ptr->getOperand(1);
+        Offset = Ptr->getOperand(0);
+      } else {
+        Base = Ptr->getOperand(0);
+        Offset = Ptr->getOperand(1);
+      }
+      return true;
+    }
+
+    isInc = (Ptr->getOpcode() == ISD::ADD);
+    Base = Ptr->getOperand(0);
+    Offset = Ptr->getOperand(1);
+    return true;
+  }
+
+  // FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
+  return false;
+}
+
+static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT,
+                                     bool isSEXTLoad, SDValue &Base,
+                                     SDValue &Offset, bool &isInc,
+                                     SelectionDAG &DAG) {
+  if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
+    return false;
+
+  Base = Ptr->getOperand(0);
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Ptr->getOperand(1))) {
+    int RHSC = (int)RHS->getZExtValue();
+    if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
+      assert(Ptr->getOpcode() == ISD::ADD);
+      isInc = false;
+      Offset = DAG.getConstant(-RHSC, RHS->getValueType(0));
+      return true;
+    } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
+      isInc = Ptr->getOpcode() == ISD::ADD;
+      Offset = DAG.getConstant(RHSC, RHS->getValueType(0));
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/// getPreIndexedAddressParts - returns true by value, base pointer and
+/// offset pointer and addressing mode by reference if the node's address
+/// can be legally represented as pre-indexed load / store address.
+bool
+ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
+                                             SDValue &Offset,
+                                             ISD::MemIndexedMode &AM,
+                                             SelectionDAG &DAG) const {
+  if (Subtarget->isThumb1Only())
+    return false;
+
+  EVT VT;
+  SDValue Ptr;
+  bool isSEXTLoad = false;
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    Ptr = LD->getBasePtr();
+    VT  = LD->getMemoryVT();
+    isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    Ptr = ST->getBasePtr();
+    VT  = ST->getMemoryVT();
+  } else
+    return false;
+
+  bool isInc;
+  bool isLegal = false;
+  if (Subtarget->isThumb2())
+    isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
+                                       Offset, isInc, DAG);
+  else
+    isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
+                                        Offset, isInc, DAG);
+  if (!isLegal)
+    return false;
+
+  AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
+  return true;
+}
+
+/// getPostIndexedAddressParts - returns true by value, base pointer and
+/// offset pointer and addressing mode by reference if this node can be
+/// combined with a load / store to form a post-indexed load / store.
+bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
+                                                   SDValue &Base,
+                                                   SDValue &Offset,
+                                                   ISD::MemIndexedMode &AM,
+                                                   SelectionDAG &DAG) const {
+  if (Subtarget->isThumb1Only())
+    return false;
+
+  EVT VT;
+  SDValue Ptr;
+  bool isSEXTLoad = false;
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    VT  = LD->getMemoryVT();
+    isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    VT  = ST->getMemoryVT();
+  } else
+    return false;
+
+  bool isInc;
+  bool isLegal = false;
+  if (Subtarget->isThumb2())
+    isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
+                                        isInc, DAG);
+  else
+    isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
+                                        isInc, DAG);
+  if (!isLegal)
+    return false;
+
+  AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
+  return true;
+}
+
+void ARMTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
+                                                       const APInt &Mask,
+                                                       APInt &KnownZero,
+                                                       APInt &KnownOne,
+                                                       const SelectionDAG &DAG,
+                                                       unsigned Depth) const {
+  KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);
+  switch (Op.getOpcode()) {
+  default: break;
+  case ARMISD::CMOV: {
+    // Bits are known zero/one if known on the LHS and RHS.
+    DAG.ComputeMaskedBits(Op.getOperand(0), Mask, KnownZero, KnownOne, Depth+1);
+    if (KnownZero == 0 && KnownOne == 0) return;
+
+    APInt KnownZeroRHS, KnownOneRHS;
+    DAG.ComputeMaskedBits(Op.getOperand(1), Mask,
+                          KnownZeroRHS, KnownOneRHS, Depth+1);
+    KnownZero &= KnownZeroRHS;
+    KnownOne  &= KnownOneRHS;
+    return;
+  }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                           ARM Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+ARMTargetLowering::ConstraintType
+ARMTargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default:  break;
+    case 'l': return C_RegisterClass;
+    case 'w': return C_RegisterClass;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+std::pair<unsigned, const TargetRegisterClass*>
+ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+                                                EVT VT) const {
+  if (Constraint.size() == 1) {
+    // GCC ARM Constraint Letters
+    switch (Constraint[0]) {
+    case 'l':
+      if (Subtarget->isThumb())
+        return std::make_pair(0U, ARM::tGPRRegisterClass);
+      else
+        return std::make_pair(0U, ARM::GPRRegisterClass);
+    case 'r':
+      return std::make_pair(0U, ARM::GPRRegisterClass);
+    case 'w':
+      if (VT == MVT::f32)
+        return std::make_pair(0U, ARM::SPRRegisterClass);
+      if (VT.getSizeInBits() == 64)
+        return std::make_pair(0U, ARM::DPRRegisterClass);
+      if (VT.getSizeInBits() == 128)
+        return std::make_pair(0U, ARM::QPRRegisterClass);
+      break;
+    }
+  }
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
+
+std::vector<unsigned> ARMTargetLowering::
+getRegClassForInlineAsmConstraint(const std::string &Constraint,
+                                  EVT VT) const {
+  if (Constraint.size() != 1)
+    return std::vector<unsigned>();
+
+  switch (Constraint[0]) {      // GCC ARM Constraint Letters
+  default: break;
+  case 'l':
+    return make_vector<unsigned>(ARM::R0, ARM::R1, ARM::R2, ARM::R3,
+                                 ARM::R4, ARM::R5, ARM::R6, ARM::R7,
+                                 0);
+  case 'r':
+    return make_vector<unsigned>(ARM::R0, ARM::R1, ARM::R2, ARM::R3,
+                                 ARM::R4, ARM::R5, ARM::R6, ARM::R7,
+                                 ARM::R8, ARM::R9, ARM::R10, ARM::R11,
+                                 ARM::R12, ARM::LR, 0);
+  case 'w':
+    if (VT == MVT::f32)
+      return make_vector<unsigned>(ARM::S0, ARM::S1, ARM::S2, ARM::S3,
+                                   ARM::S4, ARM::S5, ARM::S6, ARM::S7,
+                                   ARM::S8, ARM::S9, ARM::S10, ARM::S11,
+                                   ARM::S12,ARM::S13,ARM::S14,ARM::S15,
+                                   ARM::S16,ARM::S17,ARM::S18,ARM::S19,
+                                   ARM::S20,ARM::S21,ARM::S22,ARM::S23,
+                                   ARM::S24,ARM::S25,ARM::S26,ARM::S27,
+                                   ARM::S28,ARM::S29,ARM::S30,ARM::S31, 0);
+    if (VT.getSizeInBits() == 64)
+      return make_vector<unsigned>(ARM::D0, ARM::D1, ARM::D2, ARM::D3,
+                                   ARM::D4, ARM::D5, ARM::D6, ARM::D7,
+                                   ARM::D8, ARM::D9, ARM::D10,ARM::D11,
+                                   ARM::D12,ARM::D13,ARM::D14,ARM::D15, 0);
+    if (VT.getSizeInBits() == 128)
+      return make_vector<unsigned>(ARM::Q0, ARM::Q1, ARM::Q2, ARM::Q3,
+                                   ARM::Q4, ARM::Q5, ARM::Q6, ARM::Q7, 0);
+      break;
+  }
+
+  return std::vector<unsigned>();
+}
+
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector.  If it is invalid, don't add anything to Ops.
+void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+                                                     char Constraint,
+                                                     bool hasMemory,
+                                                     std::vector<SDValue>&Ops,
+                                                     SelectionDAG &DAG) const {
+  SDValue Result(0, 0);
+
+  switch (Constraint) {
+  default: break;
+  case 'I': case 'J': case 'K': case 'L':
+  case 'M': case 'N': case 'O':
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
+    if (!C)
+      return;
+
+    int64_t CVal64 = C->getSExtValue();
+    int CVal = (int) CVal64;
+    // None of these constraints allow values larger than 32 bits.  Check
+    // that the value fits in an int.
+    if (CVal != CVal64)
+      return;
+
+    switch (Constraint) {
+      case 'I':
+        if (Subtarget->isThumb1Only()) {
+          // This must be a constant between 0 and 255, for ADD
+          // immediates.
+          if (CVal >= 0 && CVal <= 255)
+            break;
+        } else if (Subtarget->isThumb2()) {
+          // A constant that can be used as an immediate value in a
+          // data-processing instruction.
+          if (ARM_AM::getT2SOImmVal(CVal) != -1)
+            break;
+        } else {
+          // A constant that can be used as an immediate value in a
+          // data-processing instruction.
+          if (ARM_AM::getSOImmVal(CVal) != -1)
+            break;
+        }
+        return;
+
+      case 'J':
+        if (Subtarget->isThumb()) {  // FIXME thumb2
+          // This must be a constant between -255 and -1, for negated ADD
+          // immediates. This can be used in GCC with an "n" modifier that
+          // prints the negated value, for use with SUB instructions. It is
+          // not useful otherwise but is implemented for compatibility.
+          if (CVal >= -255 && CVal <= -1)
+            break;
+        } else {
+          // This must be a constant between -4095 and 4095. It is not clear
+          // what this constraint is intended for. Implemented for
+          // compatibility with GCC.
+          if (CVal >= -4095 && CVal <= 4095)
+            break;
+        }
+        return;
+
+      case 'K':
+        if (Subtarget->isThumb1Only()) {
+          // A 32-bit value where only one byte has a nonzero value. Exclude
+          // zero to match GCC. This constraint is used by GCC internally for
+          // constants that can be loaded with a move/shift combination.
+          // It is not useful otherwise but is implemented for compatibility.
+          if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
+            break;
+        } else if (Subtarget->isThumb2()) {
+          // A constant whose bitwise inverse can be used as an immediate
+          // value in a data-processing instruction. This can be used in GCC
+          // with a "B" modifier that prints the inverted value, for use with
+          // BIC and MVN instructions. It is not useful otherwise but is
+          // implemented for compatibility.
+          if (ARM_AM::getT2SOImmVal(~CVal) != -1)
+            break;
+        } else {
+          // A constant whose bitwise inverse can be used as an immediate
+          // value in a data-processing instruction. This can be used in GCC
+          // with a "B" modifier that prints the inverted value, for use with
+          // BIC and MVN instructions. It is not useful otherwise but is
+          // implemented for compatibility.
+          if (ARM_AM::getSOImmVal(~CVal) != -1)
+            break;
+        }
+        return;
+
+      case 'L':
+        if (Subtarget->isThumb1Only()) {
+          // This must be a constant between -7 and 7,
+          // for 3-operand ADD/SUB immediate instructions.
+          if (CVal >= -7 && CVal < 7)
+            break;
+        } else if (Subtarget->isThumb2()) {
+          // A constant whose negation can be used as an immediate value in a
+          // data-processing instruction. This can be used in GCC with an "n"
+          // modifier that prints the negated value, for use with SUB
+          // instructions. It is not useful otherwise but is implemented for
+          // compatibility.
+          if (ARM_AM::getT2SOImmVal(-CVal) != -1)
+            break;
+        } else {
+          // A constant whose negation can be used as an immediate value in a
+          // data-processing instruction. This can be used in GCC with an "n"
+          // modifier that prints the negated value, for use with SUB
+          // instructions. It is not useful otherwise but is implemented for
+          // compatibility.
+          if (ARM_AM::getSOImmVal(-CVal) != -1)
+            break;
+        }
+        return;
+
+      case 'M':
+        if (Subtarget->isThumb()) { // FIXME thumb2
+          // This must be a multiple of 4 between 0 and 1020, for
+          // ADD sp + immediate.
+          if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
+            break;
+        } else {
+          // A power of two or a constant between 0 and 32.  This is used in
+          // GCC for the shift amount on shifted register operands, but it is
+          // useful in general for any shift amounts.
+          if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
+            break;
+        }
+        return;
+
+      case 'N':
+        if (Subtarget->isThumb()) {  // FIXME thumb2
+          // This must be a constant between 0 and 31, for shift amounts.
+          if (CVal >= 0 && CVal <= 31)
+            break;
+        }
+        return;
+
+      case 'O':
+        if (Subtarget->isThumb()) {  // FIXME thumb2
+          // This must be a multiple of 4 between -508 and 508, for
+          // ADD/SUB sp = sp + immediate.
+          if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
+            break;
+        }
+        return;
+    }
+    Result = DAG.getTargetConstant(CVal, Op.getValueType());
+    break;
+  }
+
+  if (Result.getNode()) {
+    Ops.push_back(Result);
+    return;
+  }
+  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, hasMemory,
+                                                      Ops, DAG);
+}
+
+bool
+ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  // The ARM target isn't yet aware of offsets.
+  return false;
+}
+
+int ARM::getVFPf32Imm(const APFloat &FPImm) {
+  APInt Imm = FPImm.bitcastToAPInt();
+  uint32_t Sign = Imm.lshr(31).getZExtValue() & 1;
+  int32_t Exp = (Imm.lshr(23).getSExtValue() & 0xff) - 127;  // -126 to 127
+  int64_t Mantissa = Imm.getZExtValue() & 0x7fffff;  // 23 bits
+
+  // We can handle 4 bits of mantissa.
+  // mantissa = (16+UInt(e:f:g:h))/16.
+  if (Mantissa & 0x7ffff)
+    return -1;
+  Mantissa >>= 19;
+  if ((Mantissa & 0xf) != Mantissa)
+    return -1;
+
+  // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
+  if (Exp < -3 || Exp > 4)
+    return -1;
+  Exp = ((Exp+3) & 0x7) ^ 4;
+
+  return ((int)Sign << 7) | (Exp << 4) | Mantissa;
+}
+
+int ARM::getVFPf64Imm(const APFloat &FPImm) {
+  APInt Imm = FPImm.bitcastToAPInt();
+  uint64_t Sign = Imm.lshr(63).getZExtValue() & 1;
+  int64_t Exp = (Imm.lshr(52).getSExtValue() & 0x7ff) - 1023;   // -1022 to 1023
+  uint64_t Mantissa = Imm.getZExtValue() & 0xfffffffffffffLL;
+
+  // We can handle 4 bits of mantissa.
+  // mantissa = (16+UInt(e:f:g:h))/16.
+  if (Mantissa & 0xffffffffffffLL)
+    return -1;
+  Mantissa >>= 48;
+  if ((Mantissa & 0xf) != Mantissa)
+    return -1;
+
+  // We can handle 3 bits of exponent: exp == UInt(NOT(b):c:d)-3
+  if (Exp < -3 || Exp > 4)
+    return -1;
+  Exp = ((Exp+3) & 0x7) ^ 4;
+
+  return ((int)Sign << 7) | (Exp << 4) | Mantissa;
+}
+
+/// isFPImmLegal - Returns true if the target can instruction select the
+/// specified FP immediate natively. If false, the legalizer will
+/// materialize the FP immediate as a load from a constant pool.
+bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+  if (!Subtarget->hasVFP3())
+    return false;
+  if (VT == MVT::f32)
+    return ARM::getVFPf32Imm(Imm) != -1;
+  if (VT == MVT::f64)
+    return ARM::getVFPf64Imm(Imm) != -1;
+  return false;
+}

diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
new file mode 100644
index 0000000..3c5df45
--- /dev/null
+++ b/lib/Target/ARM/ARMISelLowering.h

@@ -0,0 +1,349 @@
+//===-- ARMISelLowering.h - ARM DAG Lowering Interface ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that ARM uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMISELLOWERING_H
+#define ARMISELLOWERING_H
+
+#include "ARMSubtarget.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include <vector>
+
+namespace llvm {
+  class ARMConstantPoolValue;
+
+  namespace ARMISD {
+    // ARM Specific DAG Nodes
+    enum NodeType {
+      // Start the numbering where the builtin ops and target ops leave off.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      Wrapper,      // Wrapper - A wrapper node for TargetConstantPool,
+                    // TargetExternalSymbol, and TargetGlobalAddress.
+      WrapperJT,    // WrapperJT - A wrapper node for TargetJumpTable
+
+      CALL,         // Function call.
+      CALL_PRED,    // Function call that's predicable.
+      CALL_NOLINK,  // Function call with branch not branch-and-link.
+      tCALL,        // Thumb function call.
+      BRCOND,       // Conditional branch.
+      BR_JT,        // Jumptable branch.
+      BR2_JT,       // Jumptable branch (2 level - jumptable entry is a jump).
+      RET_FLAG,     // Return with a flag operand.
+
+      PIC_ADD,      // Add with a PC operand and a PIC label.
+
+      CMP,          // ARM compare instructions.
+      CMPZ,         // ARM compare that sets only Z flag.
+      CMPFP,        // ARM VFP compare instruction, sets FPSCR.
+      CMPFPw0,      // ARM VFP compare against zero instruction, sets FPSCR.
+      FMSTAT,       // ARM fmstat instruction.
+      CMOV,         // ARM conditional move instructions.
+      CNEG,         // ARM conditional negate instructions.
+
+      RBIT,         // ARM bitreverse instruction
+
+      FTOSI,        // FP to sint within a FP register.
+      FTOUI,        // FP to uint within a FP register.
+      SITOF,        // sint to FP within a FP register.
+      UITOF,        // uint to FP within a FP register.
+
+      SRL_FLAG,     // V,Flag = srl_flag X -> srl X, 1 + save carry out.
+      SRA_FLAG,     // V,Flag = sra_flag X -> sra X, 1 + save carry out.
+      RRX,          // V = RRX X, Flag     -> srl X, 1 + shift in carry flag.
+
+      VMOVRRD,      // double to two gprs.
+      VMOVDRR,      // Two gprs to double.
+
+      EH_SJLJ_SETJMP,    // SjLj exception handling setjmp.
+      EH_SJLJ_LONGJMP,   // SjLj exception handling longjmp.
+
+      THREAD_POINTER,
+
+      DYN_ALLOC,    // Dynamic allocation on the stack.
+
+      MEMBARRIER,   // Memory barrier
+      SYNCBARRIER,  // Memory sync barrier
+
+      VCEQ,         // Vector compare equal.
+      VCGE,         // Vector compare greater than or equal.
+      VCGEU,        // Vector compare unsigned greater than or equal.
+      VCGT,         // Vector compare greater than.
+      VCGTU,        // Vector compare unsigned greater than.
+      VTST,         // Vector test bits.
+
+      // Vector shift by immediate:
+      VSHL,         // ...left
+      VSHRs,        // ...right (signed)
+      VSHRu,        // ...right (unsigned)
+      VSHLLs,       // ...left long (signed)
+      VSHLLu,       // ...left long (unsigned)
+      VSHLLi,       // ...left long (with maximum shift count)
+      VSHRN,        // ...right narrow
+
+      // Vector rounding shift by immediate:
+      VRSHRs,       // ...right (signed)
+      VRSHRu,       // ...right (unsigned)
+      VRSHRN,       // ...right narrow
+
+      // Vector saturating shift by immediate:
+      VQSHLs,       // ...left (signed)
+      VQSHLu,       // ...left (unsigned)
+      VQSHLsu,      // ...left (signed to unsigned)
+      VQSHRNs,      // ...right narrow (signed)
+      VQSHRNu,      // ...right narrow (unsigned)
+      VQSHRNsu,     // ...right narrow (signed to unsigned)
+
+      // Vector saturating rounding shift by immediate:
+      VQRSHRNs,     // ...right narrow (signed)
+      VQRSHRNu,     // ...right narrow (unsigned)
+      VQRSHRNsu,    // ...right narrow (signed to unsigned)
+
+      // Vector shift and insert:
+      VSLI,         // ...left
+      VSRI,         // ...right
+
+      // Vector get lane (VMOV scalar to ARM core register)
+      // (These are used for 8- and 16-bit element types only.)
+      VGETLANEu,    // zero-extend vector extract element
+      VGETLANEs,    // sign-extend vector extract element
+
+      // Vector duplicate:
+      VDUP,
+      VDUPLANE,
+
+      // Vector shuffles:
+      VEXT,         // extract
+      VREV64,       // reverse elements within 64-bit doublewords
+      VREV32,       // reverse elements within 32-bit words
+      VREV16,       // reverse elements within 16-bit halfwords
+      VZIP,         // zip (interleave)
+      VUZP,         // unzip (deinterleave)
+      VTRN          // transpose
+    };
+  }
+
+  /// Define some predicates that are used for node matching.
+  namespace ARM {
+    /// getVMOVImm - If this is a build_vector of constants which can be
+    /// formed by using a VMOV instruction of the specified element size,
+    /// return the constant being splatted.  The ByteSize field indicates the
+    /// number of bytes of each element [1248].
+    SDValue getVMOVImm(SDNode *N, unsigned ByteSize, SelectionDAG &DAG);
+
+    /// getVFPf32Imm / getVFPf64Imm - If the given fp immediate can be
+    /// materialized with a VMOV.f32 / VMOV.f64 (i.e. fconsts / fconstd)
+    /// instruction, returns its 8-bit integer representation. Otherwise,
+    /// returns -1.
+    int getVFPf32Imm(const APFloat &FPImm);
+    int getVFPf64Imm(const APFloat &FPImm);
+  }
+
+  //===--------------------------------------------------------------------===//
+  //  ARMTargetLowering - ARM Implementation of the TargetLowering interface
+
+  class ARMTargetLowering : public TargetLowering {
+    int VarArgsFrameIndex;            // FrameIndex for start of varargs area.
+  public:
+    explicit ARMTargetLowering(TargetMachine &TM);
+
+    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG);
+
+    /// ReplaceNodeResults - Replace the results of node with an illegal result
+    /// type with new values built out of custom code.
+    ///
+    virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+                                    SelectionDAG &DAG);
+
+    virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+    virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+    virtual MachineBasicBlock *EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                         MachineBasicBlock *MBB,
+                       DenseMap<MachineBasicBlock*, MachineBasicBlock*>*) const;
+
+    /// allowsUnalignedMemoryAccesses - Returns true if the target allows
+    /// unaligned memory accesses. of the specified type.
+    /// FIXME: Add getOptimalMemOpType to implement memcpy with NEON?
+    virtual bool allowsUnalignedMemoryAccesses(EVT VT) const;
+
+    /// isLegalAddressingMode - Return true if the addressing mode represented
+    /// by AM is legal for this target, for a load/store of the specified type.
+    virtual bool isLegalAddressingMode(const AddrMode &AM, const Type *Ty)const;
+    bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const;
+
+    /// isLegalICmpImmediate - Return true if the specified immediate is legal
+    /// icmp immediate, that is the target has icmp instructions which can compare
+    /// a register against the immediate without having to materialize the
+    /// immediate into a register.
+    virtual bool isLegalICmpImmediate(int64_t Imm) const;
+
+    /// getPreIndexedAddressParts - returns true by value, base pointer and
+    /// offset pointer and addressing mode by reference if the node's address
+    /// can be legally represented as pre-indexed load / store address.
+    virtual bool getPreIndexedAddressParts(SDNode *N, SDValue &Base,
+                                           SDValue &Offset,
+                                           ISD::MemIndexedMode &AM,
+                                           SelectionDAG &DAG) const;
+
+    /// getPostIndexedAddressParts - returns true by value, base pointer and
+    /// offset pointer and addressing mode by reference if this node can be
+    /// combined with a load / store to form a post-indexed load / store.
+    virtual bool getPostIndexedAddressParts(SDNode *N, SDNode *Op,
+                                            SDValue &Base, SDValue &Offset,
+                                            ISD::MemIndexedMode &AM,
+                                            SelectionDAG &DAG) const;
+
+    virtual void computeMaskedBitsForTargetNode(const SDValue Op,
+                                                const APInt &Mask,
+                                                APInt &KnownZero,
+                                                APInt &KnownOne,
+                                                const SelectionDAG &DAG,
+                                                unsigned Depth) const;
+
+
+    ConstraintType getConstraintType(const std::string &Constraint) const;
+    std::pair<unsigned, const TargetRegisterClass*>
+      getRegForInlineAsmConstraint(const std::string &Constraint,
+                                   EVT VT) const;
+    std::vector<unsigned>
+    getRegClassForInlineAsmConstraint(const std::string &Constraint,
+                                      EVT VT) const;
+
+    /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+    /// vector.  If it is invalid, don't add anything to Ops. If hasMemory is
+    /// true it means one of the asm constraint of the inline asm instruction
+    /// being processed is 'm'.
+    virtual void LowerAsmOperandForConstraint(SDValue Op,
+                                              char ConstraintLetter,
+                                              bool hasMemory,
+                                              std::vector<SDValue> &Ops,
+                                              SelectionDAG &DAG) const;
+
+    virtual const ARMSubtarget* getSubtarget() {
+      return Subtarget;
+    }
+
+    /// getFunctionAlignment - Return the Log2 alignment of this function.
+    virtual unsigned getFunctionAlignment(const Function *F) const;
+
+    bool isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const;
+    bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
+
+    /// isFPImmLegal - Returns true if the target can instruction select the
+    /// specified FP immediate natively. If false, the legalizer will
+    /// materialize the FP immediate as a load from a constant pool.
+    virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
+
+  private:
+    /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
+    /// make the right decision when generating code for different targets.
+    const ARMSubtarget *Subtarget;
+
+    /// ARMPCLabelIndex - Keep track of the number of ARM PC labels created.
+    ///
+    unsigned ARMPCLabelIndex;
+
+    void addTypeForNEON(EVT VT, EVT PromotedLdStVT, EVT PromotedBitwiseVT);
+    void addDRTypeForNEON(EVT VT);
+    void addQRTypeForNEON(EVT VT);
+
+    typedef SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPassVector;
+    void PassF64ArgInRegs(DebugLoc dl, SelectionDAG &DAG,
+                          SDValue Chain, SDValue &Arg,
+                          RegsToPassVector &RegsToPass,
+                          CCValAssign &VA, CCValAssign &NextVA,
+                          SDValue &StackPtr,
+                          SmallVector<SDValue, 8> &MemOpChains,
+                          ISD::ArgFlagsTy Flags);
+    SDValue GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
+                                 SDValue &Root, SelectionDAG &DAG, DebugLoc dl);
+
+    CCAssignFn *CCAssignFnForNode(CallingConv::ID CC, bool Return, bool isVarArg) const;
+    SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
+                             DebugLoc dl, SelectionDAG &DAG,
+                             const CCValAssign &VA,
+                             ISD::ArgFlagsTy Flags);
+    SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
+                                    const ARMSubtarget *Subtarget);
+    SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerGlobalAddressDarwin(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
+                                            SelectionDAG &DAG);
+    SDValue LowerToTLSExecModels(GlobalAddressSDNode *GA,
+                                   SelectionDAG &DAG);
+    SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG);
+
+    SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
+                                      SDValue Chain,
+                                      SDValue Dst, SDValue Src,
+                                      SDValue Size, unsigned Align,
+                                      bool AlwaysInline,
+                                      const Value *DstSV, uint64_t DstSVOff,
+                                      const Value *SrcSV, uint64_t SrcSVOff);
+    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                            CallingConv::ID CallConv, bool isVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            DebugLoc dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerFormalArguments(SDValue Chain,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerCall(SDValue Chain, SDValue Callee,
+                CallingConv::ID CallConv, bool isVarArg,
+                bool &isTailCall,
+                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<ISD::InputArg> &Ins,
+                DebugLoc dl, SelectionDAG &DAG,
+                SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerReturn(SDValue Chain,
+                  CallingConv::ID CallConv, bool isVarArg,
+                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  DebugLoc dl, SelectionDAG &DAG);
+
+    SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                      SDValue &ARMCC, SelectionDAG &DAG, DebugLoc dl);
+
+    MachineBasicBlock *EmitAtomicCmpSwap(MachineInstr *MI,
+                                         MachineBasicBlock *BB,
+                                         unsigned Size) const;
+    MachineBasicBlock *EmitAtomicBinary(MachineInstr *MI,
+                                        MachineBasicBlock *BB,
+                                        unsigned Size,
+                                        unsigned BinOpcode) const;
+
+  };
+}
+
+#endif  // ARMISELLOWERING_H

diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
new file mode 100644
index 0000000..169eeed
--- /dev/null
+++ b/lib/Target/ARM/ARMInstrFormats.td

@@ -0,0 +1,1551 @@
+//===- ARMInstrFormats.td - ARM Instruction Formats --*- tablegen -*---------=//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//
+// ARM Instruction Format Definitions.
+//
+
+// Format specifies the encoding used by the instruction.  This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<5> val> {
+  bits<5> Value = val;
+}
+
+def Pseudo        : Format<0>;
+def MulFrm        : Format<1>;
+def BrFrm         : Format<2>;
+def BrMiscFrm     : Format<3>;
+
+def DPFrm         : Format<4>;
+def DPSoRegFrm    : Format<5>;
+
+def LdFrm         : Format<6>;
+def StFrm         : Format<7>;
+def LdMiscFrm     : Format<8>;
+def StMiscFrm     : Format<9>;
+def LdStMulFrm    : Format<10>;
+
+def LdStExFrm     : Format<28>;
+
+def ArithMiscFrm  : Format<11>;
+def ExtFrm        : Format<12>;
+
+def VFPUnaryFrm   : Format<13>;
+def VFPBinaryFrm  : Format<14>;
+def VFPConv1Frm   : Format<15>;
+def VFPConv2Frm   : Format<16>;
+def VFPConv3Frm   : Format<17>;
+def VFPConv4Frm   : Format<18>;
+def VFPConv5Frm   : Format<19>;
+def VFPLdStFrm    : Format<20>;
+def VFPLdStMulFrm : Format<21>;
+def VFPMiscFrm    : Format<22>;
+
+def ThumbFrm      : Format<23>;
+
+def NEONFrm       : Format<24>;
+def NEONGetLnFrm  : Format<25>;
+def NEONSetLnFrm  : Format<26>;
+def NEONDupFrm    : Format<27>;
+
+// Misc flags.
+
+// the instruction has a Rn register operand.
+// UnaryDP - Indicates this is a unary data processing instruction, i.e.
+// it doesn't have a Rn operand.
+class UnaryDP    { bit isUnaryDataProc = 1; }
+
+// Xform16Bit - Indicates this Thumb2 instruction may be transformed into
+// a 16-bit Thumb instruction if certain conditions are met.
+class Xform16Bit { bit canXformTo16Bit = 1; }
+
+//===----------------------------------------------------------------------===//
+// ARM Instruction flags.  These need to match ARMInstrInfo.h.
+//
+
+// Addressing mode.
+class AddrMode<bits<4> val> {
+  bits<4> Value = val;
+}
+def AddrModeNone  : AddrMode<0>;
+def AddrMode1     : AddrMode<1>;
+def AddrMode2     : AddrMode<2>;
+def AddrMode3     : AddrMode<3>;
+def AddrMode4     : AddrMode<4>;
+def AddrMode5     : AddrMode<5>;
+def AddrMode6     : AddrMode<6>;
+def AddrModeT1_1  : AddrMode<7>;
+def AddrModeT1_2  : AddrMode<8>;
+def AddrModeT1_4  : AddrMode<9>;
+def AddrModeT1_s  : AddrMode<10>;
+def AddrModeT2_i12: AddrMode<11>;
+def AddrModeT2_i8 : AddrMode<12>;
+def AddrModeT2_so : AddrMode<13>;
+def AddrModeT2_pc : AddrMode<14>;
+def AddrModeT2_i8s4 : AddrMode<15>;
+
+// Instruction size.
+class SizeFlagVal<bits<3> val> {
+  bits<3> Value = val;
+}
+def SizeInvalid  : SizeFlagVal<0>;  // Unset.
+def SizeSpecial  : SizeFlagVal<1>;  // Pseudo or special.
+def Size8Bytes   : SizeFlagVal<2>;
+def Size4Bytes   : SizeFlagVal<3>;
+def Size2Bytes   : SizeFlagVal<4>;
+
+// Load / store index mode.
+class IndexMode<bits<2> val> {
+  bits<2> Value = val;
+}
+def IndexModeNone : IndexMode<0>;
+def IndexModePre  : IndexMode<1>;
+def IndexModePost : IndexMode<2>;
+
+// Instruction execution domain.
+class Domain<bits<2> val> {
+  bits<2> Value = val;
+}
+def GenericDomain : Domain<0>;
+def VFPDomain     : Domain<1>; // Instructions in VFP domain only
+def NeonDomain    : Domain<2>; // Instructions in Neon domain only
+def VFPNeonDomain : Domain<3>; // Instructions in both VFP & Neon domains
+
+//===----------------------------------------------------------------------===//
+
+// ARM special operands.
+//
+
+// ARM Predicate operand. Default to 14 = always (AL). Second part is CC
+// register whose default is 0 (no register).
+def pred : PredicateOperand<OtherVT, (ops i32imm, CCR),
+                                     (ops (i32 14), (i32 zero_reg))> {
+  let PrintMethod = "printPredicateOperand";
+}
+
+// Conditional code result for instructions whose 's' bit is set, e.g. subs.
+def cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 zero_reg))> {
+  let PrintMethod = "printSBitModifierOperand";
+}
+
+// Same as cc_out except it defaults to setting CPSR.
+def s_cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 CPSR))> {
+  let PrintMethod = "printSBitModifierOperand";
+}
+
+//===----------------------------------------------------------------------===//
+
+// ARM Instruction templates.
+//
+
+class InstTemplate<AddrMode am, SizeFlagVal sz, IndexMode im,
+                   Format f, Domain d, string cstr, InstrItinClass itin>
+  : Instruction {
+  let Namespace = "ARM";
+
+  // TSFlagsFields
+  AddrMode AM = am;
+  bits<4> AddrModeBits = AM.Value;
+  
+  SizeFlagVal SZ = sz;
+  bits<3> SizeFlag = SZ.Value;
+
+  IndexMode IM = im;
+  bits<2> IndexModeBits = IM.Value;
+  
+  Format F = f;
+  bits<5> Form = F.Value;
+
+  Domain D = d;
+  bits<2> Dom = D.Value;
+
+  //
+  // Attributes specific to ARM instructions...
+  //
+  bit isUnaryDataProc = 0;
+  bit canXformTo16Bit = 0;
+  
+  let Constraints = cstr;
+  let Itinerary = itin;
+}
+
+class Encoding {
+  field bits<32> Inst;
+}
+
+class InstARM<AddrMode am, SizeFlagVal sz, IndexMode im,
+              Format f, Domain d, string cstr, InstrItinClass itin>
+  : InstTemplate<am, sz, im, f, d, cstr, itin>, Encoding;
+
+// This Encoding-less class is used by Thumb1 to specify the encoding bits later
+// on by adding flavors to specific instructions.
+class InstThumb<AddrMode am, SizeFlagVal sz, IndexMode im,
+                Format f, Domain d, string cstr, InstrItinClass itin>
+  : InstTemplate<am, sz, im, f, d, cstr, itin>;
+
+class PseudoInst<dag oops, dag iops, InstrItinClass itin, 
+                 string asm, list<dag> pattern>
+  : InstARM<AddrModeNone, SizeSpecial, IndexModeNone, Pseudo, GenericDomain, 
+            "", itin> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let AsmString   = asm;
+  let Pattern = pattern;
+}
+
+// Almost all ARM instructions are predicable.
+class I<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
+        IndexMode im, Format f, InstrItinClass itin, 
+        string opc, string asm, string cstr,
+        list<dag> pattern>
+  : InstARM<am, sz, im, f, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ops pred:$p));
+  let AsmString   = !strconcat(opc, !strconcat("${p}", asm));
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsARM];
+}
+// A few are not predicable
+class InoP<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
+        IndexMode im, Format f, InstrItinClass itin, 
+        string opc, string asm, string cstr,
+        list<dag> pattern>
+  : InstARM<am, sz, im, f, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let AsmString   = !strconcat(opc, asm);
+  let Pattern = pattern;
+  let isPredicable = 0;
+  list<Predicate> Predicates = [IsARM];
+}
+
+// Same as I except it can optionally modify CPSR. Note it's modeled as
+// an input operand since by default it's a zero register. It will
+// become an implicit def once it's "flipped".
+class sI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
+         IndexMode im, Format f, InstrItinClass itin,
+         string opc, string asm, string cstr,
+         list<dag> pattern>
+  : InstARM<am, sz, im, f, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ops pred:$p, cc_out:$s));
+  let AsmString   = !strconcat(opc, !strconcat("${p}${s}", asm));
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsARM];
+}
+
+// Special cases
+class XI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
+         IndexMode im, Format f, InstrItinClass itin,
+         string asm, string cstr, list<dag> pattern>
+  : InstARM<am, sz, im, f, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let AsmString   = asm;
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsARM];
+}
+
+class AI<dag oops, dag iops, Format f, InstrItinClass itin,
+         string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, itin,
+      opc, asm, "", pattern>;
+class AsI<dag oops, dag iops, Format f, InstrItinClass itin,
+          string opc, string asm, list<dag> pattern>
+  : sI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, itin,
+       opc, asm, "", pattern>;
+class AXI<dag oops, dag iops, Format f, InstrItinClass itin,
+          string asm, list<dag> pattern>
+  : XI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, itin,
+       asm, "", pattern>;
+class AInoP<dag oops, dag iops, Format f, InstrItinClass itin,
+         string opc, string asm, list<dag> pattern>
+  : InoP<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, itin,
+      opc, asm, "", pattern>;
+
+// Ctrl flow instructions
+class ABI<bits<4> opcod, dag oops, dag iops, InstrItinClass itin,
+          string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, BrFrm, itin,
+      opc, asm, "", pattern> {
+  let Inst{27-24} = opcod;
+}
+class ABXI<bits<4> opcod, dag oops, dag iops, InstrItinClass itin,
+           string asm, list<dag> pattern>
+  : XI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, BrFrm, itin,
+       asm, "", pattern> {
+  let Inst{27-24} = opcod;
+}
+class ABXIx2<dag oops, dag iops, InstrItinClass itin,
+             string asm, list<dag> pattern>
+  : XI<oops, iops, AddrModeNone, Size8Bytes, IndexModeNone, BrMiscFrm, itin,
+       asm, "", pattern>;
+
+// BR_JT instructions
+class JTI<dag oops, dag iops, InstrItinClass itin,
+          string asm, list<dag> pattern>
+  : XI<oops, iops, AddrModeNone, SizeSpecial, IndexModeNone, BrMiscFrm, itin,
+       asm, "", pattern>;
+
+
+// Atomic load/store instructions
+
+class AIldrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, LdStExFrm, itin,
+      opc, asm, "", pattern> {
+  let Inst{27-23} = 0b00011;
+  let Inst{22-21} = opcod;
+  let Inst{20} = 1;
+  let Inst{11-0}  = 0b111110011111;
+}
+class AIstrex<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, LdStExFrm, itin,
+      opc, asm, "", pattern> {
+  let Inst{27-23} = 0b00011;
+  let Inst{22-21} = opcod;
+  let Inst{20} = 0;
+  let Inst{11-4}  = 0b11111001;
+}
+
+// addrmode1 instructions
+class AI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin,
+          string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode1, Size4Bytes, IndexModeNone, f, itin,
+      opc, asm, "", pattern> {
+  let Inst{24-21} = opcod;
+  let Inst{27-26} = {0,0};
+}
+class AsI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : sI<oops, iops, AddrMode1, Size4Bytes, IndexModeNone, f, itin,
+       opc, asm, "", pattern> {
+  let Inst{24-21} = opcod;
+  let Inst{27-26} = {0,0};
+}
+class AXI1<bits<4> opcod, dag oops, dag iops, Format f, InstrItinClass itin,
+           string asm, list<dag> pattern>
+  : XI<oops, iops, AddrMode1, Size4Bytes, IndexModeNone, f, itin,
+       asm, "", pattern> {
+  let Inst{24-21} = opcod;
+  let Inst{27-26} = {0,0};
+}
+class AI1x2<dag oops, dag iops, Format f, InstrItinClass itin, 
+            string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode1, Size8Bytes, IndexModeNone, f, itin,
+      opc, asm, "", pattern>;
+
+
+// addrmode2 loads and stores
+class AI2<dag oops, dag iops, Format f, InstrItinClass itin,
+          string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin,
+      opc, asm, "", pattern> {
+  let Inst{27-26} = {0,1};
+}
+
+// loads
+class AI2ldw<dag oops, dag iops, Format f, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin,
+      opc, asm, "", pattern> {
+  let Inst{20}    = 1; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{22}    = 0; // B bit
+  let Inst{24}    = 1; // P bit
+  let Inst{27-26} = {0,1};
+}
+class AXI2ldw<dag oops, dag iops, Format f, InstrItinClass itin, 
+              string asm, list<dag> pattern>
+  : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin,
+       asm, "", pattern> {
+  let Inst{20}    = 1; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{22}    = 0; // B bit
+  let Inst{24}    = 1; // P bit
+  let Inst{27-26} = {0,1};
+}
+class AI2ldb<dag oops, dag iops, Format f, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin,
+      opc, asm, "", pattern> {
+  let Inst{20}    = 1; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{22}    = 1; // B bit
+  let Inst{24}    = 1; // P bit
+  let Inst{27-26} = {0,1};
+}
+class AXI2ldb<dag oops, dag iops, Format f, InstrItinClass itin, 
+              string asm, list<dag> pattern>
+  : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin,
+       asm, "", pattern> {
+  let Inst{20}    = 1; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{22}    = 1; // B bit
+  let Inst{24}    = 1; // P bit
+  let Inst{27-26} = {0,1};
+}
+
+// stores
+class AI2stw<dag oops, dag iops, Format f, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin,
+      opc, asm, "", pattern> {
+  let Inst{20}    = 0; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{22}    = 0; // B bit
+  let Inst{24}    = 1; // P bit
+  let Inst{27-26} = {0,1};
+}
+class AXI2stw<dag oops, dag iops, Format f, InstrItinClass itin,
+              string asm, list<dag> pattern>
+  : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin,
+       asm, "", pattern> {
+  let Inst{20}    = 0; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{22}    = 0; // B bit
+  let Inst{24}    = 1; // P bit
+  let Inst{27-26} = {0,1};
+}
+class AI2stb<dag oops, dag iops, Format f, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin,
+      opc, asm, "", pattern> {
+  let Inst{20}    = 0; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{22}    = 1; // B bit
+  let Inst{24}    = 1; // P bit
+  let Inst{27-26} = {0,1};
+}
+class AXI2stb<dag oops, dag iops, Format f, InstrItinClass itin,
+              string asm, list<dag> pattern>
+  : XI<oops, iops, AddrMode2, Size4Bytes, IndexModeNone, f, itin,
+       asm, "", pattern> {
+  let Inst{20}    = 0; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{22}    = 1; // B bit
+  let Inst{24}    = 1; // P bit
+  let Inst{27-26} = {0,1};
+}
+
+// Pre-indexed loads
+class AI2ldwpr<dag oops, dag iops, Format f, InstrItinClass itin,
+               string opc, string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, itin,
+      opc, asm, cstr, pattern> {
+  let Inst{20}    = 1; // L bit
+  let Inst{21}    = 1; // W bit
+  let Inst{22}    = 0; // B bit
+  let Inst{24}    = 1; // P bit
+  let Inst{27-26} = {0,1};
+}
+class AI2ldbpr<dag oops, dag iops, Format f, InstrItinClass itin,
+               string opc, string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, itin,
+      opc, asm, cstr, pattern> {
+  let Inst{20}    = 1; // L bit
+  let Inst{21}    = 1; // W bit
+  let Inst{22}    = 1; // B bit
+  let Inst{24}    = 1; // P bit
+  let Inst{27-26} = {0,1};
+}
+
+// Pre-indexed stores
+class AI2stwpr<dag oops, dag iops, Format f, InstrItinClass itin,
+               string opc, string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, itin,
+      opc, asm, cstr, pattern> {
+  let Inst{20}    = 0; // L bit
+  let Inst{21}    = 1; // W bit
+  let Inst{22}    = 0; // B bit
+  let Inst{24}    = 1; // P bit
+  let Inst{27-26} = {0,1};
+}
+class AI2stbpr<dag oops, dag iops, Format f, InstrItinClass itin,
+               string opc, string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode2, Size4Bytes, IndexModePre, f, itin,
+      opc, asm, cstr, pattern> {
+  let Inst{20}    = 0; // L bit
+  let Inst{21}    = 1; // W bit
+  let Inst{22}    = 1; // B bit
+  let Inst{24}    = 1; // P bit
+  let Inst{27-26} = {0,1};
+}
+
+// Post-indexed loads
+class AI2ldwpo<dag oops, dag iops, Format f, InstrItinClass itin,
+               string opc, string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, itin,
+      opc, asm, cstr,pattern> {
+  let Inst{20}    = 1; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{22}    = 0; // B bit
+  let Inst{24}    = 0; // P bit
+  let Inst{27-26} = {0,1};
+}
+class AI2ldbpo<dag oops, dag iops, Format f, InstrItinClass itin,
+               string opc, string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, itin,
+      opc, asm, cstr,pattern> {
+  let Inst{20}    = 1; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{22}    = 1; // B bit
+  let Inst{24}    = 0; // P bit
+  let Inst{27-26} = {0,1};
+}
+
+// Post-indexed stores
+class AI2stwpo<dag oops, dag iops, Format f, InstrItinClass itin,
+               string opc, string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, itin,
+      opc, asm, cstr,pattern> {
+  let Inst{20}    = 0; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{22}    = 0; // B bit
+  let Inst{24}    = 0; // P bit
+  let Inst{27-26} = {0,1};
+}
+class AI2stbpo<dag oops, dag iops, Format f, InstrItinClass itin,
+               string opc, string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode2, Size4Bytes, IndexModePost, f, itin,
+      opc, asm, cstr,pattern> {
+  let Inst{20}    = 0; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{22}    = 1; // B bit
+  let Inst{24}    = 0; // P bit
+  let Inst{27-26} = {0,1};
+}
+
+// addrmode3 instructions
+class AI3<dag oops, dag iops, Format f, InstrItinClass itin, 
+          string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
+      opc, asm, "", pattern>;
+class AXI3<dag oops, dag iops, Format f, InstrItinClass itin,
+           string asm, list<dag> pattern>
+  : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
+       asm, "", pattern>;
+
+// loads
+class AI3ldh<dag oops, dag iops, Format f, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
+      opc, asm, "", pattern> {
+  let Inst{4}     = 1;
+  let Inst{5}     = 1; // H bit
+  let Inst{6}     = 0; // S bit
+  let Inst{7}     = 1;
+  let Inst{20}    = 1; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{24}    = 1; // P bit
+  let Inst{27-25} = 0b000;
+}
+class AXI3ldh<dag oops, dag iops, Format f, InstrItinClass itin,
+              string asm, list<dag> pattern>
+  : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
+       asm, "", pattern> {
+  let Inst{4}     = 1;
+  let Inst{5}     = 1; // H bit
+  let Inst{6}     = 0; // S bit
+  let Inst{7}     = 1;
+  let Inst{20}    = 1; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{24}    = 1; // P bit
+}
+class AI3ldsh<dag oops, dag iops, Format f, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
+      opc, asm, "", pattern> {
+  let Inst{4}     = 1;
+  let Inst{5}     = 1; // H bit
+  let Inst{6}     = 1; // S bit
+  let Inst{7}     = 1;
+  let Inst{20}    = 1; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{24}    = 1; // P bit
+  let Inst{27-25} = 0b000;
+}
+class AXI3ldsh<dag oops, dag iops, Format f, InstrItinClass itin,
+               string asm, list<dag> pattern>
+  : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
+       asm, "", pattern> {
+  let Inst{4}     = 1;
+  let Inst{5}     = 1; // H bit
+  let Inst{6}     = 1; // S bit
+  let Inst{7}     = 1;
+  let Inst{20}    = 1; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{24}    = 1; // P bit
+}
+class AI3ldsb<dag oops, dag iops, Format f, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
+      opc, asm, "", pattern> {
+  let Inst{4}     = 1;
+  let Inst{5}     = 0; // H bit
+  let Inst{6}     = 1; // S bit
+  let Inst{7}     = 1;
+  let Inst{20}    = 1; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{24}    = 1; // P bit
+  let Inst{27-25} = 0b000;
+}
+class AXI3ldsb<dag oops, dag iops, Format f, InstrItinClass itin,
+               string asm, list<dag> pattern>
+  : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
+       asm, "", pattern> {
+  let Inst{4}     = 1;
+  let Inst{5}     = 0; // H bit
+  let Inst{6}     = 1; // S bit
+  let Inst{7}     = 1;
+  let Inst{20}    = 1; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{24}    = 1; // P bit
+}
+class AI3ldd<dag oops, dag iops, Format f, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
+      opc, asm, "", pattern> {
+  let Inst{4}     = 1;
+  let Inst{5}     = 0; // H bit
+  let Inst{6}     = 1; // S bit
+  let Inst{7}     = 1;
+  let Inst{20}    = 0; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{24}    = 1; // P bit
+  let Inst{27-25} = 0b000;
+}
+
+// stores
+class AI3sth<dag oops, dag iops, Format f, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
+      opc, asm, "", pattern> {
+  let Inst{4}     = 1;
+  let Inst{5}     = 1; // H bit
+  let Inst{6}     = 0; // S bit
+  let Inst{7}     = 1;
+  let Inst{20}    = 0; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{24}    = 1; // P bit
+  let Inst{27-25} = 0b000;
+}
+class AXI3sth<dag oops, dag iops, Format f, InstrItinClass itin,
+              string asm, list<dag> pattern>
+  : XI<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
+       asm, "", pattern> {
+  let Inst{4}     = 1;
+  let Inst{5}     = 1; // H bit
+  let Inst{6}     = 0; // S bit
+  let Inst{7}     = 1;
+  let Inst{20}    = 0; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{24}    = 1; // P bit
+}
+class AI3std<dag oops, dag iops, Format f, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModeNone, f, itin,
+      opc, asm, "", pattern> {
+  let Inst{4}     = 1;
+  let Inst{5}     = 1; // H bit
+  let Inst{6}     = 1; // S bit
+  let Inst{7}     = 1;
+  let Inst{20}    = 0; // L bit
+  let Inst{21}    = 0; // W bit
+  let Inst{24}    = 1; // P bit
+  let Inst{27-25} = 0b000;
+}
+
+// Pre-indexed loads
+class AI3ldhpr<dag oops, dag iops, Format f, InstrItinClass itin,
+               string opc, string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, itin,
+      opc, asm, cstr, pattern> {
+  let Inst{4}     = 1;
+  let Inst{5}     = 1; // H bit
+  let Inst{6}     = 0; // S bit
+  let Inst{7}     = 1;
+  let Inst{20}    = 1; // L bit
+  let Inst{21}    = 1; // W bit
+  let Inst{24}    = 1; // P bit
+  let Inst{27-25} = 0b000;
+}
+class AI3ldshpr<dag oops, dag iops, Format f, InstrItinClass itin,
+                string opc, string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, itin,
+      opc, asm, cstr, pattern> {
+  let Inst{4}     = 1;
+  let Inst{5}     = 1; // H bit
+  let Inst{6}     = 1; // S bit
+  let Inst{7}     = 1;
+  let Inst{20}    = 1; // L bit
+  let Inst{21}    = 1; // W bit
+  let Inst{24}    = 1; // P bit
+  let Inst{27-25} = 0b000;
+}
+class AI3ldsbpr<dag oops, dag iops, Format f, InstrItinClass itin,
+                string opc, string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, itin,
+      opc, asm, cstr, pattern> {
+  let Inst{4}     = 1;
+  let Inst{5}     = 0; // H bit
+  let Inst{6}     = 1; // S bit
+  let Inst{7}     = 1;
+  let Inst{20}    = 1; // L bit
+  let Inst{21}    = 1; // W bit
+  let Inst{24}    = 1; // P bit
+  let Inst{27-25} = 0b000;
+}
+
+// Pre-indexed stores
+class AI3sthpr<dag oops, dag iops, Format f, InstrItinClass itin,
+               string opc, string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePre, f, itin,
+      opc, asm, cstr, pattern> {
+  let Inst{4}     = 1;
+  let Inst{5}     = 1; // H bit
+  let Inst{6}     = 0; // S bit
+  let Inst{7}     = 1;
+  let Inst{20}    = 0; // L bit
+  let Inst{21}    = 1; // W bit
+  let Inst{24}    = 1; // P bit
+  let Inst{27-25} = 0b000;
+}
+
+// Post-indexed loads
+class AI3ldhpo<dag oops, dag iops, Format f, InstrItinClass itin,
+               string opc, string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, itin,
+      opc, asm, cstr,pattern> {
+  let Inst{4}     = 1;
+  let Inst{5}     = 1; // H bit
+  let Inst{6}     = 0; // S bit
+  let Inst{7}     = 1;
+  let Inst{20}    = 1; // L bit
+  let Inst{21}    = 1; // W bit
+  let Inst{24}    = 0; // P bit
+  let Inst{27-25} = 0b000;
+}
+class AI3ldshpo<dag oops, dag iops, Format f, InstrItinClass itin,
+                string opc, string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, itin,
+      opc, asm, cstr,pattern> {
+  let Inst{4}     = 1;
+  let Inst{5}     = 1; // H bit
+  let Inst{6}     = 1; // S bit
+  let Inst{7}     = 1;
+  let Inst{20}    = 1; // L bit
+  let Inst{21}    = 1; // W bit
+  let Inst{24}    = 0; // P bit
+  let Inst{27-25} = 0b000;
+}
+class AI3ldsbpo<dag oops, dag iops, Format f, InstrItinClass itin,
+                string opc, string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, itin,
+      opc, asm, cstr,pattern> {
+  let Inst{4}     = 1;
+  let Inst{5}     = 0; // H bit
+  let Inst{6}     = 1; // S bit
+  let Inst{7}     = 1;
+  let Inst{20}    = 1; // L bit
+  let Inst{21}    = 1; // W bit
+  let Inst{24}    = 0; // P bit
+  let Inst{27-25} = 0b000;
+}
+
+// Post-indexed stores
+class AI3sthpo<dag oops, dag iops, Format f, InstrItinClass itin,
+               string opc, string asm, string cstr, list<dag> pattern>
+  : I<oops, iops, AddrMode3, Size4Bytes, IndexModePost, f, itin,
+      opc, asm, cstr,pattern> {
+  let Inst{4}     = 1;
+  let Inst{5}     = 1; // H bit
+  let Inst{6}     = 0; // S bit
+  let Inst{7}     = 1;
+  let Inst{20}    = 0; // L bit
+  let Inst{21}    = 1; // W bit
+  let Inst{24}    = 0; // P bit
+  let Inst{27-25} = 0b000;
+}
+
+
+// addrmode4 instructions
+class AXI4ld<dag oops, dag iops, Format f, InstrItinClass itin,
+             string asm, list<dag> pattern>
+  : XI<oops, iops, AddrMode4, Size4Bytes, IndexModeNone, f, itin,
+       asm, "", pattern> {
+  let Inst{20}    = 1; // L bit
+  let Inst{22}    = 0; // S bit
+  let Inst{27-25} = 0b100;
+}
+class AXI4st<dag oops, dag iops, Format f, InstrItinClass itin,
+             string asm, list<dag> pattern>
+  : XI<oops, iops, AddrMode4, Size4Bytes, IndexModeNone, f, itin,
+       asm, "", pattern> {
+  let Inst{20}    = 0; // L bit
+  let Inst{22}    = 0; // S bit
+  let Inst{27-25} = 0b100;
+}
+
+// Unsigned multiply, multiply-accumulate instructions.
+class AMul1I<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, MulFrm, itin,
+      opc, asm, "", pattern> {
+  let Inst{7-4}   = 0b1001;
+  let Inst{20}    = 0; // S bit
+  let Inst{27-21} = opcod;
+}
+class AsMul1I<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : sI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, MulFrm, itin,
+       opc, asm, "", pattern> {
+  let Inst{7-4}   = 0b1001;
+  let Inst{27-21} = opcod;
+}
+
+// Most significant word multiply
+class AMul2I<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, MulFrm, itin,
+      opc, asm, "", pattern> {
+  let Inst{7-4}   = 0b1001;
+  let Inst{20}    = 1;
+  let Inst{27-21} = opcod;
+}
+
+// SMUL<x><y> / SMULW<y> / SMLA<x><y> / SMLAW<x><y>
+class AMulxyI<bits<7> opcod, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, MulFrm, itin,
+      opc, asm, "", pattern> {
+  let Inst{4}     = 0;
+  let Inst{7}     = 1;
+  let Inst{20}    = 0;
+  let Inst{27-21} = opcod;
+}
+
+// Extend instructions.
+class AExtI<bits<8> opcod, dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, ExtFrm, itin,
+      opc, asm, "", pattern> {
+  let Inst{7-4}   = 0b0111;
+  let Inst{27-20} = opcod;
+}
+
+// Misc Arithmetic instructions.
+class AMiscA1I<bits<8> opcod, dag oops, dag iops, InstrItinClass itin,
+               string opc, string asm, list<dag> pattern>
+  : I<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, ArithMiscFrm, itin,
+      opc, asm, "", pattern> {
+  let Inst{27-20} = opcod;
+}
+
+//===----------------------------------------------------------------------===//
+
+// ARMPat - Same as Pat<>, but requires that the compiler be in ARM mode.
+class ARMPat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsARM];
+}
+class ARMV5TEPat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsARM, HasV5TE];
+}
+class ARMV6Pat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsARM, HasV6];
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Thumb Instruction Format Definitions.
+//
+
+// TI - Thumb instruction.
+
+class ThumbI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
+             InstrItinClass itin, string asm, string cstr, list<dag> pattern>
+  : InstThumb<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let AsmString   = asm;
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb];
+}
+
+class TI<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern>
+  : ThumbI<oops, iops, AddrModeNone, Size2Bytes, itin, asm, "", pattern>;
+
+// Two-address instructions
+class TIt<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern>
+  : ThumbI<oops, iops, AddrModeNone, Size2Bytes, itin, asm, "$lhs = $dst", pattern>;
+
+// tBL, tBX 32-bit instructions
+class TIx2<bits<5> opcod1, bits<2> opcod2, bit opcod3,
+    dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern>
+    : ThumbI<oops, iops, AddrModeNone, Size4Bytes, itin, asm, "", pattern>, Encoding {
+  let Inst{31-27} = opcod1;
+  let Inst{15-14} = opcod2;
+  let Inst{12} = opcod3;
+}
+
+// BR_JT instructions
+class TJTI<dag oops, dag iops, InstrItinClass itin, string asm, list<dag> pattern>
+  : ThumbI<oops, iops, AddrModeNone, SizeSpecial, itin, asm, "", pattern>;
+
+// Thumb1 only
+class Thumb1I<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
+              InstrItinClass itin, string asm, string cstr, list<dag> pattern>
+  : InstThumb<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let AsmString   = asm;
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb1Only];
+}
+
+class T1I<dag oops, dag iops, InstrItinClass itin,
+          string asm, list<dag> pattern>
+  : Thumb1I<oops, iops, AddrModeNone, Size2Bytes, itin, asm, "", pattern>;
+class T1Ix2<dag oops, dag iops, InstrItinClass itin,
+            string asm, list<dag> pattern>
+  : Thumb1I<oops, iops, AddrModeNone, Size4Bytes, itin, asm, "", pattern>;
+class T1JTI<dag oops, dag iops, InstrItinClass itin,
+            string asm, list<dag> pattern>
+  : Thumb1I<oops, iops, AddrModeNone, SizeSpecial, itin, asm, "", pattern>;
+
+// Two-address instructions
+class T1It<dag oops, dag iops, InstrItinClass itin,
+           string asm, list<dag> pattern>
+  : Thumb1I<oops, iops, AddrModeNone, Size2Bytes, itin, 
+            asm, "$lhs = $dst", pattern>;
+
+// Thumb1 instruction that can either be predicated or set CPSR.
+class Thumb1sI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
+               InstrItinClass itin,
+               string opc, string asm, string cstr, list<dag> pattern>
+  : InstThumb<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+  let OutOperandList = !con(oops, (ops s_cc_out:$s));
+  let InOperandList = !con(iops, (ops pred:$p));
+  let AsmString = !strconcat(opc, !strconcat("${s}${p}", asm));
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb1Only];
+}
+
+class T1sI<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : Thumb1sI<oops, iops, AddrModeNone, Size2Bytes, itin, opc, asm, "", pattern>;
+
+// Two-address instructions
+class T1sIt<dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : Thumb1sI<oops, iops, AddrModeNone, Size2Bytes, itin, opc, asm,
+            "$lhs = $dst", pattern>;
+
+// Thumb1 instruction that can be predicated.
+class Thumb1pI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
+               InstrItinClass itin,
+               string opc, string asm, string cstr, list<dag> pattern>
+  : InstThumb<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ops pred:$p));
+  let AsmString = !strconcat(opc, !strconcat("${p}", asm));
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb1Only];
+}
+
+class T1pI<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : Thumb1pI<oops, iops, AddrModeNone, Size2Bytes, itin, opc, asm, "", pattern>;
+
+// Two-address instructions
+class T1pIt<dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : Thumb1pI<oops, iops, AddrModeNone, Size2Bytes, itin, opc, asm,
+            "$lhs = $dst", pattern>;
+
+class T1pI1<dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : Thumb1pI<oops, iops, AddrModeT1_1, Size2Bytes, itin, opc, asm, "", pattern>;
+class T1pI2<dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : Thumb1pI<oops, iops, AddrModeT1_2, Size2Bytes, itin, opc, asm, "", pattern>;
+class T1pI4<dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : Thumb1pI<oops, iops, AddrModeT1_4, Size2Bytes, itin, opc, asm, "", pattern>;
+class T1pIs<dag oops, dag iops, 
+            InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : Thumb1pI<oops, iops, AddrModeT1_s, Size2Bytes, itin, opc, asm, "", pattern>;
+
+class Encoding16 : Encoding {
+  let Inst{31-16} = 0x0000;
+}
+
+// A6.2 16-bit Thumb instruction encoding
+class T1Encoding<bits<6> opcode> : Encoding16 {
+  let Inst{15-10} = opcode;
+}
+
+// A6.2.1 Shift (immediate), add, subtract, move, and compare encoding.
+class T1General<bits<5> opcode> : Encoding16 {
+  let Inst{15-14} = 0b00;
+  let Inst{13-9} = opcode;
+}
+
+// A6.2.2 Data-processing encoding.
+class T1DataProcessing<bits<4> opcode> : Encoding16 {
+  let Inst{15-10} = 0b010000;
+  let Inst{9-6} = opcode;
+}
+
+// A6.2.3 Special data instructions and branch and exchange encoding.
+class T1Special<bits<4> opcode> : Encoding16 {
+  let Inst{15-10} = 0b010001;
+  let Inst{9-6} = opcode;
+}
+
+// A6.2.4 Load/store single data item encoding.
+class T1LoadStore<bits<4> opA, bits<3> opB> : Encoding16 {
+  let Inst{15-12} = opA;
+  let Inst{11-9} = opB;
+}
+class T1LdSt<bits<3> opB> : T1LoadStore<0b0101, opB>;
+class T1LdSt4Imm<bits<3> opB> : T1LoadStore<0b0110, opB>; // Immediate, 4 bytes
+class T1LdSt1Imm<bits<3> opB> : T1LoadStore<0b0111, opB>; // Immediate, 1 byte
+class T1LdSt2Imm<bits<3> opB> : T1LoadStore<0b1000, opB>; // Immediate, 2 bytes
+class T1LdStSP<bits<3> opB> : T1LoadStore<0b1001, opB>;   // SP relative
+
+// A6.2.5 Miscellaneous 16-bit instructions encoding.
+class T1Misc<bits<7> opcode> : Encoding16 {
+  let Inst{15-12} = 0b1011;
+  let Inst{11-5} = opcode;
+}
+
+// Thumb2I - Thumb2 instruction. Almost all Thumb2 instructions are predicable.
+class Thumb2I<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
+              InstrItinClass itin,
+              string opc, string asm, string cstr, list<dag> pattern>
+  : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ops pred:$p));
+  let AsmString = !strconcat(opc, !strconcat("${p}", asm));
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb2];
+}
+
+// Same as Thumb2I except it can optionally modify CPSR. Note it's modeled as
+// an input operand since by default it's a zero register. It will
+// become an implicit def once it's "flipped".
+// FIXME: This uses unified syntax so {s} comes before {p}. We should make it
+// more consistent.
+class Thumb2sI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
+               InstrItinClass itin,
+               string opc, string asm, string cstr, list<dag> pattern>
+  : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ops pred:$p, cc_out:$s));
+  let AsmString   = !strconcat(opc, !strconcat("${s}${p}", asm));
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb2];
+}
+
+// Special cases
+class Thumb2XI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
+               InstrItinClass itin,
+               string asm, string cstr, list<dag> pattern>
+  : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let AsmString   = asm;
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb2];
+}
+
+class ThumbXI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
+               InstrItinClass itin,
+               string asm, string cstr, list<dag> pattern>
+  : InstARM<am, sz, IndexModeNone, ThumbFrm, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let AsmString   = asm;
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb1Only];
+}
+
+class T2I<dag oops, dag iops, InstrItinClass itin,
+          string opc, string asm, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeNone, Size4Bytes, itin, opc, asm, "", pattern>;
+class T2Ii12<dag oops, dag iops, InstrItinClass itin,
+             string opc, string asm, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeT2_i12, Size4Bytes, itin, opc, asm, "", pattern>;
+class T2Ii8<dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeT2_i8, Size4Bytes, itin, opc, asm, "", pattern>;
+class T2Iso<dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeT2_so, Size4Bytes, itin, opc, asm, "", pattern>;
+class T2Ipc<dag oops, dag iops, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeT2_pc, Size4Bytes, itin, opc, asm, "", pattern>;
+class T2Ii8s4<bit P, bit W, bit load, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeT2_i8s4, Size4Bytes, itin, opc, asm, "",
+            pattern> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-25} = 0b00;
+  let Inst{24} = P;
+  let Inst{23} = ?; // The U bit.
+  let Inst{22} = 1;
+  let Inst{21} = W;
+  let Inst{20} = load;
+}
+
+class T2sI<dag oops, dag iops, InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : Thumb2sI<oops, iops, AddrModeNone, Size4Bytes, itin, opc, asm, "", pattern>;
+
+class T2XI<dag oops, dag iops, InstrItinClass itin,
+           string asm, list<dag> pattern>
+  : Thumb2XI<oops, iops, AddrModeNone, Size4Bytes, itin, asm, "", pattern>;
+class T2JTI<dag oops, dag iops, InstrItinClass itin,
+            string asm, list<dag> pattern>
+  : Thumb2XI<oops, iops, AddrModeNone, SizeSpecial, itin, asm, "", pattern>;
+
+class T2Ix2<dag oops, dag iops, InstrItinClass itin,
+          string opc, string asm, list<dag> pattern>
+  : Thumb2I<oops, iops, AddrModeNone, Size8Bytes, itin, opc, asm, "", pattern>;
+
+
+// T2Iidxldst - Thumb2 indexed load / store instructions.
+class T2Iidxldst<bit signed, bits<2> opcod, bit load, bit pre,
+                 dag oops, dag iops,
+                 AddrMode am, IndexMode im, InstrItinClass itin,
+                 string opc, string asm, string cstr, list<dag> pattern>
+  : InstARM<am, Size4Bytes, im, ThumbFrm, GenericDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ops pred:$p));
+  let AsmString = !strconcat(opc, !strconcat("${p}", asm));
+  let Pattern = pattern;
+  list<Predicate> Predicates = [IsThumb2];
+  let Inst{31-27} = 0b11111;
+  let Inst{26-25} = 0b00;
+  let Inst{24} = signed;
+  let Inst{23} = 0;
+  let Inst{22-21} = opcod;
+  let Inst{20} = load;
+  let Inst{11} = 1;
+  // (P, W) = (1, 1) Pre-indexed or (0, 1) Post-indexed
+  let Inst{10} = pre; // The P bit.
+  let Inst{8} = 1; // The W bit.
+}
+
+// Tv5Pat - Same as Pat<>, but requires V5T Thumb mode.
+class Tv5Pat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsThumb1Only, HasV5T];
+}
+
+// T1Pat - Same as Pat<>, but requires that the compiler be in Thumb1 mode.
+class T1Pat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsThumb1Only];
+}
+
+// T2Pat - Same as Pat<>, but requires that the compiler be in Thumb2 mode.
+class T2Pat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [IsThumb2];
+}
+
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ARM VFP Instruction templates.
+//
+
+// Almost all VFP instructions are predicable.
+class VFPI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
+           IndexMode im, Format f, InstrItinClass itin,
+           string opc, string asm, string cstr, list<dag> pattern>
+  : InstARM<am, sz, im, f, VFPDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ops pred:$p));
+  let AsmString   = !strconcat(opc, !strconcat("${p}", asm));
+  let Pattern = pattern;
+  list<Predicate> Predicates = [HasVFP2];
+}
+
+// Special cases
+class VFPXI<dag oops, dag iops, AddrMode am, SizeFlagVal sz,
+            IndexMode im, Format f, InstrItinClass itin,
+            string asm, string cstr, list<dag> pattern>
+  : InstARM<am, sz, im, f, VFPDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let AsmString   = asm;
+  let Pattern = pattern;
+  list<Predicate> Predicates = [HasVFP2];
+}
+
+class VFPAI<dag oops, dag iops, Format f, InstrItinClass itin,
+            string opc, string asm, list<dag> pattern>
+  : VFPI<oops, iops, AddrModeNone, Size4Bytes, IndexModeNone, f, itin,
+         opc, asm, "", pattern>;
+
+// ARM VFP addrmode5 loads and stores
+class ADI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
+           InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : VFPI<oops, iops, AddrMode5, Size4Bytes, IndexModeNone,
+      VFPLdStFrm, itin, opc, asm, "", pattern> {
+  // TODO: Mark the instructions with the appropriate subtarget info.
+  let Inst{27-24} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{11-8}  = 0b1011;
+
+  // 64-bit loads & stores operate on both NEON and VFP pipelines.
+  let Dom = VFPNeonDomain.Value;
+}
+
+class ASI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
+           InstrItinClass itin,
+           string opc, string asm, list<dag> pattern>
+  : VFPI<oops, iops, AddrMode5, Size4Bytes, IndexModeNone,
+      VFPLdStFrm, itin, opc, asm, "", pattern> {
+  // TODO: Mark the instructions with the appropriate subtarget info.
+  let Inst{27-24} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{11-8}  = 0b1010;
+}
+
+// Load / store multiple
+class AXDI5<dag oops, dag iops, InstrItinClass itin,
+            string asm, list<dag> pattern>
+  : VFPXI<oops, iops, AddrMode5, Size4Bytes, IndexModeNone,
+       VFPLdStMulFrm, itin, asm, "", pattern> {
+  // TODO: Mark the instructions with the appropriate subtarget info.
+  let Inst{27-25} = 0b110;
+  let Inst{11-8}  = 0b1011;
+
+  // 64-bit loads & stores operate on both NEON and VFP pipelines.
+  let Dom = VFPNeonDomain.Value;
+}
+
+class AXSI5<dag oops, dag iops, InstrItinClass itin,
+            string asm, list<dag> pattern>
+  : VFPXI<oops, iops, AddrMode5, Size4Bytes, IndexModeNone,
+       VFPLdStMulFrm, itin, asm, "", pattern> {
+  // TODO: Mark the instructions with the appropriate subtarget info.
+  let Inst{27-25} = 0b110;
+  let Inst{11-8}  = 0b1010;
+}
+
+// Double precision, unary
+class ADuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
+           bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc,
+           string asm, list<dag> pattern>
+  : VFPAI<oops, iops, VFPUnaryFrm, itin, opc, asm, pattern> {
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{19-16} = opcod3;
+  let Inst{11-8}  = 0b1011;
+  let Inst{7-6}   = opcod4;
+  let Inst{4}     = opcod5;
+}
+
+// Double precision, binary
+class ADbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops,
+       dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> {
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{11-8}  = 0b1011;
+  let Inst{6} = op6;
+  let Inst{4} = op4;
+}
+
+// Single precision, unary
+class ASuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
+           bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc,
+           string asm, list<dag> pattern>
+  : VFPAI<oops, iops, VFPUnaryFrm, itin, opc, asm, pattern> {
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{19-16} = opcod3;
+  let Inst{11-8}  = 0b1010;
+  let Inst{7-6}   = opcod4;
+  let Inst{4}     = opcod5;
+}
+
+// Single precision unary, if no NEON
+// Same as ASuI except not available if NEON is enabled
+class ASuIn<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
+            bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc,
+            string asm, list<dag> pattern>
+  : ASuI<opcod1, opcod2, opcod3, opcod4, opcod5, oops, iops, itin, opc, asm,
+         pattern> {
+  list<Predicate> Predicates = [HasVFP2,DontUseNEONForFP];
+}
+
+// Single precision, binary
+class ASbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, dag iops,
+           InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : VFPAI<oops, iops, VFPBinaryFrm, itin, opc, asm, pattern> {
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{11-8}  = 0b1010;
+  let Inst{6} = op6;
+  let Inst{4} = op4;
+}
+
+// Single precision binary, if no NEON
+// Same as ASbI except not available if NEON is enabled
+class ASbIn<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops,
+       dag iops, InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : ASbI<opcod1, opcod2, op6, op4, oops, iops, itin, opc, asm, pattern> {
+  list<Predicate> Predicates = [HasVFP2,DontUseNEONForFP];
+}
+
+// VFP conversion instructions
+class AVConv1I<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<4> opcod4,
+               dag oops, dag iops, InstrItinClass itin, string opc, string asm,
+               list<dag> pattern>
+  : VFPAI<oops, iops, VFPConv1Frm, itin, opc, asm, pattern> {
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{19-16} = opcod3;
+  let Inst{11-8}  = opcod4;
+  let Inst{6}     = 1;
+  let Inst{4}     = 0;
+}
+
+// VFP conversion instructions, if no NEON
+class AVConv1In<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<4> opcod4,
+                dag oops, dag iops, InstrItinClass itin,
+                string opc, string asm, list<dag> pattern>
+  : AVConv1I<opcod1, opcod2, opcod3, opcod4, oops, iops, itin, opc, asm,
+             pattern> {
+  list<Predicate> Predicates = [HasVFP2,DontUseNEONForFP];
+}
+
+class AVConvXI<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, Format f,
+               InstrItinClass itin,
+               string opc, string asm, list<dag> pattern>
+  : VFPAI<oops, iops, f, itin, opc, asm, pattern> {
+  let Inst{27-20} = opcod1;
+  let Inst{11-8}  = opcod2;
+  let Inst{4}     = 1;
+}
+
+class AVConv2I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops,
+               InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : AVConvXI<opcod1, opcod2, oops, iops, VFPConv2Frm, itin, opc, asm, pattern>;
+
+class AVConv3I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops, 
+               InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : AVConvXI<opcod1, opcod2, oops, iops, VFPConv3Frm, itin, opc, asm, pattern>;
+
+class AVConv4I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops,
+               InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : AVConvXI<opcod1, opcod2, oops, iops, VFPConv4Frm, itin, opc, asm, pattern>;
+
+class AVConv5I<bits<8> opcod1, bits<4> opcod2, dag oops, dag iops,
+               InstrItinClass itin, string opc, string asm, list<dag> pattern>
+  : AVConvXI<opcod1, opcod2, oops, iops, VFPConv5Frm, itin, opc, asm, pattern>;
+
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ARM NEON Instruction templates.
+//
+
+class NeonI<dag oops, dag iops, AddrMode am, IndexMode im, InstrItinClass itin,
+            string opc, string dt, string asm, string cstr, list<dag> pattern>
+  : InstARM<am, Size4Bytes, im, NEONFrm, NeonDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ops pred:$p));
+  let AsmString = !strconcat(
+                     !strconcat(!strconcat(opc, "${p}"), !strconcat(".", dt)),
+                     !strconcat("\t", asm));
+  let Pattern = pattern;
+  list<Predicate> Predicates = [HasNEON];
+}
+
+// Same as NeonI except it does not have a "data type" specifier.
+class NeonXI<dag oops, dag iops, AddrMode am, IndexMode im, InstrItinClass itin,
+            string opc, string asm, string cstr, list<dag> pattern>
+  : InstARM<am, Size4Bytes, im, NEONFrm, NeonDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ops pred:$p));
+  let AsmString = !strconcat(!strconcat(opc, "${p}"), !strconcat("\t", asm));
+  let Pattern = pattern;
+  list<Predicate> Predicates = [HasNEON];
+}
+
+class NI<dag oops, dag iops, InstrItinClass itin, string opc, string asm,
+         list<dag> pattern>
+  : NeonXI<oops, iops, AddrModeNone, IndexModeNone, itin, opc, asm, "",
+          pattern> {
+}
+
+class NI4<dag oops, dag iops, InstrItinClass itin, string opc,
+          string asm, list<dag> pattern>
+  : NeonXI<oops, iops, AddrMode4, IndexModeNone, itin, opc, asm, "",
+          pattern> {
+}
+
+class NLdSt<bit op23, bits<2> op21_20, bits<4> op11_8, bits<4> op7_4,
+            dag oops, dag iops, InstrItinClass itin,
+            string opc, string dt, string asm, string cstr, list<dag> pattern>
+  : NeonI<oops, iops, AddrMode6, IndexModeNone, itin, opc, dt, asm, cstr,
+          pattern> {
+  let Inst{31-24} = 0b11110100;
+  let Inst{23} = op23;
+  let Inst{21-20} = op21_20;
+  let Inst{11-8} = op11_8;
+  let Inst{7-4} = op7_4;
+}
+
+class NDataI<dag oops, dag iops, InstrItinClass itin,
+             string opc, string dt, string asm, string cstr, list<dag> pattern>
+  : NeonI<oops, iops, AddrModeNone, IndexModeNone, itin, opc, dt, asm,
+         cstr, pattern> {
+  let Inst{31-25} = 0b1111001;
+}
+
+class NDataXI<dag oops, dag iops, InstrItinClass itin,
+             string opc, string asm, string cstr, list<dag> pattern>
+  : NeonXI<oops, iops, AddrModeNone, IndexModeNone, itin, opc, asm,
+         cstr, pattern> {
+  let Inst{31-25} = 0b1111001;
+}
+
+// NEON "one register and a modified immediate" format.
+class N1ModImm<bit op23, bits<3> op21_19, bits<4> op11_8, bit op7, bit op6,
+               bit op5, bit op4,
+               dag oops, dag iops, InstrItinClass itin,
+               string opc, string dt, string asm, string cstr, list<dag> pattern>
+  : NDataI<oops, iops, itin, opc, dt, asm, cstr, pattern> {
+  let Inst{23} = op23;
+  let Inst{21-19} = op21_19;
+  let Inst{11-8} = op11_8;
+  let Inst{7} = op7;
+  let Inst{6} = op6;
+  let Inst{5} = op5;
+  let Inst{4} = op4;
+}
+
+// NEON 2 vector register format.
+class N2V<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16,
+          bits<5> op11_7, bit op6, bit op4,
+          dag oops, dag iops, InstrItinClass itin,
+          string opc, string dt, string asm, string cstr, list<dag> pattern>
+  : NDataI<oops, iops, itin, opc, dt, asm, cstr, pattern> {
+  let Inst{24-23} = op24_23;
+  let Inst{21-20} = op21_20;
+  let Inst{19-18} = op19_18;
+  let Inst{17-16} = op17_16;
+  let Inst{11-7} = op11_7;
+  let Inst{6} = op6;
+  let Inst{4} = op4;
+}
+
+// Same as N2V except it doesn't have a datatype suffix.
+class N2VX<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16,
+          bits<5> op11_7, bit op6, bit op4,
+          dag oops, dag iops, InstrItinClass itin,
+          string opc, string asm, string cstr, list<dag> pattern>
+  : NDataXI<oops, iops, itin, opc, asm, cstr, pattern> {
+  let Inst{24-23} = op24_23;
+  let Inst{21-20} = op21_20;
+  let Inst{19-18} = op19_18;
+  let Inst{17-16} = op17_16;
+  let Inst{11-7} = op11_7;
+  let Inst{6} = op6;
+  let Inst{4} = op4;
+}
+
+// NEON 2 vector register with immediate.
+class N2VImm<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
+             dag oops, dag iops, InstrItinClass itin,
+             string opc, string dt, string asm, string cstr, list<dag> pattern>
+  : NDataI<oops, iops, itin, opc, dt, asm, cstr, pattern> {
+  let Inst{24} = op24;
+  let Inst{23} = op23;
+  let Inst{11-8} = op11_8;
+  let Inst{7} = op7;
+  let Inst{6} = op6;
+  let Inst{4} = op4;
+}
+
+// NEON 3 vector register format.
+class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
+          dag oops, dag iops, InstrItinClass itin,
+          string opc, string dt, string asm, string cstr, list<dag> pattern>
+  : NDataI<oops, iops, itin, opc, dt, asm, cstr, pattern> {
+  let Inst{24} = op24;
+  let Inst{23} = op23;
+  let Inst{21-20} = op21_20;
+  let Inst{11-8} = op11_8;
+  let Inst{6} = op6;
+  let Inst{4} = op4;
+}
+
+// Same as N3VX except it doesn't have a data type suffix.
+class N3VX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
+          dag oops, dag iops, InstrItinClass itin,
+          string opc, string asm, string cstr, list<dag> pattern>
+  : NDataXI<oops, iops, itin, opc, asm, cstr, pattern> {
+  let Inst{24} = op24;
+  let Inst{23} = op23;
+  let Inst{21-20} = op21_20;
+  let Inst{11-8} = op11_8;
+  let Inst{6} = op6;
+  let Inst{4} = op4;
+}
+
+// NEON VMOVs between scalar and core registers.
+class NVLaneOp<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
+               dag oops, dag iops, Format f, InstrItinClass itin,
+               string opc, string dt, string asm, list<dag> pattern>
+  : InstARM<AddrModeNone, Size4Bytes, IndexModeNone, f, GenericDomain,
+    "", itin> {
+  let Inst{27-20} = opcod1;
+  let Inst{11-8} = opcod2;
+  let Inst{6-5} = opcod3;
+  let Inst{4} = 1;
+
+  let OutOperandList = oops;
+  let InOperandList = !con(iops, (ops pred:$p));
+  let AsmString = !strconcat(
+                     !strconcat(!strconcat(opc, "${p}"), !strconcat(".", dt)),
+                     !strconcat("\t", asm));
+  let Pattern = pattern;
+  list<Predicate> Predicates = [HasNEON];
+}
+class NVGetLane<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
+                dag oops, dag iops, InstrItinClass itin,
+                string opc, string dt, string asm, list<dag> pattern>
+  : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONGetLnFrm, itin,
+             opc, dt, asm, pattern>;
+class NVSetLane<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
+                dag oops, dag iops, InstrItinClass itin,
+                string opc, string dt, string asm, list<dag> pattern>
+  : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONSetLnFrm, itin,
+             opc, dt, asm, pattern>;
+class NVDup<bits<8> opcod1, bits<4> opcod2, bits<2> opcod3,
+            dag oops, dag iops, InstrItinClass itin,
+            string opc, string dt, string asm, list<dag> pattern>
+  : NVLaneOp<opcod1, opcod2, opcod3, oops, iops, NEONDupFrm, itin,
+             opc, dt, asm, pattern>;
+
+// NEONFPPat - Same as Pat<>, but requires that the compiler be using NEON
+// for single-precision FP.
+class NEONFPPat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [HasNEON,UseNEONForFP];
+}

diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp
new file mode 100644
index 0000000..85f6b40
--- /dev/null
+++ b/lib/Target/ARM/ARMInstrInfo.cpp

@@ -0,0 +1,86 @@
+//===- ARMInstrInfo.cpp - ARM Instruction Information -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMInstrInfo.h"
+#include "ARM.h"
+#include "ARMAddressingModes.h"
+#include "ARMGenInstrInfo.inc"
+#include "ARMMachineFunctionInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+using namespace llvm;
+
+ARMInstrInfo::ARMInstrInfo(const ARMSubtarget &STI)
+  : ARMBaseInstrInfo(STI), RI(*this, STI) {
+}
+
+unsigned ARMInstrInfo::getUnindexedOpcode(unsigned Opc) const {
+  switch (Opc) {
+  default: break;
+  case ARM::LDR_PRE:
+  case ARM::LDR_POST:
+    return ARM::LDR;
+  case ARM::LDRH_PRE:
+  case ARM::LDRH_POST:
+    return ARM::LDRH;
+  case ARM::LDRB_PRE:
+  case ARM::LDRB_POST:
+    return ARM::LDRB;
+  case ARM::LDRSH_PRE:
+  case ARM::LDRSH_POST:
+    return ARM::LDRSH;
+  case ARM::LDRSB_PRE:
+  case ARM::LDRSB_POST:
+    return ARM::LDRSB;
+  case ARM::STR_PRE:
+  case ARM::STR_POST:
+    return ARM::STR;
+  case ARM::STRH_PRE:
+  case ARM::STRH_POST:
+    return ARM::STRH;
+  case ARM::STRB_PRE:
+  case ARM::STRB_POST:
+    return ARM::STRB;
+  }
+
+  return 0;
+}
+
+void ARMInstrInfo::
+reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+              unsigned DestReg, unsigned SubIdx, const MachineInstr *Orig,
+              const TargetRegisterInfo *TRI) const {
+  DebugLoc dl = Orig->getDebugLoc();
+  unsigned Opcode = Orig->getOpcode();
+  switch (Opcode) {
+  default:
+    break;
+  case ARM::MOVi2pieces: {
+    RI.emitLoadConstPool(MBB, I, dl,
+                         DestReg, SubIdx,
+                         Orig->getOperand(1).getImm(),
+                         (ARMCC::CondCodes)Orig->getOperand(2).getImm(),
+                         Orig->getOperand(3).getReg());
+    MachineInstr *NewMI = prior(I);
+    NewMI->getOperand(0).setSubReg(SubIdx);
+    return;
+  }
+  }
+
+  return ARMBaseInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, TRI);
+}
+

diff --git a/lib/Target/ARM/ARMInstrInfo.h b/lib/Target/ARM/ARMInstrInfo.h
new file mode 100644
index 0000000..d4199d1
--- /dev/null
+++ b/lib/Target/ARM/ARMInstrInfo.h

@@ -0,0 +1,49 @@
+//===- ARMInstrInfo.h - ARM Instruction Information -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMINSTRUCTIONINFO_H
+#define ARMINSTRUCTIONINFO_H
+
+#include "llvm/Target/TargetInstrInfo.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMRegisterInfo.h"
+#include "ARMSubtarget.h"
+#include "ARM.h"
+
+namespace llvm {
+  class ARMSubtarget;
+
+class ARMInstrInfo : public ARMBaseInstrInfo {
+  ARMRegisterInfo RI;
+public:
+  explicit ARMInstrInfo(const ARMSubtarget &STI);
+
+  // Return the non-pre/post incrementing version of 'Opc'. Return 0
+  // if there is not such an opcode.
+  unsigned getUnindexedOpcode(unsigned Opc) const;
+
+  void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                     unsigned DestReg, unsigned SubIdx,
+                     const MachineInstr *Orig,
+                     const TargetRegisterInfo *TRI) const;
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  const ARMRegisterInfo &getRegisterInfo() const { return RI; }
+};
+
+}
+
+#endif

diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
new file mode 100644
index 0000000..852c74e
--- /dev/null
+++ b/lib/Target/ARM/ARMInstrInfo.td

@@ -0,0 +1,1961 @@
+//===- ARMInstrInfo.td - Target Description for ARM Target -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the ARM instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// ARM specific DAG Nodes.
+//
+
+// Type profiles.
+def SDT_ARMCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_ARMCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>;
+
+def SDT_ARMSaveCallPC : SDTypeProfile<0, 1, []>;
+
+def SDT_ARMcall    : SDTypeProfile<0, -1, [SDTCisInt<0>]>;
+
+def SDT_ARMCMov    : SDTypeProfile<1, 3,
+                                   [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+                                    SDTCisVT<3, i32>]>;
+
+def SDT_ARMBrcond  : SDTypeProfile<0, 2,
+                                   [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>]>;
+
+def SDT_ARMBrJT    : SDTypeProfile<0, 3,
+                                  [SDTCisPtrTy<0>, SDTCisVT<1, i32>,
+                                   SDTCisVT<2, i32>]>;
+
+def SDT_ARMBr2JT   : SDTypeProfile<0, 4,
+                                  [SDTCisPtrTy<0>, SDTCisVT<1, i32>,
+                                   SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
+
+def SDT_ARMCmp     : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
+
+def SDT_ARMPICAdd  : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>,
+                                          SDTCisPtrTy<1>, SDTCisVT<2, i32>]>;
+
+def SDT_ARMThreadPointer : SDTypeProfile<1, 0, [SDTCisPtrTy<0>]>;
+def SDT_ARMEH_SJLJ_Setjmp : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisPtrTy<1>,
+                                                 SDTCisInt<2>]>;
+
+def SDT_ARMMEMBARRIERV7  : SDTypeProfile<0, 0, []>;
+def SDT_ARMSYNCBARRIERV7 : SDTypeProfile<0, 0, []>;
+def SDT_ARMMEMBARRIERV6  : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def SDT_ARMSYNCBARRIERV6 : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+// Node definitions.
+def ARMWrapper       : SDNode<"ARMISD::Wrapper",     SDTIntUnaryOp>;
+def ARMWrapperJT     : SDNode<"ARMISD::WrapperJT",   SDTIntBinOp>;
+
+def ARMcallseq_start : SDNode<"ISD::CALLSEQ_START", SDT_ARMCallSeqStart,
+                              [SDNPHasChain, SDNPOutFlag]>;
+def ARMcallseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_ARMCallSeqEnd,
+                              [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+
+def ARMcall          : SDNode<"ARMISD::CALL", SDT_ARMcall,
+                              [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+def ARMcall_pred    : SDNode<"ARMISD::CALL_PRED", SDT_ARMcall,
+                              [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+def ARMcall_nolink   : SDNode<"ARMISD::CALL_NOLINK", SDT_ARMcall,
+                              [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+
+def ARMretflag       : SDNode<"ARMISD::RET_FLAG", SDTNone,
+                              [SDNPHasChain, SDNPOptInFlag]>;
+
+def ARMcmov          : SDNode<"ARMISD::CMOV", SDT_ARMCMov,
+                              [SDNPInFlag]>;
+def ARMcneg          : SDNode<"ARMISD::CNEG", SDT_ARMCMov,
+                              [SDNPInFlag]>;
+
+def ARMbrcond        : SDNode<"ARMISD::BRCOND", SDT_ARMBrcond,
+                              [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>;
+
+def ARMbrjt          : SDNode<"ARMISD::BR_JT", SDT_ARMBrJT,
+                              [SDNPHasChain]>;
+def ARMbr2jt         : SDNode<"ARMISD::BR2_JT", SDT_ARMBr2JT,
+                              [SDNPHasChain]>;
+
+def ARMcmp           : SDNode<"ARMISD::CMP", SDT_ARMCmp,
+                              [SDNPOutFlag]>;
+
+def ARMcmpZ          : SDNode<"ARMISD::CMPZ", SDT_ARMCmp,
+                              [SDNPOutFlag,SDNPCommutative]>;
+
+def ARMpic_add       : SDNode<"ARMISD::PIC_ADD", SDT_ARMPICAdd>;
+
+def ARMsrl_flag      : SDNode<"ARMISD::SRL_FLAG", SDTIntUnaryOp, [SDNPOutFlag]>;
+def ARMsra_flag      : SDNode<"ARMISD::SRA_FLAG", SDTIntUnaryOp, [SDNPOutFlag]>;
+def ARMrrx           : SDNode<"ARMISD::RRX"     , SDTIntUnaryOp, [SDNPInFlag ]>;
+
+def ARMthread_pointer: SDNode<"ARMISD::THREAD_POINTER", SDT_ARMThreadPointer>;
+def ARMeh_sjlj_setjmp: SDNode<"ARMISD::EH_SJLJ_SETJMP", SDT_ARMEH_SJLJ_Setjmp>;
+
+def ARMMemBarrierV7  : SDNode<"ARMISD::MEMBARRIER", SDT_ARMMEMBARRIERV7,
+                              [SDNPHasChain]>;
+def ARMSyncBarrierV7 : SDNode<"ARMISD::SYNCBARRIER", SDT_ARMMEMBARRIERV7,
+                              [SDNPHasChain]>;
+def ARMMemBarrierV6  : SDNode<"ARMISD::MEMBARRIER", SDT_ARMMEMBARRIERV6,
+                              [SDNPHasChain]>;
+def ARMSyncBarrierV6 : SDNode<"ARMISD::SYNCBARRIER", SDT_ARMMEMBARRIERV6,
+                              [SDNPHasChain]>;
+
+def ARMrbit          : SDNode<"ARMISD::RBIT", SDTIntUnaryOp>;
+
+//===----------------------------------------------------------------------===//
+// ARM Instruction Predicate Definitions.
+//
+def HasV5T    : Predicate<"Subtarget->hasV5TOps()">;
+def HasV5TE   : Predicate<"Subtarget->hasV5TEOps()">;
+def HasV6     : Predicate<"Subtarget->hasV6Ops()">;
+def HasV6T2   : Predicate<"Subtarget->hasV6T2Ops()">;
+def NoV6T2    : Predicate<"!Subtarget->hasV6T2Ops()">;
+def HasV7     : Predicate<"Subtarget->hasV7Ops()">;
+def HasVFP2   : Predicate<"Subtarget->hasVFP2()">;
+def HasVFP3   : Predicate<"Subtarget->hasVFP3()">;
+def HasNEON   : Predicate<"Subtarget->hasNEON()">;
+def UseNEONForFP : Predicate<"Subtarget->useNEONForSinglePrecisionFP()">;
+def DontUseNEONForFP : Predicate<"!Subtarget->useNEONForSinglePrecisionFP()">;
+def IsThumb   : Predicate<"Subtarget->isThumb()">;
+def IsThumb1Only : Predicate<"Subtarget->isThumb1Only()">;
+def IsThumb2  : Predicate<"Subtarget->isThumb2()">;
+def IsARM     : Predicate<"!Subtarget->isThumb()">;
+def IsDarwin    : Predicate<"Subtarget->isTargetDarwin()">;
+def IsNotDarwin : Predicate<"!Subtarget->isTargetDarwin()">;
+def CarryDefIsUnused : Predicate<"!N->hasAnyUseOfValue(1)">;
+def CarryDefIsUsed   : Predicate<"N->hasAnyUseOfValue(1)">;
+
+// FIXME: Eventually this will be just "hasV6T2Ops".
+def UseMovt   : Predicate<"Subtarget->useMovt()">;
+def DontUseMovt : Predicate<"!Subtarget->useMovt()">;
+
+//===----------------------------------------------------------------------===//
+// ARM Flag Definitions.
+
+class RegConstraint<string C> {
+  string Constraints = C;
+}
+
+//===----------------------------------------------------------------------===//
+//  ARM specific transformation functions and pattern fragments.
+//
+
+// so_imm_neg_XFORM - Return a so_imm value packed into the format described for
+// so_imm_neg def below.
+def so_imm_neg_XFORM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(-(int)N->getZExtValue(), MVT::i32);
+}]>;
+
+// so_imm_not_XFORM - Return a so_imm value packed into the format described for
+// so_imm_not def below.
+def so_imm_not_XFORM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(~(int)N->getZExtValue(), MVT::i32);
+}]>;
+
+// rot_imm predicate - True if the 32-bit immediate is equal to 8, 16, or 24.
+def rot_imm : PatLeaf<(i32 imm), [{
+  int32_t v = (int32_t)N->getZExtValue();
+  return v == 8 || v == 16 || v == 24;
+}]>;
+
+/// imm1_15 predicate - True if the 32-bit immediate is in the range [1,15].
+def imm1_15 : PatLeaf<(i32 imm), [{
+  return (int32_t)N->getZExtValue() >= 1 && (int32_t)N->getZExtValue() < 16;
+}]>;
+
+/// imm16_31 predicate - True if the 32-bit immediate is in the range [16,31].
+def imm16_31 : PatLeaf<(i32 imm), [{
+  return (int32_t)N->getZExtValue() >= 16 && (int32_t)N->getZExtValue() < 32;
+}]>;
+
+def so_imm_neg : 
+  PatLeaf<(imm), [{
+    return ARM_AM::getSOImmVal(-(int)N->getZExtValue()) != -1;
+  }], so_imm_neg_XFORM>;
+
+def so_imm_not :
+  PatLeaf<(imm), [{
+    return ARM_AM::getSOImmVal(~(int)N->getZExtValue()) != -1;
+  }], so_imm_not_XFORM>;
+
+// sext_16_node predicate - True if the SDNode is sign-extended 16 or more bits.
+def sext_16_node : PatLeaf<(i32 GPR:$a), [{
+  return CurDAG->ComputeNumSignBits(SDValue(N,0)) >= 17;
+}]>;
+
+/// bf_inv_mask_imm predicate - An AND mask to clear an arbitrary width bitfield
+/// e.g., 0xf000ffff
+def bf_inv_mask_imm : Operand<i32>,
+                      PatLeaf<(imm), [{ 
+  uint32_t v = (uint32_t)N->getZExtValue();
+  if (v == 0xffffffff)
+    return 0;
+  // there can be 1's on either or both "outsides", all the "inside"
+  // bits must be 0's
+  unsigned int lsb = 0, msb = 31;
+  while (v & (1 << msb)) --msb;
+  while (v & (1 << lsb)) ++lsb;
+  for (unsigned int i = lsb; i <= msb; ++i) {
+    if (v & (1 << i))
+      return 0;
+  }
+  return 1;
+}] > {
+  let PrintMethod = "printBitfieldInvMaskImmOperand";
+}
+
+/// Split a 32-bit immediate into two 16 bit parts.
+def lo16 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant((uint32_t)N->getZExtValue() & 0xffff,
+                                   MVT::i32);
+}]>;
+
+def hi16 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant((uint32_t)N->getZExtValue() >> 16, MVT::i32);
+}]>;
+
+def lo16AllZero : PatLeaf<(i32 imm), [{
+  // Returns true if all low 16-bits are 0.
+  return (((uint32_t)N->getZExtValue()) & 0xFFFFUL) == 0;
+}], hi16>;
+
+/// imm0_65535 predicate - True if the 32-bit immediate is in the range 
+/// [0.65535].
+def imm0_65535 : PatLeaf<(i32 imm), [{
+  return (uint32_t)N->getZExtValue() < 65536;
+}]>;
+
+class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
+class UnOpFrag <dag res> : PatFrag<(ops node:$Src), res>;
+
+//===----------------------------------------------------------------------===//
+// Operand Definitions.
+//
+
+// Branch target.
+def brtarget : Operand<OtherVT>;
+
+// A list of registers separated by comma. Used by load/store multiple.
+def reglist : Operand<i32> {
+  let PrintMethod = "printRegisterList";
+}
+
+// An operand for the CONSTPOOL_ENTRY pseudo-instruction.
+def cpinst_operand : Operand<i32> {
+  let PrintMethod = "printCPInstOperand";
+}
+
+def jtblock_operand : Operand<i32> {
+  let PrintMethod = "printJTBlockOperand";
+}
+def jt2block_operand : Operand<i32> {
+  let PrintMethod = "printJT2BlockOperand";
+}
+
+// Local PC labels.
+def pclabel : Operand<i32> {
+  let PrintMethod = "printPCLabel";
+}
+
+// shifter_operand operands: so_reg and so_imm.
+def so_reg : Operand<i32>,    // reg reg imm
+            ComplexPattern<i32, 3, "SelectShifterOperandReg",
+                            [shl,srl,sra,rotr]> {
+  let PrintMethod = "printSORegOperand";
+  let MIOperandInfo = (ops GPR, GPR, i32imm);
+}
+
+// so_imm - Match a 32-bit shifter_operand immediate operand, which is an
+// 8-bit immediate rotated by an arbitrary number of bits.  so_imm values are
+// represented in the imm field in the same 12-bit form that they are encoded
+// into so_imm instructions: the 8-bit immediate is the least significant bits
+// [bits 0-7], the 4-bit shift amount is the next 4 bits [bits 8-11].
+def so_imm : Operand<i32>,
+             PatLeaf<(imm), [{
+      return ARM_AM::getSOImmVal(N->getZExtValue()) != -1;
+    }]> {
+  let PrintMethod = "printSOImmOperand";
+}
+
+// Break so_imm's up into two pieces.  This handles immediates with up to 16
+// bits set in them.  This uses so_imm2part to match and so_imm2part_[12] to
+// get the first/second pieces.
+def so_imm2part : Operand<i32>,
+                  PatLeaf<(imm), [{
+      return ARM_AM::isSOImmTwoPartVal((unsigned)N->getZExtValue());
+    }]> {
+  let PrintMethod = "printSOImm2PartOperand";
+}
+
+def so_imm2part_1 : SDNodeXForm<imm, [{
+  unsigned V = ARM_AM::getSOImmTwoPartFirst((unsigned)N->getZExtValue());
+  return CurDAG->getTargetConstant(V, MVT::i32);
+}]>;
+
+def so_imm2part_2 : SDNodeXForm<imm, [{
+  unsigned V = ARM_AM::getSOImmTwoPartSecond((unsigned)N->getZExtValue());
+  return CurDAG->getTargetConstant(V, MVT::i32);
+}]>;
+
+def so_neg_imm2part : Operand<i32>, PatLeaf<(imm), [{
+      return ARM_AM::isSOImmTwoPartVal(-(int)N->getZExtValue());
+    }]> {
+  let PrintMethod = "printSOImm2PartOperand";
+}
+
+def so_neg_imm2part_1 : SDNodeXForm<imm, [{
+  unsigned V = ARM_AM::getSOImmTwoPartFirst(-(int)N->getZExtValue());
+  return CurDAG->getTargetConstant(V, MVT::i32);
+}]>;
+
+def so_neg_imm2part_2 : SDNodeXForm<imm, [{
+  unsigned V = ARM_AM::getSOImmTwoPartSecond(-(int)N->getZExtValue());
+  return CurDAG->getTargetConstant(V, MVT::i32);
+}]>;
+
+/// imm0_31 predicate - True if the 32-bit immediate is in the range [0,31].
+def imm0_31 : Operand<i32>, PatLeaf<(imm), [{
+  return (int32_t)N->getZExtValue() < 32;
+}]>;
+
+// Define ARM specific addressing modes.
+
+// addrmode2 := reg +/- reg shop imm
+// addrmode2 := reg +/- imm12
+//
+def addrmode2 : Operand<i32>,
+                ComplexPattern<i32, 3, "SelectAddrMode2", []> {
+  let PrintMethod = "printAddrMode2Operand";
+  let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm);
+}
+
+def am2offset : Operand<i32>,
+                ComplexPattern<i32, 2, "SelectAddrMode2Offset", []> {
+  let PrintMethod = "printAddrMode2OffsetOperand";
+  let MIOperandInfo = (ops GPR, i32imm);
+}
+
+// addrmode3 := reg +/- reg
+// addrmode3 := reg +/- imm8
+//
+def addrmode3 : Operand<i32>,
+                ComplexPattern<i32, 3, "SelectAddrMode3", []> {
+  let PrintMethod = "printAddrMode3Operand";
+  let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm);
+}
+
+def am3offset : Operand<i32>,
+                ComplexPattern<i32, 2, "SelectAddrMode3Offset", []> {
+  let PrintMethod = "printAddrMode3OffsetOperand";
+  let MIOperandInfo = (ops GPR, i32imm);
+}
+
+// addrmode4 := reg, <mode|W>
+//
+def addrmode4 : Operand<i32>,
+                ComplexPattern<i32, 2, "SelectAddrMode4", []> {
+  let PrintMethod = "printAddrMode4Operand";
+  let MIOperandInfo = (ops GPR, i32imm);
+}
+
+// addrmode5 := reg +/- imm8*4
+//
+def addrmode5 : Operand<i32>,
+                ComplexPattern<i32, 2, "SelectAddrMode5", []> {
+  let PrintMethod = "printAddrMode5Operand";
+  let MIOperandInfo = (ops GPR, i32imm);
+}
+
+// addrmode6 := reg with optional writeback
+//
+def addrmode6 : Operand<i32>,
+                ComplexPattern<i32, 4, "SelectAddrMode6", []> {
+  let PrintMethod = "printAddrMode6Operand";
+  let MIOperandInfo = (ops GPR:$addr, GPR:$upd, i32imm, i32imm);
+}
+
+// addrmodepc := pc + reg
+//
+def addrmodepc : Operand<i32>,
+                 ComplexPattern<i32, 2, "SelectAddrModePC", []> {
+  let PrintMethod = "printAddrModePCOperand";
+  let MIOperandInfo = (ops GPR, i32imm);
+}
+
+def nohash_imm : Operand<i32> {
+  let PrintMethod = "printNoHashImmediate";
+}
+
+//===----------------------------------------------------------------------===//
+
+include "ARMInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Multiclass helpers...
+//
+
+/// AsI1_bin_irs - Defines a set of (op r, {so_imm|r|so_reg}) patterns for a
+/// binop that produces a value.
+multiclass AsI1_bin_irs<bits<4> opcod, string opc, PatFrag opnode,
+                        bit Commutable = 0> {
+  def ri : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), DPFrm,
+               IIC_iALUi, opc, "\t$dst, $a, $b",
+               [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]> {
+    let Inst{25} = 1;
+  }
+  def rr : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, GPR:$b), DPFrm,
+               IIC_iALUr, opc, "\t$dst, $a, $b",
+               [(set GPR:$dst, (opnode GPR:$a, GPR:$b))]> {
+    let Inst{11-4} = 0b00000000;
+    let Inst{25} = 0;
+    let isCommutable = Commutable;
+  }
+  def rs : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), DPSoRegFrm,
+               IIC_iALUsr, opc, "\t$dst, $a, $b",
+               [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]> {
+    let Inst{25} = 0;
+  }
+}
+
+/// AI1_bin_s_irs - Similar to AsI1_bin_irs except it sets the 's' bit so the
+/// instruction modifies the CPSR register.
+let Defs = [CPSR] in {
+multiclass AI1_bin_s_irs<bits<4> opcod, string opc, PatFrag opnode,
+                         bit Commutable = 0> {
+  def ri : AI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), DPFrm,
+               IIC_iALUi, opc, "\t$dst, $a, $b",
+               [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]> {
+    let Inst{20} = 1;
+    let Inst{25} = 1;
+  }
+  def rr : AI1<opcod, (outs GPR:$dst), (ins GPR:$a, GPR:$b), DPFrm,
+               IIC_iALUr, opc, "\t$dst, $a, $b",
+               [(set GPR:$dst, (opnode GPR:$a, GPR:$b))]> {
+    let isCommutable = Commutable;
+    let Inst{11-4} = 0b00000000;
+    let Inst{20} = 1;
+    let Inst{25} = 0;
+  }
+  def rs : AI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), DPSoRegFrm,
+               IIC_iALUsr, opc, "\t$dst, $a, $b",
+               [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]> {
+    let Inst{20} = 1;
+    let Inst{25} = 0;
+  }
+}
+}
+
+/// AI1_cmp_irs - Defines a set of (op r, {so_imm|r|so_reg}) cmp / test
+/// patterns. Similar to AsI1_bin_irs except the instruction does not produce
+/// a explicit result, only implicitly set CPSR.
+let Defs = [CPSR] in {
+multiclass AI1_cmp_irs<bits<4> opcod, string opc, PatFrag opnode,
+                       bit Commutable = 0> {
+  def ri : AI1<opcod, (outs), (ins GPR:$a, so_imm:$b), DPFrm, IIC_iCMPi,
+               opc, "\t$a, $b",
+               [(opnode GPR:$a, so_imm:$b)]> {
+    let Inst{20} = 1;
+    let Inst{25} = 1;
+  }
+  def rr : AI1<opcod, (outs), (ins GPR:$a, GPR:$b), DPFrm, IIC_iCMPr,
+               opc, "\t$a, $b",
+               [(opnode GPR:$a, GPR:$b)]> {
+    let Inst{11-4} = 0b00000000;
+    let Inst{20} = 1;
+    let Inst{25} = 0;
+    let isCommutable = Commutable;
+  }
+  def rs : AI1<opcod, (outs), (ins GPR:$a, so_reg:$b), DPSoRegFrm, IIC_iCMPsr,
+               opc, "\t$a, $b",
+               [(opnode GPR:$a, so_reg:$b)]> {
+    let Inst{20} = 1;
+    let Inst{25} = 0;
+  }
+}
+}
+
+/// AI_unary_rrot - A unary operation with two forms: one whose operand is a
+/// register and one whose operand is a register rotated by 8/16/24.
+/// FIXME: Remove the 'r' variant. Its rot_imm is zero.
+multiclass AI_unary_rrot<bits<8> opcod, string opc, PatFrag opnode> {
+  def r     : AExtI<opcod, (outs GPR:$dst), (ins GPR:$src),
+                 IIC_iUNAr, opc, "\t$dst, $src",
+                 [(set GPR:$dst, (opnode GPR:$src))]>,
+              Requires<[IsARM, HasV6]> {
+    let Inst{11-10} = 0b00;
+    let Inst{19-16} = 0b1111;
+  }
+  def r_rot : AExtI<opcod, (outs GPR:$dst), (ins GPR:$src, i32imm:$rot),
+                 IIC_iUNAsi, opc, "\t$dst, $src, ror $rot",
+                 [(set GPR:$dst, (opnode (rotr GPR:$src, rot_imm:$rot)))]>,
+              Requires<[IsARM, HasV6]> {
+    let Inst{19-16} = 0b1111;
+  }
+}
+
+/// AI_bin_rrot - A binary operation with two forms: one whose operand is a
+/// register and one whose operand is a register rotated by 8/16/24.
+multiclass AI_bin_rrot<bits<8> opcod, string opc, PatFrag opnode> {
+  def rr     : AExtI<opcod, (outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS),
+                  IIC_iALUr, opc, "\t$dst, $LHS, $RHS",
+                  [(set GPR:$dst, (opnode GPR:$LHS, GPR:$RHS))]>,
+               Requires<[IsARM, HasV6]> {
+    let Inst{11-10} = 0b00;
+  }
+  def rr_rot : AExtI<opcod, (outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS, i32imm:$rot),
+                  IIC_iALUsi, opc, "\t$dst, $LHS, $RHS, ror $rot",
+                  [(set GPR:$dst, (opnode GPR:$LHS,
+                                          (rotr GPR:$RHS, rot_imm:$rot)))]>,
+                  Requires<[IsARM, HasV6]>;
+}
+
+/// AI1_adde_sube_irs - Define instructions and patterns for adde and sube.
+let Uses = [CPSR] in {
+multiclass AI1_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode,
+                             bit Commutable = 0> {
+  def ri : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_imm:$b),
+                DPFrm, IIC_iALUi, opc, "\t$dst, $a, $b",
+               [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]>,
+               Requires<[IsARM, CarryDefIsUnused]> {
+    let Inst{25} = 1;
+  }
+  def rr : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
+                DPFrm, IIC_iALUr, opc, "\t$dst, $a, $b",
+               [(set GPR:$dst, (opnode GPR:$a, GPR:$b))]>,
+               Requires<[IsARM, CarryDefIsUnused]> {
+    let isCommutable = Commutable;
+    let Inst{11-4} = 0b00000000;
+    let Inst{25} = 0;
+  }
+  def rs : AsI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_reg:$b),
+                DPSoRegFrm, IIC_iALUsr, opc, "\t$dst, $a, $b",
+               [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]>,
+               Requires<[IsARM, CarryDefIsUnused]> {
+    let Inst{25} = 0;
+  }
+}
+// Carry setting variants
+let Defs = [CPSR] in {
+multiclass AI1_adde_sube_s_irs<bits<4> opcod, string opc, PatFrag opnode,
+                             bit Commutable = 0> {
+  def Sri : AXI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_imm:$b),
+                DPFrm, IIC_iALUi, !strconcat(opc, "\t$dst, $a, $b"),
+               [(set GPR:$dst, (opnode GPR:$a, so_imm:$b))]>,
+               Requires<[IsARM, CarryDefIsUsed]> {
+    let Defs = [CPSR];
+    let Inst{20} = 1;
+    let Inst{25} = 1;
+  }
+  def Srr : AXI1<opcod, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
+                DPFrm, IIC_iALUr, !strconcat(opc, "\t$dst, $a, $b"),
+               [(set GPR:$dst, (opnode GPR:$a, GPR:$b))]>,
+               Requires<[IsARM, CarryDefIsUsed]> {
+    let Defs = [CPSR];
+    let Inst{11-4} = 0b00000000;
+    let Inst{20} = 1;
+    let Inst{25} = 0;
+  }
+  def Srs : AXI1<opcod, (outs GPR:$dst), (ins GPR:$a, so_reg:$b),
+                DPSoRegFrm, IIC_iALUsr, !strconcat(opc, "\t$dst, $a, $b"),
+               [(set GPR:$dst, (opnode GPR:$a, so_reg:$b))]>,
+               Requires<[IsARM, CarryDefIsUsed]> {
+    let Defs = [CPSR];
+    let Inst{20} = 1;
+    let Inst{25} = 0;
+  }
+}
+}
+}
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Miscellaneous Instructions.
+//
+
+/// CONSTPOOL_ENTRY - This instruction represents a floating constant pool in
+/// the function.  The first operand is the ID# for this instruction, the second
+/// is the index into the MachineConstantPool that this is, the third is the
+/// size in bytes of this constant pool entry.
+let neverHasSideEffects = 1, isNotDuplicable = 1 in
+def CONSTPOOL_ENTRY :
+PseudoInst<(outs), (ins cpinst_operand:$instid, cpinst_operand:$cpidx,
+                    i32imm:$size), NoItinerary,
+           "${instid:label} ${cpidx:cpentry}", []>;
+
+let Defs = [SP], Uses = [SP] in {
+def ADJCALLSTACKUP :
+PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2, pred:$p), NoItinerary,
+           "@ ADJCALLSTACKUP $amt1",
+           [(ARMcallseq_end timm:$amt1, timm:$amt2)]>;
+
+def ADJCALLSTACKDOWN : 
+PseudoInst<(outs), (ins i32imm:$amt, pred:$p), NoItinerary,
+           "@ ADJCALLSTACKDOWN $amt",
+           [(ARMcallseq_start timm:$amt)]>;
+}
+
+// Address computation and loads and stores in PIC mode.
+let isNotDuplicable = 1 in {
+def PICADD : AXI1<0b0100, (outs GPR:$dst), (ins GPR:$a, pclabel:$cp, pred:$p),
+                  Pseudo, IIC_iALUr, "\n$cp:\n\tadd$p\t$dst, pc, $a",
+                   [(set GPR:$dst, (ARMpic_add GPR:$a, imm:$cp))]>;
+
+let AddedComplexity = 10 in {
+def PICLDR  : AXI2ldw<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p),
+                  Pseudo, IIC_iLoadr, "\n${addr:label}:\n\tldr$p\t$dst, $addr",
+                  [(set GPR:$dst, (load addrmodepc:$addr))]>;
+
+def PICLDRH : AXI3ldh<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p),
+                Pseudo, IIC_iLoadr, "\n${addr:label}:\n\tldrh${p}\t$dst, $addr",
+                  [(set GPR:$dst, (zextloadi16 addrmodepc:$addr))]>;
+
+def PICLDRB : AXI2ldb<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p),
+                Pseudo, IIC_iLoadr, "\n${addr:label}:\n\tldrb${p}\t$dst, $addr",
+                  [(set GPR:$dst, (zextloadi8 addrmodepc:$addr))]>;
+
+def PICLDRSH : AXI3ldsh<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p),
+               Pseudo, IIC_iLoadr, "\n${addr:label}:\n\tldrsh${p}\t$dst, $addr",
+                  [(set GPR:$dst, (sextloadi16 addrmodepc:$addr))]>;
+
+def PICLDRSB : AXI3ldsb<(outs GPR:$dst), (ins addrmodepc:$addr, pred:$p),
+               Pseudo, IIC_iLoadr, "\n${addr:label}:\n\tldrsb${p}\t$dst, $addr",
+                  [(set GPR:$dst, (sextloadi8 addrmodepc:$addr))]>;
+}
+let AddedComplexity = 10 in {
+def PICSTR  : AXI2stw<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p),
+               Pseudo, IIC_iStorer, "\n${addr:label}:\n\tstr$p\t$src, $addr",
+               [(store GPR:$src, addrmodepc:$addr)]>;
+
+def PICSTRH : AXI3sth<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p),
+               Pseudo, IIC_iStorer, "\n${addr:label}:\n\tstrh${p}\t$src, $addr",
+               [(truncstorei16 GPR:$src, addrmodepc:$addr)]>;
+
+def PICSTRB : AXI2stb<(outs), (ins GPR:$src, addrmodepc:$addr, pred:$p),
+               Pseudo, IIC_iStorer, "\n${addr:label}:\n\tstrb${p}\t$src, $addr",
+               [(truncstorei8 GPR:$src, addrmodepc:$addr)]>;
+}
+} // isNotDuplicable = 1
+
+
+// LEApcrel - Load a pc-relative address into a register without offending the
+// assembler.
+def LEApcrel : AXI1<0x0, (outs GPR:$dst), (ins i32imm:$label, pred:$p),
+                    Pseudo, IIC_iALUi,
+           !strconcat(!strconcat(".set ${:private}PCRELV${:uid}, ($label-(",
+                                 "${:private}PCRELL${:uid}+8))\n"),
+                      !strconcat("${:private}PCRELL${:uid}:\n\t",
+                                 "add$p\t$dst, pc, #${:private}PCRELV${:uid}")),
+                   []>;
+
+def LEApcrelJT : AXI1<0x0, (outs GPR:$dst),
+                           (ins i32imm:$label, nohash_imm:$id, pred:$p),
+          Pseudo, IIC_iALUi,
+   !strconcat(!strconcat(".set ${:private}PCRELV${:uid}, "
+                         "(${label}_${id}-(",
+                                  "${:private}PCRELL${:uid}+8))\n"),
+                       !strconcat("${:private}PCRELL${:uid}:\n\t",
+                                  "add$p\t$dst, pc, #${:private}PCRELV${:uid}")),
+                   []> {
+    let Inst{25} = 1;
+}
+
+//===----------------------------------------------------------------------===//
+//  Control Flow Instructions.
+//
+
+let isReturn = 1, isTerminator = 1, isBarrier = 1 in
+  def BX_RET : AI<(outs), (ins), BrMiscFrm, IIC_Br, 
+                  "bx", "\tlr", [(ARMretflag)]> {
+  let Inst{3-0}   = 0b1110;
+  let Inst{7-4}   = 0b0001;
+  let Inst{19-8}  = 0b111111111111;
+  let Inst{27-20} = 0b00010010;
+}
+
+// Indirect branches
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+  def BRIND : AXI<(outs), (ins GPR:$dst), BrMiscFrm, IIC_Br, "bx\t$dst",
+                  [(brind GPR:$dst)]> {
+    let Inst{7-4}   = 0b0001;
+    let Inst{19-8}  = 0b111111111111;
+    let Inst{27-20} = 0b00010010;
+    let Inst{31-28} = 0b1110;
+  }
+}
+
+// FIXME: remove when we have a way to marking a MI with these properties.
+// FIXME: Should pc be an implicit operand like PICADD, etc?
+let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1,
+    hasExtraDefRegAllocReq = 1 in
+  def LDM_RET : AXI4ld<(outs),
+                    (ins addrmode4:$addr, pred:$p, reglist:$wb, variable_ops),
+                    LdStMulFrm, IIC_Br, "ldm${addr:submode}${p}\t$addr, $wb",
+                    []>;
+
+// On non-Darwin platforms R9 is callee-saved.
+let isCall = 1,
+  Defs = [R0,  R1,  R2,  R3,  R12, LR,
+          D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,
+          D16, D17, D18, D19, D20, D21, D22, D23,
+          D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR] in {
+  def BL  : ABXI<0b1011, (outs), (ins i32imm:$func, variable_ops),
+                IIC_Br, "bl\t${func:call}",
+                [(ARMcall tglobaladdr:$func)]>,
+            Requires<[IsARM, IsNotDarwin]> {
+    let Inst{31-28} = 0b1110;
+  }
+
+  def BL_pred : ABI<0b1011, (outs), (ins i32imm:$func, variable_ops),
+                   IIC_Br, "bl", "\t${func:call}",
+                   [(ARMcall_pred tglobaladdr:$func)]>,
+                Requires<[IsARM, IsNotDarwin]>;
+
+  // ARMv5T and above
+  def BLX : AXI<(outs), (ins GPR:$func, variable_ops), BrMiscFrm,
+                IIC_Br, "blx\t$func",
+                [(ARMcall GPR:$func)]>,
+            Requires<[IsARM, HasV5T, IsNotDarwin]> {
+    let Inst{7-4}   = 0b0011;
+    let Inst{19-8}  = 0b111111111111;
+    let Inst{27-20} = 0b00010010;
+  }
+
+  // ARMv4T
+  def BX : ABXIx2<(outs), (ins GPR:$func, variable_ops),
+                  IIC_Br, "mov\tlr, pc\n\tbx\t$func",
+                  [(ARMcall_nolink GPR:$func)]>,
+           Requires<[IsARM, IsNotDarwin]> {
+    let Inst{7-4}   = 0b0001;
+    let Inst{19-8}  = 0b111111111111;
+    let Inst{27-20} = 0b00010010;
+  }
+}
+
+// On Darwin R9 is call-clobbered.
+let isCall = 1,
+  Defs = [R0,  R1,  R2,  R3,  R9,  R12, LR,
+          D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,
+          D16, D17, D18, D19, D20, D21, D22, D23,
+          D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR] in {
+  def BLr9  : ABXI<0b1011, (outs), (ins i32imm:$func, variable_ops),
+                IIC_Br, "bl\t${func:call}",
+                [(ARMcall tglobaladdr:$func)]>, Requires<[IsARM, IsDarwin]> {
+    let Inst{31-28} = 0b1110;
+  }
+
+  def BLr9_pred : ABI<0b1011, (outs), (ins i32imm:$func, variable_ops),
+                   IIC_Br, "bl", "\t${func:call}",
+                   [(ARMcall_pred tglobaladdr:$func)]>,
+                  Requires<[IsARM, IsDarwin]>;
+
+  // ARMv5T and above
+  def BLXr9 : AXI<(outs), (ins GPR:$func, variable_ops), BrMiscFrm,
+                IIC_Br, "blx\t$func",
+                [(ARMcall GPR:$func)]>, Requires<[IsARM, HasV5T, IsDarwin]> {
+    let Inst{7-4}   = 0b0011;
+    let Inst{19-8}  = 0b111111111111;
+    let Inst{27-20} = 0b00010010;
+  }
+
+  // ARMv4T
+  def BXr9 : ABXIx2<(outs), (ins GPR:$func, variable_ops),
+                  IIC_Br, "mov\tlr, pc\n\tbx\t$func",
+                  [(ARMcall_nolink GPR:$func)]>, Requires<[IsARM, IsDarwin]> {
+    let Inst{7-4}   = 0b0001;
+    let Inst{19-8}  = 0b111111111111;
+    let Inst{27-20} = 0b00010010;
+  }
+}
+
+let isBranch = 1, isTerminator = 1 in {
+  // B is "predicable" since it can be xformed into a Bcc.
+  let isBarrier = 1 in {
+    let isPredicable = 1 in
+    def B : ABXI<0b1010, (outs), (ins brtarget:$target), IIC_Br,
+                "b\t$target", [(br bb:$target)]>;
+
+  let isNotDuplicable = 1, isIndirectBranch = 1 in {
+  def BR_JTr : JTI<(outs), (ins GPR:$target, jtblock_operand:$jt, i32imm:$id),
+                    IIC_Br, "mov\tpc, $target \n$jt",
+                    [(ARMbrjt GPR:$target, tjumptable:$jt, imm:$id)]> {
+    let Inst{11-4}  = 0b00000000;
+    let Inst{15-12} = 0b1111;
+    let Inst{20}    = 0; // S Bit
+    let Inst{24-21} = 0b1101;
+    let Inst{27-25} = 0b000;
+  }
+  def BR_JTm : JTI<(outs),
+                   (ins addrmode2:$target, jtblock_operand:$jt, i32imm:$id),
+                   IIC_Br, "ldr\tpc, $target \n$jt",
+                   [(ARMbrjt (i32 (load addrmode2:$target)), tjumptable:$jt,
+                     imm:$id)]> {
+    let Inst{15-12} = 0b1111;
+    let Inst{20}    = 1; // L bit
+    let Inst{21}    = 0; // W bit
+    let Inst{22}    = 0; // B bit
+    let Inst{24}    = 1; // P bit
+    let Inst{27-25} = 0b011;
+  }
+  def BR_JTadd : JTI<(outs),
+                   (ins GPR:$target, GPR:$idx, jtblock_operand:$jt, i32imm:$id),
+                    IIC_Br, "add\tpc, $target, $idx \n$jt",
+                    [(ARMbrjt (add GPR:$target, GPR:$idx), tjumptable:$jt,
+                      imm:$id)]> {
+    let Inst{15-12} = 0b1111;
+    let Inst{20}    = 0; // S bit
+    let Inst{24-21} = 0b0100;
+    let Inst{27-25} = 0b000;
+  }
+  } // isNotDuplicable = 1, isIndirectBranch = 1
+  } // isBarrier = 1
+
+  // FIXME: should be able to write a pattern for ARMBrcond, but can't use
+  // a two-value operand where a dag node expects two operands. :( 
+  def Bcc : ABI<0b1010, (outs), (ins brtarget:$target),
+               IIC_Br, "b", "\t$target",
+               [/*(ARMbrcond bb:$target, imm:$cc, CCR:$ccr)*/]>;
+}
+
+//===----------------------------------------------------------------------===//
+//  Load / store Instructions.
+//
+
+// Load
+let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in 
+def LDR  : AI2ldw<(outs GPR:$dst), (ins addrmode2:$addr), LdFrm, IIC_iLoadr,
+               "ldr", "\t$dst, $addr",
+               [(set GPR:$dst, (load addrmode2:$addr))]>;
+
+// Special LDR for loads from non-pc-relative constpools.
+let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
+    mayHaveSideEffects = 1  in
+def LDRcp : AI2ldw<(outs GPR:$dst), (ins addrmode2:$addr), LdFrm, IIC_iLoadr,
+                 "ldr", "\t$dst, $addr", []>;
+
+// Loads with zero extension
+def LDRH  : AI3ldh<(outs GPR:$dst), (ins addrmode3:$addr), LdMiscFrm,
+                  IIC_iLoadr, "ldrh", "\t$dst, $addr",
+                  [(set GPR:$dst, (zextloadi16 addrmode3:$addr))]>;
+
+def LDRB  : AI2ldb<(outs GPR:$dst), (ins addrmode2:$addr), LdFrm, 
+                  IIC_iLoadr, "ldrb", "\t$dst, $addr",
+                  [(set GPR:$dst, (zextloadi8 addrmode2:$addr))]>;
+
+// Loads with sign extension
+def LDRSH : AI3ldsh<(outs GPR:$dst), (ins addrmode3:$addr), LdMiscFrm,
+                   IIC_iLoadr, "ldrsh", "\t$dst, $addr",
+                   [(set GPR:$dst, (sextloadi16 addrmode3:$addr))]>;
+
+def LDRSB : AI3ldsb<(outs GPR:$dst), (ins addrmode3:$addr), LdMiscFrm,
+                   IIC_iLoadr, "ldrsb", "\t$dst, $addr",
+                   [(set GPR:$dst, (sextloadi8 addrmode3:$addr))]>;
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in {
+// Load doubleword
+def LDRD : AI3ldd<(outs GPR:$dst1, GPR:$dst2), (ins addrmode3:$addr), LdMiscFrm,
+                 IIC_iLoadr, "ldrd", "\t$dst1, $addr",
+                 []>, Requires<[IsARM, HasV5TE]>;
+
+// Indexed loads
+def LDR_PRE  : AI2ldwpr<(outs GPR:$dst, GPR:$base_wb),
+                     (ins addrmode2:$addr), LdFrm, IIC_iLoadru,
+                     "ldr", "\t$dst, $addr!", "$addr.base = $base_wb", []>;
+
+def LDR_POST : AI2ldwpo<(outs GPR:$dst, GPR:$base_wb),
+                     (ins GPR:$base, am2offset:$offset), LdFrm, IIC_iLoadru,
+                     "ldr", "\t$dst, [$base], $offset", "$base = $base_wb", []>;
+
+def LDRH_PRE  : AI3ldhpr<(outs GPR:$dst, GPR:$base_wb),
+                     (ins addrmode3:$addr), LdMiscFrm, IIC_iLoadru,
+                     "ldrh", "\t$dst, $addr!", "$addr.base = $base_wb", []>;
+
+def LDRH_POST : AI3ldhpo<(outs GPR:$dst, GPR:$base_wb),
+                     (ins GPR:$base,am3offset:$offset), LdMiscFrm, IIC_iLoadru,
+                    "ldrh", "\t$dst, [$base], $offset", "$base = $base_wb", []>;
+
+def LDRB_PRE  : AI2ldbpr<(outs GPR:$dst, GPR:$base_wb),
+                     (ins addrmode2:$addr), LdFrm, IIC_iLoadru,
+                     "ldrb", "\t$dst, $addr!", "$addr.base = $base_wb", []>;
+
+def LDRB_POST : AI2ldbpo<(outs GPR:$dst, GPR:$base_wb),
+                     (ins GPR:$base,am2offset:$offset), LdFrm, IIC_iLoadru,
+                    "ldrb", "\t$dst, [$base], $offset", "$base = $base_wb", []>;
+
+def LDRSH_PRE : AI3ldshpr<(outs GPR:$dst, GPR:$base_wb),
+                      (ins addrmode3:$addr), LdMiscFrm, IIC_iLoadru,
+                      "ldrsh", "\t$dst, $addr!", "$addr.base = $base_wb", []>;
+
+def LDRSH_POST: AI3ldshpo<(outs GPR:$dst, GPR:$base_wb),
+                      (ins GPR:$base,am3offset:$offset), LdMiscFrm, IIC_iLoadru,
+                   "ldrsh", "\t$dst, [$base], $offset", "$base = $base_wb", []>;
+
+def LDRSB_PRE : AI3ldsbpr<(outs GPR:$dst, GPR:$base_wb),
+                      (ins addrmode3:$addr), LdMiscFrm, IIC_iLoadru,
+                      "ldrsb", "\t$dst, $addr!", "$addr.base = $base_wb", []>;
+
+def LDRSB_POST: AI3ldsbpo<(outs GPR:$dst, GPR:$base_wb),
+                      (ins GPR:$base,am3offset:$offset), LdMiscFrm, IIC_iLoadru,
+                   "ldrsb", "\t$dst, [$base], $offset", "$base = $base_wb", []>;
+}
+
+// Store
+def STR  : AI2stw<(outs), (ins GPR:$src, addrmode2:$addr), StFrm, IIC_iStorer,
+               "str", "\t$src, $addr",
+               [(store GPR:$src, addrmode2:$addr)]>;
+
+// Stores with truncate
+def STRH : AI3sth<(outs), (ins GPR:$src, addrmode3:$addr), StMiscFrm, IIC_iStorer,
+               "strh", "\t$src, $addr",
+               [(truncstorei16 GPR:$src, addrmode3:$addr)]>;
+
+def STRB : AI2stb<(outs), (ins GPR:$src, addrmode2:$addr), StFrm, IIC_iStorer,
+               "strb", "\t$src, $addr",
+               [(truncstorei8 GPR:$src, addrmode2:$addr)]>;
+
+// Store doubleword
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
+def STRD : AI3std<(outs), (ins GPR:$src1, GPR:$src2, addrmode3:$addr),
+               StMiscFrm, IIC_iStorer,
+               "strd", "\t$src1, $addr", []>, Requires<[IsARM, HasV5TE]>;
+
+// Indexed stores
+def STR_PRE  : AI2stwpr<(outs GPR:$base_wb),
+                     (ins GPR:$src, GPR:$base, am2offset:$offset), 
+                     StFrm, IIC_iStoreru,
+                    "str", "\t$src, [$base, $offset]!", "$base = $base_wb",
+                    [(set GPR:$base_wb,
+                      (pre_store GPR:$src, GPR:$base, am2offset:$offset))]>;
+
+def STR_POST : AI2stwpo<(outs GPR:$base_wb),
+                     (ins GPR:$src, GPR:$base,am2offset:$offset), 
+                     StFrm, IIC_iStoreru,
+                    "str", "\t$src, [$base], $offset", "$base = $base_wb",
+                    [(set GPR:$base_wb,
+                      (post_store GPR:$src, GPR:$base, am2offset:$offset))]>;
+
+def STRH_PRE : AI3sthpr<(outs GPR:$base_wb),
+                     (ins GPR:$src, GPR:$base,am3offset:$offset), 
+                     StMiscFrm, IIC_iStoreru,
+                     "strh", "\t$src, [$base, $offset]!", "$base = $base_wb",
+                    [(set GPR:$base_wb,
+                      (pre_truncsti16 GPR:$src, GPR:$base,am3offset:$offset))]>;
+
+def STRH_POST: AI3sthpo<(outs GPR:$base_wb),
+                     (ins GPR:$src, GPR:$base,am3offset:$offset), 
+                     StMiscFrm, IIC_iStoreru,
+                     "strh", "\t$src, [$base], $offset", "$base = $base_wb",
+                    [(set GPR:$base_wb, (post_truncsti16 GPR:$src,
+                                         GPR:$base, am3offset:$offset))]>;
+
+def STRB_PRE : AI2stbpr<(outs GPR:$base_wb),
+                     (ins GPR:$src, GPR:$base,am2offset:$offset), 
+                     StFrm, IIC_iStoreru,
+                     "strb", "\t$src, [$base, $offset]!", "$base = $base_wb",
+                    [(set GPR:$base_wb, (pre_truncsti8 GPR:$src,
+                                         GPR:$base, am2offset:$offset))]>;
+
+def STRB_POST: AI2stbpo<(outs GPR:$base_wb),
+                     (ins GPR:$src, GPR:$base,am2offset:$offset), 
+                     StFrm, IIC_iStoreru,
+                     "strb", "\t$src, [$base], $offset", "$base = $base_wb",
+                    [(set GPR:$base_wb, (post_truncsti8 GPR:$src,
+                                         GPR:$base, am2offset:$offset))]>;
+
+//===----------------------------------------------------------------------===//
+//  Load / store multiple Instructions.
+//
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
+def LDM : AXI4ld<(outs),
+               (ins addrmode4:$addr, pred:$p, reglist:$wb, variable_ops),
+               LdStMulFrm, IIC_iLoadm, "ldm${addr:submode}${p}\t$addr, $wb",
+               []>;
+
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
+def STM : AXI4st<(outs),
+               (ins addrmode4:$addr, pred:$p, reglist:$wb, variable_ops),
+               LdStMulFrm, IIC_iStorem, "stm${addr:submode}${p}\t$addr, $wb",
+               []>;
+
+//===----------------------------------------------------------------------===//
+//  Move Instructions.
+//
+
+let neverHasSideEffects = 1 in
+def MOVr : AsI1<0b1101, (outs GPR:$dst), (ins GPR:$src), DPFrm, IIC_iMOVr,
+                "mov", "\t$dst, $src", []>, UnaryDP {
+  let Inst{11-4} = 0b00000000;
+  let Inst{25} = 0;
+}
+
+def MOVs : AsI1<0b1101, (outs GPR:$dst), (ins so_reg:$src), 
+                DPSoRegFrm, IIC_iMOVsr,
+                "mov", "\t$dst, $src", [(set GPR:$dst, so_reg:$src)]>, UnaryDP {
+  let Inst{25} = 0;
+}
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
+def MOVi : AsI1<0b1101, (outs GPR:$dst), (ins so_imm:$src), DPFrm, IIC_iMOVi,
+                "mov", "\t$dst, $src", [(set GPR:$dst, so_imm:$src)]>, UnaryDP {
+  let Inst{25} = 1;
+}
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
+def MOVi16 : AI1<0b1000, (outs GPR:$dst), (ins i32imm:$src), 
+                 DPFrm, IIC_iMOVi,
+                 "movw", "\t$dst, $src",
+                 [(set GPR:$dst, imm0_65535:$src)]>,
+                 Requires<[IsARM, HasV6T2]>, UnaryDP {
+  let Inst{20} = 0;
+  let Inst{25} = 1;
+}
+
+let Constraints = "$src = $dst" in
+def MOVTi16 : AI1<0b1010, (outs GPR:$dst), (ins GPR:$src, i32imm:$imm),
+                  DPFrm, IIC_iMOVi,
+                  "movt", "\t$dst, $imm",
+                  [(set GPR:$dst,
+                        (or (and GPR:$src, 0xffff), 
+                            lo16AllZero:$imm))]>, UnaryDP,
+                  Requires<[IsARM, HasV6T2]> {
+  let Inst{20} = 0;
+  let Inst{25} = 1;
+}
+
+def : ARMPat<(or GPR:$src, 0xffff0000), (MOVTi16 GPR:$src, 0xffff)>,
+      Requires<[IsARM, HasV6T2]>;
+
+let Uses = [CPSR] in
+def MOVrx : AsI1<0b1101, (outs GPR:$dst), (ins GPR:$src), Pseudo, IIC_iMOVsi,
+                 "mov", "\t$dst, $src, rrx",
+                 [(set GPR:$dst, (ARMrrx GPR:$src))]>, UnaryDP;
+
+// These aren't really mov instructions, but we have to define them this way
+// due to flag operands.
+
+let Defs = [CPSR] in {
+def MOVsrl_flag : AI1<0b1101, (outs GPR:$dst), (ins GPR:$src), Pseudo, 
+                      IIC_iMOVsi, "movs", "\t$dst, $src, lsr #1",
+                      [(set GPR:$dst, (ARMsrl_flag GPR:$src))]>, UnaryDP;
+def MOVsra_flag : AI1<0b1101, (outs GPR:$dst), (ins GPR:$src), Pseudo,
+                      IIC_iMOVsi, "movs", "\t$dst, $src, asr #1",
+                      [(set GPR:$dst, (ARMsra_flag GPR:$src))]>, UnaryDP;
+}
+
+//===----------------------------------------------------------------------===//
+//  Extend Instructions.
+//
+
+// Sign extenders
+
+defm SXTB  : AI_unary_rrot<0b01101010,
+                           "sxtb", UnOpFrag<(sext_inreg node:$Src, i8)>>;
+defm SXTH  : AI_unary_rrot<0b01101011,
+                           "sxth", UnOpFrag<(sext_inreg node:$Src, i16)>>;
+
+defm SXTAB : AI_bin_rrot<0b01101010,
+               "sxtab", BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS, i8))>>;
+defm SXTAH : AI_bin_rrot<0b01101011,
+               "sxtah", BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS,i16))>>;
+
+// TODO: SXT(A){B|H}16
+
+// Zero extenders
+
+let AddedComplexity = 16 in {
+defm UXTB   : AI_unary_rrot<0b01101110,
+                            "uxtb"  , UnOpFrag<(and node:$Src, 0x000000FF)>>;
+defm UXTH   : AI_unary_rrot<0b01101111,
+                            "uxth"  , UnOpFrag<(and node:$Src, 0x0000FFFF)>>;
+defm UXTB16 : AI_unary_rrot<0b01101100,
+                            "uxtb16", UnOpFrag<(and node:$Src, 0x00FF00FF)>>;
+
+def : ARMV6Pat<(and (shl GPR:$Src, (i32 8)), 0xFF00FF),
+               (UXTB16r_rot GPR:$Src, 24)>;
+def : ARMV6Pat<(and (srl GPR:$Src, (i32 8)), 0xFF00FF),
+               (UXTB16r_rot GPR:$Src, 8)>;
+
+defm UXTAB : AI_bin_rrot<0b01101110, "uxtab",
+                        BinOpFrag<(add node:$LHS, (and node:$RHS, 0x00FF))>>;
+defm UXTAH : AI_bin_rrot<0b01101111, "uxtah",
+                        BinOpFrag<(add node:$LHS, (and node:$RHS, 0xFFFF))>>;
+}
+
+// This isn't safe in general, the add is two 16-bit units, not a 32-bit add.
+//defm UXTAB16 : xxx<"uxtab16", 0xff00ff>;
+
+// TODO: UXT(A){B|H}16
+
+def SBFX  : I<(outs GPR:$dst),
+              (ins GPR:$src, imm0_31:$lsb, imm0_31:$width),
+               AddrMode1, Size4Bytes, IndexModeNone, DPFrm, IIC_iALUi,
+               "sbfx", "\t$dst, $src, $lsb, $width", "", []>,
+               Requires<[IsARM, HasV6T2]> {
+  let Inst{27-21} = 0b0111101;
+  let Inst{6-4}   = 0b101;
+}
+
+def UBFX  : I<(outs GPR:$dst),
+              (ins GPR:$src, imm0_31:$lsb, imm0_31:$width),
+               AddrMode1, Size4Bytes, IndexModeNone, DPFrm, IIC_iALUi,
+               "ubfx", "\t$dst, $src, $lsb, $width", "", []>,
+               Requires<[IsARM, HasV6T2]> {
+  let Inst{27-21} = 0b0111111;
+  let Inst{6-4}   = 0b101;
+}
+
+//===----------------------------------------------------------------------===//
+//  Arithmetic Instructions.
+//
+
+defm ADD  : AsI1_bin_irs<0b0100, "add",
+                         BinOpFrag<(add  node:$LHS, node:$RHS)>, 1>;
+defm SUB  : AsI1_bin_irs<0b0010, "sub",
+                         BinOpFrag<(sub  node:$LHS, node:$RHS)>>;
+
+// ADD and SUB with 's' bit set.
+defm ADDS : AI1_bin_s_irs<0b0100, "adds",
+                          BinOpFrag<(addc node:$LHS, node:$RHS)>, 1>;
+defm SUBS : AI1_bin_s_irs<0b0010, "subs",
+                          BinOpFrag<(subc node:$LHS, node:$RHS)>>;
+
+defm ADC : AI1_adde_sube_irs<0b0101, "adc",
+                             BinOpFrag<(adde node:$LHS, node:$RHS)>, 1>;
+defm SBC : AI1_adde_sube_irs<0b0110, "sbc",
+                             BinOpFrag<(sube node:$LHS, node:$RHS)>>;
+defm ADCS : AI1_adde_sube_s_irs<0b0101, "adcs",
+                             BinOpFrag<(adde node:$LHS, node:$RHS)>, 1>;
+defm SBCS : AI1_adde_sube_s_irs<0b0110, "sbcs",
+                             BinOpFrag<(sube node:$LHS, node:$RHS)>>;
+
+// These don't define reg/reg forms, because they are handled above.
+def RSBri : AsI1<0b0011, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), DPFrm,
+                  IIC_iALUi, "rsb", "\t$dst, $a, $b",
+                  [(set GPR:$dst, (sub so_imm:$b, GPR:$a))]> {
+    let Inst{25} = 1;
+}
+
+def RSBrs : AsI1<0b0011, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), DPSoRegFrm,
+                  IIC_iALUsr, "rsb", "\t$dst, $a, $b",
+                  [(set GPR:$dst, (sub so_reg:$b, GPR:$a))]> {
+    let Inst{25} = 0;
+}
+
+// RSB with 's' bit set.
+let Defs = [CPSR] in {
+def RSBSri : AI1<0b0011, (outs GPR:$dst), (ins GPR:$a, so_imm:$b), DPFrm,
+                 IIC_iALUi, "rsbs", "\t$dst, $a, $b",
+                 [(set GPR:$dst, (subc so_imm:$b, GPR:$a))]> {
+    let Inst{20} = 1;
+    let Inst{25} = 1;
+}
+def RSBSrs : AI1<0b0011, (outs GPR:$dst), (ins GPR:$a, so_reg:$b), DPSoRegFrm,
+                 IIC_iALUsr, "rsbs", "\t$dst, $a, $b",
+                 [(set GPR:$dst, (subc so_reg:$b, GPR:$a))]> {
+    let Inst{20} = 1;
+    let Inst{25} = 0;
+}
+}
+
+let Uses = [CPSR] in {
+def RSCri : AsI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_imm:$b),
+                 DPFrm, IIC_iALUi, "rsc", "\t$dst, $a, $b",
+                 [(set GPR:$dst, (sube so_imm:$b, GPR:$a))]>,
+                 Requires<[IsARM, CarryDefIsUnused]> {
+    let Inst{25} = 1;
+}
+def RSCrs : AsI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_reg:$b),
+                 DPSoRegFrm, IIC_iALUsr, "rsc", "\t$dst, $a, $b",
+                 [(set GPR:$dst, (sube so_reg:$b, GPR:$a))]>,
+                 Requires<[IsARM, CarryDefIsUnused]> {
+    let Inst{25} = 0;
+}
+}
+
+// FIXME: Allow these to be predicated.
+let Defs = [CPSR], Uses = [CPSR] in {
+def RSCSri : AXI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_imm:$b),
+                  DPFrm, IIC_iALUi, "rscs\t$dst, $a, $b",
+                  [(set GPR:$dst, (sube so_imm:$b, GPR:$a))]>,
+                  Requires<[IsARM, CarryDefIsUnused]> {
+    let Inst{20} = 1;
+    let Inst{25} = 1;
+}
+def RSCSrs : AXI1<0b0111, (outs GPR:$dst), (ins GPR:$a, so_reg:$b),
+                  DPSoRegFrm, IIC_iALUsr, "rscs\t$dst, $a, $b",
+                  [(set GPR:$dst, (sube so_reg:$b, GPR:$a))]>,
+                  Requires<[IsARM, CarryDefIsUnused]> {
+    let Inst{20} = 1;
+    let Inst{25} = 0;
+}
+}
+
+// (sub X, imm) gets canonicalized to (add X, -imm).  Match this form.
+def : ARMPat<(add    GPR:$src, so_imm_neg:$imm),
+             (SUBri  GPR:$src, so_imm_neg:$imm)>;
+
+//def : ARMPat<(addc   GPR:$src, so_imm_neg:$imm),
+//             (SUBSri GPR:$src, so_imm_neg:$imm)>;
+//def : ARMPat<(adde   GPR:$src, so_imm_neg:$imm),
+//             (SBCri  GPR:$src, so_imm_neg:$imm)>;
+
+// Note: These are implemented in C++ code, because they have to generate
+// ADD/SUBrs instructions, which use a complex pattern that a xform function
+// cannot produce.
+// (mul X, 2^n+1) -> (add (X << n), X)
+// (mul X, 2^n-1) -> (rsb X, (X << n))
+
+
+//===----------------------------------------------------------------------===//
+//  Bitwise Instructions.
+//
+
+defm AND   : AsI1_bin_irs<0b0000, "and",
+                          BinOpFrag<(and node:$LHS, node:$RHS)>, 1>;
+defm ORR   : AsI1_bin_irs<0b1100, "orr",
+                          BinOpFrag<(or  node:$LHS, node:$RHS)>, 1>;
+defm EOR   : AsI1_bin_irs<0b0001, "eor",
+                          BinOpFrag<(xor node:$LHS, node:$RHS)>, 1>;
+defm BIC   : AsI1_bin_irs<0b1110, "bic",
+                          BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
+
+def BFC    : I<(outs GPR:$dst), (ins GPR:$src, bf_inv_mask_imm:$imm),
+               AddrMode1, Size4Bytes, IndexModeNone, DPFrm, IIC_iUNAsi,
+               "bfc", "\t$dst, $imm", "$src = $dst",
+               [(set GPR:$dst, (and GPR:$src, bf_inv_mask_imm:$imm))]>,
+               Requires<[IsARM, HasV6T2]> {
+  let Inst{27-21} = 0b0111110;
+  let Inst{6-0}   = 0b0011111;
+}
+
+def  MVNr  : AsI1<0b1111, (outs GPR:$dst), (ins GPR:$src), DPFrm, IIC_iMOVr,
+                  "mvn", "\t$dst, $src",
+                  [(set GPR:$dst, (not GPR:$src))]>, UnaryDP {
+  let Inst{25} = 0;
+  let Inst{11-4} = 0b00000000;
+}
+def  MVNs  : AsI1<0b1111, (outs GPR:$dst), (ins so_reg:$src), DPSoRegFrm,
+                  IIC_iMOVsr, "mvn", "\t$dst, $src",
+                  [(set GPR:$dst, (not so_reg:$src))]>, UnaryDP {
+  let Inst{25} = 0;
+}
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
+def  MVNi  : AsI1<0b1111, (outs GPR:$dst), (ins so_imm:$imm), DPFrm, 
+                  IIC_iMOVi, "mvn", "\t$dst, $imm",
+                  [(set GPR:$dst, so_imm_not:$imm)]>,UnaryDP {
+    let Inst{25} = 1;
+}
+
+def : ARMPat<(and   GPR:$src, so_imm_not:$imm),
+             (BICri GPR:$src, so_imm_not:$imm)>;
+
+//===----------------------------------------------------------------------===//
+//  Multiply Instructions.
+//
+
+let isCommutable = 1 in
+def MUL   : AsMul1I<0b0000000, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
+                   IIC_iMUL32, "mul", "\t$dst, $a, $b",
+                   [(set GPR:$dst, (mul GPR:$a, GPR:$b))]>;
+
+def MLA   : AsMul1I<0b0000001, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c),
+                    IIC_iMAC32, "mla", "\t$dst, $a, $b, $c",
+                   [(set GPR:$dst, (add (mul GPR:$a, GPR:$b), GPR:$c))]>;
+
+def MLS   : AMul1I<0b0000011, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c),
+                   IIC_iMAC32, "mls", "\t$dst, $a, $b, $c",
+                   [(set GPR:$dst, (sub GPR:$c, (mul GPR:$a, GPR:$b)))]>,
+                   Requires<[IsARM, HasV6T2]>;
+
+// Extra precision multiplies with low / high results
+let neverHasSideEffects = 1 in {
+let isCommutable = 1 in {
+def SMULL : AsMul1I<0b0000110, (outs GPR:$ldst, GPR:$hdst),
+                               (ins GPR:$a, GPR:$b), IIC_iMUL64,
+                    "smull", "\t$ldst, $hdst, $a, $b", []>;
+
+def UMULL : AsMul1I<0b0000100, (outs GPR:$ldst, GPR:$hdst),
+                               (ins GPR:$a, GPR:$b), IIC_iMUL64,
+                    "umull", "\t$ldst, $hdst, $a, $b", []>;
+}
+
+// Multiply + accumulate
+def SMLAL : AsMul1I<0b0000111, (outs GPR:$ldst, GPR:$hdst),
+                               (ins GPR:$a, GPR:$b), IIC_iMAC64,
+                    "smlal", "\t$ldst, $hdst, $a, $b", []>;
+
+def UMLAL : AsMul1I<0b0000101, (outs GPR:$ldst, GPR:$hdst),
+                               (ins GPR:$a, GPR:$b), IIC_iMAC64,
+                    "umlal", "\t$ldst, $hdst, $a, $b", []>;
+
+def UMAAL : AMul1I <0b0000010, (outs GPR:$ldst, GPR:$hdst),
+                               (ins GPR:$a, GPR:$b), IIC_iMAC64,
+                    "umaal", "\t$ldst, $hdst, $a, $b", []>,
+                    Requires<[IsARM, HasV6]>;
+} // neverHasSideEffects
+
+// Most significant word multiply
+def SMMUL : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
+               IIC_iMUL32, "smmul", "\t$dst, $a, $b",
+               [(set GPR:$dst, (mulhs GPR:$a, GPR:$b))]>,
+            Requires<[IsARM, HasV6]> {
+  let Inst{7-4}   = 0b0001;
+  let Inst{15-12} = 0b1111;
+}
+
+def SMMLA : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c),
+               IIC_iMAC32, "smmla", "\t$dst, $a, $b, $c",
+               [(set GPR:$dst, (add (mulhs GPR:$a, GPR:$b), GPR:$c))]>,
+            Requires<[IsARM, HasV6]> {
+  let Inst{7-4}   = 0b0001;
+}
+
+
+def SMMLS : AMul2I <0b0111010, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c),
+               IIC_iMAC32, "smmls", "\t$dst, $a, $b, $c",
+               [(set GPR:$dst, (sub GPR:$c, (mulhs GPR:$a, GPR:$b)))]>,
+            Requires<[IsARM, HasV6]> {
+  let Inst{7-4}   = 0b1101;
+}
+
+multiclass AI_smul<string opc, PatFrag opnode> {
+  def BB : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
+              IIC_iMUL32, !strconcat(opc, "bb"), "\t$dst, $a, $b",
+              [(set GPR:$dst, (opnode (sext_inreg GPR:$a, i16),
+                                      (sext_inreg GPR:$b, i16)))]>,
+           Requires<[IsARM, HasV5TE]> {
+             let Inst{5} = 0;
+             let Inst{6} = 0;
+           }
+
+  def BT : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
+              IIC_iMUL32, !strconcat(opc, "bt"), "\t$dst, $a, $b",
+              [(set GPR:$dst, (opnode (sext_inreg GPR:$a, i16),
+                                      (sra GPR:$b, (i32 16))))]>,
+           Requires<[IsARM, HasV5TE]> {
+             let Inst{5} = 0;
+             let Inst{6} = 1;
+           }
+
+  def TB : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
+              IIC_iMUL32, !strconcat(opc, "tb"), "\t$dst, $a, $b",
+              [(set GPR:$dst, (opnode (sra GPR:$a, (i32 16)),
+                                      (sext_inreg GPR:$b, i16)))]>,
+           Requires<[IsARM, HasV5TE]> {
+             let Inst{5} = 1;
+             let Inst{6} = 0;
+           }
+
+  def TT : AMulxyI<0b0001011, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
+              IIC_iMUL32, !strconcat(opc, "tt"), "\t$dst, $a, $b",
+              [(set GPR:$dst, (opnode (sra GPR:$a, (i32 16)),
+                                      (sra GPR:$b, (i32 16))))]>,
+            Requires<[IsARM, HasV5TE]> {
+             let Inst{5} = 1;
+             let Inst{6} = 1;
+           }
+
+  def WB : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
+              IIC_iMUL16, !strconcat(opc, "wb"), "\t$dst, $a, $b",
+              [(set GPR:$dst, (sra (opnode GPR:$a,
+                                    (sext_inreg GPR:$b, i16)), (i32 16)))]>,
+           Requires<[IsARM, HasV5TE]> {
+             let Inst{5} = 1;
+             let Inst{6} = 0;
+           }
+
+  def WT : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b),
+              IIC_iMUL16, !strconcat(opc, "wt"), "\t$dst, $a, $b",
+              [(set GPR:$dst, (sra (opnode GPR:$a,
+                                    (sra GPR:$b, (i32 16))), (i32 16)))]>,
+            Requires<[IsARM, HasV5TE]> {
+             let Inst{5} = 1;
+             let Inst{6} = 1;
+           }
+}
+
+
+multiclass AI_smla<string opc, PatFrag opnode> {
+  def BB : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
+              IIC_iMAC16, !strconcat(opc, "bb"), "\t$dst, $a, $b, $acc",
+              [(set GPR:$dst, (add GPR:$acc,
+                               (opnode (sext_inreg GPR:$a, i16),
+                                       (sext_inreg GPR:$b, i16))))]>,
+           Requires<[IsARM, HasV5TE]> {
+             let Inst{5} = 0;
+             let Inst{6} = 0;
+           }
+
+  def BT : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
+              IIC_iMAC16, !strconcat(opc, "bt"), "\t$dst, $a, $b, $acc",
+              [(set GPR:$dst, (add GPR:$acc, (opnode (sext_inreg GPR:$a, i16),
+                                                     (sra GPR:$b, (i32 16)))))]>,
+           Requires<[IsARM, HasV5TE]> {
+             let Inst{5} = 0;
+             let Inst{6} = 1;
+           }
+
+  def TB : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
+              IIC_iMAC16, !strconcat(opc, "tb"), "\t$dst, $a, $b, $acc",
+              [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, (i32 16)),
+                                                 (sext_inreg GPR:$b, i16))))]>,
+           Requires<[IsARM, HasV5TE]> {
+             let Inst{5} = 1;
+             let Inst{6} = 0;
+           }
+
+  def TT : AMulxyI<0b0001000, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
+              IIC_iMAC16, !strconcat(opc, "tt"), "\t$dst, $a, $b, $acc",
+             [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, (i32 16)),
+                                                    (sra GPR:$b, (i32 16)))))]>,
+            Requires<[IsARM, HasV5TE]> {
+             let Inst{5} = 1;
+             let Inst{6} = 1;
+           }
+
+  def WB : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
+              IIC_iMAC16, !strconcat(opc, "wb"), "\t$dst, $a, $b, $acc",
+              [(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a,
+                                       (sext_inreg GPR:$b, i16)), (i32 16))))]>,
+           Requires<[IsARM, HasV5TE]> {
+             let Inst{5} = 0;
+             let Inst{6} = 0;
+           }
+
+  def WT : AMulxyI<0b0001001, (outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc),
+              IIC_iMAC16, !strconcat(opc, "wt"), "\t$dst, $a, $b, $acc",
+              [(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a,
+                                         (sra GPR:$b, (i32 16))), (i32 16))))]>,
+            Requires<[IsARM, HasV5TE]> {
+             let Inst{5} = 0;
+             let Inst{6} = 1;
+           }
+}
+
+defm SMUL : AI_smul<"smul", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
+defm SMLA : AI_smla<"smla", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
+
+// TODO: Halfword multiple accumulate long: SMLAL<x><y>
+// TODO: Dual halfword multiple: SMUAD, SMUSD, SMLAD, SMLSD, SMLALD, SMLSLD
+
+//===----------------------------------------------------------------------===//
+//  Misc. Arithmetic Instructions.
+//
+
+def CLZ  : AMiscA1I<0b000010110, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
+              "clz", "\t$dst, $src",
+              [(set GPR:$dst, (ctlz GPR:$src))]>, Requires<[IsARM, HasV5T]> {
+  let Inst{7-4}   = 0b0001;
+  let Inst{11-8}  = 0b1111;
+  let Inst{19-16} = 0b1111;
+}
+
+def RBIT : AMiscA1I<0b01101111, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
+              "rbit", "\t$dst, $src",
+              [(set GPR:$dst, (ARMrbit GPR:$src))]>,
+           Requires<[IsARM, HasV6T2]> {
+  let Inst{7-4}   = 0b0011;
+  let Inst{11-8}  = 0b1111;
+  let Inst{19-16} = 0b1111;
+}
+
+def REV  : AMiscA1I<0b01101011, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
+              "rev", "\t$dst, $src",
+              [(set GPR:$dst, (bswap GPR:$src))]>, Requires<[IsARM, HasV6]> {
+  let Inst{7-4}   = 0b0011;
+  let Inst{11-8}  = 0b1111;
+  let Inst{19-16} = 0b1111;
+}
+
+def REV16 : AMiscA1I<0b01101011, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
+               "rev16", "\t$dst, $src",
+               [(set GPR:$dst,
+                   (or (and (srl GPR:$src, (i32 8)), 0xFF),
+                       (or (and (shl GPR:$src, (i32 8)), 0xFF00),
+                           (or (and (srl GPR:$src, (i32 8)), 0xFF0000),
+                               (and (shl GPR:$src, (i32 8)), 0xFF000000)))))]>,
+               Requires<[IsARM, HasV6]> {
+  let Inst{7-4}   = 0b1011;
+  let Inst{11-8}  = 0b1111;
+  let Inst{19-16} = 0b1111;
+}
+
+def REVSH : AMiscA1I<0b01101111, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
+               "revsh", "\t$dst, $src",
+               [(set GPR:$dst,
+                  (sext_inreg
+                    (or (srl (and GPR:$src, 0xFF00), (i32 8)),
+                        (shl GPR:$src, (i32 8))), i16))]>,
+               Requires<[IsARM, HasV6]> {
+  let Inst{7-4}   = 0b1011;
+  let Inst{11-8}  = 0b1111;
+  let Inst{19-16} = 0b1111;
+}
+
+def PKHBT : AMiscA1I<0b01101000, (outs GPR:$dst),
+                                 (ins GPR:$src1, GPR:$src2, i32imm:$shamt),
+               IIC_iALUsi, "pkhbt", "\t$dst, $src1, $src2, LSL $shamt",
+               [(set GPR:$dst, (or (and GPR:$src1, 0xFFFF),
+                                   (and (shl GPR:$src2, (i32 imm:$shamt)),
+                                        0xFFFF0000)))]>,
+               Requires<[IsARM, HasV6]> {
+  let Inst{6-4} = 0b001;
+}
+
+// Alternate cases for PKHBT where identities eliminate some nodes.
+def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF), (and GPR:$src2, 0xFFFF0000)),
+               (PKHBT GPR:$src1, GPR:$src2, 0)>;
+def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF), (shl GPR:$src2, imm16_31:$shamt)),
+               (PKHBT GPR:$src1, GPR:$src2, imm16_31:$shamt)>;
+
+
+def PKHTB : AMiscA1I<0b01101000, (outs GPR:$dst),
+                                 (ins GPR:$src1, GPR:$src2, i32imm:$shamt),
+               IIC_iALUsi, "pkhtb", "\t$dst, $src1, $src2, ASR $shamt",
+               [(set GPR:$dst, (or (and GPR:$src1, 0xFFFF0000),
+                                   (and (sra GPR:$src2, imm16_31:$shamt),
+                                        0xFFFF)))]>, Requires<[IsARM, HasV6]> {
+  let Inst{6-4} = 0b101;
+}
+
+// Alternate cases for PKHTB where identities eliminate some nodes.  Note that
+// a shift amount of 0 is *not legal* here, it is PKHBT instead.
+def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF0000), (srl GPR:$src2, (i32 16))),
+               (PKHTB GPR:$src1, GPR:$src2, 16)>;
+def : ARMV6Pat<(or (and GPR:$src1, 0xFFFF0000),
+                   (and (srl GPR:$src2, imm1_15:$shamt), 0xFFFF)),
+               (PKHTB GPR:$src1, GPR:$src2, imm1_15:$shamt)>;
+
+//===----------------------------------------------------------------------===//
+//  Comparison Instructions...
+//
+
+defm CMP  : AI1_cmp_irs<0b1010, "cmp",
+                        BinOpFrag<(ARMcmp node:$LHS, node:$RHS)>>;
+//FIXME: Disable CMN, as CCodes are backwards from compare expectations
+//       Compare-to-zero still works out, just not the relationals
+//defm CMN  : AI1_cmp_irs<0b1011, "cmn",
+//                        BinOpFrag<(ARMcmp node:$LHS,(ineg node:$RHS))>>;
+
+// Note that TST/TEQ don't set all the same flags that CMP does!
+defm TST  : AI1_cmp_irs<0b1000, "tst",
+                        BinOpFrag<(ARMcmpZ (and node:$LHS, node:$RHS), 0)>, 1>;
+defm TEQ  : AI1_cmp_irs<0b1001, "teq",
+                        BinOpFrag<(ARMcmpZ (xor node:$LHS, node:$RHS), 0)>, 1>;
+
+defm CMPz  : AI1_cmp_irs<0b1010, "cmp",
+                         BinOpFrag<(ARMcmpZ node:$LHS, node:$RHS)>>;
+defm CMNz  : AI1_cmp_irs<0b1011, "cmn",
+                         BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>>;
+
+//def : ARMPat<(ARMcmp GPR:$src, so_imm_neg:$imm),
+//             (CMNri  GPR:$src, so_imm_neg:$imm)>;
+
+def : ARMPat<(ARMcmpZ GPR:$src, so_imm_neg:$imm),
+             (CMNzri  GPR:$src, so_imm_neg:$imm)>;
+
+
+// Conditional moves
+// FIXME: should be able to write a pattern for ARMcmov, but can't use
+// a two-value operand where a dag node expects two operands. :( 
+def MOVCCr : AI1<0b1101, (outs GPR:$dst), (ins GPR:$false, GPR:$true), DPFrm,
+                IIC_iCMOVr, "mov", "\t$dst, $true",
+      [/*(set GPR:$dst, (ARMcmov GPR:$false, GPR:$true, imm:$cc, CCR:$ccr))*/]>,
+                RegConstraint<"$false = $dst">, UnaryDP {
+  let Inst{11-4} = 0b00000000;
+  let Inst{25} = 0;
+}
+
+def MOVCCs : AI1<0b1101, (outs GPR:$dst),
+                        (ins GPR:$false, so_reg:$true), DPSoRegFrm, IIC_iCMOVsr,
+                "mov", "\t$dst, $true",
+   [/*(set GPR:$dst, (ARMcmov GPR:$false, so_reg:$true, imm:$cc, CCR:$ccr))*/]>,
+                RegConstraint<"$false = $dst">, UnaryDP {
+  let Inst{25} = 0;
+}
+
+def MOVCCi : AI1<0b1101, (outs GPR:$dst),
+                        (ins GPR:$false, so_imm:$true), DPFrm, IIC_iCMOVi,
+                "mov", "\t$dst, $true",
+   [/*(set GPR:$dst, (ARMcmov GPR:$false, so_imm:$true, imm:$cc, CCR:$ccr))*/]>,
+                RegConstraint<"$false = $dst">, UnaryDP {
+  let Inst{25} = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Atomic operations intrinsics
+//
+
+// memory barriers protect the atomic sequences
+let hasSideEffects = 1 in {
+def Int_MemBarrierV7 : AInoP<(outs), (ins),
+                        Pseudo, NoItinerary,
+                        "dmb", "",
+                        [(ARMMemBarrierV7)]>,
+                        Requires<[IsARM, HasV7]> {
+  let Inst{31-4} = 0xf57ff05;
+  // FIXME: add support for options other than a full system DMB
+  let Inst{3-0} = 0b1111;
+}
+
+def Int_SyncBarrierV7 : AInoP<(outs), (ins),
+                        Pseudo, NoItinerary,
+                        "dsb", "",
+                        [(ARMSyncBarrierV7)]>,
+                        Requires<[IsARM, HasV7]> {
+  let Inst{31-4} = 0xf57ff04;
+  // FIXME: add support for options other than a full system DSB
+  let Inst{3-0} = 0b1111;
+}
+
+def Int_MemBarrierV6 : AInoP<(outs), (ins GPR:$zero),
+                       Pseudo, NoItinerary,
+                       "mcr", "\tp15, 0, $zero, c7, c10, 5",
+                       [(ARMMemBarrierV6 GPR:$zero)]>,
+                       Requires<[IsARM, HasV6]> {
+  // FIXME: add support for options other than a full system DMB
+  // FIXME: add encoding
+}
+
+def Int_SyncBarrierV6 : AInoP<(outs), (ins GPR:$zero),
+                        Pseudo, NoItinerary,
+                        "mcr", "\tp15, 0, $zero, c7, c10, 4",
+                        [(ARMSyncBarrierV6 GPR:$zero)]>,
+                        Requires<[IsARM, HasV6]> {
+  // FIXME: add support for options other than a full system DSB
+  // FIXME: add encoding
+}
+}
+
+let usesCustomInserter = 1 in {
+  let Uses = [CPSR] in {
+    def ATOMIC_LOAD_ADD_I8 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      "${:comment} ATOMIC_LOAD_ADD_I8 PSEUDO!",
+      [(set GPR:$dst, (atomic_load_add_8 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_SUB_I8 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      "${:comment} ATOMIC_LOAD_SUB_I8 PSEUDO!",
+      [(set GPR:$dst, (atomic_load_sub_8 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_AND_I8 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      "${:comment} ATOMIC_LOAD_AND_I8 PSEUDO!",
+      [(set GPR:$dst, (atomic_load_and_8 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_OR_I8 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      "${:comment} ATOMIC_LOAD_OR_I8 PSEUDO!",
+      [(set GPR:$dst, (atomic_load_or_8 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_XOR_I8 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      "${:comment} ATOMIC_LOAD_XOR_I8 PSEUDO!",
+      [(set GPR:$dst, (atomic_load_xor_8 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_NAND_I8 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      "${:comment} ATOMIC_LOAD_NAND_I8 PSEUDO!",
+      [(set GPR:$dst, (atomic_load_nand_8 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_ADD_I16 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      "${:comment} ATOMIC_LOAD_ADD_I16 PSEUDO!",
+      [(set GPR:$dst, (atomic_load_add_16 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_SUB_I16 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      "${:comment} ATOMIC_LOAD_SUB_I16 PSEUDO!",
+      [(set GPR:$dst, (atomic_load_sub_16 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_AND_I16 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      "${:comment} ATOMIC_LOAD_AND_I16 PSEUDO!",
+      [(set GPR:$dst, (atomic_load_and_16 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_OR_I16 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      "${:comment} ATOMIC_LOAD_OR_I16 PSEUDO!",
+      [(set GPR:$dst, (atomic_load_or_16 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_XOR_I16 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      "${:comment} ATOMIC_LOAD_XOR_I16 PSEUDO!",
+      [(set GPR:$dst, (atomic_load_xor_16 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_NAND_I16 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      "${:comment} ATOMIC_LOAD_NAND_I16 PSEUDO!",
+      [(set GPR:$dst, (atomic_load_nand_16 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_ADD_I32 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      "${:comment} ATOMIC_LOAD_ADD_I32 PSEUDO!",
+      [(set GPR:$dst, (atomic_load_add_32 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_SUB_I32 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      "${:comment} ATOMIC_LOAD_SUB_I32 PSEUDO!",
+      [(set GPR:$dst, (atomic_load_sub_32 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_AND_I32 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      "${:comment} ATOMIC_LOAD_AND_I32 PSEUDO!",
+      [(set GPR:$dst, (atomic_load_and_32 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_OR_I32 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      "${:comment} ATOMIC_LOAD_OR_I32 PSEUDO!",
+      [(set GPR:$dst, (atomic_load_or_32 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_XOR_I32 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      "${:comment} ATOMIC_LOAD_XOR_I32 PSEUDO!",
+      [(set GPR:$dst, (atomic_load_xor_32 GPR:$ptr, GPR:$incr))]>;
+    def ATOMIC_LOAD_NAND_I32 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$incr), NoItinerary,
+      "${:comment} ATOMIC_LOAD_NAND_I32 PSEUDO!",
+      [(set GPR:$dst, (atomic_load_nand_32 GPR:$ptr, GPR:$incr))]>;
+
+    def ATOMIC_SWAP_I8 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$new), NoItinerary,
+      "${:comment} ATOMIC_SWAP_I8 PSEUDO!",
+      [(set GPR:$dst, (atomic_swap_8 GPR:$ptr, GPR:$new))]>;
+    def ATOMIC_SWAP_I16 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$new), NoItinerary,
+      "${:comment} ATOMIC_SWAP_I16 PSEUDO!",
+      [(set GPR:$dst, (atomic_swap_16 GPR:$ptr, GPR:$new))]>;
+    def ATOMIC_SWAP_I32 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$new), NoItinerary,
+      "${:comment} ATOMIC_SWAP_I32 PSEUDO!",
+      [(set GPR:$dst, (atomic_swap_32 GPR:$ptr, GPR:$new))]>;
+
+    def ATOMIC_CMP_SWAP_I8 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$old, GPR:$new), NoItinerary,
+      "${:comment} ATOMIC_CMP_SWAP_I8 PSEUDO!",
+      [(set GPR:$dst, (atomic_cmp_swap_8 GPR:$ptr, GPR:$old, GPR:$new))]>;
+    def ATOMIC_CMP_SWAP_I16 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$old, GPR:$new), NoItinerary,
+      "${:comment} ATOMIC_CMP_SWAP_I16 PSEUDO!",
+      [(set GPR:$dst, (atomic_cmp_swap_16 GPR:$ptr, GPR:$old, GPR:$new))]>;
+    def ATOMIC_CMP_SWAP_I32 : PseudoInst<
+      (outs GPR:$dst), (ins GPR:$ptr, GPR:$old, GPR:$new), NoItinerary,
+      "${:comment} ATOMIC_CMP_SWAP_I32 PSEUDO!",
+      [(set GPR:$dst, (atomic_cmp_swap_32 GPR:$ptr, GPR:$old, GPR:$new))]>;
+}
+}
+
+let mayLoad = 1 in {
+def LDREXB : AIldrex<0b10, (outs GPR:$dest), (ins GPR:$ptr), NoItinerary,
+                    "ldrexb", "\t$dest, [$ptr]",
+                    []>;
+def LDREXH : AIldrex<0b11, (outs GPR:$dest), (ins GPR:$ptr), NoItinerary,
+                    "ldrexh", "\t$dest, [$ptr]",
+                    []>;
+def LDREX  : AIldrex<0b00, (outs GPR:$dest), (ins GPR:$ptr), NoItinerary,
+                    "ldrex", "\t$dest, [$ptr]",
+                    []>;
+def LDREXD : AIldrex<0b01, (outs GPR:$dest, GPR:$dest2), (ins GPR:$ptr),
+                    NoItinerary,
+                    "ldrexd", "\t$dest, $dest2, [$ptr]",
+                    []>;
+}
+
+let mayStore = 1, Constraints = "@earlyclobber $success" in {
+def STREXB : AIstrex<0b10, (outs GPR:$success), (ins GPR:$src, GPR:$ptr),
+                    NoItinerary,
+                    "strexb", "\t$success, $src, [$ptr]",
+                    []>;
+def STREXH : AIstrex<0b11, (outs GPR:$success), (ins GPR:$src, GPR:$ptr),
+                    NoItinerary,
+                    "strexh", "\t$success, $src, [$ptr]",
+                    []>;
+def STREX  : AIstrex<0b00, (outs GPR:$success), (ins GPR:$src, GPR:$ptr),
+                    NoItinerary,
+                    "strex", "\t$success, $src, [$ptr]",
+                    []>;
+def STREXD : AIstrex<0b01, (outs GPR:$success),
+                    (ins GPR:$src, GPR:$src2, GPR:$ptr),
+                    NoItinerary,
+                    "strexd", "\t$success, $src, $src2, [$ptr]",
+                    []>;
+}
+
+//===----------------------------------------------------------------------===//
+// TLS Instructions
+//
+
+// __aeabi_read_tp preserves the registers r1-r3.
+let isCall = 1,
+  Defs = [R0, R12, LR, CPSR] in {
+  def TPsoft : ABXI<0b1011, (outs), (ins), IIC_Br,
+               "bl\t__aeabi_read_tp",
+               [(set R0, ARMthread_pointer)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// SJLJ Exception handling intrinsics
+//   eh_sjlj_setjmp() is an instruction sequence to store the return
+//   address and save #0 in R0 for the non-longjmp case.
+//   Since by its nature we may be coming from some other function to get
+//   here, and we're using the stack frame for the containing function to
+//   save/restore registers, we can't keep anything live in regs across
+//   the eh_sjlj_setjmp(), else it will almost certainly have been tromped upon
+//   when we get here from a longjmp(). We force everthing out of registers
+//   except for our own input by listing the relevant registers in Defs. By
+//   doing so, we also cause the prologue/epilogue code to actively preserve
+//   all of the callee-saved resgisters, which is exactly what we want.
+//   A constant value is passed in $val, and we use the location as a scratch.
+let Defs =
+  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR,  D0,
+    D1,  D2,  D3,  D4,  D5,  D6,  D7,  D8,  D9,  D10, D11, D12, D13, D14, D15,
+    D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30,
+    D31 ] in {
+  def Int_eh_sjlj_setjmp : XI<(outs), (ins GPR:$src, GPR:$val),
+                               AddrModeNone, SizeSpecial, IndexModeNone,
+                               Pseudo, NoItinerary,
+                               "str\tsp, [$src, #+8] @ eh_setjmp begin\n\t"
+                               "add\t$val, pc, #8\n\t"
+                               "str\t$val, [$src, #+4]\n\t"
+                               "mov\tr0, #0\n\t"
+                               "add\tpc, pc, #0\n\t"
+                               "mov\tr0, #1 @ eh_setjmp end", "",
+                         [(set R0, (ARMeh_sjlj_setjmp GPR:$src, GPR:$val))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//
+
+// Large immediate handling.
+
+// Two piece so_imms.
+let isReMaterializable = 1 in
+def MOVi2pieces : AI1x2<(outs GPR:$dst), (ins so_imm2part:$src), 
+                         Pseudo, IIC_iMOVi,
+                         "mov", "\t$dst, $src",
+                         [(set GPR:$dst, so_imm2part:$src)]>,
+                  Requires<[IsARM, NoV6T2]>;
+
+def : ARMPat<(or GPR:$LHS, so_imm2part:$RHS),
+             (ORRri (ORRri GPR:$LHS, (so_imm2part_1 imm:$RHS)),
+                    (so_imm2part_2 imm:$RHS))>;
+def : ARMPat<(xor GPR:$LHS, so_imm2part:$RHS),
+             (EORri (EORri GPR:$LHS, (so_imm2part_1 imm:$RHS)),
+                    (so_imm2part_2 imm:$RHS))>;
+def : ARMPat<(add GPR:$LHS, so_imm2part:$RHS),
+             (ADDri (ADDri GPR:$LHS, (so_imm2part_1 imm:$RHS)),
+                    (so_imm2part_2 imm:$RHS))>;
+def : ARMPat<(add GPR:$LHS, so_neg_imm2part:$RHS),
+             (SUBri (SUBri GPR:$LHS, (so_neg_imm2part_1 imm:$RHS)),
+                    (so_neg_imm2part_2 imm:$RHS))>;
+
+// 32-bit immediate using movw + movt.
+// This is a single pseudo instruction, the benefit is that it can be remat'd
+// as a single unit instead of having to handle reg inputs.
+// FIXME: Remove this when we can do generalized remat.
+let isReMaterializable = 1 in
+def MOVi32imm : AI1x2<(outs GPR:$dst), (ins i32imm:$src), Pseudo, IIC_iMOVi,
+                    "movw", "\t$dst, ${src:lo16}\n\tmovt${p}\t$dst, ${src:hi16}",
+                     [(set GPR:$dst, (i32 imm:$src))]>,
+               Requires<[IsARM, HasV6T2]>;
+
+// ConstantPool, GlobalAddress, and JumpTable
+def : ARMPat<(ARMWrapper  tglobaladdr :$dst), (LEApcrel tglobaladdr :$dst)>,
+            Requires<[IsARM, DontUseMovt]>;
+def : ARMPat<(ARMWrapper  tconstpool  :$dst), (LEApcrel tconstpool  :$dst)>;
+def : ARMPat<(ARMWrapper  tglobaladdr :$dst), (MOVi32imm tglobaladdr :$dst)>,
+            Requires<[IsARM, UseMovt]>;
+def : ARMPat<(ARMWrapperJT tjumptable:$dst, imm:$id),
+             (LEApcrelJT tjumptable:$dst, imm:$id)>;
+
+// TODO: add,sub,and, 3-instr forms?
+
+
+// Direct calls
+def : ARMPat<(ARMcall texternalsym:$func), (BL texternalsym:$func)>,
+      Requires<[IsARM, IsNotDarwin]>;
+def : ARMPat<(ARMcall texternalsym:$func), (BLr9 texternalsym:$func)>,
+      Requires<[IsARM, IsDarwin]>;
+
+// zextload i1 -> zextload i8
+def : ARMPat<(zextloadi1 addrmode2:$addr),  (LDRB addrmode2:$addr)>;
+
+// extload -> zextload
+def : ARMPat<(extloadi1  addrmode2:$addr),  (LDRB addrmode2:$addr)>;
+def : ARMPat<(extloadi8  addrmode2:$addr),  (LDRB addrmode2:$addr)>;
+def : ARMPat<(extloadi16 addrmode3:$addr),  (LDRH addrmode3:$addr)>;
+
+def : ARMPat<(extloadi8  addrmodepc:$addr), (PICLDRB addrmodepc:$addr)>;
+def : ARMPat<(extloadi16 addrmodepc:$addr), (PICLDRH addrmodepc:$addr)>;
+
+// smul* and smla*
+def : ARMV5TEPat<(mul (sra (shl GPR:$a, (i32 16)), (i32 16)),
+                      (sra (shl GPR:$b, (i32 16)), (i32 16))),
+                 (SMULBB GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(mul sext_16_node:$a, sext_16_node:$b),
+                 (SMULBB GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(mul (sra (shl GPR:$a, (i32 16)), (i32 16)),
+                      (sra GPR:$b, (i32 16))),
+                 (SMULBT GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(mul sext_16_node:$a, (sra GPR:$b, (i32 16))),
+                 (SMULBT GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(mul (sra GPR:$a, (i32 16)),
+                      (sra (shl GPR:$b, (i32 16)), (i32 16))),
+                 (SMULTB GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(mul (sra GPR:$a, (i32 16)), sext_16_node:$b),
+                (SMULTB GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(sra (mul GPR:$a, (sra (shl GPR:$b, (i32 16)), (i32 16))),
+                      (i32 16)),
+                 (SMULWB GPR:$a, GPR:$b)>;
+def : ARMV5TEPat<(sra (mul GPR:$a, sext_16_node:$b), (i32 16)),
+                 (SMULWB GPR:$a, GPR:$b)>;
+
+def : ARMV5TEPat<(add GPR:$acc,
+                      (mul (sra (shl GPR:$a, (i32 16)), (i32 16)),
+                           (sra (shl GPR:$b, (i32 16)), (i32 16)))),
+                 (SMLABB GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5TEPat<(add GPR:$acc,
+                      (mul sext_16_node:$a, sext_16_node:$b)),
+                 (SMLABB GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5TEPat<(add GPR:$acc,
+                      (mul (sra (shl GPR:$a, (i32 16)), (i32 16)),
+                           (sra GPR:$b, (i32 16)))),
+                 (SMLABT GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5TEPat<(add GPR:$acc,
+                      (mul sext_16_node:$a, (sra GPR:$b, (i32 16)))),
+                 (SMLABT GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5TEPat<(add GPR:$acc,
+                      (mul (sra GPR:$a, (i32 16)),
+                           (sra (shl GPR:$b, (i32 16)), (i32 16)))),
+                 (SMLATB GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5TEPat<(add GPR:$acc,
+                      (mul (sra GPR:$a, (i32 16)), sext_16_node:$b)),
+                 (SMLATB GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5TEPat<(add GPR:$acc,
+                      (sra (mul GPR:$a, (sra (shl GPR:$b, (i32 16)), (i32 16))),
+                           (i32 16))),
+                 (SMLAWB GPR:$a, GPR:$b, GPR:$acc)>;
+def : ARMV5TEPat<(add GPR:$acc,
+                      (sra (mul GPR:$a, sext_16_node:$b), (i32 16))),
+                 (SMLAWB GPR:$a, GPR:$b, GPR:$acc)>;
+
+//===----------------------------------------------------------------------===//
+// Thumb Support
+//
+
+include "ARMInstrThumb.td"
+
+//===----------------------------------------------------------------------===//
+// Thumb2 Support
+//
+
+include "ARMInstrThumb2.td"
+
+//===----------------------------------------------------------------------===//
+// Floating Point Support
+//
+
+include "ARMInstrVFP.td"
+
+//===----------------------------------------------------------------------===//
+// Advanced SIMD (NEON) Support
+//
+
+include "ARMInstrNEON.td"

diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
new file mode 100644
index 0000000..e2be7ba
--- /dev/null
+++ b/lib/Target/ARM/ARMInstrNEON.td

@@ -0,0 +1,3225 @@
+//===- ARMInstrNEON.td - NEON support for ARM -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the ARM NEON instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// NEON-specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDTARMVCMP    : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<1, 2>]>;
+
+def NEONvceq      : SDNode<"ARMISD::VCEQ", SDTARMVCMP>;
+def NEONvcge      : SDNode<"ARMISD::VCGE", SDTARMVCMP>;
+def NEONvcgeu     : SDNode<"ARMISD::VCGEU", SDTARMVCMP>;
+def NEONvcgt      : SDNode<"ARMISD::VCGT", SDTARMVCMP>;
+def NEONvcgtu     : SDNode<"ARMISD::VCGTU", SDTARMVCMP>;
+def NEONvtst      : SDNode<"ARMISD::VTST", SDTARMVCMP>;
+
+// Types for vector shift by immediates.  The "SHX" version is for long and
+// narrow operations where the source and destination vectors have different
+// types.  The "SHINS" version is for shift and insert operations.
+def SDTARMVSH     : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
+                                         SDTCisVT<2, i32>]>;
+def SDTARMVSHX    : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
+                                         SDTCisVT<2, i32>]>;
+def SDTARMVSHINS  : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
+                                         SDTCisSameAs<0, 2>, SDTCisVT<3, i32>]>;
+
+def NEONvshl      : SDNode<"ARMISD::VSHL", SDTARMVSH>;
+def NEONvshrs     : SDNode<"ARMISD::VSHRs", SDTARMVSH>;
+def NEONvshru     : SDNode<"ARMISD::VSHRu", SDTARMVSH>;
+def NEONvshlls    : SDNode<"ARMISD::VSHLLs", SDTARMVSHX>;
+def NEONvshllu    : SDNode<"ARMISD::VSHLLu", SDTARMVSHX>;
+def NEONvshlli    : SDNode<"ARMISD::VSHLLi", SDTARMVSHX>;
+def NEONvshrn     : SDNode<"ARMISD::VSHRN", SDTARMVSHX>;
+
+def NEONvrshrs    : SDNode<"ARMISD::VRSHRs", SDTARMVSH>;
+def NEONvrshru    : SDNode<"ARMISD::VRSHRu", SDTARMVSH>;
+def NEONvrshrn    : SDNode<"ARMISD::VRSHRN", SDTARMVSHX>;
+
+def NEONvqshls    : SDNode<"ARMISD::VQSHLs", SDTARMVSH>;
+def NEONvqshlu    : SDNode<"ARMISD::VQSHLu", SDTARMVSH>;
+def NEONvqshlsu   : SDNode<"ARMISD::VQSHLsu", SDTARMVSH>;
+def NEONvqshrns   : SDNode<"ARMISD::VQSHRNs", SDTARMVSHX>;
+def NEONvqshrnu   : SDNode<"ARMISD::VQSHRNu", SDTARMVSHX>;
+def NEONvqshrnsu  : SDNode<"ARMISD::VQSHRNsu", SDTARMVSHX>;
+
+def NEONvqrshrns  : SDNode<"ARMISD::VQRSHRNs", SDTARMVSHX>;
+def NEONvqrshrnu  : SDNode<"ARMISD::VQRSHRNu", SDTARMVSHX>;
+def NEONvqrshrnsu : SDNode<"ARMISD::VQRSHRNsu", SDTARMVSHX>;
+
+def NEONvsli      : SDNode<"ARMISD::VSLI", SDTARMVSHINS>;
+def NEONvsri      : SDNode<"ARMISD::VSRI", SDTARMVSHINS>;
+
+def SDTARMVGETLN  : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisInt<1>,
+                                         SDTCisVT<2, i32>]>;
+def NEONvgetlaneu : SDNode<"ARMISD::VGETLANEu", SDTARMVGETLN>;
+def NEONvgetlanes : SDNode<"ARMISD::VGETLANEs", SDTARMVGETLN>;
+
+def NEONvdup      : SDNode<"ARMISD::VDUP", SDTypeProfile<1, 1, [SDTCisVec<0>]>>;
+
+// VDUPLANE can produce a quad-register result from a double-register source,
+// so the result is not constrained to match the source.
+def NEONvduplane  : SDNode<"ARMISD::VDUPLANE",
+                           SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                                SDTCisVT<2, i32>]>>;
+
+def SDTARMVEXT    : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+                                         SDTCisSameAs<0, 2>, SDTCisVT<3, i32>]>;
+def NEONvext      : SDNode<"ARMISD::VEXT", SDTARMVEXT>;
+
+def SDTARMVSHUF   : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0, 1>]>;
+def NEONvrev64    : SDNode<"ARMISD::VREV64", SDTARMVSHUF>;
+def NEONvrev32    : SDNode<"ARMISD::VREV32", SDTARMVSHUF>;
+def NEONvrev16    : SDNode<"ARMISD::VREV16", SDTARMVSHUF>;
+
+def SDTARMVSHUF2  : SDTypeProfile<2, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+                                         SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>]>;
+def NEONzip       : SDNode<"ARMISD::VZIP", SDTARMVSHUF2>;
+def NEONuzp       : SDNode<"ARMISD::VUZP", SDTARMVSHUF2>;
+def NEONtrn       : SDNode<"ARMISD::VTRN", SDTARMVSHUF2>;
+
+//===----------------------------------------------------------------------===//
+// NEON operand definitions
+//===----------------------------------------------------------------------===//
+
+// addrmode_neonldstm := reg
+//
+/* TODO: Take advantage of vldm.
+def addrmode_neonldstm : Operand<i32>,
+                ComplexPattern<i32, 2, "SelectAddrModeNeonLdStM", []> {
+  let PrintMethod = "printAddrNeonLdStMOperand";
+  let MIOperandInfo = (ops GPR, i32imm);
+}
+*/
+
+def h8imm  : Operand<i8> {
+  let PrintMethod = "printHex8ImmOperand";
+}
+def h16imm : Operand<i16> {
+  let PrintMethod = "printHex16ImmOperand";
+}
+def h32imm : Operand<i32> {
+  let PrintMethod = "printHex32ImmOperand";
+}
+def h64imm : Operand<i64> {
+  let PrintMethod = "printHex64ImmOperand";
+}
+
+//===----------------------------------------------------------------------===//
+// NEON load / store instructions
+//===----------------------------------------------------------------------===//
+
+/* TODO: Take advantage of vldm.
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in {
+def VLDMD : NI<(outs),
+               (ins addrmode_neonldstm:$addr, reglist:$dst1, variable_ops),
+               IIC_fpLoadm,
+               "vldm", "${addr:submode} ${addr:base}, $dst1",
+               []> {
+  let Inst{27-25} = 0b110;
+  let Inst{20}    = 1;
+  let Inst{11-9}  = 0b101;
+}
+
+def VLDMS : NI<(outs),
+               (ins addrmode_neonldstm:$addr, reglist:$dst1, variable_ops),
+               IIC_fpLoadm,
+               "vldm", "${addr:submode} ${addr:base}, $dst1",
+               []> {
+  let Inst{27-25} = 0b110;
+  let Inst{20}    = 1;
+  let Inst{11-9}  = 0b101;
+}
+}
+*/
+
+// Use vldmia to load a Q register as a D register pair.
+def VLDRQ : NI4<(outs QPR:$dst), (ins addrmode4:$addr),
+               IIC_fpLoadm,
+               "vldmia", "$addr, ${dst:dregpair}",
+               [(set QPR:$dst, (v2f64 (load addrmode4:$addr)))]> {
+  let Inst{27-25} = 0b110;
+  let Inst{24}    = 0; // P bit
+  let Inst{23}    = 1; // U bit
+  let Inst{20}    = 1;
+  let Inst{11-8}  = 0b1011;
+}
+
+// Use vstmia to store a Q register as a D register pair.
+def VSTRQ : NI4<(outs), (ins QPR:$src, addrmode4:$addr),
+               IIC_fpStorem,
+               "vstmia", "$addr, ${src:dregpair}",
+               [(store (v2f64 QPR:$src), addrmode4:$addr)]> {
+  let Inst{27-25} = 0b110;
+  let Inst{24}    = 0; // P bit
+  let Inst{23}    = 1; // U bit
+  let Inst{20}    = 0;
+  let Inst{11-8}  = 0b1011;
+}
+
+//   VLD1     : Vector Load (multiple single elements)
+class VLD1D<bits<4> op7_4, string OpcodeStr, string Dt,
+            ValueType Ty, Intrinsic IntOp>
+  : NLdSt<0,0b10,0b0111,op7_4, (outs DPR:$dst), (ins addrmode6:$addr), IIC_VLD1,
+          OpcodeStr, Dt, "\\{$dst\\}, $addr", "",
+          [(set DPR:$dst, (Ty (IntOp addrmode6:$addr)))]>;
+class VLD1Q<bits<4> op7_4, string OpcodeStr, string Dt,
+            ValueType Ty, Intrinsic IntOp>
+  : NLdSt<0,0b10,0b1010,op7_4, (outs QPR:$dst), (ins addrmode6:$addr), IIC_VLD1,
+          OpcodeStr, Dt, "${dst:dregpair}, $addr", "",
+          [(set QPR:$dst, (Ty (IntOp addrmode6:$addr)))]>;
+
+def  VLD1d8   : VLD1D<0b0000, "vld1", "8",  v8i8,  int_arm_neon_vld1>;
+def  VLD1d16  : VLD1D<0b0100, "vld1", "16", v4i16, int_arm_neon_vld1>;
+def  VLD1d32  : VLD1D<0b1000, "vld1", "32", v2i32, int_arm_neon_vld1>;
+def  VLD1df   : VLD1D<0b1000, "vld1", "32", v2f32, int_arm_neon_vld1>;
+def  VLD1d64  : VLD1D<0b1100, "vld1", "64", v1i64, int_arm_neon_vld1>;
+
+def  VLD1q8   : VLD1Q<0b0000, "vld1", "8",  v16i8, int_arm_neon_vld1>;
+def  VLD1q16  : VLD1Q<0b0100, "vld1", "16", v8i16, int_arm_neon_vld1>;
+def  VLD1q32  : VLD1Q<0b1000, "vld1", "32", v4i32, int_arm_neon_vld1>;
+def  VLD1qf   : VLD1Q<0b1000, "vld1", "32", v4f32, int_arm_neon_vld1>;
+def  VLD1q64  : VLD1Q<0b1100, "vld1", "64", v2i64, int_arm_neon_vld1>;
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in {
+
+//   VLD2     : Vector Load (multiple 2-element structures)
+class VLD2D<bits<4> op7_4, string OpcodeStr, string Dt>
+  : NLdSt<0,0b10,0b1000,op7_4, (outs DPR:$dst1, DPR:$dst2),
+          (ins addrmode6:$addr), IIC_VLD2,
+          OpcodeStr, Dt, "\\{$dst1, $dst2\\}, $addr", "", []>;
+class VLD2Q<bits<4> op7_4, string OpcodeStr, string Dt>
+  : NLdSt<0,0b10,0b0011,op7_4,
+          (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4),
+          (ins addrmode6:$addr), IIC_VLD2,
+          OpcodeStr, Dt, "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr",
+          "", []>;
+
+def  VLD2d8   : VLD2D<0b0000, "vld2", "8">;
+def  VLD2d16  : VLD2D<0b0100, "vld2", "16">;
+def  VLD2d32  : VLD2D<0b1000, "vld2", "32">;
+def  VLD2d64  : NLdSt<0,0b10,0b1010,0b1100, (outs DPR:$dst1, DPR:$dst2),
+                      (ins addrmode6:$addr), IIC_VLD1,
+                      "vld1", "64", "\\{$dst1, $dst2\\}, $addr", "", []>;
+
+def  VLD2q8   : VLD2Q<0b0000, "vld2", "8">;
+def  VLD2q16  : VLD2Q<0b0100, "vld2", "16">;
+def  VLD2q32  : VLD2Q<0b1000, "vld2", "32">;
+
+//   VLD3     : Vector Load (multiple 3-element structures)
+class VLD3D<bits<4> op7_4, string OpcodeStr, string Dt>
+  : NLdSt<0,0b10,0b0100,op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3),
+          (ins addrmode6:$addr), IIC_VLD3,
+          OpcodeStr, Dt, "\\{$dst1, $dst2, $dst3\\}, $addr", "", []>;
+class VLD3WB<bits<4> op7_4, string OpcodeStr, string Dt>
+  : NLdSt<0,0b10,0b0101,op7_4, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, GPR:$wb),
+          (ins addrmode6:$addr), IIC_VLD3,
+          OpcodeStr, Dt, "\\{$dst1, $dst2, $dst3\\}, $addr",
+          "$addr.addr = $wb", []>;
+
+def  VLD3d8   : VLD3D<0b0000, "vld3", "8">;
+def  VLD3d16  : VLD3D<0b0100, "vld3", "16">;
+def  VLD3d32  : VLD3D<0b1000, "vld3", "32">;
+def  VLD3d64  : NLdSt<0,0b10,0b0110,0b1100,
+                      (outs DPR:$dst1, DPR:$dst2, DPR:$dst3),
+                      (ins addrmode6:$addr), IIC_VLD1,
+                      "vld1", "64", "\\{$dst1, $dst2, $dst3\\}, $addr", "", []>;
+
+// vld3 to double-spaced even registers.
+def  VLD3q8a  : VLD3WB<0b0000, "vld3", "8">;
+def  VLD3q16a : VLD3WB<0b0100, "vld3", "16">;
+def  VLD3q32a : VLD3WB<0b1000, "vld3", "32">;
+
+// vld3 to double-spaced odd registers.
+def  VLD3q8b  : VLD3WB<0b0000, "vld3", "8">;
+def  VLD3q16b : VLD3WB<0b0100, "vld3", "16">;
+def  VLD3q32b : VLD3WB<0b1000, "vld3", "32">;
+
+//   VLD4     : Vector Load (multiple 4-element structures)
+class VLD4D<bits<4> op7_4, string OpcodeStr, string Dt>
+  : NLdSt<0,0b10,0b0000,op7_4,
+          (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4),
+          (ins addrmode6:$addr), IIC_VLD4,
+          OpcodeStr, Dt, "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr",
+          "", []>;
+class VLD4WB<bits<4> op7_4, string OpcodeStr, string Dt>
+  : NLdSt<0,0b10,0b0001,op7_4,
+          (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb),
+          (ins addrmode6:$addr), IIC_VLD4,
+          OpcodeStr, Dt, "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr",
+          "$addr.addr = $wb", []>;
+
+def  VLD4d8   : VLD4D<0b0000, "vld4", "8">;
+def  VLD4d16  : VLD4D<0b0100, "vld4", "16">;
+def  VLD4d32  : VLD4D<0b1000, "vld4", "32">;
+def  VLD4d64  : NLdSt<0,0b10,0b0010,0b1100,
+                      (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4),
+                      (ins addrmode6:$addr), IIC_VLD1,
+                      "vld1", "64", "\\{$dst1, $dst2, $dst3, $dst4\\}, $addr",
+                      "", []>;
+
+// vld4 to double-spaced even registers.
+def  VLD4q8a  : VLD4WB<0b0000, "vld4", "8">;
+def  VLD4q16a : VLD4WB<0b0100, "vld4", "16">;
+def  VLD4q32a : VLD4WB<0b1000, "vld4", "32">;
+
+// vld4 to double-spaced odd registers.
+def  VLD4q8b  : VLD4WB<0b0000, "vld4", "8">;
+def  VLD4q16b : VLD4WB<0b0100, "vld4", "16">;
+def  VLD4q32b : VLD4WB<0b1000, "vld4", "32">;
+
+//   VLD1LN   : Vector Load (single element to one lane)
+//   FIXME: Not yet implemented.
+
+//   VLD2LN   : Vector Load (single 2-element structure to one lane)
+class VLD2LN<bits<4> op11_8, string OpcodeStr, string Dt>
+  : NLdSt<1,0b10,op11_8,{?,?,?,?}, (outs DPR:$dst1, DPR:$dst2),
+            (ins addrmode6:$addr, DPR:$src1, DPR:$src2, nohash_imm:$lane),
+            IIC_VLD2,
+            OpcodeStr, Dt, "\\{$dst1[$lane], $dst2[$lane]\\}, $addr",
+            "$src1 = $dst1, $src2 = $dst2", []>;
+
+// vld2 to single-spaced registers.
+def VLD2LNd8  : VLD2LN<0b0001, "vld2", "8">;
+def VLD2LNd16 : VLD2LN<0b0101, "vld2", "16"> {
+  let Inst{5} = 0;
+}
+def VLD2LNd32 : VLD2LN<0b1001, "vld2", "32"> {
+  let Inst{6} = 0;
+}
+
+// vld2 to double-spaced even registers.
+def VLD2LNq16a: VLD2LN<0b0101, "vld2", "16"> {
+  let Inst{5} = 1;
+}
+def VLD2LNq32a: VLD2LN<0b1001, "vld2", "32"> {
+  let Inst{6} = 1;
+}
+
+// vld2 to double-spaced odd registers.
+def VLD2LNq16b: VLD2LN<0b0101, "vld2", "16"> {
+  let Inst{5} = 1;
+}
+def VLD2LNq32b: VLD2LN<0b1001, "vld2", "32"> {
+  let Inst{6} = 1;
+}
+
+//   VLD3LN   : Vector Load (single 3-element structure to one lane)
+class VLD3LN<bits<4> op11_8, string OpcodeStr, string Dt>
+  : NLdSt<1,0b10,op11_8,{?,?,?,?}, (outs DPR:$dst1, DPR:$dst2, DPR:$dst3),
+            (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3,
+            nohash_imm:$lane), IIC_VLD3,
+            OpcodeStr, Dt,
+            "\\{$dst1[$lane], $dst2[$lane], $dst3[$lane]\\}, $addr",
+            "$src1 = $dst1, $src2 = $dst2, $src3 = $dst3", []>;
+
+// vld3 to single-spaced registers.
+def VLD3LNd8  : VLD3LN<0b0010, "vld3", "8"> {
+  let Inst{4} = 0;
+}
+def VLD3LNd16 : VLD3LN<0b0110, "vld3", "16"> {
+  let Inst{5-4} = 0b00;
+}
+def VLD3LNd32 : VLD3LN<0b1010, "vld3", "32"> {
+  let Inst{6-4} = 0b000;
+}
+
+// vld3 to double-spaced even registers.
+def VLD3LNq16a: VLD3LN<0b0110, "vld3", "16"> {
+  let Inst{5-4} = 0b10;
+}
+def VLD3LNq32a: VLD3LN<0b1010, "vld3", "32"> {
+  let Inst{6-4} = 0b100;
+}
+
+// vld3 to double-spaced odd registers.
+def VLD3LNq16b: VLD3LN<0b0110, "vld3", "16"> {
+  let Inst{5-4} = 0b10;
+}
+def VLD3LNq32b: VLD3LN<0b1010, "vld3", "32"> {
+  let Inst{6-4} = 0b100;
+}
+
+//   VLD4LN   : Vector Load (single 4-element structure to one lane)
+class VLD4LN<bits<4> op11_8, string OpcodeStr, string Dt>
+  : NLdSt<1,0b10,op11_8,{?,?,?,?},
+            (outs DPR:$dst1, DPR:$dst2, DPR:$dst3, DPR:$dst4),
+            (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4,
+            nohash_imm:$lane), IIC_VLD4,
+            OpcodeStr, Dt,
+          "\\{$dst1[$lane], $dst2[$lane], $dst3[$lane], $dst4[$lane]\\}, $addr",
+            "$src1 = $dst1, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4", []>;
+
+// vld4 to single-spaced registers.
+def VLD4LNd8  : VLD4LN<0b0011, "vld4", "8">;
+def VLD4LNd16 : VLD4LN<0b0111, "vld4", "16"> {
+  let Inst{5} = 0;
+}
+def VLD4LNd32 : VLD4LN<0b1011, "vld4", "32"> {
+  let Inst{6} = 0;
+}
+
+// vld4 to double-spaced even registers.
+def VLD4LNq16a: VLD4LN<0b0111, "vld4", "16"> {
+  let Inst{5} = 1;
+}
+def VLD4LNq32a: VLD4LN<0b1011, "vld4", "32"> {
+  let Inst{6} = 1;
+}
+
+// vld4 to double-spaced odd registers.
+def VLD4LNq16b: VLD4LN<0b0111, "vld4", "16"> {
+  let Inst{5} = 1;
+}
+def VLD4LNq32b: VLD4LN<0b1011, "vld4", "32"> {
+  let Inst{6} = 1;
+}
+
+//   VLD1DUP  : Vector Load (single element to all lanes)
+//   VLD2DUP  : Vector Load (single 2-element structure to all lanes)
+//   VLD3DUP  : Vector Load (single 3-element structure to all lanes)
+//   VLD4DUP  : Vector Load (single 4-element structure to all lanes)
+//   FIXME: Not yet implemented.
+} // mayLoad = 1, hasExtraDefRegAllocReq = 1
+
+//   VST1     : Vector Store (multiple single elements)
+class VST1D<bits<4> op7_4, string OpcodeStr, string Dt,
+            ValueType Ty, Intrinsic IntOp>
+  : NLdSt<0,0b00,0b0111,op7_4, (outs), (ins addrmode6:$addr, DPR:$src), IIC_VST,
+          OpcodeStr, Dt, "\\{$src\\}, $addr", "",
+          [(IntOp addrmode6:$addr, (Ty DPR:$src))]>;
+class VST1Q<bits<4> op7_4, string OpcodeStr, string Dt,
+            ValueType Ty, Intrinsic IntOp>
+  : NLdSt<0,0b00,0b1010,op7_4, (outs), (ins addrmode6:$addr, QPR:$src), IIC_VST,
+          OpcodeStr, Dt, "${src:dregpair}, $addr", "",
+          [(IntOp addrmode6:$addr, (Ty QPR:$src))]>;
+
+let hasExtraSrcRegAllocReq = 1 in {
+def  VST1d8   : VST1D<0b0000, "vst1", "8",  v8i8,  int_arm_neon_vst1>;
+def  VST1d16  : VST1D<0b0100, "vst1", "16", v4i16, int_arm_neon_vst1>;
+def  VST1d32  : VST1D<0b1000, "vst1", "32", v2i32, int_arm_neon_vst1>;
+def  VST1df   : VST1D<0b1000, "vst1", "32", v2f32, int_arm_neon_vst1>;
+def  VST1d64  : VST1D<0b1100, "vst1", "64", v1i64, int_arm_neon_vst1>;
+
+def  VST1q8   : VST1Q<0b0000, "vst1", "8",  v16i8, int_arm_neon_vst1>;
+def  VST1q16  : VST1Q<0b0100, "vst1", "16", v8i16, int_arm_neon_vst1>;
+def  VST1q32  : VST1Q<0b1000, "vst1", "32", v4i32, int_arm_neon_vst1>;
+def  VST1qf   : VST1Q<0b1000, "vst1", "32", v4f32, int_arm_neon_vst1>;
+def  VST1q64  : VST1Q<0b1100, "vst1", "64", v2i64, int_arm_neon_vst1>;
+} // hasExtraSrcRegAllocReq
+
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in {
+
+//   VST2     : Vector Store (multiple 2-element structures)
+class VST2D<bits<4> op7_4, string OpcodeStr, string Dt>
+  : NLdSt<0,0b00,0b1000,op7_4, (outs),
+          (ins addrmode6:$addr, DPR:$src1, DPR:$src2), IIC_VST,
+          OpcodeStr, Dt, "\\{$src1, $src2\\}, $addr", "", []>;
+class VST2Q<bits<4> op7_4, string OpcodeStr, string Dt>
+  : NLdSt<0,0b00,0b0011,op7_4, (outs),
+          (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4),
+          IIC_VST,
+          OpcodeStr, Dt, "\\{$src1, $src2, $src3, $src4\\}, $addr",
+          "", []>;
+
+def  VST2d8   : VST2D<0b0000, "vst2", "8">;
+def  VST2d16  : VST2D<0b0100, "vst2", "16">;
+def  VST2d32  : VST2D<0b1000, "vst2", "32">;
+def  VST2d64  : NLdSt<0,0b00,0b1010,0b1100, (outs),
+                      (ins addrmode6:$addr, DPR:$src1, DPR:$src2), IIC_VST,
+                      "vst1", "64", "\\{$src1, $src2\\}, $addr", "", []>;
+
+def  VST2q8   : VST2Q<0b0000, "vst2", "8">;
+def  VST2q16  : VST2Q<0b0100, "vst2", "16">;
+def  VST2q32  : VST2Q<0b1000, "vst2", "32">;
+
+//   VST3     : Vector Store (multiple 3-element structures)
+class VST3D<bits<4> op7_4, string OpcodeStr, string Dt>
+  : NLdSt<0,0b00,0b0100,op7_4, (outs),
+          (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3), IIC_VST,
+          OpcodeStr, Dt, "\\{$src1, $src2, $src3\\}, $addr", "", []>;
+class VST3WB<bits<4> op7_4, string OpcodeStr, string Dt>
+  : NLdSt<0,0b00,0b0101,op7_4, (outs GPR:$wb),
+          (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3), IIC_VST,
+          OpcodeStr, Dt, "\\{$src1, $src2, $src3\\}, $addr",
+          "$addr.addr = $wb", []>;
+
+def  VST3d8   : VST3D<0b0000, "vst3", "8">;
+def  VST3d16  : VST3D<0b0100, "vst3", "16">;
+def  VST3d32  : VST3D<0b1000, "vst3", "32">;
+def  VST3d64  : NLdSt<0,0b00,0b0110,0b1100, (outs),
+                      (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3),
+                      IIC_VST,
+                      "vst1", "64", "\\{$src1, $src2, $src3\\}, $addr", "", []>;
+
+// vst3 to double-spaced even registers.
+def  VST3q8a  : VST3WB<0b0000, "vst3", "8">;
+def  VST3q16a : VST3WB<0b0100, "vst3", "16">;
+def  VST3q32a : VST3WB<0b1000, "vst3", "32">;
+
+// vst3 to double-spaced odd registers.
+def  VST3q8b  : VST3WB<0b0000, "vst3", "8">;
+def  VST3q16b : VST3WB<0b0100, "vst3", "16">;
+def  VST3q32b : VST3WB<0b1000, "vst3", "32">;
+
+//   VST4     : Vector Store (multiple 4-element structures)
+class VST4D<bits<4> op7_4, string OpcodeStr, string Dt>
+  : NLdSt<0,0b00,0b0000,op7_4, (outs),
+          (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4),
+          IIC_VST,
+          OpcodeStr, Dt, "\\{$src1, $src2, $src3, $src4\\}, $addr",
+          "", []>;
+class VST4WB<bits<4> op7_4, string OpcodeStr, string Dt>
+  : NLdSt<0,0b00,0b0001,op7_4, (outs GPR:$wb),
+          (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4),
+          IIC_VST,
+          OpcodeStr, Dt, "\\{$src1, $src2, $src3, $src4\\}, $addr",
+          "$addr.addr = $wb", []>;
+
+def  VST4d8   : VST4D<0b0000, "vst4", "8">;
+def  VST4d16  : VST4D<0b0100, "vst4", "16">;
+def  VST4d32  : VST4D<0b1000, "vst4", "32">;
+def  VST4d64  : NLdSt<0,0b00,0b0010,0b1100, (outs),
+                      (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3,
+                       DPR:$src4), IIC_VST,
+                      "vst1", "64", "\\{$src1, $src2, $src3, $src4\\}, $addr",
+                      "", []>;
+
+// vst4 to double-spaced even registers.
+def  VST4q8a  : VST4WB<0b0000, "vst4", "8">;
+def  VST4q16a : VST4WB<0b0100, "vst4", "16">;
+def  VST4q32a : VST4WB<0b1000, "vst4", "32">;
+
+// vst4 to double-spaced odd registers.
+def  VST4q8b  : VST4WB<0b0000, "vst4", "8">;
+def  VST4q16b : VST4WB<0b0100, "vst4", "16">;
+def  VST4q32b : VST4WB<0b1000, "vst4", "32">;
+
+//   VST1LN   : Vector Store (single element from one lane)
+//   FIXME: Not yet implemented.
+
+//   VST2LN   : Vector Store (single 2-element structure from one lane)
+class VST2LN<bits<4> op11_8, string OpcodeStr, string Dt>
+  : NLdSt<1,0b00,op11_8,{?,?,?,?}, (outs),
+            (ins addrmode6:$addr, DPR:$src1, DPR:$src2, nohash_imm:$lane),
+            IIC_VST,
+            OpcodeStr, Dt, "\\{$src1[$lane], $src2[$lane]\\}, $addr",
+            "", []>;
+
+// vst2 to single-spaced registers.
+def VST2LNd8  : VST2LN<0b0001, "vst2", "8">;
+def VST2LNd16 : VST2LN<0b0101, "vst2", "16"> {
+  let Inst{5} = 0;
+}
+def VST2LNd32 : VST2LN<0b1001, "vst2", "32"> {
+  let Inst{6} = 0;
+}
+
+// vst2 to double-spaced even registers.
+def VST2LNq16a: VST2LN<0b0101, "vst2", "16"> {
+  let Inst{5} = 1;
+}
+def VST2LNq32a: VST2LN<0b1001, "vst2", "32"> {
+  let Inst{6} = 1;
+}
+
+// vst2 to double-spaced odd registers.
+def VST2LNq16b: VST2LN<0b0101, "vst2", "16"> {
+  let Inst{5} = 1;
+}
+def VST2LNq32b: VST2LN<0b1001, "vst2", "32"> {
+  let Inst{6} = 1;
+}
+
+//   VST3LN   : Vector Store (single 3-element structure from one lane)
+class VST3LN<bits<4> op11_8, string OpcodeStr, string Dt>
+  : NLdSt<1,0b00,op11_8,{?,?,?,?}, (outs),
+            (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3,
+            nohash_imm:$lane), IIC_VST,
+            OpcodeStr, Dt,
+            "\\{$src1[$lane], $src2[$lane], $src3[$lane]\\}, $addr", "", []>;
+
+// vst3 to single-spaced registers.
+def VST3LNd8  : VST3LN<0b0010, "vst3", "8"> {
+  let Inst{4} = 0;
+}
+def VST3LNd16 : VST3LN<0b0110, "vst3", "16"> {
+  let Inst{5-4} = 0b00;
+}
+def VST3LNd32 : VST3LN<0b1010, "vst3", "32"> {
+  let Inst{6-4} = 0b000;
+}
+
+// vst3 to double-spaced even registers.
+def VST3LNq16a: VST3LN<0b0110, "vst3", "16"> {
+  let Inst{5-4} = 0b10;
+}
+def VST3LNq32a: VST3LN<0b1010, "vst3", "32"> {
+  let Inst{6-4} = 0b100;
+}
+
+// vst3 to double-spaced odd registers.
+def VST3LNq16b: VST3LN<0b0110, "vst3", "16"> {
+  let Inst{5-4} = 0b10;
+}
+def VST3LNq32b: VST3LN<0b1010, "vst3", "32"> {
+  let Inst{6-4} = 0b100;
+}
+
+//   VST4LN   : Vector Store (single 4-element structure from one lane)
+class VST4LN<bits<4> op11_8, string OpcodeStr, string Dt>
+  : NLdSt<1,0b00,op11_8,{?,?,?,?}, (outs),
+            (ins addrmode6:$addr, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4,
+            nohash_imm:$lane), IIC_VST,
+            OpcodeStr, Dt,
+          "\\{$src1[$lane], $src2[$lane], $src3[$lane], $src4[$lane]\\}, $addr",
+            "", []>;
+
+// vst4 to single-spaced registers.
+def VST4LNd8  : VST4LN<0b0011, "vst4", "8">;
+def VST4LNd16 : VST4LN<0b0111, "vst4", "16"> {
+  let Inst{5} = 0;
+}
+def VST4LNd32 : VST4LN<0b1011, "vst4", "32"> {
+  let Inst{6} = 0;
+}
+
+// vst4 to double-spaced even registers.
+def VST4LNq16a: VST4LN<0b0111, "vst4", "16"> {
+  let Inst{5} = 1;
+}
+def VST4LNq32a: VST4LN<0b1011, "vst4", "32"> {
+  let Inst{6} = 1;
+}
+
+// vst4 to double-spaced odd registers.
+def VST4LNq16b: VST4LN<0b0111, "vst4", "16"> {
+  let Inst{5} = 1;
+}
+def VST4LNq32b: VST4LN<0b1011, "vst4", "32"> {
+  let Inst{6} = 1;
+}
+
+} // mayStore = 1, hasExtraSrcRegAllocReq = 1
+
+
+//===----------------------------------------------------------------------===//
+// NEON pattern fragments
+//===----------------------------------------------------------------------===//
+
+// Extract D sub-registers of Q registers.
+// (arm_dsubreg_0 is 5; arm_dsubreg_1 is 6)
+def DSubReg_i8_reg  : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(5 + N->getZExtValue() / 8, MVT::i32);
+}]>;
+def DSubReg_i16_reg : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(5 + N->getZExtValue() / 4, MVT::i32);
+}]>;
+def DSubReg_i32_reg : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(5 + N->getZExtValue() / 2, MVT::i32);
+}]>;
+def DSubReg_f64_reg : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(5 + N->getZExtValue(), MVT::i32);
+}]>;
+def DSubReg_f64_other_reg : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(5 + (1 - N->getZExtValue()), MVT::i32);
+}]>;
+
+// Extract S sub-registers of Q/D registers.
+// (arm_ssubreg_0 is 1; arm_ssubreg_1 is 2; etc.)
+def SSubReg_f32_reg : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(1 + N->getZExtValue(), MVT::i32);
+}]>;
+
+// Translate lane numbers from Q registers to D subregs.
+def SubReg_i8_lane  : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() & 7, MVT::i32);
+}]>;
+def SubReg_i16_lane : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() & 3, MVT::i32);
+}]>;
+def SubReg_i32_lane : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() & 1, MVT::i32);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction Classes
+//===----------------------------------------------------------------------===//
+
+// Basic 2-register operations, both double- and quad-register.
+class N2VD<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+           bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,string Dt,
+           ValueType ResTy, ValueType OpTy, SDNode OpNode>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$dst),
+        (ins DPR:$src), IIC_VUNAD, OpcodeStr, Dt, "$dst, $src", "",
+        [(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src))))]>;
+class N2VQ<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+           bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,string Dt,
+           ValueType ResTy, ValueType OpTy, SDNode OpNode>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$dst),
+        (ins QPR:$src), IIC_VUNAQ, OpcodeStr, Dt, "$dst, $src", "",
+        [(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src))))]>;
+
+// Basic 2-register operations, scalar single-precision.
+class N2VDs<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+            bits<2> op17_16, bits<5> op11_7, bit op4, string OpcodeStr,string Dt,
+            ValueType ResTy, ValueType OpTy, SDNode OpNode>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4,
+        (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src),
+        IIC_VUNAD, OpcodeStr, Dt, "$dst, $src", "", []>;
+
+class N2VDsPat<SDNode OpNode, ValueType ResTy, ValueType OpTy, NeonI Inst>
+  : NEONFPPat<(ResTy (OpNode SPR:$a)),
+       (EXTRACT_SUBREG
+           (Inst (INSERT_SUBREG (OpTy (IMPLICIT_DEF)), SPR:$a, arm_ssubreg_0)),
+        arm_ssubreg_0)>;
+
+// Basic 2-register intrinsics, both double- and quad-register.
+class N2VDInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+              bits<2> op17_16, bits<5> op11_7, bit op4, 
+              InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$dst),
+        (ins DPR:$src), itin, OpcodeStr, Dt, "$dst, $src", "",
+        [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src))))]>;
+class N2VQInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+              bits<2> op17_16, bits<5> op11_7, bit op4,
+              InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$dst),
+        (ins QPR:$src), itin, OpcodeStr, Dt, "$dst, $src", "",
+        [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src))))]>;
+
+// Basic 2-register intrinsics, scalar single-precision
+class N2VDInts<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+              bits<2> op17_16, bits<5> op11_7, bit op4, 
+              InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4,
+        (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src), itin,
+        OpcodeStr, Dt, "$dst, $src", "", []>;
+
+class N2VDIntsPat<SDNode OpNode, NeonI Inst>
+  : NEONFPPat<(f32 (OpNode SPR:$a)),
+       (EXTRACT_SUBREG
+           (Inst (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$a, arm_ssubreg_0)),
+        arm_ssubreg_0)>;
+
+// Narrow 2-register intrinsics.
+class N2VNInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+              bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
+              InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType TyD, ValueType TyQ, Intrinsic IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs DPR:$dst),
+        (ins QPR:$src), itin, OpcodeStr, Dt, "$dst, $src", "",
+        [(set DPR:$dst, (TyD (IntOp (TyQ QPR:$src))))]>;
+
+// Long 2-register intrinsics (currently only used for VMOVL).
+class N2VLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+              bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
+              InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType TyQ, ValueType TyD, Intrinsic IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, op6, op4, (outs QPR:$dst),
+        (ins DPR:$src), itin, OpcodeStr, Dt, "$dst, $src", "",
+        [(set QPR:$dst, (TyQ (IntOp (TyD DPR:$src))))]>;
+
+// 2-register shuffles (VTRN/VZIP/VUZP), both double- and quad-register.
+class N2VDShuffle<bits<2> op19_18, bits<5> op11_7, string OpcodeStr, string Dt>
+  : N2V<0b11, 0b11, op19_18, 0b10, op11_7, 0, 0, (outs DPR:$dst1, DPR:$dst2),
+        (ins DPR:$src1, DPR:$src2), IIC_VPERMD, 
+        OpcodeStr, Dt, "$dst1, $dst2",
+        "$src1 = $dst1, $src2 = $dst2", []>;
+class N2VQShuffle<bits<2> op19_18, bits<5> op11_7,
+                  InstrItinClass itin, string OpcodeStr, string Dt>
+  : N2V<0b11, 0b11, op19_18, 0b10, op11_7, 1, 0, (outs QPR:$dst1, QPR:$dst2),
+        (ins QPR:$src1, QPR:$src2), itin, 
+        OpcodeStr, Dt, "$dst1, $dst2",
+        "$src1 = $dst1, $src2 = $dst2", []>;
+
+// Basic 3-register operations, both double- and quad-register.
+class N3VD<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+           InstrItinClass itin, string OpcodeStr, string Dt,
+           ValueType ResTy, ValueType OpTy,
+           SDNode OpNode, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), itin, 
+        OpcodeStr, Dt, "$dst, $src1, $src2", "",
+        [(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src1), (OpTy DPR:$src2))))]> {
+  let isCommutable = Commutable;
+}
+// Same as N3VD but no data type.
+class N3VDX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+           InstrItinClass itin, string OpcodeStr,
+           ValueType ResTy, ValueType OpTy,
+           SDNode OpNode, bit Commutable>
+  : N3VX<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), itin, 
+        OpcodeStr, "$dst, $src1, $src2", "",
+        [(set DPR:$dst, (ResTy (OpNode (OpTy DPR:$src1), (OpTy DPR:$src2))))]> {
+  let isCommutable = Commutable;
+}
+class N3VDSL<bits<2> op21_20, bits<4> op11_8, 
+             InstrItinClass itin, string OpcodeStr, string Dt,
+             ValueType Ty, SDNode ShOp>
+  : N3V<0, 1, op21_20, op11_8, 1, 0,
+        (outs DPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),
+        itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
+        [(set (Ty DPR:$dst),
+              (Ty (ShOp (Ty DPR:$src1),
+                        (Ty (NEONvduplane (Ty DPR_VFP2:$src2),
+                                          imm:$lane)))))]> {
+  let isCommutable = 0;
+}
+class N3VDSL16<bits<2> op21_20, bits<4> op11_8, 
+               string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp>
+  : N3V<0, 1, op21_20, op11_8, 1, 0,
+        (outs DPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane),
+        IIC_VMULi16D,
+        OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
+        [(set (Ty DPR:$dst),
+              (Ty (ShOp (Ty DPR:$src1),
+                        (Ty (NEONvduplane (Ty DPR_8:$src2),
+                                          imm:$lane)))))]> {
+  let isCommutable = 0;
+}
+
+class N3VQ<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+           InstrItinClass itin, string OpcodeStr, string Dt,
+           ValueType ResTy, ValueType OpTy,
+           SDNode OpNode, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 1, op4,
+        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), itin, 
+        OpcodeStr, Dt, "$dst, $src1, $src2", "",
+        [(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src1), (OpTy QPR:$src2))))]> {
+  let isCommutable = Commutable;
+}
+class N3VQX<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+           InstrItinClass itin, string OpcodeStr,
+           ValueType ResTy, ValueType OpTy,
+           SDNode OpNode, bit Commutable>
+  : N3VX<op24, op23, op21_20, op11_8, 1, op4,
+        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), itin, 
+        OpcodeStr, "$dst, $src1, $src2", "",
+        [(set QPR:$dst, (ResTy (OpNode (OpTy QPR:$src1), (OpTy QPR:$src2))))]> {
+  let isCommutable = Commutable;
+}
+class N3VQSL<bits<2> op21_20, bits<4> op11_8, 
+             InstrItinClass itin, string OpcodeStr, string Dt,
+             ValueType ResTy, ValueType OpTy, SDNode ShOp>
+  : N3V<1, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$dst), (ins QPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),
+        itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
+        [(set (ResTy QPR:$dst),
+              (ResTy (ShOp (ResTy QPR:$src1),
+                           (ResTy (NEONvduplane (OpTy DPR_VFP2:$src2),
+                                                imm:$lane)))))]> {
+  let isCommutable = 0;
+}
+class N3VQSL16<bits<2> op21_20, bits<4> op11_8, 
+               string OpcodeStr, string Dt,
+               ValueType ResTy, ValueType OpTy, SDNode ShOp>
+  : N3V<1, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$dst), (ins QPR:$src1, DPR_8:$src2, nohash_imm:$lane),
+        IIC_VMULi16Q,
+        OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
+        [(set (ResTy QPR:$dst),
+              (ResTy (ShOp (ResTy QPR:$src1),
+                           (ResTy (NEONvduplane (OpTy DPR_8:$src2),
+                                                imm:$lane)))))]> {
+  let isCommutable = 0;
+}
+
+// Basic 3-register operations, scalar single-precision
+class N3VDs<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+           string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
+           SDNode OpNode, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src1, DPR_VFP2:$src2), IIC_VBIND,
+        OpcodeStr, Dt, "$dst, $src1, $src2", "", []> {
+  let isCommutable = Commutable;
+}
+class N3VDsPat<SDNode OpNode, NeonI Inst>
+  : NEONFPPat<(f32 (OpNode SPR:$a, SPR:$b)),
+       (EXTRACT_SUBREG
+           (Inst (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$a, arm_ssubreg_0),
+                 (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$b, arm_ssubreg_0)),
+        arm_ssubreg_0)>;
+
+// Basic 3-register intrinsics, both double- and quad-register.
+class N3VDInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+              InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy,
+              Intrinsic IntOp, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), itin, 
+        OpcodeStr, Dt, "$dst, $src1, $src2", "",
+        [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src1), (OpTy DPR:$src2))))]> {
+  let isCommutable = Commutable;
+}
+class N3VDIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, 
+                string OpcodeStr, string Dt, ValueType Ty, Intrinsic IntOp>
+  : N3V<0, 1, op21_20, op11_8, 1, 0,
+        (outs DPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),
+        itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
+        [(set (Ty DPR:$dst),
+              (Ty (IntOp (Ty DPR:$src1),
+                         (Ty (NEONvduplane (Ty DPR_VFP2:$src2),
+                                           imm:$lane)))))]> {
+  let isCommutable = 0;
+}
+class N3VDIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                  string OpcodeStr, string Dt, ValueType Ty, Intrinsic IntOp>
+  : N3V<0, 1, op21_20, op11_8, 1, 0,
+        (outs DPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane),
+        itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
+        [(set (Ty DPR:$dst),
+              (Ty (IntOp (Ty DPR:$src1),
+                         (Ty (NEONvduplane (Ty DPR_8:$src2),
+                                           imm:$lane)))))]> {
+  let isCommutable = 0;
+}
+
+class N3VQInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+              InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy,
+              Intrinsic IntOp, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 1, op4,
+        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), itin, 
+        OpcodeStr, Dt, "$dst, $src1, $src2", "",
+        [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src1), (OpTy QPR:$src2))))]> {
+  let isCommutable = Commutable;
+}
+class N3VQIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin, 
+                string OpcodeStr, string Dt,
+                ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+  : N3V<1, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$dst), (ins QPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane),
+        itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
+        [(set (ResTy QPR:$dst),
+              (ResTy (IntOp (ResTy QPR:$src1),
+                            (ResTy (NEONvduplane (OpTy DPR_VFP2:$src2),
+                                                 imm:$lane)))))]> {
+  let isCommutable = 0;
+}
+class N3VQIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                  string OpcodeStr, string Dt,
+                  ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+  : N3V<1, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$dst), (ins QPR:$src1, DPR_8:$src2, nohash_imm:$lane),
+        itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
+        [(set (ResTy QPR:$dst),
+              (ResTy (IntOp (ResTy QPR:$src1),
+                            (ResTy (NEONvduplane (OpTy DPR_8:$src2),
+                                                 imm:$lane)))))]> {
+  let isCommutable = 0;
+}
+
+// Multiply-Add/Sub operations, both double- and quad-register.
+class N3VDMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+                InstrItinClass itin, string OpcodeStr, string Dt,
+                ValueType Ty, SDNode MulOp, SDNode OpNode>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), itin,
+        OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst",
+        [(set DPR:$dst, (Ty (OpNode DPR:$src1,
+                             (Ty (MulOp DPR:$src2, DPR:$src3)))))]>;
+class N3VDMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                  string OpcodeStr, string Dt,
+                  ValueType Ty, SDNode MulOp, SDNode ShOp>
+  : N3V<0, 1, op21_20, op11_8, 1, 0,
+        (outs DPR:$dst),
+        (ins DPR:$src1, DPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane), itin,
+        OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst",
+        [(set (Ty DPR:$dst),
+              (Ty (ShOp (Ty DPR:$src1),
+                        (Ty (MulOp DPR:$src2,
+                                   (Ty (NEONvduplane (Ty DPR_VFP2:$src3),
+                                                     imm:$lane)))))))]>;
+class N3VDMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                    string OpcodeStr, string Dt,
+                    ValueType Ty, SDNode MulOp, SDNode ShOp>
+  : N3V<0, 1, op21_20, op11_8, 1, 0,
+        (outs DPR:$dst),
+        (ins DPR:$src1, DPR:$src2, DPR_8:$src3, nohash_imm:$lane), itin,
+        OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst",
+        [(set (Ty DPR:$dst),
+              (Ty (ShOp (Ty DPR:$src1),
+                        (Ty (MulOp DPR:$src2,
+                                   (Ty (NEONvduplane (Ty DPR_8:$src3),
+                                                     imm:$lane)))))))]>;
+
+class N3VQMulOp<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+                InstrItinClass itin, string OpcodeStr, string Dt, ValueType Ty,
+                SDNode MulOp, SDNode OpNode>
+  : N3V<op24, op23, op21_20, op11_8, 1, op4,
+        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), itin,
+        OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst",
+        [(set QPR:$dst, (Ty (OpNode QPR:$src1,
+                             (Ty (MulOp QPR:$src2, QPR:$src3)))))]>;
+class N3VQMulOpSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                  string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
+                  SDNode MulOp, SDNode ShOp>
+  : N3V<1, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$dst),
+        (ins QPR:$src1, QPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane), itin,
+        OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst",
+        [(set (ResTy QPR:$dst),
+              (ResTy (ShOp (ResTy QPR:$src1),
+                           (ResTy (MulOp QPR:$src2,
+                                         (ResTy (NEONvduplane (OpTy DPR_VFP2:$src3),
+                                                              imm:$lane)))))))]>;
+class N3VQMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                    string OpcodeStr, string Dt,
+                    ValueType ResTy, ValueType OpTy,
+                    SDNode MulOp, SDNode ShOp>
+  : N3V<1, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$dst),
+        (ins QPR:$src1, QPR:$src2, DPR_8:$src3, nohash_imm:$lane), itin,
+        OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst",
+        [(set (ResTy QPR:$dst),
+              (ResTy (ShOp (ResTy QPR:$src1),
+                           (ResTy (MulOp QPR:$src2,
+                                         (ResTy (NEONvduplane (OpTy DPR_8:$src3),
+                                                              imm:$lane)))))))]>;
+
+// Multiply-Add/Sub operations, scalar single-precision
+class N3VDMulOps<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+                 InstrItinClass itin, string OpcodeStr, string Dt,
+                 ValueType Ty, SDNode MulOp, SDNode OpNode>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR_VFP2:$dst),
+        (ins DPR_VFP2:$src1, DPR_VFP2:$src2, DPR_VFP2:$src3), itin,
+        OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst", []>;
+
+class N3VDMulOpsPat<SDNode MulNode, SDNode OpNode, NeonI Inst>
+  : NEONFPPat<(f32 (OpNode SPR:$acc, (f32 (MulNode SPR:$a, SPR:$b)))),
+      (EXTRACT_SUBREG
+          (Inst (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$acc, arm_ssubreg_0),
+                (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$a,   arm_ssubreg_0),
+                (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$b,   arm_ssubreg_0)),
+       arm_ssubreg_0)>;
+
+// Neon 3-argument intrinsics, both double- and quad-register.
+// The destination register is also used as the first source operand register.
+class N3VDInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+               InstrItinClass itin, string OpcodeStr, string Dt,
+               ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3), itin,
+        OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst",
+        [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src1),
+                                      (OpTy DPR:$src2), (OpTy DPR:$src3))))]>;
+class N3VQInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+               InstrItinClass itin, string OpcodeStr, string Dt,
+               ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+  : N3V<op24, op23, op21_20, op11_8, 1, op4,
+        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3), itin,
+        OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst",
+        [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src1),
+                                      (OpTy QPR:$src2), (OpTy QPR:$src3))))]>;
+
+// Neon Long 3-argument intrinsic.  The destination register is
+// a quad-register and is also used as the first source operand register.
+class N3VLInt3<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+               InstrItinClass itin, string OpcodeStr, string Dt,
+               ValueType TyQ, ValueType TyD, Intrinsic IntOp>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs QPR:$dst), (ins QPR:$src1, DPR:$src2, DPR:$src3), itin,
+        OpcodeStr, Dt, "$dst, $src2, $src3", "$src1 = $dst",
+        [(set QPR:$dst,
+          (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$src2), (TyD DPR:$src3))))]>;
+class N3VLInt3SL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                 string OpcodeStr, string Dt,
+                 ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+  : N3V<op24, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$dst),
+        (ins QPR:$src1, DPR:$src2, DPR_VFP2:$src3, nohash_imm:$lane), itin,
+        OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst",
+        [(set (ResTy QPR:$dst),
+              (ResTy (IntOp (ResTy QPR:$src1),
+                            (OpTy DPR:$src2),
+                            (OpTy (NEONvduplane (OpTy DPR_VFP2:$src3),
+                                                imm:$lane)))))]>;
+class N3VLInt3SL16<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                   string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
+                   Intrinsic IntOp>
+  : N3V<op24, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$dst),
+        (ins QPR:$src1, DPR:$src2, DPR_8:$src3, nohash_imm:$lane), itin,
+        OpcodeStr, Dt, "$dst, $src2, $src3[$lane]", "$src1 = $dst",
+        [(set (ResTy QPR:$dst),
+              (ResTy (IntOp (ResTy QPR:$src1),
+                            (OpTy DPR:$src2),
+                            (OpTy (NEONvduplane (OpTy DPR_8:$src3),
+                                                imm:$lane)))))]>;
+
+
+// Narrowing 3-register intrinsics.
+class N3VNInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+              string OpcodeStr, string Dt, ValueType TyD, ValueType TyQ,
+              Intrinsic IntOp, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs DPR:$dst), (ins QPR:$src1, QPR:$src2), IIC_VBINi4D,
+        OpcodeStr, Dt, "$dst, $src1, $src2", "",
+        [(set DPR:$dst, (TyD (IntOp (TyQ QPR:$src1), (TyQ QPR:$src2))))]> {
+  let isCommutable = Commutable;
+}
+
+// Long 3-register intrinsics.
+class N3VLInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+              InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType TyQ, ValueType TyD, Intrinsic IntOp, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs QPR:$dst), (ins DPR:$src1, DPR:$src2), itin,
+        OpcodeStr, Dt, "$dst, $src1, $src2", "",
+        [(set QPR:$dst, (TyQ (IntOp (TyD DPR:$src1), (TyD DPR:$src2))))]> {
+  let isCommutable = Commutable;
+}
+class N3VLIntSL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                string OpcodeStr, string Dt,
+                ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+  : N3V<op24, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$dst), (ins DPR:$src1, DPR_VFP2:$src2, nohash_imm:$lane), 
+        itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
+        [(set (ResTy QPR:$dst),
+              (ResTy (IntOp (OpTy DPR:$src1),
+                            (OpTy (NEONvduplane (OpTy DPR_VFP2:$src2),
+                                                imm:$lane)))))]>;
+class N3VLIntSL16<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
+                  string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy, 
+                  Intrinsic IntOp>
+  : N3V<op24, 1, op21_20, op11_8, 1, 0,
+        (outs QPR:$dst), (ins DPR:$src1, DPR_8:$src2, nohash_imm:$lane), 
+        itin, OpcodeStr, Dt, "$dst, $src1, $src2[$lane]", "",
+        [(set (ResTy QPR:$dst),
+              (ResTy (IntOp (OpTy DPR:$src1),
+                            (OpTy (NEONvduplane (OpTy DPR_8:$src2),
+                                                imm:$lane)))))]>;
+
+// Wide 3-register intrinsics.
+class N3VWInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
+              string OpcodeStr, string Dt, ValueType TyQ, ValueType TyD,
+              Intrinsic IntOp, bit Commutable>
+  : N3V<op24, op23, op21_20, op11_8, 0, op4,
+        (outs QPR:$dst), (ins QPR:$src1, DPR:$src2), IIC_VSUBiD,
+        OpcodeStr, Dt, "$dst, $src1, $src2", "",
+        [(set QPR:$dst, (TyQ (IntOp (TyQ QPR:$src1), (TyD DPR:$src2))))]> {
+  let isCommutable = Commutable;
+}
+
+// Pairwise long 2-register intrinsics, both double- and quad-register.
+class N2VDPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+                bits<2> op17_16, bits<5> op11_7, bit op4,
+                string OpcodeStr, string Dt,
+                ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4, (outs DPR:$dst),
+        (ins DPR:$src), IIC_VSHLiD, OpcodeStr, Dt, "$dst, $src", "",
+        [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src))))]>;
+class N2VQPLInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+                bits<2> op17_16, bits<5> op11_7, bit op4,
+                string OpcodeStr, string Dt,
+                ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4, (outs QPR:$dst),
+        (ins QPR:$src), IIC_VSHLiD, OpcodeStr, Dt, "$dst, $src", "",
+        [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src))))]>;
+
+// Pairwise long 2-register accumulate intrinsics,
+// both double- and quad-register.
+// The destination register is also used as the first source operand register.
+class N2VDPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+                 bits<2> op17_16, bits<5> op11_7, bit op4,
+                 string OpcodeStr, string Dt,
+                 ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 0, op4,
+        (outs DPR:$dst), (ins DPR:$src1, DPR:$src2), IIC_VPALiD,
+        OpcodeStr, Dt, "$dst, $src2", "$src1 = $dst",
+        [(set DPR:$dst, (ResTy (IntOp (ResTy DPR:$src1), (OpTy DPR:$src2))))]>;
+class N2VQPLInt2<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
+                 bits<2> op17_16, bits<5> op11_7, bit op4,
+                 string OpcodeStr, string Dt,
+                 ValueType ResTy, ValueType OpTy, Intrinsic IntOp>
+  : N2V<op24_23, op21_20, op19_18, op17_16, op11_7, 1, op4,
+        (outs QPR:$dst), (ins QPR:$src1, QPR:$src2), IIC_VPALiQ,
+        OpcodeStr, Dt, "$dst, $src2", "$src1 = $dst",
+        [(set QPR:$dst, (ResTy (IntOp (ResTy QPR:$src1), (OpTy QPR:$src2))))]>;
+
+// Shift by immediate,
+// both double- and quad-register.
+class N2VDSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+             InstrItinClass itin, string OpcodeStr, string Dt,
+             ValueType Ty, SDNode OpNode>
+  : N2VImm<op24, op23, op11_8, op7, 0, op4,
+           (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM), itin,
+           OpcodeStr, Dt, "$dst, $src, $SIMM", "",
+           [(set DPR:$dst, (Ty (OpNode (Ty DPR:$src), (i32 imm:$SIMM))))]>;
+class N2VQSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+             InstrItinClass itin, string OpcodeStr, string Dt,
+             ValueType Ty, SDNode OpNode>
+  : N2VImm<op24, op23, op11_8, op7, 1, op4,
+           (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM), itin,
+           OpcodeStr, Dt, "$dst, $src, $SIMM", "",
+           [(set QPR:$dst, (Ty (OpNode (Ty QPR:$src), (i32 imm:$SIMM))))]>;
+
+// Long shift by immediate.
+class N2VLSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
+             string OpcodeStr, string Dt,
+             ValueType ResTy, ValueType OpTy, SDNode OpNode>
+  : N2VImm<op24, op23, op11_8, op7, op6, op4,
+           (outs QPR:$dst), (ins DPR:$src, i32imm:$SIMM), IIC_VSHLiD,
+           OpcodeStr, Dt, "$dst, $src, $SIMM", "",
+           [(set QPR:$dst, (ResTy (OpNode (OpTy DPR:$src),
+                                          (i32 imm:$SIMM))))]>;
+
+// Narrow shift by immediate.
+class N2VNSh<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6, bit op4,
+             InstrItinClass itin, string OpcodeStr, string Dt,
+             ValueType ResTy, ValueType OpTy, SDNode OpNode>
+  : N2VImm<op24, op23, op11_8, op7, op6, op4,
+           (outs DPR:$dst), (ins QPR:$src, i32imm:$SIMM), itin,
+           OpcodeStr, Dt, "$dst, $src, $SIMM", "",
+           [(set DPR:$dst, (ResTy (OpNode (OpTy QPR:$src),
+                                          (i32 imm:$SIMM))))]>;
+
+// Shift right by immediate and accumulate,
+// both double- and quad-register.
+class N2VDShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+                string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp>
+  : N2VImm<op24, op23, op11_8, op7, 0, op4, (outs DPR:$dst),
+           (ins DPR:$src1, DPR:$src2, i32imm:$SIMM), IIC_VPALiD, 
+           OpcodeStr, Dt, "$dst, $src2, $SIMM", "$src1 = $dst",
+           [(set DPR:$dst, (Ty (add DPR:$src1,
+                                (Ty (ShOp DPR:$src2, (i32 imm:$SIMM))))))]>;
+class N2VQShAdd<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+                string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp>
+  : N2VImm<op24, op23, op11_8, op7, 1, op4, (outs QPR:$dst),
+           (ins QPR:$src1, QPR:$src2, i32imm:$SIMM), IIC_VPALiD, 
+           OpcodeStr, Dt, "$dst, $src2, $SIMM", "$src1 = $dst",
+           [(set QPR:$dst, (Ty (add QPR:$src1,
+                                (Ty (ShOp QPR:$src2, (i32 imm:$SIMM))))))]>;
+
+// Shift by immediate and insert,
+// both double- and quad-register.
+class N2VDShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+                string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp>
+  : N2VImm<op24, op23, op11_8, op7, 0, op4, (outs DPR:$dst),
+           (ins DPR:$src1, DPR:$src2, i32imm:$SIMM), IIC_VSHLiD, 
+           OpcodeStr, Dt, "$dst, $src2, $SIMM", "$src1 = $dst",
+           [(set DPR:$dst, (Ty (ShOp DPR:$src1, DPR:$src2, (i32 imm:$SIMM))))]>;
+class N2VQShIns<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+                string OpcodeStr, string Dt, ValueType Ty, SDNode ShOp>
+  : N2VImm<op24, op23, op11_8, op7, 1, op4, (outs QPR:$dst),
+           (ins QPR:$src1, QPR:$src2, i32imm:$SIMM), IIC_VSHLiQ, 
+           OpcodeStr, Dt, "$dst, $src2, $SIMM", "$src1 = $dst",
+           [(set QPR:$dst, (Ty (ShOp QPR:$src1, QPR:$src2, (i32 imm:$SIMM))))]>;
+
+// Convert, with fractional bits immediate,
+// both double- and quad-register.
+class N2VCvtD<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+              string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
+              Intrinsic IntOp>
+  : N2VImm<op24, op23, op11_8, op7, 0, op4,
+           (outs DPR:$dst), (ins DPR:$src, i32imm:$SIMM), IIC_VUNAD, 
+           OpcodeStr, Dt, "$dst, $src, $SIMM", "",
+           [(set DPR:$dst, (ResTy (IntOp (OpTy DPR:$src), (i32 imm:$SIMM))))]>;
+class N2VCvtQ<bit op24, bit op23, bits<4> op11_8, bit op7, bit op4,
+              string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
+              Intrinsic IntOp>
+  : N2VImm<op24, op23, op11_8, op7, 1, op4,
+           (outs QPR:$dst), (ins QPR:$src, i32imm:$SIMM), IIC_VUNAQ, 
+           OpcodeStr, Dt, "$dst, $src, $SIMM", "",
+           [(set QPR:$dst, (ResTy (IntOp (OpTy QPR:$src), (i32 imm:$SIMM))))]>;
+
+//===----------------------------------------------------------------------===//
+// Multiclasses
+//===----------------------------------------------------------------------===//
+
+// Abbreviations used in multiclass suffixes:
+//   Q = quarter int (8 bit) elements
+//   H = half int (16 bit) elements
+//   S = single int (32 bit) elements
+//   D = double int (64 bit) elements
+
+// Neon 3-register vector operations.
+
+// First with only element sizes of 8, 16 and 32 bits:
+multiclass N3V_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                   InstrItinClass itinD16, InstrItinClass itinD32,
+                   InstrItinClass itinQ16, InstrItinClass itinQ32,
+                   string OpcodeStr, string Dt,
+                   SDNode OpNode, bit Commutable = 0> {
+  // 64-bit vector types.
+  def v8i8  : N3VD<op24, op23, 0b00, op11_8, op4, itinD16, 
+                   OpcodeStr, !strconcat(Dt, "8"),
+                   v8i8, v8i8, OpNode, Commutable>;
+  def v4i16 : N3VD<op24, op23, 0b01, op11_8, op4, itinD16,
+                 OpcodeStr, !strconcat(Dt, "16"),
+                 v4i16, v4i16, OpNode, Commutable>;
+  def v2i32 : N3VD<op24, op23, 0b10, op11_8, op4, itinD32,
+                 OpcodeStr, !strconcat(Dt, "32"),
+                 v2i32, v2i32, OpNode, Commutable>;
+
+  // 128-bit vector types.
+  def v16i8 : N3VQ<op24, op23, 0b00, op11_8, op4, itinQ16,
+                  OpcodeStr, !strconcat(Dt, "8"),
+                  v16i8, v16i8, OpNode, Commutable>;
+  def v8i16 : N3VQ<op24, op23, 0b01, op11_8, op4, itinQ16,
+                 OpcodeStr, !strconcat(Dt, "16"),
+                 v8i16, v8i16, OpNode, Commutable>;
+  def v4i32 : N3VQ<op24, op23, 0b10, op11_8, op4, itinQ32,
+                 OpcodeStr, !strconcat(Dt, "32"),
+                 v4i32, v4i32, OpNode, Commutable>;
+}
+
+multiclass N3VSL_HS<bits<4> op11_8, string OpcodeStr, string Dt, SDNode ShOp> {
+  def v4i16 : N3VDSL16<0b01, op11_8, OpcodeStr, !strconcat(Dt, "16"),
+                       v4i16, ShOp>;
+  def v2i32 : N3VDSL<0b10, op11_8, IIC_VMULi32D, OpcodeStr, !strconcat(Dt,"32"),
+                     v2i32, ShOp>;
+  def v8i16 : N3VQSL16<0b01, op11_8, OpcodeStr, !strconcat(Dt, "16"),
+                       v8i16, v4i16, ShOp>;
+  def v4i32 : N3VQSL<0b10, op11_8, IIC_VMULi32Q, OpcodeStr, !strconcat(Dt,"32"),
+                     v4i32, v2i32, ShOp>;
+}
+
+// ....then also with element size 64 bits:
+multiclass N3V_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+                    InstrItinClass itinD, InstrItinClass itinQ,
+                    string OpcodeStr, string Dt,
+                    SDNode OpNode, bit Commutable = 0>
+  : N3V_QHS<op24, op23, op11_8, op4, itinD, itinD, itinQ, itinQ,
+            OpcodeStr, Dt, OpNode, Commutable> {
+  def v1i64 : N3VD<op24, op23, 0b11, op11_8, op4, itinD,
+                   OpcodeStr, !strconcat(Dt, "64"),
+                   v1i64, v1i64, OpNode, Commutable>;
+  def v2i64 : N3VQ<op24, op23, 0b11, op11_8, op4, itinQ,
+                   OpcodeStr, !strconcat(Dt, "64"),
+                   v2i64, v2i64, OpNode, Commutable>;
+}
+
+
+// Neon Narrowing 2-register vector intrinsics,
+//   source operand element sizes of 16, 32 and 64 bits:
+multiclass N2VNInt_HSD<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
+                       bits<5> op11_7, bit op6, bit op4, 
+                       InstrItinClass itin, string OpcodeStr, string Dt,
+                       Intrinsic IntOp> {
+  def v8i8  : N2VNInt<op24_23, op21_20, 0b00, op17_16, op11_7, op6, op4,
+                      itin, OpcodeStr, !strconcat(Dt, "16"),
+                      v8i8, v8i16, IntOp>;
+  def v4i16 : N2VNInt<op24_23, op21_20, 0b01, op17_16, op11_7, op6, op4,
+                      itin, OpcodeStr, !strconcat(Dt, "32"),
+                      v4i16, v4i32, IntOp>;
+  def v2i32 : N2VNInt<op24_23, op21_20, 0b10, op17_16, op11_7, op6, op4,
+                      itin, OpcodeStr, !strconcat(Dt, "64"),
+                      v2i32, v2i64, IntOp>;
+}
+
+
+// Neon Lengthening 2-register vector intrinsic (currently specific to VMOVL).
+//   source operand element sizes of 16, 32 and 64 bits:
+multiclass N2VLInt_QHS<bits<2> op24_23, bits<5> op11_7, bit op6, bit op4,
+                       string OpcodeStr, string Dt, Intrinsic IntOp> {
+  def v8i16 : N2VLInt<op24_23, 0b00, 0b10, 0b00, op11_7, op6, op4, IIC_VQUNAiD,
+                      OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, IntOp>;
+  def v4i32 : N2VLInt<op24_23, 0b01, 0b00, 0b00, op11_7, op6, op4, IIC_VQUNAiD,
+                      OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, IntOp>;
+  def v2i64 : N2VLInt<op24_23, 0b10, 0b00, 0b00, op11_7, op6, op4, IIC_VQUNAiD,
+                      OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, IntOp>;
+}
+
+
+// Neon 3-register vector intrinsics.
+
+// First with only element sizes of 16 and 32 bits:
+multiclass N3VInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                     InstrItinClass itinD16, InstrItinClass itinD32,
+                     InstrItinClass itinQ16, InstrItinClass itinQ32,
+                     string OpcodeStr, string Dt,
+                     Intrinsic IntOp, bit Commutable = 0> {
+  // 64-bit vector types.
+  def v4i16 : N3VDInt<op24, op23, 0b01, op11_8, op4, itinD16,
+                      OpcodeStr, !strconcat(Dt, "16"),
+                      v4i16, v4i16, IntOp, Commutable>;
+  def v2i32 : N3VDInt<op24, op23, 0b10, op11_8, op4, itinD32,
+                      OpcodeStr, !strconcat(Dt, "32"),
+                      v2i32, v2i32, IntOp, Commutable>;
+
+  // 128-bit vector types.
+  def v8i16 : N3VQInt<op24, op23, 0b01, op11_8, op4, itinQ16,
+                      OpcodeStr, !strconcat(Dt, "16"),
+                      v8i16, v8i16, IntOp, Commutable>;
+  def v4i32 : N3VQInt<op24, op23, 0b10, op11_8, op4, itinQ32,
+                      OpcodeStr, !strconcat(Dt, "32"),
+                      v4i32, v4i32, IntOp, Commutable>;
+}
+
+multiclass N3VIntSL_HS<bits<4> op11_8, 
+                       InstrItinClass itinD16, InstrItinClass itinD32,
+                       InstrItinClass itinQ16, InstrItinClass itinQ32,
+                       string OpcodeStr, string Dt, Intrinsic IntOp> {
+  def v4i16 : N3VDIntSL16<0b01, op11_8, itinD16,
+                          OpcodeStr, !strconcat(Dt, "16"), v4i16, IntOp>;
+  def v2i32 : N3VDIntSL<0b10, op11_8, itinD32,
+                        OpcodeStr, !strconcat(Dt, "32"), v2i32, IntOp>;
+  def v8i16 : N3VQIntSL16<0b01, op11_8, itinQ16,
+                        OpcodeStr, !strconcat(Dt, "16"), v8i16, v4i16, IntOp>;
+  def v4i32 : N3VQIntSL<0b10, op11_8, itinQ32,
+                        OpcodeStr, !strconcat(Dt, "32"), v4i32, v2i32, IntOp>;
+}
+
+// ....then also with element size of 8 bits:
+multiclass N3VInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                      InstrItinClass itinD16, InstrItinClass itinD32,
+                      InstrItinClass itinQ16, InstrItinClass itinQ32,
+                      string OpcodeStr, string Dt,
+                      Intrinsic IntOp, bit Commutable = 0>
+  : N3VInt_HS<op24, op23, op11_8, op4, itinD16, itinD32, itinQ16, itinQ32,
+              OpcodeStr, Dt, IntOp, Commutable> {
+  def v8i8  : N3VDInt<op24, op23, 0b00, op11_8, op4, itinD16,
+                     OpcodeStr, !strconcat(Dt, "8"),
+                     v8i8, v8i8, IntOp, Commutable>;
+  def v16i8 : N3VQInt<op24, op23, 0b00, op11_8, op4, itinQ16,
+                      OpcodeStr, !strconcat(Dt, "8"),
+                      v16i8, v16i8, IntOp, Commutable>;
+}
+
+// ....then also with element size of 64 bits:
+multiclass N3VInt_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       InstrItinClass itinD16, InstrItinClass itinD32,
+                       InstrItinClass itinQ16, InstrItinClass itinQ32,
+                       string OpcodeStr, string Dt,
+                       Intrinsic IntOp, bit Commutable = 0>
+  : N3VInt_QHS<op24, op23, op11_8, op4, itinD16, itinD32, itinQ16, itinQ32,
+               OpcodeStr, Dt, IntOp, Commutable> {
+  def v1i64 : N3VDInt<op24, op23, 0b11, op11_8, op4, itinD32,
+                   OpcodeStr, !strconcat(Dt, "64"),
+                   v1i64, v1i64, IntOp, Commutable>;
+  def v2i64 : N3VQInt<op24, op23, 0b11, op11_8, op4, itinQ32,
+                   OpcodeStr, !strconcat(Dt, "64"),
+                   v2i64, v2i64, IntOp, Commutable>;
+}
+
+
+// Neon Narrowing 3-register vector intrinsics,
+//   source operand element sizes of 16, 32 and 64 bits:
+multiclass N3VNInt_HSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       string OpcodeStr, string Dt,
+                       Intrinsic IntOp, bit Commutable = 0> {
+  def v8i8  : N3VNInt<op24, op23, 0b00, op11_8, op4,
+                      OpcodeStr, !strconcat(Dt, "16"),
+                      v8i8, v8i16, IntOp, Commutable>;
+  def v4i16 : N3VNInt<op24, op23, 0b01, op11_8, op4,
+                      OpcodeStr, !strconcat(Dt, "32"),
+                      v4i16, v4i32, IntOp, Commutable>;
+  def v2i32 : N3VNInt<op24, op23, 0b10, op11_8, op4,
+                      OpcodeStr, !strconcat(Dt, "64"),
+                      v2i32, v2i64, IntOp, Commutable>;
+}
+
+
+// Neon Long 3-register vector intrinsics.
+
+// First with only element sizes of 16 and 32 bits:
+multiclass N3VLInt_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                      InstrItinClass itin, string OpcodeStr, string Dt,
+                      Intrinsic IntOp, bit Commutable = 0> {
+  def v4i32 : N3VLInt<op24, op23, 0b01, op11_8, op4, itin, 
+                      OpcodeStr, !strconcat(Dt, "16"),
+                      v4i32, v4i16, IntOp, Commutable>;
+  def v2i64 : N3VLInt<op24, op23, 0b10, op11_8, op4, itin,
+                      OpcodeStr, !strconcat(Dt, "32"),
+                      v2i64, v2i32, IntOp, Commutable>;
+}
+
+multiclass N3VLIntSL_HS<bit op24, bits<4> op11_8,
+                        InstrItinClass itin, string OpcodeStr, string Dt,
+                        Intrinsic IntOp> {
+  def v4i16 : N3VLIntSL16<op24, 0b01, op11_8, itin, 
+                          OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, IntOp>;
+  def v2i32 : N3VLIntSL<op24, 0b10, op11_8, itin,
+                        OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, IntOp>;
+}
+
+// ....then also with element size of 8 bits:
+multiclass N3VLInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       InstrItinClass itin, string OpcodeStr, string Dt,
+                       Intrinsic IntOp, bit Commutable = 0>
+  : N3VLInt_HS<op24, op23, op11_8, op4, itin, OpcodeStr, Dt,
+               IntOp, Commutable> {
+  def v8i16 : N3VLInt<op24, op23, 0b00, op11_8, op4, itin, 
+                      OpcodeStr, !strconcat(Dt, "8"),
+                      v8i16, v8i8, IntOp, Commutable>;
+}
+
+
+// Neon Wide 3-register vector intrinsics,
+//   source operand element sizes of 8, 16 and 32 bits:
+multiclass N3VWInt_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       string OpcodeStr, string Dt,
+                       Intrinsic IntOp, bit Commutable = 0> {
+  def v8i16 : N3VWInt<op24, op23, 0b00, op11_8, op4,
+                      OpcodeStr, !strconcat(Dt, "8"),
+                      v8i16, v8i8, IntOp, Commutable>;
+  def v4i32 : N3VWInt<op24, op23, 0b01, op11_8, op4,
+                      OpcodeStr, !strconcat(Dt, "16"),
+                      v4i32, v4i16, IntOp, Commutable>;
+  def v2i64 : N3VWInt<op24, op23, 0b10, op11_8, op4,
+                      OpcodeStr, !strconcat(Dt, "32"),
+                      v2i64, v2i32, IntOp, Commutable>;
+}
+
+
+// Neon Multiply-Op vector operations,
+//   element sizes of 8, 16 and 32 bits:
+multiclass N3VMulOp_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                        InstrItinClass itinD16, InstrItinClass itinD32,
+                        InstrItinClass itinQ16, InstrItinClass itinQ32,
+                        string OpcodeStr, string Dt, SDNode OpNode> {
+  // 64-bit vector types.
+  def v8i8  : N3VDMulOp<op24, op23, 0b00, op11_8, op4, itinD16,
+                        OpcodeStr, !strconcat(Dt, "8"), v8i8, mul, OpNode>;
+  def v4i16 : N3VDMulOp<op24, op23, 0b01, op11_8, op4, itinD16,
+                        OpcodeStr, !strconcat(Dt, "16"), v4i16, mul, OpNode>;
+  def v2i32 : N3VDMulOp<op24, op23, 0b10, op11_8, op4, itinD32,
+                        OpcodeStr, !strconcat(Dt, "32"), v2i32, mul, OpNode>;
+
+  // 128-bit vector types.
+  def v16i8 : N3VQMulOp<op24, op23, 0b00, op11_8, op4, itinQ16,
+                        OpcodeStr, !strconcat(Dt, "8"), v16i8, mul, OpNode>;
+  def v8i16 : N3VQMulOp<op24, op23, 0b01, op11_8, op4, itinQ16,
+                        OpcodeStr, !strconcat(Dt, "16"), v8i16, mul, OpNode>;
+  def v4i32 : N3VQMulOp<op24, op23, 0b10, op11_8, op4, itinQ32,
+                        OpcodeStr, !strconcat(Dt, "32"), v4i32, mul, OpNode>;
+}
+
+multiclass N3VMulOpSL_HS<bits<4> op11_8, 
+                         InstrItinClass itinD16, InstrItinClass itinD32,
+                         InstrItinClass itinQ16, InstrItinClass itinQ32,
+                         string OpcodeStr, string Dt, SDNode ShOp> {
+  def v4i16 : N3VDMulOpSL16<0b01, op11_8, itinD16,
+                            OpcodeStr, !strconcat(Dt, "16"), v4i16, mul, ShOp>;
+  def v2i32 : N3VDMulOpSL<0b10, op11_8, itinD32,
+                          OpcodeStr, !strconcat(Dt, "32"), v2i32, mul, ShOp>;
+  def v8i16 : N3VQMulOpSL16<0b01, op11_8, itinQ16,
+                      OpcodeStr, !strconcat(Dt, "16"), v8i16, v4i16, mul, ShOp>;
+  def v4i32 : N3VQMulOpSL<0b10, op11_8, itinQ32,
+                      OpcodeStr, !strconcat(Dt, "32"), v4i32, v2i32, mul, ShOp>;
+}
+
+// Neon 3-argument intrinsics,
+//   element sizes of 8, 16 and 32 bits:
+multiclass N3VInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       string OpcodeStr, string Dt, Intrinsic IntOp> {
+  // 64-bit vector types.
+  def v8i8  : N3VDInt3<op24, op23, 0b00, op11_8, op4, IIC_VMACi16D,
+                        OpcodeStr, !strconcat(Dt, "8"), v8i8, v8i8, IntOp>;
+  def v4i16 : N3VDInt3<op24, op23, 0b01, op11_8, op4, IIC_VMACi16D,
+                        OpcodeStr, !strconcat(Dt, "16"), v4i16, v4i16, IntOp>;
+  def v2i32 : N3VDInt3<op24, op23, 0b10, op11_8, op4, IIC_VMACi32D,
+                        OpcodeStr, !strconcat(Dt, "32"), v2i32, v2i32, IntOp>;
+
+  // 128-bit vector types.
+  def v16i8 : N3VQInt3<op24, op23, 0b00, op11_8, op4, IIC_VMACi16Q,
+                        OpcodeStr, !strconcat(Dt, "8"), v16i8, v16i8, IntOp>;
+  def v8i16 : N3VQInt3<op24, op23, 0b01, op11_8, op4, IIC_VMACi16Q,
+                        OpcodeStr, !strconcat(Dt, "16"), v8i16, v8i16, IntOp>;
+  def v4i32 : N3VQInt3<op24, op23, 0b10, op11_8, op4, IIC_VMACi32Q,
+                        OpcodeStr, !strconcat(Dt, "32"), v4i32, v4i32, IntOp>;
+}
+
+
+// Neon Long 3-argument intrinsics.
+
+// First with only element sizes of 16 and 32 bits:
+multiclass N3VLInt3_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       string OpcodeStr, string Dt, Intrinsic IntOp> {
+  def v4i32 : N3VLInt3<op24, op23, 0b01, op11_8, op4, IIC_VMACi16D,
+                       OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, IntOp>;
+  def v2i64 : N3VLInt3<op24, op23, 0b10, op11_8, op4, IIC_VMACi16D,
+                       OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, IntOp>;
+}
+
+multiclass N3VLInt3SL_HS<bit op24, bits<4> op11_8,
+                         string OpcodeStr, string Dt, Intrinsic IntOp> {
+  def v4i16 : N3VLInt3SL16<op24, 0b01, op11_8, IIC_VMACi16D,
+                           OpcodeStr, !strconcat(Dt,"16"), v4i32, v4i16, IntOp>;
+  def v2i32 : N3VLInt3SL<op24, 0b10, op11_8, IIC_VMACi32D,
+                         OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, IntOp>;
+}
+
+// ....then also with element size of 8 bits:
+multiclass N3VLInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                        string OpcodeStr, string Dt, Intrinsic IntOp>
+  : N3VLInt3_HS<op24, op23, op11_8, op4, OpcodeStr, Dt, IntOp> {
+  def v8i16 : N3VLInt3<op24, op23, 0b00, op11_8, op4, IIC_VMACi16D,
+                       OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, IntOp>;
+}
+
+
+// Neon 2-register vector intrinsics,
+//   element sizes of 8, 16 and 32 bits:
+multiclass N2VInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
+                      bits<5> op11_7, bit op4,
+                      InstrItinClass itinD, InstrItinClass itinQ,
+                      string OpcodeStr, string Dt, Intrinsic IntOp> {
+  // 64-bit vector types.
+  def v8i8  : N2VDInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
+                      itinD, OpcodeStr, !strconcat(Dt, "8"), v8i8, v8i8, IntOp>;
+  def v4i16 : N2VDInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
+                   itinD, OpcodeStr, !strconcat(Dt, "16"), v4i16, v4i16, IntOp>;
+  def v2i32 : N2VDInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
+                   itinD, OpcodeStr, !strconcat(Dt, "32"), v2i32, v2i32, IntOp>;
+
+  // 128-bit vector types.
+  def v16i8 : N2VQInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
+                    itinQ, OpcodeStr, !strconcat(Dt, "8"), v16i8, v16i8, IntOp>;
+  def v8i16 : N2VQInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
+                   itinQ, OpcodeStr, !strconcat(Dt, "16"), v8i16, v8i16, IntOp>;
+  def v4i32 : N2VQInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
+                   itinQ, OpcodeStr, !strconcat(Dt, "32"), v4i32, v4i32, IntOp>;
+}
+
+
+// Neon Pairwise long 2-register intrinsics,
+//   element sizes of 8, 16 and 32 bits:
+multiclass N2VPLInt_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
+                        bits<5> op11_7, bit op4,
+                        string OpcodeStr, string Dt, Intrinsic IntOp> {
+  // 64-bit vector types.
+  def v8i8  : N2VDPLInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
+                        OpcodeStr, !strconcat(Dt, "8"), v4i16, v8i8, IntOp>;
+  def v4i16 : N2VDPLInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
+                        OpcodeStr, !strconcat(Dt, "16"), v2i32, v4i16, IntOp>;
+  def v2i32 : N2VDPLInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
+                        OpcodeStr, !strconcat(Dt, "32"), v1i64, v2i32, IntOp>;
+
+  // 128-bit vector types.
+  def v16i8 : N2VQPLInt<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
+                        OpcodeStr, !strconcat(Dt, "8"), v8i16, v16i8, IntOp>;
+  def v8i16 : N2VQPLInt<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
+                        OpcodeStr, !strconcat(Dt, "16"), v4i32, v8i16, IntOp>;
+  def v4i32 : N2VQPLInt<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
+                        OpcodeStr, !strconcat(Dt, "32"), v2i64, v4i32, IntOp>;
+}
+
+
+// Neon Pairwise long 2-register accumulate intrinsics,
+//   element sizes of 8, 16 and 32 bits:
+multiclass N2VPLInt2_QHS<bits<2> op24_23, bits<2> op21_20, bits<2> op17_16,
+                         bits<5> op11_7, bit op4,
+                         string OpcodeStr, string Dt, Intrinsic IntOp> {
+  // 64-bit vector types.
+  def v8i8  : N2VDPLInt2<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
+                         OpcodeStr, !strconcat(Dt, "8"), v4i16, v8i8, IntOp>;
+  def v4i16 : N2VDPLInt2<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
+                         OpcodeStr, !strconcat(Dt, "16"), v2i32, v4i16, IntOp>;
+  def v2i32 : N2VDPLInt2<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
+                         OpcodeStr, !strconcat(Dt, "32"), v1i64, v2i32, IntOp>;
+
+  // 128-bit vector types.
+  def v16i8 : N2VQPLInt2<op24_23, op21_20, 0b00, op17_16, op11_7, op4,
+                         OpcodeStr, !strconcat(Dt, "8"), v8i16, v16i8, IntOp>;
+  def v8i16 : N2VQPLInt2<op24_23, op21_20, 0b01, op17_16, op11_7, op4,
+                         OpcodeStr, !strconcat(Dt, "16"), v4i32, v8i16, IntOp>;
+  def v4i32 : N2VQPLInt2<op24_23, op21_20, 0b10, op17_16, op11_7, op4,
+                         OpcodeStr, !strconcat(Dt, "32"), v2i64, v4i32, IntOp>;
+}
+
+
+// Neon 2-register vector shift by immediate,
+//   element sizes of 8, 16, 32 and 64 bits:
+multiclass N2VSh_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+                      InstrItinClass itin, string OpcodeStr, string Dt,
+                      SDNode OpNode> {
+  // 64-bit vector types.
+  def v8i8  : N2VDSh<op24, op23, op11_8, 0, op4, itin,
+                     OpcodeStr, !strconcat(Dt, "8"), v8i8, OpNode> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v4i16 : N2VDSh<op24, op23, op11_8, 0, op4, itin,
+                     OpcodeStr, !strconcat(Dt, "16"), v4i16, OpNode> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v2i32 : N2VDSh<op24, op23, op11_8, 0, op4, itin,
+                     OpcodeStr, !strconcat(Dt, "32"), v2i32, OpNode> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v1i64 : N2VDSh<op24, op23, op11_8, 1, op4, itin,
+                     OpcodeStr, !strconcat(Dt, "64"), v1i64, OpNode>;
+                             // imm6 = xxxxxx
+
+  // 128-bit vector types.
+  def v16i8 : N2VQSh<op24, op23, op11_8, 0, op4, itin,
+                     OpcodeStr, !strconcat(Dt, "8"), v16i8, OpNode> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v8i16 : N2VQSh<op24, op23, op11_8, 0, op4, itin,
+                     OpcodeStr, !strconcat(Dt, "16"), v8i16, OpNode> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v4i32 : N2VQSh<op24, op23, op11_8, 0, op4, itin,
+                     OpcodeStr, !strconcat(Dt, "32"), v4i32, OpNode> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v2i64 : N2VQSh<op24, op23, op11_8, 1, op4, itin,
+                     OpcodeStr, !strconcat(Dt, "64"), v2i64, OpNode>;
+                             // imm6 = xxxxxx
+}
+
+
+// Neon Shift-Accumulate vector operations,
+//   element sizes of 8, 16, 32 and 64 bits:
+multiclass N2VShAdd_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+                         string OpcodeStr, string Dt, SDNode ShOp> {
+  // 64-bit vector types.
+  def v8i8  : N2VDShAdd<op24, op23, op11_8, 0, op4,
+                        OpcodeStr, !strconcat(Dt, "8"), v8i8, ShOp> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v4i16 : N2VDShAdd<op24, op23, op11_8, 0, op4,
+                        OpcodeStr, !strconcat(Dt, "16"), v4i16, ShOp> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v2i32 : N2VDShAdd<op24, op23, op11_8, 0, op4,
+                        OpcodeStr, !strconcat(Dt, "32"), v2i32, ShOp> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v1i64 : N2VDShAdd<op24, op23, op11_8, 1, op4,
+                        OpcodeStr, !strconcat(Dt, "64"), v1i64, ShOp>;
+                             // imm6 = xxxxxx
+
+  // 128-bit vector types.
+  def v16i8 : N2VQShAdd<op24, op23, op11_8, 0, op4,
+                        OpcodeStr, !strconcat(Dt, "8"), v16i8, ShOp> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v8i16 : N2VQShAdd<op24, op23, op11_8, 0, op4,
+                        OpcodeStr, !strconcat(Dt, "16"), v8i16, ShOp> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v4i32 : N2VQShAdd<op24, op23, op11_8, 0, op4,
+                        OpcodeStr, !strconcat(Dt, "32"), v4i32, ShOp> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v2i64 : N2VQShAdd<op24, op23, op11_8, 1, op4,
+                        OpcodeStr, !strconcat(Dt, "64"), v2i64, ShOp>;
+                             // imm6 = xxxxxx
+}
+
+
+// Neon Shift-Insert vector operations,
+//   element sizes of 8, 16, 32 and 64 bits:
+multiclass N2VShIns_QHSD<bit op24, bit op23, bits<4> op11_8, bit op4,
+                         string OpcodeStr, SDNode ShOp> {
+  // 64-bit vector types.
+  def v8i8  : N2VDShIns<op24, op23, op11_8, 0, op4,
+                        OpcodeStr, "8", v8i8, ShOp> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v4i16 : N2VDShIns<op24, op23, op11_8, 0, op4,
+                        OpcodeStr, "16", v4i16, ShOp> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v2i32 : N2VDShIns<op24, op23, op11_8, 0, op4,
+                        OpcodeStr, "32", v2i32, ShOp> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v1i64 : N2VDShIns<op24, op23, op11_8, 1, op4,
+                        OpcodeStr, "64", v1i64, ShOp>;
+                             // imm6 = xxxxxx
+
+  // 128-bit vector types.
+  def v16i8 : N2VQShIns<op24, op23, op11_8, 0, op4,
+                        OpcodeStr, "8", v16i8, ShOp> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v8i16 : N2VQShIns<op24, op23, op11_8, 0, op4,
+                        OpcodeStr, "16", v8i16, ShOp> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v4i32 : N2VQShIns<op24, op23, op11_8, 0, op4,
+                        OpcodeStr, "32", v4i32, ShOp> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+  def v2i64 : N2VQShIns<op24, op23, op11_8, 1, op4,
+                        OpcodeStr, "64", v2i64, ShOp>;
+                             // imm6 = xxxxxx
+}
+
+// Neon Shift Long operations,
+//   element sizes of 8, 16, 32 bits:
+multiclass N2VLSh_QHS<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6,
+                      bit op4, string OpcodeStr, string Dt, SDNode OpNode> {
+  def v8i16 : N2VLSh<op24, op23, op11_8, op7, op6, op4,
+                 OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, OpNode> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v4i32 : N2VLSh<op24, op23, op11_8, op7, op6, op4,
+                  OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, OpNode> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v2i64 : N2VLSh<op24, op23, op11_8, op7, op6, op4,
+                  OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, OpNode> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+}
+
+// Neon Shift Narrow operations,
+//   element sizes of 16, 32, 64 bits:
+multiclass N2VNSh_HSD<bit op24, bit op23, bits<4> op11_8, bit op7, bit op6,
+                      bit op4, InstrItinClass itin, string OpcodeStr, string Dt,
+                      SDNode OpNode> {
+  def v8i8 : N2VNSh<op24, op23, op11_8, op7, op6, op4, itin,
+                    OpcodeStr, !strconcat(Dt, "16"), v8i8, v8i16, OpNode> {
+    let Inst{21-19} = 0b001; // imm6 = 001xxx
+  }
+  def v4i16 : N2VNSh<op24, op23, op11_8, op7, op6, op4, itin,
+                     OpcodeStr, !strconcat(Dt, "32"), v4i16, v4i32, OpNode> {
+    let Inst{21-20} = 0b01;  // imm6 = 01xxxx
+  }
+  def v2i32 : N2VNSh<op24, op23, op11_8, op7, op6, op4, itin,
+                     OpcodeStr, !strconcat(Dt, "64"), v2i32, v2i64, OpNode> {
+    let Inst{21} = 0b1;      // imm6 = 1xxxxx
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction Definitions.
+//===----------------------------------------------------------------------===//
+
+// Vector Add Operations.
+
+//   VADD     : Vector Add (integer and floating-point)
+defm VADD     : N3V_QHSD<0, 0, 0b1000, 0, IIC_VBINiD, IIC_VBINiQ, "vadd", "i",
+                         add, 1>;
+def  VADDfd   : N3VD<0, 0, 0b00, 0b1101, 0, IIC_VBIND, "vadd", "f32",
+                     v2f32, v2f32, fadd, 1>;
+def  VADDfq   : N3VQ<0, 0, 0b00, 0b1101, 0, IIC_VBINQ, "vadd", "f32",
+                     v4f32, v4f32, fadd, 1>;
+//   VADDL    : Vector Add Long (Q = D + D)
+defm VADDLs   : N3VLInt_QHS<0,1,0b0000,0, IIC_VSHLiD, "vaddl", "s",
+                            int_arm_neon_vaddls, 1>;
+defm VADDLu   : N3VLInt_QHS<1,1,0b0000,0, IIC_VSHLiD, "vaddl", "u",
+                            int_arm_neon_vaddlu, 1>;
+//   VADDW    : Vector Add Wide (Q = Q + D)
+defm VADDWs   : N3VWInt_QHS<0,1,0b0001,0, "vaddw", "s", int_arm_neon_vaddws, 0>;
+defm VADDWu   : N3VWInt_QHS<1,1,0b0001,0, "vaddw", "u", int_arm_neon_vaddwu, 0>;
+//   VHADD    : Vector Halving Add
+defm VHADDs   : N3VInt_QHS<0,0,0b0000,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+                           IIC_VBINi4Q, "vhadd", "s", int_arm_neon_vhadds, 1>;
+defm VHADDu   : N3VInt_QHS<1,0,0b0000,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+                           IIC_VBINi4Q, "vhadd", "u", int_arm_neon_vhaddu, 1>;
+//   VRHADD   : Vector Rounding Halving Add
+defm VRHADDs  : N3VInt_QHS<0,0,0b0001,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+                           IIC_VBINi4Q, "vrhadd", "s", int_arm_neon_vrhadds, 1>;
+defm VRHADDu  : N3VInt_QHS<1,0,0b0001,0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+                           IIC_VBINi4Q, "vrhadd", "u", int_arm_neon_vrhaddu, 1>;
+//   VQADD    : Vector Saturating Add
+defm VQADDs   : N3VInt_QHSD<0,0,0b0000,1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+                            IIC_VBINi4Q, "vqadd", "s", int_arm_neon_vqadds, 1>;
+defm VQADDu   : N3VInt_QHSD<1,0,0b0000,1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+                            IIC_VBINi4Q, "vqadd", "u", int_arm_neon_vqaddu, 1>;
+//   VADDHN   : Vector Add and Narrow Returning High Half (D = Q + Q)
+defm VADDHN   : N3VNInt_HSD<0,1,0b0100,0, "vaddhn", "i",
+                            int_arm_neon_vaddhn, 1>;
+//   VRADDHN  : Vector Rounding Add and Narrow Returning High Half (D = Q + Q)
+defm VRADDHN  : N3VNInt_HSD<1,1,0b0100,0, "vraddhn", "i",
+                            int_arm_neon_vraddhn, 1>;
+
+// Vector Multiply Operations.
+
+//   VMUL     : Vector Multiply (integer, polynomial and floating-point)
+defm VMUL     : N3V_QHS<0, 0, 0b1001, 1, IIC_VMULi16D, IIC_VMULi32D,
+                        IIC_VMULi16Q, IIC_VMULi32Q, "vmul", "i", mul, 1>;
+def  VMULpd   : N3VDInt<1, 0, 0b00, 0b1001, 1, IIC_VMULi16D, "vmul", "p8",
+                        v8i8, v8i8, int_arm_neon_vmulp, 1>;
+def  VMULpq   : N3VQInt<1, 0, 0b00, 0b1001, 1, IIC_VMULi16Q, "vmul", "p8",
+                        v16i8, v16i8, int_arm_neon_vmulp, 1>;
+def  VMULfd   : N3VD<1, 0, 0b00, 0b1101, 1, IIC_VBIND, "vmul", "f32",
+                        v2f32, v2f32, fmul, 1>;
+def  VMULfq   : N3VQ<1, 0, 0b00, 0b1101, 1, IIC_VBINQ, "vmul", "f32",
+                        v4f32, v4f32, fmul, 1>;
+defm VMULsl  : N3VSL_HS<0b1000, "vmul", "i", mul>;
+def VMULslfd : N3VDSL<0b10, 0b1001, IIC_VBIND, "vmul", "f32", v2f32, fmul>;
+def VMULslfq : N3VQSL<0b10, 0b1001, IIC_VBINQ, "vmul", "f32", v4f32, v2f32, fmul>;
+def : Pat<(v8i16 (mul (v8i16 QPR:$src1),
+                      (v8i16 (NEONvduplane (v8i16 QPR:$src2), imm:$lane)))),
+          (v8i16 (VMULslv8i16 (v8i16 QPR:$src1),
+                              (v4i16 (EXTRACT_SUBREG QPR:$src2,
+                                                     (DSubReg_i16_reg imm:$lane))),
+                              (SubReg_i16_lane imm:$lane)))>;
+def : Pat<(v4i32 (mul (v4i32 QPR:$src1),
+                      (v4i32 (NEONvduplane (v4i32 QPR:$src2), imm:$lane)))),
+          (v4i32 (VMULslv4i32 (v4i32 QPR:$src1),
+                              (v2i32 (EXTRACT_SUBREG QPR:$src2,
+                                                     (DSubReg_i32_reg imm:$lane))),
+                              (SubReg_i32_lane imm:$lane)))>;
+def : Pat<(v4f32 (fmul (v4f32 QPR:$src1),
+                       (v4f32 (NEONvduplane (v4f32 QPR:$src2), imm:$lane)))),
+          (v4f32 (VMULslfq (v4f32 QPR:$src1),
+                           (v2f32 (EXTRACT_SUBREG QPR:$src2,
+                                                  (DSubReg_i32_reg imm:$lane))),
+                           (SubReg_i32_lane imm:$lane)))>;
+
+//   VQDMULH  : Vector Saturating Doubling Multiply Returning High Half
+defm VQDMULH  : N3VInt_HS<0, 0, 0b1011, 0, IIC_VMULi16D, IIC_VMULi32D,
+                          IIC_VMULi16Q, IIC_VMULi32Q, 
+                          "vqdmulh", "s", int_arm_neon_vqdmulh, 1>;
+defm VQDMULHsl: N3VIntSL_HS<0b1100, IIC_VMULi16D, IIC_VMULi32D,
+                            IIC_VMULi16Q, IIC_VMULi32Q,
+                            "vqdmulh", "s",  int_arm_neon_vqdmulh>;
+def : Pat<(v8i16 (int_arm_neon_vqdmulh (v8i16 QPR:$src1),
+                                       (v8i16 (NEONvduplane (v8i16 QPR:$src2),
+                                                            imm:$lane)))),
+          (v8i16 (VQDMULHslv8i16 (v8i16 QPR:$src1),
+                                 (v4i16 (EXTRACT_SUBREG QPR:$src2,
+                                                  (DSubReg_i16_reg imm:$lane))),
+                                 (SubReg_i16_lane imm:$lane)))>;
+def : Pat<(v4i32 (int_arm_neon_vqdmulh (v4i32 QPR:$src1),
+                                       (v4i32 (NEONvduplane (v4i32 QPR:$src2),
+                                                            imm:$lane)))),
+          (v4i32 (VQDMULHslv4i32 (v4i32 QPR:$src1),
+                                 (v2i32 (EXTRACT_SUBREG QPR:$src2,
+                                                  (DSubReg_i32_reg imm:$lane))),
+                                 (SubReg_i32_lane imm:$lane)))>;
+
+//   VQRDMULH : Vector Rounding Saturating Doubling Multiply Returning High Half
+defm VQRDMULH   : N3VInt_HS<1, 0, 0b1011, 0, IIC_VMULi16D, IIC_VMULi32D,
+                            IIC_VMULi16Q, IIC_VMULi32Q,
+                            "vqrdmulh", "s", int_arm_neon_vqrdmulh, 1>;
+defm VQRDMULHsl : N3VIntSL_HS<0b1101, IIC_VMULi16D, IIC_VMULi32D,
+                              IIC_VMULi16Q, IIC_VMULi32Q,
+                              "vqrdmulh", "s",  int_arm_neon_vqrdmulh>;
+def : Pat<(v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$src1),
+                                        (v8i16 (NEONvduplane (v8i16 QPR:$src2),
+                                                             imm:$lane)))),
+          (v8i16 (VQRDMULHslv8i16 (v8i16 QPR:$src1),
+                                  (v4i16 (EXTRACT_SUBREG QPR:$src2,
+                                                         (DSubReg_i16_reg imm:$lane))),
+                                  (SubReg_i16_lane imm:$lane)))>;
+def : Pat<(v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$src1),
+                                        (v4i32 (NEONvduplane (v4i32 QPR:$src2),
+                                                             imm:$lane)))),
+          (v4i32 (VQRDMULHslv4i32 (v4i32 QPR:$src1),
+                                  (v2i32 (EXTRACT_SUBREG QPR:$src2,
+                                                  (DSubReg_i32_reg imm:$lane))),
+                                  (SubReg_i32_lane imm:$lane)))>;
+
+//   VMULL    : Vector Multiply Long (integer and polynomial) (Q = D * D)
+defm VMULLs   : N3VLInt_QHS<0,1,0b1100,0, IIC_VMULi16D, "vmull", "s",
+                            int_arm_neon_vmulls, 1>;
+defm VMULLu   : N3VLInt_QHS<1,1,0b1100,0, IIC_VMULi16D, "vmull", "u",
+                            int_arm_neon_vmullu, 1>;
+def  VMULLp   : N3VLInt<0, 1, 0b00, 0b1110, 0, IIC_VMULi16D, "vmull", "p8",
+                        v8i16, v8i8, int_arm_neon_vmullp, 1>;
+defm VMULLsls : N3VLIntSL_HS<0, 0b1010, IIC_VMULi16D, "vmull", "s",
+                             int_arm_neon_vmulls>;
+defm VMULLslu : N3VLIntSL_HS<1, 0b1010, IIC_VMULi16D, "vmull", "u",
+                             int_arm_neon_vmullu>;
+
+//   VQDMULL  : Vector Saturating Doubling Multiply Long (Q = D * D)
+defm VQDMULL  : N3VLInt_HS<0,1,0b1101,0, IIC_VMULi16D, "vqdmull", "s",
+                           int_arm_neon_vqdmull, 1>;
+defm VQDMULLsl: N3VLIntSL_HS<0, 0b1011, IIC_VMULi16D, "vqdmull", "s",
+                             int_arm_neon_vqdmull>;
+
+// Vector Multiply-Accumulate and Multiply-Subtract Operations.
+
+//   VMLA     : Vector Multiply Accumulate (integer and floating-point)
+defm VMLA     : N3VMulOp_QHS<0, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
+                             IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>;
+def  VMLAfd   : N3VDMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACD, "vmla", "f32",
+                          v2f32, fmul, fadd>;
+def  VMLAfq   : N3VQMulOp<0, 0, 0b00, 0b1101, 1, IIC_VMACQ, "vmla", "f32",
+                          v4f32, fmul, fadd>;
+defm VMLAsl   : N3VMulOpSL_HS<0b0000, IIC_VMACi16D, IIC_VMACi32D,
+                              IIC_VMACi16Q, IIC_VMACi32Q, "vmla", "i", add>;
+def  VMLAslfd : N3VDMulOpSL<0b10, 0b0001, IIC_VMACD, "vmla", "f32",
+                            v2f32, fmul, fadd>;
+def  VMLAslfq : N3VQMulOpSL<0b10, 0b0001, IIC_VMACQ, "vmla", "f32",
+                            v4f32, v2f32, fmul, fadd>;
+
+def : Pat<(v8i16 (add (v8i16 QPR:$src1),
+                      (mul (v8i16 QPR:$src2),
+                           (v8i16 (NEONvduplane (v8i16 QPR:$src3), imm:$lane))))),
+          (v8i16 (VMLAslv8i16 (v8i16 QPR:$src1),
+                              (v8i16 QPR:$src2),
+                              (v4i16 (EXTRACT_SUBREG QPR:$src3,
+                                                     (DSubReg_i16_reg imm:$lane))),
+                              (SubReg_i16_lane imm:$lane)))>;
+
+def : Pat<(v4i32 (add (v4i32 QPR:$src1),
+                      (mul (v4i32 QPR:$src2),
+                           (v4i32 (NEONvduplane (v4i32 QPR:$src3), imm:$lane))))),
+          (v4i32 (VMLAslv4i32 (v4i32 QPR:$src1),
+                              (v4i32 QPR:$src2),
+                              (v2i32 (EXTRACT_SUBREG QPR:$src3,
+                                                  (DSubReg_i32_reg imm:$lane))),
+                              (SubReg_i32_lane imm:$lane)))>;
+
+def : Pat<(v4f32 (fadd (v4f32 QPR:$src1),
+                       (fmul (v4f32 QPR:$src2),
+                             (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))),
+          (v4f32 (VMLAslfq (v4f32 QPR:$src1),
+                           (v4f32 QPR:$src2),
+                           (v2f32 (EXTRACT_SUBREG QPR:$src3,
+                                                  (DSubReg_i32_reg imm:$lane))),
+                           (SubReg_i32_lane imm:$lane)))>;
+
+//   VMLAL    : Vector Multiply Accumulate Long (Q += D * D)
+defm VMLALs   : N3VLInt3_QHS<0,1,0b1000,0, "vmlal", "s", int_arm_neon_vmlals>;
+defm VMLALu   : N3VLInt3_QHS<1,1,0b1000,0, "vmlal", "u", int_arm_neon_vmlalu>;
+
+defm VMLALsls : N3VLInt3SL_HS<0, 0b0010, "vmlal", "s", int_arm_neon_vmlals>;
+defm VMLALslu : N3VLInt3SL_HS<1, 0b0010, "vmlal", "u", int_arm_neon_vmlalu>;
+
+//   VQDMLAL  : Vector Saturating Doubling Multiply Accumulate Long (Q += D * D)
+defm VQDMLAL  : N3VLInt3_HS<0, 1, 0b1001, 0, "vqdmlal", "s",
+                            int_arm_neon_vqdmlal>;
+defm VQDMLALsl: N3VLInt3SL_HS<0, 0b0011, "vqdmlal", "s", int_arm_neon_vqdmlal>;
+
+//   VMLS     : Vector Multiply Subtract (integer and floating-point)
+defm VMLS     : N3VMulOp_QHS<1, 0, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
+                             IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>;
+def  VMLSfd   : N3VDMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACD, "vmls", "f32",
+                          v2f32, fmul, fsub>;
+def  VMLSfq   : N3VQMulOp<0, 0, 0b10, 0b1101, 1, IIC_VMACQ, "vmls", "f32",
+                          v4f32, fmul, fsub>;
+defm VMLSsl   : N3VMulOpSL_HS<0b0100, IIC_VMACi16D, IIC_VMACi32D,
+                              IIC_VMACi16Q, IIC_VMACi32Q, "vmls", "i", sub>;
+def  VMLSslfd : N3VDMulOpSL<0b10, 0b0101, IIC_VMACD, "vmls", "f32",
+                            v2f32, fmul, fsub>;
+def  VMLSslfq : N3VQMulOpSL<0b10, 0b0101, IIC_VMACQ, "vmls", "f32",
+                            v4f32, v2f32, fmul, fsub>;
+
+def : Pat<(v8i16 (sub (v8i16 QPR:$src1),
+                      (mul (v8i16 QPR:$src2),
+                           (v8i16 (NEONvduplane (v8i16 QPR:$src3), imm:$lane))))),
+          (v8i16 (VMLSslv8i16 (v8i16 QPR:$src1),
+                              (v8i16 QPR:$src2),
+                              (v4i16 (EXTRACT_SUBREG QPR:$src3,
+                                                     (DSubReg_i16_reg imm:$lane))),
+                              (SubReg_i16_lane imm:$lane)))>;
+
+def : Pat<(v4i32 (sub (v4i32 QPR:$src1),
+                      (mul (v4i32 QPR:$src2),
+                         (v4i32 (NEONvduplane (v4i32 QPR:$src3), imm:$lane))))),
+          (v4i32 (VMLSslv4i32 (v4i32 QPR:$src1),
+                              (v4i32 QPR:$src2),
+                              (v2i32 (EXTRACT_SUBREG QPR:$src3,
+                                                     (DSubReg_i32_reg imm:$lane))),
+                              (SubReg_i32_lane imm:$lane)))>;
+
+def : Pat<(v4f32 (fsub (v4f32 QPR:$src1),
+                       (fmul (v4f32 QPR:$src2),
+                           (v4f32 (NEONvduplane (v4f32 QPR:$src3), imm:$lane))))),
+          (v4f32 (VMLSslfq (v4f32 QPR:$src1),
+                           (v4f32 QPR:$src2),
+                           (v2f32 (EXTRACT_SUBREG QPR:$src3,
+                                                  (DSubReg_i32_reg imm:$lane))),
+                           (SubReg_i32_lane imm:$lane)))>;
+
+//   VMLSL    : Vector Multiply Subtract Long (Q -= D * D)
+defm VMLSLs   : N3VLInt3_QHS<0,1,0b1010,0, "vmlsl", "s", int_arm_neon_vmlsls>;
+defm VMLSLu   : N3VLInt3_QHS<1,1,0b1010,0, "vmlsl", "u", int_arm_neon_vmlslu>;
+
+defm VMLSLsls : N3VLInt3SL_HS<0, 0b0110, "vmlsl", "s", int_arm_neon_vmlsls>;
+defm VMLSLslu : N3VLInt3SL_HS<1, 0b0110, "vmlsl", "u", int_arm_neon_vmlslu>;
+
+//   VQDMLSL  : Vector Saturating Doubling Multiply Subtract Long (Q -= D * D)
+defm VQDMLSL  : N3VLInt3_HS<0, 1, 0b1011, 0, "vqdmlsl", "s",
+                            int_arm_neon_vqdmlsl>;
+defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b111, "vqdmlsl", "s", int_arm_neon_vqdmlsl>;
+
+// Vector Subtract Operations.
+
+//   VSUB     : Vector Subtract (integer and floating-point)
+defm VSUB     : N3V_QHSD<1, 0, 0b1000, 0, IIC_VSUBiD, IIC_VSUBiQ,
+                         "vsub", "i", sub, 0>;
+def  VSUBfd   : N3VD<0, 0, 0b10, 0b1101, 0, IIC_VBIND, "vsub", "f32",
+                     v2f32, v2f32, fsub, 0>;
+def  VSUBfq   : N3VQ<0, 0, 0b10, 0b1101, 0, IIC_VBINQ, "vsub", "f32",
+                     v4f32, v4f32, fsub, 0>;
+//   VSUBL    : Vector Subtract Long (Q = D - D)
+defm VSUBLs   : N3VLInt_QHS<0,1,0b0010,0, IIC_VSHLiD, "vsubl", "s",
+                            int_arm_neon_vsubls, 1>;
+defm VSUBLu   : N3VLInt_QHS<1,1,0b0010,0, IIC_VSHLiD, "vsubl", "u",
+                            int_arm_neon_vsublu, 1>;
+//   VSUBW    : Vector Subtract Wide (Q = Q - D)
+defm VSUBWs   : N3VWInt_QHS<0,1,0b0011,0, "vsubw", "s", int_arm_neon_vsubws, 0>;
+defm VSUBWu   : N3VWInt_QHS<1,1,0b0011,0, "vsubw", "u", int_arm_neon_vsubwu, 0>;
+//   VHSUB    : Vector Halving Subtract
+defm VHSUBs   : N3VInt_QHS<0, 0, 0b0010, 0, IIC_VBINi4D, IIC_VBINi4D,
+                           IIC_VBINi4Q, IIC_VBINi4Q,
+                           "vhsub", "s", int_arm_neon_vhsubs, 0>;
+defm VHSUBu   : N3VInt_QHS<1, 0, 0b0010, 0, IIC_VBINi4D, IIC_VBINi4D,
+                           IIC_VBINi4Q, IIC_VBINi4Q,
+                           "vhsub", "u", int_arm_neon_vhsubu, 0>;
+//   VQSUB    : Vector Saturing Subtract
+defm VQSUBs   : N3VInt_QHSD<0, 0, 0b0010, 1, IIC_VBINi4D, IIC_VBINi4D,
+                            IIC_VBINi4Q, IIC_VBINi4Q,
+                            "vqsub", "s", int_arm_neon_vqsubs, 0>;
+defm VQSUBu   : N3VInt_QHSD<1, 0, 0b0010, 1, IIC_VBINi4D, IIC_VBINi4D,
+                            IIC_VBINi4Q, IIC_VBINi4Q,
+                            "vqsub", "u", int_arm_neon_vqsubu, 0>;
+//   VSUBHN   : Vector Subtract and Narrow Returning High Half (D = Q - Q)
+defm VSUBHN   : N3VNInt_HSD<0,1,0b0110,0, "vsubhn", "i",
+                            int_arm_neon_vsubhn, 0>;
+//   VRSUBHN  : Vector Rounding Subtract and Narrow Returning High Half (D=Q-Q)
+defm VRSUBHN  : N3VNInt_HSD<1,1,0b0110,0, "vrsubhn", "i",
+                            int_arm_neon_vrsubhn, 0>;
+
+// Vector Comparisons.
+
+//   VCEQ     : Vector Compare Equal
+defm VCEQ     : N3V_QHS<1, 0, 0b1000, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+                        IIC_VBINi4Q, "vceq", "i", NEONvceq, 1>;
+def  VCEQfd   : N3VD<0,0,0b00,0b1110,0, IIC_VBIND, "vceq", "f32", v2i32, v2f32,
+                     NEONvceq, 1>;
+def  VCEQfq   : N3VQ<0,0,0b00,0b1110,0, IIC_VBINQ, "vceq", "f32", v4i32, v4f32,
+                     NEONvceq, 1>;
+//   VCGE     : Vector Compare Greater Than or Equal
+defm VCGEs    : N3V_QHS<0, 0, 0b0011, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+                        IIC_VBINi4Q, "vcge", "s", NEONvcge, 0>;
+defm VCGEu    : N3V_QHS<1, 0, 0b0011, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, 
+                        IIC_VBINi4Q, "vcge", "u", NEONvcgeu, 0>;
+def  VCGEfd   : N3VD<1,0,0b00,0b1110,0, IIC_VBIND, "vcge", "f32",
+                     v2i32, v2f32, NEONvcge, 0>;
+def  VCGEfq   : N3VQ<1,0,0b00,0b1110,0, IIC_VBINQ, "vcge", "f32", v4i32, v4f32,
+                     NEONvcge, 0>;
+//   VCGT     : Vector Compare Greater Than
+defm VCGTs    : N3V_QHS<0, 0, 0b0011, 0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, 
+                        IIC_VBINi4Q, "vcgt", "s", NEONvcgt, 0>;
+defm VCGTu    : N3V_QHS<1, 0, 0b0011, 0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, 
+                        IIC_VBINi4Q, "vcgt", "u", NEONvcgtu, 0>;
+def  VCGTfd   : N3VD<1,0,0b10,0b1110,0, IIC_VBIND, "vcgt", "f32", v2i32, v2f32,
+                     NEONvcgt, 0>;
+def  VCGTfq   : N3VQ<1,0,0b10,0b1110,0, IIC_VBINQ, "vcgt", "f32", v4i32, v4f32,
+                     NEONvcgt, 0>;
+//   VACGE    : Vector Absolute Compare Greater Than or Equal (aka VCAGE)
+def  VACGEd   : N3VDInt<1, 0, 0b00, 0b1110, 1, IIC_VBIND, "vacge", "f32",
+                        v2i32, v2f32, int_arm_neon_vacged, 0>;
+def  VACGEq   : N3VQInt<1, 0, 0b00, 0b1110, 1, IIC_VBINQ, "vacge", "f32",
+                        v4i32, v4f32, int_arm_neon_vacgeq, 0>;
+//   VACGT    : Vector Absolute Compare Greater Than (aka VCAGT)
+def  VACGTd   : N3VDInt<1, 0, 0b10, 0b1110, 1, IIC_VBIND, "vacgt", "f32",
+                        v2i32, v2f32, int_arm_neon_vacgtd, 0>;
+def  VACGTq   : N3VQInt<1, 0, 0b10, 0b1110, 1, IIC_VBINQ, "vacgt", "f32",
+                        v4i32, v4f32, int_arm_neon_vacgtq, 0>;
+//   VTST     : Vector Test Bits
+defm VTST     : N3V_QHS<0, 0, 0b1000, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q, 
+                        IIC_VBINi4Q, "vtst", "", NEONvtst, 1>;
+
+// Vector Bitwise Operations.
+
+//   VAND     : Vector Bitwise AND
+def  VANDd    : N3VDX<0, 0, 0b00, 0b0001, 1, IIC_VBINiD, "vand",
+                      v2i32, v2i32, and, 1>;
+def  VANDq    : N3VQX<0, 0, 0b00, 0b0001, 1, IIC_VBINiQ, "vand",
+                      v4i32, v4i32, and, 1>;
+
+//   VEOR     : Vector Bitwise Exclusive OR
+def  VEORd    : N3VDX<1, 0, 0b00, 0b0001, 1, IIC_VBINiD, "veor",
+                      v2i32, v2i32, xor, 1>;
+def  VEORq    : N3VQX<1, 0, 0b00, 0b0001, 1, IIC_VBINiQ, "veor",
+                      v4i32, v4i32, xor, 1>;
+
+//   VORR     : Vector Bitwise OR
+def  VORRd    : N3VDX<0, 0, 0b10, 0b0001, 1, IIC_VBINiD, "vorr",
+                      v2i32, v2i32, or, 1>;
+def  VORRq    : N3VQX<0, 0, 0b10, 0b0001, 1, IIC_VBINiQ, "vorr",
+                      v4i32, v4i32, or, 1>;
+
+//   VBIC     : Vector Bitwise Bit Clear (AND NOT)
+def  VBICd    : N3VX<0, 0, 0b01, 0b0001, 0, 1, (outs DPR:$dst),
+                    (ins DPR:$src1, DPR:$src2), IIC_VBINiD,
+                    "vbic", "$dst, $src1, $src2", "",
+                    [(set DPR:$dst, (v2i32 (and DPR:$src1,
+                                                (vnot_conv DPR:$src2))))]>;
+def  VBICq    : N3VX<0, 0, 0b01, 0b0001, 1, 1, (outs QPR:$dst),
+                    (ins QPR:$src1, QPR:$src2), IIC_VBINiQ,
+                    "vbic", "$dst, $src1, $src2", "",
+                    [(set QPR:$dst, (v4i32 (and QPR:$src1,
+                                                (vnot_conv QPR:$src2))))]>;
+
+//   VORN     : Vector Bitwise OR NOT
+def  VORNd    : N3VX<0, 0, 0b11, 0b0001, 0, 1, (outs DPR:$dst),
+                    (ins DPR:$src1, DPR:$src2), IIC_VBINiD,
+                    "vorn", "$dst, $src1, $src2", "",
+                    [(set DPR:$dst, (v2i32 (or DPR:$src1,
+                                               (vnot_conv DPR:$src2))))]>;
+def  VORNq    : N3VX<0, 0, 0b11, 0b0001, 1, 1, (outs QPR:$dst),
+                    (ins QPR:$src1, QPR:$src2), IIC_VBINiQ,
+                    "vorn", "$dst, $src1, $src2", "",
+                    [(set QPR:$dst, (v4i32 (or QPR:$src1,
+                                               (vnot_conv QPR:$src2))))]>;
+
+//   VMVN     : Vector Bitwise NOT
+def  VMVNd    : N2VX<0b11, 0b11, 0b00, 0b00, 0b01011, 0, 0,
+                    (outs DPR:$dst), (ins DPR:$src), IIC_VSHLiD,
+                    "vmvn", "$dst, $src", "",
+                    [(set DPR:$dst, (v2i32 (vnot DPR:$src)))]>;
+def  VMVNq    : N2VX<0b11, 0b11, 0b00, 0b00, 0b01011, 1, 0,
+                    (outs QPR:$dst), (ins QPR:$src), IIC_VSHLiD,
+                    "vmvn", "$dst, $src", "",
+                    [(set QPR:$dst, (v4i32 (vnot QPR:$src)))]>;
+def : Pat<(v2i32 (vnot_conv DPR:$src)), (VMVNd DPR:$src)>;
+def : Pat<(v4i32 (vnot_conv QPR:$src)), (VMVNq QPR:$src)>;
+
+//   VBSL     : Vector Bitwise Select
+def  VBSLd    : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$dst),
+                    (ins DPR:$src1, DPR:$src2, DPR:$src3), IIC_VCNTiD,
+                    "vbsl", "$dst, $src2, $src3", "$src1 = $dst",
+                    [(set DPR:$dst,
+                      (v2i32 (or (and DPR:$src2, DPR:$src1),
+                                 (and DPR:$src3, (vnot_conv DPR:$src1)))))]>;
+def  VBSLq    : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$dst),
+                    (ins QPR:$src1, QPR:$src2, QPR:$src3), IIC_VCNTiQ,
+                    "vbsl", "$dst, $src2, $src3", "$src1 = $dst",
+                    [(set QPR:$dst,
+                      (v4i32 (or (and QPR:$src2, QPR:$src1),
+                                 (and QPR:$src3, (vnot_conv QPR:$src1)))))]>;
+
+//   VBIF     : Vector Bitwise Insert if False
+//              like VBSL but with: "vbif $dst, $src3, $src1", "$src2 = $dst",
+def  VBIFd    : N3VX<1, 0, 0b11, 0b0001, 0, 1,
+                     (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3),
+                     IIC_VBINiD, "vbif", "$dst, $src2, $src3", "$src1 = $dst",
+                     [/* For disassembly only; pattern left blank */]>;
+def  VBIFq    : N3VX<1, 0, 0b11, 0b0001, 1, 1,
+                     (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3),
+                     IIC_VBINiQ, "vbif", "$dst, $src2, $src3", "$src1 = $dst",
+                     [/* For disassembly only; pattern left blank */]>;
+
+//   VBIT     : Vector Bitwise Insert if True
+//              like VBSL but with: "vbit $dst, $src2, $src1", "$src3 = $dst",
+def  VBITd    : N3VX<1, 0, 0b10, 0b0001, 0, 1,
+                     (outs DPR:$dst), (ins DPR:$src1, DPR:$src2, DPR:$src3),
+                     IIC_VBINiD, "vbit", "$dst, $src2, $src3", "$src1 = $dst",
+                     [/* For disassembly only; pattern left blank */]>;
+def  VBITq    : N3VX<1, 0, 0b10, 0b0001, 1, 1,
+                     (outs QPR:$dst), (ins QPR:$src1, QPR:$src2, QPR:$src3),
+                     IIC_VBINiQ, "vbit", "$dst, $src2, $src3", "$src1 = $dst",
+                     [/* For disassembly only; pattern left blank */]>;
+
+// VBIT/VBIF are not yet implemented.  The TwoAddress pass will not go looking
+// for equivalent operations with different register constraints; it just
+// inserts copies.
+
+// Vector Absolute Differences.
+
+//   VABD     : Vector Absolute Difference
+defm VABDs    : N3VInt_QHS<0, 0, 0b0111, 0, IIC_VBINi4D, IIC_VBINi4D,
+                           IIC_VBINi4Q, IIC_VBINi4Q,
+                           "vabd", "s", int_arm_neon_vabds, 0>;
+defm VABDu    : N3VInt_QHS<1, 0, 0b0111, 0, IIC_VBINi4D, IIC_VBINi4D,
+                           IIC_VBINi4Q, IIC_VBINi4Q,
+                           "vabd", "u", int_arm_neon_vabdu, 0>;
+def  VABDfd   : N3VDInt<1, 0, 0b10, 0b1101, 0, IIC_VBIND,
+                        "vabd", "f32", v2f32, v2f32, int_arm_neon_vabds, 0>;
+def  VABDfq   : N3VQInt<1, 0, 0b10, 0b1101, 0, IIC_VBINQ,
+                        "vabd", "f32", v4f32, v4f32, int_arm_neon_vabds, 0>;
+
+//   VABDL    : Vector Absolute Difference Long (Q = | D - D |)
+defm VABDLs   : N3VLInt_QHS<0,1,0b0111,0, IIC_VBINi4Q,
+                            "vabdl", "s", int_arm_neon_vabdls, 0>;
+defm VABDLu   : N3VLInt_QHS<1,1,0b0111,0, IIC_VBINi4Q,
+                             "vabdl", "u", int_arm_neon_vabdlu, 0>;
+
+//   VABA     : Vector Absolute Difference and Accumulate
+defm VABAs    : N3VInt3_QHS<0,0,0b0111,1, "vaba", "s", int_arm_neon_vabas>;
+defm VABAu    : N3VInt3_QHS<1,0,0b0111,1, "vaba", "u", int_arm_neon_vabau>;
+
+//   VABAL    : Vector Absolute Difference and Accumulate Long (Q += | D - D |)
+defm VABALs   : N3VLInt3_QHS<0,1,0b0101,0, "vabal", "s", int_arm_neon_vabals>;
+defm VABALu   : N3VLInt3_QHS<1,1,0b0101,0, "vabal", "u", int_arm_neon_vabalu>;
+
+// Vector Maximum and Minimum.
+
+//   VMAX     : Vector Maximum
+defm VMAXs    : N3VInt_QHS<0, 0, 0b0110, 0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+                           IIC_VBINi4Q, "vmax", "s", int_arm_neon_vmaxs, 1>;
+defm VMAXu    : N3VInt_QHS<1, 0, 0b0110, 0, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+                           IIC_VBINi4Q, "vmax", "u", int_arm_neon_vmaxu, 1>;
+def  VMAXfd   : N3VDInt<0, 0, 0b00, 0b1111, 0, IIC_VBIND, "vmax", "f32",
+                        v2f32, v2f32, int_arm_neon_vmaxs, 1>;
+def  VMAXfq   : N3VQInt<0, 0, 0b00, 0b1111, 0, IIC_VBINQ, "vmax", "f32",
+                        v4f32, v4f32, int_arm_neon_vmaxs, 1>;
+
+//   VMIN     : Vector Minimum
+defm VMINs    : N3VInt_QHS<0, 0, 0b0110, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+                           IIC_VBINi4Q, "vmin", "s", int_arm_neon_vmins, 1>;
+defm VMINu    : N3VInt_QHS<1, 0, 0b0110, 1, IIC_VBINi4D, IIC_VBINi4D, IIC_VBINi4Q,
+                           IIC_VBINi4Q, "vmin", "u", int_arm_neon_vminu, 1>;
+def  VMINfd   : N3VDInt<0, 0, 0b10, 0b1111, 0, IIC_VBIND, "vmin", "f32",
+                        v2f32, v2f32, int_arm_neon_vmins, 1>;
+def  VMINfq   : N3VQInt<0, 0, 0b10, 0b1111, 0, IIC_VBINQ, "vmin", "f32",
+                        v4f32, v4f32, int_arm_neon_vmins, 1>;
+
+// Vector Pairwise Operations.
+
+//   VPADD    : Vector Pairwise Add
+def  VPADDi8  : N3VDInt<0, 0, 0b00, 0b1011, 1, IIC_VBINiD, "vpadd", "i8",
+                        v8i8, v8i8, int_arm_neon_vpadd, 0>;
+def  VPADDi16 : N3VDInt<0, 0, 0b01, 0b1011, 1, IIC_VBINiD, "vpadd", "i16",
+                        v4i16, v4i16, int_arm_neon_vpadd, 0>;
+def  VPADDi32 : N3VDInt<0, 0, 0b10, 0b1011, 1, IIC_VBINiD, "vpadd", "i32",
+                        v2i32, v2i32, int_arm_neon_vpadd, 0>;
+def  VPADDf   : N3VDInt<1, 0, 0b00, 0b1101, 0, IIC_VBIND, "vpadd", "f32",
+                        v2f32, v2f32, int_arm_neon_vpadd, 0>;
+
+//   VPADDL   : Vector Pairwise Add Long
+defm VPADDLs  : N2VPLInt_QHS<0b11, 0b11, 0b00, 0b00100, 0, "vpaddl", "s",
+                             int_arm_neon_vpaddls>;
+defm VPADDLu  : N2VPLInt_QHS<0b11, 0b11, 0b00, 0b00101, 0, "vpaddl", "u",
+                             int_arm_neon_vpaddlu>;
+
+//   VPADAL   : Vector Pairwise Add and Accumulate Long
+defm VPADALs  : N2VPLInt2_QHS<0b11, 0b11, 0b00, 0b01100, 0, "vpadal", "s",
+                              int_arm_neon_vpadals>;
+defm VPADALu  : N2VPLInt2_QHS<0b11, 0b11, 0b00, 0b01101, 0, "vpadal", "u",
+                              int_arm_neon_vpadalu>;
+
+//   VPMAX    : Vector Pairwise Maximum
+def  VPMAXs8  : N3VDInt<0, 0, 0b00, 0b1010, 0, IIC_VBINi4D, "vpmax", "s8",
+                        v8i8, v8i8, int_arm_neon_vpmaxs, 0>;
+def  VPMAXs16 : N3VDInt<0, 0, 0b01, 0b1010, 0, IIC_VBINi4D, "vpmax", "s16",
+                        v4i16, v4i16, int_arm_neon_vpmaxs, 0>;
+def  VPMAXs32 : N3VDInt<0, 0, 0b10, 0b1010, 0, IIC_VBINi4D, "vpmax", "s32",
+                        v2i32, v2i32, int_arm_neon_vpmaxs, 0>;
+def  VPMAXu8  : N3VDInt<1, 0, 0b00, 0b1010, 0, IIC_VBINi4D, "vpmax", "u8",
+                        v8i8, v8i8, int_arm_neon_vpmaxu, 0>;
+def  VPMAXu16 : N3VDInt<1, 0, 0b01, 0b1010, 0, IIC_VBINi4D, "vpmax", "u16",
+                        v4i16, v4i16, int_arm_neon_vpmaxu, 0>;
+def  VPMAXu32 : N3VDInt<1, 0, 0b10, 0b1010, 0, IIC_VBINi4D, "vpmax", "u32",
+                        v2i32, v2i32, int_arm_neon_vpmaxu, 0>;
+def  VPMAXf   : N3VDInt<1, 0, 0b00, 0b1111, 0, IIC_VBINi4D, "vpmax", "f32",
+                        v2f32, v2f32, int_arm_neon_vpmaxs, 0>;
+
+//   VPMIN    : Vector Pairwise Minimum
+def  VPMINs8  : N3VDInt<0, 0, 0b00, 0b1010, 1, IIC_VBINi4D, "vpmin", "s8",
+                        v8i8, v8i8, int_arm_neon_vpmins, 0>;
+def  VPMINs16 : N3VDInt<0, 0, 0b01, 0b1010, 1, IIC_VBINi4D, "vpmin", "s16",
+                        v4i16, v4i16, int_arm_neon_vpmins, 0>;
+def  VPMINs32 : N3VDInt<0, 0, 0b10, 0b1010, 1, IIC_VBINi4D, "vpmin", "s32",
+                        v2i32, v2i32, int_arm_neon_vpmins, 0>;
+def  VPMINu8  : N3VDInt<1, 0, 0b00, 0b1010, 1, IIC_VBINi4D, "vpmin", "u8",
+                        v8i8, v8i8, int_arm_neon_vpminu, 0>;
+def  VPMINu16 : N3VDInt<1, 0, 0b01, 0b1010, 1, IIC_VBINi4D, "vpmin", "u16",
+                        v4i16, v4i16, int_arm_neon_vpminu, 0>;
+def  VPMINu32 : N3VDInt<1, 0, 0b10, 0b1010, 1, IIC_VBINi4D, "vpmin", "u32",
+                        v2i32, v2i32, int_arm_neon_vpminu, 0>;
+def  VPMINf   : N3VDInt<1, 0, 0b10, 0b1111, 0, IIC_VBINi4D, "vpmin", "f32",
+                        v2f32, v2f32, int_arm_neon_vpmins, 0>;
+
+// Vector Reciprocal and Reciprocal Square Root Estimate and Step.
+
+//   VRECPE   : Vector Reciprocal Estimate
+def  VRECPEd  : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0, 
+                        IIC_VUNAD, "vrecpe", "u32",
+                        v2i32, v2i32, int_arm_neon_vrecpe>;
+def  VRECPEq  : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01000, 0, 
+                        IIC_VUNAQ, "vrecpe", "u32",
+                        v4i32, v4i32, int_arm_neon_vrecpe>;
+def  VRECPEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0,
+                        IIC_VUNAD, "vrecpe", "f32",
+                        v2f32, v2f32, int_arm_neon_vrecpe>;
+def  VRECPEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01010, 0,
+                        IIC_VUNAQ, "vrecpe", "f32",
+                        v4f32, v4f32, int_arm_neon_vrecpe>;
+
+//   VRECPS   : Vector Reciprocal Step
+def  VRECPSfd : N3VDInt<0, 0, 0b00, 0b1111, 1,
+                        IIC_VRECSD, "vrecps", "f32",
+                        v2f32, v2f32, int_arm_neon_vrecps, 1>;
+def  VRECPSfq : N3VQInt<0, 0, 0b00, 0b1111, 1,
+                        IIC_VRECSQ, "vrecps", "f32",
+                        v4f32, v4f32, int_arm_neon_vrecps, 1>;
+
+//   VRSQRTE  : Vector Reciprocal Square Root Estimate
+def  VRSQRTEd  : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0,
+                         IIC_VUNAD, "vrsqrte", "u32",
+                         v2i32, v2i32, int_arm_neon_vrsqrte>;
+def  VRSQRTEq  : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01001, 0,
+                         IIC_VUNAQ, "vrsqrte", "u32",
+                         v4i32, v4i32, int_arm_neon_vrsqrte>;
+def  VRSQRTEfd : N2VDInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0,
+                         IIC_VUNAD, "vrsqrte", "f32",
+                         v2f32, v2f32, int_arm_neon_vrsqrte>;
+def  VRSQRTEfq : N2VQInt<0b11, 0b11, 0b10, 0b11, 0b01011, 0, 
+                         IIC_VUNAQ, "vrsqrte", "f32",
+                         v4f32, v4f32, int_arm_neon_vrsqrte>;
+
+//   VRSQRTS  : Vector Reciprocal Square Root Step
+def VRSQRTSfd : N3VDInt<0, 0, 0b10, 0b1111, 1,
+                        IIC_VRECSD, "vrsqrts", "f32",
+                        v2f32, v2f32, int_arm_neon_vrsqrts, 1>;
+def VRSQRTSfq : N3VQInt<0, 0, 0b10, 0b1111, 1,
+                        IIC_VRECSQ, "vrsqrts", "f32",
+                        v4f32, v4f32, int_arm_neon_vrsqrts, 1>;
+
+// Vector Shifts.
+
+//   VSHL     : Vector Shift
+defm VSHLs    : N3VInt_QHSD<0, 0, 0b0100, 0, IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ,
+                            IIC_VSHLiQ, "vshl", "s", int_arm_neon_vshifts, 0>;
+defm VSHLu    : N3VInt_QHSD<1, 0, 0b0100, 0, IIC_VSHLiD, IIC_VSHLiD, IIC_VSHLiQ,
+                            IIC_VSHLiQ, "vshl", "u", int_arm_neon_vshiftu, 0>;
+//   VSHL     : Vector Shift Left (Immediate)
+defm VSHLi    : N2VSh_QHSD<0, 1, 0b0101, 1, IIC_VSHLiD, "vshl", "i", NEONvshl>;
+//   VSHR     : Vector Shift Right (Immediate)
+defm VSHRs    : N2VSh_QHSD<0, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "s", NEONvshrs>;
+defm VSHRu    : N2VSh_QHSD<1, 1, 0b0000, 1, IIC_VSHLiD, "vshr", "u", NEONvshru>;
+
+//   VSHLL    : Vector Shift Left Long
+defm VSHLLs   : N2VLSh_QHS<0, 1, 0b1010, 0, 0, 1, "vshll", "s", NEONvshlls>;
+defm VSHLLu   : N2VLSh_QHS<1, 1, 0b1010, 0, 0, 1, "vshll", "u", NEONvshllu>;
+
+//   VSHLL    : Vector Shift Left Long (with maximum shift count)
+class N2VLShMax<bit op24, bit op23, bits<6> op21_16, bits<4> op11_8, bit op7,
+                bit op6, bit op4, string OpcodeStr, string Dt, ValueType ResTy,
+                ValueType OpTy, SDNode OpNode>
+  : N2VLSh<op24, op23, op11_8, op7, op6, op4, OpcodeStr, Dt,
+           ResTy, OpTy, OpNode> {
+  let Inst{21-16} = op21_16;
+}
+def  VSHLLi8  : N2VLShMax<1, 1, 0b110010, 0b0011, 0, 0, 0, "vshll", "i8",
+                          v8i16, v8i8, NEONvshlli>;
+def  VSHLLi16 : N2VLShMax<1, 1, 0b110110, 0b0011, 0, 0, 0, "vshll", "i16",
+                          v4i32, v4i16, NEONvshlli>;
+def  VSHLLi32 : N2VLShMax<1, 1, 0b111010, 0b0011, 0, 0, 0, "vshll", "i32",
+                          v2i64, v2i32, NEONvshlli>;
+
+//   VSHRN    : Vector Shift Right and Narrow
+defm VSHRN    : N2VNSh_HSD<0,1,0b1000,0,0,1, IIC_VSHLiD, "vshrn", "i", NEONvshrn>;
+
+//   VRSHL    : Vector Rounding Shift
+defm VRSHLs   : N3VInt_QHSD<0,0,0b0101,0, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
+                            IIC_VSHLi4Q, "vrshl", "s", int_arm_neon_vrshifts, 0>;
+defm VRSHLu   : N3VInt_QHSD<1,0,0b0101,0, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
+                            IIC_VSHLi4Q, "vrshl", "u", int_arm_neon_vrshiftu, 0>;
+//   VRSHR    : Vector Rounding Shift Right
+defm VRSHRs   : N2VSh_QHSD<0, 1, 0b0010, 1, IIC_VSHLi4D, "vrshr", "s", NEONvrshrs>;
+defm VRSHRu   : N2VSh_QHSD<1, 1, 0b0010, 1, IIC_VSHLi4D, "vrshr", "u", NEONvrshru>;
+
+//   VRSHRN   : Vector Rounding Shift Right and Narrow
+defm VRSHRN   : N2VNSh_HSD<0, 1, 0b1000, 0, 1, 1, IIC_VSHLi4D, "vrshrn", "i",
+                           NEONvrshrn>;
+
+//   VQSHL    : Vector Saturating Shift
+defm VQSHLs   : N3VInt_QHSD<0,0,0b0100,1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
+                            IIC_VSHLi4Q, "vqshl", "s", int_arm_neon_vqshifts, 0>;
+defm VQSHLu   : N3VInt_QHSD<1,0,0b0100,1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
+                            IIC_VSHLi4Q, "vqshl", "u", int_arm_neon_vqshiftu, 0>;
+//   VQSHL    : Vector Saturating Shift Left (Immediate)
+defm VQSHLsi  : N2VSh_QHSD<0, 1, 0b0111, 1, IIC_VSHLi4D, "vqshl", "s", NEONvqshls>;
+defm VQSHLui  : N2VSh_QHSD<1, 1, 0b0111, 1, IIC_VSHLi4D, "vqshl", "u", NEONvqshlu>;
+//   VQSHLU   : Vector Saturating Shift Left (Immediate, Unsigned)
+defm VQSHLsu  : N2VSh_QHSD<1, 1, 0b0110, 1, IIC_VSHLi4D, "vqshlu", "s", NEONvqshlsu>;
+
+//   VQSHRN   : Vector Saturating Shift Right and Narrow
+defm VQSHRNs  : N2VNSh_HSD<0, 1, 0b1001, 0, 0, 1, IIC_VSHLi4D, "vqshrn", "s",
+                           NEONvqshrns>;
+defm VQSHRNu  : N2VNSh_HSD<1, 1, 0b1001, 0, 0, 1, IIC_VSHLi4D, "vqshrn", "u",
+                           NEONvqshrnu>;
+
+//   VQSHRUN  : Vector Saturating Shift Right and Narrow (Unsigned)
+defm VQSHRUN  : N2VNSh_HSD<1, 1, 0b1000, 0, 0, 1, IIC_VSHLi4D, "vqshrun", "s",
+                           NEONvqshrnsu>;
+
+//   VQRSHL   : Vector Saturating Rounding Shift
+defm VQRSHLs  : N3VInt_QHSD<0, 0, 0b0101, 1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
+                            IIC_VSHLi4Q, "vqrshl", "s",
+                            int_arm_neon_vqrshifts, 0>;
+defm VQRSHLu  : N3VInt_QHSD<1, 0, 0b0101, 1, IIC_VSHLi4D, IIC_VSHLi4D, IIC_VSHLi4Q,
+                            IIC_VSHLi4Q, "vqrshl", "u",
+                            int_arm_neon_vqrshiftu, 0>;
+
+//   VQRSHRN  : Vector Saturating Rounding Shift Right and Narrow
+defm VQRSHRNs : N2VNSh_HSD<0, 1, 0b1001, 0, 1, 1, IIC_VSHLi4D, "vqrshrn", "s",
+                           NEONvqrshrns>;
+defm VQRSHRNu : N2VNSh_HSD<1, 1, 0b1001, 0, 1, 1, IIC_VSHLi4D, "vqrshrn", "u",
+                           NEONvqrshrnu>;
+
+//   VQRSHRUN : Vector Saturating Rounding Shift Right and Narrow (Unsigned)
+defm VQRSHRUN : N2VNSh_HSD<1, 1, 0b1000, 0, 1, 1, IIC_VSHLi4D, "vqrshrun", "s",
+                           NEONvqrshrnsu>;
+
+//   VSRA     : Vector Shift Right and Accumulate
+defm VSRAs    : N2VShAdd_QHSD<0, 1, 0b0001, 1, "vsra", "s", NEONvshrs>;
+defm VSRAu    : N2VShAdd_QHSD<1, 1, 0b0001, 1, "vsra", "u", NEONvshru>;
+//   VRSRA    : Vector Rounding Shift Right and Accumulate
+defm VRSRAs   : N2VShAdd_QHSD<0, 1, 0b0011, 1, "vrsra", "s", NEONvrshrs>;
+defm VRSRAu   : N2VShAdd_QHSD<1, 1, 0b0011, 1, "vrsra", "u", NEONvrshru>;
+
+//   VSLI     : Vector Shift Left and Insert
+defm VSLI     : N2VShIns_QHSD<1, 1, 0b0101, 1, "vsli", NEONvsli>;
+//   VSRI     : Vector Shift Right and Insert
+defm VSRI     : N2VShIns_QHSD<1, 1, 0b0100, 1, "vsri", NEONvsri>;
+
+// Vector Absolute and Saturating Absolute.
+
+//   VABS     : Vector Absolute Value
+defm VABS     : N2VInt_QHS<0b11, 0b11, 0b01, 0b00110, 0, 
+                           IIC_VUNAiD, IIC_VUNAiQ, "vabs", "s",
+                           int_arm_neon_vabs>;
+def  VABSfd   : N2VDInt<0b11, 0b11, 0b10, 0b01, 0b01110, 0,
+                        IIC_VUNAD, "vabs", "f32",
+                        v2f32, v2f32, int_arm_neon_vabs>;
+def  VABSfq   : N2VQInt<0b11, 0b11, 0b10, 0b01, 0b01110, 0,
+                        IIC_VUNAQ, "vabs", "f32",
+                        v4f32, v4f32, int_arm_neon_vabs>;
+
+//   VQABS    : Vector Saturating Absolute Value
+defm VQABS    : N2VInt_QHS<0b11, 0b11, 0b00, 0b01110, 0, 
+                           IIC_VQUNAiD, IIC_VQUNAiQ, "vqabs", "s",
+                           int_arm_neon_vqabs>;
+
+// Vector Negate.
+
+def vneg      : PatFrag<(ops node:$in), (sub immAllZerosV, node:$in)>;
+def vneg_conv : PatFrag<(ops node:$in), (sub immAllZerosV_bc, node:$in)>;
+
+class VNEGD<bits<2> size, string OpcodeStr, string Dt, ValueType Ty>
+  : N2V<0b11, 0b11, size, 0b01, 0b00111, 0, 0, (outs DPR:$dst), (ins DPR:$src),
+        IIC_VSHLiD, OpcodeStr, Dt, "$dst, $src", "",
+        [(set DPR:$dst, (Ty (vneg DPR:$src)))]>;
+class VNEGQ<bits<2> size, string OpcodeStr, string Dt, ValueType Ty>
+  : N2V<0b11, 0b11, size, 0b01, 0b00111, 1, 0, (outs QPR:$dst), (ins QPR:$src),
+        IIC_VSHLiD, OpcodeStr, Dt, "$dst, $src", "",
+        [(set QPR:$dst, (Ty (vneg QPR:$src)))]>;
+
+//   VNEG     : Vector Negate
+def  VNEGs8d  : VNEGD<0b00, "vneg", "s8", v8i8>;
+def  VNEGs16d : VNEGD<0b01, "vneg", "s16", v4i16>;
+def  VNEGs32d : VNEGD<0b10, "vneg", "s32", v2i32>;
+def  VNEGs8q  : VNEGQ<0b00, "vneg", "s8", v16i8>;
+def  VNEGs16q : VNEGQ<0b01, "vneg", "s16", v8i16>;
+def  VNEGs32q : VNEGQ<0b10, "vneg", "s32", v4i32>;
+
+//   VNEG     : Vector Negate (floating-point)
+def  VNEGf32d : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 0, 0,
+                    (outs DPR:$dst), (ins DPR:$src), IIC_VUNAD,
+                    "vneg", "f32", "$dst, $src", "",
+                    [(set DPR:$dst, (v2f32 (fneg DPR:$src)))]>;
+def  VNEGf32q : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 1, 0,
+                    (outs QPR:$dst), (ins QPR:$src), IIC_VUNAQ,
+                    "vneg", "f32", "$dst, $src", "",
+                    [(set QPR:$dst, (v4f32 (fneg QPR:$src)))]>;
+
+def : Pat<(v8i8 (vneg_conv DPR:$src)), (VNEGs8d DPR:$src)>;
+def : Pat<(v4i16 (vneg_conv DPR:$src)), (VNEGs16d DPR:$src)>;
+def : Pat<(v2i32 (vneg_conv DPR:$src)), (VNEGs32d DPR:$src)>;
+def : Pat<(v16i8 (vneg_conv QPR:$src)), (VNEGs8q QPR:$src)>;
+def : Pat<(v8i16 (vneg_conv QPR:$src)), (VNEGs16q QPR:$src)>;
+def : Pat<(v4i32 (vneg_conv QPR:$src)), (VNEGs32q QPR:$src)>;
+
+//   VQNEG    : Vector Saturating Negate
+defm VQNEG    : N2VInt_QHS<0b11, 0b11, 0b00, 0b01111, 0, 
+                           IIC_VQUNAiD, IIC_VQUNAiQ, "vqneg", "s",
+                           int_arm_neon_vqneg>;
+
+// Vector Bit Counting Operations.
+
+//   VCLS     : Vector Count Leading Sign Bits
+defm VCLS     : N2VInt_QHS<0b11, 0b11, 0b00, 0b01000, 0, 
+                           IIC_VCNTiD, IIC_VCNTiQ, "vcls", "s",
+                           int_arm_neon_vcls>;
+//   VCLZ     : Vector Count Leading Zeros
+defm VCLZ     : N2VInt_QHS<0b11, 0b11, 0b00, 0b01001, 0, 
+                           IIC_VCNTiD, IIC_VCNTiQ, "vclz", "i",
+                           int_arm_neon_vclz>;
+//   VCNT     : Vector Count One Bits
+def  VCNTd    : N2VDInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0, 
+                        IIC_VCNTiD, "vcnt", "8",
+                        v8i8, v8i8, int_arm_neon_vcnt>;
+def  VCNTq    : N2VQInt<0b11, 0b11, 0b00, 0b00, 0b01010, 0,
+                        IIC_VCNTiQ, "vcnt", "8",
+                        v16i8, v16i8, int_arm_neon_vcnt>;
+
+// Vector Move Operations.
+
+//   VMOV     : Vector Move (Register)
+
+def  VMOVDneon: N3VX<0, 0, 0b10, 0b0001, 0, 1, (outs DPR:$dst), (ins DPR:$src),
+                    IIC_VMOVD, "vmov", "$dst, $src", "", []>;
+def  VMOVQ    : N3VX<0, 0, 0b10, 0b0001, 1, 1, (outs QPR:$dst), (ins QPR:$src),
+                    IIC_VMOVD, "vmov", "$dst, $src", "", []>;
+
+//   VMOV     : Vector Move (Immediate)
+
+// VMOV_get_imm8 xform function: convert build_vector to VMOV.i8 imm.
+def VMOV_get_imm8 : SDNodeXForm<build_vector, [{
+  return ARM::getVMOVImm(N, 1, *CurDAG);
+}]>;
+def vmovImm8 : PatLeaf<(build_vector), [{
+  return ARM::getVMOVImm(N, 1, *CurDAG).getNode() != 0;
+}], VMOV_get_imm8>;
+
+// VMOV_get_imm16 xform function: convert build_vector to VMOV.i16 imm.
+def VMOV_get_imm16 : SDNodeXForm<build_vector, [{
+  return ARM::getVMOVImm(N, 2, *CurDAG);
+}]>;
+def vmovImm16 : PatLeaf<(build_vector), [{
+  return ARM::getVMOVImm(N, 2, *CurDAG).getNode() != 0;
+}], VMOV_get_imm16>;
+
+// VMOV_get_imm32 xform function: convert build_vector to VMOV.i32 imm.
+def VMOV_get_imm32 : SDNodeXForm<build_vector, [{
+  return ARM::getVMOVImm(N, 4, *CurDAG);
+}]>;
+def vmovImm32 : PatLeaf<(build_vector), [{
+  return ARM::getVMOVImm(N, 4, *CurDAG).getNode() != 0;
+}], VMOV_get_imm32>;
+
+// VMOV_get_imm64 xform function: convert build_vector to VMOV.i64 imm.
+def VMOV_get_imm64 : SDNodeXForm<build_vector, [{
+  return ARM::getVMOVImm(N, 8, *CurDAG);
+}]>;
+def vmovImm64 : PatLeaf<(build_vector), [{
+  return ARM::getVMOVImm(N, 8, *CurDAG).getNode() != 0;
+}], VMOV_get_imm64>;
+
+// Note: Some of the cmode bits in the following VMOV instructions need to
+// be encoded based on the immed values.
+
+def VMOVv8i8  : N1ModImm<1, 0b000, 0b1110, 0, 0, 0, 1, (outs DPR:$dst),
+                         (ins h8imm:$SIMM), IIC_VMOVImm,
+                         "vmov", "i8", "$dst, $SIMM", "",
+                         [(set DPR:$dst, (v8i8 vmovImm8:$SIMM))]>;
+def VMOVv16i8 : N1ModImm<1, 0b000, 0b1110, 0, 1, 0, 1, (outs QPR:$dst),
+                         (ins h8imm:$SIMM), IIC_VMOVImm,
+                         "vmov", "i8", "$dst, $SIMM", "",
+                         [(set QPR:$dst, (v16i8 vmovImm8:$SIMM))]>;
+
+def VMOVv4i16 : N1ModImm<1, 0b000, {1,0,?,?}, 0, 0, {?}, 1, (outs DPR:$dst),
+                         (ins h16imm:$SIMM), IIC_VMOVImm,
+                         "vmov", "i16", "$dst, $SIMM", "",
+                         [(set DPR:$dst, (v4i16 vmovImm16:$SIMM))]>;
+def VMOVv8i16 : N1ModImm<1, 0b000, {1,0,?,?}, 0, 1, {?}, 1, (outs QPR:$dst),
+                         (ins h16imm:$SIMM), IIC_VMOVImm,
+                         "vmov", "i16", "$dst, $SIMM", "",
+                         [(set QPR:$dst, (v8i16 vmovImm16:$SIMM))]>;
+
+def VMOVv2i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 0, {?}, 1, (outs DPR:$dst),
+                         (ins h32imm:$SIMM), IIC_VMOVImm,
+                         "vmov", "i32", "$dst, $SIMM", "",
+                         [(set DPR:$dst, (v2i32 vmovImm32:$SIMM))]>;
+def VMOVv4i32 : N1ModImm<1, 0b000, {?,?,?,?}, 0, 1, {?}, 1, (outs QPR:$dst),
+                         (ins h32imm:$SIMM), IIC_VMOVImm,
+                         "vmov", "i32", "$dst, $SIMM", "",
+                         [(set QPR:$dst, (v4i32 vmovImm32:$SIMM))]>;
+
+def VMOVv1i64 : N1ModImm<1, 0b000, 0b1110, 0, 0, 1, 1, (outs DPR:$dst),
+                         (ins h64imm:$SIMM), IIC_VMOVImm,
+                         "vmov", "i64", "$dst, $SIMM", "",
+                         [(set DPR:$dst, (v1i64 vmovImm64:$SIMM))]>;
+def VMOVv2i64 : N1ModImm<1, 0b000, 0b1110, 0, 1, 1, 1, (outs QPR:$dst),
+                         (ins h64imm:$SIMM), IIC_VMOVImm,
+                         "vmov", "i64", "$dst, $SIMM", "",
+                         [(set QPR:$dst, (v2i64 vmovImm64:$SIMM))]>;
+
+//   VMOV     : Vector Get Lane (move scalar to ARM core register)
+
+def VGETLNs8  : NVGetLane<{1,1,1,0,0,1,?,1}, 0b1011, {?,?},
+                          (outs GPR:$dst), (ins DPR:$src, nohash_imm:$lane),
+                          IIC_VMOVSI, "vmov", "s8", "$dst, $src[$lane]",
+                          [(set GPR:$dst, (NEONvgetlanes (v8i8 DPR:$src),
+                                           imm:$lane))]>;
+def VGETLNs16 : NVGetLane<{1,1,1,0,0,0,?,1}, 0b1011, {?,1},
+                          (outs GPR:$dst), (ins DPR:$src, nohash_imm:$lane),
+                          IIC_VMOVSI, "vmov", "s16", "$dst, $src[$lane]",
+                          [(set GPR:$dst, (NEONvgetlanes (v4i16 DPR:$src),
+                                           imm:$lane))]>;
+def VGETLNu8  : NVGetLane<{1,1,1,0,1,1,?,1}, 0b1011, {?,?},
+                          (outs GPR:$dst), (ins DPR:$src, nohash_imm:$lane),
+                          IIC_VMOVSI, "vmov", "u8", "$dst, $src[$lane]",
+                          [(set GPR:$dst, (NEONvgetlaneu (v8i8 DPR:$src),
+                                           imm:$lane))]>;
+def VGETLNu16 : NVGetLane<{1,1,1,0,1,0,?,1}, 0b1011, {?,1},
+                          (outs GPR:$dst), (ins DPR:$src, nohash_imm:$lane),
+                          IIC_VMOVSI, "vmov", "u16", "$dst, $src[$lane]",
+                          [(set GPR:$dst, (NEONvgetlaneu (v4i16 DPR:$src),
+                                           imm:$lane))]>;
+def VGETLNi32 : NVGetLane<{1,1,1,0,0,0,?,1}, 0b1011, 0b00,
+                          (outs GPR:$dst), (ins DPR:$src, nohash_imm:$lane),
+                          IIC_VMOVSI, "vmov", "32", "$dst, $src[$lane]",
+                          [(set GPR:$dst, (extractelt (v2i32 DPR:$src),
+                                           imm:$lane))]>;
+// def VGETLNf32: see FMRDH and FMRDL in ARMInstrVFP.td
+def : Pat<(NEONvgetlanes (v16i8 QPR:$src), imm:$lane),
+          (VGETLNs8 (v8i8 (EXTRACT_SUBREG QPR:$src,
+                           (DSubReg_i8_reg imm:$lane))),
+                     (SubReg_i8_lane imm:$lane))>;
+def : Pat<(NEONvgetlanes (v8i16 QPR:$src), imm:$lane),
+          (VGETLNs16 (v4i16 (EXTRACT_SUBREG QPR:$src,
+                             (DSubReg_i16_reg imm:$lane))),
+                     (SubReg_i16_lane imm:$lane))>;
+def : Pat<(NEONvgetlaneu (v16i8 QPR:$src), imm:$lane),
+          (VGETLNu8 (v8i8 (EXTRACT_SUBREG QPR:$src,
+                           (DSubReg_i8_reg imm:$lane))),
+                     (SubReg_i8_lane imm:$lane))>;
+def : Pat<(NEONvgetlaneu (v8i16 QPR:$src), imm:$lane),
+          (VGETLNu16 (v4i16 (EXTRACT_SUBREG QPR:$src,
+                             (DSubReg_i16_reg imm:$lane))),
+                     (SubReg_i16_lane imm:$lane))>;
+def : Pat<(extractelt (v4i32 QPR:$src), imm:$lane),
+          (VGETLNi32 (v2i32 (EXTRACT_SUBREG QPR:$src,
+                             (DSubReg_i32_reg imm:$lane))),
+                     (SubReg_i32_lane imm:$lane))>;
+def : Pat<(extractelt (v2f32 DPR:$src1), imm:$src2),
+          (EXTRACT_SUBREG (v2f32 (COPY_TO_REGCLASS (v2f32 DPR:$src1), DPR_VFP2)),
+                          (SSubReg_f32_reg imm:$src2))>;
+def : Pat<(extractelt (v4f32 QPR:$src1), imm:$src2),
+          (EXTRACT_SUBREG (v4f32 (COPY_TO_REGCLASS (v4f32 QPR:$src1), QPR_VFP2)),
+                          (SSubReg_f32_reg imm:$src2))>;
+//def : Pat<(extractelt (v2i64 QPR:$src1), imm:$src2),
+//          (EXTRACT_SUBREG QPR:$src1, (DSubReg_f64_reg imm:$src2))>;
+def : Pat<(extractelt (v2f64 QPR:$src1), imm:$src2),
+          (EXTRACT_SUBREG QPR:$src1, (DSubReg_f64_reg imm:$src2))>;
+
+
+//   VMOV     : Vector Set Lane (move ARM core register to scalar)
+
+let Constraints = "$src1 = $dst" in {
+def VSETLNi8  : NVSetLane<{1,1,1,0,0,1,?,0}, 0b1011, {?,?}, (outs DPR:$dst),
+                          (ins DPR:$src1, GPR:$src2, nohash_imm:$lane),
+                          IIC_VMOVISL, "vmov", "8", "$dst[$lane], $src2",
+                          [(set DPR:$dst, (vector_insert (v8i8 DPR:$src1),
+                                           GPR:$src2, imm:$lane))]>;
+def VSETLNi16 : NVSetLane<{1,1,1,0,0,0,?,0}, 0b1011, {?,1}, (outs DPR:$dst),
+                          (ins DPR:$src1, GPR:$src2, nohash_imm:$lane),
+                          IIC_VMOVISL, "vmov", "16", "$dst[$lane], $src2",
+                          [(set DPR:$dst, (vector_insert (v4i16 DPR:$src1),
+                                           GPR:$src2, imm:$lane))]>;
+def VSETLNi32 : NVSetLane<{1,1,1,0,0,0,?,0}, 0b1011, 0b00, (outs DPR:$dst),
+                          (ins DPR:$src1, GPR:$src2, nohash_imm:$lane),
+                          IIC_VMOVISL, "vmov", "32", "$dst[$lane], $src2",
+                          [(set DPR:$dst, (insertelt (v2i32 DPR:$src1),
+                                           GPR:$src2, imm:$lane))]>;
+}
+def : Pat<(vector_insert (v16i8 QPR:$src1), GPR:$src2, imm:$lane),
+          (v16i8 (INSERT_SUBREG QPR:$src1, 
+                  (VSETLNi8 (v8i8 (EXTRACT_SUBREG QPR:$src1,
+                                   (DSubReg_i8_reg imm:$lane))),
+                            GPR:$src2, (SubReg_i8_lane imm:$lane)),
+                  (DSubReg_i8_reg imm:$lane)))>;
+def : Pat<(vector_insert (v8i16 QPR:$src1), GPR:$src2, imm:$lane),
+          (v8i16 (INSERT_SUBREG QPR:$src1, 
+                  (VSETLNi16 (v4i16 (EXTRACT_SUBREG QPR:$src1,
+                                     (DSubReg_i16_reg imm:$lane))),
+                             GPR:$src2, (SubReg_i16_lane imm:$lane)),
+                  (DSubReg_i16_reg imm:$lane)))>;
+def : Pat<(insertelt (v4i32 QPR:$src1), GPR:$src2, imm:$lane),
+          (v4i32 (INSERT_SUBREG QPR:$src1, 
+                  (VSETLNi32 (v2i32 (EXTRACT_SUBREG QPR:$src1,
+                                     (DSubReg_i32_reg imm:$lane))),
+                             GPR:$src2, (SubReg_i32_lane imm:$lane)),
+                  (DSubReg_i32_reg imm:$lane)))>;
+
+def : Pat<(v2f32 (insertelt DPR:$src1, SPR:$src2, imm:$src3)),
+          (INSERT_SUBREG (v2f32 (COPY_TO_REGCLASS DPR:$src1, DPR_VFP2)),
+                                SPR:$src2, (SSubReg_f32_reg imm:$src3))>;
+def : Pat<(v4f32 (insertelt QPR:$src1, SPR:$src2, imm:$src3)),
+          (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS QPR:$src1, QPR_VFP2)),
+                                SPR:$src2, (SSubReg_f32_reg imm:$src3))>;
+
+//def : Pat<(v2i64 (insertelt QPR:$src1, DPR:$src2, imm:$src3)),
+//          (INSERT_SUBREG QPR:$src1, DPR:$src2, (DSubReg_f64_reg imm:$src3))>;
+def : Pat<(v2f64 (insertelt QPR:$src1, DPR:$src2, imm:$src3)),
+          (INSERT_SUBREG QPR:$src1, DPR:$src2, (DSubReg_f64_reg imm:$src3))>;
+
+def : Pat<(v2f32 (scalar_to_vector SPR:$src)),
+          (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$src, arm_ssubreg_0)>;
+def : Pat<(v2f64 (scalar_to_vector DPR:$src)),
+          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), DPR:$src, arm_dsubreg_0)>;
+def : Pat<(v4f32 (scalar_to_vector SPR:$src)),
+          (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), SPR:$src, arm_ssubreg_0)>;
+
+def : Pat<(v8i8 (scalar_to_vector GPR:$src)),
+          (VSETLNi8  (v8i8  (IMPLICIT_DEF)), GPR:$src, (i32 0))>;
+def : Pat<(v4i16 (scalar_to_vector GPR:$src)),
+          (VSETLNi16 (v4i16 (IMPLICIT_DEF)), GPR:$src, (i32 0))>;
+def : Pat<(v2i32 (scalar_to_vector GPR:$src)),
+          (VSETLNi32 (v2i32 (IMPLICIT_DEF)), GPR:$src, (i32 0))>;
+
+def : Pat<(v16i8 (scalar_to_vector GPR:$src)),
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+                         (VSETLNi8 (v8i8 (IMPLICIT_DEF)), GPR:$src, (i32 0)),
+                         arm_dsubreg_0)>;
+def : Pat<(v8i16 (scalar_to_vector GPR:$src)),
+          (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
+                         (VSETLNi16 (v4i16 (IMPLICIT_DEF)), GPR:$src, (i32 0)),
+                         arm_dsubreg_0)>;
+def : Pat<(v4i32 (scalar_to_vector GPR:$src)),
+          (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
+                         (VSETLNi32 (v2i32 (IMPLICIT_DEF)), GPR:$src, (i32 0)),
+                         arm_dsubreg_0)>;
+
+//   VDUP     : Vector Duplicate (from ARM core register to all elements)
+
+class VDUPD<bits<8> opcod1, bits<2> opcod3, string Dt, ValueType Ty>
+  : NVDup<opcod1, 0b1011, opcod3, (outs DPR:$dst), (ins GPR:$src),
+          IIC_VMOVIS, "vdup", Dt, "$dst, $src",
+          [(set DPR:$dst, (Ty (NEONvdup (i32 GPR:$src))))]>;
+class VDUPQ<bits<8> opcod1, bits<2> opcod3, string Dt, ValueType Ty>
+  : NVDup<opcod1, 0b1011, opcod3, (outs QPR:$dst), (ins GPR:$src),
+          IIC_VMOVIS, "vdup", Dt, "$dst, $src",
+          [(set QPR:$dst, (Ty (NEONvdup (i32 GPR:$src))))]>;
+
+def  VDUP8d   : VDUPD<0b11101100, 0b00, "8", v8i8>;
+def  VDUP16d  : VDUPD<0b11101000, 0b01, "16", v4i16>;
+def  VDUP32d  : VDUPD<0b11101000, 0b00, "32", v2i32>;
+def  VDUP8q   : VDUPQ<0b11101110, 0b00, "8", v16i8>;
+def  VDUP16q  : VDUPQ<0b11101010, 0b01, "16", v8i16>;
+def  VDUP32q  : VDUPQ<0b11101010, 0b00, "32", v4i32>;
+
+def  VDUPfd   : NVDup<0b11101000, 0b1011, 0b00, (outs DPR:$dst), (ins GPR:$src),
+                      IIC_VMOVIS, "vdup", "32", "$dst, $src",
+                      [(set DPR:$dst, (v2f32 (NEONvdup
+                                              (f32 (bitconvert GPR:$src)))))]>;
+def  VDUPfq   : NVDup<0b11101010, 0b1011, 0b00, (outs QPR:$dst), (ins GPR:$src),
+                      IIC_VMOVIS, "vdup", "32", "$dst, $src",
+                      [(set QPR:$dst, (v4f32 (NEONvdup
+                                              (f32 (bitconvert GPR:$src)))))]>;
+
+//   VDUP     : Vector Duplicate Lane (from scalar to all elements)
+
+class VDUPLND<bits<2> op19_18, bits<2> op17_16,
+              string OpcodeStr, string Dt, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, op17_16, 0b11000, 0, 0,
+        (outs DPR:$dst), (ins DPR:$src, nohash_imm:$lane), IIC_VMOVD,
+        OpcodeStr, Dt, "$dst, $src[$lane]", "",
+        [(set DPR:$dst, (Ty (NEONvduplane (Ty DPR:$src), imm:$lane)))]>;
+
+class VDUPLNQ<bits<2> op19_18, bits<2> op17_16, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy>
+  : N2V<0b11, 0b11, op19_18, op17_16, 0b11000, 1, 0,
+        (outs QPR:$dst), (ins DPR:$src, nohash_imm:$lane), IIC_VMOVD,
+        OpcodeStr, Dt, "$dst, $src[$lane]", "",
+        [(set QPR:$dst, (ResTy (NEONvduplane (OpTy DPR:$src), imm:$lane)))]>;
+
+// Inst{19-16} is partially specified depending on the element size.
+
+def VDUPLN8d  : VDUPLND<{?,?}, {?,1}, "vdup", "8", v8i8>;
+def VDUPLN16d : VDUPLND<{?,?}, {1,0}, "vdup", "16", v4i16>;
+def VDUPLN32d : VDUPLND<{?,1}, {0,0}, "vdup", "32", v2i32>;
+def VDUPLNfd  : VDUPLND<{?,1}, {0,0}, "vdup", "32", v2f32>;
+def VDUPLN8q  : VDUPLNQ<{?,?}, {?,1}, "vdup", "8", v16i8, v8i8>;
+def VDUPLN16q : VDUPLNQ<{?,?}, {1,0}, "vdup", "16", v8i16, v4i16>;
+def VDUPLN32q : VDUPLNQ<{?,1}, {0,0}, "vdup", "32", v4i32, v2i32>;
+def VDUPLNfq  : VDUPLNQ<{?,1}, {0,0}, "vdup", "32", v4f32, v2f32>;
+
+def : Pat<(v16i8 (NEONvduplane (v16i8 QPR:$src), imm:$lane)),
+          (v16i8 (VDUPLN8q (v8i8 (EXTRACT_SUBREG QPR:$src,
+                                  (DSubReg_i8_reg imm:$lane))),
+                           (SubReg_i8_lane imm:$lane)))>;
+def : Pat<(v8i16 (NEONvduplane (v8i16 QPR:$src), imm:$lane)),
+          (v8i16 (VDUPLN16q (v4i16 (EXTRACT_SUBREG QPR:$src,
+                                    (DSubReg_i16_reg imm:$lane))),
+                            (SubReg_i16_lane imm:$lane)))>;
+def : Pat<(v4i32 (NEONvduplane (v4i32 QPR:$src), imm:$lane)),
+          (v4i32 (VDUPLN32q (v2i32 (EXTRACT_SUBREG QPR:$src,
+                                    (DSubReg_i32_reg imm:$lane))),
+                            (SubReg_i32_lane imm:$lane)))>;
+def : Pat<(v4f32 (NEONvduplane (v4f32 QPR:$src), imm:$lane)),
+          (v4f32 (VDUPLNfq (v2f32 (EXTRACT_SUBREG QPR:$src,
+                                   (DSubReg_i32_reg imm:$lane))),
+                           (SubReg_i32_lane imm:$lane)))>;
+
+def  VDUPfdf  : N2V<0b11, 0b11, {?,1}, {0,0}, 0b11000, 0, 0,
+                    (outs DPR:$dst), (ins SPR:$src),
+                    IIC_VMOVD, "vdup", "32", "$dst, ${src:lane}", "",
+                    [(set DPR:$dst, (v2f32 (NEONvdup (f32 SPR:$src))))]>;
+
+def  VDUPfqf  : N2V<0b11, 0b11, {?,1}, {0,0}, 0b11000, 1, 0,
+                    (outs QPR:$dst), (ins SPR:$src),
+                    IIC_VMOVD, "vdup", "32", "$dst, ${src:lane}", "",
+                    [(set QPR:$dst, (v4f32 (NEONvdup (f32 SPR:$src))))]>;
+
+def : Pat<(v2i64 (NEONvduplane (v2i64 QPR:$src), imm:$lane)),
+          (INSERT_SUBREG QPR:$src, 
+                         (i64 (EXTRACT_SUBREG QPR:$src, (DSubReg_f64_reg imm:$lane))),
+                         (DSubReg_f64_other_reg imm:$lane))>;
+def : Pat<(v2f64 (NEONvduplane (v2f64 QPR:$src), imm:$lane)),
+          (INSERT_SUBREG QPR:$src, 
+                         (f64 (EXTRACT_SUBREG QPR:$src, (DSubReg_f64_reg imm:$lane))),
+                         (DSubReg_f64_other_reg imm:$lane))>;
+
+//   VMOVN    : Vector Narrowing Move
+defm VMOVN    : N2VNInt_HSD<0b11,0b11,0b10,0b00100,0,0, IIC_VMOVD,
+                            "vmovn", "i", int_arm_neon_vmovn>;
+//   VQMOVN   : Vector Saturating Narrowing Move
+defm VQMOVNs  : N2VNInt_HSD<0b11,0b11,0b10,0b00101,0,0, IIC_VQUNAiD,
+                            "vqmovn", "s", int_arm_neon_vqmovns>;
+defm VQMOVNu  : N2VNInt_HSD<0b11,0b11,0b10,0b00101,1,0, IIC_VQUNAiD,
+                            "vqmovn", "u", int_arm_neon_vqmovnu>;
+defm VQMOVNsu : N2VNInt_HSD<0b11,0b11,0b10,0b00100,1,0, IIC_VQUNAiD,
+                            "vqmovun", "s", int_arm_neon_vqmovnsu>;
+//   VMOVL    : Vector Lengthening Move
+defm VMOVLs   : N2VLInt_QHS<0b01,0b10100,0,1, "vmovl", "s",
+                            int_arm_neon_vmovls>;
+defm VMOVLu   : N2VLInt_QHS<0b11,0b10100,0,1, "vmovl", "u",
+                            int_arm_neon_vmovlu>;
+
+// Vector Conversions.
+
+//   VCVT     : Vector Convert Between Floating-Point and Integers
+def  VCVTf2sd : N2VD<0b11, 0b11, 0b10, 0b11, 0b01110, 0, "vcvt", "s32.f32",
+                     v2i32, v2f32, fp_to_sint>;
+def  VCVTf2ud : N2VD<0b11, 0b11, 0b10, 0b11, 0b01111, 0, "vcvt", "u32.f32",
+                     v2i32, v2f32, fp_to_uint>;
+def  VCVTs2fd : N2VD<0b11, 0b11, 0b10, 0b11, 0b01100, 0, "vcvt", "f32.s32",
+                     v2f32, v2i32, sint_to_fp>;
+def  VCVTu2fd : N2VD<0b11, 0b11, 0b10, 0b11, 0b01101, 0, "vcvt", "f32.u32",
+                     v2f32, v2i32, uint_to_fp>;
+
+def  VCVTf2sq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01110, 0, "vcvt", "s32.f32",
+                     v4i32, v4f32, fp_to_sint>;
+def  VCVTf2uq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01111, 0, "vcvt", "u32.f32",
+                     v4i32, v4f32, fp_to_uint>;
+def  VCVTs2fq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01100, 0, "vcvt", "f32.s32",
+                     v4f32, v4i32, sint_to_fp>;
+def  VCVTu2fq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01101, 0, "vcvt", "f32.u32",
+                     v4f32, v4i32, uint_to_fp>;
+
+//   VCVT     : Vector Convert Between Floating-Point and Fixed-Point.
+def VCVTf2xsd : N2VCvtD<0, 1, 0b1111, 0, 1, "vcvt", "s32.f32",
+                        v2i32, v2f32, int_arm_neon_vcvtfp2fxs>;
+def VCVTf2xud : N2VCvtD<1, 1, 0b1111, 0, 1, "vcvt", "u32.f32",
+                        v2i32, v2f32, int_arm_neon_vcvtfp2fxu>;
+def VCVTxs2fd : N2VCvtD<0, 1, 0b1110, 0, 1, "vcvt", "f32.s32",
+                        v2f32, v2i32, int_arm_neon_vcvtfxs2fp>;
+def VCVTxu2fd : N2VCvtD<1, 1, 0b1110, 0, 1, "vcvt", "f32.u32",
+                        v2f32, v2i32, int_arm_neon_vcvtfxu2fp>;
+
+def VCVTf2xsq : N2VCvtQ<0, 1, 0b1111, 0, 1, "vcvt", "s32.f32",
+                        v4i32, v4f32, int_arm_neon_vcvtfp2fxs>;
+def VCVTf2xuq : N2VCvtQ<1, 1, 0b1111, 0, 1, "vcvt", "u32.f32",
+                        v4i32, v4f32, int_arm_neon_vcvtfp2fxu>;
+def VCVTxs2fq : N2VCvtQ<0, 1, 0b1110, 0, 1, "vcvt", "f32.s32",
+                        v4f32, v4i32, int_arm_neon_vcvtfxs2fp>;
+def VCVTxu2fq : N2VCvtQ<1, 1, 0b1110, 0, 1, "vcvt", "f32.u32",
+                        v4f32, v4i32, int_arm_neon_vcvtfxu2fp>;
+
+// Vector Reverse.
+
+//   VREV64   : Vector Reverse elements within 64-bit doublewords
+
+class VREV64D<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 0, 0, (outs DPR:$dst),
+        (ins DPR:$src), IIC_VMOVD, 
+        OpcodeStr, Dt, "$dst, $src", "",
+        [(set DPR:$dst, (Ty (NEONvrev64 (Ty DPR:$src))))]>;
+class VREV64Q<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00000, 1, 0, (outs QPR:$dst),
+        (ins QPR:$src), IIC_VMOVD, 
+        OpcodeStr, Dt, "$dst, $src", "",
+        [(set QPR:$dst, (Ty (NEONvrev64 (Ty QPR:$src))))]>;
+
+def VREV64d8  : VREV64D<0b00, "vrev64", "8", v8i8>;
+def VREV64d16 : VREV64D<0b01, "vrev64", "16", v4i16>;
+def VREV64d32 : VREV64D<0b10, "vrev64", "32", v2i32>;
+def VREV64df  : VREV64D<0b10, "vrev64", "32", v2f32>;
+
+def VREV64q8  : VREV64Q<0b00, "vrev64", "8", v16i8>;
+def VREV64q16 : VREV64Q<0b01, "vrev64", "16", v8i16>;
+def VREV64q32 : VREV64Q<0b10, "vrev64", "32", v4i32>;
+def VREV64qf  : VREV64Q<0b10, "vrev64", "32", v4f32>;
+
+//   VREV32   : Vector Reverse elements within 32-bit words
+
+class VREV32D<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 0, 0, (outs DPR:$dst),
+        (ins DPR:$src), IIC_VMOVD, 
+        OpcodeStr, Dt, "$dst, $src", "",
+        [(set DPR:$dst, (Ty (NEONvrev32 (Ty DPR:$src))))]>;
+class VREV32Q<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00001, 1, 0, (outs QPR:$dst),
+        (ins QPR:$src), IIC_VMOVD, 
+        OpcodeStr, Dt, "$dst, $src", "",
+        [(set QPR:$dst, (Ty (NEONvrev32 (Ty QPR:$src))))]>;
+
+def VREV32d8  : VREV32D<0b00, "vrev32", "8", v8i8>;
+def VREV32d16 : VREV32D<0b01, "vrev32", "16", v4i16>;
+
+def VREV32q8  : VREV32Q<0b00, "vrev32", "8", v16i8>;
+def VREV32q16 : VREV32Q<0b01, "vrev32", "16", v8i16>;
+
+//   VREV16   : Vector Reverse elements within 16-bit halfwords
+
+class VREV16D<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 0, 0, (outs DPR:$dst),
+        (ins DPR:$src), IIC_VMOVD, 
+        OpcodeStr, Dt, "$dst, $src", "",
+        [(set DPR:$dst, (Ty (NEONvrev16 (Ty DPR:$src))))]>;
+class VREV16Q<bits<2> op19_18, string OpcodeStr, string Dt, ValueType Ty>
+  : N2V<0b11, 0b11, op19_18, 0b00, 0b00010, 1, 0, (outs QPR:$dst),
+        (ins QPR:$src), IIC_VMOVD, 
+        OpcodeStr, Dt, "$dst, $src", "",
+        [(set QPR:$dst, (Ty (NEONvrev16 (Ty QPR:$src))))]>;
+
+def VREV16d8  : VREV16D<0b00, "vrev16", "8", v8i8>;
+def VREV16q8  : VREV16Q<0b00, "vrev16", "8", v16i8>;
+
+// Other Vector Shuffles.
+
+//   VEXT     : Vector Extract
+
+class VEXTd<string OpcodeStr, string Dt, ValueType Ty>
+  : N3V<0,1,0b11,{?,?,?,?},0,0, (outs DPR:$dst),
+        (ins DPR:$lhs, DPR:$rhs, i32imm:$index), IIC_VEXTD,
+        OpcodeStr, Dt, "$dst, $lhs, $rhs, $index", "",
+        [(set DPR:$dst, (Ty (NEONvext (Ty DPR:$lhs),
+                                      (Ty DPR:$rhs), imm:$index)))]>;
+
+class VEXTq<string OpcodeStr, string Dt, ValueType Ty>
+  : N3V<0,1,0b11,{?,?,?,?},1,0, (outs QPR:$dst),
+        (ins QPR:$lhs, QPR:$rhs, i32imm:$index), IIC_VEXTQ,
+        OpcodeStr, Dt, "$dst, $lhs, $rhs, $index", "",
+        [(set QPR:$dst, (Ty (NEONvext (Ty QPR:$lhs),
+                                      (Ty QPR:$rhs), imm:$index)))]>;
+
+def VEXTd8  : VEXTd<"vext", "8",  v8i8>;
+def VEXTd16 : VEXTd<"vext", "16", v4i16>;
+def VEXTd32 : VEXTd<"vext", "32", v2i32>;
+def VEXTdf  : VEXTd<"vext", "32", v2f32>;
+
+def VEXTq8  : VEXTq<"vext", "8",  v16i8>;
+def VEXTq16 : VEXTq<"vext", "16", v8i16>;
+def VEXTq32 : VEXTq<"vext", "32", v4i32>;
+def VEXTqf  : VEXTq<"vext", "32", v4f32>;
+
+//   VTRN     : Vector Transpose
+
+def  VTRNd8   : N2VDShuffle<0b00, 0b00001, "vtrn", "8">;
+def  VTRNd16  : N2VDShuffle<0b01, 0b00001, "vtrn", "16">;
+def  VTRNd32  : N2VDShuffle<0b10, 0b00001, "vtrn", "32">;
+
+def  VTRNq8   : N2VQShuffle<0b00, 0b00001, IIC_VPERMQ, "vtrn", "8">;
+def  VTRNq16  : N2VQShuffle<0b01, 0b00001, IIC_VPERMQ, "vtrn", "16">;
+def  VTRNq32  : N2VQShuffle<0b10, 0b00001, IIC_VPERMQ, "vtrn", "32">;
+
+//   VUZP     : Vector Unzip (Deinterleave)
+
+def  VUZPd8   : N2VDShuffle<0b00, 0b00010, "vuzp", "8">;
+def  VUZPd16  : N2VDShuffle<0b01, 0b00010, "vuzp", "16">;
+def  VUZPd32  : N2VDShuffle<0b10, 0b00010, "vuzp", "32">;
+
+def  VUZPq8   : N2VQShuffle<0b00, 0b00010, IIC_VPERMQ3, "vuzp", "8">;
+def  VUZPq16  : N2VQShuffle<0b01, 0b00010, IIC_VPERMQ3, "vuzp", "16">;
+def  VUZPq32  : N2VQShuffle<0b10, 0b00010, IIC_VPERMQ3, "vuzp", "32">;
+
+//   VZIP     : Vector Zip (Interleave)
+
+def  VZIPd8   : N2VDShuffle<0b00, 0b00011, "vzip", "8">;
+def  VZIPd16  : N2VDShuffle<0b01, 0b00011, "vzip", "16">;
+def  VZIPd32  : N2VDShuffle<0b10, 0b00011, "vzip", "32">;
+
+def  VZIPq8   : N2VQShuffle<0b00, 0b00011, IIC_VPERMQ3, "vzip", "8">;
+def  VZIPq16  : N2VQShuffle<0b01, 0b00011, IIC_VPERMQ3, "vzip", "16">;
+def  VZIPq32  : N2VQShuffle<0b10, 0b00011, IIC_VPERMQ3, "vzip", "32">;
+
+// Vector Table Lookup and Table Extension.
+
+//   VTBL     : Vector Table Lookup
+def  VTBL1
+  : N3V<1,1,0b11,0b1000,0,0, (outs DPR:$dst),
+        (ins DPR:$tbl1, DPR:$src), IIC_VTB1,
+        "vtbl", "8", "$dst, \\{$tbl1\\}, $src", "",
+        [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl1 DPR:$tbl1, DPR:$src)))]>;
+let hasExtraSrcRegAllocReq = 1 in {
+def  VTBL2
+  : N3V<1,1,0b11,0b1001,0,0, (outs DPR:$dst),
+        (ins DPR:$tbl1, DPR:$tbl2, DPR:$src), IIC_VTB2,
+        "vtbl", "8", "$dst, \\{$tbl1, $tbl2\\}, $src", "",
+        [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl2
+                               DPR:$tbl1, DPR:$tbl2, DPR:$src)))]>;
+def  VTBL3
+  : N3V<1,1,0b11,0b1010,0,0, (outs DPR:$dst),
+        (ins DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src), IIC_VTB3,
+        "vtbl", "8", "$dst, \\{$tbl1, $tbl2, $tbl3\\}, $src", "",
+        [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl3
+                               DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src)))]>;
+def  VTBL4
+  : N3V<1,1,0b11,0b1011,0,0, (outs DPR:$dst),
+        (ins DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src), IIC_VTB4,
+        "vtbl", "8", "$dst, \\{$tbl1, $tbl2, $tbl3, $tbl4\\}, $src", "",
+        [(set DPR:$dst, (v8i8 (int_arm_neon_vtbl4 DPR:$tbl1, DPR:$tbl2,
+                               DPR:$tbl3, DPR:$tbl4, DPR:$src)))]>;
+} // hasExtraSrcRegAllocReq = 1
+
+//   VTBX     : Vector Table Extension
+def  VTBX1
+  : N3V<1,1,0b11,0b1000,1,0, (outs DPR:$dst),
+        (ins DPR:$orig, DPR:$tbl1, DPR:$src), IIC_VTBX1,
+        "vtbx", "8", "$dst, \\{$tbl1\\}, $src", "$orig = $dst",
+        [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx1
+                               DPR:$orig, DPR:$tbl1, DPR:$src)))]>;
+let hasExtraSrcRegAllocReq = 1 in {
+def  VTBX2
+  : N3V<1,1,0b11,0b1001,1,0, (outs DPR:$dst),
+        (ins DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$src), IIC_VTBX2,
+        "vtbx", "8", "$dst, \\{$tbl1, $tbl2\\}, $src", "$orig = $dst",
+        [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx2
+                               DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$src)))]>;
+def  VTBX3
+  : N3V<1,1,0b11,0b1010,1,0, (outs DPR:$dst),
+        (ins DPR:$orig, DPR:$tbl1, DPR:$tbl2, DPR:$tbl3, DPR:$src), IIC_VTBX3,
+        "vtbx", "8", "$dst, \\{$tbl1, $tbl2, $tbl3\\}, $src", "$orig = $dst",
+        [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx3 DPR:$orig, DPR:$tbl1,
+                               DPR:$tbl2, DPR:$tbl3, DPR:$src)))]>;
+def  VTBX4
+  : N3V<1,1,0b11,0b1011,1,0, (outs DPR:$dst), (ins DPR:$orig, DPR:$tbl1,
+        DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src), IIC_VTBX4,
+        "vtbx", "8", "$dst, \\{$tbl1, $tbl2, $tbl3, $tbl4\\}, $src",
+        "$orig = $dst",
+        [(set DPR:$dst, (v8i8 (int_arm_neon_vtbx4 DPR:$orig, DPR:$tbl1,
+                               DPR:$tbl2, DPR:$tbl3, DPR:$tbl4, DPR:$src)))]>;
+} // hasExtraSrcRegAllocReq = 1
+
+//===----------------------------------------------------------------------===//
+// NEON instructions for single-precision FP math
+//===----------------------------------------------------------------------===//
+
+// These need separate instructions because they must use DPR_VFP2 register
+// class which have SPR sub-registers.
+
+// Vector Add Operations used for single-precision FP
+let neverHasSideEffects = 1 in
+def VADDfd_sfp : N3VDs<0, 0, 0b00, 0b1101, 0, "vadd", "f32", v2f32, v2f32, fadd,1>;
+def : N3VDsPat<fadd, VADDfd_sfp>;
+
+// Vector Sub Operations used for single-precision FP
+let neverHasSideEffects = 1 in
+def VSUBfd_sfp : N3VDs<0, 0, 0b10, 0b1101, 0, "vsub", "f32", v2f32, v2f32, fsub,0>;
+def : N3VDsPat<fsub, VSUBfd_sfp>;
+
+// Vector Multiply Operations used for single-precision FP
+let neverHasSideEffects = 1 in
+def VMULfd_sfp : N3VDs<1, 0, 0b00, 0b1101, 1, "vmul", "f32", v2f32, v2f32, fmul,1>;
+def : N3VDsPat<fmul, VMULfd_sfp>;
+
+// Vector Multiply-Accumulate/Subtract used for single-precision FP
+// vml[as].f32 can cause 4-8 cycle stalls in following ASIMD instructions, so
+// we want to avoid them for now. e.g., alternating vmla/vadd instructions.
+
+//let neverHasSideEffects = 1 in
+//def VMLAfd_sfp : N3VDMulOps<0, 0, 0b00, 0b1101, 1, IIC_VMACD, "vmla", "f32", v2f32,fmul,fadd>;
+//def : N3VDMulOpsPat<fmul, fadd, VMLAfd_sfp>;
+
+//let neverHasSideEffects = 1 in
+//def VMLSfd_sfp : N3VDMulOps<0, 0, 0b10, 0b1101, 1, IIC_VMACD, "vmls", "f32", v2f32,fmul,fsub>;
+//def : N3VDMulOpsPat<fmul, fsub, VMLSfd_sfp>;
+
+// Vector Absolute used for single-precision FP
+let neverHasSideEffects = 1 in
+def  VABSfd_sfp : N2VDInts<0b11, 0b11, 0b10, 0b01, 0b01110, 0,
+                           IIC_VUNAD, "vabs", "f32",
+                           v2f32, v2f32, int_arm_neon_vabs>;
+def : N2VDIntsPat<fabs, VABSfd_sfp>;
+
+// Vector Negate used for single-precision FP
+let neverHasSideEffects = 1 in
+def  VNEGf32d_sfp : N2V<0b11, 0b11, 0b10, 0b01, 0b01111, 0, 0,
+                        (outs DPR_VFP2:$dst), (ins DPR_VFP2:$src), IIC_VUNAD,
+                        "vneg", "f32", "$dst, $src", "", []>;
+def : N2VDIntsPat<fneg, VNEGf32d_sfp>;
+
+// Vector Convert between single-precision FP and integer
+let neverHasSideEffects = 1 in
+def  VCVTf2sd_sfp : N2VDs<0b11, 0b11, 0b10, 0b11, 0b01110, 0, "vcvt", "s32.f32",
+                          v2i32, v2f32, fp_to_sint>;
+def : N2VDsPat<arm_ftosi, f32, v2f32, VCVTf2sd_sfp>;
+
+let neverHasSideEffects = 1 in
+def  VCVTf2ud_sfp : N2VDs<0b11, 0b11, 0b10, 0b11, 0b01111, 0, "vcvt", "u32.f32",
+                          v2i32, v2f32, fp_to_uint>;
+def : N2VDsPat<arm_ftoui, f32, v2f32, VCVTf2ud_sfp>;
+
+let neverHasSideEffects = 1 in
+def  VCVTs2fd_sfp : N2VDs<0b11, 0b11, 0b10, 0b11, 0b01100, 0, "vcvt", "f32.s32",
+                          v2f32, v2i32, sint_to_fp>;
+def : N2VDsPat<arm_sitof, f32, v2i32, VCVTs2fd_sfp>;
+
+let neverHasSideEffects = 1 in
+def  VCVTu2fd_sfp : N2VDs<0b11, 0b11, 0b10, 0b11, 0b01101, 0, "vcvt", "f32.u32",
+                          v2f32, v2i32, uint_to_fp>;
+def : N2VDsPat<arm_uitof, f32, v2i32, VCVTu2fd_sfp>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// bit_convert
+def : Pat<(v1i64 (bitconvert (v2i32 DPR:$src))), (v1i64 DPR:$src)>;
+def : Pat<(v1i64 (bitconvert (v4i16 DPR:$src))), (v1i64 DPR:$src)>;
+def : Pat<(v1i64 (bitconvert (v8i8  DPR:$src))), (v1i64 DPR:$src)>;
+def : Pat<(v1i64 (bitconvert (f64   DPR:$src))), (v1i64 DPR:$src)>;
+def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (v1i64 DPR:$src)>;
+def : Pat<(v2i32 (bitconvert (v1i64 DPR:$src))), (v2i32 DPR:$src)>;
+def : Pat<(v2i32 (bitconvert (v4i16 DPR:$src))), (v2i32 DPR:$src)>;
+def : Pat<(v2i32 (bitconvert (v8i8  DPR:$src))), (v2i32 DPR:$src)>;
+def : Pat<(v2i32 (bitconvert (f64   DPR:$src))), (v2i32 DPR:$src)>;
+def : Pat<(v2i32 (bitconvert (v2f32 DPR:$src))), (v2i32 DPR:$src)>;
+def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (v4i16 DPR:$src)>;
+def : Pat<(v4i16 (bitconvert (v2i32 DPR:$src))), (v4i16 DPR:$src)>;
+def : Pat<(v4i16 (bitconvert (v8i8  DPR:$src))), (v4i16 DPR:$src)>;
+def : Pat<(v4i16 (bitconvert (f64   DPR:$src))), (v4i16 DPR:$src)>;
+def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (v4i16 DPR:$src)>;
+def : Pat<(v8i8  (bitconvert (v1i64 DPR:$src))), (v8i8  DPR:$src)>;
+def : Pat<(v8i8  (bitconvert (v2i32 DPR:$src))), (v8i8  DPR:$src)>;
+def : Pat<(v8i8  (bitconvert (v4i16 DPR:$src))), (v8i8  DPR:$src)>;
+def : Pat<(v8i8  (bitconvert (f64   DPR:$src))), (v8i8  DPR:$src)>;
+def : Pat<(v8i8  (bitconvert (v2f32 DPR:$src))), (v8i8  DPR:$src)>;
+def : Pat<(f64   (bitconvert (v1i64 DPR:$src))), (f64   DPR:$src)>;
+def : Pat<(f64   (bitconvert (v2i32 DPR:$src))), (f64   DPR:$src)>;
+def : Pat<(f64   (bitconvert (v4i16 DPR:$src))), (f64   DPR:$src)>;
+def : Pat<(f64   (bitconvert (v8i8  DPR:$src))), (f64   DPR:$src)>;
+def : Pat<(f64   (bitconvert (v2f32 DPR:$src))), (f64   DPR:$src)>;
+def : Pat<(v2f32 (bitconvert (f64   DPR:$src))), (v2f32 DPR:$src)>;
+def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (v2f32 DPR:$src)>;
+def : Pat<(v2f32 (bitconvert (v2i32 DPR:$src))), (v2f32 DPR:$src)>;
+def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (v2f32 DPR:$src)>;
+def : Pat<(v2f32 (bitconvert (v8i8  DPR:$src))), (v2f32 DPR:$src)>;
+
+def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (v2i64 QPR:$src)>;
+def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (v2i64 QPR:$src)>;
+def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (v2i64 QPR:$src)>;
+def : Pat<(v2i64 (bitconvert (v2f64 QPR:$src))), (v2i64 QPR:$src)>;
+def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (v2i64 QPR:$src)>;
+def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (v4i32 QPR:$src)>;
+def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (v4i32 QPR:$src)>;
+def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (v4i32 QPR:$src)>;
+def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (v4i32 QPR:$src)>;
+def : Pat<(v4i32 (bitconvert (v4f32 QPR:$src))), (v4i32 QPR:$src)>;
+def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (v8i16 QPR:$src)>;
+def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (v8i16 QPR:$src)>;
+def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (v8i16 QPR:$src)>;
+def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (v8i16 QPR:$src)>;
+def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (v8i16 QPR:$src)>;
+def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (v16i8 QPR:$src)>;
+def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (v16i8 QPR:$src)>;
+def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (v16i8 QPR:$src)>;
+def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (v16i8 QPR:$src)>;
+def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (v16i8 QPR:$src)>;
+def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (v4f32 QPR:$src)>;
+def : Pat<(v4f32 (bitconvert (v4i32 QPR:$src))), (v4f32 QPR:$src)>;
+def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (v4f32 QPR:$src)>;
+def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (v4f32 QPR:$src)>;
+def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (v4f32 QPR:$src)>;
+def : Pat<(v2f64 (bitconvert (v2i64 QPR:$src))), (v2f64 QPR:$src)>;
+def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (v2f64 QPR:$src)>;
+def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 QPR:$src)>;
+def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 QPR:$src)>;
+def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>;

diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
new file mode 100644
index 0000000..1dcea26
--- /dev/null
+++ b/lib/Target/ARM/ARMInstrThumb.td

@@ -0,0 +1,917 @@
+//===- ARMInstrThumb.td - Thumb support for ARM ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Thumb instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Thumb specific DAG Nodes.
+//
+
+def ARMtcall : SDNode<"ARMISD::tCALL", SDT_ARMcall,
+                      [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+
+def imm_neg_XFORM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(-(int)N->getZExtValue(), MVT::i32);
+}]>;
+def imm_comp_XFORM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(~((uint32_t)N->getZExtValue()), MVT::i32);
+}]>;
+
+
+/// imm0_7 predicate - True if the 32-bit immediate is in the range [0,7].
+def imm0_7 : PatLeaf<(i32 imm), [{
+  return (uint32_t)N->getZExtValue() < 8;
+}]>;
+def imm0_7_neg : PatLeaf<(i32 imm), [{
+  return (uint32_t)-N->getZExtValue() < 8;
+}], imm_neg_XFORM>;
+
+def imm0_255 : PatLeaf<(i32 imm), [{
+  return (uint32_t)N->getZExtValue() < 256;
+}]>;
+def imm0_255_comp : PatLeaf<(i32 imm), [{
+  return ~((uint32_t)N->getZExtValue()) < 256;
+}]>;
+
+def imm8_255 : PatLeaf<(i32 imm), [{
+  return (uint32_t)N->getZExtValue() >= 8 && (uint32_t)N->getZExtValue() < 256;
+}]>;
+def imm8_255_neg : PatLeaf<(i32 imm), [{
+  unsigned Val = -N->getZExtValue();
+  return Val >= 8 && Val < 256;
+}], imm_neg_XFORM>;
+
+// Break imm's up into two pieces: an immediate + a left shift.
+// This uses thumb_immshifted to match and thumb_immshifted_val and
+// thumb_immshifted_shamt to get the val/shift pieces.
+def thumb_immshifted : PatLeaf<(imm), [{
+  return ARM_AM::isThumbImmShiftedVal((unsigned)N->getZExtValue());
+}]>;
+
+def thumb_immshifted_val : SDNodeXForm<imm, [{
+  unsigned V = ARM_AM::getThumbImmNonShiftedVal((unsigned)N->getZExtValue());
+  return CurDAG->getTargetConstant(V, MVT::i32);
+}]>;
+
+def thumb_immshifted_shamt : SDNodeXForm<imm, [{
+  unsigned V = ARM_AM::getThumbImmValShift((unsigned)N->getZExtValue());
+  return CurDAG->getTargetConstant(V, MVT::i32);
+}]>;
+
+// Scaled 4 immediate.
+def t_imm_s4 : Operand<i32> {
+  let PrintMethod = "printThumbS4ImmOperand";
+}
+
+// Define Thumb specific addressing modes.
+
+// t_addrmode_rr := reg + reg
+//
+def t_addrmode_rr : Operand<i32>,
+                    ComplexPattern<i32, 2, "SelectThumbAddrModeRR", []> {
+  let PrintMethod = "printThumbAddrModeRROperand";
+  let MIOperandInfo = (ops tGPR:$base, tGPR:$offsreg);
+}
+
+// t_addrmode_s4 := reg + reg
+//                  reg + imm5 * 4
+//
+def t_addrmode_s4 : Operand<i32>,
+                    ComplexPattern<i32, 3, "SelectThumbAddrModeS4", []> {
+  let PrintMethod = "printThumbAddrModeS4Operand";
+  let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm, tGPR:$offsreg);
+}
+
+// t_addrmode_s2 := reg + reg
+//                  reg + imm5 * 2
+//
+def t_addrmode_s2 : Operand<i32>,
+                    ComplexPattern<i32, 3, "SelectThumbAddrModeS2", []> {
+  let PrintMethod = "printThumbAddrModeS2Operand";
+  let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm, tGPR:$offsreg);
+}
+
+// t_addrmode_s1 := reg + reg
+//                  reg + imm5
+//
+def t_addrmode_s1 : Operand<i32>,
+                    ComplexPattern<i32, 3, "SelectThumbAddrModeS1", []> {
+  let PrintMethod = "printThumbAddrModeS1Operand";
+  let MIOperandInfo = (ops tGPR:$base, i32imm:$offsimm, tGPR:$offsreg);
+}
+
+// t_addrmode_sp := sp + imm8 * 4
+//
+def t_addrmode_sp : Operand<i32>,
+                    ComplexPattern<i32, 2, "SelectThumbAddrModeSP", []> {
+  let PrintMethod = "printThumbAddrModeSPOperand";
+  let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+
+//===----------------------------------------------------------------------===//
+//  Miscellaneous Instructions.
+//
+
+let Defs = [SP], Uses = [SP] in {
+def tADJCALLSTACKUP :
+PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), NoItinerary,
+           "@ tADJCALLSTACKUP $amt1",
+           [(ARMcallseq_end imm:$amt1, imm:$amt2)]>, Requires<[IsThumb1Only]>;
+
+def tADJCALLSTACKDOWN :
+PseudoInst<(outs), (ins i32imm:$amt), NoItinerary,
+           "@ tADJCALLSTACKDOWN $amt",
+           [(ARMcallseq_start imm:$amt)]>, Requires<[IsThumb1Only]>;
+}
+
+// For both thumb1 and thumb2.
+let isNotDuplicable = 1 in
+def tPICADD : TIt<(outs GPR:$dst), (ins GPR:$lhs, pclabel:$cp), IIC_iALUr,
+                 "\n$cp:\n\tadd\t$dst, pc",
+                 [(set GPR:$dst, (ARMpic_add GPR:$lhs, imm:$cp))]>,
+              T1Special<{0,0,?,?}> {
+  let Inst{6-3} = 0b1111; // A8.6.6 Rm = pc
+}
+
+// PC relative add.
+def tADDrPCi : T1I<(outs tGPR:$dst), (ins t_imm_s4:$rhs), IIC_iALUi,
+                  "add\t$dst, pc, $rhs", []>,
+               T1Encoding<{1,0,1,0,0,?}>; // A6.2 & A8.6.10
+
+// ADD rd, sp, #imm8
+def tADDrSPi : T1I<(outs tGPR:$dst), (ins GPR:$sp, t_imm_s4:$rhs), IIC_iALUi,
+                  "add\t$dst, $sp, $rhs", []>,
+               T1Encoding<{1,0,1,0,1,?}>; // A6.2 & A8.6.8
+
+// ADD sp, sp, #imm7
+def tADDspi : TIt<(outs GPR:$dst), (ins GPR:$lhs, t_imm_s4:$rhs), IIC_iALUi,
+                  "add\t$dst, $rhs", []>,
+              T1Misc<{0,0,0,0,0,?,?}>; // A6.2.5 & A8.6.8
+
+// SUB sp, sp, #imm7
+def tSUBspi : TIt<(outs GPR:$dst), (ins GPR:$lhs, t_imm_s4:$rhs), IIC_iALUi,
+                  "sub\t$dst, $rhs", []>,
+              T1Misc<{0,0,0,0,1,?,?}>; // A6.2.5 & A8.6.215
+
+// ADD rm, sp
+def tADDrSP : TIt<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr,
+                  "add\t$dst, $rhs", []>,
+              T1Special<{0,0,?,?}> {
+  let Inst{6-3} = 0b1101; // A8.6.9 Encoding T1
+}
+
+// ADD sp, rm
+def tADDspr : TIt<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr,
+                  "add\t$dst, $rhs", []>,
+              T1Special<{0,0,?,?}> {
+  // A8.6.9 Encoding T2
+  let Inst{7} = 1;
+  let Inst{2-0} = 0b101;
+}
+
+// Pseudo instruction that will expand into a tSUBspi + a copy.
+let usesCustomInserter = 1 in { // Expanded after instruction selection.
+def tSUBspi_ : PseudoInst<(outs GPR:$dst), (ins GPR:$lhs, t_imm_s4:$rhs),
+               NoItinerary, "@ sub\t$dst, $rhs", []>;
+
+def tADDspr_ : PseudoInst<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs),
+               NoItinerary, "@ add\t$dst, $rhs", []>;
+
+let Defs = [CPSR] in
+def tANDsp : PseudoInst<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),
+             NoItinerary, "@ and\t$dst, $rhs", []>;
+} // usesCustomInserter
+
+//===----------------------------------------------------------------------===//
+//  Control Flow Instructions.
+//
+
+let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
+  def tBX_RET : TI<(outs), (ins), IIC_Br, "bx\tlr", [(ARMretflag)]>,
+                T1Special<{1,1,0,?}> { // A6.2.3 & A8.6.25
+    let Inst{6-3} = 0b1110; // Rm = lr
+  }
+  // Alternative return instruction used by vararg functions.
+  def tBX_RET_vararg : TI<(outs), (ins tGPR:$target), IIC_Br, "bx\t$target", []>,
+                       T1Special<{1,1,0,?}>; // A6.2.3 & A8.6.25
+}
+
+// Indirect branches
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+  def tBRIND : TI<(outs), (ins GPR:$dst), IIC_Br, "mov\tpc, $dst",
+                  [(brind GPR:$dst)]>,
+               T1Special<{1,0,1,?}> {
+    // <Rd> = Inst{7:2-0} = pc
+    let Inst{2-0} = 0b111;
+  }
+}
+
+// FIXME: remove when we have a way to marking a MI with these properties.
+let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1,
+    hasExtraDefRegAllocReq = 1 in
+def tPOP_RET : T1I<(outs), (ins pred:$p, reglist:$wb, variable_ops), IIC_Br,
+                   "pop${p}\t$wb", []>,
+               T1Misc<{1,1,0,?,?,?,?}>;
+
+let isCall = 1,
+  Defs = [R0,  R1,  R2,  R3,  R12, LR,
+          D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,
+          D16, D17, D18, D19, D20, D21, D22, D23,
+          D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR] in {
+  // Also used for Thumb2
+  def tBL  : TIx2<0b11110, 0b11, 1,
+                  (outs), (ins i32imm:$func, variable_ops), IIC_Br, 
+                  "bl\t${func:call}",
+                  [(ARMtcall tglobaladdr:$func)]>,
+             Requires<[IsThumb, IsNotDarwin]>;
+
+  // ARMv5T and above, also used for Thumb2
+  def tBLXi : TIx2<0b11110, 0b11, 0,
+                   (outs), (ins i32imm:$func, variable_ops), IIC_Br, 
+                   "blx\t${func:call}",
+                   [(ARMcall tglobaladdr:$func)]>,
+              Requires<[IsThumb, HasV5T, IsNotDarwin]>;
+
+  // Also used for Thumb2
+  def tBLXr : TI<(outs), (ins GPR:$func, variable_ops), IIC_Br, 
+                  "blx\t$func",
+                  [(ARMtcall GPR:$func)]>,
+              Requires<[IsThumb, HasV5T, IsNotDarwin]>,
+              T1Special<{1,1,1,?}>; // A6.2.3 & A8.6.24;
+
+  // ARMv4T
+  def tBX : TIx2<{?,?,?,?,?}, {?,?}, ?,
+                  (outs), (ins tGPR:$func, variable_ops), IIC_Br, 
+                  "mov\tlr, pc\n\tbx\t$func",
+                  [(ARMcall_nolink tGPR:$func)]>,
+            Requires<[IsThumb1Only, IsNotDarwin]>;
+}
+
+// On Darwin R9 is call-clobbered.
+let isCall = 1,
+  Defs = [R0,  R1,  R2,  R3,  R9,  R12, LR,
+          D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,
+          D16, D17, D18, D19, D20, D21, D22, D23,
+          D24, D25, D26, D27, D28, D29, D30, D31, CPSR, FPSCR] in {
+  // Also used for Thumb2
+  def tBLr9 : TIx2<0b11110, 0b11, 1,
+                   (outs), (ins i32imm:$func, variable_ops), IIC_Br, 
+                   "bl\t${func:call}",
+                   [(ARMtcall tglobaladdr:$func)]>,
+              Requires<[IsThumb, IsDarwin]>;
+
+  // ARMv5T and above, also used for Thumb2
+  def tBLXi_r9 : TIx2<0b11110, 0b11, 0,
+                      (outs), (ins i32imm:$func, variable_ops), IIC_Br, 
+                      "blx\t${func:call}",
+                      [(ARMcall tglobaladdr:$func)]>,
+                 Requires<[IsThumb, HasV5T, IsDarwin]>;
+
+  // Also used for Thumb2
+  def tBLXr_r9 : TI<(outs), (ins GPR:$func, variable_ops), IIC_Br, 
+                    "blx\t$func",
+                    [(ARMtcall GPR:$func)]>,
+                 Requires<[IsThumb, HasV5T, IsDarwin]>,
+                 T1Special<{1,1,1,?}>; // A6.2.3 & A8.6.24
+
+  // ARMv4T
+  def tBXr9 : TIx2<{?,?,?,?,?}, {?,?}, ?,
+                   (outs), (ins tGPR:$func, variable_ops), IIC_Br, 
+                   "mov\tlr, pc\n\tbx\t$func",
+                   [(ARMcall_nolink tGPR:$func)]>,
+              Requires<[IsThumb1Only, IsDarwin]>;
+}
+
+let isBranch = 1, isTerminator = 1 in {
+  let isBarrier = 1 in {
+    let isPredicable = 1 in
+    def tB   : T1I<(outs), (ins brtarget:$target), IIC_Br,
+                   "b\t$target", [(br bb:$target)]>,
+               T1Encoding<{1,1,1,0,0,?}>;
+
+  // Far jump
+  let Defs = [LR] in
+  def tBfar : TIx2<0b11110, 0b11, 1, (outs), (ins brtarget:$target), IIC_Br, 
+                    "bl\t$target\t@ far jump",[]>;
+
+  def tBR_JTr : T1JTI<(outs),
+                      (ins tGPR:$target, jtblock_operand:$jt, i32imm:$id),
+                      IIC_Br, "mov\tpc, $target\n\t.align\t2\n$jt",
+                      [(ARMbrjt tGPR:$target, tjumptable:$jt, imm:$id)]>,
+                Encoding16 {
+    let Inst{15-7} = 0b010001101;
+    let Inst{2-0} = 0b111;
+  }
+  }
+}
+
+// FIXME: should be able to write a pattern for ARMBrcond, but can't use
+// a two-value operand where a dag node expects two operands. :(
+let isBranch = 1, isTerminator = 1 in
+  def tBcc : T1I<(outs), (ins brtarget:$target, pred:$cc), IIC_Br,
+                 "b$cc\t$target",
+                 [/*(ARMbrcond bb:$target, imm:$cc)*/]>,
+             T1Encoding<{1,1,0,1,?,?}>;
+
+// Compare and branch on zero / non-zero
+let isBranch = 1, isTerminator = 1 in {
+  def tCBZ  : T1I<(outs), (ins tGPR:$cmp, brtarget:$target), IIC_Br,
+                  "cbz\t$cmp, $target", []>,
+              T1Misc<{0,0,?,1,?,?,?}>;
+
+  def tCBNZ : T1I<(outs), (ins tGPR:$cmp, brtarget:$target), IIC_Br,
+                  "cbnz\t$cmp, $target", []>,
+              T1Misc<{1,0,?,1,?,?,?}>;
+}
+
+//===----------------------------------------------------------------------===//
+//  Load Store Instructions.
+//
+
+let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+def tLDR : T1pI4<(outs tGPR:$dst), (ins t_addrmode_s4:$addr), IIC_iLoadr, 
+               "ldr", "\t$dst, $addr",
+               [(set tGPR:$dst, (load t_addrmode_s4:$addr))]>,
+           T1LdSt<0b100>;
+def tLDRi: T1pI4<(outs tGPR:$dst), (ins t_addrmode_s4:$addr), IIC_iLoadr, 
+               "ldr", "\t$dst, $addr",
+               []>,
+           T1LdSt4Imm<{1,?,?}>;
+
+def tLDRB : T1pI1<(outs tGPR:$dst), (ins t_addrmode_s1:$addr), IIC_iLoadr,
+                "ldrb", "\t$dst, $addr",
+                [(set tGPR:$dst, (zextloadi8 t_addrmode_s1:$addr))]>,
+            T1LdSt<0b110>;
+def tLDRBi: T1pI1<(outs tGPR:$dst), (ins t_addrmode_s1:$addr), IIC_iLoadr,
+                "ldrb", "\t$dst, $addr",
+                []>,
+            T1LdSt1Imm<{1,?,?}>;
+
+def tLDRH : T1pI2<(outs tGPR:$dst), (ins t_addrmode_s2:$addr), IIC_iLoadr,
+                "ldrh", "\t$dst, $addr",
+                [(set tGPR:$dst, (zextloadi16 t_addrmode_s2:$addr))]>,
+            T1LdSt<0b101>;
+def tLDRHi: T1pI2<(outs tGPR:$dst), (ins t_addrmode_s2:$addr), IIC_iLoadr,
+                "ldrh", "\t$dst, $addr",
+                []>,
+            T1LdSt2Imm<{1,?,?}>;
+
+let AddedComplexity = 10 in
+def tLDRSB : T1pI1<(outs tGPR:$dst), (ins t_addrmode_rr:$addr), IIC_iLoadr,
+                 "ldrsb", "\t$dst, $addr",
+                 [(set tGPR:$dst, (sextloadi8 t_addrmode_rr:$addr))]>,
+             T1LdSt<0b011>;
+
+let AddedComplexity = 10 in
+def tLDRSH : T1pI2<(outs tGPR:$dst), (ins t_addrmode_rr:$addr), IIC_iLoadr,
+                 "ldrsh", "\t$dst, $addr",
+                 [(set tGPR:$dst, (sextloadi16 t_addrmode_rr:$addr))]>,
+             T1LdSt<0b111>;
+
+let canFoldAsLoad = 1 in
+def tLDRspi : T1pIs<(outs tGPR:$dst), (ins t_addrmode_sp:$addr), IIC_iLoadi,
+                  "ldr", "\t$dst, $addr",
+                  [(set tGPR:$dst, (load t_addrmode_sp:$addr))]>,
+              T1LdStSP<{1,?,?}>;
+
+// Special instruction for restore. It cannot clobber condition register
+// when it's expanded by eliminateCallFramePseudoInstr().
+let canFoldAsLoad = 1, mayLoad = 1 in
+def tRestore : T1pIs<(outs tGPR:$dst), (ins t_addrmode_sp:$addr), IIC_iLoadi,
+                    "ldr", "\t$dst, $addr", []>,
+               T1LdStSP<{1,?,?}>;
+
+// Load tconstpool
+// FIXME: Use ldr.n to work around a Darwin assembler bug.
+let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1  in 
+def tLDRpci : T1pIs<(outs tGPR:$dst), (ins i32imm:$addr), IIC_iLoadi,
+                  "ldr", ".n\t$dst, $addr",
+                  [(set tGPR:$dst, (load (ARMWrapper tconstpool:$addr)))]>,
+              T1Encoding<{0,1,0,0,1,?}>; // A6.2 & A8.6.59
+
+// Special LDR for loads from non-pc-relative constpools.
+let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
+    mayHaveSideEffects = 1  in
+def tLDRcp  : T1pIs<(outs tGPR:$dst), (ins i32imm:$addr), IIC_iLoadi,
+                  "ldr", "\t$dst, $addr", []>,
+              T1LdStSP<{1,?,?}>;
+
+def tSTR : T1pI4<(outs), (ins tGPR:$src, t_addrmode_s4:$addr), IIC_iStorer,
+               "str", "\t$src, $addr",
+               [(store tGPR:$src, t_addrmode_s4:$addr)]>,
+           T1LdSt<0b000>;
+def tSTRi: T1pI4<(outs), (ins tGPR:$src, t_addrmode_s4:$addr), IIC_iStorer,
+               "str", "\t$src, $addr",
+               []>,
+           T1LdSt4Imm<{0,?,?}>;
+
+def tSTRB : T1pI1<(outs), (ins tGPR:$src, t_addrmode_s1:$addr), IIC_iStorer,
+                 "strb", "\t$src, $addr",
+                 [(truncstorei8 tGPR:$src, t_addrmode_s1:$addr)]>,
+            T1LdSt<0b010>;
+def tSTRBi: T1pI1<(outs), (ins tGPR:$src, t_addrmode_s1:$addr), IIC_iStorer,
+                 "strb", "\t$src, $addr",
+                 []>,
+            T1LdSt1Imm<{0,?,?}>;
+
+def tSTRH : T1pI2<(outs), (ins tGPR:$src, t_addrmode_s2:$addr), IIC_iStorer,
+                 "strh", "\t$src, $addr",
+                 [(truncstorei16 tGPR:$src, t_addrmode_s2:$addr)]>,
+            T1LdSt<0b001>;
+def tSTRHi: T1pI2<(outs), (ins tGPR:$src, t_addrmode_s2:$addr), IIC_iStorer,
+                 "strh", "\t$src, $addr",
+                 []>,
+            T1LdSt2Imm<{0,?,?}>;
+
+def tSTRspi : T1pIs<(outs), (ins tGPR:$src, t_addrmode_sp:$addr), IIC_iStorei,
+                   "str", "\t$src, $addr",
+                   [(store tGPR:$src, t_addrmode_sp:$addr)]>,
+              T1LdStSP<{0,?,?}>;
+
+let mayStore = 1 in {
+// Special instruction for spill. It cannot clobber condition register
+// when it's expanded by eliminateCallFramePseudoInstr().
+def tSpill : T1pIs<(outs), (ins tGPR:$src, t_addrmode_sp:$addr), IIC_iStorei,
+                  "str", "\t$src, $addr", []>,
+             T1LdStSP<{0,?,?}>;
+}
+
+//===----------------------------------------------------------------------===//
+//  Load / store multiple Instructions.
+//
+
+// These requires base address to be written back or one of the loaded regs.
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
+def tLDM : T1I<(outs),
+               (ins addrmode4:$addr, pred:$p, reglist:$wb, variable_ops),
+               IIC_iLoadm,
+               "ldm${addr:submode}${p}\t$addr, $wb", []>,
+           T1Encoding<{1,1,0,0,1,?}>; // A6.2 & A8.6.53
+
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
+def tSTM : T1I<(outs),
+               (ins addrmode4:$addr, pred:$p, reglist:$wb, variable_ops),
+               IIC_iStorem,
+               "stm${addr:submode}${p}\t$addr, $wb", []>,
+           T1Encoding<{1,1,0,0,0,?}>; // A6.2 & A8.6.189
+
+let mayLoad = 1, Uses = [SP], Defs = [SP], hasExtraDefRegAllocReq = 1 in
+def tPOP : T1I<(outs), (ins pred:$p, reglist:$wb, variable_ops), IIC_Br,
+               "pop${p}\t$wb", []>,
+           T1Misc<{1,1,0,?,?,?,?}>;
+
+let mayStore = 1, Uses = [SP], Defs = [SP], hasExtraSrcRegAllocReq = 1 in
+def tPUSH : T1I<(outs), (ins pred:$p, reglist:$wb, variable_ops), IIC_Br,
+                "push${p}\t$wb", []>,
+            T1Misc<{0,1,0,?,?,?,?}>;
+
+//===----------------------------------------------------------------------===//
+//  Arithmetic Instructions.
+//
+
+// Add with carry register
+let isCommutable = 1, Uses = [CPSR] in
+def tADC : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr,
+                 "adc", "\t$dst, $rhs",
+                 [(set tGPR:$dst, (adde tGPR:$lhs, tGPR:$rhs))]>,
+           T1DataProcessing<0b0101>;
+
+// Add immediate
+def tADDi3 : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iALUi,
+                   "add", "\t$dst, $lhs, $rhs",
+                   [(set tGPR:$dst, (add tGPR:$lhs, imm0_7:$rhs))]>,
+             T1General<0b01110>;
+
+def tADDi8 : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iALUi,
+                   "add", "\t$dst, $rhs",
+                   [(set tGPR:$dst, (add tGPR:$lhs, imm8_255:$rhs))]>,
+             T1General<{1,1,0,?,?}>;
+
+// Add register
+let isCommutable = 1 in
+def tADDrr : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr,
+                   "add", "\t$dst, $lhs, $rhs",
+                   [(set tGPR:$dst, (add tGPR:$lhs, tGPR:$rhs))]>,
+             T1General<0b01100>;
+
+let neverHasSideEffects = 1 in
+def tADDhirr : T1pIt<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr,
+                     "add", "\t$dst, $rhs", []>,
+               T1Special<{0,0,?,?}>;
+
+// And register
+let isCommutable = 1 in
+def tAND : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr,
+                 "and", "\t$dst, $rhs",
+                 [(set tGPR:$dst, (and tGPR:$lhs, tGPR:$rhs))]>,
+           T1DataProcessing<0b0000>;
+
+// ASR immediate
+def tASRri : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iMOVsi,
+                  "asr", "\t$dst, $lhs, $rhs",
+                  [(set tGPR:$dst, (sra tGPR:$lhs, (i32 imm:$rhs)))]>,
+             T1General<{0,1,0,?,?}>;
+
+// ASR register
+def tASRrr : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iMOVsr,
+                   "asr", "\t$dst, $rhs",
+                   [(set tGPR:$dst, (sra tGPR:$lhs, tGPR:$rhs))]>,
+             T1DataProcessing<0b0100>;
+
+// BIC register
+def tBIC : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr,
+                 "bic", "\t$dst, $rhs",
+                 [(set tGPR:$dst, (and tGPR:$lhs, (not tGPR:$rhs)))]>,
+           T1DataProcessing<0b1110>;
+
+// CMN register
+let Defs = [CPSR] in {
+//FIXME: Disable CMN, as CCodes are backwards from compare expectations
+//       Compare-to-zero still works out, just not the relationals
+//def tCMN : T1pI<(outs), (ins tGPR:$lhs, tGPR:$rhs), IIC_iCMPr,
+//                "cmn", "\t$lhs, $rhs",
+//                [(ARMcmp tGPR:$lhs, (ineg tGPR:$rhs))]>,
+//           T1DataProcessing<0b1011>;
+def tCMNz : T1pI<(outs), (ins tGPR:$lhs, tGPR:$rhs), IIC_iCMPr,
+                 "cmn", "\t$lhs, $rhs",
+                 [(ARMcmpZ tGPR:$lhs, (ineg tGPR:$rhs))]>,
+            T1DataProcessing<0b1011>;
+}
+
+// CMP immediate
+let Defs = [CPSR] in {
+def tCMPi8 : T1pI<(outs), (ins tGPR:$lhs, i32imm:$rhs), IIC_iCMPi,
+                  "cmp", "\t$lhs, $rhs",
+                  [(ARMcmp tGPR:$lhs, imm0_255:$rhs)]>,
+             T1General<{1,0,1,?,?}>;
+def tCMPzi8 : T1pI<(outs), (ins tGPR:$lhs, i32imm:$rhs), IIC_iCMPi,
+                  "cmp", "\t$lhs, $rhs",
+                  [(ARMcmpZ tGPR:$lhs, imm0_255:$rhs)]>,
+              T1General<{1,0,1,?,?}>;
+}
+
+// CMP register
+let Defs = [CPSR] in {
+def tCMPr : T1pI<(outs), (ins tGPR:$lhs, tGPR:$rhs), IIC_iCMPr,
+                 "cmp", "\t$lhs, $rhs",
+                 [(ARMcmp tGPR:$lhs, tGPR:$rhs)]>,
+            T1DataProcessing<0b1010>;
+def tCMPzr : T1pI<(outs), (ins tGPR:$lhs, tGPR:$rhs), IIC_iCMPr,
+                  "cmp", "\t$lhs, $rhs",
+                  [(ARMcmpZ tGPR:$lhs, tGPR:$rhs)]>,
+             T1DataProcessing<0b1010>;
+
+def tCMPhir : T1pI<(outs), (ins GPR:$lhs, GPR:$rhs), IIC_iCMPr,
+                   "cmp", "\t$lhs, $rhs", []>,
+              T1Special<{0,1,?,?}>;
+def tCMPzhir : T1pI<(outs), (ins GPR:$lhs, GPR:$rhs), IIC_iCMPr,
+                    "cmp", "\t$lhs, $rhs", []>,
+               T1Special<{0,1,?,?}>;
+}
+
+
+// XOR register
+let isCommutable = 1 in
+def tEOR : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr,
+                 "eor", "\t$dst, $rhs",
+                 [(set tGPR:$dst, (xor tGPR:$lhs, tGPR:$rhs))]>,
+           T1DataProcessing<0b0001>;
+
+// LSL immediate
+def tLSLri : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iMOVsi,
+                  "lsl", "\t$dst, $lhs, $rhs",
+                  [(set tGPR:$dst, (shl tGPR:$lhs, (i32 imm:$rhs)))]>,
+             T1General<{0,0,0,?,?}>;
+
+// LSL register
+def tLSLrr : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iMOVsr,
+                   "lsl", "\t$dst, $rhs",
+                   [(set tGPR:$dst, (shl tGPR:$lhs, tGPR:$rhs))]>,
+             T1DataProcessing<0b0010>;
+
+// LSR immediate
+def tLSRri : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iMOVsi,
+                  "lsr", "\t$dst, $lhs, $rhs",
+                  [(set tGPR:$dst, (srl tGPR:$lhs, (i32 imm:$rhs)))]>,
+             T1General<{0,0,1,?,?}>;
+
+// LSR register
+def tLSRrr : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iMOVsr,
+                   "lsr", "\t$dst, $rhs",
+                   [(set tGPR:$dst, (srl tGPR:$lhs, tGPR:$rhs))]>,
+             T1DataProcessing<0b0011>;
+
+// move register
+def tMOVi8 : T1sI<(outs tGPR:$dst), (ins i32imm:$src), IIC_iMOVi,
+                  "mov", "\t$dst, $src",
+                  [(set tGPR:$dst, imm0_255:$src)]>,
+             T1General<{1,0,0,?,?}>;
+
+// TODO: A7-73: MOV(2) - mov setting flag.
+
+
+let neverHasSideEffects = 1 in {
+// FIXME: Make this predicable.
+def tMOVr       : T1I<(outs tGPR:$dst), (ins tGPR:$src), IIC_iMOVr,
+                      "mov\t$dst, $src", []>,
+                  T1Special<0b1000>;
+let Defs = [CPSR] in
+def tMOVSr      : T1I<(outs tGPR:$dst), (ins tGPR:$src), IIC_iMOVr,
+                       "movs\t$dst, $src", []>, Encoding16 {
+  let Inst{15-6} = 0b0000000000;
+}
+
+// FIXME: Make these predicable.
+def tMOVgpr2tgpr : T1I<(outs tGPR:$dst), (ins GPR:$src), IIC_iMOVr,
+                       "mov\t$dst, $src", []>,
+                   T1Special<{1,0,0,?}>;
+def tMOVtgpr2gpr : T1I<(outs GPR:$dst), (ins tGPR:$src), IIC_iMOVr,
+                       "mov\t$dst, $src", []>,
+                   T1Special<{1,0,?,0}>;
+def tMOVgpr2gpr  : T1I<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVr,
+                       "mov\t$dst, $src", []>,
+                   T1Special<{1,0,?,?}>;
+} // neverHasSideEffects
+
+// multiply register
+let isCommutable = 1 in
+def tMUL : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iMUL32,
+                 "mul", "\t$dst, $rhs",
+                 [(set tGPR:$dst, (mul tGPR:$lhs, tGPR:$rhs))]>,
+           T1DataProcessing<0b1101>;
+
+// move inverse register
+def tMVN : T1sI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iMOVr,
+                "mvn", "\t$dst, $src",
+                [(set tGPR:$dst, (not tGPR:$src))]>,
+           T1DataProcessing<0b1111>;
+
+// bitwise or register
+let isCommutable = 1 in
+def tORR : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs),  IIC_iALUr,
+                 "orr", "\t$dst, $rhs",
+                 [(set tGPR:$dst, (or tGPR:$lhs, tGPR:$rhs))]>,
+           T1DataProcessing<0b1100>;
+
+// swaps
+def tREV : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr,
+                "rev", "\t$dst, $src",
+                [(set tGPR:$dst, (bswap tGPR:$src))]>,
+                Requires<[IsThumb1Only, HasV6]>,
+           T1Misc<{1,0,1,0,0,0,?}>;
+
+def tREV16 : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr,
+                  "rev16", "\t$dst, $src",
+             [(set tGPR:$dst,
+                   (or (and (srl tGPR:$src, (i32 8)), 0xFF),
+                       (or (and (shl tGPR:$src, (i32 8)), 0xFF00),
+                           (or (and (srl tGPR:$src, (i32 8)), 0xFF0000),
+                               (and (shl tGPR:$src, (i32 8)), 0xFF000000)))))]>,
+                Requires<[IsThumb1Only, HasV6]>,
+             T1Misc<{1,0,1,0,0,1,?}>;
+
+def tREVSH : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr,
+                  "revsh", "\t$dst, $src",
+                  [(set tGPR:$dst,
+                        (sext_inreg
+                          (or (srl (and tGPR:$src, 0xFF00), (i32 8)),
+                              (shl tGPR:$src, (i32 8))), i16))]>,
+                  Requires<[IsThumb1Only, HasV6]>,
+             T1Misc<{1,0,1,0,1,1,?}>;
+
+// rotate right register
+def tROR : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iMOVsr,
+                 "ror", "\t$dst, $rhs",
+                 [(set tGPR:$dst, (rotr tGPR:$lhs, tGPR:$rhs))]>,
+           T1DataProcessing<0b0111>;
+
+// negate register
+def tRSB : T1sI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iALUi,
+                "rsb", "\t$dst, $src, #0",
+                [(set tGPR:$dst, (ineg tGPR:$src))]>,
+           T1DataProcessing<0b1001>;
+
+// Subtract with carry register
+let Uses = [CPSR] in
+def tSBC : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr,
+                 "sbc", "\t$dst, $rhs",
+                 [(set tGPR:$dst, (sube tGPR:$lhs, tGPR:$rhs))]>,
+           T1DataProcessing<0b0110>;
+
+// Subtract immediate
+def tSUBi3 : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iALUi,
+                  "sub", "\t$dst, $lhs, $rhs",
+                  [(set tGPR:$dst, (add tGPR:$lhs, imm0_7_neg:$rhs))]>,
+             T1General<0b01111>;
+
+def tSUBi8 : T1sIt<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iALUi,
+                   "sub", "\t$dst, $rhs",
+                   [(set tGPR:$dst, (add tGPR:$lhs, imm8_255_neg:$rhs))]>,
+             T1General<{1,1,1,?,?}>;
+
+// subtract register
+def tSUBrr : T1sI<(outs tGPR:$dst), (ins tGPR:$lhs, tGPR:$rhs), IIC_iALUr,
+                  "sub", "\t$dst, $lhs, $rhs",
+                  [(set tGPR:$dst, (sub tGPR:$lhs, tGPR:$rhs))]>,
+             T1General<0b01101>;
+
+// TODO: A7-96: STMIA - store multiple.
+
+// sign-extend byte
+def tSXTB  : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr,
+                  "sxtb", "\t$dst, $src",
+                  [(set tGPR:$dst, (sext_inreg tGPR:$src, i8))]>,
+                  Requires<[IsThumb1Only, HasV6]>,
+             T1Misc<{0,0,1,0,0,1,?}>;
+
+// sign-extend short
+def tSXTH  : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr,
+                  "sxth", "\t$dst, $src",
+                  [(set tGPR:$dst, (sext_inreg tGPR:$src, i16))]>,
+                  Requires<[IsThumb1Only, HasV6]>,
+             T1Misc<{0,0,1,0,0,0,?}>;
+
+// test
+let isCommutable = 1, Defs = [CPSR] in
+def tTST  : T1pI<(outs), (ins tGPR:$lhs, tGPR:$rhs), IIC_iCMPr,
+                 "tst", "\t$lhs, $rhs",
+                 [(ARMcmpZ (and tGPR:$lhs, tGPR:$rhs), 0)]>,
+            T1DataProcessing<0b1000>;
+
+// zero-extend byte
+def tUXTB  : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr,
+                  "uxtb", "\t$dst, $src",
+                  [(set tGPR:$dst, (and tGPR:$src, 0xFF))]>,
+                  Requires<[IsThumb1Only, HasV6]>,
+             T1Misc<{0,0,1,0,1,1,?}>;
+
+// zero-extend short
+def tUXTH  : T1pI<(outs tGPR:$dst), (ins tGPR:$src), IIC_iUNAr,
+                  "uxth", "\t$dst, $src",
+                  [(set tGPR:$dst, (and tGPR:$src, 0xFFFF))]>,
+                  Requires<[IsThumb1Only, HasV6]>,
+             T1Misc<{0,0,1,0,1,0,?}>;
+
+
+// Conditional move tMOVCCr - Used to implement the Thumb SELECT_CC DAG operation.
+// Expanded after instruction selection into a branch sequence.
+let usesCustomInserter = 1 in  // Expanded after instruction selection.
+  def tMOVCCr_pseudo :
+  PseudoInst<(outs tGPR:$dst), (ins tGPR:$false, tGPR:$true, pred:$cc),
+              NoItinerary, "@ tMOVCCr $cc",
+             [/*(set tGPR:$dst, (ARMcmov tGPR:$false, tGPR:$true, imm:$cc))*/]>;
+
+
+// 16-bit movcc in IT blocks for Thumb2.
+def tMOVCCr : T1pIt<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iCMOVr,
+                    "mov", "\t$dst, $rhs", []>,
+              T1Special<{1,0,?,?}>;
+
+def tMOVCCi : T1pIt<(outs tGPR:$dst), (ins tGPR:$lhs, i32imm:$rhs), IIC_iCMOVi,
+                    "mov", "\t$dst, $rhs", []>,
+              T1General<{1,0,0,?,?}>;
+
+// tLEApcrel - Load a pc-relative address into a register without offending the
+// assembler.
+def tLEApcrel : T1I<(outs tGPR:$dst), (ins i32imm:$label, pred:$p), IIC_iALUi,
+                    "adr$p\t$dst, #$label", []>,
+                T1Encoding<{1,0,1,0,0,?}>; // A6.2 & A8.6.10
+
+def tLEApcrelJT : T1I<(outs tGPR:$dst),
+                      (ins i32imm:$label, nohash_imm:$id, pred:$p),
+                      IIC_iALUi, "adr$p\t$dst, #${label}_${id}", []>,
+                  T1Encoding<{1,0,1,0,0,?}>; // A6.2 & A8.6.10
+
+//===----------------------------------------------------------------------===//
+// TLS Instructions
+//
+
+// __aeabi_read_tp preserves the registers r1-r3.
+let isCall = 1,
+  Defs = [R0, LR] in {
+  def tTPsoft : TIx2<0b11110, 0b11, 1, (outs), (ins), IIC_Br,
+                     "bl\t__aeabi_read_tp",
+                     [(set R0, ARMthread_pointer)]>;
+}
+
+// SJLJ Exception handling intrinsics
+//   eh_sjlj_setjmp() is an instruction sequence to store the return
+//   address and save #0 in R0 for the non-longjmp case.
+//   Since by its nature we may be coming from some other function to get
+//   here, and we're using the stack frame for the containing function to
+//   save/restore registers, we can't keep anything live in regs across
+//   the eh_sjlj_setjmp(), else it will almost certainly have been tromped upon
+//   when we get here from a longjmp(). We force everthing out of registers
+//   except for our own input by listing the relevant registers in Defs. By
+//   doing so, we also cause the prologue/epilogue code to actively preserve
+//   all of the callee-saved resgisters, which is exactly what we want.
+//   The current SP is passed in $val, and we reuse the reg as a scratch.
+let Defs =
+  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7, R12 ] in {
+  def tInt_eh_sjlj_setjmp : ThumbXI<(outs),(ins tGPR:$src, tGPR:$val),
+                              AddrModeNone, SizeSpecial, NoItinerary,
+                              "str\t$val, [$src, #8]\t@ begin eh.setjmp\n"
+                              "\tmov\t$val, pc\n"
+                              "\tadds\t$val, #9\n"
+                              "\tstr\t$val, [$src, #4]\n"
+                              "\tmovs\tr0, #0\n"
+                              "\tb\t1f\n"
+                              "\tmovs\tr0, #1\t@ end eh.setjmp\n"
+                              "1:", "",
+                   [(set R0, (ARMeh_sjlj_setjmp tGPR:$src, tGPR:$val))]>;
+}
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//
+
+// Add with carry
+def : T1Pat<(addc   tGPR:$lhs, imm0_7:$rhs),
+            (tADDi3 tGPR:$lhs, imm0_7:$rhs)>;
+def : T1Pat<(addc   tGPR:$lhs, imm8_255:$rhs),
+            (tADDi8 tGPR:$lhs, imm8_255:$rhs)>;
+def : T1Pat<(addc   tGPR:$lhs, tGPR:$rhs),
+            (tADDrr tGPR:$lhs, tGPR:$rhs)>;
+
+// Subtract with carry
+def : T1Pat<(addc   tGPR:$lhs, imm0_7_neg:$rhs),
+            (tSUBi3 tGPR:$lhs, imm0_7_neg:$rhs)>;
+def : T1Pat<(addc   tGPR:$lhs, imm8_255_neg:$rhs),
+            (tSUBi8 tGPR:$lhs, imm8_255_neg:$rhs)>;
+def : T1Pat<(subc   tGPR:$lhs, tGPR:$rhs),
+            (tSUBrr tGPR:$lhs, tGPR:$rhs)>;
+
+// ConstantPool, GlobalAddress
+def : T1Pat<(ARMWrapper  tglobaladdr :$dst), (tLEApcrel tglobaladdr :$dst)>;
+def : T1Pat<(ARMWrapper  tconstpool  :$dst), (tLEApcrel tconstpool  :$dst)>;
+
+// JumpTable
+def : T1Pat<(ARMWrapperJT tjumptable:$dst, imm:$id),
+            (tLEApcrelJT tjumptable:$dst, imm:$id)>;
+
+// Direct calls
+def : T1Pat<(ARMtcall texternalsym:$func), (tBL texternalsym:$func)>,
+      Requires<[IsThumb, IsNotDarwin]>;
+def : T1Pat<(ARMtcall texternalsym:$func), (tBLr9 texternalsym:$func)>,
+      Requires<[IsThumb, IsDarwin]>;
+
+def : Tv5Pat<(ARMcall texternalsym:$func), (tBLXi texternalsym:$func)>,
+      Requires<[IsThumb, HasV5T, IsNotDarwin]>;
+def : Tv5Pat<(ARMcall texternalsym:$func), (tBLXi_r9 texternalsym:$func)>,
+      Requires<[IsThumb, HasV5T, IsDarwin]>;
+
+// Indirect calls to ARM routines
+def : Tv5Pat<(ARMcall GPR:$dst), (tBLXr GPR:$dst)>,
+      Requires<[IsThumb, HasV5T, IsNotDarwin]>;
+def : Tv5Pat<(ARMcall GPR:$dst), (tBLXr_r9 GPR:$dst)>,
+      Requires<[IsThumb, HasV5T, IsDarwin]>;
+
+// zextload i1 -> zextload i8
+def : T1Pat<(zextloadi1 t_addrmode_s1:$addr),
+            (tLDRB t_addrmode_s1:$addr)>;
+
+// extload -> zextload
+def : T1Pat<(extloadi1  t_addrmode_s1:$addr),  (tLDRB t_addrmode_s1:$addr)>;
+def : T1Pat<(extloadi8  t_addrmode_s1:$addr),  (tLDRB t_addrmode_s1:$addr)>;
+def : T1Pat<(extloadi16 t_addrmode_s2:$addr),  (tLDRH t_addrmode_s2:$addr)>;
+
+// If it's impossible to use [r,r] address mode for sextload, select to
+// ldr{b|h} + sxt{b|h} instead.
+def : T1Pat<(sextloadi8 t_addrmode_s1:$addr),
+            (tSXTB (tLDRB t_addrmode_s1:$addr))>,
+      Requires<[IsThumb1Only, HasV6]>;
+def : T1Pat<(sextloadi16 t_addrmode_s2:$addr),
+            (tSXTH (tLDRH t_addrmode_s2:$addr))>,
+      Requires<[IsThumb1Only, HasV6]>;
+
+def : T1Pat<(sextloadi8 t_addrmode_s1:$addr),
+            (tASRri (tLSLri (tLDRB t_addrmode_s1:$addr), 24), 24)>;
+def : T1Pat<(sextloadi16 t_addrmode_s1:$addr),
+            (tASRri (tLSLri (tLDRH t_addrmode_s1:$addr), 16), 16)>;
+
+// Large immediate handling.
+
+// Two piece imms.
+def : T1Pat<(i32 thumb_immshifted:$src),
+            (tLSLri (tMOVi8 (thumb_immshifted_val imm:$src)),
+                    (thumb_immshifted_shamt imm:$src))>;
+
+def : T1Pat<(i32 imm0_255_comp:$src),
+            (tMVN (tMOVi8 (imm_comp_XFORM imm:$src)))>;
+
+// Pseudo instruction that combines ldr from constpool and add pc. This should
+// be expanded into two instructions late to allow if-conversion and
+// scheduling.
+let isReMaterializable = 1 in
+def tLDRpci_pic : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr, pclabel:$cp),
+                   NoItinerary, "@ ldr.n\t$dst, $addr\n$cp:\n\tadd\t$dst, pc",
+               [(set GPR:$dst, (ARMpic_add (load (ARMWrapper tconstpool:$addr)),
+                                           imm:$cp))]>,
+               Requires<[IsThumb1Only]>;

diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
new file mode 100644
index 0000000..55c7aa2
--- /dev/null
+++ b/lib/Target/ARM/ARMInstrThumb2.td

@@ -0,0 +1,1978 @@
+//===- ARMInstrThumb2.td - Thumb2 support for ARM -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Thumb2 instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+// IT block predicate field
+def it_pred : Operand<i32> {
+  let PrintMethod = "printPredicateOperand";
+}
+
+// IT block condition mask
+def it_mask : Operand<i32> {
+  let PrintMethod = "printThumbITMask";
+}
+
+// Table branch address
+def tb_addrmode : Operand<i32> {
+  let PrintMethod = "printTBAddrMode";
+}
+
+// Shifted operands. No register controlled shifts for Thumb2.
+// Note: We do not support rrx shifted operands yet.
+def t2_so_reg : Operand<i32>,    // reg imm
+                ComplexPattern<i32, 2, "SelectT2ShifterOperandReg",
+                               [shl,srl,sra,rotr]> {
+  let PrintMethod = "printT2SOOperand";
+  let MIOperandInfo = (ops GPR, i32imm);
+}
+
+// t2_so_imm_not_XFORM - Return the complement of a t2_so_imm value
+def t2_so_imm_not_XFORM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(~((uint32_t)N->getZExtValue()), MVT::i32);
+}]>;
+
+// t2_so_imm_neg_XFORM - Return the negation of a t2_so_imm value
+def t2_so_imm_neg_XFORM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(-((int)N->getZExtValue()), MVT::i32);
+}]>;
+
+// t2_so_imm - Match a 32-bit immediate operand, which is an
+// 8-bit immediate rotated by an arbitrary number of bits, or an 8-bit
+// immediate splatted into multiple bytes of the word. t2_so_imm values are
+// represented in the imm field in the same 12-bit form that they are encoded
+// into t2_so_imm instructions: the 8-bit immediate is the least significant
+// bits [bits 0-7], the 4-bit shift/splat amount is the next 4 bits [bits 8-11].
+def t2_so_imm : Operand<i32>,
+                PatLeaf<(imm), [{
+  return ARM_AM::getT2SOImmVal((uint32_t)N->getZExtValue()) != -1; 
+}]>;
+
+// t2_so_imm_not - Match an immediate that is a complement 
+// of a t2_so_imm.
+def t2_so_imm_not : Operand<i32>,
+                    PatLeaf<(imm), [{
+  return ARM_AM::getT2SOImmVal(~((uint32_t)N->getZExtValue())) != -1;
+}], t2_so_imm_not_XFORM>;
+
+// t2_so_imm_neg - Match an immediate that is a negation of a t2_so_imm.
+def t2_so_imm_neg : Operand<i32>,
+                    PatLeaf<(imm), [{
+  return ARM_AM::getT2SOImmVal(-((int)N->getZExtValue())) != -1;
+}], t2_so_imm_neg_XFORM>;
+
+// Break t2_so_imm's up into two pieces.  This handles immediates with up to 16
+// bits set in them.  This uses t2_so_imm2part to match and t2_so_imm2part_[12]
+// to get the first/second pieces.
+def t2_so_imm2part : Operand<i32>,
+                  PatLeaf<(imm), [{
+      return ARM_AM::isT2SOImmTwoPartVal((unsigned)N->getZExtValue());
+    }]> {
+}
+
+def t2_so_imm2part_1 : SDNodeXForm<imm, [{
+  unsigned V = ARM_AM::getT2SOImmTwoPartFirst((unsigned)N->getZExtValue());
+  return CurDAG->getTargetConstant(V, MVT::i32);
+}]>;
+
+def t2_so_imm2part_2 : SDNodeXForm<imm, [{
+  unsigned V = ARM_AM::getT2SOImmTwoPartSecond((unsigned)N->getZExtValue());
+  return CurDAG->getTargetConstant(V, MVT::i32);
+}]>;
+
+def t2_so_neg_imm2part : Operand<i32>, PatLeaf<(imm), [{
+      return ARM_AM::isT2SOImmTwoPartVal(-(int)N->getZExtValue());
+    }]> {
+}
+
+def t2_so_neg_imm2part_1 : SDNodeXForm<imm, [{
+  unsigned V = ARM_AM::getT2SOImmTwoPartFirst(-(int)N->getZExtValue());
+  return CurDAG->getTargetConstant(V, MVT::i32);
+}]>;
+
+def t2_so_neg_imm2part_2 : SDNodeXForm<imm, [{
+  unsigned V = ARM_AM::getT2SOImmTwoPartSecond(-(int)N->getZExtValue());
+  return CurDAG->getTargetConstant(V, MVT::i32);
+}]>;
+
+/// imm1_31 predicate - True if the 32-bit immediate is in the range [1,31].
+def imm1_31 : PatLeaf<(i32 imm), [{
+  return (int32_t)N->getZExtValue() >= 1 && (int32_t)N->getZExtValue() < 32;
+}]>;
+
+/// imm0_4095 predicate - True if the 32-bit immediate is in the range [0.4095].
+def imm0_4095 : Operand<i32>,
+                PatLeaf<(i32 imm), [{
+  return (uint32_t)N->getZExtValue() < 4096;
+}]>;
+
+def imm0_4095_neg : PatLeaf<(i32 imm), [{ 
+ return (uint32_t)(-N->getZExtValue()) < 4096; 
+}], imm_neg_XFORM>; 
+
+def imm0_255_neg : PatLeaf<(i32 imm), [{
+  return (uint32_t)(-N->getZExtValue()) < 255;
+}], imm_neg_XFORM>; 
+
+// Define Thumb2 specific addressing modes.
+
+// t2addrmode_imm12  := reg + imm12
+def t2addrmode_imm12 : Operand<i32>,
+                       ComplexPattern<i32, 2, "SelectT2AddrModeImm12", []> {
+  let PrintMethod = "printT2AddrModeImm12Operand";
+  let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+
+// t2addrmode_imm8  := reg - imm8
+def t2addrmode_imm8 : Operand<i32>,
+                      ComplexPattern<i32, 2, "SelectT2AddrModeImm8", []> {
+  let PrintMethod = "printT2AddrModeImm8Operand";
+  let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+
+def t2am_imm8_offset : Operand<i32>,
+                       ComplexPattern<i32, 1, "SelectT2AddrModeImm8Offset", []>{
+  let PrintMethod = "printT2AddrModeImm8OffsetOperand";
+}
+
+// t2addrmode_imm8s4  := reg +/- (imm8 << 2)
+def t2addrmode_imm8s4 : Operand<i32>,
+                        ComplexPattern<i32, 2, "SelectT2AddrModeImm8s4", []> {
+  let PrintMethod = "printT2AddrModeImm8s4Operand";
+  let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
+}
+
+// t2addrmode_so_reg  := reg + (reg << imm2)
+def t2addrmode_so_reg : Operand<i32>,
+                        ComplexPattern<i32, 3, "SelectT2AddrModeSoReg", []> {
+  let PrintMethod = "printT2AddrModeSoRegOperand";
+  let MIOperandInfo = (ops GPR:$base, GPR:$offsreg, i32imm:$offsimm);
+}
+
+
+//===----------------------------------------------------------------------===//
+// Multiclass helpers...
+//
+
+/// T2I_un_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns for a
+/// unary operation that produces a value. These are predicable and can be
+/// changed to modify CPSR.
+multiclass T2I_un_irs<bits<4> opcod, string opc, PatFrag opnode,
+                      bit Cheap = 0, bit ReMat = 0> {
+   // shifted imm
+   def i : T2sI<(outs GPR:$dst), (ins t2_so_imm:$src), IIC_iMOVi,
+                opc, "\t$dst, $src",
+                [(set GPR:$dst, (opnode t2_so_imm:$src))]> {
+     let isAsCheapAsAMove = Cheap;
+     let isReMaterializable = ReMat;
+     let Inst{31-27} = 0b11110;
+     let Inst{25} = 0;
+     let Inst{24-21} = opcod;
+     let Inst{20} = ?; // The S bit.
+     let Inst{19-16} = 0b1111; // Rn
+     let Inst{15} = 0;
+   }
+   // register
+   def r : T2I<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVr,
+               opc, ".w\t$dst, $src",
+                [(set GPR:$dst, (opnode GPR:$src))]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+     let Inst{20} = ?; // The S bit.
+     let Inst{19-16} = 0b1111; // Rn
+     let Inst{14-12} = 0b000; // imm3
+     let Inst{7-6} = 0b00; // imm2
+     let Inst{5-4} = 0b00; // type
+   }
+   // shifted register
+   def s : T2I<(outs GPR:$dst), (ins t2_so_reg:$src), IIC_iMOVsi,
+               opc, ".w\t$dst, $src",
+               [(set GPR:$dst, (opnode t2_so_reg:$src))]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+     let Inst{20} = ?; // The S bit.
+     let Inst{19-16} = 0b1111; // Rn
+   }
+}
+
+/// T2I_bin_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns for a
+//  binary operation that produces a value. These are predicable and can be
+/// changed to modify CPSR.
+multiclass T2I_bin_irs<bits<4> opcod, string opc, PatFrag opnode, 
+                       bit Commutable = 0, string wide =""> {
+   // shifted imm
+   def ri : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iALUi,
+                 opc, "\t$dst, $lhs, $rhs",
+                 [(set GPR:$dst, (opnode GPR:$lhs, t2_so_imm:$rhs))]> {
+     let Inst{31-27} = 0b11110;
+     let Inst{25} = 0;
+     let Inst{24-21} = opcod;
+     let Inst{20} = ?; // The S bit.
+     let Inst{15} = 0;
+   }
+   // register
+   def rr : T2sI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr,
+                 opc, !strconcat(wide, "\t$dst, $lhs, $rhs"),
+                 [(set GPR:$dst, (opnode GPR:$lhs, GPR:$rhs))]> {
+     let isCommutable = Commutable;
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+     let Inst{20} = ?; // The S bit.
+     let Inst{14-12} = 0b000; // imm3
+     let Inst{7-6} = 0b00; // imm2
+     let Inst{5-4} = 0b00; // type
+   }
+   // shifted register
+   def rs : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi,
+                 opc, !strconcat(wide, "\t$dst, $lhs, $rhs"),
+                 [(set GPR:$dst, (opnode GPR:$lhs, t2_so_reg:$rhs))]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+     let Inst{20} = ?; // The S bit.
+   }
+}
+
+/// T2I_bin_w_irs - Same as T2I_bin_irs except these operations need
+//  the ".w" prefix to indicate that they are wide.
+multiclass T2I_bin_w_irs<bits<4> opcod, string opc, PatFrag opnode,
+                         bit Commutable = 0> :
+    T2I_bin_irs<opcod, opc, opnode, Commutable, ".w">;
+
+/// T2I_rbin_is - Same as T2I_bin_irs except the order of operands are
+/// reversed. It doesn't define the 'rr' form since it's handled by its
+/// T2I_bin_irs counterpart.
+multiclass T2I_rbin_is<bits<4> opcod, string opc, PatFrag opnode> {
+   // shifted imm
+   def ri : T2I<(outs GPR:$dst), (ins GPR:$rhs, t2_so_imm:$lhs), IIC_iALUi,
+                opc, ".w\t$dst, $rhs, $lhs",
+                [(set GPR:$dst, (opnode t2_so_imm:$lhs, GPR:$rhs))]> {
+     let Inst{31-27} = 0b11110;
+     let Inst{25} = 0;
+     let Inst{24-21} = opcod;
+     let Inst{20} = 0; // The S bit.
+     let Inst{15} = 0;
+   }
+   // shifted register
+   def rs : T2I<(outs GPR:$dst), (ins GPR:$rhs, t2_so_reg:$lhs), IIC_iALUsi,
+                opc, "\t$dst, $rhs, $lhs",
+                [(set GPR:$dst, (opnode t2_so_reg:$lhs, GPR:$rhs))]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+     let Inst{20} = 0; // The S bit.
+   }
+}
+
+/// T2I_bin_s_irs - Similar to T2I_bin_irs except it sets the 's' bit so the
+/// instruction modifies the CPSR register.
+let Defs = [CPSR] in {
+multiclass T2I_bin_s_irs<bits<4> opcod, string opc, PatFrag opnode,
+                         bit Commutable = 0> {
+   // shifted imm
+   def ri : T2I<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iALUi,
+                !strconcat(opc, "s"), ".w\t$dst, $lhs, $rhs",
+                [(set GPR:$dst, (opnode GPR:$lhs, t2_so_imm:$rhs))]> {
+     let Inst{31-27} = 0b11110;
+     let Inst{25} = 0;
+     let Inst{24-21} = opcod;
+     let Inst{20} = 1; // The S bit.
+     let Inst{15} = 0;
+   }
+   // register
+   def rr : T2I<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr,
+                !strconcat(opc, "s"), ".w\t$dst, $lhs, $rhs",
+                [(set GPR:$dst, (opnode GPR:$lhs, GPR:$rhs))]> {
+     let isCommutable = Commutable;
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+     let Inst{20} = 1; // The S bit.
+     let Inst{14-12} = 0b000; // imm3
+     let Inst{7-6} = 0b00; // imm2
+     let Inst{5-4} = 0b00; // type
+   }
+   // shifted register
+   def rs : T2I<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi,
+                !strconcat(opc, "s"), ".w\t$dst, $lhs, $rhs",
+                [(set GPR:$dst, (opnode GPR:$lhs, t2_so_reg:$rhs))]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+     let Inst{20} = 1; // The S bit.
+   }
+}
+}
+
+/// T2I_bin_ii12rs - Defines a set of (op reg, {so_imm|imm0_4095|r|so_reg})
+/// patterns for a binary operation that produces a value.
+multiclass T2I_bin_ii12rs<bits<3> op23_21, string opc, PatFrag opnode,
+                          bit Commutable = 0> {
+   // shifted imm
+   def ri : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iALUi,
+                 opc, ".w\t$dst, $lhs, $rhs",
+                 [(set GPR:$dst, (opnode GPR:$lhs, t2_so_imm:$rhs))]> {
+     let Inst{31-27} = 0b11110;
+     let Inst{25} = 0;
+     let Inst{24} = 1;
+     let Inst{23-21} = op23_21;
+     let Inst{20} = 0; // The S bit.
+     let Inst{15} = 0;
+   }
+   // 12-bit imm
+   def ri12 : T2sI<(outs GPR:$dst), (ins GPR:$lhs, imm0_4095:$rhs), IIC_iALUi,
+                   !strconcat(opc, "w"), "\t$dst, $lhs, $rhs",
+                   [(set GPR:$dst, (opnode GPR:$lhs, imm0_4095:$rhs))]> {
+     let Inst{31-27} = 0b11110;
+     let Inst{25} = 1;
+     let Inst{24} = 0;
+     let Inst{23-21} = op23_21;
+     let Inst{20} = 0; // The S bit.
+     let Inst{15} = 0;
+   }
+   // register
+   def rr : T2sI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr,
+                 opc, ".w\t$dst, $lhs, $rhs",
+                 [(set GPR:$dst, (opnode GPR:$lhs, GPR:$rhs))]> {
+     let isCommutable = Commutable;
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24} = 1;
+     let Inst{23-21} = op23_21;
+     let Inst{20} = 0; // The S bit.
+     let Inst{14-12} = 0b000; // imm3
+     let Inst{7-6} = 0b00; // imm2
+     let Inst{5-4} = 0b00; // type
+   }
+   // shifted register
+   def rs : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi,
+                 opc, ".w\t$dst, $lhs, $rhs",
+                 [(set GPR:$dst, (opnode GPR:$lhs, t2_so_reg:$rhs))]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24} = 1;
+     let Inst{23-21} = op23_21;
+     let Inst{20} = 0; // The S bit.
+   }
+}
+
+/// T2I_adde_sube_irs - Defines a set of (op reg, {so_imm|r|so_reg}) patterns
+/// for a binary operation that produces a value and use and define the carry
+/// bit. It's not predicable.
+let Uses = [CPSR] in {
+multiclass T2I_adde_sube_irs<bits<4> opcod, string opc, PatFrag opnode, bit Commutable = 0> {
+   // shifted imm
+   def ri : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iALUi,
+                 opc, "\t$dst, $lhs, $rhs",
+                 [(set GPR:$dst, (opnode GPR:$lhs, t2_so_imm:$rhs))]>,
+                 Requires<[IsThumb2, CarryDefIsUnused]> {
+     let Inst{31-27} = 0b11110;
+     let Inst{25} = 0;
+     let Inst{24-21} = opcod;
+     let Inst{20} = 0; // The S bit.
+     let Inst{15} = 0;
+   }
+   // register
+   def rr : T2sI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr,
+                 opc, ".w\t$dst, $lhs, $rhs",
+                 [(set GPR:$dst, (opnode GPR:$lhs, GPR:$rhs))]>,
+                 Requires<[IsThumb2, CarryDefIsUnused]> {
+     let isCommutable = Commutable;
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+     let Inst{20} = 0; // The S bit.
+     let Inst{14-12} = 0b000; // imm3
+     let Inst{7-6} = 0b00; // imm2
+     let Inst{5-4} = 0b00; // type
+   }
+   // shifted register
+   def rs : T2sI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi,
+                 opc, ".w\t$dst, $lhs, $rhs",
+                 [(set GPR:$dst, (opnode GPR:$lhs, t2_so_reg:$rhs))]>,
+                 Requires<[IsThumb2, CarryDefIsUnused]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+     let Inst{20} = 0; // The S bit.
+   }
+   // Carry setting variants
+   // shifted imm
+   def Sri : T2XI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iALUi,
+                  !strconcat(opc, "s\t$dst, $lhs, $rhs"),
+                  [(set GPR:$dst, (opnode GPR:$lhs, t2_so_imm:$rhs))]>,
+                  Requires<[IsThumb2, CarryDefIsUsed]> {
+     let Defs = [CPSR];
+     let Inst{31-27} = 0b11110;
+     let Inst{25} = 0;
+     let Inst{24-21} = opcod;
+     let Inst{20} = 1; // The S bit.
+     let Inst{15} = 0;
+   }
+   // register
+   def Srr : T2XI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iALUr,
+                  !strconcat(opc, "s.w\t$dst, $lhs, $rhs"),
+                  [(set GPR:$dst, (opnode GPR:$lhs, GPR:$rhs))]>,
+                  Requires<[IsThumb2, CarryDefIsUsed]> {
+     let Defs = [CPSR];
+     let isCommutable = Commutable;
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+     let Inst{20} = 1; // The S bit.
+     let Inst{14-12} = 0b000; // imm3
+     let Inst{7-6} = 0b00; // imm2
+     let Inst{5-4} = 0b00; // type
+   }
+   // shifted register
+   def Srs : T2XI<(outs GPR:$dst), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iALUsi,
+                  !strconcat(opc, "s.w\t$dst, $lhs, $rhs"),
+                  [(set GPR:$dst, (opnode GPR:$lhs, t2_so_reg:$rhs))]>,
+                  Requires<[IsThumb2, CarryDefIsUsed]> {
+     let Defs = [CPSR];
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+     let Inst{20} = 1; // The S bit.
+   }
+}
+}
+
+/// T2I_rbin_s_is - Same as T2I_rbin_is except sets 's' bit.
+let Defs = [CPSR] in {
+multiclass T2I_rbin_s_is<bits<4> opcod, string opc, PatFrag opnode> {
+   // shifted imm
+   def ri : T2XI<(outs GPR:$dst), (ins GPR:$rhs, t2_so_imm:$lhs, cc_out:$s),
+                 IIC_iALUi,
+                 !strconcat(opc, "${s}.w\t$dst, $rhs, $lhs"),
+                 [(set GPR:$dst, (opnode t2_so_imm:$lhs, GPR:$rhs))]> {
+     let Inst{31-27} = 0b11110;
+     let Inst{25} = 0;
+     let Inst{24-21} = opcod;
+     let Inst{20} = 1; // The S bit.
+     let Inst{15} = 0;
+   }
+   // shifted register
+   def rs : T2XI<(outs GPR:$dst), (ins GPR:$rhs, t2_so_reg:$lhs, cc_out:$s),
+                 IIC_iALUsi,
+                 !strconcat(opc, "${s}\t$dst, $rhs, $lhs"),
+                 [(set GPR:$dst, (opnode t2_so_reg:$lhs, GPR:$rhs))]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+     let Inst{20} = 1; // The S bit.
+   }
+}
+}
+
+/// T2I_sh_ir - Defines a set of (op reg, {so_imm|r}) patterns for a shift /
+//  rotate operation that produces a value.
+multiclass T2I_sh_ir<bits<2> opcod, string opc, PatFrag opnode> {
+   // 5-bit imm
+   def ri : T2sI<(outs GPR:$dst), (ins GPR:$lhs, i32imm:$rhs), IIC_iMOVsi,
+                 opc, ".w\t$dst, $lhs, $rhs",
+                 [(set GPR:$dst, (opnode GPR:$lhs, imm1_31:$rhs))]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-21} = 0b010010;
+     let Inst{19-16} = 0b1111; // Rn
+     let Inst{5-4} = opcod;
+   }
+   // register
+   def rr : T2sI<(outs GPR:$dst), (ins GPR:$lhs, GPR:$rhs), IIC_iMOVsr,
+                 opc, ".w\t$dst, $lhs, $rhs",
+                 [(set GPR:$dst, (opnode GPR:$lhs, GPR:$rhs))]> {
+     let Inst{31-27} = 0b11111;
+     let Inst{26-23} = 0b0100;
+     let Inst{22-21} = opcod;
+     let Inst{15-12} = 0b1111;
+     let Inst{7-4} = 0b0000;
+   }
+}
+
+/// T2I_cmp_irs - Defines a set of (op r, {so_imm|r|so_reg}) cmp / test
+/// patterns. Similar to T2I_bin_irs except the instruction does not produce
+/// a explicit result, only implicitly set CPSR.
+let Defs = [CPSR] in {
+multiclass T2I_cmp_irs<bits<4> opcod, string opc, PatFrag opnode> {
+   // shifted imm
+   def ri : T2I<(outs), (ins GPR:$lhs, t2_so_imm:$rhs), IIC_iCMPi,
+                opc, ".w\t$lhs, $rhs",
+                [(opnode GPR:$lhs, t2_so_imm:$rhs)]> {
+     let Inst{31-27} = 0b11110;
+     let Inst{25} = 0;
+     let Inst{24-21} = opcod;
+     let Inst{20} = 1; // The S bit.
+     let Inst{15} = 0;
+     let Inst{11-8} = 0b1111; // Rd
+   }
+   // register
+   def rr : T2I<(outs), (ins GPR:$lhs, GPR:$rhs), IIC_iCMPr,
+                opc, ".w\t$lhs, $rhs",
+                [(opnode GPR:$lhs, GPR:$rhs)]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+     let Inst{20} = 1; // The S bit.
+     let Inst{14-12} = 0b000; // imm3
+     let Inst{11-8} = 0b1111; // Rd
+     let Inst{7-6} = 0b00; // imm2
+     let Inst{5-4} = 0b00; // type
+   }
+   // shifted register
+   def rs : T2I<(outs), (ins GPR:$lhs, t2_so_reg:$rhs), IIC_iCMPsi,
+                opc, ".w\t$lhs, $rhs",
+                [(opnode GPR:$lhs, t2_so_reg:$rhs)]> {
+     let Inst{31-27} = 0b11101;
+     let Inst{26-25} = 0b01;
+     let Inst{24-21} = opcod;
+     let Inst{20} = 1; // The S bit.
+     let Inst{11-8} = 0b1111; // Rd
+   }
+}
+}
+
+/// T2I_ld - Defines a set of (op r, {imm12|imm8|so_reg}) load patterns.
+multiclass T2I_ld<bit signed, bits<2> opcod, string opc, PatFrag opnode> {
+  def i12 : T2Ii12<(outs GPR:$dst), (ins t2addrmode_imm12:$addr), IIC_iLoadi,
+                   opc, ".w\t$dst, $addr",
+                   [(set GPR:$dst, (opnode t2addrmode_imm12:$addr))]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-25} = 0b00;
+    let Inst{24} = signed;
+    let Inst{23} = 1;
+    let Inst{22-21} = opcod;
+    let Inst{20} = 1; // load
+  }
+  def i8  : T2Ii8 <(outs GPR:$dst), (ins t2addrmode_imm8:$addr), IIC_iLoadi,
+                   opc, "\t$dst, $addr",
+                   [(set GPR:$dst, (opnode t2addrmode_imm8:$addr))]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-25} = 0b00;
+    let Inst{24} = signed;
+    let Inst{23} = 0;
+    let Inst{22-21} = opcod;
+    let Inst{20} = 1; // load
+    let Inst{11} = 1;
+    // Offset: index==TRUE, wback==FALSE
+    let Inst{10} = 1; // The P bit.
+    let Inst{8} = 0; // The W bit.
+  }
+  def s   : T2Iso <(outs GPR:$dst), (ins t2addrmode_so_reg:$addr), IIC_iLoadr,
+                   opc, ".w\t$dst, $addr",
+                   [(set GPR:$dst, (opnode t2addrmode_so_reg:$addr))]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-25} = 0b00;
+    let Inst{24} = signed;
+    let Inst{23} = 0;
+    let Inst{22-21} = opcod;
+    let Inst{20} = 1; // load
+    let Inst{11-6} = 0b000000;
+  }
+  def pci : T2Ipc <(outs GPR:$dst), (ins i32imm:$addr), IIC_iLoadi,
+                   opc, ".w\t$dst, $addr",
+                   [(set GPR:$dst, (opnode (ARMWrapper tconstpool:$addr)))]> {
+    let isReMaterializable = 1;
+    let Inst{31-27} = 0b11111;
+    let Inst{26-25} = 0b00;
+    let Inst{24} = signed;
+    let Inst{23} = ?; // add = (U == '1')
+    let Inst{22-21} = opcod;
+    let Inst{20} = 1; // load
+    let Inst{19-16} = 0b1111; // Rn
+  }
+}
+
+/// T2I_st - Defines a set of (op r, {imm12|imm8|so_reg}) store patterns.
+multiclass T2I_st<bits<2> opcod, string opc, PatFrag opnode> {
+  def i12 : T2Ii12<(outs), (ins GPR:$src, t2addrmode_imm12:$addr), IIC_iStorei,
+                   opc, ".w\t$src, $addr",
+                   [(opnode GPR:$src, t2addrmode_imm12:$addr)]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0001;
+    let Inst{22-21} = opcod;
+    let Inst{20} = 0; // !load
+  }
+  def i8  : T2Ii8 <(outs), (ins GPR:$src, t2addrmode_imm8:$addr), IIC_iStorei,
+                   opc, "\t$src, $addr",
+                   [(opnode GPR:$src, t2addrmode_imm8:$addr)]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0000;
+    let Inst{22-21} = opcod;
+    let Inst{20} = 0; // !load
+    let Inst{11} = 1;
+    // Offset: index==TRUE, wback==FALSE
+    let Inst{10} = 1; // The P bit.
+    let Inst{8} = 0; // The W bit.
+  }
+  def s   : T2Iso <(outs), (ins GPR:$src, t2addrmode_so_reg:$addr), IIC_iStorer,
+                   opc, ".w\t$src, $addr",
+                   [(opnode GPR:$src, t2addrmode_so_reg:$addr)]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0000;
+    let Inst{22-21} = opcod;
+    let Inst{20} = 0; // !load
+    let Inst{11-6} = 0b000000;
+  }
+}
+
+/// T2I_picld - Defines the PIC load pattern.
+class T2I_picld<string opc, PatFrag opnode> :
+      T2I<(outs GPR:$dst), (ins addrmodepc:$addr), IIC_iLoadi,
+          !strconcat("\n${addr:label}:\n\t", opc), "\t$dst, $addr",
+          [(set GPR:$dst, (opnode addrmodepc:$addr))]>;
+
+/// T2I_picst - Defines the PIC store pattern.
+class T2I_picst<string opc, PatFrag opnode> :
+      T2I<(outs), (ins GPR:$src, addrmodepc:$addr), IIC_iStorer,
+          !strconcat("\n${addr:label}:\n\t", opc), "\t$src, $addr",
+          [(opnode GPR:$src, addrmodepc:$addr)]>;
+
+
+/// T2I_unary_rrot - A unary operation with two forms: one whose operand is a
+/// register and one whose operand is a register rotated by 8/16/24.
+multiclass T2I_unary_rrot<bits<3> opcod, string opc, PatFrag opnode> {
+  def r     : T2I<(outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
+                  opc, ".w\t$dst, $src",
+                 [(set GPR:$dst, (opnode GPR:$src))]> {
+     let Inst{31-27} = 0b11111;
+     let Inst{26-23} = 0b0100;
+     let Inst{22-20} = opcod;
+     let Inst{19-16} = 0b1111; // Rn
+     let Inst{15-12} = 0b1111;
+     let Inst{7} = 1;
+     let Inst{5-4} = 0b00; // rotate
+   }
+  def r_rot : T2I<(outs GPR:$dst), (ins GPR:$src, i32imm:$rot), IIC_iUNAsi,
+                  opc, ".w\t$dst, $src, ror $rot",
+                 [(set GPR:$dst, (opnode (rotr GPR:$src, rot_imm:$rot)))]> {
+     let Inst{31-27} = 0b11111;
+     let Inst{26-23} = 0b0100;
+     let Inst{22-20} = opcod;
+     let Inst{19-16} = 0b1111; // Rn
+     let Inst{15-12} = 0b1111;
+     let Inst{7} = 1;
+     let Inst{5-4} = {?,?}; // rotate
+   }
+}
+
+/// T2I_bin_rrot - A binary operation with two forms: one whose operand is a
+/// register and one whose operand is a register rotated by 8/16/24.
+multiclass T2I_bin_rrot<bits<3> opcod, string opc, PatFrag opnode> {
+  def rr     : T2I<(outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS), IIC_iALUr,
+                  opc, "\t$dst, $LHS, $RHS",
+                  [(set GPR:$dst, (opnode GPR:$LHS, GPR:$RHS))]> {
+     let Inst{31-27} = 0b11111;
+     let Inst{26-23} = 0b0100;
+     let Inst{22-20} = opcod;
+     let Inst{15-12} = 0b1111;
+     let Inst{7} = 1;
+     let Inst{5-4} = 0b00; // rotate
+   }
+  def rr_rot : T2I<(outs GPR:$dst), (ins GPR:$LHS, GPR:$RHS, i32imm:$rot),
+                  IIC_iALUsr, opc, "\t$dst, $LHS, $RHS, ror $rot",
+                  [(set GPR:$dst, (opnode GPR:$LHS,
+                                          (rotr GPR:$RHS, rot_imm:$rot)))]> {
+     let Inst{31-27} = 0b11111;
+     let Inst{26-23} = 0b0100;
+     let Inst{22-20} = opcod;
+     let Inst{15-12} = 0b1111;
+     let Inst{7} = 1;
+     let Inst{5-4} = {?,?}; // rotate
+   }
+}
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Miscellaneous Instructions.
+//
+
+// LEApcrel - Load a pc-relative address into a register without offending the
+// assembler.
+def t2LEApcrel : T2XI<(outs GPR:$dst), (ins i32imm:$label, pred:$p), IIC_iALUi,
+                      "adr$p.w\t$dst, #$label", []> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25-24} = 0b10;
+  // Inst{23:21} = '11' (add = FALSE) or '00' (add = TRUE)
+  let Inst{22} = 0;
+  let Inst{20} = 0;
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{15} = 0;
+}
+def t2LEApcrelJT : T2XI<(outs GPR:$dst),
+                        (ins i32imm:$label, nohash_imm:$id, pred:$p), IIC_iALUi,
+                        "adr$p.w\t$dst, #${label}_${id}", []> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25-24} = 0b10;
+  // Inst{23:21} = '11' (add = FALSE) or '00' (add = TRUE)
+  let Inst{22} = 0;
+  let Inst{20} = 0;
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{15} = 0;
+}
+
+// ADD r, sp, {so_imm|i12}
+def t2ADDrSPi   : T2sI<(outs GPR:$dst), (ins GPR:$sp, t2_so_imm:$imm),
+                        IIC_iALUi, "add", ".w\t$dst, $sp, $imm", []> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25} = 0;
+  let Inst{24-21} = 0b1000;
+  let Inst{20} = ?; // The S bit.
+  let Inst{19-16} = 0b1101; // Rn = sp
+  let Inst{15} = 0;
+}
+def t2ADDrSPi12 : T2I<(outs GPR:$dst), (ins GPR:$sp, imm0_4095:$imm), 
+                       IIC_iALUi, "addw", "\t$dst, $sp, $imm", []> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25} = 1;
+  let Inst{24-21} = 0b0000;
+  let Inst{20} = 0; // The S bit.
+  let Inst{19-16} = 0b1101; // Rn = sp
+  let Inst{15} = 0;
+}
+
+// ADD r, sp, so_reg
+def t2ADDrSPs   : T2sI<(outs GPR:$dst), (ins GPR:$sp, t2_so_reg:$rhs),
+                        IIC_iALUsi, "add", ".w\t$dst, $sp, $rhs", []> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-25} = 0b01;
+  let Inst{24-21} = 0b1000;
+  let Inst{20} = ?; // The S bit.
+  let Inst{19-16} = 0b1101; // Rn = sp
+  let Inst{15} = 0;
+}
+
+// SUB r, sp, {so_imm|i12}
+def t2SUBrSPi   : T2sI<(outs GPR:$dst), (ins GPR:$sp, t2_so_imm:$imm),
+                        IIC_iALUi, "sub", ".w\t$dst, $sp, $imm", []> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25} = 0;
+  let Inst{24-21} = 0b1101;
+  let Inst{20} = ?; // The S bit.
+  let Inst{19-16} = 0b1101; // Rn = sp
+  let Inst{15} = 0;
+}
+def t2SUBrSPi12 : T2I<(outs GPR:$dst), (ins GPR:$sp, imm0_4095:$imm),
+                       IIC_iALUi, "subw", "\t$dst, $sp, $imm", []> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25} = 1;
+  let Inst{24-21} = 0b0101;
+  let Inst{20} = 0; // The S bit.
+  let Inst{19-16} = 0b1101; // Rn = sp
+  let Inst{15} = 0;
+}
+
+// SUB r, sp, so_reg
+def t2SUBrSPs   : T2sI<(outs GPR:$dst), (ins GPR:$sp, t2_so_reg:$rhs),
+                       IIC_iALUsi,
+                       "sub", "\t$dst, $sp, $rhs", []> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-25} = 0b01;
+  let Inst{24-21} = 0b1101;
+  let Inst{20} = ?; // The S bit.
+  let Inst{19-16} = 0b1101; // Rn = sp
+  let Inst{15} = 0;
+}
+
+// Pseudo instruction that will expand into a t2SUBrSPi + a copy.
+let usesCustomInserter = 1 in { // Expanded after instruction selection.
+def t2SUBrSPi_   : PseudoInst<(outs GPR:$dst), (ins GPR:$sp, t2_so_imm:$imm),
+                   NoItinerary, "@ sub.w\t$dst, $sp, $imm", []>;
+def t2SUBrSPi12_ : PseudoInst<(outs GPR:$dst), (ins GPR:$sp, imm0_4095:$imm),
+                   NoItinerary, "@ subw\t$dst, $sp, $imm", []>;
+def t2SUBrSPs_   : PseudoInst<(outs GPR:$dst), (ins GPR:$sp, t2_so_reg:$rhs),
+                   NoItinerary, "@ sub\t$dst, $sp, $rhs", []>;
+} // usesCustomInserter
+
+
+//===----------------------------------------------------------------------===//
+//  Load / store Instructions.
+//
+
+// Load
+let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1  in 
+defm t2LDR   : T2I_ld<0, 0b10, "ldr",  UnOpFrag<(load node:$Src)>>;
+
+// Loads with zero extension
+defm t2LDRH  : T2I_ld<0, 0b01, "ldrh", UnOpFrag<(zextloadi16 node:$Src)>>;
+defm t2LDRB  : T2I_ld<0, 0b00, "ldrb", UnOpFrag<(zextloadi8  node:$Src)>>;
+
+// Loads with sign extension
+defm t2LDRSH : T2I_ld<1, 0b01, "ldrsh", UnOpFrag<(sextloadi16 node:$Src)>>;
+defm t2LDRSB : T2I_ld<1, 0b00, "ldrsb", UnOpFrag<(sextloadi8  node:$Src)>>;
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in {
+// Load doubleword
+def t2LDRDi8  : T2Ii8s4<1, 0, 1, (outs GPR:$dst1, GPR:$dst2),
+                        (ins t2addrmode_imm8s4:$addr),
+                        IIC_iLoadi, "ldrd", "\t$dst1, $addr", []>;
+def t2LDRDpci : T2Ii8s4<?, ?, 1, (outs GPR:$dst1, GPR:$dst2),
+                        (ins i32imm:$addr), IIC_iLoadi,
+                       "ldrd", "\t$dst1, $addr", []> {
+  let Inst{19-16} = 0b1111; // Rn
+}
+}
+
+// zextload i1 -> zextload i8
+def : T2Pat<(zextloadi1 t2addrmode_imm12:$addr),
+            (t2LDRBi12  t2addrmode_imm12:$addr)>;
+def : T2Pat<(zextloadi1 t2addrmode_imm8:$addr),
+            (t2LDRBi8   t2addrmode_imm8:$addr)>;
+def : T2Pat<(zextloadi1 t2addrmode_so_reg:$addr),
+            (t2LDRBs    t2addrmode_so_reg:$addr)>;
+def : T2Pat<(zextloadi1 (ARMWrapper tconstpool:$addr)),
+            (t2LDRBpci  tconstpool:$addr)>;
+
+// extload -> zextload
+// FIXME: Reduce the number of patterns by legalizing extload to zextload
+// earlier?
+def : T2Pat<(extloadi1  t2addrmode_imm12:$addr),
+            (t2LDRBi12  t2addrmode_imm12:$addr)>;
+def : T2Pat<(extloadi1  t2addrmode_imm8:$addr),
+            (t2LDRBi8   t2addrmode_imm8:$addr)>;
+def : T2Pat<(extloadi1  t2addrmode_so_reg:$addr),
+            (t2LDRBs    t2addrmode_so_reg:$addr)>;
+def : T2Pat<(extloadi1  (ARMWrapper tconstpool:$addr)),
+            (t2LDRBpci  tconstpool:$addr)>;
+
+def : T2Pat<(extloadi8  t2addrmode_imm12:$addr),
+            (t2LDRBi12  t2addrmode_imm12:$addr)>;
+def : T2Pat<(extloadi8  t2addrmode_imm8:$addr),
+            (t2LDRBi8   t2addrmode_imm8:$addr)>;
+def : T2Pat<(extloadi8  t2addrmode_so_reg:$addr),
+            (t2LDRBs    t2addrmode_so_reg:$addr)>;
+def : T2Pat<(extloadi8  (ARMWrapper tconstpool:$addr)),
+            (t2LDRBpci  tconstpool:$addr)>;
+
+def : T2Pat<(extloadi16 t2addrmode_imm12:$addr),
+            (t2LDRHi12  t2addrmode_imm12:$addr)>;
+def : T2Pat<(extloadi16 t2addrmode_imm8:$addr),
+            (t2LDRHi8   t2addrmode_imm8:$addr)>;
+def : T2Pat<(extloadi16 t2addrmode_so_reg:$addr),
+            (t2LDRHs    t2addrmode_so_reg:$addr)>;
+def : T2Pat<(extloadi16 (ARMWrapper tconstpool:$addr)),
+            (t2LDRHpci  tconstpool:$addr)>;
+
+// Indexed loads
+let mayLoad = 1 in {
+def t2LDR_PRE  : T2Iidxldst<0, 0b10, 1, 1, (outs GPR:$dst, GPR:$base_wb),
+                            (ins t2addrmode_imm8:$addr),
+                            AddrModeT2_i8, IndexModePre, IIC_iLoadiu,
+                            "ldr", "\t$dst, $addr!", "$addr.base = $base_wb",
+                            []>;
+
+def t2LDR_POST : T2Iidxldst<0, 0b10, 1, 0, (outs GPR:$dst, GPR:$base_wb),
+                            (ins GPR:$base, t2am_imm8_offset:$offset),
+                            AddrModeT2_i8, IndexModePost, IIC_iLoadiu,
+                          "ldr", "\t$dst, [$base], $offset", "$base = $base_wb",
+                            []>;
+
+def t2LDRB_PRE : T2Iidxldst<0, 0b00, 1, 1, (outs GPR:$dst, GPR:$base_wb),
+                            (ins t2addrmode_imm8:$addr),
+                            AddrModeT2_i8, IndexModePre, IIC_iLoadiu,
+                            "ldrb", "\t$dst, $addr!", "$addr.base = $base_wb",
+                            []>;
+def t2LDRB_POST : T2Iidxldst<0, 0b00, 1, 0, (outs GPR:$dst, GPR:$base_wb),
+                            (ins GPR:$base, t2am_imm8_offset:$offset),
+                            AddrModeT2_i8, IndexModePost, IIC_iLoadiu,
+                         "ldrb", "\t$dst, [$base], $offset", "$base = $base_wb",
+                            []>;
+
+def t2LDRH_PRE : T2Iidxldst<0, 0b01, 1, 1, (outs GPR:$dst, GPR:$base_wb),
+                            (ins t2addrmode_imm8:$addr),
+                            AddrModeT2_i8, IndexModePre, IIC_iLoadiu,
+                            "ldrh", "\t$dst, $addr!", "$addr.base = $base_wb",
+                            []>;
+def t2LDRH_POST : T2Iidxldst<0, 0b01, 1, 0, (outs GPR:$dst, GPR:$base_wb),
+                            (ins GPR:$base, t2am_imm8_offset:$offset),
+                            AddrModeT2_i8, IndexModePost, IIC_iLoadiu,
+                         "ldrh", "\t$dst, [$base], $offset", "$base = $base_wb",
+                            []>;
+
+def t2LDRSB_PRE : T2Iidxldst<1, 0b00, 1, 1, (outs GPR:$dst, GPR:$base_wb),
+                            (ins t2addrmode_imm8:$addr),
+                            AddrModeT2_i8, IndexModePre, IIC_iLoadiu,
+                            "ldrsb", "\t$dst, $addr!", "$addr.base = $base_wb",
+                            []>;
+def t2LDRSB_POST : T2Iidxldst<1, 0b00, 1, 0, (outs GPR:$dst, GPR:$base_wb),
+                            (ins GPR:$base, t2am_imm8_offset:$offset),
+                            AddrModeT2_i8, IndexModePost, IIC_iLoadiu,
+                        "ldrsb", "\t$dst, [$base], $offset", "$base = $base_wb",
+                            []>;
+
+def t2LDRSH_PRE : T2Iidxldst<1, 0b01, 1, 1, (outs GPR:$dst, GPR:$base_wb),
+                            (ins t2addrmode_imm8:$addr),
+                            AddrModeT2_i8, IndexModePre, IIC_iLoadiu,
+                            "ldrsh", "\t$dst, $addr!", "$addr.base = $base_wb",
+                            []>;
+def t2LDRSH_POST : T2Iidxldst<1, 0b01, 1, 0, (outs GPR:$dst, GPR:$base_wb),
+                            (ins GPR:$base, t2am_imm8_offset:$offset),
+                            AddrModeT2_i8, IndexModePost, IIC_iLoadiu,
+                        "ldrsh", "\t$dst, [$base], $offset", "$base = $base_wb",
+                            []>;
+}
+
+// Store
+defm t2STR   : T2I_st<0b10, "str",  BinOpFrag<(store node:$LHS, node:$RHS)>>;
+defm t2STRB  : T2I_st<0b00, "strb", BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>;
+defm t2STRH  : T2I_st<0b01, "strh", BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>;
+
+// Store doubleword
+let mayLoad = 1, hasExtraSrcRegAllocReq = 1 in
+def t2STRDi8 : T2Ii8s4<1, 0, 0, (outs),
+                       (ins GPR:$src1, GPR:$src2, t2addrmode_imm8s4:$addr),
+               IIC_iStorer, "strd", "\t$src1, $addr", []>;
+
+// Indexed stores
+def t2STR_PRE  : T2Iidxldst<0, 0b10, 0, 1, (outs GPR:$base_wb),
+                            (ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset),
+                            AddrModeT2_i8, IndexModePre, IIC_iStoreiu,
+                         "str", "\t$src, [$base, $offset]!", "$base = $base_wb",
+             [(set GPR:$base_wb,
+                   (pre_store GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>;
+
+def t2STR_POST : T2Iidxldst<0, 0b10, 0, 0, (outs GPR:$base_wb),
+                            (ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset),
+                            AddrModeT2_i8, IndexModePost, IIC_iStoreiu,
+                          "str", "\t$src, [$base], $offset", "$base = $base_wb",
+             [(set GPR:$base_wb,
+                  (post_store GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>;
+
+def t2STRH_PRE  : T2Iidxldst<0, 0b01, 0, 1, (outs GPR:$base_wb),
+                            (ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset),
+                            AddrModeT2_i8, IndexModePre, IIC_iStoreiu,
+                        "strh", "\t$src, [$base, $offset]!", "$base = $base_wb",
+        [(set GPR:$base_wb,
+              (pre_truncsti16 GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>;
+
+def t2STRH_POST : T2Iidxldst<0, 0b01, 0, 0, (outs GPR:$base_wb),
+                            (ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset),
+                            AddrModeT2_i8, IndexModePost, IIC_iStoreiu,
+                         "strh", "\t$src, [$base], $offset", "$base = $base_wb",
+       [(set GPR:$base_wb,
+             (post_truncsti16 GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>;
+
+def t2STRB_PRE  : T2Iidxldst<0, 0b00, 0, 1, (outs GPR:$base_wb),
+                            (ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset),
+                            AddrModeT2_i8, IndexModePre, IIC_iStoreiu,
+                        "strb", "\t$src, [$base, $offset]!", "$base = $base_wb",
+         [(set GPR:$base_wb,
+               (pre_truncsti8 GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>;
+
+def t2STRB_POST : T2Iidxldst<0, 0b00, 0, 0, (outs GPR:$base_wb),
+                            (ins GPR:$src, GPR:$base, t2am_imm8_offset:$offset),
+                            AddrModeT2_i8, IndexModePost, IIC_iStoreiu,
+                         "strb", "\t$src, [$base], $offset", "$base = $base_wb",
+        [(set GPR:$base_wb,
+              (post_truncsti8 GPR:$src, GPR:$base, t2am_imm8_offset:$offset))]>;
+
+
+// FIXME: ldrd / strd pre / post variants
+
+//===----------------------------------------------------------------------===//
+//  Load / store multiple Instructions.
+//
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in
+def t2LDM : T2XI<(outs),
+                 (ins addrmode4:$addr, pred:$p, reglist:$wb, variable_ops),
+              IIC_iLoadm, "ldm${addr:submode}${p}${addr:wide}\t$addr, $wb", []> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-25} = 0b00;
+  let Inst{24-23} = {?, ?}; // IA: '01', DB: '10'
+  let Inst{22} = 0;
+  let Inst{21} = ?; // The W bit.
+  let Inst{20} = 1; // Load
+}
+
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in
+def t2STM : T2XI<(outs),
+                 (ins addrmode4:$addr, pred:$p, reglist:$wb, variable_ops),
+             IIC_iStorem, "stm${addr:submode}${p}${addr:wide}\t$addr, $wb", []> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-25} = 0b00;
+  let Inst{24-23} = {?, ?}; // IA: '01', DB: '10'
+  let Inst{22} = 0;
+  let Inst{21} = ?; // The W bit.
+  let Inst{20} = 0; // Store
+}
+
+//===----------------------------------------------------------------------===//
+//  Move Instructions.
+//
+
+let neverHasSideEffects = 1 in
+def t2MOVr : T2sI<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVr,
+                   "mov", ".w\t$dst, $src", []> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-25} = 0b01;
+  let Inst{24-21} = 0b0010;
+  let Inst{20} = ?; // The S bit.
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{14-12} = 0b000;
+  let Inst{7-4} = 0b0000;
+}
+
+// AddedComplexity to ensure isel tries t2MOVi before t2MOVi16.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, AddedComplexity = 1 in
+def t2MOVi : T2sI<(outs GPR:$dst), (ins t2_so_imm:$src), IIC_iMOVi,
+                   "mov", ".w\t$dst, $src",
+                   [(set GPR:$dst, t2_so_imm:$src)]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25} = 0;
+  let Inst{24-21} = 0b0010;
+  let Inst{20} = ?; // The S bit.
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{15} = 0;
+}
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
+def t2MOVi16 : T2I<(outs GPR:$dst), (ins i32imm:$src), IIC_iMOVi,
+                   "movw", "\t$dst, $src",
+                   [(set GPR:$dst, imm0_65535:$src)]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25} = 1;
+  let Inst{24-21} = 0b0010;
+  let Inst{20} = 0; // The S bit.
+  let Inst{15} = 0;
+}
+
+let Constraints = "$src = $dst" in
+def t2MOVTi16 : T2I<(outs GPR:$dst), (ins GPR:$src, i32imm:$imm), IIC_iMOVi,
+                    "movt", "\t$dst, $imm",
+                    [(set GPR:$dst,
+                          (or (and GPR:$src, 0xffff), lo16AllZero:$imm))]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25} = 1;
+  let Inst{24-21} = 0b0110;
+  let Inst{20} = 0; // The S bit.
+  let Inst{15} = 0;
+}
+
+def : T2Pat<(or GPR:$src, 0xffff0000), (t2MOVTi16 GPR:$src, 0xffff)>;
+
+//===----------------------------------------------------------------------===//
+//  Extend Instructions.
+//
+
+// Sign extenders
+
+defm t2SXTB  : T2I_unary_rrot<0b100, "sxtb",
+                              UnOpFrag<(sext_inreg node:$Src, i8)>>;
+defm t2SXTH  : T2I_unary_rrot<0b000, "sxth",
+                              UnOpFrag<(sext_inreg node:$Src, i16)>>;
+
+defm t2SXTAB : T2I_bin_rrot<0b100, "sxtab",
+                        BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS, i8))>>;
+defm t2SXTAH : T2I_bin_rrot<0b000, "sxtah",
+                        BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS,i16))>>;
+
+// TODO: SXT(A){B|H}16
+
+// Zero extenders
+
+let AddedComplexity = 16 in {
+defm t2UXTB   : T2I_unary_rrot<0b101, "uxtb",
+                               UnOpFrag<(and node:$Src, 0x000000FF)>>;
+defm t2UXTH   : T2I_unary_rrot<0b001, "uxth",
+                               UnOpFrag<(and node:$Src, 0x0000FFFF)>>;
+defm t2UXTB16 : T2I_unary_rrot<0b011, "uxtb16",
+                               UnOpFrag<(and node:$Src, 0x00FF00FF)>>;
+
+def : T2Pat<(and (shl GPR:$Src, (i32 8)), 0xFF00FF),
+            (t2UXTB16r_rot GPR:$Src, 24)>;
+def : T2Pat<(and (srl GPR:$Src, (i32 8)), 0xFF00FF),
+            (t2UXTB16r_rot GPR:$Src, 8)>;
+
+defm t2UXTAB : T2I_bin_rrot<0b101, "uxtab",
+                           BinOpFrag<(add node:$LHS, (and node:$RHS, 0x00FF))>>;
+defm t2UXTAH : T2I_bin_rrot<0b001, "uxtah",
+                           BinOpFrag<(add node:$LHS, (and node:$RHS, 0xFFFF))>>;
+}
+
+//===----------------------------------------------------------------------===//
+//  Arithmetic Instructions.
+//
+
+defm t2ADD  : T2I_bin_ii12rs<0b000, "add",
+                             BinOpFrag<(add  node:$LHS, node:$RHS)>, 1>;
+defm t2SUB  : T2I_bin_ii12rs<0b101, "sub",
+                             BinOpFrag<(sub  node:$LHS, node:$RHS)>>;
+
+// ADD and SUB with 's' bit set. No 12-bit immediate (T4) variants.
+defm t2ADDS : T2I_bin_s_irs <0b1000, "add",
+                             BinOpFrag<(addc node:$LHS, node:$RHS)>, 1>;
+defm t2SUBS : T2I_bin_s_irs <0b1101, "sub",
+                             BinOpFrag<(subc node:$LHS, node:$RHS)>>;
+
+defm t2ADC  : T2I_adde_sube_irs<0b1010, "adc",
+                                BinOpFrag<(adde node:$LHS, node:$RHS)>, 1>;
+defm t2SBC  : T2I_adde_sube_irs<0b1011, "sbc",
+                                BinOpFrag<(sube node:$LHS, node:$RHS)>>;
+
+// RSB
+defm t2RSB  : T2I_rbin_is   <0b1110, "rsb",
+                             BinOpFrag<(sub  node:$LHS, node:$RHS)>>;
+defm t2RSBS : T2I_rbin_s_is <0b1110, "rsb",
+                             BinOpFrag<(subc node:$LHS, node:$RHS)>>;
+
+// (sub X, imm) gets canonicalized to (add X, -imm).  Match this form.
+let AddedComplexity = 1 in
+def : T2Pat<(add       GPR:$src, imm0_255_neg:$imm),
+            (t2SUBri   GPR:$src, imm0_255_neg:$imm)>;
+def : T2Pat<(add       GPR:$src, t2_so_imm_neg:$imm),
+            (t2SUBri   GPR:$src, t2_so_imm_neg:$imm)>;
+def : T2Pat<(add       GPR:$src, imm0_4095_neg:$imm),
+            (t2SUBri12 GPR:$src, imm0_4095_neg:$imm)>;
+
+
+//===----------------------------------------------------------------------===//
+//  Shift and rotate Instructions.
+//
+
+defm t2LSL  : T2I_sh_ir<0b00, "lsl", BinOpFrag<(shl  node:$LHS, node:$RHS)>>;
+defm t2LSR  : T2I_sh_ir<0b01, "lsr", BinOpFrag<(srl  node:$LHS, node:$RHS)>>;
+defm t2ASR  : T2I_sh_ir<0b10, "asr", BinOpFrag<(sra  node:$LHS, node:$RHS)>>;
+defm t2ROR  : T2I_sh_ir<0b11, "ror", BinOpFrag<(rotr node:$LHS, node:$RHS)>>;
+
+let Uses = [CPSR] in {
+def t2MOVrx : T2sI<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi,
+                   "rrx", "\t$dst, $src",
+                   [(set GPR:$dst, (ARMrrx GPR:$src))]> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-25} = 0b01;
+  let Inst{24-21} = 0b0010;
+  let Inst{20} = ?; // The S bit.
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{14-12} = 0b000;
+  let Inst{7-4} = 0b0011;
+}
+}
+
+let Defs = [CPSR] in {
+def t2MOVsrl_flag : T2XI<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi,
+                         "lsrs.w\t$dst, $src, #1",
+                         [(set GPR:$dst, (ARMsrl_flag GPR:$src))]> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-25} = 0b01;
+  let Inst{24-21} = 0b0010;
+  let Inst{20} = 1; // The S bit.
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{5-4} = 0b01; // Shift type.
+  // Shift amount = Inst{14-12:7-6} = 1.
+  let Inst{14-12} = 0b000;
+  let Inst{7-6} = 0b01;
+}
+def t2MOVsra_flag : T2XI<(outs GPR:$dst), (ins GPR:$src), IIC_iMOVsi,
+                         "asrs.w\t$dst, $src, #1",
+                         [(set GPR:$dst, (ARMsra_flag GPR:$src))]> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-25} = 0b01;
+  let Inst{24-21} = 0b0010;
+  let Inst{20} = 1; // The S bit.
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{5-4} = 0b10; // Shift type.
+  // Shift amount = Inst{14-12:7-6} = 1.
+  let Inst{14-12} = 0b000;
+  let Inst{7-6} = 0b01;
+}
+}
+
+//===----------------------------------------------------------------------===//
+//  Bitwise Instructions.
+//
+
+defm t2AND  : T2I_bin_w_irs<0b0000, "and",
+                            BinOpFrag<(and node:$LHS, node:$RHS)>, 1>;
+defm t2ORR  : T2I_bin_w_irs<0b0010, "orr",
+                            BinOpFrag<(or  node:$LHS, node:$RHS)>, 1>;
+defm t2EOR  : T2I_bin_w_irs<0b0100, "eor",
+                            BinOpFrag<(xor node:$LHS, node:$RHS)>, 1>;
+
+defm t2BIC  : T2I_bin_w_irs<0b0001, "bic",
+                            BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
+
+let Constraints = "$src = $dst" in
+def t2BFC : T2I<(outs GPR:$dst), (ins GPR:$src, bf_inv_mask_imm:$imm),
+                IIC_iUNAsi, "bfc", "\t$dst, $imm",
+                [(set GPR:$dst, (and GPR:$src, bf_inv_mask_imm:$imm))]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25} = 1;
+  let Inst{24-20} = 0b10110;
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{15} = 0;
+}
+
+def t2SBFX : T2I<(outs GPR:$dst), (ins GPR:$src, imm0_31:$lsb, imm0_31:$width),
+                 IIC_iALUi, "sbfx", "\t$dst, $src, $lsb, $width", []> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25} = 1;
+  let Inst{24-20} = 0b10100;
+  let Inst{15} = 0;
+}
+
+def t2UBFX : T2I<(outs GPR:$dst), (ins GPR:$src, imm0_31:$lsb, imm0_31:$width),
+                 IIC_iALUi, "ubfx", "\t$dst, $src, $lsb, $width", []> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25} = 1;
+  let Inst{24-20} = 0b11100;
+  let Inst{15} = 0;
+}
+
+// A8.6.18  BFI - Bitfield insert (Encoding T1)
+// Added for disassembler with the pattern field purposely left blank.
+// FIXME: Utilize this instruction in codgen.
+def t2BFI : T2I<(outs GPR:$dst), (ins GPR:$src, imm0_31:$lsb, imm0_31:$width),
+                IIC_iALUi, "bfi", "\t$dst, $src, $lsb, $width", []> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25} = 1;
+  let Inst{24-20} = 0b10110;
+  let Inst{15} = 0;
+}
+
+defm t2ORN  : T2I_bin_irs<0b0011, "orn", BinOpFrag<(or  node:$LHS,
+                          (not node:$RHS))>>;
+
+// Prefer over of t2EORri ra, rb, -1 because mvn has 16-bit version
+let AddedComplexity = 1 in
+defm t2MVN  : T2I_un_irs <0b0011, "mvn", UnOpFrag<(not node:$Src)>, 1, 1>;
+
+
+def : T2Pat<(and     GPR:$src, t2_so_imm_not:$imm),
+            (t2BICri GPR:$src, t2_so_imm_not:$imm)>;
+
+// FIXME: Disable this pattern on Darwin to workaround an assembler bug.
+def : T2Pat<(or      GPR:$src, t2_so_imm_not:$imm),
+            (t2ORNri GPR:$src, t2_so_imm_not:$imm)>,
+            Requires<[IsThumb2]>;
+
+def : T2Pat<(t2_so_imm_not:$src),
+            (t2MVNi t2_so_imm_not:$src)>;
+
+//===----------------------------------------------------------------------===//
+//  Multiply Instructions.
+//
+let isCommutable = 1 in
+def t2MUL: T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32,
+                "mul", "\t$dst, $a, $b",
+                [(set GPR:$dst, (mul GPR:$a, GPR:$b))]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0110;
+  let Inst{22-20} = 0b000;
+  let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate)
+  let Inst{7-4} = 0b0000; // Multiply
+}
+
+def t2MLA: T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMAC32,
+		"mla", "\t$dst, $a, $b, $c",
+		[(set GPR:$dst, (add (mul GPR:$a, GPR:$b), GPR:$c))]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0110;
+  let Inst{22-20} = 0b000;
+  let Inst{15-12} = {?, ?, ?, ?}; // Ra
+  let Inst{7-4} = 0b0000; // Multiply
+}
+
+def t2MLS: T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMAC32,
+		"mls", "\t$dst, $a, $b, $c",
+                [(set GPR:$dst, (sub GPR:$c, (mul GPR:$a, GPR:$b)))]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0110;
+  let Inst{22-20} = 0b000;
+  let Inst{15-12} = {?, ?, ?, ?}; // Ra
+  let Inst{7-4} = 0b0001; // Multiply and Subtract
+}
+
+// Extra precision multiplies with low / high results
+let neverHasSideEffects = 1 in {
+let isCommutable = 1 in {
+def t2SMULL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iMUL64,
+                   "smull", "\t$ldst, $hdst, $a, $b", []> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0111;
+  let Inst{22-20} = 0b000;
+  let Inst{7-4} = 0b0000;
+}
+
+def t2UMULL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iMUL64,
+                   "umull", "\t$ldst, $hdst, $a, $b", []> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0111;
+  let Inst{22-20} = 0b010;
+  let Inst{7-4} = 0b0000;
+}
+} // isCommutable
+
+// Multiply + accumulate
+def t2SMLAL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iMAC64,
+                  "smlal", "\t$ldst, $hdst, $a, $b", []>{
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0111;
+  let Inst{22-20} = 0b100;
+  let Inst{7-4} = 0b0000;
+}
+
+def t2UMLAL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iMAC64,
+                  "umlal", "\t$ldst, $hdst, $a, $b", []>{
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0111;
+  let Inst{22-20} = 0b110;
+  let Inst{7-4} = 0b0000;
+}
+
+def t2UMAAL : T2I<(outs GPR:$ldst, GPR:$hdst), (ins GPR:$a, GPR:$b), IIC_iMAC64,
+                  "umaal", "\t$ldst, $hdst, $a, $b", []>{
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0111;
+  let Inst{22-20} = 0b110;
+  let Inst{7-4} = 0b0110;
+}
+} // neverHasSideEffects
+
+// Most significant word multiply
+def t2SMMUL : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32,
+                  "smmul", "\t$dst, $a, $b",
+                  [(set GPR:$dst, (mulhs GPR:$a, GPR:$b))]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0110;
+  let Inst{22-20} = 0b101;
+  let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate)
+  let Inst{7-4} = 0b0000; // No Rounding (Inst{4} = 0)
+}
+
+def t2SMMLA : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMAC32,
+                  "smmla", "\t$dst, $a, $b, $c",
+                  [(set GPR:$dst, (add (mulhs GPR:$a, GPR:$b), GPR:$c))]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0110;
+  let Inst{22-20} = 0b101;
+  let Inst{15-12} = {?, ?, ?, ?}; // Ra
+  let Inst{7-4} = 0b0000; // No Rounding (Inst{4} = 0)
+}
+
+
+def t2SMMLS : T2I <(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$c), IIC_iMAC32,
+                   "smmls", "\t$dst, $a, $b, $c",
+                   [(set GPR:$dst, (sub GPR:$c, (mulhs GPR:$a, GPR:$b)))]> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-23} = 0b0110;
+  let Inst{22-20} = 0b110;
+  let Inst{15-12} = {?, ?, ?, ?}; // Ra
+  let Inst{7-4} = 0b0000; // No Rounding (Inst{4} = 0)
+}
+
+multiclass T2I_smul<string opc, PatFrag opnode> {
+  def BB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32,
+              !strconcat(opc, "bb"), "\t$dst, $a, $b",
+              [(set GPR:$dst, (opnode (sext_inreg GPR:$a, i16),
+                                      (sext_inreg GPR:$b, i16)))]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0110;
+    let Inst{22-20} = 0b001;
+    let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate)
+    let Inst{7-6} = 0b00;
+    let Inst{5-4} = 0b00;
+  }
+
+  def BT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32,
+              !strconcat(opc, "bt"), "\t$dst, $a, $b",
+              [(set GPR:$dst, (opnode (sext_inreg GPR:$a, i16),
+                                      (sra GPR:$b, (i32 16))))]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0110;
+    let Inst{22-20} = 0b001;
+    let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate)
+    let Inst{7-6} = 0b00;
+    let Inst{5-4} = 0b01;
+  }
+
+  def TB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32,
+              !strconcat(opc, "tb"), "\t$dst, $a, $b",
+              [(set GPR:$dst, (opnode (sra GPR:$a, (i32 16)),
+                                      (sext_inreg GPR:$b, i16)))]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0110;
+    let Inst{22-20} = 0b001;
+    let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate)
+    let Inst{7-6} = 0b00;
+    let Inst{5-4} = 0b10;
+  }
+
+  def TT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL32,
+              !strconcat(opc, "tt"), "\t$dst, $a, $b",
+              [(set GPR:$dst, (opnode (sra GPR:$a, (i32 16)),
+                                      (sra GPR:$b, (i32 16))))]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0110;
+    let Inst{22-20} = 0b001;
+    let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate)
+    let Inst{7-6} = 0b00;
+    let Inst{5-4} = 0b11;
+  }
+
+  def WB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL16,
+              !strconcat(opc, "wb"), "\t$dst, $a, $b",
+              [(set GPR:$dst, (sra (opnode GPR:$a,
+                                    (sext_inreg GPR:$b, i16)), (i32 16)))]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0110;
+    let Inst{22-20} = 0b011;
+    let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate)
+    let Inst{7-6} = 0b00;
+    let Inst{5-4} = 0b00;
+  }
+
+  def WT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b), IIC_iMUL16,
+              !strconcat(opc, "wt"), "\t$dst, $a, $b",
+              [(set GPR:$dst, (sra (opnode GPR:$a,
+                                    (sra GPR:$b, (i32 16))), (i32 16)))]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0110;
+    let Inst{22-20} = 0b011;
+    let Inst{15-12} = 0b1111; // Ra = 0b1111 (no accumulate)
+    let Inst{7-6} = 0b00;
+    let Inst{5-4} = 0b01;
+  }
+}
+
+
+multiclass T2I_smla<string opc, PatFrag opnode> {
+  def BB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC16,
+              !strconcat(opc, "bb"), "\t$dst, $a, $b, $acc",
+              [(set GPR:$dst, (add GPR:$acc,
+                               (opnode (sext_inreg GPR:$a, i16),
+                                       (sext_inreg GPR:$b, i16))))]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0110;
+    let Inst{22-20} = 0b001;
+    let Inst{15-12} = {?, ?, ?, ?}; // Ra
+    let Inst{7-6} = 0b00;
+    let Inst{5-4} = 0b00;
+  }
+
+  def BT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC16,
+             !strconcat(opc, "bt"), "\t$dst, $a, $b, $acc",
+             [(set GPR:$dst, (add GPR:$acc, (opnode (sext_inreg GPR:$a, i16),
+                                                    (sra GPR:$b, (i32 16)))))]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0110;
+    let Inst{22-20} = 0b001;
+    let Inst{15-12} = {?, ?, ?, ?}; // Ra
+    let Inst{7-6} = 0b00;
+    let Inst{5-4} = 0b01;
+  }
+
+  def TB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC16,
+              !strconcat(opc, "tb"), "\t$dst, $a, $b, $acc",
+              [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, (i32 16)),
+                                                 (sext_inreg GPR:$b, i16))))]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0110;
+    let Inst{22-20} = 0b001;
+    let Inst{15-12} = {?, ?, ?, ?}; // Ra
+    let Inst{7-6} = 0b00;
+    let Inst{5-4} = 0b10;
+  }
+
+  def TT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC16,
+              !strconcat(opc, "tt"), "\t$dst, $a, $b, $acc",
+             [(set GPR:$dst, (add GPR:$acc, (opnode (sra GPR:$a, (i32 16)),
+                                                    (sra GPR:$b, (i32 16)))))]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0110;
+    let Inst{22-20} = 0b001;
+    let Inst{15-12} = {?, ?, ?, ?}; // Ra
+    let Inst{7-6} = 0b00;
+    let Inst{5-4} = 0b11;
+  }
+
+  def WB : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC16,
+              !strconcat(opc, "wb"), "\t$dst, $a, $b, $acc",
+              [(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a,
+                                       (sext_inreg GPR:$b, i16)), (i32 16))))]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0110;
+    let Inst{22-20} = 0b011;
+    let Inst{15-12} = {?, ?, ?, ?}; // Ra
+    let Inst{7-6} = 0b00;
+    let Inst{5-4} = 0b00;
+  }
+
+  def WT : T2I<(outs GPR:$dst), (ins GPR:$a, GPR:$b, GPR:$acc), IIC_iMAC16,
+              !strconcat(opc, "wt"), "\t$dst, $a, $b, $acc",
+              [(set GPR:$dst, (add GPR:$acc, (sra (opnode GPR:$a,
+                                         (sra GPR:$b, (i32 16))), (i32 16))))]> {
+    let Inst{31-27} = 0b11111;
+    let Inst{26-23} = 0b0110;
+    let Inst{22-20} = 0b011;
+    let Inst{15-12} = {?, ?, ?, ?}; // Ra
+    let Inst{7-6} = 0b00;
+    let Inst{5-4} = 0b01;
+  }
+}
+
+defm t2SMUL : T2I_smul<"smul", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
+defm t2SMLA : T2I_smla<"smla", BinOpFrag<(mul node:$LHS, node:$RHS)>>;
+
+// TODO: Halfword multiple accumulate long: SMLAL<x><y>
+// TODO: Dual halfword multiple: SMUAD, SMUSD, SMLAD, SMLSD, SMLALD, SMLSLD
+
+
+//===----------------------------------------------------------------------===//
+//  Misc. Arithmetic Instructions.
+//
+
+class T2I_misc<bits<2> op1, bits<2> op2, dag oops, dag iops, InstrItinClass itin,
+              string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  let Inst{31-27} = 0b11111;
+  let Inst{26-22} = 0b01010;
+  let Inst{21-20} = op1;
+  let Inst{15-12} = 0b1111;
+  let Inst{7-6} = 0b10;
+  let Inst{5-4} = op2;
+}
+
+def t2CLZ : T2I_misc<0b11, 0b00, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
+                    "clz", "\t$dst, $src", [(set GPR:$dst, (ctlz GPR:$src))]>;
+
+def t2RBIT : T2I_misc<0b01, 0b10, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
+                      "rbit", "\t$dst, $src",
+                      [(set GPR:$dst, (ARMrbit GPR:$src))]>;
+
+def t2REV : T2I_misc<0b01, 0b00, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
+                   "rev", ".w\t$dst, $src", [(set GPR:$dst, (bswap GPR:$src))]>;
+
+def t2REV16 : T2I_misc<0b01, 0b01, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
+                       "rev16", ".w\t$dst, $src",
+                [(set GPR:$dst,
+                    (or (and (srl GPR:$src, (i32 8)), 0xFF),
+                        (or (and (shl GPR:$src, (i32 8)), 0xFF00),
+                            (or (and (srl GPR:$src, (i32 8)), 0xFF0000),
+                                (and (shl GPR:$src, (i32 8)), 0xFF000000)))))]>;
+
+def t2REVSH : T2I_misc<0b01, 0b11, (outs GPR:$dst), (ins GPR:$src), IIC_iUNAr,
+                       "revsh", ".w\t$dst, $src",
+                 [(set GPR:$dst,
+                    (sext_inreg
+                      (or (srl (and GPR:$src, 0xFF00), (i32 8)),
+                          (shl GPR:$src, (i32 8))), i16))]>;
+
+def t2PKHBT : T2I<(outs GPR:$dst), (ins GPR:$src1, GPR:$src2, i32imm:$shamt),
+                  IIC_iALUsi, "pkhbt", "\t$dst, $src1, $src2, LSL $shamt",
+                  [(set GPR:$dst, (or (and GPR:$src1, 0xFFFF),
+                                      (and (shl GPR:$src2, (i32 imm:$shamt)),
+                                           0xFFFF0000)))]> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-25} = 0b01;
+  let Inst{24-20} = 0b01100;
+  let Inst{5} = 0; // BT form
+  let Inst{4} = 0;
+}
+
+// Alternate cases for PKHBT where identities eliminate some nodes.
+def : T2Pat<(or (and GPR:$src1, 0xFFFF), (and GPR:$src2, 0xFFFF0000)),
+            (t2PKHBT GPR:$src1, GPR:$src2, 0)>;
+def : T2Pat<(or (and GPR:$src1, 0xFFFF), (shl GPR:$src2, imm16_31:$shamt)),
+            (t2PKHBT GPR:$src1, GPR:$src2, imm16_31:$shamt)>;
+
+def t2PKHTB : T2I<(outs GPR:$dst), (ins GPR:$src1, GPR:$src2, i32imm:$shamt),
+                  IIC_iALUsi, "pkhtb", "\t$dst, $src1, $src2, ASR $shamt",
+                  [(set GPR:$dst, (or (and GPR:$src1, 0xFFFF0000),
+                                      (and (sra GPR:$src2, imm16_31:$shamt),
+                                           0xFFFF)))]> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-25} = 0b01;
+  let Inst{24-20} = 0b01100;
+  let Inst{5} = 1; // TB form
+  let Inst{4} = 0;
+}
+
+// Alternate cases for PKHTB where identities eliminate some nodes.  Note that
+// a shift amount of 0 is *not legal* here, it is PKHBT instead.
+def : T2Pat<(or (and GPR:$src1, 0xFFFF0000), (srl GPR:$src2, (i32 16))),
+            (t2PKHTB GPR:$src1, GPR:$src2, 16)>;
+def : T2Pat<(or (and GPR:$src1, 0xFFFF0000),
+                     (and (srl GPR:$src2, imm1_15:$shamt), 0xFFFF)),
+            (t2PKHTB GPR:$src1, GPR:$src2, imm1_15:$shamt)>;
+
+//===----------------------------------------------------------------------===//
+//  Comparison Instructions...
+//
+
+defm t2CMP  : T2I_cmp_irs<0b1101, "cmp",
+                          BinOpFrag<(ARMcmp node:$LHS, node:$RHS)>>;
+defm t2CMPz : T2I_cmp_irs<0b1101, "cmp",
+                          BinOpFrag<(ARMcmpZ node:$LHS, node:$RHS)>>;
+
+//FIXME: Disable CMN, as CCodes are backwards from compare expectations
+//       Compare-to-zero still works out, just not the relationals
+//defm t2CMN  : T2I_cmp_irs<0b1000, "cmn",
+//                          BinOpFrag<(ARMcmp node:$LHS,(ineg node:$RHS))>>;
+defm t2CMNz : T2I_cmp_irs<0b1000, "cmn",
+                          BinOpFrag<(ARMcmpZ node:$LHS,(ineg node:$RHS))>>;
+
+//def : T2Pat<(ARMcmp  GPR:$src, t2_so_imm_neg:$imm),
+//            (t2CMNri GPR:$src, t2_so_imm_neg:$imm)>;
+
+def : T2Pat<(ARMcmpZ  GPR:$src, t2_so_imm_neg:$imm),
+            (t2CMNzri GPR:$src, t2_so_imm_neg:$imm)>;
+
+defm t2TST  : T2I_cmp_irs<0b0000, "tst",
+                          BinOpFrag<(ARMcmpZ (and node:$LHS, node:$RHS), 0)>>;
+defm t2TEQ  : T2I_cmp_irs<0b0100, "teq",
+                          BinOpFrag<(ARMcmpZ (xor node:$LHS, node:$RHS), 0)>>;
+
+// A8.6.27  CBNZ, CBZ - Compare and branch on (non)zero.
+// Short range conditional branch. Looks awesome for loops. Need to figure
+// out how to use this one.
+
+
+// Conditional moves
+// FIXME: should be able to write a pattern for ARMcmov, but can't use
+// a two-value operand where a dag node expects two operands. :( 
+def t2MOVCCr : T2I<(outs GPR:$dst), (ins GPR:$false, GPR:$true), IIC_iCMOVr,
+                   "mov", ".w\t$dst, $true",
+      [/*(set GPR:$dst, (ARMcmov GPR:$false, GPR:$true, imm:$cc, CCR:$ccr))*/]>,
+                RegConstraint<"$false = $dst"> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-25} = 0b01;
+  let Inst{24-21} = 0b0010;
+  let Inst{20} = 0; // The S bit.
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{14-12} = 0b000;
+  let Inst{7-4} = 0b0000;
+}
+
+def t2MOVCCi : T2I<(outs GPR:$dst), (ins GPR:$false, t2_so_imm:$true),
+                   IIC_iCMOVi, "mov", ".w\t$dst, $true",
+[/*(set GPR:$dst, (ARMcmov GPR:$false, t2_so_imm:$true, imm:$cc, CCR:$ccr))*/]>,
+                   RegConstraint<"$false = $dst"> {
+  let Inst{31-27} = 0b11110;
+  let Inst{25} = 0;
+  let Inst{24-21} = 0b0010;
+  let Inst{20} = 0; // The S bit.
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{15} = 0;
+}
+
+class T2I_movcc_sh<bits<2> opcod, dag oops, dag iops, InstrItinClass itin,
+                   string opc, string asm, list<dag> pattern>
+  : T2I<oops, iops, itin, opc, asm, pattern> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-25} = 0b01;
+  let Inst{24-21} = 0b0010;
+  let Inst{20} = 0; // The S bit.
+  let Inst{19-16} = 0b1111; // Rn
+  let Inst{5-4} = opcod; // Shift type.
+}
+def t2MOVCClsl : T2I_movcc_sh<0b00, (outs GPR:$dst),
+                             (ins GPR:$false, GPR:$true, i32imm:$rhs),
+                             IIC_iCMOVsi, "lsl", ".w\t$dst, $true, $rhs", []>,
+                 RegConstraint<"$false = $dst">;
+def t2MOVCClsr : T2I_movcc_sh<0b01, (outs GPR:$dst),
+                             (ins GPR:$false, GPR:$true, i32imm:$rhs),
+                             IIC_iCMOVsi, "lsr", ".w\t$dst, $true, $rhs", []>,
+                 RegConstraint<"$false = $dst">;
+def t2MOVCCasr : T2I_movcc_sh<0b10, (outs GPR:$dst),
+                             (ins GPR:$false, GPR:$true, i32imm:$rhs),
+                             IIC_iCMOVsi, "asr", ".w\t$dst, $true, $rhs", []>,
+                 RegConstraint<"$false = $dst">;
+def t2MOVCCror : T2I_movcc_sh<0b11, (outs GPR:$dst),
+                             (ins GPR:$false, GPR:$true, i32imm:$rhs),
+                             IIC_iCMOVsi, "ror", ".w\t$dst, $true, $rhs", []>,
+                 RegConstraint<"$false = $dst">;
+
+//===----------------------------------------------------------------------===//
+// Atomic operations intrinsics
+//
+
+// memory barriers protect the atomic sequences
+let hasSideEffects = 1 in {
+def t2Int_MemBarrierV7 : AInoP<(outs), (ins),
+                        Pseudo, NoItinerary,
+                        "dmb", "",
+                        [(ARMMemBarrierV7)]>,
+                        Requires<[IsThumb2]> {
+  let Inst{31-4} = 0xF3BF8F5;
+  // FIXME: add support for options other than a full system DMB
+  let Inst{3-0} = 0b1111;
+}
+
+def t2Int_SyncBarrierV7 : AInoP<(outs), (ins),
+                        Pseudo, NoItinerary,
+                        "dsb", "",
+                        [(ARMSyncBarrierV7)]>,
+                        Requires<[IsThumb2]> {
+  let Inst{31-4} = 0xF3BF8F4;
+  // FIXME: add support for options other than a full system DSB
+  let Inst{3-0} = 0b1111;
+}
+}
+
+class T2I_ldrex<bits<2> opcod, dag oops, dag iops, AddrMode am, SizeFlagVal sz,
+                InstrItinClass itin, string opc, string asm, string cstr,
+                list<dag> pattern, bits<4> rt2 = 0b1111>
+  : Thumb2I<oops, iops, am, sz, itin, opc, asm, cstr, pattern> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0001101;
+  let Inst{11-8} = rt2;
+  let Inst{7-6} = 0b01;
+  let Inst{5-4} = opcod;
+  let Inst{3-0} = 0b1111;
+}
+class T2I_strex<bits<2> opcod, dag oops, dag iops, AddrMode am, SizeFlagVal sz,
+                InstrItinClass itin, string opc, string asm, string cstr,
+                list<dag> pattern, bits<4> rt2 = 0b1111>
+  : Thumb2I<oops, iops, am, sz, itin, opc, asm, cstr, pattern> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0001100;
+  let Inst{11-8} = rt2;
+  let Inst{7-6} = 0b01;
+  let Inst{5-4} = opcod;
+}
+
+let mayLoad = 1 in {
+def t2LDREXB : T2I_ldrex<0b00, (outs GPR:$dest), (ins GPR:$ptr), AddrModeNone,
+                         Size4Bytes, NoItinerary, "ldrexb", "\t$dest, [$ptr]",
+                         "", []>;
+def t2LDREXH : T2I_ldrex<0b01, (outs GPR:$dest), (ins GPR:$ptr), AddrModeNone,
+                         Size4Bytes, NoItinerary, "ldrexh", "\t$dest, [$ptr]",
+                         "", []>;
+def t2LDREX  : Thumb2I<(outs GPR:$dest), (ins GPR:$ptr), AddrModeNone,
+                       Size4Bytes, NoItinerary,
+                       "ldrex", "\t$dest, [$ptr]", "",
+                      []> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0000101;
+  let Inst{11-8} = 0b1111;
+  let Inst{7-0} = 0b00000000; // imm8 = 0
+}
+def t2LDREXD : T2I_ldrex<0b11, (outs GPR:$dest, GPR:$dest2), (ins GPR:$ptr),
+                         AddrModeNone, Size4Bytes, NoItinerary,
+                         "ldrexd", "\t$dest, $dest2, [$ptr]", "",
+                         [], {?, ?, ?, ?}>;
+}
+
+let mayStore = 1, Constraints = "@earlyclobber $success" in {
+def t2STREXB : T2I_strex<0b00, (outs GPR:$success), (ins GPR:$src, GPR:$ptr),
+                         AddrModeNone, Size4Bytes, NoItinerary,
+                         "strexb", "\t$success, $src, [$ptr]", "", []>;
+def t2STREXH : T2I_strex<0b01, (outs GPR:$success), (ins GPR:$src, GPR:$ptr),
+                         AddrModeNone, Size4Bytes, NoItinerary,
+                         "strexh", "\t$success, $src, [$ptr]", "", []>;
+def t2STREX  : Thumb2I<(outs GPR:$success), (ins GPR:$src, GPR:$ptr),
+                       AddrModeNone, Size4Bytes, NoItinerary,
+                       "strex", "\t$success, $src, [$ptr]", "",
+                      []> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0000100;
+  let Inst{7-0} = 0b00000000; // imm8 = 0
+}
+def t2STREXD : T2I_strex<0b11, (outs GPR:$success),
+                         (ins GPR:$src, GPR:$src2, GPR:$ptr),
+                         AddrModeNone, Size4Bytes, NoItinerary,
+                         "strexd", "\t$success, $src, $src2, [$ptr]", "", [],
+                         {?, ?, ?, ?}>;
+}
+
+//===----------------------------------------------------------------------===//
+// TLS Instructions
+//
+
+// __aeabi_read_tp preserves the registers r1-r3.
+let isCall = 1,
+  Defs = [R0, R12, LR, CPSR] in {
+  def t2TPsoft : T2XI<(outs), (ins), IIC_Br,
+                     "bl\t__aeabi_read_tp",
+                     [(set R0, ARMthread_pointer)]> {
+    let Inst{31-27} = 0b11110;
+    let Inst{15-14} = 0b11;
+    let Inst{12} = 1;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// SJLJ Exception handling intrinsics
+//   eh_sjlj_setjmp() is an instruction sequence to store the return
+//   address and save #0 in R0 for the non-longjmp case.
+//   Since by its nature we may be coming from some other function to get
+//   here, and we're using the stack frame for the containing function to
+//   save/restore registers, we can't keep anything live in regs across
+//   the eh_sjlj_setjmp(), else it will almost certainly have been tromped upon
+//   when we get here from a longjmp(). We force everthing out of registers
+//   except for our own input by listing the relevant registers in Defs. By
+//   doing so, we also cause the prologue/epilogue code to actively preserve
+//   all of the callee-saved resgisters, which is exactly what we want.
+//   The current SP is passed in $val, and we reuse the reg as a scratch.
+let Defs =
+  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR,  D0,
+    D1,  D2,  D3,  D4,  D5,  D6,  D7,  D8,  D9,  D10, D11, D12, D13, D14, D15,
+    D16, D17, D18, D19, D20, D21, D22, D23, D24, D25, D26, D27, D28, D29, D30,
+    D31 ] in {
+  def t2Int_eh_sjlj_setjmp : Thumb2XI<(outs), (ins GPR:$src, tGPR:$val),
+                               AddrModeNone, SizeSpecial, NoItinerary,
+                               "str\t$val, [$src, #8]\t@ begin eh.setjmp\n"
+                               "\tmov\t$val, pc\n"
+                               "\tadds\t$val, #9\n"
+                               "\tstr\t$val, [$src, #4]\n"
+                               "\tmovs\tr0, #0\n"
+                               "\tb\t1f\n"
+                               "\tmovs\tr0, #1\t@ end eh.setjmp\n"
+                               "1:", "",
+                          [(set R0, (ARMeh_sjlj_setjmp GPR:$src, tGPR:$val))]>;
+}
+
+
+
+//===----------------------------------------------------------------------===//
+// Control-Flow Instructions
+//
+
+// FIXME: remove when we have a way to marking a MI with these properties.
+// FIXME: $dst1 should be a def. But the extra ops must be in the end of the
+// operand list.
+// FIXME: Should pc be an implicit operand like PICADD, etc?
+let isReturn = 1, isTerminator = 1, isBarrier = 1, mayLoad = 1,
+    hasExtraDefRegAllocReq = 1 in
+  def t2LDM_RET : T2XI<(outs),
+                    (ins addrmode4:$addr, pred:$p, reglist:$wb, variable_ops),
+                    IIC_Br, "ldm${addr:submode}${p}${addr:wide}\t$addr, $wb",
+                    []> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-25} = 0b00;
+  let Inst{24-23} = {?, ?}; // IA: '01', DB: '10'
+  let Inst{22} = 0;
+  let Inst{21} = ?; // The W bit.
+  let Inst{20} = 1; // Load
+}
+
+let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
+let isPredicable = 1 in
+def t2B   : T2XI<(outs), (ins brtarget:$target), IIC_Br,
+                 "b.w\t$target",
+                 [(br bb:$target)]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{15-14} = 0b10;
+  let Inst{12} = 1;
+}
+
+let isNotDuplicable = 1, isIndirectBranch = 1 in {
+def t2BR_JT :
+    T2JTI<(outs),
+          (ins GPR:$target, GPR:$index, jt2block_operand:$jt, i32imm:$id),
+           IIC_Br, "mov\tpc, $target\n$jt",
+          [(ARMbr2jt GPR:$target, GPR:$index, tjumptable:$jt, imm:$id)]> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0100100;
+  let Inst{19-16} = 0b1111;
+  let Inst{14-12} = 0b000;
+  let Inst{11-8} = 0b1111; // Rd = pc
+  let Inst{7-4} = 0b0000;
+}
+
+// FIXME: Add a non-pc based case that can be predicated.
+def t2TBB :
+    T2JTI<(outs),
+        (ins tb_addrmode:$index, jt2block_operand:$jt, i32imm:$id),
+         IIC_Br, "tbb\t$index\n$jt", []> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0001101;
+  let Inst{19-16} = 0b1111; // Rn = pc (table follows this instruction)
+  let Inst{15-8} = 0b11110000;
+  let Inst{7-4} = 0b0000; // B form
+}
+
+def t2TBH :
+    T2JTI<(outs),
+        (ins tb_addrmode:$index, jt2block_operand:$jt, i32imm:$id),
+         IIC_Br, "tbh\t$index\n$jt", []> {
+  let Inst{31-27} = 0b11101;
+  let Inst{26-20} = 0b0001101;
+  let Inst{19-16} = 0b1111; // Rn = pc (table follows this instruction)
+  let Inst{15-8} = 0b11110000;
+  let Inst{7-4} = 0b0001; // H form
+}
+} // isNotDuplicable, isIndirectBranch
+
+} // isBranch, isTerminator, isBarrier
+
+// FIXME: should be able to write a pattern for ARMBrcond, but can't use
+// a two-value operand where a dag node expects two operands. :(
+let isBranch = 1, isTerminator = 1 in
+def t2Bcc : T2I<(outs), (ins brtarget:$target), IIC_Br,
+                "b", ".w\t$target",
+                [/*(ARMbrcond bb:$target, imm:$cc)*/]> {
+  let Inst{31-27} = 0b11110;
+  let Inst{15-14} = 0b10;
+  let Inst{12} = 0;
+}
+
+
+// IT block
+def t2IT : Thumb2XI<(outs), (ins it_pred:$cc, it_mask:$mask),
+                    AddrModeNone, Size2Bytes,  IIC_iALUx,
+                    "it$mask\t$cc", "", []> {
+  // 16-bit instruction.
+  let Inst{31-16} = 0x0000;
+  let Inst{15-8} = 0b10111111;
+}
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//
+
+// Two piece so_imms.
+def : T2Pat<(or GPR:$LHS, t2_so_imm2part:$RHS),
+             (t2ORRri (t2ORRri GPR:$LHS, (t2_so_imm2part_1 imm:$RHS)),
+                    (t2_so_imm2part_2 imm:$RHS))>;
+def : T2Pat<(xor GPR:$LHS, t2_so_imm2part:$RHS),
+             (t2EORri (t2EORri GPR:$LHS, (t2_so_imm2part_1 imm:$RHS)),
+                    (t2_so_imm2part_2 imm:$RHS))>;
+def : T2Pat<(add GPR:$LHS, t2_so_imm2part:$RHS),
+             (t2ADDri (t2ADDri GPR:$LHS, (t2_so_imm2part_1 imm:$RHS)),
+                    (t2_so_imm2part_2 imm:$RHS))>;
+def : T2Pat<(add GPR:$LHS, t2_so_neg_imm2part:$RHS),
+             (t2SUBri (t2SUBri GPR:$LHS, (t2_so_neg_imm2part_1 imm:$RHS)),
+                    (t2_so_neg_imm2part_2 imm:$RHS))>;
+
+// 32-bit immediate using movw + movt.
+// This is a single pseudo instruction to make it re-materializable. Remove
+// when we can do generalized remat.
+let isReMaterializable = 1 in
+def t2MOVi32imm : T2Ix2<(outs GPR:$dst), (ins i32imm:$src), IIC_iMOVi,
+                   "movw", "\t$dst, ${src:lo16}\n\tmovt${p}\t$dst, ${src:hi16}",
+                     [(set GPR:$dst, (i32 imm:$src))]>;
+
+// ConstantPool, GlobalAddress, and JumpTable
+def : T2Pat<(ARMWrapper  tglobaladdr :$dst), (t2LEApcrel tglobaladdr :$dst)>,
+           Requires<[IsThumb2, DontUseMovt]>;
+def : T2Pat<(ARMWrapper  tconstpool  :$dst), (t2LEApcrel tconstpool  :$dst)>;
+def : T2Pat<(ARMWrapper  tglobaladdr :$dst), (t2MOVi32imm tglobaladdr :$dst)>,
+           Requires<[IsThumb2, UseMovt]>;
+
+def : T2Pat<(ARMWrapperJT tjumptable:$dst, imm:$id),
+            (t2LEApcrelJT tjumptable:$dst, imm:$id)>;
+
+// Pseudo instruction that combines ldr from constpool and add pc. This should
+// be expanded into two instructions late to allow if-conversion and
+// scheduling.
+let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in 
+def t2LDRpci_pic : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr, pclabel:$cp),
+                   NoItinerary, "@ ldr.w\t$dst, $addr\n$cp:\n\tadd\t$dst, pc",
+               [(set GPR:$dst, (ARMpic_add (load (ARMWrapper tconstpool:$addr)),
+                                           imm:$cp))]>,
+               Requires<[IsThumb2]>;

diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
new file mode 100644
index 0000000..365e1e3
--- /dev/null
+++ b/lib/Target/ARM/ARMInstrVFP.td

@@ -0,0 +1,568 @@
+//===- ARMInstrVFP.td - VFP support for ARM -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the ARM VFP instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+def SDT_FTOI :
+SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisFP<1>]>;
+def SDT_ITOF :
+SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVT<1, f32>]>;
+def SDT_CMPFP0 :
+SDTypeProfile<0, 1, [SDTCisFP<0>]>;
+def SDT_VMOVDRR :
+SDTypeProfile<1, 2, [SDTCisVT<0, f64>, SDTCisVT<1, i32>,
+                     SDTCisSameAs<1, 2>]>;
+
+def arm_ftoui  : SDNode<"ARMISD::FTOUI",  SDT_FTOI>;
+def arm_ftosi  : SDNode<"ARMISD::FTOSI",  SDT_FTOI>;
+def arm_sitof  : SDNode<"ARMISD::SITOF",  SDT_ITOF>;
+def arm_uitof  : SDNode<"ARMISD::UITOF",  SDT_ITOF>;
+def arm_fmstat : SDNode<"ARMISD::FMSTAT", SDTNone, [SDNPInFlag,SDNPOutFlag]>;
+def arm_cmpfp  : SDNode<"ARMISD::CMPFP",  SDT_ARMCmp, [SDNPOutFlag]>;
+def arm_cmpfp0 : SDNode<"ARMISD::CMPFPw0",SDT_CMPFP0, [SDNPOutFlag]>;
+def arm_fmdrr  : SDNode<"ARMISD::VMOVDRR",  SDT_VMOVDRR>;
+
+//===----------------------------------------------------------------------===//
+// Operand Definitions.
+//
+
+
+def vfp_f32imm : Operand<f32>,
+                 PatLeaf<(f32 fpimm), [{
+      return ARM::getVFPf32Imm(N->getValueAPF()) != -1;
+    }]> {
+  let PrintMethod = "printVFPf32ImmOperand";
+}
+
+def vfp_f64imm : Operand<f64>,
+                 PatLeaf<(f64 fpimm), [{
+      return ARM::getVFPf64Imm(N->getValueAPF()) != -1;
+    }]> {
+  let PrintMethod = "printVFPf64ImmOperand";
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Load / store Instructions.
+//
+
+let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in {
+def VLDRD : ADI5<0b1101, 0b01, (outs DPR:$dst), (ins addrmode5:$addr),
+                 IIC_fpLoad64, "vldr", ".64\t$dst, $addr",
+                 [(set DPR:$dst, (load addrmode5:$addr))]>;
+
+def VLDRS : ASI5<0b1101, 0b01, (outs SPR:$dst), (ins addrmode5:$addr),
+                 IIC_fpLoad32, "vldr", ".32\t$dst, $addr",
+                 [(set SPR:$dst, (load addrmode5:$addr))]>;
+} // canFoldAsLoad
+
+def VSTRD  : ADI5<0b1101, 0b00, (outs), (ins DPR:$src, addrmode5:$addr),
+                 IIC_fpStore64, "vstr", ".64\t$src, $addr",
+                 [(store DPR:$src, addrmode5:$addr)]>;
+
+def VSTRS  : ASI5<0b1101, 0b00, (outs), (ins SPR:$src, addrmode5:$addr),
+                 IIC_fpStore32, "vstr", ".32\t$src, $addr",
+                 [(store SPR:$src, addrmode5:$addr)]>;
+
+//===----------------------------------------------------------------------===//
+//  Load / store multiple Instructions.
+//
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1 in {
+def VLDMD : AXDI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$wb,
+                           variable_ops), IIC_fpLoadm,
+                  "vldm${addr:submode}${p}\t${addr:base}, $wb",
+                  []> {
+  let Inst{20} = 1;
+}
+
+def VLDMS : AXSI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$wb,
+                           variable_ops), IIC_fpLoadm, 
+                  "vldm${addr:submode}${p}\t${addr:base}, $wb",
+                  []> {
+  let Inst{20} = 1;
+}
+} // mayLoad, hasExtraDefRegAllocReq
+
+let mayStore = 1, hasExtraSrcRegAllocReq = 1 in {
+def VSTMD : AXDI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$wb,
+                           variable_ops), IIC_fpStorem,
+                 "vstm${addr:submode}${p}\t${addr:base}, $wb",
+                 []> {
+  let Inst{20} = 0;
+}
+
+def VSTMS : AXSI5<(outs), (ins addrmode5:$addr, pred:$p, reglist:$wb,
+                           variable_ops), IIC_fpStorem,
+                 "vstm${addr:submode}${p}\t${addr:base}, $wb",
+                 []> {
+  let Inst{20} = 0;
+}
+} // mayStore, hasExtraSrcRegAllocReq
+
+// FLDMX, FSTMX - mixing S/D registers for pre-armv6 cores
+
+//===----------------------------------------------------------------------===//
+// FP Binary Operations.
+//
+
+def VADDD  : ADbI<0b11100, 0b11, 0, 0, (outs DPR:$dst), (ins DPR:$a, DPR:$b),
+                 IIC_fpALU64, "vadd", ".f64\t$dst, $a, $b",
+                 [(set DPR:$dst, (fadd DPR:$a, DPR:$b))]>;
+
+def VADDS  : ASbIn<0b11100, 0b11, 0, 0, (outs SPR:$dst), (ins SPR:$a, SPR:$b),
+                  IIC_fpALU32, "vadd", ".f32\t$dst, $a, $b",
+                  [(set SPR:$dst, (fadd SPR:$a, SPR:$b))]>;
+
+// These are encoded as unary instructions.
+let Defs = [FPSCR] in {
+def VCMPED : ADuI<0b11101, 0b11, 0b0100, 0b11, 0, (outs), (ins DPR:$a, DPR:$b),
+                 IIC_fpCMP64, "vcmpe", ".f64\t$a, $b",
+                 [(arm_cmpfp DPR:$a, DPR:$b)]>;
+
+def VCMPD  : ADuI<0b11101, 0b11, 0b0100, 0b01, 0, (outs), (ins DPR:$a, DPR:$b),
+                 IIC_fpCMP64, "vcmp", ".f64\t$a, $b",
+                 [/* For disassembly only; pattern left blank */]>;
+
+def VCMPES : ASuI<0b11101, 0b11, 0b0100, 0b11, 0, (outs), (ins SPR:$a, SPR:$b),
+                 IIC_fpCMP32, "vcmpe", ".f32\t$a, $b",
+                 [(arm_cmpfp SPR:$a, SPR:$b)]>;
+
+def VCMPS  : ASuI<0b11101, 0b11, 0b0100, 0b01, 0, (outs), (ins SPR:$a, SPR:$b),
+                 IIC_fpCMP32, "vcmp", ".f32\t$a, $b",
+                 [/* For disassembly only; pattern left blank */]>;
+}
+
+def VDIVD  : ADbI<0b11101, 0b00, 0, 0, (outs DPR:$dst), (ins DPR:$a, DPR:$b),
+                 IIC_fpDIV64, "vdiv", ".f64\t$dst, $a, $b",
+                 [(set DPR:$dst, (fdiv DPR:$a, DPR:$b))]>;
+
+def VDIVS  : ASbI<0b11101, 0b00, 0, 0, (outs SPR:$dst), (ins SPR:$a, SPR:$b),
+                 IIC_fpDIV32, "vdiv", ".f32\t$dst, $a, $b",
+                 [(set SPR:$dst, (fdiv SPR:$a, SPR:$b))]>;
+
+def VMULD  : ADbI<0b11100, 0b10, 0, 0, (outs DPR:$dst), (ins DPR:$a, DPR:$b),
+                 IIC_fpMUL64, "vmul", ".f64\t$dst, $a, $b",
+                 [(set DPR:$dst, (fmul DPR:$a, DPR:$b))]>;
+
+def VMULS  : ASbIn<0b11100, 0b10, 0, 0, (outs SPR:$dst), (ins SPR:$a, SPR:$b),
+                  IIC_fpMUL32, "vmul", ".f32\t$dst, $a, $b",
+                  [(set SPR:$dst, (fmul SPR:$a, SPR:$b))]>;
+
+def VNMULD  : ADbI<0b11100, 0b10, 1, 0, (outs DPR:$dst), (ins DPR:$a, DPR:$b),
+                  IIC_fpMUL64, "vnmul", ".f64\t$dst, $a, $b",
+                  [(set DPR:$dst, (fneg (fmul DPR:$a, DPR:$b)))]>;
+
+def VNMULS  : ASbI<0b11100, 0b10, 1, 0, (outs SPR:$dst), (ins SPR:$a, SPR:$b),
+                  IIC_fpMUL32, "vnmul", ".f32\t$dst, $a, $b",
+                  [(set SPR:$dst, (fneg (fmul SPR:$a, SPR:$b)))]>;
+
+// Match reassociated forms only if not sign dependent rounding.
+def : Pat<(fmul (fneg DPR:$a), DPR:$b),
+          (VNMULD DPR:$a, DPR:$b)>, Requires<[NoHonorSignDependentRounding]>;
+def : Pat<(fmul (fneg SPR:$a), SPR:$b),
+          (VNMULS SPR:$a, SPR:$b)>, Requires<[NoHonorSignDependentRounding]>;
+
+
+def VSUBD  : ADbI<0b11100, 0b11, 1, 0, (outs DPR:$dst), (ins DPR:$a, DPR:$b),
+                 IIC_fpALU64, "vsub", ".f64\t$dst, $a, $b",
+                 [(set DPR:$dst, (fsub DPR:$a, DPR:$b))]>;
+
+def VSUBS  : ASbIn<0b11100, 0b11, 1, 0, (outs SPR:$dst), (ins SPR:$a, SPR:$b),
+                  IIC_fpALU32, "vsub", ".f32\t$dst, $a, $b",
+                  [(set SPR:$dst, (fsub SPR:$a, SPR:$b))]>;
+
+//===----------------------------------------------------------------------===//
+// FP Unary Operations.
+//
+
+def VABSD  : ADuI<0b11101, 0b11, 0b0000, 0b11, 0, (outs DPR:$dst), (ins DPR:$a),
+                 IIC_fpUNA64, "vabs", ".f64\t$dst, $a",
+                 [(set DPR:$dst, (fabs DPR:$a))]>;
+
+def VABSS  : ASuIn<0b11101, 0b11, 0b0000, 0b11, 0,(outs SPR:$dst), (ins SPR:$a),
+                  IIC_fpUNA32, "vabs", ".f32\t$dst, $a",
+                  [(set SPR:$dst, (fabs SPR:$a))]>;
+
+let Defs = [FPSCR] in {
+def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0, (outs), (ins DPR:$a),
+                  IIC_fpCMP64, "vcmpe", ".f64\t$a, #0",
+                  [(arm_cmpfp0 DPR:$a)]>;
+
+def VCMPZD  : ADuI<0b11101, 0b11, 0b0101, 0b01, 0, (outs), (ins DPR:$a),
+                  IIC_fpCMP64, "vcmp", ".f64\t$a, #0",
+                  [/* For disassembly only; pattern left blank */]>;
+
+def VCMPEZS : ASuI<0b11101, 0b11, 0b0101, 0b11, 0, (outs), (ins SPR:$a),
+                  IIC_fpCMP32, "vcmpe", ".f32\t$a, #0",
+                  [(arm_cmpfp0 SPR:$a)]>;
+
+def VCMPZS  : ASuI<0b11101, 0b11, 0b0101, 0b01, 0, (outs), (ins SPR:$a),
+                  IIC_fpCMP32, "vcmp", ".f32\t$a, #0",
+                  [/* For disassembly only; pattern left blank */]>;
+}
+
+def VCVTDS : ASuI<0b11101, 0b11, 0b0111, 0b11, 0, (outs DPR:$dst), (ins SPR:$a),
+                 IIC_fpCVTDS, "vcvt", ".f64.f32\t$dst, $a",
+                 [(set DPR:$dst, (fextend SPR:$a))]>;
+
+// Special case encoding: bits 11-8 is 0b1011.
+def VCVTSD : VFPAI<(outs SPR:$dst), (ins DPR:$a), VFPUnaryFrm,
+                   IIC_fpCVTSD, "vcvt", ".f32.f64\t$dst, $a",
+                   [(set SPR:$dst, (fround DPR:$a))]> {
+  let Inst{27-23} = 0b11101;
+  let Inst{21-16} = 0b110111;
+  let Inst{11-8}  = 0b1011;
+  let Inst{7-6}   = 0b11;
+  let Inst{4}     = 0;
+}
+
+// Between half-precision and single-precision.  For disassembly only.
+
+def VCVTBSH : ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$dst), (ins SPR:$a),
+                 /* FIXME */ IIC_fpCVTDS, "vcvtb", ".f32.f16\t$dst, $a",
+                 [/* For disassembly only; pattern left blank */]>;
+
+def VCVTBHS : ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$dst), (ins SPR:$a),
+                 /* FIXME */ IIC_fpCVTDS, "vcvtb", ".f16.f32\t$dst, $a",
+                 [/* For disassembly only; pattern left blank */]>;
+
+def VCVTTSH : ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$dst), (ins SPR:$a),
+                 /* FIXME */ IIC_fpCVTDS, "vcvtt", ".f32.f16\t$dst, $a",
+                 [/* For disassembly only; pattern left blank */]>;
+
+def VCVTTHS : ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$dst), (ins SPR:$a),
+                 /* FIXME */ IIC_fpCVTDS, "vcvtt", ".f16.f32\t$dst, $a",
+                 [/* For disassembly only; pattern left blank */]>;
+
+let neverHasSideEffects = 1 in {
+def VMOVD: ADuI<0b11101, 0b11, 0b0000, 0b01, 0, (outs DPR:$dst), (ins DPR:$a),
+                 IIC_fpUNA64, "vmov", ".f64\t$dst, $a", []>;
+
+def VMOVS: ASuI<0b11101, 0b11, 0b0000, 0b01, 0, (outs SPR:$dst), (ins SPR:$a),
+                 IIC_fpUNA32, "vmov", ".f32\t$dst, $a", []>;
+} // neverHasSideEffects
+
+def VNEGD  : ADuI<0b11101, 0b11, 0b0001, 0b01, 0, (outs DPR:$dst), (ins DPR:$a),
+                 IIC_fpUNA64, "vneg", ".f64\t$dst, $a",
+                 [(set DPR:$dst, (fneg DPR:$a))]>;
+
+def VNEGS  : ASuIn<0b11101, 0b11, 0b0001, 0b01, 0,(outs SPR:$dst), (ins SPR:$a),
+                  IIC_fpUNA32, "vneg", ".f32\t$dst, $a",
+                  [(set SPR:$dst, (fneg SPR:$a))]>;
+
+def VSQRTD : ADuI<0b11101, 0b11, 0b0001, 0b11, 0, (outs DPR:$dst), (ins DPR:$a),
+                 IIC_fpSQRT64, "vsqrt", ".f64\t$dst, $a",
+                 [(set DPR:$dst, (fsqrt DPR:$a))]>;
+
+def VSQRTS : ASuI<0b11101, 0b11, 0b0001, 0b11, 0, (outs SPR:$dst), (ins SPR:$a),
+                 IIC_fpSQRT32, "vsqrt", ".f32\t$dst, $a",
+                 [(set SPR:$dst, (fsqrt SPR:$a))]>;
+
+//===----------------------------------------------------------------------===//
+// FP <-> GPR Copies.  Int <-> FP Conversions.
+//
+
+def VMOVRS : AVConv2I<0b11100001, 0b1010, (outs GPR:$dst), (ins SPR:$src),
+                 IIC_VMOVSI, "vmov", "\t$dst, $src",
+                 [(set GPR:$dst, (bitconvert SPR:$src))]>;
+
+def VMOVSR : AVConv4I<0b11100000, 0b1010, (outs SPR:$dst), (ins GPR:$src),
+                 IIC_VMOVIS, "vmov", "\t$dst, $src",
+                 [(set SPR:$dst, (bitconvert GPR:$src))]>;
+
+def VMOVRRD  : AVConv3I<0b11000101, 0b1011,
+                      (outs GPR:$wb, GPR:$dst2), (ins DPR:$src),
+                 IIC_VMOVDI, "vmov", "\t$wb, $dst2, $src",
+                 [/* FIXME: Can't write pattern for multiple result instr*/]> {
+  let Inst{7-6} = 0b00;
+}
+
+def VMOVRRS  : AVConv3I<0b11000101, 0b1010,
+                      (outs GPR:$wb, GPR:$dst2), (ins SPR:$src1, SPR:$src2),
+                 IIC_VMOVDI, "vmov", "\t$wb, $dst2, $src1, $src2",
+                 [/* For disassembly only; pattern left blank */]> {
+  let Inst{7-6} = 0b00;
+}
+
+// FMDHR: GPR -> SPR
+// FMDLR: GPR -> SPR
+
+def VMOVDRR : AVConv5I<0b11000100, 0b1011,
+                     (outs DPR:$dst), (ins GPR:$src1, GPR:$src2),
+                IIC_VMOVID, "vmov", "\t$dst, $src1, $src2",
+                [(set DPR:$dst, (arm_fmdrr GPR:$src1, GPR:$src2))]> {
+  let Inst{7-6} = 0b00;
+}
+
+def VMOVSRR : AVConv5I<0b11000100, 0b1010,
+                     (outs SPR:$dst1, SPR:$dst2), (ins GPR:$src1, GPR:$src2),
+                IIC_VMOVID, "vmov", "\t$dst1, $dst2, $src1, $src2",
+                [/* For disassembly only; pattern left blank */]> {
+  let Inst{7-6} = 0b00;
+}
+
+// FMRDH: SPR -> GPR
+// FMRDL: SPR -> GPR
+// FMRRS: SPR -> GPR
+// FMRX : SPR system reg -> GPR
+
+// FMSRR: GPR -> SPR
+
+// FMXR: GPR -> VFP Sstem reg
+
+
+// Int to FP:
+
+def VSITOD : AVConv1I<0b11101, 0b11, 0b1000, 0b1011,
+                 (outs DPR:$dst), (ins SPR:$a),
+                 IIC_fpCVTID, "vcvt", ".f64.s32\t$dst, $a",
+                 [(set DPR:$dst, (arm_sitof SPR:$a))]> {
+  let Inst{7} = 1; // s32
+}
+
+def VSITOS : AVConv1In<0b11101, 0b11, 0b1000, 0b1010,
+                 (outs SPR:$dst),(ins SPR:$a),
+                 IIC_fpCVTIS, "vcvt", ".f32.s32\t$dst, $a",
+                 [(set SPR:$dst, (arm_sitof SPR:$a))]> {
+  let Inst{7} = 1; // s32
+}
+
+def VUITOD : AVConv1I<0b11101, 0b11, 0b1000, 0b1011,
+                 (outs DPR:$dst), (ins SPR:$a),
+                 IIC_fpCVTID, "vcvt", ".f64.u32\t$dst, $a",
+                 [(set DPR:$dst, (arm_uitof SPR:$a))]> {
+  let Inst{7} = 0; // u32
+}
+
+def VUITOS : AVConv1In<0b11101, 0b11, 0b1000, 0b1010,
+                 (outs SPR:$dst), (ins SPR:$a),
+                 IIC_fpCVTIS, "vcvt", ".f32.u32\t$dst, $a",
+                 [(set SPR:$dst, (arm_uitof SPR:$a))]> {
+  let Inst{7} = 0; // u32
+}
+
+// FP to Int:
+// Always set Z bit in the instruction, i.e. "round towards zero" variants.
+
+def VTOSIZD : AVConv1I<0b11101, 0b11, 0b1101, 0b1011,
+                       (outs SPR:$dst), (ins DPR:$a),
+                 IIC_fpCVTDI, "vcvt", ".s32.f64\t$dst, $a",
+                 [(set SPR:$dst, (arm_ftosi DPR:$a))]> {
+  let Inst{7} = 1; // Z bit
+}
+
+def VTOSIZS : AVConv1In<0b11101, 0b11, 0b1101, 0b1010,
+                        (outs SPR:$dst), (ins SPR:$a),
+                 IIC_fpCVTSI, "vcvt", ".s32.f32\t$dst, $a",
+                 [(set SPR:$dst, (arm_ftosi SPR:$a))]> {
+  let Inst{7} = 1; // Z bit
+}
+
+def VTOUIZD : AVConv1I<0b11101, 0b11, 0b1100, 0b1011,
+                       (outs SPR:$dst), (ins DPR:$a),
+                 IIC_fpCVTDI, "vcvt", ".u32.f64\t$dst, $a",
+                 [(set SPR:$dst, (arm_ftoui DPR:$a))]> {
+  let Inst{7} = 1; // Z bit
+}
+
+def VTOUIZS : AVConv1In<0b11101, 0b11, 0b1100, 0b1010,
+                        (outs SPR:$dst), (ins SPR:$a),
+                 IIC_fpCVTSI, "vcvt", ".u32.f32\t$dst, $a",
+                 [(set SPR:$dst, (arm_ftoui SPR:$a))]> {
+  let Inst{7} = 1; // Z bit
+}
+
+// And the Z bit '0' variants, i.e. use the rounding mode specified by FPSCR.
+// For disassembly only.
+
+def VTOSIRD : AVConv1I<0b11101, 0b11, 0b1101, 0b1011,
+                       (outs SPR:$dst), (ins DPR:$a),
+                 IIC_fpCVTDI, "vcvtr", ".s32.f64\t$dst, $a",
+                 [/* For disassembly only; pattern left blank */]> {
+  let Inst{7} = 0; // Z bit
+}
+
+def VTOSIRS : AVConv1In<0b11101, 0b11, 0b1101, 0b1010,
+                        (outs SPR:$dst), (ins SPR:$a),
+                 IIC_fpCVTSI, "vcvtr", ".s32.f32\t$dst, $a",
+                 [/* For disassembly only; pattern left blank */]> {
+  let Inst{7} = 0; // Z bit
+}
+
+def VTOUIRD : AVConv1I<0b11101, 0b11, 0b1100, 0b1011,
+                       (outs SPR:$dst), (ins DPR:$a),
+                 IIC_fpCVTDI, "vcvtr", ".u32.f64\t$dst, $a",
+                 [/* For disassembly only; pattern left blank */]> {
+  let Inst{7} = 0; // Z bit
+}
+
+def VTOUIRS : AVConv1In<0b11101, 0b11, 0b1100, 0b1010,
+                        (outs SPR:$dst), (ins SPR:$a),
+                 IIC_fpCVTSI, "vcvtr", ".u32.f32\t$dst, $a",
+                 [/* For disassembly only; pattern left blank */]> {
+  let Inst{7} = 0; // Z bit
+}
+
+//===----------------------------------------------------------------------===//
+// FP FMA Operations.
+//
+
+def VMLAD : ADbI<0b11100, 0b00, 0, 0,
+                (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b),
+                IIC_fpMAC64, "vmla", ".f64\t$dst, $a, $b",
+                [(set DPR:$dst, (fadd (fmul DPR:$a, DPR:$b), DPR:$dstin))]>,
+                RegConstraint<"$dstin = $dst">;
+
+def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
+                 (outs SPR:$dst), (ins SPR:$dstin, SPR:$a, SPR:$b),
+                 IIC_fpMAC32, "vmla", ".f32\t$dst, $a, $b",
+                 [(set SPR:$dst, (fadd (fmul SPR:$a, SPR:$b), SPR:$dstin))]>,
+                 RegConstraint<"$dstin = $dst">;
+
+def VNMLSD : ADbI<0b11100, 0b01, 0, 0,
+                (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b),
+                IIC_fpMAC64, "vnmls", ".f64\t$dst, $a, $b",
+                [(set DPR:$dst, (fsub (fmul DPR:$a, DPR:$b), DPR:$dstin))]>,
+                RegConstraint<"$dstin = $dst">;
+
+def VNMLSS : ASbI<0b11100, 0b01, 0, 0,
+                (outs SPR:$dst), (ins SPR:$dstin, SPR:$a, SPR:$b),
+                IIC_fpMAC32, "vnmls", ".f32\t$dst, $a, $b",
+                [(set SPR:$dst, (fsub (fmul SPR:$a, SPR:$b), SPR:$dstin))]>,
+                RegConstraint<"$dstin = $dst">;
+
+def VMLSD : ADbI<0b11100, 0b00, 1, 0,
+                 (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b),
+                 IIC_fpMAC64, "vmls", ".f64\t$dst, $a, $b",
+             [(set DPR:$dst, (fadd (fneg (fmul DPR:$a, DPR:$b)), DPR:$dstin))]>,
+                RegConstraint<"$dstin = $dst">;
+
+def VMLSS : ASbIn<0b11100, 0b00, 1, 0,
+                  (outs SPR:$dst), (ins SPR:$dstin, SPR:$a, SPR:$b),
+                  IIC_fpMAC32, "vmls", ".f32\t$dst, $a, $b",
+             [(set SPR:$dst, (fadd (fneg (fmul SPR:$a, SPR:$b)), SPR:$dstin))]>,
+                RegConstraint<"$dstin = $dst">;
+
+def : Pat<(fsub DPR:$dstin, (fmul DPR:$a, DPR:$b)),
+          (VMLSD DPR:$dstin, DPR:$a, DPR:$b)>, Requires<[DontUseNEONForFP]>;
+def : Pat<(fsub SPR:$dstin, (fmul SPR:$a, SPR:$b)),
+          (VMLSS SPR:$dstin, SPR:$a, SPR:$b)>, Requires<[DontUseNEONForFP]>;
+
+def VNMLAD : ADbI<0b11100, 0b01, 1, 0,
+                 (outs DPR:$dst), (ins DPR:$dstin, DPR:$a, DPR:$b),
+                 IIC_fpMAC64, "vnmla", ".f64\t$dst, $a, $b",
+             [(set DPR:$dst, (fsub (fneg (fmul DPR:$a, DPR:$b)), DPR:$dstin))]>,
+                RegConstraint<"$dstin = $dst">;
+
+def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
+                (outs SPR:$dst), (ins SPR:$dstin, SPR:$a, SPR:$b),
+                IIC_fpMAC32, "vnmla", ".f32\t$dst, $a, $b",
+             [(set SPR:$dst, (fsub (fneg (fmul SPR:$a, SPR:$b)), SPR:$dstin))]>,
+                RegConstraint<"$dstin = $dst">;
+
+//===----------------------------------------------------------------------===//
+// FP Conditional moves.
+//
+
+def VMOVDcc  : ADuI<0b11101, 0b11, 0b0000, 0b01, 0,
+                    (outs DPR:$dst), (ins DPR:$false, DPR:$true),
+                    IIC_fpUNA64, "vmov", ".f64\t$dst, $true",
+                [/*(set DPR:$dst, (ARMcmov DPR:$false, DPR:$true, imm:$cc))*/]>,
+                    RegConstraint<"$false = $dst">;
+
+def VMOVScc  : ASuI<0b11101, 0b11, 0b0000, 0b01, 0,
+                    (outs SPR:$dst), (ins SPR:$false, SPR:$true),
+                    IIC_fpUNA32, "vmov", ".f32\t$dst, $true",
+                [/*(set SPR:$dst, (ARMcmov SPR:$false, SPR:$true, imm:$cc))*/]>,
+                    RegConstraint<"$false = $dst">;
+
+def VNEGDcc  : ADuI<0b11101, 0b11, 0b0001, 0b01, 0,
+                    (outs DPR:$dst), (ins DPR:$false, DPR:$true),
+                    IIC_fpUNA64, "vneg", ".f64\t$dst, $true",
+                [/*(set DPR:$dst, (ARMcneg DPR:$false, DPR:$true, imm:$cc))*/]>,
+                    RegConstraint<"$false = $dst">;
+
+def VNEGScc  : ASuI<0b11101, 0b11, 0b0001, 0b01, 0,
+                    (outs SPR:$dst), (ins SPR:$false, SPR:$true),
+                    IIC_fpUNA32, "vneg", ".f32\t$dst, $true",
+                [/*(set SPR:$dst, (ARMcneg SPR:$false, SPR:$true, imm:$cc))*/]>,
+                    RegConstraint<"$false = $dst">;
+
+
+//===----------------------------------------------------------------------===//
+// Misc.
+//
+
+// APSR is the application level alias of CPSR. This FPSCR N, Z, C, V flags
+// to APSR.
+let Defs = [CPSR], Uses = [FPSCR] in
+def FMSTAT : VFPAI<(outs), (ins), VFPMiscFrm, IIC_fpSTAT, "vmrs",
+                   "\tapsr_nzcv, fpscr",
+             [(arm_fmstat)]> {
+  let Inst{27-20} = 0b11101111;
+  let Inst{19-16} = 0b0001;
+  let Inst{15-12} = 0b1111;
+  let Inst{11-8}  = 0b1010;
+  let Inst{7}     = 0;
+  let Inst{4}     = 1;
+}
+
+// FPSCR <-> GPR (for disassembly only)
+
+let Uses = [FPSCR] in {
+def VMRS : VFPAI<(outs GPR:$dst), (ins), VFPMiscFrm, IIC_fpSTAT, "vmrs",
+                 "\t$dst, fpscr",
+             [/* For disassembly only; pattern left blank */]> {
+  let Inst{27-20} = 0b11101111;
+  let Inst{19-16} = 0b0001;
+  let Inst{11-8}  = 0b1010;
+  let Inst{7}     = 0;
+  let Inst{4}     = 1;
+}
+}
+
+let Defs = [FPSCR] in {
+def VMSR : VFPAI<(outs), (ins GPR:$src), VFPMiscFrm, IIC_fpSTAT, "vmsr",
+                 "\tfpscr, $src",
+             [/* For disassembly only; pattern left blank */]> {
+  let Inst{27-20} = 0b11101110;
+  let Inst{19-16} = 0b0001;
+  let Inst{11-8}  = 0b1010;
+  let Inst{7}     = 0;
+  let Inst{4}     = 1;
+}
+}
+
+// Materialize FP immediates. VFP3 only.
+let isReMaterializable = 1 in {
+def FCONSTD : VFPAI<(outs DPR:$dst), (ins vfp_f64imm:$imm),
+                    VFPMiscFrm, IIC_VMOVImm,
+                    "vmov", ".f64\t$dst, $imm",
+                    [(set DPR:$dst, vfp_f64imm:$imm)]>, Requires<[HasVFP3]> {
+  let Inst{27-23} = 0b11101;
+  let Inst{21-20} = 0b11;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 1;
+  let Inst{7-4}   = 0b0000;
+}
+
+def FCONSTS : VFPAI<(outs SPR:$dst), (ins vfp_f32imm:$imm),
+                    VFPMiscFrm, IIC_VMOVImm,
+                    "vmov", ".f32\t$dst, $imm",
+                    [(set SPR:$dst, vfp_f32imm:$imm)]>, Requires<[HasVFP3]> {
+  let Inst{27-23} = 0b11101;
+  let Inst{21-20} = 0b11;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 0;
+  let Inst{7-4}   = 0b0000;
+}
+}

diff --git a/lib/Target/ARM/ARMJITInfo.cpp b/lib/Target/ARM/ARMJITInfo.cpp
new file mode 100644
index 0000000..bef5a06
--- /dev/null
+++ b/lib/Target/ARM/ARMJITInfo.cpp

@@ -0,0 +1,323 @@
+//===-- ARMJITInfo.cpp - Implement the JIT interfaces for the ARM target --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the JIT interfaces for the ARM target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jit"
+#include "ARMJITInfo.h"
+#include "ARMInstrInfo.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMRelocations.h"
+#include "ARMSubtarget.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/System/Memory.h"
+#include <cstdlib>
+using namespace llvm;
+
+void ARMJITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
+  llvm_report_error("ARMJITInfo::replaceMachineCodeForFunction");
+}
+
+/// JITCompilerFunction - This contains the address of the JIT function used to
+/// compile a function lazily.
+static TargetJITInfo::JITCompilerFn JITCompilerFunction;
+
+// Get the ASMPREFIX for the current host.  This is often '_'.
+#ifndef __USER_LABEL_PREFIX__
+#define __USER_LABEL_PREFIX__
+#endif
+#define GETASMPREFIX2(X) #X
+#define GETASMPREFIX(X) GETASMPREFIX2(X)
+#define ASMPREFIX GETASMPREFIX(__USER_LABEL_PREFIX__)
+
+// CompilationCallback stub - We can't use a C function with inline assembly in
+// it, because we the prolog/epilog inserted by GCC won't work for us (we need
+// to preserve more context and manipulate the stack directly).  Instead,
+// write our own wrapper, which does things our way, so we have complete
+// control over register saving and restoring.
+extern "C" {
+#if defined(__arm__)
+  void ARMCompilationCallback();
+  asm(
+    ".text\n"
+    ".align 2\n"
+    ".globl " ASMPREFIX "ARMCompilationCallback\n"
+    ASMPREFIX "ARMCompilationCallback:\n"
+    // Save caller saved registers since they may contain stuff
+    // for the real target function right now. We have to act as if this
+    // whole compilation callback doesn't exist as far as the caller is
+    // concerned, so we can't just preserve the callee saved regs.
+    "stmdb sp!, {r0, r1, r2, r3, lr}\n"
+#ifndef __SOFTFP__
+    "fstmfdd sp!, {d0, d1, d2, d3, d4, d5, d6, d7}\n"
+#endif
+    // The LR contains the address of the stub function on entry.
+    // pass it as the argument to the C part of the callback
+    "mov  r0, lr\n"
+    "sub  sp, sp, #4\n"
+    // Call the C portion of the callback
+    "bl   " ASMPREFIX "ARMCompilationCallbackC\n"
+    "add  sp, sp, #4\n"
+    // Restoring the LR to the return address of the function that invoked
+    // the stub and de-allocating the stack space for it requires us to
+    // swap the two saved LR values on the stack, as they're backwards
+    // for what we need since the pop instruction has a pre-determined
+    // order for the registers.
+    //      +--------+
+    //   0  | LR     | Original return address
+    //      +--------+
+    //   1  | LR     | Stub address (start of stub)
+    // 2-5  | R3..R0 | Saved registers (we need to preserve all regs)
+    // 6-20 | D0..D7 | Saved VFP registers
+    //      +--------+
+    //
+#ifndef __SOFTFP__
+    // Restore VFP caller-saved registers.
+    "fldmfdd sp!, {d0, d1, d2, d3, d4, d5, d6, d7}\n"
+#endif
+    //
+    //      We need to exchange the values in slots 0 and 1 so we can
+    //      return to the address in slot 1 with the address in slot 0
+    //      restored to the LR.
+    "ldr  r0, [sp,#20]\n"
+    "ldr  r1, [sp,#16]\n"
+    "str  r1, [sp,#20]\n"
+    "str  r0, [sp,#16]\n"
+    // Return to the (newly modified) stub to invoke the real function.
+    // The above twiddling of the saved return addresses allows us to
+    // deallocate everything, including the LR the stub saved, all in one
+    // pop instruction.
+    "ldmia  sp!, {r0, r1, r2, r3, lr, pc}\n"
+      );
+#else  // Not an ARM host
+  void ARMCompilationCallback() {
+    llvm_unreachable("Cannot call ARMCompilationCallback() on a non-ARM arch!");
+  }
+#endif
+}
+
+/// ARMCompilationCallbackC - This is the target-specific function invoked
+/// by the function stub when we did not know the real target of a call.
+/// This function must locate the start of the stub or call site and pass
+/// it into the JIT compiler function.
+extern "C" void ARMCompilationCallbackC(intptr_t StubAddr) {
+  // Get the address of the compiled code for this function.
+  intptr_t NewVal = (intptr_t)JITCompilerFunction((void*)StubAddr);
+
+  // Rewrite the call target... so that we don't end up here every time we
+  // execute the call. We're replacing the first two instructions of the
+  // stub with:
+  //   ldr pc, [pc,#-4]
+  //   <addr>
+  if (!sys::Memory::setRangeWritable((void*)StubAddr, 8)) {
+    llvm_unreachable("ERROR: Unable to mark stub writable");
+  }
+  *(intptr_t *)StubAddr = 0xe51ff004;  // ldr pc, [pc, #-4]
+  *(intptr_t *)(StubAddr+4) = NewVal;
+  if (!sys::Memory::setRangeExecutable((void*)StubAddr, 8)) {
+    llvm_unreachable("ERROR: Unable to mark stub executable");
+  }
+}
+
+TargetJITInfo::LazyResolverFn
+ARMJITInfo::getLazyResolverFunction(JITCompilerFn F) {
+  JITCompilerFunction = F;
+  return ARMCompilationCallback;
+}
+
+void *ARMJITInfo::emitGlobalValueIndirectSym(const GlobalValue *GV, void *Ptr,
+                                             JITCodeEmitter &JCE) {
+  uint8_t Buffer[4];
+  uint8_t *Cur = Buffer;
+  MachineCodeEmitter::emitWordLEInto(Cur, (intptr_t)Ptr);
+  void *PtrAddr = JCE.allocIndirectGV(
+      GV, Buffer, sizeof(Buffer), /*Alignment=*/4);
+  addIndirectSymAddr(Ptr, (intptr_t)PtrAddr);
+  return PtrAddr;
+}
+
+TargetJITInfo::StubLayout ARMJITInfo::getStubLayout() {
+  // The stub contains up to 3 4-byte instructions, aligned at 4 bytes, and a
+  // 4-byte address.  See emitFunctionStub for details.
+  StubLayout Result = {16, 4};
+  return Result;
+}
+
+void *ARMJITInfo::emitFunctionStub(const Function* F, void *Fn,
+                                   JITCodeEmitter &JCE) {
+  void *Addr;
+  // If this is just a call to an external function, emit a branch instead of a
+  // call.  The code is the same except for one bit of the last instruction.
+  if (Fn != (void*)(intptr_t)ARMCompilationCallback) {
+    // Branch to the corresponding function addr.
+    if (IsPIC) {
+      // The stub is 16-byte size and 4-aligned.
+      intptr_t LazyPtr = getIndirectSymAddr(Fn);
+      if (!LazyPtr) {
+        // In PIC mode, the function stub is loading a lazy-ptr.
+        LazyPtr= (intptr_t)emitGlobalValueIndirectSym((GlobalValue*)F, Fn, JCE);
+        DEBUG(if (F)
+                errs() << "JIT: Indirect symbol emitted at [" << LazyPtr
+                       << "] for GV '" << F->getName() << "'\n";
+              else
+                errs() << "JIT: Stub emitted at [" << LazyPtr
+                       << "] for external function at '" << Fn << "'\n");
+      }
+      JCE.emitAlignment(4);
+      Addr = (void*)JCE.getCurrentPCValue();
+      if (!sys::Memory::setRangeWritable(Addr, 16)) {
+        llvm_unreachable("ERROR: Unable to mark stub writable");
+      }
+      JCE.emitWordLE(0xe59fc004);            // ldr ip, [pc, #+4]
+      JCE.emitWordLE(0xe08fc00c);            // L_func$scv: add ip, pc, ip
+      JCE.emitWordLE(0xe59cf000);            // ldr pc, [ip]
+      JCE.emitWordLE(LazyPtr - (intptr_t(Addr)+4+8));  // func - (L_func$scv+8)
+      sys::Memory::InvalidateInstructionCache(Addr, 16);
+      if (!sys::Memory::setRangeExecutable(Addr, 16)) {
+        llvm_unreachable("ERROR: Unable to mark stub executable");
+      }
+    } else {
+      // The stub is 8-byte size and 4-aligned.
+      JCE.emitAlignment(4);
+      Addr = (void*)JCE.getCurrentPCValue();
+      if (!sys::Memory::setRangeWritable(Addr, 8)) {
+        llvm_unreachable("ERROR: Unable to mark stub writable");
+      }
+      JCE.emitWordLE(0xe51ff004);    // ldr pc, [pc, #-4]
+      JCE.emitWordLE((intptr_t)Fn);  // addr of function
+      sys::Memory::InvalidateInstructionCache(Addr, 8);
+      if (!sys::Memory::setRangeExecutable(Addr, 8)) {
+        llvm_unreachable("ERROR: Unable to mark stub executable");
+      }
+    }
+  } else {
+    // The compilation callback will overwrite the first two words of this
+    // stub with indirect branch instructions targeting the compiled code.
+    // This stub sets the return address to restart the stub, so that
+    // the new branch will be invoked when we come back.
+    //
+    // Branch and link to the compilation callback.
+    // The stub is 16-byte size and 4-byte aligned.
+    JCE.emitAlignment(4);
+    Addr = (void*)JCE.getCurrentPCValue();
+    if (!sys::Memory::setRangeWritable(Addr, 16)) {
+      llvm_unreachable("ERROR: Unable to mark stub writable");
+    }
+    // Save LR so the callback can determine which stub called it.
+    // The compilation callback is responsible for popping this prior
+    // to returning.
+    JCE.emitWordLE(0xe92d4000); // push {lr}
+    // Set the return address to go back to the start of this stub.
+    JCE.emitWordLE(0xe24fe00c); // sub lr, pc, #12
+    // Invoke the compilation callback.
+    JCE.emitWordLE(0xe51ff004); // ldr pc, [pc, #-4]
+    // The address of the compilation callback.
+    JCE.emitWordLE((intptr_t)ARMCompilationCallback);
+    sys::Memory::InvalidateInstructionCache(Addr, 16);
+    if (!sys::Memory::setRangeExecutable(Addr, 16)) {
+      llvm_unreachable("ERROR: Unable to mark stub executable");
+    }
+  }
+
+  return Addr;
+}
+
+intptr_t ARMJITInfo::resolveRelocDestAddr(MachineRelocation *MR) const {
+  ARM::RelocationType RT = (ARM::RelocationType)MR->getRelocationType();
+  switch (RT) {
+  default:
+    return (intptr_t)(MR->getResultPointer());
+  case ARM::reloc_arm_pic_jt:
+    // Destination address - jump table base.
+    return (intptr_t)(MR->getResultPointer()) - MR->getConstantVal();
+  case ARM::reloc_arm_jt_base:
+    // Jump table base address.
+    return getJumpTableBaseAddr(MR->getJumpTableIndex());
+  case ARM::reloc_arm_cp_entry:
+  case ARM::reloc_arm_vfp_cp_entry:
+    // Constant pool entry address.
+    return getConstantPoolEntryAddr(MR->getConstantPoolIndex());
+  case ARM::reloc_arm_machine_cp_entry: {
+    ARMConstantPoolValue *ACPV = (ARMConstantPoolValue*)MR->getConstantVal();
+    assert((!ACPV->hasModifier() && !ACPV->mustAddCurrentAddress()) &&
+           "Can't handle this machine constant pool entry yet!");
+    intptr_t Addr = (intptr_t)(MR->getResultPointer());
+    Addr -= getPCLabelAddr(ACPV->getLabelId()) + ACPV->getPCAdjustment();
+    return Addr;
+  }
+  }
+}
+
+/// relocate - Before the JIT can run a block of code that has been emitted,
+/// it must rewrite the code to contain the actual addresses of any
+/// referenced global symbols.
+void ARMJITInfo::relocate(void *Function, MachineRelocation *MR,
+                          unsigned NumRelocs, unsigned char* GOTBase) {
+  for (unsigned i = 0; i != NumRelocs; ++i, ++MR) {
+    void *RelocPos = (char*)Function + MR->getMachineCodeOffset();
+    intptr_t ResultPtr = resolveRelocDestAddr(MR);
+    switch ((ARM::RelocationType)MR->getRelocationType()) {
+    case ARM::reloc_arm_cp_entry:
+    case ARM::reloc_arm_vfp_cp_entry:
+    case ARM::reloc_arm_relative: {
+      // It is necessary to calculate the correct PC relative value. We
+      // subtract the base addr from the target addr to form a byte offset.
+      ResultPtr = ResultPtr - (intptr_t)RelocPos - 8;
+      // If the result is positive, set bit U(23) to 1.
+      if (ResultPtr >= 0)
+        *((intptr_t*)RelocPos) |= 1 << ARMII::U_BitShift;
+      else {
+        // Otherwise, obtain the absolute value and set bit U(23) to 0.
+        *((intptr_t*)RelocPos) &= ~(1 << ARMII::U_BitShift);
+        ResultPtr = - ResultPtr;
+      }
+      // Set the immed value calculated.
+      // VFP immediate offset is multiplied by 4.
+      if (MR->getRelocationType() == ARM::reloc_arm_vfp_cp_entry)
+        ResultPtr = ResultPtr >> 2;
+      *((intptr_t*)RelocPos) |= ResultPtr;
+      // Set register Rn to PC.
+      *((intptr_t*)RelocPos) |=
+        ARMRegisterInfo::getRegisterNumbering(ARM::PC) << ARMII::RegRnShift;
+      break;
+    }
+    case ARM::reloc_arm_pic_jt:
+    case ARM::reloc_arm_machine_cp_entry:
+    case ARM::reloc_arm_absolute: {
+      // These addresses have already been resolved.
+      *((intptr_t*)RelocPos) |= (intptr_t)ResultPtr;
+      break;
+    }
+    case ARM::reloc_arm_branch: {
+      // It is necessary to calculate the correct value of signed_immed_24
+      // field. We subtract the base addr from the target addr to form a
+      // byte offset, which must be inside the range -33554432 and +33554428.
+      // Then, we set the signed_immed_24 field of the instruction to bits
+      // [25:2] of the byte offset. More details ARM-ARM p. A4-11.
+      ResultPtr = ResultPtr - (intptr_t)RelocPos - 8;
+      ResultPtr = (ResultPtr & 0x03FFFFFC) >> 2;
+      assert(ResultPtr >= -33554432 && ResultPtr <= 33554428);
+      *((intptr_t*)RelocPos) |= ResultPtr;
+      break;
+    }
+    case ARM::reloc_arm_jt_base: {
+      // JT base - (instruction addr + 8)
+      ResultPtr = ResultPtr - (intptr_t)RelocPos - 8;
+      *((intptr_t*)RelocPos) |= ResultPtr;
+      break;
+    }
+    }
+  }
+}

diff --git a/lib/Target/ARM/ARMJITInfo.h b/lib/Target/ARM/ARMJITInfo.h
new file mode 100644
index 0000000..ff332b7
--- /dev/null
+++ b/lib/Target/ARM/ARMJITInfo.h

@@ -0,0 +1,182 @@
+//===- ARMJITInfo.h - ARM implementation of the JIT interface  --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the ARMJITInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMJITINFO_H
+#define ARMJITINFO_H
+
+#include "ARMMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/Target/TargetJITInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+
+namespace llvm {
+  class ARMTargetMachine;
+
+  class ARMJITInfo : public TargetJITInfo {
+    // ConstPoolId2AddrMap - A map from constant pool ids to the corresponding
+    // CONSTPOOL_ENTRY addresses.
+    SmallVector<intptr_t, 16> ConstPoolId2AddrMap;
+
+    // JumpTableId2AddrMap - A map from inline jumptable ids to the
+    // corresponding inline jump table bases.
+    SmallVector<intptr_t, 16> JumpTableId2AddrMap;
+
+    // PCLabelMap - A map from PC labels to addresses.
+    DenseMap<unsigned, intptr_t> PCLabelMap;
+
+    // Sym2IndirectSymMap - A map from symbol (GlobalValue and ExternalSymbol)
+    // addresses to their indirect symbol addresses.
+    DenseMap<void*, intptr_t> Sym2IndirectSymMap;
+
+    // IsPIC - True if the relocation model is PIC. This is used to determine
+    // how to codegen function stubs.
+    bool IsPIC;
+
+  public:
+    explicit ARMJITInfo() : IsPIC(false) { useGOT = false; }
+
+    /// replaceMachineCodeForFunction - Make it so that calling the function
+    /// whose machine code is at OLD turns into a call to NEW, perhaps by
+    /// overwriting OLD with a branch to NEW.  This is used for self-modifying
+    /// code.
+    ///
+    virtual void replaceMachineCodeForFunction(void *Old, void *New);
+
+    /// emitGlobalValueIndirectSym - Use the specified JITCodeEmitter object
+    /// to emit an indirect symbol which contains the address of the specified
+    /// ptr.
+    virtual void *emitGlobalValueIndirectSym(const GlobalValue* GV, void *ptr,
+                                            JITCodeEmitter &JCE);
+
+    // getStubLayout - Returns the size and alignment of the largest call stub
+    // on ARM.
+    virtual StubLayout getStubLayout();
+
+    /// emitFunctionStub - Use the specified JITCodeEmitter object to emit a
+    /// small native function that simply calls the function at the specified
+    /// address.
+    virtual void *emitFunctionStub(const Function* F, void *Fn,
+                                   JITCodeEmitter &JCE);
+
+    /// getLazyResolverFunction - Expose the lazy resolver to the JIT.
+    virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn);
+
+    /// relocate - Before the JIT can run a block of code that has been emitted,
+    /// it must rewrite the code to contain the actual addresses of any
+    /// referenced global symbols.
+    virtual void relocate(void *Function, MachineRelocation *MR,
+                          unsigned NumRelocs, unsigned char* GOTBase);
+
+    /// hasCustomConstantPool - Allows a target to specify that constant
+    /// pool address resolution is handled by the target.
+    virtual bool hasCustomConstantPool() const { return true; }
+
+    /// hasCustomJumpTables - Allows a target to specify that jumptables
+    /// are emitted by the target.
+    virtual bool hasCustomJumpTables() const { return true; }
+
+    /// allocateSeparateGVMemory - If true, globals should be placed in
+    /// separately allocated heap memory rather than in the same
+    /// code memory allocated by JITCodeEmitter.
+    virtual bool allocateSeparateGVMemory() const {
+#ifdef __APPLE__
+      return true;
+#else
+      return false;
+#endif
+    }
+
+    /// Initialize - Initialize internal stage for the function being JITted.
+    /// Resize constant pool ids to CONSTPOOL_ENTRY addresses map; resize
+    /// jump table ids to jump table bases map; remember if codegen relocation
+    /// model is PIC.
+    void Initialize(const MachineFunction &MF, bool isPIC) {
+      const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+      ConstPoolId2AddrMap.resize(AFI->getNumConstPoolEntries());
+      JumpTableId2AddrMap.resize(AFI->getNumJumpTables());
+      IsPIC = isPIC;
+    }
+
+    /// getConstantPoolEntryAddr - The ARM target puts all constant
+    /// pool entries into constant islands. This returns the address of the
+    /// constant pool entry of the specified index.
+    intptr_t getConstantPoolEntryAddr(unsigned CPI) const {
+      assert(CPI < ConstPoolId2AddrMap.size());
+      return ConstPoolId2AddrMap[CPI];
+    }
+
+    /// addConstantPoolEntryAddr - Map a Constant Pool Index to the address
+    /// where its associated value is stored. When relocations are processed,
+    /// this value will be used to resolve references to the constant.
+    void addConstantPoolEntryAddr(unsigned CPI, intptr_t Addr) {
+      assert(CPI < ConstPoolId2AddrMap.size());
+      ConstPoolId2AddrMap[CPI] = Addr;
+    }
+
+    /// getJumpTableBaseAddr - The ARM target inline all jump tables within
+    /// text section of the function. This returns the address of the base of
+    /// the jump table of the specified index.
+    intptr_t getJumpTableBaseAddr(unsigned JTI) const {
+      assert(JTI < JumpTableId2AddrMap.size());
+      return JumpTableId2AddrMap[JTI];
+    }
+
+    /// addJumpTableBaseAddr - Map a jump table index to the address where
+    /// the corresponding inline jump table is emitted. When relocations are
+    /// processed, this value will be used to resolve references to the
+    /// jump table.
+    void addJumpTableBaseAddr(unsigned JTI, intptr_t Addr) {
+      assert(JTI < JumpTableId2AddrMap.size());
+      JumpTableId2AddrMap[JTI] = Addr;
+    }
+
+    /// getPCLabelAddr - Retrieve the address of the PC label of the specified id.
+    intptr_t getPCLabelAddr(unsigned Id) const {
+      DenseMap<unsigned, intptr_t>::const_iterator I = PCLabelMap.find(Id);
+      assert(I != PCLabelMap.end());
+      return I->second;
+    }
+
+    /// addPCLabelAddr - Remember the address of the specified PC label.
+    void addPCLabelAddr(unsigned Id, intptr_t Addr) {
+      PCLabelMap.insert(std::make_pair(Id, Addr));
+    }
+
+    /// getIndirectSymAddr - Retrieve the address of the indirect symbol of the
+    /// specified symbol located at address. Returns 0 if the indirect symbol
+    /// has not been emitted.
+    intptr_t getIndirectSymAddr(void *Addr) const {
+      DenseMap<void*,intptr_t>::const_iterator I= Sym2IndirectSymMap.find(Addr);
+      if (I != Sym2IndirectSymMap.end())
+        return I->second;
+      return 0;
+    }
+
+    /// addIndirectSymAddr - Add a mapping from address of an emitted symbol to
+    /// its indirect symbol address.
+    void addIndirectSymAddr(void *SymAddr, intptr_t IndSymAddr) {
+      Sym2IndirectSymMap.insert(std::make_pair(SymAddr, IndSymAddr));
+    }
+
+  private:
+    /// resolveRelocDestAddr - Resolve the resulting address of the relocation
+    /// if it's not already solved. Constantpool entries must be resolved by
+    /// ARM target.
+    intptr_t resolveRelocDestAddr(MachineRelocation *MR) const;
+  };
+}
+
+#endif

diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
new file mode 100644
index 0000000..b78b95b
--- /dev/null
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp

@@ -0,0 +1,1622 @@
+//===-- ARMLoadStoreOptimizer.cpp - ARM load / store opt. pass ----*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that performs load / store related peephole
+// optimizations. This pass should be run after register allocation.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm-ldst-opt"
+#include "ARM.h"
+#include "ARMAddressingModes.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMRegisterInfo.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(NumLDMGened , "Number of ldm instructions generated");
+STATISTIC(NumSTMGened , "Number of stm instructions generated");
+STATISTIC(NumVLDMGened, "Number of vldm instructions generated");
+STATISTIC(NumVSTMGened, "Number of vstm instructions generated");
+STATISTIC(NumLdStMoved, "Number of load / store instructions moved");
+STATISTIC(NumLDRDFormed,"Number of ldrd created before allocation");
+STATISTIC(NumSTRDFormed,"Number of strd created before allocation");
+STATISTIC(NumLDRD2LDM,  "Number of ldrd instructions turned back into ldm");
+STATISTIC(NumSTRD2STM,  "Number of strd instructions turned back into stm");
+STATISTIC(NumLDRD2LDR,  "Number of ldrd instructions turned back into ldr's");
+STATISTIC(NumSTRD2STR,  "Number of strd instructions turned back into str's");
+
+/// ARMAllocLoadStoreOpt - Post- register allocation pass the combine
+/// load / store instructions to form ldm / stm instructions.
+
+namespace {
+  struct ARMLoadStoreOpt : public MachineFunctionPass {
+    static char ID;
+    ARMLoadStoreOpt() : MachineFunctionPass(&ID) {}
+
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    ARMFunctionInfo *AFI;
+    RegScavenger *RS;
+    bool isThumb2;
+
+    virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+    virtual const char *getPassName() const {
+      return "ARM load / store optimization pass";
+    }
+
+  private:
+    struct MemOpQueueEntry {
+      int Offset;
+      unsigned Position;
+      MachineBasicBlock::iterator MBBI;
+      bool Merged;
+      MemOpQueueEntry(int o, int p, MachineBasicBlock::iterator i)
+        : Offset(o), Position(p), MBBI(i), Merged(false) {}
+    };
+    typedef SmallVector<MemOpQueueEntry,8> MemOpQueue;
+    typedef MemOpQueue::iterator MemOpQueueIter;
+
+    bool MergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                  int Offset, unsigned Base, bool BaseKill, int Opcode,
+                  ARMCC::CondCodes Pred, unsigned PredReg, unsigned Scratch,
+                  DebugLoc dl, SmallVector<std::pair<unsigned, bool>, 8> &Regs);
+    void MergeOpsUpdate(MachineBasicBlock &MBB,
+                        MemOpQueue &MemOps,
+                        unsigned memOpsBegin,
+                        unsigned memOpsEnd,
+                        unsigned insertAfter,
+                        int Offset,
+                        unsigned Base,
+                        bool BaseKill,
+                        int Opcode,
+                        ARMCC::CondCodes Pred,
+                        unsigned PredReg,
+                        unsigned Scratch,
+                        DebugLoc dl,
+                        SmallVector<MachineBasicBlock::iterator, 4> &Merges);
+    void MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, unsigned Base,
+                      int Opcode, unsigned Size,
+                      ARMCC::CondCodes Pred, unsigned PredReg,
+                      unsigned Scratch, MemOpQueue &MemOps,
+                      SmallVector<MachineBasicBlock::iterator, 4> &Merges);
+
+    void AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps);
+    bool FixInvalidRegPairOp(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator &MBBI);
+    bool MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MBBI,
+                                  const TargetInstrInfo *TII,
+                                  bool &Advance,
+                                  MachineBasicBlock::iterator &I);
+    bool MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   bool &Advance,
+                                   MachineBasicBlock::iterator &I);
+    bool LoadStoreMultipleOpti(MachineBasicBlock &MBB);
+    bool MergeReturnIntoLDM(MachineBasicBlock &MBB);
+  };
+  char ARMLoadStoreOpt::ID = 0;
+}
+
+static int getLoadStoreMultipleOpcode(int Opcode) {
+  switch (Opcode) {
+  case ARM::LDR:
+    NumLDMGened++;
+    return ARM::LDM;
+  case ARM::STR:
+    NumSTMGened++;
+    return ARM::STM;
+  case ARM::t2LDRi8:
+  case ARM::t2LDRi12:
+    NumLDMGened++;
+    return ARM::t2LDM;
+  case ARM::t2STRi8:
+  case ARM::t2STRi12:
+    NumSTMGened++;
+    return ARM::t2STM;
+  case ARM::VLDRS:
+    NumVLDMGened++;
+    return ARM::VLDMS;
+  case ARM::VSTRS:
+    NumVSTMGened++;
+    return ARM::VSTMS;
+  case ARM::VLDRD:
+    NumVLDMGened++;
+    return ARM::VLDMD;
+  case ARM::VSTRD:
+    NumVSTMGened++;
+    return ARM::VSTMD;
+  default: llvm_unreachable("Unhandled opcode!");
+  }
+  return 0;
+}
+
+static bool isT2i32Load(unsigned Opc) {
+  return Opc == ARM::t2LDRi12 || Opc == ARM::t2LDRi8;
+}
+
+static bool isi32Load(unsigned Opc) {
+  return Opc == ARM::LDR || isT2i32Load(Opc);
+}
+
+static bool isT2i32Store(unsigned Opc) {
+  return Opc == ARM::t2STRi12 || Opc == ARM::t2STRi8;
+}
+
+static bool isi32Store(unsigned Opc) {
+  return Opc == ARM::STR || isT2i32Store(Opc);
+}
+
+/// MergeOps - Create and insert a LDM or STM with Base as base register and
+/// registers in Regs as the register operands that would be loaded / stored.
+/// It returns true if the transformation is done.
+bool
+ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MBBI,
+                          int Offset, unsigned Base, bool BaseKill,
+                          int Opcode, ARMCC::CondCodes Pred,
+                          unsigned PredReg, unsigned Scratch, DebugLoc dl,
+                          SmallVector<std::pair<unsigned, bool>, 8> &Regs) {
+  // Only a single register to load / store. Don't bother.
+  unsigned NumRegs = Regs.size();
+  if (NumRegs <= 1)
+    return false;
+
+  ARM_AM::AMSubMode Mode = ARM_AM::ia;
+  bool isAM4 = isi32Load(Opcode) || isi32Store(Opcode);
+  if (isAM4 && Offset == 4) {
+    if (isThumb2)
+      // Thumb2 does not support ldmib / stmib.
+      return false;
+    Mode = ARM_AM::ib;
+  } else if (isAM4 && Offset == -4 * (int)NumRegs + 4) {
+    if (isThumb2)
+      // Thumb2 does not support ldmda / stmda.
+      return false;
+    Mode = ARM_AM::da;
+  } else if (isAM4 && Offset == -4 * (int)NumRegs) {
+    Mode = ARM_AM::db;
+  } else if (Offset != 0) {
+    // If starting offset isn't zero, insert a MI to materialize a new base.
+    // But only do so if it is cost effective, i.e. merging more than two
+    // loads / stores.
+    if (NumRegs <= 2)
+      return false;
+
+    unsigned NewBase;
+    if (isi32Load(Opcode))
+      // If it is a load, then just use one of the destination register to
+      // use as the new base.
+      NewBase = Regs[NumRegs-1].first;
+    else {
+      // Use the scratch register to use as a new base.
+      NewBase = Scratch;
+      if (NewBase == 0)
+        return false;
+    }
+    int BaseOpc = !isThumb2
+      ? ARM::ADDri
+      : ((Base == ARM::SP) ? ARM::t2ADDrSPi : ARM::t2ADDri);
+    if (Offset < 0) {
+      BaseOpc = !isThumb2
+        ? ARM::SUBri
+        : ((Base == ARM::SP) ? ARM::t2SUBrSPi : ARM::t2SUBri);
+      Offset = - Offset;
+    }
+    int ImmedOffset = isThumb2
+      ? ARM_AM::getT2SOImmVal(Offset) : ARM_AM::getSOImmVal(Offset);
+    if (ImmedOffset == -1)
+      // FIXME: Try t2ADDri12 or t2SUBri12?
+      return false;  // Probably not worth it then.
+
+    BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase)
+      .addReg(Base, getKillRegState(BaseKill)).addImm(Offset)
+      .addImm(Pred).addReg(PredReg).addReg(0);
+    Base = NewBase;
+    BaseKill = true;  // New base is always killed right its use.
+  }
+
+  bool isDPR = Opcode == ARM::VLDRD || Opcode == ARM::VSTRD;
+  bool isDef = isi32Load(Opcode) || Opcode == ARM::VLDRS || Opcode == ARM::VLDRD;
+  Opcode = getLoadStoreMultipleOpcode(Opcode);
+  MachineInstrBuilder MIB = (isAM4)
+    ? BuildMI(MBB, MBBI, dl, TII->get(Opcode))
+        .addReg(Base, getKillRegState(BaseKill))
+        .addImm(ARM_AM::getAM4ModeImm(Mode)).addImm(Pred).addReg(PredReg)
+    : BuildMI(MBB, MBBI, dl, TII->get(Opcode))
+        .addReg(Base, getKillRegState(BaseKill))
+        .addImm(ARM_AM::getAM5Opc(Mode, false, isDPR ? NumRegs<<1 : NumRegs))
+        .addImm(Pred).addReg(PredReg);
+  MIB.addReg(0); // Add optional writeback (0 for now).
+  for (unsigned i = 0; i != NumRegs; ++i)
+    MIB = MIB.addReg(Regs[i].first, getDefRegState(isDef)
+                     | getKillRegState(Regs[i].second));
+
+  return true;
+}
+
+// MergeOpsUpdate - call MergeOps and update MemOps and merges accordingly on
+// success.
+void ARMLoadStoreOpt::
+MergeOpsUpdate(MachineBasicBlock &MBB,
+               MemOpQueue &memOps,
+               unsigned memOpsBegin,
+               unsigned memOpsEnd,
+               unsigned insertAfter,
+               int Offset,
+               unsigned Base,
+               bool BaseKill,
+               int Opcode,
+               ARMCC::CondCodes Pred,
+               unsigned PredReg,
+               unsigned Scratch,
+               DebugLoc dl,
+               SmallVector<MachineBasicBlock::iterator, 4> &Merges) {
+  // First calculate which of the registers should be killed by the merged
+  // instruction.
+  SmallVector<std::pair<unsigned, bool>, 8> Regs;
+  const unsigned insertPos = memOps[insertAfter].Position;
+  for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) {
+    const MachineOperand &MO = memOps[i].MBBI->getOperand(0);
+    unsigned Reg = MO.getReg();
+    bool isKill = MO.isKill();
+
+    // If we are inserting the merged operation after an unmerged operation that
+    // uses the same register, make sure to transfer any kill flag.
+    for (unsigned j = memOpsEnd, e = memOps.size(); !isKill && j != e; ++j)
+      if (memOps[j].Position<insertPos) {
+        const MachineOperand &MOJ = memOps[j].MBBI->getOperand(0);
+        if (MOJ.getReg() == Reg && MOJ.isKill())
+          isKill = true;
+      }
+
+    Regs.push_back(std::make_pair(Reg, isKill));
+  }
+
+  // Try to do the merge.
+  MachineBasicBlock::iterator Loc = memOps[insertAfter].MBBI;
+  Loc++;
+  if (!MergeOps(MBB, Loc, Offset, Base, BaseKill, Opcode,
+                Pred, PredReg, Scratch, dl, Regs))
+    return;
+
+  // Merge succeeded, update records.
+  Merges.push_back(prior(Loc));
+  for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) {
+    // Remove kill flags from any unmerged memops that come before insertPos.
+    if (Regs[i-memOpsBegin].second)
+      for (unsigned j = memOpsEnd, e = memOps.size(); j != e; ++j)
+        if (memOps[j].Position<insertPos) {
+          MachineOperand &MOJ = memOps[j].MBBI->getOperand(0);
+          if (MOJ.getReg() == Regs[i-memOpsBegin].first && MOJ.isKill())
+            MOJ.setIsKill(false);
+        }
+    MBB.erase(memOps[i].MBBI);
+    memOps[i].Merged = true;
+  }
+}
+
+/// MergeLDR_STR - Merge a number of load / store instructions into one or more
+/// load / store multiple instructions.
+void
+ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
+                          unsigned Base, int Opcode, unsigned Size,
+                          ARMCC::CondCodes Pred, unsigned PredReg,
+                          unsigned Scratch, MemOpQueue &MemOps,
+                          SmallVector<MachineBasicBlock::iterator, 4> &Merges) {
+  bool isAM4 = isi32Load(Opcode) || isi32Store(Opcode);
+  int Offset = MemOps[SIndex].Offset;
+  int SOffset = Offset;
+  unsigned insertAfter = SIndex;
+  MachineBasicBlock::iterator Loc = MemOps[SIndex].MBBI;
+  DebugLoc dl = Loc->getDebugLoc();
+  const MachineOperand &PMO = Loc->getOperand(0);
+  unsigned PReg = PMO.getReg();
+  unsigned PRegNum = PMO.isUndef() ? UINT_MAX
+    : ARMRegisterInfo::getRegisterNumbering(PReg);
+
+  for (unsigned i = SIndex+1, e = MemOps.size(); i != e; ++i) {
+    int NewOffset = MemOps[i].Offset;
+    const MachineOperand &MO = MemOps[i].MBBI->getOperand(0);
+    unsigned Reg = MO.getReg();
+    unsigned RegNum = MO.isUndef() ? UINT_MAX
+      : ARMRegisterInfo::getRegisterNumbering(Reg);
+    // AM4 - register numbers in ascending order.
+    // AM5 - consecutive register numbers in ascending order.
+    if (NewOffset == Offset + (int)Size &&
+        ((isAM4 && RegNum > PRegNum) || RegNum == PRegNum+1)) {
+      Offset += Size;
+      PRegNum = RegNum;
+    } else {
+      // Can't merge this in. Try merge the earlier ones first.
+      MergeOpsUpdate(MBB, MemOps, SIndex, i, insertAfter, SOffset,
+                     Base, false, Opcode, Pred, PredReg, Scratch, dl, Merges);
+      MergeLDR_STR(MBB, i, Base, Opcode, Size, Pred, PredReg, Scratch,
+                   MemOps, Merges);
+      return;
+    }
+
+    if (MemOps[i].Position > MemOps[insertAfter].Position)
+      insertAfter = i;
+  }
+
+  bool BaseKill = Loc->findRegisterUseOperandIdx(Base, true) != -1;
+  MergeOpsUpdate(MBB, MemOps, SIndex, MemOps.size(), insertAfter, SOffset,
+                 Base, BaseKill, Opcode, Pred, PredReg, Scratch, dl, Merges);
+  return;
+}
+
+static inline bool isMatchingDecrement(MachineInstr *MI, unsigned Base,
+                                       unsigned Bytes, unsigned Limit,
+                                       ARMCC::CondCodes Pred, unsigned PredReg){
+  unsigned MyPredReg = 0;
+  if (!MI)
+    return false;
+  if (MI->getOpcode() != ARM::t2SUBri &&
+      MI->getOpcode() != ARM::t2SUBrSPi &&
+      MI->getOpcode() != ARM::t2SUBrSPi12 &&
+      MI->getOpcode() != ARM::tSUBspi &&
+      MI->getOpcode() != ARM::SUBri)
+    return false;
+
+  // Make sure the offset fits in 8 bits.
+  if (Bytes <= 0 || (Limit && Bytes >= Limit))
+    return false;
+
+  unsigned Scale = (MI->getOpcode() == ARM::tSUBspi) ? 4 : 1; // FIXME
+  return (MI->getOperand(0).getReg() == Base &&
+          MI->getOperand(1).getReg() == Base &&
+          (MI->getOperand(2).getImm()*Scale) == Bytes &&
+          llvm::getInstrPredicate(MI, MyPredReg) == Pred &&
+          MyPredReg == PredReg);
+}
+
+static inline bool isMatchingIncrement(MachineInstr *MI, unsigned Base,
+                                       unsigned Bytes, unsigned Limit,
+                                       ARMCC::CondCodes Pred, unsigned PredReg){
+  unsigned MyPredReg = 0;
+  if (!MI)
+    return false;
+  if (MI->getOpcode() != ARM::t2ADDri &&
+      MI->getOpcode() != ARM::t2ADDrSPi &&
+      MI->getOpcode() != ARM::t2ADDrSPi12 &&
+      MI->getOpcode() != ARM::tADDspi &&
+      MI->getOpcode() != ARM::ADDri)
+    return false;
+
+  if (Bytes <= 0 || (Limit && Bytes >= Limit))
+    // Make sure the offset fits in 8 bits.
+    return false;
+
+  unsigned Scale = (MI->getOpcode() == ARM::tADDspi) ? 4 : 1; // FIXME
+  return (MI->getOperand(0).getReg() == Base &&
+          MI->getOperand(1).getReg() == Base &&
+          (MI->getOperand(2).getImm()*Scale) == Bytes &&
+          llvm::getInstrPredicate(MI, MyPredReg) == Pred &&
+          MyPredReg == PredReg);
+}
+
+static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  default: return 0;
+  case ARM::LDR:
+  case ARM::STR:
+  case ARM::t2LDRi8:
+  case ARM::t2LDRi12:
+  case ARM::t2STRi8:
+  case ARM::t2STRi12:
+  case ARM::VLDRS:
+  case ARM::VSTRS:
+    return 4;
+  case ARM::VLDRD:
+  case ARM::VSTRD:
+    return 8;
+  case ARM::LDM:
+  case ARM::STM:
+  case ARM::t2LDM:
+  case ARM::t2STM:
+    return (MI->getNumOperands() - 5) * 4;
+  case ARM::VLDMS:
+  case ARM::VSTMS:
+  case ARM::VLDMD:
+  case ARM::VSTMD:
+    return ARM_AM::getAM5Offset(MI->getOperand(1).getImm()) * 4;
+  }
+}
+
+/// MergeBaseUpdateLSMultiple - Fold proceeding/trailing inc/dec of base
+/// register into the LDM/STM/VLDM{D|S}/VSTM{D|S} op when possible:
+///
+/// stmia rn, <ra, rb, rc>
+/// rn := rn + 4 * 3;
+/// =>
+/// stmia rn!, <ra, rb, rc>
+///
+/// rn := rn - 4 * 3;
+/// ldmia rn, <ra, rb, rc>
+/// =>
+/// ldmdb rn!, <ra, rb, rc>
+bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
+                                               MachineBasicBlock::iterator MBBI,
+                                               bool &Advance,
+                                               MachineBasicBlock::iterator &I) {
+  MachineInstr *MI = MBBI;
+  unsigned Base = MI->getOperand(0).getReg();
+  unsigned Bytes = getLSMultipleTransferSize(MI);
+  unsigned PredReg = 0;
+  ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg);
+  int Opcode = MI->getOpcode();
+  bool isAM4 = Opcode == ARM::LDM || Opcode == ARM::t2LDM ||
+    Opcode == ARM::STM || Opcode == ARM::t2STM;
+
+  if (isAM4) {
+    if (ARM_AM::getAM4WBFlag(MI->getOperand(1).getImm()))
+      return false;
+
+    // Can't use the updating AM4 sub-mode if the base register is also a dest
+    // register. e.g. ldmdb r0!, {r0, r1, r2}. The behavior is undefined.
+    for (unsigned i = 3, e = MI->getNumOperands(); i != e; ++i) {
+      if (MI->getOperand(i).getReg() == Base)
+        return false;
+    }
+
+    ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MI->getOperand(1).getImm());
+    if (MBBI != MBB.begin()) {
+      MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
+      if (Mode == ARM_AM::ia &&
+          isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
+        MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(ARM_AM::db, true));
+        MI->getOperand(4).setReg(Base);
+        MI->getOperand(4).setIsDef();
+        MBB.erase(PrevMBBI);
+        return true;
+      } else if (Mode == ARM_AM::ib &&
+                 isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
+        MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(ARM_AM::da, true));
+        MI->getOperand(4).setReg(Base);  // WB to base
+        MI->getOperand(4).setIsDef();
+        MBB.erase(PrevMBBI);
+        return true;
+      }
+    }
+
+    if (MBBI != MBB.end()) {
+      MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI);
+      if ((Mode == ARM_AM::ia || Mode == ARM_AM::ib) &&
+          isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
+        MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(Mode, true));
+        MI->getOperand(4).setReg(Base);  // WB to base
+        MI->getOperand(4).setIsDef();
+        if (NextMBBI == I) {
+          Advance = true;
+          ++I;
+        }
+        MBB.erase(NextMBBI);
+        return true;
+      } else if ((Mode == ARM_AM::da || Mode == ARM_AM::db) &&
+                 isMatchingDecrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
+        MI->getOperand(1).setImm(ARM_AM::getAM4ModeImm(Mode, true));
+        MI->getOperand(4).setReg(Base);  // WB to base
+        MI->getOperand(4).setIsDef();
+        if (NextMBBI == I) {
+          Advance = true;
+          ++I;
+        }
+        MBB.erase(NextMBBI);
+        return true;
+      }
+    }
+  } else {
+    // VLDM{D|S}, VSTM{D|S} addressing mode 5 ops.
+    if (ARM_AM::getAM5WBFlag(MI->getOperand(1).getImm()))
+      return false;
+
+    ARM_AM::AMSubMode Mode = ARM_AM::getAM5SubMode(MI->getOperand(1).getImm());
+    unsigned Offset = ARM_AM::getAM5Offset(MI->getOperand(1).getImm());
+    if (MBBI != MBB.begin()) {
+      MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
+      if (Mode == ARM_AM::ia &&
+          isMatchingDecrement(PrevMBBI, Base, Bytes, 0, Pred, PredReg)) {
+        MI->getOperand(1).setImm(ARM_AM::getAM5Opc(ARM_AM::db, true, Offset));
+        MI->getOperand(4).setReg(Base);  // WB to base
+        MI->getOperand(4).setIsDef();
+        MBB.erase(PrevMBBI);
+        return true;
+      }
+    }
+
+    if (MBBI != MBB.end()) {
+      MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI);
+      if (Mode == ARM_AM::ia &&
+          isMatchingIncrement(NextMBBI, Base, Bytes, 0, Pred, PredReg)) {
+        MI->getOperand(1).setImm(ARM_AM::getAM5Opc(ARM_AM::ia, true, Offset));
+        MI->getOperand(4).setReg(Base);  // WB to base
+        MI->getOperand(4).setIsDef();
+        if (NextMBBI == I) {
+          Advance = true;
+          ++I;
+        }
+        MBB.erase(NextMBBI);
+      }
+      return true;
+    }
+  }
+
+  return false;
+}
+
+static unsigned getPreIndexedLoadStoreOpcode(unsigned Opc) {
+  switch (Opc) {
+  case ARM::LDR: return ARM::LDR_PRE;
+  case ARM::STR: return ARM::STR_PRE;
+  case ARM::VLDRS: return ARM::VLDMS;
+  case ARM::VLDRD: return ARM::VLDMD;
+  case ARM::VSTRS: return ARM::VSTMS;
+  case ARM::VSTRD: return ARM::VSTMD;
+  case ARM::t2LDRi8:
+  case ARM::t2LDRi12:
+    return ARM::t2LDR_PRE;
+  case ARM::t2STRi8:
+  case ARM::t2STRi12:
+    return ARM::t2STR_PRE;
+  default: llvm_unreachable("Unhandled opcode!");
+  }
+  return 0;
+}
+
+static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc) {
+  switch (Opc) {
+  case ARM::LDR: return ARM::LDR_POST;
+  case ARM::STR: return ARM::STR_POST;
+  case ARM::VLDRS: return ARM::VLDMS;
+  case ARM::VLDRD: return ARM::VLDMD;
+  case ARM::VSTRS: return ARM::VSTMS;
+  case ARM::VSTRD: return ARM::VSTMD;
+  case ARM::t2LDRi8:
+  case ARM::t2LDRi12:
+    return ARM::t2LDR_POST;
+  case ARM::t2STRi8:
+  case ARM::t2STRi12:
+    return ARM::t2STR_POST;
+  default: llvm_unreachable("Unhandled opcode!");
+  }
+  return 0;
+}
+
+/// MergeBaseUpdateLoadStore - Fold proceeding/trailing inc/dec of base
+/// register into the LDR/STR/FLD{D|S}/FST{D|S} op when possible:
+bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
+                                               MachineBasicBlock::iterator MBBI,
+                                               const TargetInstrInfo *TII,
+                                               bool &Advance,
+                                               MachineBasicBlock::iterator &I) {
+  MachineInstr *MI = MBBI;
+  unsigned Base = MI->getOperand(1).getReg();
+  bool BaseKill = MI->getOperand(1).isKill();
+  unsigned Bytes = getLSMultipleTransferSize(MI);
+  int Opcode = MI->getOpcode();
+  DebugLoc dl = MI->getDebugLoc();
+  bool isAM5 = Opcode == ARM::VLDRD || Opcode == ARM::VLDRS ||
+    Opcode == ARM::VSTRD || Opcode == ARM::VSTRS;
+  bool isAM2 = Opcode == ARM::LDR || Opcode == ARM::STR;
+  if (isAM2 && ARM_AM::getAM2Offset(MI->getOperand(3).getImm()) != 0)
+    return false;
+  else if (isAM5 && ARM_AM::getAM5Offset(MI->getOperand(2).getImm()) != 0)
+    return false;
+  else if (isT2i32Load(Opcode) || isT2i32Store(Opcode))
+    if (MI->getOperand(2).getImm() != 0)
+      return false;
+
+  bool isLd = isi32Load(Opcode) || Opcode == ARM::VLDRS || Opcode == ARM::VLDRD;
+  // Can't do the merge if the destination register is the same as the would-be
+  // writeback register.
+  if (isLd && MI->getOperand(0).getReg() == Base)
+    return false;
+
+  unsigned PredReg = 0;
+  ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg);
+  bool DoMerge = false;
+  ARM_AM::AddrOpc AddSub = ARM_AM::add;
+  unsigned NewOpc = 0;
+  // AM2 - 12 bits, thumb2 - 8 bits.
+  unsigned Limit = isAM5 ? 0 : (isAM2 ? 0x1000 : 0x100);
+  if (MBBI != MBB.begin()) {
+    MachineBasicBlock::iterator PrevMBBI = prior(MBBI);
+    if (isMatchingDecrement(PrevMBBI, Base, Bytes, Limit, Pred, PredReg)) {
+      DoMerge = true;
+      AddSub = ARM_AM::sub;
+      NewOpc = getPreIndexedLoadStoreOpcode(Opcode);
+    } else if (!isAM5 &&
+               isMatchingIncrement(PrevMBBI, Base, Bytes, Limit,Pred,PredReg)) {
+      DoMerge = true;
+      NewOpc = getPreIndexedLoadStoreOpcode(Opcode);
+    }
+    if (DoMerge)
+      MBB.erase(PrevMBBI);
+  }
+
+  if (!DoMerge && MBBI != MBB.end()) {
+    MachineBasicBlock::iterator NextMBBI = llvm::next(MBBI);
+    if (!isAM5 &&
+        isMatchingDecrement(NextMBBI, Base, Bytes, Limit, Pred, PredReg)) {
+      DoMerge = true;
+      AddSub = ARM_AM::sub;
+      NewOpc = getPostIndexedLoadStoreOpcode(Opcode);
+    } else if (isMatchingIncrement(NextMBBI, Base, Bytes, Limit,Pred,PredReg)) {
+      DoMerge = true;
+      NewOpc = getPostIndexedLoadStoreOpcode(Opcode);
+    }
+    if (DoMerge) {
+      if (NextMBBI == I) {
+        Advance = true;
+        ++I;
+      }
+      MBB.erase(NextMBBI);
+    }
+  }
+
+  if (!DoMerge)
+    return false;
+
+  bool isDPR = NewOpc == ARM::VLDMD || NewOpc == ARM::VSTMD;
+  unsigned Offset = 0;
+  if (isAM5)
+    Offset = ARM_AM::getAM5Opc((AddSub == ARM_AM::sub)
+                               ? ARM_AM::db
+                               : ARM_AM::ia, true, (isDPR ? 2 : 1));
+  else if (isAM2)
+    Offset = ARM_AM::getAM2Opc(AddSub, Bytes, ARM_AM::no_shift);
+  else
+    Offset = AddSub == ARM_AM::sub ? -Bytes : Bytes;
+  if (isLd) {
+    if (isAM5)
+      // VLDMS, VLDMD
+      BuildMI(MBB, MBBI, dl, TII->get(NewOpc))
+        .addReg(Base, getKillRegState(BaseKill))
+        .addImm(Offset).addImm(Pred).addReg(PredReg)
+        .addReg(Base, getDefRegState(true)) // WB base register
+        .addReg(MI->getOperand(0).getReg(), RegState::Define);
+    else if (isAM2)
+      // LDR_PRE, LDR_POST,
+      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
+        .addReg(Base, RegState::Define)
+        .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
+    else
+      // t2LDR_PRE, t2LDR_POST
+      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), MI->getOperand(0).getReg())
+        .addReg(Base, RegState::Define)
+        .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
+  } else {
+    MachineOperand &MO = MI->getOperand(0);
+    if (isAM5)
+      // VSTMS, VSTMD
+      BuildMI(MBB, MBBI, dl, TII->get(NewOpc)).addReg(Base).addImm(Offset)
+        .addImm(Pred).addReg(PredReg)
+        .addReg(Base, getDefRegState(true)) // WB base register
+        .addReg(MO.getReg(), getKillRegState(MO.isKill()));
+    else if (isAM2)
+      // STR_PRE, STR_POST
+      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base)
+        .addReg(MO.getReg(), getKillRegState(MO.isKill()))
+        .addReg(Base).addReg(0).addImm(Offset).addImm(Pred).addReg(PredReg);
+    else
+      // t2STR_PRE, t2STR_POST
+      BuildMI(MBB, MBBI, dl, TII->get(NewOpc), Base)
+        .addReg(MO.getReg(), getKillRegState(MO.isKill()))
+        .addReg(Base).addImm(Offset).addImm(Pred).addReg(PredReg);
+  }
+  MBB.erase(MBBI);
+
+  return true;
+}
+
+/// isMemoryOp - Returns true if instruction is a memory operations (that this
+/// pass is capable of operating on).
+static bool isMemoryOp(const MachineInstr *MI) {
+  if (MI->hasOneMemOperand()) {
+    const MachineMemOperand *MMO = *MI->memoperands_begin();
+
+    // Don't touch volatile memory accesses - we may be changing their order.
+    if (MMO->isVolatile())
+      return false;
+
+    // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is not.
+    if (MMO->getAlignment() < 4)
+      return false;
+  }
+
+  int Opcode = MI->getOpcode();
+  switch (Opcode) {
+  default: break;
+  case ARM::LDR:
+  case ARM::STR:
+    return MI->getOperand(1).isReg() && MI->getOperand(2).getReg() == 0;
+  case ARM::VLDRS:
+  case ARM::VSTRS:
+    return MI->getOperand(1).isReg();
+  case ARM::VLDRD:
+  case ARM::VSTRD:
+    return MI->getOperand(1).isReg();
+  case ARM::t2LDRi8:
+  case ARM::t2LDRi12:
+  case ARM::t2STRi8:
+  case ARM::t2STRi12:
+    return MI->getOperand(1).isReg();
+  }
+  return false;
+}
+
+/// AdvanceRS - Advance register scavenger to just before the earliest memory
+/// op that is being merged.
+void ARMLoadStoreOpt::AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps) {
+  MachineBasicBlock::iterator Loc = MemOps[0].MBBI;
+  unsigned Position = MemOps[0].Position;
+  for (unsigned i = 1, e = MemOps.size(); i != e; ++i) {
+    if (MemOps[i].Position < Position) {
+      Position = MemOps[i].Position;
+      Loc = MemOps[i].MBBI;
+    }
+  }
+
+  if (Loc != MBB.begin())
+    RS->forward(prior(Loc));
+}
+
+static int getMemoryOpOffset(const MachineInstr *MI) {
+  int Opcode = MI->getOpcode();
+  bool isAM2 = Opcode == ARM::LDR || Opcode == ARM::STR;
+  bool isAM3 = Opcode == ARM::LDRD || Opcode == ARM::STRD;
+  unsigned NumOperands = MI->getDesc().getNumOperands();
+  unsigned OffField = MI->getOperand(NumOperands-3).getImm();
+
+  if (Opcode == ARM::t2LDRi12 || Opcode == ARM::t2LDRi8 ||
+      Opcode == ARM::t2STRi12 || Opcode == ARM::t2STRi8 ||
+      Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8)
+    return OffField;
+
+  int Offset = isAM2
+    ? ARM_AM::getAM2Offset(OffField)
+    : (isAM3 ? ARM_AM::getAM3Offset(OffField)
+             : ARM_AM::getAM5Offset(OffField) * 4);
+  if (isAM2) {
+    if (ARM_AM::getAM2Op(OffField) == ARM_AM::sub)
+      Offset = -Offset;
+  } else if (isAM3) {
+    if (ARM_AM::getAM3Op(OffField) == ARM_AM::sub)
+      Offset = -Offset;
+  } else {
+    if (ARM_AM::getAM5Op(OffField) == ARM_AM::sub)
+      Offset = -Offset;
+  }
+  return Offset;
+}
+
+static void InsertLDR_STR(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator &MBBI,
+                          int OffImm, bool isDef,
+                          DebugLoc dl, unsigned NewOpc,
+                          unsigned Reg, bool RegDeadKill, bool RegUndef,
+                          unsigned BaseReg, bool BaseKill, bool BaseUndef,
+                          unsigned OffReg, bool OffKill, bool OffUndef,
+                          ARMCC::CondCodes Pred, unsigned PredReg,
+                          const TargetInstrInfo *TII, bool isT2) {
+  int Offset = OffImm;
+  if (!isT2) {
+    if (OffImm < 0)
+      Offset = ARM_AM::getAM2Opc(ARM_AM::sub, -OffImm, ARM_AM::no_shift);
+    else
+      Offset = ARM_AM::getAM2Opc(ARM_AM::add, OffImm, ARM_AM::no_shift);
+  }
+  if (isDef) {
+    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
+                                      TII->get(NewOpc))
+      .addReg(Reg, getDefRegState(true) | getDeadRegState(RegDeadKill))
+      .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
+    if (!isT2)
+      MIB.addReg(OffReg,  getKillRegState(OffKill)|getUndefRegState(OffUndef));
+    MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
+  } else {
+    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
+                                      TII->get(NewOpc))
+      .addReg(Reg, getKillRegState(RegDeadKill) | getUndefRegState(RegUndef))
+      .addReg(BaseReg, getKillRegState(BaseKill)|getUndefRegState(BaseUndef));
+    if (!isT2)
+      MIB.addReg(OffReg,  getKillRegState(OffKill)|getUndefRegState(OffUndef));
+    MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
+  }
+}
+
+bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator &MBBI) {
+  MachineInstr *MI = &*MBBI;
+  unsigned Opcode = MI->getOpcode();
+  if (Opcode == ARM::LDRD || Opcode == ARM::STRD ||
+      Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8) {
+    unsigned EvenReg = MI->getOperand(0).getReg();
+    unsigned OddReg  = MI->getOperand(1).getReg();
+    unsigned EvenRegNum = TRI->getDwarfRegNum(EvenReg, false);
+    unsigned OddRegNum  = TRI->getDwarfRegNum(OddReg, false);
+    if ((EvenRegNum & 1) == 0 && (EvenRegNum + 1) == OddRegNum)
+      return false;
+
+    bool isT2 = Opcode == ARM::t2LDRDi8 || Opcode == ARM::t2STRDi8;
+    bool isLd = Opcode == ARM::LDRD || Opcode == ARM::t2LDRDi8;
+    bool EvenDeadKill = isLd ?
+      MI->getOperand(0).isDead() : MI->getOperand(0).isKill();
+    bool EvenUndef = MI->getOperand(0).isUndef();
+    bool OddDeadKill  = isLd ?
+      MI->getOperand(1).isDead() : MI->getOperand(1).isKill();
+    bool OddUndef = MI->getOperand(1).isUndef();
+    const MachineOperand &BaseOp = MI->getOperand(2);
+    unsigned BaseReg = BaseOp.getReg();
+    bool BaseKill = BaseOp.isKill();
+    bool BaseUndef = BaseOp.isUndef();
+    unsigned OffReg = isT2 ? 0 : MI->getOperand(3).getReg();
+    bool OffKill = isT2 ? false : MI->getOperand(3).isKill();
+    bool OffUndef = isT2 ? false : MI->getOperand(3).isUndef();
+    int OffImm = getMemoryOpOffset(MI);
+    unsigned PredReg = 0;
+    ARMCC::CondCodes Pred = llvm::getInstrPredicate(MI, PredReg);
+
+    if (OddRegNum > EvenRegNum && OffReg == 0 && OffImm == 0) {
+      // Ascending register numbers and no offset. It's safe to change it to a
+      // ldm or stm.
+      unsigned NewOpc = (isLd)
+        ? (isT2 ? ARM::t2LDM : ARM::LDM)
+        : (isT2 ? ARM::t2STM : ARM::STM);
+      if (isLd) {
+        BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
+          .addReg(BaseReg, getKillRegState(BaseKill))
+          .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia))
+          .addImm(Pred).addReg(PredReg)
+          .addReg(0)
+          .addReg(EvenReg, getDefRegState(isLd) | getDeadRegState(EvenDeadKill))
+          .addReg(OddReg,  getDefRegState(isLd) | getDeadRegState(OddDeadKill));
+        ++NumLDRD2LDM;
+      } else {
+        BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(NewOpc))
+          .addReg(BaseReg, getKillRegState(BaseKill))
+          .addImm(ARM_AM::getAM4ModeImm(ARM_AM::ia))
+          .addImm(Pred).addReg(PredReg)
+          .addReg(0)
+          .addReg(EvenReg,
+                  getKillRegState(EvenDeadKill) | getUndefRegState(EvenUndef))
+          .addReg(OddReg,
+                  getKillRegState(OddDeadKill)  | getUndefRegState(OddUndef));
+        ++NumSTRD2STM;
+      }
+    } else {
+      // Split into two instructions.
+      assert((!isT2 || !OffReg) &&
+             "Thumb2 ldrd / strd does not encode offset register!");
+      unsigned NewOpc = (isLd)
+        ? (isT2 ? (OffImm < 0 ? ARM::t2LDRi8 : ARM::t2LDRi12) : ARM::LDR)
+        : (isT2 ? (OffImm < 0 ? ARM::t2STRi8 : ARM::t2STRi12) : ARM::STR);
+      DebugLoc dl = MBBI->getDebugLoc();
+      // If this is a load and base register is killed, it may have been
+      // re-defed by the load, make sure the first load does not clobber it.
+      if (isLd &&
+          (BaseKill || OffKill) &&
+          (TRI->regsOverlap(EvenReg, BaseReg) ||
+           (OffReg && TRI->regsOverlap(EvenReg, OffReg)))) {
+        assert(!TRI->regsOverlap(OddReg, BaseReg) &&
+               (!OffReg || !TRI->regsOverlap(OddReg, OffReg)));
+        InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc,
+                      OddReg, OddDeadKill, false,
+                      BaseReg, false, BaseUndef, OffReg, false, OffUndef,
+                      Pred, PredReg, TII, isT2);
+        InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
+                      EvenReg, EvenDeadKill, false,
+                      BaseReg, BaseKill, BaseUndef, OffReg, OffKill, OffUndef,
+                      Pred, PredReg, TII, isT2);
+      } else {
+        if (OddReg == EvenReg && EvenDeadKill) {
+          // If the two source operands are the same, the kill marker is probably
+          // on the first one. e.g.
+          // t2STRDi8 %R5<kill>, %R5, %R9<kill>, 0, 14, %reg0
+          EvenDeadKill = false;
+          OddDeadKill = true;
+        }
+        InsertLDR_STR(MBB, MBBI, OffImm, isLd, dl, NewOpc,
+                      EvenReg, EvenDeadKill, EvenUndef,
+                      BaseReg, false, BaseUndef, OffReg, false, OffUndef,
+                      Pred, PredReg, TII, isT2);
+        InsertLDR_STR(MBB, MBBI, OffImm+4, isLd, dl, NewOpc,
+                      OddReg, OddDeadKill, OddUndef,
+                      BaseReg, BaseKill, BaseUndef, OffReg, OffKill, OffUndef,
+                      Pred, PredReg, TII, isT2);
+      }
+      if (isLd)
+        ++NumLDRD2LDR;
+      else
+        ++NumSTRD2STR;
+    }
+
+    MBBI = prior(MBBI);
+    MBB.erase(MI);
+  }
+  return false;
+}
+
+/// LoadStoreMultipleOpti - An optimization pass to turn multiple LDR / STR
+/// ops of the same base and incrementing offset into LDM / STM ops.
+bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
+  unsigned NumMerges = 0;
+  unsigned NumMemOps = 0;
+  MemOpQueue MemOps;
+  unsigned CurrBase = 0;
+  int CurrOpc = -1;
+  unsigned CurrSize = 0;
+  ARMCC::CondCodes CurrPred = ARMCC::AL;
+  unsigned CurrPredReg = 0;
+  unsigned Position = 0;
+  SmallVector<MachineBasicBlock::iterator,4> Merges;
+
+  RS->enterBasicBlock(&MBB);
+  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+  while (MBBI != E) {
+    if (FixInvalidRegPairOp(MBB, MBBI))
+      continue;
+
+    bool Advance  = false;
+    bool TryMerge = false;
+    bool Clobber  = false;
+
+    bool isMemOp = isMemoryOp(MBBI);
+    if (isMemOp) {
+      int Opcode = MBBI->getOpcode();
+      unsigned Size = getLSMultipleTransferSize(MBBI);
+      unsigned Base = MBBI->getOperand(1).getReg();
+      unsigned PredReg = 0;
+      ARMCC::CondCodes Pred = llvm::getInstrPredicate(MBBI, PredReg);
+      int Offset = getMemoryOpOffset(MBBI);
+      // Watch out for:
+      // r4 := ldr [r5]
+      // r5 := ldr [r5, #4]
+      // r6 := ldr [r5, #8]
+      //
+      // The second ldr has effectively broken the chain even though it
+      // looks like the later ldr(s) use the same base register. Try to
+      // merge the ldr's so far, including this one. But don't try to
+      // combine the following ldr(s).
+      Clobber = (isi32Load(Opcode) && Base == MBBI->getOperand(0).getReg());
+      if (CurrBase == 0 && !Clobber) {
+        // Start of a new chain.
+        CurrBase = Base;
+        CurrOpc  = Opcode;
+        CurrSize = Size;
+        CurrPred = Pred;
+        CurrPredReg = PredReg;
+        MemOps.push_back(MemOpQueueEntry(Offset, Position, MBBI));
+        NumMemOps++;
+        Advance = true;
+      } else {
+        if (Clobber) {
+          TryMerge = true;
+          Advance = true;
+        }
+
+        if (CurrOpc == Opcode && CurrBase == Base && CurrPred == Pred) {
+          // No need to match PredReg.
+          // Continue adding to the queue.
+          if (Offset > MemOps.back().Offset) {
+            MemOps.push_back(MemOpQueueEntry(Offset, Position, MBBI));
+            NumMemOps++;
+            Advance = true;
+          } else {
+            for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end();
+                 I != E; ++I) {
+              if (Offset < I->Offset) {
+                MemOps.insert(I, MemOpQueueEntry(Offset, Position, MBBI));
+                NumMemOps++;
+                Advance = true;
+                break;
+              } else if (Offset == I->Offset) {
+                // Collision! This can't be merged!
+                break;
+              }
+            }
+          }
+        }
+      }
+    }
+
+    if (Advance) {
+      ++Position;
+      ++MBBI;
+      if (MBBI == E)
+        // Reach the end of the block, try merging the memory instructions.
+        TryMerge = true;
+    } else
+      TryMerge = true;
+
+    if (TryMerge) {
+      if (NumMemOps > 1) {
+        // Try to find a free register to use as a new base in case it's needed.
+        // First advance to the instruction just before the start of the chain.
+        AdvanceRS(MBB, MemOps);
+        // Find a scratch register.
+        unsigned Scratch = RS->FindUnusedReg(ARM::GPRRegisterClass);
+        // Process the load / store instructions.
+        RS->forward(prior(MBBI));
+
+        // Merge ops.
+        Merges.clear();
+        MergeLDR_STR(MBB, 0, CurrBase, CurrOpc, CurrSize,
+                     CurrPred, CurrPredReg, Scratch, MemOps, Merges);
+
+        // Try folding preceeding/trailing base inc/dec into the generated
+        // LDM/STM ops.
+        for (unsigned i = 0, e = Merges.size(); i < e; ++i)
+          if (MergeBaseUpdateLSMultiple(MBB, Merges[i], Advance, MBBI))
+            ++NumMerges;
+        NumMerges += Merges.size();
+
+        // Try folding preceeding/trailing base inc/dec into those load/store
+        // that were not merged to form LDM/STM ops.
+        for (unsigned i = 0; i != NumMemOps; ++i)
+          if (!MemOps[i].Merged)
+            if (MergeBaseUpdateLoadStore(MBB, MemOps[i].MBBI, TII,Advance,MBBI))
+              ++NumMerges;
+
+        // RS may be pointing to an instruction that's deleted.
+        RS->skipTo(prior(MBBI));
+      } else if (NumMemOps == 1) {
+        // Try folding preceeding/trailing base inc/dec into the single
+        // load/store.
+        if (MergeBaseUpdateLoadStore(MBB, MemOps[0].MBBI, TII, Advance, MBBI)) {
+          ++NumMerges;
+          RS->forward(prior(MBBI));
+        }
+      }
+
+      CurrBase = 0;
+      CurrOpc = -1;
+      CurrSize = 0;
+      CurrPred = ARMCC::AL;
+      CurrPredReg = 0;
+      if (NumMemOps) {
+        MemOps.clear();
+        NumMemOps = 0;
+      }
+
+      // If iterator hasn't been advanced and this is not a memory op, skip it.
+      // It can't start a new chain anyway.
+      if (!Advance && !isMemOp && MBBI != E) {
+        ++Position;
+        ++MBBI;
+      }
+    }
+  }
+  return NumMerges > 0;
+}
+
+namespace {
+  struct OffsetCompare {
+    bool operator()(const MachineInstr *LHS, const MachineInstr *RHS) const {
+      int LOffset = getMemoryOpOffset(LHS);
+      int ROffset = getMemoryOpOffset(RHS);
+      assert(LHS == RHS || LOffset != ROffset);
+      return LOffset > ROffset;
+    }
+  };
+}
+
+/// MergeReturnIntoLDM - If this is a exit BB, try merging the return op
+/// (bx lr) into the preceeding stack restore so it directly restore the value
+/// of LR into pc.
+///   ldmfd sp!, {r7, lr}
+///   bx lr
+/// =>
+///   ldmfd sp!, {r7, pc}
+bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
+  if (MBB.empty()) return false;
+
+  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  if (MBBI != MBB.begin() &&
+      (MBBI->getOpcode() == ARM::BX_RET || MBBI->getOpcode() == ARM::tBX_RET)) {
+    MachineInstr *PrevMI = prior(MBBI);
+    if (PrevMI->getOpcode() == ARM::LDM || PrevMI->getOpcode() == ARM::t2LDM) {
+      MachineOperand &MO = PrevMI->getOperand(PrevMI->getNumOperands()-1);
+      if (MO.getReg() != ARM::LR)
+        return false;
+      unsigned NewOpc = isThumb2 ? ARM::t2LDM_RET : ARM::LDM_RET;
+      PrevMI->setDesc(TII->get(NewOpc));
+      MO.setReg(ARM::PC);
+      MBB.erase(MBBI);
+      return true;
+    }
+  }
+  return false;
+}
+
+bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
+  const TargetMachine &TM = Fn.getTarget();
+  AFI = Fn.getInfo<ARMFunctionInfo>();
+  TII = TM.getInstrInfo();
+  TRI = TM.getRegisterInfo();
+  RS = new RegScavenger();
+  isThumb2 = AFI->isThumb2Function();
+
+  bool Modified = false;
+  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+       ++MFI) {
+    MachineBasicBlock &MBB = *MFI;
+    Modified |= LoadStoreMultipleOpti(MBB);
+    Modified |= MergeReturnIntoLDM(MBB);
+  }
+
+  delete RS;
+  return Modified;
+}
+
+
+/// ARMPreAllocLoadStoreOpt - Pre- register allocation pass that move
+/// load / stores from consecutive locations close to make it more
+/// likely they will be combined later.
+
+namespace {
+  struct ARMPreAllocLoadStoreOpt : public MachineFunctionPass{
+    static char ID;
+    ARMPreAllocLoadStoreOpt() : MachineFunctionPass(&ID) {}
+
+    const TargetData *TD;
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    const ARMSubtarget *STI;
+    MachineRegisterInfo *MRI;
+    MachineFunction *MF;
+
+    virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+    virtual const char *getPassName() const {
+      return "ARM pre- register allocation load / store optimization pass";
+    }
+
+  private:
+    bool CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl,
+                          unsigned &NewOpc, unsigned &EvenReg,
+                          unsigned &OddReg, unsigned &BaseReg,
+                          unsigned &OffReg, int &Offset,
+                          unsigned &PredReg, ARMCC::CondCodes &Pred,
+                          bool &isT2);
+    bool RescheduleOps(MachineBasicBlock *MBB,
+                       SmallVector<MachineInstr*, 4> &Ops,
+                       unsigned Base, bool isLd,
+                       DenseMap<MachineInstr*, unsigned> &MI2LocMap);
+    bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB);
+  };
+  char ARMPreAllocLoadStoreOpt::ID = 0;
+}
+
+bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
+  TD  = Fn.getTarget().getTargetData();
+  TII = Fn.getTarget().getInstrInfo();
+  TRI = Fn.getTarget().getRegisterInfo();
+  STI = &Fn.getTarget().getSubtarget<ARMSubtarget>();
+  MRI = &Fn.getRegInfo();
+  MF  = &Fn;
+
+  bool Modified = false;
+  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+       ++MFI)
+    Modified |= RescheduleLoadStoreInstrs(MFI);
+
+  return Modified;
+}
+
+static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base,
+                                      MachineBasicBlock::iterator I,
+                                      MachineBasicBlock::iterator E,
+                                      SmallPtrSet<MachineInstr*, 4> &MemOps,
+                                      SmallSet<unsigned, 4> &MemRegs,
+                                      const TargetRegisterInfo *TRI) {
+  // Are there stores / loads / calls between them?
+  // FIXME: This is overly conservative. We should make use of alias information
+  // some day.
+  SmallSet<unsigned, 4> AddedRegPressure;
+  while (++I != E) {
+    if (MemOps.count(&*I))
+      continue;
+    const TargetInstrDesc &TID = I->getDesc();
+    if (TID.isCall() || TID.isTerminator() || TID.hasUnmodeledSideEffects())
+      return false;
+    if (isLd && TID.mayStore())
+      return false;
+    if (!isLd) {
+      if (TID.mayLoad())
+        return false;
+      // It's not safe to move the first 'str' down.
+      // str r1, [r0]
+      // strh r5, [r0]
+      // str r4, [r0, #+4]
+      if (TID.mayStore())
+        return false;
+    }
+    for (unsigned j = 0, NumOps = I->getNumOperands(); j != NumOps; ++j) {
+      MachineOperand &MO = I->getOperand(j);
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (MO.isDef() && TRI->regsOverlap(Reg, Base))
+        return false;
+      if (Reg != Base && !MemRegs.count(Reg))
+        AddedRegPressure.insert(Reg);
+    }
+  }
+
+  // Estimate register pressure increase due to the transformation.
+  if (MemRegs.size() <= 4)
+    // Ok if we are moving small number of instructions.
+    return true;
+  return AddedRegPressure.size() <= MemRegs.size() * 2;
+}
+
+bool
+ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
+                                          DebugLoc &dl,
+                                          unsigned &NewOpc, unsigned &EvenReg,
+                                          unsigned &OddReg, unsigned &BaseReg,
+                                          unsigned &OffReg, int &Offset,
+                                          unsigned &PredReg,
+                                          ARMCC::CondCodes &Pred,
+                                          bool &isT2) {
+  // Make sure we're allowed to generate LDRD/STRD.
+  if (!STI->hasV5TEOps())
+    return false;
+
+  // FIXME: VLDRS / VSTRS -> VLDRD / VSTRD
+  unsigned Scale = 1;
+  unsigned Opcode = Op0->getOpcode();
+  if (Opcode == ARM::LDR)
+    NewOpc = ARM::LDRD;
+  else if (Opcode == ARM::STR)
+    NewOpc = ARM::STRD;
+  else if (Opcode == ARM::t2LDRi8 || Opcode == ARM::t2LDRi12) {
+    NewOpc = ARM::t2LDRDi8;
+    Scale = 4;
+    isT2 = true;
+  } else if (Opcode == ARM::t2STRi8 || Opcode == ARM::t2STRi12) {
+    NewOpc = ARM::t2STRDi8;
+    Scale = 4;
+    isT2 = true;
+  } else
+    return false;
+
+  // Make sure the offset registers match.
+  if (!isT2 &&
+      (Op0->getOperand(2).getReg() != Op1->getOperand(2).getReg()))
+      return false;
+
+  // Must sure the base address satisfies i64 ld / st alignment requirement.
+  if (!Op0->hasOneMemOperand() ||
+      !(*Op0->memoperands_begin())->getValue() ||
+      (*Op0->memoperands_begin())->isVolatile())
+    return false;
+
+  unsigned Align = (*Op0->memoperands_begin())->getAlignment();
+  Function *Func = MF->getFunction();
+  unsigned ReqAlign = STI->hasV6Ops()
+    ? TD->getPrefTypeAlignment(Type::getInt64Ty(Func->getContext())) 
+    : 8;  // Pre-v6 need 8-byte align
+  if (Align < ReqAlign)
+    return false;
+
+  // Then make sure the immediate offset fits.
+  int OffImm = getMemoryOpOffset(Op0);
+  if (isT2) {
+    if (OffImm < 0) {
+      if (OffImm < -255)
+        // Can't fall back to t2LDRi8 / t2STRi8.
+        return false;
+    } else {
+      int Limit = (1 << 8) * Scale;
+      if (OffImm >= Limit || (OffImm & (Scale-1)))
+        return false;
+    }
+    Offset = OffImm;
+  } else {
+    ARM_AM::AddrOpc AddSub = ARM_AM::add;
+    if (OffImm < 0) {
+      AddSub = ARM_AM::sub;
+      OffImm = - OffImm;
+    }
+    int Limit = (1 << 8) * Scale;
+    if (OffImm >= Limit || (OffImm & (Scale-1)))
+      return false;
+    Offset = ARM_AM::getAM3Opc(AddSub, OffImm);
+  }
+  EvenReg = Op0->getOperand(0).getReg();
+  OddReg  = Op1->getOperand(0).getReg();
+  if (EvenReg == OddReg)
+    return false;
+  BaseReg = Op0->getOperand(1).getReg();
+  if (!isT2)
+    OffReg = Op0->getOperand(2).getReg();
+  Pred = llvm::getInstrPredicate(Op0, PredReg);
+  dl = Op0->getDebugLoc();
+  return true;
+}
+
+bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
+                                 SmallVector<MachineInstr*, 4> &Ops,
+                                 unsigned Base, bool isLd,
+                                 DenseMap<MachineInstr*, unsigned> &MI2LocMap) {
+  bool RetVal = false;
+
+  // Sort by offset (in reverse order).
+  std::sort(Ops.begin(), Ops.end(), OffsetCompare());
+
+  // The loads / stores of the same base are in order. Scan them from first to
+  // last and check for the followins:
+  // 1. Any def of base.
+  // 2. Any gaps.
+  while (Ops.size() > 1) {
+    unsigned FirstLoc = ~0U;
+    unsigned LastLoc = 0;
+    MachineInstr *FirstOp = 0;
+    MachineInstr *LastOp = 0;
+    int LastOffset = 0;
+    unsigned LastOpcode = 0;
+    unsigned LastBytes = 0;
+    unsigned NumMove = 0;
+    for (int i = Ops.size() - 1; i >= 0; --i) {
+      MachineInstr *Op = Ops[i];
+      unsigned Loc = MI2LocMap[Op];
+      if (Loc <= FirstLoc) {
+        FirstLoc = Loc;
+        FirstOp = Op;
+      }
+      if (Loc >= LastLoc) {
+        LastLoc = Loc;
+        LastOp = Op;
+      }
+
+      unsigned Opcode = Op->getOpcode();
+      if (LastOpcode && Opcode != LastOpcode)
+        break;
+
+      int Offset = getMemoryOpOffset(Op);
+      unsigned Bytes = getLSMultipleTransferSize(Op);
+      if (LastBytes) {
+        if (Bytes != LastBytes || Offset != (LastOffset + (int)Bytes))
+          break;
+      }
+      LastOffset = Offset;
+      LastBytes = Bytes;
+      LastOpcode = Opcode;
+      if (++NumMove == 8) // FIXME: Tune this limit.
+        break;
+    }
+
+    if (NumMove <= 1)
+      Ops.pop_back();
+    else {
+      SmallPtrSet<MachineInstr*, 4> MemOps;
+      SmallSet<unsigned, 4> MemRegs;
+      for (int i = NumMove-1; i >= 0; --i) {
+        MemOps.insert(Ops[i]);
+        MemRegs.insert(Ops[i]->getOperand(0).getReg());
+      }
+
+      // Be conservative, if the instructions are too far apart, don't
+      // move them. We want to limit the increase of register pressure.
+      bool DoMove = (LastLoc - FirstLoc) <= NumMove*4; // FIXME: Tune this.
+      if (DoMove)
+        DoMove = IsSafeAndProfitableToMove(isLd, Base, FirstOp, LastOp,
+                                           MemOps, MemRegs, TRI);
+      if (!DoMove) {
+        for (unsigned i = 0; i != NumMove; ++i)
+          Ops.pop_back();
+      } else {
+        // This is the new location for the loads / stores.
+        MachineBasicBlock::iterator InsertPos = isLd ? FirstOp : LastOp;
+        while (InsertPos != MBB->end() && MemOps.count(InsertPos))
+          ++InsertPos;
+
+        // If we are moving a pair of loads / stores, see if it makes sense
+        // to try to allocate a pair of registers that can form register pairs.
+        MachineInstr *Op0 = Ops.back();
+        MachineInstr *Op1 = Ops[Ops.size()-2];
+        unsigned EvenReg = 0, OddReg = 0;
+        unsigned BaseReg = 0, OffReg = 0, PredReg = 0;
+        ARMCC::CondCodes Pred = ARMCC::AL;
+        bool isT2 = false;
+        unsigned NewOpc = 0;
+        int Offset = 0;
+        DebugLoc dl;
+        if (NumMove == 2 && CanFormLdStDWord(Op0, Op1, dl, NewOpc,
+                                             EvenReg, OddReg, BaseReg, OffReg,
+                                             Offset, PredReg, Pred, isT2)) {
+          Ops.pop_back();
+          Ops.pop_back();
+
+          // Form the pair instruction.
+          if (isLd) {
+            MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos,
+                                              dl, TII->get(NewOpc))
+              .addReg(EvenReg, RegState::Define)
+              .addReg(OddReg, RegState::Define)
+              .addReg(BaseReg);
+            if (!isT2)
+              MIB.addReg(OffReg);
+            MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
+            ++NumLDRDFormed;
+          } else {
+            MachineInstrBuilder MIB = BuildMI(*MBB, InsertPos,
+                                              dl, TII->get(NewOpc))
+              .addReg(EvenReg)
+              .addReg(OddReg)
+              .addReg(BaseReg);
+            if (!isT2)
+              MIB.addReg(OffReg);
+            MIB.addImm(Offset).addImm(Pred).addReg(PredReg);
+            ++NumSTRDFormed;
+          }
+          MBB->erase(Op0);
+          MBB->erase(Op1);
+
+          // Add register allocation hints to form register pairs.
+          MRI->setRegAllocationHint(EvenReg, ARMRI::RegPairEven, OddReg);
+          MRI->setRegAllocationHint(OddReg,  ARMRI::RegPairOdd, EvenReg);
+        } else {
+          for (unsigned i = 0; i != NumMove; ++i) {
+            MachineInstr *Op = Ops.back();
+            Ops.pop_back();
+            MBB->splice(InsertPos, MBB, Op);
+          }
+        }
+
+        NumLdStMoved += NumMove;
+        RetVal = true;
+      }
+    }
+  }
+
+  return RetVal;
+}
+
+bool
+ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
+  bool RetVal = false;
+
+  DenseMap<MachineInstr*, unsigned> MI2LocMap;
+  DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2LdsMap;
+  DenseMap<unsigned, SmallVector<MachineInstr*, 4> > Base2StsMap;
+  SmallVector<unsigned, 4> LdBases;
+  SmallVector<unsigned, 4> StBases;
+
+  unsigned Loc = 0;
+  MachineBasicBlock::iterator MBBI = MBB->begin();
+  MachineBasicBlock::iterator E = MBB->end();
+  while (MBBI != E) {
+    for (; MBBI != E; ++MBBI) {
+      MachineInstr *MI = MBBI;
+      const TargetInstrDesc &TID = MI->getDesc();
+      if (TID.isCall() || TID.isTerminator()) {
+        // Stop at barriers.
+        ++MBBI;
+        break;
+      }
+
+      MI2LocMap[MI] = Loc++;
+      if (!isMemoryOp(MI))
+        continue;
+      unsigned PredReg = 0;
+      if (llvm::getInstrPredicate(MI, PredReg) != ARMCC::AL)
+        continue;
+
+      int Opc = MI->getOpcode();
+      bool isLd = isi32Load(Opc) || Opc == ARM::VLDRS || Opc == ARM::VLDRD;
+      unsigned Base = MI->getOperand(1).getReg();
+      int Offset = getMemoryOpOffset(MI);
+
+      bool StopHere = false;
+      if (isLd) {
+        DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI =
+          Base2LdsMap.find(Base);
+        if (BI != Base2LdsMap.end()) {
+          for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
+            if (Offset == getMemoryOpOffset(BI->second[i])) {
+              StopHere = true;
+              break;
+            }
+          }
+          if (!StopHere)
+            BI->second.push_back(MI);
+        } else {
+          SmallVector<MachineInstr*, 4> MIs;
+          MIs.push_back(MI);
+          Base2LdsMap[Base] = MIs;
+          LdBases.push_back(Base);
+        }
+      } else {
+        DenseMap<unsigned, SmallVector<MachineInstr*, 4> >::iterator BI =
+          Base2StsMap.find(Base);
+        if (BI != Base2StsMap.end()) {
+          for (unsigned i = 0, e = BI->second.size(); i != e; ++i) {
+            if (Offset == getMemoryOpOffset(BI->second[i])) {
+              StopHere = true;
+              break;
+            }
+          }
+          if (!StopHere)
+            BI->second.push_back(MI);
+        } else {
+          SmallVector<MachineInstr*, 4> MIs;
+          MIs.push_back(MI);
+          Base2StsMap[Base] = MIs;
+          StBases.push_back(Base);
+        }
+      }
+
+      if (StopHere) {
+        // Found a duplicate (a base+offset combination that's seen earlier).
+        // Backtrack.
+        --Loc;
+        break;
+      }
+    }
+
+    // Re-schedule loads.
+    for (unsigned i = 0, e = LdBases.size(); i != e; ++i) {
+      unsigned Base = LdBases[i];
+      SmallVector<MachineInstr*, 4> &Lds = Base2LdsMap[Base];
+      if (Lds.size() > 1)
+        RetVal |= RescheduleOps(MBB, Lds, Base, true, MI2LocMap);
+    }
+
+    // Re-schedule stores.
+    for (unsigned i = 0, e = StBases.size(); i != e; ++i) {
+      unsigned Base = StBases[i];
+      SmallVector<MachineInstr*, 4> &Sts = Base2StsMap[Base];
+      if (Sts.size() > 1)
+        RetVal |= RescheduleOps(MBB, Sts, Base, false, MI2LocMap);
+    }
+
+    if (MBBI != E) {
+      Base2LdsMap.clear();
+      Base2StsMap.clear();
+      LdBases.clear();
+      StBases.clear();
+    }
+  }
+
+  return RetVal;
+}
+
+
+/// createARMLoadStoreOptimizationPass - returns an instance of the load / store
+/// optimization pass.
+FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) {
+  if (PreAlloc)
+    return new ARMPreAllocLoadStoreOpt();
+  return new ARMLoadStoreOpt();
+}

diff --git a/lib/Target/ARM/ARMMCAsmInfo.cpp b/lib/Target/ARM/ARMMCAsmInfo.cpp
new file mode 100644
index 0000000..ccd6add
--- /dev/null
+++ b/lib/Target/ARM/ARMMCAsmInfo.cpp

@@ -0,0 +1,70 @@
+//===-- ARMMCAsmInfo.cpp - ARM asm properties -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the ARMMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMMCAsmInfo.h"
+using namespace llvm;
+
+static const char *const arm_asm_table[] = {
+  "{r0}", "r0",
+  "{r1}", "r1",
+  "{r2}", "r2",
+  "{r3}", "r3",
+  "{r4}", "r4",
+  "{r5}", "r5",
+  "{r6}", "r6",
+  "{r7}", "r7",
+  "{r8}", "r8",
+  "{r9}", "r9",
+  "{r10}", "r10",
+  "{r11}", "r11",
+  "{r12}", "r12",
+  "{r13}", "r13",
+  "{r14}", "r14",
+  "{lr}", "lr",
+  "{sp}", "sp",
+  "{ip}", "ip",
+  "{fp}", "fp",
+  "{sl}", "sl",
+  "{memory}", "memory",
+  "{cc}", "cc",
+  0,0
+};
+
+ARMMCAsmInfoDarwin::ARMMCAsmInfoDarwin() {
+  AsmTransCBE = arm_asm_table;
+  Data64bitsDirective = 0;
+  CommentString = "@";
+  SupportsDebugInformation = true;
+
+  // Exceptions handling
+  ExceptionsType = ExceptionHandling::SjLj;
+  AbsoluteEHSectionOffsets = false;
+}
+
+ARMELFMCAsmInfo::ARMELFMCAsmInfo() {
+  // ".comm align is in bytes but .align is pow-2."
+  AlignmentIsInBytes = false;
+
+  Data64bitsDirective = 0;
+  CommentString = "@";
+
+  HasLEB128 = true;
+  AbsoluteDebugSectionOffsets = true;
+  PrivateGlobalPrefix = ".L";
+  WeakRefDirective = "\t.weak\t";
+  HasLCOMMDirective = true;
+
+  DwarfRequiresFrameSection = false;
+
+  SupportsDebugInformation = true;
+}

diff --git a/lib/Target/ARM/ARMMCAsmInfo.h b/lib/Target/ARM/ARMMCAsmInfo.h
new file mode 100644
index 0000000..90f7822
--- /dev/null
+++ b/lib/Target/ARM/ARMMCAsmInfo.h

@@ -0,0 +1,31 @@
+//=====-- ARMMCAsmInfo.h - ARM asm properties -------------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the ARMMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ARMTARGETASMINFO_H
+#define LLVM_ARMTARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfoDarwin.h"
+
+namespace llvm {
+
+  struct ARMMCAsmInfoDarwin : public MCAsmInfoDarwin {
+    explicit ARMMCAsmInfoDarwin();
+  };
+
+  struct ARMELFMCAsmInfo : public MCAsmInfo {
+    explicit ARMELFMCAsmInfo();
+  };
+
+} // namespace llvm
+
+#endif

diff --git a/lib/Target/ARM/ARMMachineFunctionInfo.h b/lib/Target/ARM/ARMMachineFunctionInfo.h
new file mode 100644
index 0000000..c998ede
--- /dev/null
+++ b/lib/Target/ARM/ARMMachineFunctionInfo.h

@@ -0,0 +1,229 @@
+//====- ARMMachineFuctionInfo.h - ARM machine function info -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares ARM-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMMACHINEFUNCTIONINFO_H
+#define ARMMACHINEFUNCTIONINFO_H
+
+#include "ARMSubtarget.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/ADT/BitVector.h"
+
+namespace llvm {
+
+/// ARMFunctionInfo - This class is derived from MachineFunction private
+/// ARM target-specific information for each MachineFunction.
+class ARMFunctionInfo : public MachineFunctionInfo {
+
+  /// isThumb - True if this function is compiled under Thumb mode.
+  /// Used to initialized Align, so must precede it.
+  bool isThumb;
+
+  /// hasThumb2 - True if the target architecture supports Thumb2. Do not use
+  /// to determine if function is compiled under Thumb mode, for that use
+  /// 'isThumb'.
+  bool hasThumb2;
+
+  /// VarArgsRegSaveSize - Size of the register save area for vararg functions.
+  ///
+  unsigned VarArgsRegSaveSize;
+
+  /// HasStackFrame - True if this function has a stack frame. Set by
+  /// processFunctionBeforeCalleeSavedScan().
+  bool HasStackFrame;
+
+  /// LRSpilledForFarJump - True if the LR register has been for spilled to
+  /// enable far jump.
+  bool LRSpilledForFarJump;
+
+  /// FramePtrSpillOffset - If HasStackFrame, this records the frame pointer
+  /// spill stack offset.
+  unsigned FramePtrSpillOffset;
+
+  /// GPRCS1Offset, GPRCS2Offset, DPRCSOffset - Starting offset of callee saved
+  /// register spills areas. For Mac OS X:
+  ///
+  /// GPR callee-saved (1) : r4, r5, r6, r7, lr
+  /// --------------------------------------------
+  /// GPR callee-saved (2) : r8, r10, r11
+  /// --------------------------------------------
+  /// DPR callee-saved : d8 - d15
+  unsigned GPRCS1Offset;
+  unsigned GPRCS2Offset;
+  unsigned DPRCSOffset;
+
+  /// GPRCS1Size, GPRCS2Size, DPRCSSize - Sizes of callee saved register spills
+  /// areas.
+  unsigned GPRCS1Size;
+  unsigned GPRCS2Size;
+  unsigned DPRCSSize;
+
+  /// GPRCS1Frames, GPRCS2Frames, DPRCSFrames - Keeps track of frame indices
+  /// which belong to these spill areas.
+  BitVector GPRCS1Frames;
+  BitVector GPRCS2Frames;
+  BitVector DPRCSFrames;
+
+  /// SpilledCSRegs - A BitVector mask of all spilled callee-saved registers.
+  ///
+  BitVector SpilledCSRegs;
+
+  /// JumpTableUId - Unique id for jumptables.
+  ///
+  unsigned JumpTableUId;
+
+  unsigned ConstPoolEntryUId;
+
+public:
+  ARMFunctionInfo() :
+    isThumb(false),
+    hasThumb2(false),
+    VarArgsRegSaveSize(0), HasStackFrame(false),
+    LRSpilledForFarJump(false),
+    FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
+    GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0),
+    GPRCS1Frames(0), GPRCS2Frames(0), DPRCSFrames(0),
+    JumpTableUId(0), ConstPoolEntryUId(0) {}
+
+  explicit ARMFunctionInfo(MachineFunction &MF) :
+    isThumb(MF.getTarget().getSubtarget<ARMSubtarget>().isThumb()),
+    hasThumb2(MF.getTarget().getSubtarget<ARMSubtarget>().hasThumb2()),
+    VarArgsRegSaveSize(0), HasStackFrame(false),
+    LRSpilledForFarJump(false),
+    FramePtrSpillOffset(0), GPRCS1Offset(0), GPRCS2Offset(0), DPRCSOffset(0),
+    GPRCS1Size(0), GPRCS2Size(0), DPRCSSize(0),
+    GPRCS1Frames(32), GPRCS2Frames(32), DPRCSFrames(32),
+    SpilledCSRegs(MF.getTarget().getRegisterInfo()->getNumRegs()),
+    JumpTableUId(0), ConstPoolEntryUId(0) {}
+
+  bool isThumbFunction() const { return isThumb; }
+  bool isThumb1OnlyFunction() const { return isThumb && !hasThumb2; }
+  bool isThumb2Function() const { return isThumb && hasThumb2; }
+
+  unsigned getVarArgsRegSaveSize() const { return VarArgsRegSaveSize; }
+  void setVarArgsRegSaveSize(unsigned s) { VarArgsRegSaveSize = s; }
+
+  bool hasStackFrame() const { return HasStackFrame; }
+  void setHasStackFrame(bool s) { HasStackFrame = s; }
+
+  bool isLRSpilledForFarJump() const { return LRSpilledForFarJump; }
+  void setLRIsSpilledForFarJump(bool s) { LRSpilledForFarJump = s; }
+
+  unsigned getFramePtrSpillOffset() const { return FramePtrSpillOffset; }
+  void setFramePtrSpillOffset(unsigned o) { FramePtrSpillOffset = o; }
+
+  unsigned getGPRCalleeSavedArea1Offset() const { return GPRCS1Offset; }
+  unsigned getGPRCalleeSavedArea2Offset() const { return GPRCS2Offset; }
+  unsigned getDPRCalleeSavedAreaOffset()  const { return DPRCSOffset; }
+
+  void setGPRCalleeSavedArea1Offset(unsigned o) { GPRCS1Offset = o; }
+  void setGPRCalleeSavedArea2Offset(unsigned o) { GPRCS2Offset = o; }
+  void setDPRCalleeSavedAreaOffset(unsigned o)  { DPRCSOffset = o; }
+
+  unsigned getGPRCalleeSavedArea1Size() const { return GPRCS1Size; }
+  unsigned getGPRCalleeSavedArea2Size() const { return GPRCS2Size; }
+  unsigned getDPRCalleeSavedAreaSize()  const { return DPRCSSize; }
+
+  void setGPRCalleeSavedArea1Size(unsigned s) { GPRCS1Size = s; }
+  void setGPRCalleeSavedArea2Size(unsigned s) { GPRCS2Size = s; }
+  void setDPRCalleeSavedAreaSize(unsigned s)  { DPRCSSize = s; }
+
+  bool isGPRCalleeSavedArea1Frame(int fi) const {
+    if (fi < 0 || fi >= (int)GPRCS1Frames.size())
+      return false;
+    return GPRCS1Frames[fi];
+  }
+  bool isGPRCalleeSavedArea2Frame(int fi) const {
+    if (fi < 0 || fi >= (int)GPRCS2Frames.size())
+      return false;
+    return GPRCS2Frames[fi];
+  }
+  bool isDPRCalleeSavedAreaFrame(int fi) const {
+    if (fi < 0 || fi >= (int)DPRCSFrames.size())
+      return false;
+    return DPRCSFrames[fi];
+  }
+
+  void addGPRCalleeSavedArea1Frame(int fi) {
+    if (fi >= 0) {
+      int Size = GPRCS1Frames.size();
+      if (fi >= Size) {
+        Size *= 2;
+        if (fi >= Size)
+          Size = fi+1;
+        GPRCS1Frames.resize(Size);
+      }
+      GPRCS1Frames[fi] = true;
+    }
+  }
+  void addGPRCalleeSavedArea2Frame(int fi) {
+    if (fi >= 0) {
+      int Size = GPRCS2Frames.size();
+      if (fi >= Size) {
+        Size *= 2;
+        if (fi >= Size)
+          Size = fi+1;
+        GPRCS2Frames.resize(Size);
+      }
+      GPRCS2Frames[fi] = true;
+    }
+  }
+  void addDPRCalleeSavedAreaFrame(int fi) {
+    if (fi >= 0) {
+      int Size = DPRCSFrames.size();
+      if (fi >= Size) {
+        Size *= 2;
+        if (fi >= Size)
+          Size = fi+1;
+        DPRCSFrames.resize(Size);
+      }
+      DPRCSFrames[fi] = true;
+    }
+  }
+
+  void setCSRegisterIsSpilled(unsigned Reg) {
+    SpilledCSRegs.set(Reg);
+  }
+
+  bool isCSRegisterSpilled(unsigned Reg) const {
+    return SpilledCSRegs[Reg];
+  }
+
+  const BitVector &getSpilledCSRegisters() const {
+    return SpilledCSRegs;
+  }
+
+  unsigned createJumpTableUId() {
+    return JumpTableUId++;
+  }
+
+  unsigned getNumJumpTables() const {
+    return JumpTableUId;
+  }
+
+  void initConstPoolEntryUId(unsigned UId) {
+    ConstPoolEntryUId = UId;
+  }
+
+  unsigned getNumConstPoolEntries() const {
+    return ConstPoolEntryUId;
+  }
+
+  unsigned createConstPoolEntryUId() {
+    return ConstPoolEntryUId++;
+  }
+};
+} // End llvm namespace
+
+#endif // ARMMACHINEFUNCTIONINFO_H

diff --git a/lib/Target/ARM/ARMPerfectShuffle.h b/lib/Target/ARM/ARMPerfectShuffle.h
new file mode 100644
index 0000000..5ff7c38
--- /dev/null
+++ b/lib/Target/ARM/ARMPerfectShuffle.h

@@ -0,0 +1,6586 @@
+//===-- ARMPerfectShuffle.h - NEON Perfect Shuffle Table ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file, which was autogenerated by llvm-PerfectShuffle, contains data
+// for the optimal way to build a perfect shuffle using neon instructions.
+//
+//===----------------------------------------------------------------------===//
+
+// 31 entries have cost 0
+// 242 entries have cost 1
+// 1447 entries have cost 2
+// 3602 entries have cost 3
+// 1237 entries have cost 4
+// 2 entries have cost 5
+
+// This table is 6561*4 = 26244 bytes in size.
+static const unsigned PerfectShuffleTable[6561+1] = {
+  135053414U,	// <0,0,0,0>: Cost 1 vdup0 LHS
+  1543503974U,	// <0,0,0,1>: Cost 2 vext2 <0,0,0,0>, LHS
+  2618572962U,	// <0,0,0,2>: Cost 3 vext2 <0,2,0,0>, <0,2,0,0>
+  2568054923U,	// <0,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
+  1476398390U,	// <0,0,0,4>: Cost 2 vext1 <0,0,0,0>, RHS
+  2550140624U,	// <0,0,0,5>: Cost 3 vext1 <0,0,0,0>, <5,1,7,3>
+  2550141434U,	// <0,0,0,6>: Cost 3 vext1 <0,0,0,0>, <6,2,7,3>
+  2591945711U,	// <0,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
+  135053414U,	// <0,0,0,u>: Cost 1 vdup0 LHS
+  2886516736U,	// <0,0,1,0>: Cost 3 vzipl LHS, <0,0,0,0>
+  1812775014U,	// <0,0,1,1>: Cost 2 vzipl LHS, LHS
+  1618133094U,	// <0,0,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
+  2625209292U,	// <0,0,1,3>: Cost 3 vext2 <1,3,0,0>, <1,3,0,0>
+  2886558034U,	// <0,0,1,4>: Cost 3 vzipl LHS, <0,4,1,5>
+  2617246864U,	// <0,0,1,5>: Cost 3 vext2 <0,0,0,0>, <1,5,3,7>
+  3659723031U,	// <0,0,1,6>: Cost 4 vext1 <6,0,0,1>, <6,0,0,1>
+  2591953904U,	// <0,0,1,7>: Cost 3 vext1 <7,0,0,1>, <7,0,0,1>
+  1812775581U,	// <0,0,1,u>: Cost 2 vzipl LHS, LHS
+  3020734464U,	// <0,0,2,0>: Cost 3 vtrnl LHS, <0,0,0,0>
+  3020734474U,	// <0,0,2,1>: Cost 3 vtrnl LHS, <0,0,1,1>
+  1946992742U,	// <0,0,2,2>: Cost 2 vtrnl LHS, LHS
+  2631181989U,	// <0,0,2,3>: Cost 3 vext2 <2,3,0,0>, <2,3,0,0>
+  3020734668U,	// <0,0,2,4>: Cost 3 vtrnl LHS, <0,2,4,6>
+  3826550569U,	// <0,0,2,5>: Cost 4 vuzpl <0,2,0,2>, <2,4,5,6>
+  2617247674U,	// <0,0,2,6>: Cost 3 vext2 <0,0,0,0>, <2,6,3,7>
+  2591962097U,	// <0,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
+  1946992796U,	// <0,0,2,u>: Cost 2 vtrnl LHS, LHS
+  2635163787U,	// <0,0,3,0>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
+  2686419196U,	// <0,0,3,1>: Cost 3 vext3 <0,3,1,0>, <0,3,1,0>
+  2686492933U,	// <0,0,3,2>: Cost 3 vext3 <0,3,2,0>, <0,3,2,0>
+  2617248156U,	// <0,0,3,3>: Cost 3 vext2 <0,0,0,0>, <3,3,3,3>
+  2617248258U,	// <0,0,3,4>: Cost 3 vext2 <0,0,0,0>, <3,4,5,6>
+  3826551298U,	// <0,0,3,5>: Cost 4 vuzpl <0,2,0,2>, <3,4,5,6>
+  3690990200U,	// <0,0,3,6>: Cost 4 vext2 <0,0,0,0>, <3,6,0,7>
+  3713551042U,	// <0,0,3,7>: Cost 4 vext2 <3,7,0,0>, <3,7,0,0>
+  2635163787U,	// <0,0,3,u>: Cost 3 vext2 <3,0,0,0>, <3,0,0,0>
+  2617248658U,	// <0,0,4,0>: Cost 3 vext2 <0,0,0,0>, <4,0,5,1>
+  2888450150U,	// <0,0,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
+  3021570150U,	// <0,0,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
+  3641829519U,	// <0,0,4,3>: Cost 4 vext1 <3,0,0,4>, <3,0,0,4>
+  3021570252U,	// <0,0,4,4>: Cost 3 vtrnl <0,2,4,6>, <0,2,4,6>
+  1543507254U,	// <0,0,4,5>: Cost 2 vext2 <0,0,0,0>, RHS
+  2752810294U,	// <0,0,4,6>: Cost 3 vuzpl <0,2,0,2>, RHS
+  3786998152U,	// <0,0,4,7>: Cost 4 vext3 <4,7,5,0>, <0,4,7,5>
+  1543507497U,	// <0,0,4,u>: Cost 2 vext2 <0,0,0,0>, RHS
+  2684354972U,	// <0,0,5,0>: Cost 3 vext3 <0,0,0,0>, <0,5,0,7>
+  2617249488U,	// <0,0,5,1>: Cost 3 vext2 <0,0,0,0>, <5,1,7,3>
+  3765617070U,	// <0,0,5,2>: Cost 4 vext3 <1,2,3,0>, <0,5,2,7>
+  3635865780U,	// <0,0,5,3>: Cost 4 vext1 <2,0,0,5>, <3,0,4,5>
+  2617249734U,	// <0,0,5,4>: Cost 3 vext2 <0,0,0,0>, <5,4,7,6>
+  2617249796U,	// <0,0,5,5>: Cost 3 vext2 <0,0,0,0>, <5,5,5,5>
+  2718712274U,	// <0,0,5,6>: Cost 3 vext3 <5,6,7,0>, <0,5,6,7>
+  2617249960U,	// <0,0,5,7>: Cost 3 vext2 <0,0,0,0>, <5,7,5,7>
+  2720039396U,	// <0,0,5,u>: Cost 3 vext3 <5,u,7,0>, <0,5,u,7>
+  2684355053U,	// <0,0,6,0>: Cost 3 vext3 <0,0,0,0>, <0,6,0,7>
+  3963609190U,	// <0,0,6,1>: Cost 4 vzipl <0,6,2,7>, LHS
+  2617250298U,	// <0,0,6,2>: Cost 3 vext2 <0,0,0,0>, <6,2,7,3>
+  3796435464U,	// <0,0,6,3>: Cost 4 vext3 <6,3,7,0>, <0,6,3,7>
+  3659762998U,	// <0,0,6,4>: Cost 4 vext1 <6,0,0,6>, RHS
+  3659763810U,	// <0,0,6,5>: Cost 4 vext1 <6,0,0,6>, <5,6,7,0>
+  2617250616U,	// <0,0,6,6>: Cost 3 vext2 <0,0,0,0>, <6,6,6,6>
+  2657727309U,	// <0,0,6,7>: Cost 3 vext2 <6,7,0,0>, <6,7,0,0>
+  2658390942U,	// <0,0,6,u>: Cost 3 vext2 <6,u,0,0>, <6,u,0,0>
+  2659054575U,	// <0,0,7,0>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
+  3635880854U,	// <0,0,7,1>: Cost 4 vext1 <2,0,0,7>, <1,2,3,0>
+  3635881401U,	// <0,0,7,2>: Cost 4 vext1 <2,0,0,7>, <2,0,0,7>
+  3734787298U,	// <0,0,7,3>: Cost 4 vext2 <7,3,0,0>, <7,3,0,0>
+  2617251174U,	// <0,0,7,4>: Cost 3 vext2 <0,0,0,0>, <7,4,5,6>
+  3659772002U,	// <0,0,7,5>: Cost 4 vext1 <6,0,0,7>, <5,6,7,0>
+  3659772189U,	// <0,0,7,6>: Cost 4 vext1 <6,0,0,7>, <6,0,0,7>
+  2617251436U,	// <0,0,7,7>: Cost 3 vext2 <0,0,0,0>, <7,7,7,7>
+  2659054575U,	// <0,0,7,u>: Cost 3 vext2 <7,0,0,0>, <7,0,0,0>
+  135053414U,	// <0,0,u,0>: Cost 1 vdup0 LHS
+  1817419878U,	// <0,0,u,1>: Cost 2 vzipl LHS, LHS
+  1947435110U,	// <0,0,u,2>: Cost 2 vtrnl LHS, LHS
+  2568120467U,	// <0,0,u,3>: Cost 3 vext1 <3,0,0,u>, <3,0,0,u>
+  1476463926U,	// <0,0,u,4>: Cost 2 vext1 <0,0,0,u>, RHS
+  1543510170U,	// <0,0,u,5>: Cost 2 vext2 <0,0,0,0>, RHS
+  2752813210U,	// <0,0,u,6>: Cost 3 vuzpl <0,2,0,2>, RHS
+  2592011255U,	// <0,0,u,7>: Cost 3 vext1 <7,0,0,u>, <7,0,0,u>
+  135053414U,	// <0,0,u,u>: Cost 1 vdup0 LHS
+  2618581002U,	// <0,1,0,0>: Cost 3 vext2 <0,2,0,1>, <0,0,1,1>
+  1557446758U,	// <0,1,0,1>: Cost 2 vext2 <2,3,0,1>, LHS
+  2618581155U,	// <0,1,0,2>: Cost 3 vext2 <0,2,0,1>, <0,2,0,1>
+  2690548468U,	// <0,1,0,3>: Cost 3 vext3 <1,0,3,0>, <1,0,3,0>
+  2626543954U,	// <0,1,0,4>: Cost 3 vext2 <1,5,0,1>, <0,4,1,5>
+  4094985216U,	// <0,1,0,5>: Cost 4 vtrnl <0,2,0,2>, <1,3,5,7>
+  2592019278U,	// <0,1,0,6>: Cost 3 vext1 <7,0,1,0>, <6,7,0,1>
+  2592019448U,	// <0,1,0,7>: Cost 3 vext1 <7,0,1,0>, <7,0,1,0>
+  1557447325U,	// <0,1,0,u>: Cost 2 vext2 <2,3,0,1>, LHS
+  1476476938U,	// <0,1,1,0>: Cost 2 vext1 <0,0,1,1>, <0,0,1,1>
+  2886517556U,	// <0,1,1,1>: Cost 3 vzipl LHS, <1,1,1,1>
+  2886517654U,	// <0,1,1,2>: Cost 3 vzipl LHS, <1,2,3,0>
+  2886517720U,	// <0,1,1,3>: Cost 3 vzipl LHS, <1,3,1,3>
+  1476480310U,	// <0,1,1,4>: Cost 2 vext1 <0,0,1,1>, RHS
+  2886558864U,	// <0,1,1,5>: Cost 3 vzipl LHS, <1,5,3,7>
+  2550223354U,	// <0,1,1,6>: Cost 3 vext1 <0,0,1,1>, <6,2,7,3>
+  2550223856U,	// <0,1,1,7>: Cost 3 vext1 <0,0,1,1>, <7,0,0,1>
+  1476482862U,	// <0,1,1,u>: Cost 2 vext1 <0,0,1,1>, LHS
+  1494401126U,	// <0,1,2,0>: Cost 2 vext1 <3,0,1,2>, LHS
+  3020735284U,	// <0,1,2,1>: Cost 3 vtrnl LHS, <1,1,1,1>
+  2562172349U,	// <0,1,2,2>: Cost 3 vext1 <2,0,1,2>, <2,0,1,2>
+  835584U,	// <0,1,2,3>: Cost 0 copy LHS
+  1494404406U,	// <0,1,2,4>: Cost 2 vext1 <3,0,1,2>, RHS
+  3020735488U,	// <0,1,2,5>: Cost 3 vtrnl LHS, <1,3,5,7>
+  2631190458U,	// <0,1,2,6>: Cost 3 vext2 <2,3,0,1>, <2,6,3,7>
+  1518294010U,	// <0,1,2,7>: Cost 2 vext1 <7,0,1,2>, <7,0,1,2>
+  835584U,	// <0,1,2,u>: Cost 0 copy LHS
+  2692318156U,	// <0,1,3,0>: Cost 3 vext3 <1,3,0,0>, <1,3,0,0>
+  2691875800U,	// <0,1,3,1>: Cost 3 vext3 <1,2,3,0>, <1,3,1,3>
+  2691875806U,	// <0,1,3,2>: Cost 3 vext3 <1,2,3,0>, <1,3,2,0>
+  2692539367U,	// <0,1,3,3>: Cost 3 vext3 <1,3,3,0>, <1,3,3,0>
+  2562182454U,	// <0,1,3,4>: Cost 3 vext1 <2,0,1,3>, RHS
+  2691875840U,	// <0,1,3,5>: Cost 3 vext3 <1,2,3,0>, <1,3,5,7>
+  2692760578U,	// <0,1,3,6>: Cost 3 vext3 <1,3,6,0>, <1,3,6,0>
+  2639817411U,	// <0,1,3,7>: Cost 3 vext2 <3,7,0,1>, <3,7,0,1>
+  2691875863U,	// <0,1,3,u>: Cost 3 vext3 <1,2,3,0>, <1,3,u,3>
+  2568159334U,	// <0,1,4,0>: Cost 3 vext1 <3,0,1,4>, LHS
+  4095312692U,	// <0,1,4,1>: Cost 4 vtrnl <0,2,4,6>, <1,1,1,1>
+  2568160934U,	// <0,1,4,2>: Cost 3 vext1 <3,0,1,4>, <2,3,0,1>
+  2568161432U,	// <0,1,4,3>: Cost 3 vext1 <3,0,1,4>, <3,0,1,4>
+  2568162614U,	// <0,1,4,4>: Cost 3 vext1 <3,0,1,4>, RHS
+  1557450038U,	// <0,1,4,5>: Cost 2 vext2 <2,3,0,1>, RHS
+  2754235702U,	// <0,1,4,6>: Cost 3 vuzpl <0,4,1,5>, RHS
+  2592052220U,	// <0,1,4,7>: Cost 3 vext1 <7,0,1,4>, <7,0,1,4>
+  1557450281U,	// <0,1,4,u>: Cost 2 vext2 <2,3,0,1>, RHS
+  3765617775U,	// <0,1,5,0>: Cost 4 vext3 <1,2,3,0>, <1,5,0,1>
+  2647781007U,	// <0,1,5,1>: Cost 3 vext2 <5,1,0,1>, <5,1,0,1>
+  3704934138U,	// <0,1,5,2>: Cost 4 vext2 <2,3,0,1>, <5,2,3,0>
+  2691875984U,	// <0,1,5,3>: Cost 3 vext3 <1,2,3,0>, <1,5,3,7>
+  2657734598U,	// <0,1,5,4>: Cost 3 vext2 <6,7,0,1>, <5,4,7,6>
+  2650435539U,	// <0,1,5,5>: Cost 3 vext2 <5,5,0,1>, <5,5,0,1>
+  2651099172U,	// <0,1,5,6>: Cost 3 vext2 <5,6,0,1>, <5,6,0,1>
+  2651762805U,	// <0,1,5,7>: Cost 3 vext2 <5,7,0,1>, <5,7,0,1>
+  2691876029U,	// <0,1,5,u>: Cost 3 vext3 <1,2,3,0>, <1,5,u,7>
+  2592063590U,	// <0,1,6,0>: Cost 3 vext1 <7,0,1,6>, LHS
+  3765617871U,	// <0,1,6,1>: Cost 4 vext3 <1,2,3,0>, <1,6,1,7>
+  2654417337U,	// <0,1,6,2>: Cost 3 vext2 <6,2,0,1>, <6,2,0,1>
+  3765617889U,	// <0,1,6,3>: Cost 4 vext3 <1,2,3,0>, <1,6,3,7>
+  2592066870U,	// <0,1,6,4>: Cost 3 vext1 <7,0,1,6>, RHS
+  3765617907U,	// <0,1,6,5>: Cost 4 vext3 <1,2,3,0>, <1,6,5,7>
+  2657071869U,	// <0,1,6,6>: Cost 3 vext2 <6,6,0,1>, <6,6,0,1>
+  1583993678U,	// <0,1,6,7>: Cost 2 vext2 <6,7,0,1>, <6,7,0,1>
+  1584657311U,	// <0,1,6,u>: Cost 2 vext2 <6,u,0,1>, <6,u,0,1>
+  2657735672U,	// <0,1,7,0>: Cost 3 vext2 <6,7,0,1>, <7,0,1,0>
+  2657735808U,	// <0,1,7,1>: Cost 3 vext2 <6,7,0,1>, <7,1,7,1>
+  2631193772U,	// <0,1,7,2>: Cost 3 vext2 <2,3,0,1>, <7,2,3,0>
+  2661053667U,	// <0,1,7,3>: Cost 3 vext2 <7,3,0,1>, <7,3,0,1>
+  2657736038U,	// <0,1,7,4>: Cost 3 vext2 <6,7,0,1>, <7,4,5,6>
+  3721524621U,	// <0,1,7,5>: Cost 4 vext2 <5,1,0,1>, <7,5,1,0>
+  2657736158U,	// <0,1,7,6>: Cost 3 vext2 <6,7,0,1>, <7,6,1,0>
+  2657736300U,	// <0,1,7,7>: Cost 3 vext2 <6,7,0,1>, <7,7,7,7>
+  2657736322U,	// <0,1,7,u>: Cost 3 vext2 <6,7,0,1>, <7,u,1,2>
+  1494450278U,	// <0,1,u,0>: Cost 2 vext1 <3,0,1,u>, LHS
+  1557452590U,	// <0,1,u,1>: Cost 2 vext2 <2,3,0,1>, LHS
+  2754238254U,	// <0,1,u,2>: Cost 3 vuzpl <0,4,1,5>, LHS
+  835584U,	// <0,1,u,3>: Cost 0 copy LHS
+  1494453558U,	// <0,1,u,4>: Cost 2 vext1 <3,0,1,u>, RHS
+  1557452954U,	// <0,1,u,5>: Cost 2 vext2 <2,3,0,1>, RHS
+  2754238618U,	// <0,1,u,6>: Cost 3 vuzpl <0,4,1,5>, RHS
+  1518343168U,	// <0,1,u,7>: Cost 2 vext1 <7,0,1,u>, <7,0,1,u>
+  835584U,	// <0,1,u,u>: Cost 0 copy LHS
+  2752299008U,	// <0,2,0,0>: Cost 3 vuzpl LHS, <0,0,0,0>
+  1544847462U,	// <0,2,0,1>: Cost 2 vext2 <0,2,0,2>, LHS
+  1678557286U,	// <0,2,0,2>: Cost 2 vuzpl LHS, LHS
+  2696521165U,	// <0,2,0,3>: Cost 3 vext3 <2,0,3,0>, <2,0,3,0>
+  2752340172U,	// <0,2,0,4>: Cost 3 vuzpl LHS, <0,2,4,6>
+  2691876326U,	// <0,2,0,5>: Cost 3 vext3 <1,2,3,0>, <2,0,5,7>
+  2618589695U,	// <0,2,0,6>: Cost 3 vext2 <0,2,0,2>, <0,6,2,7>
+  2592093185U,	// <0,2,0,7>: Cost 3 vext1 <7,0,2,0>, <7,0,2,0>
+  1678557340U,	// <0,2,0,u>: Cost 2 vuzpl LHS, LHS
+  2618589942U,	// <0,2,1,0>: Cost 3 vext2 <0,2,0,2>, <1,0,3,2>
+  2752299828U,	// <0,2,1,1>: Cost 3 vuzpl LHS, <1,1,1,1>
+  2886518376U,	// <0,2,1,2>: Cost 3 vzipl LHS, <2,2,2,2>
+  2752299766U,	// <0,2,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
+  2550295862U,	// <0,2,1,4>: Cost 3 vext1 <0,0,2,1>, RHS
+  2752340992U,	// <0,2,1,5>: Cost 3 vuzpl LHS, <1,3,5,7>
+  2886559674U,	// <0,2,1,6>: Cost 3 vzipl LHS, <2,6,3,7>
+  3934208106U,	// <0,2,1,7>: Cost 4 vuzpr <7,0,1,2>, <0,1,2,7>
+  2752340771U,	// <0,2,1,u>: Cost 3 vuzpl LHS, <1,0,u,2>
+  1476558868U,	// <0,2,2,0>: Cost 2 vext1 <0,0,2,2>, <0,0,2,2>
+  2226628029U,	// <0,2,2,1>: Cost 3 vrev <2,0,1,2>
+  2752300648U,	// <0,2,2,2>: Cost 3 vuzpl LHS, <2,2,2,2>
+  3020736114U,	// <0,2,2,3>: Cost 3 vtrnl LHS, <2,2,3,3>
+  1476562230U,	// <0,2,2,4>: Cost 2 vext1 <0,0,2,2>, RHS
+  2550304464U,	// <0,2,2,5>: Cost 3 vext1 <0,0,2,2>, <5,1,7,3>
+  2618591162U,	// <0,2,2,6>: Cost 3 vext2 <0,2,0,2>, <2,6,3,7>
+  2550305777U,	// <0,2,2,7>: Cost 3 vext1 <0,0,2,2>, <7,0,0,2>
+  1476564782U,	// <0,2,2,u>: Cost 2 vext1 <0,0,2,2>, LHS
+  2618591382U,	// <0,2,3,0>: Cost 3 vext2 <0,2,0,2>, <3,0,1,2>
+  2752301206U,	// <0,2,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
+  3826043121U,	// <0,2,3,2>: Cost 4 vuzpl LHS, <3,1,2,3>
+  2752301468U,	// <0,2,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
+  2618591746U,	// <0,2,3,4>: Cost 3 vext2 <0,2,0,2>, <3,4,5,6>
+  2752301570U,	// <0,2,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
+  3830688102U,	// <0,2,3,6>: Cost 4 vuzpl LHS, <3,2,6,3>
+  2698807012U,	// <0,2,3,7>: Cost 3 vext3 <2,3,7,0>, <2,3,7,0>
+  2752301269U,	// <0,2,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
+  2562261094U,	// <0,2,4,0>: Cost 3 vext1 <2,0,2,4>, LHS
+  4095313828U,	// <0,2,4,1>: Cost 4 vtrnl <0,2,4,6>, <2,6,1,3>
+  2226718152U,	// <0,2,4,2>: Cost 3 vrev <2,0,2,4>
+  2568235169U,	// <0,2,4,3>: Cost 3 vext1 <3,0,2,4>, <3,0,2,4>
+  2562264374U,	// <0,2,4,4>: Cost 3 vext1 <2,0,2,4>, RHS
+  1544850742U,	// <0,2,4,5>: Cost 2 vext2 <0,2,0,2>, RHS
+  1678560566U,	// <0,2,4,6>: Cost 2 vuzpl LHS, RHS
+  2592125957U,	// <0,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
+  1678560584U,	// <0,2,4,u>: Cost 2 vuzpl LHS, RHS
+  2691876686U,	// <0,2,5,0>: Cost 3 vext3 <1,2,3,0>, <2,5,0,7>
+  2618592976U,	// <0,2,5,1>: Cost 3 vext2 <0,2,0,2>, <5,1,7,3>
+  3765618528U,	// <0,2,5,2>: Cost 4 vext3 <1,2,3,0>, <2,5,2,7>
+  3765618536U,	// <0,2,5,3>: Cost 4 vext3 <1,2,3,0>, <2,5,3,6>
+  2618593222U,	// <0,2,5,4>: Cost 3 vext2 <0,2,0,2>, <5,4,7,6>
+  2752303108U,	// <0,2,5,5>: Cost 3 vuzpl LHS, <5,5,5,5>
+  2618593378U,	// <0,2,5,6>: Cost 3 vext2 <0,2,0,2>, <5,6,7,0>
+  2824785206U,	// <0,2,5,7>: Cost 3 vuzpr <1,0,3,2>, RHS
+  2824785207U,	// <0,2,5,u>: Cost 3 vuzpr <1,0,3,2>, RHS
+  2752303950U,	// <0,2,6,0>: Cost 3 vuzpl LHS, <6,7,0,1>
+  3830690081U,	// <0,2,6,1>: Cost 4 vuzpl LHS, <6,0,1,2>
+  2618593786U,	// <0,2,6,2>: Cost 3 vext2 <0,2,0,2>, <6,2,7,3>
+  2691876794U,	// <0,2,6,3>: Cost 3 vext3 <1,2,3,0>, <2,6,3,7>
+  2752303990U,	// <0,2,6,4>: Cost 3 vuzpl LHS, <6,7,4,5>
+  3830690445U,	// <0,2,6,5>: Cost 4 vuzpl LHS, <6,4,5,6>
+  2752303928U,	// <0,2,6,6>: Cost 3 vuzpl LHS, <6,6,6,6>
+  2657743695U,	// <0,2,6,7>: Cost 3 vext2 <6,7,0,2>, <6,7,0,2>
+  2691876839U,	// <0,2,6,u>: Cost 3 vext3 <1,2,3,0>, <2,6,u,7>
+  2659070961U,	// <0,2,7,0>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
+  2659734594U,	// <0,2,7,1>: Cost 3 vext2 <7,1,0,2>, <7,1,0,2>
+  3734140051U,	// <0,2,7,2>: Cost 4 vext2 <7,2,0,2>, <7,2,0,2>
+  2701166596U,	// <0,2,7,3>: Cost 3 vext3 <2,7,3,0>, <2,7,3,0>
+  2662389094U,	// <0,2,7,4>: Cost 3 vext2 <7,5,0,2>, <7,4,5,6>
+  2662389126U,	// <0,2,7,5>: Cost 3 vext2 <7,5,0,2>, <7,5,0,2>
+  3736794583U,	// <0,2,7,6>: Cost 4 vext2 <7,6,0,2>, <7,6,0,2>
+  2752304748U,	// <0,2,7,7>: Cost 3 vuzpl LHS, <7,7,7,7>
+  2659070961U,	// <0,2,7,u>: Cost 3 vext2 <7,0,0,2>, <7,0,0,2>
+  1476608026U,	// <0,2,u,0>: Cost 2 vext1 <0,0,2,u>, <0,0,2,u>
+  1544853294U,	// <0,2,u,1>: Cost 2 vext2 <0,2,0,2>, LHS
+  1678563118U,	// <0,2,u,2>: Cost 2 vuzpl LHS, LHS
+  3021178482U,	// <0,2,u,3>: Cost 3 vtrnl LHS, <2,2,3,3>
+  1476611382U,	// <0,2,u,4>: Cost 2 vext1 <0,0,2,u>, RHS
+  1544853658U,	// <0,2,u,5>: Cost 2 vext2 <0,2,0,2>, RHS
+  1678563482U,	// <0,2,u,6>: Cost 2 vuzpl LHS, RHS
+  2824785449U,	// <0,2,u,7>: Cost 3 vuzpr <1,0,3,2>, RHS
+  1678563172U,	// <0,2,u,u>: Cost 2 vuzpl LHS, LHS
+  2556329984U,	// <0,3,0,0>: Cost 3 vext1 <1,0,3,0>, <0,0,0,0>
+  2686421142U,	// <0,3,0,1>: Cost 3 vext3 <0,3,1,0>, <3,0,1,2>
+  2562303437U,	// <0,3,0,2>: Cost 3 vext1 <2,0,3,0>, <2,0,3,0>
+  4094986652U,	// <0,3,0,3>: Cost 4 vtrnl <0,2,0,2>, <3,3,3,3>
+  2556333366U,	// <0,3,0,4>: Cost 3 vext1 <1,0,3,0>, RHS
+  4094986754U,	// <0,3,0,5>: Cost 4 vtrnl <0,2,0,2>, <3,4,5,6>
+  3798796488U,	// <0,3,0,6>: Cost 4 vext3 <6,7,3,0>, <3,0,6,7>
+  3776530634U,	// <0,3,0,7>: Cost 4 vext3 <3,0,7,0>, <3,0,7,0>
+  2556335918U,	// <0,3,0,u>: Cost 3 vext1 <1,0,3,0>, LHS
+  2886518934U,	// <0,3,1,0>: Cost 3 vzipl LHS, <3,0,1,2>
+  2556338933U,	// <0,3,1,1>: Cost 3 vext1 <1,0,3,1>, <1,0,3,1>
+  2691877105U,	// <0,3,1,2>: Cost 3 vext3 <1,2,3,0>, <3,1,2,3>
+  2886519196U,	// <0,3,1,3>: Cost 3 vzipl LHS, <3,3,3,3>
+  2886519298U,	// <0,3,1,4>: Cost 3 vzipl LHS, <3,4,5,6>
+  4095740418U,	// <0,3,1,5>: Cost 4 vtrnl <0,3,1,4>, <3,4,5,6>
+  3659944242U,	// <0,3,1,6>: Cost 4 vext1 <6,0,3,1>, <6,0,3,1>
+  3769600286U,	// <0,3,1,7>: Cost 4 vext3 <1,u,3,0>, <3,1,7,3>
+  2886519582U,	// <0,3,1,u>: Cost 3 vzipl LHS, <3,u,1,2>
+  1482604646U,	// <0,3,2,0>: Cost 2 vext1 <1,0,3,2>, LHS
+  1482605302U,	// <0,3,2,1>: Cost 2 vext1 <1,0,3,2>, <1,0,3,2>
+  2556348008U,	// <0,3,2,2>: Cost 3 vext1 <1,0,3,2>, <2,2,2,2>
+  3020736924U,	// <0,3,2,3>: Cost 3 vtrnl LHS, <3,3,3,3>
+  1482607926U,	// <0,3,2,4>: Cost 2 vext1 <1,0,3,2>, RHS
+  3020737026U,	// <0,3,2,5>: Cost 3 vtrnl LHS, <3,4,5,6>
+  2598154746U,	// <0,3,2,6>: Cost 3 vext1 <u,0,3,2>, <6,2,7,3>
+  2598155258U,	// <0,3,2,7>: Cost 3 vext1 <u,0,3,2>, <7,0,1,2>
+  1482610478U,	// <0,3,2,u>: Cost 2 vext1 <1,0,3,2>, LHS
+  3692341398U,	// <0,3,3,0>: Cost 4 vext2 <0,2,0,3>, <3,0,1,2>
+  2635851999U,	// <0,3,3,1>: Cost 3 vext2 <3,1,0,3>, <3,1,0,3>
+  3636069840U,	// <0,3,3,2>: Cost 4 vext1 <2,0,3,3>, <2,0,3,3>
+  2691877276U,	// <0,3,3,3>: Cost 3 vext3 <1,2,3,0>, <3,3,3,3>
+  3961522690U,	// <0,3,3,4>: Cost 4 vzipl <0,3,1,4>, <3,4,5,6>
+  3826797058U,	// <0,3,3,5>: Cost 4 vuzpl <0,2,3,5>, <3,4,5,6>
+  3703622282U,	// <0,3,3,6>: Cost 4 vext2 <2,1,0,3>, <3,6,2,7>
+  3769600452U,	// <0,3,3,7>: Cost 4 vext3 <1,u,3,0>, <3,3,7,7>
+  2640497430U,	// <0,3,3,u>: Cost 3 vext2 <3,u,0,3>, <3,u,0,3>
+  3962194070U,	// <0,3,4,0>: Cost 4 vzipl <0,4,1,5>, <3,0,1,2>
+  2232617112U,	// <0,3,4,1>: Cost 3 vrev <3,0,1,4>
+  2232690849U,	// <0,3,4,2>: Cost 3 vrev <3,0,2,4>
+  4095314332U,	// <0,3,4,3>: Cost 4 vtrnl <0,2,4,6>, <3,3,3,3>
+  3962194434U,	// <0,3,4,4>: Cost 4 vzipl <0,4,1,5>, <3,4,5,6>
+  2691877378U,	// <0,3,4,5>: Cost 3 vext3 <1,2,3,0>, <3,4,5,6>
+  3826765110U,	// <0,3,4,6>: Cost 4 vuzpl <0,2,3,1>, RHS
+  3665941518U,	// <0,3,4,7>: Cost 4 vext1 <7,0,3,4>, <7,0,3,4>
+  2691877405U,	// <0,3,4,u>: Cost 3 vext3 <1,2,3,0>, <3,4,u,6>
+  3630112870U,	// <0,3,5,0>: Cost 4 vext1 <1,0,3,5>, LHS
+  3630113526U,	// <0,3,5,1>: Cost 4 vext1 <1,0,3,5>, <1,0,3,2>
+  4035199734U,	// <0,3,5,2>: Cost 4 vzipr <1,4,0,5>, <1,0,3,2>
+  3769600578U,	// <0,3,5,3>: Cost 4 vext3 <1,u,3,0>, <3,5,3,7>
+  2232846516U,	// <0,3,5,4>: Cost 3 vrev <3,0,4,5>
+  3779037780U,	// <0,3,5,5>: Cost 4 vext3 <3,4,5,0>, <3,5,5,7>
+  2718714461U,	// <0,3,5,6>: Cost 3 vext3 <5,6,7,0>, <3,5,6,7>
+  2706106975U,	// <0,3,5,7>: Cost 3 vext3 <3,5,7,0>, <3,5,7,0>
+  2233141464U,	// <0,3,5,u>: Cost 3 vrev <3,0,u,5>
+  2691877496U,	// <0,3,6,0>: Cost 3 vext3 <1,2,3,0>, <3,6,0,7>
+  3727511914U,	// <0,3,6,1>: Cost 4 vext2 <6,1,0,3>, <6,1,0,3>
+  3765619338U,	// <0,3,6,2>: Cost 4 vext3 <1,2,3,0>, <3,6,2,7>
+  3765619347U,	// <0,3,6,3>: Cost 4 vext3 <1,2,3,0>, <3,6,3,7>
+  3765987996U,	// <0,3,6,4>: Cost 4 vext3 <1,2,u,0>, <3,6,4,7>
+  3306670270U,	// <0,3,6,5>: Cost 4 vrev <3,0,5,6>
+  3792456365U,	// <0,3,6,6>: Cost 4 vext3 <5,6,7,0>, <3,6,6,6>
+  2706770608U,	// <0,3,6,7>: Cost 3 vext3 <3,6,7,0>, <3,6,7,0>
+  2706844345U,	// <0,3,6,u>: Cost 3 vext3 <3,6,u,0>, <3,6,u,0>
+  3769600707U,	// <0,3,7,0>: Cost 4 vext3 <1,u,3,0>, <3,7,0,1>
+  2659742787U,	// <0,3,7,1>: Cost 3 vext2 <7,1,0,3>, <7,1,0,3>
+  3636102612U,	// <0,3,7,2>: Cost 4 vext1 <2,0,3,7>, <2,0,3,7>
+  3769600740U,	// <0,3,7,3>: Cost 4 vext3 <1,u,3,0>, <3,7,3,7>
+  3769600747U,	// <0,3,7,4>: Cost 4 vext3 <1,u,3,0>, <3,7,4,5>
+  3769600758U,	// <0,3,7,5>: Cost 4 vext3 <1,u,3,0>, <3,7,5,7>
+  3659993400U,	// <0,3,7,6>: Cost 4 vext1 <6,0,3,7>, <6,0,3,7>
+  3781176065U,	// <0,3,7,7>: Cost 4 vext3 <3,7,7,0>, <3,7,7,0>
+  2664388218U,	// <0,3,7,u>: Cost 3 vext2 <7,u,0,3>, <7,u,0,3>
+  1482653798U,	// <0,3,u,0>: Cost 2 vext1 <1,0,3,u>, LHS
+  1482654460U,	// <0,3,u,1>: Cost 2 vext1 <1,0,3,u>, <1,0,3,u>
+  2556397160U,	// <0,3,u,2>: Cost 3 vext1 <1,0,3,u>, <2,2,2,2>
+  3021179292U,	// <0,3,u,3>: Cost 3 vtrnl LHS, <3,3,3,3>
+  1482657078U,	// <0,3,u,4>: Cost 2 vext1 <1,0,3,u>, RHS
+  3021179394U,	// <0,3,u,5>: Cost 3 vtrnl LHS, <3,4,5,6>
+  2598203898U,	// <0,3,u,6>: Cost 3 vext1 <u,0,3,u>, <6,2,7,3>
+  2708097874U,	// <0,3,u,7>: Cost 3 vext3 <3,u,7,0>, <3,u,7,0>
+  1482659630U,	// <0,3,u,u>: Cost 2 vext1 <1,0,3,u>, LHS
+  2617278468U,	// <0,4,0,0>: Cost 3 vext2 <0,0,0,4>, <0,0,0,4>
+  2618605670U,	// <0,4,0,1>: Cost 3 vext2 <0,2,0,4>, LHS
+  2618605734U,	// <0,4,0,2>: Cost 3 vext2 <0,2,0,4>, <0,2,0,4>
+  3642091695U,	// <0,4,0,3>: Cost 4 vext1 <3,0,4,0>, <3,0,4,0>
+  2753134796U,	// <0,4,0,4>: Cost 3 vuzpl <0,2,4,6>, <0,2,4,6>
+  2718714770U,	// <0,4,0,5>: Cost 3 vext3 <5,6,7,0>, <4,0,5,1>
+  3021245750U,	// <0,4,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
+  3665982483U,	// <0,4,0,7>: Cost 4 vext1 <7,0,4,0>, <7,0,4,0>
+  3021245768U,	// <0,4,0,u>: Cost 3 vtrnl <0,2,0,2>, RHS
+  2568355942U,	// <0,4,1,0>: Cost 3 vext1 <3,0,4,1>, LHS
+  3692348212U,	// <0,4,1,1>: Cost 4 vext2 <0,2,0,4>, <1,1,1,1>
+  3692348310U,	// <0,4,1,2>: Cost 4 vext2 <0,2,0,4>, <1,2,3,0>
+  2568358064U,	// <0,4,1,3>: Cost 3 vext1 <3,0,4,1>, <3,0,4,1>
+  2568359222U,	// <0,4,1,4>: Cost 3 vext1 <3,0,4,1>, RHS
+  1812778294U,	// <0,4,1,5>: Cost 2 vzipl LHS, RHS
+  3022671158U,	// <0,4,1,6>: Cost 3 vtrnl <0,4,1,5>, RHS
+  2592248852U,	// <0,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
+  1812778537U,	// <0,4,1,u>: Cost 2 vzipl LHS, RHS
+  2568364134U,	// <0,4,2,0>: Cost 3 vext1 <3,0,4,2>, LHS
+  2238573423U,	// <0,4,2,1>: Cost 3 vrev <4,0,1,2>
+  3692349032U,	// <0,4,2,2>: Cost 4 vext2 <0,2,0,4>, <2,2,2,2>
+  2631214761U,	// <0,4,2,3>: Cost 3 vext2 <2,3,0,4>, <2,3,0,4>
+  2568367414U,	// <0,4,2,4>: Cost 3 vext1 <3,0,4,2>, RHS
+  2887028022U,	// <0,4,2,5>: Cost 3 vzipl <0,2,0,2>, RHS
+  1946996022U,	// <0,4,2,6>: Cost 2 vtrnl LHS, RHS
+  2592257045U,	// <0,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
+  1946996040U,	// <0,4,2,u>: Cost 2 vtrnl LHS, RHS
+  3692349590U,	// <0,4,3,0>: Cost 4 vext2 <0,2,0,4>, <3,0,1,2>
+  3826878614U,	// <0,4,3,1>: Cost 4 vuzpl <0,2,4,6>, <3,0,1,2>
+  3826878625U,	// <0,4,3,2>: Cost 4 vuzpl <0,2,4,6>, <3,0,2,4>
+  3692349852U,	// <0,4,3,3>: Cost 4 vext2 <0,2,0,4>, <3,3,3,3>
+  3692349954U,	// <0,4,3,4>: Cost 4 vext2 <0,2,0,4>, <3,4,5,6>
+  3826878978U,	// <0,4,3,5>: Cost 4 vuzpl <0,2,4,6>, <3,4,5,6>
+  4095200566U,	// <0,4,3,6>: Cost 4 vtrnl <0,2,3,1>, RHS
+  3713583814U,	// <0,4,3,7>: Cost 4 vext2 <3,7,0,4>, <3,7,0,4>
+  3692350238U,	// <0,4,3,u>: Cost 4 vext2 <0,2,0,4>, <3,u,1,2>
+  2550464552U,	// <0,4,4,0>: Cost 3 vext1 <0,0,4,4>, <0,0,4,4>
+  3962194914U,	// <0,4,4,1>: Cost 4 vzipl <0,4,1,5>, <4,1,5,0>
+  3693677631U,	// <0,4,4,2>: Cost 4 vext2 <0,4,0,4>, <4,2,6,3>
+  3642124467U,	// <0,4,4,3>: Cost 4 vext1 <3,0,4,4>, <3,0,4,4>
+  2718715088U,	// <0,4,4,4>: Cost 3 vext3 <5,6,7,0>, <4,4,4,4>
+  2618608950U,	// <0,4,4,5>: Cost 3 vext2 <0,2,0,4>, RHS
+  2753137974U,	// <0,4,4,6>: Cost 3 vuzpl <0,2,4,6>, RHS
+  3666015255U,	// <0,4,4,7>: Cost 4 vext1 <7,0,4,4>, <7,0,4,4>
+  2618609193U,	// <0,4,4,u>: Cost 3 vext2 <0,2,0,4>, RHS
+  2568388710U,	// <0,4,5,0>: Cost 3 vext1 <3,0,4,5>, LHS
+  2568389526U,	// <0,4,5,1>: Cost 3 vext1 <3,0,4,5>, <1,2,3,0>
+  3636159963U,	// <0,4,5,2>: Cost 4 vext1 <2,0,4,5>, <2,0,4,5>
+  2568390836U,	// <0,4,5,3>: Cost 3 vext1 <3,0,4,5>, <3,0,4,5>
+  2568391990U,	// <0,4,5,4>: Cost 3 vext1 <3,0,4,5>, RHS
+  2718715180U,	// <0,4,5,5>: Cost 3 vext3 <5,6,7,0>, <4,5,5,6>
+  1618136374U,	// <0,4,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
+  2592281624U,	// <0,4,5,7>: Cost 3 vext1 <7,0,4,5>, <7,0,4,5>
+  1618136392U,	// <0,4,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
+  2550480938U,	// <0,4,6,0>: Cost 3 vext1 <0,0,4,6>, <0,0,4,6>
+  3826880801U,	// <0,4,6,1>: Cost 4 vuzpl <0,2,4,6>, <6,0,1,2>
+  2562426332U,	// <0,4,6,2>: Cost 3 vext1 <2,0,4,6>, <2,0,4,6>
+  3786190181U,	// <0,4,6,3>: Cost 4 vext3 <4,6,3,0>, <4,6,3,0>
+  2718715252U,	// <0,4,6,4>: Cost 3 vext3 <5,6,7,0>, <4,6,4,6>
+  3826881165U,	// <0,4,6,5>: Cost 4 vuzpl <0,2,4,6>, <6,4,5,6>
+  2712669568U,	// <0,4,6,6>: Cost 3 vext3 <4,6,6,0>, <4,6,6,0>
+  2657760081U,	// <0,4,6,7>: Cost 3 vext2 <6,7,0,4>, <6,7,0,4>
+  2718715284U,	// <0,4,6,u>: Cost 3 vext3 <5,6,7,0>, <4,6,u,2>
+  3654090854U,	// <0,4,7,0>: Cost 4 vext1 <5,0,4,7>, LHS
+  3934229326U,	// <0,4,7,1>: Cost 4 vuzpr <7,0,1,4>, <6,7,0,1>
+  3734156437U,	// <0,4,7,2>: Cost 4 vext2 <7,2,0,4>, <7,2,0,4>
+  3734820070U,	// <0,4,7,3>: Cost 4 vext2 <7,3,0,4>, <7,3,0,4>
+  3654094134U,	// <0,4,7,4>: Cost 4 vext1 <5,0,4,7>, RHS
+  2713259464U,	// <0,4,7,5>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
+  2713333201U,	// <0,4,7,6>: Cost 3 vext3 <4,7,6,0>, <4,7,6,0>
+  3654095866U,	// <0,4,7,7>: Cost 4 vext1 <5,0,4,7>, <7,0,1,2>
+  2713259464U,	// <0,4,7,u>: Cost 3 vext3 <4,7,5,0>, <4,7,5,0>
+  2568413286U,	// <0,4,u,0>: Cost 3 vext1 <3,0,4,u>, LHS
+  2618611502U,	// <0,4,u,1>: Cost 3 vext2 <0,2,0,4>, LHS
+  2753140526U,	// <0,4,u,2>: Cost 3 vuzpl <0,2,4,6>, LHS
+  2568415415U,	// <0,4,u,3>: Cost 3 vext1 <3,0,4,u>, <3,0,4,u>
+  2568416566U,	// <0,4,u,4>: Cost 3 vext1 <3,0,4,u>, RHS
+  1817423158U,	// <0,4,u,5>: Cost 2 vzipl LHS, RHS
+  1947438390U,	// <0,4,u,6>: Cost 2 vtrnl LHS, RHS
+  2592306203U,	// <0,4,u,7>: Cost 3 vext1 <7,0,4,u>, <7,0,4,u>
+  1947438408U,	// <0,4,u,u>: Cost 2 vtrnl LHS, RHS
+  3630219264U,	// <0,5,0,0>: Cost 4 vext1 <1,0,5,0>, <0,0,0,0>
+  2625912934U,	// <0,5,0,1>: Cost 3 vext2 <1,4,0,5>, LHS
+  3692355748U,	// <0,5,0,2>: Cost 4 vext2 <0,2,0,5>, <0,2,0,2>
+  3693019384U,	// <0,5,0,3>: Cost 4 vext2 <0,3,0,5>, <0,3,0,5>
+  3630222646U,	// <0,5,0,4>: Cost 4 vext1 <1,0,5,0>, RHS
+  3699655062U,	// <0,5,0,5>: Cost 4 vext2 <1,4,0,5>, <0,5,0,1>
+  2718715508U,	// <0,5,0,6>: Cost 3 vext3 <5,6,7,0>, <5,0,6,1>
+  3087011126U,	// <0,5,0,7>: Cost 3 vtrnr <0,0,0,0>, RHS
+  2625913501U,	// <0,5,0,u>: Cost 3 vext2 <1,4,0,5>, LHS
+  1500659814U,	// <0,5,1,0>: Cost 2 vext1 <4,0,5,1>, LHS
+  2886520528U,	// <0,5,1,1>: Cost 3 vzipl LHS, <5,1,7,3>
+  2574403176U,	// <0,5,1,2>: Cost 3 vext1 <4,0,5,1>, <2,2,2,2>
+  2574403734U,	// <0,5,1,3>: Cost 3 vext1 <4,0,5,1>, <3,0,1,2>
+  1500662674U,	// <0,5,1,4>: Cost 2 vext1 <4,0,5,1>, <4,0,5,1>
+  2886520836U,	// <0,5,1,5>: Cost 3 vzipl LHS, <5,5,5,5>
+  2886520930U,	// <0,5,1,6>: Cost 3 vzipl LHS, <5,6,7,0>
+  2718715600U,	// <0,5,1,7>: Cost 3 vext3 <5,6,7,0>, <5,1,7,3>
+  1500665646U,	// <0,5,1,u>: Cost 2 vext1 <4,0,5,1>, LHS
+  2556493926U,	// <0,5,2,0>: Cost 3 vext1 <1,0,5,2>, LHS
+  2244546120U,	// <0,5,2,1>: Cost 3 vrev <5,0,1,2>
+  3692357256U,	// <0,5,2,2>: Cost 4 vext2 <0,2,0,5>, <2,2,5,7>
+  2568439994U,	// <0,5,2,3>: Cost 3 vext1 <3,0,5,2>, <3,0,5,2>
+  2556497206U,	// <0,5,2,4>: Cost 3 vext1 <1,0,5,2>, RHS
+  3020738564U,	// <0,5,2,5>: Cost 3 vtrnl LHS, <5,5,5,5>
+  4027877161U,	// <0,5,2,6>: Cost 4 vzipr <0,2,0,2>, <2,4,5,6>
+  3093220662U,	// <0,5,2,7>: Cost 3 vtrnr <1,0,3,2>, RHS
+  3093220663U,	// <0,5,2,u>: Cost 3 vtrnr <1,0,3,2>, RHS
+  3699656854U,	// <0,5,3,0>: Cost 4 vext2 <1,4,0,5>, <3,0,1,2>
+  3699656927U,	// <0,5,3,1>: Cost 4 vext2 <1,4,0,5>, <3,1,0,3>
+  3699657006U,	// <0,5,3,2>: Cost 4 vext2 <1,4,0,5>, <3,2,0,1>
+  3699657116U,	// <0,5,3,3>: Cost 4 vext2 <1,4,0,5>, <3,3,3,3>
+  2637859284U,	// <0,5,3,4>: Cost 3 vext2 <3,4,0,5>, <3,4,0,5>
+  3790319453U,	// <0,5,3,5>: Cost 4 vext3 <5,3,5,0>, <5,3,5,0>
+  3699657354U,	// <0,5,3,6>: Cost 4 vext2 <1,4,0,5>, <3,6,2,7>
+  2716725103U,	// <0,5,3,7>: Cost 3 vext3 <5,3,7,0>, <5,3,7,0>
+  2716798840U,	// <0,5,3,u>: Cost 3 vext3 <5,3,u,0>, <5,3,u,0>
+  2661747602U,	// <0,5,4,0>: Cost 3 vext2 <7,4,0,5>, <4,0,5,1>
+  3630252810U,	// <0,5,4,1>: Cost 4 vext1 <1,0,5,4>, <1,0,5,4>
+  3636225507U,	// <0,5,4,2>: Cost 4 vext1 <2,0,5,4>, <2,0,5,4>
+  3716910172U,	// <0,5,4,3>: Cost 4 vext2 <4,3,0,5>, <4,3,0,5>
+  3962195892U,	// <0,5,4,4>: Cost 4 vzipl <0,4,1,5>, <5,4,5,6>
+  2625916214U,	// <0,5,4,5>: Cost 3 vext2 <1,4,0,5>, RHS
+  3718901071U,	// <0,5,4,6>: Cost 4 vext2 <4,6,0,5>, <4,6,0,5>
+  2718715846U,	// <0,5,4,7>: Cost 3 vext3 <5,6,7,0>, <5,4,7,6>
+  2625916457U,	// <0,5,4,u>: Cost 3 vext2 <1,4,0,5>, RHS
+  3791278034U,	// <0,5,5,0>: Cost 4 vext3 <5,5,0,0>, <5,5,0,0>
+  3791351771U,	// <0,5,5,1>: Cost 4 vext3 <5,5,1,0>, <5,5,1,0>
+  3318386260U,	// <0,5,5,2>: Cost 4 vrev <5,0,2,5>
+  3791499245U,	// <0,5,5,3>: Cost 4 vext3 <5,5,3,0>, <5,5,3,0>
+  3318533734U,	// <0,5,5,4>: Cost 4 vrev <5,0,4,5>
+  2718715908U,	// <0,5,5,5>: Cost 3 vext3 <5,6,7,0>, <5,5,5,5>
+  2657767522U,	// <0,5,5,6>: Cost 3 vext2 <6,7,0,5>, <5,6,7,0>
+  2718715928U,	// <0,5,5,7>: Cost 3 vext3 <5,6,7,0>, <5,5,7,7>
+  2718715937U,	// <0,5,5,u>: Cost 3 vext3 <5,6,7,0>, <5,5,u,7>
+  2592358502U,	// <0,5,6,0>: Cost 3 vext1 <7,0,5,6>, LHS
+  3792015404U,	// <0,5,6,1>: Cost 4 vext3 <5,6,1,0>, <5,6,1,0>
+  3731509754U,	// <0,5,6,2>: Cost 4 vext2 <6,7,0,5>, <6,2,7,3>
+  3785748546U,	// <0,5,6,3>: Cost 4 vext3 <4,5,6,0>, <5,6,3,4>
+  2592361782U,	// <0,5,6,4>: Cost 3 vext1 <7,0,5,6>, RHS
+  2592362594U,	// <0,5,6,5>: Cost 3 vext1 <7,0,5,6>, <5,6,7,0>
+  3785748576U,	// <0,5,6,6>: Cost 4 vext3 <4,5,6,0>, <5,6,6,7>
+  1644974178U,	// <0,5,6,7>: Cost 2 vext3 <5,6,7,0>, <5,6,7,0>
+  1645047915U,	// <0,5,6,u>: Cost 2 vext3 <5,6,u,0>, <5,6,u,0>
+  2562506854U,	// <0,5,7,0>: Cost 3 vext1 <2,0,5,7>, LHS
+  2562507670U,	// <0,5,7,1>: Cost 3 vext1 <2,0,5,7>, <1,2,3,0>
+  2562508262U,	// <0,5,7,2>: Cost 3 vext1 <2,0,5,7>, <2,0,5,7>
+  3636250774U,	// <0,5,7,3>: Cost 4 vext1 <2,0,5,7>, <3,0,1,2>
+  2562510134U,	// <0,5,7,4>: Cost 3 vext1 <2,0,5,7>, RHS
+  2718716072U,	// <0,5,7,5>: Cost 3 vext3 <5,6,7,0>, <5,7,5,7>
+  2718716074U,	// <0,5,7,6>: Cost 3 vext3 <5,6,7,0>, <5,7,6,0>
+  2719379635U,	// <0,5,7,7>: Cost 3 vext3 <5,7,7,0>, <5,7,7,0>
+  2562512686U,	// <0,5,7,u>: Cost 3 vext1 <2,0,5,7>, LHS
+  1500717158U,	// <0,5,u,0>: Cost 2 vext1 <4,0,5,u>, LHS
+  2625918766U,	// <0,5,u,1>: Cost 3 vext2 <1,4,0,5>, LHS
+  2719674583U,	// <0,5,u,2>: Cost 3 vext3 <5,u,2,0>, <5,u,2,0>
+  2568489152U,	// <0,5,u,3>: Cost 3 vext1 <3,0,5,u>, <3,0,5,u>
+  1500720025U,	// <0,5,u,4>: Cost 2 vext1 <4,0,5,u>, <4,0,5,u>
+  2625919130U,	// <0,5,u,5>: Cost 3 vext2 <1,4,0,5>, RHS
+  2586407243U,	// <0,5,u,6>: Cost 3 vext1 <6,0,5,u>, <6,0,5,u>
+  1646301444U,	// <0,5,u,7>: Cost 2 vext3 <5,u,7,0>, <5,u,7,0>
+  1646375181U,	// <0,5,u,u>: Cost 2 vext3 <5,u,u,0>, <5,u,u,0>
+  2586411110U,	// <0,6,0,0>: Cost 3 vext1 <6,0,6,0>, LHS
+  2619949158U,	// <0,6,0,1>: Cost 3 vext2 <0,4,0,6>, LHS
+  2619949220U,	// <0,6,0,2>: Cost 3 vext2 <0,4,0,6>, <0,2,0,2>
+  3785748789U,	// <0,6,0,3>: Cost 4 vext3 <4,5,6,0>, <6,0,3,4>
+  2619949386U,	// <0,6,0,4>: Cost 3 vext2 <0,4,0,6>, <0,4,0,6>
+  2586415202U,	// <0,6,0,5>: Cost 3 vext1 <6,0,6,0>, <5,6,7,0>
+  2586415436U,	// <0,6,0,6>: Cost 3 vext1 <6,0,6,0>, <6,0,6,0>
+  2952793398U,	// <0,6,0,7>: Cost 3 vzipr <0,0,0,0>, RHS
+  2619949725U,	// <0,6,0,u>: Cost 3 vext2 <0,4,0,6>, LHS
+  2562531430U,	// <0,6,1,0>: Cost 3 vext1 <2,0,6,1>, LHS
+  3693691700U,	// <0,6,1,1>: Cost 4 vext2 <0,4,0,6>, <1,1,1,1>
+  2886521338U,	// <0,6,1,2>: Cost 3 vzipl LHS, <6,2,7,3>
+  3693691864U,	// <0,6,1,3>: Cost 4 vext2 <0,4,0,6>, <1,3,1,3>
+  2562534710U,	// <0,6,1,4>: Cost 3 vext1 <2,0,6,1>, RHS
+  2580450932U,	// <0,6,1,5>: Cost 3 vext1 <5,0,6,1>, <5,0,6,1>
+  2886521656U,	// <0,6,1,6>: Cost 3 vzipl LHS, <6,6,6,6>
+  2966736182U,	// <0,6,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
+  2966736183U,	// <0,6,1,u>: Cost 3 vzipr <2,3,0,1>, RHS
+  1500741734U,	// <0,6,2,0>: Cost 2 vext1 <4,0,6,2>, LHS
+  2250518817U,	// <0,6,2,1>: Cost 3 vrev <6,0,1,2>
+  2574485096U,	// <0,6,2,2>: Cost 3 vext1 <4,0,6,2>, <2,2,2,2>
+  2631894694U,	// <0,6,2,3>: Cost 3 vext2 <2,4,0,6>, <2,3,0,1>
+  1500744604U,	// <0,6,2,4>: Cost 2 vext1 <4,0,6,2>, <4,0,6,2>
+  2574487248U,	// <0,6,2,5>: Cost 3 vext1 <4,0,6,2>, <5,1,7,3>
+  3020739384U,	// <0,6,2,6>: Cost 3 vtrnl LHS, <6,6,6,6>
+  2954136886U,	// <0,6,2,7>: Cost 3 vzipr <0,2,0,2>, RHS
+  1500747566U,	// <0,6,2,u>: Cost 2 vext1 <4,0,6,2>, LHS
+  3693693078U,	// <0,6,3,0>: Cost 4 vext2 <0,4,0,6>, <3,0,1,2>
+  3705637136U,	// <0,6,3,1>: Cost 4 vext2 <2,4,0,6>, <3,1,5,7>
+  3705637192U,	// <0,6,3,2>: Cost 4 vext2 <2,4,0,6>, <3,2,3,0>
+  3693693340U,	// <0,6,3,3>: Cost 4 vext2 <0,4,0,6>, <3,3,3,3>
+  2637867477U,	// <0,6,3,4>: Cost 3 vext2 <3,4,0,6>, <3,4,0,6>
+  3705637424U,	// <0,6,3,5>: Cost 4 vext2 <2,4,0,6>, <3,5,1,7>
+  3666154056U,	// <0,6,3,6>: Cost 4 vext1 <7,0,6,3>, <6,3,7,0>
+  2722697800U,	// <0,6,3,7>: Cost 3 vext3 <6,3,7,0>, <6,3,7,0>
+  2722771537U,	// <0,6,3,u>: Cost 3 vext3 <6,3,u,0>, <6,3,u,0>
+  2562556006U,	// <0,6,4,0>: Cost 3 vext1 <2,0,6,4>, LHS
+  4095316257U,	// <0,6,4,1>: Cost 4 vtrnl <0,2,4,6>, <6,0,1,2>
+  2562557420U,	// <0,6,4,2>: Cost 3 vext1 <2,0,6,4>, <2,0,6,4>
+  3636299926U,	// <0,6,4,3>: Cost 4 vext1 <2,0,6,4>, <3,0,1,2>
+  2562559286U,	// <0,6,4,4>: Cost 3 vext1 <2,0,6,4>, RHS
+  2619952438U,	// <0,6,4,5>: Cost 3 vext2 <0,4,0,6>, RHS
+  2723287696U,	// <0,6,4,6>: Cost 3 vext3 <6,4,6,0>, <6,4,6,0>
+  4027895094U,	// <0,6,4,7>: Cost 4 vzipr <0,2,0,4>, RHS
+  2619952681U,	// <0,6,4,u>: Cost 3 vext2 <0,4,0,6>, RHS
+  2718716594U,	// <0,6,5,0>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
+  3648250774U,	// <0,6,5,1>: Cost 4 vext1 <4,0,6,5>, <1,2,3,0>
+  3792458436U,	// <0,6,5,2>: Cost 4 vext3 <5,6,7,0>, <6,5,2,7>
+  3705638767U,	// <0,6,5,3>: Cost 5 vext2 <2,4,0,6>, <5,3,7,0>
+  3648252831U,	// <0,6,5,4>: Cost 4 vext1 <4,0,6,5>, <4,0,6,5>
+  3797619416U,	// <0,6,5,5>: Cost 4 vext3 <6,5,5,0>, <6,5,5,0>
+  3792458472U,	// <0,6,5,6>: Cost 4 vext3 <5,6,7,0>, <6,5,6,7>
+  4035202358U,	// <0,6,5,7>: Cost 4 vzipr <1,4,0,5>, RHS
+  2718716594U,	// <0,6,5,u>: Cost 3 vext3 <5,6,7,0>, <6,5,0,7>
+  3786412796U,	// <0,6,6,0>: Cost 4 vext3 <4,6,6,0>, <6,6,0,0>
+  3792458504U,	// <0,6,6,1>: Cost 4 vext3 <5,6,7,0>, <6,6,1,3>
+  3728200126U,	// <0,6,6,2>: Cost 4 vext2 <6,2,0,6>, <6,2,0,6>
+  3798135575U,	// <0,6,6,3>: Cost 4 vext3 <6,6,3,0>, <6,6,3,0>
+  3786412836U,	// <0,6,6,4>: Cost 4 vext3 <4,6,6,0>, <6,6,4,4>
+  3792458543U,	// <0,6,6,5>: Cost 4 vext3 <5,6,7,0>, <6,6,5,6>
+  2718716728U,	// <0,6,6,6>: Cost 3 vext3 <5,6,7,0>, <6,6,6,6>
+  2718716738U,	// <0,6,6,7>: Cost 3 vext3 <5,6,7,0>, <6,6,7,7>
+  2718716747U,	// <0,6,6,u>: Cost 3 vext3 <5,6,7,0>, <6,6,u,7>
+  2718716750U,	// <0,6,7,0>: Cost 3 vext3 <5,6,7,0>, <6,7,0,1>
+  2724909910U,	// <0,6,7,1>: Cost 3 vext3 <6,7,1,0>, <6,7,1,0>
+  3636323823U,	// <0,6,7,2>: Cost 4 vext1 <2,0,6,7>, <2,0,6,7>
+  2725057384U,	// <0,6,7,3>: Cost 3 vext3 <6,7,3,0>, <6,7,3,0>
+  2718716790U,	// <0,6,7,4>: Cost 3 vext3 <5,6,7,0>, <6,7,4,5>
+  2718716800U,	// <0,6,7,5>: Cost 3 vext3 <5,6,7,0>, <6,7,5,6>
+  3792458629U,	// <0,6,7,6>: Cost 4 vext3 <5,6,7,0>, <6,7,6,2>
+  2725352332U,	// <0,6,7,7>: Cost 3 vext3 <6,7,7,0>, <6,7,7,0>
+  2718716822U,	// <0,6,7,u>: Cost 3 vext3 <5,6,7,0>, <6,7,u,1>
+  1500790886U,	// <0,6,u,0>: Cost 2 vext1 <4,0,6,u>, LHS
+  2619954990U,	// <0,6,u,1>: Cost 3 vext2 <0,4,0,6>, LHS
+  2562590192U,	// <0,6,u,2>: Cost 3 vext1 <2,0,6,u>, <2,0,6,u>
+  2725721017U,	// <0,6,u,3>: Cost 3 vext3 <6,u,3,0>, <6,u,3,0>
+  1500793762U,	// <0,6,u,4>: Cost 2 vext1 <4,0,6,u>, <4,0,6,u>
+  2619955354U,	// <0,6,u,5>: Cost 3 vext2 <0,4,0,6>, RHS
+  2725942228U,	// <0,6,u,6>: Cost 3 vext3 <6,u,6,0>, <6,u,6,0>
+  2954186038U,	// <0,6,u,7>: Cost 3 vzipr <0,2,0,u>, RHS
+  1500796718U,	// <0,6,u,u>: Cost 2 vext1 <4,0,6,u>, LHS
+  2256401391U,	// <0,7,0,0>: Cost 3 vrev <7,0,0,0>
+  2632564838U,	// <0,7,0,1>: Cost 3 vext2 <2,5,0,7>, LHS
+  2256548865U,	// <0,7,0,2>: Cost 3 vrev <7,0,2,0>
+  3700998396U,	// <0,7,0,3>: Cost 4 vext2 <1,6,0,7>, <0,3,1,0>
+  2718716952U,	// <0,7,0,4>: Cost 3 vext3 <5,6,7,0>, <7,0,4,5>
+  2718716962U,	// <0,7,0,5>: Cost 3 vext3 <5,6,7,0>, <7,0,5,6>
+  2621284845U,	// <0,7,0,6>: Cost 3 vext2 <0,6,0,7>, <0,6,0,7>
+  3904685542U,	// <0,7,0,7>: Cost 4 vuzpr <2,0,5,7>, <2,0,5,7>
+  2632565405U,	// <0,7,0,u>: Cost 3 vext2 <2,5,0,7>, LHS
+  2256409584U,	// <0,7,1,0>: Cost 3 vrev <7,0,0,1>
+  3706307380U,	// <0,7,1,1>: Cost 4 vext2 <2,5,0,7>, <1,1,1,1>
+  2632565654U,	// <0,7,1,2>: Cost 3 vext2 <2,5,0,7>, <1,2,3,0>
+  3769603168U,	// <0,7,1,3>: Cost 4 vext3 <1,u,3,0>, <7,1,3,5>
+  2256704532U,	// <0,7,1,4>: Cost 3 vrev <7,0,4,1>
+  3769603184U,	// <0,7,1,5>: Cost 4 vext3 <1,u,3,0>, <7,1,5,3>
+  3700999366U,	// <0,7,1,6>: Cost 4 vext2 <1,6,0,7>, <1,6,0,7>
+  2886522476U,	// <0,7,1,7>: Cost 3 vzipl LHS, <7,7,7,7>
+  2256999480U,	// <0,7,1,u>: Cost 3 vrev <7,0,u,1>
+  2586501222U,	// <0,7,2,0>: Cost 3 vext1 <6,0,7,2>, LHS
+  1182749690U,	// <0,7,2,1>: Cost 2 vrev <7,0,1,2>
+  3636356595U,	// <0,7,2,2>: Cost 4 vext1 <2,0,7,2>, <2,0,7,2>
+  2727711916U,	// <0,7,2,3>: Cost 3 vext3 <7,2,3,0>, <7,2,3,0>
+  2586504502U,	// <0,7,2,4>: Cost 3 vext1 <6,0,7,2>, RHS
+  2632566606U,	// <0,7,2,5>: Cost 3 vext2 <2,5,0,7>, <2,5,0,7>
+  2586505559U,	// <0,7,2,6>: Cost 3 vext1 <6,0,7,2>, <6,0,7,2>
+  3020740204U,	// <0,7,2,7>: Cost 3 vtrnl LHS, <7,7,7,7>
+  1183265849U,	// <0,7,2,u>: Cost 2 vrev <7,0,u,2>
+  3701000342U,	// <0,7,3,0>: Cost 4 vext2 <1,6,0,7>, <3,0,1,2>
+  3706308849U,	// <0,7,3,1>: Cost 4 vext2 <2,5,0,7>, <3,1,2,3>
+  3330315268U,	// <0,7,3,2>: Cost 4 vrev <7,0,2,3>
+  3706309020U,	// <0,7,3,3>: Cost 4 vext2 <2,5,0,7>, <3,3,3,3>
+  3706309122U,	// <0,7,3,4>: Cost 4 vext2 <2,5,0,7>, <3,4,5,6>
+  3712281127U,	// <0,7,3,5>: Cost 4 vext2 <3,5,0,7>, <3,5,0,7>
+  2639202936U,	// <0,7,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
+  3802412321U,	// <0,7,3,7>: Cost 4 vext3 <7,3,7,0>, <7,3,7,0>
+  2640530202U,	// <0,7,3,u>: Cost 3 vext2 <3,u,0,7>, <3,u,0,7>
+  3654287462U,	// <0,7,4,0>: Cost 4 vext1 <5,0,7,4>, LHS
+  2256507900U,	// <0,7,4,1>: Cost 3 vrev <7,0,1,4>
+  2256581637U,	// <0,7,4,2>: Cost 3 vrev <7,0,2,4>
+  3660262008U,	// <0,7,4,3>: Cost 4 vext1 <6,0,7,4>, <3,6,0,7>
+  3786413405U,	// <0,7,4,4>: Cost 4 vext3 <4,6,6,0>, <7,4,4,6>
+  2632568118U,	// <0,7,4,5>: Cost 3 vext2 <2,5,0,7>, RHS
+  3718917457U,	// <0,7,4,6>: Cost 4 vext2 <4,6,0,7>, <4,6,0,7>
+  3787003255U,	// <0,7,4,7>: Cost 4 vext3 <4,7,5,0>, <7,4,7,5>
+  2632568361U,	// <0,7,4,u>: Cost 3 vext2 <2,5,0,7>, RHS
+  3706310268U,	// <0,7,5,0>: Cost 4 vext2 <2,5,0,7>, <5,0,7,0>
+  3792459156U,	// <0,7,5,1>: Cost 4 vext3 <5,6,7,0>, <7,5,1,7>
+  3330331654U,	// <0,7,5,2>: Cost 4 vrev <7,0,2,5>
+  3722899255U,	// <0,7,5,3>: Cost 4 vext2 <5,3,0,7>, <5,3,0,7>
+  2256737304U,	// <0,7,5,4>: Cost 3 vrev <7,0,4,5>
+  3724226521U,	// <0,7,5,5>: Cost 4 vext2 <5,5,0,7>, <5,5,0,7>
+  2718717377U,	// <0,7,5,6>: Cost 3 vext3 <5,6,7,0>, <7,5,6,7>
+  2729997763U,	// <0,7,5,7>: Cost 3 vext3 <7,5,7,0>, <7,5,7,0>
+  2720044499U,	// <0,7,5,u>: Cost 3 vext3 <5,u,7,0>, <7,5,u,7>
+  3712946517U,	// <0,7,6,0>: Cost 4 vext2 <3,6,0,7>, <6,0,7,0>
+  2256524286U,	// <0,7,6,1>: Cost 3 vrev <7,0,1,6>
+  3792459246U,	// <0,7,6,2>: Cost 4 vext3 <5,6,7,0>, <7,6,2,7>
+  3796440567U,	// <0,7,6,3>: Cost 4 vext3 <6,3,7,0>, <7,6,3,7>
+  3654307126U,	// <0,7,6,4>: Cost 4 vext1 <5,0,7,6>, RHS
+  2656457394U,	// <0,7,6,5>: Cost 3 vext2 <6,5,0,7>, <6,5,0,7>
+  3792459281U,	// <0,7,6,6>: Cost 4 vext3 <5,6,7,0>, <7,6,6,6>
+  2730661396U,	// <0,7,6,7>: Cost 3 vext3 <7,6,7,0>, <7,6,7,0>
+  2658448293U,	// <0,7,6,u>: Cost 3 vext2 <6,u,0,7>, <6,u,0,7>
+  3787003431U,	// <0,7,7,0>: Cost 4 vext3 <4,7,5,0>, <7,7,0,1>
+  3654312854U,	// <0,7,7,1>: Cost 4 vext1 <5,0,7,7>, <1,2,3,0>
+  3654313446U,	// <0,7,7,2>: Cost 4 vext1 <5,0,7,7>, <2,0,5,7>
+  3804771905U,	// <0,7,7,3>: Cost 4 vext3 <7,7,3,0>, <7,7,3,0>
+  3654315318U,	// <0,7,7,4>: Cost 4 vext1 <5,0,7,7>, RHS
+  3654315651U,	// <0,7,7,5>: Cost 4 vext1 <5,0,7,7>, <5,0,7,7>
+  3660288348U,	// <0,7,7,6>: Cost 4 vext1 <6,0,7,7>, <6,0,7,7>
+  2718717548U,	// <0,7,7,7>: Cost 3 vext3 <5,6,7,0>, <7,7,7,7>
+  2664420990U,	// <0,7,7,u>: Cost 3 vext2 <7,u,0,7>, <7,u,0,7>
+  2256466935U,	// <0,7,u,0>: Cost 3 vrev <7,0,0,u>
+  1182798848U,	// <0,7,u,1>: Cost 2 vrev <7,0,1,u>
+  2256614409U,	// <0,7,u,2>: Cost 3 vrev <7,0,2,u>
+  2731693714U,	// <0,7,u,3>: Cost 3 vext3 <7,u,3,0>, <7,u,3,0>
+  2256761883U,	// <0,7,u,4>: Cost 3 vrev <7,0,4,u>
+  2632571034U,	// <0,7,u,5>: Cost 3 vext2 <2,5,0,7>, RHS
+  2669066421U,	// <0,7,u,6>: Cost 3 vext2 <u,6,0,7>, <u,6,0,7>
+  2731988662U,	// <0,7,u,7>: Cost 3 vext3 <7,u,7,0>, <7,u,7,0>
+  1183315007U,	// <0,7,u,u>: Cost 2 vrev <7,0,u,u>
+  135053414U,	// <0,u,0,0>: Cost 1 vdup0 LHS
+  1544896614U,	// <0,u,0,1>: Cost 2 vext2 <0,2,0,u>, LHS
+  1678999654U,	// <0,u,0,2>: Cost 2 vuzpl LHS, LHS
+  2691880677U,	// <0,u,0,3>: Cost 3 vext3 <1,2,3,0>, <u,0,3,2>
+  1476988214U,	// <0,u,0,4>: Cost 2 vext1 <0,0,u,0>, RHS
+  2718791419U,	// <0,u,0,5>: Cost 3 vext3 <5,6,u,0>, <u,0,5,6>
+  3021248666U,	// <0,u,0,6>: Cost 3 vtrnl <0,2,0,2>, RHS
+  2592535607U,	// <0,u,0,7>: Cost 3 vext1 <7,0,u,0>, <7,0,u,0>
+  135053414U,	// <0,u,0,u>: Cost 1 vdup0 LHS
+  1476993097U,	// <0,u,1,0>: Cost 2 vext1 <0,0,u,1>, <0,0,u,1>
+  1812780846U,	// <0,u,1,1>: Cost 2 vzipl LHS, LHS
+  1618138926U,	// <0,u,1,2>: Cost 2 vext3 <1,2,3,0>, LHS
+  2752742134U,	// <0,u,1,3>: Cost 3 vuzpl LHS, <1,0,3,2>
+  1476996406U,	// <0,u,1,4>: Cost 2 vext1 <0,0,u,1>, RHS
+  1812781210U,	// <0,u,1,5>: Cost 2 vzipl LHS, RHS
+  2887006416U,	// <0,u,1,6>: Cost 3 vzipl LHS, <u,6,3,7>
+  2966736200U,	// <0,u,1,7>: Cost 3 vzipr <2,3,0,1>, RHS
+  1812781413U,	// <0,u,1,u>: Cost 2 vzipl LHS, LHS
+  1482973286U,	// <0,u,2,0>: Cost 2 vext1 <1,0,u,2>, LHS
+  1482973987U,	// <0,u,2,1>: Cost 2 vext1 <1,0,u,2>, <1,0,u,2>
+  1946998574U,	// <0,u,2,2>: Cost 2 vtrnl LHS, LHS
+  835584U,	// <0,u,2,3>: Cost 0 copy LHS
+  1482976566U,	// <0,u,2,4>: Cost 2 vext1 <1,0,u,2>, RHS
+  3020781631U,	// <0,u,2,5>: Cost 3 vtrnl LHS, <u,4,5,6>
+  1946998938U,	// <0,u,2,6>: Cost 2 vtrnl LHS, RHS
+  1518810169U,	// <0,u,2,7>: Cost 2 vext1 <7,0,u,2>, <7,0,u,2>
+  835584U,	// <0,u,2,u>: Cost 0 copy LHS
+  2618640534U,	// <0,u,3,0>: Cost 3 vext2 <0,2,0,u>, <3,0,1,2>
+  2752743574U,	// <0,u,3,1>: Cost 3 vuzpl LHS, <3,0,1,2>
+  2636556597U,	// <0,u,3,2>: Cost 3 vext2 <3,2,0,u>, <3,2,0,u>
+  2752743836U,	// <0,u,3,3>: Cost 3 vuzpl LHS, <3,3,3,3>
+  2618640898U,	// <0,u,3,4>: Cost 3 vext2 <0,2,0,u>, <3,4,5,6>
+  2752743938U,	// <0,u,3,5>: Cost 3 vuzpl LHS, <3,4,5,6>
+  2639202936U,	// <0,u,3,6>: Cost 3 vext2 <3,6,0,7>, <3,6,0,7>
+  2639874762U,	// <0,u,3,7>: Cost 3 vext2 <3,7,0,u>, <3,7,0,u>
+  2752743637U,	// <0,u,3,u>: Cost 3 vuzpl LHS, <3,0,u,2>
+  2562703462U,	// <0,u,4,0>: Cost 3 vext1 <2,0,u,4>, LHS
+  2888455982U,	// <0,u,4,1>: Cost 3 vzipl <0,4,1,5>, LHS
+  3021575982U,	// <0,u,4,2>: Cost 3 vtrnl <0,2,4,6>, LHS
+  2568677591U,	// <0,u,4,3>: Cost 3 vext1 <3,0,u,4>, <3,0,u,4>
+  2562706742U,	// <0,u,4,4>: Cost 3 vext1 <2,0,u,4>, RHS
+  1544899894U,	// <0,u,4,5>: Cost 2 vext2 <0,2,0,u>, RHS
+  1679002934U,	// <0,u,4,6>: Cost 2 vuzpl LHS, RHS
+  2718718033U,	// <0,u,4,7>: Cost 3 vext3 <5,6,7,0>, <u,4,7,6>
+  1679002952U,	// <0,u,4,u>: Cost 2 vuzpl LHS, RHS
+  2568683622U,	// <0,u,5,0>: Cost 3 vext1 <3,0,u,5>, LHS
+  2568684438U,	// <0,u,5,1>: Cost 3 vext1 <3,0,u,5>, <1,2,3,0>
+  3765622902U,	// <0,u,5,2>: Cost 4 vext3 <1,2,3,0>, <u,5,2,7>
+  2691881087U,	// <0,u,5,3>: Cost 3 vext3 <1,2,3,0>, <u,5,3,7>
+  2568686902U,	// <0,u,5,4>: Cost 3 vext1 <3,0,u,5>, RHS
+  2650492890U,	// <0,u,5,5>: Cost 3 vext2 <5,5,0,u>, <5,5,0,u>
+  1618139290U,	// <0,u,5,6>: Cost 2 vext3 <1,2,3,0>, RHS
+  2824834358U,	// <0,u,5,7>: Cost 3 vuzpr <1,0,3,u>, RHS
+  1618139308U,	// <0,u,5,u>: Cost 2 vext3 <1,2,3,0>, RHS
+  2592579686U,	// <0,u,6,0>: Cost 3 vext1 <7,0,u,6>, LHS
+  2262496983U,	// <0,u,6,1>: Cost 3 vrev <u,0,1,6>
+  2654474688U,	// <0,u,6,2>: Cost 3 vext2 <6,2,0,u>, <6,2,0,u>
+  2691881168U,	// <0,u,6,3>: Cost 3 vext3 <1,2,3,0>, <u,6,3,7>
+  2592582966U,	// <0,u,6,4>: Cost 3 vext1 <7,0,u,6>, RHS
+  2656465587U,	// <0,u,6,5>: Cost 3 vext2 <6,5,0,u>, <6,5,0,u>
+  2657129220U,	// <0,u,6,6>: Cost 3 vext2 <6,6,0,u>, <6,6,0,u>
+  1584051029U,	// <0,u,6,7>: Cost 2 vext2 <6,7,0,u>, <6,7,0,u>
+  1584714662U,	// <0,u,6,u>: Cost 2 vext2 <6,u,0,u>, <6,u,0,u>
+  2562728038U,	// <0,u,7,0>: Cost 3 vext1 <2,0,u,7>, LHS
+  2562728854U,	// <0,u,7,1>: Cost 3 vext1 <2,0,u,7>, <1,2,3,0>
+  2562729473U,	// <0,u,7,2>: Cost 3 vext1 <2,0,u,7>, <2,0,u,7>
+  2661111018U,	// <0,u,7,3>: Cost 3 vext2 <7,3,0,u>, <7,3,0,u>
+  2562731318U,	// <0,u,7,4>: Cost 3 vext1 <2,0,u,7>, RHS
+  2718718258U,	// <0,u,7,5>: Cost 3 vext3 <5,6,7,0>, <u,7,5,6>
+  2586620261U,	// <0,u,7,6>: Cost 3 vext1 <6,0,u,7>, <6,0,u,7>
+  2657793644U,	// <0,u,7,7>: Cost 3 vext2 <6,7,0,u>, <7,7,7,7>
+  2562733870U,	// <0,u,7,u>: Cost 3 vext1 <2,0,u,7>, LHS
+  135053414U,	// <0,u,u,0>: Cost 1 vdup0 LHS
+  1544902446U,	// <0,u,u,1>: Cost 2 vext2 <0,2,0,u>, LHS
+  1679005486U,	// <0,u,u,2>: Cost 2 vuzpl LHS, LHS
+  835584U,	// <0,u,u,3>: Cost 0 copy LHS
+  1483025718U,	// <0,u,u,4>: Cost 2 vext1 <1,0,u,u>, RHS
+  1544902810U,	// <0,u,u,5>: Cost 2 vext2 <0,2,0,u>, RHS
+  1679005850U,	// <0,u,u,6>: Cost 2 vuzpl LHS, RHS
+  1518859327U,	// <0,u,u,7>: Cost 2 vext1 <7,0,u,u>, <7,0,u,u>
+  835584U,	// <0,u,u,u>: Cost 0 copy LHS
+  2689744896U,	// <1,0,0,0>: Cost 3 vext3 <0,u,1,1>, <0,0,0,0>
+  1610694666U,	// <1,0,0,1>: Cost 2 vext3 <0,0,1,1>, <0,0,1,1>
+  2689744916U,	// <1,0,0,2>: Cost 3 vext3 <0,u,1,1>, <0,0,2,2>
+  2619310332U,	// <1,0,0,3>: Cost 3 vext2 <0,3,1,0>, <0,3,1,0>
+  2684657701U,	// <1,0,0,4>: Cost 3 vext3 <0,0,4,1>, <0,0,4,1>
+  2620637598U,	// <1,0,0,5>: Cost 3 vext2 <0,5,1,0>, <0,5,1,0>
+  3708977654U,	// <1,0,0,6>: Cost 4 vext2 <3,0,1,0>, <0,6,1,7>
+  3666351168U,	// <1,0,0,7>: Cost 4 vext1 <7,1,0,0>, <7,1,0,0>
+  1611210825U,	// <1,0,0,u>: Cost 2 vext3 <0,0,u,1>, <0,0,u,1>
+  2556780646U,	// <1,0,1,0>: Cost 3 vext1 <1,1,0,1>, LHS
+  2556781355U,	// <1,0,1,1>: Cost 3 vext1 <1,1,0,1>, <1,1,0,1>
+  1616003174U,	// <1,0,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
+  3693052888U,	// <1,0,1,3>: Cost 4 vext2 <0,3,1,0>, <1,3,1,3>
+  2556783926U,	// <1,0,1,4>: Cost 3 vext1 <1,1,0,1>, RHS
+  2580672143U,	// <1,0,1,5>: Cost 3 vext1 <5,1,0,1>, <5,1,0,1>
+  2724839566U,	// <1,0,1,6>: Cost 3 vext3 <6,7,0,1>, <0,1,6,7>
+  3654415354U,	// <1,0,1,7>: Cost 4 vext1 <5,1,0,1>, <7,0,1,2>
+  1616003228U,	// <1,0,1,u>: Cost 2 vext3 <0,u,1,1>, LHS
+  2685690019U,	// <1,0,2,0>: Cost 3 vext3 <0,2,0,1>, <0,2,0,1>
+  2685763756U,	// <1,0,2,1>: Cost 3 vext3 <0,2,1,1>, <0,2,1,1>
+  2698297524U,	// <1,0,2,2>: Cost 3 vext3 <2,3,0,1>, <0,2,2,0>
+  2685911230U,	// <1,0,2,3>: Cost 3 vext3 <0,2,3,1>, <0,2,3,1>
+  2689745100U,	// <1,0,2,4>: Cost 3 vext3 <0,u,1,1>, <0,2,4,6>
+  3764814038U,	// <1,0,2,5>: Cost 4 vext3 <1,1,1,1>, <0,2,5,7>
+  2724839640U,	// <1,0,2,6>: Cost 3 vext3 <6,7,0,1>, <0,2,6,0>
+  2592625658U,	// <1,0,2,7>: Cost 3 vext1 <7,1,0,2>, <7,0,1,2>
+  2686279915U,	// <1,0,2,u>: Cost 3 vext3 <0,2,u,1>, <0,2,u,1>
+  3087843328U,	// <1,0,3,0>: Cost 3 vtrnr LHS, <0,0,0,0>
+  3087843338U,	// <1,0,3,1>: Cost 3 vtrnr LHS, <0,0,1,1>
+  67944550U,	// <1,0,3,2>: Cost 1 vrev LHS
+  2568743135U,	// <1,0,3,3>: Cost 3 vext1 <3,1,0,3>, <3,1,0,3>
+  2562772278U,	// <1,0,3,4>: Cost 3 vext1 <2,1,0,3>, RHS
+  4099850454U,	// <1,0,3,5>: Cost 4 vtrnl <1,0,3,2>, <0,2,5,7>
+  3704998538U,	// <1,0,3,6>: Cost 4 vext2 <2,3,1,0>, <3,6,2,7>
+  2592633923U,	// <1,0,3,7>: Cost 3 vext1 <7,1,0,3>, <7,1,0,3>
+  68386972U,	// <1,0,3,u>: Cost 1 vrev LHS
+  2620640146U,	// <1,0,4,0>: Cost 3 vext2 <0,5,1,0>, <4,0,5,1>
+  2689745234U,	// <1,0,4,1>: Cost 3 vext3 <0,u,1,1>, <0,4,1,5>
+  2689745244U,	// <1,0,4,2>: Cost 3 vext3 <0,u,1,1>, <0,4,2,6>
+  3760980320U,	// <1,0,4,3>: Cost 4 vext3 <0,4,3,1>, <0,4,3,1>
+  3761054057U,	// <1,0,4,4>: Cost 4 vext3 <0,4,4,1>, <0,4,4,1>
+  2619313462U,	// <1,0,4,5>: Cost 3 vext2 <0,3,1,0>, RHS
+  3761201531U,	// <1,0,4,6>: Cost 4 vext3 <0,4,6,1>, <0,4,6,1>
+  3666383940U,	// <1,0,4,7>: Cost 4 vext1 <7,1,0,4>, <7,1,0,4>
+  2619313705U,	// <1,0,4,u>: Cost 3 vext2 <0,3,1,0>, RHS
+  4029300736U,	// <1,0,5,0>: Cost 4 vzipr <0,4,1,5>, <0,0,0,0>
+  2895249510U,	// <1,0,5,1>: Cost 3 vzipl <1,5,3,7>, LHS
+  3028287590U,	// <1,0,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
+  3642501345U,	// <1,0,5,3>: Cost 4 vext1 <3,1,0,5>, <3,1,0,5>
+  2215592058U,	// <1,0,5,4>: Cost 3 vrev <0,1,4,5>
+  3724242907U,	// <1,0,5,5>: Cost 4 vext2 <5,5,1,0>, <5,5,1,0>
+  3724906540U,	// <1,0,5,6>: Cost 4 vext2 <5,6,1,0>, <5,6,1,0>
+  3911118134U,	// <1,0,5,7>: Cost 4 vuzpr <3,1,3,0>, RHS
+  3028287644U,	// <1,0,5,u>: Cost 3 vtrnl <1,3,5,7>, LHS
+  3762086375U,	// <1,0,6,0>: Cost 4 vext3 <0,6,0,1>, <0,6,0,1>
+  2698297846U,	// <1,0,6,1>: Cost 3 vext3 <2,3,0,1>, <0,6,1,7>
+  3760022015U,	// <1,0,6,2>: Cost 4 vext3 <0,2,u,1>, <0,6,2,7>
+  3642509538U,	// <1,0,6,3>: Cost 4 vext1 <3,1,0,6>, <3,1,0,6>
+  3762381323U,	// <1,0,6,4>: Cost 4 vext3 <0,6,4,1>, <0,6,4,1>
+  3730215604U,	// <1,0,6,5>: Cost 4 vext2 <6,5,1,0>, <6,5,1,0>
+  3730879237U,	// <1,0,6,6>: Cost 4 vext2 <6,6,1,0>, <6,6,1,0>
+  2657801046U,	// <1,0,6,7>: Cost 3 vext2 <6,7,1,0>, <6,7,1,0>
+  2658464679U,	// <1,0,6,u>: Cost 3 vext2 <6,u,1,0>, <6,u,1,0>
+  2659128312U,	// <1,0,7,0>: Cost 3 vext2 <7,0,1,0>, <7,0,1,0>
+  4047898278U,	// <1,0,7,1>: Cost 4 vzipr <3,5,1,7>, <2,3,0,1>
+  2215460970U,	// <1,0,7,2>: Cost 3 vrev <0,1,2,7>
+  3734861035U,	// <1,0,7,3>: Cost 4 vext2 <7,3,1,0>, <7,3,1,0>
+  3731543398U,	// <1,0,7,4>: Cost 4 vext2 <6,7,1,0>, <7,4,5,6>
+  3736188301U,	// <1,0,7,5>: Cost 4 vext2 <7,5,1,0>, <7,5,1,0>
+  2663110110U,	// <1,0,7,6>: Cost 3 vext2 <7,6,1,0>, <7,6,1,0>
+  3731543660U,	// <1,0,7,7>: Cost 4 vext2 <6,7,1,0>, <7,7,7,7>
+  2664437376U,	// <1,0,7,u>: Cost 3 vext2 <7,u,1,0>, <7,u,1,0>
+  3087884288U,	// <1,0,u,0>: Cost 3 vtrnr LHS, <0,0,0,0>
+  1616003730U,	// <1,0,u,1>: Cost 2 vext3 <0,u,1,1>, <0,u,1,1>
+  67985515U,	// <1,0,u,2>: Cost 1 vrev LHS
+  2689893028U,	// <1,0,u,3>: Cost 3 vext3 <0,u,3,1>, <0,u,3,1>
+  2689745586U,	// <1,0,u,4>: Cost 3 vext3 <0,u,1,1>, <0,u,4,6>
+  2619316378U,	// <1,0,u,5>: Cost 3 vext2 <0,3,1,0>, RHS
+  2669082807U,	// <1,0,u,6>: Cost 3 vext2 <u,6,1,0>, <u,6,1,0>
+  2592674888U,	// <1,0,u,7>: Cost 3 vext1 <7,1,0,u>, <7,1,0,u>
+  68427937U,	// <1,0,u,u>: Cost 1 vrev LHS
+  1543585802U,	// <1,1,0,0>: Cost 2 vext2 <0,0,1,1>, <0,0,1,1>
+  1548894310U,	// <1,1,0,1>: Cost 2 vext2 <0,u,1,1>, LHS
+  2618654892U,	// <1,1,0,2>: Cost 3 vext2 <0,2,1,1>, <0,2,1,1>
+  2689745654U,	// <1,1,0,3>: Cost 3 vext3 <0,u,1,1>, <1,0,3,2>
+  2622636370U,	// <1,1,0,4>: Cost 3 vext2 <0,u,1,1>, <0,4,1,5>
+  2620645791U,	// <1,1,0,5>: Cost 3 vext2 <0,5,1,1>, <0,5,1,1>
+  3696378367U,	// <1,1,0,6>: Cost 4 vext2 <0,u,1,1>, <0,6,2,7>
+  3666424905U,	// <1,1,0,7>: Cost 4 vext1 <7,1,1,0>, <7,1,1,0>
+  1548894866U,	// <1,1,0,u>: Cost 2 vext2 <0,u,1,1>, <0,u,1,1>
+  1483112550U,	// <1,1,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
+  202162278U,	// <1,1,1,1>: Cost 1 vdup1 LHS
+  2622636950U,	// <1,1,1,2>: Cost 3 vext2 <0,u,1,1>, <1,2,3,0>
+  2622637016U,	// <1,1,1,3>: Cost 3 vext2 <0,u,1,1>, <1,3,1,3>
+  1483115830U,	// <1,1,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
+  2622637200U,	// <1,1,1,5>: Cost 3 vext2 <0,u,1,1>, <1,5,3,7>
+  2622637263U,	// <1,1,1,6>: Cost 3 vext2 <0,u,1,1>, <1,6,1,7>
+  2592691274U,	// <1,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
+  202162278U,	// <1,1,1,u>: Cost 1 vdup1 LHS
+  2550890588U,	// <1,1,2,0>: Cost 3 vext1 <0,1,1,2>, <0,1,1,2>
+  2617329183U,	// <1,1,2,1>: Cost 3 vext2 <0,0,1,1>, <2,1,3,1>
+  2622637672U,	// <1,1,2,2>: Cost 3 vext2 <0,u,1,1>, <2,2,2,2>
+  2622637734U,	// <1,1,2,3>: Cost 3 vext2 <0,u,1,1>, <2,3,0,1>
+  2550893878U,	// <1,1,2,4>: Cost 3 vext1 <0,1,1,2>, RHS
+  3696379744U,	// <1,1,2,5>: Cost 4 vext2 <0,u,1,1>, <2,5,2,7>
+  2622638010U,	// <1,1,2,6>: Cost 3 vext2 <0,u,1,1>, <2,6,3,7>
+  3804554170U,	// <1,1,2,7>: Cost 4 vext3 <7,7,0,1>, <1,2,7,0>
+  2622638139U,	// <1,1,2,u>: Cost 3 vext2 <0,u,1,1>, <2,u,0,1>
+  2622638230U,	// <1,1,3,0>: Cost 3 vext2 <0,u,1,1>, <3,0,1,2>
+  3087844148U,	// <1,1,3,1>: Cost 3 vtrnr LHS, <1,1,1,1>
+  4161585244U,	// <1,1,3,2>: Cost 4 vtrnr LHS, <0,1,1,2>
+  2014101606U,	// <1,1,3,3>: Cost 2 vtrnr LHS, LHS
+  2622638594U,	// <1,1,3,4>: Cost 3 vext2 <0,u,1,1>, <3,4,5,6>
+  2689745920U,	// <1,1,3,5>: Cost 3 vext3 <0,u,1,1>, <1,3,5,7>
+  3763487753U,	// <1,1,3,6>: Cost 4 vext3 <0,u,1,1>, <1,3,6,7>
+  2592707660U,	// <1,1,3,7>: Cost 3 vext1 <7,1,1,3>, <7,1,1,3>
+  2014101611U,	// <1,1,3,u>: Cost 2 vtrnr LHS, LHS
+  2556878950U,	// <1,1,4,0>: Cost 3 vext1 <1,1,1,4>, LHS
+  2221335351U,	// <1,1,4,1>: Cost 3 vrev <1,1,1,4>
+  3696380988U,	// <1,1,4,2>: Cost 4 vext2 <0,u,1,1>, <4,2,6,0>
+  3763487805U,	// <1,1,4,3>: Cost 4 vext3 <0,u,1,1>, <1,4,3,5>
+  2556882230U,	// <1,1,4,4>: Cost 3 vext1 <1,1,1,4>, RHS
+  1548897590U,	// <1,1,4,5>: Cost 2 vext2 <0,u,1,1>, RHS
+  2758184246U,	// <1,1,4,6>: Cost 3 vuzpl <1,1,1,1>, RHS
+  3666457677U,	// <1,1,4,7>: Cost 4 vext1 <7,1,1,4>, <7,1,1,4>
+  1548897833U,	// <1,1,4,u>: Cost 2 vext2 <0,u,1,1>, RHS
+  2693653615U,	// <1,1,5,0>: Cost 3 vext3 <1,5,0,1>, <1,5,0,1>
+  2617331408U,	// <1,1,5,1>: Cost 3 vext2 <0,0,1,1>, <5,1,7,3>
+  4029302934U,	// <1,1,5,2>: Cost 4 vzipr <0,4,1,5>, <3,0,1,2>
+  2689746064U,	// <1,1,5,3>: Cost 3 vext3 <0,u,1,1>, <1,5,3,7>
+  2221564755U,	// <1,1,5,4>: Cost 3 vrev <1,1,4,5>
+  2955559250U,	// <1,1,5,5>: Cost 3 vzipr <0,4,1,5>, <0,4,1,5>
+  2617331810U,	// <1,1,5,6>: Cost 3 vext2 <0,0,1,1>, <5,6,7,0>
+  2825293110U,	// <1,1,5,7>: Cost 3 vuzpr <1,1,1,1>, RHS
+  2689746109U,	// <1,1,5,u>: Cost 3 vext3 <0,u,1,1>, <1,5,u,7>
+  3696382241U,	// <1,1,6,0>: Cost 4 vext2 <0,u,1,1>, <6,0,1,2>
+  2689746127U,	// <1,1,6,1>: Cost 3 vext3 <0,u,1,1>, <1,6,1,7>
+  2617332218U,	// <1,1,6,2>: Cost 3 vext2 <0,0,1,1>, <6,2,7,3>
+  3763487969U,	// <1,1,6,3>: Cost 4 vext3 <0,u,1,1>, <1,6,3,7>
+  3696382605U,	// <1,1,6,4>: Cost 4 vext2 <0,u,1,1>, <6,4,5,6>
+  4029309266U,	// <1,1,6,5>: Cost 4 vzipr <0,4,1,6>, <0,4,1,5>
+  2617332536U,	// <1,1,6,6>: Cost 3 vext2 <0,0,1,1>, <6,6,6,6>
+  2724840702U,	// <1,1,6,7>: Cost 3 vext3 <6,7,0,1>, <1,6,7,0>
+  2725504263U,	// <1,1,6,u>: Cost 3 vext3 <6,u,0,1>, <1,6,u,0>
+  2617332720U,	// <1,1,7,0>: Cost 3 vext2 <0,0,1,1>, <7,0,0,1>
+  2659800138U,	// <1,1,7,1>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
+  3691074717U,	// <1,1,7,2>: Cost 4 vext2 <0,0,1,1>, <7,2,1,3>
+  4167811174U,	// <1,1,7,3>: Cost 4 vtrnr <1,1,5,7>, LHS
+  2617333094U,	// <1,1,7,4>: Cost 3 vext2 <0,0,1,1>, <7,4,5,6>
+  3295396702U,	// <1,1,7,5>: Cost 4 vrev <1,1,5,7>
+  3803891014U,	// <1,1,7,6>: Cost 4 vext3 <7,6,0,1>, <1,7,6,0>
+  2617333356U,	// <1,1,7,7>: Cost 3 vext2 <0,0,1,1>, <7,7,7,7>
+  2659800138U,	// <1,1,7,u>: Cost 3 vext2 <7,1,1,1>, <7,1,1,1>
+  1483112550U,	// <1,1,u,0>: Cost 2 vext1 <1,1,1,1>, LHS
+  202162278U,	// <1,1,u,1>: Cost 1 vdup1 LHS
+  2622642056U,	// <1,1,u,2>: Cost 3 vext2 <0,u,1,1>, <u,2,3,3>
+  2014142566U,	// <1,1,u,3>: Cost 2 vtrnr LHS, LHS
+  1483115830U,	// <1,1,u,4>: Cost 2 vext1 <1,1,1,1>, RHS
+  1548900506U,	// <1,1,u,5>: Cost 2 vext2 <0,u,1,1>, RHS
+  2622642384U,	// <1,1,u,6>: Cost 3 vext2 <0,u,1,1>, <u,6,3,7>
+  2825293353U,	// <1,1,u,7>: Cost 3 vuzpr <1,1,1,1>, RHS
+  202162278U,	// <1,1,u,u>: Cost 1 vdup1 LHS
+  2635251712U,	// <1,2,0,0>: Cost 3 vext2 <3,0,1,2>, <0,0,0,0>
+  1561509990U,	// <1,2,0,1>: Cost 2 vext2 <3,0,1,2>, LHS
+  2618663085U,	// <1,2,0,2>: Cost 3 vext2 <0,2,1,2>, <0,2,1,2>
+  2696529358U,	// <1,2,0,3>: Cost 3 vext3 <2,0,3,1>, <2,0,3,1>
+  2635252050U,	// <1,2,0,4>: Cost 3 vext2 <3,0,1,2>, <0,4,1,5>
+  3769533926U,	// <1,2,0,5>: Cost 4 vext3 <1,u,2,1>, <2,0,5,7>
+  2621317617U,	// <1,2,0,6>: Cost 3 vext2 <0,6,1,2>, <0,6,1,2>
+  2659140170U,	// <1,2,0,7>: Cost 3 vext2 <7,0,1,2>, <0,7,2,1>
+  1561510557U,	// <1,2,0,u>: Cost 2 vext2 <3,0,1,2>, LHS
+  2623308516U,	// <1,2,1,0>: Cost 3 vext2 <1,0,1,2>, <1,0,1,2>
+  2635252532U,	// <1,2,1,1>: Cost 3 vext2 <3,0,1,2>, <1,1,1,1>
+  2631271318U,	// <1,2,1,2>: Cost 3 vext2 <2,3,1,2>, <1,2,3,0>
+  2958180454U,	// <1,2,1,3>: Cost 3 vzipr <0,u,1,1>, LHS
+  2550959414U,	// <1,2,1,4>: Cost 3 vext1 <0,1,2,1>, RHS
+  2635252880U,	// <1,2,1,5>: Cost 3 vext2 <3,0,1,2>, <1,5,3,7>
+  2635252952U,	// <1,2,1,6>: Cost 3 vext2 <3,0,1,2>, <1,6,2,7>
+  3732882731U,	// <1,2,1,7>: Cost 4 vext2 <7,0,1,2>, <1,7,3,0>
+  2958180459U,	// <1,2,1,u>: Cost 3 vzipr <0,u,1,1>, LHS
+  2629281213U,	// <1,2,2,0>: Cost 3 vext2 <2,0,1,2>, <2,0,1,2>
+  2635253280U,	// <1,2,2,1>: Cost 3 vext2 <3,0,1,2>, <2,1,3,2>
+  2618664552U,	// <1,2,2,2>: Cost 3 vext2 <0,2,1,2>, <2,2,2,2>
+  2689746546U,	// <1,2,2,3>: Cost 3 vext3 <0,u,1,1>, <2,2,3,3>
+  3764815485U,	// <1,2,2,4>: Cost 4 vext3 <1,1,1,1>, <2,2,4,5>
+  3760023176U,	// <1,2,2,5>: Cost 4 vext3 <0,2,u,1>, <2,2,5,7>
+  2635253690U,	// <1,2,2,6>: Cost 3 vext2 <3,0,1,2>, <2,6,3,7>
+  2659141610U,	// <1,2,2,7>: Cost 3 vext2 <7,0,1,2>, <2,7,0,1>
+  2689746591U,	// <1,2,2,u>: Cost 3 vext3 <0,u,1,1>, <2,2,u,3>
+  403488870U,	// <1,2,3,0>: Cost 1 vext1 LHS, LHS
+  1477231350U,	// <1,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  1477232232U,	// <1,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
+  1477233052U,	// <1,2,3,3>: Cost 2 vext1 LHS, <3,3,3,3>
+  403492150U,	// <1,2,3,4>: Cost 1 vext1 LHS, RHS
+  1525010128U,	// <1,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
+  1525010938U,	// <1,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1525011450U,	// <1,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  403494702U,	// <1,2,3,u>: Cost 1 vext1 LHS, LHS
+  2641226607U,	// <1,2,4,0>: Cost 3 vext2 <4,0,1,2>, <4,0,1,2>
+  3624723446U,	// <1,2,4,1>: Cost 4 vext1 <0,1,2,4>, <1,3,4,6>
+  3301123609U,	// <1,2,4,2>: Cost 4 vrev <2,1,2,4>
+  2598759198U,	// <1,2,4,3>: Cost 3 vext1 <u,1,2,4>, <3,u,1,2>
+  2659142864U,	// <1,2,4,4>: Cost 3 vext2 <7,0,1,2>, <4,4,4,4>
+  1561513270U,	// <1,2,4,5>: Cost 2 vext2 <3,0,1,2>, RHS
+  2659143028U,	// <1,2,4,6>: Cost 3 vext2 <7,0,1,2>, <4,6,4,6>
+  2659143112U,	// <1,2,4,7>: Cost 3 vext2 <7,0,1,2>, <4,7,5,0>
+  1561513513U,	// <1,2,4,u>: Cost 2 vext2 <3,0,1,2>, RHS
+  2550988902U,	// <1,2,5,0>: Cost 3 vext1 <0,1,2,5>, LHS
+  2550989824U,	// <1,2,5,1>: Cost 3 vext1 <0,1,2,5>, <1,3,5,7>
+  3624732264U,	// <1,2,5,2>: Cost 4 vext1 <0,1,2,5>, <2,2,2,2>
+  2955559014U,	// <1,2,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
+  2550992182U,	// <1,2,5,4>: Cost 3 vext1 <0,1,2,5>, RHS
+  2659143684U,	// <1,2,5,5>: Cost 3 vext2 <7,0,1,2>, <5,5,5,5>
+  2659143778U,	// <1,2,5,6>: Cost 3 vext2 <7,0,1,2>, <5,6,7,0>
+  2659143848U,	// <1,2,5,7>: Cost 3 vext2 <7,0,1,2>, <5,7,5,7>
+  2550994734U,	// <1,2,5,u>: Cost 3 vext1 <0,1,2,5>, LHS
+  2700289945U,	// <1,2,6,0>: Cost 3 vext3 <2,6,0,1>, <2,6,0,1>
+  2635256232U,	// <1,2,6,1>: Cost 3 vext2 <3,0,1,2>, <6,1,7,2>
+  2659144186U,	// <1,2,6,2>: Cost 3 vext2 <7,0,1,2>, <6,2,7,3>
+  2689746874U,	// <1,2,6,3>: Cost 3 vext3 <0,u,1,1>, <2,6,3,7>
+  3763488705U,	// <1,2,6,4>: Cost 4 vext3 <0,u,1,1>, <2,6,4,5>
+  3763488716U,	// <1,2,6,5>: Cost 4 vext3 <0,u,1,1>, <2,6,5,7>
+  2659144504U,	// <1,2,6,6>: Cost 3 vext2 <7,0,1,2>, <6,6,6,6>
+  2657817432U,	// <1,2,6,7>: Cost 3 vext2 <6,7,1,2>, <6,7,1,2>
+  2689746919U,	// <1,2,6,u>: Cost 3 vext3 <0,u,1,1>, <2,6,u,7>
+  1585402874U,	// <1,2,7,0>: Cost 2 vext2 <7,0,1,2>, <7,0,1,2>
+  2659144770U,	// <1,2,7,1>: Cost 3 vext2 <7,0,1,2>, <7,1,0,2>
+  3708998858U,	// <1,2,7,2>: Cost 4 vext2 <3,0,1,2>, <7,2,6,3>
+  2635257059U,	// <1,2,7,3>: Cost 3 vext2 <3,0,1,2>, <7,3,0,1>
+  2659145062U,	// <1,2,7,4>: Cost 3 vext2 <7,0,1,2>, <7,4,5,6>
+  3732886916U,	// <1,2,7,5>: Cost 4 vext2 <7,0,1,2>, <7,5,0,0>
+  3732886998U,	// <1,2,7,6>: Cost 4 vext2 <7,0,1,2>, <7,6,0,1>
+  2659145255U,	// <1,2,7,7>: Cost 3 vext2 <7,0,1,2>, <7,7,0,1>
+  1590711938U,	// <1,2,7,u>: Cost 2 vext2 <7,u,1,2>, <7,u,1,2>
+  403529835U,	// <1,2,u,0>: Cost 1 vext1 LHS, LHS
+  1477272310U,	// <1,2,u,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  1477273192U,	// <1,2,u,2>: Cost 2 vext1 LHS, <2,2,2,2>
+  1477273750U,	// <1,2,u,3>: Cost 2 vext1 LHS, <3,0,1,2>
+  403533110U,	// <1,2,u,4>: Cost 1 vext1 LHS, RHS
+  1561516186U,	// <1,2,u,5>: Cost 2 vext2 <3,0,1,2>, RHS
+  1525051898U,	// <1,2,u,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1525052410U,	// <1,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  403535662U,	// <1,2,u,u>: Cost 1 vext1 LHS, LHS
+  2819407872U,	// <1,3,0,0>: Cost 3 vuzpr LHS, <0,0,0,0>
+  1551564902U,	// <1,3,0,1>: Cost 2 vext2 <1,3,1,3>, LHS
+  2819408630U,	// <1,3,0,2>: Cost 3 vuzpr LHS, <1,0,3,2>
+  2619334911U,	// <1,3,0,3>: Cost 3 vext2 <0,3,1,3>, <0,3,1,3>
+  2625306962U,	// <1,3,0,4>: Cost 3 vext2 <1,3,1,3>, <0,4,1,5>
+  3832725879U,	// <1,3,0,5>: Cost 4 vuzpl <1,2,3,0>, <0,4,5,6>
+  3699048959U,	// <1,3,0,6>: Cost 4 vext2 <1,3,1,3>, <0,6,2,7>
+  3776538827U,	// <1,3,0,7>: Cost 4 vext3 <3,0,7,1>, <3,0,7,1>
+  1551565469U,	// <1,3,0,u>: Cost 2 vext2 <1,3,1,3>, LHS
+  2618671862U,	// <1,3,1,0>: Cost 3 vext2 <0,2,1,3>, <1,0,3,2>
+  2819408692U,	// <1,3,1,1>: Cost 3 vuzpr LHS, <1,1,1,1>
+  2624643975U,	// <1,3,1,2>: Cost 3 vext2 <1,2,1,3>, <1,2,1,3>
+  1745666150U,	// <1,3,1,3>: Cost 2 vuzpr LHS, LHS
+  2557005110U,	// <1,3,1,4>: Cost 3 vext1 <1,1,3,1>, RHS
+  2625307792U,	// <1,3,1,5>: Cost 3 vext2 <1,3,1,3>, <1,5,3,7>
+  3698386127U,	// <1,3,1,6>: Cost 4 vext2 <1,2,1,3>, <1,6,1,7>
+  2592838748U,	// <1,3,1,7>: Cost 3 vext1 <7,1,3,1>, <7,1,3,1>
+  1745666155U,	// <1,3,1,u>: Cost 2 vuzpr LHS, LHS
+  2819408790U,	// <1,3,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
+  2625308193U,	// <1,3,2,1>: Cost 3 vext2 <1,3,1,3>, <2,1,3,3>
+  2819408036U,	// <1,3,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
+  2819851890U,	// <1,3,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
+  2819408794U,	// <1,3,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
+  3893149890U,	// <1,3,2,5>: Cost 4 vuzpr LHS, <0,2,3,5>
+  2819408076U,	// <1,3,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
+  3772041583U,	// <1,3,2,7>: Cost 4 vext3 <2,3,0,1>, <3,2,7,3>
+  2819408042U,	// <1,3,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
+  1483276390U,	// <1,3,3,0>: Cost 2 vext1 <1,1,3,3>, LHS
+  1483277128U,	// <1,3,3,1>: Cost 2 vext1 <1,1,3,3>, <1,1,3,3>
+  2557019752U,	// <1,3,3,2>: Cost 3 vext1 <1,1,3,3>, <2,2,2,2>
+  2819408856U,	// <1,3,3,3>: Cost 3 vuzpr LHS, <1,3,1,3>
+  1483279670U,	// <1,3,3,4>: Cost 2 vext1 <1,1,3,3>, RHS
+  2819409614U,	// <1,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
+  2598826490U,	// <1,3,3,6>: Cost 3 vext1 <u,1,3,3>, <6,2,7,3>
+  3087844352U,	// <1,3,3,7>: Cost 3 vtrnr LHS, <1,3,5,7>
+  1483282222U,	// <1,3,3,u>: Cost 2 vext1 <1,1,3,3>, LHS
+  2568970342U,	// <1,3,4,0>: Cost 3 vext1 <3,1,3,4>, LHS
+  2568971224U,	// <1,3,4,1>: Cost 3 vext1 <3,1,3,4>, <1,3,1,3>
+  3832761290U,	// <1,3,4,2>: Cost 4 vuzpl <1,2,3,4>, <4,1,2,3>
+  2233428219U,	// <1,3,4,3>: Cost 3 vrev <3,1,3,4>
+  2568973622U,	// <1,3,4,4>: Cost 3 vext1 <3,1,3,4>, RHS
+  1551568182U,	// <1,3,4,5>: Cost 2 vext2 <1,3,1,3>, RHS
+  2819410434U,	// <1,3,4,6>: Cost 3 vuzpr LHS, <3,4,5,6>
+  3666605151U,	// <1,3,4,7>: Cost 4 vext1 <7,1,3,4>, <7,1,3,4>
+  1551568425U,	// <1,3,4,u>: Cost 2 vext2 <1,3,1,3>, RHS
+  2563006566U,	// <1,3,5,0>: Cost 3 vext1 <2,1,3,5>, LHS
+  2568979456U,	// <1,3,5,1>: Cost 3 vext1 <3,1,3,5>, <1,3,5,7>
+  2563008035U,	// <1,3,5,2>: Cost 3 vext1 <2,1,3,5>, <2,1,3,5>
+  2233436412U,	// <1,3,5,3>: Cost 3 vrev <3,1,3,5>
+  2563009846U,	// <1,3,5,4>: Cost 3 vext1 <2,1,3,5>, RHS
+  2867187716U,	// <1,3,5,5>: Cost 3 vuzpr LHS, <5,5,5,5>
+  2655834214U,	// <1,3,5,6>: Cost 3 vext2 <6,4,1,3>, <5,6,7,4>
+  1745669430U,	// <1,3,5,7>: Cost 2 vuzpr LHS, RHS
+  1745669431U,	// <1,3,5,u>: Cost 2 vuzpr LHS, RHS
+  2867187810U,	// <1,3,6,0>: Cost 3 vuzpr LHS, <5,6,7,0>
+  3699052931U,	// <1,3,6,1>: Cost 4 vext2 <1,3,1,3>, <6,1,3,1>
+  2654507460U,	// <1,3,6,2>: Cost 3 vext2 <6,2,1,3>, <6,2,1,3>
+  3766291091U,	// <1,3,6,3>: Cost 4 vext3 <1,3,3,1>, <3,6,3,7>
+  2655834726U,	// <1,3,6,4>: Cost 3 vext2 <6,4,1,3>, <6,4,1,3>
+  3923384562U,	// <1,3,6,5>: Cost 4 vuzpr <5,1,7,3>, <u,6,7,5>
+  2657161992U,	// <1,3,6,6>: Cost 3 vext2 <6,6,1,3>, <6,6,1,3>
+  2819852218U,	// <1,3,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
+  2819852219U,	// <1,3,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
+  2706926275U,	// <1,3,7,0>: Cost 3 vext3 <3,7,0,1>, <3,7,0,1>
+  2659816524U,	// <1,3,7,1>: Cost 3 vext2 <7,1,1,3>, <7,1,1,3>
+  3636766245U,	// <1,3,7,2>: Cost 4 vext1 <2,1,3,7>, <2,1,3,7>
+  2867187903U,	// <1,3,7,3>: Cost 3 vuzpr LHS, <5,7,u,3>
+  2625312102U,	// <1,3,7,4>: Cost 3 vext2 <1,3,1,3>, <7,4,5,6>
+  2867188598U,	// <1,3,7,5>: Cost 3 vuzpr LHS, <6,7,4,5>
+  3728250344U,	// <1,3,7,6>: Cost 4 vext2 <6,2,1,3>, <7,6,2,1>
+  2867187880U,	// <1,3,7,7>: Cost 3 vuzpr LHS, <5,7,5,7>
+  2707516171U,	// <1,3,7,u>: Cost 3 vext3 <3,7,u,1>, <3,7,u,1>
+  1483317350U,	// <1,3,u,0>: Cost 2 vext1 <1,1,3,u>, LHS
+  1483318093U,	// <1,3,u,1>: Cost 2 vext1 <1,1,3,u>, <1,1,3,u>
+  2819410718U,	// <1,3,u,2>: Cost 3 vuzpr LHS, <3,u,1,2>
+  1745666717U,	// <1,3,u,3>: Cost 2 vuzpr LHS, LHS
+  1483320630U,	// <1,3,u,4>: Cost 2 vext1 <1,1,3,u>, RHS
+  1551571098U,	// <1,3,u,5>: Cost 2 vext2 <1,3,1,3>, RHS
+  2819410758U,	// <1,3,u,6>: Cost 3 vuzpr LHS, <3,u,5,6>
+  1745669673U,	// <1,3,u,7>: Cost 2 vuzpr LHS, RHS
+  1745666722U,	// <1,3,u,u>: Cost 2 vuzpr LHS, LHS
+  2617352205U,	// <1,4,0,0>: Cost 3 vext2 <0,0,1,4>, <0,0,1,4>
+  2619342950U,	// <1,4,0,1>: Cost 3 vext2 <0,3,1,4>, LHS
+  3692421295U,	// <1,4,0,2>: Cost 4 vext2 <0,2,1,4>, <0,2,1,4>
+  2619343104U,	// <1,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
+  2617352530U,	// <1,4,0,4>: Cost 3 vext2 <0,0,1,4>, <0,4,1,5>
+  1634880402U,	// <1,4,0,5>: Cost 2 vext3 <4,0,5,1>, <4,0,5,1>
+  2713930652U,	// <1,4,0,6>: Cost 3 vext3 <4,u,5,1>, <4,0,6,2>
+  3732898396U,	// <1,4,0,7>: Cost 4 vext2 <7,0,1,4>, <0,7,4,1>
+  1635101613U,	// <1,4,0,u>: Cost 2 vext3 <4,0,u,1>, <4,0,u,1>
+  3693085430U,	// <1,4,1,0>: Cost 4 vext2 <0,3,1,4>, <1,0,3,2>
+  2623988535U,	// <1,4,1,1>: Cost 3 vext2 <1,1,1,4>, <1,1,1,4>
+  3693085590U,	// <1,4,1,2>: Cost 4 vext2 <0,3,1,4>, <1,2,3,0>
+  3692422134U,	// <1,4,1,3>: Cost 4 vext2 <0,2,1,4>, <1,3,4,6>
+  3693085726U,	// <1,4,1,4>: Cost 4 vext2 <0,3,1,4>, <1,4,0,1>
+  2892401974U,	// <1,4,1,5>: Cost 3 vzipl <1,1,1,1>, RHS
+  3026619702U,	// <1,4,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
+  3800206324U,	// <1,4,1,7>: Cost 4 vext3 <7,0,4,1>, <4,1,7,0>
+  2892402217U,	// <1,4,1,u>: Cost 3 vzipl <1,1,1,1>, RHS
+  3966978927U,	// <1,4,2,0>: Cost 4 vzipl <1,2,3,4>, <4,0,1,2>
+  3966979018U,	// <1,4,2,1>: Cost 4 vzipl <1,2,3,4>, <4,1,2,3>
+  3693086312U,	// <1,4,2,2>: Cost 4 vext2 <0,3,1,4>, <2,2,2,2>
+  2635269798U,	// <1,4,2,3>: Cost 3 vext2 <3,0,1,4>, <2,3,0,1>
+  3966979280U,	// <1,4,2,4>: Cost 4 vzipl <1,2,3,4>, <4,4,4,4>
+  2893204790U,	// <1,4,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
+  3693086650U,	// <1,4,2,6>: Cost 4 vext2 <0,3,1,4>, <2,6,3,7>
+  3666662502U,	// <1,4,2,7>: Cost 4 vext1 <7,1,4,2>, <7,1,4,2>
+  2893205033U,	// <1,4,2,u>: Cost 3 vzipl <1,2,3,0>, RHS
+  2563063910U,	// <1,4,3,0>: Cost 3 vext1 <2,1,4,3>, LHS
+  2563064730U,	// <1,4,3,1>: Cost 3 vext1 <2,1,4,3>, <1,2,3,4>
+  2563065386U,	// <1,4,3,2>: Cost 3 vext1 <2,1,4,3>, <2,1,4,3>
+  3693087132U,	// <1,4,3,3>: Cost 4 vext2 <0,3,1,4>, <3,3,3,3>
+  2619345410U,	// <1,4,3,4>: Cost 3 vext2 <0,3,1,4>, <3,4,5,6>
+  3087843666U,	// <1,4,3,5>: Cost 3 vtrnr LHS, <0,4,1,5>
+  3087843676U,	// <1,4,3,6>: Cost 3 vtrnr LHS, <0,4,2,6>
+  3666670695U,	// <1,4,3,7>: Cost 4 vext1 <7,1,4,3>, <7,1,4,3>
+  3087843669U,	// <1,4,3,u>: Cost 3 vtrnr LHS, <0,4,1,u>
+  2620672914U,	// <1,4,4,0>: Cost 3 vext2 <0,5,1,4>, <4,0,5,1>
+  3630842706U,	// <1,4,4,1>: Cost 4 vext1 <1,1,4,4>, <1,1,4,4>
+  3313069003U,	// <1,4,4,2>: Cost 4 vrev <4,1,2,4>
+  3642788100U,	// <1,4,4,3>: Cost 4 vext1 <3,1,4,4>, <3,1,4,4>
+  2713930960U,	// <1,4,4,4>: Cost 3 vext3 <4,u,5,1>, <4,4,4,4>
+  2619346230U,	// <1,4,4,5>: Cost 3 vext2 <0,3,1,4>, RHS
+  2713930980U,	// <1,4,4,6>: Cost 3 vext3 <4,u,5,1>, <4,4,6,6>
+  3736882642U,	// <1,4,4,7>: Cost 4 vext2 <7,6,1,4>, <4,7,6,1>
+  2619346473U,	// <1,4,4,u>: Cost 3 vext2 <0,3,1,4>, RHS
+  2557108326U,	// <1,4,5,0>: Cost 3 vext1 <1,1,4,5>, LHS
+  2557109075U,	// <1,4,5,1>: Cost 3 vext1 <1,1,4,5>, <1,1,4,5>
+  2598913774U,	// <1,4,5,2>: Cost 3 vext1 <u,1,4,5>, <2,3,u,1>
+  3630852246U,	// <1,4,5,3>: Cost 4 vext1 <1,1,4,5>, <3,0,1,2>
+  2557111606U,	// <1,4,5,4>: Cost 3 vext1 <1,1,4,5>, RHS
+  2895252790U,	// <1,4,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
+  1616006454U,	// <1,4,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  3899059510U,	// <1,4,5,7>: Cost 4 vuzpr <1,1,1,4>, RHS
+  1616006472U,	// <1,4,5,u>: Cost 2 vext3 <0,u,1,1>, RHS
+  2557116518U,	// <1,4,6,0>: Cost 3 vext1 <1,1,4,6>, LHS
+  2557117236U,	// <1,4,6,1>: Cost 3 vext1 <1,1,4,6>, <1,1,1,1>
+  3630859880U,	// <1,4,6,2>: Cost 4 vext1 <1,1,4,6>, <2,2,2,2>
+  2569062550U,	// <1,4,6,3>: Cost 3 vext1 <3,1,4,6>, <3,0,1,2>
+  2557119798U,	// <1,4,6,4>: Cost 3 vext1 <1,1,4,6>, RHS
+  3763490174U,	// <1,4,6,5>: Cost 4 vext3 <0,u,1,1>, <4,6,5,7>
+  3763490183U,	// <1,4,6,6>: Cost 4 vext3 <0,u,1,1>, <4,6,6,7>
+  2712751498U,	// <1,4,6,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
+  2557122350U,	// <1,4,6,u>: Cost 3 vext1 <1,1,4,6>, LHS
+  2659161084U,	// <1,4,7,0>: Cost 3 vext2 <7,0,1,4>, <7,0,1,4>
+  3732903040U,	// <1,4,7,1>: Cost 4 vext2 <7,0,1,4>, <7,1,7,1>
+  3734230174U,	// <1,4,7,2>: Cost 4 vext2 <7,2,1,4>, <7,2,1,4>
+  3734893807U,	// <1,4,7,3>: Cost 4 vext2 <7,3,1,4>, <7,3,1,4>
+  3660729654U,	// <1,4,7,4>: Cost 4 vext1 <6,1,4,7>, RHS
+  3786493384U,	// <1,4,7,5>: Cost 4 vext3 <4,6,7,1>, <4,7,5,0>
+  2713341394U,	// <1,4,7,6>: Cost 3 vext3 <4,7,6,1>, <4,7,6,1>
+  3660731386U,	// <1,4,7,7>: Cost 4 vext1 <6,1,4,7>, <7,0,1,2>
+  2664470148U,	// <1,4,7,u>: Cost 3 vext2 <7,u,1,4>, <7,u,1,4>
+  2557132902U,	// <1,4,u,0>: Cost 3 vext1 <1,1,4,u>, LHS
+  2619348782U,	// <1,4,u,1>: Cost 3 vext2 <0,3,1,4>, LHS
+  2563106351U,	// <1,4,u,2>: Cost 3 vext1 <2,1,4,u>, <2,1,4,u>
+  2713783816U,	// <1,4,u,3>: Cost 3 vext3 <4,u,3,1>, <4,u,3,1>
+  2622666815U,	// <1,4,u,4>: Cost 3 vext2 <0,u,1,4>, <u,4,5,6>
+  1640189466U,	// <1,4,u,5>: Cost 2 vext3 <4,u,5,1>, <4,u,5,1>
+  1616006697U,	// <1,4,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  2712751498U,	// <1,4,u,7>: Cost 3 vext3 <4,6,7,1>, <4,6,7,1>
+  1616006715U,	// <1,4,u,u>: Cost 2 vext3 <0,u,1,1>, RHS
+  2620014592U,	// <1,5,0,0>: Cost 3 vext2 <0,4,1,5>, <0,0,0,0>
+  1546272870U,	// <1,5,0,1>: Cost 2 vext2 <0,4,1,5>, LHS
+  2618687664U,	// <1,5,0,2>: Cost 3 vext2 <0,2,1,5>, <0,2,1,5>
+  3693093120U,	// <1,5,0,3>: Cost 4 vext2 <0,3,1,5>, <0,3,1,4>
+  1546273106U,	// <1,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
+  2620678563U,	// <1,5,0,5>: Cost 3 vext2 <0,5,1,5>, <0,5,1,5>
+  2714668660U,	// <1,5,0,6>: Cost 3 vext3 <5,0,6,1>, <5,0,6,1>
+  3772042877U,	// <1,5,0,7>: Cost 4 vext3 <2,3,0,1>, <5,0,7,1>
+  1546273437U,	// <1,5,0,u>: Cost 2 vext2 <0,4,1,5>, LHS
+  2620015350U,	// <1,5,1,0>: Cost 3 vext2 <0,4,1,5>, <1,0,3,2>
+  2620015412U,	// <1,5,1,1>: Cost 3 vext2 <0,4,1,5>, <1,1,1,1>
+  2620015510U,	// <1,5,1,2>: Cost 3 vext2 <0,4,1,5>, <1,2,3,0>
+  2618688512U,	// <1,5,1,3>: Cost 3 vext2 <0,2,1,5>, <1,3,5,7>
+  2620015677U,	// <1,5,1,4>: Cost 3 vext2 <0,4,1,5>, <1,4,3,5>
+  2620015727U,	// <1,5,1,5>: Cost 3 vext2 <0,4,1,5>, <1,5,0,1>
+  2620015859U,	// <1,5,1,6>: Cost 3 vext2 <0,4,1,5>, <1,6,5,7>
+  3093728566U,	// <1,5,1,7>: Cost 3 vtrnr <1,1,1,1>, RHS
+  2620015981U,	// <1,5,1,u>: Cost 3 vext2 <0,4,1,5>, <1,u,1,3>
+  3692430816U,	// <1,5,2,0>: Cost 4 vext2 <0,2,1,5>, <2,0,5,1>
+  2620016163U,	// <1,5,2,1>: Cost 3 vext2 <0,4,1,5>, <2,1,3,5>
+  2620016232U,	// <1,5,2,2>: Cost 3 vext2 <0,4,1,5>, <2,2,2,2>
+  2620016294U,	// <1,5,2,3>: Cost 3 vext2 <0,4,1,5>, <2,3,0,1>
+  3693758221U,	// <1,5,2,4>: Cost 4 vext2 <0,4,1,5>, <2,4,2,5>
+  3692431209U,	// <1,5,2,5>: Cost 4 vext2 <0,2,1,5>, <2,5,3,7>
+  2620016570U,	// <1,5,2,6>: Cost 3 vext2 <0,4,1,5>, <2,6,3,7>
+  4173598006U,	// <1,5,2,7>: Cost 4 vtrnr <2,1,3,2>, RHS
+  2620016699U,	// <1,5,2,u>: Cost 3 vext2 <0,4,1,5>, <2,u,0,1>
+  2620016790U,	// <1,5,3,0>: Cost 3 vext2 <0,4,1,5>, <3,0,1,2>
+  2569110672U,	// <1,5,3,1>: Cost 3 vext1 <3,1,5,3>, <1,5,3,7>
+  3693758785U,	// <1,5,3,2>: Cost 4 vext2 <0,4,1,5>, <3,2,2,2>
+  2620017052U,	// <1,5,3,3>: Cost 3 vext2 <0,4,1,5>, <3,3,3,3>
+  2620017154U,	// <1,5,3,4>: Cost 3 vext2 <0,4,1,5>, <3,4,5,6>
+  3135623172U,	// <1,5,3,5>: Cost 3 vtrnr LHS, <5,5,5,5>
+  4161587048U,	// <1,5,3,6>: Cost 4 vtrnr LHS, <2,5,3,6>
+  2014104886U,	// <1,5,3,7>: Cost 2 vtrnr LHS, RHS
+  2014104887U,	// <1,5,3,u>: Cost 2 vtrnr LHS, RHS
+  2620017554U,	// <1,5,4,0>: Cost 3 vext2 <0,4,1,5>, <4,0,5,1>
+  2620017634U,	// <1,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
+  3693759551U,	// <1,5,4,2>: Cost 4 vext2 <0,4,1,5>, <4,2,6,3>
+  3642861837U,	// <1,5,4,3>: Cost 4 vext1 <3,1,5,4>, <3,1,5,4>
+  2575092710U,	// <1,5,4,4>: Cost 3 vext1 <4,1,5,4>, <4,1,5,4>
+  1546276150U,	// <1,5,4,5>: Cost 2 vext2 <0,4,1,5>, RHS
+  2759855414U,	// <1,5,4,6>: Cost 3 vuzpl <1,3,5,7>, RHS
+  2713931718U,	// <1,5,4,7>: Cost 3 vext3 <4,u,5,1>, <5,4,7,6>
+  1546276393U,	// <1,5,4,u>: Cost 2 vext2 <0,4,1,5>, RHS
+  2557182054U,	// <1,5,5,0>: Cost 3 vext1 <1,1,5,5>, LHS
+  2557182812U,	// <1,5,5,1>: Cost 3 vext1 <1,1,5,5>, <1,1,5,5>
+  3630925347U,	// <1,5,5,2>: Cost 4 vext1 <1,1,5,5>, <2,1,3,5>
+  4029301675U,	// <1,5,5,3>: Cost 4 vzipr <0,4,1,5>, <1,2,5,3>
+  2557185334U,	// <1,5,5,4>: Cost 3 vext1 <1,1,5,5>, RHS
+  2713931780U,	// <1,5,5,5>: Cost 3 vext3 <4,u,5,1>, <5,5,5,5>
+  2667794530U,	// <1,5,5,6>: Cost 3 vext2 <u,4,1,5>, <5,6,7,0>
+  2713931800U,	// <1,5,5,7>: Cost 3 vext3 <4,u,5,1>, <5,5,7,7>
+  2557187886U,	// <1,5,5,u>: Cost 3 vext1 <1,1,5,5>, LHS
+  2718208036U,	// <1,5,6,0>: Cost 3 vext3 <5,6,0,1>, <5,6,0,1>
+  2620019115U,	// <1,5,6,1>: Cost 3 vext2 <0,4,1,5>, <6,1,7,5>
+  2667794938U,	// <1,5,6,2>: Cost 3 vext2 <u,4,1,5>, <6,2,7,3>
+  3787673666U,	// <1,5,6,3>: Cost 4 vext3 <4,u,5,1>, <5,6,3,4>
+  3693761165U,	// <1,5,6,4>: Cost 4 vext2 <0,4,1,5>, <6,4,5,6>
+  3319279297U,	// <1,5,6,5>: Cost 4 vrev <5,1,5,6>
+  2667795256U,	// <1,5,6,6>: Cost 3 vext2 <u,4,1,5>, <6,6,6,6>
+  2713931874U,	// <1,5,6,7>: Cost 3 vext3 <4,u,5,1>, <5,6,7,0>
+  2713931883U,	// <1,5,6,u>: Cost 3 vext3 <4,u,5,1>, <5,6,u,0>
+  2557198438U,	// <1,5,7,0>: Cost 3 vext1 <1,1,5,7>, LHS
+  2557199156U,	// <1,5,7,1>: Cost 3 vext1 <1,1,5,7>, <1,1,1,1>
+  2569143974U,	// <1,5,7,2>: Cost 3 vext1 <3,1,5,7>, <2,3,0,1>
+  2569144592U,	// <1,5,7,3>: Cost 3 vext1 <3,1,5,7>, <3,1,5,7>
+  2557201718U,	// <1,5,7,4>: Cost 3 vext1 <1,1,5,7>, RHS
+  2713931944U,	// <1,5,7,5>: Cost 3 vext3 <4,u,5,1>, <5,7,5,7>
+  3787673770U,	// <1,5,7,6>: Cost 4 vext3 <4,u,5,1>, <5,7,6,0>
+  2719387828U,	// <1,5,7,7>: Cost 3 vext3 <5,7,7,1>, <5,7,7,1>
+  2557204270U,	// <1,5,7,u>: Cost 3 vext1 <1,1,5,7>, LHS
+  2620020435U,	// <1,5,u,0>: Cost 3 vext2 <0,4,1,5>, <u,0,1,2>
+  1546278702U,	// <1,5,u,1>: Cost 2 vext2 <0,4,1,5>, LHS
+  2620020616U,	// <1,5,u,2>: Cost 3 vext2 <0,4,1,5>, <u,2,3,3>
+  2620020668U,	// <1,5,u,3>: Cost 3 vext2 <0,4,1,5>, <u,3,0,1>
+  1594054682U,	// <1,5,u,4>: Cost 2 vext2 <u,4,1,5>, <u,4,1,5>
+  1546279066U,	// <1,5,u,5>: Cost 2 vext2 <0,4,1,5>, RHS
+  2620020944U,	// <1,5,u,6>: Cost 3 vext2 <0,4,1,5>, <u,6,3,7>
+  2014145846U,	// <1,5,u,7>: Cost 2 vtrnr LHS, RHS
+  2014145847U,	// <1,5,u,u>: Cost 2 vtrnr LHS, RHS
+  3692437504U,	// <1,6,0,0>: Cost 4 vext2 <0,2,1,6>, <0,0,0,0>
+  2618695782U,	// <1,6,0,1>: Cost 3 vext2 <0,2,1,6>, LHS
+  2618695857U,	// <1,6,0,2>: Cost 3 vext2 <0,2,1,6>, <0,2,1,6>
+  3794161970U,	// <1,6,0,3>: Cost 4 vext3 <6,0,3,1>, <6,0,3,1>
+  2620023122U,	// <1,6,0,4>: Cost 3 vext2 <0,4,1,6>, <0,4,1,5>
+  2620686756U,	// <1,6,0,5>: Cost 3 vext2 <0,5,1,6>, <0,5,1,6>
+  2621350389U,	// <1,6,0,6>: Cost 3 vext2 <0,6,1,6>, <0,6,1,6>
+  4028599606U,	// <1,6,0,7>: Cost 4 vzipr <0,3,1,0>, RHS
+  2618696349U,	// <1,6,0,u>: Cost 3 vext2 <0,2,1,6>, LHS
+  3692438262U,	// <1,6,1,0>: Cost 4 vext2 <0,2,1,6>, <1,0,3,2>
+  2625995572U,	// <1,6,1,1>: Cost 3 vext2 <1,4,1,6>, <1,1,1,1>
+  3692438422U,	// <1,6,1,2>: Cost 4 vext2 <0,2,1,6>, <1,2,3,0>
+  3692438488U,	// <1,6,1,3>: Cost 4 vext2 <0,2,1,6>, <1,3,1,3>
+  2625995820U,	// <1,6,1,4>: Cost 3 vext2 <1,4,1,6>, <1,4,1,6>
+  3692438672U,	// <1,6,1,5>: Cost 4 vext2 <0,2,1,6>, <1,5,3,7>
+  3692438720U,	// <1,6,1,6>: Cost 4 vext2 <0,2,1,6>, <1,6,0,1>
+  2958183734U,	// <1,6,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
+  2958183735U,	// <1,6,1,u>: Cost 3 vzipr <0,u,1,1>, RHS
+  2721526201U,	// <1,6,2,0>: Cost 3 vext3 <6,2,0,1>, <6,2,0,1>
+  3692439097U,	// <1,6,2,1>: Cost 4 vext2 <0,2,1,6>, <2,1,6,0>
+  3692439144U,	// <1,6,2,2>: Cost 4 vext2 <0,2,1,6>, <2,2,2,2>
+  3692439206U,	// <1,6,2,3>: Cost 4 vext2 <0,2,1,6>, <2,3,0,1>
+  3636948278U,	// <1,6,2,4>: Cost 4 vext1 <2,1,6,2>, RHS
+  3787674092U,	// <1,6,2,5>: Cost 4 vext3 <4,u,5,1>, <6,2,5,7>
+  2618697658U,	// <1,6,2,6>: Cost 3 vext2 <0,2,1,6>, <2,6,3,7>
+  2970799414U,	// <1,6,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
+  2970799415U,	// <1,6,2,u>: Cost 3 vzipr <3,0,1,2>, RHS
+  2563211366U,	// <1,6,3,0>: Cost 3 vext1 <2,1,6,3>, LHS
+  3699738854U,	// <1,6,3,1>: Cost 4 vext2 <1,4,1,6>, <3,1,1,1>
+  2563212860U,	// <1,6,3,2>: Cost 3 vext1 <2,1,6,3>, <2,1,6,3>
+  3692439964U,	// <1,6,3,3>: Cost 4 vext2 <0,2,1,6>, <3,3,3,3>
+  2563214646U,	// <1,6,3,4>: Cost 3 vext1 <2,1,6,3>, RHS
+  4191820018U,	// <1,6,3,5>: Cost 4 vtrnr <5,1,7,3>, <u,6,7,5>
+  2587103648U,	// <1,6,3,6>: Cost 3 vext1 <6,1,6,3>, <6,1,6,3>
+  3087845306U,	// <1,6,3,7>: Cost 3 vtrnr LHS, <2,6,3,7>
+  3087845307U,	// <1,6,3,u>: Cost 3 vtrnr LHS, <2,6,3,u>
+  3693767570U,	// <1,6,4,0>: Cost 4 vext2 <0,4,1,6>, <4,0,5,1>
+  3693767650U,	// <1,6,4,1>: Cost 4 vext2 <0,4,1,6>, <4,1,5,0>
+  3636962877U,	// <1,6,4,2>: Cost 4 vext1 <2,1,6,4>, <2,1,6,4>
+  3325088134U,	// <1,6,4,3>: Cost 4 vrev <6,1,3,4>
+  3693767898U,	// <1,6,4,4>: Cost 4 vext2 <0,4,1,6>, <4,4,5,5>
+  2618699062U,	// <1,6,4,5>: Cost 3 vext2 <0,2,1,6>, RHS
+  3833670966U,	// <1,6,4,6>: Cost 4 vuzpl <1,3,6,7>, RHS
+  4028632374U,	// <1,6,4,7>: Cost 4 vzipr <0,3,1,4>, RHS
+  2618699305U,	// <1,6,4,u>: Cost 3 vext2 <0,2,1,6>, RHS
+  3693768264U,	// <1,6,5,0>: Cost 4 vext2 <0,4,1,6>, <5,0,1,2>
+  3630998373U,	// <1,6,5,1>: Cost 4 vext1 <1,1,6,5>, <1,1,6,5>
+  3636971070U,	// <1,6,5,2>: Cost 4 vext1 <2,1,6,5>, <2,1,6,5>
+  3642943767U,	// <1,6,5,3>: Cost 4 vext1 <3,1,6,5>, <3,1,6,5>
+  3693768628U,	// <1,6,5,4>: Cost 4 vext2 <0,4,1,6>, <5,4,5,6>
+  3732918276U,	// <1,6,5,5>: Cost 4 vext2 <7,0,1,6>, <5,5,5,5>
+  2620690530U,	// <1,6,5,6>: Cost 3 vext2 <0,5,1,6>, <5,6,7,0>
+  2955562294U,	// <1,6,5,7>: Cost 3 vzipr <0,4,1,5>, RHS
+  2955562295U,	// <1,6,5,u>: Cost 3 vzipr <0,4,1,5>, RHS
+  2724180733U,	// <1,6,6,0>: Cost 3 vext3 <6,6,0,1>, <6,6,0,1>
+  3631006566U,	// <1,6,6,1>: Cost 4 vext1 <1,1,6,6>, <1,1,6,6>
+  3631007674U,	// <1,6,6,2>: Cost 4 vext1 <1,1,6,6>, <2,6,3,7>
+  3692442184U,	// <1,6,6,3>: Cost 4 vext2 <0,2,1,6>, <6,3,7,0>
+  3631009078U,	// <1,6,6,4>: Cost 4 vext1 <1,1,6,6>, RHS
+  3787674416U,	// <1,6,6,5>: Cost 4 vext3 <4,u,5,1>, <6,6,5,7>
+  2713932600U,	// <1,6,6,6>: Cost 3 vext3 <4,u,5,1>, <6,6,6,6>
+  2713932610U,	// <1,6,6,7>: Cost 3 vext3 <4,u,5,1>, <6,6,7,7>
+  2713932619U,	// <1,6,6,u>: Cost 3 vext3 <4,u,5,1>, <6,6,u,7>
+  1651102542U,	// <1,6,7,0>: Cost 2 vext3 <6,7,0,1>, <6,7,0,1>
+  2724918103U,	// <1,6,7,1>: Cost 3 vext3 <6,7,1,1>, <6,7,1,1>
+  2698302306U,	// <1,6,7,2>: Cost 3 vext3 <2,3,0,1>, <6,7,2,3>
+  3642960153U,	// <1,6,7,3>: Cost 4 vext1 <3,1,6,7>, <3,1,6,7>
+  2713932662U,	// <1,6,7,4>: Cost 3 vext3 <4,u,5,1>, <6,7,4,5>
+  2725213051U,	// <1,6,7,5>: Cost 3 vext3 <6,7,5,1>, <6,7,5,1>
+  2724844426U,	// <1,6,7,6>: Cost 3 vext3 <6,7,0,1>, <6,7,6,7>
+  4035956022U,	// <1,6,7,7>: Cost 4 vzipr <1,5,1,7>, RHS
+  1651692438U,	// <1,6,7,u>: Cost 2 vext3 <6,7,u,1>, <6,7,u,1>
+  1651766175U,	// <1,6,u,0>: Cost 2 vext3 <6,u,0,1>, <6,u,0,1>
+  2618701614U,	// <1,6,u,1>: Cost 3 vext2 <0,2,1,6>, LHS
+  3135663508U,	// <1,6,u,2>: Cost 3 vtrnr LHS, <4,6,u,2>
+  3692443580U,	// <1,6,u,3>: Cost 4 vext2 <0,2,1,6>, <u,3,0,1>
+  2713932743U,	// <1,6,u,4>: Cost 3 vext3 <4,u,5,1>, <6,u,4,5>
+  2618701978U,	// <1,6,u,5>: Cost 3 vext2 <0,2,1,6>, RHS
+  2622683344U,	// <1,6,u,6>: Cost 3 vext2 <0,u,1,6>, <u,6,3,7>
+  3087886266U,	// <1,6,u,7>: Cost 3 vtrnr LHS, <2,6,3,7>
+  1652356071U,	// <1,6,u,u>: Cost 2 vext3 <6,u,u,1>, <6,u,u,1>
+  2726171632U,	// <1,7,0,0>: Cost 3 vext3 <7,0,0,1>, <7,0,0,1>
+  2626666598U,	// <1,7,0,1>: Cost 3 vext2 <1,5,1,7>, LHS
+  3695100067U,	// <1,7,0,2>: Cost 4 vext2 <0,6,1,7>, <0,2,0,1>
+  3707044102U,	// <1,7,0,3>: Cost 4 vext2 <2,6,1,7>, <0,3,2,1>
+  2726466580U,	// <1,7,0,4>: Cost 3 vext3 <7,0,4,1>, <7,0,4,1>
+  3654921933U,	// <1,7,0,5>: Cost 4 vext1 <5,1,7,0>, <5,1,7,0>
+  2621358582U,	// <1,7,0,6>: Cost 3 vext2 <0,6,1,7>, <0,6,1,7>
+  2622022215U,	// <1,7,0,7>: Cost 3 vext2 <0,7,1,7>, <0,7,1,7>
+  2626667165U,	// <1,7,0,u>: Cost 3 vext2 <1,5,1,7>, LHS
+  2593128550U,	// <1,7,1,0>: Cost 3 vext1 <7,1,7,1>, LHS
+  2626667316U,	// <1,7,1,1>: Cost 3 vext2 <1,5,1,7>, <1,1,1,1>
+  3700409238U,	// <1,7,1,2>: Cost 4 vext2 <1,5,1,7>, <1,2,3,0>
+  2257294428U,	// <1,7,1,3>: Cost 3 vrev <7,1,3,1>
+  2593131830U,	// <1,7,1,4>: Cost 3 vext1 <7,1,7,1>, RHS
+  2626667646U,	// <1,7,1,5>: Cost 3 vext2 <1,5,1,7>, <1,5,1,7>
+  2627331279U,	// <1,7,1,6>: Cost 3 vext2 <1,6,1,7>, <1,6,1,7>
+  2593133696U,	// <1,7,1,7>: Cost 3 vext1 <7,1,7,1>, <7,1,7,1>
+  2628658545U,	// <1,7,1,u>: Cost 3 vext2 <1,u,1,7>, <1,u,1,7>
+  2587164774U,	// <1,7,2,0>: Cost 3 vext1 <6,1,7,2>, LHS
+  3701073445U,	// <1,7,2,1>: Cost 4 vext2 <1,6,1,7>, <2,1,3,7>
+  3700409960U,	// <1,7,2,2>: Cost 4 vext2 <1,5,1,7>, <2,2,2,2>
+  2638612134U,	// <1,7,2,3>: Cost 3 vext2 <3,5,1,7>, <2,3,0,1>
+  2587168054U,	// <1,7,2,4>: Cost 3 vext1 <6,1,7,2>, RHS
+  3706382167U,	// <1,7,2,5>: Cost 4 vext2 <2,5,1,7>, <2,5,1,7>
+  2587169192U,	// <1,7,2,6>: Cost 3 vext1 <6,1,7,2>, <6,1,7,2>
+  3660911610U,	// <1,7,2,7>: Cost 4 vext1 <6,1,7,2>, <7,0,1,2>
+  2587170606U,	// <1,7,2,u>: Cost 3 vext1 <6,1,7,2>, LHS
+  1507459174U,	// <1,7,3,0>: Cost 2 vext1 <5,1,7,3>, LHS
+  2569257984U,	// <1,7,3,1>: Cost 3 vext1 <3,1,7,3>, <1,3,5,7>
+  2581202536U,	// <1,7,3,2>: Cost 3 vext1 <5,1,7,3>, <2,2,2,2>
+  2569259294U,	// <1,7,3,3>: Cost 3 vext1 <3,1,7,3>, <3,1,7,3>
+  1507462454U,	// <1,7,3,4>: Cost 2 vext1 <5,1,7,3>, RHS
+  1507462864U,	// <1,7,3,5>: Cost 2 vext1 <5,1,7,3>, <5,1,7,3>
+  2581205498U,	// <1,7,3,6>: Cost 3 vext1 <5,1,7,3>, <6,2,7,3>
+  2581206010U,	// <1,7,3,7>: Cost 3 vext1 <5,1,7,3>, <7,0,1,2>
+  1507465006U,	// <1,7,3,u>: Cost 2 vext1 <5,1,7,3>, LHS
+  2728826164U,	// <1,7,4,0>: Cost 3 vext3 <7,4,0,1>, <7,4,0,1>
+  3654951732U,	// <1,7,4,1>: Cost 4 vext1 <5,1,7,4>, <1,1,1,1>
+  3330987094U,	// <1,7,4,2>: Cost 4 vrev <7,1,2,4>
+  3331060831U,	// <1,7,4,3>: Cost 4 vrev <7,1,3,4>
+  3787674971U,	// <1,7,4,4>: Cost 4 vext3 <4,u,5,1>, <7,4,4,4>
+  2626669878U,	// <1,7,4,5>: Cost 3 vext2 <1,5,1,7>, RHS
+  3785979241U,	// <1,7,4,6>: Cost 4 vext3 <4,6,0,1>, <7,4,6,0>
+  3787085176U,	// <1,7,4,7>: Cost 4 vext3 <4,7,6,1>, <7,4,7,6>
+  2626670121U,	// <1,7,4,u>: Cost 3 vext2 <1,5,1,7>, RHS
+  2569273446U,	// <1,7,5,0>: Cost 3 vext1 <3,1,7,5>, LHS
+  2569274368U,	// <1,7,5,1>: Cost 3 vext1 <3,1,7,5>, <1,3,5,7>
+  3643016808U,	// <1,7,5,2>: Cost 4 vext1 <3,1,7,5>, <2,2,2,2>
+  2569275680U,	// <1,7,5,3>: Cost 3 vext1 <3,1,7,5>, <3,1,7,5>
+  2569276726U,	// <1,7,5,4>: Cost 3 vext1 <3,1,7,5>, RHS
+  4102034790U,	// <1,7,5,5>: Cost 4 vtrnl <1,3,5,7>, <7,4,5,6>
+  2651222067U,	// <1,7,5,6>: Cost 3 vext2 <5,6,1,7>, <5,6,1,7>
+  3899378998U,	// <1,7,5,7>: Cost 4 vuzpr <1,1,5,7>, RHS
+  2569279278U,	// <1,7,5,u>: Cost 3 vext1 <3,1,7,5>, LHS
+  2730153430U,	// <1,7,6,0>: Cost 3 vext3 <7,6,0,1>, <7,6,0,1>
+  2724845022U,	// <1,7,6,1>: Cost 3 vext3 <6,7,0,1>, <7,6,1,0>
+  3643025338U,	// <1,7,6,2>: Cost 4 vext1 <3,1,7,6>, <2,6,3,7>
+  3643025697U,	// <1,7,6,3>: Cost 4 vext1 <3,1,7,6>, <3,1,7,6>
+  3643026742U,	// <1,7,6,4>: Cost 4 vext1 <3,1,7,6>, RHS
+  3654971091U,	// <1,7,6,5>: Cost 4 vext1 <5,1,7,6>, <5,1,7,6>
+  3787675153U,	// <1,7,6,6>: Cost 4 vext3 <4,u,5,1>, <7,6,6,6>
+  2724845076U,	// <1,7,6,7>: Cost 3 vext3 <6,7,0,1>, <7,6,7,0>
+  2725508637U,	// <1,7,6,u>: Cost 3 vext3 <6,u,0,1>, <7,6,u,0>
+  2730817063U,	// <1,7,7,0>: Cost 3 vext3 <7,7,0,1>, <7,7,0,1>
+  3631088436U,	// <1,7,7,1>: Cost 4 vext1 <1,1,7,7>, <1,1,1,1>
+  3660949158U,	// <1,7,7,2>: Cost 4 vext1 <6,1,7,7>, <2,3,0,1>
+  3801904705U,	// <1,7,7,3>: Cost 4 vext3 <7,3,0,1>, <7,7,3,0>
+  3631090998U,	// <1,7,7,4>: Cost 4 vext1 <1,1,7,7>, RHS
+  2662503828U,	// <1,7,7,5>: Cost 3 vext2 <7,5,1,7>, <7,5,1,7>
+  3660951981U,	// <1,7,7,6>: Cost 4 vext1 <6,1,7,7>, <6,1,7,7>
+  2713933420U,	// <1,7,7,7>: Cost 3 vext3 <4,u,5,1>, <7,7,7,7>
+  2731406959U,	// <1,7,7,u>: Cost 3 vext3 <7,7,u,1>, <7,7,u,1>
+  1507500134U,	// <1,7,u,0>: Cost 2 vext1 <5,1,7,u>, LHS
+  2626672430U,	// <1,7,u,1>: Cost 3 vext2 <1,5,1,7>, LHS
+  2581243496U,	// <1,7,u,2>: Cost 3 vext1 <5,1,7,u>, <2,2,2,2>
+  2569300259U,	// <1,7,u,3>: Cost 3 vext1 <3,1,7,u>, <3,1,7,u>
+  1507503414U,	// <1,7,u,4>: Cost 2 vext1 <5,1,7,u>, RHS
+  1507503829U,	// <1,7,u,5>: Cost 2 vext1 <5,1,7,u>, <5,1,7,u>
+  2581246458U,	// <1,7,u,6>: Cost 3 vext1 <5,1,7,u>, <6,2,7,3>
+  2581246970U,	// <1,7,u,7>: Cost 3 vext1 <5,1,7,u>, <7,0,1,2>
+  1507505966U,	// <1,7,u,u>: Cost 2 vext1 <5,1,7,u>, LHS
+  1543643153U,	// <1,u,0,0>: Cost 2 vext2 <0,0,1,u>, <0,0,1,u>
+  1546297446U,	// <1,u,0,1>: Cost 2 vext2 <0,4,1,u>, LHS
+  2819448852U,	// <1,u,0,2>: Cost 3 vuzpr LHS, <0,0,2,2>
+  2619375876U,	// <1,u,0,3>: Cost 3 vext2 <0,3,1,u>, <0,3,1,u>
+  1546297685U,	// <1,u,0,4>: Cost 2 vext2 <0,4,1,u>, <0,4,1,u>
+  1658771190U,	// <1,u,0,5>: Cost 2 vext3 <u,0,5,1>, <u,0,5,1>
+  2736789248U,	// <1,u,0,6>: Cost 3 vext3 <u,7,0,1>, <u,0,6,2>
+  2659189376U,	// <1,u,0,7>: Cost 3 vext2 <7,0,1,u>, <0,7,u,1>
+  1546298013U,	// <1,u,0,u>: Cost 2 vext2 <0,4,1,u>, LHS
+  1483112550U,	// <1,u,1,0>: Cost 2 vext1 <1,1,1,1>, LHS
+  202162278U,	// <1,u,1,1>: Cost 1 vdup1 LHS
+  1616009006U,	// <1,u,1,2>: Cost 2 vext3 <0,u,1,1>, LHS
+  1745707110U,	// <1,u,1,3>: Cost 2 vuzpr LHS, LHS
+  1483115830U,	// <1,u,1,4>: Cost 2 vext1 <1,1,1,1>, RHS
+  2620040336U,	// <1,u,1,5>: Cost 3 vext2 <0,4,1,u>, <1,5,3,7>
+  3026622618U,	// <1,u,1,6>: Cost 3 vtrnl <1,1,1,1>, RHS
+  2958183752U,	// <1,u,1,7>: Cost 3 vzipr <0,u,1,1>, RHS
+  202162278U,	// <1,u,1,u>: Cost 1 vdup1 LHS
+  2819449750U,	// <1,u,2,0>: Cost 3 vuzpr LHS, <1,2,3,0>
+  2893207342U,	// <1,u,2,1>: Cost 3 vzipl <1,2,3,0>, LHS
+  2819448996U,	// <1,u,2,2>: Cost 3 vuzpr LHS, <0,2,0,2>
+  2819450482U,	// <1,u,2,3>: Cost 3 vuzpr LHS, <2,2,3,3>
+  2819449754U,	// <1,u,2,4>: Cost 3 vuzpr LHS, <1,2,3,4>
+  2893207706U,	// <1,u,2,5>: Cost 3 vzipl <1,2,3,0>, RHS
+  2819449036U,	// <1,u,2,6>: Cost 3 vuzpr LHS, <0,2,4,6>
+  2970799432U,	// <1,u,2,7>: Cost 3 vzipr <3,0,1,2>, RHS
+  2819449002U,	// <1,u,2,u>: Cost 3 vuzpr LHS, <0,2,0,u>
+  403931292U,	// <1,u,3,0>: Cost 1 vext1 LHS, LHS
+  1477673718U,	// <1,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  115726126U,	// <1,u,3,2>: Cost 1 vrev LHS
+  2014102173U,	// <1,u,3,3>: Cost 2 vtrnr LHS, LHS
+  403934518U,	// <1,u,3,4>: Cost 1 vext1 LHS, RHS
+  1507536601U,	// <1,u,3,5>: Cost 2 vext1 <5,1,u,3>, <5,1,u,3>
+  1525453306U,	// <1,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  2014105129U,	// <1,u,3,7>: Cost 2 vtrnr LHS, RHS
+  403937070U,	// <1,u,3,u>: Cost 1 vext1 LHS, LHS
+  2620042157U,	// <1,u,4,0>: Cost 3 vext2 <0,4,1,u>, <4,0,u,1>
+  2620042237U,	// <1,u,4,1>: Cost 3 vext2 <0,4,1,u>, <4,1,u,0>
+  2263217967U,	// <1,u,4,2>: Cost 3 vrev <u,1,2,4>
+  2569341224U,	// <1,u,4,3>: Cost 3 vext1 <3,1,u,4>, <3,1,u,4>
+  2569342262U,	// <1,u,4,4>: Cost 3 vext1 <3,1,u,4>, RHS
+  1546300726U,	// <1,u,4,5>: Cost 2 vext2 <0,4,1,u>, RHS
+  2819449180U,	// <1,u,4,6>: Cost 3 vuzpr LHS, <0,4,2,6>
+  2724845649U,	// <1,u,4,7>: Cost 3 vext3 <6,7,0,1>, <u,4,7,6>
+  1546300969U,	// <1,u,4,u>: Cost 2 vext2 <0,4,1,u>, RHS
+  2551431270U,	// <1,u,5,0>: Cost 3 vext1 <0,1,u,5>, LHS
+  2551432192U,	// <1,u,5,1>: Cost 3 vext1 <0,1,u,5>, <1,3,5,7>
+  3028293422U,	// <1,u,5,2>: Cost 3 vtrnl <1,3,5,7>, LHS
+  2955559068U,	// <1,u,5,3>: Cost 3 vzipr <0,4,1,5>, LHS
+  2551434550U,	// <1,u,5,4>: Cost 3 vext1 <0,1,u,5>, RHS
+  2895255706U,	// <1,u,5,5>: Cost 3 vzipl <1,5,3,7>, RHS
+  1616009370U,	// <1,u,5,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  1745710390U,	// <1,u,5,7>: Cost 2 vuzpr LHS, RHS
+  1745710391U,	// <1,u,5,u>: Cost 2 vuzpr LHS, RHS
+  2653221159U,	// <1,u,6,0>: Cost 3 vext2 <6,0,1,u>, <6,0,1,u>
+  2725509303U,	// <1,u,6,1>: Cost 3 vext3 <6,u,0,1>, <u,6,1,0>
+  2659193338U,	// <1,u,6,2>: Cost 3 vext2 <7,0,1,u>, <6,2,7,3>
+  2689751248U,	// <1,u,6,3>: Cost 3 vext3 <0,u,1,1>, <u,6,3,7>
+  2867228774U,	// <1,u,6,4>: Cost 3 vuzpr LHS, <5,6,7,4>
+  3764820194U,	// <1,u,6,5>: Cost 4 vext3 <1,1,1,1>, <u,6,5,7>
+  2657202957U,	// <1,u,6,6>: Cost 3 vext2 <6,6,1,u>, <6,6,1,u>
+  2819450810U,	// <1,u,6,7>: Cost 3 vuzpr LHS, <2,6,3,7>
+  2819450811U,	// <1,u,6,u>: Cost 3 vuzpr LHS, <2,6,3,u>
+  1585452032U,	// <1,u,7,0>: Cost 2 vext2 <7,0,1,u>, <7,0,1,u>
+  2557420340U,	// <1,u,7,1>: Cost 3 vext1 <1,1,u,7>, <1,1,1,1>
+  2569365158U,	// <1,u,7,2>: Cost 3 vext1 <3,1,u,7>, <2,3,0,1>
+  2569365803U,	// <1,u,7,3>: Cost 3 vext1 <3,1,u,7>, <3,1,u,7>
+  2557422902U,	// <1,u,7,4>: Cost 3 vext1 <1,1,u,7>, RHS
+  2662512021U,	// <1,u,7,5>: Cost 3 vext2 <7,5,1,u>, <7,5,1,u>
+  2724845884U,	// <1,u,7,6>: Cost 3 vext3 <6,7,0,1>, <u,7,6,7>
+  2659194476U,	// <1,u,7,7>: Cost 3 vext2 <7,0,1,u>, <7,7,7,7>
+  1590761096U,	// <1,u,7,u>: Cost 2 vext2 <7,u,1,u>, <7,u,1,u>
+  403972257U,	// <1,u,u,0>: Cost 1 vext1 LHS, LHS
+  202162278U,	// <1,u,u,1>: Cost 1 vdup1 LHS
+  115767091U,	// <1,u,u,2>: Cost 1 vrev LHS
+  1745707677U,	// <1,u,u,3>: Cost 2 vuzpr LHS, LHS
+  403975478U,	// <1,u,u,4>: Cost 1 vext1 LHS, RHS
+  1546303642U,	// <1,u,u,5>: Cost 2 vext2 <0,4,1,u>, RHS
+  1616009613U,	// <1,u,u,6>: Cost 2 vext3 <0,u,1,1>, RHS
+  1745710633U,	// <1,u,u,7>: Cost 2 vuzpr LHS, RHS
+  403978030U,	// <1,u,u,u>: Cost 1 vext1 LHS, LHS
+  2551463936U,	// <2,0,0,0>: Cost 3 vext1 <0,2,0,0>, <0,0,0,0>
+  2685698058U,	// <2,0,0,1>: Cost 3 vext3 <0,2,0,2>, <0,0,1,1>
+  1610776596U,	// <2,0,0,2>: Cost 2 vext3 <0,0,2,2>, <0,0,2,2>
+  2619384069U,	// <2,0,0,3>: Cost 3 vext2 <0,3,2,0>, <0,3,2,0>
+  2551467318U,	// <2,0,0,4>: Cost 3 vext1 <0,2,0,0>, RHS
+  3899836596U,	// <2,0,0,5>: Cost 4 vuzpr <1,2,3,0>, <3,0,4,5>
+  2621374968U,	// <2,0,0,6>: Cost 3 vext2 <0,6,2,0>, <0,6,2,0>
+  4168271334U,	// <2,0,0,7>: Cost 4 vtrnr <1,2,3,0>, <2,0,5,7>
+  1611219018U,	// <2,0,0,u>: Cost 2 vext3 <0,0,u,2>, <0,0,u,2>
+  2551472138U,	// <2,0,1,0>: Cost 3 vext1 <0,2,0,1>, <0,0,1,1>
+  2690564186U,	// <2,0,1,1>: Cost 3 vext3 <1,0,3,2>, <0,1,1,0>
+  1611956326U,	// <2,0,1,2>: Cost 2 vext3 <0,2,0,2>, LHS
+  2826092646U,	// <2,0,1,3>: Cost 3 vuzpr <1,2,3,0>, LHS
+  2551475510U,	// <2,0,1,4>: Cost 3 vext1 <0,2,0,1>, RHS
+  3692463248U,	// <2,0,1,5>: Cost 4 vext2 <0,2,2,0>, <1,5,3,7>
+  2587308473U,	// <2,0,1,6>: Cost 3 vext1 <6,2,0,1>, <6,2,0,1>
+  3661050874U,	// <2,0,1,7>: Cost 4 vext1 <6,2,0,1>, <7,0,1,2>
+  1611956380U,	// <2,0,1,u>: Cost 2 vext3 <0,2,0,2>, LHS
+  1477738598U,	// <2,0,2,0>: Cost 2 vext1 <0,2,0,2>, LHS
+  2551481078U,	// <2,0,2,1>: Cost 3 vext1 <0,2,0,2>, <1,0,3,2>
+  2551481796U,	// <2,0,2,2>: Cost 3 vext1 <0,2,0,2>, <2,0,2,0>
+  2551482518U,	// <2,0,2,3>: Cost 3 vext1 <0,2,0,2>, <3,0,1,2>
+  1477741878U,	// <2,0,2,4>: Cost 2 vext1 <0,2,0,2>, RHS
+  2551484112U,	// <2,0,2,5>: Cost 3 vext1 <0,2,0,2>, <5,1,7,3>
+  2551484759U,	// <2,0,2,6>: Cost 3 vext1 <0,2,0,2>, <6,0,7,2>
+  2551485434U,	// <2,0,2,7>: Cost 3 vext1 <0,2,0,2>, <7,0,1,2>
+  1477744430U,	// <2,0,2,u>: Cost 2 vext1 <0,2,0,2>, LHS
+  2953625600U,	// <2,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
+  2953627302U,	// <2,0,3,1>: Cost 3 vzipr LHS, <2,3,0,1>
+  2953625764U,	// <2,0,3,2>: Cost 3 vzipr LHS, <0,2,0,2>
+  4027369695U,	// <2,0,3,3>: Cost 4 vzipr LHS, <3,1,0,3>
+  3625233718U,	// <2,0,3,4>: Cost 4 vext1 <0,2,0,3>, RHS
+  3899836110U,	// <2,0,3,5>: Cost 4 vuzpr <1,2,3,0>, <2,3,4,5>
+  4032012618U,	// <2,0,3,6>: Cost 4 vzipr LHS, <0,4,0,6>
+  3899835392U,	// <2,0,3,7>: Cost 4 vuzpr <1,2,3,0>, <1,3,5,7>
+  2953625770U,	// <2,0,3,u>: Cost 3 vzipr LHS, <0,2,0,u>
+  2551496806U,	// <2,0,4,0>: Cost 3 vext1 <0,2,0,4>, LHS
+  2685698386U,	// <2,0,4,1>: Cost 3 vext3 <0,2,0,2>, <0,4,1,5>
+  2685698396U,	// <2,0,4,2>: Cost 3 vext3 <0,2,0,2>, <0,4,2,6>
+  3625240726U,	// <2,0,4,3>: Cost 4 vext1 <0,2,0,4>, <3,0,1,2>
+  2551500086U,	// <2,0,4,4>: Cost 3 vext1 <0,2,0,4>, RHS
+  2618723638U,	// <2,0,4,5>: Cost 3 vext2 <0,2,2,0>, RHS
+  2765409590U,	// <2,0,4,6>: Cost 3 vuzpl <2,3,0,1>, RHS
+  3799990664U,	// <2,0,4,7>: Cost 4 vext3 <7,0,1,2>, <0,4,7,5>
+  2685698450U,	// <2,0,4,u>: Cost 3 vext3 <0,2,0,2>, <0,4,u,6>
+  3625246822U,	// <2,0,5,0>: Cost 4 vext1 <0,2,0,5>, LHS
+  3289776304U,	// <2,0,5,1>: Cost 4 vrev <0,2,1,5>
+  2690564526U,	// <2,0,5,2>: Cost 3 vext3 <1,0,3,2>, <0,5,2,7>
+  3289923778U,	// <2,0,5,3>: Cost 4 vrev <0,2,3,5>
+  2216255691U,	// <2,0,5,4>: Cost 3 vrev <0,2,4,5>
+  3726307332U,	// <2,0,5,5>: Cost 4 vext2 <5,u,2,0>, <5,5,5,5>
+  3726307426U,	// <2,0,5,6>: Cost 4 vext2 <5,u,2,0>, <5,6,7,0>
+  2826095926U,	// <2,0,5,7>: Cost 3 vuzpr <1,2,3,0>, RHS
+  2216550639U,	// <2,0,5,u>: Cost 3 vrev <0,2,u,5>
+  4162420736U,	// <2,0,6,0>: Cost 4 vtrnr <0,2,4,6>, <0,0,0,0>
+  2901885030U,	// <2,0,6,1>: Cost 3 vzipl <2,6,3,7>, LHS
+  2685698559U,	// <2,0,6,2>: Cost 3 vext3 <0,2,0,2>, <0,6,2,7>
+  3643173171U,	// <2,0,6,3>: Cost 4 vext1 <3,2,0,6>, <3,2,0,6>
+  2216263884U,	// <2,0,6,4>: Cost 3 vrev <0,2,4,6>
+  3730289341U,	// <2,0,6,5>: Cost 4 vext2 <6,5,2,0>, <6,5,2,0>
+  3726308152U,	// <2,0,6,6>: Cost 4 vext2 <5,u,2,0>, <6,6,6,6>
+  3899836346U,	// <2,0,6,7>: Cost 4 vuzpr <1,2,3,0>, <2,6,3,7>
+  2216558832U,	// <2,0,6,u>: Cost 3 vrev <0,2,u,6>
+  2659202049U,	// <2,0,7,0>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
+  3726308437U,	// <2,0,7,1>: Cost 4 vext2 <5,u,2,0>, <7,1,2,3>
+  2726249034U,	// <2,0,7,2>: Cost 3 vext3 <7,0,1,2>, <0,7,2,1>
+  3734934772U,	// <2,0,7,3>: Cost 4 vext2 <7,3,2,0>, <7,3,2,0>
+  3726308710U,	// <2,0,7,4>: Cost 4 vext2 <5,u,2,0>, <7,4,5,6>
+  3726308814U,	// <2,0,7,5>: Cost 4 vext2 <5,u,2,0>, <7,5,u,2>
+  3736925671U,	// <2,0,7,6>: Cost 4 vext2 <7,6,2,0>, <7,6,2,0>
+  3726308972U,	// <2,0,7,7>: Cost 4 vext2 <5,u,2,0>, <7,7,7,7>
+  2659202049U,	// <2,0,7,u>: Cost 3 vext2 <7,0,2,0>, <7,0,2,0>
+  1477787750U,	// <2,0,u,0>: Cost 2 vext1 <0,2,0,u>, LHS
+  2953668262U,	// <2,0,u,1>: Cost 3 vzipr LHS, <2,3,0,1>
+  1611956893U,	// <2,0,u,2>: Cost 2 vext3 <0,2,0,2>, LHS
+  2551531670U,	// <2,0,u,3>: Cost 3 vext1 <0,2,0,u>, <3,0,1,2>
+  1477791030U,	// <2,0,u,4>: Cost 2 vext1 <0,2,0,u>, RHS
+  2618726554U,	// <2,0,u,5>: Cost 3 vext2 <0,2,2,0>, RHS
+  2765412506U,	// <2,0,u,6>: Cost 3 vuzpl <2,3,0,1>, RHS
+  2826096169U,	// <2,0,u,7>: Cost 3 vuzpr <1,2,3,0>, RHS
+  1611956947U,	// <2,0,u,u>: Cost 2 vext3 <0,2,0,2>, LHS
+  2569453670U,	// <2,1,0,0>: Cost 3 vext1 <3,2,1,0>, LHS
+  2619392102U,	// <2,1,0,1>: Cost 3 vext2 <0,3,2,1>, LHS
+  3759440619U,	// <2,1,0,2>: Cost 4 vext3 <0,2,0,2>, <1,0,2,0>
+  1616823030U,	// <2,1,0,3>: Cost 2 vext3 <1,0,3,2>, <1,0,3,2>
+  2569456950U,	// <2,1,0,4>: Cost 3 vext1 <3,2,1,0>, RHS
+  2690712328U,	// <2,1,0,5>: Cost 3 vext3 <1,0,5,2>, <1,0,5,2>
+  3661115841U,	// <2,1,0,6>: Cost 4 vext1 <6,2,1,0>, <6,2,1,0>
+  2622046794U,	// <2,1,0,7>: Cost 3 vext2 <0,7,2,1>, <0,7,2,1>
+  1617191715U,	// <2,1,0,u>: Cost 2 vext3 <1,0,u,2>, <1,0,u,2>
+  2551545958U,	// <2,1,1,0>: Cost 3 vext1 <0,2,1,1>, LHS
+  2685698868U,	// <2,1,1,1>: Cost 3 vext3 <0,2,0,2>, <1,1,1,1>
+  2628682646U,	// <2,1,1,2>: Cost 3 vext2 <1,u,2,1>, <1,2,3,0>
+  2685698888U,	// <2,1,1,3>: Cost 3 vext3 <0,2,0,2>, <1,1,3,3>
+  2551549238U,	// <2,1,1,4>: Cost 3 vext1 <0,2,1,1>, RHS
+  3693134992U,	// <2,1,1,5>: Cost 4 vext2 <0,3,2,1>, <1,5,3,7>
+  3661124034U,	// <2,1,1,6>: Cost 4 vext1 <6,2,1,1>, <6,2,1,1>
+  3625292794U,	// <2,1,1,7>: Cost 4 vext1 <0,2,1,1>, <7,0,1,2>
+  2685698933U,	// <2,1,1,u>: Cost 3 vext3 <0,2,0,2>, <1,1,u,3>
+  2551554150U,	// <2,1,2,0>: Cost 3 vext1 <0,2,1,2>, LHS
+  3893649571U,	// <2,1,2,1>: Cost 4 vuzpr <0,2,0,1>, <0,2,0,1>
+  2551555688U,	// <2,1,2,2>: Cost 3 vext1 <0,2,1,2>, <2,2,2,2>
+  2685698966U,	// <2,1,2,3>: Cost 3 vext3 <0,2,0,2>, <1,2,3,0>
+  2551557430U,	// <2,1,2,4>: Cost 3 vext1 <0,2,1,2>, RHS
+  3763422123U,	// <2,1,2,5>: Cost 4 vext3 <0,u,0,2>, <1,2,5,3>
+  3693135802U,	// <2,1,2,6>: Cost 4 vext2 <0,3,2,1>, <2,6,3,7>
+  2726249402U,	// <2,1,2,7>: Cost 3 vext3 <7,0,1,2>, <1,2,7,0>
+  2685699011U,	// <2,1,2,u>: Cost 3 vext3 <0,2,0,2>, <1,2,u,0>
+  2551562342U,	// <2,1,3,0>: Cost 3 vext1 <0,2,1,3>, LHS
+  2953625610U,	// <2,1,3,1>: Cost 3 vzipr LHS, <0,0,1,1>
+  2953627798U,	// <2,1,3,2>: Cost 3 vzipr LHS, <3,0,1,2>
+  2953626584U,	// <2,1,3,3>: Cost 3 vzipr LHS, <1,3,1,3>
+  2551565622U,	// <2,1,3,4>: Cost 3 vext1 <0,2,1,3>, RHS
+  2953625938U,	// <2,1,3,5>: Cost 3 vzipr LHS, <0,4,1,5>
+  2587398596U,	// <2,1,3,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
+  4032013519U,	// <2,1,3,7>: Cost 4 vzipr LHS, <1,6,1,7>
+  2953625617U,	// <2,1,3,u>: Cost 3 vzipr LHS, <0,0,1,u>
+  2690565154U,	// <2,1,4,0>: Cost 3 vext3 <1,0,3,2>, <1,4,0,5>
+  3625313270U,	// <2,1,4,1>: Cost 4 vext1 <0,2,1,4>, <1,3,4,6>
+  3771532340U,	// <2,1,4,2>: Cost 4 vext3 <2,2,2,2>, <1,4,2,5>
+  1148404634U,	// <2,1,4,3>: Cost 2 vrev <1,2,3,4>
+  3625315638U,	// <2,1,4,4>: Cost 4 vext1 <0,2,1,4>, RHS
+  2619395382U,	// <2,1,4,5>: Cost 3 vext2 <0,3,2,1>, RHS
+  3837242678U,	// <2,1,4,6>: Cost 4 vuzpl <2,0,1,2>, RHS
+  3799991394U,	// <2,1,4,7>: Cost 4 vext3 <7,0,1,2>, <1,4,7,6>
+  1148773319U,	// <2,1,4,u>: Cost 2 vrev <1,2,u,4>
+  2551578726U,	// <2,1,5,0>: Cost 3 vext1 <0,2,1,5>, LHS
+  2551579648U,	// <2,1,5,1>: Cost 3 vext1 <0,2,1,5>, <1,3,5,7>
+  3625321952U,	// <2,1,5,2>: Cost 4 vext1 <0,2,1,5>, <2,0,5,1>
+  2685699216U,	// <2,1,5,3>: Cost 3 vext3 <0,2,0,2>, <1,5,3,7>
+  2551582006U,	// <2,1,5,4>: Cost 3 vext1 <0,2,1,5>, RHS
+  3740913668U,	// <2,1,5,5>: Cost 4 vext2 <u,3,2,1>, <5,5,5,5>
+  3661156806U,	// <2,1,5,6>: Cost 4 vext1 <6,2,1,5>, <6,2,1,5>
+  3893652790U,	// <2,1,5,7>: Cost 4 vuzpr <0,2,0,1>, RHS
+  2685699261U,	// <2,1,5,u>: Cost 3 vext3 <0,2,0,2>, <1,5,u,7>
+  2551586918U,	// <2,1,6,0>: Cost 3 vext1 <0,2,1,6>, LHS
+  3625329398U,	// <2,1,6,1>: Cost 4 vext1 <0,2,1,6>, <1,0,3,2>
+  2551588794U,	// <2,1,6,2>: Cost 3 vext1 <0,2,1,6>, <2,6,3,7>
+  3088679014U,	// <2,1,6,3>: Cost 3 vtrnr <0,2,4,6>, LHS
+  2551590198U,	// <2,1,6,4>: Cost 3 vext1 <0,2,1,6>, RHS
+  4029382994U,	// <2,1,6,5>: Cost 4 vzipr <0,4,2,6>, <0,4,1,5>
+  3625333560U,	// <2,1,6,6>: Cost 4 vext1 <0,2,1,6>, <6,6,6,6>
+  3731624800U,	// <2,1,6,7>: Cost 4 vext2 <6,7,2,1>, <6,7,2,1>
+  2551592750U,	// <2,1,6,u>: Cost 3 vext1 <0,2,1,6>, LHS
+  2622051322U,	// <2,1,7,0>: Cost 3 vext2 <0,7,2,1>, <7,0,1,2>
+  3733615699U,	// <2,1,7,1>: Cost 4 vext2 <7,1,2,1>, <7,1,2,1>
+  3795125538U,	// <2,1,7,2>: Cost 4 vext3 <6,1,7,2>, <1,7,2,0>
+  2222171037U,	// <2,1,7,3>: Cost 3 vrev <1,2,3,7>
+  3740915046U,	// <2,1,7,4>: Cost 4 vext2 <u,3,2,1>, <7,4,5,6>
+  3296060335U,	// <2,1,7,5>: Cost 4 vrev <1,2,5,7>
+  3736933864U,	// <2,1,7,6>: Cost 4 vext2 <7,6,2,1>, <7,6,2,1>
+  3805300055U,	// <2,1,7,7>: Cost 4 vext3 <7,u,1,2>, <1,7,7,u>
+  2669827714U,	// <2,1,7,u>: Cost 3 vext2 <u,7,2,1>, <7,u,1,2>
+  2551603302U,	// <2,1,u,0>: Cost 3 vext1 <0,2,1,u>, LHS
+  2953666570U,	// <2,1,u,1>: Cost 3 vzipr LHS, <0,0,1,1>
+  2953668758U,	// <2,1,u,2>: Cost 3 vzipr LHS, <3,0,1,2>
+  1148437406U,	// <2,1,u,3>: Cost 2 vrev <1,2,3,u>
+  2551606582U,	// <2,1,u,4>: Cost 3 vext1 <0,2,1,u>, RHS
+  2953666898U,	// <2,1,u,5>: Cost 3 vzipr LHS, <0,4,1,5>
+  2587398596U,	// <2,1,u,6>: Cost 3 vext1 <6,2,1,3>, <6,2,1,3>
+  2669828370U,	// <2,1,u,7>: Cost 3 vext2 <u,7,2,1>, <u,7,2,1>
+  1148806091U,	// <2,1,u,u>: Cost 2 vrev <1,2,u,u>
+  1543667732U,	// <2,2,0,0>: Cost 2 vext2 <0,0,2,2>, <0,0,2,2>
+  1548976230U,	// <2,2,0,1>: Cost 2 vext2 <0,u,2,2>, LHS
+  2685699524U,	// <2,2,0,2>: Cost 3 vext3 <0,2,0,2>, <2,0,2,0>
+  2685699535U,	// <2,2,0,3>: Cost 3 vext3 <0,2,0,2>, <2,0,3,2>
+  2551614774U,	// <2,2,0,4>: Cost 3 vext1 <0,2,2,0>, RHS
+  3704422830U,	// <2,2,0,5>: Cost 4 vext2 <2,2,2,2>, <0,5,2,7>
+  3893657642U,	// <2,2,0,6>: Cost 4 vuzpr <0,2,0,2>, <0,0,4,6>
+  3770574323U,	// <2,2,0,7>: Cost 4 vext3 <2,0,7,2>, <2,0,7,2>
+  1548976796U,	// <2,2,0,u>: Cost 2 vext2 <0,u,2,2>, <0,u,2,2>
+  2622718710U,	// <2,2,1,0>: Cost 3 vext2 <0,u,2,2>, <1,0,3,2>
+  2622718772U,	// <2,2,1,1>: Cost 3 vext2 <0,u,2,2>, <1,1,1,1>
+  2622718870U,	// <2,2,1,2>: Cost 3 vext2 <0,u,2,2>, <1,2,3,0>
+  2819915878U,	// <2,2,1,3>: Cost 3 vuzpr <0,2,0,2>, LHS
+  3625364790U,	// <2,2,1,4>: Cost 4 vext1 <0,2,2,1>, RHS
+  2622719120U,	// <2,2,1,5>: Cost 3 vext2 <0,u,2,2>, <1,5,3,7>
+  3760031292U,	// <2,2,1,6>: Cost 4 vext3 <0,2,u,2>, <2,1,6,3>
+  3667170468U,	// <2,2,1,7>: Cost 4 vext1 <7,2,2,1>, <7,2,2,1>
+  2819915883U,	// <2,2,1,u>: Cost 3 vuzpr <0,2,0,2>, LHS
+  1489829990U,	// <2,2,2,0>: Cost 2 vext1 <2,2,2,2>, LHS
+  2563572470U,	// <2,2,2,1>: Cost 3 vext1 <2,2,2,2>, <1,0,3,2>
+  269271142U,	// <2,2,2,2>: Cost 1 vdup2 LHS
+  2685699698U,	// <2,2,2,3>: Cost 3 vext3 <0,2,0,2>, <2,2,3,3>
+  1489833270U,	// <2,2,2,4>: Cost 2 vext1 <2,2,2,2>, RHS
+  2685699720U,	// <2,2,2,5>: Cost 3 vext3 <0,2,0,2>, <2,2,5,7>
+  2622719930U,	// <2,2,2,6>: Cost 3 vext2 <0,u,2,2>, <2,6,3,7>
+  2593436837U,	// <2,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
+  269271142U,	// <2,2,2,u>: Cost 1 vdup2 LHS
+  2685699750U,	// <2,2,3,0>: Cost 3 vext3 <0,2,0,2>, <2,3,0,1>
+  2690565806U,	// <2,2,3,1>: Cost 3 vext3 <1,0,3,2>, <2,3,1,0>
+  2953627240U,	// <2,2,3,2>: Cost 3 vzipr LHS, <2,2,2,2>
+  1879883878U,	// <2,2,3,3>: Cost 2 vzipr LHS, LHS
+  2685699790U,	// <2,2,3,4>: Cost 3 vext3 <0,2,0,2>, <2,3,4,5>
+  3893659342U,	// <2,2,3,5>: Cost 4 vuzpr <0,2,0,2>, <2,3,4,5>
+  2958270812U,	// <2,2,3,6>: Cost 3 vzipr LHS, <0,4,2,6>
+  2593445030U,	// <2,2,3,7>: Cost 3 vext1 <7,2,2,3>, <7,2,2,3>
+  1879883883U,	// <2,2,3,u>: Cost 2 vzipr LHS, LHS
+  2551644262U,	// <2,2,4,0>: Cost 3 vext1 <0,2,2,4>, LHS
+  3625386742U,	// <2,2,4,1>: Cost 4 vext1 <0,2,2,4>, <1,0,3,2>
+  2551645902U,	// <2,2,4,2>: Cost 3 vext1 <0,2,2,4>, <2,3,4,5>
+  3759441686U,	// <2,2,4,3>: Cost 4 vext3 <0,2,0,2>, <2,4,3,5>
+  2551647542U,	// <2,2,4,4>: Cost 3 vext1 <0,2,2,4>, RHS
+  1548979510U,	// <2,2,4,5>: Cost 2 vext2 <0,u,2,2>, RHS
+  2764901686U,	// <2,2,4,6>: Cost 3 vuzpl <2,2,2,2>, RHS
+  3667195047U,	// <2,2,4,7>: Cost 4 vext1 <7,2,2,4>, <7,2,2,4>
+  1548979753U,	// <2,2,4,u>: Cost 2 vext2 <0,u,2,2>, RHS
+  3696463432U,	// <2,2,5,0>: Cost 4 vext2 <0,u,2,2>, <5,0,1,2>
+  2617413328U,	// <2,2,5,1>: Cost 3 vext2 <0,0,2,2>, <5,1,7,3>
+  2685699936U,	// <2,2,5,2>: Cost 3 vext3 <0,2,0,2>, <2,5,2,7>
+  4027383910U,	// <2,2,5,3>: Cost 4 vzipr <0,1,2,5>, LHS
+  2228201085U,	// <2,2,5,4>: Cost 3 vrev <2,2,4,5>
+  2617413636U,	// <2,2,5,5>: Cost 3 vext2 <0,0,2,2>, <5,5,5,5>
+  2617413730U,	// <2,2,5,6>: Cost 3 vext2 <0,0,2,2>, <5,6,7,0>
+  2819919158U,	// <2,2,5,7>: Cost 3 vuzpr <0,2,0,2>, RHS
+  2819919159U,	// <2,2,5,u>: Cost 3 vuzpr <0,2,0,2>, RHS
+  3625402554U,	// <2,2,6,0>: Cost 4 vext1 <0,2,2,6>, <0,2,2,6>
+  3760031652U,	// <2,2,6,1>: Cost 4 vext3 <0,2,u,2>, <2,6,1,3>
+  2617414138U,	// <2,2,6,2>: Cost 3 vext2 <0,0,2,2>, <6,2,7,3>
+  2685700026U,	// <2,2,6,3>: Cost 3 vext3 <0,2,0,2>, <2,6,3,7>
+  3625405750U,	// <2,2,6,4>: Cost 4 vext1 <0,2,2,6>, RHS
+  3760031692U,	// <2,2,6,5>: Cost 4 vext3 <0,2,u,2>, <2,6,5,7>
+  3088679116U,	// <2,2,6,6>: Cost 3 vtrnr <0,2,4,6>, <0,2,4,6>
+  2657891169U,	// <2,2,6,7>: Cost 3 vext2 <6,7,2,2>, <6,7,2,2>
+  2685700071U,	// <2,2,6,u>: Cost 3 vext3 <0,2,0,2>, <2,6,u,7>
+  2726250474U,	// <2,2,7,0>: Cost 3 vext3 <7,0,1,2>, <2,7,0,1>
+  3704427616U,	// <2,2,7,1>: Cost 4 vext2 <2,2,2,2>, <7,1,3,5>
+  2660545701U,	// <2,2,7,2>: Cost 3 vext2 <7,2,2,2>, <7,2,2,2>
+  4030718054U,	// <2,2,7,3>: Cost 4 vzipr <0,6,2,7>, LHS
+  2617415014U,	// <2,2,7,4>: Cost 3 vext2 <0,0,2,2>, <7,4,5,6>
+  3302033032U,	// <2,2,7,5>: Cost 4 vrev <2,2,5,7>
+  3661246929U,	// <2,2,7,6>: Cost 4 vext1 <6,2,2,7>, <6,2,2,7>
+  2617415276U,	// <2,2,7,7>: Cost 3 vext2 <0,0,2,2>, <7,7,7,7>
+  2731558962U,	// <2,2,7,u>: Cost 3 vext3 <7,u,1,2>, <2,7,u,1>
+  1489829990U,	// <2,2,u,0>: Cost 2 vext1 <2,2,2,2>, LHS
+  1548982062U,	// <2,2,u,1>: Cost 2 vext2 <0,u,2,2>, LHS
+  269271142U,	// <2,2,u,2>: Cost 1 vdup2 LHS
+  1879924838U,	// <2,2,u,3>: Cost 2 vzipr LHS, LHS
+  1489833270U,	// <2,2,u,4>: Cost 2 vext1 <2,2,2,2>, RHS
+  1548982426U,	// <2,2,u,5>: Cost 2 vext2 <0,u,2,2>, RHS
+  2953666908U,	// <2,2,u,6>: Cost 3 vzipr LHS, <0,4,2,6>
+  2819919401U,	// <2,2,u,7>: Cost 3 vuzpr <0,2,0,2>, RHS
+  269271142U,	// <2,2,u,u>: Cost 1 vdup2 LHS
+  1544339456U,	// <2,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+  470597734U,	// <2,3,0,1>: Cost 1 vext2 LHS, LHS
+  1548984484U,	// <2,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  2619408648U,	// <2,3,0,3>: Cost 3 vext2 <0,3,2,3>, <0,3,2,3>
+  1548984658U,	// <2,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  2665857454U,	// <2,3,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
+  2622726655U,	// <2,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
+  2593494188U,	// <2,3,0,7>: Cost 3 vext1 <7,2,3,0>, <7,2,3,0>
+  470598301U,	// <2,3,0,u>: Cost 1 vext2 LHS, LHS
+  1544340214U,	// <2,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  1544340276U,	// <2,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+  1544340374U,	// <2,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+  1548985304U,	// <2,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  2551696694U,	// <2,3,1,4>: Cost 3 vext1 <0,2,3,1>, RHS
+  1548985488U,	// <2,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  2622727375U,	// <2,3,1,6>: Cost 3 vext2 LHS, <1,6,1,7>
+  2665858347U,	// <2,3,1,7>: Cost 3 vext2 LHS, <1,7,3,0>
+  1548985709U,	// <2,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
+  2622727613U,	// <2,3,2,0>: Cost 3 vext2 LHS, <2,0,1,2>
+  2622727711U,	// <2,3,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
+  1544341096U,	// <2,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
+  1544341158U,	// <2,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+  2622727958U,	// <2,3,2,4>: Cost 3 vext2 LHS, <2,4,3,5>
+  2622728032U,	// <2,3,2,5>: Cost 3 vext2 LHS, <2,5,2,7>
+  1548986298U,	// <2,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  2665859050U,	// <2,3,2,7>: Cost 3 vext2 LHS, <2,7,0,1>
+  1548986427U,	// <2,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
+  1548986518U,	// <2,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+  2622728415U,	// <2,3,3,1>: Cost 3 vext2 LHS, <3,1,0,3>
+  1489913458U,	// <2,3,3,2>: Cost 2 vext1 <2,2,3,3>, <2,2,3,3>
+  1544341916U,	// <2,3,3,3>: Cost 2 vext2 LHS, <3,3,3,3>
+  1548986882U,	// <2,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+  2665859632U,	// <2,3,3,5>: Cost 3 vext2 LHS, <3,5,1,7>
+  2234304870U,	// <2,3,3,6>: Cost 3 vrev <3,2,6,3>
+  2958271632U,	// <2,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
+  1548987166U,	// <2,3,3,u>: Cost 2 vext2 LHS, <3,u,1,2>
+  1483948134U,	// <2,3,4,0>: Cost 2 vext1 <1,2,3,4>, LHS
+  1483948954U,	// <2,3,4,1>: Cost 2 vext1 <1,2,3,4>, <1,2,3,4>
+  2622729276U,	// <2,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
+  2557692054U,	// <2,3,4,3>: Cost 3 vext1 <1,2,3,4>, <3,0,1,2>
+  1483951414U,	// <2,3,4,4>: Cost 2 vext1 <1,2,3,4>, RHS
+  470601014U,	// <2,3,4,5>: Cost 1 vext2 LHS, RHS
+  1592118644U,	// <2,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+  2593526960U,	// <2,3,4,7>: Cost 3 vext1 <7,2,3,4>, <7,2,3,4>
+  470601257U,	// <2,3,4,u>: Cost 1 vext2 LHS, RHS
+  2551726182U,	// <2,3,5,0>: Cost 3 vext1 <0,2,3,5>, LHS
+  1592118992U,	// <2,3,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
+  2665860862U,	// <2,3,5,2>: Cost 3 vext2 LHS, <5,2,3,4>
+  2551728642U,	// <2,3,5,3>: Cost 3 vext1 <0,2,3,5>, <3,4,5,6>
+  1592119238U,	// <2,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+  1592119300U,	// <2,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+  1592119394U,	// <2,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
+  1592119464U,	// <2,3,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
+  1592119545U,	// <2,3,5,u>: Cost 2 vext2 LHS, <5,u,5,7>
+  2622730529U,	// <2,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
+  2557707164U,	// <2,3,6,1>: Cost 3 vext1 <1,2,3,6>, <1,2,3,6>
+  1592119802U,	// <2,3,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
+  2665861682U,	// <2,3,6,3>: Cost 3 vext2 LHS, <6,3,4,5>
+  2622730893U,	// <2,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
+  2665861810U,	// <2,3,6,5>: Cost 3 vext2 LHS, <6,5,0,7>
+  1592120120U,	// <2,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+  1592120142U,	// <2,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+  1592120223U,	// <2,3,6,u>: Cost 2 vext2 LHS, <6,u,0,1>
+  1592120314U,	// <2,3,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
+  2659890261U,	// <2,3,7,1>: Cost 3 vext2 <7,1,2,3>, <7,1,2,3>
+  2660553894U,	// <2,3,7,2>: Cost 3 vext2 <7,2,2,3>, <7,2,2,3>
+  2665862371U,	// <2,3,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
+  1592120678U,	// <2,3,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
+  2665862534U,	// <2,3,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
+  2665862614U,	// <2,3,7,6>: Cost 3 vext2 LHS, <7,6,0,1>
+  1592120940U,	// <2,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+  1592120962U,	// <2,3,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
+  1548990163U,	// <2,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
+  470603566U,	// <2,3,u,1>: Cost 1 vext2 LHS, LHS
+  1548990341U,	// <2,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
+  1548990396U,	// <2,3,u,3>: Cost 2 vext2 LHS, <u,3,0,1>
+  1548990527U,	// <2,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
+  470603930U,	// <2,3,u,5>: Cost 1 vext2 LHS, RHS
+  1548990672U,	// <2,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
+  1592121600U,	// <2,3,u,7>: Cost 2 vext2 LHS, <u,7,0,1>
+  470604133U,	// <2,3,u,u>: Cost 1 vext2 LHS, LHS
+  2617425942U,	// <2,4,0,0>: Cost 3 vext2 <0,0,2,4>, <0,0,2,4>
+  2618753126U,	// <2,4,0,1>: Cost 3 vext2 <0,2,2,4>, LHS
+  2618753208U,	// <2,4,0,2>: Cost 3 vext2 <0,2,2,4>, <0,2,2,4>
+  2619416841U,	// <2,4,0,3>: Cost 3 vext2 <0,3,2,4>, <0,3,2,4>
+  2587593628U,	// <2,4,0,4>: Cost 3 vext1 <6,2,4,0>, <4,0,6,2>
+  2712832914U,	// <2,4,0,5>: Cost 3 vext3 <4,6,u,2>, <4,0,5,1>
+  1634962332U,	// <2,4,0,6>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
+  3799993252U,	// <2,4,0,7>: Cost 4 vext3 <7,0,1,2>, <4,0,7,1>
+  1634962332U,	// <2,4,0,u>: Cost 2 vext3 <4,0,6,2>, <4,0,6,2>
+  2619417334U,	// <2,4,1,0>: Cost 3 vext2 <0,3,2,4>, <1,0,3,2>
+  3692495668U,	// <2,4,1,1>: Cost 4 vext2 <0,2,2,4>, <1,1,1,1>
+  2625389466U,	// <2,4,1,2>: Cost 3 vext2 <1,3,2,4>, <1,2,3,4>
+  2826125414U,	// <2,4,1,3>: Cost 3 vuzpr <1,2,3,4>, LHS
+  3699794995U,	// <2,4,1,4>: Cost 4 vext2 <1,4,2,4>, <1,4,2,4>
+  3692496016U,	// <2,4,1,5>: Cost 4 vext2 <0,2,2,4>, <1,5,3,7>
+  3763424238U,	// <2,4,1,6>: Cost 4 vext3 <0,u,0,2>, <4,1,6,3>
+  3667317942U,	// <2,4,1,7>: Cost 4 vext1 <7,2,4,1>, <7,2,4,1>
+  2826125419U,	// <2,4,1,u>: Cost 3 vuzpr <1,2,3,4>, LHS
+  2629371336U,	// <2,4,2,0>: Cost 3 vext2 <2,0,2,4>, <2,0,2,4>
+  3699131946U,	// <2,4,2,1>: Cost 4 vext2 <1,3,2,4>, <2,1,4,3>
+  2630698602U,	// <2,4,2,2>: Cost 3 vext2 <2,2,2,4>, <2,2,2,4>
+  2618754766U,	// <2,4,2,3>: Cost 3 vext2 <0,2,2,4>, <2,3,4,5>
+  2826126234U,	// <2,4,2,4>: Cost 3 vuzpr <1,2,3,4>, <1,2,3,4>
+  2899119414U,	// <2,4,2,5>: Cost 3 vzipl <2,2,2,2>, RHS
+  3033337142U,	// <2,4,2,6>: Cost 3 vtrnl <2,2,2,2>, RHS
+  3800214597U,	// <2,4,2,7>: Cost 4 vext3 <7,0,4,2>, <4,2,7,0>
+  2899119657U,	// <2,4,2,u>: Cost 3 vzipl <2,2,2,2>, RHS
+  2635344033U,	// <2,4,3,0>: Cost 3 vext2 <3,0,2,4>, <3,0,2,4>
+  4032012325U,	// <2,4,3,1>: Cost 4 vzipr LHS, <0,0,4,1>
+  3692497228U,	// <2,4,3,2>: Cost 4 vext2 <0,2,2,4>, <3,2,3,4>
+  3692497308U,	// <2,4,3,3>: Cost 4 vext2 <0,2,2,4>, <3,3,3,3>
+  3001404624U,	// <2,4,3,4>: Cost 3 vzipr LHS, <4,4,4,4>
+  2953627342U,	// <2,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
+  2953625804U,	// <2,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
+  3899868160U,	// <2,4,3,7>: Cost 4 vuzpr <1,2,3,4>, <1,3,5,7>
+  2953625806U,	// <2,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
+  2710916266U,	// <2,4,4,0>: Cost 3 vext3 <4,4,0,2>, <4,4,0,2>
+  3899869648U,	// <2,4,4,1>: Cost 4 vuzpr <1,2,3,4>, <3,4,0,1>
+  3899869658U,	// <2,4,4,2>: Cost 4 vuzpr <1,2,3,4>, <3,4,1,2>
+  3899868930U,	// <2,4,4,3>: Cost 4 vuzpr <1,2,3,4>, <2,4,1,3>
+  2712833232U,	// <2,4,4,4>: Cost 3 vext3 <4,6,u,2>, <4,4,4,4>
+  2618756406U,	// <2,4,4,5>: Cost 3 vext2 <0,2,2,4>, RHS
+  2765737270U,	// <2,4,4,6>: Cost 3 vuzpl <2,3,4,5>, RHS
+  4168304426U,	// <2,4,4,7>: Cost 4 vtrnr <1,2,3,4>, <2,4,5,7>
+  2618756649U,	// <2,4,4,u>: Cost 3 vext2 <0,2,2,4>, RHS
+  2551800011U,	// <2,4,5,0>: Cost 3 vext1 <0,2,4,5>, <0,2,4,5>
+  2569716470U,	// <2,4,5,1>: Cost 3 vext1 <3,2,4,5>, <1,0,3,2>
+  2563745405U,	// <2,4,5,2>: Cost 3 vext1 <2,2,4,5>, <2,2,4,5>
+  2569718102U,	// <2,4,5,3>: Cost 3 vext1 <3,2,4,5>, <3,2,4,5>
+  2551803190U,	// <2,4,5,4>: Cost 3 vext1 <0,2,4,5>, RHS
+  3625545732U,	// <2,4,5,5>: Cost 4 vext1 <0,2,4,5>, <5,5,5,5>
+  1611959606U,	// <2,4,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  2826128694U,	// <2,4,5,7>: Cost 3 vuzpr <1,2,3,4>, RHS
+  1611959624U,	// <2,4,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
+  1478066278U,	// <2,4,6,0>: Cost 2 vext1 <0,2,4,6>, LHS
+  2551808758U,	// <2,4,6,1>: Cost 3 vext1 <0,2,4,6>, <1,0,3,2>
+  2551809516U,	// <2,4,6,2>: Cost 3 vext1 <0,2,4,6>, <2,0,6,4>
+  2551810198U,	// <2,4,6,3>: Cost 3 vext1 <0,2,4,6>, <3,0,1,2>
+  1478069558U,	// <2,4,6,4>: Cost 2 vext1 <0,2,4,6>, RHS
+  2901888310U,	// <2,4,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
+  2551812920U,	// <2,4,6,6>: Cost 3 vext1 <0,2,4,6>, <6,6,6,6>
+  2726251914U,	// <2,4,6,7>: Cost 3 vext3 <7,0,1,2>, <4,6,7,1>
+  1478072110U,	// <2,4,6,u>: Cost 2 vext1 <0,2,4,6>, LHS
+  2659234821U,	// <2,4,7,0>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
+  3786722726U,	// <2,4,7,1>: Cost 4 vext3 <4,7,1,2>, <4,7,1,2>
+  3734303911U,	// <2,4,7,2>: Cost 4 vext2 <7,2,2,4>, <7,2,2,4>
+  3734967544U,	// <2,4,7,3>: Cost 4 vext2 <7,3,2,4>, <7,3,2,4>
+  3727005030U,	// <2,4,7,4>: Cost 4 vext2 <6,0,2,4>, <7,4,5,6>
+  2726251976U,	// <2,4,7,5>: Cost 3 vext3 <7,0,1,2>, <4,7,5,0>
+  2726251986U,	// <2,4,7,6>: Cost 3 vext3 <7,0,1,2>, <4,7,6,1>
+  3727005292U,	// <2,4,7,7>: Cost 4 vext2 <6,0,2,4>, <7,7,7,7>
+  2659234821U,	// <2,4,7,u>: Cost 3 vext2 <7,0,2,4>, <7,0,2,4>
+  1478082662U,	// <2,4,u,0>: Cost 2 vext1 <0,2,4,u>, LHS
+  2618758958U,	// <2,4,u,1>: Cost 3 vext2 <0,2,2,4>, LHS
+  2551826024U,	// <2,4,u,2>: Cost 3 vext1 <0,2,4,u>, <2,2,2,2>
+  2551826582U,	// <2,4,u,3>: Cost 3 vext1 <0,2,4,u>, <3,0,1,2>
+  1478085942U,	// <2,4,u,4>: Cost 2 vext1 <0,2,4,u>, RHS
+  2953668302U,	// <2,4,u,5>: Cost 3 vzipr LHS, <2,3,4,5>
+  1611959849U,	// <2,4,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  2826128937U,	// <2,4,u,7>: Cost 3 vuzpr <1,2,3,4>, RHS
+  1611959867U,	// <2,4,u,u>: Cost 2 vext3 <0,2,0,2>, RHS
+  3691839488U,	// <2,5,0,0>: Cost 4 vext2 <0,1,2,5>, <0,0,0,0>
+  2618097766U,	// <2,5,0,1>: Cost 3 vext2 <0,1,2,5>, LHS
+  2620088484U,	// <2,5,0,2>: Cost 3 vext2 <0,4,2,5>, <0,2,0,2>
+  2619425034U,	// <2,5,0,3>: Cost 3 vext2 <0,3,2,5>, <0,3,2,5>
+  2620088667U,	// <2,5,0,4>: Cost 3 vext2 <0,4,2,5>, <0,4,2,5>
+  2620752300U,	// <2,5,0,5>: Cost 3 vext2 <0,5,2,5>, <0,5,2,5>
+  3693830655U,	// <2,5,0,6>: Cost 4 vext2 <0,4,2,5>, <0,6,2,7>
+  3094531382U,	// <2,5,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
+  2618098333U,	// <2,5,0,u>: Cost 3 vext2 <0,1,2,5>, LHS
+  3691840246U,	// <2,5,1,0>: Cost 4 vext2 <0,1,2,5>, <1,0,3,2>
+  3691840308U,	// <2,5,1,1>: Cost 4 vext2 <0,1,2,5>, <1,1,1,1>
+  2626061206U,	// <2,5,1,2>: Cost 3 vext2 <1,4,2,5>, <1,2,3,0>
+  2618098688U,	// <2,5,1,3>: Cost 3 vext2 <0,1,2,5>, <1,3,5,7>
+  2626061364U,	// <2,5,1,4>: Cost 3 vext2 <1,4,2,5>, <1,4,2,5>
+  3691840656U,	// <2,5,1,5>: Cost 4 vext2 <0,1,2,5>, <1,5,3,7>
+  3789082310U,	// <2,5,1,6>: Cost 4 vext3 <5,1,6,2>, <5,1,6,2>
+  2712833744U,	// <2,5,1,7>: Cost 3 vext3 <4,6,u,2>, <5,1,7,3>
+  2628715896U,	// <2,5,1,u>: Cost 3 vext2 <1,u,2,5>, <1,u,2,5>
+  3693831613U,	// <2,5,2,0>: Cost 4 vext2 <0,4,2,5>, <2,0,1,2>
+  4026698642U,	// <2,5,2,1>: Cost 4 vzipr <0,0,2,2>, <4,0,5,1>
+  2632033896U,	// <2,5,2,2>: Cost 3 vext2 <2,4,2,5>, <2,2,2,2>
+  3691841190U,	// <2,5,2,3>: Cost 4 vext2 <0,1,2,5>, <2,3,0,1>
+  2632034061U,	// <2,5,2,4>: Cost 3 vext2 <2,4,2,5>, <2,4,2,5>
+  3691841352U,	// <2,5,2,5>: Cost 4 vext2 <0,1,2,5>, <2,5,0,1>
+  3691841466U,	// <2,5,2,6>: Cost 4 vext2 <0,1,2,5>, <2,6,3,7>
+  3088354614U,	// <2,5,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
+  3088354615U,	// <2,5,2,u>: Cost 3 vtrnr <0,2,0,2>, RHS
+  2557829222U,	// <2,5,3,0>: Cost 3 vext1 <1,2,5,3>, LHS
+  2557830059U,	// <2,5,3,1>: Cost 3 vext1 <1,2,5,3>, <1,2,5,3>
+  2575746766U,	// <2,5,3,2>: Cost 3 vext1 <4,2,5,3>, <2,3,4,5>
+  3691841948U,	// <2,5,3,3>: Cost 4 vext2 <0,1,2,5>, <3,3,3,3>
+  2619427330U,	// <2,5,3,4>: Cost 3 vext2 <0,3,2,5>, <3,4,5,6>
+  2581720847U,	// <2,5,3,5>: Cost 3 vext1 <5,2,5,3>, <5,2,5,3>
+  2953628162U,	// <2,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  2953626624U,	// <2,5,3,7>: Cost 3 vzipr LHS, <1,3,5,7>
+  2953626625U,	// <2,5,3,u>: Cost 3 vzipr LHS, <1,3,5,u>
+  2569781350U,	// <2,5,4,0>: Cost 3 vext1 <3,2,5,4>, LHS
+  3631580076U,	// <2,5,4,1>: Cost 4 vext1 <1,2,5,4>, <1,2,5,4>
+  2569782990U,	// <2,5,4,2>: Cost 3 vext1 <3,2,5,4>, <2,3,4,5>
+  2569783646U,	// <2,5,4,3>: Cost 3 vext1 <3,2,5,4>, <3,2,5,4>
+  2569784630U,	// <2,5,4,4>: Cost 3 vext1 <3,2,5,4>, RHS
+  2618101046U,	// <2,5,4,5>: Cost 3 vext2 <0,1,2,5>, RHS
+  3893905922U,	// <2,5,4,6>: Cost 4 vuzpr <0,2,3,5>, <3,4,5,6>
+  3094564150U,	// <2,5,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
+  2618101289U,	// <2,5,4,u>: Cost 3 vext2 <0,1,2,5>, RHS
+  2551873638U,	// <2,5,5,0>: Cost 3 vext1 <0,2,5,5>, LHS
+  3637560320U,	// <2,5,5,1>: Cost 4 vext1 <2,2,5,5>, <1,3,5,7>
+  3637560966U,	// <2,5,5,2>: Cost 4 vext1 <2,2,5,5>, <2,2,5,5>
+  3723030343U,	// <2,5,5,3>: Cost 4 vext2 <5,3,2,5>, <5,3,2,5>
+  2551876918U,	// <2,5,5,4>: Cost 3 vext1 <0,2,5,5>, RHS
+  2712834052U,	// <2,5,5,5>: Cost 3 vext3 <4,6,u,2>, <5,5,5,5>
+  4028713474U,	// <2,5,5,6>: Cost 4 vzipr <0,3,2,5>, <3,4,5,6>
+  2712834072U,	// <2,5,5,7>: Cost 3 vext3 <4,6,u,2>, <5,5,7,7>
+  2712834081U,	// <2,5,5,u>: Cost 3 vext3 <4,6,u,2>, <5,5,u,7>
+  2575769702U,	// <2,5,6,0>: Cost 3 vext1 <4,2,5,6>, LHS
+  3631596462U,	// <2,5,6,1>: Cost 4 vext1 <1,2,5,6>, <1,2,5,6>
+  2655924730U,	// <2,5,6,2>: Cost 3 vext2 <6,4,2,5>, <6,2,7,3>
+  3643541856U,	// <2,5,6,3>: Cost 4 vext1 <3,2,5,6>, <3,2,5,6>
+  2655924849U,	// <2,5,6,4>: Cost 3 vext2 <6,4,2,5>, <6,4,2,5>
+  3787755607U,	// <2,5,6,5>: Cost 4 vext3 <4,u,6,2>, <5,6,5,7>
+  4029385218U,	// <2,5,6,6>: Cost 4 vzipr <0,4,2,6>, <3,4,5,6>
+  3088682294U,	// <2,5,6,7>: Cost 3 vtrnr <0,2,4,6>, RHS
+  3088682295U,	// <2,5,6,u>: Cost 3 vtrnr <0,2,4,6>, RHS
+  2563833958U,	// <2,5,7,0>: Cost 3 vext1 <2,2,5,7>, LHS
+  2551890678U,	// <2,5,7,1>: Cost 3 vext1 <0,2,5,7>, <1,0,3,2>
+  2563835528U,	// <2,5,7,2>: Cost 3 vext1 <2,2,5,7>, <2,2,5,7>
+  3637577878U,	// <2,5,7,3>: Cost 4 vext1 <2,2,5,7>, <3,0,1,2>
+  2563837238U,	// <2,5,7,4>: Cost 3 vext1 <2,2,5,7>, RHS
+  2712834216U,	// <2,5,7,5>: Cost 3 vext3 <4,6,u,2>, <5,7,5,7>
+  2712834220U,	// <2,5,7,6>: Cost 3 vext3 <4,6,u,2>, <5,7,6,2>
+  4174449974U,	// <2,5,7,7>: Cost 4 vtrnr <2,2,5,7>, RHS
+  2563839790U,	// <2,5,7,u>: Cost 3 vext1 <2,2,5,7>, LHS
+  2563842150U,	// <2,5,u,0>: Cost 3 vext1 <2,2,5,u>, LHS
+  2618103598U,	// <2,5,u,1>: Cost 3 vext2 <0,1,2,5>, LHS
+  2563843721U,	// <2,5,u,2>: Cost 3 vext1 <2,2,5,u>, <2,2,5,u>
+  2569816418U,	// <2,5,u,3>: Cost 3 vext1 <3,2,5,u>, <3,2,5,u>
+  2622748735U,	// <2,5,u,4>: Cost 3 vext2 <0,u,2,5>, <u,4,5,6>
+  2618103962U,	// <2,5,u,5>: Cost 3 vext2 <0,1,2,5>, RHS
+  2953669122U,	// <2,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  2953667584U,	// <2,5,u,7>: Cost 3 vzipr LHS, <1,3,5,7>
+  2618104165U,	// <2,5,u,u>: Cost 3 vext2 <0,1,2,5>, LHS
+  2620096512U,	// <2,6,0,0>: Cost 3 vext2 <0,4,2,6>, <0,0,0,0>
+  1546354790U,	// <2,6,0,1>: Cost 2 vext2 <0,4,2,6>, LHS
+  2620096676U,	// <2,6,0,2>: Cost 3 vext2 <0,4,2,6>, <0,2,0,2>
+  3693838588U,	// <2,6,0,3>: Cost 4 vext2 <0,4,2,6>, <0,3,1,0>
+  1546355036U,	// <2,6,0,4>: Cost 2 vext2 <0,4,2,6>, <0,4,2,6>
+  3694502317U,	// <2,6,0,5>: Cost 4 vext2 <0,5,2,6>, <0,5,2,6>
+  2551911246U,	// <2,6,0,6>: Cost 3 vext1 <0,2,6,0>, <6,7,0,1>
+  2720723287U,	// <2,6,0,7>: Cost 3 vext3 <6,0,7,2>, <6,0,7,2>
+  1546355357U,	// <2,6,0,u>: Cost 2 vext2 <0,4,2,6>, LHS
+  2620097270U,	// <2,6,1,0>: Cost 3 vext2 <0,4,2,6>, <1,0,3,2>
+  2620097332U,	// <2,6,1,1>: Cost 3 vext2 <0,4,2,6>, <1,1,1,1>
+  2620097430U,	// <2,6,1,2>: Cost 3 vext2 <0,4,2,6>, <1,2,3,0>
+  2820243558U,	// <2,6,1,3>: Cost 3 vuzpr <0,2,4,6>, LHS
+  2620097598U,	// <2,6,1,4>: Cost 3 vext2 <0,4,2,6>, <1,4,3,6>
+  2620097680U,	// <2,6,1,5>: Cost 3 vext2 <0,4,2,6>, <1,5,3,7>
+  3693839585U,	// <2,6,1,6>: Cost 4 vext2 <0,4,2,6>, <1,6,3,7>
+  2721386920U,	// <2,6,1,7>: Cost 3 vext3 <6,1,7,2>, <6,1,7,2>
+  2820243563U,	// <2,6,1,u>: Cost 3 vuzpr <0,2,4,6>, LHS
+  2714014137U,	// <2,6,2,0>: Cost 3 vext3 <4,u,6,2>, <6,2,0,1>
+  2712834500U,	// <2,6,2,1>: Cost 3 vext3 <4,6,u,2>, <6,2,1,3>
+  2620098152U,	// <2,6,2,2>: Cost 3 vext2 <0,4,2,6>, <2,2,2,2>
+  2620098214U,	// <2,6,2,3>: Cost 3 vext2 <0,4,2,6>, <2,3,0,1>
+  2632042254U,	// <2,6,2,4>: Cost 3 vext2 <2,4,2,6>, <2,4,2,6>
+  2712834540U,	// <2,6,2,5>: Cost 3 vext3 <4,6,u,2>, <6,2,5,7>
+  2820243660U,	// <2,6,2,6>: Cost 3 vuzpr <0,2,4,6>, <0,2,4,6>
+  2958265654U,	// <2,6,2,7>: Cost 3 vzipr <0,u,2,2>, RHS
+  2620098619U,	// <2,6,2,u>: Cost 3 vext2 <0,4,2,6>, <2,u,0,1>
+  2620098710U,	// <2,6,3,0>: Cost 3 vext2 <0,4,2,6>, <3,0,1,2>
+  3893986982U,	// <2,6,3,1>: Cost 4 vuzpr <0,2,4,6>, <2,3,0,1>
+  2569848762U,	// <2,6,3,2>: Cost 3 vext1 <3,2,6,3>, <2,6,3,7>
+  2620098972U,	// <2,6,3,3>: Cost 3 vext2 <0,4,2,6>, <3,3,3,3>
+  2620099074U,	// <2,6,3,4>: Cost 3 vext2 <0,4,2,6>, <3,4,5,6>
+  3893987022U,	// <2,6,3,5>: Cost 4 vuzpr <0,2,4,6>, <2,3,4,5>
+  3001404644U,	// <2,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
+  1879887158U,	// <2,6,3,7>: Cost 2 vzipr LHS, RHS
+  1879887159U,	// <2,6,3,u>: Cost 2 vzipr LHS, RHS
+  2620099484U,	// <2,6,4,0>: Cost 3 vext2 <0,4,2,6>, <4,0,6,2>
+  2620099566U,	// <2,6,4,1>: Cost 3 vext2 <0,4,2,6>, <4,1,6,3>
+  2620099644U,	// <2,6,4,2>: Cost 3 vext2 <0,4,2,6>, <4,2,6,0>
+  3643599207U,	// <2,6,4,3>: Cost 4 vext1 <3,2,6,4>, <3,2,6,4>
+  2575830080U,	// <2,6,4,4>: Cost 3 vext1 <4,2,6,4>, <4,2,6,4>
+  1546358070U,	// <2,6,4,5>: Cost 2 vext2 <0,4,2,6>, RHS
+  2667875700U,	// <2,6,4,6>: Cost 3 vext2 <u,4,2,6>, <4,6,4,6>
+  4028042550U,	// <2,6,4,7>: Cost 4 vzipr <0,2,2,4>, RHS
+  1546358313U,	// <2,6,4,u>: Cost 2 vext2 <0,4,2,6>, RHS
+  3693841992U,	// <2,6,5,0>: Cost 4 vext2 <0,4,2,6>, <5,0,1,2>
+  2667876048U,	// <2,6,5,1>: Cost 3 vext2 <u,4,2,6>, <5,1,7,3>
+  2712834756U,	// <2,6,5,2>: Cost 3 vext3 <4,6,u,2>, <6,5,2,7>
+  3643607400U,	// <2,6,5,3>: Cost 4 vext1 <3,2,6,5>, <3,2,6,5>
+  2252091873U,	// <2,6,5,4>: Cost 3 vrev <6,2,4,5>
+  2667876356U,	// <2,6,5,5>: Cost 3 vext2 <u,4,2,6>, <5,5,5,5>
+  2667876450U,	// <2,6,5,6>: Cost 3 vext2 <u,4,2,6>, <5,6,7,0>
+  2820246838U,	// <2,6,5,7>: Cost 3 vuzpr <0,2,4,6>, RHS
+  2820246839U,	// <2,6,5,u>: Cost 3 vuzpr <0,2,4,6>, RHS
+  2563899494U,	// <2,6,6,0>: Cost 3 vext1 <2,2,6,6>, LHS
+  3893988683U,	// <2,6,6,1>: Cost 4 vuzpr <0,2,4,6>, <4,6,0,1>
+  2563901072U,	// <2,6,6,2>: Cost 3 vext1 <2,2,6,6>, <2,2,6,6>
+  3893987236U,	// <2,6,6,3>: Cost 4 vuzpr <0,2,4,6>, <2,6,1,3>
+  2563902774U,	// <2,6,6,4>: Cost 3 vext1 <2,2,6,6>, RHS
+  3893988723U,	// <2,6,6,5>: Cost 4 vuzpr <0,2,4,6>, <4,6,4,5>
+  2712834872U,	// <2,6,6,6>: Cost 3 vext3 <4,6,u,2>, <6,6,6,6>
+  2955644214U,	// <2,6,6,7>: Cost 3 vzipr <0,4,2,6>, RHS
+  2955644215U,	// <2,6,6,u>: Cost 3 vzipr <0,4,2,6>, RHS
+  2712834894U,	// <2,6,7,0>: Cost 3 vext3 <4,6,u,2>, <6,7,0,1>
+  2724926296U,	// <2,6,7,1>: Cost 3 vext3 <6,7,1,2>, <6,7,1,2>
+  2725000033U,	// <2,6,7,2>: Cost 3 vext3 <6,7,2,2>, <6,7,2,2>
+  2702365544U,	// <2,6,7,3>: Cost 3 vext3 <3,0,1,2>, <6,7,3,0>
+  2712834934U,	// <2,6,7,4>: Cost 3 vext3 <4,6,u,2>, <6,7,4,5>
+  3776107393U,	// <2,6,7,5>: Cost 4 vext3 <3,0,1,2>, <6,7,5,7>
+  2725294981U,	// <2,6,7,6>: Cost 3 vext3 <6,7,6,2>, <6,7,6,2>
+  2726253452U,	// <2,6,7,7>: Cost 3 vext3 <7,0,1,2>, <6,7,7,0>
+  2712834966U,	// <2,6,7,u>: Cost 3 vext3 <4,6,u,2>, <6,7,u,1>
+  2620102355U,	// <2,6,u,0>: Cost 3 vext2 <0,4,2,6>, <u,0,1,2>
+  1546360622U,	// <2,6,u,1>: Cost 2 vext2 <0,4,2,6>, LHS
+  2620102536U,	// <2,6,u,2>: Cost 3 vext2 <0,4,2,6>, <u,2,3,3>
+  2820244125U,	// <2,6,u,3>: Cost 3 vuzpr <0,2,4,6>, LHS
+  1594136612U,	// <2,6,u,4>: Cost 2 vext2 <u,4,2,6>, <u,4,2,6>
+  1546360986U,	// <2,6,u,5>: Cost 2 vext2 <0,4,2,6>, RHS
+  2620102864U,	// <2,6,u,6>: Cost 3 vext2 <0,4,2,6>, <u,6,3,7>
+  1879928118U,	// <2,6,u,7>: Cost 2 vzipr LHS, RHS
+  1879928119U,	// <2,6,u,u>: Cost 2 vzipr LHS, RHS
+  2726179825U,	// <2,7,0,0>: Cost 3 vext3 <7,0,0,2>, <7,0,0,2>
+  1652511738U,	// <2,7,0,1>: Cost 2 vext3 <7,0,1,2>, <7,0,1,2>
+  2621431972U,	// <2,7,0,2>: Cost 3 vext2 <0,6,2,7>, <0,2,0,2>
+  2257949868U,	// <2,7,0,3>: Cost 3 vrev <7,2,3,0>
+  2726474773U,	// <2,7,0,4>: Cost 3 vext3 <7,0,4,2>, <7,0,4,2>
+  2620768686U,	// <2,7,0,5>: Cost 3 vext2 <0,5,2,7>, <0,5,2,7>
+  2621432319U,	// <2,7,0,6>: Cost 3 vext2 <0,6,2,7>, <0,6,2,7>
+  2599760953U,	// <2,7,0,7>: Cost 3 vext1 <u,2,7,0>, <7,0,u,2>
+  1653027897U,	// <2,7,0,u>: Cost 2 vext3 <7,0,u,2>, <7,0,u,2>
+  2639348470U,	// <2,7,1,0>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
+  3695174452U,	// <2,7,1,1>: Cost 4 vext2 <0,6,2,7>, <1,1,1,1>
+  3695174550U,	// <2,7,1,2>: Cost 4 vext2 <0,6,2,7>, <1,2,3,0>
+  3694511104U,	// <2,7,1,3>: Cost 4 vext2 <0,5,2,7>, <1,3,5,7>
+  3713090594U,	// <2,7,1,4>: Cost 4 vext2 <3,6,2,7>, <1,4,0,5>
+  3693184144U,	// <2,7,1,5>: Cost 4 vext2 <0,3,2,7>, <1,5,3,7>
+  2627405016U,	// <2,7,1,6>: Cost 3 vext2 <1,6,2,7>, <1,6,2,7>
+  3799995519U,	// <2,7,1,7>: Cost 4 vext3 <7,0,1,2>, <7,1,7,0>
+  2639348470U,	// <2,7,1,u>: Cost 3 vext2 <3,6,2,7>, <1,0,3,2>
+  3695175101U,	// <2,7,2,0>: Cost 4 vext2 <0,6,2,7>, <2,0,1,2>
+  3643655168U,	// <2,7,2,1>: Cost 4 vext1 <3,2,7,2>, <1,3,5,7>
+  2257892517U,	// <2,7,2,2>: Cost 3 vrev <7,2,2,2>
+  3695175334U,	// <2,7,2,3>: Cost 4 vext2 <0,6,2,7>, <2,3,0,1>
+  3695175465U,	// <2,7,2,4>: Cost 4 vext2 <0,6,2,7>, <2,4,5,6>
+  2632714080U,	// <2,7,2,5>: Cost 3 vext2 <2,5,2,7>, <2,5,2,7>
+  2633377713U,	// <2,7,2,6>: Cost 3 vext2 <2,6,2,7>, <2,6,2,7>
+  3695175658U,	// <2,7,2,7>: Cost 4 vext2 <0,6,2,7>, <2,7,0,1>
+  2634704979U,	// <2,7,2,u>: Cost 3 vext2 <2,u,2,7>, <2,u,2,7>
+  1514094694U,	// <2,7,3,0>: Cost 2 vext1 <6,2,7,3>, LHS
+  2569921680U,	// <2,7,3,1>: Cost 3 vext1 <3,2,7,3>, <1,5,3,7>
+  2587838056U,	// <2,7,3,2>: Cost 3 vext1 <6,2,7,3>, <2,2,2,2>
+  2569922927U,	// <2,7,3,3>: Cost 3 vext1 <3,2,7,3>, <3,2,7,3>
+  1514097974U,	// <2,7,3,4>: Cost 2 vext1 <6,2,7,3>, RHS
+  2581868321U,	// <2,7,3,5>: Cost 3 vext1 <5,2,7,3>, <5,2,7,3>
+  1514099194U,	// <2,7,3,6>: Cost 2 vext1 <6,2,7,3>, <6,2,7,3>
+  2587841530U,	// <2,7,3,7>: Cost 3 vext1 <6,2,7,3>, <7,0,1,2>
+  1514100526U,	// <2,7,3,u>: Cost 2 vext1 <6,2,7,3>, LHS
+  2708706617U,	// <2,7,4,0>: Cost 3 vext3 <4,0,6,2>, <7,4,0,6>
+  3649643418U,	// <2,7,4,1>: Cost 4 vext1 <4,2,7,4>, <1,2,3,4>
+  3649644330U,	// <2,7,4,2>: Cost 4 vext1 <4,2,7,4>, <2,4,5,7>
+  2257982640U,	// <2,7,4,3>: Cost 3 vrev <7,2,3,4>
+  3649645641U,	// <2,7,4,4>: Cost 4 vext1 <4,2,7,4>, <4,2,7,4>
+  2621435190U,	// <2,7,4,5>: Cost 3 vext2 <0,6,2,7>, RHS
+  2712835441U,	// <2,7,4,6>: Cost 3 vext3 <4,6,u,2>, <7,4,6,u>
+  3799995762U,	// <2,7,4,7>: Cost 4 vext3 <7,0,1,2>, <7,4,7,0>
+  2621435433U,	// <2,7,4,u>: Cost 3 vext2 <0,6,2,7>, RHS
+  2729497990U,	// <2,7,5,0>: Cost 3 vext3 <7,5,0,2>, <7,5,0,2>
+  3643679744U,	// <2,7,5,1>: Cost 4 vext1 <3,2,7,5>, <1,3,5,7>
+  3637708424U,	// <2,7,5,2>: Cost 4 vext1 <2,2,7,5>, <2,2,5,7>
+  3643681137U,	// <2,7,5,3>: Cost 4 vext1 <3,2,7,5>, <3,2,7,5>
+  2599800118U,	// <2,7,5,4>: Cost 3 vext1 <u,2,7,5>, RHS
+  3786577334U,	// <2,7,5,5>: Cost 4 vext3 <4,6,u,2>, <7,5,5,5>
+  3786577345U,	// <2,7,5,6>: Cost 4 vext3 <4,6,u,2>, <7,5,6,7>
+  2599802214U,	// <2,7,5,7>: Cost 3 vext1 <u,2,7,5>, <7,4,5,6>
+  2599802670U,	// <2,7,5,u>: Cost 3 vext1 <u,2,7,5>, LHS
+  2581889126U,	// <2,7,6,0>: Cost 3 vext1 <5,2,7,6>, LHS
+  3643687936U,	// <2,7,6,1>: Cost 4 vext1 <3,2,7,6>, <1,3,5,7>
+  2663240186U,	// <2,7,6,2>: Cost 3 vext2 <7,6,2,7>, <6,2,7,3>
+  3643689330U,	// <2,7,6,3>: Cost 4 vext1 <3,2,7,6>, <3,2,7,6>
+  2581892406U,	// <2,7,6,4>: Cost 3 vext1 <5,2,7,6>, RHS
+  2581892900U,	// <2,7,6,5>: Cost 3 vext1 <5,2,7,6>, <5,2,7,6>
+  2587865597U,	// <2,7,6,6>: Cost 3 vext1 <6,2,7,6>, <6,2,7,6>
+  3786577428U,	// <2,7,6,7>: Cost 4 vext3 <4,6,u,2>, <7,6,7,0>
+  2581894958U,	// <2,7,6,u>: Cost 3 vext1 <5,2,7,6>, LHS
+  2726254119U,	// <2,7,7,0>: Cost 3 vext3 <7,0,1,2>, <7,7,0,1>
+  3804640817U,	// <2,7,7,1>: Cost 4 vext3 <7,7,1,2>, <7,7,1,2>
+  3637724826U,	// <2,7,7,2>: Cost 4 vext1 <2,2,7,7>, <2,2,7,7>
+  3734992123U,	// <2,7,7,3>: Cost 4 vext2 <7,3,2,7>, <7,3,2,7>
+  2552040758U,	// <2,7,7,4>: Cost 3 vext1 <0,2,7,7>, RHS
+  3799995992U,	// <2,7,7,5>: Cost 4 vext3 <7,0,1,2>, <7,7,5,5>
+  2663241198U,	// <2,7,7,6>: Cost 3 vext2 <7,6,2,7>, <7,6,2,7>
+  2712835692U,	// <2,7,7,7>: Cost 3 vext3 <4,6,u,2>, <7,7,7,7>
+  2731562607U,	// <2,7,7,u>: Cost 3 vext3 <7,u,1,2>, <7,7,u,1>
+  1514135654U,	// <2,7,u,0>: Cost 2 vext1 <6,2,7,u>, LHS
+  1657820802U,	// <2,7,u,1>: Cost 2 vext3 <7,u,1,2>, <7,u,1,2>
+  2587879016U,	// <2,7,u,2>: Cost 3 vext1 <6,2,7,u>, <2,2,2,2>
+  2569963892U,	// <2,7,u,3>: Cost 3 vext1 <3,2,7,u>, <3,2,7,u>
+  1514138934U,	// <2,7,u,4>: Cost 2 vext1 <6,2,7,u>, RHS
+  2621438106U,	// <2,7,u,5>: Cost 3 vext2 <0,6,2,7>, RHS
+  1514140159U,	// <2,7,u,6>: Cost 2 vext1 <6,2,7,u>, <6,2,7,u>
+  2587882490U,	// <2,7,u,7>: Cost 3 vext1 <6,2,7,u>, <7,0,1,2>
+  1514141486U,	// <2,7,u,u>: Cost 2 vext1 <6,2,7,u>, LHS
+  1544380416U,	// <2,u,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+  470638699U,	// <2,u,0,1>: Cost 1 vext2 LHS, LHS
+  1544380580U,	// <2,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  1658631909U,	// <2,u,0,3>: Cost 2 vext3 <u,0,3,2>, <u,0,3,2>
+  1544380754U,	// <2,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  2665898414U,	// <2,u,0,5>: Cost 3 vext2 LHS, <0,5,2,7>
+  1658853120U,	// <2,u,0,6>: Cost 2 vext3 <u,0,6,2>, <u,0,6,2>
+  3094531625U,	// <2,u,0,7>: Cost 3 vtrnr <1,2,3,0>, RHS
+  470639261U,	// <2,u,0,u>: Cost 1 vext2 LHS, LHS
+  1544381174U,	// <2,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  1544381236U,	// <2,u,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+  1544381334U,	// <2,u,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+  1544381400U,	// <2,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  2618123325U,	// <2,u,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
+  1544381584U,	// <2,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  2618123489U,	// <2,u,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
+  2726254427U,	// <2,u,1,7>: Cost 3 vext3 <7,0,1,2>, <u,1,7,3>
+  1544381823U,	// <2,u,1,u>: Cost 2 vext2 LHS, <1,u,3,3>
+  1478328422U,	// <2,u,2,0>: Cost 2 vext1 <0,2,u,2>, LHS
+  2618123807U,	// <2,u,2,1>: Cost 3 vext2 LHS, <2,1,3,1>
+  269271142U,	// <2,u,2,2>: Cost 1 vdup2 LHS
+  1544382118U,	// <2,u,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+  1478331702U,	// <2,u,2,4>: Cost 2 vext1 <0,2,u,2>, RHS
+  2618124136U,	// <2,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+  1544382394U,	// <2,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  3088354857U,	// <2,u,2,7>: Cost 3 vtrnr <0,2,0,2>, RHS
+  269271142U,	// <2,u,2,u>: Cost 1 vdup2 LHS
+  1544382614U,	// <2,u,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+  2953627374U,	// <2,u,3,1>: Cost 3 vzipr LHS, <2,3,u,1>
+  1490282143U,	// <2,u,3,2>: Cost 2 vext1 <2,2,u,3>, <2,2,u,3>
+  1879883932U,	// <2,u,3,3>: Cost 2 vzipr LHS, LHS
+  1544382978U,	// <2,u,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+  2953627378U,	// <2,u,3,5>: Cost 3 vzipr LHS, <2,3,u,5>
+  1514172931U,	// <2,u,3,6>: Cost 2 vext1 <6,2,u,3>, <6,2,u,3>
+  1879887176U,	// <2,u,3,7>: Cost 2 vzipr LHS, RHS
+  1879883937U,	// <2,u,3,u>: Cost 2 vzipr LHS, LHS
+  1484316774U,	// <2,u,4,0>: Cost 2 vext1 <1,2,u,4>, LHS
+  1484317639U,	// <2,u,4,1>: Cost 2 vext1 <1,2,u,4>, <1,2,u,4>
+  2552088270U,	// <2,u,4,2>: Cost 3 vext1 <0,2,u,4>, <2,3,4,5>
+  1190213513U,	// <2,u,4,3>: Cost 2 vrev <u,2,3,4>
+  1484320054U,	// <2,u,4,4>: Cost 2 vext1 <1,2,u,4>, RHS
+  470641974U,	// <2,u,4,5>: Cost 1 vext2 LHS, RHS
+  1592159604U,	// <2,u,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+  3094564393U,	// <2,u,4,7>: Cost 3 vtrnr <1,2,3,4>, RHS
+  470642217U,	// <2,u,4,u>: Cost 1 vext2 LHS, RHS
+  2552094959U,	// <2,u,5,0>: Cost 3 vext1 <0,2,u,5>, <0,2,u,5>
+  1592159952U,	// <2,u,5,1>: Cost 2 vext2 LHS, <5,1,7,3>
+  2564040353U,	// <2,u,5,2>: Cost 3 vext1 <2,2,u,5>, <2,2,u,5>
+  2690275455U,	// <2,u,5,3>: Cost 3 vext3 <0,u,u,2>, <u,5,3,7>
+  1592160198U,	// <2,u,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+  1592160260U,	// <2,u,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+  1611962522U,	// <2,u,5,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  1592160424U,	// <2,u,5,7>: Cost 2 vext2 LHS, <5,7,5,7>
+  1611962540U,	// <2,u,5,u>: Cost 2 vext3 <0,2,0,2>, RHS
+  1478361190U,	// <2,u,6,0>: Cost 2 vext1 <0,2,u,6>, LHS
+  2552103670U,	// <2,u,6,1>: Cost 3 vext1 <0,2,u,6>, <1,0,3,2>
+  1592160762U,	// <2,u,6,2>: Cost 2 vext2 LHS, <6,2,7,3>
+  2685704400U,	// <2,u,6,3>: Cost 3 vext3 <0,2,0,2>, <u,6,3,7>
+  1478364470U,	// <2,u,6,4>: Cost 2 vext1 <0,2,u,6>, RHS
+  2901891226U,	// <2,u,6,5>: Cost 3 vzipl <2,6,3,7>, RHS
+  1592161080U,	// <2,u,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+  1592161102U,	// <2,u,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+  1478367022U,	// <2,u,6,u>: Cost 2 vext1 <0,2,u,6>, LHS
+  1592161274U,	// <2,u,7,0>: Cost 2 vext2 LHS, <7,0,1,2>
+  2659931226U,	// <2,u,7,1>: Cost 3 vext2 <7,1,2,u>, <7,1,2,u>
+  2564056739U,	// <2,u,7,2>: Cost 3 vext1 <2,2,u,7>, <2,2,u,7>
+  2665903331U,	// <2,u,7,3>: Cost 3 vext2 LHS, <7,3,0,1>
+  1592161638U,	// <2,u,7,4>: Cost 2 vext2 LHS, <7,4,5,6>
+  2665903494U,	// <2,u,7,5>: Cost 3 vext2 LHS, <7,5,0,2>
+  2587947527U,	// <2,u,7,6>: Cost 3 vext1 <6,2,u,7>, <6,2,u,7>
+  1592161900U,	// <2,u,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+  1592161922U,	// <2,u,7,u>: Cost 2 vext2 LHS, <7,u,1,2>
+  1478377574U,	// <2,u,u,0>: Cost 2 vext1 <0,2,u,u>, LHS
+  470644526U,	// <2,u,u,1>: Cost 1 vext2 LHS, LHS
+  269271142U,	// <2,u,u,2>: Cost 1 vdup2 LHS
+  1879924892U,	// <2,u,u,3>: Cost 2 vzipr LHS, LHS
+  1478380854U,	// <2,u,u,4>: Cost 2 vext1 <0,2,u,u>, RHS
+  470644890U,	// <2,u,u,5>: Cost 1 vext2 LHS, RHS
+  1611962765U,	// <2,u,u,6>: Cost 2 vext3 <0,2,0,2>, RHS
+  1879928136U,	// <2,u,u,7>: Cost 2 vzipr LHS, RHS
+  470645093U,	// <2,u,u,u>: Cost 1 vext2 LHS, LHS
+  1611448320U,	// <3,0,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
+  1611890698U,	// <3,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
+  1611890708U,	// <3,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
+  3763576860U,	// <3,0,0,3>: Cost 4 vext3 LHS, <0,0,3,1>
+  2689835045U,	// <3,0,0,4>: Cost 3 vext3 LHS, <0,0,4,1>
+  3698508206U,	// <3,0,0,5>: Cost 4 vext2 <1,2,3,0>, <0,5,2,7>
+  3763576887U,	// <3,0,0,6>: Cost 4 vext3 LHS, <0,0,6,1>
+  3667678434U,	// <3,0,0,7>: Cost 4 vext1 <7,3,0,0>, <7,3,0,0>
+  1616093258U,	// <3,0,0,u>: Cost 2 vext3 LHS, <0,0,u,2>
+  1490337894U,	// <3,0,1,0>: Cost 2 vext1 <2,3,0,1>, LHS
+  2685632602U,	// <3,0,1,1>: Cost 3 vext3 LHS, <0,1,1,0>
+  537706598U,	// <3,0,1,2>: Cost 1 vext3 LHS, LHS
+  2624766936U,	// <3,0,1,3>: Cost 3 vext2 <1,2,3,0>, <1,3,1,3>
+  1490341174U,	// <3,0,1,4>: Cost 2 vext1 <2,3,0,1>, RHS
+  2624767120U,	// <3,0,1,5>: Cost 3 vext2 <1,2,3,0>, <1,5,3,7>
+  2732966030U,	// <3,0,1,6>: Cost 3 vext3 LHS, <0,1,6,7>
+  2593944803U,	// <3,0,1,7>: Cost 3 vext1 <7,3,0,1>, <7,3,0,1>
+  537706652U,	// <3,0,1,u>: Cost 1 vext3 LHS, LHS
+  1611890852U,	// <3,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+  2685632684U,	// <3,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
+  2685632692U,	// <3,0,2,2>: Cost 3 vext3 LHS, <0,2,2,0>
+  2685632702U,	// <3,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
+  1611890892U,	// <3,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+  2732966102U,	// <3,0,2,5>: Cost 3 vext3 LHS, <0,2,5,7>
+  2624767930U,	// <3,0,2,6>: Cost 3 vext2 <1,2,3,0>, <2,6,3,7>
+  2685632744U,	// <3,0,2,7>: Cost 3 vext3 LHS, <0,2,7,7>
+  1611890924U,	// <3,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
+  2624768150U,	// <3,0,3,0>: Cost 3 vext2 <1,2,3,0>, <3,0,1,2>
+  2685632764U,	// <3,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
+  2685632774U,	// <3,0,3,2>: Cost 3 vext3 LHS, <0,3,2,1>
+  2624768412U,	// <3,0,3,3>: Cost 3 vext2 <1,2,3,0>, <3,3,3,3>
+  2624768514U,	// <3,0,3,4>: Cost 3 vext2 <1,2,3,0>, <3,4,5,6>
+  3702491714U,	// <3,0,3,5>: Cost 4 vext2 <1,u,3,0>, <3,5,3,7>
+  2624768632U,	// <3,0,3,6>: Cost 3 vext2 <1,2,3,0>, <3,6,0,7>
+  3702491843U,	// <3,0,3,7>: Cost 4 vext2 <1,u,3,0>, <3,7,0,1>
+  2686959934U,	// <3,0,3,u>: Cost 3 vext3 <0,3,u,3>, <0,3,u,3>
+  2689835336U,	// <3,0,4,0>: Cost 3 vext3 LHS, <0,4,0,4>
+  1611891026U,	// <3,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
+  1611891036U,	// <3,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
+  3763577184U,	// <3,0,4,3>: Cost 4 vext3 LHS, <0,4,3,1>
+  2689835374U,	// <3,0,4,4>: Cost 3 vext3 LHS, <0,4,4,6>
+  1551027510U,	// <3,0,4,5>: Cost 2 vext2 <1,2,3,0>, RHS
+  2666573172U,	// <3,0,4,6>: Cost 3 vext2 <u,2,3,0>, <4,6,4,6>
+  3667711206U,	// <3,0,4,7>: Cost 4 vext1 <7,3,0,4>, <7,3,0,4>
+  1616093586U,	// <3,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
+  2685190556U,	// <3,0,5,0>: Cost 3 vext3 LHS, <0,5,0,7>
+  2666573520U,	// <3,0,5,1>: Cost 3 vext2 <u,2,3,0>, <5,1,7,3>
+  3040886886U,	// <3,0,5,2>: Cost 3 vtrnl <3,4,5,6>, LHS
+  3625912834U,	// <3,0,5,3>: Cost 4 vext1 <0,3,0,5>, <3,4,5,6>
+  2666573766U,	// <3,0,5,4>: Cost 3 vext2 <u,2,3,0>, <5,4,7,6>
+  2666573828U,	// <3,0,5,5>: Cost 3 vext2 <u,2,3,0>, <5,5,5,5>
+  2732966354U,	// <3,0,5,6>: Cost 3 vext3 LHS, <0,5,6,7>
+  2666573992U,	// <3,0,5,7>: Cost 3 vext2 <u,2,3,0>, <5,7,5,7>
+  3040886940U,	// <3,0,5,u>: Cost 3 vtrnl <3,4,5,6>, LHS
+  2685190637U,	// <3,0,6,0>: Cost 3 vext3 LHS, <0,6,0,7>
+  2732966390U,	// <3,0,6,1>: Cost 3 vext3 LHS, <0,6,1,7>
+  2689835519U,	// <3,0,6,2>: Cost 3 vext3 LHS, <0,6,2,7>
+  3667724438U,	// <3,0,6,3>: Cost 4 vext1 <7,3,0,6>, <3,0,1,2>
+  3763577355U,	// <3,0,6,4>: Cost 4 vext3 LHS, <0,6,4,1>
+  3806708243U,	// <3,0,6,5>: Cost 4 vext3 LHS, <0,6,5,0>
+  2666574648U,	// <3,0,6,6>: Cost 3 vext2 <u,2,3,0>, <6,6,6,6>
+  2657948520U,	// <3,0,6,7>: Cost 3 vext2 <6,7,3,0>, <6,7,3,0>
+  2689835573U,	// <3,0,6,u>: Cost 3 vext3 LHS, <0,6,u,7>
+  2666574842U,	// <3,0,7,0>: Cost 3 vext2 <u,2,3,0>, <7,0,1,2>
+  2685633095U,	// <3,0,7,1>: Cost 3 vext3 LHS, <0,7,1,7>
+  2660603052U,	// <3,0,7,2>: Cost 3 vext2 <7,2,3,0>, <7,2,3,0>
+  3643844997U,	// <3,0,7,3>: Cost 4 vext1 <3,3,0,7>, <3,3,0,7>
+  2666575206U,	// <3,0,7,4>: Cost 3 vext2 <u,2,3,0>, <7,4,5,6>
+  3655790391U,	// <3,0,7,5>: Cost 4 vext1 <5,3,0,7>, <5,3,0,7>
+  3731690968U,	// <3,0,7,6>: Cost 4 vext2 <6,7,3,0>, <7,6,0,3>
+  2666575468U,	// <3,0,7,7>: Cost 3 vext2 <u,2,3,0>, <7,7,7,7>
+  2664584850U,	// <3,0,7,u>: Cost 3 vext2 <7,u,3,0>, <7,u,3,0>
+  1616093834U,	// <3,0,u,0>: Cost 2 vext3 LHS, <0,u,0,2>
+  1611891346U,	// <3,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
+  537707165U,	// <3,0,u,2>: Cost 1 vext3 LHS, LHS
+  2689835684U,	// <3,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
+  1616093874U,	// <3,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
+  1551030426U,	// <3,0,u,5>: Cost 2 vext2 <1,2,3,0>, RHS
+  2624772304U,	// <3,0,u,6>: Cost 3 vext2 <1,2,3,0>, <u,6,3,7>
+  2594002154U,	// <3,0,u,7>: Cost 3 vext1 <7,3,0,u>, <7,3,0,u>
+  537707219U,	// <3,0,u,u>: Cost 1 vext3 LHS, LHS
+  2552201318U,	// <3,1,0,0>: Cost 3 vext1 <0,3,1,0>, LHS
+  2618802278U,	// <3,1,0,1>: Cost 3 vext2 <0,2,3,1>, LHS
+  2618802366U,	// <3,1,0,2>: Cost 3 vext2 <0,2,3,1>, <0,2,3,1>
+  1611449078U,	// <3,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
+  2552204598U,	// <3,1,0,4>: Cost 3 vext1 <0,3,1,0>, RHS
+  2732966663U,	// <3,1,0,5>: Cost 3 vext3 LHS, <1,0,5,1>
+  3906258396U,	// <3,1,0,6>: Cost 4 vuzpr <2,3,0,1>, <2,0,4,6>
+  3667752171U,	// <3,1,0,7>: Cost 4 vext1 <7,3,1,0>, <7,3,1,0>
+  1611891491U,	// <3,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
+  2689835819U,	// <3,1,1,0>: Cost 3 vext3 LHS, <1,1,0,1>
+  1611449140U,	// <3,1,1,1>: Cost 2 vext3 LHS, <1,1,1,1>
+  2624775063U,	// <3,1,1,2>: Cost 3 vext2 <1,2,3,1>, <1,2,3,1>
+  1611891528U,	// <3,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
+  2689835859U,	// <3,1,1,4>: Cost 3 vext3 LHS, <1,1,4,5>
+  2689835868U,	// <3,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
+  3763577701U,	// <3,1,1,6>: Cost 4 vext3 LHS, <1,1,6,5>
+  3765273452U,	// <3,1,1,7>: Cost 4 vext3 <1,1,7,3>, <1,1,7,3>
+  1611891573U,	// <3,1,1,u>: Cost 2 vext3 LHS, <1,1,u,3>
+  2629420494U,	// <3,1,2,0>: Cost 3 vext2 <2,0,3,1>, <2,0,3,1>
+  2689835911U,	// <3,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
+  2564163248U,	// <3,1,2,2>: Cost 3 vext1 <2,3,1,2>, <2,3,1,2>
+  1611449238U,	// <3,1,2,3>: Cost 2 vext3 LHS, <1,2,3,0>
+  2564164918U,	// <3,1,2,4>: Cost 3 vext1 <2,3,1,2>, RHS
+  2689835947U,	// <3,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
+  3692545978U,	// <3,1,2,6>: Cost 4 vext2 <0,2,3,1>, <2,6,3,7>
+  2732966842U,	// <3,1,2,7>: Cost 3 vext3 LHS, <1,2,7,0>
+  1611891651U,	// <3,1,2,u>: Cost 2 vext3 LHS, <1,2,u,0>
+  1484456038U,	// <3,1,3,0>: Cost 2 vext1 <1,3,1,3>, LHS
+  1611891672U,	// <3,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
+  2685633502U,	// <3,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
+  2685633512U,	// <3,1,3,3>: Cost 3 vext3 LHS, <1,3,3,1>
+  1484459318U,	// <3,1,3,4>: Cost 2 vext1 <1,3,1,3>, RHS
+  1611891712U,	// <3,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
+  2689836041U,	// <3,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
+  2733409294U,	// <3,1,3,7>: Cost 3 vext3 LHS, <1,3,7,3>
+  1611891735U,	// <3,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
+  2552234086U,	// <3,1,4,0>: Cost 3 vext1 <0,3,1,4>, LHS
+  2732966955U,	// <3,1,4,1>: Cost 3 vext3 LHS, <1,4,1,5>
+  2732966964U,	// <3,1,4,2>: Cost 3 vext3 LHS, <1,4,2,5>
+  2685633597U,	// <3,1,4,3>: Cost 3 vext3 LHS, <1,4,3,5>
+  2552237366U,	// <3,1,4,4>: Cost 3 vext1 <0,3,1,4>, RHS
+  2618805558U,	// <3,1,4,5>: Cost 3 vext2 <0,2,3,1>, RHS
+  2769472822U,	// <3,1,4,6>: Cost 3 vuzpl <3,0,1,2>, RHS
+  3667784943U,	// <3,1,4,7>: Cost 4 vext1 <7,3,1,4>, <7,3,1,4>
+  2685633642U,	// <3,1,4,u>: Cost 3 vext3 LHS, <1,4,u,5>
+  2689836143U,	// <3,1,5,0>: Cost 3 vext3 LHS, <1,5,0,1>
+  2564187280U,	// <3,1,5,1>: Cost 3 vext1 <2,3,1,5>, <1,5,3,7>
+  2564187827U,	// <3,1,5,2>: Cost 3 vext1 <2,3,1,5>, <2,3,1,5>
+  1611891856U,	// <3,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
+  2689836183U,	// <3,1,5,4>: Cost 3 vext3 LHS, <1,5,4,5>
+  3759375522U,	// <3,1,5,5>: Cost 4 vext3 LHS, <1,5,5,7>
+  3720417378U,	// <3,1,5,6>: Cost 4 vext2 <4,u,3,1>, <5,6,7,0>
+  2832518454U,	// <3,1,5,7>: Cost 3 vuzpr <2,3,0,1>, RHS
+  1611891901U,	// <3,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
+  3763578048U,	// <3,1,6,0>: Cost 4 vext3 LHS, <1,6,0,1>
+  2689836239U,	// <3,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
+  2732967128U,	// <3,1,6,2>: Cost 3 vext3 LHS, <1,6,2,7>
+  2685633761U,	// <3,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
+  3763578088U,	// <3,1,6,4>: Cost 4 vext3 LHS, <1,6,4,5>
+  2689836275U,	// <3,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
+  3763578108U,	// <3,1,6,6>: Cost 4 vext3 LHS, <1,6,6,7>
+  2732967166U,	// <3,1,6,7>: Cost 3 vext3 LHS, <1,6,7,0>
+  2685633806U,	// <3,1,6,u>: Cost 3 vext3 LHS, <1,6,u,7>
+  3631972454U,	// <3,1,7,0>: Cost 4 vext1 <1,3,1,7>, LHS
+  2659947612U,	// <3,1,7,1>: Cost 3 vext2 <7,1,3,1>, <7,1,3,1>
+  4036102294U,	// <3,1,7,2>: Cost 4 vzipr <1,5,3,7>, <3,0,1,2>
+  3095396454U,	// <3,1,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
+  3631975734U,	// <3,1,7,4>: Cost 4 vext1 <1,3,1,7>, RHS
+  2222982144U,	// <3,1,7,5>: Cost 3 vrev <1,3,5,7>
+  3296797705U,	// <3,1,7,6>: Cost 4 vrev <1,3,6,7>
+  3720418924U,	// <3,1,7,7>: Cost 4 vext2 <4,u,3,1>, <7,7,7,7>
+  3095396459U,	// <3,1,7,u>: Cost 3 vtrnr <1,3,5,7>, LHS
+  1484496998U,	// <3,1,u,0>: Cost 2 vext1 <1,3,1,u>, LHS
+  1611892077U,	// <3,1,u,1>: Cost 2 vext3 LHS, <1,u,1,3>
+  2685633907U,	// <3,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
+  1611892092U,	// <3,1,u,3>: Cost 2 vext3 LHS, <1,u,3,0>
+  1484500278U,	// <3,1,u,4>: Cost 2 vext1 <1,3,1,u>, RHS
+  1611892117U,	// <3,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
+  2685633950U,	// <3,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
+  2832518697U,	// <3,1,u,7>: Cost 3 vuzpr <2,3,0,1>, RHS
+  1611892140U,	// <3,1,u,u>: Cost 2 vext3 LHS, <1,u,u,3>
+  2623455232U,	// <3,2,0,0>: Cost 3 vext2 <1,0,3,2>, <0,0,0,0>
+  1549713510U,	// <3,2,0,1>: Cost 2 vext2 <1,0,3,2>, LHS
+  2689836484U,	// <3,2,0,2>: Cost 3 vext3 LHS, <2,0,2,0>
+  2685633997U,	// <3,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
+  2623455570U,	// <3,2,0,4>: Cost 3 vext2 <1,0,3,2>, <0,4,1,5>
+  2732967398U,	// <3,2,0,5>: Cost 3 vext3 LHS, <2,0,5,7>
+  2689836524U,	// <3,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
+  2229044964U,	// <3,2,0,7>: Cost 3 vrev <2,3,7,0>
+  1549714077U,	// <3,2,0,u>: Cost 2 vext2 <1,0,3,2>, LHS
+  1549714166U,	// <3,2,1,0>: Cost 2 vext2 <1,0,3,2>, <1,0,3,2>
+  2623456052U,	// <3,2,1,1>: Cost 3 vext2 <1,0,3,2>, <1,1,1,1>
+  2623456150U,	// <3,2,1,2>: Cost 3 vext2 <1,0,3,2>, <1,2,3,0>
+  2685634079U,	// <3,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
+  2552286518U,	// <3,2,1,4>: Cost 3 vext1 <0,3,2,1>, RHS
+  2623456400U,	// <3,2,1,5>: Cost 3 vext2 <1,0,3,2>, <1,5,3,7>
+  2689836604U,	// <3,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
+  3667834101U,	// <3,2,1,7>: Cost 4 vext1 <7,3,2,1>, <7,3,2,1>
+  1155385070U,	// <3,2,1,u>: Cost 2 vrev <2,3,u,1>
+  2689836629U,	// <3,2,2,0>: Cost 3 vext3 LHS, <2,2,0,1>
+  2689836640U,	// <3,2,2,1>: Cost 3 vext3 LHS, <2,2,1,3>
+  1611449960U,	// <3,2,2,2>: Cost 2 vext3 LHS, <2,2,2,2>
+  1611892338U,	// <3,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
+  2689836669U,	// <3,2,2,4>: Cost 3 vext3 LHS, <2,2,4,5>
+  2689836680U,	// <3,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
+  2689836688U,	// <3,2,2,6>: Cost 3 vext3 LHS, <2,2,6,6>
+  3763578518U,	// <3,2,2,7>: Cost 4 vext3 LHS, <2,2,7,3>
+  1611892383U,	// <3,2,2,u>: Cost 2 vext3 LHS, <2,2,u,3>
+  1611450022U,	// <3,2,3,0>: Cost 2 vext3 LHS, <2,3,0,1>
+  2685191854U,	// <3,2,3,1>: Cost 3 vext3 LHS, <2,3,1,0>
+  2685191865U,	// <3,2,3,2>: Cost 3 vext3 LHS, <2,3,2,2>
+  2685191875U,	// <3,2,3,3>: Cost 3 vext3 LHS, <2,3,3,3>
+  1611450062U,	// <3,2,3,4>: Cost 2 vext3 LHS, <2,3,4,5>
+  2732967635U,	// <3,2,3,5>: Cost 3 vext3 LHS, <2,3,5,1>
+  2732967645U,	// <3,2,3,6>: Cost 3 vext3 LHS, <2,3,6,2>
+  2732967652U,	// <3,2,3,7>: Cost 3 vext3 LHS, <2,3,7,0>
+  1611450094U,	// <3,2,3,u>: Cost 2 vext3 LHS, <2,3,u,1>
+  2558279782U,	// <3,2,4,0>: Cost 3 vext1 <1,3,2,4>, LHS
+  2558280602U,	// <3,2,4,1>: Cost 3 vext1 <1,3,2,4>, <1,2,3,4>
+  2732967692U,	// <3,2,4,2>: Cost 3 vext3 LHS, <2,4,2,4>
+  2685634326U,	// <3,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
+  2558283062U,	// <3,2,4,4>: Cost 3 vext1 <1,3,2,4>, RHS
+  1549716790U,	// <3,2,4,5>: Cost 2 vext2 <1,0,3,2>, RHS
+  2689836844U,	// <3,2,4,6>: Cost 3 vext3 LHS, <2,4,6,0>
+  2229077736U,	// <3,2,4,7>: Cost 3 vrev <2,3,7,4>
+  1549717033U,	// <3,2,4,u>: Cost 2 vext2 <1,0,3,2>, RHS
+  2552316006U,	// <3,2,5,0>: Cost 3 vext1 <0,3,2,5>, LHS
+  2228643507U,	// <3,2,5,1>: Cost 3 vrev <2,3,1,5>
+  2689836896U,	// <3,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
+  2685634408U,	// <3,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
+  1155122894U,	// <3,2,5,4>: Cost 2 vrev <2,3,4,5>
+  2665263108U,	// <3,2,5,5>: Cost 3 vext2 <u,0,3,2>, <5,5,5,5>
+  2689836932U,	// <3,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
+  2665263272U,	// <3,2,5,7>: Cost 3 vext2 <u,0,3,2>, <5,7,5,7>
+  1155417842U,	// <3,2,5,u>: Cost 2 vrev <2,3,u,5>
+  2689836953U,	// <3,2,6,0>: Cost 3 vext3 LHS, <2,6,0,1>
+  2689836964U,	// <3,2,6,1>: Cost 3 vext3 LHS, <2,6,1,3>
+  2689836976U,	// <3,2,6,2>: Cost 3 vext3 LHS, <2,6,2,6>
+  1611892666U,	// <3,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
+  2689836993U,	// <3,2,6,4>: Cost 3 vext3 LHS, <2,6,4,5>
+  2689837004U,	// <3,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
+  2689837013U,	// <3,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
+  2665263950U,	// <3,2,6,7>: Cost 3 vext2 <u,0,3,2>, <6,7,0,1>
+  1611892711U,	// <3,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
+  2665264122U,	// <3,2,7,0>: Cost 3 vext2 <u,0,3,2>, <7,0,1,2>
+  2623460419U,	// <3,2,7,1>: Cost 3 vext2 <1,0,3,2>, <7,1,0,3>
+  4169138340U,	// <3,2,7,2>: Cost 4 vtrnr <1,3,5,7>, <0,2,0,2>
+  2962358374U,	// <3,2,7,3>: Cost 3 vzipr <1,5,3,7>, LHS
+  2665264486U,	// <3,2,7,4>: Cost 3 vext2 <u,0,3,2>, <7,4,5,6>
+  2228954841U,	// <3,2,7,5>: Cost 3 vrev <2,3,5,7>
+  2229028578U,	// <3,2,7,6>: Cost 3 vrev <2,3,6,7>
+  2665264748U,	// <3,2,7,7>: Cost 3 vext2 <u,0,3,2>, <7,7,7,7>
+  2962358379U,	// <3,2,7,u>: Cost 3 vzipr <1,5,3,7>, LHS
+  1611892795U,	// <3,2,u,0>: Cost 2 vext3 LHS, <2,u,0,1>
+  1549719342U,	// <3,2,u,1>: Cost 2 vext2 <1,0,3,2>, LHS
+  1611449960U,	// <3,2,u,2>: Cost 2 vext3 LHS, <2,2,2,2>
+  1611892824U,	// <3,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
+  1611892835U,	// <3,2,u,4>: Cost 2 vext3 LHS, <2,u,4,5>
+  1549719706U,	// <3,2,u,5>: Cost 2 vext2 <1,0,3,2>, RHS
+  2689837168U,	// <3,2,u,6>: Cost 3 vext3 LHS, <2,u,6,0>
+  2665265408U,	// <3,2,u,7>: Cost 3 vext2 <u,0,3,2>, <u,7,0,1>
+  1611892867U,	// <3,2,u,u>: Cost 2 vext3 LHS, <2,u,u,1>
+  2685192331U,	// <3,3,0,0>: Cost 3 vext3 LHS, <3,0,0,0>
+  1611450518U,	// <3,3,0,1>: Cost 2 vext3 LHS, <3,0,1,2>
+  2685634717U,	// <3,3,0,2>: Cost 3 vext3 LHS, <3,0,2,0>
+  2564294806U,	// <3,3,0,3>: Cost 3 vext1 <2,3,3,0>, <3,0,1,2>
+  2685634736U,	// <3,3,0,4>: Cost 3 vext3 LHS, <3,0,4,1>
+  2732968122U,	// <3,3,0,5>: Cost 3 vext3 LHS, <3,0,5,2>
+  3763579075U,	// <3,3,0,6>: Cost 4 vext3 LHS, <3,0,6,2>
+  4034053264U,	// <3,3,0,7>: Cost 4 vzipr <1,2,3,0>, <1,5,3,7>
+  1611450581U,	// <3,3,0,u>: Cost 2 vext3 LHS, <3,0,u,2>
+  2685192415U,	// <3,3,1,0>: Cost 3 vext3 LHS, <3,1,0,3>
+  1550385992U,	// <3,3,1,1>: Cost 2 vext2 <1,1,3,3>, <1,1,3,3>
+  2685192433U,	// <3,3,1,2>: Cost 3 vext3 LHS, <3,1,2,3>
+  2685634808U,	// <3,3,1,3>: Cost 3 vext3 LHS, <3,1,3,1>
+  2558332214U,	// <3,3,1,4>: Cost 3 vext1 <1,3,3,1>, RHS
+  2685634828U,	// <3,3,1,5>: Cost 3 vext3 LHS, <3,1,5,3>
+  3759376661U,	// <3,3,1,6>: Cost 4 vext3 LHS, <3,1,6,3>
+  2703477022U,	// <3,3,1,7>: Cost 3 vext3 <3,1,7,3>, <3,1,7,3>
+  1555031423U,	// <3,3,1,u>: Cost 2 vext2 <1,u,3,3>, <1,u,3,3>
+  2564309094U,	// <3,3,2,0>: Cost 3 vext1 <2,3,3,2>, LHS
+  2630100513U,	// <3,3,2,1>: Cost 3 vext2 <2,1,3,3>, <2,1,3,3>
+  1557022322U,	// <3,3,2,2>: Cost 2 vext2 <2,2,3,3>, <2,2,3,3>
+  2685192520U,	// <3,3,2,3>: Cost 3 vext3 LHS, <3,2,3,0>
+  2564312374U,	// <3,3,2,4>: Cost 3 vext1 <2,3,3,2>, RHS
+  2732968286U,	// <3,3,2,5>: Cost 3 vext3 LHS, <3,2,5,4>
+  2685634918U,	// <3,3,2,6>: Cost 3 vext3 LHS, <3,2,6,3>
+  2704140655U,	// <3,3,2,7>: Cost 3 vext3 <3,2,7,3>, <3,2,7,3>
+  1561004120U,	// <3,3,2,u>: Cost 2 vext2 <2,u,3,3>, <2,u,3,3>
+  1496547430U,	// <3,3,3,0>: Cost 2 vext1 <3,3,3,3>, LHS
+  2624129256U,	// <3,3,3,1>: Cost 3 vext2 <1,1,3,3>, <3,1,1,3>
+  2630764866U,	// <3,3,3,2>: Cost 3 vext2 <2,2,3,3>, <3,2,2,3>
+  336380006U,	// <3,3,3,3>: Cost 1 vdup3 LHS
+  1496550710U,	// <3,3,3,4>: Cost 2 vext1 <3,3,3,3>, RHS
+  2732968368U,	// <3,3,3,5>: Cost 3 vext3 LHS, <3,3,5,5>
+  2624129683U,	// <3,3,3,6>: Cost 3 vext2 <1,1,3,3>, <3,6,3,7>
+  2594182400U,	// <3,3,3,7>: Cost 3 vext1 <7,3,3,3>, <7,3,3,3>
+  336380006U,	// <3,3,3,u>: Cost 1 vdup3 LHS
+  2558353510U,	// <3,3,4,0>: Cost 3 vext1 <1,3,3,4>, LHS
+  2558354411U,	// <3,3,4,1>: Cost 3 vext1 <1,3,3,4>, <1,3,3,4>
+  2564327108U,	// <3,3,4,2>: Cost 3 vext1 <2,3,3,4>, <2,3,3,4>
+  2564327938U,	// <3,3,4,3>: Cost 3 vext1 <2,3,3,4>, <3,4,5,6>
+  2960343962U,	// <3,3,4,4>: Cost 3 vzipr <1,2,3,4>, <1,2,3,4>
+  1611893250U,	// <3,3,4,5>: Cost 2 vext3 LHS, <3,4,5,6>
+  2771619126U,	// <3,3,4,6>: Cost 3 vuzpl <3,3,3,3>, RHS
+  4034086032U,	// <3,3,4,7>: Cost 4 vzipr <1,2,3,4>, <1,5,3,7>
+  1611893277U,	// <3,3,4,u>: Cost 2 vext3 LHS, <3,4,u,6>
+  2558361702U,	// <3,3,5,0>: Cost 3 vext1 <1,3,3,5>, LHS
+  2558362604U,	// <3,3,5,1>: Cost 3 vext1 <1,3,3,5>, <1,3,3,5>
+  2558363342U,	// <3,3,5,2>: Cost 3 vext1 <1,3,3,5>, <2,3,4,5>
+  2732968512U,	// <3,3,5,3>: Cost 3 vext3 LHS, <3,5,3,5>
+  2558364982U,	// <3,3,5,4>: Cost 3 vext1 <1,3,3,5>, RHS
+  3101279950U,	// <3,3,5,5>: Cost 3 vtrnr <2,3,4,5>, <2,3,4,5>
+  2665934946U,	// <3,3,5,6>: Cost 3 vext2 <u,1,3,3>, <5,6,7,0>
+  2826636598U,	// <3,3,5,7>: Cost 3 vuzpr <1,3,1,3>, RHS
+  2826636599U,	// <3,3,5,u>: Cost 3 vuzpr <1,3,1,3>, RHS
+  2732968568U,	// <3,3,6,0>: Cost 3 vext3 LHS, <3,6,0,7>
+  3763579521U,	// <3,3,6,1>: Cost 4 vext3 LHS, <3,6,1,7>
+  2732968586U,	// <3,3,6,2>: Cost 3 vext3 LHS, <3,6,2,7>
+  2732968595U,	// <3,3,6,3>: Cost 3 vext3 LHS, <3,6,3,7>
+  2732968604U,	// <3,3,6,4>: Cost 3 vext3 LHS, <3,6,4,7>
+  3763579557U,	// <3,3,6,5>: Cost 4 vext3 LHS, <3,6,5,7>
+  2732968621U,	// <3,3,6,6>: Cost 3 vext3 LHS, <3,6,6,6>
+  2657973099U,	// <3,3,6,7>: Cost 3 vext2 <6,7,3,3>, <6,7,3,3>
+  2658636732U,	// <3,3,6,u>: Cost 3 vext2 <6,u,3,3>, <6,u,3,3>
+  2558378086U,	// <3,3,7,0>: Cost 3 vext1 <1,3,3,7>, LHS
+  2558378990U,	// <3,3,7,1>: Cost 3 vext1 <1,3,3,7>, <1,3,3,7>
+  2564351687U,	// <3,3,7,2>: Cost 3 vext1 <2,3,3,7>, <2,3,3,7>
+  2661291264U,	// <3,3,7,3>: Cost 3 vext2 <7,3,3,3>, <7,3,3,3>
+  2558381366U,	// <3,3,7,4>: Cost 3 vext1 <1,3,3,7>, RHS
+  2732968694U,	// <3,3,7,5>: Cost 3 vext3 LHS, <3,7,5,7>
+  3781126907U,	// <3,3,7,6>: Cost 4 vext3 <3,7,6,3>, <3,7,6,3>
+  3095397376U,	// <3,3,7,7>: Cost 3 vtrnr <1,3,5,7>, <1,3,5,7>
+  2558383918U,	// <3,3,7,u>: Cost 3 vext1 <1,3,3,7>, LHS
+  1496547430U,	// <3,3,u,0>: Cost 2 vext1 <3,3,3,3>, LHS
+  1611893534U,	// <3,3,u,1>: Cost 2 vext3 LHS, <3,u,1,2>
+  1592858504U,	// <3,3,u,2>: Cost 2 vext2 <u,2,3,3>, <u,2,3,3>
+  336380006U,	// <3,3,u,3>: Cost 1 vdup3 LHS
+  1496550710U,	// <3,3,u,4>: Cost 2 vext1 <3,3,3,3>, RHS
+  1611893574U,	// <3,3,u,5>: Cost 2 vext3 LHS, <3,u,5,6>
+  2690280268U,	// <3,3,u,6>: Cost 3 vext3 LHS, <3,u,6,3>
+  2826636841U,	// <3,3,u,7>: Cost 3 vuzpr <1,3,1,3>, RHS
+  336380006U,	// <3,3,u,u>: Cost 1 vdup3 LHS
+  2624798720U,	// <3,4,0,0>: Cost 3 vext2 <1,2,3,4>, <0,0,0,0>
+  1551056998U,	// <3,4,0,1>: Cost 2 vext2 <1,2,3,4>, LHS
+  2624798884U,	// <3,4,0,2>: Cost 3 vext2 <1,2,3,4>, <0,2,0,2>
+  3693232384U,	// <3,4,0,3>: Cost 4 vext2 <0,3,3,4>, <0,3,1,4>
+  2624799058U,	// <3,4,0,4>: Cost 3 vext2 <1,2,3,4>, <0,4,1,5>
+  1659227026U,	// <3,4,0,5>: Cost 2 vext3 LHS, <4,0,5,1>
+  1659227036U,	// <3,4,0,6>: Cost 2 vext3 LHS, <4,0,6,2>
+  3667973382U,	// <3,4,0,7>: Cost 4 vext1 <7,3,4,0>, <7,3,4,0>
+  1551057565U,	// <3,4,0,u>: Cost 2 vext2 <1,2,3,4>, LHS
+  2624799478U,	// <3,4,1,0>: Cost 3 vext2 <1,2,3,4>, <1,0,3,2>
+  2624799540U,	// <3,4,1,1>: Cost 3 vext2 <1,2,3,4>, <1,1,1,1>
+  1551057818U,	// <3,4,1,2>: Cost 2 vext2 <1,2,3,4>, <1,2,3,4>
+  2624799704U,	// <3,4,1,3>: Cost 3 vext2 <1,2,3,4>, <1,3,1,3>
+  2564377910U,	// <3,4,1,4>: Cost 3 vext1 <2,3,4,1>, RHS
+  2689838050U,	// <3,4,1,5>: Cost 3 vext3 LHS, <4,1,5,0>
+  2689838062U,	// <3,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
+  2628117807U,	// <3,4,1,7>: Cost 3 vext2 <1,7,3,4>, <1,7,3,4>
+  1555039616U,	// <3,4,1,u>: Cost 2 vext2 <1,u,3,4>, <1,u,3,4>
+  3626180710U,	// <3,4,2,0>: Cost 4 vext1 <0,3,4,2>, LHS
+  2624800298U,	// <3,4,2,1>: Cost 3 vext2 <1,2,3,4>, <2,1,4,3>
+  2624800360U,	// <3,4,2,2>: Cost 3 vext2 <1,2,3,4>, <2,2,2,2>
+  2624800422U,	// <3,4,2,3>: Cost 3 vext2 <1,2,3,4>, <2,3,0,1>
+  2624800514U,	// <3,4,2,4>: Cost 3 vext2 <1,2,3,4>, <2,4,1,3>
+  2709965878U,	// <3,4,2,5>: Cost 3 vext3 <4,2,5,3>, <4,2,5,3>
+  2689838140U,	// <3,4,2,6>: Cost 3 vext3 LHS, <4,2,6,0>
+  2634090504U,	// <3,4,2,7>: Cost 3 vext2 <2,7,3,4>, <2,7,3,4>
+  2689838158U,	// <3,4,2,u>: Cost 3 vext3 LHS, <4,2,u,0>
+  2624800918U,	// <3,4,3,0>: Cost 3 vext2 <1,2,3,4>, <3,0,1,2>
+  2636081403U,	// <3,4,3,1>: Cost 3 vext2 <3,1,3,4>, <3,1,3,4>
+  2636745036U,	// <3,4,3,2>: Cost 3 vext2 <3,2,3,4>, <3,2,3,4>
+  2624801180U,	// <3,4,3,3>: Cost 3 vext2 <1,2,3,4>, <3,3,3,3>
+  2624801232U,	// <3,4,3,4>: Cost 3 vext2 <1,2,3,4>, <3,4,0,1>
+  2905836854U,	// <3,4,3,5>: Cost 3 vzipl <3,3,3,3>, RHS
+  3040054582U,	// <3,4,3,6>: Cost 3 vtrnl <3,3,3,3>, RHS
+  3702524611U,	// <3,4,3,7>: Cost 4 vext2 <1,u,3,4>, <3,7,0,1>
+  2624801566U,	// <3,4,3,u>: Cost 3 vext2 <1,2,3,4>, <3,u,1,2>
+  2564399206U,	// <3,4,4,0>: Cost 3 vext1 <2,3,4,4>, LHS
+  2564400026U,	// <3,4,4,1>: Cost 3 vext1 <2,3,4,4>, <1,2,3,4>
+  2564400845U,	// <3,4,4,2>: Cost 3 vext1 <2,3,4,4>, <2,3,4,4>
+  2570373542U,	// <3,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
+  1659227344U,	// <3,4,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
+  1551060278U,	// <3,4,4,5>: Cost 2 vext2 <1,2,3,4>, RHS
+  1659227364U,	// <3,4,4,6>: Cost 2 vext3 LHS, <4,4,6,6>
+  3668006154U,	// <3,4,4,7>: Cost 4 vext1 <7,3,4,4>, <7,3,4,4>
+  1551060521U,	// <3,4,4,u>: Cost 2 vext2 <1,2,3,4>, RHS
+  1490665574U,	// <3,4,5,0>: Cost 2 vext1 <2,3,4,5>, LHS
+  2689838341U,	// <3,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
+  1490667214U,	// <3,4,5,2>: Cost 2 vext1 <2,3,4,5>, <2,3,4,5>
+  2564409494U,	// <3,4,5,3>: Cost 3 vext1 <2,3,4,5>, <3,0,1,2>
+  1490668854U,	// <3,4,5,4>: Cost 2 vext1 <2,3,4,5>, RHS
+  2689838381U,	// <3,4,5,5>: Cost 3 vext3 LHS, <4,5,5,7>
+  537709878U,	// <3,4,5,6>: Cost 1 vext3 LHS, RHS
+  2594272523U,	// <3,4,5,7>: Cost 3 vext1 <7,3,4,5>, <7,3,4,5>
+  537709896U,	// <3,4,5,u>: Cost 1 vext3 LHS, RHS
+  2689838411U,	// <3,4,6,0>: Cost 3 vext3 LHS, <4,6,0,1>
+  2558444534U,	// <3,4,6,1>: Cost 3 vext1 <1,3,4,6>, <1,3,4,6>
+  2666607098U,	// <3,4,6,2>: Cost 3 vext2 <u,2,3,4>, <6,2,7,3>
+  2558446082U,	// <3,4,6,3>: Cost 3 vext1 <1,3,4,6>, <3,4,5,6>
+  1659227508U,	// <3,4,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
+  2689838462U,	// <3,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
+  2689838471U,	// <3,4,6,6>: Cost 3 vext3 LHS, <4,6,6,7>
+  2657981292U,	// <3,4,6,7>: Cost 3 vext2 <6,7,3,4>, <6,7,3,4>
+  1659227540U,	// <3,4,6,u>: Cost 2 vext3 LHS, <4,6,u,2>
+  2666607610U,	// <3,4,7,0>: Cost 3 vext2 <u,2,3,4>, <7,0,1,2>
+  3702527072U,	// <3,4,7,1>: Cost 4 vext2 <1,u,3,4>, <7,1,3,5>
+  2660635824U,	// <3,4,7,2>: Cost 3 vext2 <7,2,3,4>, <7,2,3,4>
+  3644139945U,	// <3,4,7,3>: Cost 4 vext1 <3,3,4,7>, <3,3,4,7>
+  2666607974U,	// <3,4,7,4>: Cost 3 vext2 <u,2,3,4>, <7,4,5,6>
+  2732969416U,	// <3,4,7,5>: Cost 3 vext3 LHS, <4,7,5,0>
+  2732969425U,	// <3,4,7,6>: Cost 3 vext3 LHS, <4,7,6,0>
+  2666608236U,	// <3,4,7,7>: Cost 3 vext2 <u,2,3,4>, <7,7,7,7>
+  2664617622U,	// <3,4,7,u>: Cost 3 vext2 <7,u,3,4>, <7,u,3,4>
+  1490690150U,	// <3,4,u,0>: Cost 2 vext1 <2,3,4,u>, LHS
+  1551062830U,	// <3,4,u,1>: Cost 2 vext2 <1,2,3,4>, LHS
+  1490691793U,	// <3,4,u,2>: Cost 2 vext1 <2,3,4,u>, <2,3,4,u>
+  2624804796U,	// <3,4,u,3>: Cost 3 vext2 <1,2,3,4>, <u,3,0,1>
+  1490693430U,	// <3,4,u,4>: Cost 2 vext1 <2,3,4,u>, RHS
+  1551063194U,	// <3,4,u,5>: Cost 2 vext2 <1,2,3,4>, RHS
+  537710121U,	// <3,4,u,6>: Cost 1 vext3 LHS, RHS
+  2594297102U,	// <3,4,u,7>: Cost 3 vext1 <7,3,4,u>, <7,3,4,u>
+  537710139U,	// <3,4,u,u>: Cost 1 vext3 LHS, RHS
+  3692576768U,	// <3,5,0,0>: Cost 4 vext2 <0,2,3,5>, <0,0,0,0>
+  2618835046U,	// <3,5,0,1>: Cost 3 vext2 <0,2,3,5>, LHS
+  2618835138U,	// <3,5,0,2>: Cost 3 vext2 <0,2,3,5>, <0,2,3,5>
+  3692577024U,	// <3,5,0,3>: Cost 4 vext2 <0,2,3,5>, <0,3,1,4>
+  2689838690U,	// <3,5,0,4>: Cost 3 vext3 LHS, <5,0,4,1>
+  2732969579U,	// <3,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
+  2732969588U,	// <3,5,0,6>: Cost 3 vext3 LHS, <5,0,6,1>
+  2246963055U,	// <3,5,0,7>: Cost 3 vrev <5,3,7,0>
+  2618835613U,	// <3,5,0,u>: Cost 3 vext2 <0,2,3,5>, LHS
+  2594308198U,	// <3,5,1,0>: Cost 3 vext1 <7,3,5,1>, LHS
+  3692577588U,	// <3,5,1,1>: Cost 4 vext2 <0,2,3,5>, <1,1,1,1>
+  2624807835U,	// <3,5,1,2>: Cost 3 vext2 <1,2,3,5>, <1,2,3,5>
+  2625471468U,	// <3,5,1,3>: Cost 3 vext2 <1,3,3,5>, <1,3,3,5>
+  2626135101U,	// <3,5,1,4>: Cost 3 vext2 <1,4,3,5>, <1,4,3,5>
+  2594311888U,	// <3,5,1,5>: Cost 3 vext1 <7,3,5,1>, <5,1,7,3>
+  3699877107U,	// <3,5,1,6>: Cost 4 vext2 <1,4,3,5>, <1,6,5,7>
+  1641680592U,	// <3,5,1,7>: Cost 2 vext3 <5,1,7,3>, <5,1,7,3>
+  1641754329U,	// <3,5,1,u>: Cost 2 vext3 <5,1,u,3>, <5,1,u,3>
+  3692578274U,	// <3,5,2,0>: Cost 4 vext2 <0,2,3,5>, <2,0,5,3>
+  2630116899U,	// <3,5,2,1>: Cost 3 vext2 <2,1,3,5>, <2,1,3,5>
+  3692578408U,	// <3,5,2,2>: Cost 4 vext2 <0,2,3,5>, <2,2,2,2>
+  2625472206U,	// <3,5,2,3>: Cost 3 vext2 <1,3,3,5>, <2,3,4,5>
+  2632107798U,	// <3,5,2,4>: Cost 3 vext2 <2,4,3,5>, <2,4,3,5>
+  2715938575U,	// <3,5,2,5>: Cost 3 vext3 <5,2,5,3>, <5,2,5,3>
+  3692578746U,	// <3,5,2,6>: Cost 4 vext2 <0,2,3,5>, <2,6,3,7>
+  2716086049U,	// <3,5,2,7>: Cost 3 vext3 <5,2,7,3>, <5,2,7,3>
+  2634762330U,	// <3,5,2,u>: Cost 3 vext2 <2,u,3,5>, <2,u,3,5>
+  3692578966U,	// <3,5,3,0>: Cost 4 vext2 <0,2,3,5>, <3,0,1,2>
+  2636089596U,	// <3,5,3,1>: Cost 3 vext2 <3,1,3,5>, <3,1,3,5>
+  3699214668U,	// <3,5,3,2>: Cost 4 vext2 <1,3,3,5>, <3,2,3,4>
+  2638080412U,	// <3,5,3,3>: Cost 3 vext2 <3,4,3,5>, <3,3,3,3>
+  2618837506U,	// <3,5,3,4>: Cost 3 vext2 <0,2,3,5>, <3,4,5,6>
+  2832844494U,	// <3,5,3,5>: Cost 3 vuzpr <2,3,4,5>, <2,3,4,5>
+  4033415682U,	// <3,5,3,6>: Cost 4 vzipr <1,1,3,3>, <3,4,5,6>
+  3095072054U,	// <3,5,3,7>: Cost 3 vtrnr <1,3,1,3>, RHS
+  3095072055U,	// <3,5,3,u>: Cost 3 vtrnr <1,3,1,3>, RHS
+  2600304742U,	// <3,5,4,0>: Cost 3 vext1 <u,3,5,4>, LHS
+  3763580815U,	// <3,5,4,1>: Cost 4 vext3 LHS, <5,4,1,5>
+  2564474582U,	// <3,5,4,2>: Cost 3 vext1 <2,3,5,4>, <2,3,5,4>
+  3699879044U,	// <3,5,4,3>: Cost 4 vext2 <1,4,3,5>, <4,3,5,0>
+  2600308022U,	// <3,5,4,4>: Cost 3 vext1 <u,3,5,4>, RHS
+  2618838326U,	// <3,5,4,5>: Cost 3 vext2 <0,2,3,5>, RHS
+  2772454710U,	// <3,5,4,6>: Cost 3 vuzpl <3,4,5,6>, RHS
+  1659228102U,	// <3,5,4,7>: Cost 2 vext3 LHS, <5,4,7,6>
+  1659228111U,	// <3,5,4,u>: Cost 2 vext3 LHS, <5,4,u,6>
+  2570453094U,	// <3,5,5,0>: Cost 3 vext1 <3,3,5,5>, LHS
+  2624810704U,	// <3,5,5,1>: Cost 3 vext2 <1,2,3,5>, <5,1,7,3>
+  2570454734U,	// <3,5,5,2>: Cost 3 vext1 <3,3,5,5>, <2,3,4,5>
+  2570455472U,	// <3,5,5,3>: Cost 3 vext1 <3,3,5,5>, <3,3,5,5>
+  2570456374U,	// <3,5,5,4>: Cost 3 vext1 <3,3,5,5>, RHS
+  1659228164U,	// <3,5,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
+  2732969998U,	// <3,5,5,6>: Cost 3 vext3 LHS, <5,5,6,6>
+  1659228184U,	// <3,5,5,7>: Cost 2 vext3 LHS, <5,5,7,7>
+  1659228193U,	// <3,5,5,u>: Cost 2 vext3 LHS, <5,5,u,7>
+  2732970020U,	// <3,5,6,0>: Cost 3 vext3 LHS, <5,6,0,1>
+  2732970035U,	// <3,5,6,1>: Cost 3 vext3 LHS, <5,6,1,7>
+  2564490968U,	// <3,5,6,2>: Cost 3 vext1 <2,3,5,6>, <2,3,5,6>
+  2732970050U,	// <3,5,6,3>: Cost 3 vext3 LHS, <5,6,3,4>
+  2732970060U,	// <3,5,6,4>: Cost 3 vext3 LHS, <5,6,4,5>
+  2732970071U,	// <3,5,6,5>: Cost 3 vext3 LHS, <5,6,5,7>
+  2732970080U,	// <3,5,6,6>: Cost 3 vext3 LHS, <5,6,6,7>
+  1659228258U,	// <3,5,6,7>: Cost 2 vext3 LHS, <5,6,7,0>
+  1659228267U,	// <3,5,6,u>: Cost 2 vext3 LHS, <5,6,u,0>
+  1484783718U,	// <3,5,7,0>: Cost 2 vext1 <1,3,5,7>, LHS
+  1484784640U,	// <3,5,7,1>: Cost 2 vext1 <1,3,5,7>, <1,3,5,7>
+  2558527080U,	// <3,5,7,2>: Cost 3 vext1 <1,3,5,7>, <2,2,2,2>
+  2558527638U,	// <3,5,7,3>: Cost 3 vext1 <1,3,5,7>, <3,0,1,2>
+  1484786998U,	// <3,5,7,4>: Cost 2 vext1 <1,3,5,7>, RHS
+  1659228328U,	// <3,5,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
+  2732970154U,	// <3,5,7,6>: Cost 3 vext3 LHS, <5,7,6,0>
+  2558531180U,	// <3,5,7,7>: Cost 3 vext1 <1,3,5,7>, <7,7,7,7>
+  1484789550U,	// <3,5,7,u>: Cost 2 vext1 <1,3,5,7>, LHS
+  1484791910U,	// <3,5,u,0>: Cost 2 vext1 <1,3,5,u>, LHS
+  1484792833U,	// <3,5,u,1>: Cost 2 vext1 <1,3,5,u>, <1,3,5,u>
+  2558535272U,	// <3,5,u,2>: Cost 3 vext1 <1,3,5,u>, <2,2,2,2>
+  2558535830U,	// <3,5,u,3>: Cost 3 vext1 <1,3,5,u>, <3,0,1,2>
+  1484795190U,	// <3,5,u,4>: Cost 2 vext1 <1,3,5,u>, RHS
+  1659228409U,	// <3,5,u,5>: Cost 2 vext3 LHS, <5,u,5,7>
+  2772457626U,	// <3,5,u,6>: Cost 3 vuzpl <3,4,5,6>, RHS
+  1646326023U,	// <3,5,u,7>: Cost 2 vext3 <5,u,7,3>, <5,u,7,3>
+  1484797742U,	// <3,5,u,u>: Cost 2 vext1 <1,3,5,u>, LHS
+  2558541926U,	// <3,6,0,0>: Cost 3 vext1 <1,3,6,0>, LHS
+  2689839393U,	// <3,6,0,1>: Cost 3 vext3 LHS, <6,0,1,2>
+  2689839404U,	// <3,6,0,2>: Cost 3 vext3 LHS, <6,0,2,4>
+  3706519808U,	// <3,6,0,3>: Cost 4 vext2 <2,5,3,6>, <0,3,1,4>
+  2689839420U,	// <3,6,0,4>: Cost 3 vext3 LHS, <6,0,4,2>
+  2732970314U,	// <3,6,0,5>: Cost 3 vext3 LHS, <6,0,5,7>
+  2732970316U,	// <3,6,0,6>: Cost 3 vext3 LHS, <6,0,6,0>
+  2960313654U,	// <3,6,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
+  2689839456U,	// <3,6,0,u>: Cost 3 vext3 LHS, <6,0,u,2>
+  3763581290U,	// <3,6,1,0>: Cost 4 vext3 LHS, <6,1,0,3>
+  3763581297U,	// <3,6,1,1>: Cost 4 vext3 LHS, <6,1,1,1>
+  2624816028U,	// <3,6,1,2>: Cost 3 vext2 <1,2,3,6>, <1,2,3,6>
+  3763581315U,	// <3,6,1,3>: Cost 4 vext3 LHS, <6,1,3,1>
+  2626143294U,	// <3,6,1,4>: Cost 3 vext2 <1,4,3,6>, <1,4,3,6>
+  3763581335U,	// <3,6,1,5>: Cost 4 vext3 LHS, <6,1,5,3>
+  2721321376U,	// <3,6,1,6>: Cost 3 vext3 <6,1,6,3>, <6,1,6,3>
+  2721395113U,	// <3,6,1,7>: Cost 3 vext3 <6,1,7,3>, <6,1,7,3>
+  2628797826U,	// <3,6,1,u>: Cost 3 vext2 <1,u,3,6>, <1,u,3,6>
+  2594390118U,	// <3,6,2,0>: Cost 3 vext1 <7,3,6,2>, LHS
+  2721616324U,	// <3,6,2,1>: Cost 3 vext3 <6,2,1,3>, <6,2,1,3>
+  2630788725U,	// <3,6,2,2>: Cost 3 vext2 <2,2,3,6>, <2,2,3,6>
+  3763581395U,	// <3,6,2,3>: Cost 4 vext3 LHS, <6,2,3,0>
+  2632115991U,	// <3,6,2,4>: Cost 3 vext2 <2,4,3,6>, <2,4,3,6>
+  2632779624U,	// <3,6,2,5>: Cost 3 vext2 <2,5,3,6>, <2,5,3,6>
+  2594394618U,	// <3,6,2,6>: Cost 3 vext1 <7,3,6,2>, <6,2,7,3>
+  1648316922U,	// <3,6,2,7>: Cost 2 vext3 <6,2,7,3>, <6,2,7,3>
+  1648390659U,	// <3,6,2,u>: Cost 2 vext3 <6,2,u,3>, <6,2,u,3>
+  3693914262U,	// <3,6,3,0>: Cost 4 vext2 <0,4,3,6>, <3,0,1,2>
+  3638281176U,	// <3,6,3,1>: Cost 4 vext1 <2,3,6,3>, <1,3,1,3>
+  3696568678U,	// <3,6,3,2>: Cost 4 vext2 <0,u,3,6>, <3,2,6,3>
+  2638088604U,	// <3,6,3,3>: Cost 3 vext2 <3,4,3,6>, <3,3,3,3>
+  2632780290U,	// <3,6,3,4>: Cost 3 vext2 <2,5,3,6>, <3,4,5,6>
+  3712494145U,	// <3,6,3,5>: Cost 4 vext2 <3,5,3,6>, <3,5,3,6>
+  3698559612U,	// <3,6,3,6>: Cost 4 vext2 <1,2,3,6>, <3,6,1,2>
+  2959674678U,	// <3,6,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
+  2959674679U,	// <3,6,3,u>: Cost 3 vzipr <1,1,3,3>, RHS
+  3763581536U,	// <3,6,4,0>: Cost 4 vext3 LHS, <6,4,0,6>
+  2722943590U,	// <3,6,4,1>: Cost 3 vext3 <6,4,1,3>, <6,4,1,3>
+  2732970609U,	// <3,6,4,2>: Cost 3 vext3 LHS, <6,4,2,5>
+  3698560147U,	// <3,6,4,3>: Cost 4 vext2 <1,2,3,6>, <4,3,6,6>
+  2732970628U,	// <3,6,4,4>: Cost 3 vext3 LHS, <6,4,4,6>
+  2689839757U,	// <3,6,4,5>: Cost 3 vext3 LHS, <6,4,5,6>
+  2732970640U,	// <3,6,4,6>: Cost 3 vext3 LHS, <6,4,6,0>
+  2960346422U,	// <3,6,4,7>: Cost 3 vzipr <1,2,3,4>, RHS
+  2689839784U,	// <3,6,4,u>: Cost 3 vext3 LHS, <6,4,u,6>
+  2576498790U,	// <3,6,5,0>: Cost 3 vext1 <4,3,6,5>, LHS
+  3650241270U,	// <3,6,5,1>: Cost 4 vext1 <4,3,6,5>, <1,0,3,2>
+  2732970692U,	// <3,6,5,2>: Cost 3 vext3 LHS, <6,5,2,7>
+  2576501250U,	// <3,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
+  2576501906U,	// <3,6,5,4>: Cost 3 vext1 <4,3,6,5>, <4,3,6,5>
+  3650244622U,	// <3,6,5,5>: Cost 4 vext1 <4,3,6,5>, <5,5,6,6>
+  4114633528U,	// <3,6,5,6>: Cost 4 vtrnl <3,4,5,6>, <6,6,6,6>
+  2732970735U,	// <3,6,5,7>: Cost 3 vext3 LHS, <6,5,7,5>
+  2576504622U,	// <3,6,5,u>: Cost 3 vext1 <4,3,6,5>, LHS
+  2732970749U,	// <3,6,6,0>: Cost 3 vext3 LHS, <6,6,0,1>
+  2724270856U,	// <3,6,6,1>: Cost 3 vext3 <6,6,1,3>, <6,6,1,3>
+  2624819706U,	// <3,6,6,2>: Cost 3 vext2 <1,2,3,6>, <6,2,7,3>
+  3656223234U,	// <3,6,6,3>: Cost 4 vext1 <5,3,6,6>, <3,4,5,6>
+  2732970788U,	// <3,6,6,4>: Cost 3 vext3 LHS, <6,6,4,4>
+  2732970800U,	// <3,6,6,5>: Cost 3 vext3 LHS, <6,6,5,7>
+  1659228984U,	// <3,6,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
+  1659228994U,	// <3,6,6,7>: Cost 2 vext3 LHS, <6,6,7,7>
+  1659229003U,	// <3,6,6,u>: Cost 2 vext3 LHS, <6,6,u,7>
+  1659229006U,	// <3,6,7,0>: Cost 2 vext3 LHS, <6,7,0,1>
+  2558600201U,	// <3,6,7,1>: Cost 3 vext1 <1,3,6,7>, <1,3,6,7>
+  2558601146U,	// <3,6,7,2>: Cost 3 vext1 <1,3,6,7>, <2,6,3,7>
+  2725081963U,	// <3,6,7,3>: Cost 3 vext3 <6,7,3,3>, <6,7,3,3>
+  1659229046U,	// <3,6,7,4>: Cost 2 vext3 LHS, <6,7,4,5>
+  2715423611U,	// <3,6,7,5>: Cost 3 vext3 <5,1,7,3>, <6,7,5,1>
+  2722059141U,	// <3,6,7,6>: Cost 3 vext3 <6,2,7,3>, <6,7,6,2>
+  2962361654U,	// <3,6,7,7>: Cost 3 vzipr <1,5,3,7>, RHS
+  1659229078U,	// <3,6,7,u>: Cost 2 vext3 LHS, <6,7,u,1>
+  1659229087U,	// <3,6,u,0>: Cost 2 vext3 LHS, <6,u,0,1>
+  2689840041U,	// <3,6,u,1>: Cost 3 vext3 LHS, <6,u,1,2>
+  2558609339U,	// <3,6,u,2>: Cost 3 vext1 <1,3,6,u>, <2,6,3,u>
+  2576525853U,	// <3,6,u,3>: Cost 3 vext1 <4,3,6,u>, <3,4,u,6>
+  1659229127U,	// <3,6,u,4>: Cost 2 vext3 LHS, <6,u,4,5>
+  2689840081U,	// <3,6,u,5>: Cost 3 vext3 LHS, <6,u,5,6>
+  1659228984U,	// <3,6,u,6>: Cost 2 vext3 LHS, <6,6,6,6>
+  1652298720U,	// <3,6,u,7>: Cost 2 vext3 <6,u,7,3>, <6,u,7,3>
+  1659229159U,	// <3,6,u,u>: Cost 2 vext3 LHS, <6,u,u,1>
+  2626813952U,	// <3,7,0,0>: Cost 3 vext2 <1,5,3,7>, <0,0,0,0>
+  1553072230U,	// <3,7,0,1>: Cost 2 vext2 <1,5,3,7>, LHS
+  2626814116U,	// <3,7,0,2>: Cost 3 vext2 <1,5,3,7>, <0,2,0,2>
+  3700556028U,	// <3,7,0,3>: Cost 4 vext2 <1,5,3,7>, <0,3,1,0>
+  2626814290U,	// <3,7,0,4>: Cost 3 vext2 <1,5,3,7>, <0,4,1,5>
+  2582507375U,	// <3,7,0,5>: Cost 3 vext1 <5,3,7,0>, <5,3,7,0>
+  2588480072U,	// <3,7,0,6>: Cost 3 vext1 <6,3,7,0>, <6,3,7,0>
+  2732971055U,	// <3,7,0,7>: Cost 3 vext3 LHS, <7,0,7,1>
+  1553072797U,	// <3,7,0,u>: Cost 2 vext2 <1,5,3,7>, LHS
+  2626814710U,	// <3,7,1,0>: Cost 3 vext2 <1,5,3,7>, <1,0,3,2>
+  2626814772U,	// <3,7,1,1>: Cost 3 vext2 <1,5,3,7>, <1,1,1,1>
+  2626814870U,	// <3,7,1,2>: Cost 3 vext2 <1,5,3,7>, <1,2,3,0>
+  2625487854U,	// <3,7,1,3>: Cost 3 vext2 <1,3,3,7>, <1,3,3,7>
+  2582514998U,	// <3,7,1,4>: Cost 3 vext1 <5,3,7,1>, RHS
+  1553073296U,	// <3,7,1,5>: Cost 2 vext2 <1,5,3,7>, <1,5,3,7>
+  2627478753U,	// <3,7,1,6>: Cost 3 vext2 <1,6,3,7>, <1,6,3,7>
+  2727367810U,	// <3,7,1,7>: Cost 3 vext3 <7,1,7,3>, <7,1,7,3>
+  1555064195U,	// <3,7,1,u>: Cost 2 vext2 <1,u,3,7>, <1,u,3,7>
+  2588491878U,	// <3,7,2,0>: Cost 3 vext1 <6,3,7,2>, LHS
+  3700557318U,	// <3,7,2,1>: Cost 4 vext2 <1,5,3,7>, <2,1,0,3>
+  2626815592U,	// <3,7,2,2>: Cost 3 vext2 <1,5,3,7>, <2,2,2,2>
+  2626815654U,	// <3,7,2,3>: Cost 3 vext2 <1,5,3,7>, <2,3,0,1>
+  2588495158U,	// <3,7,2,4>: Cost 3 vext1 <6,3,7,2>, RHS
+  2632787817U,	// <3,7,2,5>: Cost 3 vext2 <2,5,3,7>, <2,5,3,7>
+  1559709626U,	// <3,7,2,6>: Cost 2 vext2 <2,6,3,7>, <2,6,3,7>
+  2728031443U,	// <3,7,2,7>: Cost 3 vext3 <7,2,7,3>, <7,2,7,3>
+  1561036892U,	// <3,7,2,u>: Cost 2 vext2 <2,u,3,7>, <2,u,3,7>
+  2626816150U,	// <3,7,3,0>: Cost 3 vext2 <1,5,3,7>, <3,0,1,2>
+  2626816268U,	// <3,7,3,1>: Cost 3 vext2 <1,5,3,7>, <3,1,5,3>
+  2633451878U,	// <3,7,3,2>: Cost 3 vext2 <2,6,3,7>, <3,2,6,3>
+  2626816412U,	// <3,7,3,3>: Cost 3 vext2 <1,5,3,7>, <3,3,3,3>
+  2626816514U,	// <3,7,3,4>: Cost 3 vext2 <1,5,3,7>, <3,4,5,6>
+  2638760514U,	// <3,7,3,5>: Cost 3 vext2 <3,5,3,7>, <3,5,3,7>
+  2639424147U,	// <3,7,3,6>: Cost 3 vext2 <3,6,3,7>, <3,6,3,7>
+  2826961920U,	// <3,7,3,7>: Cost 3 vuzpr <1,3,5,7>, <1,3,5,7>
+  2626816798U,	// <3,7,3,u>: Cost 3 vext2 <1,5,3,7>, <3,u,1,2>
+  2582536294U,	// <3,7,4,0>: Cost 3 vext1 <5,3,7,4>, LHS
+  2582537360U,	// <3,7,4,1>: Cost 3 vext1 <5,3,7,4>, <1,5,3,7>
+  2588510138U,	// <3,7,4,2>: Cost 3 vext1 <6,3,7,4>, <2,6,3,7>
+  3700558996U,	// <3,7,4,3>: Cost 4 vext2 <1,5,3,7>, <4,3,6,7>
+  2582539574U,	// <3,7,4,4>: Cost 3 vext1 <5,3,7,4>, RHS
+  1553075510U,	// <3,7,4,5>: Cost 2 vext2 <1,5,3,7>, RHS
+  2588512844U,	// <3,7,4,6>: Cost 3 vext1 <6,3,7,4>, <6,3,7,4>
+  2564625766U,	// <3,7,4,7>: Cost 3 vext1 <2,3,7,4>, <7,4,5,6>
+  1553075753U,	// <3,7,4,u>: Cost 2 vext2 <1,5,3,7>, RHS
+  2732971398U,	// <3,7,5,0>: Cost 3 vext3 LHS, <7,5,0,2>
+  2626817744U,	// <3,7,5,1>: Cost 3 vext2 <1,5,3,7>, <5,1,7,3>
+  3700559649U,	// <3,7,5,2>: Cost 4 vext2 <1,5,3,7>, <5,2,7,3>
+  2626817903U,	// <3,7,5,3>: Cost 3 vext2 <1,5,3,7>, <5,3,7,0>
+  2258728203U,	// <3,7,5,4>: Cost 3 vrev <7,3,4,5>
+  2732971446U,	// <3,7,5,5>: Cost 3 vext3 LHS, <7,5,5,5>
+  2732971457U,	// <3,7,5,6>: Cost 3 vext3 LHS, <7,5,6,7>
+  2826964278U,	// <3,7,5,7>: Cost 3 vuzpr <1,3,5,7>, RHS
+  2826964279U,	// <3,7,5,u>: Cost 3 vuzpr <1,3,5,7>, RHS
+  2732971478U,	// <3,7,6,0>: Cost 3 vext3 LHS, <7,6,0,1>
+  2732971486U,	// <3,7,6,1>: Cost 3 vext3 LHS, <7,6,1,0>
+  2633454074U,	// <3,7,6,2>: Cost 3 vext2 <2,6,3,7>, <6,2,7,3>
+  2633454152U,	// <3,7,6,3>: Cost 3 vext2 <2,6,3,7>, <6,3,7,0>
+  2732971518U,	// <3,7,6,4>: Cost 3 vext3 LHS, <7,6,4,5>
+  2732971526U,	// <3,7,6,5>: Cost 3 vext3 LHS, <7,6,5,4>
+  2732971537U,	// <3,7,6,6>: Cost 3 vext3 LHS, <7,6,6,6>
+  2732971540U,	// <3,7,6,7>: Cost 3 vext3 LHS, <7,6,7,0>
+  2726041124U,	// <3,7,6,u>: Cost 3 vext3 <6,u,7,3>, <7,6,u,7>
+  2570616934U,	// <3,7,7,0>: Cost 3 vext1 <3,3,7,7>, LHS
+  2570617856U,	// <3,7,7,1>: Cost 3 vext1 <3,3,7,7>, <1,3,5,7>
+  2564646635U,	// <3,7,7,2>: Cost 3 vext1 <2,3,7,7>, <2,3,7,7>
+  2570619332U,	// <3,7,7,3>: Cost 3 vext1 <3,3,7,7>, <3,3,7,7>
+  2570620214U,	// <3,7,7,4>: Cost 3 vext1 <3,3,7,7>, RHS
+  2582564726U,	// <3,7,7,5>: Cost 3 vext1 <5,3,7,7>, <5,3,7,7>
+  2588537423U,	// <3,7,7,6>: Cost 3 vext1 <6,3,7,7>, <6,3,7,7>
+  1659229804U,	// <3,7,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
+  1659229804U,	// <3,7,7,u>: Cost 2 vext3 LHS, <7,7,7,7>
+  2626819795U,	// <3,7,u,0>: Cost 3 vext2 <1,5,3,7>, <u,0,1,2>
+  1553078062U,	// <3,7,u,1>: Cost 2 vext2 <1,5,3,7>, LHS
+  2626819973U,	// <3,7,u,2>: Cost 3 vext2 <1,5,3,7>, <u,2,3,0>
+  2826961565U,	// <3,7,u,3>: Cost 3 vuzpr <1,3,5,7>, LHS
+  2626820159U,	// <3,7,u,4>: Cost 3 vext2 <1,5,3,7>, <u,4,5,6>
+  1553078426U,	// <3,7,u,5>: Cost 2 vext2 <1,5,3,7>, RHS
+  1595545808U,	// <3,7,u,6>: Cost 2 vext2 <u,6,3,7>, <u,6,3,7>
+  1659229804U,	// <3,7,u,7>: Cost 2 vext3 LHS, <7,7,7,7>
+  1553078629U,	// <3,7,u,u>: Cost 2 vext2 <1,5,3,7>, LHS
+  1611448320U,	// <3,u,0,0>: Cost 2 vext3 LHS, <0,0,0,0>
+  1611896531U,	// <3,u,0,1>: Cost 2 vext3 LHS, <u,0,1,2>
+  1659672284U,	// <3,u,0,2>: Cost 2 vext3 LHS, <u,0,2,2>
+  1616099045U,	// <3,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
+  2685638381U,	// <3,u,0,4>: Cost 3 vext3 LHS, <u,0,4,1>
+  1663874806U,	// <3,u,0,5>: Cost 2 vext3 LHS, <u,0,5,1>
+  1663874816U,	// <3,u,0,6>: Cost 2 vext3 LHS, <u,0,6,2>
+  2960313672U,	// <3,u,0,7>: Cost 3 vzipr <1,2,3,0>, RHS
+  1611896594U,	// <3,u,0,u>: Cost 2 vext3 LHS, <u,0,u,2>
+  1549763324U,	// <3,u,1,0>: Cost 2 vext2 <1,0,3,u>, <1,0,3,u>
+  1550426957U,	// <3,u,1,1>: Cost 2 vext2 <1,1,3,u>, <1,1,3,u>
+  537712430U,	// <3,u,1,2>: Cost 1 vext3 LHS, LHS
+  1616541495U,	// <3,u,1,3>: Cost 2 vext3 LHS, <u,1,3,3>
+  1490930998U,	// <3,u,1,4>: Cost 2 vext1 <2,3,u,1>, RHS
+  1553081489U,	// <3,u,1,5>: Cost 2 vext2 <1,5,3,u>, <1,5,3,u>
+  2627486946U,	// <3,u,1,6>: Cost 3 vext2 <1,6,3,u>, <1,6,3,u>
+  1659230043U,	// <3,u,1,7>: Cost 2 vext3 LHS, <u,1,7,3>
+  537712484U,	// <3,u,1,u>: Cost 1 vext3 LHS, LHS
+  1611890852U,	// <3,u,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+  2624833102U,	// <3,u,2,1>: Cost 3 vext2 <1,2,3,u>, <2,1,u,3>
+  1557063287U,	// <3,u,2,2>: Cost 2 vext2 <2,2,3,u>, <2,2,3,u>
+  1616099205U,	// <3,u,2,3>: Cost 2 vext3 LHS, <u,2,3,0>
+  1611890892U,	// <3,u,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+  2689841054U,	// <3,u,2,5>: Cost 3 vext3 LHS, <u,2,5,7>
+  1559717819U,	// <3,u,2,6>: Cost 2 vext2 <2,6,3,u>, <2,6,3,u>
+  1659230124U,	// <3,u,2,7>: Cost 2 vext3 LHS, <u,2,7,3>
+  1616541618U,	// <3,u,2,u>: Cost 2 vext3 LHS, <u,2,u,0>
+  1611896764U,	// <3,u,3,0>: Cost 2 vext3 LHS, <u,3,0,1>
+  1484973079U,	// <3,u,3,1>: Cost 2 vext1 <1,3,u,3>, <1,3,u,3>
+  2685638607U,	// <3,u,3,2>: Cost 3 vext3 LHS, <u,3,2,2>
+  336380006U,	// <3,u,3,3>: Cost 1 vdup3 LHS
+  1611896804U,	// <3,u,3,4>: Cost 2 vext3 LHS, <u,3,4,5>
+  1616541679U,	// <3,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
+  2690283512U,	// <3,u,3,6>: Cost 3 vext3 LHS, <u,3,6,7>
+  2959674696U,	// <3,u,3,7>: Cost 3 vzipr <1,1,3,3>, RHS
+  336380006U,	// <3,u,3,u>: Cost 1 vdup3 LHS
+  2558722150U,	// <3,u,4,0>: Cost 3 vext1 <1,3,u,4>, LHS
+  1659672602U,	// <3,u,4,1>: Cost 2 vext3 LHS, <u,4,1,5>
+  1659672612U,	// <3,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
+  2689841196U,	// <3,u,4,3>: Cost 3 vext3 LHS, <u,4,3,5>
+  1659227344U,	// <3,u,4,4>: Cost 2 vext3 LHS, <4,4,4,4>
+  1611896895U,	// <3,u,4,5>: Cost 2 vext3 LHS, <u,4,5,6>
+  1663875144U,	// <3,u,4,6>: Cost 2 vext3 LHS, <u,4,6,6>
+  1659230289U,	// <3,u,4,7>: Cost 2 vext3 LHS, <u,4,7,6>
+  1611896922U,	// <3,u,4,u>: Cost 2 vext3 LHS, <u,4,u,6>
+  1490960486U,	// <3,u,5,0>: Cost 2 vext1 <2,3,u,5>, LHS
+  2689841261U,	// <3,u,5,1>: Cost 3 vext3 LHS, <u,5,1,7>
+  1490962162U,	// <3,u,5,2>: Cost 2 vext1 <2,3,u,5>, <2,3,u,5>
+  1616541823U,	// <3,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
+  1490963766U,	// <3,u,5,4>: Cost 2 vext1 <2,3,u,5>, RHS
+  1659228164U,	// <3,u,5,5>: Cost 2 vext3 LHS, <5,5,5,5>
+  537712794U,	// <3,u,5,6>: Cost 1 vext3 LHS, RHS
+  1659230371U,	// <3,u,5,7>: Cost 2 vext3 LHS, <u,5,7,7>
+  537712812U,	// <3,u,5,u>: Cost 1 vext3 LHS, RHS
+  2689841327U,	// <3,u,6,0>: Cost 3 vext3 LHS, <u,6,0,1>
+  2558739482U,	// <3,u,6,1>: Cost 3 vext1 <1,3,u,6>, <1,3,u,6>
+  2689841351U,	// <3,u,6,2>: Cost 3 vext3 LHS, <u,6,2,7>
+  1616099536U,	// <3,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
+  1659227508U,	// <3,u,6,4>: Cost 2 vext3 LHS, <4,6,4,6>
+  2690283746U,	// <3,u,6,5>: Cost 3 vext3 LHS, <u,6,5,7>
+  1659228984U,	// <3,u,6,6>: Cost 2 vext3 LHS, <6,6,6,6>
+  1659230445U,	// <3,u,6,7>: Cost 2 vext3 LHS, <u,6,7,0>
+  1616099581U,	// <3,u,6,u>: Cost 2 vext3 LHS, <u,6,u,7>
+  1485004902U,	// <3,u,7,0>: Cost 2 vext1 <1,3,u,7>, LHS
+  1485005851U,	// <3,u,7,1>: Cost 2 vext1 <1,3,u,7>, <1,3,u,7>
+  2558748264U,	// <3,u,7,2>: Cost 3 vext1 <1,3,u,7>, <2,2,2,2>
+  3095397021U,	// <3,u,7,3>: Cost 3 vtrnr <1,3,5,7>, LHS
+  1485008182U,	// <3,u,7,4>: Cost 2 vext1 <1,3,u,7>, RHS
+  1659228328U,	// <3,u,7,5>: Cost 2 vext3 LHS, <5,7,5,7>
+  2722060599U,	// <3,u,7,6>: Cost 3 vext3 <6,2,7,3>, <u,7,6,2>
+  1659229804U,	// <3,u,7,7>: Cost 2 vext3 LHS, <7,7,7,7>
+  1485010734U,	// <3,u,7,u>: Cost 2 vext1 <1,3,u,7>, LHS
+  1616099665U,	// <3,u,u,0>: Cost 2 vext3 LHS, <u,u,0,1>
+  1611897179U,	// <3,u,u,1>: Cost 2 vext3 LHS, <u,u,1,2>
+  537712997U,	// <3,u,u,2>: Cost 1 vext3 LHS, LHS
+  336380006U,	// <3,u,u,3>: Cost 1 vdup3 LHS
+  1616099705U,	// <3,u,u,4>: Cost 2 vext3 LHS, <u,u,4,5>
+  1611897219U,	// <3,u,u,5>: Cost 2 vext3 LHS, <u,u,5,6>
+  537713037U,	// <3,u,u,6>: Cost 1 vext3 LHS, RHS
+  1659230607U,	// <3,u,u,7>: Cost 2 vext3 LHS, <u,u,7,0>
+  537713051U,	// <3,u,u,u>: Cost 1 vext3 LHS, LHS
+  2691907584U,	// <4,0,0,0>: Cost 3 vext3 <1,2,3,4>, <0,0,0,0>
+  2691907594U,	// <4,0,0,1>: Cost 3 vext3 <1,2,3,4>, <0,0,1,1>
+  2691907604U,	// <4,0,0,2>: Cost 3 vext3 <1,2,3,4>, <0,0,2,2>
+  3709862144U,	// <4,0,0,3>: Cost 4 vext2 <3,1,4,0>, <0,3,1,4>
+  2684682280U,	// <4,0,0,4>: Cost 3 vext3 <0,0,4,4>, <0,0,4,4>
+  3694600633U,	// <4,0,0,5>: Cost 4 vext2 <0,5,4,0>, <0,5,4,0>
+  3291431290U,	// <4,0,0,6>: Cost 4 vrev <0,4,6,0>
+  3668342067U,	// <4,0,0,7>: Cost 4 vext1 <7,4,0,0>, <7,4,0,0>
+  2691907657U,	// <4,0,0,u>: Cost 3 vext3 <1,2,3,4>, <0,0,u,1>
+  2570715238U,	// <4,0,1,0>: Cost 3 vext1 <3,4,0,1>, LHS
+  2570716058U,	// <4,0,1,1>: Cost 3 vext1 <3,4,0,1>, <1,2,3,4>
+  1618165862U,	// <4,0,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  2570717648U,	// <4,0,1,3>: Cost 3 vext1 <3,4,0,1>, <3,4,0,1>
+  2570718518U,	// <4,0,1,4>: Cost 3 vext1 <3,4,0,1>, RHS
+  2594607206U,	// <4,0,1,5>: Cost 3 vext1 <7,4,0,1>, <5,6,7,4>
+  3662377563U,	// <4,0,1,6>: Cost 4 vext1 <6,4,0,1>, <6,4,0,1>
+  2594608436U,	// <4,0,1,7>: Cost 3 vext1 <7,4,0,1>, <7,4,0,1>
+  1618165916U,	// <4,0,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
+  2685714598U,	// <4,0,2,0>: Cost 3 vext3 <0,2,0,4>, <0,2,0,4>
+  3759530159U,	// <4,0,2,1>: Cost 4 vext3 <0,2,1,4>, <0,2,1,4>
+  2685862072U,	// <4,0,2,2>: Cost 3 vext3 <0,2,2,4>, <0,2,2,4>
+  2631476937U,	// <4,0,2,3>: Cost 3 vext2 <2,3,4,0>, <2,3,4,0>
+  2685714636U,	// <4,0,2,4>: Cost 3 vext3 <0,2,0,4>, <0,2,4,6>
+  3765649622U,	// <4,0,2,5>: Cost 4 vext3 <1,2,3,4>, <0,2,5,7>
+  2686157020U,	// <4,0,2,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
+  3668358453U,	// <4,0,2,7>: Cost 4 vext1 <7,4,0,2>, <7,4,0,2>
+  2686304494U,	// <4,0,2,u>: Cost 3 vext3 <0,2,u,4>, <0,2,u,4>
+  3632529510U,	// <4,0,3,0>: Cost 4 vext1 <1,4,0,3>, LHS
+  2686451968U,	// <4,0,3,1>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
+  2686525705U,	// <4,0,3,2>: Cost 3 vext3 <0,3,2,4>, <0,3,2,4>
+  3760341266U,	// <4,0,3,3>: Cost 4 vext3 <0,3,3,4>, <0,3,3,4>
+  3632532790U,	// <4,0,3,4>: Cost 4 vext1 <1,4,0,3>, RHS
+  3913254606U,	// <4,0,3,5>: Cost 4 vuzpr <3,4,5,0>, <2,3,4,5>
+  3705219740U,	// <4,0,3,6>: Cost 4 vext2 <2,3,4,0>, <3,6,4,7>
+  3713845990U,	// <4,0,3,7>: Cost 4 vext2 <3,7,4,0>, <3,7,4,0>
+  2686451968U,	// <4,0,3,u>: Cost 3 vext3 <0,3,1,4>, <0,3,1,4>
+  2552823910U,	// <4,0,4,0>: Cost 3 vext1 <0,4,0,4>, LHS
+  2691907922U,	// <4,0,4,1>: Cost 3 vext3 <1,2,3,4>, <0,4,1,5>
+  2691907932U,	// <4,0,4,2>: Cost 3 vext3 <1,2,3,4>, <0,4,2,6>
+  3626567830U,	// <4,0,4,3>: Cost 4 vext1 <0,4,0,4>, <3,0,1,2>
+  2552827190U,	// <4,0,4,4>: Cost 3 vext1 <0,4,0,4>, RHS
+  2631478582U,	// <4,0,4,5>: Cost 3 vext2 <2,3,4,0>, RHS
+  3626570017U,	// <4,0,4,6>: Cost 4 vext1 <0,4,0,4>, <6,0,1,2>
+  3668374839U,	// <4,0,4,7>: Cost 4 vext1 <7,4,0,4>, <7,4,0,4>
+  2552829742U,	// <4,0,4,u>: Cost 3 vext1 <0,4,0,4>, LHS
+  2558804070U,	// <4,0,5,0>: Cost 3 vext1 <1,4,0,5>, LHS
+  1839644774U,	// <4,0,5,1>: Cost 2 vzipl RHS, LHS
+  2913386660U,	// <4,0,5,2>: Cost 3 vzipl RHS, <0,2,0,2>
+  2570750420U,	// <4,0,5,3>: Cost 3 vext1 <3,4,0,5>, <3,4,0,5>
+  2558807350U,	// <4,0,5,4>: Cost 3 vext1 <1,4,0,5>, RHS
+  3987128750U,	// <4,0,5,5>: Cost 4 vzipl RHS, <0,5,2,7>
+  3987128822U,	// <4,0,5,6>: Cost 4 vzipl RHS, <0,6,1,7>
+  2594641208U,	// <4,0,5,7>: Cost 3 vext1 <7,4,0,5>, <7,4,0,5>
+  1839645341U,	// <4,0,5,u>: Cost 2 vzipl RHS, LHS
+  2552840294U,	// <4,0,6,0>: Cost 3 vext1 <0,4,0,6>, LHS
+  3047604234U,	// <4,0,6,1>: Cost 3 vtrnl RHS, <0,0,1,1>
+  1973862502U,	// <4,0,6,2>: Cost 2 vtrnl RHS, LHS
+  2570758613U,	// <4,0,6,3>: Cost 3 vext1 <3,4,0,6>, <3,4,0,6>
+  2552843574U,	// <4,0,6,4>: Cost 3 vext1 <0,4,0,6>, RHS
+  2217664887U,	// <4,0,6,5>: Cost 3 vrev <0,4,5,6>
+  3662418528U,	// <4,0,6,6>: Cost 4 vext1 <6,4,0,6>, <6,4,0,6>
+  2658022257U,	// <4,0,6,7>: Cost 3 vext2 <6,7,4,0>, <6,7,4,0>
+  1973862556U,	// <4,0,6,u>: Cost 2 vtrnl RHS, LHS
+  3731764218U,	// <4,0,7,0>: Cost 4 vext2 <6,7,4,0>, <7,0,1,2>
+  3988324454U,	// <4,0,7,1>: Cost 4 vzipl <4,7,5,0>, LHS
+  4122034278U,	// <4,0,7,2>: Cost 4 vtrnl <4,6,7,1>, LHS
+  3735082246U,	// <4,0,7,3>: Cost 4 vext2 <7,3,4,0>, <7,3,4,0>
+  3731764536U,	// <4,0,7,4>: Cost 4 vext2 <6,7,4,0>, <7,4,0,5>
+  3937145718U,	// <4,0,7,5>: Cost 4 vuzpr <7,4,5,0>, <6,7,4,5>
+  3737073145U,	// <4,0,7,6>: Cost 4 vext2 <7,6,4,0>, <7,6,4,0>
+  3731764844U,	// <4,0,7,7>: Cost 4 vext2 <6,7,4,0>, <7,7,7,7>
+  4122034332U,	// <4,0,7,u>: Cost 4 vtrnl <4,6,7,1>, LHS
+  2552856678U,	// <4,0,u,0>: Cost 3 vext1 <0,4,0,u>, LHS
+  1841635430U,	// <4,0,u,1>: Cost 2 vzipl RHS, LHS
+  1618166429U,	// <4,0,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  2570774999U,	// <4,0,u,3>: Cost 3 vext1 <3,4,0,u>, <3,4,0,u>
+  2552859958U,	// <4,0,u,4>: Cost 3 vext1 <0,4,0,u>, RHS
+  2631481498U,	// <4,0,u,5>: Cost 3 vext2 <2,3,4,0>, RHS
+  2686157020U,	// <4,0,u,6>: Cost 3 vext3 <0,2,6,4>, <0,2,6,4>
+  2594665787U,	// <4,0,u,7>: Cost 3 vext1 <7,4,0,u>, <7,4,0,u>
+  1618166483U,	// <4,0,u,u>: Cost 2 vext3 <1,2,3,4>, LHS
+  2617548837U,	// <4,1,0,0>: Cost 3 vext2 <0,0,4,1>, <0,0,4,1>
+  2622857318U,	// <4,1,0,1>: Cost 3 vext2 <0,u,4,1>, LHS
+  3693281484U,	// <4,1,0,2>: Cost 4 vext2 <0,3,4,1>, <0,2,4,6>
+  2691908342U,	// <4,1,0,3>: Cost 3 vext3 <1,2,3,4>, <1,0,3,2>
+  2622857554U,	// <4,1,0,4>: Cost 3 vext2 <0,u,4,1>, <0,4,1,5>
+  3764470538U,	// <4,1,0,5>: Cost 4 vext3 <1,0,5,4>, <1,0,5,4>
+  3695272459U,	// <4,1,0,6>: Cost 4 vext2 <0,6,4,1>, <0,6,4,1>
+  3733094980U,	// <4,1,0,7>: Cost 4 vext2 <7,0,4,1>, <0,7,1,4>
+  2622857885U,	// <4,1,0,u>: Cost 3 vext2 <0,u,4,1>, LHS
+  3696599798U,	// <4,1,1,0>: Cost 4 vext2 <0,u,4,1>, <1,0,3,2>
+  2691097399U,	// <4,1,1,1>: Cost 3 vext3 <1,1,1,4>, <1,1,1,4>
+  2631484314U,	// <4,1,1,2>: Cost 3 vext2 <2,3,4,1>, <1,2,3,4>
+  2691908424U,	// <4,1,1,3>: Cost 3 vext3 <1,2,3,4>, <1,1,3,3>
+  3696600125U,	// <4,1,1,4>: Cost 4 vext2 <0,u,4,1>, <1,4,3,5>
+  3696600175U,	// <4,1,1,5>: Cost 4 vext2 <0,u,4,1>, <1,5,0,1>
+  3696600307U,	// <4,1,1,6>: Cost 4 vext2 <0,u,4,1>, <1,6,5,7>
+  3668423997U,	// <4,1,1,7>: Cost 4 vext1 <7,4,1,1>, <7,4,1,1>
+  2691908469U,	// <4,1,1,u>: Cost 3 vext3 <1,2,3,4>, <1,1,u,3>
+  2570797158U,	// <4,1,2,0>: Cost 3 vext1 <3,4,1,2>, LHS
+  2570797978U,	// <4,1,2,1>: Cost 3 vext1 <3,4,1,2>, <1,2,3,4>
+  3696600680U,	// <4,1,2,2>: Cost 4 vext2 <0,u,4,1>, <2,2,2,2>
+  1618166682U,	// <4,1,2,3>: Cost 2 vext3 <1,2,3,4>, <1,2,3,4>
+  2570800438U,	// <4,1,2,4>: Cost 3 vext1 <3,4,1,2>, RHS
+  3765650347U,	// <4,1,2,5>: Cost 4 vext3 <1,2,3,4>, <1,2,5,3>
+  3696601018U,	// <4,1,2,6>: Cost 4 vext2 <0,u,4,1>, <2,6,3,7>
+  3668432190U,	// <4,1,2,7>: Cost 4 vext1 <7,4,1,2>, <7,4,1,2>
+  1618535367U,	// <4,1,2,u>: Cost 2 vext3 <1,2,u,4>, <1,2,u,4>
+  2564833382U,	// <4,1,3,0>: Cost 3 vext1 <2,4,1,3>, LHS
+  2691908568U,	// <4,1,3,1>: Cost 3 vext3 <1,2,3,4>, <1,3,1,3>
+  2691908578U,	// <4,1,3,2>: Cost 3 vext3 <1,2,3,4>, <1,3,2,4>
+  2692572139U,	// <4,1,3,3>: Cost 3 vext3 <1,3,3,4>, <1,3,3,4>
+  2564836662U,	// <4,1,3,4>: Cost 3 vext1 <2,4,1,3>, RHS
+  2691908608U,	// <4,1,3,5>: Cost 3 vext3 <1,2,3,4>, <1,3,5,7>
+  2588725862U,	// <4,1,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+  3662468090U,	// <4,1,3,7>: Cost 4 vext1 <6,4,1,3>, <7,0,1,2>
+  2691908631U,	// <4,1,3,u>: Cost 3 vext3 <1,2,3,4>, <1,3,u,3>
+  3760194590U,	// <4,1,4,0>: Cost 4 vext3 <0,3,1,4>, <1,4,0,1>
+  3693947874U,	// <4,1,4,1>: Cost 4 vext2 <0,4,4,1>, <4,1,5,0>
+  3765650484U,	// <4,1,4,2>: Cost 4 vext3 <1,2,3,4>, <1,4,2,5>
+  3113877606U,	// <4,1,4,3>: Cost 3 vtrnr <4,4,4,4>, LHS
+  3760194630U,	// <4,1,4,4>: Cost 4 vext3 <0,3,1,4>, <1,4,4,5>
+  2622860598U,	// <4,1,4,5>: Cost 3 vext2 <0,u,4,1>, RHS
+  3297436759U,	// <4,1,4,6>: Cost 4 vrev <1,4,6,4>
+  3800007772U,	// <4,1,4,7>: Cost 4 vext3 <7,0,1,4>, <1,4,7,0>
+  2622860841U,	// <4,1,4,u>: Cost 3 vext2 <0,u,4,1>, RHS
+  1479164006U,	// <4,1,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
+  2552906486U,	// <4,1,5,1>: Cost 3 vext1 <0,4,1,5>, <1,0,3,2>
+  2552907299U,	// <4,1,5,2>: Cost 3 vext1 <0,4,1,5>, <2,1,3,5>
+  2552907926U,	// <4,1,5,3>: Cost 3 vext1 <0,4,1,5>, <3,0,1,2>
+  1479167286U,	// <4,1,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
+  2913387664U,	// <4,1,5,5>: Cost 3 vzipl RHS, <1,5,3,7>
+  2600686074U,	// <4,1,5,6>: Cost 3 vext1 <u,4,1,5>, <6,2,7,3>
+  2600686586U,	// <4,1,5,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
+  1479169838U,	// <4,1,5,u>: Cost 2 vext1 <0,4,1,5>, LHS
+  2552914022U,	// <4,1,6,0>: Cost 3 vext1 <0,4,1,6>, LHS
+  2558886708U,	// <4,1,6,1>: Cost 3 vext1 <1,4,1,6>, <1,1,1,1>
+  4028205206U,	// <4,1,6,2>: Cost 4 vzipr <0,2,4,6>, <3,0,1,2>
+  3089858662U,	// <4,1,6,3>: Cost 3 vtrnr <0,4,2,6>, LHS
+  2552917302U,	// <4,1,6,4>: Cost 3 vext1 <0,4,1,6>, RHS
+  2223637584U,	// <4,1,6,5>: Cost 3 vrev <1,4,5,6>
+  4121347081U,	// <4,1,6,6>: Cost 4 vtrnl RHS, <1,3,6,7>
+  3721155406U,	// <4,1,6,7>: Cost 4 vext2 <5,0,4,1>, <6,7,0,1>
+  2552919854U,	// <4,1,6,u>: Cost 3 vext1 <0,4,1,6>, LHS
+  2659357716U,	// <4,1,7,0>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
+  3733763173U,	// <4,1,7,1>: Cost 4 vext2 <7,1,4,1>, <7,1,4,1>
+  3734426806U,	// <4,1,7,2>: Cost 4 vext2 <7,2,4,1>, <7,2,4,1>
+  2695226671U,	// <4,1,7,3>: Cost 3 vext3 <1,7,3,4>, <1,7,3,4>
+  3721155942U,	// <4,1,7,4>: Cost 4 vext2 <5,0,4,1>, <7,4,5,6>
+  3721155976U,	// <4,1,7,5>: Cost 4 vext2 <5,0,4,1>, <7,5,0,4>
+  3662500458U,	// <4,1,7,6>: Cost 4 vext1 <6,4,1,7>, <6,4,1,7>
+  3721156204U,	// <4,1,7,7>: Cost 4 vext2 <5,0,4,1>, <7,7,7,7>
+  2659357716U,	// <4,1,7,u>: Cost 3 vext2 <7,0,4,1>, <7,0,4,1>
+  1479188582U,	// <4,1,u,0>: Cost 2 vext1 <0,4,1,u>, LHS
+  2552931062U,	// <4,1,u,1>: Cost 3 vext1 <0,4,1,u>, <1,0,3,2>
+  2552931944U,	// <4,1,u,2>: Cost 3 vext1 <0,4,1,u>, <2,2,2,2>
+  1622148480U,	// <4,1,u,3>: Cost 2 vext3 <1,u,3,4>, <1,u,3,4>
+  1479191862U,	// <4,1,u,4>: Cost 2 vext1 <0,4,1,u>, RHS
+  2622863514U,	// <4,1,u,5>: Cost 3 vext2 <0,u,4,1>, RHS
+  2588725862U,	// <4,1,u,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+  2600686586U,	// <4,1,u,7>: Cost 3 vext1 <u,4,1,5>, <7,0,1,2>
+  1479194414U,	// <4,1,u,u>: Cost 2 vext1 <0,4,1,u>, LHS
+  2617557030U,	// <4,2,0,0>: Cost 3 vext2 <0,0,4,2>, <0,0,4,2>
+  2622865510U,	// <4,2,0,1>: Cost 3 vext2 <0,u,4,2>, LHS
+  2622865612U,	// <4,2,0,2>: Cost 3 vext2 <0,u,4,2>, <0,2,4,6>
+  3693289753U,	// <4,2,0,3>: Cost 4 vext2 <0,3,4,2>, <0,3,4,2>
+  2635473244U,	// <4,2,0,4>: Cost 3 vext2 <3,0,4,2>, <0,4,2,6>
+  3765650918U,	// <4,2,0,5>: Cost 4 vext3 <1,2,3,4>, <2,0,5,7>
+  2696775148U,	// <4,2,0,6>: Cost 3 vext3 <2,0,6,4>, <2,0,6,4>
+  3695944285U,	// <4,2,0,7>: Cost 4 vext2 <0,7,4,2>, <0,7,4,2>
+  2622866077U,	// <4,2,0,u>: Cost 3 vext2 <0,u,4,2>, LHS
+  3696607990U,	// <4,2,1,0>: Cost 4 vext2 <0,u,4,2>, <1,0,3,2>
+  3696608052U,	// <4,2,1,1>: Cost 4 vext2 <0,u,4,2>, <1,1,1,1>
+  3696608150U,	// <4,2,1,2>: Cost 4 vext2 <0,u,4,2>, <1,2,3,0>
+  3895574630U,	// <4,2,1,3>: Cost 4 vuzpr <0,4,u,2>, LHS
+  2691909162U,	// <4,2,1,4>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
+  3696608400U,	// <4,2,1,5>: Cost 4 vext2 <0,u,4,2>, <1,5,3,7>
+  3760784956U,	// <4,2,1,6>: Cost 4 vext3 <0,4,0,4>, <2,1,6,3>
+  3773908549U,	// <4,2,1,7>: Cost 5 vext3 <2,5,7,4>, <2,1,7,3>
+  2691909162U,	// <4,2,1,u>: Cost 3 vext3 <1,2,3,4>, <2,1,4,3>
+  3696608748U,	// <4,2,2,0>: Cost 4 vext2 <0,u,4,2>, <2,0,6,4>
+  3696608828U,	// <4,2,2,1>: Cost 4 vext2 <0,u,4,2>, <2,1,6,3>
+  2691909224U,	// <4,2,2,2>: Cost 3 vext3 <1,2,3,4>, <2,2,2,2>
+  2691909234U,	// <4,2,2,3>: Cost 3 vext3 <1,2,3,4>, <2,2,3,3>
+  3759605368U,	// <4,2,2,4>: Cost 4 vext3 <0,2,2,4>, <2,2,4,0>
+  3696609156U,	// <4,2,2,5>: Cost 4 vext2 <0,u,4,2>, <2,5,6,7>
+  3760785040U,	// <4,2,2,6>: Cost 4 vext3 <0,4,0,4>, <2,2,6,6>
+  3668505927U,	// <4,2,2,7>: Cost 4 vext1 <7,4,2,2>, <7,4,2,2>
+  2691909279U,	// <4,2,2,u>: Cost 3 vext3 <1,2,3,4>, <2,2,u,3>
+  2691909286U,	// <4,2,3,0>: Cost 3 vext3 <1,2,3,4>, <2,3,0,1>
+  3764840111U,	// <4,2,3,1>: Cost 4 vext3 <1,1,1,4>, <2,3,1,1>
+  3765651129U,	// <4,2,3,2>: Cost 4 vext3 <1,2,3,4>, <2,3,2,2>
+  2698544836U,	// <4,2,3,3>: Cost 3 vext3 <2,3,3,4>, <2,3,3,4>
+  2685863630U,	// <4,2,3,4>: Cost 3 vext3 <0,2,2,4>, <2,3,4,5>
+  2698692310U,	// <4,2,3,5>: Cost 3 vext3 <2,3,5,4>, <2,3,5,4>
+  3772507871U,	// <4,2,3,6>: Cost 4 vext3 <2,3,6,4>, <2,3,6,4>
+  2698839784U,	// <4,2,3,7>: Cost 3 vext3 <2,3,7,4>, <2,3,7,4>
+  2691909358U,	// <4,2,3,u>: Cost 3 vext3 <1,2,3,4>, <2,3,u,1>
+  2564915302U,	// <4,2,4,0>: Cost 3 vext1 <2,4,2,4>, LHS
+  2564916122U,	// <4,2,4,1>: Cost 3 vext1 <2,4,2,4>, <1,2,3,4>
+  2564917004U,	// <4,2,4,2>: Cost 3 vext1 <2,4,2,4>, <2,4,2,4>
+  2699208469U,	// <4,2,4,3>: Cost 3 vext3 <2,4,3,4>, <2,4,3,4>
+  2564918582U,	// <4,2,4,4>: Cost 3 vext1 <2,4,2,4>, RHS
+  2622868790U,	// <4,2,4,5>: Cost 3 vext2 <0,u,4,2>, RHS
+  2229667632U,	// <4,2,4,6>: Cost 3 vrev <2,4,6,4>
+  3800082229U,	// <4,2,4,7>: Cost 4 vext3 <7,0,2,4>, <2,4,7,0>
+  2622869033U,	// <4,2,4,u>: Cost 3 vext2 <0,u,4,2>, RHS
+  2552979558U,	// <4,2,5,0>: Cost 3 vext1 <0,4,2,5>, LHS
+  2558952342U,	// <4,2,5,1>: Cost 3 vext1 <1,4,2,5>, <1,2,3,0>
+  2564925032U,	// <4,2,5,2>: Cost 3 vext1 <2,4,2,5>, <2,2,2,2>
+  2967060582U,	// <4,2,5,3>: Cost 3 vzipr <2,3,4,5>, LHS
+  2552982838U,	// <4,2,5,4>: Cost 3 vext1 <0,4,2,5>, RHS
+  3987130190U,	// <4,2,5,5>: Cost 4 vzipl RHS, <2,5,0,7>
+  2913388474U,	// <4,2,5,6>: Cost 3 vzipl RHS, <2,6,3,7>
+  3895577910U,	// <4,2,5,7>: Cost 4 vuzpr <0,4,u,2>, RHS
+  2552985390U,	// <4,2,5,u>: Cost 3 vext1 <0,4,2,5>, LHS
+  1479245926U,	// <4,2,6,0>: Cost 2 vext1 <0,4,2,6>, LHS
+  2552988406U,	// <4,2,6,1>: Cost 3 vext1 <0,4,2,6>, <1,0,3,2>
+  2552989288U,	// <4,2,6,2>: Cost 3 vext1 <0,4,2,6>, <2,2,2,2>
+  2954461286U,	// <4,2,6,3>: Cost 3 vzipr <0,2,4,6>, LHS
+  1479249206U,	// <4,2,6,4>: Cost 2 vext1 <0,4,2,6>, RHS
+  2229610281U,	// <4,2,6,5>: Cost 3 vrev <2,4,5,6>
+  2600767994U,	// <4,2,6,6>: Cost 3 vext1 <u,4,2,6>, <6,2,7,3>
+  2600768506U,	// <4,2,6,7>: Cost 3 vext1 <u,4,2,6>, <7,0,1,2>
+  1479251758U,	// <4,2,6,u>: Cost 2 vext1 <0,4,2,6>, LHS
+  2659365909U,	// <4,2,7,0>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
+  3733771366U,	// <4,2,7,1>: Cost 4 vext2 <7,1,4,2>, <7,1,4,2>
+  3734434999U,	// <4,2,7,2>: Cost 4 vext2 <7,2,4,2>, <7,2,4,2>
+  2701199368U,	// <4,2,7,3>: Cost 3 vext3 <2,7,3,4>, <2,7,3,4>
+  4175774618U,	// <4,2,7,4>: Cost 4 vtrnr <2,4,5,7>, <1,2,3,4>
+  3303360298U,	// <4,2,7,5>: Cost 4 vrev <2,4,5,7>
+  3727136217U,	// <4,2,7,6>: Cost 4 vext2 <6,0,4,2>, <7,6,0,4>
+  3727136364U,	// <4,2,7,7>: Cost 4 vext2 <6,0,4,2>, <7,7,7,7>
+  2659365909U,	// <4,2,7,u>: Cost 3 vext2 <7,0,4,2>, <7,0,4,2>
+  1479262310U,	// <4,2,u,0>: Cost 2 vext1 <0,4,2,u>, LHS
+  2553004790U,	// <4,2,u,1>: Cost 3 vext1 <0,4,2,u>, <1,0,3,2>
+  2553005672U,	// <4,2,u,2>: Cost 3 vext1 <0,4,2,u>, <2,2,2,2>
+  2954477670U,	// <4,2,u,3>: Cost 3 vzipr <0,2,4,u>, LHS
+  1479265590U,	// <4,2,u,4>: Cost 2 vext1 <0,4,2,u>, RHS
+  2622871706U,	// <4,2,u,5>: Cost 3 vext2 <0,u,4,2>, RHS
+  2229700404U,	// <4,2,u,6>: Cost 3 vrev <2,4,6,u>
+  2600784890U,	// <4,2,u,7>: Cost 3 vext1 <u,4,2,u>, <7,0,1,2>
+  1479268142U,	// <4,2,u,u>: Cost 2 vext1 <0,4,2,u>, LHS
+  3765651595U,	// <4,3,0,0>: Cost 4 vext3 <1,2,3,4>, <3,0,0,0>
+  2691909782U,	// <4,3,0,1>: Cost 3 vext3 <1,2,3,4>, <3,0,1,2>
+  2702452897U,	// <4,3,0,2>: Cost 3 vext3 <3,0,2,4>, <3,0,2,4>
+  3693297946U,	// <4,3,0,3>: Cost 4 vext2 <0,3,4,3>, <0,3,4,3>
+  3760711856U,	// <4,3,0,4>: Cost 4 vext3 <0,3,u,4>, <3,0,4,1>
+  2235533820U,	// <4,3,0,5>: Cost 3 vrev <3,4,5,0>
+  3309349381U,	// <4,3,0,6>: Cost 4 vrev <3,4,6,0>
+  3668563278U,	// <4,3,0,7>: Cost 4 vext1 <7,4,3,0>, <7,4,3,0>
+  2691909845U,	// <4,3,0,u>: Cost 3 vext3 <1,2,3,4>, <3,0,u,2>
+  2235173328U,	// <4,3,1,0>: Cost 3 vrev <3,4,0,1>
+  3764840678U,	// <4,3,1,1>: Cost 4 vext3 <1,1,1,4>, <3,1,1,1>
+  2630173594U,	// <4,3,1,2>: Cost 3 vext2 <2,1,4,3>, <1,2,3,4>
+  2703190267U,	// <4,3,1,3>: Cost 3 vext3 <3,1,3,4>, <3,1,3,4>
+  3760195840U,	// <4,3,1,4>: Cost 4 vext3 <0,3,1,4>, <3,1,4,0>
+  3765651724U,	// <4,3,1,5>: Cost 4 vext3 <1,2,3,4>, <3,1,5,3>
+  3309357574U,	// <4,3,1,6>: Cost 4 vrev <3,4,6,1>
+  3769633054U,	// <4,3,1,7>: Cost 4 vext3 <1,u,3,4>, <3,1,7,3>
+  2703558952U,	// <4,3,1,u>: Cost 3 vext3 <3,1,u,4>, <3,1,u,4>
+  3626770534U,	// <4,3,2,0>: Cost 4 vext1 <0,4,3,2>, LHS
+  2630174250U,	// <4,3,2,1>: Cost 3 vext2 <2,1,4,3>, <2,1,4,3>
+  3765651777U,	// <4,3,2,2>: Cost 4 vext3 <1,2,3,4>, <3,2,2,2>
+  2703853900U,	// <4,3,2,3>: Cost 3 vext3 <3,2,3,4>, <3,2,3,4>
+  3626773814U,	// <4,3,2,4>: Cost 4 vext1 <0,4,3,2>, RHS
+  2704001374U,	// <4,3,2,5>: Cost 3 vext3 <3,2,5,4>, <3,2,5,4>
+  3765651814U,	// <4,3,2,6>: Cost 4 vext3 <1,2,3,4>, <3,2,6,3>
+  3769633135U,	// <4,3,2,7>: Cost 4 vext3 <1,u,3,4>, <3,2,7,3>
+  2634819681U,	// <4,3,2,u>: Cost 3 vext2 <2,u,4,3>, <2,u,4,3>
+  3765651839U,	// <4,3,3,0>: Cost 4 vext3 <1,2,3,4>, <3,3,0,1>
+  3765651848U,	// <4,3,3,1>: Cost 4 vext3 <1,2,3,4>, <3,3,1,1>
+  3710552404U,	// <4,3,3,2>: Cost 4 vext2 <3,2,4,3>, <3,2,4,3>
+  2691910044U,	// <4,3,3,3>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
+  2704591270U,	// <4,3,3,4>: Cost 3 vext3 <3,3,4,4>, <3,3,4,4>
+  3769633202U,	// <4,3,3,5>: Cost 4 vext3 <1,u,3,4>, <3,3,5,7>
+  3703917212U,	// <4,3,3,6>: Cost 4 vext2 <2,1,4,3>, <3,6,4,7>
+  3769633220U,	// <4,3,3,7>: Cost 4 vext3 <1,u,3,4>, <3,3,7,7>
+  2691910044U,	// <4,3,3,u>: Cost 3 vext3 <1,2,3,4>, <3,3,3,3>
+  2691910096U,	// <4,3,4,0>: Cost 3 vext3 <1,2,3,4>, <3,4,0,1>
+  2691910106U,	// <4,3,4,1>: Cost 3 vext3 <1,2,3,4>, <3,4,1,2>
+  2564990741U,	// <4,3,4,2>: Cost 3 vext1 <2,4,3,4>, <2,4,3,4>
+  3765651946U,	// <4,3,4,3>: Cost 4 vext3 <1,2,3,4>, <3,4,3,0>
+  2691910136U,	// <4,3,4,4>: Cost 3 vext3 <1,2,3,4>, <3,4,4,5>
+  2686454274U,	// <4,3,4,5>: Cost 3 vext3 <0,3,1,4>, <3,4,5,6>
+  2235640329U,	// <4,3,4,6>: Cost 3 vrev <3,4,6,4>
+  3801483792U,	// <4,3,4,7>: Cost 4 vext3 <7,2,3,4>, <3,4,7,2>
+  2691910168U,	// <4,3,4,u>: Cost 3 vext3 <1,2,3,4>, <3,4,u,1>
+  2559025254U,	// <4,3,5,0>: Cost 3 vext1 <1,4,3,5>, LHS
+  2559026237U,	// <4,3,5,1>: Cost 3 vext1 <1,4,3,5>, <1,4,3,5>
+  2564998862U,	// <4,3,5,2>: Cost 3 vext1 <2,4,3,5>, <2,3,4,5>
+  2570971548U,	// <4,3,5,3>: Cost 3 vext1 <3,4,3,5>, <3,3,3,3>
+  2559028534U,	// <4,3,5,4>: Cost 3 vext1 <1,4,3,5>, RHS
+  4163519477U,	// <4,3,5,5>: Cost 4 vtrnr <0,4,1,5>, <1,3,4,5>
+  3309390346U,	// <4,3,5,6>: Cost 4 vrev <3,4,6,5>
+  2706139747U,	// <4,3,5,7>: Cost 3 vext3 <3,5,7,4>, <3,5,7,4>
+  2559031086U,	// <4,3,5,u>: Cost 3 vext1 <1,4,3,5>, LHS
+  2559033446U,	// <4,3,6,0>: Cost 3 vext1 <1,4,3,6>, LHS
+  2559034430U,	// <4,3,6,1>: Cost 3 vext1 <1,4,3,6>, <1,4,3,6>
+  2565007127U,	// <4,3,6,2>: Cost 3 vext1 <2,4,3,6>, <2,4,3,6>
+  2570979740U,	// <4,3,6,3>: Cost 3 vext1 <3,4,3,6>, <3,3,3,3>
+  2559036726U,	// <4,3,6,4>: Cost 3 vext1 <1,4,3,6>, RHS
+  1161841154U,	// <4,3,6,5>: Cost 2 vrev <3,4,5,6>
+  4028203932U,	// <4,3,6,6>: Cost 4 vzipr <0,2,4,6>, <1,2,3,6>
+  2706803380U,	// <4,3,6,7>: Cost 3 vext3 <3,6,7,4>, <3,6,7,4>
+  1162062365U,	// <4,3,6,u>: Cost 2 vrev <3,4,u,6>
+  3769633475U,	// <4,3,7,0>: Cost 4 vext3 <1,u,3,4>, <3,7,0,1>
+  3769633488U,	// <4,3,7,1>: Cost 4 vext3 <1,u,3,4>, <3,7,1,5>
+  3638757144U,	// <4,3,7,2>: Cost 4 vext1 <2,4,3,7>, <2,4,3,7>
+  3769633508U,	// <4,3,7,3>: Cost 4 vext3 <1,u,3,4>, <3,7,3,7>
+  3769633515U,	// <4,3,7,4>: Cost 4 vext3 <1,u,3,4>, <3,7,4,5>
+  3769633526U,	// <4,3,7,5>: Cost 4 vext3 <1,u,3,4>, <3,7,5,7>
+  3662647932U,	// <4,3,7,6>: Cost 4 vext1 <6,4,3,7>, <6,4,3,7>
+  3781208837U,	// <4,3,7,7>: Cost 4 vext3 <3,7,7,4>, <3,7,7,4>
+  3769633547U,	// <4,3,7,u>: Cost 4 vext3 <1,u,3,4>, <3,7,u,1>
+  2559049830U,	// <4,3,u,0>: Cost 3 vext1 <1,4,3,u>, LHS
+  2691910430U,	// <4,3,u,1>: Cost 3 vext3 <1,2,3,4>, <3,u,1,2>
+  2565023513U,	// <4,3,u,2>: Cost 3 vext1 <2,4,3,u>, <2,4,3,u>
+  2707835698U,	// <4,3,u,3>: Cost 3 vext3 <3,u,3,4>, <3,u,3,4>
+  2559053110U,	// <4,3,u,4>: Cost 3 vext1 <1,4,3,u>, RHS
+  1161857540U,	// <4,3,u,5>: Cost 2 vrev <3,4,5,u>
+  2235673101U,	// <4,3,u,6>: Cost 3 vrev <3,4,6,u>
+  2708130646U,	// <4,3,u,7>: Cost 3 vext3 <3,u,7,4>, <3,u,7,4>
+  1162078751U,	// <4,3,u,u>: Cost 2 vrev <3,4,u,u>
+  2617573416U,	// <4,4,0,0>: Cost 3 vext2 <0,0,4,4>, <0,0,4,4>
+  1570373734U,	// <4,4,0,1>: Cost 2 vext2 <4,4,4,4>, LHS
+  2779676774U,	// <4,4,0,2>: Cost 3 vuzpl <4,6,4,6>, LHS
+  3760196480U,	// <4,4,0,3>: Cost 4 vext3 <0,3,1,4>, <4,0,3,1>
+  2576977100U,	// <4,4,0,4>: Cost 3 vext1 <4,4,4,0>, <4,4,4,0>
+  2718747538U,	// <4,4,0,5>: Cost 3 vext3 <5,6,7,4>, <4,0,5,1>
+  2718747548U,	// <4,4,0,6>: Cost 3 vext3 <5,6,7,4>, <4,0,6,2>
+  3668637015U,	// <4,4,0,7>: Cost 4 vext1 <7,4,4,0>, <7,4,4,0>
+  1570374301U,	// <4,4,0,u>: Cost 2 vext2 <4,4,4,4>, LHS
+  2644116214U,	// <4,4,1,0>: Cost 3 vext2 <4,4,4,4>, <1,0,3,2>
+  2644116276U,	// <4,4,1,1>: Cost 3 vext2 <4,4,4,4>, <1,1,1,1>
+  2691910602U,	// <4,4,1,2>: Cost 3 vext3 <1,2,3,4>, <4,1,2,3>
+  2644116440U,	// <4,4,1,3>: Cost 3 vext2 <4,4,4,4>, <1,3,1,3>
+  2711227356U,	// <4,4,1,4>: Cost 3 vext3 <4,4,4,4>, <4,1,4,3>
+  2709310438U,	// <4,4,1,5>: Cost 3 vext3 <4,1,5,4>, <4,1,5,4>
+  3765652462U,	// <4,4,1,6>: Cost 4 vext3 <1,2,3,4>, <4,1,6,3>
+  3768970231U,	// <4,4,1,7>: Cost 4 vext3 <1,7,3,4>, <4,1,7,3>
+  2695891968U,	// <4,4,1,u>: Cost 3 vext3 <1,u,3,4>, <4,1,u,3>
+  3703260634U,	// <4,4,2,0>: Cost 4 vext2 <2,0,4,4>, <2,0,4,4>
+  3765652499U,	// <4,4,2,1>: Cost 4 vext3 <1,2,3,4>, <4,2,1,4>
+  2644117096U,	// <4,4,2,2>: Cost 3 vext2 <4,4,4,4>, <2,2,2,2>
+  2631509709U,	// <4,4,2,3>: Cost 3 vext2 <2,3,4,4>, <2,3,4,4>
+  2644117269U,	// <4,4,2,4>: Cost 3 vext2 <4,4,4,4>, <2,4,3,4>
+  3705251698U,	// <4,4,2,5>: Cost 4 vext2 <2,3,4,4>, <2,5,4,7>
+  2710047808U,	// <4,4,2,6>: Cost 3 vext3 <4,2,6,4>, <4,2,6,4>
+  3783863369U,	// <4,4,2,7>: Cost 4 vext3 <4,2,7,4>, <4,2,7,4>
+  2634827874U,	// <4,4,2,u>: Cost 3 vext2 <2,u,4,4>, <2,u,4,4>
+  2644117654U,	// <4,4,3,0>: Cost 3 vext2 <4,4,4,4>, <3,0,1,2>
+  3638797210U,	// <4,4,3,1>: Cost 4 vext1 <2,4,4,3>, <1,2,3,4>
+  3638798082U,	// <4,4,3,2>: Cost 4 vext1 <2,4,4,3>, <2,4,1,3>
+  2637482406U,	// <4,4,3,3>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
+  2638146039U,	// <4,4,3,4>: Cost 3 vext2 <3,4,4,4>, <3,4,4,4>
+  3913287374U,	// <4,4,3,5>: Cost 4 vuzpr <3,4,5,4>, <2,3,4,5>
+  3765652625U,	// <4,4,3,6>: Cost 4 vext3 <1,2,3,4>, <4,3,6,4>
+  3713878762U,	// <4,4,3,7>: Cost 4 vext2 <3,7,4,4>, <3,7,4,4>
+  2637482406U,	// <4,4,3,u>: Cost 3 vext2 <3,3,4,4>, <3,3,4,4>
+  1503264870U,	// <4,4,4,0>: Cost 2 vext1 <4,4,4,4>, LHS
+  2577007514U,	// <4,4,4,1>: Cost 3 vext1 <4,4,4,4>, <1,2,3,4>
+  2577008232U,	// <4,4,4,2>: Cost 3 vext1 <4,4,4,4>, <2,2,2,2>
+  2571037175U,	// <4,4,4,3>: Cost 3 vext1 <3,4,4,4>, <3,4,4,4>
+  161926454U,	// <4,4,4,4>: Cost 1 vdup0 RHS
+  1570377014U,	// <4,4,4,5>: Cost 2 vext2 <4,4,4,4>, RHS
+  2779680054U,	// <4,4,4,6>: Cost 3 vuzpl <4,6,4,6>, RHS
+  2594927963U,	// <4,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
+  161926454U,	// <4,4,4,u>: Cost 1 vdup0 RHS
+  2571042918U,	// <4,4,5,0>: Cost 3 vext1 <3,4,4,5>, LHS
+  2571043738U,	// <4,4,5,1>: Cost 3 vext1 <3,4,4,5>, <1,2,3,4>
+  3638814495U,	// <4,4,5,2>: Cost 4 vext1 <2,4,4,5>, <2,4,4,5>
+  2571045368U,	// <4,4,5,3>: Cost 3 vext1 <3,4,4,5>, <3,4,4,5>
+  2571046198U,	// <4,4,5,4>: Cost 3 vext1 <3,4,4,5>, RHS
+  1839648054U,	// <4,4,5,5>: Cost 2 vzipl RHS, RHS
+  1618169142U,	// <4,4,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  2594936156U,	// <4,4,5,7>: Cost 3 vext1 <7,4,4,5>, <7,4,4,5>
+  1618169160U,	// <4,4,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
+  2553135206U,	// <4,4,6,0>: Cost 3 vext1 <0,4,4,6>, LHS
+  3626877686U,	// <4,4,6,1>: Cost 4 vext1 <0,4,4,6>, <1,0,3,2>
+  2565080782U,	// <4,4,6,2>: Cost 3 vext1 <2,4,4,6>, <2,3,4,5>
+  2571053561U,	// <4,4,6,3>: Cost 3 vext1 <3,4,4,6>, <3,4,4,6>
+  2553138486U,	// <4,4,6,4>: Cost 3 vext1 <0,4,4,6>, RHS
+  2241555675U,	// <4,4,6,5>: Cost 3 vrev <4,4,5,6>
+  1973865782U,	// <4,4,6,6>: Cost 2 vtrnl RHS, RHS
+  2658055029U,	// <4,4,6,7>: Cost 3 vext2 <6,7,4,4>, <6,7,4,4>
+  1973865800U,	// <4,4,6,u>: Cost 2 vtrnl RHS, RHS
+  2644120570U,	// <4,4,7,0>: Cost 3 vext2 <4,4,4,4>, <7,0,1,2>
+  3638829978U,	// <4,4,7,1>: Cost 4 vext1 <2,4,4,7>, <1,2,3,4>
+  3638830881U,	// <4,4,7,2>: Cost 4 vext1 <2,4,4,7>, <2,4,4,7>
+  3735115018U,	// <4,4,7,3>: Cost 4 vext2 <7,3,4,4>, <7,3,4,4>
+  2662036827U,	// <4,4,7,4>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
+  2713292236U,	// <4,4,7,5>: Cost 3 vext3 <4,7,5,4>, <4,7,5,4>
+  2713365973U,	// <4,4,7,6>: Cost 3 vext3 <4,7,6,4>, <4,7,6,4>
+  2644121196U,	// <4,4,7,7>: Cost 3 vext2 <4,4,4,4>, <7,7,7,7>
+  2662036827U,	// <4,4,7,u>: Cost 3 vext2 <7,4,4,4>, <7,4,4,4>
+  1503297638U,	// <4,4,u,0>: Cost 2 vext1 <4,4,4,u>, LHS
+  1570379566U,	// <4,4,u,1>: Cost 2 vext2 <4,4,4,4>, LHS
+  2779682606U,	// <4,4,u,2>: Cost 3 vuzpl <4,6,4,6>, LHS
+  2571069947U,	// <4,4,u,3>: Cost 3 vext1 <3,4,4,u>, <3,4,4,u>
+  161926454U,	// <4,4,u,4>: Cost 1 vdup0 RHS
+  1841638710U,	// <4,4,u,5>: Cost 2 vzipl RHS, RHS
+  1618169385U,	// <4,4,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  2594960735U,	// <4,4,u,7>: Cost 3 vext1 <7,4,4,u>, <7,4,4,u>
+  161926454U,	// <4,4,u,u>: Cost 1 vdup0 RHS
+  2631516160U,	// <4,5,0,0>: Cost 3 vext2 <2,3,4,5>, <0,0,0,0>
+  1557774438U,	// <4,5,0,1>: Cost 2 vext2 <2,3,4,5>, LHS
+  2618908875U,	// <4,5,0,2>: Cost 3 vext2 <0,2,4,5>, <0,2,4,5>
+  2571078140U,	// <4,5,0,3>: Cost 3 vext1 <3,4,5,0>, <3,4,5,0>
+  2626871634U,	// <4,5,0,4>: Cost 3 vext2 <1,5,4,5>, <0,4,1,5>
+  3705258414U,	// <4,5,0,5>: Cost 4 vext2 <2,3,4,5>, <0,5,2,7>
+  2594968438U,	// <4,5,0,6>: Cost 3 vext1 <7,4,5,0>, <6,7,4,5>
+  2594968928U,	// <4,5,0,7>: Cost 3 vext1 <7,4,5,0>, <7,4,5,0>
+  1557775005U,	// <4,5,0,u>: Cost 2 vext2 <2,3,4,5>, LHS
+  2631516918U,	// <4,5,1,0>: Cost 3 vext2 <2,3,4,5>, <1,0,3,2>
+  2624217939U,	// <4,5,1,1>: Cost 3 vext2 <1,1,4,5>, <1,1,4,5>
+  2631517078U,	// <4,5,1,2>: Cost 3 vext2 <2,3,4,5>, <1,2,3,0>
+  2821341286U,	// <4,5,1,3>: Cost 3 vuzpr <0,4,1,5>, LHS
+  3895086054U,	// <4,5,1,4>: Cost 4 vuzpr <0,4,1,5>, <4,1,5,4>
+  2626872471U,	// <4,5,1,5>: Cost 3 vext2 <1,5,4,5>, <1,5,4,5>
+  3895083131U,	// <4,5,1,6>: Cost 4 vuzpr <0,4,1,5>, <0,1,4,6>
+  2718748368U,	// <4,5,1,7>: Cost 3 vext3 <5,6,7,4>, <5,1,7,3>
+  2821341291U,	// <4,5,1,u>: Cost 3 vuzpr <0,4,1,5>, LHS
+  2571092070U,	// <4,5,2,0>: Cost 3 vext1 <3,4,5,2>, LHS
+  3699287585U,	// <4,5,2,1>: Cost 4 vext2 <1,3,4,5>, <2,1,3,3>
+  2630854269U,	// <4,5,2,2>: Cost 3 vext2 <2,2,4,5>, <2,2,4,5>
+  1557776078U,	// <4,5,2,3>: Cost 2 vext2 <2,3,4,5>, <2,3,4,5>
+  2631517974U,	// <4,5,2,4>: Cost 3 vext2 <2,3,4,5>, <2,4,3,5>
+  3692652384U,	// <4,5,2,5>: Cost 4 vext2 <0,2,4,5>, <2,5,2,7>
+  2631518138U,	// <4,5,2,6>: Cost 3 vext2 <2,3,4,5>, <2,6,3,7>
+  4164013366U,	// <4,5,2,7>: Cost 4 vtrnr <0,4,u,2>, RHS
+  1561094243U,	// <4,5,2,u>: Cost 2 vext2 <2,u,4,5>, <2,u,4,5>
+  2631518358U,	// <4,5,3,0>: Cost 3 vext2 <2,3,4,5>, <3,0,1,2>
+  3895084710U,	// <4,5,3,1>: Cost 4 vuzpr <0,4,1,5>, <2,3,0,1>
+  2631518540U,	// <4,5,3,2>: Cost 3 vext2 <2,3,4,5>, <3,2,3,4>
+  2631518620U,	// <4,5,3,3>: Cost 3 vext2 <2,3,4,5>, <3,3,3,3>
+  2631518716U,	// <4,5,3,4>: Cost 3 vext2 <2,3,4,5>, <3,4,5,0>
+  2631518784U,	// <4,5,3,5>: Cost 3 vext2 <2,3,4,5>, <3,5,3,5>
+  2658060980U,	// <4,5,3,6>: Cost 3 vext2 <6,7,4,5>, <3,6,7,4>
+  2640145131U,	// <4,5,3,7>: Cost 3 vext2 <3,7,4,5>, <3,7,4,5>
+  2631519006U,	// <4,5,3,u>: Cost 3 vext2 <2,3,4,5>, <3,u,1,2>
+  2571108454U,	// <4,5,4,0>: Cost 3 vext1 <3,4,5,4>, LHS
+  3632907342U,	// <4,5,4,1>: Cost 4 vext1 <1,4,5,4>, <1,4,5,4>
+  2571110094U,	// <4,5,4,2>: Cost 3 vext1 <3,4,5,4>, <2,3,4,5>
+  2571110912U,	// <4,5,4,3>: Cost 3 vext1 <3,4,5,4>, <3,4,5,4>
+  2571111734U,	// <4,5,4,4>: Cost 3 vext1 <3,4,5,4>, RHS
+  1557777718U,	// <4,5,4,5>: Cost 2 vext2 <2,3,4,5>, RHS
+  2645454195U,	// <4,5,4,6>: Cost 3 vext2 <4,6,4,5>, <4,6,4,5>
+  2718748614U,	// <4,5,4,7>: Cost 3 vext3 <5,6,7,4>, <5,4,7,6>
+  1557777961U,	// <4,5,4,u>: Cost 2 vext2 <2,3,4,5>, RHS
+  1503346790U,	// <4,5,5,0>: Cost 2 vext1 <4,4,5,5>, LHS
+  2913398480U,	// <4,5,5,1>: Cost 3 vzipl RHS, <5,1,7,3>
+  2631519998U,	// <4,5,5,2>: Cost 3 vext2 <2,3,4,5>, <5,2,3,4>
+  2577090710U,	// <4,5,5,3>: Cost 3 vext1 <4,4,5,5>, <3,0,1,2>
+  1503349978U,	// <4,5,5,4>: Cost 2 vext1 <4,4,5,5>, <4,4,5,5>
+  2631520260U,	// <4,5,5,5>: Cost 3 vext2 <2,3,4,5>, <5,5,5,5>
+  2913390690U,	// <4,5,5,6>: Cost 3 vzipl RHS, <5,6,7,0>
+  2821344566U,	// <4,5,5,7>: Cost 3 vuzpr <0,4,1,5>, RHS
+  1503352622U,	// <4,5,5,u>: Cost 2 vext1 <4,4,5,5>, LHS
+  1497383014U,	// <4,5,6,0>: Cost 2 vext1 <3,4,5,6>, LHS
+  2559181904U,	// <4,5,6,1>: Cost 3 vext1 <1,4,5,6>, <1,4,5,6>
+  2565154601U,	// <4,5,6,2>: Cost 3 vext1 <2,4,5,6>, <2,4,5,6>
+  1497385474U,	// <4,5,6,3>: Cost 2 vext1 <3,4,5,6>, <3,4,5,6>
+  1497386294U,	// <4,5,6,4>: Cost 2 vext1 <3,4,5,6>, RHS
+  3047608324U,	// <4,5,6,5>: Cost 3 vtrnl RHS, <5,5,5,5>
+  2571129656U,	// <4,5,6,6>: Cost 3 vext1 <3,4,5,6>, <6,6,6,6>
+  27705344U,	// <4,5,6,7>: Cost 0 copy RHS
+  27705344U,	// <4,5,6,u>: Cost 0 copy RHS
+  2565161062U,	// <4,5,7,0>: Cost 3 vext1 <2,4,5,7>, LHS
+  2565161882U,	// <4,5,7,1>: Cost 3 vext1 <2,4,5,7>, <1,2,3,4>
+  2565162794U,	// <4,5,7,2>: Cost 3 vext1 <2,4,5,7>, <2,4,5,7>
+  2661381387U,	// <4,5,7,3>: Cost 3 vext2 <7,3,4,5>, <7,3,4,5>
+  2565164342U,	// <4,5,7,4>: Cost 3 vext1 <2,4,5,7>, RHS
+  2718748840U,	// <4,5,7,5>: Cost 3 vext3 <5,6,7,4>, <5,7,5,7>
+  2718748846U,	// <4,5,7,6>: Cost 3 vext3 <5,6,7,4>, <5,7,6,4>
+  2719412407U,	// <4,5,7,7>: Cost 3 vext3 <5,7,7,4>, <5,7,7,4>
+  2565166894U,	// <4,5,7,u>: Cost 3 vext1 <2,4,5,7>, LHS
+  1497399398U,	// <4,5,u,0>: Cost 2 vext1 <3,4,5,u>, LHS
+  1557780270U,	// <4,5,u,1>: Cost 2 vext2 <2,3,4,5>, LHS
+  2631522181U,	// <4,5,u,2>: Cost 3 vext2 <2,3,4,5>, <u,2,3,0>
+  1497401860U,	// <4,5,u,3>: Cost 2 vext1 <3,4,5,u>, <3,4,5,u>
+  1497402678U,	// <4,5,u,4>: Cost 2 vext1 <3,4,5,u>, RHS
+  1557780634U,	// <4,5,u,5>: Cost 2 vext2 <2,3,4,5>, RHS
+  2631522512U,	// <4,5,u,6>: Cost 3 vext2 <2,3,4,5>, <u,6,3,7>
+  27705344U,	// <4,5,u,7>: Cost 0 copy RHS
+  27705344U,	// <4,5,u,u>: Cost 0 copy RHS
+  2618916864U,	// <4,6,0,0>: Cost 3 vext2 <0,2,4,6>, <0,0,0,0>
+  1545175142U,	// <4,6,0,1>: Cost 2 vext2 <0,2,4,6>, LHS
+  1545175244U,	// <4,6,0,2>: Cost 2 vext2 <0,2,4,6>, <0,2,4,6>
+  3692658940U,	// <4,6,0,3>: Cost 4 vext2 <0,2,4,6>, <0,3,1,0>
+  2618917202U,	// <4,6,0,4>: Cost 3 vext2 <0,2,4,6>, <0,4,1,5>
+  3852910806U,	// <4,6,0,5>: Cost 4 vuzpl RHS, <0,2,5,7>
+  2253525648U,	// <4,6,0,6>: Cost 3 vrev <6,4,6,0>
+  4040764726U,	// <4,6,0,7>: Cost 4 vzipr <2,3,4,0>, RHS
+  1545175709U,	// <4,6,0,u>: Cost 2 vext2 <0,2,4,6>, LHS
+  2618917622U,	// <4,6,1,0>: Cost 3 vext2 <0,2,4,6>, <1,0,3,2>
+  2618917684U,	// <4,6,1,1>: Cost 3 vext2 <0,2,4,6>, <1,1,1,1>
+  2618917782U,	// <4,6,1,2>: Cost 3 vext2 <0,2,4,6>, <1,2,3,0>
+  2618917848U,	// <4,6,1,3>: Cost 3 vext2 <0,2,4,6>, <1,3,1,3>
+  3692659773U,	// <4,6,1,4>: Cost 4 vext2 <0,2,4,6>, <1,4,3,5>
+  2618918032U,	// <4,6,1,5>: Cost 3 vext2 <0,2,4,6>, <1,5,3,7>
+  3692659937U,	// <4,6,1,6>: Cost 4 vext2 <0,2,4,6>, <1,6,3,7>
+  4032146742U,	// <4,6,1,7>: Cost 4 vzipr <0,u,4,1>, RHS
+  2618918253U,	// <4,6,1,u>: Cost 3 vext2 <0,2,4,6>, <1,u,1,3>
+  2618918380U,	// <4,6,2,0>: Cost 3 vext2 <0,2,4,6>, <2,0,6,4>
+  2618918460U,	// <4,6,2,1>: Cost 3 vext2 <0,2,4,6>, <2,1,6,3>
+  2618918504U,	// <4,6,2,2>: Cost 3 vext2 <0,2,4,6>, <2,2,2,2>
+  2618918566U,	// <4,6,2,3>: Cost 3 vext2 <0,2,4,6>, <2,3,0,1>
+  2618918679U,	// <4,6,2,4>: Cost 3 vext2 <0,2,4,6>, <2,4,3,6>
+  2618918788U,	// <4,6,2,5>: Cost 3 vext2 <0,2,4,6>, <2,5,6,7>
+  2618918842U,	// <4,6,2,6>: Cost 3 vext2 <0,2,4,6>, <2,6,3,7>
+  2718749178U,	// <4,6,2,7>: Cost 3 vext3 <5,6,7,4>, <6,2,7,3>
+  2618918971U,	// <4,6,2,u>: Cost 3 vext2 <0,2,4,6>, <2,u,0,1>
+  2618919062U,	// <4,6,3,0>: Cost 3 vext2 <0,2,4,6>, <3,0,1,2>
+  2636171526U,	// <4,6,3,1>: Cost 3 vext2 <3,1,4,6>, <3,1,4,6>
+  3692661057U,	// <4,6,3,2>: Cost 4 vext2 <0,2,4,6>, <3,2,2,2>
+  2618919324U,	// <4,6,3,3>: Cost 3 vext2 <0,2,4,6>, <3,3,3,3>
+  2618919426U,	// <4,6,3,4>: Cost 3 vext2 <0,2,4,6>, <3,4,5,6>
+  2638826058U,	// <4,6,3,5>: Cost 3 vext2 <3,5,4,6>, <3,5,4,6>
+  3913303030U,	// <4,6,3,6>: Cost 4 vuzpr <3,4,5,6>, <1,3,4,6>
+  2722730572U,	// <4,6,3,7>: Cost 3 vext3 <6,3,7,4>, <6,3,7,4>
+  2618919710U,	// <4,6,3,u>: Cost 3 vext2 <0,2,4,6>, <3,u,1,2>
+  2565210214U,	// <4,6,4,0>: Cost 3 vext1 <2,4,6,4>, LHS
+  2718749286U,	// <4,6,4,1>: Cost 3 vext3 <5,6,7,4>, <6,4,1,3>
+  2565211952U,	// <4,6,4,2>: Cost 3 vext1 <2,4,6,4>, <2,4,6,4>
+  2571184649U,	// <4,6,4,3>: Cost 3 vext1 <3,4,6,4>, <3,4,6,4>
+  2565213494U,	// <4,6,4,4>: Cost 3 vext1 <2,4,6,4>, RHS
+  1545178422U,	// <4,6,4,5>: Cost 2 vext2 <0,2,4,6>, RHS
+  1705430326U,	// <4,6,4,6>: Cost 2 vuzpl RHS, RHS
+  2595075437U,	// <4,6,4,7>: Cost 3 vext1 <7,4,6,4>, <7,4,6,4>
+  1545178665U,	// <4,6,4,u>: Cost 2 vext2 <0,2,4,6>, RHS
+  2565218406U,	// <4,6,5,0>: Cost 3 vext1 <2,4,6,5>, LHS
+  2645462736U,	// <4,6,5,1>: Cost 3 vext2 <4,6,4,6>, <5,1,7,3>
+  2913399290U,	// <4,6,5,2>: Cost 3 vzipl RHS, <6,2,7,3>
+  3913305394U,	// <4,6,5,3>: Cost 4 vuzpr <3,4,5,6>, <4,5,6,3>
+  2645462982U,	// <4,6,5,4>: Cost 3 vext2 <4,6,4,6>, <5,4,7,6>
+  2779172868U,	// <4,6,5,5>: Cost 3 vuzpl RHS, <5,5,5,5>
+  2913391416U,	// <4,6,5,6>: Cost 3 vzipl RHS, <6,6,6,6>
+  2821426486U,	// <4,6,5,7>: Cost 3 vuzpr <0,4,2,6>, RHS
+  2821426487U,	// <4,6,5,u>: Cost 3 vuzpr <0,4,2,6>, RHS
+  1503428710U,	// <4,6,6,0>: Cost 2 vext1 <4,4,6,6>, LHS
+  2577171190U,	// <4,6,6,1>: Cost 3 vext1 <4,4,6,6>, <1,0,3,2>
+  2645463546U,	// <4,6,6,2>: Cost 3 vext2 <4,6,4,6>, <6,2,7,3>
+  2577172630U,	// <4,6,6,3>: Cost 3 vext1 <4,4,6,6>, <3,0,1,2>
+  1503431908U,	// <4,6,6,4>: Cost 2 vext1 <4,4,6,6>, <4,4,6,6>
+  2253501069U,	// <4,6,6,5>: Cost 3 vrev <6,4,5,6>
+  2618921784U,	// <4,6,6,6>: Cost 3 vext2 <0,2,4,6>, <6,6,6,6>
+  2954464566U,	// <4,6,6,7>: Cost 3 vzipr <0,2,4,6>, RHS
+  1503434542U,	// <4,6,6,u>: Cost 2 vext1 <4,4,6,6>, LHS
+  2645464058U,	// <4,6,7,0>: Cost 3 vext2 <4,6,4,6>, <7,0,1,2>
+  2779173882U,	// <4,6,7,1>: Cost 3 vuzpl RHS, <7,0,1,2>
+  3638978355U,	// <4,6,7,2>: Cost 4 vext1 <2,4,6,7>, <2,4,6,7>
+  2725090156U,	// <4,6,7,3>: Cost 3 vext3 <6,7,3,4>, <6,7,3,4>
+  2645464422U,	// <4,6,7,4>: Cost 3 vext2 <4,6,4,6>, <7,4,5,6>
+  2779174246U,	// <4,6,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
+  3852915914U,	// <4,6,7,6>: Cost 4 vuzpl RHS, <7,2,6,3>
+  2779174508U,	// <4,6,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
+  2779173945U,	// <4,6,7,u>: Cost 3 vuzpl RHS, <7,0,u,2>
+  1503445094U,	// <4,6,u,0>: Cost 2 vext1 <4,4,6,u>, LHS
+  1545180974U,	// <4,6,u,1>: Cost 2 vext2 <0,2,4,6>, LHS
+  1705432878U,	// <4,6,u,2>: Cost 2 vuzpl RHS, LHS
+  2618922940U,	// <4,6,u,3>: Cost 3 vext2 <0,2,4,6>, <u,3,0,1>
+  1503448294U,	// <4,6,u,4>: Cost 2 vext1 <4,4,6,u>, <4,4,6,u>
+  1545181338U,	// <4,6,u,5>: Cost 2 vext2 <0,2,4,6>, RHS
+  1705433242U,	// <4,6,u,6>: Cost 2 vuzpl RHS, RHS
+  2954480950U,	// <4,6,u,7>: Cost 3 vzipr <0,2,4,u>, RHS
+  1545181541U,	// <4,6,u,u>: Cost 2 vext2 <0,2,4,6>, LHS
+  3706601472U,	// <4,7,0,0>: Cost 4 vext2 <2,5,4,7>, <0,0,0,0>
+  2632859750U,	// <4,7,0,1>: Cost 3 vext2 <2,5,4,7>, LHS
+  2726343685U,	// <4,7,0,2>: Cost 3 vext3 <7,0,2,4>, <7,0,2,4>
+  3701293312U,	// <4,7,0,3>: Cost 4 vext2 <1,6,4,7>, <0,3,1,4>
+  3706601810U,	// <4,7,0,4>: Cost 4 vext2 <2,5,4,7>, <0,4,1,5>
+  2259424608U,	// <4,7,0,5>: Cost 3 vrev <7,4,5,0>
+  3695321617U,	// <4,7,0,6>: Cost 4 vext2 <0,6,4,7>, <0,6,4,7>
+  3800454194U,	// <4,7,0,7>: Cost 4 vext3 <7,0,7,4>, <7,0,7,4>
+  2632860317U,	// <4,7,0,u>: Cost 3 vext2 <2,5,4,7>, LHS
+  2259064116U,	// <4,7,1,0>: Cost 3 vrev <7,4,0,1>
+  3700630324U,	// <4,7,1,1>: Cost 4 vext2 <1,5,4,7>, <1,1,1,1>
+  2632860570U,	// <4,7,1,2>: Cost 3 vext2 <2,5,4,7>, <1,2,3,4>
+  3769635936U,	// <4,7,1,3>: Cost 4 vext3 <1,u,3,4>, <7,1,3,5>
+  3656920374U,	// <4,7,1,4>: Cost 4 vext1 <5,4,7,1>, RHS
+  3700630681U,	// <4,7,1,5>: Cost 4 vext2 <1,5,4,7>, <1,5,4,7>
+  3701294314U,	// <4,7,1,6>: Cost 4 vext2 <1,6,4,7>, <1,6,4,7>
+  3793818754U,	// <4,7,1,7>: Cost 4 vext3 <5,u,7,4>, <7,1,7,3>
+  2259654012U,	// <4,7,1,u>: Cost 3 vrev <7,4,u,1>
+  3656925286U,	// <4,7,2,0>: Cost 4 vext1 <5,4,7,2>, LHS
+  3706603050U,	// <4,7,2,1>: Cost 4 vext2 <2,5,4,7>, <2,1,4,3>
+  3706603112U,	// <4,7,2,2>: Cost 4 vext2 <2,5,4,7>, <2,2,2,2>
+  2727744688U,	// <4,7,2,3>: Cost 3 vext3 <7,2,3,4>, <7,2,3,4>
+  3705939745U,	// <4,7,2,4>: Cost 4 vext2 <2,4,4,7>, <2,4,4,7>
+  2632861554U,	// <4,7,2,5>: Cost 3 vext2 <2,5,4,7>, <2,5,4,7>
+  3706603450U,	// <4,7,2,6>: Cost 4 vext2 <2,5,4,7>, <2,6,3,7>
+  3792491731U,	// <4,7,2,7>: Cost 4 vext3 <5,6,7,4>, <7,2,7,3>
+  2634852453U,	// <4,7,2,u>: Cost 3 vext2 <2,u,4,7>, <2,u,4,7>
+  3706603670U,	// <4,7,3,0>: Cost 4 vext2 <2,5,4,7>, <3,0,1,2>
+  3662906266U,	// <4,7,3,1>: Cost 4 vext1 <6,4,7,3>, <1,2,3,4>
+  3725183326U,	// <4,7,3,2>: Cost 4 vext2 <5,6,4,7>, <3,2,5,4>
+  3706603932U,	// <4,7,3,3>: Cost 4 vext2 <2,5,4,7>, <3,3,3,3>
+  3701295618U,	// <4,7,3,4>: Cost 4 vext2 <1,6,4,7>, <3,4,5,6>
+  2638834251U,	// <4,7,3,5>: Cost 3 vext2 <3,5,4,7>, <3,5,4,7>
+  2639497884U,	// <4,7,3,6>: Cost 3 vext2 <3,6,4,7>, <3,6,4,7>
+  3802445093U,	// <4,7,3,7>: Cost 4 vext3 <7,3,7,4>, <7,3,7,4>
+  2640825150U,	// <4,7,3,u>: Cost 3 vext2 <3,u,4,7>, <3,u,4,7>
+  2718750004U,	// <4,7,4,0>: Cost 3 vext3 <5,6,7,4>, <7,4,0,1>
+  3706604490U,	// <4,7,4,1>: Cost 4 vext2 <2,5,4,7>, <4,1,2,3>
+  3656943474U,	// <4,7,4,2>: Cost 4 vext1 <5,4,7,4>, <2,5,4,7>
+  3779884371U,	// <4,7,4,3>: Cost 4 vext3 <3,5,7,4>, <7,4,3,5>
+  2259383643U,	// <4,7,4,4>: Cost 3 vrev <7,4,4,4>
+  2632863030U,	// <4,7,4,5>: Cost 3 vext2 <2,5,4,7>, RHS
+  2259531117U,	// <4,7,4,6>: Cost 3 vrev <7,4,6,4>
+  3907340074U,	// <4,7,4,7>: Cost 4 vuzpr <2,4,5,7>, <2,4,5,7>
+  2632863273U,	// <4,7,4,u>: Cost 3 vext2 <2,5,4,7>, RHS
+  2913391610U,	// <4,7,5,0>: Cost 3 vzipl RHS, <7,0,1,2>
+  3645006848U,	// <4,7,5,1>: Cost 4 vext1 <3,4,7,5>, <1,3,5,7>
+  2589181646U,	// <4,7,5,2>: Cost 3 vext1 <6,4,7,5>, <2,3,4,5>
+  3645008403U,	// <4,7,5,3>: Cost 4 vext1 <3,4,7,5>, <3,4,7,5>
+  2913391974U,	// <4,7,5,4>: Cost 3 vzipl RHS, <7,4,5,6>
+  2583211973U,	// <4,7,5,5>: Cost 3 vext1 <5,4,7,5>, <5,4,7,5>
+  2589184670U,	// <4,7,5,6>: Cost 3 vext1 <6,4,7,5>, <6,4,7,5>
+  2913392236U,	// <4,7,5,7>: Cost 3 vzipl RHS, <7,7,7,7>
+  2913392258U,	// <4,7,5,u>: Cost 3 vzipl RHS, <7,u,1,2>
+  1509474406U,	// <4,7,6,0>: Cost 2 vext1 <5,4,7,6>, LHS
+  3047609338U,	// <4,7,6,1>: Cost 3 vtrnl RHS, <7,0,1,2>
+  2583217768U,	// <4,7,6,2>: Cost 3 vext1 <5,4,7,6>, <2,2,2,2>
+  2583218326U,	// <4,7,6,3>: Cost 3 vext1 <5,4,7,6>, <3,0,1,2>
+  1509477686U,	// <4,7,6,4>: Cost 2 vext1 <5,4,7,6>, RHS
+  1509478342U,	// <4,7,6,5>: Cost 2 vext1 <5,4,7,6>, <5,4,7,6>
+  2583220730U,	// <4,7,6,6>: Cost 3 vext1 <5,4,7,6>, <6,2,7,3>
+  3047609964U,	// <4,7,6,7>: Cost 3 vtrnl RHS, <7,7,7,7>
+  1509480238U,	// <4,7,6,u>: Cost 2 vext1 <5,4,7,6>, LHS
+  3650994278U,	// <4,7,7,0>: Cost 4 vext1 <4,4,7,7>, LHS
+  3650995098U,	// <4,7,7,1>: Cost 4 vext1 <4,4,7,7>, <1,2,3,4>
+  3650996010U,	// <4,7,7,2>: Cost 4 vext1 <4,4,7,7>, <2,4,5,7>
+  3804804677U,	// <4,7,7,3>: Cost 4 vext3 <7,7,3,4>, <7,7,3,4>
+  3650997486U,	// <4,7,7,4>: Cost 4 vext1 <4,4,7,7>, <4,4,7,7>
+  2662725039U,	// <4,7,7,5>: Cost 3 vext2 <7,5,4,7>, <7,5,4,7>
+  3662942880U,	// <4,7,7,6>: Cost 4 vext1 <6,4,7,7>, <6,4,7,7>
+  2718750316U,	// <4,7,7,7>: Cost 3 vext3 <5,6,7,4>, <7,7,7,7>
+  2664715938U,	// <4,7,7,u>: Cost 3 vext2 <7,u,4,7>, <7,u,4,7>
+  1509490790U,	// <4,7,u,0>: Cost 2 vext1 <5,4,7,u>, LHS
+  2632865582U,	// <4,7,u,1>: Cost 3 vext2 <2,5,4,7>, LHS
+  2583234152U,	// <4,7,u,2>: Cost 3 vext1 <5,4,7,u>, <2,2,2,2>
+  2583234710U,	// <4,7,u,3>: Cost 3 vext1 <5,4,7,u>, <3,0,1,2>
+  1509494070U,	// <4,7,u,4>: Cost 2 vext1 <5,4,7,u>, RHS
+  1509494728U,	// <4,7,u,5>: Cost 2 vext1 <5,4,7,u>, <5,4,7,u>
+  2583237114U,	// <4,7,u,6>: Cost 3 vext1 <5,4,7,u>, <6,2,7,3>
+  3047757420U,	// <4,7,u,7>: Cost 3 vtrnl RHS, <7,7,7,7>
+  1509496622U,	// <4,7,u,u>: Cost 2 vext1 <5,4,7,u>, LHS
+  2618933248U,	// <4,u,0,0>: Cost 3 vext2 <0,2,4,u>, <0,0,0,0>
+  1545191526U,	// <4,u,0,1>: Cost 2 vext2 <0,2,4,u>, LHS
+  1545191630U,	// <4,u,0,2>: Cost 2 vext2 <0,2,4,u>, <0,2,4,u>
+  2691913445U,	// <4,u,0,3>: Cost 3 vext3 <1,2,3,4>, <u,0,3,2>
+  2618933586U,	// <4,u,0,4>: Cost 3 vext2 <0,2,4,u>, <0,4,1,5>
+  2265397305U,	// <4,u,0,5>: Cost 3 vrev <u,4,5,0>
+  2595189625U,	// <4,u,0,6>: Cost 3 vext1 <7,4,u,0>, <6,7,4,u>
+  2595190139U,	// <4,u,0,7>: Cost 3 vext1 <7,4,u,0>, <7,4,u,0>
+  1545192093U,	// <4,u,0,u>: Cost 2 vext2 <0,2,4,u>, LHS
+  2618934006U,	// <4,u,1,0>: Cost 3 vext2 <0,2,4,u>, <1,0,3,2>
+  2618934068U,	// <4,u,1,1>: Cost 3 vext2 <0,2,4,u>, <1,1,1,1>
+  1618171694U,	// <4,u,1,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  2618934232U,	// <4,u,1,3>: Cost 3 vext2 <0,2,4,u>, <1,3,1,3>
+  2695894848U,	// <4,u,1,4>: Cost 3 vext3 <1,u,3,4>, <u,1,4,3>
+  2618934416U,	// <4,u,1,5>: Cost 3 vext2 <0,2,4,u>, <1,5,3,7>
+  3692676321U,	// <4,u,1,6>: Cost 4 vext2 <0,2,4,u>, <1,6,3,7>
+  2718750555U,	// <4,u,1,7>: Cost 3 vext3 <5,6,7,4>, <u,1,7,3>
+  1618171748U,	// <4,u,1,u>: Cost 2 vext3 <1,2,3,4>, LHS
+  2553397350U,	// <4,u,2,0>: Cost 3 vext1 <0,4,u,2>, LHS
+  2630215215U,	// <4,u,2,1>: Cost 3 vext2 <2,1,4,u>, <2,1,4,u>
+  2618934888U,	// <4,u,2,2>: Cost 3 vext2 <0,2,4,u>, <2,2,2,2>
+  1557800657U,	// <4,u,2,3>: Cost 2 vext2 <2,3,4,u>, <2,3,4,u>
+  2618935065U,	// <4,u,2,4>: Cost 3 vext2 <0,2,4,u>, <2,4,3,u>
+  2733864859U,	// <4,u,2,5>: Cost 3 vext3 <u,2,5,4>, <u,2,5,4>
+  2618935226U,	// <4,u,2,6>: Cost 3 vext2 <0,2,4,u>, <2,6,3,7>
+  2718750636U,	// <4,u,2,7>: Cost 3 vext3 <5,6,7,4>, <u,2,7,3>
+  1561118822U,	// <4,u,2,u>: Cost 2 vext2 <2,u,4,u>, <2,u,4,u>
+  2618935446U,	// <4,u,3,0>: Cost 3 vext2 <0,2,4,u>, <3,0,1,2>
+  2779318422U,	// <4,u,3,1>: Cost 3 vuzpl RHS, <3,0,1,2>
+  2636851545U,	// <4,u,3,2>: Cost 3 vext2 <3,2,4,u>, <3,2,4,u>
+  2618935708U,	// <4,u,3,3>: Cost 3 vext2 <0,2,4,u>, <3,3,3,3>
+  2618935810U,	// <4,u,3,4>: Cost 3 vext2 <0,2,4,u>, <3,4,5,6>
+  2691913711U,	// <4,u,3,5>: Cost 3 vext3 <1,2,3,4>, <u,3,5,7>
+  2588725862U,	// <4,u,3,6>: Cost 3 vext1 <6,4,1,3>, <6,4,1,3>
+  2640169710U,	// <4,u,3,7>: Cost 3 vext2 <3,7,4,u>, <3,7,4,u>
+  2618936094U,	// <4,u,3,u>: Cost 3 vext2 <0,2,4,u>, <3,u,1,2>
+  1503559782U,	// <4,u,4,0>: Cost 2 vext1 <4,4,u,4>, LHS
+  2692282391U,	// <4,u,4,1>: Cost 3 vext3 <1,2,u,4>, <u,4,1,2>
+  2565359426U,	// <4,u,4,2>: Cost 3 vext1 <2,4,u,4>, <2,4,u,4>
+  2571332123U,	// <4,u,4,3>: Cost 3 vext1 <3,4,u,4>, <3,4,u,4>
+  161926454U,	// <4,u,4,4>: Cost 1 vdup0 RHS
+  1545194806U,	// <4,u,4,5>: Cost 2 vext2 <0,2,4,u>, RHS
+  1705577782U,	// <4,u,4,6>: Cost 2 vuzpl RHS, RHS
+  2718750801U,	// <4,u,4,7>: Cost 3 vext3 <5,6,7,4>, <u,4,7,6>
+  161926454U,	// <4,u,4,u>: Cost 1 vdup0 RHS
+  1479164006U,	// <4,u,5,0>: Cost 2 vext1 <0,4,1,5>, LHS
+  1839650606U,	// <4,u,5,1>: Cost 2 vzipl RHS, LHS
+  2565367502U,	// <4,u,5,2>: Cost 3 vext1 <2,4,u,5>, <2,3,4,5>
+  3089777309U,	// <4,u,5,3>: Cost 3 vtrnr <0,4,1,5>, LHS
+  1479167286U,	// <4,u,5,4>: Cost 2 vext1 <0,4,1,5>, RHS
+  1839650970U,	// <4,u,5,5>: Cost 2 vzipl RHS, RHS
+  1618172058U,	// <4,u,5,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  3089780265U,	// <4,u,5,7>: Cost 3 vtrnr <0,4,1,5>, RHS
+  1618172076U,	// <4,u,5,u>: Cost 2 vext3 <1,2,3,4>, RHS
+  1479688294U,	// <4,u,6,0>: Cost 2 vext1 <0,4,u,6>, LHS
+  2553430774U,	// <4,u,6,1>: Cost 3 vext1 <0,4,u,6>, <1,0,3,2>
+  1973868334U,	// <4,u,6,2>: Cost 2 vtrnl RHS, LHS
+  1497606685U,	// <4,u,6,3>: Cost 2 vext1 <3,4,u,6>, <3,4,u,6>
+  1479691574U,	// <4,u,6,4>: Cost 2 vext1 <0,4,u,6>, RHS
+  1509552079U,	// <4,u,6,5>: Cost 2 vext1 <5,4,u,6>, <5,4,u,6>
+  1973868698U,	// <4,u,6,6>: Cost 2 vtrnl RHS, RHS
+  27705344U,	// <4,u,6,7>: Cost 0 copy RHS
+  27705344U,	// <4,u,6,u>: Cost 0 copy RHS
+  2565382246U,	// <4,u,7,0>: Cost 3 vext1 <2,4,u,7>, LHS
+  2565383066U,	// <4,u,7,1>: Cost 3 vext1 <2,4,u,7>, <1,2,3,4>
+  2565384005U,	// <4,u,7,2>: Cost 3 vext1 <2,4,u,7>, <2,4,u,7>
+  2661405966U,	// <4,u,7,3>: Cost 3 vext2 <7,3,4,u>, <7,3,4,u>
+  2565385526U,	// <4,u,7,4>: Cost 3 vext1 <2,4,u,7>, RHS
+  2779321702U,	// <4,u,7,5>: Cost 3 vuzpl RHS, <7,4,5,6>
+  2589274793U,	// <4,u,7,6>: Cost 3 vext1 <6,4,u,7>, <6,4,u,7>
+  2779321964U,	// <4,u,7,7>: Cost 3 vuzpl RHS, <7,7,7,7>
+  2565388078U,	// <4,u,7,u>: Cost 3 vext1 <2,4,u,7>, LHS
+  1479704678U,	// <4,u,u,0>: Cost 2 vext1 <0,4,u,u>, LHS
+  1545197358U,	// <4,u,u,1>: Cost 2 vext2 <0,2,4,u>, LHS
+  1618172261U,	// <4,u,u,2>: Cost 2 vext3 <1,2,3,4>, LHS
+  1497623071U,	// <4,u,u,3>: Cost 2 vext1 <3,4,u,u>, <3,4,u,u>
+  161926454U,	// <4,u,u,4>: Cost 1 vdup0 RHS
+  1545197722U,	// <4,u,u,5>: Cost 2 vext2 <0,2,4,u>, RHS
+  1618172301U,	// <4,u,u,6>: Cost 2 vext3 <1,2,3,4>, RHS
+  27705344U,	// <4,u,u,7>: Cost 0 copy RHS
+  27705344U,	// <4,u,u,u>: Cost 0 copy RHS
+  2687123456U,	// <5,0,0,0>: Cost 3 vext3 <0,4,1,5>, <0,0,0,0>
+  2687123466U,	// <5,0,0,1>: Cost 3 vext3 <0,4,1,5>, <0,0,1,1>
+  2687123476U,	// <5,0,0,2>: Cost 3 vext3 <0,4,1,5>, <0,0,2,2>
+  3710599434U,	// <5,0,0,3>: Cost 4 vext2 <3,2,5,0>, <0,3,2,5>
+  2642166098U,	// <5,0,0,4>: Cost 3 vext2 <4,1,5,0>, <0,4,1,5>
+  3657060306U,	// <5,0,0,5>: Cost 4 vext1 <5,5,0,0>, <5,5,0,0>
+  3292094923U,	// <5,0,0,6>: Cost 4 vrev <0,5,6,0>
+  3669005700U,	// <5,0,0,7>: Cost 4 vext1 <7,5,0,0>, <7,5,0,0>
+  2687123530U,	// <5,0,0,u>: Cost 3 vext3 <0,4,1,5>, <0,0,u,2>
+  2559434854U,	// <5,0,1,0>: Cost 3 vext1 <1,5,0,1>, LHS
+  2559435887U,	// <5,0,1,1>: Cost 3 vext1 <1,5,0,1>, <1,5,0,1>
+  1613381734U,	// <5,0,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  3698656256U,	// <5,0,1,3>: Cost 4 vext2 <1,2,5,0>, <1,3,5,7>
+  2559438134U,	// <5,0,1,4>: Cost 3 vext1 <1,5,0,1>, RHS
+  2583326675U,	// <5,0,1,5>: Cost 3 vext1 <5,5,0,1>, <5,5,0,1>
+  3715908851U,	// <5,0,1,6>: Cost 4 vext2 <4,1,5,0>, <1,6,5,7>
+  3657069562U,	// <5,0,1,7>: Cost 4 vext1 <5,5,0,1>, <7,0,1,2>
+  1613381788U,	// <5,0,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
+  2686017700U,	// <5,0,2,0>: Cost 3 vext3 <0,2,4,5>, <0,2,0,2>
+  2685796528U,	// <5,0,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
+  2698625208U,	// <5,0,2,2>: Cost 3 vext3 <2,3,4,5>, <0,2,2,4>
+  2685944002U,	// <5,0,2,3>: Cost 3 vext3 <0,2,3,5>, <0,2,3,5>
+  2686017739U,	// <5,0,2,4>: Cost 3 vext3 <0,2,4,5>, <0,2,4,5>
+  2686091476U,	// <5,0,2,5>: Cost 3 vext3 <0,2,5,5>, <0,2,5,5>
+  2725167324U,	// <5,0,2,6>: Cost 3 vext3 <6,7,4,5>, <0,2,6,4>
+  2595280230U,	// <5,0,2,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
+  2686312687U,	// <5,0,2,u>: Cost 3 vext3 <0,2,u,5>, <0,2,u,5>
+  3760128248U,	// <5,0,3,0>: Cost 4 vext3 <0,3,0,5>, <0,3,0,5>
+  3759685888U,	// <5,0,3,1>: Cost 4 vext3 <0,2,3,5>, <0,3,1,4>
+  2686533898U,	// <5,0,3,2>: Cost 3 vext3 <0,3,2,5>, <0,3,2,5>
+  3760349459U,	// <5,0,3,3>: Cost 4 vext3 <0,3,3,5>, <0,3,3,5>
+  2638187004U,	// <5,0,3,4>: Cost 3 vext2 <3,4,5,0>, <3,4,5,0>
+  3776348452U,	// <5,0,3,5>: Cost 4 vext3 <3,0,4,5>, <0,3,5,4>
+  3713256094U,	// <5,0,3,6>: Cost 4 vext2 <3,6,5,0>, <3,6,5,0>
+  3914064896U,	// <5,0,3,7>: Cost 4 vuzpr <3,5,7,0>, <1,3,5,7>
+  2686976320U,	// <5,0,3,u>: Cost 3 vext3 <0,3,u,5>, <0,3,u,5>
+  2559459430U,	// <5,0,4,0>: Cost 3 vext1 <1,5,0,4>, LHS
+  1613381970U,	// <5,0,4,1>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
+  2687123804U,	// <5,0,4,2>: Cost 3 vext3 <0,4,1,5>, <0,4,2,6>
+  3761013092U,	// <5,0,4,3>: Cost 4 vext3 <0,4,3,5>, <0,4,3,5>
+  2559462710U,	// <5,0,4,4>: Cost 3 vext1 <1,5,0,4>, RHS
+  2638187830U,	// <5,0,4,5>: Cost 3 vext2 <3,4,5,0>, RHS
+  3761234303U,	// <5,0,4,6>: Cost 4 vext3 <0,4,6,5>, <0,4,6,5>
+  2646150600U,	// <5,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
+  1613381970U,	// <5,0,4,u>: Cost 2 vext3 <0,4,1,5>, <0,4,1,5>
+  3766763926U,	// <5,0,5,0>: Cost 4 vext3 <1,4,0,5>, <0,5,0,1>
+  2919268454U,	// <5,0,5,1>: Cost 3 vzipl <5,5,5,5>, LHS
+  3053486182U,	// <5,0,5,2>: Cost 3 vtrnl <5,5,5,5>, LHS
+  3723210589U,	// <5,0,5,3>: Cost 4 vext2 <5,3,5,0>, <5,3,5,0>
+  3766763966U,	// <5,0,5,4>: Cost 4 vext3 <1,4,0,5>, <0,5,4,5>
+  2650796031U,	// <5,0,5,5>: Cost 3 vext2 <5,5,5,0>, <5,5,5,0>
+  3719893090U,	// <5,0,5,6>: Cost 4 vext2 <4,7,5,0>, <5,6,7,0>
+  3914067254U,	// <5,0,5,7>: Cost 4 vuzpr <3,5,7,0>, RHS
+  2919269021U,	// <5,0,5,u>: Cost 3 vzipl <5,5,5,5>, LHS
+  4047519744U,	// <5,0,6,0>: Cost 4 vzipr <3,4,5,6>, <0,0,0,0>
+  2920038502U,	// <5,0,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
+  3759759871U,	// <5,0,6,2>: Cost 4 vext3 <0,2,4,5>, <0,6,2,7>
+  3645164070U,	// <5,0,6,3>: Cost 4 vext1 <3,5,0,6>, <3,5,0,6>
+  3762414095U,	// <5,0,6,4>: Cost 4 vext3 <0,6,4,5>, <0,6,4,5>
+  3993780690U,	// <5,0,6,5>: Cost 4 vzipl <5,6,7,0>, <0,5,6,7>
+  3719893816U,	// <5,0,6,6>: Cost 4 vext2 <4,7,5,0>, <6,6,6,6>
+  2662077302U,	// <5,0,6,7>: Cost 3 vext2 <7,4,5,0>, <6,7,4,5>
+  2920039069U,	// <5,0,6,u>: Cost 3 vzipl <5,6,7,0>, LHS
+  2565455974U,	// <5,0,7,0>: Cost 3 vext1 <2,5,0,7>, LHS
+  2565456790U,	// <5,0,7,1>: Cost 3 vext1 <2,5,0,7>, <1,2,3,0>
+  2565457742U,	// <5,0,7,2>: Cost 3 vext1 <2,5,0,7>, <2,5,0,7>
+  3639199894U,	// <5,0,7,3>: Cost 4 vext1 <2,5,0,7>, <3,0,1,2>
+  2565459254U,	// <5,0,7,4>: Cost 3 vext1 <2,5,0,7>, RHS
+  2589347938U,	// <5,0,7,5>: Cost 3 vext1 <6,5,0,7>, <5,6,7,0>
+  2589348530U,	// <5,0,7,6>: Cost 3 vext1 <6,5,0,7>, <6,5,0,7>
+  4188456422U,	// <5,0,7,7>: Cost 4 vtrnr RHS, <2,0,5,7>
+  2565461806U,	// <5,0,7,u>: Cost 3 vext1 <2,5,0,7>, LHS
+  2687124106U,	// <5,0,u,0>: Cost 3 vext3 <0,4,1,5>, <0,u,0,2>
+  1616036502U,	// <5,0,u,1>: Cost 2 vext3 <0,u,1,5>, <0,u,1,5>
+  1613382301U,	// <5,0,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  2689925800U,	// <5,0,u,3>: Cost 3 vext3 <0,u,3,5>, <0,u,3,5>
+  2687124146U,	// <5,0,u,4>: Cost 3 vext3 <0,4,1,5>, <0,u,4,6>
+  2638190746U,	// <5,0,u,5>: Cost 3 vext2 <3,4,5,0>, RHS
+  2589356723U,	// <5,0,u,6>: Cost 3 vext1 <6,5,0,u>, <6,5,0,u>
+  2595280230U,	// <5,0,u,7>: Cost 3 vext1 <7,5,0,2>, <7,4,5,6>
+  1613382355U,	// <5,0,u,u>: Cost 2 vext3 <0,4,1,5>, LHS
+  2646818816U,	// <5,1,0,0>: Cost 3 vext2 <4,u,5,1>, <0,0,0,0>
+  1573077094U,	// <5,1,0,1>: Cost 2 vext2 <4,u,5,1>, LHS
+  2646818980U,	// <5,1,0,2>: Cost 3 vext2 <4,u,5,1>, <0,2,0,2>
+  2687124214U,	// <5,1,0,3>: Cost 3 vext3 <0,4,1,5>, <1,0,3,2>
+  2641510738U,	// <5,1,0,4>: Cost 3 vext2 <4,0,5,1>, <0,4,1,5>
+  2641510814U,	// <5,1,0,5>: Cost 3 vext2 <4,0,5,1>, <0,5,1,0>
+  3720561142U,	// <5,1,0,6>: Cost 4 vext2 <4,u,5,1>, <0,6,1,7>
+  3298141357U,	// <5,1,0,7>: Cost 4 vrev <1,5,7,0>
+  1573077661U,	// <5,1,0,u>: Cost 2 vext2 <4,u,5,1>, LHS
+  2223891567U,	// <5,1,1,0>: Cost 3 vrev <1,5,0,1>
+  2687124276U,	// <5,1,1,1>: Cost 3 vext3 <0,4,1,5>, <1,1,1,1>
+  2646819734U,	// <5,1,1,2>: Cost 3 vext2 <4,u,5,1>, <1,2,3,0>
+  2687124296U,	// <5,1,1,3>: Cost 3 vext3 <0,4,1,5>, <1,1,3,3>
+  2691326803U,	// <5,1,1,4>: Cost 3 vext3 <1,1,4,5>, <1,1,4,5>
+  2691400540U,	// <5,1,1,5>: Cost 3 vext3 <1,1,5,5>, <1,1,5,5>
+  3765216101U,	// <5,1,1,6>: Cost 4 vext3 <1,1,6,5>, <1,1,6,5>
+  3765289838U,	// <5,1,1,7>: Cost 4 vext3 <1,1,7,5>, <1,1,7,5>
+  2687124341U,	// <5,1,1,u>: Cost 3 vext3 <0,4,1,5>, <1,1,u,3>
+  3297641584U,	// <5,1,2,0>: Cost 4 vrev <1,5,0,2>
+  3763520391U,	// <5,1,2,1>: Cost 4 vext3 <0,u,1,5>, <1,2,1,3>
+  2646820456U,	// <5,1,2,2>: Cost 3 vext2 <4,u,5,1>, <2,2,2,2>
+  2687124374U,	// <5,1,2,3>: Cost 3 vext3 <0,4,1,5>, <1,2,3,0>
+  2691990436U,	// <5,1,2,4>: Cost 3 vext3 <1,2,4,5>, <1,2,4,5>
+  2687124395U,	// <5,1,2,5>: Cost 3 vext3 <0,4,1,5>, <1,2,5,3>
+  2646820794U,	// <5,1,2,6>: Cost 3 vext2 <4,u,5,1>, <2,6,3,7>
+  3808199610U,	// <5,1,2,7>: Cost 4 vext3 <u,3,4,5>, <1,2,7,0>
+  2687124419U,	// <5,1,2,u>: Cost 3 vext3 <0,4,1,5>, <1,2,u,0>
+  2577440870U,	// <5,1,3,0>: Cost 3 vext1 <4,5,1,3>, LHS
+  2687124440U,	// <5,1,3,1>: Cost 3 vext3 <0,4,1,5>, <1,3,1,3>
+  3759686627U,	// <5,1,3,2>: Cost 4 vext3 <0,2,3,5>, <1,3,2,5>
+  2692580332U,	// <5,1,3,3>: Cost 3 vext3 <1,3,3,5>, <1,3,3,5>
+  2687124469U,	// <5,1,3,4>: Cost 3 vext3 <0,4,1,5>, <1,3,4,5>
+  2685207552U,	// <5,1,3,5>: Cost 3 vext3 <0,1,2,5>, <1,3,5,7>
+  3760866313U,	// <5,1,3,6>: Cost 4 vext3 <0,4,1,5>, <1,3,6,7>
+  2692875280U,	// <5,1,3,7>: Cost 3 vext3 <1,3,7,5>, <1,3,7,5>
+  2687124503U,	// <5,1,3,u>: Cost 3 vext3 <0,4,1,5>, <1,3,u,3>
+  1567771538U,	// <5,1,4,0>: Cost 2 vext2 <4,0,5,1>, <4,0,5,1>
+  2693096491U,	// <5,1,4,1>: Cost 3 vext3 <1,4,1,5>, <1,4,1,5>
+  2693170228U,	// <5,1,4,2>: Cost 3 vext3 <1,4,2,5>, <1,4,2,5>
+  2687124541U,	// <5,1,4,3>: Cost 3 vext3 <0,4,1,5>, <1,4,3,5>
+  2646822096U,	// <5,1,4,4>: Cost 3 vext2 <4,u,5,1>, <4,4,4,4>
+  1573080374U,	// <5,1,4,5>: Cost 2 vext2 <4,u,5,1>, RHS
+  2646822260U,	// <5,1,4,6>: Cost 3 vext2 <4,u,5,1>, <4,6,4,6>
+  3298174129U,	// <5,1,4,7>: Cost 4 vrev <1,5,7,4>
+  1573080602U,	// <5,1,4,u>: Cost 2 vext2 <4,u,5,1>, <4,u,5,1>
+  2687124591U,	// <5,1,5,0>: Cost 3 vext3 <0,4,1,5>, <1,5,0,1>
+  2646822543U,	// <5,1,5,1>: Cost 3 vext2 <4,u,5,1>, <5,1,0,1>
+  3760866433U,	// <5,1,5,2>: Cost 4 vext3 <0,4,1,5>, <1,5,2,1>
+  2687124624U,	// <5,1,5,3>: Cost 3 vext3 <0,4,1,5>, <1,5,3,7>
+  2687124631U,	// <5,1,5,4>: Cost 3 vext3 <0,4,1,5>, <1,5,4,5>
+  2646822916U,	// <5,1,5,5>: Cost 3 vext2 <4,u,5,1>, <5,5,5,5>
+  2646823010U,	// <5,1,5,6>: Cost 3 vext2 <4,u,5,1>, <5,6,7,0>
+  2646823080U,	// <5,1,5,7>: Cost 3 vext2 <4,u,5,1>, <5,7,5,7>
+  2687124663U,	// <5,1,5,u>: Cost 3 vext3 <0,4,1,5>, <1,5,u,1>
+  2553577574U,	// <5,1,6,0>: Cost 3 vext1 <0,5,1,6>, LHS
+  3763520719U,	// <5,1,6,1>: Cost 4 vext3 <0,u,1,5>, <1,6,1,7>
+  2646823418U,	// <5,1,6,2>: Cost 3 vext2 <4,u,5,1>, <6,2,7,3>
+  3760866529U,	// <5,1,6,3>: Cost 4 vext3 <0,4,1,5>, <1,6,3,7>
+  2553580854U,	// <5,1,6,4>: Cost 3 vext1 <0,5,1,6>, RHS
+  2687124723U,	// <5,1,6,5>: Cost 3 vext3 <0,4,1,5>, <1,6,5,7>
+  2646823736U,	// <5,1,6,6>: Cost 3 vext2 <4,u,5,1>, <6,6,6,6>
+  2646823758U,	// <5,1,6,7>: Cost 3 vext2 <4,u,5,1>, <6,7,0,1>
+  2646823839U,	// <5,1,6,u>: Cost 3 vext2 <4,u,5,1>, <6,u,0,1>
+  2559557734U,	// <5,1,7,0>: Cost 3 vext1 <1,5,1,7>, LHS
+  2559558452U,	// <5,1,7,1>: Cost 3 vext1 <1,5,1,7>, <1,1,1,1>
+  2571503270U,	// <5,1,7,2>: Cost 3 vext1 <3,5,1,7>, <2,3,0,1>
+  2040971366U,	// <5,1,7,3>: Cost 2 vtrnr RHS, LHS
+  2559561014U,	// <5,1,7,4>: Cost 3 vext1 <1,5,1,7>, RHS
+  2595393232U,	// <5,1,7,5>: Cost 3 vext1 <7,5,1,7>, <5,1,7,3>
+  4188455035U,	// <5,1,7,6>: Cost 4 vtrnr RHS, <0,1,4,6>
+  2646824556U,	// <5,1,7,7>: Cost 3 vext2 <4,u,5,1>, <7,7,7,7>
+  2040971371U,	// <5,1,7,u>: Cost 2 vtrnr RHS, LHS
+  1591662326U,	// <5,1,u,0>: Cost 2 vext2 <u,0,5,1>, <u,0,5,1>
+  1573082926U,	// <5,1,u,1>: Cost 2 vext2 <4,u,5,1>, LHS
+  2695824760U,	// <5,1,u,2>: Cost 3 vext3 <1,u,2,5>, <1,u,2,5>
+  2040979558U,	// <5,1,u,3>: Cost 2 vtrnr RHS, LHS
+  2687124874U,	// <5,1,u,4>: Cost 3 vext3 <0,4,1,5>, <1,u,4,5>
+  1573083290U,	// <5,1,u,5>: Cost 2 vext2 <4,u,5,1>, RHS
+  2646825168U,	// <5,1,u,6>: Cost 3 vext2 <4,u,5,1>, <u,6,3,7>
+  2646825216U,	// <5,1,u,7>: Cost 3 vext2 <4,u,5,1>, <u,7,0,1>
+  2040979563U,	// <5,1,u,u>: Cost 2 vtrnr RHS, LHS
+  3702652928U,	// <5,2,0,0>: Cost 4 vext2 <1,u,5,2>, <0,0,0,0>
+  2628911206U,	// <5,2,0,1>: Cost 3 vext2 <1,u,5,2>, LHS
+  2641518756U,	// <5,2,0,2>: Cost 3 vext2 <4,0,5,2>, <0,2,0,2>
+  3759760847U,	// <5,2,0,3>: Cost 4 vext3 <0,2,4,5>, <2,0,3,2>
+  3760866775U,	// <5,2,0,4>: Cost 4 vext3 <0,4,1,5>, <2,0,4,1>
+  3759539680U,	// <5,2,0,5>: Cost 4 vext3 <0,2,1,5>, <2,0,5,1>
+  3760866796U,	// <5,2,0,6>: Cost 4 vext3 <0,4,1,5>, <2,0,6,4>
+  3304114054U,	// <5,2,0,7>: Cost 4 vrev <2,5,7,0>
+  2628911773U,	// <5,2,0,u>: Cost 3 vext2 <1,u,5,2>, LHS
+  2623603464U,	// <5,2,1,0>: Cost 3 vext2 <1,0,5,2>, <1,0,5,2>
+  3698008921U,	// <5,2,1,1>: Cost 4 vext2 <1,1,5,2>, <1,1,5,2>
+  3633325603U,	// <5,2,1,2>: Cost 4 vext1 <1,5,2,1>, <2,1,3,5>
+  2687125027U,	// <5,2,1,3>: Cost 3 vext3 <0,4,1,5>, <2,1,3,5>
+  3633327414U,	// <5,2,1,4>: Cost 4 vext1 <1,5,2,1>, RHS
+  3759539760U,	// <5,2,1,5>: Cost 4 vext3 <0,2,1,5>, <2,1,5,0>
+  3760866876U,	// <5,2,1,6>: Cost 4 vext3 <0,4,1,5>, <2,1,6,3>
+  3304122247U,	// <5,2,1,7>: Cost 4 vrev <2,5,7,1>
+  2687125072U,	// <5,2,1,u>: Cost 3 vext3 <0,4,1,5>, <2,1,u,5>
+  3633332326U,	// <5,2,2,0>: Cost 4 vext1 <1,5,2,2>, LHS
+  3759760992U,	// <5,2,2,1>: Cost 4 vext3 <0,2,4,5>, <2,2,1,3>
+  2687125096U,	// <5,2,2,2>: Cost 3 vext3 <0,4,1,5>, <2,2,2,2>
+  2687125106U,	// <5,2,2,3>: Cost 3 vext3 <0,4,1,5>, <2,2,3,3>
+  2697963133U,	// <5,2,2,4>: Cost 3 vext3 <2,2,4,5>, <2,2,4,5>
+  3759466120U,	// <5,2,2,5>: Cost 4 vext3 <0,2,0,5>, <2,2,5,7>
+  3760866960U,	// <5,2,2,6>: Cost 4 vext3 <0,4,1,5>, <2,2,6,6>
+  3771926168U,	// <5,2,2,7>: Cost 4 vext3 <2,2,7,5>, <2,2,7,5>
+  2687125151U,	// <5,2,2,u>: Cost 3 vext3 <0,4,1,5>, <2,2,u,3>
+  2687125158U,	// <5,2,3,0>: Cost 3 vext3 <0,4,1,5>, <2,3,0,1>
+  2698405555U,	// <5,2,3,1>: Cost 3 vext3 <2,3,1,5>, <2,3,1,5>
+  2577516238U,	// <5,2,3,2>: Cost 3 vext1 <4,5,2,3>, <2,3,4,5>
+  3759687365U,	// <5,2,3,3>: Cost 4 vext3 <0,2,3,5>, <2,3,3,5>
+  1624884942U,	// <5,2,3,4>: Cost 2 vext3 <2,3,4,5>, <2,3,4,5>
+  2698700503U,	// <5,2,3,5>: Cost 3 vext3 <2,3,5,5>, <2,3,5,5>
+  3772368608U,	// <5,2,3,6>: Cost 4 vext3 <2,3,4,5>, <2,3,6,5>
+  3702655716U,	// <5,2,3,7>: Cost 4 vext2 <1,u,5,2>, <3,7,3,7>
+  1625179890U,	// <5,2,3,u>: Cost 2 vext3 <2,3,u,5>, <2,3,u,5>
+  2641521555U,	// <5,2,4,0>: Cost 3 vext2 <4,0,5,2>, <4,0,5,2>
+  3772368642U,	// <5,2,4,1>: Cost 4 vext3 <2,3,4,5>, <2,4,1,3>
+  2699142925U,	// <5,2,4,2>: Cost 3 vext3 <2,4,2,5>, <2,4,2,5>
+  2698626838U,	// <5,2,4,3>: Cost 3 vext3 <2,3,4,5>, <2,4,3,5>
+  2698626848U,	// <5,2,4,4>: Cost 3 vext3 <2,3,4,5>, <2,4,4,6>
+  2628914486U,	// <5,2,4,5>: Cost 3 vext2 <1,u,5,2>, RHS
+  2645503353U,	// <5,2,4,6>: Cost 3 vext2 <4,6,5,2>, <4,6,5,2>
+  3304146826U,	// <5,2,4,7>: Cost 4 vrev <2,5,7,4>
+  2628914729U,	// <5,2,4,u>: Cost 3 vext2 <1,u,5,2>, RHS
+  2553643110U,	// <5,2,5,0>: Cost 3 vext1 <0,5,2,5>, LHS
+  3758950227U,	// <5,2,5,1>: Cost 4 vext3 <0,1,2,5>, <2,5,1,3>
+  3759761248U,	// <5,2,5,2>: Cost 4 vext3 <0,2,4,5>, <2,5,2,7>
+  2982396006U,	// <5,2,5,3>: Cost 3 vzipr <4,u,5,5>, LHS
+  2553646390U,	// <5,2,5,4>: Cost 3 vext1 <0,5,2,5>, RHS
+  2553647108U,	// <5,2,5,5>: Cost 3 vext1 <0,5,2,5>, <5,5,5,5>
+  3760867204U,	// <5,2,5,6>: Cost 4 vext3 <0,4,1,5>, <2,5,6,7>
+  3702657141U,	// <5,2,5,7>: Cost 4 vext2 <1,u,5,2>, <5,7,0,1>
+  2982396011U,	// <5,2,5,u>: Cost 3 vzipr <4,u,5,5>, LHS
+  3627393126U,	// <5,2,6,0>: Cost 4 vext1 <0,5,2,6>, LHS
+  3760867236U,	// <5,2,6,1>: Cost 4 vext3 <0,4,1,5>, <2,6,1,3>
+  2645504506U,	// <5,2,6,2>: Cost 3 vext2 <4,6,5,2>, <6,2,7,3>
+  2687125434U,	// <5,2,6,3>: Cost 3 vext3 <0,4,1,5>, <2,6,3,7>
+  2700617665U,	// <5,2,6,4>: Cost 3 vext3 <2,6,4,5>, <2,6,4,5>
+  3760867276U,	// <5,2,6,5>: Cost 4 vext3 <0,4,1,5>, <2,6,5,7>
+  3763521493U,	// <5,2,6,6>: Cost 4 vext3 <0,u,1,5>, <2,6,6,7>
+  3719246670U,	// <5,2,6,7>: Cost 4 vext2 <4,6,5,2>, <6,7,0,1>
+  2687125479U,	// <5,2,6,u>: Cost 3 vext3 <0,4,1,5>, <2,6,u,7>
+  2565603430U,	// <5,2,7,0>: Cost 3 vext1 <2,5,2,7>, LHS
+  2553660150U,	// <5,2,7,1>: Cost 3 vext1 <0,5,2,7>, <1,0,3,2>
+  2565605216U,	// <5,2,7,2>: Cost 3 vext1 <2,5,2,7>, <2,5,2,7>
+  2961178726U,	// <5,2,7,3>: Cost 3 vzipr <1,3,5,7>, LHS
+  2565606710U,	// <5,2,7,4>: Cost 3 vext1 <2,5,2,7>, RHS
+  4034920552U,	// <5,2,7,5>: Cost 4 vzipr <1,3,5,7>, <0,1,2,5>
+  3114713292U,	// <5,2,7,6>: Cost 3 vtrnr RHS, <0,2,4,6>
+  3702658668U,	// <5,2,7,7>: Cost 4 vext2 <1,u,5,2>, <7,7,7,7>
+  2961178731U,	// <5,2,7,u>: Cost 3 vzipr <1,3,5,7>, LHS
+  2687125563U,	// <5,2,u,0>: Cost 3 vext3 <0,4,1,5>, <2,u,0,1>
+  2628917038U,	// <5,2,u,1>: Cost 3 vext2 <1,u,5,2>, LHS
+  2565613409U,	// <5,2,u,2>: Cost 3 vext1 <2,5,2,u>, <2,5,2,u>
+  2687125592U,	// <5,2,u,3>: Cost 3 vext3 <0,4,1,5>, <2,u,3,3>
+  1628203107U,	// <5,2,u,4>: Cost 2 vext3 <2,u,4,5>, <2,u,4,5>
+  2628917402U,	// <5,2,u,5>: Cost 3 vext2 <1,u,5,2>, RHS
+  2702092405U,	// <5,2,u,6>: Cost 3 vext3 <2,u,6,5>, <2,u,6,5>
+  3304179598U,	// <5,2,u,7>: Cost 4 vrev <2,5,7,u>
+  1628498055U,	// <5,2,u,u>: Cost 2 vext3 <2,u,u,5>, <2,u,u,5>
+  3760867467U,	// <5,3,0,0>: Cost 4 vext3 <0,4,1,5>, <3,0,0,0>
+  2687125654U,	// <5,3,0,1>: Cost 3 vext3 <0,4,1,5>, <3,0,1,2>
+  3759761565U,	// <5,3,0,2>: Cost 4 vext3 <0,2,4,5>, <3,0,2,0>
+  3633391766U,	// <5,3,0,3>: Cost 4 vext1 <1,5,3,0>, <3,0,1,2>
+  2687125680U,	// <5,3,0,4>: Cost 3 vext3 <0,4,1,5>, <3,0,4,1>
+  3760277690U,	// <5,3,0,5>: Cost 4 vext3 <0,3,2,5>, <3,0,5,2>
+  3310013014U,	// <5,3,0,6>: Cost 4 vrev <3,5,6,0>
+  2236344927U,	// <5,3,0,7>: Cost 3 vrev <3,5,7,0>
+  2687125717U,	// <5,3,0,u>: Cost 3 vext3 <0,4,1,5>, <3,0,u,2>
+  3760867551U,	// <5,3,1,0>: Cost 4 vext3 <0,4,1,5>, <3,1,0,3>
+  3760867558U,	// <5,3,1,1>: Cost 4 vext3 <0,4,1,5>, <3,1,1,1>
+  2624938923U,	// <5,3,1,2>: Cost 3 vext2 <1,2,5,3>, <1,2,5,3>
+  2703198460U,	// <5,3,1,3>: Cost 3 vext3 <3,1,3,5>, <3,1,3,5>
+  3760867587U,	// <5,3,1,4>: Cost 4 vext3 <0,4,1,5>, <3,1,4,3>
+  2636219536U,	// <5,3,1,5>: Cost 3 vext2 <3,1,5,3>, <1,5,3,7>
+  3698681075U,	// <5,3,1,6>: Cost 4 vext2 <1,2,5,3>, <1,6,5,7>
+  2703493408U,	// <5,3,1,7>: Cost 3 vext3 <3,1,7,5>, <3,1,7,5>
+  2628920721U,	// <5,3,1,u>: Cost 3 vext2 <1,u,5,3>, <1,u,5,3>
+  3766765870U,	// <5,3,2,0>: Cost 4 vext3 <1,4,0,5>, <3,2,0,1>
+  3698681379U,	// <5,3,2,1>: Cost 4 vext2 <1,2,5,3>, <2,1,3,5>
+  3760867649U,	// <5,3,2,2>: Cost 4 vext3 <0,4,1,5>, <3,2,2,2>
+  2698627404U,	// <5,3,2,3>: Cost 3 vext3 <2,3,4,5>, <3,2,3,4>
+  2703935830U,	// <5,3,2,4>: Cost 3 vext3 <3,2,4,5>, <3,2,4,5>
+  2698627422U,	// <5,3,2,5>: Cost 3 vext3 <2,3,4,5>, <3,2,5,4>
+  3760867686U,	// <5,3,2,6>: Cost 4 vext3 <0,4,1,5>, <3,2,6,3>
+  3769788783U,	// <5,3,2,7>: Cost 4 vext3 <1,u,5,5>, <3,2,7,3>
+  2701945209U,	// <5,3,2,u>: Cost 3 vext3 <2,u,4,5>, <3,2,u,4>
+  3760867711U,	// <5,3,3,0>: Cost 4 vext3 <0,4,1,5>, <3,3,0,1>
+  2636220684U,	// <5,3,3,1>: Cost 3 vext2 <3,1,5,3>, <3,1,5,3>
+  3772369298U,	// <5,3,3,2>: Cost 4 vext3 <2,3,4,5>, <3,3,2,2>
+  2687125916U,	// <5,3,3,3>: Cost 3 vext3 <0,4,1,5>, <3,3,3,3>
+  2704599463U,	// <5,3,3,4>: Cost 3 vext3 <3,3,4,5>, <3,3,4,5>
+  2704673200U,	// <5,3,3,5>: Cost 3 vext3 <3,3,5,5>, <3,3,5,5>
+  3709962935U,	// <5,3,3,6>: Cost 4 vext2 <3,1,5,3>, <3,6,7,7>
+  3772369346U,	// <5,3,3,7>: Cost 4 vext3 <2,3,4,5>, <3,3,7,5>
+  2704894411U,	// <5,3,3,u>: Cost 3 vext3 <3,3,u,5>, <3,3,u,5>
+  2704968148U,	// <5,3,4,0>: Cost 3 vext3 <3,4,0,5>, <3,4,0,5>
+  3698682850U,	// <5,3,4,1>: Cost 4 vext2 <1,2,5,3>, <4,1,5,0>
+  2642857014U,	// <5,3,4,2>: Cost 3 vext2 <4,2,5,3>, <4,2,5,3>
+  2705189359U,	// <5,3,4,3>: Cost 3 vext3 <3,4,3,5>, <3,4,3,5>
+  2705263096U,	// <5,3,4,4>: Cost 3 vext3 <3,4,4,5>, <3,4,4,5>
+  2685946370U,	// <5,3,4,5>: Cost 3 vext3 <0,2,3,5>, <3,4,5,6>
+  3779152394U,	// <5,3,4,6>: Cost 4 vext3 <3,4,6,5>, <3,4,6,5>
+  2236377699U,	// <5,3,4,7>: Cost 3 vrev <3,5,7,4>
+  2687126045U,	// <5,3,4,u>: Cost 3 vext3 <0,4,1,5>, <3,4,u,6>
+  2571632742U,	// <5,3,5,0>: Cost 3 vext1 <3,5,3,5>, LHS
+  2559689870U,	// <5,3,5,1>: Cost 3 vext1 <1,5,3,5>, <1,5,3,5>
+  2571634382U,	// <5,3,5,2>: Cost 3 vext1 <3,5,3,5>, <2,3,4,5>
+  2571635264U,	// <5,3,5,3>: Cost 3 vext1 <3,5,3,5>, <3,5,3,5>
+  2571636022U,	// <5,3,5,4>: Cost 3 vext1 <3,5,3,5>, RHS
+  2559692804U,	// <5,3,5,5>: Cost 3 vext1 <1,5,3,5>, <5,5,5,5>
+  3720581218U,	// <5,3,5,6>: Cost 4 vext2 <4,u,5,3>, <5,6,7,0>
+  2236385892U,	// <5,3,5,7>: Cost 3 vrev <3,5,7,5>
+  2571638574U,	// <5,3,5,u>: Cost 3 vext1 <3,5,3,5>, LHS
+  2565668966U,	// <5,3,6,0>: Cost 3 vext1 <2,5,3,6>, LHS
+  3633439887U,	// <5,3,6,1>: Cost 4 vext1 <1,5,3,6>, <1,5,3,6>
+  2565670760U,	// <5,3,6,2>: Cost 3 vext1 <2,5,3,6>, <2,5,3,6>
+  2565671426U,	// <5,3,6,3>: Cost 3 vext1 <2,5,3,6>, <3,4,5,6>
+  2565672246U,	// <5,3,6,4>: Cost 3 vext1 <2,5,3,6>, RHS
+  3639414630U,	// <5,3,6,5>: Cost 4 vext1 <2,5,3,6>, <5,3,6,0>
+  4047521640U,	// <5,3,6,6>: Cost 4 vzipr <3,4,5,6>, <2,5,3,6>
+  2725169844U,	// <5,3,6,7>: Cost 3 vext3 <6,7,4,5>, <3,6,7,4>
+  2565674798U,	// <5,3,6,u>: Cost 3 vext1 <2,5,3,6>, LHS
+  1485963366U,	// <5,3,7,0>: Cost 2 vext1 <1,5,3,7>, LHS
+  1485964432U,	// <5,3,7,1>: Cost 2 vext1 <1,5,3,7>, <1,5,3,7>
+  2559706728U,	// <5,3,7,2>: Cost 3 vext1 <1,5,3,7>, <2,2,2,2>
+  2559707286U,	// <5,3,7,3>: Cost 3 vext1 <1,5,3,7>, <3,0,1,2>
+  1485966646U,	// <5,3,7,4>: Cost 2 vext1 <1,5,3,7>, RHS
+  2559708880U,	// <5,3,7,5>: Cost 3 vext1 <1,5,3,7>, <5,1,7,3>
+  2601513466U,	// <5,3,7,6>: Cost 3 vext1 <u,5,3,7>, <6,2,7,3>
+  3114714112U,	// <5,3,7,7>: Cost 3 vtrnr RHS, <1,3,5,7>
+  1485969198U,	// <5,3,7,u>: Cost 2 vext1 <1,5,3,7>, LHS
+  1485971558U,	// <5,3,u,0>: Cost 2 vext1 <1,5,3,u>, LHS
+  1485972625U,	// <5,3,u,1>: Cost 2 vext1 <1,5,3,u>, <1,5,3,u>
+  2559714920U,	// <5,3,u,2>: Cost 3 vext1 <1,5,3,u>, <2,2,2,2>
+  2559715478U,	// <5,3,u,3>: Cost 3 vext1 <1,5,3,u>, <3,0,1,2>
+  1485974838U,	// <5,3,u,4>: Cost 2 vext1 <1,5,3,u>, RHS
+  2687126342U,	// <5,3,u,5>: Cost 3 vext3 <0,4,1,5>, <3,u,5,6>
+  2601521658U,	// <5,3,u,6>: Cost 3 vext1 <u,5,3,u>, <6,2,7,3>
+  2236410471U,	// <5,3,u,7>: Cost 3 vrev <3,5,7,u>
+  1485977390U,	// <5,3,u,u>: Cost 2 vext1 <1,5,3,u>, LHS
+  3627491430U,	// <5,4,0,0>: Cost 4 vext1 <0,5,4,0>, LHS
+  2636890214U,	// <5,4,0,1>: Cost 3 vext2 <3,2,5,4>, LHS
+  3703333028U,	// <5,4,0,2>: Cost 4 vext2 <2,0,5,4>, <0,2,0,2>
+  3782249348U,	// <5,4,0,3>: Cost 4 vext3 <4,0,3,5>, <4,0,3,5>
+  2642198866U,	// <5,4,0,4>: Cost 3 vext2 <4,1,5,4>, <0,4,1,5>
+  2687126418U,	// <5,4,0,5>: Cost 3 vext3 <0,4,1,5>, <4,0,5,1>
+  2242243887U,	// <5,4,0,6>: Cost 3 vrev <4,5,6,0>
+  3316059448U,	// <5,4,0,7>: Cost 4 vrev <4,5,7,0>
+  2636890781U,	// <5,4,0,u>: Cost 3 vext2 <3,2,5,4>, LHS
+  2241809658U,	// <5,4,1,0>: Cost 3 vrev <4,5,0,1>
+  3698025307U,	// <5,4,1,1>: Cost 4 vext2 <1,1,5,4>, <1,1,5,4>
+  3698688940U,	// <5,4,1,2>: Cost 4 vext2 <1,2,5,4>, <1,2,5,4>
+  3698689024U,	// <5,4,1,3>: Cost 4 vext2 <1,2,5,4>, <1,3,5,7>
+  3700016206U,	// <5,4,1,4>: Cost 4 vext2 <1,4,5,4>, <1,4,5,4>
+  2687126498U,	// <5,4,1,5>: Cost 3 vext3 <0,4,1,5>, <4,1,5,0>
+  3760868336U,	// <5,4,1,6>: Cost 4 vext3 <0,4,1,5>, <4,1,6,5>
+  3316067641U,	// <5,4,1,7>: Cost 4 vrev <4,5,7,1>
+  2242399554U,	// <5,4,1,u>: Cost 3 vrev <4,5,u,1>
+  3703334371U,	// <5,4,2,0>: Cost 4 vext2 <2,0,5,4>, <2,0,5,4>
+  3703998004U,	// <5,4,2,1>: Cost 4 vext2 <2,1,5,4>, <2,1,5,4>
+  3704661637U,	// <5,4,2,2>: Cost 4 vext2 <2,2,5,4>, <2,2,5,4>
+  2636891854U,	// <5,4,2,3>: Cost 3 vext2 <3,2,5,4>, <2,3,4,5>
+  3705988903U,	// <5,4,2,4>: Cost 4 vext2 <2,4,5,4>, <2,4,5,4>
+  2698628150U,	// <5,4,2,5>: Cost 3 vext3 <2,3,4,5>, <4,2,5,3>
+  3760868415U,	// <5,4,2,6>: Cost 4 vext3 <0,4,1,5>, <4,2,6,3>
+  3783871562U,	// <5,4,2,7>: Cost 4 vext3 <4,2,7,5>, <4,2,7,5>
+  2666752099U,	// <5,4,2,u>: Cost 3 vext2 <u,2,5,4>, <2,u,4,5>
+  3639459942U,	// <5,4,3,0>: Cost 4 vext1 <2,5,4,3>, LHS
+  3709970701U,	// <5,4,3,1>: Cost 4 vext2 <3,1,5,4>, <3,1,5,4>
+  2636892510U,	// <5,4,3,2>: Cost 3 vext2 <3,2,5,4>, <3,2,5,4>
+  3710634396U,	// <5,4,3,3>: Cost 4 vext2 <3,2,5,4>, <3,3,3,3>
+  2638219776U,	// <5,4,3,4>: Cost 3 vext2 <3,4,5,4>, <3,4,5,4>
+  3766987908U,	// <5,4,3,5>: Cost 4 vext3 <1,4,3,5>, <4,3,5,0>
+  2710719634U,	// <5,4,3,6>: Cost 3 vext3 <4,3,6,5>, <4,3,6,5>
+  3914097664U,	// <5,4,3,7>: Cost 4 vuzpr <3,5,7,4>, <1,3,5,7>
+  2640874308U,	// <5,4,3,u>: Cost 3 vext2 <3,u,5,4>, <3,u,5,4>
+  2583642214U,	// <5,4,4,0>: Cost 3 vext1 <5,5,4,4>, LHS
+  2642201574U,	// <5,4,4,1>: Cost 3 vext2 <4,1,5,4>, <4,1,5,4>
+  3710635062U,	// <5,4,4,2>: Cost 4 vext2 <3,2,5,4>, <4,2,5,3>
+  3717270664U,	// <5,4,4,3>: Cost 4 vext2 <4,3,5,4>, <4,3,5,4>
+  2713963728U,	// <5,4,4,4>: Cost 3 vext3 <4,u,5,5>, <4,4,4,4>
+  1637567706U,	// <5,4,4,5>: Cost 2 vext3 <4,4,5,5>, <4,4,5,5>
+  2242276659U,	// <5,4,4,6>: Cost 3 vrev <4,5,6,4>
+  2646183372U,	// <5,4,4,7>: Cost 3 vext2 <4,7,5,4>, <4,7,5,4>
+  1637788917U,	// <5,4,4,u>: Cost 2 vext3 <4,4,u,5>, <4,4,u,5>
+  2559762534U,	// <5,4,5,0>: Cost 3 vext1 <1,5,4,5>, LHS
+  2559763607U,	// <5,4,5,1>: Cost 3 vext1 <1,5,4,5>, <1,5,4,5>
+  2698628366U,	// <5,4,5,2>: Cost 3 vext3 <2,3,4,5>, <4,5,2,3>
+  3633506454U,	// <5,4,5,3>: Cost 4 vext1 <1,5,4,5>, <3,0,1,2>
+  2559765814U,	// <5,4,5,4>: Cost 3 vext1 <1,5,4,5>, RHS
+  2583654395U,	// <5,4,5,5>: Cost 3 vext1 <5,5,4,5>, <5,5,4,5>
+  1613385014U,	// <5,4,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
+  3901639990U,	// <5,4,5,7>: Cost 4 vuzpr <1,5,0,4>, RHS
+  1613385032U,	// <5,4,5,u>: Cost 2 vext3 <0,4,1,5>, RHS
+  2559770726U,	// <5,4,6,0>: Cost 3 vext1 <1,5,4,6>, LHS
+  2559771648U,	// <5,4,6,1>: Cost 3 vext1 <1,5,4,6>, <1,3,5,7>
+  3633514088U,	// <5,4,6,2>: Cost 4 vext1 <1,5,4,6>, <2,2,2,2>
+  2571717122U,	// <5,4,6,3>: Cost 3 vext1 <3,5,4,6>, <3,4,5,6>
+  2559774006U,	// <5,4,6,4>: Cost 3 vext1 <1,5,4,6>, RHS
+  2712636796U,	// <5,4,6,5>: Cost 3 vext3 <4,6,5,5>, <4,6,5,5>
+  3760868743U,	// <5,4,6,6>: Cost 4 vext3 <0,4,1,5>, <4,6,6,7>
+  2712784270U,	// <5,4,6,7>: Cost 3 vext3 <4,6,7,5>, <4,6,7,5>
+  2559776558U,	// <5,4,6,u>: Cost 3 vext1 <1,5,4,6>, LHS
+  2565750886U,	// <5,4,7,0>: Cost 3 vext1 <2,5,4,7>, LHS
+  2565751706U,	// <5,4,7,1>: Cost 3 vext1 <2,5,4,7>, <1,2,3,4>
+  2565752690U,	// <5,4,7,2>: Cost 3 vext1 <2,5,4,7>, <2,5,4,7>
+  2571725387U,	// <5,4,7,3>: Cost 3 vext1 <3,5,4,7>, <3,5,4,7>
+  2565754166U,	// <5,4,7,4>: Cost 3 vext1 <2,5,4,7>, RHS
+  3114713426U,	// <5,4,7,5>: Cost 3 vtrnr RHS, <0,4,1,5>
+  94817590U,	// <5,4,7,6>: Cost 1 vrev RHS
+  2595616175U,	// <5,4,7,7>: Cost 3 vext1 <7,5,4,7>, <7,5,4,7>
+  94965064U,	// <5,4,7,u>: Cost 1 vrev RHS
+  2559787110U,	// <5,4,u,0>: Cost 3 vext1 <1,5,4,u>, LHS
+  2559788186U,	// <5,4,u,1>: Cost 3 vext1 <1,5,4,u>, <1,5,4,u>
+  2242014483U,	// <5,4,u,2>: Cost 3 vrev <4,5,2,u>
+  2667419628U,	// <5,4,u,3>: Cost 3 vext2 <u,3,5,4>, <u,3,5,4>
+  2559790390U,	// <5,4,u,4>: Cost 3 vext1 <1,5,4,u>, RHS
+  1640222238U,	// <5,4,u,5>: Cost 2 vext3 <4,u,5,5>, <4,u,5,5>
+  94825783U,	// <5,4,u,6>: Cost 1 vrev RHS
+  2714111536U,	// <5,4,u,7>: Cost 3 vext3 <4,u,7,5>, <4,u,7,5>
+  94973257U,	// <5,4,u,u>: Cost 1 vrev RHS
+  2646851584U,	// <5,5,0,0>: Cost 3 vext2 <4,u,5,5>, <0,0,0,0>
+  1573109862U,	// <5,5,0,1>: Cost 2 vext2 <4,u,5,5>, LHS
+  2646851748U,	// <5,5,0,2>: Cost 3 vext2 <4,u,5,5>, <0,2,0,2>
+  3760279130U,	// <5,5,0,3>: Cost 4 vext3 <0,3,2,5>, <5,0,3,2>
+  2687127138U,	// <5,5,0,4>: Cost 3 vext3 <0,4,1,5>, <5,0,4,1>
+  2248142847U,	// <5,5,0,5>: Cost 3 vrev <5,5,5,0>
+  3720593910U,	// <5,5,0,6>: Cost 4 vext2 <4,u,5,5>, <0,6,1,7>
+  4182502710U,	// <5,5,0,7>: Cost 4 vtrnr <3,5,7,0>, RHS
+  1573110429U,	// <5,5,0,u>: Cost 2 vext2 <4,u,5,5>, LHS
+  2646852342U,	// <5,5,1,0>: Cost 3 vext2 <4,u,5,5>, <1,0,3,2>
+  2624291676U,	// <5,5,1,1>: Cost 3 vext2 <1,1,5,5>, <1,1,5,5>
+  2646852502U,	// <5,5,1,2>: Cost 3 vext2 <4,u,5,5>, <1,2,3,0>
+  2646852568U,	// <5,5,1,3>: Cost 3 vext2 <4,u,5,5>, <1,3,1,3>
+  2715217591U,	// <5,5,1,4>: Cost 3 vext3 <5,1,4,5>, <5,1,4,5>
+  2628936848U,	// <5,5,1,5>: Cost 3 vext2 <1,u,5,5>, <1,5,3,7>
+  3698033907U,	// <5,5,1,6>: Cost 4 vext2 <1,1,5,5>, <1,6,5,7>
+  2713964240U,	// <5,5,1,7>: Cost 3 vext3 <4,u,5,5>, <5,1,7,3>
+  2628937107U,	// <5,5,1,u>: Cost 3 vext2 <1,u,5,5>, <1,u,5,5>
+  3645497446U,	// <5,5,2,0>: Cost 4 vext1 <3,5,5,2>, LHS
+  3760869099U,	// <5,5,2,1>: Cost 4 vext3 <0,4,1,5>, <5,2,1,3>
+  2646853224U,	// <5,5,2,2>: Cost 3 vext2 <4,u,5,5>, <2,2,2,2>
+  2698628862U,	// <5,5,2,3>: Cost 3 vext3 <2,3,4,5>, <5,2,3,4>
+  3772370694U,	// <5,5,2,4>: Cost 4 vext3 <2,3,4,5>, <5,2,4,3>
+  2713964303U,	// <5,5,2,5>: Cost 3 vext3 <4,u,5,5>, <5,2,5,3>
+  2646853562U,	// <5,5,2,6>: Cost 3 vext2 <4,u,5,5>, <2,6,3,7>
+  4038198272U,	// <5,5,2,7>: Cost 4 vzipr <1,u,5,2>, <1,3,5,7>
+  2701946667U,	// <5,5,2,u>: Cost 3 vext3 <2,u,4,5>, <5,2,u,4>
+  2646853782U,	// <5,5,3,0>: Cost 3 vext2 <4,u,5,5>, <3,0,1,2>
+  3698034922U,	// <5,5,3,1>: Cost 4 vext2 <1,1,5,5>, <3,1,1,5>
+  3702679919U,	// <5,5,3,2>: Cost 4 vext2 <1,u,5,5>, <3,2,7,3>
+  2637564336U,	// <5,5,3,3>: Cost 3 vext2 <3,3,5,5>, <3,3,5,5>
+  2646854146U,	// <5,5,3,4>: Cost 3 vext2 <4,u,5,5>, <3,4,5,6>
+  2638891602U,	// <5,5,3,5>: Cost 3 vext2 <3,5,5,5>, <3,5,5,5>
+  3702680247U,	// <5,5,3,6>: Cost 4 vext2 <1,u,5,5>, <3,6,7,7>
+  3702680259U,	// <5,5,3,7>: Cost 4 vext2 <1,u,5,5>, <3,7,0,1>
+  2646854430U,	// <5,5,3,u>: Cost 3 vext2 <4,u,5,5>, <3,u,1,2>
+  2646854546U,	// <5,5,4,0>: Cost 3 vext2 <4,u,5,5>, <4,0,5,1>
+  2642209767U,	// <5,5,4,1>: Cost 3 vext2 <4,1,5,5>, <4,1,5,5>
+  3711306806U,	// <5,5,4,2>: Cost 4 vext2 <3,3,5,5>, <4,2,5,3>
+  3645516369U,	// <5,5,4,3>: Cost 4 vext1 <3,5,5,4>, <3,5,5,4>
+  1570458842U,	// <5,5,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
+  1573113142U,	// <5,5,4,5>: Cost 2 vext2 <4,u,5,5>, RHS
+  2645527932U,	// <5,5,4,6>: Cost 3 vext2 <4,6,5,5>, <4,6,5,5>
+  2713964486U,	// <5,5,4,7>: Cost 3 vext3 <4,u,5,5>, <5,4,7,6>
+  1573113374U,	// <5,5,4,u>: Cost 2 vext2 <4,u,5,5>, <4,u,5,5>
+  1509982310U,	// <5,5,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
+  2646855376U,	// <5,5,5,1>: Cost 3 vext2 <4,u,5,5>, <5,1,7,3>
+  2583725672U,	// <5,5,5,2>: Cost 3 vext1 <5,5,5,5>, <2,2,2,2>
+  2583726230U,	// <5,5,5,3>: Cost 3 vext1 <5,5,5,5>, <3,0,1,2>
+  1509985590U,	// <5,5,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
+  229035318U,	// <5,5,5,5>: Cost 1 vdup1 RHS
+  2646855778U,	// <5,5,5,6>: Cost 3 vext2 <4,u,5,5>, <5,6,7,0>
+  2646855848U,	// <5,5,5,7>: Cost 3 vext2 <4,u,5,5>, <5,7,5,7>
+  229035318U,	// <5,5,5,u>: Cost 1 vdup1 RHS
+  2577760358U,	// <5,5,6,0>: Cost 3 vext1 <4,5,5,6>, LHS
+  3633587361U,	// <5,5,6,1>: Cost 4 vext1 <1,5,5,6>, <1,5,5,6>
+  2646856186U,	// <5,5,6,2>: Cost 3 vext2 <4,u,5,5>, <6,2,7,3>
+  3633588738U,	// <5,5,6,3>: Cost 4 vext1 <1,5,5,6>, <3,4,5,6>
+  2718535756U,	// <5,5,6,4>: Cost 3 vext3 <5,6,4,5>, <5,6,4,5>
+  2644202223U,	// <5,5,6,5>: Cost 3 vext2 <4,4,5,5>, <6,5,7,5>
+  2973780482U,	// <5,5,6,6>: Cost 3 vzipr <3,4,5,6>, <3,4,5,6>
+  2646856526U,	// <5,5,6,7>: Cost 3 vext2 <4,u,5,5>, <6,7,0,1>
+  2646856607U,	// <5,5,6,u>: Cost 3 vext2 <4,u,5,5>, <6,u,0,1>
+  2571796582U,	// <5,5,7,0>: Cost 3 vext1 <3,5,5,7>, LHS
+  3633595392U,	// <5,5,7,1>: Cost 4 vext1 <1,5,5,7>, <1,3,5,7>
+  2571798222U,	// <5,5,7,2>: Cost 3 vext1 <3,5,5,7>, <2,3,4,5>
+  2571799124U,	// <5,5,7,3>: Cost 3 vext1 <3,5,5,7>, <3,5,5,7>
+  2571799862U,	// <5,5,7,4>: Cost 3 vext1 <3,5,5,7>, RHS
+  3114717188U,	// <5,5,7,5>: Cost 3 vtrnr RHS, <5,5,5,5>
+  4034923010U,	// <5,5,7,6>: Cost 4 vzipr <1,3,5,7>, <3,4,5,6>
+  2040974646U,	// <5,5,7,7>: Cost 2 vtrnr RHS, RHS
+  2040974647U,	// <5,5,7,u>: Cost 2 vtrnr RHS, RHS
+  1509982310U,	// <5,5,u,0>: Cost 2 vext1 <5,5,5,5>, LHS
+  1573115694U,	// <5,5,u,1>: Cost 2 vext2 <4,u,5,5>, LHS
+  2571806414U,	// <5,5,u,2>: Cost 3 vext1 <3,5,5,u>, <2,3,4,5>
+  2571807317U,	// <5,5,u,3>: Cost 3 vext1 <3,5,5,u>, <3,5,5,u>
+  1509985590U,	// <5,5,u,4>: Cost 2 vext1 <5,5,5,5>, RHS
+  229035318U,	// <5,5,u,5>: Cost 1 vdup1 RHS
+  2646857936U,	// <5,5,u,6>: Cost 3 vext2 <4,u,5,5>, <u,6,3,7>
+  2040982838U,	// <5,5,u,7>: Cost 2 vtrnr RHS, RHS
+  229035318U,	// <5,5,u,u>: Cost 1 vdup1 RHS
+  2638233600U,	// <5,6,0,0>: Cost 3 vext2 <3,4,5,6>, <0,0,0,0>
+  1564491878U,	// <5,6,0,1>: Cost 2 vext2 <3,4,5,6>, LHS
+  2632261796U,	// <5,6,0,2>: Cost 3 vext2 <2,4,5,6>, <0,2,0,2>
+  2638233856U,	// <5,6,0,3>: Cost 3 vext2 <3,4,5,6>, <0,3,1,4>
+  2638233938U,	// <5,6,0,4>: Cost 3 vext2 <3,4,5,6>, <0,4,1,5>
+  3706003885U,	// <5,6,0,5>: Cost 4 vext2 <2,4,5,6>, <0,5,2,6>
+  3706003967U,	// <5,6,0,6>: Cost 4 vext2 <2,4,5,6>, <0,6,2,7>
+  4047473974U,	// <5,6,0,7>: Cost 4 vzipr <3,4,5,0>, RHS
+  1564492445U,	// <5,6,0,u>: Cost 2 vext2 <3,4,5,6>, LHS
+  2638234358U,	// <5,6,1,0>: Cost 3 vext2 <3,4,5,6>, <1,0,3,2>
+  2638234420U,	// <5,6,1,1>: Cost 3 vext2 <3,4,5,6>, <1,1,1,1>
+  2638234518U,	// <5,6,1,2>: Cost 3 vext2 <3,4,5,6>, <1,2,3,0>
+  2638234584U,	// <5,6,1,3>: Cost 3 vext2 <3,4,5,6>, <1,3,1,3>
+  2626290768U,	// <5,6,1,4>: Cost 3 vext2 <1,4,5,6>, <1,4,5,6>
+  2638234768U,	// <5,6,1,5>: Cost 3 vext2 <3,4,5,6>, <1,5,3,7>
+  3700032719U,	// <5,6,1,6>: Cost 4 vext2 <1,4,5,6>, <1,6,1,7>
+  2982366518U,	// <5,6,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
+  2628945300U,	// <5,6,1,u>: Cost 3 vext2 <1,u,5,6>, <1,u,5,6>
+  3706004925U,	// <5,6,2,0>: Cost 4 vext2 <2,4,5,6>, <2,0,1,2>
+  3711976966U,	// <5,6,2,1>: Cost 4 vext2 <3,4,5,6>, <2,1,0,3>
+  2638235240U,	// <5,6,2,2>: Cost 3 vext2 <3,4,5,6>, <2,2,2,2>
+  2638235302U,	// <5,6,2,3>: Cost 3 vext2 <3,4,5,6>, <2,3,0,1>
+  2632263465U,	// <5,6,2,4>: Cost 3 vext2 <2,4,5,6>, <2,4,5,6>
+  2638235496U,	// <5,6,2,5>: Cost 3 vext2 <3,4,5,6>, <2,5,3,6>
+  2638235578U,	// <5,6,2,6>: Cost 3 vext2 <3,4,5,6>, <2,6,3,7>
+  2713965050U,	// <5,6,2,7>: Cost 3 vext3 <4,u,5,5>, <6,2,7,3>
+  2634917997U,	// <5,6,2,u>: Cost 3 vext2 <2,u,5,6>, <2,u,5,6>
+  2638235798U,	// <5,6,3,0>: Cost 3 vext2 <3,4,5,6>, <3,0,1,2>
+  3711977695U,	// <5,6,3,1>: Cost 4 vext2 <3,4,5,6>, <3,1,0,3>
+  3710650720U,	// <5,6,3,2>: Cost 4 vext2 <3,2,5,6>, <3,2,5,6>
+  2638236060U,	// <5,6,3,3>: Cost 3 vext2 <3,4,5,6>, <3,3,3,3>
+  1564494338U,	// <5,6,3,4>: Cost 2 vext2 <3,4,5,6>, <3,4,5,6>
+  2638236234U,	// <5,6,3,5>: Cost 3 vext2 <3,4,5,6>, <3,5,4,6>
+  3711978104U,	// <5,6,3,6>: Cost 4 vext2 <3,4,5,6>, <3,6,0,7>
+  4034227510U,	// <5,6,3,7>: Cost 4 vzipr <1,2,5,3>, RHS
+  1567148870U,	// <5,6,3,u>: Cost 2 vext2 <3,u,5,6>, <3,u,5,6>
+  2577817702U,	// <5,6,4,0>: Cost 3 vext1 <4,5,6,4>, LHS
+  3700034544U,	// <5,6,4,1>: Cost 4 vext2 <1,4,5,6>, <4,1,6,5>
+  2723033713U,	// <5,6,4,2>: Cost 3 vext3 <6,4,2,5>, <6,4,2,5>
+  2638236818U,	// <5,6,4,3>: Cost 3 vext2 <3,4,5,6>, <4,3,6,5>
+  2644208859U,	// <5,6,4,4>: Cost 3 vext2 <4,4,5,6>, <4,4,5,6>
+  1564495158U,	// <5,6,4,5>: Cost 2 vext2 <3,4,5,6>, RHS
+  2645536125U,	// <5,6,4,6>: Cost 3 vext2 <4,6,5,6>, <4,6,5,6>
+  2723402398U,	// <5,6,4,7>: Cost 3 vext3 <6,4,7,5>, <6,4,7,5>
+  1564495401U,	// <5,6,4,u>: Cost 2 vext2 <3,4,5,6>, RHS
+  2577825894U,	// <5,6,5,0>: Cost 3 vext1 <4,5,6,5>, LHS
+  2662125264U,	// <5,6,5,1>: Cost 3 vext2 <7,4,5,6>, <5,1,7,3>
+  3775836867U,	// <5,6,5,2>: Cost 4 vext3 <2,u,6,5>, <6,5,2,6>
+  3711979343U,	// <5,6,5,3>: Cost 4 vext2 <3,4,5,6>, <5,3,3,4>
+  2650181556U,	// <5,6,5,4>: Cost 3 vext2 <5,4,5,6>, <5,4,5,6>
+  2662125572U,	// <5,6,5,5>: Cost 3 vext2 <7,4,5,6>, <5,5,5,5>
+  2638237732U,	// <5,6,5,6>: Cost 3 vext2 <3,4,5,6>, <5,6,0,1>
+  2982399286U,	// <5,6,5,7>: Cost 3 vzipr <4,u,5,5>, RHS
+  2982399287U,	// <5,6,5,u>: Cost 3 vzipr <4,u,5,5>, RHS
+  2583806054U,	// <5,6,6,0>: Cost 3 vext1 <5,5,6,6>, LHS
+  3711979910U,	// <5,6,6,1>: Cost 4 vext2 <3,4,5,6>, <6,1,3,4>
+  2662126074U,	// <5,6,6,2>: Cost 3 vext2 <7,4,5,6>, <6,2,7,3>
+  2583808514U,	// <5,6,6,3>: Cost 3 vext1 <5,5,6,6>, <3,4,5,6>
+  2583809334U,	// <5,6,6,4>: Cost 3 vext1 <5,5,6,6>, RHS
+  2583810062U,	// <5,6,6,5>: Cost 3 vext1 <5,5,6,6>, <5,5,6,6>
+  2638238520U,	// <5,6,6,6>: Cost 3 vext2 <3,4,5,6>, <6,6,6,6>
+  2973781302U,	// <5,6,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
+  2973781303U,	// <5,6,6,u>: Cost 3 vzipr <3,4,5,6>, RHS
+  430358630U,	// <5,6,7,0>: Cost 1 vext1 RHS, LHS
+  1504101110U,	// <5,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
+  1504101992U,	// <5,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1504102550U,	// <5,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  430361910U,	// <5,6,7,4>: Cost 1 vext1 RHS, RHS
+  1504104390U,	// <5,6,7,5>: Cost 2 vext1 RHS, <5,4,7,6>
+  1504105272U,	// <5,6,7,6>: Cost 2 vext1 RHS, <6,6,6,6>
+  1504106092U,	// <5,6,7,7>: Cost 2 vext1 RHS, <7,7,7,7>
+  430364462U,	// <5,6,7,u>: Cost 1 vext1 RHS, LHS
+  430366822U,	// <5,6,u,0>: Cost 1 vext1 RHS, LHS
+  1564497710U,	// <5,6,u,1>: Cost 2 vext2 <3,4,5,6>, LHS
+  1504110184U,	// <5,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1504110742U,	// <5,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  430370103U,	// <5,6,u,4>: Cost 1 vext1 RHS, RHS
+  1564498074U,	// <5,6,u,5>: Cost 2 vext2 <3,4,5,6>, RHS
+  1504113146U,	// <5,6,u,6>: Cost 2 vext1 RHS, <6,2,7,3>
+  1504113658U,	// <5,6,u,7>: Cost 2 vext1 RHS, <7,0,1,2>
+  430372654U,	// <5,6,u,u>: Cost 1 vext1 RHS, LHS
+  2625634304U,	// <5,7,0,0>: Cost 3 vext2 <1,3,5,7>, <0,0,0,0>
+  1551892582U,	// <5,7,0,1>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625634468U,	// <5,7,0,2>: Cost 3 vext2 <1,3,5,7>, <0,2,0,2>
+  2571889247U,	// <5,7,0,3>: Cost 3 vext1 <3,5,7,0>, <3,5,7,0>
+  2625634642U,	// <5,7,0,4>: Cost 3 vext2 <1,3,5,7>, <0,4,1,5>
+  2595778728U,	// <5,7,0,5>: Cost 3 vext1 <7,5,7,0>, <5,7,5,7>
+  3699376639U,	// <5,7,0,6>: Cost 4 vext2 <1,3,5,7>, <0,6,2,7>
+  2260235715U,	// <5,7,0,7>: Cost 3 vrev <7,5,7,0>
+  1551893149U,	// <5,7,0,u>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625635062U,	// <5,7,1,0>: Cost 3 vext2 <1,3,5,7>, <1,0,3,2>
+  2624308020U,	// <5,7,1,1>: Cost 3 vext2 <1,1,5,7>, <1,1,1,1>
+  2625635222U,	// <5,7,1,2>: Cost 3 vext2 <1,3,5,7>, <1,2,3,0>
+  1551893504U,	// <5,7,1,3>: Cost 2 vext2 <1,3,5,7>, <1,3,5,7>
+  2571898166U,	// <5,7,1,4>: Cost 3 vext1 <3,5,7,1>, RHS
+  2625635472U,	// <5,7,1,5>: Cost 3 vext2 <1,3,5,7>, <1,5,3,7>
+  2627626227U,	// <5,7,1,6>: Cost 3 vext2 <1,6,5,7>, <1,6,5,7>
+  3702031684U,	// <5,7,1,7>: Cost 4 vext2 <1,7,5,7>, <1,7,5,7>
+  1555211669U,	// <5,7,1,u>: Cost 2 vext2 <1,u,5,7>, <1,u,5,7>
+  2629617126U,	// <5,7,2,0>: Cost 3 vext2 <2,0,5,7>, <2,0,5,7>
+  3699377670U,	// <5,7,2,1>: Cost 4 vext2 <1,3,5,7>, <2,1,0,3>
+  2625635944U,	// <5,7,2,2>: Cost 3 vext2 <1,3,5,7>, <2,2,2,2>
+  2625636006U,	// <5,7,2,3>: Cost 3 vext2 <1,3,5,7>, <2,3,0,1>
+  2632271658U,	// <5,7,2,4>: Cost 3 vext2 <2,4,5,7>, <2,4,5,7>
+  2625636201U,	// <5,7,2,5>: Cost 3 vext2 <1,3,5,7>, <2,5,3,7>
+  2625636282U,	// <5,7,2,6>: Cost 3 vext2 <1,3,5,7>, <2,6,3,7>
+  3708004381U,	// <5,7,2,7>: Cost 4 vext2 <2,7,5,7>, <2,7,5,7>
+  2625636411U,	// <5,7,2,u>: Cost 3 vext2 <1,3,5,7>, <2,u,0,1>
+  2625636502U,	// <5,7,3,0>: Cost 3 vext2 <1,3,5,7>, <3,0,1,2>
+  2625636604U,	// <5,7,3,1>: Cost 3 vext2 <1,3,5,7>, <3,1,3,5>
+  3699378478U,	// <5,7,3,2>: Cost 4 vext2 <1,3,5,7>, <3,2,0,1>
+  2625636764U,	// <5,7,3,3>: Cost 3 vext2 <1,3,5,7>, <3,3,3,3>
+  2625636866U,	// <5,7,3,4>: Cost 3 vext2 <1,3,5,7>, <3,4,5,6>
+  2625636959U,	// <5,7,3,5>: Cost 3 vext2 <1,3,5,7>, <3,5,7,0>
+  3699378808U,	// <5,7,3,6>: Cost 4 vext2 <1,3,5,7>, <3,6,0,7>
+  2640235254U,	// <5,7,3,7>: Cost 3 vext2 <3,7,5,7>, <3,7,5,7>
+  2625637150U,	// <5,7,3,u>: Cost 3 vext2 <1,3,5,7>, <3,u,1,2>
+  2571919462U,	// <5,7,4,0>: Cost 3 vext1 <3,5,7,4>, LHS
+  2571920384U,	// <5,7,4,1>: Cost 3 vext1 <3,5,7,4>, <1,3,5,7>
+  3699379260U,	// <5,7,4,2>: Cost 4 vext2 <1,3,5,7>, <4,2,6,0>
+  2571922019U,	// <5,7,4,3>: Cost 3 vext1 <3,5,7,4>, <3,5,7,4>
+  2571922742U,	// <5,7,4,4>: Cost 3 vext1 <3,5,7,4>, RHS
+  1551895862U,	// <5,7,4,5>: Cost 2 vext2 <1,3,5,7>, RHS
+  2846277980U,	// <5,7,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
+  2646207951U,	// <5,7,4,7>: Cost 3 vext2 <4,7,5,7>, <4,7,5,7>
+  1551896105U,	// <5,7,4,u>: Cost 2 vext2 <1,3,5,7>, RHS
+  2583871590U,	// <5,7,5,0>: Cost 3 vext1 <5,5,7,5>, LHS
+  2652180176U,	// <5,7,5,1>: Cost 3 vext2 <5,7,5,7>, <5,1,7,3>
+  2625638177U,	// <5,7,5,2>: Cost 3 vext2 <1,3,5,7>, <5,2,7,3>
+  2625638262U,	// <5,7,5,3>: Cost 3 vext2 <1,3,5,7>, <5,3,7,7>
+  2583874870U,	// <5,7,5,4>: Cost 3 vext1 <5,5,7,5>, RHS
+  2846281732U,	// <5,7,5,5>: Cost 3 vuzpr RHS, <5,5,5,5>
+  2651517015U,	// <5,7,5,6>: Cost 3 vext2 <5,6,5,7>, <5,6,5,7>
+  1772539190U,	// <5,7,5,7>: Cost 2 vuzpr RHS, RHS
+  1772539191U,	// <5,7,5,u>: Cost 2 vuzpr RHS, RHS
+  2846281826U,	// <5,7,6,0>: Cost 3 vuzpr RHS, <5,6,7,0>
+  3699380615U,	// <5,7,6,1>: Cost 4 vext2 <1,3,5,7>, <6,1,3,5>
+  2846281108U,	// <5,7,6,2>: Cost 3 vuzpr RHS, <4,6,u,2>
+  2589854210U,	// <5,7,6,3>: Cost 3 vext1 <6,5,7,6>, <3,4,5,6>
+  2846281830U,	// <5,7,6,4>: Cost 3 vuzpr RHS, <5,6,7,4>
+  2725467658U,	// <5,7,6,5>: Cost 3 vext3 <6,7,u,5>, <7,6,5,u>
+  2846281076U,	// <5,7,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
+  2846279610U,	// <5,7,6,7>: Cost 3 vuzpr RHS, <2,6,3,7>
+  2846279611U,	// <5,7,6,u>: Cost 3 vuzpr RHS, <2,6,3,u>
+  1510146150U,	// <5,7,7,0>: Cost 2 vext1 <5,5,7,7>, LHS
+  2846282574U,	// <5,7,7,1>: Cost 3 vuzpr RHS, <6,7,0,1>
+  2583889512U,	// <5,7,7,2>: Cost 3 vext1 <5,5,7,7>, <2,2,2,2>
+  2846281919U,	// <5,7,7,3>: Cost 3 vuzpr RHS, <5,7,u,3>
+  1510149430U,	// <5,7,7,4>: Cost 2 vext1 <5,5,7,7>, RHS
+  1510150168U,	// <5,7,7,5>: Cost 2 vext1 <5,5,7,7>, <5,5,7,7>
+  2583892474U,	// <5,7,7,6>: Cost 3 vext1 <5,5,7,7>, <6,2,7,3>
+  2625640044U,	// <5,7,7,7>: Cost 3 vext2 <1,3,5,7>, <7,7,7,7>
+  1510151982U,	// <5,7,7,u>: Cost 2 vext1 <5,5,7,7>, LHS
+  1510154342U,	// <5,7,u,0>: Cost 2 vext1 <5,5,7,u>, LHS
+  1551898414U,	// <5,7,u,1>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625640325U,	// <5,7,u,2>: Cost 3 vext2 <1,3,5,7>, <u,2,3,0>
+  1772536477U,	// <5,7,u,3>: Cost 2 vuzpr RHS, LHS
+  1510157622U,	// <5,7,u,4>: Cost 2 vext1 <5,5,7,u>, RHS
+  1551898778U,	// <5,7,u,5>: Cost 2 vext2 <1,3,5,7>, RHS
+  2625640656U,	// <5,7,u,6>: Cost 3 vext2 <1,3,5,7>, <u,6,3,7>
+  1772539433U,	// <5,7,u,7>: Cost 2 vuzpr RHS, RHS
+  1551898981U,	// <5,7,u,u>: Cost 2 vext2 <1,3,5,7>, LHS
+  2625642496U,	// <5,u,0,0>: Cost 3 vext2 <1,3,5,u>, <0,0,0,0>
+  1551900774U,	// <5,u,0,1>: Cost 2 vext2 <1,3,5,u>, LHS
+  2625642660U,	// <5,u,0,2>: Cost 3 vext2 <1,3,5,u>, <0,2,0,2>
+  2698630885U,	// <5,u,0,3>: Cost 3 vext3 <2,3,4,5>, <u,0,3,2>
+  2687129325U,	// <5,u,0,4>: Cost 3 vext3 <0,4,1,5>, <u,0,4,1>
+  2689783542U,	// <5,u,0,5>: Cost 3 vext3 <0,u,1,5>, <u,0,5,1>
+  2266134675U,	// <5,u,0,6>: Cost 3 vrev <u,5,6,0>
+  2595853772U,	// <5,u,0,7>: Cost 3 vext1 <7,5,u,0>, <7,5,u,0>
+  1551901341U,	// <5,u,0,u>: Cost 2 vext2 <1,3,5,u>, LHS
+  2625643254U,	// <5,u,1,0>: Cost 3 vext2 <1,3,5,u>, <1,0,3,2>
+  2625643316U,	// <5,u,1,1>: Cost 3 vext2 <1,3,5,u>, <1,1,1,1>
+  1613387566U,	// <5,u,1,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  1551901697U,	// <5,u,1,3>: Cost 2 vext2 <1,3,5,u>, <1,3,5,u>
+  2626307154U,	// <5,u,1,4>: Cost 3 vext2 <1,4,5,u>, <1,4,5,u>
+  2689783622U,	// <5,u,1,5>: Cost 3 vext3 <0,u,1,5>, <u,1,5,0>
+  2627634420U,	// <5,u,1,6>: Cost 3 vext2 <1,6,5,u>, <1,6,5,u>
+  2982366536U,	// <5,u,1,7>: Cost 3 vzipr <4,u,5,1>, RHS
+  1613387620U,	// <5,u,1,u>: Cost 2 vext3 <0,4,1,5>, LHS
+  2846286742U,	// <5,u,2,0>: Cost 3 vuzpr RHS, <1,2,3,0>
+  2685796528U,	// <5,u,2,1>: Cost 3 vext3 <0,2,1,5>, <0,2,1,5>
+  2625644136U,	// <5,u,2,2>: Cost 3 vext2 <1,3,5,u>, <2,2,2,2>
+  2687129480U,	// <5,u,2,3>: Cost 3 vext3 <0,4,1,5>, <u,2,3,3>
+  2632279851U,	// <5,u,2,4>: Cost 3 vext2 <2,4,5,u>, <2,4,5,u>
+  2625644394U,	// <5,u,2,5>: Cost 3 vext2 <1,3,5,u>, <2,5,3,u>
+  2625644474U,	// <5,u,2,6>: Cost 3 vext2 <1,3,5,u>, <2,6,3,7>
+  2713966508U,	// <5,u,2,7>: Cost 3 vext3 <4,u,5,5>, <u,2,7,3>
+  2625644603U,	// <5,u,2,u>: Cost 3 vext2 <1,3,5,u>, <2,u,0,1>
+  2687129532U,	// <5,u,3,0>: Cost 3 vext3 <0,4,1,5>, <u,3,0,1>
+  2636261649U,	// <5,u,3,1>: Cost 3 vext2 <3,1,5,u>, <3,1,5,u>
+  2636925282U,	// <5,u,3,2>: Cost 3 vext2 <3,2,5,u>, <3,2,5,u>
+  2625644956U,	// <5,u,3,3>: Cost 3 vext2 <1,3,5,u>, <3,3,3,3>
+  1564510724U,	// <5,u,3,4>: Cost 2 vext2 <3,4,5,u>, <3,4,5,u>
+  2625645160U,	// <5,u,3,5>: Cost 3 vext2 <1,3,5,u>, <3,5,u,0>
+  2734610422U,	// <5,u,3,6>: Cost 3 vext3 <u,3,6,5>, <u,3,6,5>
+  2640243447U,	// <5,u,3,7>: Cost 3 vext2 <3,7,5,u>, <3,7,5,u>
+  1567165256U,	// <5,u,3,u>: Cost 2 vext2 <3,u,5,u>, <3,u,5,u>
+  1567828889U,	// <5,u,4,0>: Cost 2 vext2 <4,0,5,u>, <4,0,5,u>
+  1661163546U,	// <5,u,4,1>: Cost 2 vext3 <u,4,1,5>, <u,4,1,5>
+  2734463012U,	// <5,u,4,2>: Cost 3 vext3 <u,3,4,5>, <u,4,2,6>
+  2698631212U,	// <5,u,4,3>: Cost 3 vext3 <2,3,4,5>, <u,4,3,5>
+  1570458842U,	// <5,u,4,4>: Cost 2 vext2 <4,4,5,5>, <4,4,5,5>
+  1551904054U,	// <5,u,4,5>: Cost 2 vext2 <1,3,5,u>, RHS
+  2846286172U,	// <5,u,4,6>: Cost 3 vuzpr RHS, <0,4,2,6>
+  2646216144U,	// <5,u,4,7>: Cost 3 vext2 <4,7,5,u>, <4,7,5,u>
+  1551904297U,	// <5,u,4,u>: Cost 2 vext2 <1,3,5,u>, RHS
+  1509982310U,	// <5,u,5,0>: Cost 2 vext1 <5,5,5,5>, LHS
+  2560058555U,	// <5,u,5,1>: Cost 3 vext1 <1,5,u,5>, <1,5,u,5>
+  2698926194U,	// <5,u,5,2>: Cost 3 vext3 <2,3,u,5>, <u,5,2,3>
+  2698631295U,	// <5,u,5,3>: Cost 3 vext3 <2,3,4,5>, <u,5,3,7>
+  1509985590U,	// <5,u,5,4>: Cost 2 vext1 <5,5,5,5>, RHS
+  229035318U,	// <5,u,5,5>: Cost 1 vdup1 RHS
+  1613387930U,	// <5,u,5,6>: Cost 2 vext3 <0,4,1,5>, RHS
+  1772547382U,	// <5,u,5,7>: Cost 2 vuzpr RHS, RHS
+  229035318U,	// <5,u,5,u>: Cost 1 vdup1 RHS
+  2566037606U,	// <5,u,6,0>: Cost 3 vext1 <2,5,u,6>, LHS
+  2920044334U,	// <5,u,6,1>: Cost 3 vzipl <5,6,7,0>, LHS
+  2566039445U,	// <5,u,6,2>: Cost 3 vext1 <2,5,u,6>, <2,5,u,6>
+  2687129808U,	// <5,u,6,3>: Cost 3 vext3 <0,4,1,5>, <u,6,3,7>
+  2566040886U,	// <5,u,6,4>: Cost 3 vext1 <2,5,u,6>, RHS
+  2920044698U,	// <5,u,6,5>: Cost 3 vzipl <5,6,7,0>, RHS
+  2846289268U,	// <5,u,6,6>: Cost 3 vuzpr RHS, <4,6,4,6>
+  2973781320U,	// <5,u,6,7>: Cost 3 vzipr <3,4,5,6>, RHS
+  2687129853U,	// <5,u,6,u>: Cost 3 vext3 <0,4,1,5>, <u,6,u,7>
+  430506086U,	// <5,u,7,0>: Cost 1 vext1 RHS, LHS
+  1486333117U,	// <5,u,7,1>: Cost 2 vext1 <1,5,u,7>, <1,5,u,7>
+  1504249448U,	// <5,u,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  2040971933U,	// <5,u,7,3>: Cost 2 vtrnr RHS, LHS
+  430509384U,	// <5,u,7,4>: Cost 1 vext1 RHS, RHS
+  1504251600U,	// <5,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+  118708378U,	// <5,u,7,6>: Cost 1 vrev RHS
+  2040974889U,	// <5,u,7,7>: Cost 2 vtrnr RHS, RHS
+  430511918U,	// <5,u,7,u>: Cost 1 vext1 RHS, LHS
+  430514278U,	// <5,u,u,0>: Cost 1 vext1 RHS, LHS
+  1551906606U,	// <5,u,u,1>: Cost 2 vext2 <1,3,5,u>, LHS
+  1613388133U,	// <5,u,u,2>: Cost 2 vext3 <0,4,1,5>, LHS
+  1772544669U,	// <5,u,u,3>: Cost 2 vuzpr RHS, LHS
+  430517577U,	// <5,u,u,4>: Cost 1 vext1 RHS, RHS
+  229035318U,	// <5,u,u,5>: Cost 1 vdup1 RHS
+  118716571U,	// <5,u,u,6>: Cost 1 vrev RHS
+  1772547625U,	// <5,u,u,7>: Cost 2 vuzpr RHS, RHS
+  430520110U,	// <5,u,u,u>: Cost 1 vext1 RHS, LHS
+  2686025728U,	// <6,0,0,0>: Cost 3 vext3 <0,2,4,6>, <0,0,0,0>
+  2686025738U,	// <6,0,0,1>: Cost 3 vext3 <0,2,4,6>, <0,0,1,1>
+  2686025748U,	// <6,0,0,2>: Cost 3 vext3 <0,2,4,6>, <0,0,2,2>
+  3779084320U,	// <6,0,0,3>: Cost 4 vext3 <3,4,5,6>, <0,0,3,5>
+  2642903388U,	// <6,0,0,4>: Cost 3 vext2 <4,2,6,0>, <0,4,2,6>
+  3657723939U,	// <6,0,0,5>: Cost 4 vext1 <5,6,0,0>, <5,6,0,0>
+  3926676514U,	// <6,0,0,6>: Cost 4 vuzpr <5,6,7,0>, <7,0,5,6>
+  3926675786U,	// <6,0,0,7>: Cost 4 vuzpr <5,6,7,0>, <6,0,5,7>
+  2686025802U,	// <6,0,0,u>: Cost 3 vext3 <0,2,4,6>, <0,0,u,2>
+  2566070374U,	// <6,0,1,0>: Cost 3 vext1 <2,6,0,1>, LHS
+  3759767642U,	// <6,0,1,1>: Cost 4 vext3 <0,2,4,6>, <0,1,1,0>
+  1612284006U,	// <6,0,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  2583988738U,	// <6,0,1,3>: Cost 3 vext1 <5,6,0,1>, <3,4,5,6>
+  2566073654U,	// <6,0,1,4>: Cost 3 vext1 <2,6,0,1>, RHS
+  2583990308U,	// <6,0,1,5>: Cost 3 vext1 <5,6,0,1>, <5,6,0,1>
+  2589963005U,	// <6,0,1,6>: Cost 3 vext1 <6,6,0,1>, <6,6,0,1>
+  2595935702U,	// <6,0,1,7>: Cost 3 vext1 <7,6,0,1>, <7,6,0,1>
+  1612284060U,	// <6,0,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
+  2686025892U,	// <6,0,2,0>: Cost 3 vext3 <0,2,4,6>, <0,2,0,2>
+  2685804721U,	// <6,0,2,1>: Cost 3 vext3 <0,2,1,6>, <0,2,1,6>
+  3759620282U,	// <6,0,2,2>: Cost 4 vext3 <0,2,2,6>, <0,2,2,6>
+  2705342658U,	// <6,0,2,3>: Cost 3 vext3 <3,4,5,6>, <0,2,3,5>
+  1612284108U,	// <6,0,2,4>: Cost 2 vext3 <0,2,4,6>, <0,2,4,6>
+  3706029956U,	// <6,0,2,5>: Cost 4 vext2 <2,4,6,0>, <2,5,6,7>
+  2686173406U,	// <6,0,2,6>: Cost 3 vext3 <0,2,6,6>, <0,2,6,6>
+  3651769338U,	// <6,0,2,7>: Cost 4 vext1 <4,6,0,2>, <7,0,1,2>
+  1612579056U,	// <6,0,2,u>: Cost 2 vext3 <0,2,u,6>, <0,2,u,6>
+  3706030230U,	// <6,0,3,0>: Cost 4 vext2 <2,4,6,0>, <3,0,1,2>
+  2705342720U,	// <6,0,3,1>: Cost 3 vext3 <3,4,5,6>, <0,3,1,4>
+  2705342730U,	// <6,0,3,2>: Cost 3 vext3 <3,4,5,6>, <0,3,2,5>
+  3706030492U,	// <6,0,3,3>: Cost 4 vext2 <2,4,6,0>, <3,3,3,3>
+  2644896258U,	// <6,0,3,4>: Cost 3 vext2 <4,5,6,0>, <3,4,5,6>
+  3718638154U,	// <6,0,3,5>: Cost 4 vext2 <4,5,6,0>, <3,5,4,6>
+  3729918619U,	// <6,0,3,6>: Cost 4 vext2 <6,4,6,0>, <3,6,4,6>
+  3926672384U,	// <6,0,3,7>: Cost 4 vuzpr <5,6,7,0>, <1,3,5,7>
+  2705342784U,	// <6,0,3,u>: Cost 3 vext3 <3,4,5,6>, <0,3,u,5>
+  2687058250U,	// <6,0,4,0>: Cost 3 vext3 <0,4,0,6>, <0,4,0,6>
+  2686026066U,	// <6,0,4,1>: Cost 3 vext3 <0,2,4,6>, <0,4,1,5>
+  1613463900U,	// <6,0,4,2>: Cost 2 vext3 <0,4,2,6>, <0,4,2,6>
+  3761021285U,	// <6,0,4,3>: Cost 4 vext3 <0,4,3,6>, <0,4,3,6>
+  2687353198U,	// <6,0,4,4>: Cost 3 vext3 <0,4,4,6>, <0,4,4,6>
+  2632289590U,	// <6,0,4,5>: Cost 3 vext2 <2,4,6,0>, RHS
+  2645560704U,	// <6,0,4,6>: Cost 3 vext2 <4,6,6,0>, <4,6,6,0>
+  2646224337U,	// <6,0,4,7>: Cost 3 vext2 <4,7,6,0>, <4,7,6,0>
+  1613906322U,	// <6,0,4,u>: Cost 2 vext3 <0,4,u,6>, <0,4,u,6>
+  3651788902U,	// <6,0,5,0>: Cost 4 vext1 <4,6,0,5>, LHS
+  2687795620U,	// <6,0,5,1>: Cost 3 vext3 <0,5,1,6>, <0,5,1,6>
+  3761611181U,	// <6,0,5,2>: Cost 4 vext3 <0,5,2,6>, <0,5,2,6>
+  3723284326U,	// <6,0,5,3>: Cost 4 vext2 <5,3,6,0>, <5,3,6,0>
+  2646224838U,	// <6,0,5,4>: Cost 3 vext2 <4,7,6,0>, <5,4,7,6>
+  3718639630U,	// <6,0,5,5>: Cost 4 vext2 <4,5,6,0>, <5,5,6,6>
+  2652196962U,	// <6,0,5,6>: Cost 3 vext2 <5,7,6,0>, <5,6,7,0>
+  2852932918U,	// <6,0,5,7>: Cost 3 vuzpr <5,6,7,0>, RHS
+  2852932919U,	// <6,0,5,u>: Cost 3 vuzpr <5,6,7,0>, RHS
+  2852933730U,	// <6,0,6,0>: Cost 3 vuzpr <5,6,7,0>, <5,6,7,0>
+  2925985894U,	// <6,0,6,1>: Cost 3 vzipl <6,6,6,6>, LHS
+  3060203622U,	// <6,0,6,2>: Cost 3 vtrnl <6,6,6,6>, LHS
+  3718640178U,	// <6,0,6,3>: Cost 4 vext2 <4,5,6,0>, <6,3,4,5>
+  2656178832U,	// <6,0,6,4>: Cost 3 vext2 <6,4,6,0>, <6,4,6,0>
+  3725939378U,	// <6,0,6,5>: Cost 4 vext2 <5,7,6,0>, <6,5,0,7>
+  2657506098U,	// <6,0,6,6>: Cost 3 vext2 <6,6,6,0>, <6,6,6,0>
+  2619020110U,	// <6,0,6,7>: Cost 3 vext2 <0,2,6,0>, <6,7,0,1>
+  2925986461U,	// <6,0,6,u>: Cost 3 vzipl <6,6,6,6>, LHS
+  2572091494U,	// <6,0,7,0>: Cost 3 vext1 <3,6,0,7>, LHS
+  2572092310U,	// <6,0,7,1>: Cost 3 vext1 <3,6,0,7>, <1,2,3,0>
+  2980495524U,	// <6,0,7,2>: Cost 3 vzipr RHS, <0,2,0,2>
+  2572094072U,	// <6,0,7,3>: Cost 3 vext1 <3,6,0,7>, <3,6,0,7>
+  2572094774U,	// <6,0,7,4>: Cost 3 vext1 <3,6,0,7>, RHS
+  4054238242U,	// <6,0,7,5>: Cost 4 vzipr RHS, <1,4,0,5>
+  3645837653U,	// <6,0,7,6>: Cost 4 vext1 <3,6,0,7>, <6,0,7,0>
+  4054239054U,	// <6,0,7,7>: Cost 4 vzipr RHS, <2,5,0,7>
+  2572097326U,	// <6,0,7,u>: Cost 3 vext1 <3,6,0,7>, LHS
+  2686026378U,	// <6,0,u,0>: Cost 3 vext3 <0,2,4,6>, <0,u,0,2>
+  2686026386U,	// <6,0,u,1>: Cost 3 vext3 <0,2,4,6>, <0,u,1,1>
+  1612284573U,	// <6,0,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  2705343144U,	// <6,0,u,3>: Cost 3 vext3 <3,4,5,6>, <0,u,3,5>
+  1616265906U,	// <6,0,u,4>: Cost 2 vext3 <0,u,4,6>, <0,u,4,6>
+  2632292506U,	// <6,0,u,5>: Cost 3 vext2 <2,4,6,0>, RHS
+  2590020356U,	// <6,0,u,6>: Cost 3 vext1 <6,6,0,u>, <6,6,0,u>
+  2852933161U,	// <6,0,u,7>: Cost 3 vuzpr <5,6,7,0>, RHS
+  1612284627U,	// <6,0,u,u>: Cost 2 vext3 <0,2,4,6>, LHS
+  2595995750U,	// <6,1,0,0>: Cost 3 vext1 <7,6,1,0>, LHS
+  2646229094U,	// <6,1,0,1>: Cost 3 vext2 <4,7,6,1>, LHS
+  3694092492U,	// <6,1,0,2>: Cost 4 vext2 <0,4,6,1>, <0,2,4,6>
+  2686026486U,	// <6,1,0,3>: Cost 3 vext3 <0,2,4,6>, <1,0,3,2>
+  2595999030U,	// <6,1,0,4>: Cost 3 vext1 <7,6,1,0>, RHS
+  3767730952U,	// <6,1,0,5>: Cost 4 vext3 <1,5,4,6>, <1,0,5,2>
+  2596000590U,	// <6,1,0,6>: Cost 3 vext1 <7,6,1,0>, <6,7,0,1>
+  2596001246U,	// <6,1,0,7>: Cost 3 vext1 <7,6,1,0>, <7,6,1,0>
+  2686026531U,	// <6,1,0,u>: Cost 3 vext3 <0,2,4,6>, <1,0,u,2>
+  3763602219U,	// <6,1,1,0>: Cost 4 vext3 <0,u,2,6>, <1,1,0,1>
+  2686026548U,	// <6,1,1,1>: Cost 3 vext3 <0,2,4,6>, <1,1,1,1>
+  3764929346U,	// <6,1,1,2>: Cost 4 vext3 <1,1,2,6>, <1,1,2,6>
+  2686026568U,	// <6,1,1,3>: Cost 3 vext3 <0,2,4,6>, <1,1,3,3>
+  2691334996U,	// <6,1,1,4>: Cost 3 vext3 <1,1,4,6>, <1,1,4,6>
+  3760874332U,	// <6,1,1,5>: Cost 4 vext3 <0,4,1,6>, <1,1,5,5>
+  3765224294U,	// <6,1,1,6>: Cost 4 vext3 <1,1,6,6>, <1,1,6,6>
+  3669751263U,	// <6,1,1,7>: Cost 4 vext1 <7,6,1,1>, <7,6,1,1>
+  2686026613U,	// <6,1,1,u>: Cost 3 vext3 <0,2,4,6>, <1,1,u,3>
+  2554208358U,	// <6,1,2,0>: Cost 3 vext1 <0,6,1,2>, LHS
+  3763602311U,	// <6,1,2,1>: Cost 4 vext3 <0,u,2,6>, <1,2,1,3>
+  3639895971U,	// <6,1,2,2>: Cost 4 vext1 <2,6,1,2>, <2,6,1,2>
+  2686026646U,	// <6,1,2,3>: Cost 3 vext3 <0,2,4,6>, <1,2,3,0>
+  2554211638U,	// <6,1,2,4>: Cost 3 vext1 <0,6,1,2>, RHS
+  3760874411U,	// <6,1,2,5>: Cost 4 vext3 <0,4,1,6>, <1,2,5,3>
+  2554212858U,	// <6,1,2,6>: Cost 3 vext1 <0,6,1,2>, <6,2,7,3>
+  3802973114U,	// <6,1,2,7>: Cost 4 vext3 <7,4,5,6>, <1,2,7,0>
+  2686026691U,	// <6,1,2,u>: Cost 3 vext3 <0,2,4,6>, <1,2,u,0>
+  2566160486U,	// <6,1,3,0>: Cost 3 vext1 <2,6,1,3>, LHS
+  2686026712U,	// <6,1,3,1>: Cost 3 vext3 <0,2,4,6>, <1,3,1,3>
+  2686026724U,	// <6,1,3,2>: Cost 3 vext3 <0,2,4,6>, <1,3,2,6>
+  3759768552U,	// <6,1,3,3>: Cost 4 vext3 <0,2,4,6>, <1,3,3,1>
+  2692662262U,	// <6,1,3,4>: Cost 3 vext3 <1,3,4,6>, <1,3,4,6>
+  2686026752U,	// <6,1,3,5>: Cost 3 vext3 <0,2,4,6>, <1,3,5,7>
+  2590053128U,	// <6,1,3,6>: Cost 3 vext1 <6,6,1,3>, <6,6,1,3>
+  3663795194U,	// <6,1,3,7>: Cost 4 vext1 <6,6,1,3>, <7,0,1,2>
+  2686026775U,	// <6,1,3,u>: Cost 3 vext3 <0,2,4,6>, <1,3,u,3>
+  2641587099U,	// <6,1,4,0>: Cost 3 vext2 <4,0,6,1>, <4,0,6,1>
+  2693104684U,	// <6,1,4,1>: Cost 3 vext3 <1,4,1,6>, <1,4,1,6>
+  3639912357U,	// <6,1,4,2>: Cost 4 vext1 <2,6,1,4>, <2,6,1,4>
+  2687206462U,	// <6,1,4,3>: Cost 3 vext3 <0,4,2,6>, <1,4,3,6>
+  3633941814U,	// <6,1,4,4>: Cost 4 vext1 <1,6,1,4>, RHS
+  2693399632U,	// <6,1,4,5>: Cost 3 vext3 <1,4,5,6>, <1,4,5,6>
+  3765077075U,	// <6,1,4,6>: Cost 4 vext3 <1,1,4,6>, <1,4,6,0>
+  2646232530U,	// <6,1,4,7>: Cost 3 vext2 <4,7,6,1>, <4,7,6,1>
+  2687206507U,	// <6,1,4,u>: Cost 3 vext3 <0,4,2,6>, <1,4,u,6>
+  2647559796U,	// <6,1,5,0>: Cost 3 vext2 <5,0,6,1>, <5,0,6,1>
+  3765077118U,	// <6,1,5,1>: Cost 4 vext3 <1,1,4,6>, <1,5,1,7>
+  3767583878U,	// <6,1,5,2>: Cost 4 vext3 <1,5,2,6>, <1,5,2,6>
+  2686026896U,	// <6,1,5,3>: Cost 3 vext3 <0,2,4,6>, <1,5,3,7>
+  2693989528U,	// <6,1,5,4>: Cost 3 vext3 <1,5,4,6>, <1,5,4,6>
+  3767805089U,	// <6,1,5,5>: Cost 4 vext3 <1,5,5,6>, <1,5,5,6>
+  2652868706U,	// <6,1,5,6>: Cost 3 vext2 <5,u,6,1>, <5,6,7,0>
+  3908250934U,	// <6,1,5,7>: Cost 4 vuzpr <2,6,0,1>, RHS
+  2686026941U,	// <6,1,5,u>: Cost 3 vext3 <0,2,4,6>, <1,5,u,7>
+  2554241126U,	// <6,1,6,0>: Cost 3 vext1 <0,6,1,6>, LHS
+  3763602639U,	// <6,1,6,1>: Cost 4 vext3 <0,u,2,6>, <1,6,1,7>
+  3759547607U,	// <6,1,6,2>: Cost 4 vext3 <0,2,1,6>, <1,6,2,6>
+  3115221094U,	// <6,1,6,3>: Cost 3 vtrnr <4,6,4,6>, LHS
+  2554244406U,	// <6,1,6,4>: Cost 3 vext1 <0,6,1,6>, RHS
+  3760874739U,	// <6,1,6,5>: Cost 4 vext3 <0,4,1,6>, <1,6,5,7>
+  2554245944U,	// <6,1,6,6>: Cost 3 vext1 <0,6,1,6>, <6,6,6,6>
+  3719975758U,	// <6,1,6,7>: Cost 4 vext2 <4,7,6,1>, <6,7,0,1>
+  3115221099U,	// <6,1,6,u>: Cost 3 vtrnr <4,6,4,6>, LHS
+  2560221286U,	// <6,1,7,0>: Cost 3 vext1 <1,6,1,7>, LHS
+  2560222415U,	// <6,1,7,1>: Cost 3 vext1 <1,6,1,7>, <1,6,1,7>
+  2980497558U,	// <6,1,7,2>: Cost 3 vzipr RHS, <3,0,1,2>
+  3103211622U,	// <6,1,7,3>: Cost 3 vtrnr <2,6,3,7>, LHS
+  2560224566U,	// <6,1,7,4>: Cost 3 vext1 <1,6,1,7>, RHS
+  2980495698U,	// <6,1,7,5>: Cost 3 vzipr RHS, <0,4,1,5>
+  3633967526U,	// <6,1,7,6>: Cost 4 vext1 <1,6,1,7>, <6,1,7,0>
+  4054237686U,	// <6,1,7,7>: Cost 4 vzipr RHS, <0,6,1,7>
+  2560227118U,	// <6,1,7,u>: Cost 3 vext1 <1,6,1,7>, LHS
+  2560229478U,	// <6,1,u,0>: Cost 3 vext1 <1,6,1,u>, LHS
+  2686027117U,	// <6,1,u,1>: Cost 3 vext3 <0,2,4,6>, <1,u,1,3>
+  2686027129U,	// <6,1,u,2>: Cost 3 vext3 <0,2,4,6>, <1,u,2,6>
+  2686027132U,	// <6,1,u,3>: Cost 3 vext3 <0,2,4,6>, <1,u,3,0>
+  2687206795U,	// <6,1,u,4>: Cost 3 vext3 <0,4,2,6>, <1,u,4,6>
+  2686027157U,	// <6,1,u,5>: Cost 3 vext3 <0,2,4,6>, <1,u,5,7>
+  2590094093U,	// <6,1,u,6>: Cost 3 vext1 <6,6,1,u>, <6,6,1,u>
+  2596066790U,	// <6,1,u,7>: Cost 3 vext1 <7,6,1,u>, <7,6,1,u>
+  2686027177U,	// <6,1,u,u>: Cost 3 vext3 <0,2,4,6>, <1,u,u,0>
+  2646900736U,	// <6,2,0,0>: Cost 3 vext2 <4,u,6,2>, <0,0,0,0>
+  1573159014U,	// <6,2,0,1>: Cost 2 vext2 <4,u,6,2>, LHS
+  2646900900U,	// <6,2,0,2>: Cost 3 vext2 <4,u,6,2>, <0,2,0,2>
+  3759769037U,	// <6,2,0,3>: Cost 4 vext3 <0,2,4,6>, <2,0,3,0>
+  2641592668U,	// <6,2,0,4>: Cost 3 vext2 <4,0,6,2>, <0,4,2,6>
+  3779085794U,	// <6,2,0,5>: Cost 4 vext3 <3,4,5,6>, <2,0,5,3>
+  2686027244U,	// <6,2,0,6>: Cost 3 vext3 <0,2,4,6>, <2,0,6,4>
+  3669816807U,	// <6,2,0,7>: Cost 4 vext1 <7,6,2,0>, <7,6,2,0>
+  1573159581U,	// <6,2,0,u>: Cost 2 vext2 <4,u,6,2>, LHS
+  2230527897U,	// <6,2,1,0>: Cost 3 vrev <2,6,0,1>
+  2646901556U,	// <6,2,1,1>: Cost 3 vext2 <4,u,6,2>, <1,1,1,1>
+  2646901654U,	// <6,2,1,2>: Cost 3 vext2 <4,u,6,2>, <1,2,3,0>
+  2847047782U,	// <6,2,1,3>: Cost 3 vuzpr <4,6,u,2>, LHS
+  3771049517U,	// <6,2,1,4>: Cost 4 vext3 <2,1,4,6>, <2,1,4,6>
+  2646901904U,	// <6,2,1,5>: Cost 3 vext2 <4,u,6,2>, <1,5,3,7>
+  2686027324U,	// <6,2,1,6>: Cost 3 vext3 <0,2,4,6>, <2,1,6,3>
+  3669825000U,	// <6,2,1,7>: Cost 4 vext1 <7,6,2,1>, <7,6,2,1>
+  2231117793U,	// <6,2,1,u>: Cost 3 vrev <2,6,u,1>
+  3763603029U,	// <6,2,2,0>: Cost 4 vext3 <0,u,2,6>, <2,2,0,1>
+  3759769184U,	// <6,2,2,1>: Cost 4 vext3 <0,2,4,6>, <2,2,1,3>
+  2686027368U,	// <6,2,2,2>: Cost 3 vext3 <0,2,4,6>, <2,2,2,2>
+  2686027378U,	// <6,2,2,3>: Cost 3 vext3 <0,2,4,6>, <2,2,3,3>
+  2697971326U,	// <6,2,2,4>: Cost 3 vext3 <2,2,4,6>, <2,2,4,6>
+  3759769224U,	// <6,2,2,5>: Cost 4 vext3 <0,2,4,6>, <2,2,5,7>
+  2698118800U,	// <6,2,2,6>: Cost 3 vext3 <2,2,6,6>, <2,2,6,6>
+  3920794092U,	// <6,2,2,7>: Cost 4 vuzpr <4,6,u,2>, <6,2,5,7>
+  2686027423U,	// <6,2,2,u>: Cost 3 vext3 <0,2,4,6>, <2,2,u,3>
+  2686027430U,	// <6,2,3,0>: Cost 3 vext3 <0,2,4,6>, <2,3,0,1>
+  3759769262U,	// <6,2,3,1>: Cost 4 vext3 <0,2,4,6>, <2,3,1,0>
+  2698487485U,	// <6,2,3,2>: Cost 3 vext3 <2,3,2,6>, <2,3,2,6>
+  2705344196U,	// <6,2,3,3>: Cost 3 vext3 <3,4,5,6>, <2,3,3,4>
+  2686027470U,	// <6,2,3,4>: Cost 3 vext3 <0,2,4,6>, <2,3,4,5>
+  2698708696U,	// <6,2,3,5>: Cost 3 vext3 <2,3,5,6>, <2,3,5,6>
+  2724660961U,	// <6,2,3,6>: Cost 3 vext3 <6,6,6,6>, <2,3,6,6>
+  2729232104U,	// <6,2,3,7>: Cost 3 vext3 <7,4,5,6>, <2,3,7,4>
+  2686027502U,	// <6,2,3,u>: Cost 3 vext3 <0,2,4,6>, <2,3,u,1>
+  1567853468U,	// <6,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
+  3759769351U,	// <6,2,4,1>: Cost 4 vext3 <0,2,4,6>, <2,4,1,u>
+  2699151118U,	// <6,2,4,2>: Cost 3 vext3 <2,4,2,6>, <2,4,2,6>
+  2686027543U,	// <6,2,4,3>: Cost 3 vext3 <0,2,4,6>, <2,4,3,6>
+  2699298592U,	// <6,2,4,4>: Cost 3 vext3 <2,4,4,6>, <2,4,4,6>
+  1573162294U,	// <6,2,4,5>: Cost 2 vext2 <4,u,6,2>, RHS
+  2686027564U,	// <6,2,4,6>: Cost 3 vext3 <0,2,4,6>, <2,4,6,0>
+  3719982547U,	// <6,2,4,7>: Cost 4 vext2 <4,7,6,2>, <4,7,6,2>
+  1573162532U,	// <6,2,4,u>: Cost 2 vext2 <4,u,6,2>, <4,u,6,2>
+  3779086154U,	// <6,2,5,0>: Cost 4 vext3 <3,4,5,6>, <2,5,0,3>
+  2646904528U,	// <6,2,5,1>: Cost 3 vext2 <4,u,6,2>, <5,1,7,3>
+  3759769440U,	// <6,2,5,2>: Cost 4 vext3 <0,2,4,6>, <2,5,2,7>
+  2699888488U,	// <6,2,5,3>: Cost 3 vext3 <2,5,3,6>, <2,5,3,6>
+  2230855617U,	// <6,2,5,4>: Cost 3 vrev <2,6,4,5>
+  2646904836U,	// <6,2,5,5>: Cost 3 vext2 <4,u,6,2>, <5,5,5,5>
+  2646904930U,	// <6,2,5,6>: Cost 3 vext2 <4,u,6,2>, <5,6,7,0>
+  2847051062U,	// <6,2,5,7>: Cost 3 vuzpr <4,6,u,2>, RHS
+  2700257173U,	// <6,2,5,u>: Cost 3 vext3 <2,5,u,6>, <2,5,u,6>
+  2687207321U,	// <6,2,6,0>: Cost 3 vext3 <0,4,2,6>, <2,6,0,1>
+  2686027684U,	// <6,2,6,1>: Cost 3 vext3 <0,2,4,6>, <2,6,1,3>
+  2566260656U,	// <6,2,6,2>: Cost 3 vext1 <2,6,2,6>, <2,6,2,6>
+  2685806522U,	// <6,2,6,3>: Cost 3 vext3 <0,2,1,6>, <2,6,3,7>
+  2687207361U,	// <6,2,6,4>: Cost 3 vext3 <0,4,2,6>, <2,6,4,5>
+  2686027724U,	// <6,2,6,5>: Cost 3 vext3 <0,2,4,6>, <2,6,5,7>
+  2646905656U,	// <6,2,6,6>: Cost 3 vext2 <4,u,6,2>, <6,6,6,6>
+  2646905678U,	// <6,2,6,7>: Cost 3 vext2 <4,u,6,2>, <6,7,0,1>
+  2686027751U,	// <6,2,6,u>: Cost 3 vext3 <0,2,4,6>, <2,6,u,7>
+  2554323046U,	// <6,2,7,0>: Cost 3 vext1 <0,6,2,7>, LHS
+  2572239606U,	// <6,2,7,1>: Cost 3 vext1 <3,6,2,7>, <1,0,3,2>
+  2566268849U,	// <6,2,7,2>: Cost 3 vext1 <2,6,2,7>, <2,6,2,7>
+  1906753638U,	// <6,2,7,3>: Cost 2 vzipr RHS, LHS
+  2554326326U,	// <6,2,7,4>: Cost 3 vext1 <0,6,2,7>, RHS
+  3304687564U,	// <6,2,7,5>: Cost 4 vrev <2,6,5,7>
+  2980495708U,	// <6,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
+  2646906476U,	// <6,2,7,7>: Cost 3 vext2 <4,u,6,2>, <7,7,7,7>
+  1906753643U,	// <6,2,7,u>: Cost 2 vzipr RHS, LHS
+  1591744256U,	// <6,2,u,0>: Cost 2 vext2 <u,0,6,2>, <u,0,6,2>
+  1573164846U,	// <6,2,u,1>: Cost 2 vext2 <4,u,6,2>, LHS
+  2701805650U,	// <6,2,u,2>: Cost 3 vext3 <2,u,2,6>, <2,u,2,6>
+  1906761830U,	// <6,2,u,3>: Cost 2 vzipr RHS, LHS
+  2686027875U,	// <6,2,u,4>: Cost 3 vext3 <0,2,4,6>, <2,u,4,5>
+  1573165210U,	// <6,2,u,5>: Cost 2 vext2 <4,u,6,2>, RHS
+  2686322800U,	// <6,2,u,6>: Cost 3 vext3 <0,2,u,6>, <2,u,6,0>
+  2847051305U,	// <6,2,u,7>: Cost 3 vuzpr <4,6,u,2>, RHS
+  1906761835U,	// <6,2,u,u>: Cost 2 vzipr RHS, LHS
+  3759769739U,	// <6,3,0,0>: Cost 4 vext3 <0,2,4,6>, <3,0,0,0>
+  2686027926U,	// <6,3,0,1>: Cost 3 vext3 <0,2,4,6>, <3,0,1,2>
+  2686027937U,	// <6,3,0,2>: Cost 3 vext3 <0,2,4,6>, <3,0,2,4>
+  3640027286U,	// <6,3,0,3>: Cost 4 vext1 <2,6,3,0>, <3,0,1,2>
+  2687207601U,	// <6,3,0,4>: Cost 3 vext3 <0,4,2,6>, <3,0,4,2>
+  2705344698U,	// <6,3,0,5>: Cost 3 vext3 <3,4,5,6>, <3,0,5,2>
+  3663917847U,	// <6,3,0,6>: Cost 4 vext1 <6,6,3,0>, <6,6,3,0>
+  2237008560U,	// <6,3,0,7>: Cost 3 vrev <3,6,7,0>
+  2686027989U,	// <6,3,0,u>: Cost 3 vext3 <0,2,4,6>, <3,0,u,2>
+  3759769823U,	// <6,3,1,0>: Cost 4 vext3 <0,2,4,6>, <3,1,0,3>
+  3759769830U,	// <6,3,1,1>: Cost 4 vext3 <0,2,4,6>, <3,1,1,1>
+  3759769841U,	// <6,3,1,2>: Cost 4 vext3 <0,2,4,6>, <3,1,2,3>
+  3759769848U,	// <6,3,1,3>: Cost 4 vext3 <0,2,4,6>, <3,1,3,1>
+  2703280390U,	// <6,3,1,4>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
+  3759769868U,	// <6,3,1,5>: Cost 4 vext3 <0,2,4,6>, <3,1,5,3>
+  3704063194U,	// <6,3,1,6>: Cost 4 vext2 <2,1,6,3>, <1,6,3,0>
+  3767732510U,	// <6,3,1,7>: Cost 4 vext3 <1,5,4,6>, <3,1,7,3>
+  2703280390U,	// <6,3,1,u>: Cost 3 vext3 <3,1,4,6>, <3,1,4,6>
+  3704063468U,	// <6,3,2,0>: Cost 4 vext2 <2,1,6,3>, <2,0,6,4>
+  2630321724U,	// <6,3,2,1>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
+  3759769921U,	// <6,3,2,2>: Cost 4 vext3 <0,2,4,6>, <3,2,2,2>
+  3759769928U,	// <6,3,2,3>: Cost 4 vext3 <0,2,4,6>, <3,2,3,0>
+  3704063767U,	// <6,3,2,4>: Cost 4 vext2 <2,1,6,3>, <2,4,3,6>
+  3704063876U,	// <6,3,2,5>: Cost 4 vext2 <2,1,6,3>, <2,5,6,7>
+  2636957626U,	// <6,3,2,6>: Cost 3 vext2 <3,2,6,3>, <2,6,3,7>
+  3777907058U,	// <6,3,2,7>: Cost 4 vext3 <3,2,7,6>, <3,2,7,6>
+  2630321724U,	// <6,3,2,u>: Cost 3 vext2 <2,1,6,3>, <2,1,6,3>
+  3759769983U,	// <6,3,3,0>: Cost 4 vext3 <0,2,4,6>, <3,3,0,1>
+  3710036245U,	// <6,3,3,1>: Cost 4 vext2 <3,1,6,3>, <3,1,6,3>
+  2636958054U,	// <6,3,3,2>: Cost 3 vext2 <3,2,6,3>, <3,2,6,3>
+  2686028188U,	// <6,3,3,3>: Cost 3 vext3 <0,2,4,6>, <3,3,3,3>
+  2704607656U,	// <6,3,3,4>: Cost 3 vext3 <3,3,4,6>, <3,3,4,6>
+  3773041072U,	// <6,3,3,5>: Cost 4 vext3 <2,4,4,6>, <3,3,5,5>
+  3711363731U,	// <6,3,3,6>: Cost 4 vext2 <3,3,6,3>, <3,6,3,7>
+  3767732676U,	// <6,3,3,7>: Cost 4 vext3 <1,5,4,6>, <3,3,7,7>
+  2707999179U,	// <6,3,3,u>: Cost 3 vext3 <3,u,5,6>, <3,3,u,5>
+  2584232038U,	// <6,3,4,0>: Cost 3 vext1 <5,6,3,4>, LHS
+  2642267118U,	// <6,3,4,1>: Cost 3 vext2 <4,1,6,3>, <4,1,6,3>
+  2642930751U,	// <6,3,4,2>: Cost 3 vext2 <4,2,6,3>, <4,2,6,3>
+  2705197552U,	// <6,3,4,3>: Cost 3 vext3 <3,4,3,6>, <3,4,3,6>
+  2584235318U,	// <6,3,4,4>: Cost 3 vext1 <5,6,3,4>, RHS
+  1631603202U,	// <6,3,4,5>: Cost 2 vext3 <3,4,5,6>, <3,4,5,6>
+  2654211444U,	// <6,3,4,6>: Cost 3 vext2 <6,1,6,3>, <4,6,4,6>
+  2237041332U,	// <6,3,4,7>: Cost 3 vrev <3,6,7,4>
+  1631824413U,	// <6,3,4,u>: Cost 2 vext3 <3,4,u,6>, <3,4,u,6>
+  3640066150U,	// <6,3,5,0>: Cost 4 vext1 <2,6,3,5>, LHS
+  3772746288U,	// <6,3,5,1>: Cost 4 vext3 <2,4,0,6>, <3,5,1,7>
+  3640067790U,	// <6,3,5,2>: Cost 4 vext1 <2,6,3,5>, <2,3,4,5>
+  3773041216U,	// <6,3,5,3>: Cost 4 vext3 <2,4,4,6>, <3,5,3,5>
+  2705934922U,	// <6,3,5,4>: Cost 3 vext3 <3,5,4,6>, <3,5,4,6>
+  3773041236U,	// <6,3,5,5>: Cost 4 vext3 <2,4,4,6>, <3,5,5,7>
+  3779086940U,	// <6,3,5,6>: Cost 4 vext3 <3,4,5,6>, <3,5,6,6>
+  3767732831U,	// <6,3,5,7>: Cost 4 vext3 <1,5,4,6>, <3,5,7,0>
+  2706229870U,	// <6,3,5,u>: Cost 3 vext3 <3,5,u,6>, <3,5,u,6>
+  2602164326U,	// <6,3,6,0>: Cost 3 vext1 <u,6,3,6>, LHS
+  2654212512U,	// <6,3,6,1>: Cost 3 vext2 <6,1,6,3>, <6,1,6,3>
+  2566334393U,	// <6,3,6,2>: Cost 3 vext1 <2,6,3,6>, <2,6,3,6>
+  3704066588U,	// <6,3,6,3>: Cost 4 vext2 <2,1,6,3>, <6,3,2,1>
+  2602167524U,	// <6,3,6,4>: Cost 3 vext1 <u,6,3,6>, <4,4,6,6>
+  3710702321U,	// <6,3,6,5>: Cost 4 vext2 <3,2,6,3>, <6,5,7,7>
+  2724661933U,	// <6,3,6,6>: Cost 3 vext3 <6,6,6,6>, <3,6,6,6>
+  3710702465U,	// <6,3,6,7>: Cost 4 vext2 <3,2,6,3>, <6,7,5,7>
+  2602170158U,	// <6,3,6,u>: Cost 3 vext1 <u,6,3,6>, LHS
+  1492598886U,	// <6,3,7,0>: Cost 2 vext1 <2,6,3,7>, LHS
+  2560369889U,	// <6,3,7,1>: Cost 3 vext1 <1,6,3,7>, <1,6,3,7>
+  1492600762U,	// <6,3,7,2>: Cost 2 vext1 <2,6,3,7>, <2,6,3,7>
+  2566342806U,	// <6,3,7,3>: Cost 3 vext1 <2,6,3,7>, <3,0,1,2>
+  1492602166U,	// <6,3,7,4>: Cost 2 vext1 <2,6,3,7>, RHS
+  2602176208U,	// <6,3,7,5>: Cost 3 vext1 <u,6,3,7>, <5,1,7,3>
+  2566345210U,	// <6,3,7,6>: Cost 3 vext1 <2,6,3,7>, <6,2,7,3>
+  2980496528U,	// <6,3,7,7>: Cost 3 vzipr RHS, <1,5,3,7>
+  1492604718U,	// <6,3,7,u>: Cost 2 vext1 <2,6,3,7>, LHS
+  1492607078U,	// <6,3,u,0>: Cost 2 vext1 <2,6,3,u>, LHS
+  2686028574U,	// <6,3,u,1>: Cost 3 vext3 <0,2,4,6>, <3,u,1,2>
+  1492608955U,	// <6,3,u,2>: Cost 2 vext1 <2,6,3,u>, <2,6,3,u>
+  2566350998U,	// <6,3,u,3>: Cost 3 vext1 <2,6,3,u>, <3,0,1,2>
+  1492610358U,	// <6,3,u,4>: Cost 2 vext1 <2,6,3,u>, RHS
+  1634257734U,	// <6,3,u,5>: Cost 2 vext3 <3,u,5,6>, <3,u,5,6>
+  2566353489U,	// <6,3,u,6>: Cost 3 vext1 <2,6,3,u>, <6,3,u,0>
+  2980504720U,	// <6,3,u,7>: Cost 3 vzipr RHS, <1,5,3,7>
+  1492612910U,	// <6,3,u,u>: Cost 2 vext1 <2,6,3,u>, LHS
+  3703406592U,	// <6,4,0,0>: Cost 4 vext2 <2,0,6,4>, <0,0,0,0>
+  2629664870U,	// <6,4,0,1>: Cost 3 vext2 <2,0,6,4>, LHS
+  2629664972U,	// <6,4,0,2>: Cost 3 vext2 <2,0,6,4>, <0,2,4,6>
+  3779087232U,	// <6,4,0,3>: Cost 4 vext3 <3,4,5,6>, <4,0,3,1>
+  2642936156U,	// <6,4,0,4>: Cost 3 vext2 <4,2,6,4>, <0,4,2,6>
+  2712570770U,	// <6,4,0,5>: Cost 3 vext3 <4,6,4,6>, <4,0,5,1>
+  2687208348U,	// <6,4,0,6>: Cost 3 vext3 <0,4,2,6>, <4,0,6,2>
+  3316723081U,	// <6,4,0,7>: Cost 4 vrev <4,6,7,0>
+  2629665437U,	// <6,4,0,u>: Cost 3 vext2 <2,0,6,4>, LHS
+  2242473291U,	// <6,4,1,0>: Cost 3 vrev <4,6,0,1>
+  3700089652U,	// <6,4,1,1>: Cost 4 vext2 <1,4,6,4>, <1,1,1,1>
+  3703407510U,	// <6,4,1,2>: Cost 4 vext2 <2,0,6,4>, <1,2,3,0>
+  2852962406U,	// <6,4,1,3>: Cost 3 vuzpr <5,6,7,4>, LHS
+  3628166454U,	// <6,4,1,4>: Cost 4 vext1 <0,6,4,1>, RHS
+  3760876514U,	// <6,4,1,5>: Cost 4 vext3 <0,4,1,6>, <4,1,5,0>
+  2687208430U,	// <6,4,1,6>: Cost 3 vext3 <0,4,2,6>, <4,1,6,3>
+  3316731274U,	// <6,4,1,7>: Cost 4 vrev <4,6,7,1>
+  2243063187U,	// <6,4,1,u>: Cost 3 vrev <4,6,u,1>
+  2629666284U,	// <6,4,2,0>: Cost 3 vext2 <2,0,6,4>, <2,0,6,4>
+  3703408188U,	// <6,4,2,1>: Cost 4 vext2 <2,0,6,4>, <2,1,6,3>
+  3703408232U,	// <6,4,2,2>: Cost 4 vext2 <2,0,6,4>, <2,2,2,2>
+  3703408294U,	// <6,4,2,3>: Cost 4 vext2 <2,0,6,4>, <2,3,0,1>
+  2632320816U,	// <6,4,2,4>: Cost 3 vext2 <2,4,6,4>, <2,4,6,4>
+  2923384118U,	// <6,4,2,5>: Cost 3 vzipl <6,2,7,3>, RHS
+  2687208508U,	// <6,4,2,6>: Cost 3 vext3 <0,4,2,6>, <4,2,6,0>
+  3760950341U,	// <6,4,2,7>: Cost 4 vext3 <0,4,2,6>, <4,2,7,0>
+  2634975348U,	// <6,4,2,u>: Cost 3 vext2 <2,u,6,4>, <2,u,6,4>
+  3703408790U,	// <6,4,3,0>: Cost 4 vext2 <2,0,6,4>, <3,0,1,2>
+  3316305238U,	// <6,4,3,1>: Cost 4 vrev <4,6,1,3>
+  3703408947U,	// <6,4,3,2>: Cost 4 vext2 <2,0,6,4>, <3,2,0,6>
+  3703409052U,	// <6,4,3,3>: Cost 4 vext2 <2,0,6,4>, <3,3,3,3>
+  2644929026U,	// <6,4,3,4>: Cost 3 vext2 <4,5,6,4>, <3,4,5,6>
+  3718670922U,	// <6,4,3,5>: Cost 4 vext2 <4,5,6,4>, <3,5,4,6>
+  2705345682U,	// <6,4,3,6>: Cost 3 vext3 <3,4,5,6>, <4,3,6,5>
+  3926705152U,	// <6,4,3,7>: Cost 4 vuzpr <5,6,7,4>, <1,3,5,7>
+  2668817222U,	// <6,4,3,u>: Cost 3 vext2 <u,5,6,4>, <3,u,5,6>
+  2590277734U,	// <6,4,4,0>: Cost 3 vext1 <6,6,4,4>, LHS
+  3716017135U,	// <6,4,4,1>: Cost 4 vext2 <4,1,6,4>, <4,1,6,4>
+  2642938944U,	// <6,4,4,2>: Cost 3 vext2 <4,2,6,4>, <4,2,6,4>
+  3717344401U,	// <6,4,4,3>: Cost 4 vext2 <4,3,6,4>, <4,3,6,4>
+  2712571088U,	// <6,4,4,4>: Cost 3 vext3 <4,6,4,6>, <4,4,4,4>
+  2629668150U,	// <6,4,4,5>: Cost 3 vext2 <2,0,6,4>, RHS
+  1637649636U,	// <6,4,4,6>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
+  2646257109U,	// <6,4,4,7>: Cost 3 vext2 <4,7,6,4>, <4,7,6,4>
+  1637649636U,	// <6,4,4,u>: Cost 2 vext3 <4,4,6,6>, <4,4,6,6>
+  2566398054U,	// <6,4,5,0>: Cost 3 vext1 <2,6,4,5>, LHS
+  3760876805U,	// <6,4,5,1>: Cost 4 vext3 <0,4,1,6>, <4,5,1,3>
+  2566399937U,	// <6,4,5,2>: Cost 3 vext1 <2,6,4,5>, <2,6,4,5>
+  2584316418U,	// <6,4,5,3>: Cost 3 vext1 <5,6,4,5>, <3,4,5,6>
+  2566401334U,	// <6,4,5,4>: Cost 3 vext1 <2,6,4,5>, RHS
+  2584318028U,	// <6,4,5,5>: Cost 3 vext1 <5,6,4,5>, <5,6,4,5>
+  1612287286U,	// <6,4,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
+  2852965686U,	// <6,4,5,7>: Cost 3 vuzpr <5,6,7,4>, RHS
+  1612287304U,	// <6,4,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
+  1504608358U,	// <6,4,6,0>: Cost 2 vext1 <4,6,4,6>, LHS
+  2578350838U,	// <6,4,6,1>: Cost 3 vext1 <4,6,4,6>, <1,0,3,2>
+  2578351720U,	// <6,4,6,2>: Cost 3 vext1 <4,6,4,6>, <2,2,2,2>
+  2578352278U,	// <6,4,6,3>: Cost 3 vext1 <4,6,4,6>, <3,0,1,2>
+  1504611638U,	// <6,4,6,4>: Cost 2 vext1 <4,6,4,6>, RHS
+  2578353872U,	// <6,4,6,5>: Cost 3 vext1 <4,6,4,6>, <5,1,7,3>
+  2578354682U,	// <6,4,6,6>: Cost 3 vext1 <4,6,4,6>, <6,2,7,3>
+  2578355194U,	// <6,4,6,7>: Cost 3 vext1 <4,6,4,6>, <7,0,1,2>
+  1504614190U,	// <6,4,6,u>: Cost 2 vext1 <4,6,4,6>, LHS
+  2572386406U,	// <6,4,7,0>: Cost 3 vext1 <3,6,4,7>, LHS
+  2572387226U,	// <6,4,7,1>: Cost 3 vext1 <3,6,4,7>, <1,2,3,4>
+  3640157902U,	// <6,4,7,2>: Cost 4 vext1 <2,6,4,7>, <2,3,4,5>
+  2572389020U,	// <6,4,7,3>: Cost 3 vext1 <3,6,4,7>, <3,6,4,7>
+  2572389686U,	// <6,4,7,4>: Cost 3 vext1 <3,6,4,7>, RHS
+  2980497102U,	// <6,4,7,5>: Cost 3 vzipr RHS, <2,3,4,5>
+  2980495564U,	// <6,4,7,6>: Cost 3 vzipr RHS, <0,2,4,6>
+  4054239090U,	// <6,4,7,7>: Cost 4 vzipr RHS, <2,5,4,7>
+  2572392238U,	// <6,4,7,u>: Cost 3 vext1 <3,6,4,7>, LHS
+  1504608358U,	// <6,4,u,0>: Cost 2 vext1 <4,6,4,6>, LHS
+  2629670702U,	// <6,4,u,1>: Cost 3 vext2 <2,0,6,4>, LHS
+  2566424516U,	// <6,4,u,2>: Cost 3 vext1 <2,6,4,u>, <2,6,4,u>
+  2584340994U,	// <6,4,u,3>: Cost 3 vext1 <5,6,4,u>, <3,4,5,6>
+  1640156694U,	// <6,4,u,4>: Cost 2 vext3 <4,u,4,6>, <4,u,4,6>
+  2629671066U,	// <6,4,u,5>: Cost 3 vext2 <2,0,6,4>, RHS
+  1612287529U,	// <6,4,u,6>: Cost 2 vext3 <0,2,4,6>, RHS
+  2852965929U,	// <6,4,u,7>: Cost 3 vuzpr <5,6,7,4>, RHS
+  1612287547U,	// <6,4,u,u>: Cost 2 vext3 <0,2,4,6>, RHS
+  3708723200U,	// <6,5,0,0>: Cost 4 vext2 <2,u,6,5>, <0,0,0,0>
+  2634981478U,	// <6,5,0,1>: Cost 3 vext2 <2,u,6,5>, LHS
+  3694125260U,	// <6,5,0,2>: Cost 4 vext2 <0,4,6,5>, <0,2,4,6>
+  3779087962U,	// <6,5,0,3>: Cost 4 vext3 <3,4,5,6>, <5,0,3,2>
+  3760877154U,	// <6,5,0,4>: Cost 4 vext3 <0,4,1,6>, <5,0,4,1>
+  4195110916U,	// <6,5,0,5>: Cost 4 vtrnr <5,6,7,0>, <5,5,5,5>
+  3696779775U,	// <6,5,0,6>: Cost 4 vext2 <0,u,6,5>, <0,6,2,7>
+  1175212130U,	// <6,5,0,7>: Cost 2 vrev <5,6,7,0>
+  1175285867U,	// <6,5,0,u>: Cost 2 vrev <5,6,u,0>
+  2248445988U,	// <6,5,1,0>: Cost 3 vrev <5,6,0,1>
+  3698107237U,	// <6,5,1,1>: Cost 4 vext2 <1,1,6,5>, <1,1,6,5>
+  3708724118U,	// <6,5,1,2>: Cost 4 vext2 <2,u,6,5>, <1,2,3,0>
+  3908575334U,	// <6,5,1,3>: Cost 4 vuzpr <2,6,4,5>, LHS
+  3716023376U,	// <6,5,1,4>: Cost 4 vext2 <4,1,6,5>, <1,4,5,6>
+  3708724368U,	// <6,5,1,5>: Cost 4 vext2 <2,u,6,5>, <1,5,3,7>
+  3767733960U,	// <6,5,1,6>: Cost 4 vext3 <1,5,4,6>, <5,1,6,4>
+  2712571600U,	// <6,5,1,7>: Cost 3 vext3 <4,6,4,6>, <5,1,7,3>
+  2712571609U,	// <6,5,1,u>: Cost 3 vext3 <4,6,4,6>, <5,1,u,3>
+  2578391142U,	// <6,5,2,0>: Cost 3 vext1 <4,6,5,2>, LHS
+  3704079934U,	// <6,5,2,1>: Cost 4 vext2 <2,1,6,5>, <2,1,6,5>
+  3708724840U,	// <6,5,2,2>: Cost 4 vext2 <2,u,6,5>, <2,2,2,2>
+  3705407182U,	// <6,5,2,3>: Cost 4 vext2 <2,3,6,5>, <2,3,4,5>
+  2578394422U,	// <6,5,2,4>: Cost 3 vext1 <4,6,5,2>, RHS
+  3717351272U,	// <6,5,2,5>: Cost 4 vext2 <4,3,6,5>, <2,5,3,6>
+  2634983354U,	// <6,5,2,6>: Cost 3 vext2 <2,u,6,5>, <2,6,3,7>
+  3115486518U,	// <6,5,2,7>: Cost 3 vtrnr <4,6,u,2>, RHS
+  2634983541U,	// <6,5,2,u>: Cost 3 vext2 <2,u,6,5>, <2,u,6,5>
+  3708725398U,	// <6,5,3,0>: Cost 4 vext2 <2,u,6,5>, <3,0,1,2>
+  3710052631U,	// <6,5,3,1>: Cost 4 vext2 <3,1,6,5>, <3,1,6,5>
+  3708725606U,	// <6,5,3,2>: Cost 4 vext2 <2,u,6,5>, <3,2,6,3>
+  3708725660U,	// <6,5,3,3>: Cost 4 vext2 <2,u,6,5>, <3,3,3,3>
+  2643610114U,	// <6,5,3,4>: Cost 3 vext2 <4,3,6,5>, <3,4,5,6>
+  3717352010U,	// <6,5,3,5>: Cost 4 vext2 <4,3,6,5>, <3,5,4,6>
+  3773632358U,	// <6,5,3,6>: Cost 4 vext3 <2,5,3,6>, <5,3,6,0>
+  2248978533U,	// <6,5,3,7>: Cost 3 vrev <5,6,7,3>
+  2249052270U,	// <6,5,3,u>: Cost 3 vrev <5,6,u,3>
+  2596323430U,	// <6,5,4,0>: Cost 3 vext1 <7,6,5,4>, LHS
+  3716025328U,	// <6,5,4,1>: Cost 4 vext2 <4,1,6,5>, <4,1,6,5>
+  3716688961U,	// <6,5,4,2>: Cost 4 vext2 <4,2,6,5>, <4,2,6,5>
+  2643610770U,	// <6,5,4,3>: Cost 3 vext2 <4,3,6,5>, <4,3,6,5>
+  2596326710U,	// <6,5,4,4>: Cost 3 vext1 <7,6,5,4>, RHS
+  2634984758U,	// <6,5,4,5>: Cost 3 vext2 <2,u,6,5>, RHS
+  3767734199U,	// <6,5,4,6>: Cost 4 vext3 <1,5,4,6>, <5,4,6,0>
+  1643696070U,	// <6,5,4,7>: Cost 2 vext3 <5,4,7,6>, <5,4,7,6>
+  1643769807U,	// <6,5,4,u>: Cost 2 vext3 <5,4,u,6>, <5,4,u,6>
+  2578415718U,	// <6,5,5,0>: Cost 3 vext1 <4,6,5,5>, LHS
+  3652158198U,	// <6,5,5,1>: Cost 4 vext1 <4,6,5,5>, <1,0,3,2>
+  3652159080U,	// <6,5,5,2>: Cost 4 vext1 <4,6,5,5>, <2,2,2,2>
+  3652159638U,	// <6,5,5,3>: Cost 4 vext1 <4,6,5,5>, <3,0,1,2>
+  2578418998U,	// <6,5,5,4>: Cost 3 vext1 <4,6,5,5>, RHS
+  2712571908U,	// <6,5,5,5>: Cost 3 vext3 <4,6,4,6>, <5,5,5,5>
+  2718027790U,	// <6,5,5,6>: Cost 3 vext3 <5,5,6,6>, <5,5,6,6>
+  2712571928U,	// <6,5,5,7>: Cost 3 vext3 <4,6,4,6>, <5,5,7,7>
+  2712571937U,	// <6,5,5,u>: Cost 3 vext3 <4,6,4,6>, <5,5,u,7>
+  2705346596U,	// <6,5,6,0>: Cost 3 vext3 <3,4,5,6>, <5,6,0,1>
+  3767144496U,	// <6,5,6,1>: Cost 4 vext3 <1,4,5,6>, <5,6,1,4>
+  3773116473U,	// <6,5,6,2>: Cost 4 vext3 <2,4,5,6>, <5,6,2,4>
+  2705346626U,	// <6,5,6,3>: Cost 3 vext3 <3,4,5,6>, <5,6,3,4>
+  2705346636U,	// <6,5,6,4>: Cost 3 vext3 <3,4,5,6>, <5,6,4,5>
+  3908577217U,	// <6,5,6,5>: Cost 4 vuzpr <2,6,4,5>, <2,6,4,5>
+  2578428728U,	// <6,5,6,6>: Cost 3 vext1 <4,6,5,6>, <6,6,6,6>
+  2712572002U,	// <6,5,6,7>: Cost 3 vext3 <4,6,4,6>, <5,6,7,0>
+  2705346668U,	// <6,5,6,u>: Cost 3 vext3 <3,4,5,6>, <5,6,u,1>
+  2560516198U,	// <6,5,7,0>: Cost 3 vext1 <1,6,5,7>, LHS
+  2560517363U,	// <6,5,7,1>: Cost 3 vext1 <1,6,5,7>, <1,6,5,7>
+  2566490060U,	// <6,5,7,2>: Cost 3 vext1 <2,6,5,7>, <2,6,5,7>
+  3634260118U,	// <6,5,7,3>: Cost 4 vext1 <1,6,5,7>, <3,0,1,2>
+  2560519478U,	// <6,5,7,4>: Cost 3 vext1 <1,6,5,7>, RHS
+  2980498650U,	// <6,5,7,5>: Cost 3 vzipr RHS, <4,4,5,5>
+  2980497922U,	// <6,5,7,6>: Cost 3 vzipr RHS, <3,4,5,6>
+  3103214902U,	// <6,5,7,7>: Cost 3 vtrnr <2,6,3,7>, RHS
+  2560522030U,	// <6,5,7,u>: Cost 3 vext1 <1,6,5,7>, LHS
+  2560524390U,	// <6,5,u,0>: Cost 3 vext1 <1,6,5,u>, LHS
+  2560525556U,	// <6,5,u,1>: Cost 3 vext1 <1,6,5,u>, <1,6,5,u>
+  2566498253U,	// <6,5,u,2>: Cost 3 vext1 <2,6,5,u>, <2,6,5,u>
+  2646931439U,	// <6,5,u,3>: Cost 3 vext2 <4,u,6,5>, <u,3,5,7>
+  2560527670U,	// <6,5,u,4>: Cost 3 vext1 <1,6,5,u>, RHS
+  2634987674U,	// <6,5,u,5>: Cost 3 vext2 <2,u,6,5>, RHS
+  2980506114U,	// <6,5,u,6>: Cost 3 vzipr RHS, <3,4,5,6>
+  1175277674U,	// <6,5,u,7>: Cost 2 vrev <5,6,7,u>
+  1175351411U,	// <6,5,u,u>: Cost 2 vrev <5,6,u,u>
+  2578448486U,	// <6,6,0,0>: Cost 3 vext1 <4,6,6,0>, LHS
+  1573191782U,	// <6,6,0,1>: Cost 2 vext2 <4,u,6,6>, LHS
+  2686030124U,	// <6,6,0,2>: Cost 3 vext3 <0,2,4,6>, <6,0,2,4>
+  3779088690U,	// <6,6,0,3>: Cost 4 vext3 <3,4,5,6>, <6,0,3,1>
+  2687209788U,	// <6,6,0,4>: Cost 3 vext3 <0,4,2,6>, <6,0,4,2>
+  3652194000U,	// <6,6,0,5>: Cost 4 vext1 <4,6,6,0>, <5,1,7,3>
+  2254852914U,	// <6,6,0,6>: Cost 3 vrev <6,6,6,0>
+  4041575734U,	// <6,6,0,7>: Cost 4 vzipr <2,4,6,0>, RHS
+  1573192349U,	// <6,6,0,u>: Cost 2 vext2 <4,u,6,6>, LHS
+  2646934262U,	// <6,6,1,0>: Cost 3 vext2 <4,u,6,6>, <1,0,3,2>
+  2646934324U,	// <6,6,1,1>: Cost 3 vext2 <4,u,6,6>, <1,1,1,1>
+  2646934422U,	// <6,6,1,2>: Cost 3 vext2 <4,u,6,6>, <1,2,3,0>
+  2846785638U,	// <6,6,1,3>: Cost 3 vuzpr <4,6,4,6>, LHS
+  3760951694U,	// <6,6,1,4>: Cost 4 vext3 <0,4,2,6>, <6,1,4,3>
+  2646934672U,	// <6,6,1,5>: Cost 3 vext2 <4,u,6,6>, <1,5,3,7>
+  2712572320U,	// <6,6,1,6>: Cost 3 vext3 <4,6,4,6>, <6,1,6,3>
+  3775549865U,	// <6,6,1,7>: Cost 4 vext3 <2,u,2,6>, <6,1,7,3>
+  2846785643U,	// <6,6,1,u>: Cost 3 vuzpr <4,6,4,6>, LHS
+  3759772094U,	// <6,6,2,0>: Cost 4 vext3 <0,2,4,6>, <6,2,0,6>
+  3704751676U,	// <6,6,2,1>: Cost 4 vext2 <2,2,6,6>, <2,1,6,3>
+  2631009936U,	// <6,6,2,2>: Cost 3 vext2 <2,2,6,6>, <2,2,6,6>
+  2646935206U,	// <6,6,2,3>: Cost 3 vext2 <4,u,6,6>, <2,3,0,1>
+  3759772127U,	// <6,6,2,4>: Cost 4 vext3 <0,2,4,6>, <6,2,4,3>
+  3704752004U,	// <6,6,2,5>: Cost 4 vext2 <2,2,6,6>, <2,5,6,7>
+  2646935482U,	// <6,6,2,6>: Cost 3 vext2 <4,u,6,6>, <2,6,3,7>
+  2712572410U,	// <6,6,2,7>: Cost 3 vext3 <4,6,4,6>, <6,2,7,3>
+  2712572419U,	// <6,6,2,u>: Cost 3 vext3 <4,6,4,6>, <6,2,u,3>
+  2646935702U,	// <6,6,3,0>: Cost 3 vext2 <4,u,6,6>, <3,0,1,2>
+  3777024534U,	// <6,6,3,1>: Cost 4 vext3 <3,1,4,6>, <6,3,1,4>
+  3704752453U,	// <6,6,3,2>: Cost 4 vext2 <2,2,6,6>, <3,2,2,6>
+  2646935964U,	// <6,6,3,3>: Cost 3 vext2 <4,u,6,6>, <3,3,3,3>
+  2705347122U,	// <6,6,3,4>: Cost 3 vext3 <3,4,5,6>, <6,3,4,5>
+  3779678778U,	// <6,6,3,5>: Cost 4 vext3 <3,5,4,6>, <6,3,5,4>
+  2657553069U,	// <6,6,3,6>: Cost 3 vext2 <6,6,6,6>, <3,6,6,6>
+  4039609654U,	// <6,6,3,7>: Cost 4 vzipr <2,1,6,3>, RHS
+  2708001366U,	// <6,6,3,u>: Cost 3 vext3 <3,u,5,6>, <6,3,u,5>
+  2578481254U,	// <6,6,4,0>: Cost 3 vext1 <4,6,6,4>, LHS
+  3652223734U,	// <6,6,4,1>: Cost 4 vext1 <4,6,6,4>, <1,0,3,2>
+  3760951922U,	// <6,6,4,2>: Cost 4 vext3 <0,4,2,6>, <6,4,2,6>
+  3779089019U,	// <6,6,4,3>: Cost 4 vext3 <3,4,5,6>, <6,4,3,6>
+  1570540772U,	// <6,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
+  1573195062U,	// <6,6,4,5>: Cost 2 vext2 <4,u,6,6>, RHS
+  2712572560U,	// <6,6,4,6>: Cost 3 vext3 <4,6,4,6>, <6,4,6,0>
+  2723410591U,	// <6,6,4,7>: Cost 3 vext3 <6,4,7,6>, <6,4,7,6>
+  1573195304U,	// <6,6,4,u>: Cost 2 vext2 <4,u,6,6>, <4,u,6,6>
+  3640287334U,	// <6,6,5,0>: Cost 4 vext1 <2,6,6,5>, LHS
+  2646937296U,	// <6,6,5,1>: Cost 3 vext2 <4,u,6,6>, <5,1,7,3>
+  3640289235U,	// <6,6,5,2>: Cost 4 vext1 <2,6,6,5>, <2,6,6,5>
+  3720679279U,	// <6,6,5,3>: Cost 4 vext2 <4,u,6,6>, <5,3,7,0>
+  2646937542U,	// <6,6,5,4>: Cost 3 vext2 <4,u,6,6>, <5,4,7,6>
+  2646937604U,	// <6,6,5,5>: Cost 3 vext2 <4,u,6,6>, <5,5,5,5>
+  2646937698U,	// <6,6,5,6>: Cost 3 vext2 <4,u,6,6>, <5,6,7,0>
+  2846788918U,	// <6,6,5,7>: Cost 3 vuzpr <4,6,4,6>, RHS
+  2846788919U,	// <6,6,5,u>: Cost 3 vuzpr <4,6,4,6>, RHS
+  1516699750U,	// <6,6,6,0>: Cost 2 vext1 <6,6,6,6>, LHS
+  2590442230U,	// <6,6,6,1>: Cost 3 vext1 <6,6,6,6>, <1,0,3,2>
+  2646938106U,	// <6,6,6,2>: Cost 3 vext2 <4,u,6,6>, <6,2,7,3>
+  2590443670U,	// <6,6,6,3>: Cost 3 vext1 <6,6,6,6>, <3,0,1,2>
+  1516703030U,	// <6,6,6,4>: Cost 2 vext1 <6,6,6,6>, RHS
+  2590445264U,	// <6,6,6,5>: Cost 3 vext1 <6,6,6,6>, <5,1,7,3>
+  296144182U,	// <6,6,6,6>: Cost 1 vdup2 RHS
+  2712572738U,	// <6,6,6,7>: Cost 3 vext3 <4,6,4,6>, <6,6,7,7>
+  296144182U,	// <6,6,6,u>: Cost 1 vdup2 RHS
+  2566561894U,	// <6,6,7,0>: Cost 3 vext1 <2,6,6,7>, LHS
+  3634332924U,	// <6,6,7,1>: Cost 4 vext1 <1,6,6,7>, <1,6,6,7>
+  2566563797U,	// <6,6,7,2>: Cost 3 vext1 <2,6,6,7>, <2,6,6,7>
+  2584480258U,	// <6,6,7,3>: Cost 3 vext1 <5,6,6,7>, <3,4,5,6>
+  2566565174U,	// <6,6,7,4>: Cost 3 vext1 <2,6,6,7>, RHS
+  2717438846U,	// <6,6,7,5>: Cost 3 vext3 <5,4,7,6>, <6,7,5,4>
+  2980500280U,	// <6,6,7,6>: Cost 3 vzipr RHS, <6,6,6,6>
+  1906756918U,	// <6,6,7,7>: Cost 2 vzipr RHS, RHS
+  1906756919U,	// <6,6,7,u>: Cost 2 vzipr RHS, RHS
+  1516699750U,	// <6,6,u,0>: Cost 2 vext1 <6,6,6,6>, LHS
+  1573197614U,	// <6,6,u,1>: Cost 2 vext2 <4,u,6,6>, LHS
+  2566571990U,	// <6,6,u,2>: Cost 3 vext1 <2,6,6,u>, <2,6,6,u>
+  2846786205U,	// <6,6,u,3>: Cost 3 vuzpr <4,6,4,6>, LHS
+  1516703030U,	// <6,6,u,4>: Cost 2 vext1 <6,6,6,6>, RHS
+  1573197978U,	// <6,6,u,5>: Cost 2 vext2 <4,u,6,6>, RHS
+  296144182U,	// <6,6,u,6>: Cost 1 vdup2 RHS
+  1906765110U,	// <6,6,u,7>: Cost 2 vzipr RHS, RHS
+  296144182U,	// <6,6,u,u>: Cost 1 vdup2 RHS
+  1571209216U,	// <6,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+  497467494U,	// <6,7,0,1>: Cost 1 vext2 RHS, LHS
+  1571209380U,	// <6,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+  2644951292U,	// <6,7,0,3>: Cost 3 vext2 RHS, <0,3,1,0>
+  1571209554U,	// <6,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+  1510756450U,	// <6,7,0,5>: Cost 2 vext1 <5,6,7,0>, <5,6,7,0>
+  2644951542U,	// <6,7,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
+  2584499194U,	// <6,7,0,7>: Cost 3 vext1 <5,6,7,0>, <7,0,1,2>
+  497468061U,	// <6,7,0,u>: Cost 1 vext2 RHS, LHS
+  1571209974U,	// <6,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+  1571210036U,	// <6,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+  1571210134U,	// <6,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
+  1571210200U,	// <6,7,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
+  2644952098U,	// <6,7,1,4>: Cost 3 vext2 RHS, <1,4,0,5>
+  1571210384U,	// <6,7,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
+  2644952271U,	// <6,7,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
+  2578535418U,	// <6,7,1,7>: Cost 3 vext1 <4,6,7,1>, <7,0,1,2>
+  1571210605U,	// <6,7,1,u>: Cost 2 vext2 RHS, <1,u,1,3>
+  2644952509U,	// <6,7,2,0>: Cost 3 vext2 RHS, <2,0,1,2>
+  2644952582U,	// <6,7,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
+  1571210856U,	// <6,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+  1571210918U,	// <6,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+  2644952828U,	// <6,7,2,4>: Cost 3 vext2 RHS, <2,4,0,6>
+  2633009028U,	// <6,7,2,5>: Cost 3 vext2 <2,5,6,7>, <2,5,6,7>
+  1571211194U,	// <6,7,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
+  2668840938U,	// <6,7,2,7>: Cost 3 vext2 RHS, <2,7,0,1>
+  1571211323U,	// <6,7,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
+  1571211414U,	// <6,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+  2644953311U,	// <6,7,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
+  2644953390U,	// <6,7,3,2>: Cost 3 vext2 RHS, <3,2,0,1>
+  1571211676U,	// <6,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+  1571211778U,	// <6,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+  2644953648U,	// <6,7,3,5>: Cost 3 vext2 RHS, <3,5,1,7>
+  2644953720U,	// <6,7,3,6>: Cost 3 vext2 RHS, <3,6,0,7>
+  2644953795U,	// <6,7,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
+  1571212062U,	// <6,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+  1573202834U,	// <6,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+  2644954058U,	// <6,7,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
+  2644954166U,	// <6,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
+  2644954258U,	// <6,7,4,3>: Cost 3 vext2 RHS, <4,3,6,5>
+  1571212496U,	// <6,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+  497470774U,	// <6,7,4,5>: Cost 1 vext2 RHS, RHS
+  1573203316U,	// <6,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  2646281688U,	// <6,7,4,7>: Cost 3 vext2 <4,7,6,7>, <4,7,6,7>
+  497471017U,	// <6,7,4,u>: Cost 1 vext2 RHS, RHS
+  2644954696U,	// <6,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
+  1573203664U,	// <6,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  2644954878U,	// <6,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
+  2644954991U,	// <6,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
+  1571213254U,	// <6,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+  1571213316U,	// <6,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+  1571213410U,	// <6,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
+  1573204136U,	// <6,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+  1573204217U,	// <6,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
+  2644955425U,	// <6,7,6,0>: Cost 3 vext2 RHS, <6,0,1,2>
+  2644955561U,	// <6,7,6,1>: Cost 3 vext2 RHS, <6,1,7,3>
+  1573204474U,	// <6,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  2644955698U,	// <6,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
+  2644955789U,	// <6,7,6,4>: Cost 3 vext2 RHS, <6,4,5,6>
+  2644955889U,	// <6,7,6,5>: Cost 3 vext2 RHS, <6,5,7,7>
+  1571214136U,	// <6,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
+  1571214158U,	// <6,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+  1573204895U,	// <6,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
+  1573204986U,	// <6,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
+  2572608656U,	// <6,7,7,1>: Cost 3 vext1 <3,6,7,7>, <1,5,3,7>
+  2644956362U,	// <6,7,7,2>: Cost 3 vext2 RHS, <7,2,6,3>
+  2572610231U,	// <6,7,7,3>: Cost 3 vext1 <3,6,7,7>, <3,6,7,7>
+  1573205350U,	// <6,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
+  2646947220U,	// <6,7,7,5>: Cost 3 vext2 RHS, <7,5,1,7>
+  1516786498U,	// <6,7,7,6>: Cost 2 vext1 <6,6,7,7>, <6,6,7,7>
+  1571214956U,	// <6,7,7,7>: Cost 2 vext2 RHS, <7,7,7,7>
+  1573205634U,	// <6,7,7,u>: Cost 2 vext2 RHS, <7,u,1,2>
+  1571215059U,	// <6,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
+  497473326U,	// <6,7,u,1>: Cost 1 vext2 RHS, LHS
+  1571215237U,	// <6,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
+  1571215292U,	// <6,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+  1571215423U,	// <6,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
+  497473690U,	// <6,7,u,5>: Cost 1 vext2 RHS, RHS
+  1571215568U,	// <6,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
+  1573206272U,	// <6,7,u,7>: Cost 2 vext2 RHS, <u,7,0,1>
+  497473893U,	// <6,7,u,u>: Cost 1 vext2 RHS, LHS
+  1571217408U,	// <6,u,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+  497475686U,	// <6,u,0,1>: Cost 1 vext2 RHS, LHS
+  1571217572U,	// <6,u,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+  2689865445U,	// <6,u,0,3>: Cost 3 vext3 <0,u,2,6>, <u,0,3,2>
+  1571217746U,	// <6,u,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+  1510830187U,	// <6,u,0,5>: Cost 2 vext1 <5,6,u,0>, <5,6,u,0>
+  2644959734U,	// <6,u,0,6>: Cost 3 vext2 RHS, <0,6,1,7>
+  1193130221U,	// <6,u,0,7>: Cost 2 vrev <u,6,7,0>
+  497476253U,	// <6,u,0,u>: Cost 1 vext2 RHS, LHS
+  1571218166U,	// <6,u,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+  1571218228U,	// <6,u,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+  1612289838U,	// <6,u,1,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  1571218392U,	// <6,u,1,3>: Cost 2 vext2 RHS, <1,3,1,3>
+  2566663478U,	// <6,u,1,4>: Cost 3 vext1 <2,6,u,1>, RHS
+  1571218576U,	// <6,u,1,5>: Cost 2 vext2 RHS, <1,5,3,7>
+  2644960463U,	// <6,u,1,6>: Cost 3 vext2 RHS, <1,6,1,7>
+  2717439835U,	// <6,u,1,7>: Cost 3 vext3 <5,4,7,6>, <u,1,7,3>
+  1612289892U,	// <6,u,1,u>: Cost 2 vext3 <0,2,4,6>, LHS
+  1504870502U,	// <6,u,2,0>: Cost 2 vext1 <4,6,u,2>, LHS
+  2644960774U,	// <6,u,2,1>: Cost 3 vext2 RHS, <2,1,0,3>
+  1571219048U,	// <6,u,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+  1571219110U,	// <6,u,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+  1504873782U,	// <6,u,2,4>: Cost 2 vext1 <4,6,u,2>, RHS
+  2633017221U,	// <6,u,2,5>: Cost 3 vext2 <2,5,6,u>, <2,5,6,u>
+  1571219386U,	// <6,u,2,6>: Cost 2 vext2 RHS, <2,6,3,7>
+  2712573868U,	// <6,u,2,7>: Cost 3 vext3 <4,6,4,6>, <u,2,7,3>
+  1571219515U,	// <6,u,2,u>: Cost 2 vext2 RHS, <2,u,0,1>
+  1571219606U,	// <6,u,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+  2644961503U,	// <6,u,3,1>: Cost 3 vext2 RHS, <3,1,0,3>
+  2566678499U,	// <6,u,3,2>: Cost 3 vext1 <2,6,u,3>, <2,6,u,3>
+  1571219868U,	// <6,u,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+  1571219970U,	// <6,u,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+  2689865711U,	// <6,u,3,5>: Cost 3 vext3 <0,u,2,6>, <u,3,5,7>
+  2708002806U,	// <6,u,3,6>: Cost 3 vext3 <3,u,5,6>, <u,3,6,5>
+  2644961987U,	// <6,u,3,7>: Cost 3 vext2 RHS, <3,7,0,1>
+  1571220254U,	// <6,u,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+  1571220370U,	// <6,u,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+  2644962250U,	// <6,u,4,1>: Cost 3 vext2 RHS, <4,1,2,3>
+  1661245476U,	// <6,u,4,2>: Cost 2 vext3 <u,4,2,6>, <u,4,2,6>
+  2686031917U,	// <6,u,4,3>: Cost 3 vext3 <0,2,4,6>, <u,4,3,6>
+  1571220688U,	// <6,u,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+  497478967U,	// <6,u,4,5>: Cost 1 vext2 RHS, RHS
+  1571220852U,	// <6,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  1661614161U,	// <6,u,4,7>: Cost 2 vext3 <u,4,7,6>, <u,4,7,6>
+  497479209U,	// <6,u,4,u>: Cost 1 vext2 RHS, RHS
+  2566692966U,	// <6,u,5,0>: Cost 3 vext1 <2,6,u,5>, LHS
+  1571221200U,	// <6,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  2566694885U,	// <6,u,5,2>: Cost 3 vext1 <2,6,u,5>, <2,6,u,5>
+  2689865855U,	// <6,u,5,3>: Cost 3 vext3 <0,u,2,6>, <u,5,3,7>
+  1571221446U,	// <6,u,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+  1571221508U,	// <6,u,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+  1612290202U,	// <6,u,5,6>: Cost 2 vext3 <0,2,4,6>, RHS
+  1571221672U,	// <6,u,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+  1612290220U,	// <6,u,5,u>: Cost 2 vext3 <0,2,4,6>, RHS
+  1504903270U,	// <6,u,6,0>: Cost 2 vext1 <4,6,u,6>, LHS
+  2644963752U,	// <6,u,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
+  1571222010U,	// <6,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  2686032080U,	// <6,u,6,3>: Cost 3 vext3 <0,2,4,6>, <u,6,3,7>
+  1504906550U,	// <6,u,6,4>: Cost 2 vext1 <4,6,u,6>, RHS
+  2644964079U,	// <6,u,6,5>: Cost 3 vext2 RHS, <6,5,7,5>
+  296144182U,	// <6,u,6,6>: Cost 1 vdup2 RHS
+  1571222350U,	// <6,u,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+  296144182U,	// <6,u,6,u>: Cost 1 vdup2 RHS
+  1492967526U,	// <6,u,7,0>: Cost 2 vext1 <2,6,u,7>, LHS
+  2560738574U,	// <6,u,7,1>: Cost 3 vext1 <1,6,u,7>, <1,6,u,7>
+  1492969447U,	// <6,u,7,2>: Cost 2 vext1 <2,6,u,7>, <2,6,u,7>
+  1906753692U,	// <6,u,7,3>: Cost 2 vzipr RHS, LHS
+  1492970806U,	// <6,u,7,4>: Cost 2 vext1 <2,6,u,7>, RHS
+  2980495761U,	// <6,u,7,5>: Cost 3 vzipr RHS, <0,4,u,5>
+  1516860235U,	// <6,u,7,6>: Cost 2 vext1 <6,6,u,7>, <6,6,u,7>
+  1906756936U,	// <6,u,7,7>: Cost 2 vzipr RHS, RHS
+  1492973358U,	// <6,u,7,u>: Cost 2 vext1 <2,6,u,7>, LHS
+  1492975718U,	// <6,u,u,0>: Cost 2 vext1 <2,6,u,u>, LHS
+  497481518U,	// <6,u,u,1>: Cost 1 vext2 RHS, LHS
+  1612290405U,	// <6,u,u,2>: Cost 2 vext3 <0,2,4,6>, LHS
+  1571223484U,	// <6,u,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+  1492978998U,	// <6,u,u,4>: Cost 2 vext1 <2,6,u,u>, RHS
+  497481882U,	// <6,u,u,5>: Cost 1 vext2 RHS, RHS
+  296144182U,	// <6,u,u,6>: Cost 1 vdup2 RHS
+  1906765128U,	// <6,u,u,7>: Cost 2 vzipr RHS, RHS
+  497482085U,	// <6,u,u,u>: Cost 1 vext2 RHS, LHS
+  1638318080U,	// <7,0,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
+  1638318090U,	// <7,0,0,1>: Cost 2 vext3 RHS, <0,0,1,1>
+  1638318100U,	// <7,0,0,2>: Cost 2 vext3 RHS, <0,0,2,2>
+  3646442178U,	// <7,0,0,3>: Cost 4 vext1 <3,7,0,0>, <3,7,0,0>
+  2712059941U,	// <7,0,0,4>: Cost 3 vext3 RHS, <0,0,4,1>
+  2651603364U,	// <7,0,0,5>: Cost 3 vext2 <5,6,7,0>, <0,5,1,6>
+  2590618445U,	// <7,0,0,6>: Cost 3 vext1 <6,7,0,0>, <6,7,0,0>
+  3785801798U,	// <7,0,0,7>: Cost 4 vext3 RHS, <0,0,7,7>
+  1638318153U,	// <7,0,0,u>: Cost 2 vext3 RHS, <0,0,u,1>
+  1516879974U,	// <7,0,1,0>: Cost 2 vext1 <6,7,0,1>, LHS
+  2693922911U,	// <7,0,1,1>: Cost 3 vext3 <1,5,3,7>, <0,1,1,5>
+  564576358U,	// <7,0,1,2>: Cost 1 vext3 RHS, LHS
+  2638996480U,	// <7,0,1,3>: Cost 3 vext2 <3,5,7,0>, <1,3,5,7>
+  1516883254U,	// <7,0,1,4>: Cost 2 vext1 <6,7,0,1>, RHS
+  2649613456U,	// <7,0,1,5>: Cost 3 vext2 <5,3,7,0>, <1,5,3,7>
+  1516884814U,	// <7,0,1,6>: Cost 2 vext1 <6,7,0,1>, <6,7,0,1>
+  2590626808U,	// <7,0,1,7>: Cost 3 vext1 <6,7,0,1>, <7,0,1,0>
+  564576412U,	// <7,0,1,u>: Cost 1 vext3 RHS, LHS
+  1638318244U,	// <7,0,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
+  2692743344U,	// <7,0,2,1>: Cost 3 vext3 <1,3,5,7>, <0,2,1,5>
+  2712060084U,	// <7,0,2,2>: Cost 3 vext3 RHS, <0,2,2,0>
+  2712060094U,	// <7,0,2,3>: Cost 3 vext3 RHS, <0,2,3,1>
+  1638318284U,	// <7,0,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
+  2712060118U,	// <7,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
+  2651604922U,	// <7,0,2,6>: Cost 3 vext2 <5,6,7,0>, <2,6,3,7>
+  2686255336U,	// <7,0,2,7>: Cost 3 vext3 <0,2,7,7>, <0,2,7,7>
+  1638318316U,	// <7,0,2,u>: Cost 2 vext3 RHS, <0,2,u,2>
+  2651605142U,	// <7,0,3,0>: Cost 3 vext2 <5,6,7,0>, <3,0,1,2>
+  2712060156U,	// <7,0,3,1>: Cost 3 vext3 RHS, <0,3,1,0>
+  2712060165U,	// <7,0,3,2>: Cost 3 vext3 RHS, <0,3,2,0>
+  2651605404U,	// <7,0,3,3>: Cost 3 vext2 <5,6,7,0>, <3,3,3,3>
+  2651605506U,	// <7,0,3,4>: Cost 3 vext2 <5,6,7,0>, <3,4,5,6>
+  2638998111U,	// <7,0,3,5>: Cost 3 vext2 <3,5,7,0>, <3,5,7,0>
+  2639661744U,	// <7,0,3,6>: Cost 3 vext2 <3,6,7,0>, <3,6,7,0>
+  3712740068U,	// <7,0,3,7>: Cost 4 vext2 <3,5,7,0>, <3,7,3,7>
+  2640989010U,	// <7,0,3,u>: Cost 3 vext2 <3,u,7,0>, <3,u,7,0>
+  2712060232U,	// <7,0,4,0>: Cost 3 vext3 RHS, <0,4,0,4>
+  1638318418U,	// <7,0,4,1>: Cost 2 vext3 RHS, <0,4,1,5>
+  1638318428U,	// <7,0,4,2>: Cost 2 vext3 RHS, <0,4,2,6>
+  3646474950U,	// <7,0,4,3>: Cost 4 vext1 <3,7,0,4>, <3,7,0,4>
+  2712060270U,	// <7,0,4,4>: Cost 3 vext3 RHS, <0,4,4,6>
+  1577864502U,	// <7,0,4,5>: Cost 2 vext2 <5,6,7,0>, RHS
+  2651606388U,	// <7,0,4,6>: Cost 3 vext2 <5,6,7,0>, <4,6,4,6>
+  3787792776U,	// <7,0,4,7>: Cost 4 vext3 RHS, <0,4,7,5>
+  1638318481U,	// <7,0,4,u>: Cost 2 vext3 RHS, <0,4,u,5>
+  2590654566U,	// <7,0,5,0>: Cost 3 vext1 <6,7,0,5>, LHS
+  2651606736U,	// <7,0,5,1>: Cost 3 vext2 <5,6,7,0>, <5,1,7,3>
+  2712060334U,	// <7,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
+  2649616239U,	// <7,0,5,3>: Cost 3 vext2 <5,3,7,0>, <5,3,7,0>
+  2651606982U,	// <7,0,5,4>: Cost 3 vext2 <5,6,7,0>, <5,4,7,6>
+  2651607044U,	// <7,0,5,5>: Cost 3 vext2 <5,6,7,0>, <5,5,5,5>
+  1577865314U,	// <7,0,5,6>: Cost 2 vext2 <5,6,7,0>, <5,6,7,0>
+  2651607208U,	// <7,0,5,7>: Cost 3 vext2 <5,6,7,0>, <5,7,5,7>
+  1579192580U,	// <7,0,5,u>: Cost 2 vext2 <5,u,7,0>, <5,u,7,0>
+  2688393709U,	// <7,0,6,0>: Cost 3 vext3 <0,6,0,7>, <0,6,0,7>
+  2712060406U,	// <7,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
+  2688541183U,	// <7,0,6,2>: Cost 3 vext3 <0,6,2,7>, <0,6,2,7>
+  2655588936U,	// <7,0,6,3>: Cost 3 vext2 <6,3,7,0>, <6,3,7,0>
+  3762430481U,	// <7,0,6,4>: Cost 4 vext3 <0,6,4,7>, <0,6,4,7>
+  2651607730U,	// <7,0,6,5>: Cost 3 vext2 <5,6,7,0>, <6,5,0,7>
+  2651607864U,	// <7,0,6,6>: Cost 3 vext2 <5,6,7,0>, <6,6,6,6>
+  2651607886U,	// <7,0,6,7>: Cost 3 vext2 <5,6,7,0>, <6,7,0,1>
+  2688983605U,	// <7,0,6,u>: Cost 3 vext3 <0,6,u,7>, <0,6,u,7>
+  2651608058U,	// <7,0,7,0>: Cost 3 vext2 <5,6,7,0>, <7,0,1,2>
+  2932703334U,	// <7,0,7,1>: Cost 3 vzipl <7,7,7,7>, LHS
+  3066921062U,	// <7,0,7,2>: Cost 3 vtrnl <7,7,7,7>, LHS
+  3712742678U,	// <7,0,7,3>: Cost 4 vext2 <3,5,7,0>, <7,3,5,7>
+  2651608422U,	// <7,0,7,4>: Cost 3 vext2 <5,6,7,0>, <7,4,5,6>
+  2651608513U,	// <7,0,7,5>: Cost 3 vext2 <5,6,7,0>, <7,5,6,7>
+  2663552532U,	// <7,0,7,6>: Cost 3 vext2 <7,6,7,0>, <7,6,7,0>
+  2651608684U,	// <7,0,7,7>: Cost 3 vext2 <5,6,7,0>, <7,7,7,7>
+  2651608706U,	// <7,0,7,u>: Cost 3 vext2 <5,6,7,0>, <7,u,1,2>
+  1638318730U,	// <7,0,u,0>: Cost 2 vext3 RHS, <0,u,0,2>
+  1638318738U,	// <7,0,u,1>: Cost 2 vext3 RHS, <0,u,1,1>
+  564576925U,	// <7,0,u,2>: Cost 1 vext3 RHS, LHS
+  2572765898U,	// <7,0,u,3>: Cost 3 vext1 <3,7,0,u>, <3,7,0,u>
+  1638318770U,	// <7,0,u,4>: Cost 2 vext3 RHS, <0,u,4,6>
+  1577867418U,	// <7,0,u,5>: Cost 2 vext2 <5,6,7,0>, RHS
+  1516942165U,	// <7,0,u,6>: Cost 2 vext1 <6,7,0,u>, <6,7,0,u>
+  2651609344U,	// <7,0,u,7>: Cost 3 vext2 <5,6,7,0>, <u,7,0,1>
+  564576979U,	// <7,0,u,u>: Cost 1 vext3 RHS, LHS
+  2590687334U,	// <7,1,0,0>: Cost 3 vext1 <6,7,1,0>, LHS
+  2639003750U,	// <7,1,0,1>: Cost 3 vext2 <3,5,7,1>, LHS
+  2793357414U,	// <7,1,0,2>: Cost 3 vuzpl <7,0,1,2>, LHS
+  1638318838U,	// <7,1,0,3>: Cost 2 vext3 RHS, <1,0,3,2>
+  2590690614U,	// <7,1,0,4>: Cost 3 vext1 <6,7,1,0>, RHS
+  2712060679U,	// <7,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
+  2590692182U,	// <7,1,0,6>: Cost 3 vext1 <6,7,1,0>, <6,7,1,0>
+  3785802521U,	// <7,1,0,7>: Cost 4 vext3 RHS, <1,0,7,1>
+  1638318883U,	// <7,1,0,u>: Cost 2 vext3 RHS, <1,0,u,2>
+  2712060715U,	// <7,1,1,0>: Cost 3 vext3 RHS, <1,1,0,1>
+  1638318900U,	// <7,1,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
+  3774300994U,	// <7,1,1,2>: Cost 4 vext3 <2,6,3,7>, <1,1,2,6>
+  1638318920U,	// <7,1,1,3>: Cost 2 vext3 RHS, <1,1,3,3>
+  2712060755U,	// <7,1,1,4>: Cost 3 vext3 RHS, <1,1,4,5>
+  2691416926U,	// <7,1,1,5>: Cost 3 vext3 <1,1,5,7>, <1,1,5,7>
+  2590700375U,	// <7,1,1,6>: Cost 3 vext1 <6,7,1,1>, <6,7,1,1>
+  3765158766U,	// <7,1,1,7>: Cost 4 vext3 <1,1,5,7>, <1,1,7,5>
+  1638318965U,	// <7,1,1,u>: Cost 2 vext3 RHS, <1,1,u,3>
+  2712060796U,	// <7,1,2,0>: Cost 3 vext3 RHS, <1,2,0,1>
+  2712060807U,	// <7,1,2,1>: Cost 3 vext3 RHS, <1,2,1,3>
+  3712747112U,	// <7,1,2,2>: Cost 4 vext2 <3,5,7,1>, <2,2,2,2>
+  1638318998U,	// <7,1,2,3>: Cost 2 vext3 RHS, <1,2,3,0>
+  2712060836U,	// <7,1,2,4>: Cost 3 vext3 RHS, <1,2,4,5>
+  2712060843U,	// <7,1,2,5>: Cost 3 vext3 RHS, <1,2,5,3>
+  2590708568U,	// <7,1,2,6>: Cost 3 vext1 <6,7,1,2>, <6,7,1,2>
+  2735948730U,	// <7,1,2,7>: Cost 3 vext3 RHS, <1,2,7,0>
+  1638319043U,	// <7,1,2,u>: Cost 2 vext3 RHS, <1,2,u,0>
+  2712060876U,	// <7,1,3,0>: Cost 3 vext3 RHS, <1,3,0,0>
+  1638319064U,	// <7,1,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
+  2712060894U,	// <7,1,3,2>: Cost 3 vext3 RHS, <1,3,2,0>
+  2692596718U,	// <7,1,3,3>: Cost 3 vext3 <1,3,3,7>, <1,3,3,7>
+  2712060917U,	// <7,1,3,4>: Cost 3 vext3 RHS, <1,3,4,5>
+  1619002368U,	// <7,1,3,5>: Cost 2 vext3 <1,3,5,7>, <1,3,5,7>
+  2692817929U,	// <7,1,3,6>: Cost 3 vext3 <1,3,6,7>, <1,3,6,7>
+  2735948814U,	// <7,1,3,7>: Cost 3 vext3 RHS, <1,3,7,3>
+  1619223579U,	// <7,1,3,u>: Cost 2 vext3 <1,3,u,7>, <1,3,u,7>
+  2712060962U,	// <7,1,4,0>: Cost 3 vext3 RHS, <1,4,0,5>
+  2712060971U,	// <7,1,4,1>: Cost 3 vext3 RHS, <1,4,1,5>
+  2712060980U,	// <7,1,4,2>: Cost 3 vext3 RHS, <1,4,2,5>
+  2712060989U,	// <7,1,4,3>: Cost 3 vext3 RHS, <1,4,3,5>
+  3785802822U,	// <7,1,4,4>: Cost 4 vext3 RHS, <1,4,4,5>
+  2639007030U,	// <7,1,4,5>: Cost 3 vext2 <3,5,7,1>, RHS
+  2645642634U,	// <7,1,4,6>: Cost 3 vext2 <4,6,7,1>, <4,6,7,1>
+  3719384520U,	// <7,1,4,7>: Cost 4 vext2 <4,6,7,1>, <4,7,5,0>
+  2639007273U,	// <7,1,4,u>: Cost 3 vext2 <3,5,7,1>, RHS
+  2572812390U,	// <7,1,5,0>: Cost 3 vext1 <3,7,1,5>, LHS
+  2693776510U,	// <7,1,5,1>: Cost 3 vext3 <1,5,1,7>, <1,5,1,7>
+  3774301318U,	// <7,1,5,2>: Cost 4 vext3 <2,6,3,7>, <1,5,2,6>
+  1620182160U,	// <7,1,5,3>: Cost 2 vext3 <1,5,3,7>, <1,5,3,7>
+  2572815670U,	// <7,1,5,4>: Cost 3 vext1 <3,7,1,5>, RHS
+  3766486178U,	// <7,1,5,5>: Cost 4 vext3 <1,3,5,7>, <1,5,5,7>
+  2651615331U,	// <7,1,5,6>: Cost 3 vext2 <5,6,7,1>, <5,6,7,1>
+  2652278964U,	// <7,1,5,7>: Cost 3 vext2 <5,7,7,1>, <5,7,7,1>
+  1620550845U,	// <7,1,5,u>: Cost 2 vext3 <1,5,u,7>, <1,5,u,7>
+  3768108230U,	// <7,1,6,0>: Cost 4 vext3 <1,6,0,7>, <1,6,0,7>
+  2694440143U,	// <7,1,6,1>: Cost 3 vext3 <1,6,1,7>, <1,6,1,7>
+  2712061144U,	// <7,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
+  2694587617U,	// <7,1,6,3>: Cost 3 vext3 <1,6,3,7>, <1,6,3,7>
+  3768403178U,	// <7,1,6,4>: Cost 4 vext3 <1,6,4,7>, <1,6,4,7>
+  2694735091U,	// <7,1,6,5>: Cost 3 vext3 <1,6,5,7>, <1,6,5,7>
+  3768550652U,	// <7,1,6,6>: Cost 4 vext3 <1,6,6,7>, <1,6,6,7>
+  2652279630U,	// <7,1,6,7>: Cost 3 vext2 <5,7,7,1>, <6,7,0,1>
+  2694956302U,	// <7,1,6,u>: Cost 3 vext3 <1,6,u,7>, <1,6,u,7>
+  2645644282U,	// <7,1,7,0>: Cost 3 vext2 <4,6,7,1>, <7,0,1,2>
+  2859062094U,	// <7,1,7,1>: Cost 3 vuzpr <6,7,0,1>, <6,7,0,1>
+  3779462437U,	// <7,1,7,2>: Cost 4 vext3 <3,5,1,7>, <1,7,2,3>
+  3121938534U,	// <7,1,7,3>: Cost 3 vtrnr <5,7,5,7>, LHS
+  2554916150U,	// <7,1,7,4>: Cost 3 vext1 <0,7,1,7>, RHS
+  3769140548U,	// <7,1,7,5>: Cost 4 vext3 <1,7,5,7>, <1,7,5,7>
+  3726022164U,	// <7,1,7,6>: Cost 4 vext2 <5,7,7,1>, <7,6,7,0>
+  2554918508U,	// <7,1,7,7>: Cost 3 vext1 <0,7,1,7>, <7,7,7,7>
+  3121938539U,	// <7,1,7,u>: Cost 3 vtrnr <5,7,5,7>, LHS
+  2572836966U,	// <7,1,u,0>: Cost 3 vext1 <3,7,1,u>, LHS
+  1638319469U,	// <7,1,u,1>: Cost 2 vext3 RHS, <1,u,1,3>
+  2712061299U,	// <7,1,u,2>: Cost 3 vext3 RHS, <1,u,2,0>
+  1622173059U,	// <7,1,u,3>: Cost 2 vext3 <1,u,3,7>, <1,u,3,7>
+  2572840246U,	// <7,1,u,4>: Cost 3 vext1 <3,7,1,u>, RHS
+  1622320533U,	// <7,1,u,5>: Cost 2 vext3 <1,u,5,7>, <1,u,5,7>
+  2696136094U,	// <7,1,u,6>: Cost 3 vext3 <1,u,6,7>, <1,u,6,7>
+  2859060777U,	// <7,1,u,7>: Cost 3 vuzpr <6,7,0,1>, RHS
+  1622541744U,	// <7,1,u,u>: Cost 2 vext3 <1,u,u,7>, <1,u,u,7>
+  2712061364U,	// <7,2,0,0>: Cost 3 vext3 RHS, <2,0,0,2>
+  2712061373U,	// <7,2,0,1>: Cost 3 vext3 RHS, <2,0,1,2>
+  2712061380U,	// <7,2,0,2>: Cost 3 vext3 RHS, <2,0,2,0>
+  2712061389U,	// <7,2,0,3>: Cost 3 vext3 RHS, <2,0,3,0>
+  2712061404U,	// <7,2,0,4>: Cost 3 vext3 RHS, <2,0,4,6>
+  2696725990U,	// <7,2,0,5>: Cost 3 vext3 <2,0,5,7>, <2,0,5,7>
+  2712061417U,	// <7,2,0,6>: Cost 3 vext3 RHS, <2,0,6,1>
+  3785803251U,	// <7,2,0,7>: Cost 4 vext3 RHS, <2,0,7,2>
+  2696947201U,	// <7,2,0,u>: Cost 3 vext3 <2,0,u,7>, <2,0,u,7>
+  2712061446U,	// <7,2,1,0>: Cost 3 vext3 RHS, <2,1,0,3>
+  3785803276U,	// <7,2,1,1>: Cost 4 vext3 RHS, <2,1,1,0>
+  3785803285U,	// <7,2,1,2>: Cost 4 vext3 RHS, <2,1,2,0>
+  2712061471U,	// <7,2,1,3>: Cost 3 vext3 RHS, <2,1,3,1>
+  2712061482U,	// <7,2,1,4>: Cost 3 vext3 RHS, <2,1,4,3>
+  3766486576U,	// <7,2,1,5>: Cost 4 vext3 <1,3,5,7>, <2,1,5,0>
+  2712061500U,	// <7,2,1,6>: Cost 3 vext3 RHS, <2,1,6,3>
+  2602718850U,	// <7,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
+  2712061516U,	// <7,2,1,u>: Cost 3 vext3 RHS, <2,1,u,1>
+  2712061525U,	// <7,2,2,0>: Cost 3 vext3 RHS, <2,2,0,1>
+  2712061536U,	// <7,2,2,1>: Cost 3 vext3 RHS, <2,2,1,3>
+  1638319720U,	// <7,2,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
+  1638319730U,	// <7,2,2,3>: Cost 2 vext3 RHS, <2,2,3,3>
+  2712061565U,	// <7,2,2,4>: Cost 3 vext3 RHS, <2,2,4,5>
+  2698053256U,	// <7,2,2,5>: Cost 3 vext3 <2,2,5,7>, <2,2,5,7>
+  2712061584U,	// <7,2,2,6>: Cost 3 vext3 RHS, <2,2,6,6>
+  3771795096U,	// <7,2,2,7>: Cost 4 vext3 <2,2,5,7>, <2,2,7,5>
+  1638319775U,	// <7,2,2,u>: Cost 2 vext3 RHS, <2,2,u,3>
+  1638319782U,	// <7,2,3,0>: Cost 2 vext3 RHS, <2,3,0,1>
+  2693924531U,	// <7,2,3,1>: Cost 3 vext3 <1,5,3,7>, <2,3,1,5>
+  2700560061U,	// <7,2,3,2>: Cost 3 vext3 <2,6,3,7>, <2,3,2,6>
+  2693924551U,	// <7,2,3,3>: Cost 3 vext3 <1,5,3,7>, <2,3,3,7>
+  1638319822U,	// <7,2,3,4>: Cost 2 vext3 RHS, <2,3,4,5>
+  2698716889U,	// <7,2,3,5>: Cost 3 vext3 <2,3,5,7>, <2,3,5,7>
+  2712061665U,	// <7,2,3,6>: Cost 3 vext3 RHS, <2,3,6,6>
+  2735949540U,	// <7,2,3,7>: Cost 3 vext3 RHS, <2,3,7,0>
+  1638319854U,	// <7,2,3,u>: Cost 2 vext3 RHS, <2,3,u,1>
+  2712061692U,	// <7,2,4,0>: Cost 3 vext3 RHS, <2,4,0,6>
+  2712061698U,	// <7,2,4,1>: Cost 3 vext3 RHS, <2,4,1,3>
+  2712061708U,	// <7,2,4,2>: Cost 3 vext3 RHS, <2,4,2,4>
+  2712061718U,	// <7,2,4,3>: Cost 3 vext3 RHS, <2,4,3,5>
+  2712061728U,	// <7,2,4,4>: Cost 3 vext3 RHS, <2,4,4,6>
+  2699380522U,	// <7,2,4,5>: Cost 3 vext3 <2,4,5,7>, <2,4,5,7>
+  2712061740U,	// <7,2,4,6>: Cost 3 vext3 RHS, <2,4,6,0>
+  3809691445U,	// <7,2,4,7>: Cost 4 vext3 RHS, <2,4,7,0>
+  2699601733U,	// <7,2,4,u>: Cost 3 vext3 <2,4,u,7>, <2,4,u,7>
+  2699675470U,	// <7,2,5,0>: Cost 3 vext3 <2,5,0,7>, <2,5,0,7>
+  3766486867U,	// <7,2,5,1>: Cost 4 vext3 <1,3,5,7>, <2,5,1,3>
+  2699822944U,	// <7,2,5,2>: Cost 3 vext3 <2,5,2,7>, <2,5,2,7>
+  2692745065U,	// <7,2,5,3>: Cost 3 vext3 <1,3,5,7>, <2,5,3,7>
+  2699970418U,	// <7,2,5,4>: Cost 3 vext3 <2,5,4,7>, <2,5,4,7>
+  3766486907U,	// <7,2,5,5>: Cost 4 vext3 <1,3,5,7>, <2,5,5,7>
+  2700117892U,	// <7,2,5,6>: Cost 3 vext3 <2,5,6,7>, <2,5,6,7>
+  3771795334U,	// <7,2,5,7>: Cost 4 vext3 <2,2,5,7>, <2,5,7,0>
+  2692745110U,	// <7,2,5,u>: Cost 3 vext3 <1,3,5,7>, <2,5,u,7>
+  2572894310U,	// <7,2,6,0>: Cost 3 vext1 <3,7,2,6>, LHS
+  2712061860U,	// <7,2,6,1>: Cost 3 vext3 RHS, <2,6,1,3>
+  2700486577U,	// <7,2,6,2>: Cost 3 vext3 <2,6,2,7>, <2,6,2,7>
+  1626818490U,	// <7,2,6,3>: Cost 2 vext3 <2,6,3,7>, <2,6,3,7>
+  2572897590U,	// <7,2,6,4>: Cost 3 vext1 <3,7,2,6>, RHS
+  2700707788U,	// <7,2,6,5>: Cost 3 vext3 <2,6,5,7>, <2,6,5,7>
+  2700781525U,	// <7,2,6,6>: Cost 3 vext3 <2,6,6,7>, <2,6,6,7>
+  3774597086U,	// <7,2,6,7>: Cost 4 vext3 <2,6,7,7>, <2,6,7,7>
+  1627187175U,	// <7,2,6,u>: Cost 2 vext3 <2,6,u,7>, <2,6,u,7>
+  2735949802U,	// <7,2,7,0>: Cost 3 vext3 RHS, <2,7,0,1>
+  3780200434U,	// <7,2,7,1>: Cost 4 vext3 <3,6,2,7>, <2,7,1,0>
+  3773564928U,	// <7,2,7,2>: Cost 4 vext3 <2,5,2,7>, <2,7,2,5>
+  2986541158U,	// <7,2,7,3>: Cost 3 vzipr <5,5,7,7>, LHS
+  2554989878U,	// <7,2,7,4>: Cost 3 vext1 <0,7,2,7>, RHS
+  3775113245U,	// <7,2,7,5>: Cost 4 vext3 <2,7,5,7>, <2,7,5,7>
+  4060283228U,	// <7,2,7,6>: Cost 4 vzipr <5,5,7,7>, <0,4,2,6>
+  2554992236U,	// <7,2,7,7>: Cost 3 vext1 <0,7,2,7>, <7,7,7,7>
+  2986541163U,	// <7,2,7,u>: Cost 3 vzipr <5,5,7,7>, LHS
+  1638320187U,	// <7,2,u,0>: Cost 2 vext3 RHS, <2,u,0,1>
+  2693924936U,	// <7,2,u,1>: Cost 3 vext3 <1,5,3,7>, <2,u,1,5>
+  1638319720U,	// <7,2,u,2>: Cost 2 vext3 RHS, <2,2,2,2>
+  1628145756U,	// <7,2,u,3>: Cost 2 vext3 <2,u,3,7>, <2,u,3,7>
+  1638320227U,	// <7,2,u,4>: Cost 2 vext3 RHS, <2,u,4,5>
+  2702035054U,	// <7,2,u,5>: Cost 3 vext3 <2,u,5,7>, <2,u,5,7>
+  2702108791U,	// <7,2,u,6>: Cost 3 vext3 <2,u,6,7>, <2,u,6,7>
+  2735949945U,	// <7,2,u,7>: Cost 3 vext3 RHS, <2,u,7,0>
+  1628514441U,	// <7,2,u,u>: Cost 2 vext3 <2,u,u,7>, <2,u,u,7>
+  2712062091U,	// <7,3,0,0>: Cost 3 vext3 RHS, <3,0,0,0>
+  1638320278U,	// <7,3,0,1>: Cost 2 vext3 RHS, <3,0,1,2>
+  2712062109U,	// <7,3,0,2>: Cost 3 vext3 RHS, <3,0,2,0>
+  2590836886U,	// <7,3,0,3>: Cost 3 vext1 <6,7,3,0>, <3,0,1,2>
+  2712062128U,	// <7,3,0,4>: Cost 3 vext3 RHS, <3,0,4,1>
+  2712062138U,	// <7,3,0,5>: Cost 3 vext3 RHS, <3,0,5,2>
+  2590839656U,	// <7,3,0,6>: Cost 3 vext1 <6,7,3,0>, <6,7,3,0>
+  3311414017U,	// <7,3,0,7>: Cost 4 vrev <3,7,7,0>
+  1638320341U,	// <7,3,0,u>: Cost 2 vext3 RHS, <3,0,u,2>
+  2237164227U,	// <7,3,1,0>: Cost 3 vrev <3,7,0,1>
+  2712062182U,	// <7,3,1,1>: Cost 3 vext3 RHS, <3,1,1,1>
+  2712062193U,	// <7,3,1,2>: Cost 3 vext3 RHS, <3,1,2,3>
+  2692745468U,	// <7,3,1,3>: Cost 3 vext3 <1,3,5,7>, <3,1,3,5>
+  2712062214U,	// <7,3,1,4>: Cost 3 vext3 RHS, <3,1,4,6>
+  2693925132U,	// <7,3,1,5>: Cost 3 vext3 <1,5,3,7>, <3,1,5,3>
+  3768183059U,	// <7,3,1,6>: Cost 4 vext3 <1,6,1,7>, <3,1,6,1>
+  2692745504U,	// <7,3,1,7>: Cost 3 vext3 <1,3,5,7>, <3,1,7,5>
+  2696063273U,	// <7,3,1,u>: Cost 3 vext3 <1,u,5,7>, <3,1,u,5>
+  2712062254U,	// <7,3,2,0>: Cost 3 vext3 RHS, <3,2,0,1>
+  2712062262U,	// <7,3,2,1>: Cost 3 vext3 RHS, <3,2,1,0>
+  2712062273U,	// <7,3,2,2>: Cost 3 vext3 RHS, <3,2,2,2>
+  2712062280U,	// <7,3,2,3>: Cost 3 vext3 RHS, <3,2,3,0>
+  2712062294U,	// <7,3,2,4>: Cost 3 vext3 RHS, <3,2,4,5>
+  2712062302U,	// <7,3,2,5>: Cost 3 vext3 RHS, <3,2,5,4>
+  2700560742U,	// <7,3,2,6>: Cost 3 vext3 <2,6,3,7>, <3,2,6,3>
+  2712062319U,	// <7,3,2,7>: Cost 3 vext3 RHS, <3,2,7,3>
+  2712062325U,	// <7,3,2,u>: Cost 3 vext3 RHS, <3,2,u,0>
+  2712062335U,	// <7,3,3,0>: Cost 3 vext3 RHS, <3,3,0,1>
+  2636368158U,	// <7,3,3,1>: Cost 3 vext2 <3,1,7,3>, <3,1,7,3>
+  2637031791U,	// <7,3,3,2>: Cost 3 vext2 <3,2,7,3>, <3,2,7,3>
+  1638320540U,	// <7,3,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
+  2712062374U,	// <7,3,3,4>: Cost 3 vext3 RHS, <3,3,4,4>
+  2704689586U,	// <7,3,3,5>: Cost 3 vext3 <3,3,5,7>, <3,3,5,7>
+  2590864235U,	// <7,3,3,6>: Cost 3 vext1 <6,7,3,3>, <6,7,3,3>
+  2704837060U,	// <7,3,3,7>: Cost 3 vext3 <3,3,7,7>, <3,3,7,7>
+  1638320540U,	// <7,3,3,u>: Cost 2 vext3 RHS, <3,3,3,3>
+  2712062416U,	// <7,3,4,0>: Cost 3 vext3 RHS, <3,4,0,1>
+  2712062426U,	// <7,3,4,1>: Cost 3 vext3 RHS, <3,4,1,2>
+  2566981640U,	// <7,3,4,2>: Cost 3 vext1 <2,7,3,4>, <2,7,3,4>
+  2712062447U,	// <7,3,4,3>: Cost 3 vext3 RHS, <3,4,3,5>
+  2712062456U,	// <7,3,4,4>: Cost 3 vext3 RHS, <3,4,4,5>
+  1638320642U,	// <7,3,4,5>: Cost 2 vext3 RHS, <3,4,5,6>
+  2648313204U,	// <7,3,4,6>: Cost 3 vext2 <5,1,7,3>, <4,6,4,6>
+  3311446789U,	// <7,3,4,7>: Cost 4 vrev <3,7,7,4>
+  1638320669U,	// <7,3,4,u>: Cost 2 vext3 RHS, <3,4,u,6>
+  2602819686U,	// <7,3,5,0>: Cost 3 vext1 <u,7,3,5>, LHS
+  1574571728U,	// <7,3,5,1>: Cost 2 vext2 <5,1,7,3>, <5,1,7,3>
+  2648977185U,	// <7,3,5,2>: Cost 3 vext2 <5,2,7,3>, <5,2,7,3>
+  2705869378U,	// <7,3,5,3>: Cost 3 vext3 <3,5,3,7>, <3,5,3,7>
+  2237491947U,	// <7,3,5,4>: Cost 3 vrev <3,7,4,5>
+  2706016852U,	// <7,3,5,5>: Cost 3 vext3 <3,5,5,7>, <3,5,5,7>
+  2648313954U,	// <7,3,5,6>: Cost 3 vext2 <5,1,7,3>, <5,6,7,0>
+  2692745823U,	// <7,3,5,7>: Cost 3 vext3 <1,3,5,7>, <3,5,7,0>
+  1579217159U,	// <7,3,5,u>: Cost 2 vext2 <5,u,7,3>, <5,u,7,3>
+  2706311800U,	// <7,3,6,0>: Cost 3 vext3 <3,6,0,7>, <3,6,0,7>
+  2654286249U,	// <7,3,6,1>: Cost 3 vext2 <6,1,7,3>, <6,1,7,3>
+  1581208058U,	// <7,3,6,2>: Cost 2 vext2 <6,2,7,3>, <6,2,7,3>
+  2706533011U,	// <7,3,6,3>: Cost 3 vext3 <3,6,3,7>, <3,6,3,7>
+  2706606748U,	// <7,3,6,4>: Cost 3 vext3 <3,6,4,7>, <3,6,4,7>
+  3780422309U,	// <7,3,6,5>: Cost 4 vext3 <3,6,5,7>, <3,6,5,7>
+  2712062637U,	// <7,3,6,6>: Cost 3 vext3 RHS, <3,6,6,6>
+  2706827959U,	// <7,3,6,7>: Cost 3 vext3 <3,6,7,7>, <3,6,7,7>
+  1585189856U,	// <7,3,6,u>: Cost 2 vext2 <6,u,7,3>, <6,u,7,3>
+  2693925571U,	// <7,3,7,0>: Cost 3 vext3 <1,5,3,7>, <3,7,0,1>
+  2693925584U,	// <7,3,7,1>: Cost 3 vext3 <1,5,3,7>, <3,7,1,5>
+  2700561114U,	// <7,3,7,2>: Cost 3 vext3 <2,6,3,7>, <3,7,2,6>
+  2572978916U,	// <7,3,7,3>: Cost 3 vext1 <3,7,3,7>, <3,7,3,7>
+  2693925611U,	// <7,3,7,4>: Cost 3 vext3 <1,5,3,7>, <3,7,4,5>
+  2707344118U,	// <7,3,7,5>: Cost 3 vext3 <3,7,5,7>, <3,7,5,7>
+  2654950894U,	// <7,3,7,6>: Cost 3 vext2 <6,2,7,3>, <7,6,2,7>
+  2648315500U,	// <7,3,7,7>: Cost 3 vext2 <5,1,7,3>, <7,7,7,7>
+  2693925643U,	// <7,3,7,u>: Cost 3 vext3 <1,5,3,7>, <3,7,u,1>
+  2237221578U,	// <7,3,u,0>: Cost 3 vrev <3,7,0,u>
+  1638320926U,	// <7,3,u,1>: Cost 2 vext3 RHS, <3,u,1,2>
+  1593153452U,	// <7,3,u,2>: Cost 2 vext2 <u,2,7,3>, <u,2,7,3>
+  1638320540U,	// <7,3,u,3>: Cost 2 vext3 RHS, <3,3,3,3>
+  2237516526U,	// <7,3,u,4>: Cost 3 vrev <3,7,4,u>
+  1638320966U,	// <7,3,u,5>: Cost 2 vext3 RHS, <3,u,5,6>
+  2712062796U,	// <7,3,u,6>: Cost 3 vext3 RHS, <3,u,6,3>
+  2692967250U,	// <7,3,u,7>: Cost 3 vext3 <1,3,u,7>, <3,u,7,0>
+  1638320989U,	// <7,3,u,u>: Cost 2 vext3 RHS, <3,u,u,2>
+  2651635712U,	// <7,4,0,0>: Cost 3 vext2 <5,6,7,4>, <0,0,0,0>
+  1577893990U,	// <7,4,0,1>: Cost 2 vext2 <5,6,7,4>, LHS
+  2651635876U,	// <7,4,0,2>: Cost 3 vext2 <5,6,7,4>, <0,2,0,2>
+  3785804672U,	// <7,4,0,3>: Cost 4 vext3 RHS, <4,0,3,1>
+  2651636050U,	// <7,4,0,4>: Cost 3 vext2 <5,6,7,4>, <0,4,1,5>
+  1638468498U,	// <7,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
+  1638468508U,	// <7,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
+  3787795364U,	// <7,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
+  1640459181U,	// <7,4,0,u>: Cost 2 vext3 RHS, <4,0,u,1>
+  2651636470U,	// <7,4,1,0>: Cost 3 vext2 <5,6,7,4>, <1,0,3,2>
+  2651636532U,	// <7,4,1,1>: Cost 3 vext2 <5,6,7,4>, <1,1,1,1>
+  2712062922U,	// <7,4,1,2>: Cost 3 vext3 RHS, <4,1,2,3>
+  2639029248U,	// <7,4,1,3>: Cost 3 vext2 <3,5,7,4>, <1,3,5,7>
+  2712062940U,	// <7,4,1,4>: Cost 3 vext3 RHS, <4,1,4,3>
+  2712062946U,	// <7,4,1,5>: Cost 3 vext3 RHS, <4,1,5,0>
+  2712062958U,	// <7,4,1,6>: Cost 3 vext3 RHS, <4,1,6,3>
+  3785804791U,	// <7,4,1,7>: Cost 4 vext3 RHS, <4,1,7,3>
+  2712062973U,	// <7,4,1,u>: Cost 3 vext3 RHS, <4,1,u,0>
+  3785804807U,	// <7,4,2,0>: Cost 4 vext3 RHS, <4,2,0,1>
+  3785804818U,	// <7,4,2,1>: Cost 4 vext3 RHS, <4,2,1,3>
+  2651637352U,	// <7,4,2,2>: Cost 3 vext2 <5,6,7,4>, <2,2,2,2>
+  2651637414U,	// <7,4,2,3>: Cost 3 vext2 <5,6,7,4>, <2,3,0,1>
+  3716753194U,	// <7,4,2,4>: Cost 4 vext2 <4,2,7,4>, <2,4,5,7>
+  2712063030U,	// <7,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
+  2712063036U,	// <7,4,2,6>: Cost 3 vext3 RHS, <4,2,6,0>
+  3773123658U,	// <7,4,2,7>: Cost 4 vext3 <2,4,5,7>, <4,2,7,5>
+  2712063054U,	// <7,4,2,u>: Cost 3 vext3 RHS, <4,2,u,0>
+  2651637910U,	// <7,4,3,0>: Cost 3 vext2 <5,6,7,4>, <3,0,1,2>
+  3712772348U,	// <7,4,3,1>: Cost 4 vext2 <3,5,7,4>, <3,1,3,5>
+  3785804906U,	// <7,4,3,2>: Cost 4 vext3 RHS, <4,3,2,1>
+  2651638172U,	// <7,4,3,3>: Cost 3 vext2 <5,6,7,4>, <3,3,3,3>
+  2651638274U,	// <7,4,3,4>: Cost 3 vext2 <5,6,7,4>, <3,4,5,6>
+  2639030883U,	// <7,4,3,5>: Cost 3 vext2 <3,5,7,4>, <3,5,7,4>
+  2712063122U,	// <7,4,3,6>: Cost 3 vext3 RHS, <4,3,6,5>
+  3712772836U,	// <7,4,3,7>: Cost 4 vext2 <3,5,7,4>, <3,7,3,7>
+  2641021782U,	// <7,4,3,u>: Cost 3 vext2 <3,u,7,4>, <3,u,7,4>
+  2714053802U,	// <7,4,4,0>: Cost 3 vext3 RHS, <4,4,0,2>
+  3785804978U,	// <7,4,4,1>: Cost 4 vext3 RHS, <4,4,1,1>
+  3716754505U,	// <7,4,4,2>: Cost 4 vext2 <4,2,7,4>, <4,2,7,4>
+  3785804998U,	// <7,4,4,3>: Cost 4 vext3 RHS, <4,4,3,3>
+  1638321360U,	// <7,4,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
+  1638468826U,	// <7,4,4,5>: Cost 2 vext3 RHS, <4,4,5,5>
+  1638468836U,	// <7,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
+  3785215214U,	// <7,4,4,7>: Cost 4 vext3 <4,4,7,7>, <4,4,7,7>
+  1640459509U,	// <7,4,4,u>: Cost 2 vext3 RHS, <4,4,u,5>
+  1517207654U,	// <7,4,5,0>: Cost 2 vext1 <6,7,4,5>, LHS
+  2573034640U,	// <7,4,5,1>: Cost 3 vext1 <3,7,4,5>, <1,5,3,7>
+  2712063246U,	// <7,4,5,2>: Cost 3 vext3 RHS, <4,5,2,3>
+  2573036267U,	// <7,4,5,3>: Cost 3 vext1 <3,7,4,5>, <3,7,4,5>
+  1517210934U,	// <7,4,5,4>: Cost 2 vext1 <6,7,4,5>, RHS
+  2711989549U,	// <7,4,5,5>: Cost 3 vext3 <4,5,5,7>, <4,5,5,7>
+  564579638U,	// <7,4,5,6>: Cost 1 vext3 RHS, RHS
+  2651639976U,	// <7,4,5,7>: Cost 3 vext2 <5,6,7,4>, <5,7,5,7>
+  564579656U,	// <7,4,5,u>: Cost 1 vext3 RHS, RHS
+  2712063307U,	// <7,4,6,0>: Cost 3 vext3 RHS, <4,6,0,1>
+  3767668056U,	// <7,4,6,1>: Cost 4 vext3 <1,5,3,7>, <4,6,1,5>
+  2651640314U,	// <7,4,6,2>: Cost 3 vext2 <5,6,7,4>, <6,2,7,3>
+  2655621708U,	// <7,4,6,3>: Cost 3 vext2 <6,3,7,4>, <6,3,7,4>
+  1638468980U,	// <7,4,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
+  2712063358U,	// <7,4,6,5>: Cost 3 vext3 RHS, <4,6,5,7>
+  2712063367U,	// <7,4,6,6>: Cost 3 vext3 RHS, <4,6,6,7>
+  2712210826U,	// <7,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
+  1638469012U,	// <7,4,6,u>: Cost 2 vext3 RHS, <4,6,u,2>
+  2651640826U,	// <7,4,7,0>: Cost 3 vext2 <5,6,7,4>, <7,0,1,2>
+  3773713830U,	// <7,4,7,1>: Cost 4 vext3 <2,5,4,7>, <4,7,1,2>
+  3773713842U,	// <7,4,7,2>: Cost 4 vext3 <2,5,4,7>, <4,7,2,5>
+  3780349372U,	// <7,4,7,3>: Cost 4 vext3 <3,6,4,7>, <4,7,3,6>
+  2651641140U,	// <7,4,7,4>: Cost 3 vext2 <5,6,7,4>, <7,4,0,1>
+  2712210888U,	// <7,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
+  2712210898U,	// <7,4,7,6>: Cost 3 vext3 RHS, <4,7,6,1>
+  2651641452U,	// <7,4,7,7>: Cost 3 vext2 <5,6,7,4>, <7,7,7,7>
+  2713538026U,	// <7,4,7,u>: Cost 3 vext3 <4,7,u,7>, <4,7,u,7>
+  1517232230U,	// <7,4,u,0>: Cost 2 vext1 <6,7,4,u>, LHS
+  1577899822U,	// <7,4,u,1>: Cost 2 vext2 <5,6,7,4>, LHS
+  2712063489U,	// <7,4,u,2>: Cost 3 vext3 RHS, <4,u,2,3>
+  2573060846U,	// <7,4,u,3>: Cost 3 vext1 <3,7,4,u>, <3,7,4,u>
+  1640312342U,	// <7,4,u,4>: Cost 2 vext3 RHS, <4,u,4,6>
+  1638469146U,	// <7,4,u,5>: Cost 2 vext3 RHS, <4,u,5,1>
+  564579881U,	// <7,4,u,6>: Cost 1 vext3 RHS, RHS
+  2714054192U,	// <7,4,u,7>: Cost 3 vext3 RHS, <4,u,7,5>
+  564579899U,	// <7,4,u,u>: Cost 1 vext3 RHS, RHS
+  2579038310U,	// <7,5,0,0>: Cost 3 vext1 <4,7,5,0>, LHS
+  2636382310U,	// <7,5,0,1>: Cost 3 vext2 <3,1,7,5>, LHS
+  2796339302U,	// <7,5,0,2>: Cost 3 vuzpl <7,4,5,6>, LHS
+  3646810719U,	// <7,5,0,3>: Cost 4 vext1 <3,7,5,0>, <3,5,7,0>
+  2712063586U,	// <7,5,0,4>: Cost 3 vext3 RHS, <5,0,4,1>
+  2735951467U,	// <7,5,0,5>: Cost 3 vext3 RHS, <5,0,5,1>
+  2735951476U,	// <7,5,0,6>: Cost 3 vext3 RHS, <5,0,6,1>
+  2579043322U,	// <7,5,0,7>: Cost 3 vext1 <4,7,5,0>, <7,0,1,2>
+  2636382877U,	// <7,5,0,u>: Cost 3 vext2 <3,1,7,5>, LHS
+  2712211087U,	// <7,5,1,0>: Cost 3 vext3 RHS, <5,1,0,1>
+  3698180916U,	// <7,5,1,1>: Cost 4 vext2 <1,1,7,5>, <1,1,1,1>
+  3710124950U,	// <7,5,1,2>: Cost 4 vext2 <3,1,7,5>, <1,2,3,0>
+  2636383232U,	// <7,5,1,3>: Cost 3 vext2 <3,1,7,5>, <1,3,5,7>
+  2712211127U,	// <7,5,1,4>: Cost 3 vext3 RHS, <5,1,4,5>
+  2590994128U,	// <7,5,1,5>: Cost 3 vext1 <6,7,5,1>, <5,1,7,3>
+  2590995323U,	// <7,5,1,6>: Cost 3 vext1 <6,7,5,1>, <6,7,5,1>
+  1638469328U,	// <7,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
+  1638469337U,	// <7,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
+  3785805536U,	// <7,5,2,0>: Cost 4 vext3 RHS, <5,2,0,1>
+  3785805544U,	// <7,5,2,1>: Cost 4 vext3 RHS, <5,2,1,0>
+  3704817288U,	// <7,5,2,2>: Cost 4 vext2 <2,2,7,5>, <2,2,5,7>
+  2712063742U,	// <7,5,2,3>: Cost 3 vext3 RHS, <5,2,3,4>
+  3716761386U,	// <7,5,2,4>: Cost 4 vext2 <4,2,7,5>, <2,4,5,7>
+  2714054415U,	// <7,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
+  3774304024U,	// <7,5,2,6>: Cost 4 vext3 <2,6,3,7>, <5,2,6,3>
+  2712063777U,	// <7,5,2,7>: Cost 3 vext3 RHS, <5,2,7,3>
+  2712063787U,	// <7,5,2,u>: Cost 3 vext3 RHS, <5,2,u,4>
+  3634888806U,	// <7,5,3,0>: Cost 4 vext1 <1,7,5,3>, LHS
+  2636384544U,	// <7,5,3,1>: Cost 3 vext2 <3,1,7,5>, <3,1,7,5>
+  3710790001U,	// <7,5,3,2>: Cost 4 vext2 <3,2,7,5>, <3,2,7,5>
+  3710126492U,	// <7,5,3,3>: Cost 4 vext2 <3,1,7,5>, <3,3,3,3>
+  3634892086U,	// <7,5,3,4>: Cost 4 vext1 <1,7,5,3>, RHS
+  2639039076U,	// <7,5,3,5>: Cost 3 vext2 <3,5,7,5>, <3,5,7,5>
+  3713444533U,	// <7,5,3,6>: Cost 4 vext2 <3,6,7,5>, <3,6,7,5>
+  2693926767U,	// <7,5,3,7>: Cost 3 vext3 <1,5,3,7>, <5,3,7,0>
+  2712063864U,	// <7,5,3,u>: Cost 3 vext3 RHS, <5,3,u,0>
+  2579071078U,	// <7,5,4,0>: Cost 3 vext1 <4,7,5,4>, LHS
+  3646841856U,	// <7,5,4,1>: Cost 4 vext1 <3,7,5,4>, <1,3,5,7>
+  3716762698U,	// <7,5,4,2>: Cost 4 vext2 <4,2,7,5>, <4,2,7,5>
+  3646843491U,	// <7,5,4,3>: Cost 4 vext1 <3,7,5,4>, <3,5,7,4>
+  2579074358U,	// <7,5,4,4>: Cost 3 vext1 <4,7,5,4>, RHS
+  2636385590U,	// <7,5,4,5>: Cost 3 vext2 <3,1,7,5>, RHS
+  2645675406U,	// <7,5,4,6>: Cost 3 vext2 <4,6,7,5>, <4,6,7,5>
+  1638322118U,	// <7,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
+  1638469583U,	// <7,5,4,u>: Cost 2 vext3 RHS, <5,4,u,6>
+  2714054611U,	// <7,5,5,0>: Cost 3 vext3 RHS, <5,5,0,1>
+  2652974800U,	// <7,5,5,1>: Cost 3 vext2 <5,u,7,5>, <5,1,7,3>
+  3710127905U,	// <7,5,5,2>: Cost 4 vext2 <3,1,7,5>, <5,2,7,3>
+  3785805808U,	// <7,5,5,3>: Cost 4 vext3 RHS, <5,5,3,3>
+  2712211450U,	// <7,5,5,4>: Cost 3 vext3 RHS, <5,5,4,4>
+  1638322180U,	// <7,5,5,5>: Cost 2 vext3 RHS, <5,5,5,5>
+  2712064014U,	// <7,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
+  1638469656U,	// <7,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
+  1638469665U,	// <7,5,5,u>: Cost 2 vext3 RHS, <5,5,u,7>
+  2712064036U,	// <7,5,6,0>: Cost 3 vext3 RHS, <5,6,0,1>
+  2714054707U,	// <7,5,6,1>: Cost 3 vext3 RHS, <5,6,1,7>
+  3785805879U,	// <7,5,6,2>: Cost 4 vext3 RHS, <5,6,2,2>
+  2712064066U,	// <7,5,6,3>: Cost 3 vext3 RHS, <5,6,3,4>
+  2712064076U,	// <7,5,6,4>: Cost 3 vext3 RHS, <5,6,4,5>
+  2714054743U,	// <7,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
+  2712064096U,	// <7,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
+  1638322274U,	// <7,5,6,7>: Cost 2 vext3 RHS, <5,6,7,0>
+  1638469739U,	// <7,5,6,u>: Cost 2 vext3 RHS, <5,6,u,0>
+  1511325798U,	// <7,5,7,0>: Cost 2 vext1 <5,7,5,7>, LHS
+  2692747392U,	// <7,5,7,1>: Cost 3 vext3 <1,3,5,7>, <5,7,1,3>
+  2585069160U,	// <7,5,7,2>: Cost 3 vext1 <5,7,5,7>, <2,2,2,2>
+  2573126390U,	// <7,5,7,3>: Cost 3 vext1 <3,7,5,7>, <3,7,5,7>
+  1511329078U,	// <7,5,7,4>: Cost 2 vext1 <5,7,5,7>, RHS
+  1638469800U,	// <7,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
+  2712211626U,	// <7,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
+  2712211636U,	// <7,5,7,7>: Cost 3 vext3 RHS, <5,7,7,1>
+  1638469823U,	// <7,5,7,u>: Cost 2 vext3 RHS, <5,7,u,3>
+  1511333990U,	// <7,5,u,0>: Cost 2 vext1 <5,7,5,u>, LHS
+  2636388142U,	// <7,5,u,1>: Cost 3 vext2 <3,1,7,5>, LHS
+  2712211671U,	// <7,5,u,2>: Cost 3 vext3 RHS, <5,u,2,0>
+  2573134583U,	// <7,5,u,3>: Cost 3 vext1 <3,7,5,u>, <3,7,5,u>
+  1511337270U,	// <7,5,u,4>: Cost 2 vext1 <5,7,5,u>, RHS
+  1638469881U,	// <7,5,u,5>: Cost 2 vext3 RHS, <5,u,5,7>
+  2712064258U,	// <7,5,u,6>: Cost 3 vext3 RHS, <5,u,6,7>
+  1638469892U,	// <7,5,u,7>: Cost 2 vext3 RHS, <5,u,7,0>
+  1638469904U,	// <7,5,u,u>: Cost 2 vext3 RHS, <5,u,u,3>
+  2650324992U,	// <7,6,0,0>: Cost 3 vext2 <5,4,7,6>, <0,0,0,0>
+  1576583270U,	// <7,6,0,1>: Cost 2 vext2 <5,4,7,6>, LHS
+  2712064300U,	// <7,6,0,2>: Cost 3 vext3 RHS, <6,0,2,4>
+  2255295336U,	// <7,6,0,3>: Cost 3 vrev <6,7,3,0>
+  2712064316U,	// <7,6,0,4>: Cost 3 vext3 RHS, <6,0,4,2>
+  2585088098U,	// <7,6,0,5>: Cost 3 vext1 <5,7,6,0>, <5,6,7,0>
+  2735952204U,	// <7,6,0,6>: Cost 3 vext3 RHS, <6,0,6,0>
+  2712211799U,	// <7,6,0,7>: Cost 3 vext3 RHS, <6,0,7,2>
+  1576583837U,	// <7,6,0,u>: Cost 2 vext2 <5,4,7,6>, LHS
+  1181340494U,	// <7,6,1,0>: Cost 2 vrev <6,7,0,1>
+  2650325812U,	// <7,6,1,1>: Cost 3 vext2 <5,4,7,6>, <1,1,1,1>
+  2650325910U,	// <7,6,1,2>: Cost 3 vext2 <5,4,7,6>, <1,2,3,0>
+  2650325976U,	// <7,6,1,3>: Cost 3 vext2 <5,4,7,6>, <1,3,1,3>
+  2579123510U,	// <7,6,1,4>: Cost 3 vext1 <4,7,6,1>, RHS
+  2650326160U,	// <7,6,1,5>: Cost 3 vext2 <5,4,7,6>, <1,5,3,7>
+  2714055072U,	// <7,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
+  2712064425U,	// <7,6,1,7>: Cost 3 vext3 RHS, <6,1,7,3>
+  1181930390U,	// <7,6,1,u>: Cost 2 vrev <6,7,u,1>
+  2712211897U,	// <7,6,2,0>: Cost 3 vext3 RHS, <6,2,0,1>
+  2714055108U,	// <7,6,2,1>: Cost 3 vext3 RHS, <6,2,1,3>
+  2650326632U,	// <7,6,2,2>: Cost 3 vext2 <5,4,7,6>, <2,2,2,2>
+  2650326694U,	// <7,6,2,3>: Cost 3 vext2 <5,4,7,6>, <2,3,0,1>
+  2714055137U,	// <7,6,2,4>: Cost 3 vext3 RHS, <6,2,4,5>
+  2714055148U,	// <7,6,2,5>: Cost 3 vext3 RHS, <6,2,5,7>
+  2650326970U,	// <7,6,2,6>: Cost 3 vext2 <5,4,7,6>, <2,6,3,7>
+  1638470138U,	// <7,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
+  1638470147U,	// <7,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
+  2650327190U,	// <7,6,3,0>: Cost 3 vext2 <5,4,7,6>, <3,0,1,2>
+  2255172441U,	// <7,6,3,1>: Cost 3 vrev <6,7,1,3>
+  2255246178U,	// <7,6,3,2>: Cost 3 vrev <6,7,2,3>
+  2650327452U,	// <7,6,3,3>: Cost 3 vext2 <5,4,7,6>, <3,3,3,3>
+  2712064562U,	// <7,6,3,4>: Cost 3 vext3 RHS, <6,3,4,5>
+  2650327627U,	// <7,6,3,5>: Cost 3 vext2 <5,4,7,6>, <3,5,4,7>
+  3713452726U,	// <7,6,3,6>: Cost 4 vext2 <3,6,7,6>, <3,6,7,6>
+  2700563016U,	// <7,6,3,7>: Cost 3 vext3 <2,6,3,7>, <6,3,7,0>
+  2712064593U,	// <7,6,3,u>: Cost 3 vext3 RHS, <6,3,u,0>
+  2650327954U,	// <7,6,4,0>: Cost 3 vext2 <5,4,7,6>, <4,0,5,1>
+  2735952486U,	// <7,6,4,1>: Cost 3 vext3 RHS, <6,4,1,3>
+  2735952497U,	// <7,6,4,2>: Cost 3 vext3 RHS, <6,4,2,5>
+  2255328108U,	// <7,6,4,3>: Cost 3 vrev <6,7,3,4>
+  2712212100U,	// <7,6,4,4>: Cost 3 vext3 RHS, <6,4,4,6>
+  1576586550U,	// <7,6,4,5>: Cost 2 vext2 <5,4,7,6>, RHS
+  2714055312U,	// <7,6,4,6>: Cost 3 vext3 RHS, <6,4,6,0>
+  2712212126U,	// <7,6,4,7>: Cost 3 vext3 RHS, <6,4,7,5>
+  1576586793U,	// <7,6,4,u>: Cost 2 vext2 <5,4,7,6>, RHS
+  2579152998U,	// <7,6,5,0>: Cost 3 vext1 <4,7,6,5>, LHS
+  2650328784U,	// <7,6,5,1>: Cost 3 vext2 <5,4,7,6>, <5,1,7,3>
+  2714055364U,	// <7,6,5,2>: Cost 3 vext3 RHS, <6,5,2,7>
+  3785806538U,	// <7,6,5,3>: Cost 4 vext3 RHS, <6,5,3,4>
+  1576587206U,	// <7,6,5,4>: Cost 2 vext2 <5,4,7,6>, <5,4,7,6>
+  2650329092U,	// <7,6,5,5>: Cost 3 vext2 <5,4,7,6>, <5,5,5,5>
+  2650329186U,	// <7,6,5,6>: Cost 3 vext2 <5,4,7,6>, <5,6,7,0>
+  2712064753U,	// <7,6,5,7>: Cost 3 vext3 RHS, <6,5,7,7>
+  1181963162U,	// <7,6,5,u>: Cost 2 vrev <6,7,u,5>
+  2714055421U,	// <7,6,6,0>: Cost 3 vext3 RHS, <6,6,0,1>
+  2714055432U,	// <7,6,6,1>: Cost 3 vext3 RHS, <6,6,1,3>
+  2650329594U,	// <7,6,6,2>: Cost 3 vext2 <5,4,7,6>, <6,2,7,3>
+  3785806619U,	// <7,6,6,3>: Cost 4 vext3 RHS, <6,6,3,4>
+  2712212260U,	// <7,6,6,4>: Cost 3 vext3 RHS, <6,6,4,4>
+  2714055472U,	// <7,6,6,5>: Cost 3 vext3 RHS, <6,6,5,7>
+  1638323000U,	// <7,6,6,6>: Cost 2 vext3 RHS, <6,6,6,6>
+  1638470466U,	// <7,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
+  1638470475U,	// <7,6,6,u>: Cost 2 vext3 RHS, <6,6,u,7>
+  1638323022U,	// <7,6,7,0>: Cost 2 vext3 RHS, <6,7,0,1>
+  2712064854U,	// <7,6,7,1>: Cost 3 vext3 RHS, <6,7,1,0>
+  2712064865U,	// <7,6,7,2>: Cost 3 vext3 RHS, <6,7,2,2>
+  2712064872U,	// <7,6,7,3>: Cost 3 vext3 RHS, <6,7,3,0>
+  1638323062U,	// <7,6,7,4>: Cost 2 vext3 RHS, <6,7,4,5>
+  2712064894U,	// <7,6,7,5>: Cost 3 vext3 RHS, <6,7,5,4>
+  2712064905U,	// <7,6,7,6>: Cost 3 vext3 RHS, <6,7,6,6>
+  2712064915U,	// <7,6,7,7>: Cost 3 vext3 RHS, <6,7,7,7>
+  1638323094U,	// <7,6,7,u>: Cost 2 vext3 RHS, <6,7,u,1>
+  1638470559U,	// <7,6,u,0>: Cost 2 vext3 RHS, <6,u,0,1>
+  1576589102U,	// <7,6,u,1>: Cost 2 vext2 <5,4,7,6>, LHS
+  2712212402U,	// <7,6,u,2>: Cost 3 vext3 RHS, <6,u,2,2>
+  2712212409U,	// <7,6,u,3>: Cost 3 vext3 RHS, <6,u,3,0>
+  1638470599U,	// <7,6,u,4>: Cost 2 vext3 RHS, <6,u,4,5>
+  1576589466U,	// <7,6,u,5>: Cost 2 vext2 <5,4,7,6>, RHS
+  1638323000U,	// <7,6,u,6>: Cost 2 vext3 RHS, <6,6,6,6>
+  1638470624U,	// <7,6,u,7>: Cost 2 vext3 RHS, <6,u,7,3>
+  1638470631U,	// <7,6,u,u>: Cost 2 vext3 RHS, <6,u,u,1>
+  2712065007U,	// <7,7,0,0>: Cost 3 vext3 RHS, <7,0,0,0>
+  1638323194U,	// <7,7,0,1>: Cost 2 vext3 RHS, <7,0,1,2>
+  2712065025U,	// <7,7,0,2>: Cost 3 vext3 RHS, <7,0,2,0>
+  3646958337U,	// <7,7,0,3>: Cost 4 vext1 <3,7,7,0>, <3,7,7,0>
+  2712065044U,	// <7,7,0,4>: Cost 3 vext3 RHS, <7,0,4,1>
+  2585161907U,	// <7,7,0,5>: Cost 3 vext1 <5,7,7,0>, <5,7,7,0>
+  2591134604U,	// <7,7,0,6>: Cost 3 vext1 <6,7,7,0>, <6,7,7,0>
+  2591134714U,	// <7,7,0,7>: Cost 3 vext1 <6,7,7,0>, <7,0,1,2>
+  1638323257U,	// <7,7,0,u>: Cost 2 vext3 RHS, <7,0,u,2>
+  2712065091U,	// <7,7,1,0>: Cost 3 vext3 RHS, <7,1,0,3>
+  2712065098U,	// <7,7,1,1>: Cost 3 vext3 RHS, <7,1,1,1>
+  2712065109U,	// <7,7,1,2>: Cost 3 vext3 RHS, <7,1,2,3>
+  2692748384U,	// <7,7,1,3>: Cost 3 vext3 <1,3,5,7>, <7,1,3,5>
+  2585169206U,	// <7,7,1,4>: Cost 3 vext1 <5,7,7,1>, RHS
+  2693928048U,	// <7,7,1,5>: Cost 3 vext3 <1,5,3,7>, <7,1,5,3>
+  2585170766U,	// <7,7,1,6>: Cost 3 vext1 <5,7,7,1>, <6,7,0,1>
+  2735953024U,	// <7,7,1,7>: Cost 3 vext3 RHS, <7,1,7,1>
+  2695918731U,	// <7,7,1,u>: Cost 3 vext3 <1,u,3,7>, <7,1,u,3>
+  3770471574U,	// <7,7,2,0>: Cost 4 vext3 <2,0,5,7>, <7,2,0,5>
+  3785807002U,	// <7,7,2,1>: Cost 4 vext3 RHS, <7,2,1,0>
+  2712065189U,	// <7,7,2,2>: Cost 3 vext3 RHS, <7,2,2,2>
+  2712065196U,	// <7,7,2,3>: Cost 3 vext3 RHS, <7,2,3,0>
+  3773125818U,	// <7,7,2,4>: Cost 4 vext3 <2,4,5,7>, <7,2,4,5>
+  3766490305U,	// <7,7,2,5>: Cost 4 vext3 <1,3,5,7>, <7,2,5,3>
+  2700563658U,	// <7,7,2,6>: Cost 3 vext3 <2,6,3,7>, <7,2,6,3>
+  2735953107U,	// <7,7,2,7>: Cost 3 vext3 RHS, <7,2,7,3>
+  2701890780U,	// <7,7,2,u>: Cost 3 vext3 <2,u,3,7>, <7,2,u,3>
+  2712065251U,	// <7,7,3,0>: Cost 3 vext3 RHS, <7,3,0,1>
+  3766490350U,	// <7,7,3,1>: Cost 4 vext3 <1,3,5,7>, <7,3,1,3>
+  3774305530U,	// <7,7,3,2>: Cost 4 vext3 <2,6,3,7>, <7,3,2,6>
+  2637728196U,	// <7,7,3,3>: Cost 3 vext2 <3,3,7,7>, <3,3,7,7>
+  2712065291U,	// <7,7,3,4>: Cost 3 vext3 RHS, <7,3,4,5>
+  2585186486U,	// <7,7,3,5>: Cost 3 vext1 <5,7,7,3>, <5,7,7,3>
+  2639719095U,	// <7,7,3,6>: Cost 3 vext2 <3,6,7,7>, <3,6,7,7>
+  2640382728U,	// <7,7,3,7>: Cost 3 vext2 <3,7,7,7>, <3,7,7,7>
+  2641046361U,	// <7,7,3,u>: Cost 3 vext2 <3,u,7,7>, <3,u,7,7>
+  2712212792U,	// <7,7,4,0>: Cost 3 vext3 RHS, <7,4,0,5>
+  3646989312U,	// <7,7,4,1>: Cost 4 vext1 <3,7,7,4>, <1,3,5,7>
+  3785807176U,	// <7,7,4,2>: Cost 4 vext3 RHS, <7,4,2,3>
+  3646991109U,	// <7,7,4,3>: Cost 4 vext1 <3,7,7,4>, <3,7,7,4>
+  2712065371U,	// <7,7,4,4>: Cost 3 vext3 RHS, <7,4,4,4>
+  1638323558U,	// <7,7,4,5>: Cost 2 vext3 RHS, <7,4,5,6>
+  2712212845U,	// <7,7,4,6>: Cost 3 vext3 RHS, <7,4,6,4>
+  2591167846U,	// <7,7,4,7>: Cost 3 vext1 <6,7,7,4>, <7,4,5,6>
+  1638323585U,	// <7,7,4,u>: Cost 2 vext3 RHS, <7,4,u,6>
+  2585198694U,	// <7,7,5,0>: Cost 3 vext1 <5,7,7,5>, LHS
+  2712212884U,	// <7,7,5,1>: Cost 3 vext3 RHS, <7,5,1,7>
+  3711471393U,	// <7,7,5,2>: Cost 4 vext2 <3,3,7,7>, <5,2,7,3>
+  2649673590U,	// <7,7,5,3>: Cost 3 vext2 <5,3,7,7>, <5,3,7,7>
+  2712065455U,	// <7,7,5,4>: Cost 3 vext3 RHS, <7,5,4,7>
+  1577259032U,	// <7,7,5,5>: Cost 2 vext2 <5,5,7,7>, <5,5,7,7>
+  2712065473U,	// <7,7,5,6>: Cost 3 vext3 RHS, <7,5,6,7>
+  2712212936U,	// <7,7,5,7>: Cost 3 vext3 RHS, <7,5,7,5>
+  1579249931U,	// <7,7,5,u>: Cost 2 vext2 <5,u,7,7>, <5,u,7,7>
+  2591178854U,	// <7,7,6,0>: Cost 3 vext1 <6,7,7,6>, LHS
+  2735953374U,	// <7,7,6,1>: Cost 3 vext3 RHS, <7,6,1,0>
+  2712212974U,	// <7,7,6,2>: Cost 3 vext3 RHS, <7,6,2,7>
+  2655646287U,	// <7,7,6,3>: Cost 3 vext2 <6,3,7,7>, <6,3,7,7>
+  2591182134U,	// <7,7,6,4>: Cost 3 vext1 <6,7,7,6>, RHS
+  2656973553U,	// <7,7,6,5>: Cost 3 vext2 <6,5,7,7>, <6,5,7,7>
+  1583895362U,	// <7,7,6,6>: Cost 2 vext2 <6,6,7,7>, <6,6,7,7>
+  2712065556U,	// <7,7,6,7>: Cost 3 vext3 RHS, <7,6,7,0>
+  1585222628U,	// <7,7,6,u>: Cost 2 vext2 <6,u,7,7>, <6,u,7,7>
+  1523417190U,	// <7,7,7,0>: Cost 2 vext1 <7,7,7,7>, LHS
+  2597159670U,	// <7,7,7,1>: Cost 3 vext1 <7,7,7,7>, <1,0,3,2>
+  2597160552U,	// <7,7,7,2>: Cost 3 vext1 <7,7,7,7>, <2,2,2,2>
+  2597161110U,	// <7,7,7,3>: Cost 3 vext1 <7,7,7,7>, <3,0,1,2>
+  1523420470U,	// <7,7,7,4>: Cost 2 vext1 <7,7,7,7>, RHS
+  2651002296U,	// <7,7,7,5>: Cost 3 vext2 <5,5,7,7>, <7,5,5,7>
+  2657637906U,	// <7,7,7,6>: Cost 3 vext2 <6,6,7,7>, <7,6,6,7>
+  363253046U,	// <7,7,7,7>: Cost 1 vdup3 RHS
+  363253046U,	// <7,7,7,u>: Cost 1 vdup3 RHS
+  1523417190U,	// <7,7,u,0>: Cost 2 vext1 <7,7,7,7>, LHS
+  1638471298U,	// <7,7,u,1>: Cost 2 vext3 RHS, <7,u,1,2>
+  2712213132U,	// <7,7,u,2>: Cost 3 vext3 RHS, <7,u,2,3>
+  2712213138U,	// <7,7,u,3>: Cost 3 vext3 RHS, <7,u,3,0>
+  1523420470U,	// <7,7,u,4>: Cost 2 vext1 <7,7,7,7>, RHS
+  1638471338U,	// <7,7,u,5>: Cost 2 vext3 RHS, <7,u,5,6>
+  1595840756U,	// <7,7,u,6>: Cost 2 vext2 <u,6,7,7>, <u,6,7,7>
+  363253046U,	// <7,7,u,7>: Cost 1 vdup3 RHS
+  363253046U,	// <7,7,u,u>: Cost 1 vdup3 RHS
+  1638318080U,	// <7,u,0,0>: Cost 2 vext3 RHS, <0,0,0,0>
+  1638323923U,	// <7,u,0,1>: Cost 2 vext3 RHS, <u,0,1,2>
+  1662211804U,	// <7,u,0,2>: Cost 2 vext3 RHS, <u,0,2,2>
+  1638323941U,	// <7,u,0,3>: Cost 2 vext3 RHS, <u,0,3,2>
+  2712065773U,	// <7,u,0,4>: Cost 3 vext3 RHS, <u,0,4,1>
+  1662359286U,	// <7,u,0,5>: Cost 2 vext3 RHS, <u,0,5,1>
+  1662359296U,	// <7,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
+  2987150664U,	// <7,u,0,7>: Cost 3 vzipr <5,6,7,0>, RHS
+  1638323986U,	// <7,u,0,u>: Cost 2 vext3 RHS, <u,0,u,2>
+  1517469798U,	// <7,u,1,0>: Cost 2 vext1 <6,7,u,1>, LHS
+  1638318900U,	// <7,u,1,1>: Cost 2 vext3 RHS, <1,1,1,1>
+  564582190U,	// <7,u,1,2>: Cost 1 vext3 RHS, LHS
+  1638324023U,	// <7,u,1,3>: Cost 2 vext3 RHS, <u,1,3,3>
+  1517473078U,	// <7,u,1,4>: Cost 2 vext1 <6,7,u,1>, RHS
+  2693928777U,	// <7,u,1,5>: Cost 3 vext3 <1,5,3,7>, <u,1,5,3>
+  1517474710U,	// <7,u,1,6>: Cost 2 vext1 <6,7,u,1>, <6,7,u,1>
+  1640462171U,	// <7,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
+  564582244U,	// <7,u,1,u>: Cost 1 vext3 RHS, LHS
+  1638318244U,	// <7,u,2,0>: Cost 2 vext3 RHS, <0,2,0,2>
+  2712065907U,	// <7,u,2,1>: Cost 3 vext3 RHS, <u,2,1,0>
+  1638319720U,	// <7,u,2,2>: Cost 2 vext3 RHS, <2,2,2,2>
+  1638324101U,	// <7,u,2,3>: Cost 2 vext3 RHS, <u,2,3,0>
+  1638318284U,	// <7,u,2,4>: Cost 2 vext3 RHS, <0,2,4,6>
+  2712065947U,	// <7,u,2,5>: Cost 3 vext3 RHS, <u,2,5,4>
+  2700564387U,	// <7,u,2,6>: Cost 3 vext3 <2,6,3,7>, <u,2,6,3>
+  1640314796U,	// <7,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
+  1638324146U,	// <7,u,2,u>: Cost 2 vext3 RHS, <u,2,u,0>
+  1638324156U,	// <7,u,3,0>: Cost 2 vext3 RHS, <u,3,0,1>
+  1638319064U,	// <7,u,3,1>: Cost 2 vext3 RHS, <1,3,1,3>
+  2700564435U,	// <7,u,3,2>: Cost 3 vext3 <2,6,3,7>, <u,3,2,6>
+  1638320540U,	// <7,u,3,3>: Cost 2 vext3 RHS, <3,3,3,3>
+  1638324196U,	// <7,u,3,4>: Cost 2 vext3 RHS, <u,3,4,5>
+  1638324207U,	// <7,u,3,5>: Cost 2 vext3 RHS, <u,3,5,7>
+  2700564472U,	// <7,u,3,6>: Cost 3 vext3 <2,6,3,7>, <u,3,6,7>
+  2695919610U,	// <7,u,3,7>: Cost 3 vext3 <1,u,3,7>, <u,3,7,0>
+  1638324228U,	// <7,u,3,u>: Cost 2 vext3 RHS, <u,3,u,1>
+  2712066061U,	// <7,u,4,0>: Cost 3 vext3 RHS, <u,4,0,1>
+  1662212122U,	// <7,u,4,1>: Cost 2 vext3 RHS, <u,4,1,5>
+  1662212132U,	// <7,u,4,2>: Cost 2 vext3 RHS, <u,4,2,6>
+  2712066092U,	// <7,u,4,3>: Cost 3 vext3 RHS, <u,4,3,5>
+  1638321360U,	// <7,u,4,4>: Cost 2 vext3 RHS, <4,4,4,4>
+  1638324287U,	// <7,u,4,5>: Cost 2 vext3 RHS, <u,4,5,6>
+  1662359624U,	// <7,u,4,6>: Cost 2 vext3 RHS, <u,4,6,6>
+  1640314961U,	// <7,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
+  1638324314U,	// <7,u,4,u>: Cost 2 vext3 RHS, <u,4,u,6>
+  1517502566U,	// <7,u,5,0>: Cost 2 vext1 <6,7,u,5>, LHS
+  1574612693U,	// <7,u,5,1>: Cost 2 vext2 <5,1,7,u>, <5,1,7,u>
+  2712066162U,	// <7,u,5,2>: Cost 3 vext3 RHS, <u,5,2,3>
+  1638324351U,	// <7,u,5,3>: Cost 2 vext3 RHS, <u,5,3,7>
+  1576603592U,	// <7,u,5,4>: Cost 2 vext2 <5,4,7,u>, <5,4,7,u>
+  1577267225U,	// <7,u,5,5>: Cost 2 vext2 <5,5,7,u>, <5,5,7,u>
+  564582554U,	// <7,u,5,6>: Cost 1 vext3 RHS, RHS
+  1640462499U,	// <7,u,5,7>: Cost 2 vext3 RHS, <u,5,7,7>
+  564582572U,	// <7,u,5,u>: Cost 1 vext3 RHS, RHS
+  2712066223U,	// <7,u,6,0>: Cost 3 vext3 RHS, <u,6,0,1>
+  2712066238U,	// <7,u,6,1>: Cost 3 vext3 RHS, <u,6,1,7>
+  1581249023U,	// <7,u,6,2>: Cost 2 vext2 <6,2,7,u>, <6,2,7,u>
+  1638324432U,	// <7,u,6,3>: Cost 2 vext3 RHS, <u,6,3,7>
+  1638468980U,	// <7,u,6,4>: Cost 2 vext3 RHS, <4,6,4,6>
+  2712066274U,	// <7,u,6,5>: Cost 3 vext3 RHS, <u,6,5,7>
+  1583903555U,	// <7,u,6,6>: Cost 2 vext2 <6,6,7,u>, <6,6,7,u>
+  1640315117U,	// <7,u,6,7>: Cost 2 vext3 RHS, <u,6,7,0>
+  1638324477U,	// <7,u,6,u>: Cost 2 vext3 RHS, <u,6,u,7>
+  1638471936U,	// <7,u,7,0>: Cost 2 vext3 RHS, <u,7,0,1>
+  2692970763U,	// <7,u,7,1>: Cost 3 vext3 <1,3,u,7>, <u,7,1,3>
+  2700933399U,	// <7,u,7,2>: Cost 3 vext3 <2,6,u,7>, <u,7,2,6>
+  2573347601U,	// <7,u,7,3>: Cost 3 vext1 <3,7,u,7>, <3,7,u,7>
+  1638471976U,	// <7,u,7,4>: Cost 2 vext3 RHS, <u,7,4,5>
+  1511551171U,	// <7,u,7,5>: Cost 2 vext1 <5,7,u,7>, <5,7,u,7>
+  2712213815U,	// <7,u,7,6>: Cost 3 vext3 RHS, <u,7,6,2>
+  363253046U,	// <7,u,7,7>: Cost 1 vdup3 RHS
+  363253046U,	// <7,u,7,u>: Cost 1 vdup3 RHS
+  1638324561U,	// <7,u,u,0>: Cost 2 vext3 RHS, <u,u,0,1>
+  1638324571U,	// <7,u,u,1>: Cost 2 vext3 RHS, <u,u,1,2>
+  564582757U,	// <7,u,u,2>: Cost 1 vext3 RHS, LHS
+  1638324587U,	// <7,u,u,3>: Cost 2 vext3 RHS, <u,u,3,0>
+  1638324601U,	// <7,u,u,4>: Cost 2 vext3 RHS, <u,u,4,5>
+  1638324611U,	// <7,u,u,5>: Cost 2 vext3 RHS, <u,u,5,6>
+  564582797U,	// <7,u,u,6>: Cost 1 vext3 RHS, RHS
+  363253046U,	// <7,u,u,7>: Cost 1 vdup3 RHS
+  564582811U,	// <7,u,u,u>: Cost 1 vext3 RHS, LHS
+  135053414U,	// <u,0,0,0>: Cost 1 vdup0 LHS
+  1611489290U,	// <u,0,0,1>: Cost 2 vext3 LHS, <0,0,1,1>
+  1611489300U,	// <u,0,0,2>: Cost 2 vext3 LHS, <0,0,2,2>
+  2568054923U,	// <u,0,0,3>: Cost 3 vext1 <3,0,0,0>, <3,0,0,0>
+  1481706806U,	// <u,0,0,4>: Cost 2 vext1 <0,u,0,0>, RHS
+  2555449040U,	// <u,0,0,5>: Cost 3 vext1 <0,u,0,0>, <5,1,7,3>
+  2591282078U,	// <u,0,0,6>: Cost 3 vext1 <6,u,0,0>, <6,u,0,0>
+  2591945711U,	// <u,0,0,7>: Cost 3 vext1 <7,0,0,0>, <7,0,0,0>
+  135053414U,	// <u,0,0,u>: Cost 1 vdup0 LHS
+  1493655654U,	// <u,0,1,0>: Cost 2 vext1 <2,u,0,1>, LHS
+  1860550758U,	// <u,0,1,1>: Cost 2 vzipl LHS, LHS
+  537747563U,	// <u,0,1,2>: Cost 1 vext3 LHS, LHS
+  2625135576U,	// <u,0,1,3>: Cost 3 vext2 <1,2,u,0>, <1,3,1,3>
+  1493658934U,	// <u,0,1,4>: Cost 2 vext1 <2,u,0,1>, RHS
+  2625135760U,	// <u,0,1,5>: Cost 3 vext2 <1,2,u,0>, <1,5,3,7>
+  1517548447U,	// <u,0,1,6>: Cost 2 vext1 <6,u,0,1>, <6,u,0,1>
+  2591290362U,	// <u,0,1,7>: Cost 3 vext1 <6,u,0,1>, <7,0,1,2>
+  537747612U,	// <u,0,1,u>: Cost 1 vext3 LHS, LHS
+  1611489444U,	// <u,0,2,0>: Cost 2 vext3 LHS, <0,2,0,2>
+  2685231276U,	// <u,0,2,1>: Cost 3 vext3 LHS, <0,2,1,1>
+  1994768486U,	// <u,0,2,2>: Cost 2 vtrnl LHS, LHS
+  2685231294U,	// <u,0,2,3>: Cost 3 vext3 LHS, <0,2,3,1>
+  1611489484U,	// <u,0,2,4>: Cost 2 vext3 LHS, <0,2,4,6>
+  2712068310U,	// <u,0,2,5>: Cost 3 vext3 RHS, <0,2,5,7>
+  2625136570U,	// <u,0,2,6>: Cost 3 vext2 <1,2,u,0>, <2,6,3,7>
+  2591962097U,	// <u,0,2,7>: Cost 3 vext1 <7,0,0,2>, <7,0,0,2>
+  1611489516U,	// <u,0,2,u>: Cost 2 vext3 LHS, <0,2,u,2>
+  2954067968U,	// <u,0,3,0>: Cost 3 vzipr LHS, <0,0,0,0>
+  2685231356U,	// <u,0,3,1>: Cost 3 vext3 LHS, <0,3,1,0>
+  72589981U,	// <u,0,3,2>: Cost 1 vrev LHS
+  2625137052U,	// <u,0,3,3>: Cost 3 vext2 <1,2,u,0>, <3,3,3,3>
+  2625137154U,	// <u,0,3,4>: Cost 3 vext2 <1,2,u,0>, <3,4,5,6>
+  2639071848U,	// <u,0,3,5>: Cost 3 vext2 <3,5,u,0>, <3,5,u,0>
+  2639735481U,	// <u,0,3,6>: Cost 3 vext2 <3,6,u,0>, <3,6,u,0>
+  2597279354U,	// <u,0,3,7>: Cost 3 vext1 <7,u,0,3>, <7,u,0,3>
+  73032403U,	// <u,0,3,u>: Cost 1 vrev LHS
+  2687074636U,	// <u,0,4,0>: Cost 3 vext3 <0,4,0,u>, <0,4,0,u>
+  1611489618U,	// <u,0,4,1>: Cost 2 vext3 LHS, <0,4,1,5>
+  1611489628U,	// <u,0,4,2>: Cost 2 vext3 LHS, <0,4,2,6>
+  3629222038U,	// <u,0,4,3>: Cost 4 vext1 <0,u,0,4>, <3,0,1,2>
+  2555481398U,	// <u,0,4,4>: Cost 3 vext1 <0,u,0,4>, RHS
+  1551396150U,	// <u,0,4,5>: Cost 2 vext2 <1,2,u,0>, RHS
+  2651680116U,	// <u,0,4,6>: Cost 3 vext2 <5,6,u,0>, <4,6,4,6>
+  2646150600U,	// <u,0,4,7>: Cost 3 vext2 <4,7,5,0>, <4,7,5,0>
+  1611932050U,	// <u,0,4,u>: Cost 2 vext3 LHS, <0,4,u,6>
+  2561458278U,	// <u,0,5,0>: Cost 3 vext1 <1,u,0,5>, LHS
+  1863532646U,	// <u,0,5,1>: Cost 2 vzipl RHS, LHS
+  2712068526U,	// <u,0,5,2>: Cost 3 vext3 RHS, <0,5,2,7>
+  2649689976U,	// <u,0,5,3>: Cost 3 vext2 <5,3,u,0>, <5,3,u,0>
+  2220237489U,	// <u,0,5,4>: Cost 3 vrev <0,u,4,5>
+  2651680772U,	// <u,0,5,5>: Cost 3 vext2 <5,6,u,0>, <5,5,5,5>
+  1577939051U,	// <u,0,5,6>: Cost 2 vext2 <5,6,u,0>, <5,6,u,0>
+  2830077238U,	// <u,0,5,7>: Cost 3 vuzpr <1,u,3,0>, RHS
+  1579266317U,	// <u,0,5,u>: Cost 2 vext2 <5,u,u,0>, <5,u,u,0>
+  2555494502U,	// <u,0,6,0>: Cost 3 vext1 <0,u,0,6>, LHS
+  2712068598U,	// <u,0,6,1>: Cost 3 vext3 RHS, <0,6,1,7>
+  1997750374U,	// <u,0,6,2>: Cost 2 vtrnl RHS, LHS
+  2655662673U,	// <u,0,6,3>: Cost 3 vext2 <6,3,u,0>, <6,3,u,0>
+  2555497782U,	// <u,0,6,4>: Cost 3 vext1 <0,u,0,6>, RHS
+  2651681459U,	// <u,0,6,5>: Cost 3 vext2 <5,6,u,0>, <6,5,0,u>
+  2651681592U,	// <u,0,6,6>: Cost 3 vext2 <5,6,u,0>, <6,6,6,6>
+  2651681614U,	// <u,0,6,7>: Cost 3 vext2 <5,6,u,0>, <6,7,0,1>
+  1997750428U,	// <u,0,6,u>: Cost 2 vtrnl RHS, LHS
+  2567446630U,	// <u,0,7,0>: Cost 3 vext1 <2,u,0,7>, LHS
+  2567447446U,	// <u,0,7,1>: Cost 3 vext1 <2,u,0,7>, <1,2,3,0>
+  2567448641U,	// <u,0,7,2>: Cost 3 vext1 <2,u,0,7>, <2,u,0,7>
+  2573421338U,	// <u,0,7,3>: Cost 3 vext1 <3,u,0,7>, <3,u,0,7>
+  2567449910U,	// <u,0,7,4>: Cost 3 vext1 <2,u,0,7>, RHS
+  2651682242U,	// <u,0,7,5>: Cost 3 vext2 <5,6,u,0>, <7,5,6,u>
+  2591339429U,	// <u,0,7,6>: Cost 3 vext1 <6,u,0,7>, <6,u,0,7>
+  2651682412U,	// <u,0,7,7>: Cost 3 vext2 <5,6,u,0>, <7,7,7,7>
+  2567452462U,	// <u,0,7,u>: Cost 3 vext1 <2,u,0,7>, LHS
+  135053414U,	// <u,0,u,0>: Cost 1 vdup0 LHS
+  1611489938U,	// <u,0,u,1>: Cost 2 vext3 LHS, <0,u,1,1>
+  537748125U,	// <u,0,u,2>: Cost 1 vext3 LHS, LHS
+  2685674148U,	// <u,0,u,3>: Cost 3 vext3 LHS, <0,u,3,1>
+  1611932338U,	// <u,0,u,4>: Cost 2 vext3 LHS, <0,u,4,6>
+  1551399066U,	// <u,0,u,5>: Cost 2 vext2 <1,2,u,0>, RHS
+  1517605798U,	// <u,0,u,6>: Cost 2 vext1 <6,u,0,u>, <6,u,0,u>
+  2830077481U,	// <u,0,u,7>: Cost 3 vuzpr <1,u,3,0>, RHS
+  537748179U,	// <u,0,u,u>: Cost 1 vext3 LHS, LHS
+  1544101961U,	// <u,1,0,0>: Cost 2 vext2 <0,0,u,1>, <0,0,u,1>
+  1558036582U,	// <u,1,0,1>: Cost 2 vext2 <2,3,u,1>, LHS
+  2619171051U,	// <u,1,0,2>: Cost 3 vext2 <0,2,u,1>, <0,2,u,1>
+  1611490038U,	// <u,1,0,3>: Cost 2 vext3 LHS, <1,0,3,2>
+  2555522358U,	// <u,1,0,4>: Cost 3 vext1 <0,u,1,0>, RHS
+  2712068871U,	// <u,1,0,5>: Cost 3 vext3 RHS, <1,0,5,1>
+  2591355815U,	// <u,1,0,6>: Cost 3 vext1 <6,u,1,0>, <6,u,1,0>
+  2597328512U,	// <u,1,0,7>: Cost 3 vext1 <7,u,1,0>, <7,u,1,0>
+  1611490083U,	// <u,1,0,u>: Cost 2 vext3 LHS, <1,0,u,2>
+  1481785446U,	// <u,1,1,0>: Cost 2 vext1 <0,u,1,1>, LHS
+  202162278U,	// <u,1,1,1>: Cost 1 vdup1 LHS
+  2555528808U,	// <u,1,1,2>: Cost 3 vext1 <0,u,1,1>, <2,2,2,2>
+  1611490120U,	// <u,1,1,3>: Cost 2 vext3 LHS, <1,1,3,3>
+  1481788726U,	// <u,1,1,4>: Cost 2 vext1 <0,u,1,1>, RHS
+  2689876828U,	// <u,1,1,5>: Cost 3 vext3 LHS, <1,1,5,5>
+  2591364008U,	// <u,1,1,6>: Cost 3 vext1 <6,u,1,1>, <6,u,1,1>
+  2592691274U,	// <u,1,1,7>: Cost 3 vext1 <7,1,1,1>, <7,1,1,1>
+  202162278U,	// <u,1,1,u>: Cost 1 vdup1 LHS
+  1499709542U,	// <u,1,2,0>: Cost 2 vext1 <3,u,1,2>, LHS
+  2689876871U,	// <u,1,2,1>: Cost 3 vext3 LHS, <1,2,1,3>
+  2631116445U,	// <u,1,2,2>: Cost 3 vext2 <2,2,u,1>, <2,2,u,1>
+  835584U,	// <u,1,2,3>: Cost 0 copy LHS
+  1499712822U,	// <u,1,2,4>: Cost 2 vext1 <3,u,1,2>, RHS
+  2689876907U,	// <u,1,2,5>: Cost 3 vext3 LHS, <1,2,5,3>
+  2631780282U,	// <u,1,2,6>: Cost 3 vext2 <2,3,u,1>, <2,6,3,7>
+  1523603074U,	// <u,1,2,7>: Cost 2 vext1 <7,u,1,2>, <7,u,1,2>
+  835584U,	// <u,1,2,u>: Cost 0 copy LHS
+  1487773798U,	// <u,1,3,0>: Cost 2 vext1 <1,u,1,3>, LHS
+  1611490264U,	// <u,1,3,1>: Cost 2 vext3 LHS, <1,3,1,3>
+  2685232094U,	// <u,1,3,2>: Cost 3 vext3 LHS, <1,3,2,0>
+  2018746470U,	// <u,1,3,3>: Cost 2 vtrnr LHS, LHS
+  1487777078U,	// <u,1,3,4>: Cost 2 vext1 <1,u,1,3>, RHS
+  1611490304U,	// <u,1,3,5>: Cost 2 vext3 LHS, <1,3,5,7>
+  2685674505U,	// <u,1,3,6>: Cost 3 vext3 LHS, <1,3,6,7>
+  2640407307U,	// <u,1,3,7>: Cost 3 vext2 <3,7,u,1>, <3,7,u,1>
+  1611490327U,	// <u,1,3,u>: Cost 2 vext3 LHS, <1,3,u,3>
+  1567992749U,	// <u,1,4,0>: Cost 2 vext2 <4,0,u,1>, <4,0,u,1>
+  2693121070U,	// <u,1,4,1>: Cost 3 vext3 <1,4,1,u>, <1,4,1,u>
+  2693194807U,	// <u,1,4,2>: Cost 3 vext3 <1,4,2,u>, <1,4,2,u>
+  1152386432U,	// <u,1,4,3>: Cost 2 vrev <1,u,3,4>
+  2555555126U,	// <u,1,4,4>: Cost 3 vext1 <0,u,1,4>, RHS
+  1558039862U,	// <u,1,4,5>: Cost 2 vext2 <2,3,u,1>, RHS
+  2645716371U,	// <u,1,4,6>: Cost 3 vext2 <4,6,u,1>, <4,6,u,1>
+  2597361284U,	// <u,1,4,7>: Cost 3 vext1 <7,u,1,4>, <7,u,1,4>
+  1152755117U,	// <u,1,4,u>: Cost 2 vrev <1,u,u,4>
+  1481818214U,	// <u,1,5,0>: Cost 2 vext1 <0,u,1,5>, LHS
+  2555560694U,	// <u,1,5,1>: Cost 3 vext1 <0,u,1,5>, <1,0,3,2>
+  2555561576U,	// <u,1,5,2>: Cost 3 vext1 <0,u,1,5>, <2,2,2,2>
+  1611490448U,	// <u,1,5,3>: Cost 2 vext3 LHS, <1,5,3,7>
+  1481821494U,	// <u,1,5,4>: Cost 2 vext1 <0,u,1,5>, RHS
+  2651025435U,	// <u,1,5,5>: Cost 3 vext2 <5,5,u,1>, <5,5,u,1>
+  2651689068U,	// <u,1,5,6>: Cost 3 vext2 <5,6,u,1>, <5,6,u,1>
+  2823966006U,	// <u,1,5,7>: Cost 3 vuzpr <0,u,1,1>, RHS
+  1611932861U,	// <u,1,5,u>: Cost 2 vext3 LHS, <1,5,u,7>
+  2555568230U,	// <u,1,6,0>: Cost 3 vext1 <0,u,1,6>, LHS
+  2689877199U,	// <u,1,6,1>: Cost 3 vext3 LHS, <1,6,1,7>
+  2712069336U,	// <u,1,6,2>: Cost 3 vext3 RHS, <1,6,2,7>
+  2685232353U,	// <u,1,6,3>: Cost 3 vext3 LHS, <1,6,3,7>
+  2555571510U,	// <u,1,6,4>: Cost 3 vext1 <0,u,1,6>, RHS
+  2689877235U,	// <u,1,6,5>: Cost 3 vext3 LHS, <1,6,5,7>
+  2657661765U,	// <u,1,6,6>: Cost 3 vext2 <6,6,u,1>, <6,6,u,1>
+  1584583574U,	// <u,1,6,7>: Cost 2 vext2 <6,7,u,1>, <6,7,u,1>
+  1585247207U,	// <u,1,6,u>: Cost 2 vext2 <6,u,u,1>, <6,u,u,1>
+  2561548390U,	// <u,1,7,0>: Cost 3 vext1 <1,u,1,7>, LHS
+  2561549681U,	// <u,1,7,1>: Cost 3 vext1 <1,u,1,7>, <1,u,1,7>
+  2573493926U,	// <u,1,7,2>: Cost 3 vext1 <3,u,1,7>, <2,3,0,1>
+  2042962022U,	// <u,1,7,3>: Cost 2 vtrnr RHS, LHS
+  2561551670U,	// <u,1,7,4>: Cost 3 vext1 <1,u,1,7>, RHS
+  2226300309U,	// <u,1,7,5>: Cost 3 vrev <1,u,5,7>
+  2658325990U,	// <u,1,7,6>: Cost 3 vext2 <6,7,u,1>, <7,6,1,u>
+  2658326124U,	// <u,1,7,7>: Cost 3 vext2 <6,7,u,1>, <7,7,7,7>
+  2042962027U,	// <u,1,7,u>: Cost 2 vtrnr RHS, LHS
+  1481842790U,	// <u,1,u,0>: Cost 2 vext1 <0,u,1,u>, LHS
+  202162278U,	// <u,1,u,1>: Cost 1 vdup1 LHS
+  2685674867U,	// <u,1,u,2>: Cost 3 vext3 LHS, <1,u,2,0>
+  835584U,	// <u,1,u,3>: Cost 0 copy LHS
+  1481846070U,	// <u,1,u,4>: Cost 2 vext1 <0,u,1,u>, RHS
+  1611933077U,	// <u,1,u,5>: Cost 2 vext3 LHS, <1,u,5,7>
+  2685674910U,	// <u,1,u,6>: Cost 3 vext3 LHS, <1,u,6,7>
+  1523652232U,	// <u,1,u,7>: Cost 2 vext1 <7,u,1,u>, <7,u,1,u>
+  835584U,	// <u,1,u,u>: Cost 0 copy LHS
+  1544110154U,	// <u,2,0,0>: Cost 2 vext2 <0,0,u,2>, <0,0,u,2>
+  1545437286U,	// <u,2,0,1>: Cost 2 vext2 <0,2,u,2>, LHS
+  1545437420U,	// <u,2,0,2>: Cost 2 vext2 <0,2,u,2>, <0,2,u,2>
+  2685232589U,	// <u,2,0,3>: Cost 3 vext3 LHS, <2,0,3,0>
+  2619179346U,	// <u,2,0,4>: Cost 3 vext2 <0,2,u,2>, <0,4,1,5>
+  2712069606U,	// <u,2,0,5>: Cost 3 vext3 RHS, <2,0,5,7>
+  2689877484U,	// <u,2,0,6>: Cost 3 vext3 LHS, <2,0,6,4>
+  2659656273U,	// <u,2,0,7>: Cost 3 vext2 <7,0,u,2>, <0,7,2,u>
+  1545437853U,	// <u,2,0,u>: Cost 2 vext2 <0,2,u,2>, LHS
+  1550082851U,	// <u,2,1,0>: Cost 2 vext2 <1,0,u,2>, <1,0,u,2>
+  2619179828U,	// <u,2,1,1>: Cost 3 vext2 <0,2,u,2>, <1,1,1,1>
+  2619179926U,	// <u,2,1,2>: Cost 3 vext2 <0,2,u,2>, <1,2,3,0>
+  2685232671U,	// <u,2,1,3>: Cost 3 vext3 LHS, <2,1,3,1>
+  2555604278U,	// <u,2,1,4>: Cost 3 vext1 <0,u,2,1>, RHS
+  2619180176U,	// <u,2,1,5>: Cost 3 vext2 <0,2,u,2>, <1,5,3,7>
+  2689877564U,	// <u,2,1,6>: Cost 3 vext3 LHS, <2,1,6,3>
+  2602718850U,	// <u,2,1,7>: Cost 3 vext1 <u,7,2,1>, <7,u,1,2>
+  1158703235U,	// <u,2,1,u>: Cost 2 vrev <2,u,u,1>
+  1481867366U,	// <u,2,2,0>: Cost 2 vext1 <0,u,2,2>, LHS
+  2555609846U,	// <u,2,2,1>: Cost 3 vext1 <0,u,2,2>, <1,0,3,2>
+  269271142U,	// <u,2,2,2>: Cost 1 vdup2 LHS
+  1611490930U,	// <u,2,2,3>: Cost 2 vext3 LHS, <2,2,3,3>
+  1481870646U,	// <u,2,2,4>: Cost 2 vext1 <0,u,2,2>, RHS
+  2689877640U,	// <u,2,2,5>: Cost 3 vext3 LHS, <2,2,5,7>
+  2619180986U,	// <u,2,2,6>: Cost 3 vext2 <0,2,u,2>, <2,6,3,7>
+  2593436837U,	// <u,2,2,7>: Cost 3 vext1 <7,2,2,2>, <7,2,2,2>
+  269271142U,	// <u,2,2,u>: Cost 1 vdup2 LHS
+  408134301U,	// <u,2,3,0>: Cost 1 vext1 LHS, LHS
+  1481876214U,	// <u,2,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  1481877096U,	// <u,2,3,2>: Cost 2 vext1 LHS, <2,2,2,2>
+  1880326246U,	// <u,2,3,3>: Cost 2 vzipr LHS, LHS
+  408137014U,	// <u,2,3,4>: Cost 1 vext1 LHS, RHS
+  1529654992U,	// <u,2,3,5>: Cost 2 vext1 LHS, <5,1,7,3>
+  1529655802U,	// <u,2,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1529656314U,	// <u,2,3,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  408139566U,	// <u,2,3,u>: Cost 1 vext1 LHS, LHS
+  1567853468U,	// <u,2,4,0>: Cost 2 vext2 <4,0,6,2>, <4,0,6,2>
+  2561598362U,	// <u,2,4,1>: Cost 3 vext1 <1,u,2,4>, <1,2,3,4>
+  2555627214U,	// <u,2,4,2>: Cost 3 vext1 <0,u,2,4>, <2,3,4,5>
+  2685232918U,	// <u,2,4,3>: Cost 3 vext3 LHS, <2,4,3,5>
+  2555628854U,	// <u,2,4,4>: Cost 3 vext1 <0,u,2,4>, RHS
+  1545440566U,	// <u,2,4,5>: Cost 2 vext2 <0,2,u,2>, RHS
+  1571982740U,	// <u,2,4,6>: Cost 2 vext2 <4,6,u,2>, <4,6,u,2>
+  2592125957U,	// <u,2,4,7>: Cost 3 vext1 <7,0,2,4>, <7,0,2,4>
+  1545440809U,	// <u,2,4,u>: Cost 2 vext2 <0,2,u,2>, RHS
+  2555633766U,	// <u,2,5,0>: Cost 3 vext1 <0,u,2,5>, LHS
+  2561606550U,	// <u,2,5,1>: Cost 3 vext1 <1,u,2,5>, <1,2,3,0>
+  2689877856U,	// <u,2,5,2>: Cost 3 vext3 LHS, <2,5,2,7>
+  2685233000U,	// <u,2,5,3>: Cost 3 vext3 LHS, <2,5,3,6>
+  1158441059U,	// <u,2,5,4>: Cost 2 vrev <2,u,4,5>
+  2645725188U,	// <u,2,5,5>: Cost 3 vext2 <4,6,u,2>, <5,5,5,5>
+  2689877892U,	// <u,2,5,6>: Cost 3 vext3 LHS, <2,5,6,7>
+  2823900470U,	// <u,2,5,7>: Cost 3 vuzpr <0,u,0,2>, RHS
+  1158736007U,	// <u,2,5,u>: Cost 2 vrev <2,u,u,5>
+  1481900134U,	// <u,2,6,0>: Cost 2 vext1 <0,u,2,6>, LHS
+  2555642614U,	// <u,2,6,1>: Cost 3 vext1 <0,u,2,6>, <1,0,3,2>
+  2555643496U,	// <u,2,6,2>: Cost 3 vext1 <0,u,2,6>, <2,2,2,2>
+  1611491258U,	// <u,2,6,3>: Cost 2 vext3 LHS, <2,6,3,7>
+  1481903414U,	// <u,2,6,4>: Cost 2 vext1 <0,u,2,6>, RHS
+  2689877964U,	// <u,2,6,5>: Cost 3 vext3 LHS, <2,6,5,7>
+  2689877973U,	// <u,2,6,6>: Cost 3 vext3 LHS, <2,6,6,7>
+  2645726030U,	// <u,2,6,7>: Cost 3 vext2 <4,6,u,2>, <6,7,0,1>
+  1611933671U,	// <u,2,6,u>: Cost 2 vext3 LHS, <2,6,u,7>
+  1585919033U,	// <u,2,7,0>: Cost 2 vext2 <7,0,u,2>, <7,0,u,2>
+  2573566710U,	// <u,2,7,1>: Cost 3 vext1 <3,u,2,7>, <1,0,3,2>
+  2567596115U,	// <u,2,7,2>: Cost 3 vext1 <2,u,2,7>, <2,u,2,7>
+  1906901094U,	// <u,2,7,3>: Cost 2 vzipr RHS, LHS
+  2555653430U,	// <u,2,7,4>: Cost 3 vext1 <0,u,2,7>, RHS
+  2800080230U,	// <u,2,7,5>: Cost 3 vuzpl LHS, <7,4,5,6>
+  2980643164U,	// <u,2,7,6>: Cost 3 vzipr RHS, <0,4,2,6>
+  2645726828U,	// <u,2,7,7>: Cost 3 vext2 <4,6,u,2>, <7,7,7,7>
+  1906901099U,	// <u,2,7,u>: Cost 2 vzipr RHS, LHS
+  408175266U,	// <u,2,u,0>: Cost 1 vext1 LHS, LHS
+  1545443118U,	// <u,2,u,1>: Cost 2 vext2 <0,2,u,2>, LHS
+  269271142U,	// <u,2,u,2>: Cost 1 vdup2 LHS
+  1611491416U,	// <u,2,u,3>: Cost 2 vext3 LHS, <2,u,3,3>
+  408177974U,	// <u,2,u,4>: Cost 1 vext1 LHS, RHS
+  1545443482U,	// <u,2,u,5>: Cost 2 vext2 <0,2,u,2>, RHS
+  1726339226U,	// <u,2,u,6>: Cost 2 vuzpl LHS, RHS
+  1529697274U,	// <u,2,u,7>: Cost 2 vext1 LHS, <7,0,1,2>
+  408180526U,	// <u,2,u,u>: Cost 1 vext1 LHS, LHS
+  1544781824U,	// <u,3,0,0>: Cost 2 vext2 LHS, <0,0,0,0>
+  471040156U,	// <u,3,0,1>: Cost 1 vext2 LHS, LHS
+  1544781988U,	// <u,3,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  2618523900U,	// <u,3,0,3>: Cost 3 vext2 LHS, <0,3,1,0>
+  1544782162U,	// <u,3,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  2238188352U,	// <u,3,0,5>: Cost 3 vrev <3,u,5,0>
+  2623169023U,	// <u,3,0,6>: Cost 3 vext2 LHS, <0,6,2,7>
+  2238335826U,	// <u,3,0,7>: Cost 3 vrev <3,u,7,0>
+  471040669U,	// <u,3,0,u>: Cost 1 vext2 LHS, LHS
+  1544782582U,	// <u,3,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  1544782644U,	// <u,3,1,1>: Cost 2 vext2 LHS, <1,1,1,1>
+  1544782742U,	// <u,3,1,2>: Cost 2 vext2 LHS, <1,2,3,0>
+  1544782808U,	// <u,3,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  2618524733U,	// <u,3,1,4>: Cost 3 vext2 LHS, <1,4,3,5>
+  1544782992U,	// <u,3,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  2618524897U,	// <u,3,1,6>: Cost 3 vext2 LHS, <1,6,3,7>
+  2703517987U,	// <u,3,1,7>: Cost 3 vext3 <3,1,7,u>, <3,1,7,u>
+  1544783213U,	// <u,3,1,u>: Cost 2 vext2 LHS, <1,u,1,3>
+  1529716838U,	// <u,3,2,0>: Cost 2 vext1 <u,u,3,2>, LHS
+  1164167966U,	// <u,3,2,1>: Cost 2 vrev <3,u,1,2>
+  1544783464U,	// <u,3,2,2>: Cost 2 vext2 LHS, <2,2,2,2>
+  1544783526U,	// <u,3,2,3>: Cost 2 vext2 LHS, <2,3,0,1>
+  1529720118U,	// <u,3,2,4>: Cost 2 vext1 <u,u,3,2>, RHS
+  2618525544U,	// <u,3,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+  1544783802U,	// <u,3,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  2704181620U,	// <u,3,2,7>: Cost 3 vext3 <3,2,7,u>, <3,2,7,u>
+  1544783931U,	// <u,3,2,u>: Cost 2 vext2 LHS, <2,u,0,1>
+  1544784022U,	// <u,3,3,0>: Cost 2 vext2 LHS, <3,0,1,2>
+  1487922559U,	// <u,3,3,1>: Cost 2 vext1 <1,u,3,3>, <1,u,3,3>
+  1493895256U,	// <u,3,3,2>: Cost 2 vext1 <2,u,3,3>, <2,u,3,3>
+  336380006U,	// <u,3,3,3>: Cost 1 vdup3 LHS
+  1544784386U,	// <u,3,3,4>: Cost 2 vext2 LHS, <3,4,5,6>
+  2824054478U,	// <u,3,3,5>: Cost 3 vuzpr LHS, <2,3,4,5>
+  2238286668U,	// <u,3,3,6>: Cost 3 vrev <3,u,6,3>
+  2954069136U,	// <u,3,3,7>: Cost 3 vzipr LHS, <1,5,3,7>
+  336380006U,	// <u,3,3,u>: Cost 1 vdup3 LHS
+  1487929446U,	// <u,3,4,0>: Cost 2 vext1 <1,u,3,4>, LHS
+  1487930752U,	// <u,3,4,1>: Cost 2 vext1 <1,u,3,4>, <1,u,3,4>
+  2623171644U,	// <u,3,4,2>: Cost 3 vext2 LHS, <4,2,6,0>
+  2561673366U,	// <u,3,4,3>: Cost 3 vext1 <1,u,3,4>, <3,0,1,2>
+  1487932726U,	// <u,3,4,4>: Cost 2 vext1 <1,u,3,4>, RHS
+  471043382U,	// <u,3,4,5>: Cost 1 vext2 LHS, RHS
+  1592561012U,	// <u,3,4,6>: Cost 2 vext2 LHS, <4,6,4,6>
+  2238368598U,	// <u,3,4,7>: Cost 3 vrev <3,u,7,4>
+  471043625U,	// <u,3,4,u>: Cost 1 vext2 LHS, RHS
+  2555707494U,	// <u,3,5,0>: Cost 3 vext1 <0,u,3,5>, LHS
+  1574645465U,	// <u,3,5,1>: Cost 2 vext2 <5,1,u,3>, <5,1,u,3>
+  2567653106U,	// <u,3,5,2>: Cost 3 vext1 <2,u,3,5>, <2,3,u,5>
+  2555709954U,	// <u,3,5,3>: Cost 3 vext1 <0,u,3,5>, <3,4,5,6>
+  1592561606U,	// <u,3,5,4>: Cost 2 vext2 LHS, <5,4,7,6>
+  1592561668U,	// <u,3,5,5>: Cost 2 vext2 LHS, <5,5,5,5>
+  1592561762U,	// <u,3,5,6>: Cost 2 vext2 LHS, <5,6,7,0>
+  1750314294U,	// <u,3,5,7>: Cost 2 vuzpr LHS, RHS
+  1750314295U,	// <u,3,5,u>: Cost 2 vuzpr LHS, RHS
+  2623172897U,	// <u,3,6,0>: Cost 3 vext2 LHS, <6,0,1,2>
+  2561688962U,	// <u,3,6,1>: Cost 3 vext1 <1,u,3,6>, <1,u,3,6>
+  1581281795U,	// <u,3,6,2>: Cost 2 vext2 <6,2,u,3>, <6,2,u,3>
+  2706541204U,	// <u,3,6,3>: Cost 3 vext3 <3,6,3,u>, <3,6,3,u>
+  2623173261U,	// <u,3,6,4>: Cost 3 vext2 LHS, <6,4,5,6>
+  1164495686U,	// <u,3,6,5>: Cost 2 vrev <3,u,5,6>
+  1592562488U,	// <u,3,6,6>: Cost 2 vext2 LHS, <6,6,6,6>
+  1592562510U,	// <u,3,6,7>: Cost 2 vext2 LHS, <6,7,0,1>
+  1164716897U,	// <u,3,6,u>: Cost 2 vrev <3,u,u,6>
+  1487954022U,	// <u,3,7,0>: Cost 2 vext1 <1,u,3,7>, LHS
+  1487955331U,	// <u,3,7,1>: Cost 2 vext1 <1,u,3,7>, <1,u,3,7>
+  1493928028U,	// <u,3,7,2>: Cost 2 vext1 <2,u,3,7>, <2,u,3,7>
+  2561697942U,	// <u,3,7,3>: Cost 3 vext1 <1,u,3,7>, <3,0,1,2>
+  1487957302U,	// <u,3,7,4>: Cost 2 vext1 <1,u,3,7>, RHS
+  2707352311U,	// <u,3,7,5>: Cost 3 vext3 <3,7,5,u>, <3,7,5,u>
+  2655024623U,	// <u,3,7,6>: Cost 3 vext2 <6,2,u,3>, <7,6,2,u>
+  1592563308U,	// <u,3,7,7>: Cost 2 vext2 LHS, <7,7,7,7>
+  1487959854U,	// <u,3,7,u>: Cost 2 vext1 <1,u,3,7>, LHS
+  1544787667U,	// <u,3,u,0>: Cost 2 vext2 LHS, <u,0,1,2>
+  471045934U,	// <u,3,u,1>: Cost 1 vext2 LHS, LHS
+  1549432709U,	// <u,3,u,2>: Cost 2 vext2 LHS, <u,2,3,0>
+  336380006U,	// <u,3,u,3>: Cost 1 vdup3 LHS
+  1544788031U,	// <u,3,u,4>: Cost 2 vext2 LHS, <u,4,5,6>
+  471046298U,	// <u,3,u,5>: Cost 1 vext2 LHS, RHS
+  1549433040U,	// <u,3,u,6>: Cost 2 vext2 LHS, <u,6,3,7>
+  1750314537U,	// <u,3,u,7>: Cost 2 vuzpr LHS, RHS
+  471046501U,	// <u,3,u,u>: Cost 1 vext2 LHS, LHS
+  2625167360U,	// <u,4,0,0>: Cost 3 vext2 <1,2,u,4>, <0,0,0,0>
+  1551425638U,	// <u,4,0,1>: Cost 2 vext2 <1,2,u,4>, LHS
+  2619195630U,	// <u,4,0,2>: Cost 3 vext2 <0,2,u,4>, <0,2,u,4>
+  2619343104U,	// <u,4,0,3>: Cost 3 vext2 <0,3,1,4>, <0,3,1,4>
+  2625167698U,	// <u,4,0,4>: Cost 3 vext2 <1,2,u,4>, <0,4,1,5>
+  1638329234U,	// <u,4,0,5>: Cost 2 vext3 RHS, <4,0,5,1>
+  1638329244U,	// <u,4,0,6>: Cost 2 vext3 RHS, <4,0,6,2>
+  3787803556U,	// <u,4,0,7>: Cost 4 vext3 RHS, <4,0,7,1>
+  1551426205U,	// <u,4,0,u>: Cost 2 vext2 <1,2,u,4>, LHS
+  2555748454U,	// <u,4,1,0>: Cost 3 vext1 <0,u,4,1>, LHS
+  2625168180U,	// <u,4,1,1>: Cost 3 vext2 <1,2,u,4>, <1,1,1,1>
+  1551426503U,	// <u,4,1,2>: Cost 2 vext2 <1,2,u,4>, <1,2,u,4>
+  2625168344U,	// <u,4,1,3>: Cost 3 vext2 <1,2,u,4>, <1,3,1,3>
+  2555751734U,	// <u,4,1,4>: Cost 3 vext1 <0,u,4,1>, RHS
+  1860554038U,	// <u,4,1,5>: Cost 2 vzipl LHS, RHS
+  2689879022U,	// <u,4,1,6>: Cost 3 vext3 LHS, <4,1,6,3>
+  2592248852U,	// <u,4,1,7>: Cost 3 vext1 <7,0,4,1>, <7,0,4,1>
+  1555408301U,	// <u,4,1,u>: Cost 2 vext2 <1,u,u,4>, <1,u,u,4>
+  2555756646U,	// <u,4,2,0>: Cost 3 vext1 <0,u,4,2>, LHS
+  2625168943U,	// <u,4,2,1>: Cost 3 vext2 <1,2,u,4>, <2,1,4,u>
+  2625169000U,	// <u,4,2,2>: Cost 3 vext2 <1,2,u,4>, <2,2,2,2>
+  2619197134U,	// <u,4,2,3>: Cost 3 vext2 <0,2,u,4>, <2,3,4,5>
+  2555759926U,	// <u,4,2,4>: Cost 3 vext1 <0,u,4,2>, RHS
+  2712071222U,	// <u,4,2,5>: Cost 3 vext3 RHS, <4,2,5,3>
+  1994771766U,	// <u,4,2,6>: Cost 2 vtrnl LHS, RHS
+  2592257045U,	// <u,4,2,7>: Cost 3 vext1 <7,0,4,2>, <7,0,4,2>
+  1994771784U,	// <u,4,2,u>: Cost 2 vtrnl LHS, RHS
+  2625169558U,	// <u,4,3,0>: Cost 3 vext2 <1,2,u,4>, <3,0,1,2>
+  2567709594U,	// <u,4,3,1>: Cost 3 vext1 <2,u,4,3>, <1,2,3,4>
+  2567710817U,	// <u,4,3,2>: Cost 3 vext1 <2,u,4,3>, <2,u,4,3>
+  2625169820U,	// <u,4,3,3>: Cost 3 vext2 <1,2,u,4>, <3,3,3,3>
+  2625169922U,	// <u,4,3,4>: Cost 3 vext2 <1,2,u,4>, <3,4,5,6>
+  2954069710U,	// <u,4,3,5>: Cost 3 vzipr LHS, <2,3,4,5>
+  2954068172U,	// <u,4,3,6>: Cost 3 vzipr LHS, <0,2,4,6>
+  3903849472U,	// <u,4,3,7>: Cost 4 vuzpr <1,u,3,4>, <1,3,5,7>
+  2954068174U,	// <u,4,3,u>: Cost 3 vzipr LHS, <0,2,4,u>
+  1505919078U,	// <u,4,4,0>: Cost 2 vext1 <4,u,4,4>, LHS
+  2567717831U,	// <u,4,4,1>: Cost 3 vext1 <2,u,4,4>, <1,2,u,4>
+  2567719010U,	// <u,4,4,2>: Cost 3 vext1 <2,u,4,4>, <2,u,4,4>
+  2570373542U,	// <u,4,4,3>: Cost 3 vext1 <3,3,4,4>, <3,3,4,4>
+  161926454U,	// <u,4,4,4>: Cost 1 vdup0 RHS
+  1551428918U,	// <u,4,4,5>: Cost 2 vext2 <1,2,u,4>, RHS
+  1638329572U,	// <u,4,4,6>: Cost 2 vext3 RHS, <4,4,6,6>
+  2594927963U,	// <u,4,4,7>: Cost 3 vext1 <7,4,4,4>, <7,4,4,4>
+  161926454U,	// <u,4,4,u>: Cost 1 vdup0 RHS
+  1493983334U,	// <u,4,5,0>: Cost 2 vext1 <2,u,4,5>, LHS
+  2689879301U,	// <u,4,5,1>: Cost 3 vext3 LHS, <4,5,1,3>
+  1493985379U,	// <u,4,5,2>: Cost 2 vext1 <2,u,4,5>, <2,u,4,5>
+  2567727254U,	// <u,4,5,3>: Cost 3 vext1 <2,u,4,5>, <3,0,1,2>
+  1493986614U,	// <u,4,5,4>: Cost 2 vext1 <2,u,4,5>, RHS
+  1863535926U,	// <u,4,5,5>: Cost 2 vzipl RHS, RHS
+  537750838U,	// <u,4,5,6>: Cost 1 vext3 LHS, RHS
+  2830110006U,	// <u,4,5,7>: Cost 3 vuzpr <1,u,3,4>, RHS
+  537750856U,	// <u,4,5,u>: Cost 1 vext3 LHS, RHS
+  1482047590U,	// <u,4,6,0>: Cost 2 vext1 <0,u,4,6>, LHS
+  2555790070U,	// <u,4,6,1>: Cost 3 vext1 <0,u,4,6>, <1,0,3,2>
+  2555790952U,	// <u,4,6,2>: Cost 3 vext1 <0,u,4,6>, <2,2,2,2>
+  2555791510U,	// <u,4,6,3>: Cost 3 vext1 <0,u,4,6>, <3,0,1,2>
+  1482050870U,	// <u,4,6,4>: Cost 2 vext1 <0,u,4,6>, RHS
+  2689879422U,	// <u,4,6,5>: Cost 3 vext3 LHS, <4,6,5,7>
+  1997753654U,	// <u,4,6,6>: Cost 2 vtrnl RHS, RHS
+  2712071562U,	// <u,4,6,7>: Cost 3 vext3 RHS, <4,6,7,1>
+  1482053422U,	// <u,4,6,u>: Cost 2 vext1 <0,u,4,6>, LHS
+  2567741542U,	// <u,4,7,0>: Cost 3 vext1 <2,u,4,7>, LHS
+  2567742362U,	// <u,4,7,1>: Cost 3 vext1 <2,u,4,7>, <1,2,3,4>
+  2567743589U,	// <u,4,7,2>: Cost 3 vext1 <2,u,4,7>, <2,u,4,7>
+  2573716286U,	// <u,4,7,3>: Cost 3 vext1 <3,u,4,7>, <3,u,4,7>
+  2567744822U,	// <u,4,7,4>: Cost 3 vext1 <2,u,4,7>, RHS
+  2712071624U,	// <u,4,7,5>: Cost 3 vext3 RHS, <4,7,5,0>
+  96808489U,	// <u,4,7,6>: Cost 1 vrev RHS
+  2651715180U,	// <u,4,7,7>: Cost 3 vext2 <5,6,u,4>, <7,7,7,7>
+  96955963U,	// <u,4,7,u>: Cost 1 vrev RHS
+  1482063974U,	// <u,4,u,0>: Cost 2 vext1 <0,u,4,u>, LHS
+  1551431470U,	// <u,4,u,1>: Cost 2 vext2 <1,2,u,4>, LHS
+  1494009958U,	// <u,4,u,2>: Cost 2 vext1 <2,u,4,u>, <2,u,4,u>
+  2555807894U,	// <u,4,u,3>: Cost 3 vext1 <0,u,4,u>, <3,0,1,2>
+  161926454U,	// <u,4,u,4>: Cost 1 vdup0 RHS
+  1551431834U,	// <u,4,u,5>: Cost 2 vext2 <1,2,u,4>, RHS
+  537751081U,	// <u,4,u,6>: Cost 1 vext3 LHS, RHS
+  2830110249U,	// <u,4,u,7>: Cost 3 vuzpr <1,u,3,4>, RHS
+  537751099U,	// <u,4,u,u>: Cost 1 vext3 LHS, RHS
+  2631811072U,	// <u,5,0,0>: Cost 3 vext2 <2,3,u,5>, <0,0,0,0>
+  1558069350U,	// <u,5,0,1>: Cost 2 vext2 <2,3,u,5>, LHS
+  2619203823U,	// <u,5,0,2>: Cost 3 vext2 <0,2,u,5>, <0,2,u,5>
+  2619867456U,	// <u,5,0,3>: Cost 3 vext2 <0,3,u,5>, <0,3,u,5>
+  1546273106U,	// <u,5,0,4>: Cost 2 vext2 <0,4,1,5>, <0,4,1,5>
+  2733010539U,	// <u,5,0,5>: Cost 3 vext3 LHS, <5,0,5,1>
+  2597622682U,	// <u,5,0,6>: Cost 3 vext1 <7,u,5,0>, <6,7,u,5>
+  1176539396U,	// <u,5,0,7>: Cost 2 vrev <5,u,7,0>
+  1558069917U,	// <u,5,0,u>: Cost 2 vext2 <2,3,u,5>, LHS
+  1505968230U,	// <u,5,1,0>: Cost 2 vext1 <4,u,5,1>, LHS
+  2624512887U,	// <u,5,1,1>: Cost 3 vext2 <1,1,u,5>, <1,1,u,5>
+  2631811990U,	// <u,5,1,2>: Cost 3 vext2 <2,3,u,5>, <1,2,3,0>
+  2618541056U,	// <u,5,1,3>: Cost 3 vext2 <0,1,u,5>, <1,3,5,7>
+  1505971510U,	// <u,5,1,4>: Cost 2 vext1 <4,u,5,1>, RHS
+  2627167419U,	// <u,5,1,5>: Cost 3 vext2 <1,5,u,5>, <1,5,u,5>
+  2579714554U,	// <u,5,1,6>: Cost 3 vext1 <4,u,5,1>, <6,2,7,3>
+  1638330064U,	// <u,5,1,7>: Cost 2 vext3 RHS, <5,1,7,3>
+  1638477529U,	// <u,5,1,u>: Cost 2 vext3 RHS, <5,1,u,3>
+  2561802342U,	// <u,5,2,0>: Cost 3 vext1 <1,u,5,2>, LHS
+  2561803264U,	// <u,5,2,1>: Cost 3 vext1 <1,u,5,2>, <1,3,5,7>
+  2631149217U,	// <u,5,2,2>: Cost 3 vext2 <2,2,u,5>, <2,2,u,5>
+  1558071026U,	// <u,5,2,3>: Cost 2 vext2 <2,3,u,5>, <2,3,u,5>
+  2561805622U,	// <u,5,2,4>: Cost 3 vext1 <1,u,5,2>, RHS
+  2714062607U,	// <u,5,2,5>: Cost 3 vext3 RHS, <5,2,5,3>
+  2631813050U,	// <u,5,2,6>: Cost 3 vext2 <2,3,u,5>, <2,6,3,7>
+  3092335926U,	// <u,5,2,7>: Cost 3 vtrnr <0,u,0,2>, RHS
+  1561389191U,	// <u,5,2,u>: Cost 2 vext2 <2,u,u,5>, <2,u,u,5>
+  2561810534U,	// <u,5,3,0>: Cost 3 vext1 <1,u,5,3>, LHS
+  2561811857U,	// <u,5,3,1>: Cost 3 vext1 <1,u,5,3>, <1,u,5,3>
+  2631813474U,	// <u,5,3,2>: Cost 3 vext2 <2,3,u,5>, <3,2,5,u>
+  2631813532U,	// <u,5,3,3>: Cost 3 vext2 <2,3,u,5>, <3,3,3,3>
+  2619869698U,	// <u,5,3,4>: Cost 3 vext2 <0,3,u,5>, <3,4,5,6>
+  3001847002U,	// <u,5,3,5>: Cost 3 vzipr LHS, <4,4,5,5>
+  2954070530U,	// <u,5,3,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  2018749750U,	// <u,5,3,7>: Cost 2 vtrnr LHS, RHS
+  2018749751U,	// <u,5,3,u>: Cost 2 vtrnr LHS, RHS
+  2573762662U,	// <u,5,4,0>: Cost 3 vext1 <3,u,5,4>, LHS
+  2620017634U,	// <u,5,4,1>: Cost 3 vext2 <0,4,1,5>, <4,1,5,0>
+  2573764338U,	// <u,5,4,2>: Cost 3 vext1 <3,u,5,4>, <2,3,u,5>
+  2573765444U,	// <u,5,4,3>: Cost 3 vext1 <3,u,5,4>, <3,u,5,4>
+  1570680053U,	// <u,5,4,4>: Cost 2 vext2 <4,4,u,5>, <4,4,u,5>
+  1558072630U,	// <u,5,4,5>: Cost 2 vext2 <2,3,u,5>, RHS
+  2645749143U,	// <u,5,4,6>: Cost 3 vext2 <4,6,u,5>, <4,6,u,5>
+  1638330310U,	// <u,5,4,7>: Cost 2 vext3 RHS, <5,4,7,6>
+  1558072873U,	// <u,5,4,u>: Cost 2 vext2 <2,3,u,5>, RHS
+  1506000998U,	// <u,5,5,0>: Cost 2 vext1 <4,u,5,5>, LHS
+  2561827984U,	// <u,5,5,1>: Cost 3 vext1 <1,u,5,5>, <1,5,3,7>
+  2579744360U,	// <u,5,5,2>: Cost 3 vext1 <4,u,5,5>, <2,2,2,2>
+  2579744918U,	// <u,5,5,3>: Cost 3 vext1 <4,u,5,5>, <3,0,1,2>
+  1506004278U,	// <u,5,5,4>: Cost 2 vext1 <4,u,5,5>, RHS
+  229035318U,	// <u,5,5,5>: Cost 1 vdup1 RHS
+  2712072206U,	// <u,5,5,6>: Cost 3 vext3 RHS, <5,5,6,6>
+  1638330392U,	// <u,5,5,7>: Cost 2 vext3 RHS, <5,5,7,7>
+  229035318U,	// <u,5,5,u>: Cost 1 vdup1 RHS
+  1500037222U,	// <u,5,6,0>: Cost 2 vext1 <3,u,5,6>, LHS
+  2561836436U,	// <u,5,6,1>: Cost 3 vext1 <1,u,5,6>, <1,u,5,6>
+  2567809133U,	// <u,5,6,2>: Cost 3 vext1 <2,u,5,6>, <2,u,5,6>
+  1500040006U,	// <u,5,6,3>: Cost 2 vext1 <3,u,5,6>, <3,u,5,6>
+  1500040502U,	// <u,5,6,4>: Cost 2 vext1 <3,u,5,6>, RHS
+  2714062935U,	// <u,5,6,5>: Cost 3 vext3 RHS, <5,6,5,7>
+  2712072288U,	// <u,5,6,6>: Cost 3 vext3 RHS, <5,6,6,7>
+  27705344U,	// <u,5,6,7>: Cost 0 copy RHS
+  27705344U,	// <u,5,6,u>: Cost 0 copy RHS
+  1488101478U,	// <u,5,7,0>: Cost 2 vext1 <1,u,5,7>, LHS
+  1488102805U,	// <u,5,7,1>: Cost 2 vext1 <1,u,5,7>, <1,u,5,7>
+  2561844840U,	// <u,5,7,2>: Cost 3 vext1 <1,u,5,7>, <2,2,2,2>
+  2561845398U,	// <u,5,7,3>: Cost 3 vext1 <1,u,5,7>, <3,0,1,2>
+  1488104758U,	// <u,5,7,4>: Cost 2 vext1 <1,u,5,7>, RHS
+  1638330536U,	// <u,5,7,5>: Cost 2 vext3 RHS, <5,7,5,7>
+  2712072362U,	// <u,5,7,6>: Cost 3 vext3 RHS, <5,7,6,0>
+  2042965302U,	// <u,5,7,7>: Cost 2 vtrnr RHS, RHS
+  1488107310U,	// <u,5,7,u>: Cost 2 vext1 <1,u,5,7>, LHS
+  1488109670U,	// <u,5,u,0>: Cost 2 vext1 <1,u,5,u>, LHS
+  1488110998U,	// <u,5,u,1>: Cost 2 vext1 <1,u,5,u>, <1,u,5,u>
+  2561853032U,	// <u,5,u,2>: Cost 3 vext1 <1,u,5,u>, <2,2,2,2>
+  1500056392U,	// <u,5,u,3>: Cost 2 vext1 <3,u,5,u>, <3,u,5,u>
+  1488112950U,	// <u,5,u,4>: Cost 2 vext1 <1,u,5,u>, RHS
+  229035318U,	// <u,5,u,5>: Cost 1 vdup1 RHS
+  2954111490U,	// <u,5,u,6>: Cost 3 vzipr LHS, <3,4,5,6>
+  27705344U,	// <u,5,u,7>: Cost 0 copy RHS
+  27705344U,	// <u,5,u,u>: Cost 0 copy RHS
+  2619211776U,	// <u,6,0,0>: Cost 3 vext2 <0,2,u,6>, <0,0,0,0>
+  1545470054U,	// <u,6,0,1>: Cost 2 vext2 <0,2,u,6>, LHS
+  1545470192U,	// <u,6,0,2>: Cost 2 vext2 <0,2,u,6>, <0,2,u,6>
+  2255958969U,	// <u,6,0,3>: Cost 3 vrev <6,u,3,0>
+  1546797458U,	// <u,6,0,4>: Cost 2 vext2 <0,4,u,6>, <0,4,u,6>
+  2720624971U,	// <u,6,0,5>: Cost 3 vext3 <6,0,5,u>, <6,0,5,u>
+  2256180180U,	// <u,6,0,6>: Cost 3 vrev <6,u,6,0>
+  2960682294U,	// <u,6,0,7>: Cost 3 vzipr <1,2,u,0>, RHS
+  1545470621U,	// <u,6,0,u>: Cost 2 vext2 <0,2,u,6>, LHS
+  1182004127U,	// <u,6,1,0>: Cost 2 vrev <6,u,0,1>
+  2619212596U,	// <u,6,1,1>: Cost 3 vext2 <0,2,u,6>, <1,1,1,1>
+  2619212694U,	// <u,6,1,2>: Cost 3 vext2 <0,2,u,6>, <1,2,3,0>
+  2619212760U,	// <u,6,1,3>: Cost 3 vext2 <0,2,u,6>, <1,3,1,3>
+  2626511979U,	// <u,6,1,4>: Cost 3 vext2 <1,4,u,6>, <1,4,u,6>
+  2619212944U,	// <u,6,1,5>: Cost 3 vext2 <0,2,u,6>, <1,5,3,7>
+  2714063264U,	// <u,6,1,6>: Cost 3 vext3 RHS, <6,1,6,3>
+  2967326006U,	// <u,6,1,7>: Cost 3 vzipr <2,3,u,1>, RHS
+  1182594023U,	// <u,6,1,u>: Cost 2 vrev <6,u,u,1>
+  1506050150U,	// <u,6,2,0>: Cost 2 vext1 <4,u,6,2>, LHS
+  2579792630U,	// <u,6,2,1>: Cost 3 vext1 <4,u,6,2>, <1,0,3,2>
+  2619213416U,	// <u,6,2,2>: Cost 3 vext2 <0,2,u,6>, <2,2,2,2>
+  2619213478U,	// <u,6,2,3>: Cost 3 vext2 <0,2,u,6>, <2,3,0,1>
+  1506053430U,	// <u,6,2,4>: Cost 2 vext1 <4,u,6,2>, RHS
+  2633148309U,	// <u,6,2,5>: Cost 3 vext2 <2,5,u,6>, <2,5,u,6>
+  2619213754U,	// <u,6,2,6>: Cost 3 vext2 <0,2,u,6>, <2,6,3,7>
+  1638330874U,	// <u,6,2,7>: Cost 2 vext3 RHS, <6,2,7,3>
+  1638478339U,	// <u,6,2,u>: Cost 2 vext3 RHS, <6,2,u,3>
+  2619213974U,	// <u,6,3,0>: Cost 3 vext2 <0,2,u,6>, <3,0,1,2>
+  2255836074U,	// <u,6,3,1>: Cost 3 vrev <6,u,1,3>
+  2255909811U,	// <u,6,3,2>: Cost 3 vrev <6,u,2,3>
+  2619214236U,	// <u,6,3,3>: Cost 3 vext2 <0,2,u,6>, <3,3,3,3>
+  1564715549U,	// <u,6,3,4>: Cost 2 vext2 <3,4,u,6>, <3,4,u,6>
+  2639121006U,	// <u,6,3,5>: Cost 3 vext2 <3,5,u,6>, <3,5,u,6>
+  3001847012U,	// <u,6,3,6>: Cost 3 vzipr LHS, <4,4,6,6>
+  1880329526U,	// <u,6,3,7>: Cost 2 vzipr LHS, RHS
+  1880329527U,	// <u,6,3,u>: Cost 2 vzipr LHS, RHS
+  2567864422U,	// <u,6,4,0>: Cost 3 vext1 <2,u,6,4>, LHS
+  2733011558U,	// <u,6,4,1>: Cost 3 vext3 LHS, <6,4,1,3>
+  2567866484U,	// <u,6,4,2>: Cost 3 vext1 <2,u,6,4>, <2,u,6,4>
+  2638458005U,	// <u,6,4,3>: Cost 3 vext2 <3,4,u,6>, <4,3,6,u>
+  1570540772U,	// <u,6,4,4>: Cost 2 vext2 <4,4,6,6>, <4,4,6,6>
+  1545473334U,	// <u,6,4,5>: Cost 2 vext2 <0,2,u,6>, RHS
+  1572015512U,	// <u,6,4,6>: Cost 2 vext2 <4,6,u,6>, <4,6,u,6>
+  2960715062U,	// <u,6,4,7>: Cost 3 vzipr <1,2,u,4>, RHS
+  1545473577U,	// <u,6,4,u>: Cost 2 vext2 <0,2,u,6>, RHS
+  2567872614U,	// <u,6,5,0>: Cost 3 vext1 <2,u,6,5>, LHS
+  2645757648U,	// <u,6,5,1>: Cost 3 vext2 <4,6,u,6>, <5,1,7,3>
+  2567874490U,	// <u,6,5,2>: Cost 3 vext1 <2,u,6,5>, <2,6,3,7>
+  2576501250U,	// <u,6,5,3>: Cost 3 vext1 <4,3,6,5>, <3,4,5,6>
+  1576660943U,	// <u,6,5,4>: Cost 2 vext2 <5,4,u,6>, <5,4,u,6>
+  2645757956U,	// <u,6,5,5>: Cost 3 vext2 <4,6,u,6>, <5,5,5,5>
+  2645758050U,	// <u,6,5,6>: Cost 3 vext2 <4,6,u,6>, <5,6,7,0>
+  2824080694U,	// <u,6,5,7>: Cost 3 vuzpr <0,u,2,6>, RHS
+  1182626795U,	// <u,6,5,u>: Cost 2 vrev <6,u,u,5>
+  1506082918U,	// <u,6,6,0>: Cost 2 vext1 <4,u,6,6>, LHS
+  2579825398U,	// <u,6,6,1>: Cost 3 vext1 <4,u,6,6>, <1,0,3,2>
+  2645758458U,	// <u,6,6,2>: Cost 3 vext2 <4,6,u,6>, <6,2,7,3>
+  2579826838U,	// <u,6,6,3>: Cost 3 vext1 <4,u,6,6>, <3,0,1,2>
+  1506086198U,	// <u,6,6,4>: Cost 2 vext1 <4,u,6,6>, RHS
+  2579828432U,	// <u,6,6,5>: Cost 3 vext1 <4,u,6,6>, <5,1,7,3>
+  296144182U,	// <u,6,6,6>: Cost 1 vdup2 RHS
+  1638331202U,	// <u,6,6,7>: Cost 2 vext3 RHS, <6,6,7,7>
+  296144182U,	// <u,6,6,u>: Cost 1 vdup2 RHS
+  432349286U,	// <u,6,7,0>: Cost 1 vext1 RHS, LHS
+  1506091766U,	// <u,6,7,1>: Cost 2 vext1 RHS, <1,0,3,2>
+  1506092648U,	// <u,6,7,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1506093206U,	// <u,6,7,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  432352809U,	// <u,6,7,4>: Cost 1 vext1 RHS, RHS
+  1506094800U,	// <u,6,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+  1506095610U,	// <u,6,7,6>: Cost 2 vext1 RHS, <6,2,7,3>
+  1906904374U,	// <u,6,7,7>: Cost 2 vzipr RHS, RHS
+  432355118U,	// <u,6,7,u>: Cost 1 vext1 RHS, LHS
+  432357478U,	// <u,6,u,0>: Cost 1 vext1 RHS, LHS
+  1545475886U,	// <u,6,u,1>: Cost 2 vext2 <0,2,u,6>, LHS
+  1506100840U,	// <u,6,u,2>: Cost 2 vext1 RHS, <2,2,2,2>
+  1506101398U,	// <u,6,u,3>: Cost 2 vext1 RHS, <3,0,1,2>
+  432361002U,	// <u,6,u,4>: Cost 1 vext1 RHS, RHS
+  1545476250U,	// <u,6,u,5>: Cost 2 vext2 <0,2,u,6>, RHS
+  296144182U,	// <u,6,u,6>: Cost 1 vdup2 RHS
+  1880370486U,	// <u,6,u,7>: Cost 2 vzipr LHS, RHS
+  432363310U,	// <u,6,u,u>: Cost 1 vext1 RHS, LHS
+  1571356672U,	// <u,7,0,0>: Cost 2 vext2 RHS, <0,0,0,0>
+  497614950U,	// <u,7,0,1>: Cost 1 vext2 RHS, LHS
+  1571356836U,	// <u,7,0,2>: Cost 2 vext2 RHS, <0,2,0,2>
+  2573880146U,	// <u,7,0,3>: Cost 3 vext1 <3,u,7,0>, <3,u,7,0>
+  1571357010U,	// <u,7,0,4>: Cost 2 vext2 RHS, <0,4,1,5>
+  1512083716U,	// <u,7,0,5>: Cost 2 vext1 <5,u,7,0>, <5,u,7,0>
+  2621874741U,	// <u,7,0,6>: Cost 3 vext2 <0,6,u,7>, <0,6,u,7>
+  2585826298U,	// <u,7,0,7>: Cost 3 vext1 <5,u,7,0>, <7,0,1,2>
+  497615517U,	// <u,7,0,u>: Cost 1 vext2 RHS, LHS
+  1571357430U,	// <u,7,1,0>: Cost 2 vext2 RHS, <1,0,3,2>
+  1571357492U,	// <u,7,1,1>: Cost 2 vext2 RHS, <1,1,1,1>
+  1571357590U,	// <u,7,1,2>: Cost 2 vext2 RHS, <1,2,3,0>
+  1552114715U,	// <u,7,1,3>: Cost 2 vext2 <1,3,u,7>, <1,3,u,7>
+  2573888822U,	// <u,7,1,4>: Cost 3 vext1 <3,u,7,1>, RHS
+  1553441981U,	// <u,7,1,5>: Cost 2 vext2 <1,5,u,7>, <1,5,u,7>
+  2627847438U,	// <u,7,1,6>: Cost 3 vext2 <1,6,u,7>, <1,6,u,7>
+  2727408775U,	// <u,7,1,7>: Cost 3 vext3 <7,1,7,u>, <7,1,7,u>
+  1555432880U,	// <u,7,1,u>: Cost 2 vext2 <1,u,u,7>, <1,u,u,7>
+  2629838337U,	// <u,7,2,0>: Cost 3 vext2 <2,0,u,7>, <2,0,u,7>
+  1188058754U,	// <u,7,2,1>: Cost 2 vrev <7,u,1,2>
+  1571358312U,	// <u,7,2,2>: Cost 2 vext2 RHS, <2,2,2,2>
+  1571358374U,	// <u,7,2,3>: Cost 2 vext2 RHS, <2,3,0,1>
+  2632492869U,	// <u,7,2,4>: Cost 3 vext2 <2,4,u,7>, <2,4,u,7>
+  2633156502U,	// <u,7,2,5>: Cost 3 vext2 <2,5,u,7>, <2,5,u,7>
+  1560078311U,	// <u,7,2,6>: Cost 2 vext2 <2,6,u,7>, <2,6,u,7>
+  2728072408U,	// <u,7,2,7>: Cost 3 vext3 <7,2,7,u>, <7,2,7,u>
+  1561405577U,	// <u,7,2,u>: Cost 2 vext2 <2,u,u,7>, <2,u,u,7>
+  1571358870U,	// <u,7,3,0>: Cost 2 vext2 RHS, <3,0,1,2>
+  2627184913U,	// <u,7,3,1>: Cost 3 vext2 <1,5,u,7>, <3,1,5,u>
+  2633820523U,	// <u,7,3,2>: Cost 3 vext2 <2,6,u,7>, <3,2,6,u>
+  1571359132U,	// <u,7,3,3>: Cost 2 vext2 RHS, <3,3,3,3>
+  1571359234U,	// <u,7,3,4>: Cost 2 vext2 RHS, <3,4,5,6>
+  1512108295U,	// <u,7,3,5>: Cost 2 vext1 <5,u,7,3>, <5,u,7,3>
+  1518080992U,	// <u,7,3,6>: Cost 2 vext1 <6,u,7,3>, <6,u,7,3>
+  2640456465U,	// <u,7,3,7>: Cost 3 vext2 <3,7,u,7>, <3,7,u,7>
+  1571359518U,	// <u,7,3,u>: Cost 2 vext2 RHS, <3,u,1,2>
+  1571359634U,	// <u,7,4,0>: Cost 2 vext2 RHS, <4,0,5,1>
+  2573911067U,	// <u,7,4,1>: Cost 3 vext1 <3,u,7,4>, <1,3,u,7>
+  2645101622U,	// <u,7,4,2>: Cost 3 vext2 RHS, <4,2,5,3>
+  2573912918U,	// <u,7,4,3>: Cost 3 vext1 <3,u,7,4>, <3,u,7,4>
+  1571359952U,	// <u,7,4,4>: Cost 2 vext2 RHS, <4,4,4,4>
+  497618248U,	// <u,7,4,5>: Cost 1 vext2 RHS, RHS
+  1571360116U,	// <u,7,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  2645102024U,	// <u,7,4,7>: Cost 3 vext2 RHS, <4,7,5,0>
+  497618473U,	// <u,7,4,u>: Cost 1 vext2 RHS, RHS
+  2645102152U,	// <u,7,5,0>: Cost 3 vext2 RHS, <5,0,1,2>
+  1571360464U,	// <u,7,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  2645102334U,	// <u,7,5,2>: Cost 3 vext2 RHS, <5,2,3,4>
+  2645102447U,	// <u,7,5,3>: Cost 3 vext2 RHS, <5,3,7,0>
+  1571360710U,	// <u,7,5,4>: Cost 2 vext2 RHS, <5,4,7,6>
+  1571360772U,	// <u,7,5,5>: Cost 2 vext2 RHS, <5,5,5,5>
+  1571360866U,	// <u,7,5,6>: Cost 2 vext2 RHS, <5,6,7,0>
+  1571360936U,	// <u,7,5,7>: Cost 2 vext2 RHS, <5,7,5,7>
+  1571361017U,	// <u,7,5,u>: Cost 2 vext2 RHS, <5,u,5,7>
+  1530044518U,	// <u,7,6,0>: Cost 2 vext1 <u,u,7,6>, LHS
+  2645103016U,	// <u,7,6,1>: Cost 3 vext2 RHS, <6,1,7,2>
+  1571361274U,	// <u,7,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  2645103154U,	// <u,7,6,3>: Cost 3 vext2 RHS, <6,3,4,5>
+  1530047798U,	// <u,7,6,4>: Cost 2 vext1 <u,u,7,6>, RHS
+  1188386474U,	// <u,7,6,5>: Cost 2 vrev <7,u,5,6>
+  1571361592U,	// <u,7,6,6>: Cost 2 vext2 RHS, <6,6,6,6>
+  1571361614U,	// <u,7,6,7>: Cost 2 vext2 RHS, <6,7,0,1>
+  1571361695U,	// <u,7,6,u>: Cost 2 vext2 RHS, <6,u,0,1>
+  1571361786U,	// <u,7,7,0>: Cost 2 vext2 RHS, <7,0,1,2>
+  2573935616U,	// <u,7,7,1>: Cost 3 vext1 <3,u,7,7>, <1,3,5,7>
+  2645103781U,	// <u,7,7,2>: Cost 3 vext2 RHS, <7,2,2,2>
+  2573937497U,	// <u,7,7,3>: Cost 3 vext1 <3,u,7,7>, <3,u,7,7>
+  1571362150U,	// <u,7,7,4>: Cost 2 vext2 RHS, <7,4,5,6>
+  1512141067U,	// <u,7,7,5>: Cost 2 vext1 <5,u,7,7>, <5,u,7,7>
+  1518113764U,	// <u,7,7,6>: Cost 2 vext1 <6,u,7,7>, <6,u,7,7>
+  363253046U,	// <u,7,7,7>: Cost 1 vdup3 RHS
+  363253046U,	// <u,7,7,u>: Cost 1 vdup3 RHS
+  1571362515U,	// <u,7,u,0>: Cost 2 vext2 RHS, <u,0,1,2>
+  497620782U,	// <u,7,u,1>: Cost 1 vext2 RHS, LHS
+  1571362693U,	// <u,7,u,2>: Cost 2 vext2 RHS, <u,2,3,0>
+  1571362748U,	// <u,7,u,3>: Cost 2 vext2 RHS, <u,3,0,1>
+  1571362879U,	// <u,7,u,4>: Cost 2 vext2 RHS, <u,4,5,6>
+  497621146U,	// <u,7,u,5>: Cost 1 vext2 RHS, RHS
+  1571363024U,	// <u,7,u,6>: Cost 2 vext2 RHS, <u,6,3,7>
+  363253046U,	// <u,7,u,7>: Cost 1 vdup3 RHS
+  497621349U,	// <u,7,u,u>: Cost 1 vext2 RHS, LHS
+  135053414U,	// <u,u,0,0>: Cost 1 vdup0 LHS
+  471081121U,	// <u,u,0,1>: Cost 1 vext2 LHS, LHS
+  1544822948U,	// <u,u,0,2>: Cost 2 vext2 LHS, <0,2,0,2>
+  1616140005U,	// <u,u,0,3>: Cost 2 vext3 LHS, <u,0,3,2>
+  1544823122U,	// <u,u,0,4>: Cost 2 vext2 LHS, <0,4,1,5>
+  1512157453U,	// <u,u,0,5>: Cost 2 vext1 <5,u,u,0>, <5,u,u,0>
+  1662220032U,	// <u,u,0,6>: Cost 2 vext3 RHS, <u,0,6,2>
+  1194457487U,	// <u,u,0,7>: Cost 2 vrev <u,u,7,0>
+  471081629U,	// <u,u,0,u>: Cost 1 vext2 LHS, LHS
+  1544823542U,	// <u,u,1,0>: Cost 2 vext2 LHS, <1,0,3,2>
+  202162278U,	// <u,u,1,1>: Cost 1 vdup1 LHS
+  537753390U,	// <u,u,1,2>: Cost 1 vext3 LHS, LHS
+  1544823768U,	// <u,u,1,3>: Cost 2 vext2 LHS, <1,3,1,3>
+  1494248758U,	// <u,u,1,4>: Cost 2 vext1 <2,u,u,1>, RHS
+  1544823952U,	// <u,u,1,5>: Cost 2 vext2 LHS, <1,5,3,7>
+  1518138343U,	// <u,u,1,6>: Cost 2 vext1 <6,u,u,1>, <6,u,u,1>
+  1640322907U,	// <u,u,1,7>: Cost 2 vext3 RHS, <u,1,7,3>
+  537753444U,	// <u,u,1,u>: Cost 1 vext3 LHS, LHS
+  1482309734U,	// <u,u,2,0>: Cost 2 vext1 <0,u,u,2>, LHS
+  1194031451U,	// <u,u,2,1>: Cost 2 vrev <u,u,1,2>
+  269271142U,	// <u,u,2,2>: Cost 1 vdup2 LHS
+  835584U,	// <u,u,2,3>: Cost 0 copy LHS
+  1482313014U,	// <u,u,2,4>: Cost 2 vext1 <0,u,u,2>, RHS
+  2618566504U,	// <u,u,2,5>: Cost 3 vext2 LHS, <2,5,3,6>
+  1544824762U,	// <u,u,2,6>: Cost 2 vext2 LHS, <2,6,3,7>
+  1638479788U,	// <u,u,2,7>: Cost 2 vext3 RHS, <u,2,7,3>
+  835584U,	// <u,u,2,u>: Cost 0 copy LHS
+  408576723U,	// <u,u,3,0>: Cost 1 vext1 LHS, LHS
+  1482318582U,	// <u,u,3,1>: Cost 2 vext1 LHS, <1,0,3,2>
+  120371557U,	// <u,u,3,2>: Cost 1 vrev LHS
+  336380006U,	// <u,u,3,3>: Cost 1 vdup3 LHS
+  408579382U,	// <u,u,3,4>: Cost 1 vext1 LHS, RHS
+  1616140271U,	// <u,u,3,5>: Cost 2 vext3 LHS, <u,3,5,7>
+  1530098170U,	// <u,u,3,6>: Cost 2 vext1 LHS, <6,2,7,3>
+  1880329544U,	// <u,u,3,7>: Cost 2 vzipr LHS, RHS
+  408581934U,	// <u,u,3,u>: Cost 1 vext1 LHS, LHS
+  1488298086U,	// <u,u,4,0>: Cost 2 vext1 <1,u,u,4>, LHS
+  1488299437U,	// <u,u,4,1>: Cost 2 vext1 <1,u,u,4>, <1,u,u,4>
+  1659271204U,	// <u,u,4,2>: Cost 2 vext3 LHS, <u,4,2,6>
+  1194195311U,	// <u,u,4,3>: Cost 2 vrev <u,u,3,4>
+  161926454U,	// <u,u,4,4>: Cost 1 vdup0 RHS
+  471084342U,	// <u,u,4,5>: Cost 1 vext2 LHS, RHS
+  1571368308U,	// <u,u,4,6>: Cost 2 vext2 RHS, <4,6,4,6>
+  1640323153U,	// <u,u,4,7>: Cost 2 vext3 RHS, <u,4,7,6>
+  471084585U,	// <u,u,4,u>: Cost 1 vext2 LHS, RHS
+  1494278246U,	// <u,u,5,0>: Cost 2 vext1 <2,u,u,5>, LHS
+  1571368656U,	// <u,u,5,1>: Cost 2 vext2 RHS, <5,1,7,3>
+  1494280327U,	// <u,u,5,2>: Cost 2 vext1 <2,u,u,5>, <2,u,u,5>
+  1616140415U,	// <u,u,5,3>: Cost 2 vext3 LHS, <u,5,3,7>
+  1494281526U,	// <u,u,5,4>: Cost 2 vext1 <2,u,u,5>, RHS
+  229035318U,	// <u,u,5,5>: Cost 1 vdup1 RHS
+  537753754U,	// <u,u,5,6>: Cost 1 vext3 LHS, RHS
+  1750355254U,	// <u,u,5,7>: Cost 2 vuzpr LHS, RHS
+  537753772U,	// <u,u,5,u>: Cost 1 vext3 LHS, RHS
+  1482342502U,	// <u,u,6,0>: Cost 2 vext1 <0,u,u,6>, LHS
+  2556084982U,	// <u,u,6,1>: Cost 3 vext1 <0,u,u,6>, <1,0,3,2>
+  1571369466U,	// <u,u,6,2>: Cost 2 vext2 RHS, <6,2,7,3>
+  1611938000U,	// <u,u,6,3>: Cost 2 vext3 LHS, <u,6,3,7>
+  1482345782U,	// <u,u,6,4>: Cost 2 vext1 <0,u,u,6>, RHS
+  1194359171U,	// <u,u,6,5>: Cost 2 vrev <u,u,5,6>
+  296144182U,	// <u,u,6,6>: Cost 1 vdup2 RHS
+  27705344U,	// <u,u,6,7>: Cost 0 copy RHS
+  27705344U,	// <u,u,6,u>: Cost 0 copy RHS
+  432496742U,	// <u,u,7,0>: Cost 1 vext1 RHS, LHS
+  1488324016U,	// <u,u,7,1>: Cost 2 vext1 <1,u,u,7>, <1,u,u,7>
+  1494296713U,	// <u,u,7,2>: Cost 2 vext1 <2,u,u,7>, <2,u,u,7>
+  1906901148U,	// <u,u,7,3>: Cost 2 vzipr RHS, LHS
+  432500283U,	// <u,u,7,4>: Cost 1 vext1 RHS, RHS
+  1506242256U,	// <u,u,7,5>: Cost 2 vext1 RHS, <5,1,7,3>
+  120699277U,	// <u,u,7,6>: Cost 1 vrev RHS
+  363253046U,	// <u,u,7,7>: Cost 1 vdup3 RHS
+  432502574U,	// <u,u,7,u>: Cost 1 vext1 RHS, LHS
+  408617688U,	// <u,u,u,0>: Cost 1 vext1 LHS, LHS
+  471086894U,	// <u,u,u,1>: Cost 1 vext2 LHS, LHS
+  537753957U,	// <u,u,u,2>: Cost 1 vext3 LHS, LHS
+  835584U,	// <u,u,u,3>: Cost 0 copy LHS
+  408620342U,	// <u,u,u,4>: Cost 1 vext1 LHS, RHS
+  471087258U,	// <u,u,u,5>: Cost 1 vext2 LHS, RHS
+  537753997U,	// <u,u,u,6>: Cost 1 vext3 LHS, RHS
+  27705344U,	// <u,u,u,7>: Cost 0 copy RHS
+  835584U,	// <u,u,u,u>: Cost 0 copy LHS
+  0
+};

diff --git a/lib/Target/ARM/ARMRegisterInfo.cpp b/lib/Target/ARM/ARMRegisterInfo.cpp
new file mode 100644
index 0000000..d5bc3f6
--- /dev/null
+++ b/lib/Target/ARM/ARMRegisterInfo.cpp

@@ -0,0 +1,41 @@
+//===- ARMRegisterInfo.cpp - ARM Register Information -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMAddressingModes.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMInstrInfo.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMRegisterInfo.h"
+#include "ARMSubtarget.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
+using namespace llvm;
+
+ARMRegisterInfo::ARMRegisterInfo(const ARMBaseInstrInfo &tii,
+                                 const ARMSubtarget &sti)
+  : ARMBaseRegisterInfo(tii, sti) {
+}

diff --git a/lib/Target/ARM/ARMRegisterInfo.h b/lib/Target/ARM/ARMRegisterInfo.h
new file mode 100644
index 0000000..041afd0
--- /dev/null
+++ b/lib/Target/ARM/ARMRegisterInfo.h

@@ -0,0 +1,43 @@
+//===- ARMRegisterInfo.h - ARM Register Information Impl --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the ARM implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMREGISTERINFO_H
+#define ARMREGISTERINFO_H
+
+#include "ARM.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "ARMBaseRegisterInfo.h"
+
+namespace llvm {
+  class ARMSubtarget;
+  class ARMBaseInstrInfo;
+  class Type;
+
+namespace ARM {
+  /// SubregIndex - The index of various subregister classes. Note that 
+  /// these indices must be kept in sync with the class indices in the 
+  /// ARMRegisterInfo.td file.
+  enum SubregIndex {
+    SSUBREG_0 = 1, SSUBREG_1 = 2, SSUBREG_2 = 3, SSUBREG_3 = 4,
+    DSUBREG_0 = 5, DSUBREG_1 = 6
+  };
+}
+
+struct ARMRegisterInfo : public ARMBaseRegisterInfo {
+public:
+  ARMRegisterInfo(const ARMBaseInstrInfo &tii, const ARMSubtarget &STI);
+};
+
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
new file mode 100644
index 0000000..0d4200c
--- /dev/null
+++ b/lib/Target/ARM/ARMRegisterInfo.td

@@ -0,0 +1,410 @@
+//===- ARMRegisterInfo.td - ARM Register defs -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the ARM register file
+//===----------------------------------------------------------------------===//
+
+// Registers are identified with 4-bit ID numbers.
+class ARMReg<bits<4> num, string n, list<Register> subregs = []> : Register<n> {
+  field bits<4> Num;
+  let Namespace = "ARM";
+  let SubRegs = subregs;
+}
+
+class ARMFReg<bits<6> num, string n> : Register<n> {
+  field bits<6> Num;
+  let Namespace = "ARM";
+}
+
+// Integer registers
+def R0  : ARMReg< 0, "r0">,  DwarfRegNum<[0]>;
+def R1  : ARMReg< 1, "r1">,  DwarfRegNum<[1]>;
+def R2  : ARMReg< 2, "r2">,  DwarfRegNum<[2]>;
+def R3  : ARMReg< 3, "r3">,  DwarfRegNum<[3]>;
+def R4  : ARMReg< 4, "r4">,  DwarfRegNum<[4]>;
+def R5  : ARMReg< 5, "r5">,  DwarfRegNum<[5]>;
+def R6  : ARMReg< 6, "r6">,  DwarfRegNum<[6]>;
+def R7  : ARMReg< 7, "r7">,  DwarfRegNum<[7]>;
+def R8  : ARMReg< 8, "r8">,  DwarfRegNum<[8]>;
+def R9  : ARMReg< 9, "r9">,  DwarfRegNum<[9]>;
+def R10 : ARMReg<10, "r10">, DwarfRegNum<[10]>;
+def R11 : ARMReg<11, "r11">, DwarfRegNum<[11]>;
+def R12 : ARMReg<12, "r12">, DwarfRegNum<[12]>;
+def SP  : ARMReg<13, "sp">,  DwarfRegNum<[13]>;
+def LR  : ARMReg<14, "lr">,  DwarfRegNum<[14]>;
+def PC  : ARMReg<15, "pc">,  DwarfRegNum<[15]>;
+
+// Float registers
+def S0  : ARMFReg< 0, "s0">;  def S1  : ARMFReg< 1, "s1">;
+def S2  : ARMFReg< 2, "s2">;  def S3  : ARMFReg< 3, "s3">;
+def S4  : ARMFReg< 4, "s4">;  def S5  : ARMFReg< 5, "s5">;
+def S6  : ARMFReg< 6, "s6">;  def S7  : ARMFReg< 7, "s7">;
+def S8  : ARMFReg< 8, "s8">;  def S9  : ARMFReg< 9, "s9">;
+def S10 : ARMFReg<10, "s10">; def S11 : ARMFReg<11, "s11">;
+def S12 : ARMFReg<12, "s12">; def S13 : ARMFReg<13, "s13">;
+def S14 : ARMFReg<14, "s14">; def S15 : ARMFReg<15, "s15">;
+def S16 : ARMFReg<16, "s16">; def S17 : ARMFReg<17, "s17">;
+def S18 : ARMFReg<18, "s18">; def S19 : ARMFReg<19, "s19">;
+def S20 : ARMFReg<20, "s20">; def S21 : ARMFReg<21, "s21">;
+def S22 : ARMFReg<22, "s22">; def S23 : ARMFReg<23, "s23">;
+def S24 : ARMFReg<24, "s24">; def S25 : ARMFReg<25, "s25">;
+def S26 : ARMFReg<26, "s26">; def S27 : ARMFReg<27, "s27">;
+def S28 : ARMFReg<28, "s28">; def S29 : ARMFReg<29, "s29">;
+def S30 : ARMFReg<30, "s30">; def S31 : ARMFReg<31, "s31">;
+def SDummy : ARMFReg<63, "sINVALID">;
+
+// Aliases of the F* registers used to hold 64-bit fp values (doubles)
+def D0  : ARMReg< 0,  "d0", [S0,   S1]>;
+def D1  : ARMReg< 1,  "d1", [S2,   S3]>;
+def D2  : ARMReg< 2,  "d2", [S4,   S5]>;
+def D3  : ARMReg< 3,  "d3", [S6,   S7]>;
+def D4  : ARMReg< 4,  "d4", [S8,   S9]>;
+def D5  : ARMReg< 5,  "d5", [S10, S11]>;
+def D6  : ARMReg< 6,  "d6", [S12, S13]>;
+def D7  : ARMReg< 7,  "d7", [S14, S15]>;
+def D8  : ARMReg< 8,  "d8", [S16, S17]>;
+def D9  : ARMReg< 9,  "d9", [S18, S19]>;
+def D10 : ARMReg<10, "d10", [S20, S21]>;
+def D11 : ARMReg<11, "d11", [S22, S23]>;
+def D12 : ARMReg<12, "d12", [S24, S25]>;
+def D13 : ARMReg<13, "d13", [S26, S27]>;
+def D14 : ARMReg<14, "d14", [S28, S29]>;
+def D15 : ARMReg<15, "d15", [S30, S31]>;
+
+// VFP3 defines 16 additional double registers
+def D16 : ARMFReg<16, "d16">; def D17 : ARMFReg<17, "d17">;
+def D18 : ARMFReg<18, "d18">; def D19 : ARMFReg<19, "d19">;
+def D20 : ARMFReg<20, "d20">; def D21 : ARMFReg<21, "d21">;
+def D22 : ARMFReg<22, "d22">; def D23 : ARMFReg<23, "d23">;
+def D24 : ARMFReg<24, "d24">; def D25 : ARMFReg<25, "d25">;
+def D26 : ARMFReg<26, "d26">; def D27 : ARMFReg<27, "d27">;
+def D28 : ARMFReg<28, "d28">; def D29 : ARMFReg<29, "d29">;
+def D30 : ARMFReg<30, "d30">; def D31 : ARMFReg<31, "d31">;
+
+// Advanced SIMD (NEON) defines 16 quad-word aliases
+def Q0  : ARMReg< 0,  "q0", [D0,   D1]>;
+def Q1  : ARMReg< 1,  "q1", [D2,   D3]>;
+def Q2  : ARMReg< 2,  "q2", [D4,   D5]>;
+def Q3  : ARMReg< 3,  "q3", [D6,   D7]>;
+def Q4  : ARMReg< 4,  "q4", [D8,   D9]>;
+def Q5  : ARMReg< 5,  "q5", [D10, D11]>;
+def Q6  : ARMReg< 6,  "q6", [D12, D13]>;
+def Q7  : ARMReg< 7,  "q7", [D14, D15]>;
+def Q8  : ARMReg< 8,  "q8", [D16, D17]>;
+def Q9  : ARMReg< 9,  "q9", [D18, D19]>;
+def Q10 : ARMReg<10, "q10", [D20, D21]>;
+def Q11 : ARMReg<11, "q11", [D22, D23]>;
+def Q12 : ARMReg<12, "q12", [D24, D25]>;
+def Q13 : ARMReg<13, "q13", [D26, D27]>;
+def Q14 : ARMReg<14, "q14", [D28, D29]>;
+def Q15 : ARMReg<15, "q15", [D30, D31]>;
+
+// Current Program Status Register.
+def CPSR  : ARMReg<0, "cpsr">;
+
+def FPSCR : ARMReg<1, "fpscr">;
+
+// Register classes.
+//
+// pc  == Program Counter
+// lr  == Link Register
+// sp  == Stack Pointer
+// r12 == ip (scratch)
+// r7  == Frame Pointer (thumb-style backtraces)
+// r9  == May be reserved as Thread Register
+// r11 == Frame Pointer (arm-style backtraces)
+// r10 == Stack Limit
+//
+def GPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6,
+                                           R7, R8, R9, R10, R11, R12,
+                                           SP, LR, PC]> {
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    // FP is R11, R9 is available.
+    static const unsigned ARM_GPR_AO_1[] = {
+      ARM::R0, ARM::R1, ARM::R2, ARM::R3,
+      ARM::R12,ARM::LR,
+      ARM::R4, ARM::R5, ARM::R6, ARM::R7,
+      ARM::R8, ARM::R9, ARM::R10,
+      ARM::R11 };
+    // FP is R11, R9 is not available.
+    static const unsigned ARM_GPR_AO_2[] = {
+      ARM::R0, ARM::R1, ARM::R2, ARM::R3,
+      ARM::R12,ARM::LR,
+      ARM::R4, ARM::R5, ARM::R6, ARM::R7,
+      ARM::R8, ARM::R10,
+      ARM::R11 };
+    // FP is R7, R9 is available as non-callee-saved register.
+    // This is used by Darwin.
+    static const unsigned ARM_GPR_AO_3[] = {
+      ARM::R0, ARM::R1, ARM::R2, ARM::R3,
+      ARM::R9, ARM::R12,ARM::LR,
+      ARM::R4, ARM::R5, ARM::R6,
+      ARM::R8, ARM::R10,ARM::R11,ARM::R7 };
+    // FP is R7, R9 is not available.
+    static const unsigned ARM_GPR_AO_4[] = {
+      ARM::R0, ARM::R1, ARM::R2, ARM::R3,
+      ARM::R12,ARM::LR,
+      ARM::R4, ARM::R5, ARM::R6,
+      ARM::R8, ARM::R10,ARM::R11,
+      ARM::R7 };
+    // FP is R7, R9 is available as callee-saved register.
+    // This is used by non-Darwin platform in Thumb mode.
+    static const unsigned ARM_GPR_AO_5[] = {
+      ARM::R0, ARM::R1, ARM::R2, ARM::R3,
+      ARM::R12,ARM::LR,
+      ARM::R4, ARM::R5, ARM::R6,
+      ARM::R8, ARM::R9, ARM::R10,ARM::R11,ARM::R7 };
+
+    // For Thumb1 mode, we don't want to allocate hi regs at all, as we
+    // don't know how to spill them. If we make our prologue/epilogue code
+    // smarter at some point, we can go back to using the above allocation
+    // orders for the Thumb1 instructions that know how to use hi regs.
+    static const unsigned THUMB_GPR_AO[] = {
+      ARM::R0, ARM::R1, ARM::R2, ARM::R3,
+      ARM::R4, ARM::R5, ARM::R6, ARM::R7 };
+
+    GPRClass::iterator
+    GPRClass::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>();
+      if (Subtarget.isThumb1Only())
+        return THUMB_GPR_AO;
+      if (Subtarget.isTargetDarwin()) {
+        if (Subtarget.isR9Reserved())
+          return ARM_GPR_AO_4;
+        else
+          return ARM_GPR_AO_3;
+      } else {
+        if (Subtarget.isR9Reserved())
+          return ARM_GPR_AO_2;
+        else if (Subtarget.isThumb())
+          return ARM_GPR_AO_5;
+        else
+          return ARM_GPR_AO_1;
+      }
+    }
+
+    GPRClass::iterator
+    GPRClass::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>();
+      GPRClass::iterator I;
+
+      if (Subtarget.isThumb1Only()) {
+        I = THUMB_GPR_AO + (sizeof(THUMB_GPR_AO)/sizeof(unsigned));
+        // Mac OS X requires FP not to be clobbered for backtracing purpose.
+        return (Subtarget.isTargetDarwin() || RI->hasFP(MF)) ? I-1 : I;
+      }
+
+      if (Subtarget.isTargetDarwin()) {
+        if (Subtarget.isR9Reserved())
+          I = ARM_GPR_AO_4 + (sizeof(ARM_GPR_AO_4)/sizeof(unsigned));
+        else
+          I = ARM_GPR_AO_3 + (sizeof(ARM_GPR_AO_3)/sizeof(unsigned));
+      } else {
+        if (Subtarget.isR9Reserved())
+          I = ARM_GPR_AO_2 + (sizeof(ARM_GPR_AO_2)/sizeof(unsigned));
+        else if (Subtarget.isThumb())
+          I = ARM_GPR_AO_5 + (sizeof(ARM_GPR_AO_5)/sizeof(unsigned));
+        else
+          I = ARM_GPR_AO_1 + (sizeof(ARM_GPR_AO_1)/sizeof(unsigned));
+      }
+
+      // Mac OS X requires FP not to be clobbered for backtracing purpose.
+      return (Subtarget.isTargetDarwin() || RI->hasFP(MF)) ? I-1 : I;
+    }
+  }];
+}
+
+// Thumb registers are R0-R7 normally. Some instructions can still use
+// the general GPR register class above (MOV, e.g.)
+def tGPR : RegisterClass<"ARM", [i32], 32, [R0, R1, R2, R3, R4, R5, R6, R7]> {
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    static const unsigned THUMB_tGPR_AO[] = {
+      ARM::R0, ARM::R1, ARM::R2, ARM::R3,
+      ARM::R4, ARM::R5, ARM::R6, ARM::R7 };
+
+    // FP is R7, only low registers available.
+    tGPRClass::iterator
+    tGPRClass::allocation_order_begin(const MachineFunction &MF) const {
+      return THUMB_tGPR_AO;
+    }
+
+    tGPRClass::iterator
+    tGPRClass::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>();
+      tGPRClass::iterator I =
+        THUMB_tGPR_AO + (sizeof(THUMB_tGPR_AO)/sizeof(unsigned));
+      // Mac OS X requires FP not to be clobbered for backtracing purpose.
+      return (Subtarget.isTargetDarwin() || RI->hasFP(MF)) ? I-1 : I;
+    }
+  }];
+}
+
+// Scalar single precision floating point register class..
+def SPR : RegisterClass<"ARM", [f32], 32, [S0, S1, S2, S3, S4, S5, S6, S7, S8,
+  S9, S10, S11, S12, S13, S14, S15, S16, S17, S18, S19, S20, S21, S22,
+  S23, S24, S25, S26, S27, S28, S29, S30, S31]>;
+
+// Subset of SPR which can be used as a source of NEON scalars for 16-bit
+// operations
+def SPR_8 : RegisterClass<"ARM", [f32], 32,
+                          [S0, S1,  S2,  S3,  S4,  S5,  S6,  S7,
+                           S8, S9, S10, S11, S12, S13, S14, S15]>;
+
+// Dummy f32 regclass to represent impossible subreg indices.
+def SPR_INVALID : RegisterClass<"ARM", [f32], 32, [SDummy]> {
+  let CopyCost = -1;
+}
+
+// Scalar double precision floating point / generic 64-bit vector register
+// class.
+// ARM requires only word alignment for double. It's more performant if it
+// is double-word alignment though.
+def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64,
+                        [D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,
+                         D8,  D9,  D10, D11, D12, D13, D14, D15,
+                         D16, D17, D18, D19, D20, D21, D22, D23,
+                         D24, D25, D26, D27, D28, D29, D30, D31]> {
+  let SubRegClassList = [SPR_INVALID, SPR_INVALID];
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    // VFP2
+    static const unsigned ARM_DPR_VFP2[] = {
+      ARM::D0,  ARM::D1,  ARM::D2,  ARM::D3,
+      ARM::D4,  ARM::D5,  ARM::D6,  ARM::D7,
+      ARM::D8,  ARM::D9,  ARM::D10, ARM::D11,
+      ARM::D12, ARM::D13, ARM::D14, ARM::D15 };
+    // VFP3
+    static const unsigned ARM_DPR_VFP3[] = {
+      ARM::D0,  ARM::D1,  ARM::D2,  ARM::D3,
+      ARM::D4,  ARM::D5,  ARM::D6,  ARM::D7,
+      ARM::D8,  ARM::D9,  ARM::D10, ARM::D11,
+      ARM::D12, ARM::D13, ARM::D14, ARM::D15,
+      ARM::D16, ARM::D17, ARM::D18, ARM::D19,
+      ARM::D20, ARM::D21, ARM::D22, ARM::D23,
+      ARM::D24, ARM::D25, ARM::D26, ARM::D27,
+      ARM::D28, ARM::D29, ARM::D30, ARM::D31 };
+    DPRClass::iterator
+    DPRClass::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>();
+      if (Subtarget.hasVFP3())
+        return ARM_DPR_VFP3;
+      return ARM_DPR_VFP2;
+    }
+
+    DPRClass::iterator
+    DPRClass::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const ARMSubtarget &Subtarget = TM.getSubtarget<ARMSubtarget>();
+      if (Subtarget.hasVFP3())
+        return ARM_DPR_VFP3 + (sizeof(ARM_DPR_VFP3)/sizeof(unsigned));
+      else
+        return ARM_DPR_VFP2 + (sizeof(ARM_DPR_VFP2)/sizeof(unsigned));
+    }
+  }];
+}
+
+// Subset of DPR that are accessible with VFP2 (and so that also have
+// 32-bit SPR subregs).
+def DPR_VFP2 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64,
+                             [D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,
+                              D8,  D9,  D10, D11, D12, D13, D14, D15]> {
+  let SubRegClassList = [SPR, SPR];
+}
+
+// Subset of DPR which can be used as a source of NEON scalars for 16-bit
+// operations
+def DPR_8 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32], 64,
+                          [D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7]> {
+  let SubRegClassList = [SPR_8, SPR_8];
+}
+
+// Generic 128-bit vector register class.
+def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128,
+                        [Q0,  Q1,  Q2,  Q3,  Q4,  Q5,  Q6,  Q7,
+                         Q8,  Q9,  Q10, Q11, Q12, Q13, Q14, Q15]> {
+  let SubRegClassList = [SPR_INVALID, SPR_INVALID, SPR_INVALID, SPR_INVALID,
+                         DPR, DPR];
+}
+
+// Subset of QPR that have 32-bit SPR subregs.
+def QPR_VFP2 : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                             128,
+                             [Q0,  Q1,  Q2,  Q3,  Q4,  Q5,  Q6,  Q7]> {
+  let SubRegClassList = [SPR, SPR, SPR, SPR, DPR_VFP2, DPR_VFP2];
+}
+
+// Subset of QPR that have DPR_8 and SPR_8 subregs.
+def QPR_8 : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                           128,
+                           [Q0,  Q1,  Q2,  Q3]> {
+  let SubRegClassList = [SPR_8, SPR_8, SPR_8, SPR_8, DPR_8, DPR_8];
+}
+
+// Condition code registers.
+def CCR : RegisterClass<"ARM", [i32], 32, [CPSR]>;
+
+//===----------------------------------------------------------------------===//
+// Subregister Set Definitions... now that we have all of the pieces, define the
+// sub registers for each register.
+//
+
+def arm_ssubreg_0 : PatLeaf<(i32 1)>;
+def arm_ssubreg_1 : PatLeaf<(i32 2)>;
+def arm_ssubreg_2 : PatLeaf<(i32 3)>;
+def arm_ssubreg_3 : PatLeaf<(i32 4)>;
+def arm_dsubreg_0 : PatLeaf<(i32 5)>;
+def arm_dsubreg_1 : PatLeaf<(i32 6)>;
+
+// S sub-registers of D registers.
+def : SubRegSet<1, [D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,
+                    D8,  D9,  D10, D11, D12, D13, D14, D15],
+                   [S0,  S2,  S4,  S6,  S8,  S10, S12, S14,
+                    S16, S18, S20, S22, S24, S26, S28, S30]>;
+def : SubRegSet<2, [D0,  D1,  D2,  D3,  D4,  D5,  D6,  D7,
+                    D8,  D9,  D10, D11, D12, D13, D14, D15],
+                   [S1,  S3,  S5,  S7,  S9,  S11, S13, S15,
+                    S17, S19, S21, S23, S25, S27, S29, S31]>;
+
+// S sub-registers of Q registers.
+def : SubRegSet<1, [Q0,  Q1,  Q2,  Q3,  Q4,  Q5,  Q6,  Q7],
+                   [S0,  S4,  S8,  S12, S16, S20, S24, S28]>;
+def : SubRegSet<2, [Q0,  Q1,  Q2,  Q3,  Q4,  Q5,  Q6,  Q7],
+                   [S1,  S5,  S9,  S13, S17, S21, S25, S29]>;
+def : SubRegSet<3, [Q0,  Q1,  Q2,  Q3,  Q4,  Q5,  Q6,  Q7],
+                   [S2,  S6,  S10, S14, S18, S22, S26, S30]>;
+def : SubRegSet<4, [Q0,  Q1,  Q2,  Q3,  Q4,  Q5,  Q6,  Q7],
+                   [S3,  S7,  S11, S15, S19, S23, S27, S31]>;
+
+// D sub-registers of Q registers.
+def : SubRegSet<5, [Q0,  Q1,  Q2,  Q3,  Q4,  Q5,  Q6,  Q7,
+                    Q8,  Q9,  Q10, Q11, Q12, Q13, Q14, Q15],
+                   [D0,  D2,  D4,  D6,  D8,  D10, D12, D14,
+                    D16, D18, D20, D22, D24, D26, D28, D30]>;
+def : SubRegSet<6, [Q0,  Q1,  Q2,  Q3,  Q4,  Q5,  Q6,  Q7,
+                    Q8,  Q9,  Q10, Q11, Q12, Q13, Q14, Q15],
+                   [D1,  D3,  D5,  D7,  D9,  D11, D13, D15,
+                    D17, D19, D21, D23, D25, D27, D29, D31]>;

diff --git a/lib/Target/ARM/ARMRelocations.h b/lib/Target/ARM/ARMRelocations.h
new file mode 100644
index 0000000..2cc2950
--- /dev/null
+++ b/lib/Target/ARM/ARMRelocations.h

@@ -0,0 +1,56 @@
+//===- ARMRelocations.h - ARM Code Relocations ------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the ARM target-specific relocation types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMRELOCATIONS_H
+#define ARMRELOCATIONS_H
+
+#include "llvm/CodeGen/MachineRelocation.h"
+
+namespace llvm {
+  namespace ARM {
+    enum RelocationType {
+      // reloc_arm_absolute - Absolute relocation, just add the relocated value
+      // to the value already in memory.
+      reloc_arm_absolute,
+
+      // reloc_arm_relative - PC relative relocation, add the relocated value to
+      // the value already in memory, after we adjust it for where the PC is.
+      reloc_arm_relative,
+
+      // reloc_arm_cp_entry - PC relative relocation for constpool_entry's whose
+      // addresses are kept locally in a map.
+      reloc_arm_cp_entry,
+
+      // reloc_arm_vfp_cp_entry - Same as reloc_arm_cp_entry except the offset
+      // should be divided by 4.
+      reloc_arm_vfp_cp_entry,
+
+      // reloc_arm_machine_cp_entry - Relocation of a ARM machine constantpool
+      // entry.
+      reloc_arm_machine_cp_entry,
+
+      // reloc_arm_jt_base - PC relative relocation for jump tables whose
+      // addresses are kept locally in a map.
+      reloc_arm_jt_base,
+
+      // reloc_arm_pic_jt - PIC jump table entry relocation: dest bb - jt base.
+      reloc_arm_pic_jt,
+
+      // reloc_arm_branch - Branch address relocation.
+      reloc_arm_branch
+    };
+  }
+}
+
+#endif
+

diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td
new file mode 100644
index 0000000..fc4c5f5
--- /dev/null
+++ b/lib/Target/ARM/ARMSchedule.td

@@ -0,0 +1,160 @@
+//===- ARMSchedule.td - ARM Scheduling Definitions ---------*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Functional units across ARM processors
+//
+def FU_Issue   : FuncUnit; // issue
+def FU_Pipe0   : FuncUnit; // pipeline 0
+def FU_Pipe1   : FuncUnit; // pipeline 1
+def FU_LdSt0   : FuncUnit; // pipeline 0 load/store
+def FU_LdSt1   : FuncUnit; // pipeline 1 load/store
+def FU_NPipe   : FuncUnit; // NEON ALU/MUL pipe
+def FU_NLSPipe : FuncUnit; // NEON LS pipe
+
+//===----------------------------------------------------------------------===//
+// Instruction Itinerary classes used for ARM
+//
+def IIC_iALUx      : InstrItinClass;
+def IIC_iALUi      : InstrItinClass;
+def IIC_iALUr      : InstrItinClass;
+def IIC_iALUsi     : InstrItinClass;
+def IIC_iALUsr     : InstrItinClass;
+def IIC_iUNAr      : InstrItinClass;
+def IIC_iUNAsi     : InstrItinClass;
+def IIC_iUNAsr     : InstrItinClass;
+def IIC_iCMPi      : InstrItinClass;
+def IIC_iCMPr      : InstrItinClass;
+def IIC_iCMPsi     : InstrItinClass;
+def IIC_iCMPsr     : InstrItinClass;
+def IIC_iMOVi      : InstrItinClass;
+def IIC_iMOVr      : InstrItinClass;
+def IIC_iMOVsi     : InstrItinClass;
+def IIC_iMOVsr     : InstrItinClass;
+def IIC_iCMOVi     : InstrItinClass;
+def IIC_iCMOVr     : InstrItinClass;
+def IIC_iCMOVsi    : InstrItinClass;
+def IIC_iCMOVsr    : InstrItinClass;
+def IIC_iMUL16     : InstrItinClass;
+def IIC_iMAC16     : InstrItinClass;
+def IIC_iMUL32     : InstrItinClass;
+def IIC_iMAC32     : InstrItinClass;
+def IIC_iMUL64     : InstrItinClass;
+def IIC_iMAC64     : InstrItinClass;
+def IIC_iLoadi     : InstrItinClass;
+def IIC_iLoadr     : InstrItinClass;
+def IIC_iLoadsi    : InstrItinClass;
+def IIC_iLoadiu    : InstrItinClass;
+def IIC_iLoadru    : InstrItinClass;
+def IIC_iLoadsiu   : InstrItinClass;
+def IIC_iLoadm     : InstrItinClass;
+def IIC_iStorei    : InstrItinClass;
+def IIC_iStorer    : InstrItinClass;
+def IIC_iStoresi   : InstrItinClass;
+def IIC_iStoreiu   : InstrItinClass;
+def IIC_iStoreru   : InstrItinClass;
+def IIC_iStoresiu  : InstrItinClass;
+def IIC_iStorem    : InstrItinClass;
+def IIC_Br         : InstrItinClass;
+def IIC_fpSTAT     : InstrItinClass;
+def IIC_fpUNA32    : InstrItinClass;
+def IIC_fpUNA64    : InstrItinClass;
+def IIC_fpCMP32    : InstrItinClass;
+def IIC_fpCMP64    : InstrItinClass;
+def IIC_fpCVTSD    : InstrItinClass;
+def IIC_fpCVTDS    : InstrItinClass;
+def IIC_fpCVTIS    : InstrItinClass;
+def IIC_fpCVTID    : InstrItinClass;
+def IIC_fpCVTSI    : InstrItinClass;
+def IIC_fpCVTDI    : InstrItinClass;
+def IIC_fpALU32    : InstrItinClass;
+def IIC_fpALU64    : InstrItinClass;
+def IIC_fpMUL32    : InstrItinClass;
+def IIC_fpMUL64    : InstrItinClass;
+def IIC_fpMAC32    : InstrItinClass;
+def IIC_fpMAC64    : InstrItinClass;
+def IIC_fpDIV32    : InstrItinClass;
+def IIC_fpDIV64    : InstrItinClass;
+def IIC_fpSQRT32   : InstrItinClass;
+def IIC_fpSQRT64   : InstrItinClass;
+def IIC_fpLoad32   : InstrItinClass;
+def IIC_fpLoad64   : InstrItinClass;
+def IIC_fpLoadm    : InstrItinClass;
+def IIC_fpStore32  : InstrItinClass;
+def IIC_fpStore64  : InstrItinClass;
+def IIC_fpStorem   : InstrItinClass;
+def IIC_VLD1       : InstrItinClass;
+def IIC_VLD2       : InstrItinClass;
+def IIC_VLD3       : InstrItinClass;
+def IIC_VLD4       : InstrItinClass;
+def IIC_VST        : InstrItinClass;
+def IIC_VUNAD      : InstrItinClass;
+def IIC_VUNAQ      : InstrItinClass;
+def IIC_VBIND      : InstrItinClass;
+def IIC_VBINQ      : InstrItinClass;
+def IIC_VMOVImm    : InstrItinClass;
+def IIC_VMOVD      : InstrItinClass;
+def IIC_VMOVQ      : InstrItinClass;
+def IIC_VMOVIS     : InstrItinClass;
+def IIC_VMOVID     : InstrItinClass;
+def IIC_VMOVISL    : InstrItinClass;
+def IIC_VMOVSI     : InstrItinClass;
+def IIC_VMOVDI     : InstrItinClass;
+def IIC_VPERMD     : InstrItinClass;
+def IIC_VPERMQ     : InstrItinClass;
+def IIC_VPERMQ3    : InstrItinClass;
+def IIC_VMACD      : InstrItinClass;
+def IIC_VMACQ      : InstrItinClass;
+def IIC_VRECSD     : InstrItinClass;
+def IIC_VRECSQ     : InstrItinClass;
+def IIC_VCNTiD     : InstrItinClass;
+def IIC_VCNTiQ     : InstrItinClass;
+def IIC_VUNAiD     : InstrItinClass;
+def IIC_VUNAiQ     : InstrItinClass;
+def IIC_VQUNAiD    : InstrItinClass;
+def IIC_VQUNAiQ    : InstrItinClass;
+def IIC_VBINiD     : InstrItinClass;
+def IIC_VBINiQ     : InstrItinClass;
+def IIC_VSUBiD     : InstrItinClass;
+def IIC_VSUBiQ     : InstrItinClass;
+def IIC_VBINi4D    : InstrItinClass;
+def IIC_VBINi4Q    : InstrItinClass;
+def IIC_VSHLiD     : InstrItinClass;
+def IIC_VSHLiQ     : InstrItinClass;
+def IIC_VSHLi4D    : InstrItinClass;
+def IIC_VSHLi4Q    : InstrItinClass;
+def IIC_VPALiD     : InstrItinClass;
+def IIC_VPALiQ     : InstrItinClass;
+def IIC_VMULi16D   : InstrItinClass;
+def IIC_VMULi32D   : InstrItinClass;
+def IIC_VMULi16Q   : InstrItinClass;
+def IIC_VMULi32Q   : InstrItinClass;
+def IIC_VMACi16D   : InstrItinClass;
+def IIC_VMACi32D   : InstrItinClass;
+def IIC_VMACi16Q   : InstrItinClass;
+def IIC_VMACi32Q   : InstrItinClass;
+def IIC_VEXTD      : InstrItinClass;
+def IIC_VEXTQ      : InstrItinClass;
+def IIC_VTB1       : InstrItinClass;
+def IIC_VTB2       : InstrItinClass;
+def IIC_VTB3       : InstrItinClass;
+def IIC_VTB4       : InstrItinClass;
+def IIC_VTBX1      : InstrItinClass;
+def IIC_VTBX2      : InstrItinClass;
+def IIC_VTBX3      : InstrItinClass;
+def IIC_VTBX4      : InstrItinClass;
+
+//===----------------------------------------------------------------------===//
+// Processor instruction itineraries.
+
+def GenericItineraries : ProcessorItineraries<[]>;
+
+
+include "ARMScheduleV6.td"
+include "ARMScheduleV7.td"

diff --git a/lib/Target/ARM/ARMScheduleV6.td b/lib/Target/ARM/ARMScheduleV6.td
new file mode 100644
index 0000000..0fef466
--- /dev/null
+++ b/lib/Target/ARM/ARMScheduleV6.td

@@ -0,0 +1,200 @@
+//===- ARMScheduleV6.td - ARM v6 Scheduling Definitions ----*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the ARM v6 processors.
+//
+//===----------------------------------------------------------------------===//
+
+// Model based on ARM1176
+//
+// Scheduling information derived from "ARM1176JZF-S Technical Reference Manual".
+//
+def ARMV6Itineraries : ProcessorItineraries<[
+  //
+  // No operand cycles
+  InstrItinData<IIC_iALUx    , [InstrStage<1, [FU_Pipe0]>]>,
+  //
+  // Binary Instructions that produce a result
+  InstrItinData<IIC_iALUi    , [InstrStage<1, [FU_Pipe0]>], [2, 2]>,
+  InstrItinData<IIC_iALUr    , [InstrStage<1, [FU_Pipe0]>], [2, 2, 2]>,
+  InstrItinData<IIC_iALUsi   , [InstrStage<1, [FU_Pipe0]>], [2, 2, 1]>,
+  InstrItinData<IIC_iALUsr   , [InstrStage<2, [FU_Pipe0]>], [3, 3, 2, 1]>,
+  //
+  // Unary Instructions that produce a result
+  InstrItinData<IIC_iUNAr    , [InstrStage<1, [FU_Pipe0]>], [2, 2]>,
+  InstrItinData<IIC_iUNAsi   , [InstrStage<1, [FU_Pipe0]>], [2, 1]>,
+  InstrItinData<IIC_iUNAsr   , [InstrStage<2, [FU_Pipe0]>], [3, 2, 1]>,
+  //
+  // Compare instructions
+  InstrItinData<IIC_iCMPi    , [InstrStage<1, [FU_Pipe0]>], [2]>,
+  InstrItinData<IIC_iCMPr    , [InstrStage<1, [FU_Pipe0]>], [2, 2]>,
+  InstrItinData<IIC_iCMPsi   , [InstrStage<1, [FU_Pipe0]>], [2, 1]>,
+  InstrItinData<IIC_iCMPsr   , [InstrStage<2, [FU_Pipe0]>], [3, 2, 1]>,
+  //
+  // Move instructions, unconditional
+  InstrItinData<IIC_iMOVi    , [InstrStage<1, [FU_Pipe0]>], [2]>,
+  InstrItinData<IIC_iMOVr    , [InstrStage<1, [FU_Pipe0]>], [2, 2]>,
+  InstrItinData<IIC_iMOVsi   , [InstrStage<1, [FU_Pipe0]>], [2, 1]>,
+  InstrItinData<IIC_iMOVsr   , [InstrStage<2, [FU_Pipe0]>], [3, 2, 1]>,
+  //
+  // Move instructions, conditional
+  InstrItinData<IIC_iCMOVi   , [InstrStage<1, [FU_Pipe0]>], [3]>,
+  InstrItinData<IIC_iCMOVr   , [InstrStage<1, [FU_Pipe0]>], [3, 2]>,
+  InstrItinData<IIC_iCMOVsi  , [InstrStage<1, [FU_Pipe0]>], [3, 1]>,
+  InstrItinData<IIC_iCMOVsr  , [InstrStage<1, [FU_Pipe0]>], [4, 2, 1]>,
+
+  // Integer multiply pipeline
+  //
+  InstrItinData<IIC_iMUL16   , [InstrStage<1, [FU_Pipe0]>], [4, 1, 1]>,
+  InstrItinData<IIC_iMAC16   , [InstrStage<1, [FU_Pipe0]>], [4, 1, 1, 2]>,
+  InstrItinData<IIC_iMUL32   , [InstrStage<2, [FU_Pipe0]>], [5, 1, 1]>,
+  InstrItinData<IIC_iMAC32   , [InstrStage<2, [FU_Pipe0]>], [5, 1, 1, 2]>,
+  InstrItinData<IIC_iMUL64   , [InstrStage<3, [FU_Pipe0]>], [6, 1, 1]>,
+  InstrItinData<IIC_iMAC64   , [InstrStage<3, [FU_Pipe0]>], [6, 1, 1, 2]>,
+  
+  // Integer load pipeline
+  //
+  // Immediate offset
+  InstrItinData<IIC_iLoadi   , [InstrStage<1, [FU_Pipe0]>], [4, 1]>,
+  //
+  // Register offset
+  InstrItinData<IIC_iLoadr   , [InstrStage<1, [FU_Pipe0]>], [4, 1, 1]>,
+  //
+  // Scaled register offset, issues over 2 cycles
+  InstrItinData<IIC_iLoadsi  , [InstrStage<2, [FU_Pipe0]>], [5, 2, 1]>,
+  //
+  // Immediate offset with update
+  InstrItinData<IIC_iLoadiu  , [InstrStage<1, [FU_Pipe0]>], [4, 2, 1]>,
+  //
+  // Register offset with update
+  InstrItinData<IIC_iLoadru  , [InstrStage<1, [FU_Pipe0]>], [4, 2, 1, 1]>,
+  //
+  // Scaled register offset with update, issues over 2 cycles
+  InstrItinData<IIC_iLoadsiu , [InstrStage<2, [FU_Pipe0]>], [5, 2, 2, 1]>,
+
+  //
+  // Load multiple
+  InstrItinData<IIC_iLoadm   , [InstrStage<3, [FU_Pipe0]>]>,
+
+  // Integer store pipeline
+  //
+  // Immediate offset
+  InstrItinData<IIC_iStorei  , [InstrStage<1, [FU_Pipe0]>], [2, 1]>,
+  //
+  // Register offset
+  InstrItinData<IIC_iStorer  , [InstrStage<1, [FU_Pipe0]>], [2, 1, 1]>,
+
+  //
+  // Scaled register offset, issues over 2 cycles
+  InstrItinData<IIC_iStoresi , [InstrStage<2, [FU_Pipe0]>], [2, 2, 1]>,
+  //
+  // Immediate offset with update
+  InstrItinData<IIC_iStoreiu , [InstrStage<1, [FU_Pipe0]>], [2, 2, 1]>,
+  //
+  // Register offset with update
+  InstrItinData<IIC_iStoreru , [InstrStage<1, [FU_Pipe0]>], [2, 2, 1, 1]>,
+  //
+  // Scaled register offset with update, issues over 2 cycles
+  InstrItinData<IIC_iStoresiu, [InstrStage<2, [FU_Pipe0]>], [2, 2, 2, 1]>,
+  //
+  // Store multiple
+  InstrItinData<IIC_iStorem   , [InstrStage<3, [FU_Pipe0]>]>,
+  
+  // Branch
+  //
+  // no delay slots, so the latency of a branch is unimportant
+  InstrItinData<IIC_Br      , [InstrStage<1, [FU_Pipe0]>]>,
+
+  // VFP
+  // Issue through integer pipeline, and execute in NEON unit. We assume
+  // RunFast mode so that NFP pipeline is used for single-precision when
+  // possible.
+  //
+  // FP Special Register to Integer Register File Move
+  InstrItinData<IIC_fpSTAT , [InstrStage<1, [FU_Pipe0]>], [3]>,
+  //
+  // Single-precision FP Unary
+  InstrItinData<IIC_fpUNA32 , [InstrStage<1, [FU_Pipe0]>], [5, 2]>,
+  //
+  // Double-precision FP Unary
+  InstrItinData<IIC_fpUNA64 , [InstrStage<1, [FU_Pipe0]>], [5, 2]>,
+  //
+  // Single-precision FP Compare
+  InstrItinData<IIC_fpCMP32 , [InstrStage<1, [FU_Pipe0]>], [2, 2]>,
+  //
+  // Double-precision FP Compare
+  InstrItinData<IIC_fpCMP64 , [InstrStage<1, [FU_Pipe0]>], [2, 2]>,
+  //
+  // Single to Double FP Convert
+  InstrItinData<IIC_fpCVTSD , [InstrStage<1, [FU_Pipe0]>], [5, 2]>,
+  //
+  // Double to Single FP Convert
+  InstrItinData<IIC_fpCVTDS , [InstrStage<1, [FU_Pipe0]>], [5, 2]>,
+  //
+  // Single-Precision FP to Integer Convert
+  InstrItinData<IIC_fpCVTSI , [InstrStage<1, [FU_Pipe0]>], [9, 2]>,
+  //
+  // Double-Precision FP to Integer Convert
+  InstrItinData<IIC_fpCVTDI , [InstrStage<1, [FU_Pipe0]>], [9, 2]>,
+  //
+  // Integer to Single-Precision FP Convert
+  InstrItinData<IIC_fpCVTIS , [InstrStage<1, [FU_Pipe0]>], [9, 2]>,
+  //
+  // Integer to Double-Precision FP Convert
+  InstrItinData<IIC_fpCVTID , [InstrStage<1, [FU_Pipe0]>], [9, 2]>,
+  //
+  // Single-precision FP ALU
+  InstrItinData<IIC_fpALU32 , [InstrStage<1, [FU_Pipe0]>], [9, 2, 2]>,
+  //
+  // Double-precision FP ALU
+  InstrItinData<IIC_fpALU64 , [InstrStage<1, [FU_Pipe0]>], [9, 2, 2]>,
+  //
+  // Single-precision FP Multiply
+  InstrItinData<IIC_fpMUL32 , [InstrStage<1, [FU_Pipe0]>], [9, 2, 2]>,
+  //
+  // Double-precision FP Multiply
+  InstrItinData<IIC_fpMUL64 , [InstrStage<2, [FU_Pipe0]>], [9, 2, 2]>,
+  //
+  // Single-precision FP MAC
+  InstrItinData<IIC_fpMAC32 , [InstrStage<1, [FU_Pipe0]>], [9, 2, 2, 2]>,
+  //
+  // Double-precision FP MAC
+  InstrItinData<IIC_fpMAC64 , [InstrStage<2, [FU_Pipe0]>], [9, 2, 2, 2]>,
+  //
+  // Single-precision FP DIV
+  InstrItinData<IIC_fpDIV32 , [InstrStage<15, [FU_Pipe0]>], [20, 2, 2]>,
+  //
+  // Double-precision FP DIV
+  InstrItinData<IIC_fpDIV64 , [InstrStage<29, [FU_Pipe0]>], [34, 2, 2]>,
+  //
+  // Single-precision FP SQRT
+  InstrItinData<IIC_fpSQRT32 , [InstrStage<15, [FU_Pipe0]>], [20, 2, 2]>,
+  //
+  // Double-precision FP SQRT
+  InstrItinData<IIC_fpSQRT64 , [InstrStage<29, [FU_Pipe0]>], [34, 2, 2]>,
+  //
+  // Single-precision FP Load
+  InstrItinData<IIC_fpLoad32 , [InstrStage<1, [FU_Pipe0]>], [5, 2, 2]>,
+  //
+  // Double-precision FP Load
+  InstrItinData<IIC_fpLoad64 , [InstrStage<1, [FU_Pipe0]>], [5, 2, 2]>,
+  //
+  // FP Load Multiple
+  InstrItinData<IIC_fpLoadm , [InstrStage<3, [FU_Pipe0]>]>,
+  //
+  // Single-precision FP Store
+  InstrItinData<IIC_fpStore32 , [InstrStage<1, [FU_Pipe0]>], [2, 2, 2]>,
+  //
+  // Double-precision FP Store
+  // use FU_Issue to enforce the 1 load/store per cycle limit
+  InstrItinData<IIC_fpStore64 , [InstrStage<1, [FU_Pipe0]>], [2, 2, 2]>,
+  //
+  // FP Store Multiple
+  InstrItinData<IIC_fpStorem , [InstrStage<3, [FU_Pipe0]>]>
+]>;

diff --git a/lib/Target/ARM/ARMScheduleV7.td b/lib/Target/ARM/ARMScheduleV7.td
new file mode 100644
index 0000000..bbbf413
--- /dev/null
+++ b/lib/Target/ARM/ARMScheduleV7.td

@@ -0,0 +1,587 @@
+//===- ARMScheduleV7.td - ARM v7 Scheduling Definitions ----*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the ARM v7 processors.
+//
+//===----------------------------------------------------------------------===//
+
+//
+// Scheduling information derived from "Cortex-A8 Technical Reference Manual".
+//
+// Dual issue pipeline represented by FU_Pipe0 | FU_Pipe1
+//
+def CortexA8Itineraries : ProcessorItineraries<[
+
+  // Two fully-pipelined integer ALU pipelines
+  //
+  // No operand cycles
+  InstrItinData<IIC_iALUx    , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>]>,
+  //
+  // Binary Instructions that produce a result
+  InstrItinData<IIC_iALUi    , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 2]>,
+  InstrItinData<IIC_iALUr    , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 2, 2]>,
+  InstrItinData<IIC_iALUsi   , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 2, 1]>,
+  InstrItinData<IIC_iALUsr   , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 2, 1, 1]>,
+  //
+  // Unary Instructions that produce a result
+  InstrItinData<IIC_iUNAr    , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 2]>,
+  InstrItinData<IIC_iUNAsi   , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 1]>,
+  InstrItinData<IIC_iUNAsr   , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 1, 1]>,
+  //
+  // Compare instructions
+  InstrItinData<IIC_iCMPi    , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2]>,
+  InstrItinData<IIC_iCMPr    , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 2]>,
+  InstrItinData<IIC_iCMPsi   , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 1]>,
+  InstrItinData<IIC_iCMPsr   , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 1, 1]>,
+  //
+  // Move instructions, unconditional
+  InstrItinData<IIC_iMOVi    , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [1]>,
+  InstrItinData<IIC_iMOVr    , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [1, 1]>,
+  InstrItinData<IIC_iMOVsi   , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [1, 1]>,
+  InstrItinData<IIC_iMOVsr   , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [1, 1, 1]>,
+  //
+  // Move instructions, conditional
+  InstrItinData<IIC_iCMOVi   , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2]>,
+  InstrItinData<IIC_iCMOVr   , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 1]>,
+  InstrItinData<IIC_iCMOVsi  , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 1]>,
+  InstrItinData<IIC_iCMOVsr  , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>], [2, 1, 1]>,
+
+  // Integer multiply pipeline
+  // Result written in E5, but that is relative to the last cycle of multicycle,
+  // so we use 6 for those cases
+  //
+  InstrItinData<IIC_iMUL16   , [InstrStage<1, [FU_Pipe0]>], [5, 1, 1]>,
+  InstrItinData<IIC_iMAC16   , [InstrStage<1, [FU_Pipe1], 0>, 
+                                InstrStage<2, [FU_Pipe0]>], [6, 1, 1, 4]>,
+  InstrItinData<IIC_iMUL32   , [InstrStage<1, [FU_Pipe1], 0>, 
+                                InstrStage<2, [FU_Pipe0]>], [6, 1, 1]>,
+  InstrItinData<IIC_iMAC32   , [InstrStage<1, [FU_Pipe1], 0>, 
+                                InstrStage<2, [FU_Pipe0]>], [6, 1, 1, 4]>,
+  InstrItinData<IIC_iMUL64   , [InstrStage<2, [FU_Pipe1], 0>, 
+                                InstrStage<3, [FU_Pipe0]>], [6, 6, 1, 1]>,
+  InstrItinData<IIC_iMAC64   , [InstrStage<2, [FU_Pipe1], 0>, 
+                                InstrStage<3, [FU_Pipe0]>], [6, 6, 1, 1]>,
+  
+  // Integer load pipeline
+  //
+  // loads have an extra cycle of latency, but are fully pipelined
+  // use FU_Issue to enforce the 1 load/store per cycle limit
+  //
+  // Immediate offset
+  InstrItinData<IIC_iLoadi   , [InstrStage<1, [FU_Issue], 0>,
+                                InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                                InstrStage<1, [FU_LdSt0]>], [3, 1]>,
+  //
+  // Register offset
+  InstrItinData<IIC_iLoadr   , [InstrStage<1, [FU_Issue], 0>,
+                                InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                                InstrStage<1, [FU_LdSt0]>], [3, 1, 1]>,
+  //
+  // Scaled register offset, issues over 2 cycles
+  InstrItinData<IIC_iLoadsi  , [InstrStage<2, [FU_Issue], 0>,
+                                InstrStage<1, [FU_Pipe0], 0>,
+                                InstrStage<1, [FU_Pipe1]>,
+                                InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                                InstrStage<1, [FU_LdSt0]>], [4, 1, 1]>,
+  //
+  // Immediate offset with update
+  InstrItinData<IIC_iLoadiu  , [InstrStage<1, [FU_Issue], 0>,
+                                InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                                InstrStage<1, [FU_LdSt0]>], [3, 2, 1]>,
+  //
+  // Register offset with update
+  InstrItinData<IIC_iLoadru  , [InstrStage<1, [FU_Issue], 0>,
+                                InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                                InstrStage<1, [FU_LdSt0]>], [3, 2, 1, 1]>,
+  //
+  // Scaled register offset with update, issues over 2 cycles
+  InstrItinData<IIC_iLoadsiu , [InstrStage<2, [FU_Issue], 0>,
+                                InstrStage<1, [FU_Pipe0], 0>,
+                                InstrStage<1, [FU_Pipe1]>,
+                                InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                                InstrStage<1, [FU_LdSt0]>], [4, 3, 1, 1]>,
+  //
+  // Load multiple
+  InstrItinData<IIC_iLoadm   , [InstrStage<2, [FU_Issue], 0>,
+                                InstrStage<2, [FU_Pipe0], 0>,
+                                InstrStage<2, [FU_Pipe1]>,
+                                InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                                InstrStage<1, [FU_LdSt0]>]>,
+
+  // Integer store pipeline
+  //
+  // use FU_Issue to enforce the 1 load/store per cycle limit
+  //
+  // Immediate offset
+  InstrItinData<IIC_iStorei  , [InstrStage<1, [FU_Issue], 0>,
+                                InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                                InstrStage<1, [FU_LdSt0]>], [3, 1]>,
+  //
+  // Register offset
+  InstrItinData<IIC_iStorer  , [InstrStage<1, [FU_Issue], 0>,
+                                InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                                InstrStage<1, [FU_LdSt0]>], [3, 1, 1]>,
+  //
+  // Scaled register offset, issues over 2 cycles
+  InstrItinData<IIC_iStoresi , [InstrStage<2, [FU_Issue], 0>,
+                                InstrStage<1, [FU_Pipe0], 0>,
+                                InstrStage<1, [FU_Pipe1]>,
+                                InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                                InstrStage<1, [FU_LdSt0]>], [3, 1, 1]>,
+  //
+  // Immediate offset with update
+  InstrItinData<IIC_iStoreiu , [InstrStage<1, [FU_Issue], 0>,
+                                InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                                InstrStage<1, [FU_LdSt0]>], [2, 3, 1]>,
+  //
+  // Register offset with update
+  InstrItinData<IIC_iStoreru  , [InstrStage<1, [FU_Issue], 0>,
+                                InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                                InstrStage<1, [FU_LdSt0]>], [2, 3, 1, 1]>,
+  //
+  // Scaled register offset with update, issues over 2 cycles
+  InstrItinData<IIC_iStoresiu, [InstrStage<2, [FU_Issue], 0>,
+                                InstrStage<1, [FU_Pipe0], 0>,
+                                InstrStage<1, [FU_Pipe1]>,
+                                InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                                InstrStage<1, [FU_LdSt0]>], [3, 3, 1, 1]>,
+  //
+  // Store multiple
+  InstrItinData<IIC_iStorem  , [InstrStage<2, [FU_Issue], 0>,
+                                InstrStage<2, [FU_Pipe0], 0>,
+                                InstrStage<2, [FU_Pipe1]>,
+                                InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                                InstrStage<1, [FU_LdSt0]>]>,
+  
+  // Branch
+  //
+  // no delay slots, so the latency of a branch is unimportant
+  InstrItinData<IIC_Br      , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>]>,
+
+  // VFP
+  // Issue through integer pipeline, and execute in NEON unit. We assume
+  // RunFast mode so that NFP pipeline is used for single-precision when
+  // possible.
+  //
+  // FP Special Register to Integer Register File Move
+  InstrItinData<IIC_fpSTAT , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                              InstrStage<1, [FU_NLSPipe]>]>,
+  //
+  // Single-precision FP Unary
+  InstrItinData<IIC_fpUNA32 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [7, 1]>,
+  //
+  // Double-precision FP Unary
+  InstrItinData<IIC_fpUNA64 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<4, [FU_NPipe], 0>,
+                               InstrStage<4, [FU_NLSPipe]>], [4, 1]>,
+  //
+  // Single-precision FP Compare
+  InstrItinData<IIC_fpCMP32 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [1, 1]>,
+  //
+  // Double-precision FP Compare
+  InstrItinData<IIC_fpCMP64 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<4, [FU_NPipe], 0>,
+                               InstrStage<4, [FU_NLSPipe]>], [4, 1]>,
+  //
+  // Single to Double FP Convert
+  InstrItinData<IIC_fpCVTSD , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<7, [FU_NPipe], 0>,
+                               InstrStage<7, [FU_NLSPipe]>], [7, 1]>,
+  //
+  // Double to Single FP Convert
+  InstrItinData<IIC_fpCVTDS , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<5, [FU_NPipe], 0>,
+                               InstrStage<5, [FU_NLSPipe]>], [5, 1]>,
+  //
+  // Single-Precision FP to Integer Convert
+  InstrItinData<IIC_fpCVTSI , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [7, 1]>,
+  //
+  // Double-Precision FP to Integer Convert
+  InstrItinData<IIC_fpCVTDI , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<8, [FU_NPipe], 0>,
+                               InstrStage<8, [FU_NLSPipe]>], [8, 1]>,
+  //
+  // Integer to Single-Precision FP Convert
+  InstrItinData<IIC_fpCVTIS , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [7, 1]>,
+  //
+  // Integer to Double-Precision FP Convert
+  InstrItinData<IIC_fpCVTID , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<8, [FU_NPipe], 0>,
+                               InstrStage<8, [FU_NLSPipe]>], [8, 1]>,
+  //
+  // Single-precision FP ALU
+  InstrItinData<IIC_fpALU32 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [7, 1, 1]>,
+  //
+  // Double-precision FP ALU
+  InstrItinData<IIC_fpALU64 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<9, [FU_NPipe], 0>,
+                               InstrStage<9, [FU_NLSPipe]>], [9, 1, 1]>,
+  //
+  // Single-precision FP Multiply
+  InstrItinData<IIC_fpMUL32 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [7, 1, 1]>,
+  //
+  // Double-precision FP Multiply
+  InstrItinData<IIC_fpMUL64 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<11, [FU_NPipe], 0>,
+                               InstrStage<11, [FU_NLSPipe]>], [11, 1, 1]>,
+  //
+  // Single-precision FP MAC
+  InstrItinData<IIC_fpMAC32 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [7, 2, 1, 1]>,
+  //
+  // Double-precision FP MAC
+  InstrItinData<IIC_fpMAC64 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<19, [FU_NPipe], 0>,
+                               InstrStage<19, [FU_NLSPipe]>], [19, 2, 1, 1]>,
+  //
+  // Single-precision FP DIV
+  InstrItinData<IIC_fpDIV32 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<20, [FU_NPipe], 0>,
+                               InstrStage<20, [FU_NLSPipe]>], [20, 1, 1]>,
+  //
+  // Double-precision FP DIV
+  InstrItinData<IIC_fpDIV64 , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<29, [FU_NPipe], 0>,
+                               InstrStage<29, [FU_NLSPipe]>], [29, 1, 1]>,
+  //
+  // Single-precision FP SQRT
+  InstrItinData<IIC_fpSQRT32, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<19, [FU_NPipe], 0>,
+                               InstrStage<19, [FU_NLSPipe]>], [19, 1]>,
+  //
+  // Double-precision FP SQRT
+  InstrItinData<IIC_fpSQRT64, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<29, [FU_NPipe], 0>,
+                               InstrStage<29, [FU_NLSPipe]>], [29, 1]>,
+  //
+  // Single-precision FP Load
+  // use FU_Issue to enforce the 1 load/store per cycle limit
+  InstrItinData<IIC_fpLoad32, [InstrStage<1, [FU_Issue], 0>, 
+                               InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_LdSt0], 0>,
+                               InstrStage<1, [FU_NLSPipe]>]>,
+  //
+  // Double-precision FP Load
+  // use FU_Issue to enforce the 1 load/store per cycle limit
+  InstrItinData<IIC_fpLoad64, [InstrStage<2, [FU_Issue], 0>, 
+                               InstrStage<1, [FU_Pipe0], 0>,
+                               InstrStage<1, [FU_Pipe1]>,
+                               InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_LdSt0], 0>,
+                               InstrStage<1, [FU_NLSPipe]>]>,
+  //
+  // FP Load Multiple
+  // use FU_Issue to enforce the 1 load/store per cycle limit
+  InstrItinData<IIC_fpLoadm,  [InstrStage<3, [FU_Issue], 0>, 
+                               InstrStage<2, [FU_Pipe0], 0>,
+                               InstrStage<2, [FU_Pipe1]>,
+                               InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_LdSt0], 0>,
+                               InstrStage<1, [FU_NLSPipe]>]>,
+  //
+  // Single-precision FP Store
+  // use FU_Issue to enforce the 1 load/store per cycle limit
+  InstrItinData<IIC_fpStore32,[InstrStage<1, [FU_Issue], 0>, 
+                               InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_LdSt0], 0>,
+                               InstrStage<1, [FU_NLSPipe]>]>,
+  //
+  // Double-precision FP Store
+  // use FU_Issue to enforce the 1 load/store per cycle limit
+  InstrItinData<IIC_fpStore64,[InstrStage<2, [FU_Issue], 0>, 
+                               InstrStage<1, [FU_Pipe0], 0>,
+                               InstrStage<1, [FU_Pipe1]>,
+                               InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_LdSt0], 0>,
+                               InstrStage<1, [FU_NLSPipe]>]>,
+  //
+  // FP Store Multiple
+  // use FU_Issue to enforce the 1 load/store per cycle limit
+  InstrItinData<IIC_fpStorem, [InstrStage<3, [FU_Issue], 0>, 
+                               InstrStage<2, [FU_Pipe0], 0>,
+                               InstrStage<2, [FU_Pipe1]>,
+                               InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_LdSt0], 0>,
+                               InstrStage<1, [FU_NLSPipe]>]>,
+
+  // NEON
+  // Issue through integer pipeline, and execute in NEON unit.
+  //
+  // VLD1
+  InstrItinData<IIC_VLD1,     [InstrStage<1, [FU_Issue], 0>, 
+                               InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_LdSt0], 0>,
+                               InstrStage<1, [FU_NLSPipe]>]>,
+  //
+  // VLD2
+  InstrItinData<IIC_VLD2,     [InstrStage<1, [FU_Issue], 0>, 
+                               InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_LdSt0], 0>,
+                               InstrStage<1, [FU_NLSPipe]>], [2, 2, 1]>,
+  //
+  // VLD3
+  InstrItinData<IIC_VLD3,     [InstrStage<1, [FU_Issue], 0>, 
+                               InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_LdSt0], 0>,
+                               InstrStage<1, [FU_NLSPipe]>], [2, 2, 2, 1]>,
+  //
+  // VLD4
+  InstrItinData<IIC_VLD4,     [InstrStage<1, [FU_Issue], 0>, 
+                               InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_LdSt0], 0>,
+                               InstrStage<1, [FU_NLSPipe]>], [2, 2, 2, 2, 1]>,
+  //
+  // VST
+  InstrItinData<IIC_VST,      [InstrStage<1, [FU_Issue], 0>, 
+                               InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_LdSt0], 0>,
+                               InstrStage<1, [FU_NLSPipe]>]>,
+  //
+  // Double-register FP Unary
+  InstrItinData<IIC_VUNAD,    [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [5, 2]>,
+  //
+  // Quad-register FP Unary
+  // Result written in N5, but that is relative to the last cycle of multicycle,
+  // so we use 6 for those cases
+  InstrItinData<IIC_VUNAQ,    [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<2, [FU_NPipe]>], [6, 2]>,
+  //
+  // Double-register FP Binary
+  InstrItinData<IIC_VBIND,    [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [5, 2, 2]>,
+  //
+  // Quad-register FP Binary
+  // Result written in N5, but that is relative to the last cycle of multicycle,
+  // so we use 6 for those cases
+  InstrItinData<IIC_VBINQ,    [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<2, [FU_NPipe]>], [6, 2, 2]>,
+  //
+  // Move Immediate
+  InstrItinData<IIC_VMOVImm,  [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [3]>,
+  //
+  // Double-register Permute Move
+  InstrItinData<IIC_VMOVD,    [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NLSPipe]>], [2, 1]>,
+  //
+  // Quad-register Permute Move
+  // Result written in N2, but that is relative to the last cycle of multicycle,
+  // so we use 3 for those cases
+  InstrItinData<IIC_VMOVQ,    [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<2, [FU_NLSPipe]>], [3, 1]>,
+  //
+  // Integer to Single-precision Move
+  InstrItinData<IIC_VMOVIS ,  [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NLSPipe]>], [2, 1]>,
+  //
+  // Integer to Double-precision Move
+  InstrItinData<IIC_VMOVID ,  [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NLSPipe]>], [2, 1, 1]>,
+  //
+  // Single-precision to Integer Move
+  InstrItinData<IIC_VMOVSI ,  [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NLSPipe]>], [20, 1]>,
+  //
+  // Double-precision to Integer Move
+  InstrItinData<IIC_VMOVDI ,  [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NLSPipe]>], [20, 20, 1]>,
+  //
+  // Integer to Lane Move
+  InstrItinData<IIC_VMOVISL , [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<2, [FU_NLSPipe]>], [3, 1, 1]>,
+  //
+  // Double-register Permute
+  InstrItinData<IIC_VPERMD,   [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NLSPipe]>], [2, 2, 1, 1]>,
+  //
+  // Quad-register Permute
+  // Result written in N2, but that is relative to the last cycle of multicycle,
+  // so we use 3 for those cases
+  InstrItinData<IIC_VPERMQ,   [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<2, [FU_NLSPipe]>], [3, 3, 1, 1]>,
+  //
+  // Quad-register Permute (3 cycle issue)
+  // Result written in N2, but that is relative to the last cycle of multicycle,
+  // so we use 4 for those cases
+  InstrItinData<IIC_VPERMQ3,  [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NLSPipe]>,
+                               InstrStage<1, [FU_NPipe], 0>,
+                               InstrStage<2, [FU_NLSPipe]>], [4, 4, 1, 1]>,
+  //
+  // Double-register FP Multiple-Accumulate
+  InstrItinData<IIC_VMACD,    [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [9, 2, 2, 3]>,
+  //
+  // Quad-register FP Multiple-Accumulate
+  // Result written in N9, but that is relative to the last cycle of multicycle,
+  // so we use 10 for those cases
+  InstrItinData<IIC_VMACQ,    [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<2, [FU_NPipe]>], [10, 2, 2, 3]>,
+  //
+  // Double-register Reciprical Step
+  InstrItinData<IIC_VRECSD,   [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [9, 2, 2]>,
+  //
+  // Quad-register Reciprical Step
+  InstrItinData<IIC_VRECSQ,   [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<2, [FU_NPipe]>], [10, 2, 2]>,
+  //
+  // Double-register Integer Count
+  InstrItinData<IIC_VCNTiD,   [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [3, 2, 2]>,
+  //
+  // Quad-register Integer Count
+  // Result written in N3, but that is relative to the last cycle of multicycle,
+  // so we use 4 for those cases
+  InstrItinData<IIC_VCNTiQ,   [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<2, [FU_NPipe]>], [4, 2, 2]>,
+  //
+  // Double-register Integer Unary
+  InstrItinData<IIC_VUNAiD,   [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [4, 2]>,
+  //
+  // Quad-register Integer Unary
+  InstrItinData<IIC_VUNAiQ,   [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [4, 2]>,
+  //
+  // Double-register Integer Q-Unary
+  InstrItinData<IIC_VQUNAiD,  [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [4, 1]>,
+  //
+  // Quad-register Integer CountQ-Unary
+  InstrItinData<IIC_VQUNAiQ,  [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [4, 1]>,
+  //
+  // Double-register Integer Binary
+  InstrItinData<IIC_VBINiD,   [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [3, 2, 2]>,
+  //
+  // Quad-register Integer Binary
+  InstrItinData<IIC_VBINiQ,   [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [3, 2, 2]>,
+  //
+  // Double-register Integer Binary (4 cycle)
+  InstrItinData<IIC_VBINi4D,  [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [4, 2, 1]>,
+  //
+  // Quad-register Integer Binary (4 cycle)
+  InstrItinData<IIC_VBINi4Q,  [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [4, 2, 1]>,
+  //
+  // Double-register Integer Subtract
+  InstrItinData<IIC_VSUBiD,   [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [3, 2, 1]>,
+  //
+  // Quad-register Integer Subtract
+  InstrItinData<IIC_VSUBiQ,   [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [3, 2, 1]>,
+  //
+  // Double-register Integer Shift
+  InstrItinData<IIC_VSHLiD,   [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [3, 1, 1]>,
+  //
+  // Quad-register Integer Shift
+  InstrItinData<IIC_VSHLiQ,   [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<2, [FU_NPipe]>], [4, 1, 1]>,
+  //
+  // Double-register Integer Shift (4 cycle)
+  InstrItinData<IIC_VSHLi4D,   [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [4, 1, 1]>,
+  //
+  // Quad-register Integer Shift (4 cycle)
+  InstrItinData<IIC_VSHLi4Q,   [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<2, [FU_NPipe]>], [5, 1, 1]>,
+  //
+  // Double-register Integer Pair Add Long
+  InstrItinData<IIC_VPALiD,   [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [6, 3, 2, 1]>,
+  //
+  // Quad-register Integer Pair Add Long
+  InstrItinData<IIC_VPALiQ,   [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<2, [FU_NPipe]>], [7, 3, 2, 1]>,
+  //
+  // Double-register Integer Multiply (.8, .16)
+  InstrItinData<IIC_VMULi16D, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [6, 2, 2]>,
+  //
+  // Double-register Integer Multiply (.32)
+  InstrItinData<IIC_VMULi32D, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<2, [FU_NPipe]>], [7, 2, 1]>,
+  //
+  // Quad-register Integer Multiply (.8, .16)
+  InstrItinData<IIC_VMULi16Q, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<2, [FU_NPipe]>], [7, 2, 2]>,
+  //
+  // Quad-register Integer Multiply (.32)
+  InstrItinData<IIC_VMULi32Q, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>,
+                               InstrStage<2, [FU_NLSPipe], 0>,
+                               InstrStage<3, [FU_NPipe]>], [9, 2, 1]>,
+  //
+  // Double-register Integer Multiply-Accumulate (.8, .16)
+  InstrItinData<IIC_VMACi16D, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>], [6, 2, 2, 3]>,
+  //
+  // Double-register Integer Multiply-Accumulate (.32)
+  InstrItinData<IIC_VMACi32D, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<2, [FU_NPipe]>], [7, 2, 1, 3]>,
+  //
+  // Quad-register Integer Multiply-Accumulate (.8, .16)
+  InstrItinData<IIC_VMACi16Q, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<2, [FU_NPipe]>], [7, 2, 2, 3]>,
+  //
+  // Quad-register Integer Multiply-Accumulate (.32)
+  InstrItinData<IIC_VMACi32Q, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NPipe]>,
+                               InstrStage<2, [FU_NLSPipe], 0>,
+                               InstrStage<3, [FU_NPipe]>], [9, 2, 1, 3]>,
+  //
+  // Double-register VEXT
+  InstrItinData<IIC_VEXTD,    [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NLSPipe]>], [2, 1, 1]>,
+  //
+  // Quad-register VEXT
+  InstrItinData<IIC_VEXTQ,    [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<2, [FU_NLSPipe]>], [3, 1, 1]>,
+  //
+  // VTB
+  InstrItinData<IIC_VTB1,     [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<2, [FU_NLSPipe]>], [3, 2, 1]>,
+  InstrItinData<IIC_VTB2,     [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<2, [FU_NLSPipe]>], [3, 2, 2, 1]>,
+  InstrItinData<IIC_VTB3,     [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NLSPipe]>,
+                               InstrStage<1, [FU_NPipe], 0>,
+                               InstrStage<2, [FU_NLSPipe]>], [4, 2, 2, 3, 1]>,
+  InstrItinData<IIC_VTB4,     [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NLSPipe]>,
+                               InstrStage<1, [FU_NPipe], 0>,
+                               InstrStage<2, [FU_NLSPipe]>], [4, 2, 2, 3, 3, 1]>,
+  //
+  // VTBX
+  InstrItinData<IIC_VTBX1,    [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<2, [FU_NLSPipe]>], [3, 1, 2, 1]>,
+  InstrItinData<IIC_VTBX2,    [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<2, [FU_NLSPipe]>], [3, 1, 2, 2, 1]>,
+  InstrItinData<IIC_VTBX3,    [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NLSPipe]>,
+                               InstrStage<1, [FU_NPipe], 0>,
+                               InstrStage<2, [FU_NLSPipe]>], [4, 1, 2, 2, 3, 1]>,
+  InstrItinData<IIC_VTBX4,    [InstrStage<1, [FU_Pipe0, FU_Pipe1]>,
+                               InstrStage<1, [FU_NLSPipe]>,
+                               InstrStage<1, [FU_NPipe], 0>,
+                               InstrStage<2, [FU_NLSPipe]>], [4, 1, 2, 2, 3, 3, 1]>
+]>;

diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
new file mode 100644
index 0000000..426862c
--- /dev/null
+++ b/lib/Target/ARM/ARMSubtarget.cpp

@@ -0,0 +1,177 @@
+//===-- ARMSubtarget.cpp - ARM Subtarget Information ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ARM specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMSubtarget.h"
+#include "ARMGenSubtarget.inc"
+#include "llvm/GlobalValue.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/ADT/SmallVector.h"
+using namespace llvm;
+
+static cl::opt<bool>
+ReserveR9("arm-reserve-r9", cl::Hidden,
+          cl::desc("Reserve R9, making it unavailable as GPR"));
+static cl::opt<bool>
+UseNEONFP("arm-use-neon-fp",
+          cl::desc("Use NEON for single-precision FP"),
+          cl::init(false), cl::Hidden);
+
+static cl::opt<bool>
+UseMOVT("arm-use-movt",
+        cl::init(true), cl::Hidden);
+
+ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &FS,
+                           bool isT)
+  : ARMArchVersion(V4T)
+  , ARMFPUType(None)
+  , UseNEONForSinglePrecisionFP(UseNEONFP)
+  , IsThumb(isT)
+  , ThumbMode(Thumb1)
+  , PostRAScheduler(false)
+  , IsR9Reserved(ReserveR9)
+  , UseMovt(UseMOVT)
+  , stackAlignment(4)
+  , CPUString("generic")
+  , TargetType(isELF) // Default to ELF unless otherwise specified.
+  , TargetABI(ARM_ABI_APCS) {
+  // default to soft float ABI
+  if (FloatABIType == FloatABI::Default)
+    FloatABIType = FloatABI::Soft;
+
+  // Determine default and user specified characteristics
+
+  // Parse features string.
+  CPUString = ParseSubtargetFeatures(FS, CPUString);
+
+  // Set the boolean corresponding to the current target triple, or the default
+  // if one cannot be determined, to true.
+  unsigned Len = TT.length();
+  unsigned Idx = 0;
+
+  if (Len >= 5 && TT.substr(0, 4) == "armv")
+    Idx = 4;
+  else if (Len >= 6 && TT.substr(0, 5) == "thumb") {
+    IsThumb = true;
+    if (Len >= 7 && TT[5] == 'v')
+      Idx = 6;
+  }
+  if (Idx) {
+    unsigned SubVer = TT[Idx];
+    if (SubVer > '4' && SubVer <= '9') {
+      if (SubVer >= '7') {
+        ARMArchVersion = V7A;
+      } else if (SubVer == '6') {
+        ARMArchVersion = V6;
+        if (Len >= Idx+3 && TT[Idx+1] == 't' && TT[Idx+2] == '2')
+          ARMArchVersion = V6T2;
+      } else if (SubVer == '5') {
+        ARMArchVersion = V5T;
+        if (Len >= Idx+3 && TT[Idx+1] == 't' && TT[Idx+2] == 'e')
+          ARMArchVersion = V5TE;
+      }
+      if (ARMArchVersion >= V6T2)
+        ThumbMode = Thumb2;
+    }
+  }
+
+  // Thumb2 implies at least V6T2.
+  if (ARMArchVersion < V6T2 && ThumbMode >= Thumb2)
+    ARMArchVersion = V6T2;
+
+  if (Len >= 10) {
+    if (TT.find("-darwin") != std::string::npos)
+      // arm-darwin
+      TargetType = isDarwin;
+  }
+
+  if (TT.find("eabi") != std::string::npos)
+    TargetABI = ARM_ABI_AAPCS;
+
+  if (isAAPCS_ABI())
+    stackAlignment = 8;
+
+  if (isTargetDarwin())
+    IsR9Reserved = ReserveR9 | (ARMArchVersion < V6);
+
+  if (!isThumb() || hasThumb2())
+    PostRAScheduler = true;
+
+  // Set CPU specific features.
+  if (CPUString == "cortex-a8") {
+    // On Cortex-a8, it's faster to perform some single-precision FP
+    // operations with NEON instructions.
+    if (UseNEONFP.getPosition() == 0)
+      UseNEONForSinglePrecisionFP = true;
+  }
+}
+
+/// GVIsIndirectSymbol - true if the GV will be accessed via an indirect symbol.
+bool
+ARMSubtarget::GVIsIndirectSymbol(GlobalValue *GV, Reloc::Model RelocM) const {
+  if (RelocM == Reloc::Static)
+    return false;
+
+  // Materializable GVs (in JIT lazy compilation mode) do not require an extra
+  // load from stub.
+  bool isDecl = GV->isDeclaration() && !GV->isMaterializable();
+
+  if (!isTargetDarwin()) {
+    // Extra load is needed for all externally visible.
+    if (GV->hasLocalLinkage() || GV->hasHiddenVisibility())
+      return false;
+    return true;
+  } else {
+    if (RelocM == Reloc::PIC_) {
+      // If this is a strong reference to a definition, it is definitely not
+      // through a stub.
+      if (!isDecl && !GV->isWeakForLinker())
+        return false;
+
+      // Unless we have a symbol with hidden visibility, we have to go through a
+      // normal $non_lazy_ptr stub because this symbol might be resolved late.
+      if (!GV->hasHiddenVisibility())  // Non-hidden $non_lazy_ptr reference.
+        return true;
+
+      // If symbol visibility is hidden, we have a stub for common symbol
+      // references and external declarations.
+      if (isDecl || GV->hasCommonLinkage())
+        // Hidden $non_lazy_ptr reference.
+        return true;
+
+      return false;
+    } else {
+      // If this is a strong reference to a definition, it is definitely not
+      // through a stub.
+      if (!isDecl && !GV->isWeakForLinker())
+        return false;
+    
+      // Unless we have a symbol with hidden visibility, we have to go through a
+      // normal $non_lazy_ptr stub because this symbol might be resolved late.
+      if (!GV->hasHiddenVisibility())  // Non-hidden $non_lazy_ptr reference.
+        return true;
+    }
+  }
+
+  return false;
+}
+
+bool ARMSubtarget::enablePostRAScheduler(
+           CodeGenOpt::Level OptLevel,
+           TargetSubtarget::AntiDepBreakMode& Mode,
+           RegClassVector& CriticalPathRCs) const {
+  Mode = TargetSubtarget::ANTIDEP_CRITICAL;
+  CriticalPathRCs.clear();
+  CriticalPathRCs.push_back(&ARM::GPRRegClass);
+  return PostRAScheduler && OptLevel >= CodeGenOpt::Default;
+}

diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
new file mode 100644
index 0000000..3f06b7b
--- /dev/null
+++ b/lib/Target/ARM/ARMSubtarget.h

@@ -0,0 +1,156 @@
+//=====---- ARMSubtarget.h - Define Subtarget for the ARM -----*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the ARM specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMSUBTARGET_H
+#define ARMSUBTARGET_H
+
+#include "llvm/Target/TargetInstrItineraries.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtarget.h"
+#include "ARMBaseRegisterInfo.h"
+#include <string>
+
+namespace llvm {
+class GlobalValue;
+
+class ARMSubtarget : public TargetSubtarget {
+protected:
+  enum ARMArchEnum {
+    V4T, V5T, V5TE, V6, V6T2, V7A
+  };
+
+  enum ARMFPEnum {
+    None, VFPv2, VFPv3, NEON
+  };
+
+  enum ThumbTypeEnum {
+    Thumb1,
+    Thumb2
+  };
+
+  /// ARMArchVersion - ARM architecture version: V4T (base), V5T, V5TE,
+  /// V6, V6T2, V7A.
+  ARMArchEnum ARMArchVersion;
+
+  /// ARMFPUType - Floating Point Unit type.
+  ARMFPEnum ARMFPUType;
+
+  /// UseNEONForSinglePrecisionFP - if the NEONFP attribute has been
+  /// specified. Use the method useNEONForSinglePrecisionFP() to
+  /// determine if NEON should actually be used.
+  bool UseNEONForSinglePrecisionFP;
+
+  /// IsThumb - True if we are in thumb mode, false if in ARM mode.
+  bool IsThumb;
+
+  /// ThumbMode - Indicates supported Thumb version.
+  ThumbTypeEnum ThumbMode;
+
+  /// PostRAScheduler - True if using post-register-allocation scheduler.
+  bool PostRAScheduler;
+
+  /// IsR9Reserved - True if R9 is a not available as general purpose register.
+  bool IsR9Reserved;
+
+  /// UseMovt - True if MOVT / MOVW pairs are used for materialization of 32-bit
+  /// imms (including global addresses).
+  bool UseMovt;
+
+  /// stackAlignment - The minimum alignment known to hold of the stack frame on
+  /// entry to the function and which must be maintained by every function.
+  unsigned stackAlignment;
+
+  /// CPUString - String name of used CPU.
+  std::string CPUString;
+
+  /// Selected instruction itineraries (one entry per itinerary class.)
+  InstrItineraryData InstrItins;
+
+ public:
+  enum {
+    isELF, isDarwin
+  } TargetType;
+
+  enum {
+    ARM_ABI_APCS,
+    ARM_ABI_AAPCS // ARM EABI
+  } TargetABI;
+
+  /// This constructor initializes the data members to match that
+  /// of the specified triple.
+  ///
+  ARMSubtarget(const std::string &TT, const std::string &FS, bool isThumb);
+
+  /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
+  /// that still makes it profitable to inline the call.
+  unsigned getMaxInlineSizeThreshold() const {
+    // FIXME: For now, we don't lower memcpy's to loads / stores for Thumb.
+    // Change this once Thumb ldmia / stmia support is added.
+    return isThumb() ? 0 : 64;
+  }
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  std::string ParseSubtargetFeatures(const std::string &FS,
+                                     const std::string &CPU);
+
+  bool hasV4TOps()  const { return ARMArchVersion >= V4T;  }
+  bool hasV5TOps()  const { return ARMArchVersion >= V5T;  }
+  bool hasV5TEOps() const { return ARMArchVersion >= V5TE; }
+  bool hasV6Ops()   const { return ARMArchVersion >= V6;   }
+  bool hasV6T2Ops() const { return ARMArchVersion >= V6T2; }
+  bool hasV7Ops()   const { return ARMArchVersion >= V7A;  }
+
+  bool hasVFP2() const { return ARMFPUType >= VFPv2; }
+  bool hasVFP3() const { return ARMFPUType >= VFPv3; }
+  bool hasNEON() const { return ARMFPUType >= NEON;  }
+  bool useNEONForSinglePrecisionFP() const {
+    return hasNEON() && UseNEONForSinglePrecisionFP; }
+
+  bool isTargetDarwin() const { return TargetType == isDarwin; }
+  bool isTargetELF() const { return TargetType == isELF; }
+
+  bool isAPCS_ABI() const { return TargetABI == ARM_ABI_APCS; }
+  bool isAAPCS_ABI() const { return TargetABI == ARM_ABI_AAPCS; }
+
+  bool isThumb() const { return IsThumb; }
+  bool isThumb1Only() const { return IsThumb && (ThumbMode == Thumb1); }
+  bool isThumb2() const { return IsThumb && (ThumbMode == Thumb2); }
+  bool hasThumb2() const { return ThumbMode >= Thumb2; }
+
+  bool isR9Reserved() const { return IsR9Reserved; }
+
+  bool useMovt() const { return UseMovt && hasV6T2Ops(); }
+
+  const std::string & getCPUString() const { return CPUString; }
+
+  /// enablePostRAScheduler - True at 'More' optimization.
+  bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
+                             TargetSubtarget::AntiDepBreakMode& Mode,
+                             RegClassVector& CriticalPathRCs) const;
+
+  /// getInstrItins - Return the instruction itineraies based on subtarget
+  /// selection.
+  const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
+
+  /// getStackAlignment - Returns the minimum alignment known to hold of the
+  /// stack frame on entry to the function and which must be maintained by every
+  /// function for this subtarget.
+  unsigned getStackAlignment() const { return stackAlignment; }
+
+  /// GVIsIndirectSymbol - true if the GV will be accessed via an indirect
+  /// symbol.
+  bool GVIsIndirectSymbol(GlobalValue *GV, Reloc::Model RelocM) const;
+};
+} // End llvm namespace
+
+#endif  // ARMSUBTARGET_H

diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
new file mode 100644
index 0000000..7233f5c
--- /dev/null
+++ b/lib/Target/ARM/ARMTargetMachine.cpp

@@ -0,0 +1,144 @@
+//===-- ARMTargetMachine.cpp - Define TargetMachine for ARM ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMTargetMachine.h"
+#include "ARMMCAsmInfo.h"
+#include "ARMFrameInfo.h"
+#include "ARM.h"
+#include "llvm/PassManager.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+static const MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) {
+  Triple TheTriple(TT);
+  switch (TheTriple.getOS()) {
+  case Triple::Darwin:
+    return new ARMMCAsmInfoDarwin();
+  default:
+    return new ARMELFMCAsmInfo();
+  }
+}
+
+
+extern "C" void LLVMInitializeARMTarget() {
+  // Register the target.
+  RegisterTargetMachine<ARMTargetMachine> X(TheARMTarget);
+  RegisterTargetMachine<ThumbTargetMachine> Y(TheThumbTarget);
+
+  // Register the target asm info.
+  RegisterAsmInfoFn A(TheARMTarget, createMCAsmInfo);
+  RegisterAsmInfoFn B(TheThumbTarget, createMCAsmInfo);
+}
+
+/// TargetMachine ctor - Create an ARM architecture model.
+///
+ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T,
+                                           const std::string &TT,
+                                           const std::string &FS,
+                                           bool isThumb)
+  : LLVMTargetMachine(T, TT),
+    Subtarget(TT, FS, isThumb),
+    FrameInfo(Subtarget),
+    JITInfo(),
+    InstrItins(Subtarget.getInstrItineraryData()) {
+  DefRelocModel = getRelocationModel();
+}
+
+ARMTargetMachine::ARMTargetMachine(const Target &T, const std::string &TT,
+                                   const std::string &FS)
+  : ARMBaseTargetMachine(T, TT, FS, false), InstrInfo(Subtarget),
+    DataLayout(Subtarget.isAPCS_ABI() ?
+               std::string("e-p:32:32-f64:32:32-i64:32:32-n32") :
+               std::string("e-p:32:32-f64:64:64-i64:64:64-n32")),
+    TLInfo(*this) {
+}
+
+ThumbTargetMachine::ThumbTargetMachine(const Target &T, const std::string &TT,
+                                       const std::string &FS)
+  : ARMBaseTargetMachine(T, TT, FS, true),
+    InstrInfo(Subtarget.hasThumb2()
+              ? ((ARMBaseInstrInfo*)new Thumb2InstrInfo(Subtarget))
+              : ((ARMBaseInstrInfo*)new Thumb1InstrInfo(Subtarget))),
+    DataLayout(Subtarget.isAPCS_ABI() ?
+               std::string("e-p:32:32-f64:32:32-i64:32:32-"
+                           "i16:16:32-i8:8:32-i1:8:32-a:0:32-n32") :
+               std::string("e-p:32:32-f64:64:64-i64:64:64-"
+                           "i16:16:32-i8:8:32-i1:8:32-a:0:32-n32")),
+    TLInfo(*this) {
+}
+
+
+
+// Pass Pipeline Configuration
+bool ARMBaseTargetMachine::addInstSelector(PassManagerBase &PM,
+                                           CodeGenOpt::Level OptLevel) {
+  PM.add(createARMISelDag(*this, OptLevel));
+  return false;
+}
+
+bool ARMBaseTargetMachine::addPreRegAlloc(PassManagerBase &PM,
+                                          CodeGenOpt::Level OptLevel) {
+  if (Subtarget.hasNEON())
+    PM.add(createNEONPreAllocPass());
+
+  // FIXME: temporarily disabling load / store optimization pass for Thumb1.
+  if (OptLevel != CodeGenOpt::None && !Subtarget.isThumb1Only())
+    PM.add(createARMLoadStoreOptimizationPass(true));
+  return true;
+}
+
+bool ARMBaseTargetMachine::addPreSched2(PassManagerBase &PM,
+                                        CodeGenOpt::Level OptLevel) {
+  // FIXME: temporarily disabling load / store optimization pass for Thumb1.
+  if (OptLevel != CodeGenOpt::None && !Subtarget.isThumb1Only())
+    PM.add(createARMLoadStoreOptimizationPass());
+
+  // Expand some pseudo instructions into multiple instructions to allow
+  // proper scheduling.
+  PM.add(createARMExpandPseudoPass());
+
+  return true;
+}
+
+bool ARMBaseTargetMachine::addPreEmitPass(PassManagerBase &PM,
+                                          CodeGenOpt::Level OptLevel) {
+  // FIXME: temporarily disabling load / store optimization pass for Thumb1.
+  if (OptLevel != CodeGenOpt::None) {
+    if (!Subtarget.isThumb1Only())
+      PM.add(createIfConverterPass());
+    if (Subtarget.hasNEON())
+      PM.add(createNEONMoveFixPass());
+  }
+
+  if (Subtarget.isThumb2()) {
+    PM.add(createThumb2ITBlockPass());
+    PM.add(createThumb2SizeReductionPass());
+  }
+
+  PM.add(createARMConstantIslandPass());
+  return true;
+}
+
+bool ARMBaseTargetMachine::addCodeEmitter(PassManagerBase &PM,
+                                          CodeGenOpt::Level OptLevel,
+                                          JITCodeEmitter &JCE) {
+  // FIXME: Move this to TargetJITInfo!
+  if (DefRelocModel == Reloc::Default)
+    setRelocationModel(Reloc::Static);
+
+  // Machine code emitter pass for ARM.
+  PM.add(createARMJITCodeEmitterPass(*this, JCE));
+  return false;
+}

diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h
new file mode 100644
index 0000000..88e67e3
--- /dev/null
+++ b/lib/Target/ARM/ARMTargetMachine.h

@@ -0,0 +1,109 @@
+//===-- ARMTargetMachine.h - Define TargetMachine for ARM -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the ARM specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMTARGETMACHINE_H
+#define ARMTARGETMACHINE_H
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+#include "ARMInstrInfo.h"
+#include "ARMFrameInfo.h"
+#include "ARMJITInfo.h"
+#include "ARMSubtarget.h"
+#include "ARMISelLowering.h"
+#include "Thumb1InstrInfo.h"
+#include "Thumb2InstrInfo.h"
+
+namespace llvm {
+
+class ARMBaseTargetMachine : public LLVMTargetMachine {
+protected:
+  ARMSubtarget        Subtarget;
+
+private:
+  ARMFrameInfo        FrameInfo;
+  ARMJITInfo          JITInfo;
+  InstrItineraryData  InstrItins;
+  Reloc::Model        DefRelocModel;    // Reloc model before it's overridden.
+
+public:
+  ARMBaseTargetMachine(const Target &T, const std::string &TT,
+                       const std::string &FS, bool isThumb);
+
+  virtual const ARMFrameInfo     *getFrameInfo() const { return &FrameInfo; }
+  virtual       ARMJITInfo       *getJITInfo()         { return &JITInfo; }
+  virtual const ARMSubtarget  *getSubtargetImpl() const { return &Subtarget; }
+  virtual const InstrItineraryData getInstrItineraryData() const {
+    return InstrItins;
+  }
+
+  // Pass Pipeline Configuration
+  virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+  virtual bool addPreRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+  virtual bool addPreSched2(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+  virtual bool addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+  virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel,
+                              JITCodeEmitter &MCE);
+};
+
+/// ARMTargetMachine - ARM target machine.
+///
+class ARMTargetMachine : public ARMBaseTargetMachine {
+  ARMInstrInfo        InstrInfo;
+  const TargetData    DataLayout;       // Calculates type size & alignment
+  ARMTargetLowering   TLInfo;
+public:
+  ARMTargetMachine(const Target &T, const std::string &TT,
+                   const std::string &FS);
+
+  virtual const ARMRegisterInfo  *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+
+  virtual       ARMTargetLowering *getTargetLowering() const {
+    return const_cast<ARMTargetLowering*>(&TLInfo);
+  }
+
+  virtual const ARMInstrInfo     *getInstrInfo() const { return &InstrInfo; }
+  virtual const TargetData       *getTargetData() const { return &DataLayout; }
+};
+
+/// ThumbTargetMachine - Thumb target machine.
+/// Due to the way architectures are handled, this represents both
+///   Thumb-1 and Thumb-2.
+///
+class ThumbTargetMachine : public ARMBaseTargetMachine {
+  ARMBaseInstrInfo    *InstrInfo;   // either Thumb1InstrInfo or Thumb2InstrInfo
+  const TargetData    DataLayout;   // Calculates type size & alignment
+  ARMTargetLowering   TLInfo;
+public:
+  ThumbTargetMachine(const Target &T, const std::string &TT,
+                     const std::string &FS);
+
+  /// returns either Thumb1RegisterInfo or Thumb2RegisterInfo
+  virtual const ARMBaseRegisterInfo *getRegisterInfo() const {
+    return &InstrInfo->getRegisterInfo();
+  }
+
+  virtual ARMTargetLowering *getTargetLowering() const {
+    return const_cast<ARMTargetLowering*>(&TLInfo);
+  }
+
+  /// returns either Thumb1InstrInfo or Thumb2InstrInfo
+  virtual const ARMBaseInstrInfo *getInstrInfo() const { return InstrInfo; }
+  virtual const TargetData       *getTargetData() const { return &DataLayout; }
+};
+
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/ARM/ARMTargetObjectFile.h b/lib/Target/ARM/ARMTargetObjectFile.h
new file mode 100644
index 0000000..9703403
--- /dev/null
+++ b/lib/Target/ARM/ARMTargetObjectFile.h

@@ -0,0 +1,39 @@
+//===-- llvm/Target/ARMTargetObjectFile.h - ARM Object Info -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_ARM_TARGETOBJECTFILE_H
+#define LLVM_TARGET_ARM_TARGETOBJECTFILE_H
+
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/MC/MCSectionELF.h"
+
+namespace llvm {
+
+  class ARMElfTargetObjectFile : public TargetLoweringObjectFileELF {
+  public:
+    ARMElfTargetObjectFile() : TargetLoweringObjectFileELF() {}
+
+    void Initialize(MCContext &Ctx, const TargetMachine &TM) {
+      TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+
+      if (TM.getSubtarget<ARMSubtarget>().isAAPCS_ABI()) {
+        StaticCtorSection =
+          getELFSection(".init_array", MCSectionELF::SHT_INIT_ARRAY, 
+                        MCSectionELF::SHF_WRITE | MCSectionELF::SHF_ALLOC,
+                        SectionKind::getDataRel());
+        StaticDtorSection =
+          getELFSection(".fini_array", MCSectionELF::SHT_FINI_ARRAY,
+                        MCSectionELF::SHF_WRITE | MCSectionELF::SHF_ALLOC,
+                        SectionKind::getDataRel());
+      }
+    }
+  };
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
new file mode 100644
index 0000000..89c7769
--- /dev/null
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp

@@ -0,0 +1,739 @@
+//===-- ARMAsmParser.cpp - Parse ARM assembly to MCInst instructions ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Target/TargetAsmParser.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+using namespace llvm;
+
+namespace {
+struct ARMOperand;
+
+// The shift types for register controlled shifts in arm memory addressing
+enum ShiftType {
+  Lsl,
+  Lsr,
+  Asr,
+  Ror,
+  Rrx
+};
+
+class ARMAsmParser : public TargetAsmParser {
+  MCAsmParser &Parser;
+
+private:
+  MCAsmParser &getParser() const { return Parser; }
+
+  MCAsmLexer &getLexer() const { return Parser.getLexer(); }
+
+  void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); }
+
+  bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); }
+
+  bool MaybeParseRegister(ARMOperand &Op, bool ParseWriteBack);
+
+  bool ParseRegisterList(ARMOperand &Op);
+
+  bool ParseMemory(ARMOperand &Op);
+
+  bool ParseMemoryOffsetReg(bool &Negative,
+                            bool &OffsetRegShifted,
+                            enum ShiftType &ShiftType,
+                            const MCExpr *&ShiftAmount,
+                            const MCExpr *&Offset,
+                            bool &OffsetIsReg,
+                            int &OffsetRegNum);
+
+  bool ParseShift(enum ShiftType &St, const MCExpr *&ShiftAmount);
+
+  bool ParseOperand(ARMOperand &Op);
+
+  bool ParseDirectiveWord(unsigned Size, SMLoc L);
+
+  bool ParseDirectiveThumb(SMLoc L);
+
+  bool ParseDirectiveThumbFunc(SMLoc L);
+
+  bool ParseDirectiveCode(SMLoc L);
+
+  bool ParseDirectiveSyntax(SMLoc L);
+
+  // TODO - For now hacked versions of the next two are in here in this file to
+  // allow some parser testing until the table gen versions are implemented.
+
+  /// @name Auto-generated Match Functions
+  /// {
+  bool MatchInstruction(const SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                        MCInst &Inst);
+
+  /// MatchRegisterName - Match the given string to a register name and return
+  /// its register number, or -1 if there is no match.  To allow return values
+  /// to be used directly in register lists, arm registers have values between
+  /// 0 and 15.
+  int MatchRegisterName(const StringRef &Name);
+
+  /// }
+
+
+public:
+  ARMAsmParser(const Target &T, MCAsmParser &_Parser)
+    : TargetAsmParser(T), Parser(_Parser) {}
+
+  virtual bool ParseInstruction(const StringRef &Name, SMLoc NameLoc,
+                                SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  virtual bool ParseDirective(AsmToken DirectiveID);
+};
+  
+/// ARMOperand - Instances of this class represent a parsed ARM machine
+/// instruction.
+struct ARMOperand : public MCParsedAsmOperand {
+  enum {
+    Token,
+    Register,
+    Immediate,
+    Memory
+  } Kind;
+
+
+  union {
+    struct {
+      const char *Data;
+      unsigned Length;
+    } Tok;
+
+    struct {
+      unsigned RegNum;
+      bool Writeback;
+    } Reg;
+
+    struct {
+      const MCExpr *Val;
+    } Imm;
+
+    // This is for all forms of ARM address expressions
+    struct {
+      unsigned BaseRegNum;
+      unsigned OffsetRegNum; // used when OffsetIsReg is true
+      const MCExpr *Offset; // used when OffsetIsReg is false
+      const MCExpr *ShiftAmount; // used when OffsetRegShifted is true
+      enum ShiftType ShiftType;  // used when OffsetRegShifted is true
+      unsigned
+        OffsetRegShifted : 1, // only used when OffsetIsReg is true
+        Preindexed : 1,
+        Postindexed : 1,
+        OffsetIsReg : 1,
+        Negative : 1, // only used when OffsetIsReg is true
+        Writeback : 1;
+    } Mem;
+
+  };
+
+  StringRef getToken() const {
+    assert(Kind == Token && "Invalid access!");
+    return StringRef(Tok.Data, Tok.Length);
+  }
+
+  unsigned getReg() const {
+    assert(Kind == Register && "Invalid access!");
+    return Reg.RegNum;
+  }
+
+  const MCExpr *getImm() const {
+    assert(Kind == Immediate && "Invalid access!");
+    return Imm.Val;
+  }
+
+  bool isToken() const {return Kind == Token; }
+
+  bool isReg() const { return Kind == Register; }
+
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getReg()));
+  }
+
+  static ARMOperand CreateToken(StringRef Str) {
+    ARMOperand Res;
+    Res.Kind = Token;
+    Res.Tok.Data = Str.data();
+    Res.Tok.Length = Str.size();
+    return Res;
+  }
+
+  static ARMOperand CreateReg(unsigned RegNum, bool Writeback) {
+    ARMOperand Res;
+    Res.Kind = Register;
+    Res.Reg.RegNum = RegNum;
+    Res.Reg.Writeback = Writeback;
+    return Res;
+  }
+
+  static ARMOperand CreateImm(const MCExpr *Val) {
+    ARMOperand Res;
+    Res.Kind = Immediate;
+    Res.Imm.Val = Val;
+    return Res;
+  }
+
+  static ARMOperand CreateMem(unsigned BaseRegNum, bool OffsetIsReg,
+                              const MCExpr *Offset, unsigned OffsetRegNum,
+                              bool OffsetRegShifted, enum ShiftType ShiftType,
+                              const MCExpr *ShiftAmount, bool Preindexed,
+                              bool Postindexed, bool Negative, bool Writeback) {
+    ARMOperand Res;
+    Res.Kind = Memory;
+    Res.Mem.BaseRegNum = BaseRegNum;
+    Res.Mem.OffsetIsReg = OffsetIsReg;
+    Res.Mem.Offset = Offset;
+    Res.Mem.OffsetRegNum = OffsetRegNum;
+    Res.Mem.OffsetRegShifted = OffsetRegShifted;
+    Res.Mem.ShiftType = ShiftType;
+    Res.Mem.ShiftAmount = ShiftAmount;
+    Res.Mem.Preindexed = Preindexed;
+    Res.Mem.Postindexed = Postindexed;
+    Res.Mem.Negative = Negative;
+    Res.Mem.Writeback = Writeback;
+    return Res;
+  }
+};
+
+} // end anonymous namespace.
+
+/// Try to parse a register name.  The token must be an Identifier when called,
+/// and if it is a register name a Reg operand is created, the token is eaten
+/// and false is returned.  Else true is returned and no token is eaten.
+/// TODO this is likely to change to allow different register types and or to
+/// parse for a specific register type.
+bool ARMAsmParser::MaybeParseRegister(ARMOperand &Op, bool ParseWriteBack) {
+  const AsmToken &Tok = Parser.getTok();
+  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
+
+  // FIXME: Validate register for the current architecture; we have to do
+  // validation later, so maybe there is no need for this here.
+  int RegNum;
+
+  RegNum = MatchRegisterName(Tok.getString());
+  if (RegNum == -1)
+    return true;
+  Parser.Lex(); // Eat identifier token.
+
+  bool Writeback = false;
+  if (ParseWriteBack) {
+    const AsmToken &ExclaimTok = Parser.getTok();
+    if (ExclaimTok.is(AsmToken::Exclaim)) {
+      Writeback = true;
+      Parser.Lex(); // Eat exclaim token
+    }
+  }
+
+  Op = ARMOperand::CreateReg(RegNum, Writeback);
+
+  return false;
+}
+
+/// Parse a register list, return false if successful else return true or an 
+/// error.  The first token must be a '{' when called.
+bool ARMAsmParser::ParseRegisterList(ARMOperand &Op) {
+  assert(Parser.getTok().is(AsmToken::LCurly) &&
+         "Token is not an Left Curly Brace");
+  Parser.Lex(); // Eat left curly brace token.
+
+  const AsmToken &RegTok = Parser.getTok();
+  SMLoc RegLoc = RegTok.getLoc();
+  if (RegTok.isNot(AsmToken::Identifier))
+    return Error(RegLoc, "register expected");
+  int RegNum = MatchRegisterName(RegTok.getString());
+  if (RegNum == -1)
+    return Error(RegLoc, "register expected");
+  Parser.Lex(); // Eat identifier token.
+  unsigned RegList = 1 << RegNum;
+
+  int HighRegNum = RegNum;
+  // TODO ranges like "{Rn-Rm}"
+  while (Parser.getTok().is(AsmToken::Comma)) {
+    Parser.Lex(); // Eat comma token.
+
+    const AsmToken &RegTok = Parser.getTok();
+    SMLoc RegLoc = RegTok.getLoc();
+    if (RegTok.isNot(AsmToken::Identifier))
+      return Error(RegLoc, "register expected");
+    int RegNum = MatchRegisterName(RegTok.getString());
+    if (RegNum == -1)
+      return Error(RegLoc, "register expected");
+
+    if (RegList & (1 << RegNum))
+      Warning(RegLoc, "register duplicated in register list");
+    else if (RegNum <= HighRegNum)
+      Warning(RegLoc, "register not in ascending order in register list");
+    RegList |= 1 << RegNum;
+    HighRegNum = RegNum;
+
+    Parser.Lex(); // Eat identifier token.
+  }
+  const AsmToken &RCurlyTok = Parser.getTok();
+  if (RCurlyTok.isNot(AsmToken::RCurly))
+    return Error(RCurlyTok.getLoc(), "'}' expected");
+  Parser.Lex(); // Eat left curly brace token.
+
+  return false;
+}
+
+/// Parse an arm memory expression, return false if successful else return true
+/// or an error.  The first token must be a '[' when called.
+/// TODO Only preindexing and postindexing addressing are started, unindexed
+/// with option, etc are still to do.
+bool ARMAsmParser::ParseMemory(ARMOperand &Op) {
+  assert(Parser.getTok().is(AsmToken::LBrac) &&
+         "Token is not an Left Bracket");
+  Parser.Lex(); // Eat left bracket token.
+
+  const AsmToken &BaseRegTok = Parser.getTok();
+  if (BaseRegTok.isNot(AsmToken::Identifier))
+    return Error(BaseRegTok.getLoc(), "register expected");
+  if (MaybeParseRegister(Op, false))
+    return Error(BaseRegTok.getLoc(), "register expected");
+  int BaseRegNum = Op.getReg();
+
+  bool Preindexed = false;
+  bool Postindexed = false;
+  bool OffsetIsReg = false;
+  bool Negative = false;
+  bool Writeback = false;
+
+  // First look for preindexed address forms, that is after the "[Rn" we now
+  // have to see if the next token is a comma.
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.is(AsmToken::Comma)) {
+    Preindexed = true;
+    Parser.Lex(); // Eat comma token.
+    int OffsetRegNum;
+    bool OffsetRegShifted;
+    enum ShiftType ShiftType;
+    const MCExpr *ShiftAmount;
+    const MCExpr *Offset;
+    if(ParseMemoryOffsetReg(Negative, OffsetRegShifted, ShiftType, ShiftAmount,
+                            Offset, OffsetIsReg, OffsetRegNum))
+      return true;
+    const AsmToken &RBracTok = Parser.getTok();
+    if (RBracTok.isNot(AsmToken::RBrac))
+      return Error(RBracTok.getLoc(), "']' expected");
+    Parser.Lex(); // Eat right bracket token.
+
+    const AsmToken &ExclaimTok = Parser.getTok();
+    if (ExclaimTok.is(AsmToken::Exclaim)) {
+      Writeback = true;
+      Parser.Lex(); // Eat exclaim token
+    }
+    Op = ARMOperand::CreateMem(BaseRegNum, OffsetIsReg, Offset, OffsetRegNum,
+                               OffsetRegShifted, ShiftType, ShiftAmount,
+                               Preindexed, Postindexed, Negative, Writeback);
+    return false;
+  }
+  // The "[Rn" we have so far was not followed by a comma.
+  else if (Tok.is(AsmToken::RBrac)) {
+    // This is a post indexing addressing forms, that is a ']' follows after
+    // the "[Rn".
+    Postindexed = true;
+    Writeback = true;
+    Parser.Lex(); // Eat right bracket token.
+
+    int OffsetRegNum = 0;
+    bool OffsetRegShifted = false;
+    enum ShiftType ShiftType;
+    const MCExpr *ShiftAmount;
+    const MCExpr *Offset;
+
+    const AsmToken &NextTok = Parser.getTok();
+    if (NextTok.isNot(AsmToken::EndOfStatement)) {
+      if (NextTok.isNot(AsmToken::Comma))
+	return Error(NextTok.getLoc(), "',' expected");
+      Parser.Lex(); // Eat comma token.
+      if(ParseMemoryOffsetReg(Negative, OffsetRegShifted, ShiftType,
+                              ShiftAmount, Offset, OffsetIsReg, OffsetRegNum))
+        return true;
+    }
+
+    Op = ARMOperand::CreateMem(BaseRegNum, OffsetIsReg, Offset, OffsetRegNum,
+                               OffsetRegShifted, ShiftType, ShiftAmount,
+                               Preindexed, Postindexed, Negative, Writeback);
+    return false;
+  }
+
+  return true;
+}
+
+/// Parse the offset of a memory operand after we have seen "[Rn," or "[Rn],"
+/// we will parse the following (were +/- means that a plus or minus is
+/// optional):
+///   +/-Rm
+///   +/-Rm, shift
+///   #offset
+/// we return false on success or an error otherwise.
+bool ARMAsmParser::ParseMemoryOffsetReg(bool &Negative,
+					bool &OffsetRegShifted,
+                                        enum ShiftType &ShiftType,
+                                        const MCExpr *&ShiftAmount,
+                                        const MCExpr *&Offset,
+                                        bool &OffsetIsReg,
+                                        int &OffsetRegNum) {
+  ARMOperand Op;
+  Negative = false;
+  OffsetRegShifted = false;
+  OffsetIsReg = false;
+  OffsetRegNum = -1;
+  const AsmToken &NextTok = Parser.getTok();
+  if (NextTok.is(AsmToken::Plus))
+    Parser.Lex(); // Eat plus token.
+  else if (NextTok.is(AsmToken::Minus)) {
+    Negative = true;
+    Parser.Lex(); // Eat minus token
+  }
+  // See if there is a register following the "[Rn," or "[Rn]," we have so far.
+  const AsmToken &OffsetRegTok = Parser.getTok();
+  if (OffsetRegTok.is(AsmToken::Identifier)) {
+    OffsetIsReg = !MaybeParseRegister(Op, false);
+    if (OffsetIsReg)
+      OffsetRegNum = Op.getReg();
+  }
+  // If we parsed a register as the offset then their can be a shift after that
+  if (OffsetRegNum != -1) {
+    // Look for a comma then a shift
+    const AsmToken &Tok = Parser.getTok();
+    if (Tok.is(AsmToken::Comma)) {
+      Parser.Lex(); // Eat comma token.
+
+      const AsmToken &Tok = Parser.getTok();
+      if (ParseShift(ShiftType, ShiftAmount))
+	return Error(Tok.getLoc(), "shift expected");
+      OffsetRegShifted = true;
+    }
+  }
+  else { // the "[Rn," or "[Rn,]" we have so far was not followed by "Rm"
+    // Look for #offset following the "[Rn," or "[Rn],"
+    const AsmToken &HashTok = Parser.getTok();
+    if (HashTok.isNot(AsmToken::Hash))
+      return Error(HashTok.getLoc(), "'#' expected");
+    Parser.Lex(); // Eat hash token.
+
+    if (getParser().ParseExpression(Offset))
+     return true;
+  }
+  return false;
+}
+
+/// ParseShift as one of these two:
+///   ( lsl | lsr | asr | ror ) , # shift_amount
+///   rrx
+/// and returns true if it parses a shift otherwise it returns false.
+bool ARMAsmParser::ParseShift(ShiftType &St, const MCExpr *&ShiftAmount) {
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.isNot(AsmToken::Identifier))
+    return true;
+  const StringRef &ShiftName = Tok.getString();
+  if (ShiftName == "lsl" || ShiftName == "LSL")
+    St = Lsl;
+  else if (ShiftName == "lsr" || ShiftName == "LSR")
+    St = Lsr;
+  else if (ShiftName == "asr" || ShiftName == "ASR")
+    St = Asr;
+  else if (ShiftName == "ror" || ShiftName == "ROR")
+    St = Ror;
+  else if (ShiftName == "rrx" || ShiftName == "RRX")
+    St = Rrx;
+  else
+    return true;
+  Parser.Lex(); // Eat shift type token.
+
+  // Rrx stands alone.
+  if (St == Rrx)
+    return false;
+
+  // Otherwise, there must be a '#' and a shift amount.
+  const AsmToken &HashTok = Parser.getTok();
+  if (HashTok.isNot(AsmToken::Hash))
+    return Error(HashTok.getLoc(), "'#' expected");
+  Parser.Lex(); // Eat hash token.
+
+  if (getParser().ParseExpression(ShiftAmount))
+    return true;
+
+  return false;
+}
+
+/// A hack to allow some testing, to be replaced by a real table gen version.
+int ARMAsmParser::MatchRegisterName(const StringRef &Name) {
+  if (Name == "r0" || Name == "R0")
+    return 0;
+  else if (Name == "r1" || Name == "R1")
+    return 1;
+  else if (Name == "r2" || Name == "R2")
+    return 2;
+  else if (Name == "r3" || Name == "R3")
+    return 3;
+  else if (Name == "r3" || Name == "R3")
+    return 3;
+  else if (Name == "r4" || Name == "R4")
+    return 4;
+  else if (Name == "r5" || Name == "R5")
+    return 5;
+  else if (Name == "r6" || Name == "R6")
+    return 6;
+  else if (Name == "r7" || Name == "R7")
+    return 7;
+  else if (Name == "r8" || Name == "R8")
+    return 8;
+  else if (Name == "r9" || Name == "R9")
+    return 9;
+  else if (Name == "r10" || Name == "R10")
+    return 10;
+  else if (Name == "r11" || Name == "R11" || Name == "fp")
+    return 11;
+  else if (Name == "r12" || Name == "R12" || Name == "ip")
+    return 12;
+  else if (Name == "r13" || Name == "R13" || Name == "sp")
+    return 13;
+  else if (Name == "r14" || Name == "R14" || Name == "lr")
+      return 14;
+  else if (Name == "r15" || Name == "R15" || Name == "pc")
+    return 15;
+  return -1;
+}
+
+/// A hack to allow some testing, to be replaced by a real table gen version.
+bool ARMAsmParser::
+MatchInstruction(const SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                 MCInst &Inst) {
+  ARMOperand &Op0 = *(ARMOperand*)Operands[0];
+  assert(Op0.Kind == ARMOperand::Token && "First operand not a Token");
+  const StringRef &Mnemonic = Op0.getToken();
+  if (Mnemonic == "add" ||
+      Mnemonic == "stmfd" ||
+      Mnemonic == "str" ||
+      Mnemonic == "ldmfd" ||
+      Mnemonic == "ldr" ||
+      Mnemonic == "mov" ||
+      Mnemonic == "sub" ||
+      Mnemonic == "bl" ||
+      Mnemonic == "push" ||
+      Mnemonic == "blx" ||
+      Mnemonic == "pop") {
+    // Hard-coded to a valid instruction, till we have a real matcher.
+    Inst = MCInst();
+    Inst.setOpcode(ARM::MOVr);
+    Inst.addOperand(MCOperand::CreateReg(2));
+    Inst.addOperand(MCOperand::CreateReg(2));
+    Inst.addOperand(MCOperand::CreateImm(0));
+    Inst.addOperand(MCOperand::CreateImm(0));
+    Inst.addOperand(MCOperand::CreateReg(0));
+    return false;
+  }
+
+  return true;
+}
+
+/// Parse a arm instruction operand.  For now this parses the operand regardless
+/// of the mnemonic.
+bool ARMAsmParser::ParseOperand(ARMOperand &Op) {
+  switch (getLexer().getKind()) {
+  case AsmToken::Identifier:
+    if (!MaybeParseRegister(Op, true))
+      return false;
+    // This was not a register so parse other operands that start with an
+    // identifier (like labels) as expressions and create them as immediates.
+    const MCExpr *IdVal;
+    if (getParser().ParseExpression(IdVal))
+      return true;
+    Op = ARMOperand::CreateImm(IdVal);
+    return false;
+  case AsmToken::LBrac:
+    return ParseMemory(Op);
+  case AsmToken::LCurly:
+    return ParseRegisterList(Op);
+  case AsmToken::Hash:
+    // #42 -> immediate.
+    // TODO: ":lower16:" and ":upper16:" modifiers after # before immediate
+    Parser.Lex();
+    const MCExpr *ImmVal;
+    if (getParser().ParseExpression(ImmVal))
+      return true;
+    Op = ARMOperand::CreateImm(ImmVal);
+    return false;
+  default:
+    return Error(Parser.getTok().getLoc(), "unexpected token in operand");
+  }
+}
+
+/// Parse an arm instruction mnemonic followed by its operands.
+bool ARMAsmParser::ParseInstruction(const StringRef &Name, SMLoc NameLoc,
+                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  Operands.push_back(new ARMOperand(ARMOperand::CreateToken(Name)));
+
+  SMLoc Loc = Parser.getTok().getLoc();
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+
+    // Read the first operand.
+    ARMOperand Op;
+    if (ParseOperand(Op)) return true;
+    Operands.push_back(new ARMOperand(Op));
+
+    while (getLexer().is(AsmToken::Comma)) {
+      Parser.Lex();  // Eat the comma.
+
+      // Parse and remember the operand.
+      if (ParseOperand(Op)) return true;
+      Operands.push_back(new ARMOperand(Op));
+    }
+  }
+  return false;
+}
+
+/// ParseDirective parses the arm specific directives
+bool ARMAsmParser::ParseDirective(AsmToken DirectiveID) {
+  StringRef IDVal = DirectiveID.getIdentifier();
+  if (IDVal == ".word")
+    return ParseDirectiveWord(4, DirectiveID.getLoc());
+  else if (IDVal == ".thumb")
+    return ParseDirectiveThumb(DirectiveID.getLoc());
+  else if (IDVal == ".thumb_func")
+    return ParseDirectiveThumbFunc(DirectiveID.getLoc());
+  else if (IDVal == ".code")
+    return ParseDirectiveCode(DirectiveID.getLoc());
+  else if (IDVal == ".syntax")
+    return ParseDirectiveSyntax(DirectiveID.getLoc());
+  return true;
+}
+
+/// ParseDirectiveWord
+///  ::= .word [ expression (, expression)* ]
+bool ARMAsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    for (;;) {
+      const MCExpr *Value;
+      if (getParser().ParseExpression(Value))
+        return true;
+
+      getParser().getStreamer().EmitValue(Value, Size, 0/*addrspace*/);
+
+      if (getLexer().is(AsmToken::EndOfStatement))
+        break;
+      
+      // FIXME: Improve diagnostic.
+      if (getLexer().isNot(AsmToken::Comma))
+        return Error(L, "unexpected token in directive");
+      Parser.Lex();
+    }
+  }
+
+  Parser.Lex();
+  return false;
+}
+
+/// ParseDirectiveThumb
+///  ::= .thumb
+bool ARMAsmParser::ParseDirectiveThumb(SMLoc L) {
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return Error(L, "unexpected token in directive");
+  Parser.Lex();
+
+  // TODO: set thumb mode
+  // TODO: tell the MC streamer the mode
+  // getParser().getStreamer().Emit???();
+  return false;
+}
+
+/// ParseDirectiveThumbFunc
+///  ::= .thumbfunc symbol_name
+bool ARMAsmParser::ParseDirectiveThumbFunc(SMLoc L) {
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.isNot(AsmToken::Identifier) && Tok.isNot(AsmToken::String))
+    return Error(L, "unexpected token in .syntax directive");
+  StringRef ATTRIBUTE_UNUSED SymbolName = Parser.getTok().getIdentifier();
+  Parser.Lex(); // Consume the identifier token.
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return Error(L, "unexpected token in directive");
+  Parser.Lex();
+
+  // TODO: mark symbol as a thumb symbol
+  // getParser().getStreamer().Emit???();
+  return false;
+}
+
+/// ParseDirectiveSyntax
+///  ::= .syntax unified | divided
+bool ARMAsmParser::ParseDirectiveSyntax(SMLoc L) {
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.isNot(AsmToken::Identifier))
+    return Error(L, "unexpected token in .syntax directive");
+  const StringRef &Mode = Tok.getString();
+  bool unified_syntax;
+  if (Mode == "unified" || Mode == "UNIFIED") {
+    Parser.Lex();
+    unified_syntax = true;
+  }
+  else if (Mode == "divided" || Mode == "DIVIDED") {
+    Parser.Lex();
+    unified_syntax = false;
+  }
+  else
+    return Error(L, "unrecognized syntax mode in .syntax directive");
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return Error(Parser.getTok().getLoc(), "unexpected token in directive");
+  Parser.Lex();
+
+  // TODO tell the MC streamer the mode
+  // getParser().getStreamer().Emit???();
+  return false;
+}
+
+/// ParseDirectiveCode
+///  ::= .code 16 | 32
+bool ARMAsmParser::ParseDirectiveCode(SMLoc L) {
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.isNot(AsmToken::Integer))
+    return Error(L, "unexpected token in .code directive");
+  int64_t Val = Parser.getTok().getIntVal();
+  bool thumb_mode;
+  if (Val == 16) {
+    Parser.Lex();
+    thumb_mode = true;
+  }
+  else if (Val == 32) {
+    Parser.Lex();
+    thumb_mode = false;
+  }
+  else
+    return Error(L, "invalid operand to .code directive");
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return Error(Parser.getTok().getLoc(), "unexpected token in directive");
+  Parser.Lex();
+
+  // TODO tell the MC streamer the mode
+  // getParser().getStreamer().Emit???();
+  return false;
+}
+
+/// Force static initialization.
+extern "C" void LLVMInitializeARMAsmParser() {
+  RegisterAsmParser<ARMAsmParser> X(TheARMTarget);
+  RegisterAsmParser<ARMAsmParser> Y(TheThumbTarget);
+}

diff --git a/lib/Target/ARM/AsmParser/CMakeLists.txt b/lib/Target/ARM/AsmParser/CMakeLists.txt
new file mode 100644
index 0000000..308c6cf
--- /dev/null
+++ b/lib/Target/ARM/AsmParser/CMakeLists.txt

@@ -0,0 +1,6 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMARMAsmParser
+  ARMAsmParser.cpp
+  )
+

diff --git a/lib/Target/ARM/AsmParser/Makefile b/lib/Target/ARM/AsmParser/Makefile
new file mode 100644
index 0000000..97e5612
--- /dev/null
+++ b/lib/Target/ARM/AsmParser/Makefile

@@ -0,0 +1,15 @@
+##===- lib/Target/ARM/AsmParser/Makefile -------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMARMAsmParser
+
+# Hack: we need to include 'main' ARM target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common

diff --git a/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp
new file mode 100644
index 0000000..0a75c09
--- /dev/null
+++ b/lib/Target/ARM/AsmPrinter/ARMAsmPrinter.cpp

@@ -0,0 +1,1296 @@
+//===-- ARMAsmPrinter.cpp - Print machine code to an ARM .s file ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to GAS-format ARM assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "ARM.h"
+#include "ARMBuildAttrs.h"
+#include "ARMAddressingModes.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMInstPrinter.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMMCInstLower.h"
+#include "ARMTargetMachine.h"
+#include "llvm/Constants.h"
+#include "llvm/Module.h"
+#include "llvm/Type.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/MathExtras.h"
+#include <cctype>
+using namespace llvm;
+
+static cl::opt<bool>
+EnableMCInst("enable-arm-mcinst-printer", cl::Hidden,
+            cl::desc("enable experimental asmprinter gunk in the arm backend"));
+
+namespace {
+  class ARMAsmPrinter : public AsmPrinter {
+
+    /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
+    /// make the right decision when printing asm code for different targets.
+    const ARMSubtarget *Subtarget;
+
+    /// AFI - Keep a pointer to ARMFunctionInfo for the current
+    /// MachineFunction.
+    ARMFunctionInfo *AFI;
+
+    /// MCP - Keep a pointer to constantpool entries of the current
+    /// MachineFunction.
+    const MachineConstantPool *MCP;
+
+  public:
+    explicit ARMAsmPrinter(formatted_raw_ostream &O, TargetMachine &TM,
+                           MCContext &Ctx, MCStreamer &Streamer,
+                           const MCAsmInfo *T)
+      : AsmPrinter(O, TM, Ctx, Streamer, T), AFI(NULL), MCP(NULL) {
+      Subtarget = &TM.getSubtarget<ARMSubtarget>();
+    }
+
+    virtual const char *getPassName() const {
+      return "ARM Assembly Printer";
+    }
+    
+    void printInstructionThroughMCStreamer(const MachineInstr *MI);
+    
+
+    void printOperand(const MachineInstr *MI, int OpNum,
+                      const char *Modifier = 0);
+    void printSOImmOperand(const MachineInstr *MI, int OpNum);
+    void printSOImm2PartOperand(const MachineInstr *MI, int OpNum);
+    void printSORegOperand(const MachineInstr *MI, int OpNum);
+    void printAddrMode2Operand(const MachineInstr *MI, int OpNum);
+    void printAddrMode2OffsetOperand(const MachineInstr *MI, int OpNum);
+    void printAddrMode3Operand(const MachineInstr *MI, int OpNum);
+    void printAddrMode3OffsetOperand(const MachineInstr *MI, int OpNum);
+    void printAddrMode4Operand(const MachineInstr *MI, int OpNum,
+                               const char *Modifier = 0);
+    void printAddrMode5Operand(const MachineInstr *MI, int OpNum,
+                               const char *Modifier = 0);
+    void printAddrMode6Operand(const MachineInstr *MI, int OpNum);
+    void printAddrModePCOperand(const MachineInstr *MI, int OpNum,
+                                const char *Modifier = 0);
+    void printBitfieldInvMaskImmOperand (const MachineInstr *MI, int OpNum);
+
+    void printThumbS4ImmOperand(const MachineInstr *MI, int OpNum);
+    void printThumbITMask(const MachineInstr *MI, int OpNum);
+    void printThumbAddrModeRROperand(const MachineInstr *MI, int OpNum);
+    void printThumbAddrModeRI5Operand(const MachineInstr *MI, int OpNum,
+                                      unsigned Scale);
+    void printThumbAddrModeS1Operand(const MachineInstr *MI, int OpNum);
+    void printThumbAddrModeS2Operand(const MachineInstr *MI, int OpNum);
+    void printThumbAddrModeS4Operand(const MachineInstr *MI, int OpNum);
+    void printThumbAddrModeSPOperand(const MachineInstr *MI, int OpNum);
+
+    void printT2SOOperand(const MachineInstr *MI, int OpNum);
+    void printT2AddrModeImm12Operand(const MachineInstr *MI, int OpNum);
+    void printT2AddrModeImm8Operand(const MachineInstr *MI, int OpNum);
+    void printT2AddrModeImm8s4Operand(const MachineInstr *MI, int OpNum);
+    void printT2AddrModeImm8OffsetOperand(const MachineInstr *MI, int OpNum);
+    void printT2AddrModeSoRegOperand(const MachineInstr *MI, int OpNum);
+
+    void printPredicateOperand(const MachineInstr *MI, int OpNum);
+    void printSBitModifierOperand(const MachineInstr *MI, int OpNum);
+    void printPCLabel(const MachineInstr *MI, int OpNum);
+    void printRegisterList(const MachineInstr *MI, int OpNum);
+    void printCPInstOperand(const MachineInstr *MI, int OpNum,
+                            const char *Modifier);
+    void printJTBlockOperand(const MachineInstr *MI, int OpNum);
+    void printJT2BlockOperand(const MachineInstr *MI, int OpNum);
+    void printTBAddrMode(const MachineInstr *MI, int OpNum);
+    void printNoHashImmediate(const MachineInstr *MI, int OpNum);
+    void printVFPf32ImmOperand(const MachineInstr *MI, int OpNum);
+    void printVFPf64ImmOperand(const MachineInstr *MI, int OpNum);
+
+    void printHex8ImmOperand(const MachineInstr *MI, int OpNum) {
+      O << "#0x" << utohexstr(MI->getOperand(OpNum).getImm() & 0xff);
+    }
+    void printHex16ImmOperand(const MachineInstr *MI, int OpNum) {
+      O << "#0x" << utohexstr(MI->getOperand(OpNum).getImm() & 0xffff);
+    }
+    void printHex32ImmOperand(const MachineInstr *MI, int OpNum) {
+      O << "#0x" << utohexstr(MI->getOperand(OpNum).getImm() & 0xffffffff);
+    }
+    void printHex64ImmOperand(const MachineInstr *MI, int OpNum) {
+      O << "#0x" << utohexstr(MI->getOperand(OpNum).getImm());
+    }
+
+    virtual bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
+                                 unsigned AsmVariant, const char *ExtraCode);
+    virtual bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
+                                       unsigned AsmVariant,
+                                       const char *ExtraCode);
+
+    void printInstruction(const MachineInstr *MI);  // autogenerated.
+    static const char *getRegisterName(unsigned RegNo);
+
+    virtual void EmitInstruction(const MachineInstr *MI);
+    bool runOnMachineFunction(MachineFunction &F);
+    
+    virtual void EmitConstantPool() {} // we emit constant pools customly!
+    virtual void EmitFunctionEntryLabel();
+    void EmitStartOfAsmFile(Module &M);
+    void EmitEndOfAsmFile(Module &M);
+
+    MCSymbol *GetARMSetPICJumpTableLabel2(unsigned uid, unsigned uid2,
+                                          const MachineBasicBlock *MBB) const;
+    MCSymbol *GetARMJTIPICJumpTableLabel2(unsigned uid, unsigned uid2) const;
+
+    /// EmitMachineConstantPoolValue - Print a machine constantpool value to
+    /// the .s file.
+    virtual void EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
+      switch (TM.getTargetData()->getTypeAllocSize(MCPV->getType())) {
+      case 1: O << MAI->getData8bitsDirective(0); break;
+      case 2: O << MAI->getData16bitsDirective(0); break;
+      case 4: O << MAI->getData32bitsDirective(0); break;
+      default: assert(0 && "Unknown CPV size");
+      }
+
+      ARMConstantPoolValue *ACPV = static_cast<ARMConstantPoolValue*>(MCPV);
+      SmallString<128> TmpNameStr;
+
+      if (ACPV->isLSDA()) {
+        raw_svector_ostream(TmpNameStr) << MAI->getPrivateGlobalPrefix() <<
+          "_LSDA_" << getFunctionNumber();
+        O << TmpNameStr.str();
+      } else if (ACPV->isBlockAddress()) {
+        O << GetBlockAddressSymbol(ACPV->getBlockAddress())->getName();
+      } else if (ACPV->isGlobalValue()) {
+        GlobalValue *GV = ACPV->getGV();
+        bool isIndirect = Subtarget->isTargetDarwin() &&
+          Subtarget->GVIsIndirectSymbol(GV, TM.getRelocationModel());
+        if (!isIndirect)
+          O << *GetGlobalValueSymbol(GV);
+        else {
+          // FIXME: Remove this when Darwin transition to @GOT like syntax.
+          MCSymbol *Sym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+          O << *Sym;
+          
+          MachineModuleInfoMachO &MMIMachO =
+            MMI->getObjFileInfo<MachineModuleInfoMachO>();
+          MCSymbol *&StubSym =
+            GV->hasHiddenVisibility() ? MMIMachO.getHiddenGVStubEntry(Sym) :
+                                        MMIMachO.getGVStubEntry(Sym);
+          if (StubSym == 0)
+            StubSym = GetGlobalValueSymbol(GV);
+        }
+      } else {
+        assert(ACPV->isExtSymbol() && "unrecognized constant pool value");
+        O << *GetExternalSymbolSymbol(ACPV->getSymbol());
+      }
+
+      if (ACPV->hasModifier()) O << "(" << ACPV->getModifier() << ")";
+      if (ACPV->getPCAdjustment() != 0) {
+        O << "-(" << MAI->getPrivateGlobalPrefix() << "PC"
+          << getFunctionNumber() << "_"  << ACPV->getLabelId()
+          << "+" << (unsigned)ACPV->getPCAdjustment();
+         if (ACPV->mustAddCurrentAddress())
+           O << "-.";
+         O << ')';
+      }
+      OutStreamer.AddBlankLine();
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AsmPrinter::getAnalysisUsage(AU);
+      AU.setPreservesAll();
+      AU.addRequired<MachineModuleInfo>();
+      AU.addRequired<DwarfWriter>();
+    }
+  };
+} // end of anonymous namespace
+
+#include "ARMGenAsmWriter.inc"
+
+void ARMAsmPrinter::EmitFunctionEntryLabel() {
+  if (AFI->isThumbFunction()) {
+    O << "\t.code\t16\n";
+    O << "\t.thumb_func";
+    if (Subtarget->isTargetDarwin())
+      O << '\t' << *CurrentFnSym;
+    O << '\n';
+  }
+  
+  OutStreamer.EmitLabel(CurrentFnSym);
+}
+
+/// runOnMachineFunction - This uses the printInstruction()
+/// method to print assembly for each instruction.
+///
+bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  AFI = MF.getInfo<ARMFunctionInfo>();
+  MCP = MF.getConstantPool();
+
+  return AsmPrinter::runOnMachineFunction(MF);
+}
+
+void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
+                                 const char *Modifier) {
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  unsigned TF = MO.getTargetFlags();
+
+  switch (MO.getType()) {
+  default:
+    assert(0 && "<unknown operand type>");
+  case MachineOperand::MO_Register: {
+    unsigned Reg = MO.getReg();
+    assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+    if (Modifier && strcmp(Modifier, "dregpair") == 0) {
+      unsigned DRegLo = TRI->getSubReg(Reg, 5); // arm_dsubreg_0
+      unsigned DRegHi = TRI->getSubReg(Reg, 6); // arm_dsubreg_1
+      O << '{'
+        << getRegisterName(DRegLo) << ',' << getRegisterName(DRegHi)
+        << '}';
+    } else if (Modifier && strcmp(Modifier, "lane") == 0) {
+      unsigned RegNum = ARMRegisterInfo::getRegisterNumbering(Reg);
+      unsigned DReg = TRI->getMatchingSuperReg(Reg, RegNum & 1 ? 2 : 1,
+                                               &ARM::DPR_VFP2RegClass);
+      O << getRegisterName(DReg) << '[' << (RegNum & 1) << ']';
+    } else {
+      assert(!MO.getSubReg() && "Subregs should be eliminated!");
+      O << getRegisterName(Reg);
+    }
+    break;
+  }
+  case MachineOperand::MO_Immediate: {
+    int64_t Imm = MO.getImm();
+    O << '#';
+    if ((Modifier && strcmp(Modifier, "lo16") == 0) ||
+        (TF & ARMII::MO_LO16))
+      O << ":lower16:";
+    else if ((Modifier && strcmp(Modifier, "hi16") == 0) ||
+             (TF & ARMII::MO_HI16))
+      O << ":upper16:";
+    O << Imm;
+    break;
+  }
+  case MachineOperand::MO_MachineBasicBlock:
+    O << *MO.getMBB()->getSymbol(OutContext);
+    return;
+  case MachineOperand::MO_GlobalAddress: {
+    bool isCallOp = Modifier && !strcmp(Modifier, "call");
+    GlobalValue *GV = MO.getGlobal();
+
+    if ((Modifier && strcmp(Modifier, "lo16") == 0) ||
+        (TF & ARMII::MO_LO16))
+      O << ":lower16:";
+    else if ((Modifier && strcmp(Modifier, "hi16") == 0) ||
+             (TF & ARMII::MO_HI16))
+      O << ":upper16:";
+    O << *GetGlobalValueSymbol(GV);
+
+    printOffset(MO.getOffset());
+
+    if (isCallOp && Subtarget->isTargetELF() &&
+        TM.getRelocationModel() == Reloc::PIC_)
+      O << "(PLT)";
+    break;
+  }
+  case MachineOperand::MO_ExternalSymbol: {
+    bool isCallOp = Modifier && !strcmp(Modifier, "call");
+    O << *GetExternalSymbolSymbol(MO.getSymbolName());
+    
+    if (isCallOp && Subtarget->isTargetELF() &&
+        TM.getRelocationModel() == Reloc::PIC_)
+      O << "(PLT)";
+    break;
+  }
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << *GetCPISymbol(MO.getIndex());
+    break;
+  case MachineOperand::MO_JumpTableIndex:
+    O << *GetJTISymbol(MO.getIndex());
+    break;
+  }
+}
+
+static void printSOImm(formatted_raw_ostream &O, int64_t V, bool VerboseAsm,
+                       const MCAsmInfo *MAI) {
+  // Break it up into two parts that make up a shifter immediate.
+  V = ARM_AM::getSOImmVal(V);
+  assert(V != -1 && "Not a valid so_imm value!");
+
+  unsigned Imm = ARM_AM::getSOImmValImm(V);
+  unsigned Rot = ARM_AM::getSOImmValRot(V);
+
+  // Print low-level immediate formation info, per
+  // A5.1.3: "Data-processing operands - Immediate".
+  if (Rot) {
+    O << "#" << Imm << ", " << Rot;
+    // Pretty printed version.
+    if (VerboseAsm) {
+      O.PadToColumn(MAI->getCommentColumn());
+      O << MAI->getCommentString() << ' ';
+      O << (int)ARM_AM::rotr32(Imm, Rot);
+    }
+  } else {
+    O << "#" << Imm;
+  }
+}
+
+/// printSOImmOperand - SOImm is 4-bit rotate amount in bits 8-11 with 8-bit
+/// immediate in bits 0-7.
+void ARMAsmPrinter::printSOImmOperand(const MachineInstr *MI, int OpNum) {
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  assert(MO.isImm() && "Not a valid so_imm value!");
+  printSOImm(O, MO.getImm(), VerboseAsm, MAI);
+}
+
+/// printSOImm2PartOperand - SOImm is broken into two pieces using a 'mov'
+/// followed by an 'orr' to materialize.
+void ARMAsmPrinter::printSOImm2PartOperand(const MachineInstr *MI, int OpNum) {
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  assert(MO.isImm() && "Not a valid so_imm value!");
+  unsigned V1 = ARM_AM::getSOImmTwoPartFirst(MO.getImm());
+  unsigned V2 = ARM_AM::getSOImmTwoPartSecond(MO.getImm());
+  printSOImm(O, V1, VerboseAsm, MAI);
+  O << "\n\torr";
+  printPredicateOperand(MI, 2);
+  O << "\t";
+  printOperand(MI, 0);
+  O << ", ";
+  printOperand(MI, 0);
+  O << ", ";
+  printSOImm(O, V2, VerboseAsm, MAI);
+}
+
+// so_reg is a 4-operand unit corresponding to register forms of the A5.1
+// "Addressing Mode 1 - Data-processing operands" forms.  This includes:
+//    REG 0   0           - e.g. R5
+//    REG REG 0,SH_OPC    - e.g. R5, ROR R3
+//    REG 0   IMM,SH_OPC  - e.g. R5, LSL #3
+void ARMAsmPrinter::printSORegOperand(const MachineInstr *MI, int Op) {
+  const MachineOperand &MO1 = MI->getOperand(Op);
+  const MachineOperand &MO2 = MI->getOperand(Op+1);
+  const MachineOperand &MO3 = MI->getOperand(Op+2);
+
+  O << getRegisterName(MO1.getReg());
+
+  // Print the shift opc.
+  O << ", "
+    << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(MO3.getImm()))
+    << " ";
+
+  if (MO2.getReg()) {
+    O << getRegisterName(MO2.getReg());
+    assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0);
+  } else {
+    O << "#" << ARM_AM::getSORegOffset(MO3.getImm());
+  }
+}
+
+void ARMAsmPrinter::printAddrMode2Operand(const MachineInstr *MI, int Op) {
+  const MachineOperand &MO1 = MI->getOperand(Op);
+  const MachineOperand &MO2 = MI->getOperand(Op+1);
+  const MachineOperand &MO3 = MI->getOperand(Op+2);
+
+  if (!MO1.isReg()) {   // FIXME: This is for CP entries, but isn't right.
+    printOperand(MI, Op);
+    return;
+  }
+
+  O << "[" << getRegisterName(MO1.getReg());
+
+  if (!MO2.getReg()) {
+    if (ARM_AM::getAM2Offset(MO3.getImm()))  // Don't print +0.
+      O << ", #"
+        << (char)ARM_AM::getAM2Op(MO3.getImm())
+        << ARM_AM::getAM2Offset(MO3.getImm());
+    O << "]";
+    return;
+  }
+
+  O << ", "
+    << (char)ARM_AM::getAM2Op(MO3.getImm())
+    << getRegisterName(MO2.getReg());
+
+  if (unsigned ShImm = ARM_AM::getAM2Offset(MO3.getImm()))
+    O << ", "
+      << ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO3.getImm()))
+      << " #" << ShImm;
+  O << "]";
+}
+
+void ARMAsmPrinter::printAddrMode2OffsetOperand(const MachineInstr *MI, int Op){
+  const MachineOperand &MO1 = MI->getOperand(Op);
+  const MachineOperand &MO2 = MI->getOperand(Op+1);
+
+  if (!MO1.getReg()) {
+    unsigned ImmOffs = ARM_AM::getAM2Offset(MO2.getImm());
+    assert(ImmOffs && "Malformed indexed load / store!");
+    O << "#"
+      << (char)ARM_AM::getAM2Op(MO2.getImm())
+      << ImmOffs;
+    return;
+  }
+
+  O << (char)ARM_AM::getAM2Op(MO2.getImm())
+    << getRegisterName(MO1.getReg());
+
+  if (unsigned ShImm = ARM_AM::getAM2Offset(MO2.getImm()))
+    O << ", "
+      << ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO2.getImm()))
+      << " #" << ShImm;
+}
+
+void ARMAsmPrinter::printAddrMode3Operand(const MachineInstr *MI, int Op) {
+  const MachineOperand &MO1 = MI->getOperand(Op);
+  const MachineOperand &MO2 = MI->getOperand(Op+1);
+  const MachineOperand &MO3 = MI->getOperand(Op+2);
+
+  assert(TargetRegisterInfo::isPhysicalRegister(MO1.getReg()));
+  O << "[" << getRegisterName(MO1.getReg());
+
+  if (MO2.getReg()) {
+    O << ", "
+      << (char)ARM_AM::getAM3Op(MO3.getImm())
+      << getRegisterName(MO2.getReg())
+      << "]";
+    return;
+  }
+
+  if (unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm()))
+    O << ", #"
+      << (char)ARM_AM::getAM3Op(MO3.getImm())
+      << ImmOffs;
+  O << "]";
+}
+
+void ARMAsmPrinter::printAddrMode3OffsetOperand(const MachineInstr *MI, int Op){
+  const MachineOperand &MO1 = MI->getOperand(Op);
+  const MachineOperand &MO2 = MI->getOperand(Op+1);
+
+  if (MO1.getReg()) {
+    O << (char)ARM_AM::getAM3Op(MO2.getImm())
+      << getRegisterName(MO1.getReg());
+    return;
+  }
+
+  unsigned ImmOffs = ARM_AM::getAM3Offset(MO2.getImm());
+  assert(ImmOffs && "Malformed indexed load / store!");
+  O << "#"
+    << (char)ARM_AM::getAM3Op(MO2.getImm())
+    << ImmOffs;
+}
+
+void ARMAsmPrinter::printAddrMode4Operand(const MachineInstr *MI, int Op,
+                                          const char *Modifier) {
+  const MachineOperand &MO1 = MI->getOperand(Op);
+  const MachineOperand &MO2 = MI->getOperand(Op+1);
+  ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MO2.getImm());
+  if (Modifier && strcmp(Modifier, "submode") == 0) {
+    if (MO1.getReg() == ARM::SP) {
+      // FIXME
+      bool isLDM = (MI->getOpcode() == ARM::LDM ||
+                    MI->getOpcode() == ARM::LDM_RET ||
+                    MI->getOpcode() == ARM::t2LDM ||
+                    MI->getOpcode() == ARM::t2LDM_RET);
+      O << ARM_AM::getAMSubModeAltStr(Mode, isLDM);
+    } else
+      O << ARM_AM::getAMSubModeStr(Mode);
+  } else if (Modifier && strcmp(Modifier, "wide") == 0) {
+    ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MO2.getImm());
+    if (Mode == ARM_AM::ia)
+      O << ".w";
+  } else {
+    printOperand(MI, Op);
+    if (ARM_AM::getAM4WBFlag(MO2.getImm()))
+      O << "!";
+  }
+}
+
+void ARMAsmPrinter::printAddrMode5Operand(const MachineInstr *MI, int Op,
+                                          const char *Modifier) {
+  const MachineOperand &MO1 = MI->getOperand(Op);
+  const MachineOperand &MO2 = MI->getOperand(Op+1);
+
+  if (!MO1.isReg()) {   // FIXME: This is for CP entries, but isn't right.
+    printOperand(MI, Op);
+    return;
+  }
+
+  assert(TargetRegisterInfo::isPhysicalRegister(MO1.getReg()));
+
+  if (Modifier && strcmp(Modifier, "submode") == 0) {
+    ARM_AM::AMSubMode Mode = ARM_AM::getAM5SubMode(MO2.getImm());
+    O << ARM_AM::getAMSubModeStr(Mode);
+    return;
+  } else if (Modifier && strcmp(Modifier, "base") == 0) {
+    // Used for FSTM{D|S} and LSTM{D|S} operations.
+    O << getRegisterName(MO1.getReg());
+    if (ARM_AM::getAM5WBFlag(MO2.getImm()))
+      O << "!";
+    return;
+  }
+
+  O << "[" << getRegisterName(MO1.getReg());
+
+  if (unsigned ImmOffs = ARM_AM::getAM5Offset(MO2.getImm())) {
+    O << ", #"
+      << (char)ARM_AM::getAM5Op(MO2.getImm())
+      << ImmOffs*4;
+  }
+  O << "]";
+}
+
+void ARMAsmPrinter::printAddrMode6Operand(const MachineInstr *MI, int Op) {
+  const MachineOperand &MO1 = MI->getOperand(Op);
+  const MachineOperand &MO2 = MI->getOperand(Op+1);
+  const MachineOperand &MO3 = MI->getOperand(Op+2);
+  const MachineOperand &MO4 = MI->getOperand(Op+3);
+
+  O << "[" << getRegisterName(MO1.getReg());
+  if (MO4.getImm()) {
+    // FIXME: Both darwin as and GNU as violate ARM docs here.
+    O << ", :" << MO4.getImm();
+  }
+  O << "]";
+
+  if (ARM_AM::getAM6WBFlag(MO3.getImm())) {
+    if (MO2.getReg() == 0)
+      O << "!";
+    else
+      O << ", " << getRegisterName(MO2.getReg());
+  }
+}
+
+void ARMAsmPrinter::printAddrModePCOperand(const MachineInstr *MI, int Op,
+                                           const char *Modifier) {
+  if (Modifier && strcmp(Modifier, "label") == 0) {
+    printPCLabel(MI, Op+1);
+    return;
+  }
+
+  const MachineOperand &MO1 = MI->getOperand(Op);
+  assert(TargetRegisterInfo::isPhysicalRegister(MO1.getReg()));
+  O << "[pc, +" << getRegisterName(MO1.getReg()) << "]";
+}
+
+void
+ARMAsmPrinter::printBitfieldInvMaskImmOperand(const MachineInstr *MI, int Op) {
+  const MachineOperand &MO = MI->getOperand(Op);
+  uint32_t v = ~MO.getImm();
+  int32_t lsb = CountTrailingZeros_32(v);
+  int32_t width = (32 - CountLeadingZeros_32 (v)) - lsb;
+  assert(MO.isImm() && "Not a valid bf_inv_mask_imm value!");
+  O << "#" << lsb << ", #" << width;
+}
+
+//===--------------------------------------------------------------------===//
+
+void ARMAsmPrinter::printThumbS4ImmOperand(const MachineInstr *MI, int Op) {
+  O << "#" <<  MI->getOperand(Op).getImm() * 4;
+}
+
+void
+ARMAsmPrinter::printThumbITMask(const MachineInstr *MI, int Op) {
+  // (3 - the number of trailing zeros) is the number of then / else.
+  unsigned Mask = MI->getOperand(Op).getImm();
+  unsigned NumTZ = CountTrailingZeros_32(Mask);
+  assert(NumTZ <= 3 && "Invalid IT mask!");
+  for (unsigned Pos = 3, e = NumTZ; Pos > e; --Pos) {
+    bool T = (Mask & (1 << Pos)) == 0;
+    if (T)
+      O << 't';
+    else
+      O << 'e';
+  }
+}
+
+void
+ARMAsmPrinter::printThumbAddrModeRROperand(const MachineInstr *MI, int Op) {
+  const MachineOperand &MO1 = MI->getOperand(Op);
+  const MachineOperand &MO2 = MI->getOperand(Op+1);
+  O << "[" << getRegisterName(MO1.getReg());
+  O << ", " << getRegisterName(MO2.getReg()) << "]";
+}
+
+void
+ARMAsmPrinter::printThumbAddrModeRI5Operand(const MachineInstr *MI, int Op,
+                                            unsigned Scale) {
+  const MachineOperand &MO1 = MI->getOperand(Op);
+  const MachineOperand &MO2 = MI->getOperand(Op+1);
+  const MachineOperand &MO3 = MI->getOperand(Op+2);
+
+  if (!MO1.isReg()) {   // FIXME: This is for CP entries, but isn't right.
+    printOperand(MI, Op);
+    return;
+  }
+
+  O << "[" << getRegisterName(MO1.getReg());
+  if (MO3.getReg())
+    O << ", " << getRegisterName(MO3.getReg());
+  else if (unsigned ImmOffs = MO2.getImm())
+    O << ", #+" << ImmOffs * Scale;
+  O << "]";
+}
+
+void
+ARMAsmPrinter::printThumbAddrModeS1Operand(const MachineInstr *MI, int Op) {
+  printThumbAddrModeRI5Operand(MI, Op, 1);
+}
+void
+ARMAsmPrinter::printThumbAddrModeS2Operand(const MachineInstr *MI, int Op) {
+  printThumbAddrModeRI5Operand(MI, Op, 2);
+}
+void
+ARMAsmPrinter::printThumbAddrModeS4Operand(const MachineInstr *MI, int Op) {
+  printThumbAddrModeRI5Operand(MI, Op, 4);
+}
+
+void ARMAsmPrinter::printThumbAddrModeSPOperand(const MachineInstr *MI,int Op) {
+  const MachineOperand &MO1 = MI->getOperand(Op);
+  const MachineOperand &MO2 = MI->getOperand(Op+1);
+  O << "[" << getRegisterName(MO1.getReg());
+  if (unsigned ImmOffs = MO2.getImm())
+    O << ", #+" << ImmOffs*4;
+  O << "]";
+}
+
+//===--------------------------------------------------------------------===//
+
+// Constant shifts t2_so_reg is a 2-operand unit corresponding to the Thumb2
+// register with shift forms.
+// REG 0   0           - e.g. R5
+// REG IMM, SH_OPC     - e.g. R5, LSL #3
+void ARMAsmPrinter::printT2SOOperand(const MachineInstr *MI, int OpNum) {
+  const MachineOperand &MO1 = MI->getOperand(OpNum);
+  const MachineOperand &MO2 = MI->getOperand(OpNum+1);
+
+  unsigned Reg = MO1.getReg();
+  assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+  O << getRegisterName(Reg);
+
+  // Print the shift opc.
+  O << ", "
+    << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(MO2.getImm()))
+    << " ";
+
+  assert(MO2.isImm() && "Not a valid t2_so_reg value!");
+  O << "#" << ARM_AM::getSORegOffset(MO2.getImm());
+}
+
+void ARMAsmPrinter::printT2AddrModeImm12Operand(const MachineInstr *MI,
+                                                int OpNum) {
+  const MachineOperand &MO1 = MI->getOperand(OpNum);
+  const MachineOperand &MO2 = MI->getOperand(OpNum+1);
+
+  O << "[" << getRegisterName(MO1.getReg());
+
+  unsigned OffImm = MO2.getImm();
+  if (OffImm)  // Don't print +0.
+    O << ", #+" << OffImm;
+  O << "]";
+}
+
+void ARMAsmPrinter::printT2AddrModeImm8Operand(const MachineInstr *MI,
+                                               int OpNum) {
+  const MachineOperand &MO1 = MI->getOperand(OpNum);
+  const MachineOperand &MO2 = MI->getOperand(OpNum+1);
+
+  O << "[" << getRegisterName(MO1.getReg());
+
+  int32_t OffImm = (int32_t)MO2.getImm();
+  // Don't print +0.
+  if (OffImm < 0)
+    O << ", #-" << -OffImm;
+  else if (OffImm > 0)
+    O << ", #+" << OffImm;
+  O << "]";
+}
+
+void ARMAsmPrinter::printT2AddrModeImm8s4Operand(const MachineInstr *MI,
+                                                 int OpNum) {
+  const MachineOperand &MO1 = MI->getOperand(OpNum);
+  const MachineOperand &MO2 = MI->getOperand(OpNum+1);
+
+  O << "[" << getRegisterName(MO1.getReg());
+
+  int32_t OffImm = (int32_t)MO2.getImm() / 4;
+  // Don't print +0.
+  if (OffImm < 0)
+    O << ", #-" << -OffImm * 4;
+  else if (OffImm > 0)
+    O << ", #+" << OffImm * 4;
+  O << "]";
+}
+
+void ARMAsmPrinter::printT2AddrModeImm8OffsetOperand(const MachineInstr *MI,
+                                                     int OpNum) {
+  const MachineOperand &MO1 = MI->getOperand(OpNum);
+  int32_t OffImm = (int32_t)MO1.getImm();
+  // Don't print +0.
+  if (OffImm < 0)
+    O << "#-" << -OffImm;
+  else if (OffImm > 0)
+    O << "#+" << OffImm;
+}
+
+void ARMAsmPrinter::printT2AddrModeSoRegOperand(const MachineInstr *MI,
+                                                int OpNum) {
+  const MachineOperand &MO1 = MI->getOperand(OpNum);
+  const MachineOperand &MO2 = MI->getOperand(OpNum+1);
+  const MachineOperand &MO3 = MI->getOperand(OpNum+2);
+
+  O << "[" << getRegisterName(MO1.getReg());
+
+  assert(MO2.getReg() && "Invalid so_reg load / store address!");
+  O << ", " << getRegisterName(MO2.getReg());
+
+  unsigned ShAmt = MO3.getImm();
+  if (ShAmt) {
+    assert(ShAmt <= 3 && "Not a valid Thumb2 addressing mode!");
+    O << ", lsl #" << ShAmt;
+  }
+  O << "]";
+}
+
+
+//===--------------------------------------------------------------------===//
+
+void ARMAsmPrinter::printPredicateOperand(const MachineInstr *MI, int OpNum) {
+  ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(OpNum).getImm();
+  if (CC != ARMCC::AL)
+    O << ARMCondCodeToString(CC);
+}
+
+void ARMAsmPrinter::printSBitModifierOperand(const MachineInstr *MI, int OpNum){
+  unsigned Reg = MI->getOperand(OpNum).getReg();
+  if (Reg) {
+    assert(Reg == ARM::CPSR && "Expect ARM CPSR register!");
+    O << 's';
+  }
+}
+
+void ARMAsmPrinter::printPCLabel(const MachineInstr *MI, int OpNum) {
+  int Id = (int)MI->getOperand(OpNum).getImm();
+  O << MAI->getPrivateGlobalPrefix()
+    << "PC" << getFunctionNumber() << "_" << Id;
+}
+
+void ARMAsmPrinter::printRegisterList(const MachineInstr *MI, int OpNum) {
+  O << "{";
+  // Always skip the first operand, it's the optional (and implicit writeback).
+  for (unsigned i = OpNum+1, e = MI->getNumOperands(); i != e; ++i) {
+    if (MI->getOperand(i).isImplicit())
+      continue;
+    if ((int)i != OpNum+1) O << ", ";
+    printOperand(MI, i);
+  }
+  O << "}";
+}
+
+void ARMAsmPrinter::printCPInstOperand(const MachineInstr *MI, int OpNum,
+                                       const char *Modifier) {
+  assert(Modifier && "This operand only works with a modifier!");
+  // There are two aspects to a CONSTANTPOOL_ENTRY operand, the label and the
+  // data itself.
+  if (!strcmp(Modifier, "label")) {
+    unsigned ID = MI->getOperand(OpNum).getImm();
+    OutStreamer.EmitLabel(GetCPISymbol(ID));
+  } else {
+    assert(!strcmp(Modifier, "cpentry") && "Unknown modifier for CPE");
+    unsigned CPI = MI->getOperand(OpNum).getIndex();
+
+    const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPI];
+
+    if (MCPE.isMachineConstantPoolEntry()) {
+      EmitMachineConstantPoolValue(MCPE.Val.MachineCPVal);
+    } else {
+      EmitGlobalConstant(MCPE.Val.ConstVal);
+    }
+  }
+}
+
+MCSymbol *ARMAsmPrinter::
+GetARMSetPICJumpTableLabel2(unsigned uid, unsigned uid2,
+                            const MachineBasicBlock *MBB) const {
+  SmallString<60> Name;
+  raw_svector_ostream(Name) << MAI->getPrivateGlobalPrefix()
+    << getFunctionNumber() << '_' << uid << '_' << uid2
+    << "_set_" << MBB->getNumber();
+  return OutContext.GetOrCreateSymbol(Name.str());
+}
+
+MCSymbol *ARMAsmPrinter::
+GetARMJTIPICJumpTableLabel2(unsigned uid, unsigned uid2) const {
+  SmallString<60> Name;
+  raw_svector_ostream(Name) << MAI->getPrivateGlobalPrefix() << "JTI"
+    << getFunctionNumber() << '_' << uid << '_' << uid2;
+  return OutContext.GetOrCreateSymbol(Name.str());
+}
+
+void ARMAsmPrinter::printJTBlockOperand(const MachineInstr *MI, int OpNum) {
+  assert(!Subtarget->isThumb2() && "Thumb2 should use double-jump jumptables!");
+
+  const MachineOperand &MO1 = MI->getOperand(OpNum);
+  const MachineOperand &MO2 = MI->getOperand(OpNum+1); // Unique Id
+  
+  unsigned JTI = MO1.getIndex();
+  MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel2(JTI, MO2.getImm());
+  OutStreamer.EmitLabel(JTISymbol);
+
+  const char *JTEntryDirective = MAI->getData32bitsDirective();
+
+  const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+  const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+  bool UseSet= MAI->hasSetDirective() && TM.getRelocationModel() == Reloc::PIC_;
+  SmallPtrSet<MachineBasicBlock*, 8> JTSets;
+  for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) {
+    MachineBasicBlock *MBB = JTBBs[i];
+    bool isNew = JTSets.insert(MBB);
+
+    if (UseSet && isNew) {
+      O << "\t.set\t"
+        << *GetARMSetPICJumpTableLabel2(JTI, MO2.getImm(), MBB) << ','
+        << *MBB->getSymbol(OutContext) << '-' << *JTISymbol << '\n';
+    }
+
+    O << JTEntryDirective << ' ';
+    if (UseSet)
+      O << *GetARMSetPICJumpTableLabel2(JTI, MO2.getImm(), MBB);
+    else if (TM.getRelocationModel() == Reloc::PIC_)
+      O << *MBB->getSymbol(OutContext) << '-' << *JTISymbol;
+    else
+      O << *MBB->getSymbol(OutContext);
+
+    if (i != e-1)
+      O << '\n';
+  }
+}
+
+void ARMAsmPrinter::printJT2BlockOperand(const MachineInstr *MI, int OpNum) {
+  const MachineOperand &MO1 = MI->getOperand(OpNum);
+  const MachineOperand &MO2 = MI->getOperand(OpNum+1); // Unique Id
+  unsigned JTI = MO1.getIndex();
+  
+  MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel2(JTI, MO2.getImm());
+  OutStreamer.EmitLabel(JTISymbol);
+
+  const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+  const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+  bool ByteOffset = false, HalfWordOffset = false;
+  if (MI->getOpcode() == ARM::t2TBB)
+    ByteOffset = true;
+  else if (MI->getOpcode() == ARM::t2TBH)
+    HalfWordOffset = true;
+
+  for (unsigned i = 0, e = JTBBs.size(); i != e; ++i) {
+    MachineBasicBlock *MBB = JTBBs[i];
+    if (ByteOffset)
+      O << MAI->getData8bitsDirective();
+    else if (HalfWordOffset)
+      O << MAI->getData16bitsDirective();
+    
+    if (ByteOffset || HalfWordOffset)
+      O << '(' << *MBB->getSymbol(OutContext) << "-" << *JTISymbol << ")/2";
+    else
+      O << "\tb.w " << *MBB->getSymbol(OutContext);
+
+    if (i != e-1)
+      O << '\n';
+  }
+
+  // Make sure the instruction that follows TBB is 2-byte aligned.
+  // FIXME: Constant island pass should insert an "ALIGN" instruction instead.
+  if (ByteOffset && (JTBBs.size() & 1)) {
+    O << '\n';
+    EmitAlignment(1);
+  }
+}
+
+void ARMAsmPrinter::printTBAddrMode(const MachineInstr *MI, int OpNum) {
+  O << "[pc, " << getRegisterName(MI->getOperand(OpNum).getReg());
+  if (MI->getOpcode() == ARM::t2TBH)
+    O << ", lsl #1";
+  O << ']';
+}
+
+void ARMAsmPrinter::printNoHashImmediate(const MachineInstr *MI, int OpNum) {
+  O << MI->getOperand(OpNum).getImm();
+}
+
+void ARMAsmPrinter::printVFPf32ImmOperand(const MachineInstr *MI, int OpNum) {
+  const ConstantFP *FP = MI->getOperand(OpNum).getFPImm();
+  O << '#' << FP->getValueAPF().convertToFloat();
+  if (VerboseAsm) {
+    O.PadToColumn(MAI->getCommentColumn());
+    O << MAI->getCommentString() << ' ';
+    WriteAsOperand(O, FP, /*PrintType=*/false);
+  }
+}
+
+void ARMAsmPrinter::printVFPf64ImmOperand(const MachineInstr *MI, int OpNum) {
+  const ConstantFP *FP = MI->getOperand(OpNum).getFPImm();
+  O << '#' << FP->getValueAPF().convertToDouble();
+  if (VerboseAsm) {
+    O.PadToColumn(MAI->getCommentColumn());
+    O << MAI->getCommentString() << ' ';
+    WriteAsOperand(O, FP, /*PrintType=*/false);
+  }
+}
+
+bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
+                                    unsigned AsmVariant, const char *ExtraCode){
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default: return true;  // Unknown modifier.
+    case 'a': // Print as a memory address.
+      if (MI->getOperand(OpNum).isReg()) {
+        O << "[" << getRegisterName(MI->getOperand(OpNum).getReg()) << "]";
+        return false;
+      }
+      // Fallthrough
+    case 'c': // Don't print "#" before an immediate operand.
+      if (!MI->getOperand(OpNum).isImm())
+        return true;
+      printNoHashImmediate(MI, OpNum);
+      return false;
+    case 'P': // Print a VFP double precision register.
+    case 'q': // Print a NEON quad precision register.
+      printOperand(MI, OpNum);
+      return false;
+    case 'Q':
+      if (TM.getTargetData()->isLittleEndian())
+        break;
+      // Fallthrough
+    case 'R':
+      if (TM.getTargetData()->isBigEndian())
+        break;
+      // Fallthrough
+    case 'H': // Write second word of DI / DF reference.
+      // Verify that this operand has two consecutive registers.
+      if (!MI->getOperand(OpNum).isReg() ||
+          OpNum+1 == MI->getNumOperands() ||
+          !MI->getOperand(OpNum+1).isReg())
+        return true;
+      ++OpNum;   // Return the high-part.
+    }
+  }
+
+  printOperand(MI, OpNum);
+  return false;
+}
+
+bool ARMAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                          unsigned OpNum, unsigned AsmVariant,
+                                          const char *ExtraCode) {
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier.
+
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  assert(MO.isReg() && "unexpected inline asm memory operand");
+  O << "[" << getRegisterName(MO.getReg()) << "]";
+  return false;
+}
+
+void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  if (EnableMCInst) {
+    printInstructionThroughMCStreamer(MI);
+  } else {
+    int Opc = MI->getOpcode();
+    if (Opc == ARM::CONSTPOOL_ENTRY)
+      EmitAlignment(2);
+    
+    printInstruction(MI);
+    OutStreamer.AddBlankLine();
+  }
+}
+
+void ARMAsmPrinter::EmitStartOfAsmFile(Module &M) {
+  if (Subtarget->isTargetDarwin()) {
+    Reloc::Model RelocM = TM.getRelocationModel();
+    if (RelocM == Reloc::PIC_ || RelocM == Reloc::DynamicNoPIC) {
+      // Declare all the text sections up front (before the DWARF sections
+      // emitted by AsmPrinter::doInitialization) so the assembler will keep
+      // them together at the beginning of the object file.  This helps
+      // avoid out-of-range branches that are due a fundamental limitation of
+      // the way symbol offsets are encoded with the current Darwin ARM
+      // relocations.
+      TargetLoweringObjectFileMachO &TLOFMacho = 
+        static_cast<TargetLoweringObjectFileMachO &>(getObjFileLowering());
+      OutStreamer.SwitchSection(TLOFMacho.getTextSection());
+      OutStreamer.SwitchSection(TLOFMacho.getTextCoalSection());
+      OutStreamer.SwitchSection(TLOFMacho.getConstTextCoalSection());
+      if (RelocM == Reloc::DynamicNoPIC) {
+        const MCSection *sect =
+          TLOFMacho.getMachOSection("__TEXT", "__symbol_stub4",
+                                    MCSectionMachO::S_SYMBOL_STUBS,
+                                    12, SectionKind::getText());
+        OutStreamer.SwitchSection(sect);
+      } else {
+        const MCSection *sect =
+          TLOFMacho.getMachOSection("__TEXT", "__picsymbolstub4",
+                                    MCSectionMachO::S_SYMBOL_STUBS,
+                                    16, SectionKind::getText());
+        OutStreamer.SwitchSection(sect);
+      }
+    }
+  }
+
+  // Use unified assembler syntax.
+  O << "\t.syntax unified\n";
+
+  // Emit ARM Build Attributes
+  if (Subtarget->isTargetELF()) {
+    // CPU Type
+    std::string CPUString = Subtarget->getCPUString();
+    if (CPUString != "generic")
+      O << "\t.cpu " << CPUString << '\n';
+
+    // FIXME: Emit FPU type
+    if (Subtarget->hasVFP2())
+      O << "\t.eabi_attribute " << ARMBuildAttrs::VFP_arch << ", 2\n";
+
+    // Signal various FP modes.
+    if (!UnsafeFPMath)
+      O << "\t.eabi_attribute " << ARMBuildAttrs::ABI_FP_denormal << ", 1\n"
+        << "\t.eabi_attribute " << ARMBuildAttrs::ABI_FP_exceptions << ", 1\n";
+
+    if (FiniteOnlyFPMath())
+      O << "\t.eabi_attribute " << ARMBuildAttrs::ABI_FP_number_model << ", 1\n";
+    else
+      O << "\t.eabi_attribute " << ARMBuildAttrs::ABI_FP_number_model << ", 3\n";
+
+    // 8-bytes alignment stuff.
+    O << "\t.eabi_attribute " << ARMBuildAttrs::ABI_align8_needed << ", 1\n"
+      << "\t.eabi_attribute " << ARMBuildAttrs::ABI_align8_preserved << ", 1\n";
+
+    // Hard float.  Use both S and D registers and conform to AAPCS-VFP.
+    if (Subtarget->isAAPCS_ABI() && FloatABIType == FloatABI::Hard)
+      O << "\t.eabi_attribute " << ARMBuildAttrs::ABI_HardFP_use << ", 3\n"
+        << "\t.eabi_attribute " << ARMBuildAttrs::ABI_VFP_args << ", 1\n";
+
+    // FIXME: Should we signal R9 usage?
+  }
+}
+
+
+void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
+  if (Subtarget->isTargetDarwin()) {
+    // All darwin targets use mach-o.
+    TargetLoweringObjectFileMachO &TLOFMacho =
+      static_cast<TargetLoweringObjectFileMachO &>(getObjFileLowering());
+    MachineModuleInfoMachO &MMIMacho =
+      MMI->getObjFileInfo<MachineModuleInfoMachO>();
+
+    O << '\n';
+
+    // Output non-lazy-pointers for external and common global variables.
+    MachineModuleInfoMachO::SymbolListTy Stubs = MMIMacho.GetGVStubList();
+    
+    if (!Stubs.empty()) {
+      // Switch with ".non_lazy_symbol_pointer" directive.
+      OutStreamer.SwitchSection(TLOFMacho.getNonLazySymbolPointerSection());
+      EmitAlignment(2);
+      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+        O << *Stubs[i].first << ":\n\t.indirect_symbol ";
+        O << *Stubs[i].second << "\n\t.long\t0\n";
+      }
+    }
+
+    Stubs = MMIMacho.GetHiddenGVStubList();
+    if (!Stubs.empty()) {
+      OutStreamer.SwitchSection(getObjFileLowering().getDataSection());
+      EmitAlignment(2);
+      for (unsigned i = 0, e = Stubs.size(); i != e; ++i)
+        O << *Stubs[i].first << ":\n\t.long " << *Stubs[i].second << "\n";
+    }
+
+    // Funny Darwin hack: This flag tells the linker that no global symbols
+    // contain code that falls through to other global symbols (e.g. the obvious
+    // implementation of multiple entry points).  If this doesn't occur, the
+    // linker can safely perform dead code stripping.  Since LLVM never
+    // generates code that does this, it is always safe to set.
+    OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+
+void ARMAsmPrinter::printInstructionThroughMCStreamer(const MachineInstr *MI) {
+  ARMMCInstLower MCInstLowering(OutContext, *Mang, *this);
+  switch (MI->getOpcode()) {
+  case ARM::t2MOVi32imm:
+    assert(0 && "Should be lowered by thumb2it pass");
+  default: break;
+  case ARM::PICADD: { // FIXME: Remove asm string from td file.
+    // This is a pseudo op for a label + instruction sequence, which looks like:
+    // LPC0:
+    //     add r0, pc, r0
+    // This adds the address of LPC0 to r0.
+    
+    // Emit the label.
+    // FIXME: MOVE TO SHARED PLACE.
+    unsigned Id = (unsigned)MI->getOperand(2).getImm();
+    const char *Prefix = MAI->getPrivateGlobalPrefix();
+    MCSymbol *Label =OutContext.GetOrCreateSymbol(Twine(Prefix)
+                         + "PC" + Twine(getFunctionNumber()) + "_" + Twine(Id));
+    OutStreamer.EmitLabel(Label);
+    
+    
+    // Form and emit tha dd.
+    MCInst AddInst;
+    AddInst.setOpcode(ARM::ADDrr);
+    AddInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+    AddInst.addOperand(MCOperand::CreateReg(ARM::PC));
+    AddInst.addOperand(MCOperand::CreateReg(MI->getOperand(1).getReg()));
+    OutStreamer.EmitInstruction(AddInst);
+    return;
+  }
+  case ARM::CONSTPOOL_ENTRY: { // FIXME: Remove asm string from td file.
+    /// CONSTPOOL_ENTRY - This instruction represents a floating constant pool
+    /// in the function.  The first operand is the ID# for this instruction, the
+    /// second is the index into the MachineConstantPool that this is, the third
+    /// is the size in bytes of this constant pool entry.
+    unsigned LabelId = (unsigned)MI->getOperand(0).getImm();
+    unsigned CPIdx   = (unsigned)MI->getOperand(1).getIndex();
+
+    EmitAlignment(2);
+    OutStreamer.EmitLabel(GetCPISymbol(LabelId));
+
+    const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPIdx];
+    if (MCPE.isMachineConstantPoolEntry())
+      EmitMachineConstantPoolValue(MCPE.Val.MachineCPVal);
+    else
+      EmitGlobalConstant(MCPE.Val.ConstVal);
+    
+    return;
+  }
+  case ARM::MOVi2pieces: { // FIXME: Remove asmstring from td file.
+    // This is a hack that lowers as a two instruction sequence.
+    unsigned DstReg = MI->getOperand(0).getReg();
+    unsigned ImmVal = (unsigned)MI->getOperand(1).getImm();
+
+    unsigned SOImmValV1 = ARM_AM::getSOImmTwoPartFirst(ImmVal);
+    unsigned SOImmValV2 = ARM_AM::getSOImmTwoPartSecond(ImmVal);
+    
+    {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::MOVi);
+      TmpInst.addOperand(MCOperand::CreateReg(DstReg));
+      TmpInst.addOperand(MCOperand::CreateImm(SOImmValV1));
+      
+      // Predicate.
+      TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(2).getImm()));
+      TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(3).getReg()));
+
+      TmpInst.addOperand(MCOperand::CreateReg(0));          // cc_out
+      OutStreamer.EmitInstruction(TmpInst);
+    }
+
+    {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::ORRri);
+      TmpInst.addOperand(MCOperand::CreateReg(DstReg));     // dstreg
+      TmpInst.addOperand(MCOperand::CreateReg(DstReg));     // inreg
+      TmpInst.addOperand(MCOperand::CreateImm(SOImmValV2)); // so_imm
+      // Predicate.
+      TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(2).getImm()));
+      TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(3).getReg()));
+      
+      TmpInst.addOperand(MCOperand::CreateReg(0));          // cc_out
+      OutStreamer.EmitInstruction(TmpInst);
+    }
+    return; 
+  }
+  case ARM::MOVi32imm: { // FIXME: Remove asmstring from td file.
+    // This is a hack that lowers as a two instruction sequence.
+    unsigned DstReg = MI->getOperand(0).getReg();
+    unsigned ImmVal = (unsigned)MI->getOperand(1).getImm();
+    
+    {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::MOVi16);
+      TmpInst.addOperand(MCOperand::CreateReg(DstReg));         // dstreg
+      TmpInst.addOperand(MCOperand::CreateImm(ImmVal & 65535)); // lower16(imm)
+      
+      // Predicate.
+      TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(2).getImm()));
+      TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(3).getReg()));
+      
+      OutStreamer.EmitInstruction(TmpInst);
+    }
+    
+    {
+      MCInst TmpInst;
+      TmpInst.setOpcode(ARM::MOVTi16);
+      TmpInst.addOperand(MCOperand::CreateReg(DstReg));         // dstreg
+      TmpInst.addOperand(MCOperand::CreateReg(DstReg));         // srcreg
+      TmpInst.addOperand(MCOperand::CreateImm(ImmVal >> 16));   // upper16(imm)
+      
+      // Predicate.
+      TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(2).getImm()));
+      TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(3).getReg()));
+      
+      OutStreamer.EmitInstruction(TmpInst);
+    }
+    
+    return;
+  }
+  }
+      
+  MCInst TmpInst;
+  MCInstLowering.Lower(MI, TmpInst);
+  OutStreamer.EmitInstruction(TmpInst);
+}
+
+//===----------------------------------------------------------------------===//
+// Target Registry Stuff
+//===----------------------------------------------------------------------===//
+
+static MCInstPrinter *createARMMCInstPrinter(const Target &T,
+                                             unsigned SyntaxVariant,
+                                             const MCAsmInfo &MAI,
+                                             raw_ostream &O) {
+  if (SyntaxVariant == 0)
+    return new ARMInstPrinter(O, MAI, false);
+  return 0;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeARMAsmPrinter() {
+  RegisterAsmPrinter<ARMAsmPrinter> X(TheARMTarget);
+  RegisterAsmPrinter<ARMAsmPrinter> Y(TheThumbTarget);
+
+  TargetRegistry::RegisterMCInstPrinter(TheARMTarget, createARMMCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheThumbTarget, createARMMCInstPrinter);
+}
+

diff --git a/lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp
new file mode 100644
index 0000000..d7d8e09
--- /dev/null
+++ b/lib/Target/ARM/AsmPrinter/ARMInstPrinter.cpp

@@ -0,0 +1,356 @@
+//===-- ARMInstPrinter.cpp - Convert ARM MCInst to assembly syntax --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an ARM MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "ARM.h" // FIXME: FACTOR ENUMS BETTER.
+#include "ARMInstPrinter.h"
+#include "ARMAddressingModes.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+// Include the auto-generated portion of the assembly writer.
+#define MachineInstr MCInst
+#define ARMAsmPrinter ARMInstPrinter  // FIXME: REMOVE.
+#include "ARMGenAsmWriter.inc"
+#undef MachineInstr
+#undef ARMAsmPrinter
+
+void ARMInstPrinter::printInst(const MCInst *MI) { printInstruction(MI); }
+
+void ARMInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                  const char *Modifier) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    unsigned Reg = Op.getReg();
+    if (Modifier && strcmp(Modifier, "dregpair") == 0) {
+      // FIXME: Breaks e.g. ARM/vmul.ll.
+      assert(0);
+      /*
+      unsigned DRegLo = TRI->getSubReg(Reg, 5); // arm_dsubreg_0
+      unsigned DRegHi = TRI->getSubReg(Reg, 6); // arm_dsubreg_1
+      O << '{'
+      << getRegisterName(DRegLo) << ',' << getRegisterName(DRegHi)
+      << '}';*/
+    } else if (Modifier && strcmp(Modifier, "lane") == 0) {
+      assert(0);
+      /*
+      unsigned RegNum = ARMRegisterInfo::getRegisterNumbering(Reg);
+      unsigned DReg = TRI->getMatchingSuperReg(Reg, RegNum & 1 ? 2 : 1,
+                                               &ARM::DPR_VFP2RegClass);
+      O << getRegisterName(DReg) << '[' << (RegNum & 1) << ']';
+       */
+    } else {
+      O << getRegisterName(Reg);
+    }
+  } else if (Op.isImm()) {
+    assert((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported");
+    O << '#' << Op.getImm();
+  } else {
+    assert((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported");
+    assert(Op.isExpr() && "unknown operand kind in printOperand");
+    O << *Op.getExpr();
+  }
+}
+
+static void printSOImm(raw_ostream &O, int64_t V, bool VerboseAsm,
+                       const MCAsmInfo *MAI) {
+  // Break it up into two parts that make up a shifter immediate.
+  V = ARM_AM::getSOImmVal(V);
+  assert(V != -1 && "Not a valid so_imm value!");
+  
+  unsigned Imm = ARM_AM::getSOImmValImm(V);
+  unsigned Rot = ARM_AM::getSOImmValRot(V);
+  
+  // Print low-level immediate formation info, per
+  // A5.1.3: "Data-processing operands - Immediate".
+  if (Rot) {
+    O << "#" << Imm << ", " << Rot;
+    // Pretty printed version.
+    if (VerboseAsm)
+      O << ' ' << MAI->getCommentString()
+      << ' ' << (int)ARM_AM::rotr32(Imm, Rot);
+  } else {
+    O << "#" << Imm;
+  }
+}
+
+
+/// printSOImmOperand - SOImm is 4-bit rotate amount in bits 8-11 with 8-bit
+/// immediate in bits 0-7.
+void ARMInstPrinter::printSOImmOperand(const MCInst *MI, unsigned OpNum) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  assert(MO.isImm() && "Not a valid so_imm value!");
+  printSOImm(O, MO.getImm(), VerboseAsm, &MAI);
+}
+
+/// printSOImm2PartOperand - SOImm is broken into two pieces using a 'mov'
+/// followed by an 'orr' to materialize.
+void ARMInstPrinter::printSOImm2PartOperand(const MCInst *MI, unsigned OpNum) {
+  // FIXME: REMOVE this method.
+  abort();
+}
+
+// so_reg is a 4-operand unit corresponding to register forms of the A5.1
+// "Addressing Mode 1 - Data-processing operands" forms.  This includes:
+//    REG 0   0           - e.g. R5
+//    REG REG 0,SH_OPC    - e.g. R5, ROR R3
+//    REG 0   IMM,SH_OPC  - e.g. R5, LSL #3
+void ARMInstPrinter::printSORegOperand(const MCInst *MI, unsigned OpNum) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+  const MCOperand &MO3 = MI->getOperand(OpNum+2);
+  
+  O << getRegisterName(MO1.getReg());
+  
+  // Print the shift opc.
+  O << ", "
+    << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(MO3.getImm()))
+    << ' ';
+  
+  if (MO2.getReg()) {
+    O << getRegisterName(MO2.getReg());
+    assert(ARM_AM::getSORegOffset(MO3.getImm()) == 0);
+  } else {
+    O << "#" << ARM_AM::getSORegOffset(MO3.getImm());
+  }
+}
+
+
+void ARMInstPrinter::printAddrMode2Operand(const MCInst *MI, unsigned Op) {
+  const MCOperand &MO1 = MI->getOperand(Op);
+  const MCOperand &MO2 = MI->getOperand(Op+1);
+  const MCOperand &MO3 = MI->getOperand(Op+2);
+  
+  if (!MO1.isReg()) {   // FIXME: This is for CP entries, but isn't right.
+    printOperand(MI, Op);
+    return;
+  }
+  
+  O << "[" << getRegisterName(MO1.getReg());
+  
+  if (!MO2.getReg()) {
+    if (ARM_AM::getAM2Offset(MO3.getImm()))  // Don't print +0.
+      O << ", #"
+      << (char)ARM_AM::getAM2Op(MO3.getImm())
+      << ARM_AM::getAM2Offset(MO3.getImm());
+    O << "]";
+    return;
+  }
+  
+  O << ", "
+  << (char)ARM_AM::getAM2Op(MO3.getImm())
+  << getRegisterName(MO2.getReg());
+  
+  if (unsigned ShImm = ARM_AM::getAM2Offset(MO3.getImm()))
+    O << ", "
+    << ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO3.getImm()))
+    << " #" << ShImm;
+  O << "]";
+}  
+
+void ARMInstPrinter::printAddrMode2OffsetOperand(const MCInst *MI,
+                                                 unsigned OpNum) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+  
+  if (!MO1.getReg()) {
+    unsigned ImmOffs = ARM_AM::getAM2Offset(MO2.getImm());
+    assert(ImmOffs && "Malformed indexed load / store!");
+    O << '#' << (char)ARM_AM::getAM2Op(MO2.getImm()) << ImmOffs;
+    return;
+  }
+  
+  O << (char)ARM_AM::getAM2Op(MO2.getImm()) << getRegisterName(MO1.getReg());
+  
+  if (unsigned ShImm = ARM_AM::getAM2Offset(MO2.getImm()))
+    O << ", "
+    << ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO2.getImm()))
+    << " #" << ShImm;
+}
+
+void ARMInstPrinter::printAddrMode3Operand(const MCInst *MI, unsigned OpNum) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+  const MCOperand &MO3 = MI->getOperand(OpNum+2);
+  
+  O << '[' << getRegisterName(MO1.getReg());
+  
+  if (MO2.getReg()) {
+    O << ", " << (char)ARM_AM::getAM3Op(MO3.getImm())
+      << getRegisterName(MO2.getReg()) << ']';
+    return;
+  }
+  
+  if (unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm()))
+    O << ", #"
+    << (char)ARM_AM::getAM3Op(MO3.getImm())
+    << ImmOffs;
+  O << ']';
+}
+
+void ARMInstPrinter::printAddrMode3OffsetOperand(const MCInst *MI,
+                                                 unsigned OpNum) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+  
+  if (MO1.getReg()) {
+    O << (char)ARM_AM::getAM3Op(MO2.getImm())
+    << getRegisterName(MO1.getReg());
+    return;
+  }
+  
+  unsigned ImmOffs = ARM_AM::getAM3Offset(MO2.getImm());
+  assert(ImmOffs && "Malformed indexed load / store!");
+  O << "#"
+  << (char)ARM_AM::getAM3Op(MO2.getImm())
+  << ImmOffs;
+}
+
+
+void ARMInstPrinter::printAddrMode4Operand(const MCInst *MI, unsigned OpNum,
+                                           const char *Modifier) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+  ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MO2.getImm());
+  if (Modifier && strcmp(Modifier, "submode") == 0) {
+    if (MO1.getReg() == ARM::SP) {
+      // FIXME
+      bool isLDM = (MI->getOpcode() == ARM::LDM ||
+                    MI->getOpcode() == ARM::LDM_RET ||
+                    MI->getOpcode() == ARM::t2LDM ||
+                    MI->getOpcode() == ARM::t2LDM_RET);
+      O << ARM_AM::getAMSubModeAltStr(Mode, isLDM);
+    } else
+      O << ARM_AM::getAMSubModeStr(Mode);
+  } else if (Modifier && strcmp(Modifier, "wide") == 0) {
+    ARM_AM::AMSubMode Mode = ARM_AM::getAM4SubMode(MO2.getImm());
+    if (Mode == ARM_AM::ia)
+      O << ".w";
+  } else {
+    printOperand(MI, OpNum);
+    if (ARM_AM::getAM4WBFlag(MO2.getImm()))
+      O << "!";
+  }
+}
+
+void ARMInstPrinter::printAddrMode5Operand(const MCInst *MI, unsigned OpNum,
+                                           const char *Modifier) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+  
+  if (!MO1.isReg()) {   // FIXME: This is for CP entries, but isn't right.
+    printOperand(MI, OpNum);
+    return;
+  }
+  
+  if (Modifier && strcmp(Modifier, "submode") == 0) {
+    ARM_AM::AMSubMode Mode = ARM_AM::getAM5SubMode(MO2.getImm());
+    O << ARM_AM::getAMSubModeStr(Mode);
+    return;
+  } else if (Modifier && strcmp(Modifier, "base") == 0) {
+    // Used for FSTM{D|S} and LSTM{D|S} operations.
+    O << getRegisterName(MO1.getReg());
+    if (ARM_AM::getAM5WBFlag(MO2.getImm()))
+      O << "!";
+    return;
+  }
+  
+  O << "[" << getRegisterName(MO1.getReg());
+  
+  if (unsigned ImmOffs = ARM_AM::getAM5Offset(MO2.getImm())) {
+    O << ", #"
+      << (char)ARM_AM::getAM5Op(MO2.getImm())
+      << ImmOffs*4;
+  }
+  O << "]";
+}
+
+void ARMInstPrinter::printAddrMode6Operand(const MCInst *MI, unsigned OpNum) {
+  const MCOperand &MO1 = MI->getOperand(OpNum);
+  const MCOperand &MO2 = MI->getOperand(OpNum+1);
+  const MCOperand &MO3 = MI->getOperand(OpNum+2);
+  
+  // FIXME: No support yet for specifying alignment.
+  O << '[' << getRegisterName(MO1.getReg()) << ']';
+  
+  if (ARM_AM::getAM6WBFlag(MO3.getImm())) {
+    if (MO2.getReg() == 0)
+      O << '!';
+    else
+      O << ", " << getRegisterName(MO2.getReg());
+  }
+}
+
+void ARMInstPrinter::printAddrModePCOperand(const MCInst *MI, unsigned OpNum,
+                                            const char *Modifier) {
+  assert(0 && "FIXME: Implement printAddrModePCOperand");
+}
+
+void ARMInstPrinter::printBitfieldInvMaskImmOperand (const MCInst *MI,
+                                                     unsigned OpNum) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+  uint32_t v = ~MO.getImm();
+  int32_t lsb = CountTrailingZeros_32(v);
+  int32_t width = (32 - CountLeadingZeros_32 (v)) - lsb;
+  assert(MO.isImm() && "Not a valid bf_inv_mask_imm value!");
+  O << '#' << lsb << ", #" << width;
+}
+
+void ARMInstPrinter::printRegisterList(const MCInst *MI, unsigned OpNum) {
+  O << "{";
+  // Always skip the first operand, it's the optional (and implicit writeback).
+  for (unsigned i = OpNum+1, e = MI->getNumOperands(); i != e; ++i) {
+    if (i != OpNum+1) O << ", ";
+    O << getRegisterName(MI->getOperand(i).getReg());
+  }
+  O << "}";
+}
+
+void ARMInstPrinter::printPredicateOperand(const MCInst *MI, unsigned OpNum) {
+  ARMCC::CondCodes CC = (ARMCC::CondCodes)MI->getOperand(OpNum).getImm();
+  if (CC != ARMCC::AL)
+    O << ARMCondCodeToString(CC);
+}
+
+void ARMInstPrinter::printSBitModifierOperand(const MCInst *MI, unsigned OpNum){
+  if (MI->getOperand(OpNum).getReg()) {
+    assert(MI->getOperand(OpNum).getReg() == ARM::CPSR &&
+           "Expect ARM CPSR register!");
+    O << 's';
+  }
+}
+
+
+
+void ARMInstPrinter::printCPInstOperand(const MCInst *MI, unsigned OpNum,
+                                        const char *Modifier) {
+  // FIXME: remove this.
+  abort();
+}
+
+void ARMInstPrinter::printNoHashImmediate(const MCInst *MI, unsigned OpNum) {
+  O << MI->getOperand(OpNum).getImm();
+}
+
+
+void ARMInstPrinter::printPCLabel(const MCInst *MI, unsigned OpNum) {
+  // FIXME: remove this.
+  abort();
+}
+
+void ARMInstPrinter::printThumbS4ImmOperand(const MCInst *MI, unsigned OpNum) {
+  O << "#" <<  MI->getOperand(OpNum).getImm() * 4;
+}

diff --git a/lib/Target/ARM/AsmPrinter/ARMInstPrinter.h b/lib/Target/ARM/AsmPrinter/ARMInstPrinter.h
new file mode 100644
index 0000000..23a7f05
--- /dev/null
+++ b/lib/Target/ARM/AsmPrinter/ARMInstPrinter.h

@@ -0,0 +1,96 @@
+//===-- ARMInstPrinter.h - Convert ARM MCInst to assembly syntax ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an ARM MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMINSTPRINTER_H
+#define ARMINSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+  class MCOperand;
+  
+class ARMInstPrinter : public MCInstPrinter {
+  bool VerboseAsm;
+public:
+  ARMInstPrinter(raw_ostream &O, const MCAsmInfo &MAI, bool verboseAsm)
+    : MCInstPrinter(O, MAI), VerboseAsm(verboseAsm) {}
+
+  virtual void printInst(const MCInst *MI);
+  
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI);
+  static const char *getRegisterName(unsigned RegNo);
+
+
+  void printOperand(const MCInst *MI, unsigned OpNo,
+                    const char *Modifier = 0);
+    
+  void printSOImmOperand(const MCInst *MI, unsigned OpNum);
+  void printSOImm2PartOperand(const MCInst *MI, unsigned OpNum);
+  
+  void printSORegOperand(const MCInst *MI, unsigned OpNum);
+  void printAddrMode2Operand(const MCInst *MI, unsigned OpNum);
+  void printAddrMode2OffsetOperand(const MCInst *MI, unsigned OpNum);
+  void printAddrMode3Operand(const MCInst *MI, unsigned OpNum);
+  void printAddrMode3OffsetOperand(const MCInst *MI, unsigned OpNum);
+  void printAddrMode4Operand(const MCInst *MI, unsigned OpNum,
+                             const char *Modifier = 0);
+  void printAddrMode5Operand(const MCInst *MI, unsigned OpNum,
+                             const char *Modifier = 0);
+  void printAddrMode6Operand(const MCInst *MI, unsigned OpNum);
+  void printAddrModePCOperand(const MCInst *MI, unsigned OpNum,
+                              const char *Modifier = 0);
+    
+  void printBitfieldInvMaskImmOperand(const MCInst *MI, unsigned OpNum);
+
+  void printThumbS4ImmOperand(const MCInst *MI, unsigned OpNum);
+  void printThumbITMask(const MCInst *MI, unsigned OpNum) {}
+  void printThumbAddrModeRROperand(const MCInst *MI, unsigned OpNum) {}
+  void printThumbAddrModeRI5Operand(const MCInst *MI, unsigned OpNum,
+                                    unsigned Scale) {}
+  void printThumbAddrModeS1Operand(const MCInst *MI, unsigned OpNum) {}
+  void printThumbAddrModeS2Operand(const MCInst *MI, unsigned OpNum) {}
+  void printThumbAddrModeS4Operand(const MCInst *MI, unsigned OpNum) {}
+  void printThumbAddrModeSPOperand(const MCInst *MI, unsigned OpNum) {}
+  
+  void printT2SOOperand(const MCInst *MI, unsigned OpNum) {}
+  void printT2AddrModeImm12Operand(const MCInst *MI, unsigned OpNum) {}
+  void printT2AddrModeImm8Operand(const MCInst *MI, unsigned OpNum) {}
+  void printT2AddrModeImm8s4Operand(const MCInst *MI, unsigned OpNum) {}
+  void printT2AddrModeImm8OffsetOperand(const MCInst *MI, unsigned OpNum) {}
+  void printT2AddrModeSoRegOperand(const MCInst *MI, unsigned OpNum) {}
+  
+  void printPredicateOperand(const MCInst *MI, unsigned OpNum);
+  void printSBitModifierOperand(const MCInst *MI, unsigned OpNum);
+  void printRegisterList(const MCInst *MI, unsigned OpNum);
+  void printCPInstOperand(const MCInst *MI, unsigned OpNum,
+                          const char *Modifier);
+  void printJTBlockOperand(const MCInst *MI, unsigned OpNum) {}
+  void printJT2BlockOperand(const MCInst *MI, unsigned OpNum) {}
+  void printTBAddrMode(const MCInst *MI, unsigned OpNum) {}
+  void printNoHashImmediate(const MCInst *MI, unsigned OpNum);
+  void printVFPf32ImmOperand(const MCInst *MI, int OpNum) {}
+  void printVFPf64ImmOperand(const MCInst *MI, int OpNum) {}
+  void printHex8ImmOperand(const MCInst *MI, int OpNum) {}
+  void printHex16ImmOperand(const MCInst *MI, int OpNum) {}
+  void printHex32ImmOperand(const MCInst *MI, int OpNum) {}
+  void printHex64ImmOperand(const MCInst *MI, int OpNum) {}
+
+  void printPCLabel(const MCInst *MI, unsigned OpNum);  
+  // FIXME: Implement.
+  void PrintSpecial(const MCInst *MI, const char *Kind) {}
+};
+  
+}
+
+#endif

diff --git a/lib/Target/ARM/AsmPrinter/ARMMCInstLower.cpp b/lib/Target/ARM/AsmPrinter/ARMMCInstLower.cpp
new file mode 100644
index 0000000..1b2dd48
--- /dev/null
+++ b/lib/Target/ARM/AsmPrinter/ARMMCInstLower.cpp

@@ -0,0 +1,161 @@
+//===-- ARMMCInstLower.cpp - Convert ARM MachineInstr to an MCInst --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower ARM MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMMCInstLower.h"
+//#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+//#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/SmallString.h"
+using namespace llvm;
+
+
+#if 0
+const ARMSubtarget &ARMMCInstLower::getSubtarget() const {
+  return AsmPrinter.getSubtarget();
+}
+
+MachineModuleInfoMachO &ARMMCInstLower::getMachOMMI() const {
+  assert(getSubtarget().isTargetDarwin() &&"Can only get MachO info on darwin");
+  return AsmPrinter.MMI->getObjFileInfo<MachineModuleInfoMachO>(); 
+}
+#endif
+
+MCSymbol *ARMMCInstLower::
+GetGlobalAddressSymbol(const MachineOperand &MO) const {
+  // FIXME: HANDLE PLT references how??
+  switch (MO.getTargetFlags()) {
+  default: assert(0 && "Unknown target flag on GV operand");
+  case 0: break;
+  }
+  
+  return Printer.GetGlobalValueSymbol(MO.getGlobal());
+}
+
+MCSymbol *ARMMCInstLower::
+GetExternalSymbolSymbol(const MachineOperand &MO) const {
+  // FIXME: HANDLE PLT references how??
+  switch (MO.getTargetFlags()) {
+  default: assert(0 && "Unknown target flag on GV operand");
+  case 0: break;
+  }
+  
+  return Printer.GetExternalSymbolSymbol(MO.getSymbolName());
+}
+
+
+
+MCSymbol *ARMMCInstLower::
+GetJumpTableSymbol(const MachineOperand &MO) const {
+  SmallString<256> Name;
+  raw_svector_ostream(Name) << Printer.MAI->getPrivateGlobalPrefix() << "JTI"
+    << Printer.getFunctionNumber() << '_' << MO.getIndex();
+  
+#if 0
+  switch (MO.getTargetFlags()) {
+    default: llvm_unreachable("Unknown target flag on GV operand");
+  }
+#endif
+  
+  // Create a symbol for the name.
+  return Ctx.GetOrCreateSymbol(Name.str());
+}
+
+MCSymbol *ARMMCInstLower::
+GetConstantPoolIndexSymbol(const MachineOperand &MO) const {
+  SmallString<256> Name;
+  raw_svector_ostream(Name) << Printer.MAI->getPrivateGlobalPrefix() << "CPI"
+    << Printer.getFunctionNumber() << '_' << MO.getIndex();
+  
+#if 0
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  }
+#endif
+  
+  // Create a symbol for the name.
+  return Ctx.GetOrCreateSymbol(Name.str());
+}
+  
+MCOperand ARMMCInstLower::
+LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const {
+  // FIXME: We would like an efficient form for this, so we don't have to do a
+  // lot of extra uniquing.
+  const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+  
+#if 0
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  }
+#endif
+  
+  if (!MO.isJTI() && MO.getOffset())
+    Expr = MCBinaryExpr::CreateAdd(Expr,
+                                   MCConstantExpr::Create(MO.getOffset(), Ctx),
+                                   Ctx);
+  return MCOperand::CreateExpr(Expr);
+}
+
+
+void ARMMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+  
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    
+    MCOperand MCOp;
+    switch (MO.getType()) {
+    default:
+      MI->dump();
+      assert(0 && "unknown operand type");
+    case MachineOperand::MO_Register:
+      // Ignore all implicit register operands.
+      if (MO.isImplicit()) continue;
+      assert(!MO.getSubReg() && "Subregs should be eliminated!");
+      MCOp = MCOperand::CreateReg(MO.getReg());
+      break;
+    case MachineOperand::MO_Immediate:
+      MCOp = MCOperand::CreateImm(MO.getImm());
+      break;
+    case MachineOperand::MO_MachineBasicBlock:
+      MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
+                       MO.getMBB()->getSymbol(Ctx), Ctx));
+      break;
+    case MachineOperand::MO_GlobalAddress:
+      MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO));
+      break;
+    case MachineOperand::MO_ExternalSymbol:
+      MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO));
+      break;
+    case MachineOperand::MO_JumpTableIndex:
+      MCOp = LowerSymbolOperand(MO, GetJumpTableSymbol(MO));
+      break;
+    case MachineOperand::MO_ConstantPoolIndex:
+      MCOp = LowerSymbolOperand(MO, GetConstantPoolIndexSymbol(MO));
+      break;
+    case MachineOperand::MO_BlockAddress:
+      MCOp = LowerSymbolOperand(MO, Printer.GetBlockAddressSymbol(
+                                              MO.getBlockAddress()));
+      break;
+    }
+    
+    OutMI.addOperand(MCOp);
+  }
+  
+}

diff --git a/lib/Target/ARM/AsmPrinter/ARMMCInstLower.h b/lib/Target/ARM/AsmPrinter/ARMMCInstLower.h
new file mode 100644
index 0000000..383d30d
--- /dev/null
+++ b/lib/Target/ARM/AsmPrinter/ARMMCInstLower.h

@@ -0,0 +1,56 @@
+//===-- ARMMCInstLower.h - Lower MachineInstr to MCInst -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARM_MCINSTLOWER_H
+#define ARM_MCINSTLOWER_H
+
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+  class AsmPrinter;
+  class MCAsmInfo;
+  class MCContext;
+  class MCInst;
+  class MCOperand;
+  class MCSymbol;
+  class MachineInstr;
+  class MachineModuleInfoMachO;
+  class MachineOperand;
+  class Mangler;
+  //class ARMSubtarget;
+  
+/// ARMMCInstLower - This class is used to lower an MachineInstr into an MCInst.
+class VISIBILITY_HIDDEN ARMMCInstLower {
+  MCContext &Ctx;
+  Mangler &Mang;
+  AsmPrinter &Printer;
+
+  //const ARMSubtarget &getSubtarget() const;
+public:
+  ARMMCInstLower(MCContext &ctx, Mangler &mang, AsmPrinter &printer)
+    : Ctx(ctx), Mang(mang), Printer(printer) {}
+  
+  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+  //MCSymbol *GetPICBaseSymbol() const;
+  MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetJumpTableSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetConstantPoolIndexSymbol(const MachineOperand &MO) const;
+  MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+  
+/*
+private:
+  MachineModuleInfoMachO &getMachOMMI() const;
+ */
+};
+
+}
+
+#endif

diff --git a/lib/Target/ARM/AsmPrinter/CMakeLists.txt b/lib/Target/ARM/AsmPrinter/CMakeLists.txt
new file mode 100644
index 0000000..4e299f8
--- /dev/null
+++ b/lib/Target/ARM/AsmPrinter/CMakeLists.txt

@@ -0,0 +1,8 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMARMAsmPrinter
+  ARMAsmPrinter.cpp
+  ARMInstPrinter.cpp
+  ARMMCInstLower.cpp
+  )
+add_dependencies(LLVMARMAsmPrinter ARMCodeGenTable_gen)

diff --git a/lib/Target/ARM/AsmPrinter/Makefile b/lib/Target/ARM/AsmPrinter/Makefile
new file mode 100644
index 0000000..208becc
--- /dev/null
+++ b/lib/Target/ARM/AsmPrinter/Makefile

@@ -0,0 +1,15 @@
+##===- lib/Target/ARM/AsmPrinter/Makefile ------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMARMAsmPrinter
+
+# Hack: we need to include 'main' arm target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common

diff --git a/lib/Target/ARM/CMakeLists.txt b/lib/Target/ARM/CMakeLists.txt
new file mode 100644
index 0000000..964551f
--- /dev/null
+++ b/lib/Target/ARM/CMakeLists.txt

@@ -0,0 +1,40 @@
+set(LLVM_TARGET_DEFINITIONS ARM.td)
+
+tablegen(ARMGenRegisterInfo.h.inc -gen-register-desc-header)
+tablegen(ARMGenRegisterNames.inc -gen-register-enums)
+tablegen(ARMGenRegisterInfo.inc -gen-register-desc)
+tablegen(ARMGenInstrNames.inc -gen-instr-enums)
+tablegen(ARMGenInstrInfo.inc -gen-instr-desc)
+tablegen(ARMGenCodeEmitter.inc -gen-emitter)
+tablegen(ARMGenAsmWriter.inc -gen-asm-writer)
+tablegen(ARMGenDAGISel.inc -gen-dag-isel)
+tablegen(ARMGenCallingConv.inc -gen-callingconv)
+tablegen(ARMGenSubtarget.inc -gen-subtarget)
+
+add_llvm_target(ARMCodeGen
+  ARMBaseInstrInfo.cpp
+  ARMBaseRegisterInfo.cpp
+  ARMCodeEmitter.cpp
+  ARMConstantIslandPass.cpp
+  ARMConstantPoolValue.cpp
+  ARMExpandPseudoInsts.cpp
+  ARMISelDAGToDAG.cpp
+  ARMISelLowering.cpp
+  ARMInstrInfo.cpp
+  ARMJITInfo.cpp
+  ARMLoadStoreOptimizer.cpp
+  ARMMCAsmInfo.cpp
+  ARMRegisterInfo.cpp
+  ARMSubtarget.cpp
+  ARMTargetMachine.cpp
+  NEONMoveFix.cpp
+  NEONPreAllocPass.cpp
+  Thumb1InstrInfo.cpp
+  Thumb1RegisterInfo.cpp
+  Thumb2ITBlockPass.cpp
+  Thumb2InstrInfo.cpp
+  Thumb2RegisterInfo.cpp
+  Thumb2SizeReduction.cpp
+  )
+
+target_link_libraries (LLVMARMCodeGen LLVMSelectionDAG)

diff --git a/lib/Target/ARM/Makefile b/lib/Target/ARM/Makefile
new file mode 100644
index 0000000..a8dd38c
--- /dev/null
+++ b/lib/Target/ARM/Makefile

@@ -0,0 +1,23 @@
+##===- lib/Target/ARM/Makefile -----------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMARMCodeGen
+TARGET = ARM
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = ARMGenRegisterInfo.h.inc ARMGenRegisterNames.inc \
+                ARMGenRegisterInfo.inc ARMGenInstrNames.inc \
+                ARMGenInstrInfo.inc ARMGenAsmWriter.inc \
+                ARMGenDAGISel.inc ARMGenSubtarget.inc \
+                ARMGenCodeEmitter.inc ARMGenCallingConv.inc
+
+DIRS = AsmPrinter AsmParser TargetInfo
+
+include $(LEVEL)/Makefile.common

diff --git a/lib/Target/ARM/NEONMoveFix.cpp b/lib/Target/ARM/NEONMoveFix.cpp
new file mode 100644
index 0000000..3c0414d
--- /dev/null
+++ b/lib/Target/ARM/NEONMoveFix.cpp

@@ -0,0 +1,141 @@
+//===-- NEONMoveFix.cpp - Convert vfp reg-reg moves into neon ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "neon-mov-fix"
+#include "ARM.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMInstrInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+STATISTIC(NumVMovs, "Number of reg-reg moves converted");
+
+namespace {
+  struct NEONMoveFixPass : public MachineFunctionPass {
+    static char ID;
+    NEONMoveFixPass() : MachineFunctionPass(&ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+    virtual const char *getPassName() const {
+      return "NEON reg-reg move conversion";
+    }
+
+  private:
+    const TargetRegisterInfo *TRI;
+    const ARMBaseInstrInfo *TII;
+
+    typedef DenseMap<unsigned, const MachineInstr*> RegMap;
+
+    bool InsertMoves(MachineBasicBlock &MBB);
+  };
+  char NEONMoveFixPass::ID = 0;
+}
+
+bool NEONMoveFixPass::InsertMoves(MachineBasicBlock &MBB) {
+  RegMap Defs;
+  bool Modified = false;
+
+  // Walk over MBB tracking the def points of the registers.
+  MachineBasicBlock::iterator MII = MBB.begin(), E = MBB.end();
+  MachineBasicBlock::iterator NextMII;
+  for (; MII != E; MII = NextMII) {
+    NextMII = llvm::next(MII);
+    MachineInstr *MI = &*MII;
+
+    if (MI->getOpcode() == ARM::VMOVD &&
+        !TII->isPredicated(MI)) {
+      unsigned SrcReg = MI->getOperand(1).getReg();
+      // If we do not find an instruction defining the reg, this means the
+      // register should be live-in for this BB. It's always to better to use
+      // NEON reg-reg moves.
+      unsigned Domain = ARMII::DomainNEON;
+      RegMap::iterator DefMI = Defs.find(SrcReg);
+      if (DefMI != Defs.end()) {
+        Domain = DefMI->second->getDesc().TSFlags & ARMII::DomainMask;
+        // Instructions in general domain are subreg accesses.
+        // Map them to NEON reg-reg moves.
+        if (Domain == ARMII::DomainGeneral)
+          Domain = ARMII::DomainNEON;
+      }
+
+      if (Domain & ARMII::DomainNEON) {
+        // Convert VMOVD to VMOVDneon
+        unsigned DestReg = MI->getOperand(0).getReg();
+
+        DEBUG({errs() << "vmov convert: "; MI->dump();});
+
+        // It's safe to ignore imp-defs / imp-uses here, since:
+        //  - We're running late, no intelligent condegen passes should be run
+        //    afterwards
+        //  - The imp-defs / imp-uses are superregs only, we don't care about
+        //    them.
+        AddDefaultPred(BuildMI(MBB, *MI, MI->getDebugLoc(),
+                             TII->get(ARM::VMOVDneon), DestReg).addReg(SrcReg));
+        MBB.erase(MI);
+        MachineBasicBlock::iterator I = prior(NextMII);
+        MI = &*I;
+
+        DEBUG({errs() << "        into: "; MI->dump();});
+
+        Modified = true;
+        ++NumVMovs;
+      } else {
+        assert((Domain & ARMII::DomainVFP) && "Invalid domain!");
+        // Do nothing.
+      }
+    }
+
+    // Update def information.
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      const MachineOperand& MO = MI->getOperand(i);
+      if (!MO.isReg() || !MO.isDef())
+        continue;
+      unsigned MOReg = MO.getReg();
+
+      Defs[MOReg] = MI;
+      // Catch subregs as well.
+      for (const unsigned *R = TRI->getSubRegisters(MOReg); *R; ++R)
+        Defs[*R] = MI;
+    }
+  }
+
+  return Modified;
+}
+
+bool NEONMoveFixPass::runOnMachineFunction(MachineFunction &Fn) {
+  ARMFunctionInfo *AFI = Fn.getInfo<ARMFunctionInfo>();
+  const TargetMachine &TM = Fn.getTarget();
+
+  if (AFI->isThumbFunction())
+    return false;
+
+  TRI = TM.getRegisterInfo();
+  TII = static_cast<const ARMBaseInstrInfo*>(TM.getInstrInfo());
+
+  bool Modified = false;
+  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+       ++MFI) {
+    MachineBasicBlock &MBB = *MFI;
+    Modified |= InsertMoves(MBB);
+  }
+
+  return Modified;
+}
+
+/// createNEONMoveFixPass - Returns an instance of the NEON reg-reg moves fix
+/// pass.
+FunctionPass *llvm::createNEONMoveFixPass() {
+  return new NEONMoveFixPass();
+}

diff --git a/lib/Target/ARM/NEONPreAllocPass.cpp b/lib/Target/ARM/NEONPreAllocPass.cpp
new file mode 100644
index 0000000..d9942c8
--- /dev/null
+++ b/lib/Target/ARM/NEONPreAllocPass.cpp

@@ -0,0 +1,394 @@
+//===-- NEONPreAllocPass.cpp - Allocate adjacent NEON registers--*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "neon-prealloc"
+#include "ARM.h"
+#include "ARMInstrInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+using namespace llvm;
+
+namespace {
+  class NEONPreAllocPass : public MachineFunctionPass {
+    const TargetInstrInfo *TII;
+
+  public:
+    static char ID;
+    NEONPreAllocPass() : MachineFunctionPass(&ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual const char *getPassName() const {
+      return "NEON register pre-allocation pass";
+    }
+
+  private:
+    bool PreAllocNEONRegisters(MachineBasicBlock &MBB);
+  };
+
+  char NEONPreAllocPass::ID = 0;
+}
+
+static bool isNEONMultiRegOp(int Opcode, unsigned &FirstOpnd, unsigned &NumRegs,
+                             unsigned &Offset, unsigned &Stride) {
+  // Default to unit stride with no offset.
+  Stride = 1;
+  Offset = 0;
+
+  switch (Opcode) {
+  default:
+    break;
+
+  case ARM::VLD2d8:
+  case ARM::VLD2d16:
+  case ARM::VLD2d32:
+  case ARM::VLD2d64:
+  case ARM::VLD2LNd8:
+  case ARM::VLD2LNd16:
+  case ARM::VLD2LNd32:
+    FirstOpnd = 0;
+    NumRegs = 2;
+    return true;
+
+  case ARM::VLD2q8:
+  case ARM::VLD2q16:
+  case ARM::VLD2q32:
+    FirstOpnd = 0;
+    NumRegs = 4;
+    return true;
+
+  case ARM::VLD2LNq16a:
+  case ARM::VLD2LNq32a:
+    FirstOpnd = 0;
+    NumRegs = 2;
+    Offset = 0;
+    Stride = 2;
+    return true;
+
+  case ARM::VLD2LNq16b:
+  case ARM::VLD2LNq32b:
+    FirstOpnd = 0;
+    NumRegs = 2;
+    Offset = 1;
+    Stride = 2;
+    return true;
+
+  case ARM::VLD3d8:
+  case ARM::VLD3d16:
+  case ARM::VLD3d32:
+  case ARM::VLD3d64:
+  case ARM::VLD3LNd8:
+  case ARM::VLD3LNd16:
+  case ARM::VLD3LNd32:
+    FirstOpnd = 0;
+    NumRegs = 3;
+    return true;
+
+  case ARM::VLD3q8a:
+  case ARM::VLD3q16a:
+  case ARM::VLD3q32a:
+    FirstOpnd = 0;
+    NumRegs = 3;
+    Offset = 0;
+    Stride = 2;
+    return true;
+
+  case ARM::VLD3q8b:
+  case ARM::VLD3q16b:
+  case ARM::VLD3q32b:
+    FirstOpnd = 0;
+    NumRegs = 3;
+    Offset = 1;
+    Stride = 2;
+    return true;
+
+  case ARM::VLD3LNq16a:
+  case ARM::VLD3LNq32a:
+    FirstOpnd = 0;
+    NumRegs = 3;
+    Offset = 0;
+    Stride = 2;
+    return true;
+
+  case ARM::VLD3LNq16b:
+  case ARM::VLD3LNq32b:
+    FirstOpnd = 0;
+    NumRegs = 3;
+    Offset = 1;
+    Stride = 2;
+    return true;
+
+  case ARM::VLD4d8:
+  case ARM::VLD4d16:
+  case ARM::VLD4d32:
+  case ARM::VLD4d64:
+  case ARM::VLD4LNd8:
+  case ARM::VLD4LNd16:
+  case ARM::VLD4LNd32:
+    FirstOpnd = 0;
+    NumRegs = 4;
+    return true;
+
+  case ARM::VLD4q8a:
+  case ARM::VLD4q16a:
+  case ARM::VLD4q32a:
+    FirstOpnd = 0;
+    NumRegs = 4;
+    Offset = 0;
+    Stride = 2;
+    return true;
+
+  case ARM::VLD4q8b:
+  case ARM::VLD4q16b:
+  case ARM::VLD4q32b:
+    FirstOpnd = 0;
+    NumRegs = 4;
+    Offset = 1;
+    Stride = 2;
+    return true;
+
+  case ARM::VLD4LNq16a:
+  case ARM::VLD4LNq32a:
+    FirstOpnd = 0;
+    NumRegs = 4;
+    Offset = 0;
+    Stride = 2;
+    return true;
+
+  case ARM::VLD4LNq16b:
+  case ARM::VLD4LNq32b:
+    FirstOpnd = 0;
+    NumRegs = 4;
+    Offset = 1;
+    Stride = 2;
+    return true;
+
+  case ARM::VST2d8:
+  case ARM::VST2d16:
+  case ARM::VST2d32:
+  case ARM::VST2d64:
+  case ARM::VST2LNd8:
+  case ARM::VST2LNd16:
+  case ARM::VST2LNd32:
+    FirstOpnd = 4;
+    NumRegs = 2;
+    return true;
+
+  case ARM::VST2q8:
+  case ARM::VST2q16:
+  case ARM::VST2q32:
+    FirstOpnd = 4;
+    NumRegs = 4;
+    return true;
+
+  case ARM::VST2LNq16a:
+  case ARM::VST2LNq32a:
+    FirstOpnd = 4;
+    NumRegs = 2;
+    Offset = 0;
+    Stride = 2;
+    return true;
+
+  case ARM::VST2LNq16b:
+  case ARM::VST2LNq32b:
+    FirstOpnd = 4;
+    NumRegs = 2;
+    Offset = 1;
+    Stride = 2;
+    return true;
+
+  case ARM::VST3d8:
+  case ARM::VST3d16:
+  case ARM::VST3d32:
+  case ARM::VST3d64:
+  case ARM::VST3LNd8:
+  case ARM::VST3LNd16:
+  case ARM::VST3LNd32:
+    FirstOpnd = 4;
+    NumRegs = 3;
+    return true;
+
+  case ARM::VST3q8a:
+  case ARM::VST3q16a:
+  case ARM::VST3q32a:
+    FirstOpnd = 5;
+    NumRegs = 3;
+    Offset = 0;
+    Stride = 2;
+    return true;
+
+  case ARM::VST3q8b:
+  case ARM::VST3q16b:
+  case ARM::VST3q32b:
+    FirstOpnd = 5;
+    NumRegs = 3;
+    Offset = 1;
+    Stride = 2;
+    return true;
+
+  case ARM::VST3LNq16a:
+  case ARM::VST3LNq32a:
+    FirstOpnd = 4;
+    NumRegs = 3;
+    Offset = 0;
+    Stride = 2;
+    return true;
+
+  case ARM::VST3LNq16b:
+  case ARM::VST3LNq32b:
+    FirstOpnd = 4;
+    NumRegs = 3;
+    Offset = 1;
+    Stride = 2;
+    return true;
+
+  case ARM::VST4d8:
+  case ARM::VST4d16:
+  case ARM::VST4d32:
+  case ARM::VST4d64:
+  case ARM::VST4LNd8:
+  case ARM::VST4LNd16:
+  case ARM::VST4LNd32:
+    FirstOpnd = 4;
+    NumRegs = 4;
+    return true;
+
+  case ARM::VST4q8a:
+  case ARM::VST4q16a:
+  case ARM::VST4q32a:
+    FirstOpnd = 5;
+    NumRegs = 4;
+    Offset = 0;
+    Stride = 2;
+    return true;
+
+  case ARM::VST4q8b:
+  case ARM::VST4q16b:
+  case ARM::VST4q32b:
+    FirstOpnd = 5;
+    NumRegs = 4;
+    Offset = 1;
+    Stride = 2;
+    return true;
+
+  case ARM::VST4LNq16a:
+  case ARM::VST4LNq32a:
+    FirstOpnd = 4;
+    NumRegs = 4;
+    Offset = 0;
+    Stride = 2;
+    return true;
+
+  case ARM::VST4LNq16b:
+  case ARM::VST4LNq32b:
+    FirstOpnd = 4;
+    NumRegs = 4;
+    Offset = 1;
+    Stride = 2;
+    return true;
+
+  case ARM::VTBL2:
+    FirstOpnd = 1;
+    NumRegs = 2;
+    return true;
+
+  case ARM::VTBL3:
+    FirstOpnd = 1;
+    NumRegs = 3;
+    return true;
+
+  case ARM::VTBL4:
+    FirstOpnd = 1;
+    NumRegs = 4;
+    return true;
+
+  case ARM::VTBX2:
+    FirstOpnd = 2;
+    NumRegs = 2;
+    return true;
+
+  case ARM::VTBX3:
+    FirstOpnd = 2;
+    NumRegs = 3;
+    return true;
+
+  case ARM::VTBX4:
+    FirstOpnd = 2;
+    NumRegs = 4;
+    return true;
+  }
+
+  return false;
+}
+
+bool NEONPreAllocPass::PreAllocNEONRegisters(MachineBasicBlock &MBB) {
+  bool Modified = false;
+
+  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+  for (; MBBI != E; ++MBBI) {
+    MachineInstr *MI = &*MBBI;
+    unsigned FirstOpnd, NumRegs, Offset, Stride;
+    if (!isNEONMultiRegOp(MI->getOpcode(), FirstOpnd, NumRegs, Offset, Stride))
+      continue;
+
+    MachineBasicBlock::iterator NextI = llvm::next(MBBI);
+    for (unsigned R = 0; R < NumRegs; ++R) {
+      MachineOperand &MO = MI->getOperand(FirstOpnd + R);
+      assert(MO.isReg() && MO.getSubReg() == 0 && "unexpected operand");
+      unsigned VirtReg = MO.getReg();
+      assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
+             "expected a virtual register");
+
+      // For now, just assign a fixed set of adjacent registers.
+      // This leaves plenty of room for future improvements.
+      static const unsigned NEONDRegs[] = {
+        ARM::D0, ARM::D1, ARM::D2, ARM::D3,
+        ARM::D4, ARM::D5, ARM::D6, ARM::D7
+      };
+      MO.setReg(NEONDRegs[Offset + R * Stride]);
+
+      if (MO.isUse()) {
+        // Insert a copy from VirtReg.
+        TII->copyRegToReg(MBB, MBBI, MO.getReg(), VirtReg,
+                          ARM::DPRRegisterClass, ARM::DPRRegisterClass);
+        if (MO.isKill()) {
+          MachineInstr *CopyMI = prior(MBBI);
+          CopyMI->findRegisterUseOperand(VirtReg)->setIsKill();
+        }
+        MO.setIsKill();
+      } else if (MO.isDef() && !MO.isDead()) {
+        // Add a copy to VirtReg.
+        TII->copyRegToReg(MBB, NextI, VirtReg, MO.getReg(),
+                          ARM::DPRRegisterClass, ARM::DPRRegisterClass);
+      }
+    }
+  }
+
+  return Modified;
+}
+
+bool NEONPreAllocPass::runOnMachineFunction(MachineFunction &MF) {
+  TII = MF.getTarget().getInstrInfo();
+
+  bool Modified = false;
+  for (MachineFunction::iterator MFI = MF.begin(), E = MF.end(); MFI != E;
+       ++MFI) {
+    MachineBasicBlock &MBB = *MFI;
+    Modified |= PreAllocNEONRegisters(MBB);
+  }
+
+  return Modified;
+}
+
+/// createNEONPreAllocPass - returns an instance of the NEON register
+/// pre-allocation pass.
+FunctionPass *llvm::createNEONPreAllocPass() {
+  return new NEONPreAllocPass();
+}

diff --git a/lib/Target/ARM/README-Thumb.txt b/lib/Target/ARM/README-Thumb.txt
new file mode 100644
index 0000000..6b605bb
--- /dev/null
+++ b/lib/Target/ARM/README-Thumb.txt

@@ -0,0 +1,248 @@
+//===---------------------------------------------------------------------===//
+// Random ideas for the ARM backend (Thumb specific).
+//===---------------------------------------------------------------------===//
+
+* Add support for compiling functions in both ARM and Thumb mode, then taking
+  the smallest.
+
+* Add support for compiling individual basic blocks in thumb mode, when in a 
+  larger ARM function.  This can be used for presumed cold code, like paths
+  to abort (failure path of asserts), EH handling code, etc.
+
+* Thumb doesn't have normal pre/post increment addressing modes, but you can
+  load/store 32-bit integers with pre/postinc by using load/store multiple
+  instrs with a single register.
+
+* Make better use of high registers r8, r10, r11, r12 (ip). Some variants of add
+  and cmp instructions can use high registers. Also, we can use them as
+  temporaries to spill values into.
+
+* In thumb mode, short, byte, and bool preferred alignments are currently set
+  to 4 to accommodate ISA restriction (i.e. add sp, #imm, imm must be multiple
+  of 4).
+
+//===---------------------------------------------------------------------===//
+
+Potential jumptable improvements:
+
+* If we know function size is less than (1 << 16) * 2 bytes, we can use 16-bit
+  jumptable entries (e.g. (L1 - L2) >> 1). Or even smaller entries if the
+  function is even smaller. This also applies to ARM.
+
+* Thumb jumptable codegen can improve given some help from the assembler. This
+  is what we generate right now:
+
+	.set PCRELV0, (LJTI1_0_0-(LPCRELL0+4))
+LPCRELL0:
+	mov r1, #PCRELV0
+	add r1, pc
+	ldr r0, [r0, r1]
+	mov pc, r0 
+	.align	2
+LJTI1_0_0:
+	.long	 LBB1_3
+        ...
+
+Note there is another pc relative add that we can take advantage of.
+     add r1, pc, #imm_8 * 4
+
+We should be able to generate:
+
+LPCRELL0:
+	add r1, LJTI1_0_0
+	ldr r0, [r0, r1]
+	mov pc, r0 
+	.align	2
+LJTI1_0_0:
+	.long	 LBB1_3
+
+if the assembler can translate the add to:
+       add r1, pc, #((LJTI1_0_0-(LPCRELL0+4))&0xfffffffc)
+
+Note the assembler also does something similar to constpool load:
+LPCRELL0:
+     ldr r0, LCPI1_0
+=>
+     ldr r0, pc, #((LCPI1_0-(LPCRELL0+4))&0xfffffffc)
+
+
+//===---------------------------------------------------------------------===//
+
+We compiles the following:
+
+define i16 @func_entry_2E_ce(i32 %i) {
+        switch i32 %i, label %bb12.exitStub [
+                 i32 0, label %bb4.exitStub
+                 i32 1, label %bb9.exitStub
+                 i32 2, label %bb4.exitStub
+                 i32 3, label %bb4.exitStub
+                 i32 7, label %bb9.exitStub
+                 i32 8, label %bb.exitStub
+                 i32 9, label %bb9.exitStub
+        ]
+
+bb12.exitStub:
+        ret i16 0
+
+bb4.exitStub:
+        ret i16 1
+
+bb9.exitStub:
+        ret i16 2
+
+bb.exitStub:
+        ret i16 3
+}
+
+into:
+
+_func_entry_2E_ce:
+        mov r2, #1
+        lsl r2, r0
+        cmp r0, #9
+        bhi LBB1_4      @bb12.exitStub
+LBB1_1: @newFuncRoot
+        mov r1, #13
+        tst r2, r1
+        bne LBB1_5      @bb4.exitStub
+LBB1_2: @newFuncRoot
+        ldr r1, LCPI1_0
+        tst r2, r1
+        bne LBB1_6      @bb9.exitStub
+LBB1_3: @newFuncRoot
+        mov r1, #1
+        lsl r1, r1, #8
+        tst r2, r1
+        bne LBB1_7      @bb.exitStub
+LBB1_4: @bb12.exitStub
+        mov r0, #0
+        bx lr
+LBB1_5: @bb4.exitStub
+        mov r0, #1
+        bx lr
+LBB1_6: @bb9.exitStub
+        mov r0, #2
+        bx lr
+LBB1_7: @bb.exitStub
+        mov r0, #3
+        bx lr
+LBB1_8:
+        .align  2
+LCPI1_0:
+        .long   642
+
+
+gcc compiles to:
+
+	cmp	r0, #9
+	@ lr needed for prologue
+	bhi	L2
+	ldr	r3, L11
+	mov	r2, #1
+	mov	r1, r2, asl r0
+	ands	r0, r3, r2, asl r0
+	movne	r0, #2
+	bxne	lr
+	tst	r1, #13
+	beq	L9
+L3:
+	mov	r0, r2
+	bx	lr
+L9:
+	tst	r1, #256
+	movne	r0, #3
+	bxne	lr
+L2:
+	mov	r0, #0
+	bx	lr
+L12:
+	.align 2
+L11:
+	.long	642
+        
+
+GCC is doing a couple of clever things here:
+  1. It is predicating one of the returns.  This isn't a clear win though: in
+     cases where that return isn't taken, it is replacing one condbranch with
+     two 'ne' predicated instructions.
+  2. It is sinking the shift of "1 << i" into the tst, and using ands instead of
+     tst.  This will probably require whole function isel.
+  3. GCC emits:
+  	tst	r1, #256
+     we emit:
+        mov r1, #1
+        lsl r1, r1, #8
+        tst r2, r1
+  
+
+//===---------------------------------------------------------------------===//
+
+When spilling in thumb mode and the sp offset is too large to fit in the ldr /
+str offset field, we load the offset from a constpool entry and add it to sp:
+
+ldr r2, LCPI
+add r2, sp
+ldr r2, [r2]
+
+These instructions preserve the condition code which is important if the spill
+is between a cmp and a bcc instruction. However, we can use the (potentially)
+cheaper sequnce if we know it's ok to clobber the condition register.
+
+add r2, sp, #255 * 4
+add r2, #132
+ldr r2, [r2, #7 * 4]
+
+This is especially bad when dynamic alloca is used. The all fixed size stack
+objects are referenced off the frame pointer with negative offsets. See
+oggenc for an example.
+
+
+//===---------------------------------------------------------------------===//
+
+Poor codegen test/CodeGen/ARM/select.ll f7:
+
+	ldr r5, LCPI1_0
+LPC0:
+	add r5, pc
+	ldr r6, LCPI1_1
+	ldr r2, LCPI1_2
+	mov r3, r6
+	mov lr, pc
+	bx r5
+
+//===---------------------------------------------------------------------===//
+
+Make register allocator / spiller smarter so we can re-materialize "mov r, imm",
+etc. Almost all Thumb instructions clobber condition code.
+
+//===---------------------------------------------------------------------===//
+
+Add ldmia, stmia support.
+
+//===---------------------------------------------------------------------===//
+
+Thumb load / store address mode offsets are scaled. The values kept in the
+instruction operands are pre-scale values. This probably ought to be changed
+to avoid extra work when we convert Thumb2 instructions to Thumb1 instructions.
+
+//===---------------------------------------------------------------------===//
+
+We need to make (some of the) Thumb1 instructions predicable. That will allow
+shrinking of predicated Thumb2 instructions. To allow this, we need to be able
+to toggle the 's' bit since they do not set CPSR when they are inside IT blocks.
+
+//===---------------------------------------------------------------------===//
+
+Make use of hi register variants of cmp: tCMPhir / tCMPZhir.
+
+//===---------------------------------------------------------------------===//
+
+Thumb1 immediate field sometimes keep pre-scaled values. See
+Thumb1RegisterInfo::eliminateFrameIndex. This is inconsistent from ARM and
+Thumb2.
+
+//===---------------------------------------------------------------------===//
+
+Rather than having tBR_JTr print a ".align 2" and constant island pass pad it,
+add a target specific ALIGN instruction instead. That way, GetInstSizeInBytes
+won't have to over-estimate. It can also be used for loop alignment pass.

diff --git a/lib/Target/ARM/README-Thumb2.txt b/lib/Target/ARM/README-Thumb2.txt
new file mode 100644
index 0000000..e7c2552
--- /dev/null
+++ b/lib/Target/ARM/README-Thumb2.txt

@@ -0,0 +1,6 @@
+//===---------------------------------------------------------------------===//
+// Random ideas for the ARM backend (Thumb2 specific).
+//===---------------------------------------------------------------------===//
+
+Make sure jumptable destinations are below the jumptable in order to make use
+of tbb / tbh.

diff --git a/lib/Target/ARM/README.txt b/lib/Target/ARM/README.txt
new file mode 100644
index 0000000..9efb5a1
--- /dev/null
+++ b/lib/Target/ARM/README.txt

@@ -0,0 +1,587 @@
+//===---------------------------------------------------------------------===//
+// Random ideas for the ARM backend.
+//===---------------------------------------------------------------------===//
+
+Reimplement 'select' in terms of 'SEL'.
+
+* We would really like to support UXTAB16, but we need to prove that the
+  add doesn't need to overflow between the two 16-bit chunks.
+
+* Implement pre/post increment support.  (e.g. PR935)
+* Implement smarter constant generation for binops with large immediates.
+
+//===---------------------------------------------------------------------===//
+
+Crazy idea:  Consider code that uses lots of 8-bit or 16-bit values.  By the
+time regalloc happens, these values are now in a 32-bit register, usually with
+the top-bits known to be sign or zero extended.  If spilled, we should be able
+to spill these to a 8-bit or 16-bit stack slot, zero or sign extending as part
+of the reload.
+
+Doing this reduces the size of the stack frame (important for thumb etc), and
+also increases the likelihood that we will be able to reload multiple values
+from the stack with a single load.
+
+//===---------------------------------------------------------------------===//
+
+The constant island pass is in good shape.  Some cleanups might be desirable,
+but there is unlikely to be much improvement in the generated code.
+
+1.  There may be some advantage to trying to be smarter about the initial
+placement, rather than putting everything at the end.
+
+2.  There might be some compile-time efficiency to be had by representing
+consecutive islands as a single block rather than multiple blocks.
+
+3.  Use a priority queue to sort constant pool users in inverse order of
+    position so we always process the one closed to the end of functions
+    first. This may simply CreateNewWater.
+
+//===---------------------------------------------------------------------===//
+
+Eliminate copysign custom expansion. We are still generating crappy code with
+default expansion + if-conversion.
+
+//===---------------------------------------------------------------------===//
+
+Eliminate one instruction from:
+
+define i32 @_Z6slow4bii(i32 %x, i32 %y) {
+        %tmp = icmp sgt i32 %x, %y
+        %retval = select i1 %tmp, i32 %x, i32 %y
+        ret i32 %retval
+}
+
+__Z6slow4bii:
+        cmp r0, r1
+        movgt r1, r0
+        mov r0, r1
+        bx lr
+=>
+
+__Z6slow4bii:
+        cmp r0, r1
+        movle r0, r1
+        bx lr
+
+//===---------------------------------------------------------------------===//
+
+Implement long long "X-3" with instructions that fold the immediate in.  These
+were disabled due to badness with the ARM carry flag on subtracts.
+
+//===---------------------------------------------------------------------===//
+
+More load / store optimizations:
+1) Better representation for block transfer? This is from Olden/power:
+
+	fldd d0, [r4]
+	fstd d0, [r4, #+32]
+	fldd d0, [r4, #+8]
+	fstd d0, [r4, #+40]
+	fldd d0, [r4, #+16]
+	fstd d0, [r4, #+48]
+	fldd d0, [r4, #+24]
+	fstd d0, [r4, #+56]
+
+If we can spare the registers, it would be better to use fldm and fstm here.
+Need major register allocator enhancement though.
+
+2) Can we recognize the relative position of constantpool entries? i.e. Treat
+
+	ldr r0, LCPI17_3
+	ldr r1, LCPI17_4
+	ldr r2, LCPI17_5
+
+   as
+	ldr r0, LCPI17
+	ldr r1, LCPI17+4
+	ldr r2, LCPI17+8
+
+   Then the ldr's can be combined into a single ldm. See Olden/power.
+
+Note for ARM v4 gcc uses ldmia to load a pair of 32-bit values to represent a
+double 64-bit FP constant:
+
+	adr	r0, L6
+	ldmia	r0, {r0-r1}
+
+	.align 2
+L6:
+	.long	-858993459
+	.long	1074318540
+
+3) struct copies appear to be done field by field 
+instead of by words, at least sometimes:
+
+struct foo { int x; short s; char c1; char c2; };
+void cpy(struct foo*a, struct foo*b) { *a = *b; }
+
+llvm code (-O2)
+        ldrb r3, [r1, #+6]
+        ldr r2, [r1]
+        ldrb r12, [r1, #+7]
+        ldrh r1, [r1, #+4]
+        str r2, [r0]
+        strh r1, [r0, #+4]
+        strb r3, [r0, #+6]
+        strb r12, [r0, #+7]
+gcc code (-O2)
+        ldmia   r1, {r1-r2}
+        stmia   r0, {r1-r2}
+
+In this benchmark poor handling of aggregate copies has shown up as
+having a large effect on size, and possibly speed as well (we don't have
+a good way to measure on ARM).
+
+//===---------------------------------------------------------------------===//
+
+* Consider this silly example:
+
+double bar(double x) {  
+  double r = foo(3.1);
+  return x+r;
+}
+
+_bar:
+        stmfd sp!, {r4, r5, r7, lr}
+        add r7, sp, #8
+        mov r4, r0
+        mov r5, r1
+        fldd d0, LCPI1_0
+        fmrrd r0, r1, d0
+        bl _foo
+        fmdrr d0, r4, r5
+        fmsr s2, r0
+        fsitod d1, s2
+        faddd d0, d1, d0
+        fmrrd r0, r1, d0
+        ldmfd sp!, {r4, r5, r7, pc}
+
+Ignore the prologue and epilogue stuff for a second. Note 
+	mov r4, r0
+	mov r5, r1
+the copys to callee-save registers and the fact they are only being used by the
+fmdrr instruction. It would have been better had the fmdrr been scheduled
+before the call and place the result in a callee-save DPR register. The two
+mov ops would not have been necessary.
+
+//===---------------------------------------------------------------------===//
+
+Calling convention related stuff:
+
+* gcc's parameter passing implementation is terrible and we suffer as a result:
+
+e.g.
+struct s {
+  double d1;
+  int s1;
+};
+
+void foo(struct s S) {
+  printf("%g, %d\n", S.d1, S.s1);
+}
+
+'S' is passed via registers r0, r1, r2. But gcc stores them to the stack, and
+then reload them to r1, r2, and r3 before issuing the call (r0 contains the
+address of the format string):
+
+	stmfd	sp!, {r7, lr}
+	add	r7, sp, #0
+	sub	sp, sp, #12
+	stmia	sp, {r0, r1, r2}
+	ldmia	sp, {r1-r2}
+	ldr	r0, L5
+	ldr	r3, [sp, #8]
+L2:
+	add	r0, pc, r0
+	bl	L_printf$stub
+
+Instead of a stmia, ldmia, and a ldr, wouldn't it be better to do three moves?
+
+* Return an aggregate type is even worse:
+
+e.g.
+struct s foo(void) {
+  struct s S = {1.1, 2};
+  return S;
+}
+
+	mov	ip, r0
+	ldr	r0, L5
+	sub	sp, sp, #12
+L2:
+	add	r0, pc, r0
+	@ lr needed for prologue
+	ldmia	r0, {r0, r1, r2}
+	stmia	sp, {r0, r1, r2}
+	stmia	ip, {r0, r1, r2}
+	mov	r0, ip
+	add	sp, sp, #12
+	bx	lr
+
+r0 (and later ip) is the hidden parameter from caller to store the value in. The
+first ldmia loads the constants into r0, r1, r2. The last stmia stores r0, r1,
+r2 into the address passed in. However, there is one additional stmia that
+stores r0, r1, and r2 to some stack location. The store is dead.
+
+The llvm-gcc generated code looks like this:
+
+csretcc void %foo(%struct.s* %agg.result) {
+entry:
+	%S = alloca %struct.s, align 4		; <%struct.s*> [#uses=1]
+	%memtmp = alloca %struct.s		; <%struct.s*> [#uses=1]
+	cast %struct.s* %S to sbyte*		; <sbyte*>:0 [#uses=2]
+	call void %llvm.memcpy.i32( sbyte* %0, sbyte* cast ({ double, int }* %C.0.904 to sbyte*), uint 12, uint 4 )
+	cast %struct.s* %agg.result to sbyte*		; <sbyte*>:1 [#uses=2]
+	call void %llvm.memcpy.i32( sbyte* %1, sbyte* %0, uint 12, uint 0 )
+	cast %struct.s* %memtmp to sbyte*		; <sbyte*>:2 [#uses=1]
+	call void %llvm.memcpy.i32( sbyte* %2, sbyte* %1, uint 12, uint 0 )
+	ret void
+}
+
+llc ends up issuing two memcpy's (the first memcpy becomes 3 loads from
+constantpool). Perhaps we should 1) fix llvm-gcc so the memcpy is translated
+into a number of load and stores, or 2) custom lower memcpy (of small size) to
+be ldmia / stmia. I think option 2 is better but the current register
+allocator cannot allocate a chunk of registers at a time.
+
+A feasible temporary solution is to use specific physical registers at the
+lowering time for small (<= 4 words?) transfer size.
+
+* ARM CSRet calling convention requires the hidden argument to be returned by
+the callee.
+
+//===---------------------------------------------------------------------===//
+
+We can definitely do a better job on BB placements to eliminate some branches.
+It's very common to see llvm generated assembly code that looks like this:
+
+LBB3:
+ ...
+LBB4:
+...
+  beq LBB3
+  b LBB2
+
+If BB4 is the only predecessor of BB3, then we can emit BB3 after BB4. We can
+then eliminate beq and and turn the unconditional branch to LBB2 to a bne.
+
+See McCat/18-imp/ComputeBoundingBoxes for an example.
+
+//===---------------------------------------------------------------------===//
+
+Pre-/post- indexed load / stores:
+
+1) We should not make the pre/post- indexed load/store transform if the base ptr
+is guaranteed to be live beyond the load/store. This can happen if the base
+ptr is live out of the block we are performing the optimization. e.g.
+
+mov r1, r2
+ldr r3, [r1], #4
+...
+
+vs.
+
+ldr r3, [r2]
+add r1, r2, #4
+...
+
+In most cases, this is just a wasted optimization. However, sometimes it can
+negatively impact the performance because two-address code is more restrictive
+when it comes to scheduling.
+
+Unfortunately, liveout information is currently unavailable during DAG combine
+time.
+
+2) Consider spliting a indexed load / store into a pair of add/sub + load/store
+   to solve #1 (in TwoAddressInstructionPass.cpp).
+
+3) Enhance LSR to generate more opportunities for indexed ops.
+
+4) Once we added support for multiple result patterns, write indexed loads
+   patterns instead of C++ instruction selection code.
+
+5) Use VLDM / VSTM to emulate indexed FP load / store.
+
+//===---------------------------------------------------------------------===//
+
+Implement support for some more tricky ways to materialize immediates.  For
+example, to get 0xffff8000, we can use:
+
+mov r9, #&3f8000
+sub r9, r9, #&400000
+
+//===---------------------------------------------------------------------===//
+
+We sometimes generate multiple add / sub instructions to update sp in prologue
+and epilogue if the inc / dec value is too large to fit in a single immediate
+operand. In some cases, perhaps it might be better to load the value from a
+constantpool instead.
+
+//===---------------------------------------------------------------------===//
+
+GCC generates significantly better code for this function.
+
+int foo(int StackPtr, unsigned char *Line, unsigned char *Stack, int LineLen) {
+    int i = 0;
+
+    if (StackPtr != 0) {
+       while (StackPtr != 0 && i < (((LineLen) < (32768))? (LineLen) : (32768)))
+          Line[i++] = Stack[--StackPtr];
+        if (LineLen > 32768)
+        {
+            while (StackPtr != 0 && i < LineLen)
+            {
+                i++;
+                --StackPtr;
+            }
+        }
+    }
+    return StackPtr;
+}
+
+//===---------------------------------------------------------------------===//
+
+This should compile to the mlas instruction:
+int mlas(int x, int y, int z) { return ((x * y + z) < 0) ? 7 : 13; }
+
+//===---------------------------------------------------------------------===//
+
+At some point, we should triage these to see if they still apply to us:
+
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19598
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=18560
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=27016
+
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11831
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11826
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11825
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11824
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11823
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=11820
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=10982
+
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=10242
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9831
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9760
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9759
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9703
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9702
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=9663
+
+http://www.inf.u-szeged.hu/gcc-arm/
+http://citeseer.ist.psu.edu/debus04linktime.html
+
+//===---------------------------------------------------------------------===//
+
+gcc generates smaller code for this function at -O2 or -Os:
+
+void foo(signed char* p) {
+  if (*p == 3)
+     bar();
+   else if (*p == 4)
+    baz();
+  else if (*p == 5)
+    quux();
+}
+
+llvm decides it's a good idea to turn the repeated if...else into a
+binary tree, as if it were a switch; the resulting code requires -1 
+compare-and-branches when *p<=2 or *p==5, the same number if *p==4
+or *p>6, and +1 if *p==3.  So it should be a speed win
+(on balance).  However, the revised code is larger, with 4 conditional 
+branches instead of 3.
+
+More seriously, there is a byte->word extend before
+each comparison, where there should be only one, and the condition codes
+are not remembered when the same two values are compared twice.
+
+//===---------------------------------------------------------------------===//
+
+More LSR enhancements possible:
+
+1. Teach LSR about pre- and post- indexed ops to allow iv increment be merged
+   in a load / store.
+2. Allow iv reuse even when a type conversion is required. For example, i8
+   and i32 load / store addressing modes are identical.
+
+
+//===---------------------------------------------------------------------===//
+
+This:
+
+int foo(int a, int b, int c, int d) {
+  long long acc = (long long)a * (long long)b;
+  acc += (long long)c * (long long)d;
+  return (int)(acc >> 32);
+}
+
+Should compile to use SMLAL (Signed Multiply Accumulate Long) which multiplies 
+two signed 32-bit values to produce a 64-bit value, and accumulates this with 
+a 64-bit value.
+
+We currently get this with both v4 and v6:
+
+_foo:
+        smull r1, r0, r1, r0
+        smull r3, r2, r3, r2
+        adds r3, r3, r1
+        adc r0, r2, r0
+        bx lr
+
+//===---------------------------------------------------------------------===//
+
+This:
+        #include <algorithm>
+        std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
+        { return std::make_pair(a + b, a + b < a); }
+        bool no_overflow(unsigned a, unsigned b)
+        { return !full_add(a, b).second; }
+
+Should compile to:
+
+_Z8full_addjj:
+	adds	r2, r1, r2
+	movcc	r1, #0
+	movcs	r1, #1
+	str	r2, [r0, #0]
+	strb	r1, [r0, #4]
+	mov	pc, lr
+
+_Z11no_overflowjj:
+	cmn	r0, r1
+	movcs	r0, #0
+	movcc	r0, #1
+	mov	pc, lr
+
+not:
+
+__Z8full_addjj:
+        add r3, r2, r1
+        str r3, [r0]
+        mov r2, #1
+        mov r12, #0
+        cmp r3, r1
+        movlo r12, r2
+        str r12, [r0, #+4]
+        bx lr
+__Z11no_overflowjj:
+        add r3, r1, r0
+        mov r2, #1
+        mov r1, #0
+        cmp r3, r0
+        movhs r1, r2
+        mov r0, r1
+        bx lr
+
+//===---------------------------------------------------------------------===//
+
+Some of the NEON intrinsics may be appropriate for more general use, either
+as target-independent intrinsics or perhaps elsewhere in the ARM backend.
+Some of them may also be lowered to target-independent SDNodes, and perhaps
+some new SDNodes could be added.
+
+For example, maximum, minimum, and absolute value operations are well-defined
+and standard operations, both for vector and scalar types.
+
+The current NEON-specific intrinsics for count leading zeros and count one
+bits could perhaps be replaced by the target-independent ctlz and ctpop
+intrinsics.  It may also make sense to add a target-independent "ctls"
+intrinsic for "count leading sign bits".  Likewise, the backend could use
+the target-independent SDNodes for these operations.
+
+ARMv6 has scalar saturating and halving adds and subtracts.  The same
+intrinsics could possibly be used for both NEON's vector implementations of
+those operations and the ARMv6 scalar versions.
+
+//===---------------------------------------------------------------------===//
+
+ARM::MOVCCr is commutable (by flipping the condition). But we need to implement
+ARMInstrInfo::commuteInstruction() to support it.
+
+//===---------------------------------------------------------------------===//
+
+Split out LDR (literal) from normal ARM LDR instruction. Also consider spliting
+LDR into imm12 and so_reg forms. This allows us to clean up some code. e.g.
+ARMLoadStoreOptimizer does not need to look at LDR (literal) and LDR (so_reg)
+while ARMConstantIslandPass only need to worry about LDR (literal).
+
+//===---------------------------------------------------------------------===//
+
+Constant island pass should make use of full range SoImm values for LEApcrel.
+Be careful though as the last attempt caused infinite looping on lencod.
+
+//===---------------------------------------------------------------------===//
+
+Predication issue. This function:   
+
+extern unsigned array[ 128 ];
+int     foo( int x ) {
+  int     y;
+  y = array[ x & 127 ];
+  if ( x & 128 )
+     y = 123456789 & ( y >> 2 );
+  else
+     y = 123456789 & y;
+  return y;
+}
+
+compiles to:
+
+_foo:
+	and r1, r0, #127
+	ldr r2, LCPI1_0
+	ldr r2, [r2]
+	ldr r1, [r2, +r1, lsl #2]
+	mov r2, r1, lsr #2
+	tst r0, #128
+	moveq r2, r1
+	ldr r0, LCPI1_1
+	and r0, r2, r0
+	bx lr
+
+It would be better to do something like this, to fold the shift into the
+conditional move:
+
+	and r1, r0, #127
+	ldr r2, LCPI1_0
+	ldr r2, [r2]
+	ldr r1, [r2, +r1, lsl #2]
+	tst r0, #128
+	movne r1, r1, lsr #2
+	ldr r0, LCPI1_1
+	and r0, r1, r0
+	bx lr
+
+it saves an instruction and a register.
+
+//===---------------------------------------------------------------------===//
+
+It might be profitable to cse MOVi16 if there are lots of 32-bit immediates
+with the same bottom half.
+
+//===---------------------------------------------------------------------===//
+
+Robert Muth started working on an alternate jump table implementation that
+does not put the tables in-line in the text.  This is more like the llvm
+default jump table implementation.  This might be useful sometime.  Several
+revisions of patches are on the mailing list, beginning at:
+http://lists.cs.uiuc.edu/pipermail/llvmdev/2009-June/022763.html
+
+//===---------------------------------------------------------------------===//
+
+Make use of the "rbit" instruction.
+
+//===---------------------------------------------------------------------===//
+
+Take a look at test/CodeGen/Thumb2/machine-licm.ll. ARM should be taught how
+to licm and cse the unnecessary load from cp#1.
+
+//===---------------------------------------------------------------------===//
+
+The CMN instruction sets the flags like an ADD instruction, while CMP sets
+them like a subtract. Therefore to be able to use CMN for comparisons other
+than the Z bit, we'll need additional logic to reverse the conditionals
+associated with the comparison. Perhaps a pseudo-instruction for the comparison,
+with a post-codegen pass to clean up and handle the condition codes?
+See PR5694 for testcase.

diff --git a/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp b/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp
new file mode 100644
index 0000000..163a0a9
--- /dev/null
+++ b/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp

@@ -0,0 +1,23 @@
+//===-- ARMTargetInfo.cpp - ARM Target Implementation ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "llvm/Module.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheARMTarget, llvm::TheThumbTarget;
+
+extern "C" void LLVMInitializeARMTargetInfo() { 
+  RegisterTarget<Triple::arm, /*HasJIT=*/true>
+    X(TheARMTarget, "arm", "ARM");
+
+  RegisterTarget<Triple::thumb, /*HasJIT=*/true>
+    Y(TheThumbTarget, "thumb", "Thumb");
+}

diff --git a/lib/Target/ARM/TargetInfo/CMakeLists.txt b/lib/Target/ARM/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000..3910bb0
--- /dev/null
+++ b/lib/Target/ARM/TargetInfo/CMakeLists.txt

@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMARMInfo
+  ARMTargetInfo.cpp
+  )
+
+add_dependencies(LLVMARMInfo ARMCodeGenTable_gen)

diff --git a/lib/Target/ARM/TargetInfo/Makefile b/lib/Target/ARM/TargetInfo/Makefile
new file mode 100644
index 0000000..6292ab1
--- /dev/null
+++ b/lib/Target/ARM/TargetInfo/Makefile

@@ -0,0 +1,15 @@
+##===- lib/Target/ARM/TargetInfo/Makefile ------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMARMInfo
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common

diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp
new file mode 100644
index 0000000..7f42c82
--- /dev/null
+++ b/lib/Target/ARM/Thumb1InstrInfo.cpp

@@ -0,0 +1,250 @@
+//===- Thumb1InstrInfo.cpp - Thumb-1 Instruction Information ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Thumb-1 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Thumb1InstrInfo.h"
+#include "ARM.h"
+#include "ARMGenInstrInfo.inc"
+#include "ARMMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/ADT/SmallVector.h"
+#include "Thumb1InstrInfo.h"
+
+using namespace llvm;
+
+Thumb1InstrInfo::Thumb1InstrInfo(const ARMSubtarget &STI)
+  : ARMBaseInstrInfo(STI), RI(*this, STI) {
+}
+
+unsigned Thumb1InstrInfo::getUnindexedOpcode(unsigned Opc) const {
+  return 0;
+}
+
+bool Thumb1InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator I,
+                                   unsigned DestReg, unsigned SrcReg,
+                                   const TargetRegisterClass *DestRC,
+                                   const TargetRegisterClass *SrcRC) const {
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  if (DestRC == ARM::GPRRegisterClass) {
+    if (SrcRC == ARM::GPRRegisterClass) {
+      BuildMI(MBB, I, DL, get(ARM::tMOVgpr2gpr), DestReg).addReg(SrcReg);
+      return true;
+    } else if (SrcRC == ARM::tGPRRegisterClass) {
+      BuildMI(MBB, I, DL, get(ARM::tMOVtgpr2gpr), DestReg).addReg(SrcReg);
+      return true;
+    }
+  } else if (DestRC == ARM::tGPRRegisterClass) {
+    if (SrcRC == ARM::GPRRegisterClass) {
+      BuildMI(MBB, I, DL, get(ARM::tMOVgpr2tgpr), DestReg).addReg(SrcReg);
+      return true;
+    } else if (SrcRC == ARM::tGPRRegisterClass) {
+      BuildMI(MBB, I, DL, get(ARM::tMOVr), DestReg).addReg(SrcReg);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool Thumb1InstrInfo::
+canFoldMemoryOperand(const MachineInstr *MI,
+                     const SmallVectorImpl<unsigned> &Ops) const {
+  if (Ops.size() != 1) return false;
+
+  unsigned OpNum = Ops[0];
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+  default: break;
+  case ARM::tMOVr:
+  case ARM::tMOVtgpr2gpr:
+  case ARM::tMOVgpr2tgpr:
+  case ARM::tMOVgpr2gpr: {
+    if (OpNum == 0) { // move -> store
+      unsigned SrcReg = MI->getOperand(1).getReg();
+      if (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
+          !isARMLowRegister(SrcReg))
+        // tSpill cannot take a high register operand.
+        return false;
+    } else {          // move -> load
+      unsigned DstReg = MI->getOperand(0).getReg();
+      if (TargetRegisterInfo::isPhysicalRegister(DstReg) &&
+          !isARMLowRegister(DstReg))
+        // tRestore cannot target a high register operand.
+        return false;
+    }
+    return true;
+  }
+  }
+
+  return false;
+}
+
+void Thumb1InstrInfo::
+storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                    unsigned SrcReg, bool isKill, int FI,
+                    const TargetRegisterClass *RC) const {
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  assert((RC == ARM::tGPRRegisterClass ||
+          (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
+           isARMLowRegister(SrcReg))) && "Unknown regclass!");
+
+  if (RC == ARM::tGPRRegisterClass ||
+      (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
+       isARMLowRegister(SrcReg))) {
+    MachineFunction &MF = *MBB.getParent();
+    MachineFrameInfo &MFI = *MF.getFrameInfo();
+    MachineMemOperand *MMO =
+      MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FI),
+                              MachineMemOperand::MOStore, 0,
+                              MFI.getObjectSize(FI),
+                              MFI.getObjectAlignment(FI));
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tSpill))
+                   .addReg(SrcReg, getKillRegState(isKill))
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+  }
+}
+
+void Thumb1InstrInfo::
+loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                     unsigned DestReg, int FI,
+                     const TargetRegisterClass *RC) const {
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  assert((RC == ARM::tGPRRegisterClass ||
+          (TargetRegisterInfo::isPhysicalRegister(DestReg) &&
+           isARMLowRegister(DestReg))) && "Unknown regclass!");
+
+  if (RC == ARM::tGPRRegisterClass ||
+      (TargetRegisterInfo::isPhysicalRegister(DestReg) &&
+       isARMLowRegister(DestReg))) {
+    MachineFunction &MF = *MBB.getParent();
+    MachineFrameInfo &MFI = *MF.getFrameInfo();
+    MachineMemOperand *MMO =
+      MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FI),
+                              MachineMemOperand::MOLoad, 0,
+                              MFI.getObjectSize(FI),
+                              MFI.getObjectAlignment(FI));
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::tRestore), DestReg)
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+  }
+}
+
+bool Thumb1InstrInfo::
+spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MI,
+                          const std::vector<CalleeSavedInfo> &CSI) const {
+  if (CSI.empty())
+    return false;
+
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, get(ARM::tPUSH));
+  AddDefaultPred(MIB);
+  MIB.addReg(0); // No write back.
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i-1].getReg();
+    // Add the callee-saved register as live-in. It's killed at the spill.
+    MBB.addLiveIn(Reg);
+    MIB.addReg(Reg, RegState::Kill);
+  }
+  return true;
+}
+
+bool Thumb1InstrInfo::
+restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            const std::vector<CalleeSavedInfo> &CSI) const {
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  if (CSI.empty())
+    return false;
+
+  bool isVarArg = AFI->getVarArgsRegSaveSize() > 0;
+  DebugLoc DL = MI->getDebugLoc();
+  MachineInstrBuilder MIB = BuildMI(MF, DL, get(ARM::tPOP));
+  AddDefaultPred(MIB);
+  MIB.addReg(0); // No write back.
+
+  bool NumRegs = false;
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i-1].getReg();
+    if (Reg == ARM::LR) {
+      // Special epilogue for vararg functions. See emitEpilogue
+      if (isVarArg)
+        continue;
+      Reg = ARM::PC;
+      (*MIB).setDesc(get(ARM::tPOP_RET));
+      MI = MBB.erase(MI);
+    }
+    MIB.addReg(Reg, getDefRegState(true));
+    NumRegs = true;
+  }
+
+  // It's illegal to emit pop instruction without operands.
+  if (NumRegs)
+    MBB.insert(MI, &*MIB);
+
+  return true;
+}
+
+MachineInstr *Thumb1InstrInfo::
+foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
+                      const SmallVectorImpl<unsigned> &Ops, int FI) const {
+  if (Ops.size() != 1) return NULL;
+
+  unsigned OpNum = Ops[0];
+  unsigned Opc = MI->getOpcode();
+  MachineInstr *NewMI = NULL;
+  switch (Opc) {
+  default: break;
+  case ARM::tMOVr:
+  case ARM::tMOVtgpr2gpr:
+  case ARM::tMOVgpr2tgpr:
+  case ARM::tMOVgpr2gpr: {
+    if (OpNum == 0) { // move -> store
+      unsigned SrcReg = MI->getOperand(1).getReg();
+      bool isKill = MI->getOperand(1).isKill();
+      if (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
+          !isARMLowRegister(SrcReg))
+        // tSpill cannot take a high register operand.
+        break;
+      NewMI = AddDefaultPred(BuildMI(MF, MI->getDebugLoc(), get(ARM::tSpill))
+                             .addReg(SrcReg, getKillRegState(isKill))
+                             .addFrameIndex(FI).addImm(0));
+    } else {          // move -> load
+      unsigned DstReg = MI->getOperand(0).getReg();
+      if (TargetRegisterInfo::isPhysicalRegister(DstReg) &&
+          !isARMLowRegister(DstReg))
+        // tRestore cannot target a high register operand.
+        break;
+      bool isDead = MI->getOperand(0).isDead();
+      NewMI = AddDefaultPred(BuildMI(MF, MI->getDebugLoc(), get(ARM::tRestore))
+                             .addReg(DstReg,
+                                     RegState::Define | getDeadRegState(isDead))
+                             .addFrameIndex(FI).addImm(0));
+    }
+    break;
+  }
+  }
+
+  return NewMI;
+}

diff --git a/lib/Target/ARM/Thumb1InstrInfo.h b/lib/Target/ARM/Thumb1InstrInfo.h
new file mode 100644
index 0000000..516ddf1
--- /dev/null
+++ b/lib/Target/ARM/Thumb1InstrInfo.h

@@ -0,0 +1,79 @@
+//===- Thumb1InstrInfo.h - Thumb-1 Instruction Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Thumb-1 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef THUMB1INSTRUCTIONINFO_H
+#define THUMB1INSTRUCTIONINFO_H
+
+#include "llvm/Target/TargetInstrInfo.h"
+#include "ARM.h"
+#include "ARMInstrInfo.h"
+#include "Thumb1RegisterInfo.h"
+
+namespace llvm {
+  class ARMSubtarget;
+
+class Thumb1InstrInfo : public ARMBaseInstrInfo {
+  Thumb1RegisterInfo RI;
+public:
+  explicit Thumb1InstrInfo(const ARMSubtarget &STI);
+
+  // Return the non-pre/post incrementing version of 'Opc'. Return 0
+  // if there is not such an opcode.
+  unsigned getUnindexedOpcode(unsigned Opc) const;
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  const Thumb1RegisterInfo &getRegisterInfo() const { return RI; }
+
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI) const;
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   const std::vector<CalleeSavedInfo> &CSI) const;
+
+  bool copyRegToReg(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator I,
+                            unsigned DestReg, unsigned SrcReg,
+                            const TargetRegisterClass *DestRC,
+                            const TargetRegisterClass *SrcRC) const;
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   unsigned SrcReg, bool isKill, int FrameIndex,
+                                   const TargetRegisterClass *RC) const;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MBBI,
+                                    unsigned DestReg, int FrameIndex,
+                                    const TargetRegisterClass *RC) const;
+
+  bool canFoldMemoryOperand(const MachineInstr *MI,
+                                    const SmallVectorImpl<unsigned> &Ops) const;
+
+  MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                      MachineInstr* MI,
+                                      const SmallVectorImpl<unsigned> &Ops,
+                                      int FrameIndex) const;
+
+  MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                      MachineInstr* MI,
+                                      const SmallVectorImpl<unsigned> &Ops,
+                                      MachineInstr* LoadMI) const {
+    return 0;
+  }
+};
+}
+
+#endif // THUMB1INSTRUCTIONINFO_H

diff --git a/lib/Target/ARM/Thumb1RegisterInfo.cpp b/lib/Target/ARM/Thumb1RegisterInfo.cpp
new file mode 100644
index 0000000..d6630ce
--- /dev/null
+++ b/lib/Target/ARM/Thumb1RegisterInfo.cpp

@@ -0,0 +1,850 @@
+//===- Thumb1RegisterInfo.cpp - Thumb-1 Register Information ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Thumb-1 implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMAddressingModes.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMSubtarget.h"
+#include "Thumb1InstrInfo.h"
+#include "Thumb1RegisterInfo.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+Thumb1RegisterInfo::Thumb1RegisterInfo(const ARMBaseInstrInfo &tii,
+                                       const ARMSubtarget &sti)
+  : ARMBaseRegisterInfo(tii, sti) {
+}
+
+/// emitLoadConstPool - Emits a load from constpool to materialize the
+/// specified immediate.
+void Thumb1RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator &MBBI,
+                                           DebugLoc dl,
+                                           unsigned DestReg, unsigned SubIdx,
+                                           int Val,
+                                           ARMCC::CondCodes Pred,
+                                           unsigned PredReg) const {
+  MachineFunction &MF = *MBB.getParent();
+  MachineConstantPool *ConstantPool = MF.getConstantPool();
+  Constant *C = ConstantInt::get(
+          Type::getInt32Ty(MBB.getParent()->getFunction()->getContext()), Val);
+  unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
+
+  BuildMI(MBB, MBBI, dl, TII.get(ARM::tLDRcp))
+          .addReg(DestReg, getDefRegState(true), SubIdx)
+          .addConstantPoolIndex(Idx).addImm(Pred).addReg(PredReg);
+}
+
+const TargetRegisterClass*
+Thumb1RegisterInfo::getPhysicalRegisterRegClass(unsigned Reg, EVT VT) const {
+  if (isARMLowRegister(Reg))
+    return ARM::tGPRRegisterClass;
+  switch (Reg) {
+   default:
+    break;
+   case ARM::R8:  case ARM::R9:  case ARM::R10:  case ARM::R11:
+   case ARM::R12: case ARM::SP:  case ARM::LR:   case ARM::PC:
+    return ARM::GPRRegisterClass;
+  }
+
+  return TargetRegisterInfo::getPhysicalRegisterRegClass(Reg, VT);
+}
+
+bool Thumb1RegisterInfo::hasReservedCallFrame(MachineFunction &MF) const {
+  const MachineFrameInfo *FFI = MF.getFrameInfo();
+  unsigned CFSize = FFI->getMaxCallFrameSize();
+  // It's not always a good idea to include the call frame as part of the
+  // stack frame. ARM (especially Thumb) has small immediate offset to
+  // address the stack frame. So a large call frame can cause poor codegen
+  // and may even makes it impossible to scavenge a register.
+  if (CFSize >= ((1 << 8) - 1) * 4 / 2) // Half of imm8 * 4
+    return false;
+
+  return !MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+
+/// emitThumbRegPlusImmInReg - Emits a series of instructions to materialize
+/// a destreg = basereg + immediate in Thumb code. Materialize the immediate
+/// in a register using mov / mvn sequences or load the immediate from a
+/// constpool entry.
+static
+void emitThumbRegPlusImmInReg(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator &MBBI,
+                              unsigned DestReg, unsigned BaseReg,
+                              int NumBytes, bool CanChangeCC,
+                              const TargetInstrInfo &TII,
+                              const Thumb1RegisterInfo& MRI,
+                              DebugLoc dl) {
+    MachineFunction &MF = *MBB.getParent();
+    bool isHigh = !isARMLowRegister(DestReg) ||
+                  (BaseReg != 0 && !isARMLowRegister(BaseReg));
+    bool isSub = false;
+    // Subtract doesn't have high register version. Load the negative value
+    // if either base or dest register is a high register. Also, if do not
+    // issue sub as part of the sequence if condition register is to be
+    // preserved.
+    if (NumBytes < 0 && !isHigh && CanChangeCC) {
+      isSub = true;
+      NumBytes = -NumBytes;
+    }
+    unsigned LdReg = DestReg;
+    if (DestReg == ARM::SP) {
+      assert(BaseReg == ARM::SP && "Unexpected!");
+      LdReg = MF.getRegInfo().createVirtualRegister(ARM::tGPRRegisterClass);
+    }
+
+    if (NumBytes <= 255 && NumBytes >= 0)
+      AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg))
+        .addImm(NumBytes);
+    else if (NumBytes < 0 && NumBytes >= -255) {
+      AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8), LdReg))
+        .addImm(NumBytes);
+      AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tRSB), LdReg))
+        .addReg(LdReg, RegState::Kill);
+    } else
+      MRI.emitLoadConstPool(MBB, MBBI, dl, LdReg, 0, NumBytes);
+
+    // Emit add / sub.
+    int Opc = (isSub) ? ARM::tSUBrr : (isHigh ? ARM::tADDhirr : ARM::tADDrr);
+    MachineInstrBuilder MIB =
+      BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg);
+    if (Opc != ARM::tADDhirr)
+      MIB = AddDefaultT1CC(MIB);
+    if (DestReg == ARM::SP || isSub)
+      MIB.addReg(BaseReg).addReg(LdReg, RegState::Kill);
+    else
+      MIB.addReg(LdReg).addReg(BaseReg, RegState::Kill);
+    AddDefaultPred(MIB);
+}
+
+/// calcNumMI - Returns the number of instructions required to materialize
+/// the specific add / sub r, c instruction.
+static unsigned calcNumMI(int Opc, int ExtraOpc, unsigned Bytes,
+                          unsigned NumBits, unsigned Scale) {
+  unsigned NumMIs = 0;
+  unsigned Chunk = ((1 << NumBits) - 1) * Scale;
+
+  if (Opc == ARM::tADDrSPi) {
+    unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes;
+    Bytes -= ThisVal;
+    NumMIs++;
+    NumBits = 8;
+    Scale = 1;  // Followed by a number of tADDi8.
+    Chunk = ((1 << NumBits) - 1) * Scale;
+  }
+
+  NumMIs += Bytes / Chunk;
+  if ((Bytes % Chunk) != 0)
+    NumMIs++;
+  if (ExtraOpc)
+    NumMIs++;
+  return NumMIs;
+}
+
+/// emitThumbRegPlusImmediate - Emits a series of instructions to materialize
+/// a destreg = basereg + immediate in Thumb code.
+static
+void emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator &MBBI,
+                               unsigned DestReg, unsigned BaseReg,
+                               int NumBytes, const TargetInstrInfo &TII,
+                               const Thumb1RegisterInfo& MRI,
+                               DebugLoc dl) {
+  bool isSub = NumBytes < 0;
+  unsigned Bytes = (unsigned)NumBytes;
+  if (isSub) Bytes = -NumBytes;
+  bool isMul4 = (Bytes & 3) == 0;
+  bool isTwoAddr = false;
+  bool DstNotEqBase = false;
+  unsigned NumBits = 1;
+  unsigned Scale = 1;
+  int Opc = 0;
+  int ExtraOpc = 0;
+  bool NeedCC = false;
+  bool NeedPred = false;
+
+  if (DestReg == BaseReg && BaseReg == ARM::SP) {
+    assert(isMul4 && "Thumb sp inc / dec size must be multiple of 4!");
+    NumBits = 7;
+    Scale = 4;
+    Opc = isSub ? ARM::tSUBspi : ARM::tADDspi;
+    isTwoAddr = true;
+  } else if (!isSub && BaseReg == ARM::SP) {
+    // r1 = add sp, 403
+    // =>
+    // r1 = add sp, 100 * 4
+    // r1 = add r1, 3
+    if (!isMul4) {
+      Bytes &= ~3;
+      ExtraOpc = ARM::tADDi3;
+    }
+    NumBits = 8;
+    Scale = 4;
+    Opc = ARM::tADDrSPi;
+  } else {
+    // sp = sub sp, c
+    // r1 = sub sp, c
+    // r8 = sub sp, c
+    if (DestReg != BaseReg)
+      DstNotEqBase = true;
+    NumBits = 8;
+    if (DestReg == ARM::SP) {
+      Opc = isSub ? ARM::tSUBspi : ARM::tADDspi;
+      assert(isMul4 && "Thumb sp inc / dec size must be multiple of 4!");
+      NumBits = 7;
+      Scale = 4;
+    } else {
+      Opc = isSub ? ARM::tSUBi8 : ARM::tADDi8;
+      NumBits = 8;
+      NeedPred = NeedCC = true;
+    }
+    isTwoAddr = true;
+  }
+
+  unsigned NumMIs = calcNumMI(Opc, ExtraOpc, Bytes, NumBits, Scale);
+  unsigned Threshold = (DestReg == ARM::SP) ? 3 : 2;
+  if (NumMIs > Threshold) {
+    // This will expand into too many instructions. Load the immediate from a
+    // constpool entry.
+    emitThumbRegPlusImmInReg(MBB, MBBI, DestReg, BaseReg, NumBytes, true, TII,
+                             MRI, dl);
+    return;
+  }
+
+  if (DstNotEqBase) {
+    if (isARMLowRegister(DestReg) && isARMLowRegister(BaseReg)) {
+      // If both are low registers, emit DestReg = add BaseReg, max(Imm, 7)
+      unsigned Chunk = (1 << 3) - 1;
+      unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes;
+      Bytes -= ThisVal;
+      const TargetInstrDesc &TID = TII.get(isSub ? ARM::tSUBi3 : ARM::tADDi3);
+      const MachineInstrBuilder MIB =
+        AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TID, DestReg));
+      AddDefaultPred(MIB.addReg(BaseReg, RegState::Kill).addImm(ThisVal));
+    } else {
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), DestReg)
+        .addReg(BaseReg, RegState::Kill);
+    }
+    BaseReg = DestReg;
+  }
+
+  unsigned Chunk = ((1 << NumBits) - 1) * Scale;
+  while (Bytes) {
+    unsigned ThisVal = (Bytes > Chunk) ? Chunk : Bytes;
+    Bytes -= ThisVal;
+    ThisVal /= Scale;
+    // Build the new tADD / tSUB.
+    if (isTwoAddr) {
+      MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg);
+      if (NeedCC)
+        MIB = AddDefaultT1CC(MIB);
+      MIB .addReg(DestReg).addImm(ThisVal);
+      if (NeedPred)
+        MIB = AddDefaultPred(MIB);
+    }
+    else {
+      bool isKill = BaseReg != ARM::SP;
+      MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg);
+      if (NeedCC)
+        MIB = AddDefaultT1CC(MIB);
+      MIB.addReg(BaseReg, getKillRegState(isKill)).addImm(ThisVal);
+      if (NeedPred)
+        MIB = AddDefaultPred(MIB);
+      BaseReg = DestReg;
+
+      if (Opc == ARM::tADDrSPi) {
+        // r4 = add sp, imm
+        // r4 = add r4, imm
+        // ...
+        NumBits = 8;
+        Scale = 1;
+        Chunk = ((1 << NumBits) - 1) * Scale;
+        Opc = isSub ? ARM::tSUBi8 : ARM::tADDi8;
+        NeedPred = NeedCC = isTwoAddr = true;
+      }
+    }
+  }
+
+  if (ExtraOpc) {
+    const TargetInstrDesc &TID = TII.get(ExtraOpc);
+    AddDefaultPred(AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TID, DestReg))
+                   .addReg(DestReg, RegState::Kill)
+                   .addImm(((unsigned)NumBytes) & 3));
+  }
+}
+
+static void emitSPUpdate(MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator &MBBI,
+                         const TargetInstrInfo &TII, DebugLoc dl,
+                         const Thumb1RegisterInfo &MRI,
+                         int NumBytes) {
+  emitThumbRegPlusImmediate(MBB, MBBI, ARM::SP, ARM::SP, NumBytes, TII,
+                            MRI, dl);
+}
+
+void Thumb1RegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  if (!hasReservedCallFrame(MF)) {
+    // If we have alloca, convert as follows:
+    // ADJCALLSTACKDOWN -> sub, sp, sp, amount
+    // ADJCALLSTACKUP   -> add, sp, sp, amount
+    MachineInstr *Old = I;
+    DebugLoc dl = Old->getDebugLoc();
+    unsigned Amount = Old->getOperand(0).getImm();
+    if (Amount != 0) {
+      // We need to keep the stack aligned properly.  To do this, we round the
+      // amount of space needed for the outgoing arguments up to the next
+      // alignment boundary.
+      unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment();
+      Amount = (Amount+Align-1)/Align*Align;
+
+      // Replace the pseudo instruction with a new instruction...
+      unsigned Opc = Old->getOpcode();
+      if (Opc == ARM::ADJCALLSTACKDOWN || Opc == ARM::tADJCALLSTACKDOWN) {
+        emitSPUpdate(MBB, I, TII, dl, *this, -Amount);
+      } else {
+        assert(Opc == ARM::ADJCALLSTACKUP || Opc == ARM::tADJCALLSTACKUP);
+        emitSPUpdate(MBB, I, TII, dl, *this, Amount);
+      }
+    }
+  }
+  MBB.erase(I);
+}
+
+/// emitThumbConstant - Emit a series of instructions to materialize a
+/// constant.
+static void emitThumbConstant(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator &MBBI,
+                              unsigned DestReg, int Imm,
+                              const TargetInstrInfo &TII,
+                              const Thumb1RegisterInfo& MRI,
+                              DebugLoc dl) {
+  bool isSub = Imm < 0;
+  if (isSub) Imm = -Imm;
+
+  int Chunk = (1 << 8) - 1;
+  int ThisVal = (Imm > Chunk) ? Chunk : Imm;
+  Imm -= ThisVal;
+  AddDefaultPred(AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVi8),
+                                        DestReg))
+                 .addImm(ThisVal));
+  if (Imm > 0)
+    emitThumbRegPlusImmediate(MBB, MBBI, DestReg, DestReg, Imm, TII, MRI, dl);
+  if (isSub) {
+    const TargetInstrDesc &TID = TII.get(ARM::tRSB);
+    AddDefaultPred(AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TID, DestReg))
+                   .addReg(DestReg, RegState::Kill));
+  }
+}
+
+static void removeOperands(MachineInstr &MI, unsigned i) {
+  unsigned Op = i;
+  for (unsigned e = MI.getNumOperands(); i != e; ++i)
+    MI.RemoveOperand(Op);
+}
+
+int Thumb1RegisterInfo::
+rewriteFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                  unsigned FrameReg, int Offset,
+                  unsigned MOVOpc, unsigned ADDriOpc, unsigned SUBriOpc) const
+{
+  // if/when eliminateFrameIndex() conforms with ARMBaseRegisterInfo
+  // version then can pull out Thumb1 specific parts here
+  return 0;
+}
+
+/// saveScavengerRegister - Spill the register so it can be used by the
+/// register scavenger. Return true.
+bool
+Thumb1RegisterInfo::saveScavengerRegister(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator I,
+                                          MachineBasicBlock::iterator &UseMI,
+                                          const TargetRegisterClass *RC,
+                                          unsigned Reg) const {
+  // Thumb1 can't use the emergency spill slot on the stack because
+  // ldr/str immediate offsets must be positive, and if we're referencing
+  // off the frame pointer (if, for example, there are alloca() calls in
+  // the function, the offset will be negative. Use R12 instead since that's
+  // a call clobbered register that we know won't be used in Thumb1 mode.
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  BuildMI(MBB, I, DL, TII.get(ARM::tMOVtgpr2gpr)).
+    addReg(ARM::R12, RegState::Define).addReg(Reg, RegState::Kill);
+
+  // The UseMI is where we would like to restore the register. If there's
+  // interference with R12 before then, however, we'll need to restore it
+  // before that instead and adjust the UseMI.
+  bool done = false;
+  for (MachineBasicBlock::iterator II = I; !done && II != UseMI ; ++II) {
+    // If this instruction affects R12, adjust our restore point.
+    for (unsigned i = 0, e = II->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = II->getOperand(i);
+      if (!MO.isReg() || MO.isUndef() || !MO.getReg() ||
+          TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+        continue;
+      if (MO.getReg() == ARM::R12) {
+        UseMI = II;
+        done = true;
+        break;
+      }
+    }
+  }
+  // Restore the register from R12
+  BuildMI(MBB, UseMI, DL, TII.get(ARM::tMOVgpr2tgpr)).
+    addReg(Reg, RegState::Define).addReg(ARM::R12, RegState::Kill);
+
+  return true;
+}
+
+unsigned
+Thumb1RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                        int SPAdj, int *Value,
+                                        RegScavenger *RS) const{
+  unsigned VReg = 0;
+  unsigned i = 0;
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  DebugLoc dl = MI.getDebugLoc();
+
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+
+  unsigned FrameReg = ARM::SP;
+  int FrameIndex = MI.getOperand(i).getIndex();
+  int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) +
+               MF.getFrameInfo()->getStackSize() + SPAdj;
+
+  if (AFI->isGPRCalleeSavedArea1Frame(FrameIndex))
+    Offset -= AFI->getGPRCalleeSavedArea1Offset();
+  else if (AFI->isGPRCalleeSavedArea2Frame(FrameIndex))
+    Offset -= AFI->getGPRCalleeSavedArea2Offset();
+  else if (hasFP(MF)) {
+    assert(SPAdj == 0 && "Unexpected");
+    // There is alloca()'s in this function, must reference off the frame
+    // pointer instead.
+    FrameReg = getFrameRegister(MF);
+    Offset -= AFI->getFramePtrSpillOffset();
+  }
+
+  unsigned Opcode = MI.getOpcode();
+  const TargetInstrDesc &Desc = MI.getDesc();
+  unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
+
+  if (Opcode == ARM::tADDrSPi) {
+    Offset += MI.getOperand(i+1).getImm();
+
+    // Can't use tADDrSPi if it's based off the frame pointer.
+    unsigned NumBits = 0;
+    unsigned Scale = 1;
+    if (FrameReg != ARM::SP) {
+      Opcode = ARM::tADDi3;
+      MI.setDesc(TII.get(Opcode));
+      NumBits = 3;
+    } else {
+      NumBits = 8;
+      Scale = 4;
+      assert((Offset & 3) == 0 &&
+             "Thumb add/sub sp, #imm immediate must be multiple of 4!");
+    }
+
+    unsigned PredReg;
+    if (Offset == 0 && getInstrPredicate(&MI, PredReg) == ARMCC::AL) {
+      // Turn it into a move.
+      MI.setDesc(TII.get(ARM::tMOVgpr2tgpr));
+      MI.getOperand(i).ChangeToRegister(FrameReg, false);
+      // Remove offset and remaining explicit predicate operands.
+      do MI.RemoveOperand(i+1);
+      while (MI.getNumOperands() > i+1 &&
+             (!MI.getOperand(i+1).isReg() || !MI.getOperand(i+1).isImm()));
+      return 0;
+    }
+
+    // Common case: small offset, fits into instruction.
+    unsigned Mask = (1 << NumBits) - 1;
+    if (((Offset / Scale) & ~Mask) == 0) {
+      // Replace the FrameIndex with sp / fp
+      if (Opcode == ARM::tADDi3) {
+        removeOperands(MI, i);
+        MachineInstrBuilder MIB(&MI);
+        AddDefaultPred(AddDefaultT1CC(MIB).addReg(FrameReg)
+                       .addImm(Offset / Scale));
+      } else {
+        MI.getOperand(i).ChangeToRegister(FrameReg, false);
+        MI.getOperand(i+1).ChangeToImmediate(Offset / Scale);
+      }
+      return 0;
+    }
+
+    unsigned DestReg = MI.getOperand(0).getReg();
+    unsigned Bytes = (Offset > 0) ? Offset : -Offset;
+    unsigned NumMIs = calcNumMI(Opcode, 0, Bytes, NumBits, Scale);
+    // MI would expand into a large number of instructions. Don't try to
+    // simplify the immediate.
+    if (NumMIs > 2) {
+      emitThumbRegPlusImmediate(MBB, II, DestReg, FrameReg, Offset, TII,
+                                *this, dl);
+      MBB.erase(II);
+      return 0;
+    }
+
+    if (Offset > 0) {
+      // Translate r0 = add sp, imm to
+      // r0 = add sp, 255*4
+      // r0 = add r0, (imm - 255*4)
+      if (Opcode == ARM::tADDi3) {
+        removeOperands(MI, i);
+        MachineInstrBuilder MIB(&MI);
+        AddDefaultPred(AddDefaultT1CC(MIB).addReg(FrameReg).addImm(Mask));
+      } else {
+        MI.getOperand(i).ChangeToRegister(FrameReg, false);
+        MI.getOperand(i+1).ChangeToImmediate(Mask);
+      }
+      Offset = (Offset - Mask * Scale);
+      MachineBasicBlock::iterator NII = llvm::next(II);
+      emitThumbRegPlusImmediate(MBB, NII, DestReg, DestReg, Offset, TII,
+                                *this, dl);
+    } else {
+      // Translate r0 = add sp, -imm to
+      // r0 = -imm (this is then translated into a series of instructons)
+      // r0 = add r0, sp
+      emitThumbConstant(MBB, II, DestReg, Offset, TII, *this, dl);
+
+      MI.setDesc(TII.get(ARM::tADDhirr));
+      MI.getOperand(i).ChangeToRegister(DestReg, false, false, true);
+      MI.getOperand(i+1).ChangeToRegister(FrameReg, false);
+      if (Opcode == ARM::tADDi3) {
+        MachineInstrBuilder MIB(&MI);
+        AddDefaultPred(MIB);
+      }
+    }
+    return 0;
+  } else {
+    unsigned ImmIdx = 0;
+    int InstrOffs = 0;
+    unsigned NumBits = 0;
+    unsigned Scale = 1;
+    switch (AddrMode) {
+    case ARMII::AddrModeT1_s: {
+      ImmIdx = i+1;
+      InstrOffs = MI.getOperand(ImmIdx).getImm();
+      NumBits = (FrameReg == ARM::SP) ? 8 : 5;
+      Scale = 4;
+      break;
+    }
+    default:
+      llvm_unreachable("Unsupported addressing mode!");
+      break;
+    }
+
+    Offset += InstrOffs * Scale;
+    assert((Offset & (Scale-1)) == 0 && "Can't encode this offset!");
+
+    // Common case: small offset, fits into instruction.
+    MachineOperand &ImmOp = MI.getOperand(ImmIdx);
+    int ImmedOffset = Offset / Scale;
+    unsigned Mask = (1 << NumBits) - 1;
+    if ((unsigned)Offset <= Mask * Scale) {
+      // Replace the FrameIndex with sp
+      MI.getOperand(i).ChangeToRegister(FrameReg, false);
+      ImmOp.ChangeToImmediate(ImmedOffset);
+      return 0;
+    }
+
+    bool isThumSpillRestore = Opcode == ARM::tRestore || Opcode == ARM::tSpill;
+    if (AddrMode == ARMII::AddrModeT1_s) {
+      // Thumb tLDRspi, tSTRspi. These will change to instructions that use
+      // a different base register.
+      NumBits = 5;
+      Mask = (1 << NumBits) - 1;
+    }
+    // If this is a thumb spill / restore, we will be using a constpool load to
+    // materialize the offset.
+    if (AddrMode == ARMII::AddrModeT1_s && isThumSpillRestore)
+      ImmOp.ChangeToImmediate(0);
+    else {
+      // Otherwise, it didn't fit. Pull in what we can to simplify the immed.
+      ImmedOffset = ImmedOffset & Mask;
+      ImmOp.ChangeToImmediate(ImmedOffset);
+      Offset &= ~(Mask*Scale);
+    }
+  }
+
+  // If we get here, the immediate doesn't fit into the instruction.  We folded
+  // as much as possible above, handle the rest, providing a register that is
+  // SP+LargeImm.
+  assert(Offset && "This code isn't needed if offset already handled!");
+
+  // Remove predicate first.
+  int PIdx = MI.findFirstPredOperandIdx();
+  if (PIdx != -1)
+    removeOperands(MI, PIdx);
+
+  if (Desc.mayLoad()) {
+    // Use the destination register to materialize sp + offset.
+    unsigned TmpReg = MI.getOperand(0).getReg();
+    bool UseRR = false;
+    if (Opcode == ARM::tRestore) {
+      if (FrameReg == ARM::SP)
+        emitThumbRegPlusImmInReg(MBB, II, TmpReg, FrameReg,
+                                 Offset, false, TII, *this, dl);
+      else {
+        emitLoadConstPool(MBB, II, dl, TmpReg, 0, Offset);
+        UseRR = true;
+      }
+    } else {
+      emitThumbRegPlusImmediate(MBB, II, TmpReg, FrameReg, Offset, TII,
+                                *this, dl);
+    }
+
+    MI.setDesc(TII.get(ARM::tLDR));
+    MI.getOperand(i).ChangeToRegister(TmpReg, false, false, true);
+    if (UseRR)
+      // Use [reg, reg] addrmode.
+      MI.addOperand(MachineOperand::CreateReg(FrameReg, false));
+    else  // tLDR has an extra register operand.
+      MI.addOperand(MachineOperand::CreateReg(0, false));
+  } else if (Desc.mayStore()) {
+      VReg = MF.getRegInfo().createVirtualRegister(ARM::tGPRRegisterClass);
+      assert (Value && "Frame index virtual allocated, but Value arg is NULL!");
+      *Value = Offset;
+      bool UseRR = false;
+
+      if (Opcode == ARM::tSpill) {
+        if (FrameReg == ARM::SP)
+          emitThumbRegPlusImmInReg(MBB, II, VReg, FrameReg,
+                                   Offset, false, TII, *this, dl);
+        else {
+          emitLoadConstPool(MBB, II, dl, VReg, 0, Offset);
+          UseRR = true;
+        }
+      } else
+        emitThumbRegPlusImmediate(MBB, II, VReg, FrameReg, Offset, TII,
+                                  *this, dl);
+      MI.setDesc(TII.get(ARM::tSTR));
+      MI.getOperand(i).ChangeToRegister(VReg, false, false, true);
+      if (UseRR)  // Use [reg, reg] addrmode.
+        MI.addOperand(MachineOperand::CreateReg(FrameReg, false));
+      else // tSTR has an extra register operand.
+        MI.addOperand(MachineOperand::CreateReg(0, false));
+  } else
+    assert(false && "Unexpected opcode!");
+
+  // Add predicate back if it's needed.
+  if (MI.getDesc().isPredicable()) {
+    MachineInstrBuilder MIB(&MI);
+    AddDefaultPred(MIB);
+  }
+  return VReg;
+}
+
+void Thumb1RegisterInfo::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo  *MFI = MF.getFrameInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize();
+  unsigned NumBytes = MFI->getStackSize();
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  DebugLoc dl = (MBBI != MBB.end() ?
+                 MBBI->getDebugLoc() : DebugLoc::getUnknownLoc());
+
+  // Thumb add/sub sp, imm8 instructions implicitly multiply the offset by 4.
+  NumBytes = (NumBytes + 3) & ~3;
+  MFI->setStackSize(NumBytes);
+
+  // Determine the sizes of each callee-save spill areas and record which frame
+  // belongs to which callee-save spill areas.
+  unsigned GPRCS1Size = 0, GPRCS2Size = 0, DPRCSSize = 0;
+  int FramePtrSpillFI = 0;
+
+  if (VARegSaveSize)
+    emitSPUpdate(MBB, MBBI, TII, dl, *this, -VARegSaveSize);
+
+  if (!AFI->hasStackFrame()) {
+    if (NumBytes != 0)
+      emitSPUpdate(MBB, MBBI, TII, dl, *this, -NumBytes);
+    return;
+  }
+
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    int FI = CSI[i].getFrameIdx();
+    switch (Reg) {
+    case ARM::R4:
+    case ARM::R5:
+    case ARM::R6:
+    case ARM::R7:
+    case ARM::LR:
+      if (Reg == FramePtr)
+        FramePtrSpillFI = FI;
+      AFI->addGPRCalleeSavedArea1Frame(FI);
+      GPRCS1Size += 4;
+      break;
+    case ARM::R8:
+    case ARM::R9:
+    case ARM::R10:
+    case ARM::R11:
+      if (Reg == FramePtr)
+        FramePtrSpillFI = FI;
+      if (STI.isTargetDarwin()) {
+        AFI->addGPRCalleeSavedArea2Frame(FI);
+        GPRCS2Size += 4;
+      } else {
+        AFI->addGPRCalleeSavedArea1Frame(FI);
+        GPRCS1Size += 4;
+      }
+      break;
+    default:
+      AFI->addDPRCalleeSavedAreaFrame(FI);
+      DPRCSSize += 8;
+    }
+  }
+
+  if (MBBI != MBB.end() && MBBI->getOpcode() == ARM::tPUSH) {
+    ++MBBI;
+    if (MBBI != MBB.end())
+      dl = MBBI->getDebugLoc();
+  }
+
+  // Darwin ABI requires FP to point to the stack slot that contains the
+  // previous FP.
+  if (STI.isTargetDarwin() || hasFP(MF)) {
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), FramePtr)
+      .addFrameIndex(FramePtrSpillFI).addImm(0);
+  }
+
+  // Determine starting offsets of spill areas.
+  unsigned DPRCSOffset  = NumBytes - (GPRCS1Size + GPRCS2Size + DPRCSSize);
+  unsigned GPRCS2Offset = DPRCSOffset + DPRCSSize;
+  unsigned GPRCS1Offset = GPRCS2Offset + GPRCS2Size;
+  AFI->setFramePtrSpillOffset(MFI->getObjectOffset(FramePtrSpillFI) + NumBytes);
+  AFI->setGPRCalleeSavedArea1Offset(GPRCS1Offset);
+  AFI->setGPRCalleeSavedArea2Offset(GPRCS2Offset);
+  AFI->setDPRCalleeSavedAreaOffset(DPRCSOffset);
+
+  NumBytes = DPRCSOffset;
+  if (NumBytes) {
+    // Insert it after all the callee-save spills.
+    emitSPUpdate(MBB, MBBI, TII, dl, *this, -NumBytes);
+  }
+
+  if (STI.isTargetELF() && hasFP(MF)) {
+    MFI->setOffsetAdjustment(MFI->getOffsetAdjustment() -
+                             AFI->getFramePtrSpillOffset());
+  }
+
+  AFI->setGPRCalleeSavedArea1Size(GPRCS1Size);
+  AFI->setGPRCalleeSavedArea2Size(GPRCS2Size);
+  AFI->setDPRCalleeSavedAreaSize(DPRCSSize);
+}
+
+static bool isCalleeSavedRegister(unsigned Reg, const unsigned *CSRegs) {
+  for (unsigned i = 0; CSRegs[i]; ++i)
+    if (Reg == CSRegs[i])
+      return true;
+  return false;
+}
+
+static bool isCSRestore(MachineInstr *MI, const unsigned *CSRegs) {
+  return (MI->getOpcode() == ARM::tRestore &&
+          MI->getOperand(1).isFI() &&
+          isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs));
+}
+
+void Thumb1RegisterInfo::emitEpilogue(MachineFunction &MF,
+                                      MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  assert((MBBI->getOpcode() == ARM::tBX_RET ||
+          MBBI->getOpcode() == ARM::tPOP_RET) &&
+         "Can only insert epilog into returning blocks");
+  DebugLoc dl = MBBI->getDebugLoc();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+  unsigned VARegSaveSize = AFI->getVarArgsRegSaveSize();
+  int NumBytes = (int)MFI->getStackSize();
+
+  if (!AFI->hasStackFrame()) {
+    if (NumBytes != 0)
+      emitSPUpdate(MBB, MBBI, TII, dl, *this, NumBytes);
+  } else {
+    // Unwind MBBI to point to first LDR / VLDRD.
+    const unsigned *CSRegs = getCalleeSavedRegs();
+    if (MBBI != MBB.begin()) {
+      do
+        --MBBI;
+      while (MBBI != MBB.begin() && isCSRestore(MBBI, CSRegs));
+      if (!isCSRestore(MBBI, CSRegs))
+        ++MBBI;
+    }
+
+    // Move SP to start of FP callee save spill area.
+    NumBytes -= (AFI->getGPRCalleeSavedArea1Size() +
+                 AFI->getGPRCalleeSavedArea2Size() +
+                 AFI->getDPRCalleeSavedAreaSize());
+
+    if (hasFP(MF)) {
+      NumBytes = AFI->getFramePtrSpillOffset() - NumBytes;
+      // Reset SP based on frame pointer only if the stack frame extends beyond
+      // frame pointer stack slot or target is ELF and the function has FP.
+      if (NumBytes)
+        emitThumbRegPlusImmediate(MBB, MBBI, ARM::SP, FramePtr, -NumBytes,
+                                  TII, *this, dl);
+      else
+        BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVtgpr2gpr), ARM::SP)
+          .addReg(FramePtr);
+    } else {
+      if (MBBI->getOpcode() == ARM::tBX_RET &&
+          &MBB.front() != MBBI &&
+          prior(MBBI)->getOpcode() == ARM::tPOP) {
+        MachineBasicBlock::iterator PMBBI = prior(MBBI);
+        emitSPUpdate(MBB, PMBBI, TII, dl, *this, NumBytes);
+      } else
+        emitSPUpdate(MBB, MBBI, TII, dl, *this, NumBytes);
+    }
+  }
+
+  if (VARegSaveSize) {
+    // Epilogue for vararg functions: pop LR to R3 and branch off it.
+    AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::tPOP)))
+      .addReg(0) // No write back.
+      .addReg(ARM::R3, RegState::Define);
+
+    emitSPUpdate(MBB, MBBI, TII, dl, *this, VARegSaveSize);
+
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::tBX_RET_vararg))
+      .addReg(ARM::R3, RegState::Kill);
+    MBB.erase(MBBI);
+  }
+}

diff --git a/lib/Target/ARM/Thumb1RegisterInfo.h b/lib/Target/ARM/Thumb1RegisterInfo.h
new file mode 100644
index 0000000..37ad388
--- /dev/null
+++ b/lib/Target/ARM/Thumb1RegisterInfo.h

@@ -0,0 +1,70 @@
+//===- Thumb1RegisterInfo.h - Thumb-1 Register Information Impl -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Thumb-1 implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef THUMB1REGISTERINFO_H
+#define THUMB1REGISTERINFO_H
+
+#include "ARM.h"
+#include "ARMRegisterInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+namespace llvm {
+  class ARMSubtarget;
+  class ARMBaseInstrInfo;
+  class Type;
+
+struct Thumb1RegisterInfo : public ARMBaseRegisterInfo {
+public:
+  Thumb1RegisterInfo(const ARMBaseInstrInfo &tii, const ARMSubtarget &STI);
+
+  /// emitLoadConstPool - Emits a load from constpool to materialize the
+  /// specified immediate.
+ void emitLoadConstPool(MachineBasicBlock &MBB,
+                        MachineBasicBlock::iterator &MBBI,
+                        DebugLoc dl,
+                        unsigned DestReg, unsigned SubIdx, int Val,
+                        ARMCC::CondCodes Pred = ARMCC::AL,
+                        unsigned PredReg = 0) const;
+
+  /// Code Generation virtual methods...
+  const TargetRegisterClass *
+    getPhysicalRegisterRegClass(unsigned Reg, EVT VT = MVT::Other) const;
+
+  bool hasReservedCallFrame(MachineFunction &MF) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+  // rewrite MI to access 'Offset' bytes from the FP. Return the offset that
+  // could not be handled directly in MI.
+  int rewriteFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                        unsigned FrameReg, int Offset,
+                        unsigned MOVOpc, unsigned ADDriOpc, unsigned SUBriOpc) const;
+
+  bool saveScavengerRegister(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator I,
+                             MachineBasicBlock::iterator &UseMI,
+                             const TargetRegisterClass *RC,
+                             unsigned Reg) const;
+  unsigned eliminateFrameIndex(MachineBasicBlock::iterator II,
+                               int SPAdj, int *Value = NULL,
+                               RegScavenger *RS = NULL) const;
+
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+};
+}
+
+#endif // THUMB1REGISTERINFO_H

diff --git a/lib/Target/ARM/Thumb2ITBlockPass.cpp b/lib/Target/ARM/Thumb2ITBlockPass.cpp
new file mode 100644
index 0000000..f5ba155
--- /dev/null
+++ b/lib/Target/ARM/Thumb2ITBlockPass.cpp

@@ -0,0 +1,119 @@
+//===-- Thumb2ITBlockPass.cpp - Insert Thumb IT blocks ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "thumb2-it"
+#include "ARM.h"
+#include "ARMMachineFunctionInfo.h"
+#include "Thumb2InstrInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(NumITs,     "Number of IT blocks inserted");
+
+namespace {
+  struct Thumb2ITBlockPass : public MachineFunctionPass {
+    static char ID;
+    Thumb2ITBlockPass() : MachineFunctionPass(&ID) {}
+
+    const Thumb2InstrInfo *TII;
+    ARMFunctionInfo *AFI;
+
+    virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+    virtual const char *getPassName() const {
+      return "Thumb IT blocks insertion pass";
+    }
+
+  private:
+    bool InsertITBlocks(MachineBasicBlock &MBB);
+  };
+  char Thumb2ITBlockPass::ID = 0;
+}
+
+static ARMCC::CondCodes getPredicate(const MachineInstr *MI, unsigned &PredReg){
+  unsigned Opc = MI->getOpcode();
+  if (Opc == ARM::tBcc || Opc == ARM::t2Bcc)
+    return ARMCC::AL;
+  return llvm::getInstrPredicate(MI, PredReg);
+}
+
+bool Thumb2ITBlockPass::InsertITBlocks(MachineBasicBlock &MBB) {
+  bool Modified = false;
+
+  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+  while (MBBI != E) {
+    MachineInstr *MI = &*MBBI;
+    DebugLoc dl = MI->getDebugLoc();
+    unsigned PredReg = 0;
+    ARMCC::CondCodes CC = getPredicate(MI, PredReg);
+
+    if (CC == ARMCC::AL) {
+      ++MBBI;
+      continue;
+    }
+
+    // Insert an IT instruction.
+    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(ARM::t2IT))
+      .addImm(CC);
+    ++MBBI;
+
+    // Finalize IT mask.
+    ARMCC::CondCodes OCC = ARMCC::getOppositeCondition(CC);
+    unsigned Mask = 0, Pos = 3;
+    // Branches, including tricky ones like LDM_RET, need to end an IT
+    // block so check the instruction we just put in the block.
+    while (MBBI != E && Pos &&
+           (!MI->getDesc().isBranch() && !MI->getDesc().isReturn())) {
+      MachineInstr *NMI = &*MBBI;
+      MI = NMI;
+      DebugLoc ndl = NMI->getDebugLoc();
+      unsigned NPredReg = 0;
+      ARMCC::CondCodes NCC = getPredicate(NMI, NPredReg);
+      if (NCC == OCC) {
+        Mask |= (1 << Pos);
+      } else if (NCC != CC)
+        break;
+      --Pos;
+      ++MBBI;
+    }
+    Mask |= (1 << Pos);
+    MIB.addImm(Mask);
+    Modified = true;
+    ++NumITs;
+  }
+
+  return Modified;
+}
+
+bool Thumb2ITBlockPass::runOnMachineFunction(MachineFunction &Fn) {
+  const TargetMachine &TM = Fn.getTarget();
+  AFI = Fn.getInfo<ARMFunctionInfo>();
+  TII = static_cast<const Thumb2InstrInfo*>(TM.getInstrInfo());
+
+  if (!AFI->isThumbFunction())
+    return false;
+
+  bool Modified = false;
+  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+       ++MFI) {
+    MachineBasicBlock &MBB = *MFI;
+    Modified |= InsertITBlocks(MBB);
+  }
+
+  return Modified;
+}
+
+/// createThumb2ITBlockPass - Returns an instance of the Thumb2 IT blocks
+/// insertion pass.
+FunctionPass *llvm::createThumb2ITBlockPass() {
+  return new Thumb2ITBlockPass();
+}

diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp
new file mode 100644
index 0000000..20f13f1
--- /dev/null
+++ b/lib/Target/ARM/Thumb2InstrInfo.cpp

@@ -0,0 +1,482 @@
+//===- Thumb2InstrInfo.cpp - Thumb-2 Instruction Information ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Thumb-2 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Thumb2InstrInfo.h"
+#include "ARM.h"
+#include "ARMConstantPoolValue.h"
+#include "ARMAddressingModes.h"
+#include "ARMGenInstrInfo.inc"
+#include "ARMMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/ADT/SmallVector.h"
+#include "Thumb2InstrInfo.h"
+
+using namespace llvm;
+
+Thumb2InstrInfo::Thumb2InstrInfo(const ARMSubtarget &STI)
+  : ARMBaseInstrInfo(STI), RI(*this, STI) {
+}
+
+unsigned Thumb2InstrInfo::getUnindexedOpcode(unsigned Opc) const {
+  // FIXME
+  return 0;
+}
+
+bool
+Thumb2InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I,
+                              unsigned DestReg, unsigned SrcReg,
+                              const TargetRegisterClass *DestRC,
+                              const TargetRegisterClass *SrcRC) const {
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  if (DestRC == ARM::GPRRegisterClass &&
+      SrcRC == ARM::GPRRegisterClass) {
+    BuildMI(MBB, I, DL, get(ARM::tMOVgpr2gpr), DestReg).addReg(SrcReg);
+    return true;
+  } else if (DestRC == ARM::GPRRegisterClass &&
+             SrcRC == ARM::tGPRRegisterClass) {
+    BuildMI(MBB, I, DL, get(ARM::tMOVtgpr2gpr), DestReg).addReg(SrcReg);
+    return true;
+  } else if (DestRC == ARM::tGPRRegisterClass &&
+             SrcRC == ARM::GPRRegisterClass) {
+    BuildMI(MBB, I, DL, get(ARM::tMOVgpr2tgpr), DestReg).addReg(SrcReg);
+    return true;
+  }
+
+  // Handle SPR, DPR, and QPR copies.
+  return ARMBaseInstrInfo::copyRegToReg(MBB, I, DestReg, SrcReg, DestRC, SrcRC);
+}
+
+void Thumb2InstrInfo::
+storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                    unsigned SrcReg, bool isKill, int FI,
+                    const TargetRegisterClass *RC) const {
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  if (RC == ARM::GPRRegisterClass) {
+    MachineFunction &MF = *MBB.getParent();
+    MachineFrameInfo &MFI = *MF.getFrameInfo();
+    MachineMemOperand *MMO =
+      MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FI),
+                              MachineMemOperand::MOStore, 0,
+                              MFI.getObjectSize(FI),
+                              MFI.getObjectAlignment(FI));
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::t2STRi12))
+                   .addReg(SrcReg, getKillRegState(isKill))
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+    return;
+  }
+
+  ARMBaseInstrInfo::storeRegToStackSlot(MBB, I, SrcReg, isKill, FI, RC);
+}
+
+void Thumb2InstrInfo::
+loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                     unsigned DestReg, int FI,
+                     const TargetRegisterClass *RC) const {
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  if (RC == ARM::GPRRegisterClass) {
+    MachineFunction &MF = *MBB.getParent();
+    MachineFrameInfo &MFI = *MF.getFrameInfo();
+    MachineMemOperand *MMO =
+      MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FI),
+                              MachineMemOperand::MOLoad, 0,
+                              MFI.getObjectSize(FI),
+                              MFI.getObjectAlignment(FI));
+    AddDefaultPred(BuildMI(MBB, I, DL, get(ARM::t2LDRi12), DestReg)
+                   .addFrameIndex(FI).addImm(0).addMemOperand(MMO));
+    return;
+  }
+
+  ARMBaseInstrInfo::loadRegFromStackSlot(MBB, I, DestReg, FI, RC);
+}
+
+void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator &MBBI, DebugLoc dl,
+                               unsigned DestReg, unsigned BaseReg, int NumBytes,
+                               ARMCC::CondCodes Pred, unsigned PredReg,
+                               const ARMBaseInstrInfo &TII) {
+  bool isSub = NumBytes < 0;
+  if (isSub) NumBytes = -NumBytes;
+
+  // If profitable, use a movw or movt to materialize the offset.
+  // FIXME: Use the scavenger to grab a scratch register.
+  if (DestReg != ARM::SP && DestReg != BaseReg &&
+      NumBytes >= 4096 &&
+      ARM_AM::getT2SOImmVal(NumBytes) == -1) {
+    bool Fits = false;
+    if (NumBytes < 65536) {
+      // Use a movw to materialize the 16-bit constant.
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi16), DestReg)
+        .addImm(NumBytes)
+        .addImm((unsigned)Pred).addReg(PredReg).addReg(0);
+      Fits = true;
+    } else if ((NumBytes & 0xffff) == 0) {
+      // Use a movt to materialize the 32-bit constant.
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVTi16), DestReg)
+        .addReg(DestReg)
+        .addImm(NumBytes >> 16)
+        .addImm((unsigned)Pred).addReg(PredReg).addReg(0);
+      Fits = true;
+    }
+
+    if (Fits) {
+      if (isSub) {
+        BuildMI(MBB, MBBI, dl, TII.get(ARM::t2SUBrr), DestReg)
+          .addReg(BaseReg, RegState::Kill)
+          .addReg(DestReg, RegState::Kill)
+          .addImm((unsigned)Pred).addReg(PredReg).addReg(0);
+      } else {
+        BuildMI(MBB, MBBI, dl, TII.get(ARM::t2ADDrr), DestReg)
+          .addReg(DestReg, RegState::Kill)
+          .addReg(BaseReg, RegState::Kill)
+        .addImm((unsigned)Pred).addReg(PredReg).addReg(0);
+      }
+      return;
+    }
+  }
+
+  while (NumBytes) {
+    unsigned ThisVal = NumBytes;
+    unsigned Opc = 0;
+    if (DestReg == ARM::SP && BaseReg != ARM::SP) {
+      // mov sp, rn. Note t2MOVr cannot be used.
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVgpr2gpr),DestReg).addReg(BaseReg);
+      BaseReg = ARM::SP;
+      continue;
+    }
+
+    if (BaseReg == ARM::SP) {
+      // sub sp, sp, #imm7
+      if (DestReg == ARM::SP && (ThisVal < ((1 << 7)-1) * 4)) {
+        assert((ThisVal & 3) == 0 && "Stack update is not multiple of 4?");
+        Opc = isSub ? ARM::tSUBspi : ARM::tADDspi;
+        // FIXME: Fix Thumb1 immediate encoding.
+        BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
+          .addReg(BaseReg).addImm(ThisVal/4);
+        NumBytes = 0;
+        continue;
+      }
+
+      // sub rd, sp, so_imm
+      Opc = isSub ? ARM::t2SUBrSPi : ARM::t2ADDrSPi;
+      if (ARM_AM::getT2SOImmVal(NumBytes) != -1) {
+        NumBytes = 0;
+      } else {
+        // FIXME: Move this to ARMAddressingModes.h?
+        unsigned RotAmt = CountLeadingZeros_32(ThisVal);
+        ThisVal = ThisVal & ARM_AM::rotr32(0xff000000U, RotAmt);
+        NumBytes &= ~ThisVal;
+        assert(ARM_AM::getT2SOImmVal(ThisVal) != -1 &&
+               "Bit extraction didn't work?");
+      }
+    } else {
+      assert(DestReg != ARM::SP && BaseReg != ARM::SP);
+      Opc = isSub ? ARM::t2SUBri : ARM::t2ADDri;
+      if (ARM_AM::getT2SOImmVal(NumBytes) != -1) {
+        NumBytes = 0;
+      } else if (ThisVal < 4096) {
+        Opc = isSub ? ARM::t2SUBri12 : ARM::t2ADDri12;
+        NumBytes = 0;
+      } else {
+        // FIXME: Move this to ARMAddressingModes.h?
+        unsigned RotAmt = CountLeadingZeros_32(ThisVal);
+        ThisVal = ThisVal & ARM_AM::rotr32(0xff000000U, RotAmt);
+        NumBytes &= ~ThisVal;
+        assert(ARM_AM::getT2SOImmVal(ThisVal) != -1 &&
+               "Bit extraction didn't work?");
+      }
+    }
+
+    // Build the new ADD / SUB.
+    AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
+                                .addReg(BaseReg, RegState::Kill)
+                                .addImm(ThisVal)));
+
+    BaseReg = DestReg;
+  }
+}
+
+static unsigned
+negativeOffsetOpcode(unsigned opcode)
+{
+  switch (opcode) {
+  case ARM::t2LDRi12:   return ARM::t2LDRi8;
+  case ARM::t2LDRHi12:  return ARM::t2LDRHi8;
+  case ARM::t2LDRBi12:  return ARM::t2LDRBi8;
+  case ARM::t2LDRSHi12: return ARM::t2LDRSHi8;
+  case ARM::t2LDRSBi12: return ARM::t2LDRSBi8;
+  case ARM::t2STRi12:   return ARM::t2STRi8;
+  case ARM::t2STRBi12:  return ARM::t2STRBi8;
+  case ARM::t2STRHi12:  return ARM::t2STRHi8;
+
+  case ARM::t2LDRi8:
+  case ARM::t2LDRHi8:
+  case ARM::t2LDRBi8:
+  case ARM::t2LDRSHi8:
+  case ARM::t2LDRSBi8:
+  case ARM::t2STRi8:
+  case ARM::t2STRBi8:
+  case ARM::t2STRHi8:
+    return opcode;
+
+  default:
+    break;
+  }
+
+  return 0;
+}
+
+static unsigned
+positiveOffsetOpcode(unsigned opcode)
+{
+  switch (opcode) {
+  case ARM::t2LDRi8:   return ARM::t2LDRi12;
+  case ARM::t2LDRHi8:  return ARM::t2LDRHi12;
+  case ARM::t2LDRBi8:  return ARM::t2LDRBi12;
+  case ARM::t2LDRSHi8: return ARM::t2LDRSHi12;
+  case ARM::t2LDRSBi8: return ARM::t2LDRSBi12;
+  case ARM::t2STRi8:   return ARM::t2STRi12;
+  case ARM::t2STRBi8:  return ARM::t2STRBi12;
+  case ARM::t2STRHi8:  return ARM::t2STRHi12;
+
+  case ARM::t2LDRi12:
+  case ARM::t2LDRHi12:
+  case ARM::t2LDRBi12:
+  case ARM::t2LDRSHi12:
+  case ARM::t2LDRSBi12:
+  case ARM::t2STRi12:
+  case ARM::t2STRBi12:
+  case ARM::t2STRHi12:
+    return opcode;
+
+  default:
+    break;
+  }
+
+  return 0;
+}
+
+static unsigned
+immediateOffsetOpcode(unsigned opcode)
+{
+  switch (opcode) {
+  case ARM::t2LDRs:   return ARM::t2LDRi12;
+  case ARM::t2LDRHs:  return ARM::t2LDRHi12;
+  case ARM::t2LDRBs:  return ARM::t2LDRBi12;
+  case ARM::t2LDRSHs: return ARM::t2LDRSHi12;
+  case ARM::t2LDRSBs: return ARM::t2LDRSBi12;
+  case ARM::t2STRs:   return ARM::t2STRi12;
+  case ARM::t2STRBs:  return ARM::t2STRBi12;
+  case ARM::t2STRHs:  return ARM::t2STRHi12;
+
+  case ARM::t2LDRi12:
+  case ARM::t2LDRHi12:
+  case ARM::t2LDRBi12:
+  case ARM::t2LDRSHi12:
+  case ARM::t2LDRSBi12:
+  case ARM::t2STRi12:
+  case ARM::t2STRBi12:
+  case ARM::t2STRHi12:
+  case ARM::t2LDRi8:
+  case ARM::t2LDRHi8:
+  case ARM::t2LDRBi8:
+  case ARM::t2LDRSHi8:
+  case ARM::t2LDRSBi8:
+  case ARM::t2STRi8:
+  case ARM::t2STRBi8:
+  case ARM::t2STRHi8:
+    return opcode;
+
+  default:
+    break;
+  }
+
+  return 0;
+}
+
+bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                               unsigned FrameReg, int &Offset,
+                               const ARMBaseInstrInfo &TII) {
+  unsigned Opcode = MI.getOpcode();
+  const TargetInstrDesc &Desc = MI.getDesc();
+  unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
+  bool isSub = false;
+
+  // Memory operands in inline assembly always use AddrModeT2_i12.
+  if (Opcode == ARM::INLINEASM)
+    AddrMode = ARMII::AddrModeT2_i12; // FIXME. mode for thumb2?
+
+  if (Opcode == ARM::t2ADDri || Opcode == ARM::t2ADDri12) {
+    Offset += MI.getOperand(FrameRegIdx+1).getImm();
+
+    bool isSP = FrameReg == ARM::SP;
+    unsigned PredReg;
+    if (Offset == 0 && getInstrPredicate(&MI, PredReg) == ARMCC::AL) {
+      // Turn it into a move.
+      MI.setDesc(TII.get(ARM::tMOVgpr2gpr));
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+      // Remove offset and remaining explicit predicate operands.
+      do MI.RemoveOperand(FrameRegIdx+1);
+      while (MI.getNumOperands() > FrameRegIdx+1 &&
+             (!MI.getOperand(FrameRegIdx+1).isReg() ||
+              !MI.getOperand(FrameRegIdx+1).isImm()));
+      return true;
+    }
+
+    if (Offset < 0) {
+      Offset = -Offset;
+      isSub = true;
+      MI.setDesc(TII.get(isSP ? ARM::t2SUBrSPi : ARM::t2SUBri));
+    } else {
+      MI.setDesc(TII.get(isSP ? ARM::t2ADDrSPi : ARM::t2ADDri));
+    }
+
+    // Common case: small offset, fits into instruction.
+    if (ARM_AM::getT2SOImmVal(Offset) != -1) {
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+      MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset);
+      Offset = 0;
+      return true;
+    }
+    // Another common case: imm12.
+    if (Offset < 4096) {
+      unsigned NewOpc = isSP
+        ? (isSub ? ARM::t2SUBrSPi12 : ARM::t2ADDrSPi12)
+        : (isSub ? ARM::t2SUBri12   : ARM::t2ADDri12);
+      MI.setDesc(TII.get(NewOpc));
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+      MI.getOperand(FrameRegIdx+1).ChangeToImmediate(Offset);
+      Offset = 0;
+      return true;
+    }
+
+    // Otherwise, extract 8 adjacent bits from the immediate into this
+    // t2ADDri/t2SUBri.
+    unsigned RotAmt = CountLeadingZeros_32(Offset);
+    unsigned ThisImmVal = Offset & ARM_AM::rotr32(0xff000000U, RotAmt);
+
+    // We will handle these bits from offset, clear them.
+    Offset &= ~ThisImmVal;
+
+    assert(ARM_AM::getT2SOImmVal(ThisImmVal) != -1 &&
+           "Bit extraction didn't work?");
+    MI.getOperand(FrameRegIdx+1).ChangeToImmediate(ThisImmVal);
+  } else {
+
+    // AddrMode4 and AddrMode6 cannot handle any offset.
+    if (AddrMode == ARMII::AddrMode4 || AddrMode == ARMII::AddrMode6)
+      return false;
+
+    // AddrModeT2_so cannot handle any offset. If there is no offset
+    // register then we change to an immediate version.
+    unsigned NewOpc = Opcode;
+    if (AddrMode == ARMII::AddrModeT2_so) {
+      unsigned OffsetReg = MI.getOperand(FrameRegIdx+1).getReg();
+      if (OffsetReg != 0) {
+        MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+        return Offset == 0;
+      }
+
+      MI.RemoveOperand(FrameRegIdx+1);
+      MI.getOperand(FrameRegIdx+1).ChangeToImmediate(0);
+      NewOpc = immediateOffsetOpcode(Opcode);
+      AddrMode = ARMII::AddrModeT2_i12;
+    }
+
+    unsigned NumBits = 0;
+    unsigned Scale = 1;
+    if (AddrMode == ARMII::AddrModeT2_i8 || AddrMode == ARMII::AddrModeT2_i12) {
+      // i8 supports only negative, and i12 supports only positive, so
+      // based on Offset sign convert Opcode to the appropriate
+      // instruction
+      Offset += MI.getOperand(FrameRegIdx+1).getImm();
+      if (Offset < 0) {
+        NewOpc = negativeOffsetOpcode(Opcode);
+        NumBits = 8;
+        isSub = true;
+        Offset = -Offset;
+      } else {
+        NewOpc = positiveOffsetOpcode(Opcode);
+        NumBits = 12;
+      }
+    } else if (AddrMode == ARMII::AddrMode5) {
+      // VFP address mode.
+      const MachineOperand &OffOp = MI.getOperand(FrameRegIdx+1);
+      int InstrOffs = ARM_AM::getAM5Offset(OffOp.getImm());
+      if (ARM_AM::getAM5Op(OffOp.getImm()) == ARM_AM::sub)
+        InstrOffs *= -1;
+      NumBits = 8;
+      Scale = 4;
+      Offset += InstrOffs * 4;
+      assert((Offset & (Scale-1)) == 0 && "Can't encode this offset!");
+      if (Offset < 0) {
+        Offset = -Offset;
+        isSub = true;
+      }
+    } else {
+      llvm_unreachable("Unsupported addressing mode!");
+    }
+
+    if (NewOpc != Opcode)
+      MI.setDesc(TII.get(NewOpc));
+
+    MachineOperand &ImmOp = MI.getOperand(FrameRegIdx+1);
+
+    // Attempt to fold address computation
+    // Common case: small offset, fits into instruction.
+    int ImmedOffset = Offset / Scale;
+    unsigned Mask = (1 << NumBits) - 1;
+    if ((unsigned)Offset <= Mask * Scale) {
+      // Replace the FrameIndex with fp/sp
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+      if (isSub) {
+        if (AddrMode == ARMII::AddrMode5)
+          // FIXME: Not consistent.
+          ImmedOffset |= 1 << NumBits;
+        else
+          ImmedOffset = -ImmedOffset;
+      }
+      ImmOp.ChangeToImmediate(ImmedOffset);
+      Offset = 0;
+      return true;
+    }
+
+    // Otherwise, offset doesn't fit. Pull in what we can to simplify
+    ImmedOffset = ImmedOffset & Mask;
+    if (isSub) {
+      if (AddrMode == ARMII::AddrMode5)
+        // FIXME: Not consistent.
+        ImmedOffset |= 1 << NumBits;
+      else {
+        ImmedOffset = -ImmedOffset;
+        if (ImmedOffset == 0)
+          // Change the opcode back if the encoded offset is zero.
+          MI.setDesc(TII.get(positiveOffsetOpcode(NewOpc)));
+      }
+    }
+    ImmOp.ChangeToImmediate(ImmedOffset);
+    Offset &= ~(Mask*Scale);
+  }
+
+  Offset = (isSub) ? -Offset : Offset;
+  return Offset == 0;
+}

diff --git a/lib/Target/ARM/Thumb2InstrInfo.h b/lib/Target/ARM/Thumb2InstrInfo.h
new file mode 100644
index 0000000..a0f89a6
--- /dev/null
+++ b/lib/Target/ARM/Thumb2InstrInfo.h

@@ -0,0 +1,58 @@
+//===- Thumb2InstrInfo.h - Thumb-2 Instruction Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Thumb-2 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef THUMB2INSTRUCTIONINFO_H
+#define THUMB2INSTRUCTIONINFO_H
+
+#include "llvm/Target/TargetInstrInfo.h"
+#include "ARM.h"
+#include "ARMInstrInfo.h"
+#include "Thumb2RegisterInfo.h"
+
+namespace llvm {
+  class ARMSubtarget;
+
+class Thumb2InstrInfo : public ARMBaseInstrInfo {
+  Thumb2RegisterInfo RI;
+public:
+  explicit Thumb2InstrInfo(const ARMSubtarget &STI);
+
+  // Return the non-pre/post incrementing version of 'Opc'. Return 0
+  // if there is not such an opcode.
+  unsigned getUnindexedOpcode(unsigned Opc) const;
+
+  bool copyRegToReg(MachineBasicBlock &MBB,
+                    MachineBasicBlock::iterator I,
+                    unsigned DestReg, unsigned SrcReg,
+                    const TargetRegisterClass *DestRC,
+                    const TargetRegisterClass *SrcRC) const;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC) const;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC) const;
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  const Thumb2RegisterInfo &getRegisterInfo() const { return RI; }
+};
+}
+
+#endif // THUMB2INSTRUCTIONINFO_H

diff --git a/lib/Target/ARM/Thumb2RegisterInfo.cpp b/lib/Target/ARM/Thumb2RegisterInfo.cpp
new file mode 100644
index 0000000..f24d3e2
--- /dev/null
+++ b/lib/Target/ARM/Thumb2RegisterInfo.cpp

@@ -0,0 +1,62 @@
+//===- Thumb2RegisterInfo.cpp - Thumb-2 Register Information ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Thumb-2 implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMAddressingModes.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMMachineFunctionInfo.h"
+#include "ARMSubtarget.h"
+#include "Thumb2InstrInfo.h"
+#include "Thumb2RegisterInfo.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/ErrorHandling.h"
+using namespace llvm;
+
+Thumb2RegisterInfo::Thumb2RegisterInfo(const ARMBaseInstrInfo &tii,
+                                       const ARMSubtarget &sti)
+  : ARMBaseRegisterInfo(tii, sti) {
+}
+
+/// emitLoadConstPool - Emits a load from constpool to materialize the
+/// specified immediate.
+void Thumb2RegisterInfo::emitLoadConstPool(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator &MBBI,
+                                           DebugLoc dl,
+                                           unsigned DestReg, unsigned SubIdx,
+                                           int Val,
+                                           ARMCC::CondCodes Pred,
+                                           unsigned PredReg) const {
+  MachineFunction &MF = *MBB.getParent();
+  MachineConstantPool *ConstantPool = MF.getConstantPool();
+  Constant *C = ConstantInt::get(
+           Type::getInt32Ty(MBB.getParent()->getFunction()->getContext()), Val);
+  unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
+
+  BuildMI(MBB, MBBI, dl, TII.get(ARM::t2LDRpci))
+    .addReg(DestReg, getDefRegState(true), SubIdx)
+    .addConstantPoolIndex(Idx).addImm((int64_t)ARMCC::AL).addReg(0);
+}

diff --git a/lib/Target/ARM/Thumb2RegisterInfo.h b/lib/Target/ARM/Thumb2RegisterInfo.h
new file mode 100644
index 0000000..b3cf2e5
--- /dev/null
+++ b/lib/Target/ARM/Thumb2RegisterInfo.h

@@ -0,0 +1,42 @@
+//===- Thumb2RegisterInfo.h - Thumb-2 Register Information Impl -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Thumb-2 implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef THUMB2REGISTERINFO_H
+#define THUMB2REGISTERINFO_H
+
+#include "ARM.h"
+#include "ARMRegisterInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+namespace llvm {
+  class ARMSubtarget;
+  class ARMBaseInstrInfo;
+  class Type;
+
+struct Thumb2RegisterInfo : public ARMBaseRegisterInfo {
+public:
+  Thumb2RegisterInfo(const ARMBaseInstrInfo &tii, const ARMSubtarget &STI);
+
+  /// emitLoadConstPool - Emits a load from constpool to materialize the
+  /// specified immediate.
+  void emitLoadConstPool(MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator &MBBI,
+                         DebugLoc dl,
+                         unsigned DestReg, unsigned SubIdx, int Val,
+                         ARMCC::CondCodes Pred = ARMCC::AL,
+                         unsigned PredReg = 0) const;
+};
+}
+
+#endif // THUMB2REGISTERINFO_H

diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp
new file mode 100644
index 0000000..5086eff
--- /dev/null
+++ b/lib/Target/ARM/Thumb2SizeReduction.cpp

@@ -0,0 +1,709 @@
+//===-- Thumb2SizeReduction.cpp - Thumb2 code size reduction pass -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "t2-reduce-size"
+#include "ARM.h"
+#include "ARMAddressingModes.h"
+#include "ARMBaseRegisterInfo.h"
+#include "ARMBaseInstrInfo.h"
+#include "Thumb2InstrInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(NumNarrows,  "Number of 32-bit instrs reduced to 16-bit ones");
+STATISTIC(Num2Addrs,   "Number of 32-bit instrs reduced to 2addr 16-bit ones");
+STATISTIC(NumLdSts,    "Number of 32-bit load / store reduced to 16-bit ones");
+
+static cl::opt<int> ReduceLimit("t2-reduce-limit",
+                                cl::init(-1), cl::Hidden);
+static cl::opt<int> ReduceLimit2Addr("t2-reduce-limit2",
+                                     cl::init(-1), cl::Hidden);
+static cl::opt<int> ReduceLimitLdSt("t2-reduce-limit3",
+                                     cl::init(-1), cl::Hidden);
+
+namespace {
+  /// ReduceTable - A static table with information on mapping from wide
+  /// opcodes to narrow
+  struct ReduceEntry {
+    unsigned WideOpc;      // Wide opcode
+    unsigned NarrowOpc1;   // Narrow opcode to transform to
+    unsigned NarrowOpc2;   // Narrow opcode when it's two-address
+    uint8_t  Imm1Limit;    // Limit of immediate field (bits)
+    uint8_t  Imm2Limit;    // Limit of immediate field when it's two-address
+    unsigned LowRegs1 : 1; // Only possible if low-registers are used
+    unsigned LowRegs2 : 1; // Only possible if low-registers are used (2addr)
+    unsigned PredCC1  : 2; // 0 - If predicated, cc is on and vice versa.
+                           // 1 - No cc field.
+                           // 2 - Always set CPSR.
+    unsigned PredCC2  : 2;
+    unsigned Special  : 1; // Needs to be dealt with specially
+  };
+
+  static const ReduceEntry ReduceTable[] = {
+    // Wide,        Narrow1,      Narrow2,     imm1,imm2,  lo1, lo2, P/C, S
+    { ARM::t2ADCrr, 0,            ARM::tADC,     0,   0,    0,   1,  0,0, 0 },
+    { ARM::t2ADDri, ARM::tADDi3,  ARM::tADDi8,   3,   8,    1,   1,  0,0, 0 },
+    { ARM::t2ADDrr, ARM::tADDrr,  ARM::tADDhirr, 0,   0,    1,   0,  0,1, 0 },
+    // Note: immediate scale is 4.
+    { ARM::t2ADDrSPi,ARM::tADDrSPi,0,            8,   0,    1,   0,  1,0, 0 },
+    { ARM::t2ADDSri,ARM::tADDi3,  ARM::tADDi8,   3,   8,    1,   1,  2,2, 1 },
+    { ARM::t2ADDSrr,ARM::tADDrr,  0,             0,   0,    1,   0,  2,0, 1 },
+    { ARM::t2ANDrr, 0,            ARM::tAND,     0,   0,    0,   1,  0,0, 0 },
+    { ARM::t2ASRri, ARM::tASRri,  0,             5,   0,    1,   0,  0,0, 0 },
+    { ARM::t2ASRrr, 0,            ARM::tASRrr,   0,   0,    0,   1,  0,0, 0 },
+    { ARM::t2BICrr, 0,            ARM::tBIC,     0,   0,    0,   1,  0,0, 0 },
+    //FIXME: Disable CMN, as CCodes are backwards from compare expectations
+    //{ ARM::t2CMNrr, ARM::tCMN,    0,             0,   0,    1,   0,  2,0, 0 },
+    { ARM::t2CMPri, ARM::tCMPi8,  0,             8,   0,    1,   0,  2,0, 0 },
+    { ARM::t2CMPrr, ARM::tCMPhir, 0,             0,   0,    0,   0,  2,0, 0 },
+    { ARM::t2CMPzri,ARM::tCMPzi8, 0,             8,   0,    1,   0,  2,0, 0 },
+    { ARM::t2CMPzrr,ARM::tCMPzhir,0,             0,   0,    0,   0,  2,0, 0 },
+    { ARM::t2EORrr, 0,            ARM::tEOR,     0,   0,    0,   1,  0,0, 0 },
+    // FIXME: adr.n immediate offset must be multiple of 4.
+    //{ ARM::t2LEApcrelJT,ARM::tLEApcrelJT, 0,     0,   0,    1,   0,  1,0, 0 },
+    { ARM::t2LSLri, ARM::tLSLri,  0,             5,   0,    1,   0,  0,0, 0 },
+    { ARM::t2LSLrr, 0,            ARM::tLSLrr,   0,   0,    0,   1,  0,0, 0 },
+    { ARM::t2LSRri, ARM::tLSRri,  0,             5,   0,    1,   0,  0,0, 0 },
+    { ARM::t2LSRrr, 0,            ARM::tLSRrr,   0,   0,    0,   1,  0,0, 0 },
+    { ARM::t2MOVi,  ARM::tMOVi8,  0,             8,   0,    1,   0,  0,0, 0 },
+    { ARM::t2MOVi16,ARM::tMOVi8,  0,             8,   0,    1,   0,  0,0, 1 },
+    // FIXME: Do we need the 16-bit 'S' variant?
+    { ARM::t2MOVr,ARM::tMOVgpr2gpr,0,            0,   0,    0,   0,  1,0, 0 },
+    { ARM::t2MOVCCr,0,            ARM::tMOVCCr,  0,   0,    0,   0,  0,1, 0 },
+    { ARM::t2MOVCCi,0,            ARM::tMOVCCi,  0,   8,    0,   1,  0,1, 0 },
+    { ARM::t2MUL,   0,            ARM::tMUL,     0,   0,    0,   1,  0,0, 0 },
+    { ARM::t2MVNr,  ARM::tMVN,    0,             0,   0,    1,   0,  0,0, 0 },
+    { ARM::t2ORRrr, 0,            ARM::tORR,     0,   0,    0,   1,  0,0, 0 },
+    { ARM::t2REV,   ARM::tREV,    0,             0,   0,    1,   0,  1,0, 0 },
+    { ARM::t2REV16, ARM::tREV16,  0,             0,   0,    1,   0,  1,0, 0 },
+    { ARM::t2REVSH, ARM::tREVSH,  0,             0,   0,    1,   0,  1,0, 0 },
+    { ARM::t2RORrr, 0,            ARM::tROR,     0,   0,    0,   1,  0,0, 0 },
+    { ARM::t2RSBri, ARM::tRSB,    0,             0,   0,    1,   0,  0,0, 1 },
+    { ARM::t2RSBSri,ARM::tRSB,    0,             0,   0,    1,   0,  2,0, 1 },
+    { ARM::t2SBCrr, 0,            ARM::tSBC,     0,   0,    0,   1,  0,0, 0 },
+    { ARM::t2SUBri, ARM::tSUBi3,  ARM::tSUBi8,   3,   8,    1,   1,  0,0, 0 },
+    { ARM::t2SUBrr, ARM::tSUBrr,  0,             0,   0,    1,   0,  0,0, 0 },
+    { ARM::t2SUBSri,ARM::tSUBi3,  ARM::tSUBi8,   3,   8,    1,   1,  2,2, 0 },
+    { ARM::t2SUBSrr,ARM::tSUBrr,  0,             0,   0,    1,   0,  2,0, 0 },
+    { ARM::t2SXTBr, ARM::tSXTB,   0,             0,   0,    1,   0,  1,0, 0 },
+    { ARM::t2SXTHr, ARM::tSXTH,   0,             0,   0,    1,   0,  1,0, 0 },
+    { ARM::t2TSTrr, ARM::tTST,    0,             0,   0,    1,   0,  2,0, 0 },
+    { ARM::t2UXTBr, ARM::tUXTB,   0,             0,   0,    1,   0,  1,0, 0 },
+    { ARM::t2UXTHr, ARM::tUXTH,   0,             0,   0,    1,   0,  1,0, 0 },
+
+    // FIXME: Clean this up after splitting each Thumb load / store opcode
+    // into multiple ones.
+    { ARM::t2LDRi12,ARM::tLDR,    ARM::tLDRspi,  5,   8,    1,   0,  0,0, 1 },
+    { ARM::t2LDRs,  ARM::tLDR,    0,             0,   0,    1,   0,  0,0, 1 },
+    { ARM::t2LDRBi12,ARM::tLDRB,  0,             5,   0,    1,   0,  0,0, 1 },
+    { ARM::t2LDRBs, ARM::tLDRB,   0,             0,   0,    1,   0,  0,0, 1 },
+    { ARM::t2LDRHi12,ARM::tLDRH,  0,             5,   0,    1,   0,  0,0, 1 },
+    { ARM::t2LDRHs, ARM::tLDRH,   0,             0,   0,    1,   0,  0,0, 1 },
+    { ARM::t2LDRSBs,ARM::tLDRSB,  0,             0,   0,    1,   0,  0,0, 1 },
+    { ARM::t2LDRSHs,ARM::tLDRSH,  0,             0,   0,    1,   0,  0,0, 1 },
+    { ARM::t2STRi12,ARM::tSTR,    ARM::tSTRspi,  5,   8,    1,   0,  0,0, 1 },
+    { ARM::t2STRs,  ARM::tSTR,    0,             0,   0,    1,   0,  0,0, 1 },
+    { ARM::t2STRBi12,ARM::tSTRB,  0,             5,   0,    1,   0,  0,0, 1 },
+    { ARM::t2STRBs, ARM::tSTRB,   0,             0,   0,    1,   0,  0,0, 1 },
+    { ARM::t2STRHi12,ARM::tSTRH,  0,             5,   0,    1,   0,  0,0, 1 },
+    { ARM::t2STRHs, ARM::tSTRH,   0,             0,   0,    1,   0,  0,0, 1 },
+
+    { ARM::t2LDM_RET,0,           ARM::tPOP_RET, 0,   0,    1,   1,  1,1, 1 },
+    { ARM::t2LDM,   ARM::tLDM,    ARM::tPOP,     0,   0,    1,   1,  1,1, 1 },
+    { ARM::t2STM,   ARM::tSTM,    ARM::tPUSH,    0,   0,    1,   1,  1,1, 1 },
+  };
+
+  class Thumb2SizeReduce : public MachineFunctionPass {
+  public:
+    static char ID;
+    Thumb2SizeReduce();
+
+    const Thumb2InstrInfo *TII;
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual const char *getPassName() const {
+      return "Thumb2 instruction size reduction pass";
+    }
+
+  private:
+    /// ReduceOpcodeMap - Maps wide opcode to index of entry in ReduceTable.
+    DenseMap<unsigned, unsigned> ReduceOpcodeMap;
+
+    bool VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry,
+                         bool is2Addr, ARMCC::CondCodes Pred,
+                         bool LiveCPSR, bool &HasCC, bool &CCDead);
+
+    bool ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
+                         const ReduceEntry &Entry);
+
+    bool ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
+                       const ReduceEntry &Entry, bool LiveCPSR);
+
+    /// ReduceTo2Addr - Reduce a 32-bit instruction to a 16-bit two-address
+    /// instruction.
+    bool ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
+                       const ReduceEntry &Entry,
+                       bool LiveCPSR);
+
+    /// ReduceToNarrow - Reduce a 32-bit instruction to a 16-bit
+    /// non-two-address instruction.
+    bool ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
+                        const ReduceEntry &Entry,
+                        bool LiveCPSR);
+
+    /// ReduceMBB - Reduce width of instructions in the specified basic block.
+    bool ReduceMBB(MachineBasicBlock &MBB);
+  };
+  char Thumb2SizeReduce::ID = 0;
+}
+
+Thumb2SizeReduce::Thumb2SizeReduce() : MachineFunctionPass(&ID) {
+  for (unsigned i = 0, e = array_lengthof(ReduceTable); i != e; ++i) {
+    unsigned FromOpc = ReduceTable[i].WideOpc;
+    if (!ReduceOpcodeMap.insert(std::make_pair(FromOpc, i)).second)
+      assert(false && "Duplicated entries?");
+  }
+}
+
+static bool HasImplicitCPSRDef(const TargetInstrDesc &TID) {
+  for (const unsigned *Regs = TID.ImplicitDefs; *Regs; ++Regs)
+    if (*Regs == ARM::CPSR)
+      return true;
+  return false;
+}
+
+bool
+Thumb2SizeReduce::VerifyPredAndCC(MachineInstr *MI, const ReduceEntry &Entry,
+                                  bool is2Addr, ARMCC::CondCodes Pred,
+                                  bool LiveCPSR, bool &HasCC, bool &CCDead) {
+  if ((is2Addr  && Entry.PredCC2 == 0) ||
+      (!is2Addr && Entry.PredCC1 == 0)) {
+    if (Pred == ARMCC::AL) {
+      // Not predicated, must set CPSR.
+      if (!HasCC) {
+        // Original instruction was not setting CPSR, but CPSR is not
+        // currently live anyway. It's ok to set it. The CPSR def is
+        // dead though.
+        if (!LiveCPSR) {
+          HasCC = true;
+          CCDead = true;
+          return true;
+        }
+        return false;
+      }
+    } else {
+      // Predicated, must not set CPSR.
+      if (HasCC)
+        return false;
+    }
+  } else if ((is2Addr  && Entry.PredCC2 == 2) ||
+             (!is2Addr && Entry.PredCC1 == 2)) {
+    /// Old opcode has an optional def of CPSR.
+    if (HasCC)
+      return true;
+    // If both old opcode does not implicit CPSR def, then it's not ok since
+    // these new opcodes CPSR def is not meant to be thrown away. e.g. CMP.
+    if (!HasImplicitCPSRDef(MI->getDesc()))
+      return false;
+    HasCC = true;
+  } else {
+    // 16-bit instruction does not set CPSR.
+    if (HasCC)
+      return false;
+  }
+
+  return true;
+}
+
+static bool VerifyLowRegs(MachineInstr *MI) {
+  unsigned Opc = MI->getOpcode();
+  bool isPCOk = (Opc == ARM::t2LDM_RET) || (Opc == ARM::t2LDM);
+  bool isLROk = (Opc == ARM::t2STM);
+  bool isSPOk = isPCOk || isLROk || (Opc == ARM::t2ADDrSPi);
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || MO.isImplicit())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0 || Reg == ARM::CPSR)
+      continue;
+    if (isPCOk && Reg == ARM::PC)
+      continue;
+    if (isLROk && Reg == ARM::LR)
+      continue;
+    if (Reg == ARM::SP) {
+      if (isSPOk)
+        continue;
+      if (i == 1 && (Opc == ARM::t2LDRi12 || Opc == ARM::t2STRi12))
+        // Special case for these ldr / str with sp as base register.
+        continue;
+    }
+    if (!isARMLowRegister(Reg))
+      return false;
+  }
+  return true;
+}
+
+bool
+Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
+                                  const ReduceEntry &Entry) {
+  if (ReduceLimitLdSt != -1 && ((int)NumLdSts >= ReduceLimitLdSt))
+    return false;
+
+  unsigned Scale = 1;
+  bool HasImmOffset = false;
+  bool HasShift = false;
+  bool HasOffReg = true;
+  bool isLdStMul = false;
+  unsigned Opc = Entry.NarrowOpc1;
+  unsigned OpNum = 3; // First 'rest' of operands.
+  uint8_t  ImmLimit = Entry.Imm1Limit;
+  switch (Entry.WideOpc) {
+  default:
+    llvm_unreachable("Unexpected Thumb2 load / store opcode!");
+  case ARM::t2LDRi12:
+  case ARM::t2STRi12: {
+    unsigned BaseReg = MI->getOperand(1).getReg();
+    if (BaseReg == ARM::SP) {
+      Opc = Entry.NarrowOpc2;
+      ImmLimit = Entry.Imm2Limit;
+      HasOffReg = false;
+    }
+    Scale = 4;
+    HasImmOffset = true;
+    break;
+  }
+  case ARM::t2LDRBi12:
+  case ARM::t2STRBi12:
+    HasImmOffset = true;
+    break;
+  case ARM::t2LDRHi12:
+  case ARM::t2STRHi12:
+    Scale = 2;
+    HasImmOffset = true;
+    break;
+  case ARM::t2LDRs:
+  case ARM::t2LDRBs:
+  case ARM::t2LDRHs:
+  case ARM::t2LDRSBs:
+  case ARM::t2LDRSHs:
+  case ARM::t2STRs:
+  case ARM::t2STRBs:
+  case ARM::t2STRHs:
+    HasShift = true;
+    OpNum = 4;
+    break;
+  case ARM::t2LDM_RET:
+  case ARM::t2LDM:
+  case ARM::t2STM: {
+    OpNum = 0;
+    unsigned BaseReg = MI->getOperand(0).getReg();
+    unsigned Mode = MI->getOperand(1).getImm();
+    if (BaseReg == ARM::SP && ARM_AM::getAM4WBFlag(Mode)) {
+      Opc = Entry.NarrowOpc2;
+      OpNum = 2;
+    } else if (Entry.WideOpc == ARM::t2LDM_RET ||
+               !isARMLowRegister(BaseReg) ||
+               !ARM_AM::getAM4WBFlag(Mode) ||
+               ARM_AM::getAM4SubMode(Mode) != ARM_AM::ia) {
+      return false;
+    }
+    isLdStMul = true;
+    break;
+  }
+  }
+
+  unsigned OffsetReg = 0;
+  bool OffsetKill = false;
+  if (HasShift) {
+    OffsetReg  = MI->getOperand(2).getReg();
+    OffsetKill = MI->getOperand(2).isKill();
+    if (MI->getOperand(3).getImm())
+      // Thumb1 addressing mode doesn't support shift.
+      return false;
+  }
+
+  unsigned OffsetImm = 0;
+  if (HasImmOffset) {
+    OffsetImm = MI->getOperand(2).getImm();
+    unsigned MaxOffset = ((1 << ImmLimit) - 1) * Scale;
+    if ((OffsetImm & (Scale-1)) || OffsetImm > MaxOffset)
+      // Make sure the immediate field fits.
+      return false;
+  }
+
+  // Add the 16-bit load / store instruction.
+  // FIXME: Thumb1 addressing mode encode both immediate and register offset.
+  DebugLoc dl = MI->getDebugLoc();
+  MachineInstrBuilder MIB = BuildMI(MBB, *MI, dl, TII->get(Opc));
+  if (!isLdStMul) {
+    MIB.addOperand(MI->getOperand(0)).addOperand(MI->getOperand(1));
+    if (Opc != ARM::tLDRSB && Opc != ARM::tLDRSH) {
+      // tLDRSB and tLDRSH do not have an immediate offset field. On the other
+      // hand, it must have an offset register.
+      // FIXME: Remove this special case.
+      MIB.addImm(OffsetImm/Scale);
+    }
+    assert((!HasShift || OffsetReg) && "Invalid so_reg load / store address!");
+
+    if (HasOffReg)
+      MIB.addReg(OffsetReg, getKillRegState(OffsetKill));
+  }
+
+  // Transfer the rest of operands.
+  for (unsigned e = MI->getNumOperands(); OpNum != e; ++OpNum)
+    MIB.addOperand(MI->getOperand(OpNum));
+
+  // Transfer memoperands.
+  (*MIB).setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+  DEBUG(errs() << "Converted 32-bit: " << *MI << "       to 16-bit: " << *MIB);
+
+  MBB.erase(MI);
+  ++NumLdSts;
+  return true;
+}
+
+bool
+Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
+                                const ReduceEntry &Entry,
+                                bool LiveCPSR) {
+  if (Entry.LowRegs1 && !VerifyLowRegs(MI))
+    return false;
+
+  const TargetInstrDesc &TID = MI->getDesc();
+  if (TID.mayLoad() || TID.mayStore())
+    return ReduceLoadStore(MBB, MI, Entry);
+
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+  default: break;
+  case ARM::t2ADDSri: 
+  case ARM::t2ADDSrr: {
+    unsigned PredReg = 0;
+    if (getInstrPredicate(MI, PredReg) == ARMCC::AL) {
+      switch (Opc) {
+      default: break;
+      case ARM::t2ADDSri: {
+        if (ReduceTo2Addr(MBB, MI, Entry, LiveCPSR))
+          return true;
+        // fallthrough
+      }
+      case ARM::t2ADDSrr:
+        return ReduceToNarrow(MBB, MI, Entry, LiveCPSR);
+      }
+    }
+    break;
+  }
+  case ARM::t2RSBri:
+  case ARM::t2RSBSri:
+    if (MI->getOperand(2).getImm() == 0)
+      return ReduceToNarrow(MBB, MI, Entry, LiveCPSR);
+    break;
+  case ARM::t2MOVi16:
+    // Can convert only 'pure' immediate operands, not immediates obtained as
+    // globals' addresses.
+    if (MI->getOperand(1).isImm())
+      return ReduceToNarrow(MBB, MI, Entry, LiveCPSR);
+    break;
+  }
+  return false;
+}
+
+bool
+Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
+                                const ReduceEntry &Entry,
+                                bool LiveCPSR) {
+
+  if (ReduceLimit2Addr != -1 && ((int)Num2Addrs >= ReduceLimit2Addr))
+    return false;
+
+  const TargetInstrDesc &TID = MI->getDesc();
+  unsigned Reg0 = MI->getOperand(0).getReg();
+  unsigned Reg1 = MI->getOperand(1).getReg();
+  if (Reg0 != Reg1)
+    return false;
+  if (Entry.LowRegs2 && !isARMLowRegister(Reg0))
+    return false;
+  if (Entry.Imm2Limit) {
+    unsigned Imm = MI->getOperand(2).getImm();
+    unsigned Limit = (1 << Entry.Imm2Limit) - 1;
+    if (Imm > Limit)
+      return false;
+  } else {
+    unsigned Reg2 = MI->getOperand(2).getReg();
+    if (Entry.LowRegs2 && !isARMLowRegister(Reg2))
+      return false;
+  }
+
+  // Check if it's possible / necessary to transfer the predicate.
+  const TargetInstrDesc &NewTID = TII->get(Entry.NarrowOpc2);
+  unsigned PredReg = 0;
+  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
+  bool SkipPred = false;
+  if (Pred != ARMCC::AL) {
+    if (!NewTID.isPredicable())
+      // Can't transfer predicate, fail.
+      return false;
+  } else {
+    SkipPred = !NewTID.isPredicable();
+  }
+
+  bool HasCC = false;
+  bool CCDead = false;
+  if (TID.hasOptionalDef()) {
+    unsigned NumOps = TID.getNumOperands();
+    HasCC = (MI->getOperand(NumOps-1).getReg() == ARM::CPSR);
+    if (HasCC && MI->getOperand(NumOps-1).isDead())
+      CCDead = true;
+  }
+  if (!VerifyPredAndCC(MI, Entry, true, Pred, LiveCPSR, HasCC, CCDead))
+    return false;
+
+  // Add the 16-bit instruction.
+  DebugLoc dl = MI->getDebugLoc();
+  MachineInstrBuilder MIB = BuildMI(MBB, *MI, dl, NewTID);
+  MIB.addOperand(MI->getOperand(0));
+  if (NewTID.hasOptionalDef()) {
+    if (HasCC)
+      AddDefaultT1CC(MIB, CCDead);
+    else
+      AddNoT1CC(MIB);
+  }
+
+  // Transfer the rest of operands.
+  unsigned NumOps = TID.getNumOperands();
+  for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) {
+    if (i < NumOps && TID.OpInfo[i].isOptionalDef())
+      continue;
+    if (SkipPred && TID.OpInfo[i].isPredicate())
+      continue;
+    MIB.addOperand(MI->getOperand(i));
+  }
+
+  DEBUG(errs() << "Converted 32-bit: " << *MI << "       to 16-bit: " << *MIB);
+
+  MBB.erase(MI);
+  ++Num2Addrs;
+  return true;
+}
+
+bool
+Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
+                                 const ReduceEntry &Entry,
+                                 bool LiveCPSR) {
+  if (ReduceLimit != -1 && ((int)NumNarrows >= ReduceLimit))
+    return false;
+
+  unsigned Limit = ~0U;
+  unsigned Scale = (Entry.WideOpc == ARM::t2ADDrSPi) ? 4 : 1;
+  if (Entry.Imm1Limit)
+    Limit = ((1 << Entry.Imm1Limit) - 1) * Scale;
+
+  const TargetInstrDesc &TID = MI->getDesc();
+  for (unsigned i = 0, e = TID.getNumOperands(); i != e; ++i) {
+    if (TID.OpInfo[i].isPredicate())
+      continue;
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg()) {
+      unsigned Reg = MO.getReg();
+      if (!Reg || Reg == ARM::CPSR)
+        continue;
+      if (Entry.WideOpc == ARM::t2ADDrSPi && Reg == ARM::SP)
+        continue;
+      if (Entry.LowRegs1 && !isARMLowRegister(Reg))
+        return false;
+    } else if (MO.isImm() &&
+               !TID.OpInfo[i].isPredicate()) {
+      if (((unsigned)MO.getImm()) > Limit || (MO.getImm() & (Scale-1)) != 0)
+        return false;
+    }
+  }
+
+  // Check if it's possible / necessary to transfer the predicate.
+  const TargetInstrDesc &NewTID = TII->get(Entry.NarrowOpc1);
+  unsigned PredReg = 0;
+  ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
+  bool SkipPred = false;
+  if (Pred != ARMCC::AL) {
+    if (!NewTID.isPredicable())
+      // Can't transfer predicate, fail.
+      return false;
+  } else {
+    SkipPred = !NewTID.isPredicable();
+  }
+
+  bool HasCC = false;
+  bool CCDead = false;
+  if (TID.hasOptionalDef()) {
+    unsigned NumOps = TID.getNumOperands();
+    HasCC = (MI->getOperand(NumOps-1).getReg() == ARM::CPSR);
+    if (HasCC && MI->getOperand(NumOps-1).isDead())
+      CCDead = true;
+  }
+  if (!VerifyPredAndCC(MI, Entry, false, Pred, LiveCPSR, HasCC, CCDead))
+    return false;
+
+  // Add the 16-bit instruction.
+  DebugLoc dl = MI->getDebugLoc();
+  MachineInstrBuilder MIB = BuildMI(MBB, *MI, dl, NewTID);
+  MIB.addOperand(MI->getOperand(0));
+  if (NewTID.hasOptionalDef()) {
+    if (HasCC)
+      AddDefaultT1CC(MIB, CCDead);
+    else
+      AddNoT1CC(MIB);
+  }
+
+  // Transfer the rest of operands.
+  unsigned NumOps = TID.getNumOperands();
+  for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) {
+    if (i < NumOps && TID.OpInfo[i].isOptionalDef())
+      continue;
+    if ((TID.getOpcode() == ARM::t2RSBSri ||
+         TID.getOpcode() == ARM::t2RSBri) && i == 2)
+      // Skip the zero immediate operand, it's now implicit.
+      continue;
+    bool isPred = (i < NumOps && TID.OpInfo[i].isPredicate());
+    if (SkipPred && isPred)
+        continue;
+    const MachineOperand &MO = MI->getOperand(i);
+    if (Scale > 1 && !isPred && MO.isImm())
+      MIB.addImm(MO.getImm() / Scale);
+    else {
+      if (MO.isReg() && MO.isImplicit() && MO.getReg() == ARM::CPSR)
+        // Skip implicit def of CPSR. Either it's modeled as an optional
+        // def now or it's already an implicit def on the new instruction.
+        continue;
+      MIB.addOperand(MO);
+    }
+  }
+  if (!TID.isPredicable() && NewTID.isPredicable())
+    AddDefaultPred(MIB);
+
+  DEBUG(errs() << "Converted 32-bit: " << *MI << "       to 16-bit: " << *MIB);
+
+  MBB.erase(MI);
+  ++NumNarrows;
+  return true;
+}
+
+static bool UpdateCPSRDef(MachineInstr &MI, bool LiveCPSR) {
+  bool HasDef = false;
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || MO.isUndef() || MO.isUse())
+      continue;
+    if (MO.getReg() != ARM::CPSR)
+      continue;
+    if (!MO.isDead())
+      HasDef = true;
+  }
+
+  return HasDef || LiveCPSR;
+}
+
+static bool UpdateCPSRUse(MachineInstr &MI, bool LiveCPSR) {
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || MO.isUndef() || MO.isDef())
+      continue;
+    if (MO.getReg() != ARM::CPSR)
+      continue;
+    assert(LiveCPSR && "CPSR liveness tracking is wrong!");
+    if (MO.isKill()) {
+      LiveCPSR = false;
+      break;
+    }
+  }
+
+  return LiveCPSR;
+}
+
+bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
+  bool Modified = false;
+
+  bool LiveCPSR = false;
+  // Yes, CPSR could be livein.
+  for (MachineBasicBlock::const_livein_iterator I = MBB.livein_begin(),
+         E = MBB.livein_end(); I != E; ++I) {
+    if (*I == ARM::CPSR) {
+      LiveCPSR = true;
+      break;
+    }
+  }
+
+  MachineBasicBlock::iterator MII = MBB.begin(), E = MBB.end();
+  MachineBasicBlock::iterator NextMII;
+  for (; MII != E; MII = NextMII) {
+    NextMII = llvm::next(MII);
+
+    MachineInstr *MI = &*MII;
+    LiveCPSR = UpdateCPSRUse(*MI, LiveCPSR);
+
+    unsigned Opcode = MI->getOpcode();
+    DenseMap<unsigned, unsigned>::iterator OPI = ReduceOpcodeMap.find(Opcode);
+    if (OPI != ReduceOpcodeMap.end()) {
+      const ReduceEntry &Entry = ReduceTable[OPI->second];
+      // Ignore "special" cases for now.
+      if (Entry.Special) {
+        if (ReduceSpecial(MBB, MI, Entry, LiveCPSR)) {
+          Modified = true;
+          MachineBasicBlock::iterator I = prior(NextMII);
+          MI = &*I;
+        }
+        goto ProcessNext;
+      }
+
+      // Try to transform to a 16-bit two-address instruction.
+      if (Entry.NarrowOpc2 && ReduceTo2Addr(MBB, MI, Entry, LiveCPSR)) {
+        Modified = true;
+        MachineBasicBlock::iterator I = prior(NextMII);
+        MI = &*I;
+        goto ProcessNext;
+      }
+
+      // Try to transform ro a 16-bit non-two-address instruction.
+      if (Entry.NarrowOpc1 && ReduceToNarrow(MBB, MI, Entry, LiveCPSR)) {
+        Modified = true;
+        MachineBasicBlock::iterator I = prior(NextMII);
+        MI = &*I;
+      }
+    }
+
+  ProcessNext:
+    LiveCPSR = UpdateCPSRDef(*MI, LiveCPSR);
+  }
+
+  return Modified;
+}
+
+bool Thumb2SizeReduce::runOnMachineFunction(MachineFunction &MF) {
+  const TargetMachine &TM = MF.getTarget();
+  TII = static_cast<const Thumb2InstrInfo*>(TM.getInstrInfo());
+
+  bool Modified = false;
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
+    Modified |= ReduceMBB(*I);
+  return Modified;
+}
+
+/// createThumb2SizeReductionPass - Returns an instance of the Thumb2 size
+/// reduction pass.
+FunctionPass *llvm::createThumb2SizeReductionPass() {
+  return new Thumb2SizeReduce();
+}

diff --git a/lib/Target/Alpha/Alpha.h b/lib/Target/Alpha/Alpha.h
new file mode 100644
index 0000000..5cf4866
--- /dev/null
+++ b/lib/Target/Alpha/Alpha.h

@@ -0,0 +1,46 @@
+//===-- Alpha.h - Top-level interface for Alpha representation --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// Alpha back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_ALPHA_H
+#define TARGET_ALPHA_H
+
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+  class AlphaTargetMachine;
+  class FunctionPass;
+  class formatted_raw_ostream;
+
+  FunctionPass *createAlphaISelDag(AlphaTargetMachine &TM);
+  FunctionPass *createAlphaPatternInstructionSelector(TargetMachine &TM);
+  FunctionPass *createAlphaJITCodeEmitterPass(AlphaTargetMachine &TM,
+                                              JITCodeEmitter &JCE);
+  FunctionPass *createAlphaLLRPPass(AlphaTargetMachine &tm);
+  FunctionPass *createAlphaBranchSelectionPass();
+
+  extern Target TheAlphaTarget;
+
+} // end namespace llvm;
+
+// Defines symbolic names for Alpha registers.  This defines a mapping from
+// register name to register number.
+//
+#include "AlphaGenRegisterNames.inc"
+
+// Defines symbolic names for the Alpha instructions.
+//
+#include "AlphaGenInstrNames.inc"
+
+#endif

diff --git a/lib/Target/Alpha/Alpha.td b/lib/Target/Alpha/Alpha.td
new file mode 100644
index 0000000..6efdf55
--- /dev/null
+++ b/lib/Target/Alpha/Alpha.td

@@ -0,0 +1,72 @@
+//===- Alpha.td - Describe the Alpha Target Machine --------*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+// Get the target-independent interfaces which we are implementing...
+//
+include "llvm/Target/Target.td"
+
+//Alpha is little endian
+
+//===----------------------------------------------------------------------===//
+// Subtarget Features
+//===----------------------------------------------------------------------===//
+
+def FeatureCIX : SubtargetFeature<"cix", "HasCT", "true",
+                                  "Enable CIX extentions">;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "AlphaRegisterInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Calling Convention Description
+//===----------------------------------------------------------------------===//
+
+include "AlphaCallingConv.td"
+
+//===----------------------------------------------------------------------===//
+// Schedule Description
+//===----------------------------------------------------------------------===//
+
+include "AlphaSchedule.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "AlphaInstrInfo.td"
+
+def AlphaInstrInfo : InstrInfo {
+  // Define how we want to layout our target-specific information field.
+ // let TSFlagsFields = [];
+ // let TSFlagsShifts = [];
+}
+
+//===----------------------------------------------------------------------===//
+// Alpha Processor Definitions
+//===----------------------------------------------------------------------===//
+
+def : Processor<"generic", Alpha21264Itineraries, []>;
+def : Processor<"ev6"    , Alpha21264Itineraries, []>;
+def : Processor<"ev67"   , Alpha21264Itineraries, [FeatureCIX]>;
+
+//===----------------------------------------------------------------------===//
+// The Alpha Target
+//===----------------------------------------------------------------------===//
+
+
+def Alpha : Target {
+  // Pull in Instruction Info:
+  let InstructionSet = AlphaInstrInfo;
+}

diff --git a/lib/Target/Alpha/AlphaBranchSelector.cpp b/lib/Target/Alpha/AlphaBranchSelector.cpp
new file mode 100644
index 0000000..001656e
--- /dev/null
+++ b/lib/Target/Alpha/AlphaBranchSelector.cpp

@@ -0,0 +1,66 @@
+//===-- AlphaBranchSelector.cpp - Convert Pseudo branchs ----------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Replace Pseudo COND_BRANCH_* with their appropriate real branch
+// Simplified version of the PPC Branch Selector
+//
+//===----------------------------------------------------------------------===//
+
+#include "Alpha.h"
+#include "AlphaInstrInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/MC/MCAsmInfo.h"
+using namespace llvm;
+
+namespace {
+  struct AlphaBSel : public MachineFunctionPass {
+    static char ID;
+    AlphaBSel() : MachineFunctionPass(&ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+    virtual const char *getPassName() const {
+      return "Alpha Branch Selection";
+    }
+  };
+  char AlphaBSel::ID = 0;
+}
+
+/// createAlphaBranchSelectionPass - returns an instance of the Branch Selection
+/// Pass
+///
+FunctionPass *llvm::createAlphaBranchSelectionPass() {
+  return new AlphaBSel();
+}
+
+bool AlphaBSel::runOnMachineFunction(MachineFunction &Fn) {
+
+  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+       ++MFI) {
+    MachineBasicBlock *MBB = MFI;
+    
+    for (MachineBasicBlock::iterator MBBI = MBB->begin(), EE = MBB->end();
+         MBBI != EE; ++MBBI) {
+      if (MBBI->getOpcode() == Alpha::COND_BRANCH_I ||
+          MBBI->getOpcode() == Alpha::COND_BRANCH_F) {
+        
+        // condbranch operands:
+        // 0. bc opcode
+        // 1. reg
+        // 2. target MBB
+        const TargetInstrInfo *TII = Fn.getTarget().getInstrInfo();
+        MBBI->setDesc(TII->get(MBBI->getOperand(0).getImm()));
+      }
+    }
+  }
+  
+  return true;
+}
+

diff --git a/lib/Target/Alpha/AlphaCallingConv.td b/lib/Target/Alpha/AlphaCallingConv.td
new file mode 100644
index 0000000..38ada69
--- /dev/null
+++ b/lib/Target/Alpha/AlphaCallingConv.td

@@ -0,0 +1,37 @@
+//===- AlphaCallingConv.td - Calling Conventions for Alpha -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This describes the calling conventions for Alpha architecture.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Alpha Return Value Calling Convention
+//===----------------------------------------------------------------------===//
+def RetCC_Alpha : CallingConv<[
+  // i64 is returned in register R0
+  CCIfType<[i64], CCAssignToReg<[R0]>>,
+
+  // f32 / f64 are returned in F0/F1
+  CCIfType<[f32, f64], CCAssignToReg<[F0, F1]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Alpha Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+def CC_Alpha : CallingConv<[
+  // The first 6 arguments are passed in registers, whether integer or
+  // floating-point
+  CCIfType<[i64], CCAssignToRegWithShadow<[R16, R17, R18, R19, R20, R21],
+                                          [F16, F17, F18, F19, F20, F21]>>,
+
+  CCIfType<[f32, f64], CCAssignToRegWithShadow<[F16, F17, F18, F19, F20, F21],
+                                               [R16, R17, R18, R19, R20, R21]>>,
+
+  // Stack slots are 8 bytes in size and 8-byte aligned.
+  CCIfType<[i64, f32, f64], CCAssignToStack<8, 8>>
+]>;

diff --git a/lib/Target/Alpha/AlphaCodeEmitter.cpp b/lib/Target/Alpha/AlphaCodeEmitter.cpp
new file mode 100644
index 0000000..eb5e429
--- /dev/null
+++ b/lib/Target/Alpha/AlphaCodeEmitter.cpp

@@ -0,0 +1,219 @@
+//===-- Alpha/AlphaCodeEmitter.cpp - Convert Alpha code to machine code ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the pass that transforms the Alpha machine instructions
+// into relocatable machine code.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "alpha-emitter"
+#include "AlphaTargetMachine.h"
+#include "AlphaRelocations.h"
+#include "Alpha.h"
+#include "llvm/PassManager.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+namespace {
+  class AlphaCodeEmitter : public MachineFunctionPass {
+    JITCodeEmitter &MCE;
+    const AlphaInstrInfo *II;
+  public:
+    static char ID;
+
+    AlphaCodeEmitter(JITCodeEmitter &mce) : MachineFunctionPass(&ID),
+    MCE(mce) {}
+
+    /// getBinaryCodeForInstr - This function, generated by the
+    /// CodeEmitterGenerator using TableGen, produces the binary encoding for
+    /// machine instructions.
+
+    unsigned getBinaryCodeForInstr(const MachineInstr &MI);
+
+    /// getMachineOpValue - evaluates the MachineOperand of a given MachineInstr
+
+    unsigned getMachineOpValue(const MachineInstr &MI,
+                               const MachineOperand &MO);
+    
+    bool runOnMachineFunction(MachineFunction &MF);
+    
+    virtual const char *getPassName() const {
+      return "Alpha Machine Code Emitter";
+    }
+    
+  private:
+    void emitBasicBlock(MachineBasicBlock &MBB);
+  };
+}
+
+char AlphaCodeEmitter::ID = 0;
+
+
+/// createAlphaCodeEmitterPass - Return a pass that emits the collected Alpha
+/// code to the specified MCE object.
+
+FunctionPass *llvm::createAlphaJITCodeEmitterPass(AlphaTargetMachine &TM,
+                                                  JITCodeEmitter &JCE) {
+  return new AlphaCodeEmitter(JCE);
+}
+
+bool AlphaCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
+  II = ((AlphaTargetMachine&)MF.getTarget()).getInstrInfo();
+
+  do {
+    MCE.startFunction(MF);
+    for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I)
+      emitBasicBlock(*I);
+  } while (MCE.finishFunction(MF));
+
+  return false;
+}
+
+void AlphaCodeEmitter::emitBasicBlock(MachineBasicBlock &MBB) {
+  MCE.StartMachineBasicBlock(&MBB);
+  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+       I != E; ++I) {
+    const MachineInstr &MI = *I;
+    MCE.processDebugLoc(MI.getDebugLoc(), true);
+    switch(MI.getOpcode()) {
+    default:
+      MCE.emitWordLE(getBinaryCodeForInstr(*I));
+      break;
+    case Alpha::ALTENT:
+    case Alpha::PCLABEL:
+    case Alpha::MEMLABEL:
+    case TargetOpcode::IMPLICIT_DEF:
+    case TargetOpcode::KILL:
+      break; //skip these
+    }
+    MCE.processDebugLoc(MI.getDebugLoc(), false);
+  }
+}
+
+static unsigned getAlphaRegNumber(unsigned Reg) {
+  switch (Reg) {
+  case Alpha::R0  : case Alpha::F0  : return 0;
+  case Alpha::R1  : case Alpha::F1  : return 1;
+  case Alpha::R2  : case Alpha::F2  : return 2;
+  case Alpha::R3  : case Alpha::F3  : return 3;
+  case Alpha::R4  : case Alpha::F4  : return 4;
+  case Alpha::R5  : case Alpha::F5  : return 5;
+  case Alpha::R6  : case Alpha::F6  : return 6;
+  case Alpha::R7  : case Alpha::F7  : return 7;
+  case Alpha::R8  : case Alpha::F8  : return 8;
+  case Alpha::R9  : case Alpha::F9  : return 9;
+  case Alpha::R10 : case Alpha::F10 : return 10;
+  case Alpha::R11 : case Alpha::F11 : return 11;
+  case Alpha::R12 : case Alpha::F12 : return 12;
+  case Alpha::R13 : case Alpha::F13 : return 13;
+  case Alpha::R14 : case Alpha::F14 : return 14;
+  case Alpha::R15 : case Alpha::F15 : return 15;
+  case Alpha::R16 : case Alpha::F16 : return 16;
+  case Alpha::R17 : case Alpha::F17 : return 17;
+  case Alpha::R18 : case Alpha::F18 : return 18;
+  case Alpha::R19 : case Alpha::F19 : return 19;
+  case Alpha::R20 : case Alpha::F20 : return 20;
+  case Alpha::R21 : case Alpha::F21 : return 21;
+  case Alpha::R22 : case Alpha::F22 : return 22;
+  case Alpha::R23 : case Alpha::F23 : return 23;
+  case Alpha::R24 : case Alpha::F24 : return 24;
+  case Alpha::R25 : case Alpha::F25 : return 25;
+  case Alpha::R26 : case Alpha::F26 : return 26;
+  case Alpha::R27 : case Alpha::F27 : return 27;
+  case Alpha::R28 : case Alpha::F28 : return 28;
+  case Alpha::R29 : case Alpha::F29 : return 29;
+  case Alpha::R30 : case Alpha::F30 : return 30;
+  case Alpha::R31 : case Alpha::F31 : return 31;
+  default:
+    llvm_unreachable("Unhandled reg");
+  }
+}
+
+unsigned AlphaCodeEmitter::getMachineOpValue(const MachineInstr &MI,
+                                             const MachineOperand &MO) {
+
+  unsigned rv = 0; // Return value; defaults to 0 for unhandled cases
+                   // or things that get fixed up later by the JIT.
+
+  if (MO.isReg()) {
+    rv = getAlphaRegNumber(MO.getReg());
+  } else if (MO.isImm()) {
+    rv = MO.getImm();
+  } else if (MO.isGlobal() || MO.isSymbol() || MO.isCPI()) {
+    DEBUG(errs() << MO << " is a relocated op for " << MI << "\n");
+    unsigned Reloc = 0;
+    int Offset = 0;
+    bool useGOT = false;
+    switch (MI.getOpcode()) {
+    case Alpha::BSR:
+      Reloc = Alpha::reloc_bsr;
+      break;
+    case Alpha::LDLr:
+    case Alpha::LDQr:
+    case Alpha::LDBUr:
+    case Alpha::LDWUr:
+    case Alpha::LDSr:
+    case Alpha::LDTr:
+    case Alpha::LDAr:
+    case Alpha::STQr:
+    case Alpha::STLr:
+    case Alpha::STWr:
+    case Alpha::STBr:
+    case Alpha::STSr:
+    case Alpha::STTr:
+      Reloc = Alpha::reloc_gprellow;
+      break;
+    case Alpha::LDAHr:
+      Reloc = Alpha::reloc_gprelhigh;
+      break;
+    case Alpha::LDQl:
+      Reloc = Alpha::reloc_literal;
+      useGOT = true;
+      break;
+    case Alpha::LDAg:
+    case Alpha::LDAHg:
+      Reloc = Alpha::reloc_gpdist;
+      Offset = MI.getOperand(3).getImm();
+      break;
+    default:
+      llvm_unreachable("unknown relocatable instruction");
+    }
+    if (MO.isGlobal())
+      MCE.addRelocation(MachineRelocation::getGV(MCE.getCurrentPCOffset(),
+                                                 Reloc, MO.getGlobal(), Offset,
+                                                 isa<Function>(MO.getGlobal()),
+                                                 useGOT));
+    else if (MO.isSymbol())
+      MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(),
+                                                     Reloc, MO.getSymbolName(),
+                                                     Offset, true));
+    else
+     MCE.addRelocation(MachineRelocation::getConstPool(MCE.getCurrentPCOffset(),
+                                          Reloc, MO.getIndex(), Offset));
+  } else if (MO.isMBB()) {
+    MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(),
+                                               Alpha::reloc_bsr, MO.getMBB()));
+  } else {
+#ifndef NDEBUG
+    errs() << "ERROR: Unknown type of MachineOperand: " << MO << "\n";
+#endif
+    llvm_unreachable(0);
+  }
+
+  return rv;
+}
+
+#include "AlphaGenCodeEmitter.inc"

diff --git a/lib/Target/Alpha/AlphaISelDAGToDAG.cpp b/lib/Target/Alpha/AlphaISelDAGToDAG.cpp
new file mode 100644
index 0000000..eaefef9
--- /dev/null
+++ b/lib/Target/Alpha/AlphaISelDAGToDAG.cpp

@@ -0,0 +1,453 @@
+//===-- AlphaISelDAGToDAG.cpp - Alpha pattern matching inst selector ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pattern matching instruction selector for Alpha,
+// converting from a legalized dag to a Alpha dag.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Alpha.h"
+#include "AlphaTargetMachine.h"
+#include "AlphaISelLowering.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+using namespace llvm;
+
+namespace {
+
+  //===--------------------------------------------------------------------===//
+  /// AlphaDAGToDAGISel - Alpha specific code to select Alpha machine
+  /// instructions for SelectionDAG operations.
+  class AlphaDAGToDAGISel : public SelectionDAGISel {
+    static const int64_t IMM_LOW  = -32768;
+    static const int64_t IMM_HIGH = 32767;
+    static const int64_t IMM_MULT = 65536;
+    static const int64_t IMM_FULLHIGH = IMM_HIGH + IMM_HIGH * IMM_MULT;
+    static const int64_t IMM_FULLLOW = IMM_LOW + IMM_LOW  * IMM_MULT;
+
+    static int64_t get_ldah16(int64_t x) {
+      int64_t y = x / IMM_MULT;
+      if (x % IMM_MULT > IMM_HIGH)
+        ++y;
+      return y;
+    }
+
+    static int64_t get_lda16(int64_t x) {
+      return x - get_ldah16(x) * IMM_MULT;
+    }
+
+    /// get_zapImm - Return a zap mask if X is a valid immediate for a zapnot
+    /// instruction (if not, return 0).  Note that this code accepts partial
+    /// zap masks.  For example (and LHS, 1) is a valid zap, as long we know
+    /// that the bits 1-7 of LHS are already zero.  If LHS is non-null, we are
+    /// in checking mode.  If LHS is null, we assume that the mask has already
+    /// been validated before.
+    uint64_t get_zapImm(SDValue LHS, uint64_t Constant) {
+      uint64_t BitsToCheck = 0;
+      unsigned Result = 0;
+      for (unsigned i = 0; i != 8; ++i) {
+        if (((Constant >> 8*i) & 0xFF) == 0) {
+          // nothing to do.
+        } else {
+          Result |= 1 << i;
+          if (((Constant >> 8*i) & 0xFF) == 0xFF) {
+            // If the entire byte is set, zapnot the byte.
+          } else if (LHS.getNode() == 0) {
+            // Otherwise, if the mask was previously validated, we know its okay
+            // to zapnot this entire byte even though all the bits aren't set.
+          } else {
+            // Otherwise we don't know that the it's okay to zapnot this entire
+            // byte.  Only do this iff we can prove that the missing bits are
+            // already null, so the bytezap doesn't need to really null them.
+            BitsToCheck |= ~Constant & (0xFF << 8*i);
+          }
+        }
+      }
+      
+      // If there are missing bits in a byte (for example, X & 0xEF00), check to
+      // see if the missing bits (0x1000) are already known zero if not, the zap
+      // isn't okay to do, as it won't clear all the required bits.
+      if (BitsToCheck &&
+          !CurDAG->MaskedValueIsZero(LHS,
+                                     APInt(LHS.getValueSizeInBits(),
+                                           BitsToCheck)))
+        return 0;
+      
+      return Result;
+    }
+    
+    static uint64_t get_zapImm(uint64_t x) {
+      unsigned build = 0;
+      for(int i = 0; i != 8; ++i) {
+        if ((x & 0x00FF) == 0x00FF)
+          build |= 1 << i;
+        else if ((x & 0x00FF) != 0)
+          return 0;
+        x >>= 8;
+      }
+      return build;
+    }
+      
+    
+    static uint64_t getNearPower2(uint64_t x) {
+      if (!x) return 0;
+      unsigned at = CountLeadingZeros_64(x);
+      uint64_t complow = 1 << (63 - at);
+      uint64_t comphigh = 1 << (64 - at);
+      //cerr << x << ":" << complow << ":" << comphigh << "\n";
+      if (abs64(complow - x) <= abs64(comphigh - x))
+        return complow;
+      else
+        return comphigh;
+    }
+
+    static bool chkRemNearPower2(uint64_t x, uint64_t r, bool swap) {
+      uint64_t y = getNearPower2(x);
+      if (swap)
+        return (y - x) == r;
+      else
+        return (x - y) == r;
+    }
+
+    static bool isFPZ(SDValue N) {
+      ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N);
+      return (CN && (CN->getValueAPF().isZero()));
+    }
+    static bool isFPZn(SDValue N) {
+      ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N);
+      return (CN && CN->getValueAPF().isNegZero());
+    }
+    static bool isFPZp(SDValue N) {
+      ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N);
+      return (CN && CN->getValueAPF().isPosZero());
+    }
+
+  public:
+    explicit AlphaDAGToDAGISel(AlphaTargetMachine &TM)
+      : SelectionDAGISel(TM)
+    {}
+
+    /// getI64Imm - Return a target constant with the specified value, of type
+    /// i64.
+    inline SDValue getI64Imm(int64_t Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i64);
+    }
+
+    // Select - Convert the specified operand from a target-independent to a
+    // target-specific node if it hasn't already been changed.
+    SDNode *Select(SDNode *N);
+    
+    /// InstructionSelect - This callback is invoked by
+    /// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+    virtual void InstructionSelect();
+    
+    virtual const char *getPassName() const {
+      return "Alpha DAG->DAG Pattern Instruction Selection";
+    } 
+
+    /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+    /// inline asm expressions.
+    virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                              char ConstraintCode,
+                                              std::vector<SDValue> &OutOps) {
+      SDValue Op0;
+      switch (ConstraintCode) {
+      default: return true;
+      case 'm':   // memory
+        Op0 = Op;
+        break;
+      }
+      
+      OutOps.push_back(Op0);
+      return false;
+    }
+    
+// Include the pieces autogenerated from the target description.
+#include "AlphaGenDAGISel.inc"
+    
+private:
+    /// getTargetMachine - Return a reference to the TargetMachine, casted
+    /// to the target-specific type.
+    const AlphaTargetMachine &getTargetMachine() {
+      return static_cast<const AlphaTargetMachine &>(TM);
+    }
+
+    /// getInstrInfo - Return a reference to the TargetInstrInfo, casted
+    /// to the target-specific type.
+    const AlphaInstrInfo *getInstrInfo() {
+      return getTargetMachine().getInstrInfo();
+    }
+
+    SDNode *getGlobalBaseReg();
+    SDNode *getGlobalRetAddr();
+    void SelectCALL(SDNode *Op);
+
+  };
+}
+
+/// getGlobalBaseReg - Output the instructions required to put the
+/// GOT address into a register.
+///
+SDNode *AlphaDAGToDAGISel::getGlobalBaseReg() {
+  unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
+  return CurDAG->getRegister(GlobalBaseReg, TLI.getPointerTy()).getNode();
+}
+
+/// getGlobalRetAddr - Grab the return address.
+///
+SDNode *AlphaDAGToDAGISel::getGlobalRetAddr() {
+  unsigned GlobalRetAddr = getInstrInfo()->getGlobalRetAddr(MF);
+  return CurDAG->getRegister(GlobalRetAddr, TLI.getPointerTy()).getNode();
+}
+
+/// InstructionSelect - This callback is invoked by
+/// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+void AlphaDAGToDAGISel::InstructionSelect() {
+  // Select target instructions for the DAG.
+  SelectRoot(*CurDAG);
+  CurDAG->RemoveDeadNodes();
+}
+
+// Select - Convert the specified operand from a target-independent to a
+// target-specific node if it hasn't already been changed.
+SDNode *AlphaDAGToDAGISel::Select(SDNode *N) {
+  if (N->isMachineOpcode()) {
+    return NULL;   // Already selected.
+  }
+  DebugLoc dl = N->getDebugLoc();
+
+  switch (N->getOpcode()) {
+  default: break;
+  case AlphaISD::CALL:
+    SelectCALL(N);
+    return NULL;
+
+  case ISD::FrameIndex: {
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    return CurDAG->SelectNodeTo(N, Alpha::LDA, MVT::i64,
+                                CurDAG->getTargetFrameIndex(FI, MVT::i32),
+                                getI64Imm(0));
+  }
+  case ISD::GLOBAL_OFFSET_TABLE:
+    return getGlobalBaseReg();
+  case AlphaISD::GlobalRetAddr:
+    return getGlobalRetAddr();
+  
+  case AlphaISD::DivCall: {
+    SDValue Chain = CurDAG->getEntryNode();
+    SDValue N0 = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    SDValue N2 = N->getOperand(2);
+    Chain = CurDAG->getCopyToReg(Chain, dl, Alpha::R24, N1, 
+                                 SDValue(0,0));
+    Chain = CurDAG->getCopyToReg(Chain, dl, Alpha::R25, N2, 
+                                 Chain.getValue(1));
+    Chain = CurDAG->getCopyToReg(Chain, dl, Alpha::R27, N0, 
+                                 Chain.getValue(1));
+    SDNode *CNode =
+      CurDAG->getMachineNode(Alpha::JSRs, dl, MVT::Other, MVT::Flag, 
+                             Chain, Chain.getValue(1));
+    Chain = CurDAG->getCopyFromReg(Chain, dl, Alpha::R27, MVT::i64, 
+                                   SDValue(CNode, 1));
+    return CurDAG->SelectNodeTo(N, Alpha::BISr, MVT::i64, Chain, Chain);
+  }
+
+  case ISD::READCYCLECOUNTER: {
+    SDValue Chain = N->getOperand(0);
+    return CurDAG->getMachineNode(Alpha::RPCC, dl, MVT::i64, MVT::Other,
+                                  Chain);
+  }
+
+  case ISD::Constant: {
+    uint64_t uval = cast<ConstantSDNode>(N)->getZExtValue();
+    
+    if (uval == 0) {
+      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                                Alpha::R31, MVT::i64);
+      ReplaceUses(SDValue(N, 0), Result);
+      return NULL;
+    }
+
+    int64_t val = (int64_t)uval;
+    int32_t val32 = (int32_t)val;
+    if (val <= IMM_HIGH + IMM_HIGH * IMM_MULT &&
+        val >= IMM_LOW  + IMM_LOW  * IMM_MULT)
+      break; //(LDAH (LDA))
+    if ((uval >> 32) == 0 && //empty upper bits
+        val32 <= IMM_HIGH + IMM_HIGH * IMM_MULT)
+      // val32 >= IMM_LOW  + IMM_LOW  * IMM_MULT) //always true
+      break; //(zext (LDAH (LDA)))
+    //Else use the constant pool
+    ConstantInt *C = ConstantInt::get(
+                                Type::getInt64Ty(*CurDAG->getContext()), uval);
+    SDValue CPI = CurDAG->getTargetConstantPool(C, MVT::i64);
+    SDNode *Tmp = CurDAG->getMachineNode(Alpha::LDAHr, dl, MVT::i64, CPI,
+                                         SDValue(getGlobalBaseReg(), 0));
+    return CurDAG->SelectNodeTo(N, Alpha::LDQr, MVT::i64, MVT::Other, 
+                                CPI, SDValue(Tmp, 0), CurDAG->getEntryNode());
+  }
+  case ISD::TargetConstantFP:
+  case ISD::ConstantFP: {
+    ConstantFPSDNode *CN = cast<ConstantFPSDNode>(N);
+    bool isDouble = N->getValueType(0) == MVT::f64;
+    EVT T = isDouble ? MVT::f64 : MVT::f32;
+    if (CN->getValueAPF().isPosZero()) {
+      return CurDAG->SelectNodeTo(N, isDouble ? Alpha::CPYST : Alpha::CPYSS,
+                                  T, CurDAG->getRegister(Alpha::F31, T),
+                                  CurDAG->getRegister(Alpha::F31, T));
+    } else if (CN->getValueAPF().isNegZero()) {
+      return CurDAG->SelectNodeTo(N, isDouble ? Alpha::CPYSNT : Alpha::CPYSNS,
+                                  T, CurDAG->getRegister(Alpha::F31, T),
+                                  CurDAG->getRegister(Alpha::F31, T));
+    } else {
+      llvm_report_error("Unhandled FP constant type");
+    }
+    break;
+  }
+
+  case ISD::SETCC:
+    if (N->getOperand(0).getNode()->getValueType(0).isFloatingPoint()) {
+      ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+
+      unsigned Opc = Alpha::WTF;
+      bool rev = false;
+      bool inv = false;
+      switch(CC) {
+      default: DEBUG(N->dump(CurDAG)); llvm_unreachable("Unknown FP comparison!");
+      case ISD::SETEQ: case ISD::SETOEQ: case ISD::SETUEQ:
+        Opc = Alpha::CMPTEQ; break;
+      case ISD::SETLT: case ISD::SETOLT: case ISD::SETULT: 
+        Opc = Alpha::CMPTLT; break;
+      case ISD::SETLE: case ISD::SETOLE: case ISD::SETULE: 
+        Opc = Alpha::CMPTLE; break;
+      case ISD::SETGT: case ISD::SETOGT: case ISD::SETUGT: 
+        Opc = Alpha::CMPTLT; rev = true; break;
+      case ISD::SETGE: case ISD::SETOGE: case ISD::SETUGE: 
+        Opc = Alpha::CMPTLE; rev = true; break;
+      case ISD::SETNE: case ISD::SETONE: case ISD::SETUNE:
+        Opc = Alpha::CMPTEQ; inv = true; break;
+      case ISD::SETO:
+        Opc = Alpha::CMPTUN; inv = true; break;
+      case ISD::SETUO:
+        Opc = Alpha::CMPTUN; break;
+      };
+      SDValue tmp1 = N->getOperand(rev?1:0);
+      SDValue tmp2 = N->getOperand(rev?0:1);
+      SDNode *cmp = CurDAG->getMachineNode(Opc, dl, MVT::f64, tmp1, tmp2);
+      if (inv) 
+        cmp = CurDAG->getMachineNode(Alpha::CMPTEQ, dl, 
+                                     MVT::f64, SDValue(cmp, 0), 
+                                     CurDAG->getRegister(Alpha::F31, MVT::f64));
+      switch(CC) {
+      case ISD::SETUEQ: case ISD::SETULT: case ISD::SETULE:
+      case ISD::SETUNE: case ISD::SETUGT: case ISD::SETUGE:
+       {
+         SDNode* cmp2 = CurDAG->getMachineNode(Alpha::CMPTUN, dl, MVT::f64,
+                                               tmp1, tmp2);
+         cmp = CurDAG->getMachineNode(Alpha::ADDT, dl, MVT::f64, 
+                                      SDValue(cmp2, 0), SDValue(cmp, 0));
+         break;
+       }
+      default: break;
+      }
+
+      SDNode* LD = CurDAG->getMachineNode(Alpha::FTOIT, dl,
+                                          MVT::i64, SDValue(cmp, 0));
+      return CurDAG->getMachineNode(Alpha::CMPULT, dl, MVT::i64, 
+                                    CurDAG->getRegister(Alpha::R31, MVT::i64),
+                                    SDValue(LD,0));
+    }
+    break;
+
+  case ISD::AND: {
+    ConstantSDNode* SC = NULL;
+    ConstantSDNode* MC = NULL;
+    if (N->getOperand(0).getOpcode() == ISD::SRL &&
+        (MC = dyn_cast<ConstantSDNode>(N->getOperand(1))) &&
+        (SC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1)))) {
+      uint64_t sval = SC->getZExtValue();
+      uint64_t mval = MC->getZExtValue();
+      // If the result is a zap, let the autogened stuff handle it.
+      if (get_zapImm(N->getOperand(0), mval))
+        break;
+      // given mask X, and shift S, we want to see if there is any zap in the
+      // mask if we play around with the botton S bits
+      uint64_t dontcare = (~0ULL) >> (64 - sval);
+      uint64_t mask = mval << sval;
+      
+      if (get_zapImm(mask | dontcare))
+        mask = mask | dontcare;
+      
+      if (get_zapImm(mask)) {
+        SDValue Z = 
+          SDValue(CurDAG->getMachineNode(Alpha::ZAPNOTi, dl, MVT::i64,
+                                         N->getOperand(0).getOperand(0),
+                                         getI64Imm(get_zapImm(mask))), 0);
+        return CurDAG->getMachineNode(Alpha::SRLr, dl, MVT::i64, Z, 
+                                      getI64Imm(sval));
+      }
+    }
+    break;
+  }
+
+  }
+
+  return SelectCode(N);
+}
+
+void AlphaDAGToDAGISel::SelectCALL(SDNode *N) {
+  //TODO: add flag stuff to prevent nondeturministic breakage!
+
+  SDValue Chain = N->getOperand(0);
+  SDValue Addr = N->getOperand(1);
+  SDValue InFlag = N->getOperand(N->getNumOperands() - 1);
+  DebugLoc dl = N->getDebugLoc();
+
+   if (Addr.getOpcode() == AlphaISD::GPRelLo) {
+     SDValue GOT = SDValue(getGlobalBaseReg(), 0);
+     Chain = CurDAG->getCopyToReg(Chain, dl, Alpha::R29, GOT, InFlag);
+     InFlag = Chain.getValue(1);
+     Chain = SDValue(CurDAG->getMachineNode(Alpha::BSR, dl, MVT::Other, 
+                                            MVT::Flag, Addr.getOperand(0),
+                                            Chain, InFlag), 0);
+   } else {
+     Chain = CurDAG->getCopyToReg(Chain, dl, Alpha::R27, Addr, InFlag);
+     InFlag = Chain.getValue(1);
+     Chain = SDValue(CurDAG->getMachineNode(Alpha::JSR, dl, MVT::Other,
+                                            MVT::Flag, Chain, InFlag), 0);
+   }
+   InFlag = Chain.getValue(1);
+
+  ReplaceUses(SDValue(N, 0), Chain);
+  ReplaceUses(SDValue(N, 1), InFlag);
+}
+
+
+/// createAlphaISelDag - This pass converts a legalized DAG into a 
+/// Alpha-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createAlphaISelDag(AlphaTargetMachine &TM) {
+  return new AlphaDAGToDAGISel(TM);
+}

diff --git a/lib/Target/Alpha/AlphaISelLowering.cpp b/lib/Target/Alpha/AlphaISelLowering.cpp
new file mode 100644
index 0000000..0bbe567
--- /dev/null
+++ b/lib/Target/Alpha/AlphaISelLowering.cpp

@@ -0,0 +1,921 @@
+//===-- AlphaISelLowering.cpp - Alpha DAG Lowering Implementation ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AlphaISelLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AlphaISelLowering.h"
+#include "AlphaTargetMachine.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Module.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+/// AddLiveIn - This helper function adds the specified physical register to the
+/// MachineFunction as a live in value.  It also creates a corresponding virtual
+/// register for it.
+static unsigned AddLiveIn(MachineFunction &MF, unsigned PReg,
+                          TargetRegisterClass *RC) {
+  assert(RC->contains(PReg) && "Not the correct regclass!");
+  unsigned VReg = MF.getRegInfo().createVirtualRegister(RC);
+  MF.getRegInfo().addLiveIn(PReg, VReg);
+  return VReg;
+}
+
+AlphaTargetLowering::AlphaTargetLowering(TargetMachine &TM)
+  : TargetLowering(TM, new TargetLoweringObjectFileELF()) {
+  // Set up the TargetLowering object.
+  //I am having problems with shr n i8 1
+  setShiftAmountType(MVT::i64);
+  setBooleanContents(ZeroOrOneBooleanContent);
+
+  addRegisterClass(MVT::i64, Alpha::GPRCRegisterClass);
+  addRegisterClass(MVT::f64, Alpha::F8RCRegisterClass);
+  addRegisterClass(MVT::f32, Alpha::F4RCRegisterClass);
+
+  // We want to custom lower some of our intrinsics.
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
+  setLoadExtAction(ISD::EXTLOAD, MVT::i1,  Promote);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1,  Promote);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, Expand);
+
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1,  Promote);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i8,  Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Expand);
+
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+  //  setOperationAction(ISD::BRIND,        MVT::Other,   Expand);
+  setOperationAction(ISD::BR_JT,        MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC,        MVT::Other, Expand);
+  setOperationAction(ISD::SELECT_CC,    MVT::Other, Expand);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+  setOperationAction(ISD::FREM, MVT::f32, Expand);
+  setOperationAction(ISD::FREM, MVT::f64, Expand);
+
+  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+
+  if (!TM.getSubtarget<AlphaSubtarget>().hasCT()) {
+    setOperationAction(ISD::CTPOP    , MVT::i64  , Expand);
+    setOperationAction(ISD::CTTZ     , MVT::i64  , Expand);
+    setOperationAction(ISD::CTLZ     , MVT::i64  , Expand);
+  }
+  setOperationAction(ISD::BSWAP    , MVT::i64, Expand);
+  setOperationAction(ISD::ROTL     , MVT::i64, Expand);
+  setOperationAction(ISD::ROTR     , MVT::i64, Expand);
+
+  setOperationAction(ISD::SREM     , MVT::i64, Custom);
+  setOperationAction(ISD::UREM     , MVT::i64, Custom);
+  setOperationAction(ISD::SDIV     , MVT::i64, Custom);
+  setOperationAction(ISD::UDIV     , MVT::i64, Custom);
+
+  setOperationAction(ISD::ADDC     , MVT::i64, Expand);
+  setOperationAction(ISD::ADDE     , MVT::i64, Expand);
+  setOperationAction(ISD::SUBC     , MVT::i64, Expand);
+  setOperationAction(ISD::SUBE     , MVT::i64, Expand);
+
+  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+
+  setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
+  setOperationAction(ISD::SRA_PARTS, MVT::i64, Expand);
+  setOperationAction(ISD::SHL_PARTS, MVT::i64, Expand);
+
+  // We don't support sin/cos/sqrt/pow
+  setOperationAction(ISD::FSIN , MVT::f64, Expand);
+  setOperationAction(ISD::FCOS , MVT::f64, Expand);
+  setOperationAction(ISD::FSIN , MVT::f32, Expand);
+  setOperationAction(ISD::FCOS , MVT::f32, Expand);
+
+  setOperationAction(ISD::FSQRT, MVT::f64, Expand);
+  setOperationAction(ISD::FSQRT, MVT::f32, Expand);
+
+  setOperationAction(ISD::FPOW , MVT::f32, Expand);
+  setOperationAction(ISD::FPOW , MVT::f64, Expand);
+
+  setOperationAction(ISD::SETCC, MVT::f32, Promote);
+
+  setOperationAction(ISD::BIT_CONVERT, MVT::f32, Promote);
+
+  setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
+
+  // Not implemented yet.
+  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
+
+  // We want to legalize GlobalAddress and ConstantPool and
+  // ExternalSymbols nodes into the appropriate instructions to
+  // materialize the address.
+  setOperationAction(ISD::GlobalAddress,  MVT::i64, Custom);
+  setOperationAction(ISD::ConstantPool,   MVT::i64, Custom);
+  setOperationAction(ISD::ExternalSymbol, MVT::i64, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
+
+  setOperationAction(ISD::VASTART, MVT::Other, Custom);
+  setOperationAction(ISD::VAEND,   MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY,  MVT::Other, Custom);
+  setOperationAction(ISD::VAARG,   MVT::Other, Custom);
+  setOperationAction(ISD::VAARG,   MVT::i32,   Custom);
+
+  setOperationAction(ISD::JumpTable, MVT::i64, Custom);
+  setOperationAction(ISD::JumpTable, MVT::i32, Custom);
+
+  setStackPointerRegisterToSaveRestore(Alpha::R30);
+
+  setJumpBufSize(272);
+  setJumpBufAlignment(16);
+
+  computeRegisterProperties();
+}
+
+MVT::SimpleValueType AlphaTargetLowering::getSetCCResultType(EVT VT) const {
+  return MVT::i64;
+}
+
+const char *AlphaTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return 0;
+  case AlphaISD::CVTQT_: return "Alpha::CVTQT_";
+  case AlphaISD::CVTQS_: return "Alpha::CVTQS_";
+  case AlphaISD::CVTTQ_: return "Alpha::CVTTQ_";
+  case AlphaISD::GPRelHi: return "Alpha::GPRelHi";
+  case AlphaISD::GPRelLo: return "Alpha::GPRelLo";
+  case AlphaISD::RelLit: return "Alpha::RelLit";
+  case AlphaISD::GlobalRetAddr: return "Alpha::GlobalRetAddr";
+  case AlphaISD::CALL:   return "Alpha::CALL";
+  case AlphaISD::DivCall: return "Alpha::DivCall";
+  case AlphaISD::RET_FLAG: return "Alpha::RET_FLAG";
+  case AlphaISD::COND_BRANCH_I: return "Alpha::COND_BRANCH_I";
+  case AlphaISD::COND_BRANCH_F: return "Alpha::COND_BRANCH_F";
+  }
+}
+
+/// getFunctionAlignment - Return the Log2 alignment of this function.
+unsigned AlphaTargetLowering::getFunctionAlignment(const Function *F) const {
+  return 4;
+}
+
+static SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) {
+  EVT PtrVT = Op.getValueType();
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+  SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
+  // FIXME there isn't really any debug info here
+  DebugLoc dl = Op.getDebugLoc();
+
+  SDValue Hi = DAG.getNode(AlphaISD::GPRelHi,  dl, MVT::i64, JTI,
+                             DAG.getGLOBAL_OFFSET_TABLE(MVT::i64));
+  SDValue Lo = DAG.getNode(AlphaISD::GPRelLo, dl, MVT::i64, JTI, Hi);
+  return Lo;
+}
+
+//http://www.cs.arizona.edu/computer.help/policy/DIGITAL_unix/
+//AA-PY8AC-TET1_html/callCH3.html#BLOCK21
+
+//For now, just use variable size stack frame format
+
+//In a standard call, the first six items are passed in registers $16
+//- $21 and/or registers $f16 - $f21. (See Section 4.1.2 for details
+//of argument-to-register correspondence.) The remaining items are
+//collected in a memory argument list that is a naturally aligned
+//array of quadwords. In a standard call, this list, if present, must
+//be passed at 0(SP).
+//7 ... n         0(SP) ... (n-7)*8(SP)
+
+// //#define FP    $15
+// //#define RA    $26
+// //#define PV    $27
+// //#define GP    $29
+// //#define SP    $30
+
+#include "AlphaGenCallingConv.inc"
+
+SDValue
+AlphaTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
+                               CallingConv::ID CallConv, bool isVarArg,
+                               bool &isTailCall,
+                               const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               DebugLoc dl, SelectionDAG &DAG,
+                               SmallVectorImpl<SDValue> &InVals) {
+  // Alpha target does not yet support tail call optimization.
+  isTailCall = false;
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 ArgLocs, *DAG.getContext());
+
+  CCInfo.AnalyzeCallOperands(Outs, CC_Alpha);
+
+    // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getConstant(NumBytes,
+                                                      getPointerTy(), true));
+
+  SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
+  SmallVector<SDValue, 12> MemOpChains;
+  SDValue StackPtr;
+
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+
+    SDValue Arg = Outs[i].Val;
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+      default: assert(0 && "Unknown loc info!");
+      case CCValAssign::Full: break;
+      case CCValAssign::SExt:
+        Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::ZExt:
+        Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::AExt:
+        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+    }
+
+    // Arguments that can be passed on register must be kept at RegsToPass
+    // vector
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      assert(VA.isMemLoc());
+
+      if (StackPtr.getNode() == 0)
+        StackPtr = DAG.getCopyFromReg(Chain, dl, Alpha::R30, MVT::i64);
+
+      SDValue PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+                                   StackPtr,
+                                   DAG.getIntPtrConstant(VA.getLocMemOffset()));
+
+      MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
+                                         PseudoSourceValue::getStack(), 0));
+    }
+  }
+
+  // Transform all store nodes into one single node because all store nodes are
+  // independent of each other.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain and
+  // flag operands which copy the outgoing args into registers.  The InFlag in
+  // necessary since all emited instructions must be stuck together.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // Returns a chain & a flag for retval copy to use.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are
+  // known live into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  Chain = DAG.getNode(AlphaISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  // Create the CALLSEQ_END node.
+  Chain = DAG.getCALLSEQ_END(Chain,
+                             DAG.getConstant(NumBytes, getPointerTy(), true),
+                             DAG.getConstant(0, getPointerTy(), true),
+                             InFlag);
+  InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
+                         Ins, dl, DAG, InVals);
+}
+
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+///
+SDValue
+AlphaTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
+                                     CallingConv::ID CallConv, bool isVarArg,
+                                     const SmallVectorImpl<ISD::InputArg> &Ins,
+                                     DebugLoc dl, SelectionDAG &DAG,
+                                     SmallVectorImpl<SDValue> &InVals) {
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs,
+                 *DAG.getContext());
+
+  CCInfo.AnalyzeCallResult(Ins, RetCC_Alpha);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+
+    Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
+                               VA.getLocVT(), InFlag).getValue(1);
+    SDValue RetValue = Chain.getValue(0);
+    InFlag = Chain.getValue(2);
+
+    // If this is an 8/16/32-bit value, it is really passed promoted to 64
+    // bits. Insert an assert[sz]ext to capture this, then truncate to the
+    // right size.
+    if (VA.getLocInfo() == CCValAssign::SExt)
+      RetValue = DAG.getNode(ISD::AssertSext, dl, VA.getLocVT(), RetValue,
+                             DAG.getValueType(VA.getValVT()));
+    else if (VA.getLocInfo() == CCValAssign::ZExt)
+      RetValue = DAG.getNode(ISD::AssertZext, dl, VA.getLocVT(), RetValue,
+                             DAG.getValueType(VA.getValVT()));
+
+    if (VA.getLocInfo() != CCValAssign::Full)
+      RetValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), RetValue);
+
+    InVals.push_back(RetValue);
+  }
+
+  return Chain;
+}
+
+SDValue
+AlphaTargetLowering::LowerFormalArguments(SDValue Chain,
+                                          CallingConv::ID CallConv, bool isVarArg,
+                                          const SmallVectorImpl<ISD::InputArg>
+                                            &Ins,
+                                          DebugLoc dl, SelectionDAG &DAG,
+                                          SmallVectorImpl<SDValue> &InVals) {
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  unsigned args_int[] = {
+    Alpha::R16, Alpha::R17, Alpha::R18, Alpha::R19, Alpha::R20, Alpha::R21};
+  unsigned args_float[] = {
+    Alpha::F16, Alpha::F17, Alpha::F18, Alpha::F19, Alpha::F20, Alpha::F21};
+
+  for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
+    SDValue argt;
+    EVT ObjectVT = Ins[ArgNo].VT;
+    SDValue ArgVal;
+
+    if (ArgNo  < 6) {
+      switch (ObjectVT.getSimpleVT().SimpleTy) {
+      default:
+        assert(false && "Invalid value type!");
+      case MVT::f64:
+        args_float[ArgNo] = AddLiveIn(MF, args_float[ArgNo],
+                                      &Alpha::F8RCRegClass);
+        ArgVal = DAG.getCopyFromReg(Chain, dl, args_float[ArgNo], ObjectVT);
+        break;
+      case MVT::f32:
+        args_float[ArgNo] = AddLiveIn(MF, args_float[ArgNo],
+                                      &Alpha::F4RCRegClass);
+        ArgVal = DAG.getCopyFromReg(Chain, dl, args_float[ArgNo], ObjectVT);
+        break;
+      case MVT::i64:
+        args_int[ArgNo] = AddLiveIn(MF, args_int[ArgNo],
+                                    &Alpha::GPRCRegClass);
+        ArgVal = DAG.getCopyFromReg(Chain, dl, args_int[ArgNo], MVT::i64);
+        break;
+      }
+    } else { //more args
+      // Create the frame index object for this incoming parameter...
+      int FI = MFI->CreateFixedObject(8, 8 * (ArgNo - 6), true, false);
+
+      // Create the SelectionDAG nodes corresponding to a load
+      //from this parameter
+      SDValue FIN = DAG.getFrameIndex(FI, MVT::i64);
+      ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, NULL, 0);
+    }
+    InVals.push_back(ArgVal);
+  }
+
+  // If the functions takes variable number of arguments, copy all regs to stack
+  if (isVarArg) {
+    VarArgsOffset = Ins.size() * 8;
+    std::vector<SDValue> LS;
+    for (int i = 0; i < 6; ++i) {
+      if (TargetRegisterInfo::isPhysicalRegister(args_int[i]))
+        args_int[i] = AddLiveIn(MF, args_int[i], &Alpha::GPRCRegClass);
+      SDValue argt = DAG.getCopyFromReg(Chain, dl, args_int[i], MVT::i64);
+      int FI = MFI->CreateFixedObject(8, -8 * (6 - i), true, false);
+      if (i == 0) VarArgsBase = FI;
+      SDValue SDFI = DAG.getFrameIndex(FI, MVT::i64);
+      LS.push_back(DAG.getStore(Chain, dl, argt, SDFI, NULL, 0));
+
+      if (TargetRegisterInfo::isPhysicalRegister(args_float[i]))
+        args_float[i] = AddLiveIn(MF, args_float[i], &Alpha::F8RCRegClass);
+      argt = DAG.getCopyFromReg(Chain, dl, args_float[i], MVT::f64);
+      FI = MFI->CreateFixedObject(8, - 8 * (12 - i), true, false);
+      SDFI = DAG.getFrameIndex(FI, MVT::i64);
+      LS.push_back(DAG.getStore(Chain, dl, argt, SDFI, NULL, 0));
+    }
+
+    //Set up a token factor with all the stack traffic
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &LS[0], LS.size());
+  }
+
+  return Chain;
+}
+
+SDValue
+AlphaTargetLowering::LowerReturn(SDValue Chain,
+                                 CallingConv::ID CallConv, bool isVarArg,
+                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                 DebugLoc dl, SelectionDAG &DAG) {
+
+  SDValue Copy = DAG.getCopyToReg(Chain, dl, Alpha::R26,
+                                  DAG.getNode(AlphaISD::GlobalRetAddr,
+                                              DebugLoc::getUnknownLoc(),
+                                              MVT::i64),
+                                  SDValue());
+  switch (Outs.size()) {
+  default:
+    llvm_unreachable("Do not know how to return this many arguments!");
+  case 0:
+    break;
+    //return SDValue(); // ret void is legal
+  case 1: {
+    EVT ArgVT = Outs[0].Val.getValueType();
+    unsigned ArgReg;
+    if (ArgVT.isInteger())
+      ArgReg = Alpha::R0;
+    else {
+      assert(ArgVT.isFloatingPoint());
+      ArgReg = Alpha::F0;
+    }
+    Copy = DAG.getCopyToReg(Copy, dl, ArgReg,
+                            Outs[0].Val, Copy.getValue(1));
+    if (DAG.getMachineFunction().getRegInfo().liveout_empty())
+      DAG.getMachineFunction().getRegInfo().addLiveOut(ArgReg);
+    break;
+  }
+  case 2: {
+    EVT ArgVT = Outs[0].Val.getValueType();
+    unsigned ArgReg1, ArgReg2;
+    if (ArgVT.isInteger()) {
+      ArgReg1 = Alpha::R0;
+      ArgReg2 = Alpha::R1;
+    } else {
+      assert(ArgVT.isFloatingPoint());
+      ArgReg1 = Alpha::F0;
+      ArgReg2 = Alpha::F1;
+    }
+    Copy = DAG.getCopyToReg(Copy, dl, ArgReg1,
+                            Outs[0].Val, Copy.getValue(1));
+    if (std::find(DAG.getMachineFunction().getRegInfo().liveout_begin(),
+                  DAG.getMachineFunction().getRegInfo().liveout_end(), ArgReg1)
+        == DAG.getMachineFunction().getRegInfo().liveout_end())
+      DAG.getMachineFunction().getRegInfo().addLiveOut(ArgReg1);
+    Copy = DAG.getCopyToReg(Copy, dl, ArgReg2,
+                            Outs[1].Val, Copy.getValue(1));
+    if (std::find(DAG.getMachineFunction().getRegInfo().liveout_begin(),
+                   DAG.getMachineFunction().getRegInfo().liveout_end(), ArgReg2)
+        == DAG.getMachineFunction().getRegInfo().liveout_end())
+      DAG.getMachineFunction().getRegInfo().addLiveOut(ArgReg2);
+    break;
+  }
+  }
+  return DAG.getNode(AlphaISD::RET_FLAG, dl,
+                     MVT::Other, Copy, Copy.getValue(1));
+}
+
+void AlphaTargetLowering::LowerVAARG(SDNode *N, SDValue &Chain,
+                                     SDValue &DataPtr, SelectionDAG &DAG) {
+  Chain = N->getOperand(0);
+  SDValue VAListP = N->getOperand(1);
+  const Value *VAListS = cast<SrcValueSDNode>(N->getOperand(2))->getValue();
+  DebugLoc dl = N->getDebugLoc();
+
+  SDValue Base = DAG.getLoad(MVT::i64, dl, Chain, VAListP, VAListS, 0);
+  SDValue Tmp = DAG.getNode(ISD::ADD, dl, MVT::i64, VAListP,
+                              DAG.getConstant(8, MVT::i64));
+  SDValue Offset = DAG.getExtLoad(ISD::SEXTLOAD, dl, MVT::i64, Base.getValue(1),
+                                    Tmp, NULL, 0, MVT::i32);
+  DataPtr = DAG.getNode(ISD::ADD, dl, MVT::i64, Base, Offset);
+  if (N->getValueType(0).isFloatingPoint())
+  {
+    //if fp && Offset < 6*8, then subtract 6*8 from DataPtr
+    SDValue FPDataPtr = DAG.getNode(ISD::SUB, dl, MVT::i64, DataPtr,
+                                      DAG.getConstant(8*6, MVT::i64));
+    SDValue CC = DAG.getSetCC(dl, MVT::i64, Offset,
+                                DAG.getConstant(8*6, MVT::i64), ISD::SETLT);
+    DataPtr = DAG.getNode(ISD::SELECT, dl, MVT::i64, CC, FPDataPtr, DataPtr);
+  }
+
+  SDValue NewOffset = DAG.getNode(ISD::ADD, dl, MVT::i64, Offset,
+                                    DAG.getConstant(8, MVT::i64));
+  Chain = DAG.getTruncStore(Offset.getValue(1), dl, NewOffset, Tmp, NULL, 0,
+                            MVT::i32);
+}
+
+/// LowerOperation - Provide custom lowering hooks for some operations.
+///
+SDValue AlphaTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Wasn't expecting to be able to lower this!");
+  case ISD::JumpTable: return LowerJumpTable(Op, DAG);
+
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    switch (IntNo) {
+    default: break;    // Don't custom lower most intrinsics.
+    case Intrinsic::alpha_umulh:
+      return DAG.getNode(ISD::MULHU, dl, MVT::i64,
+                         Op.getOperand(1), Op.getOperand(2));
+    }
+  }
+
+  case ISD::SRL_PARTS: {
+    SDValue ShOpLo = Op.getOperand(0);
+    SDValue ShOpHi = Op.getOperand(1);
+    SDValue ShAmt  = Op.getOperand(2);
+    SDValue bm = DAG.getNode(ISD::SUB, dl, MVT::i64,
+                             DAG.getConstant(64, MVT::i64), ShAmt);
+    SDValue BMCC = DAG.getSetCC(dl, MVT::i64, bm,
+                                DAG.getConstant(0, MVT::i64), ISD::SETLE);
+    // if 64 - shAmt <= 0
+    SDValue Hi_Neg = DAG.getConstant(0, MVT::i64);
+    SDValue ShAmt_Neg = DAG.getNode(ISD::SUB, dl, MVT::i64,
+                                    DAG.getConstant(0, MVT::i64), bm);
+    SDValue Lo_Neg = DAG.getNode(ISD::SRL, dl, MVT::i64, ShOpHi, ShAmt_Neg);
+    // else
+    SDValue carries = DAG.getNode(ISD::SHL, dl, MVT::i64, ShOpHi, bm);
+    SDValue Hi_Pos =  DAG.getNode(ISD::SRL, dl, MVT::i64, ShOpHi, ShAmt);
+    SDValue Lo_Pos = DAG.getNode(ISD::SRL, dl, MVT::i64, ShOpLo, ShAmt);
+    Lo_Pos = DAG.getNode(ISD::OR, dl, MVT::i64, Lo_Pos, carries);
+    // Merge
+    SDValue Hi = DAG.getNode(ISD::SELECT, dl, MVT::i64, BMCC, Hi_Neg, Hi_Pos);
+    SDValue Lo = DAG.getNode(ISD::SELECT, dl, MVT::i64, BMCC, Lo_Neg, Lo_Pos);
+    SDValue Ops[2] = { Lo, Hi };
+    return DAG.getMergeValues(Ops, 2, dl);
+  }
+    //  case ISD::SRA_PARTS:
+
+    //  case ISD::SHL_PARTS:
+
+
+  case ISD::SINT_TO_FP: {
+    assert(Op.getOperand(0).getValueType() == MVT::i64 &&
+           "Unhandled SINT_TO_FP type in custom expander!");
+    SDValue LD;
+    bool isDouble = Op.getValueType() == MVT::f64;
+    LD = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f64, Op.getOperand(0));
+    SDValue FP = DAG.getNode(isDouble?AlphaISD::CVTQT_:AlphaISD::CVTQS_, dl,
+                               isDouble?MVT::f64:MVT::f32, LD);
+    return FP;
+  }
+  case ISD::FP_TO_SINT: {
+    bool isDouble = Op.getOperand(0).getValueType() == MVT::f64;
+    SDValue src = Op.getOperand(0);
+
+    if (!isDouble) //Promote
+      src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, src);
+
+    src = DAG.getNode(AlphaISD::CVTTQ_, dl, MVT::f64, src);
+
+    return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, src);
+  }
+  case ISD::ConstantPool: {
+    ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+    Constant *C = CP->getConstVal();
+    SDValue CPI = DAG.getTargetConstantPool(C, MVT::i64, CP->getAlignment());
+    // FIXME there isn't really any debug info here
+
+    SDValue Hi = DAG.getNode(AlphaISD::GPRelHi,  dl, MVT::i64, CPI,
+                               DAG.getGLOBAL_OFFSET_TABLE(MVT::i64));
+    SDValue Lo = DAG.getNode(AlphaISD::GPRelLo, dl, MVT::i64, CPI, Hi);
+    return Lo;
+  }
+  case ISD::GlobalTLSAddress:
+    llvm_unreachable("TLS not implemented for Alpha.");
+  case ISD::GlobalAddress: {
+    GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
+    GlobalValue *GV = GSDN->getGlobal();
+    SDValue GA = DAG.getTargetGlobalAddress(GV, MVT::i64, GSDN->getOffset());
+    // FIXME there isn't really any debug info here
+
+    //    if (!GV->hasWeakLinkage() && !GV->isDeclaration() && !GV->hasLinkOnceLinkage()) {
+    if (GV->hasLocalLinkage()) {
+      SDValue Hi = DAG.getNode(AlphaISD::GPRelHi,  dl, MVT::i64, GA,
+                                DAG.getGLOBAL_OFFSET_TABLE(MVT::i64));
+      SDValue Lo = DAG.getNode(AlphaISD::GPRelLo, dl, MVT::i64, GA, Hi);
+      return Lo;
+    } else
+      return DAG.getNode(AlphaISD::RelLit, dl, MVT::i64, GA,
+                         DAG.getGLOBAL_OFFSET_TABLE(MVT::i64));
+  }
+  case ISD::ExternalSymbol: {
+    return DAG.getNode(AlphaISD::RelLit, dl, MVT::i64,
+                       DAG.getTargetExternalSymbol(cast<ExternalSymbolSDNode>(Op)
+                                                   ->getSymbol(), MVT::i64),
+                       DAG.getGLOBAL_OFFSET_TABLE(MVT::i64));
+  }
+
+  case ISD::UREM:
+  case ISD::SREM:
+    //Expand only on constant case
+    if (Op.getOperand(1).getOpcode() == ISD::Constant) {
+      EVT VT = Op.getNode()->getValueType(0);
+      SDValue Tmp1 = Op.getNode()->getOpcode() == ISD::UREM ?
+        BuildUDIV(Op.getNode(), DAG, NULL) :
+        BuildSDIV(Op.getNode(), DAG, NULL);
+      Tmp1 = DAG.getNode(ISD::MUL, dl, VT, Tmp1, Op.getOperand(1));
+      Tmp1 = DAG.getNode(ISD::SUB, dl, VT, Op.getOperand(0), Tmp1);
+      return Tmp1;
+    }
+    //fall through
+  case ISD::SDIV:
+  case ISD::UDIV:
+    if (Op.getValueType().isInteger()) {
+      if (Op.getOperand(1).getOpcode() == ISD::Constant)
+        return Op.getOpcode() == ISD::SDIV ? BuildSDIV(Op.getNode(), DAG, NULL)
+          : BuildUDIV(Op.getNode(), DAG, NULL);
+      const char* opstr = 0;
+      switch (Op.getOpcode()) {
+      case ISD::UREM: opstr = "__remqu"; break;
+      case ISD::SREM: opstr = "__remq";  break;
+      case ISD::UDIV: opstr = "__divqu"; break;
+      case ISD::SDIV: opstr = "__divq";  break;
+      }
+      SDValue Tmp1 = Op.getOperand(0),
+        Tmp2 = Op.getOperand(1),
+        Addr = DAG.getExternalSymbol(opstr, MVT::i64);
+      return DAG.getNode(AlphaISD::DivCall, dl, MVT::i64, Addr, Tmp1, Tmp2);
+    }
+    break;
+
+  case ISD::VAARG: {
+    SDValue Chain, DataPtr;
+    LowerVAARG(Op.getNode(), Chain, DataPtr, DAG);
+
+    SDValue Result;
+    if (Op.getValueType() == MVT::i32)
+      Result = DAG.getExtLoad(ISD::SEXTLOAD, dl, MVT::i64, Chain, DataPtr,
+                              NULL, 0, MVT::i32);
+    else
+      Result = DAG.getLoad(Op.getValueType(), dl, Chain, DataPtr, NULL, 0);
+    return Result;
+  }
+  case ISD::VACOPY: {
+    SDValue Chain = Op.getOperand(0);
+    SDValue DestP = Op.getOperand(1);
+    SDValue SrcP = Op.getOperand(2);
+    const Value *DestS = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
+    const Value *SrcS = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+
+    SDValue Val = DAG.getLoad(getPointerTy(), dl, Chain, SrcP, SrcS, 0);
+    SDValue Result = DAG.getStore(Val.getValue(1), dl, Val, DestP, DestS, 0);
+    SDValue NP = DAG.getNode(ISD::ADD, dl, MVT::i64, SrcP,
+                               DAG.getConstant(8, MVT::i64));
+    Val = DAG.getExtLoad(ISD::SEXTLOAD, dl, MVT::i64, Result,
+                         NP, NULL,0, MVT::i32);
+    SDValue NPD = DAG.getNode(ISD::ADD, dl, MVT::i64, DestP,
+                                DAG.getConstant(8, MVT::i64));
+    return DAG.getTruncStore(Val.getValue(1), dl, Val, NPD, NULL, 0, MVT::i32);
+  }
+  case ISD::VASTART: {
+    SDValue Chain = Op.getOperand(0);
+    SDValue VAListP = Op.getOperand(1);
+    const Value *VAListS = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+
+    // vastart stores the address of the VarArgsBase and VarArgsOffset
+    SDValue FR  = DAG.getFrameIndex(VarArgsBase, MVT::i64);
+    SDValue S1  = DAG.getStore(Chain, dl, FR, VAListP, VAListS, 0);
+    SDValue SA2 = DAG.getNode(ISD::ADD, dl, MVT::i64, VAListP,
+                                DAG.getConstant(8, MVT::i64));
+    return DAG.getTruncStore(S1, dl, DAG.getConstant(VarArgsOffset, MVT::i64),
+                             SA2, NULL, 0, MVT::i32);
+  }
+  case ISD::RETURNADDR:
+    return DAG.getNode(AlphaISD::GlobalRetAddr, DebugLoc::getUnknownLoc(),
+                       MVT::i64);
+      //FIXME: implement
+  case ISD::FRAMEADDR:          break;
+  }
+
+  return SDValue();
+}
+
+void AlphaTargetLowering::ReplaceNodeResults(SDNode *N,
+                                             SmallVectorImpl<SDValue>&Results,
+                                             SelectionDAG &DAG) {
+  DebugLoc dl = N->getDebugLoc();
+  assert(N->getValueType(0) == MVT::i32 &&
+         N->getOpcode() == ISD::VAARG &&
+         "Unknown node to custom promote!");
+
+  SDValue Chain, DataPtr;
+  LowerVAARG(N, Chain, DataPtr, DAG);
+  SDValue Res = DAG.getLoad(N->getValueType(0), dl, Chain, DataPtr, NULL, 0);
+  Results.push_back(Res);
+  Results.push_back(SDValue(Res.getNode(), 1));
+}
+
+
+//Inline Asm
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+AlphaTargetLowering::ConstraintType
+AlphaTargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default: break;
+    case 'f':
+    case 'r':
+      return C_RegisterClass;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+std::vector<unsigned> AlphaTargetLowering::
+getRegClassForInlineAsmConstraint(const std::string &Constraint,
+                                  EVT VT) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default: break;  // Unknown constriant letter
+    case 'f':
+      return make_vector<unsigned>(Alpha::F0 , Alpha::F1 , Alpha::F2 ,
+                                   Alpha::F3 , Alpha::F4 , Alpha::F5 ,
+                                   Alpha::F6 , Alpha::F7 , Alpha::F8 ,
+                                   Alpha::F9 , Alpha::F10, Alpha::F11,
+                                   Alpha::F12, Alpha::F13, Alpha::F14,
+                                   Alpha::F15, Alpha::F16, Alpha::F17,
+                                   Alpha::F18, Alpha::F19, Alpha::F20,
+                                   Alpha::F21, Alpha::F22, Alpha::F23,
+                                   Alpha::F24, Alpha::F25, Alpha::F26,
+                                   Alpha::F27, Alpha::F28, Alpha::F29,
+                                   Alpha::F30, Alpha::F31, 0);
+    case 'r':
+      return make_vector<unsigned>(Alpha::R0 , Alpha::R1 , Alpha::R2 ,
+                                   Alpha::R3 , Alpha::R4 , Alpha::R5 ,
+                                   Alpha::R6 , Alpha::R7 , Alpha::R8 ,
+                                   Alpha::R9 , Alpha::R10, Alpha::R11,
+                                   Alpha::R12, Alpha::R13, Alpha::R14,
+                                   Alpha::R15, Alpha::R16, Alpha::R17,
+                                   Alpha::R18, Alpha::R19, Alpha::R20,
+                                   Alpha::R21, Alpha::R22, Alpha::R23,
+                                   Alpha::R24, Alpha::R25, Alpha::R26,
+                                   Alpha::R27, Alpha::R28, Alpha::R29,
+                                   Alpha::R30, Alpha::R31, 0);
+    }
+  }
+
+  return std::vector<unsigned>();
+}
+//===----------------------------------------------------------------------===//
+//  Other Lowering Code
+//===----------------------------------------------------------------------===//
+
+MachineBasicBlock *
+AlphaTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                 MachineBasicBlock *BB,
+                   DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  assert((MI->getOpcode() == Alpha::CAS32 ||
+          MI->getOpcode() == Alpha::CAS64 ||
+          MI->getOpcode() == Alpha::LAS32 ||
+          MI->getOpcode() == Alpha::LAS64 ||
+          MI->getOpcode() == Alpha::SWAP32 ||
+          MI->getOpcode() == Alpha::SWAP64) &&
+         "Unexpected instr type to insert");
+
+  bool is32 = MI->getOpcode() == Alpha::CAS32 ||
+    MI->getOpcode() == Alpha::LAS32 ||
+    MI->getOpcode() == Alpha::SWAP32;
+
+  //Load locked store conditional for atomic ops take on the same form
+  //start:
+  //ll
+  //do stuff (maybe branch to exit)
+  //sc
+  //test sc and maybe branck to start
+  //exit:
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  DebugLoc dl = MI->getDebugLoc();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  MachineBasicBlock *thisMBB = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *llscMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+
+  // Inform sdisel of the edge changes.
+  for (MachineBasicBlock::succ_iterator I = BB->succ_begin(),
+         E = BB->succ_end(); I != E; ++I)
+    EM->insert(std::make_pair(*I, sinkMBB));
+
+  sinkMBB->transferSuccessors(thisMBB);
+
+  F->insert(It, llscMBB);
+  F->insert(It, sinkMBB);
+
+  BuildMI(thisMBB, dl, TII->get(Alpha::BR)).addMBB(llscMBB);
+
+  unsigned reg_res = MI->getOperand(0).getReg(),
+    reg_ptr = MI->getOperand(1).getReg(),
+    reg_v2 = MI->getOperand(2).getReg(),
+    reg_store = F->getRegInfo().createVirtualRegister(&Alpha::GPRCRegClass);
+
+  BuildMI(llscMBB, dl, TII->get(is32 ? Alpha::LDL_L : Alpha::LDQ_L),
+          reg_res).addImm(0).addReg(reg_ptr);
+  switch (MI->getOpcode()) {
+  case Alpha::CAS32:
+  case Alpha::CAS64: {
+    unsigned reg_cmp
+      = F->getRegInfo().createVirtualRegister(&Alpha::GPRCRegClass);
+    BuildMI(llscMBB, dl, TII->get(Alpha::CMPEQ), reg_cmp)
+      .addReg(reg_v2).addReg(reg_res);
+    BuildMI(llscMBB, dl, TII->get(Alpha::BEQ))
+      .addImm(0).addReg(reg_cmp).addMBB(sinkMBB);
+    BuildMI(llscMBB, dl, TII->get(Alpha::BISr), reg_store)
+      .addReg(Alpha::R31).addReg(MI->getOperand(3).getReg());
+    break;
+  }
+  case Alpha::LAS32:
+  case Alpha::LAS64: {
+    BuildMI(llscMBB, dl,TII->get(is32 ? Alpha::ADDLr : Alpha::ADDQr), reg_store)
+      .addReg(reg_res).addReg(reg_v2);
+    break;
+  }
+  case Alpha::SWAP32:
+  case Alpha::SWAP64: {
+    BuildMI(llscMBB, dl, TII->get(Alpha::BISr), reg_store)
+      .addReg(reg_v2).addReg(reg_v2);
+    break;
+  }
+  }
+  BuildMI(llscMBB, dl, TII->get(is32 ? Alpha::STL_C : Alpha::STQ_C), reg_store)
+    .addReg(reg_store).addImm(0).addReg(reg_ptr);
+  BuildMI(llscMBB, dl, TII->get(Alpha::BEQ))
+    .addImm(0).addReg(reg_store).addMBB(llscMBB);
+  BuildMI(llscMBB, dl, TII->get(Alpha::BR)).addMBB(sinkMBB);
+
+  thisMBB->addSuccessor(llscMBB);
+  llscMBB->addSuccessor(llscMBB);
+  llscMBB->addSuccessor(sinkMBB);
+  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+
+  return sinkMBB;
+}
+
+bool
+AlphaTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  // The Alpha target isn't yet aware of offsets.
+  return false;
+}
+
+bool AlphaTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+  if (VT != MVT::f32 && VT != MVT::f64)
+    return false;
+  // +0.0   F31
+  // +0.0f  F31
+  // -0.0  -F31
+  // -0.0f -F31
+  return Imm.isZero() || Imm.isNegZero();
+}

diff --git a/lib/Target/Alpha/AlphaISelLowering.h b/lib/Target/Alpha/AlphaISelLowering.h
new file mode 100644
index 0000000..0f17025
--- /dev/null
+++ b/lib/Target/Alpha/AlphaISelLowering.h

@@ -0,0 +1,138 @@
+//===-- AlphaISelLowering.h - Alpha DAG Lowering Interface ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that Alpha uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_ALPHA_ALPHAISELLOWERING_H
+#define LLVM_TARGET_ALPHA_ALPHAISELLOWERING_H
+
+#include "llvm/ADT/VectorExtras.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "Alpha.h"
+
+namespace llvm {
+
+  namespace AlphaISD {
+    enum NodeType {
+      // Start the numbering where the builting ops and target ops leave off.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+      //These corrospond to the identical Instruction
+      CVTQT_, CVTQS_, CVTTQ_,
+
+      /// GPRelHi/GPRelLo - These represent the high and low 16-bit
+      /// parts of a global address respectively.
+      GPRelHi, GPRelLo, 
+
+      /// RetLit - Literal Relocation of a Global
+      RelLit,
+
+      /// GlobalRetAddr - used to restore the return address
+      GlobalRetAddr,
+      
+      /// CALL - Normal call.
+      CALL,
+
+      /// DIVCALL - used for special library calls for div and rem
+      DivCall,
+      
+      /// return flag operand
+      RET_FLAG,
+
+      /// CHAIN = COND_BRANCH CHAIN, OPC, (G|F)PRC, DESTBB [, INFLAG] - This
+      /// corresponds to the COND_BRANCH pseudo instruction.  
+      /// *PRC is the input register to compare to zero,
+      /// OPC is the branch opcode to use (e.g. Alpha::BEQ),
+      /// DESTBB is the destination block to branch to, and INFLAG is
+      /// an optional input flag argument.
+      COND_BRANCH_I, COND_BRANCH_F
+
+    };
+  }
+
+  class AlphaTargetLowering : public TargetLowering {
+    int VarArgsOffset;  // What is the offset to the first vaarg
+    int VarArgsBase;    // What is the base FrameIndex
+  public:
+    explicit AlphaTargetLowering(TargetMachine &TM);
+    
+    /// getSetCCResultType - Get the SETCC result ValueType
+    virtual MVT::SimpleValueType getSetCCResultType(EVT VT) const;
+
+    /// LowerOperation - Provide custom lowering hooks for some operations.
+    ///
+    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG);
+
+    /// ReplaceNodeResults - Replace the results of node with an illegal result
+    /// type with new values built out of custom code.
+    ///
+    virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+                                    SelectionDAG &DAG);
+
+    // Friendly names for dumps
+    const char *getTargetNodeName(unsigned Opcode) const;
+
+    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                            CallingConv::ID CallConv, bool isVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            DebugLoc dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals);
+
+    ConstraintType getConstraintType(const std::string &Constraint) const;
+
+    std::vector<unsigned> 
+      getRegClassForInlineAsmConstraint(const std::string &Constraint,
+                                        EVT VT) const;
+
+    MachineBasicBlock *EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                   MachineBasicBlock *BB,
+                    DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const;
+
+    virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
+
+    /// getFunctionAlignment - Return the Log2 alignment of this function.
+    virtual unsigned getFunctionAlignment(const Function *F) const;
+
+    /// isFPImmLegal - Returns true if the target can instruction select the
+    /// specified FP immediate natively. If false, the legalizer will
+    /// materialize the FP immediate as a load from a constant pool.
+    virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
+
+  private:
+    // Helpers for custom lowering.
+    void LowerVAARG(SDNode *N, SDValue &Chain, SDValue &DataPtr,
+                    SelectionDAG &DAG);
+
+    virtual SDValue
+      LowerFormalArguments(SDValue Chain,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerCall(SDValue Chain, SDValue Callee,
+                CallingConv::ID CallConv, bool isVarArg, bool &isTailCall,
+                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<ISD::InputArg> &Ins,
+                DebugLoc dl, SelectionDAG &DAG,
+                SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerReturn(SDValue Chain,
+                  CallingConv::ID CallConv, bool isVarArg,
+                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  DebugLoc dl, SelectionDAG &DAG);
+  };
+}
+
+#endif   // LLVM_TARGET_ALPHA_ALPHAISELLOWERING_H

diff --git a/lib/Target/Alpha/AlphaInstrFormats.td b/lib/Target/Alpha/AlphaInstrFormats.td
new file mode 100644
index 0000000..6d82875
--- /dev/null
+++ b/lib/Target/Alpha/AlphaInstrFormats.td

@@ -0,0 +1,268 @@
+//===- AlphaInstrFormats.td - Alpha Instruction Formats ----*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+//3.3:
+//Memory
+//Branch
+//Operate
+//Floating-point
+//PALcode
+
+def u8imm   : Operand<i64>;
+def s14imm  : Operand<i64>;
+def s16imm  : Operand<i64>;
+def s21imm  : Operand<i64>;
+def s64imm  : Operand<i64>;
+def u64imm  : Operand<i64>;
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+// Alpha instruction baseline
+class InstAlpha<bits<6> op, string asmstr, InstrItinClass itin> : Instruction {
+  field bits<32> Inst;
+  let Namespace = "Alpha";
+  let AsmString = asmstr;
+  let Inst{31-26} = op;
+  let Itinerary = itin;
+}
+
+
+//3.3.1
+class MForm<bits<6> opcode, bit load, string asmstr, list<dag> pattern, InstrItinClass itin> 
+        : InstAlpha<opcode, asmstr, itin> {
+  let Pattern = pattern;
+  let canFoldAsLoad = load;
+  let Defs = [R28]; //We may use this for frame index calculations, so reserve it here
+
+  bits<5> Ra;
+  bits<16> disp;
+  bits<5> Rb;
+
+  let Inst{25-21} = Ra;
+  let Inst{20-16} = Rb;
+  let Inst{15-0} = disp;
+}
+class MfcForm<bits<6> opcode, bits<16> fc, string asmstr, InstrItinClass itin> 
+        : InstAlpha<opcode, asmstr, itin> {    
+  bits<5> Ra;
+
+  let OutOperandList = (ops GPRC:$RA);
+  let InOperandList = (ops);
+  let Inst{25-21} = Ra;
+  let Inst{20-16} = 0;
+  let Inst{15-0} = fc;
+}
+class MfcPForm<bits<6> opcode, bits<16> fc, string asmstr, InstrItinClass itin> 
+        : InstAlpha<opcode, asmstr, itin> {    
+  let OutOperandList = (ops);
+  let InOperandList = (ops);
+  let Inst{25-21} = 0;
+  let Inst{20-16} = 0;
+  let Inst{15-0} = fc;
+}
+
+class MbrForm<bits<6> opcode, bits<2> TB, dag OL, string asmstr, InstrItinClass itin>
+    : InstAlpha<opcode, asmstr, itin> {
+  bits<5> Ra;
+  bits<5> Rb;
+  bits<14> disp;
+
+  let OutOperandList = (ops);
+  let InOperandList = OL;
+
+  let Inst{25-21} = Ra;
+  let Inst{20-16} = Rb;
+  let Inst{15-14} = TB;
+  let Inst{13-0} = disp;
+}
+class MbrpForm<bits<6> opcode, bits<2> TB, dag OL, string asmstr, list<dag> pattern, InstrItinClass itin>
+    : InstAlpha<opcode, asmstr, itin> {
+  let Pattern=pattern;
+  bits<5> Ra;
+  bits<5> Rb;
+  bits<14> disp;
+
+  let OutOperandList = (ops);
+  let InOperandList = OL;
+
+  let Inst{25-21} = Ra;
+  let Inst{20-16} = Rb;
+  let Inst{15-14} = TB;
+  let Inst{13-0} = disp;
+}
+
+//3.3.2
+def target : Operand<OtherVT> {}
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1 in {
+class BFormN<bits<6> opcode, dag OL, string asmstr, InstrItinClass itin>
+   : InstAlpha<opcode, asmstr, itin> {
+  let OutOperandList = (ops);
+  let InOperandList = OL;
+  bits<64> Opc; //dummy
+  bits<5> Ra;
+  bits<21> disp;
+
+  let Inst{25-21} = Ra;
+  let Inst{20-0} = disp;
+}
+}
+
+let isBranch = 1, isTerminator = 1 in
+class BFormD<bits<6> opcode, string asmstr, list<dag> pattern, InstrItinClass itin> 
+    : InstAlpha<opcode, asmstr, itin> {
+  let Pattern = pattern;
+  let OutOperandList = (ops);
+  let InOperandList = (ops target:$DISP);
+  bits<5> Ra;
+  bits<21> disp;
+
+  let Inst{25-21} = Ra;
+  let Inst{20-0} = disp;
+}
+
+//3.3.3
+class OForm<bits<6> opcode, bits<7> fun, string asmstr, list<dag> pattern, InstrItinClass itin> 
+    : InstAlpha<opcode, asmstr, itin> {
+  let Pattern = pattern;
+  let OutOperandList = (outs GPRC:$RC);
+  let InOperandList = (ins GPRC:$RA, GPRC:$RB);
+
+  bits<5> Rc;
+  bits<5> Ra;
+  bits<5> Rb;
+  bits<7> Function = fun;
+
+  let Inst{25-21} = Ra;
+  let Inst{20-16} = Rb;
+  let Inst{15-13} = 0;
+  let Inst{12} = 0;
+  let Inst{11-5} = Function;
+  let Inst{4-0} = Rc;
+}
+
+class OForm2<bits<6> opcode, bits<7> fun, string asmstr, list<dag> pattern, InstrItinClass itin> 
+    : InstAlpha<opcode, asmstr, itin> {
+  let Pattern = pattern;
+  let OutOperandList = (outs GPRC:$RC);
+  let InOperandList = (ins GPRC:$RB);
+
+  bits<5> Rc;
+  bits<5> Rb;
+  bits<7> Function = fun;
+
+  let Inst{25-21} = 31;
+  let Inst{20-16} = Rb;
+  let Inst{15-13} = 0;
+  let Inst{12} = 0;
+  let Inst{11-5} = Function;
+  let Inst{4-0} = Rc;
+}
+
+class OForm4<bits<6> opcode, bits<7> fun, string asmstr, list<dag> pattern, InstrItinClass itin> 
+    : InstAlpha<opcode, asmstr, itin> {
+  let Pattern = pattern;
+  let OutOperandList = (outs GPRC:$RDEST);
+  let InOperandList = (ins GPRC:$RCOND, GPRC:$RTRUE, GPRC:$RFALSE);
+  let Constraints = "$RFALSE = $RDEST";
+  let DisableEncoding = "$RFALSE";
+
+  bits<5> Rc;
+  bits<5> Ra;
+  bits<5> Rb;
+  bits<7> Function = fun;
+
+//  let isTwoAddress = 1;
+  let Inst{25-21} = Ra;
+  let Inst{20-16} = Rb;
+  let Inst{15-13} = 0;
+  let Inst{12} = 0;
+  let Inst{11-5} = Function;
+  let Inst{4-0} = Rc;
+}
+
+
+class OFormL<bits<6> opcode, bits<7> fun, string asmstr, list<dag> pattern, InstrItinClass itin> 
+    : InstAlpha<opcode, asmstr, itin> {
+  let Pattern = pattern;
+  let OutOperandList = (outs GPRC:$RC);
+  let InOperandList = (ins GPRC:$RA, u8imm:$L);
+
+  bits<5> Rc;
+  bits<5> Ra;
+  bits<8> LIT;
+  bits<7> Function = fun;
+
+  let Inst{25-21} = Ra;
+  let Inst{20-13} = LIT;
+  let Inst{12} = 1;
+  let Inst{11-5} = Function;
+  let Inst{4-0} = Rc;
+}
+
+class OForm4L<bits<6> opcode, bits<7> fun, string asmstr, list<dag> pattern, InstrItinClass itin> 
+    : InstAlpha<opcode, asmstr, itin> {
+  let Pattern = pattern;
+  let OutOperandList = (outs GPRC:$RDEST);
+  let InOperandList = (ins GPRC:$RCOND, s64imm:$RTRUE, GPRC:$RFALSE);
+  let Constraints = "$RFALSE = $RDEST";
+  let DisableEncoding = "$RFALSE";
+
+  bits<5> Rc;
+  bits<5> Ra;
+  bits<8> LIT;
+  bits<7> Function = fun;
+
+//  let isTwoAddress = 1;
+  let Inst{25-21} = Ra;
+  let Inst{20-13} = LIT;
+  let Inst{12} = 1;
+  let Inst{11-5} = Function;
+  let Inst{4-0} = Rc;
+}
+
+//3.3.4
+class FPForm<bits<6> opcode, bits<11> fun, string asmstr, list<dag> pattern, InstrItinClass itin> 
+    : InstAlpha<opcode, asmstr, itin> {
+  let Pattern = pattern;
+
+  bits<5> Fc;
+  bits<5> Fa;
+  bits<5> Fb;
+  bits<11> Function = fun;
+
+  let Inst{25-21} = Fa;
+  let Inst{20-16} = Fb;
+  let Inst{15-5} = Function;
+  let Inst{4-0} = Fc;
+}
+
+//3.3.5
+class PALForm<bits<6> opcode, dag OL, string asmstr, InstrItinClass itin>
+    : InstAlpha<opcode, asmstr, itin> {
+  let OutOperandList = (ops);
+  let InOperandList = OL;
+  bits<26> Function;
+
+  let Inst{25-0} = Function;
+}
+
+
+// Pseudo instructions.
+class PseudoInstAlpha<dag OOL, dag IOL, string nm, list<dag> pattern, InstrItinClass itin> 
+    : InstAlpha<0, nm, itin>  {
+  let OutOperandList = OOL;
+  let InOperandList = IOL;
+  let Pattern = pattern;
+
+}

diff --git a/lib/Target/Alpha/AlphaInstrInfo.cpp b/lib/Target/Alpha/AlphaInstrInfo.cpp
new file mode 100644
index 0000000..39f0749
--- /dev/null
+++ b/lib/Target/Alpha/AlphaInstrInfo.cpp

@@ -0,0 +1,454 @@
+//===- AlphaInstrInfo.cpp - Alpha Instruction Information -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Alpha implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Alpha.h"
+#include "AlphaInstrInfo.h"
+#include "AlphaMachineFunctionInfo.h"
+#include "AlphaGenInstrInfo.inc"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/ErrorHandling.h"
+using namespace llvm;
+
+AlphaInstrInfo::AlphaInstrInfo()
+  : TargetInstrInfoImpl(AlphaInsts, array_lengthof(AlphaInsts)),
+    RI(*this) { }
+
+
+bool AlphaInstrInfo::isMoveInstr(const MachineInstr& MI,
+                                 unsigned& sourceReg, unsigned& destReg,
+                                 unsigned& SrcSR, unsigned& DstSR) const {
+  unsigned oc = MI.getOpcode();
+  if (oc == Alpha::BISr   || 
+      oc == Alpha::CPYSS  || 
+      oc == Alpha::CPYST  ||
+      oc == Alpha::CPYSSt || 
+      oc == Alpha::CPYSTs) {
+    // or r1, r2, r2 
+    // cpys(s|t) r1 r2 r2
+    assert(MI.getNumOperands() >= 3 &&
+           MI.getOperand(0).isReg() &&
+           MI.getOperand(1).isReg() &&
+           MI.getOperand(2).isReg() &&
+           "invalid Alpha BIS instruction!");
+    if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
+      sourceReg = MI.getOperand(1).getReg();
+      destReg = MI.getOperand(0).getReg();
+      SrcSR = DstSR = 0;
+      return true;
+    }
+  }
+  return false;
+}
+
+unsigned 
+AlphaInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                    int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  case Alpha::LDL:
+  case Alpha::LDQ:
+  case Alpha::LDBU:
+  case Alpha::LDWU:
+  case Alpha::LDS:
+  case Alpha::LDT:
+    if (MI->getOperand(1).isFI()) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+unsigned 
+AlphaInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                   int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  case Alpha::STL:
+  case Alpha::STQ:
+  case Alpha::STB:
+  case Alpha::STW:
+  case Alpha::STS:
+  case Alpha::STT:
+    if (MI->getOperand(1).isFI()) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+static bool isAlphaIntCondCode(unsigned Opcode) {
+  switch (Opcode) {
+  case Alpha::BEQ: 
+  case Alpha::BNE: 
+  case Alpha::BGE: 
+  case Alpha::BGT: 
+  case Alpha::BLE: 
+  case Alpha::BLT: 
+  case Alpha::BLBC: 
+  case Alpha::BLBS:
+    return true;
+  default:
+    return false;
+  }
+}
+
+unsigned AlphaInstrInfo::InsertBranch(MachineBasicBlock &MBB,
+                                      MachineBasicBlock *TBB,
+                                      MachineBasicBlock *FBB,
+                            const SmallVectorImpl<MachineOperand> &Cond) const {
+  // FIXME this should probably have a DebugLoc argument
+  DebugLoc dl = DebugLoc::getUnknownLoc();
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 2 || Cond.size() == 0) && 
+         "Alpha branch conditions have two components!");
+
+  // One-way branch.
+  if (FBB == 0) {
+    if (Cond.empty())   // Unconditional branch
+      BuildMI(&MBB, dl, get(Alpha::BR)).addMBB(TBB);
+    else                // Conditional branch
+      if (isAlphaIntCondCode(Cond[0].getImm()))
+        BuildMI(&MBB, dl, get(Alpha::COND_BRANCH_I))
+          .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB);
+      else
+        BuildMI(&MBB, dl, get(Alpha::COND_BRANCH_F))
+          .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB);
+    return 1;
+  }
+  
+  // Two-way Conditional Branch.
+  if (isAlphaIntCondCode(Cond[0].getImm()))
+    BuildMI(&MBB, dl, get(Alpha::COND_BRANCH_I))
+      .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB);
+  else
+    BuildMI(&MBB, dl, get(Alpha::COND_BRANCH_F))
+      .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB);
+  BuildMI(&MBB, dl, get(Alpha::BR)).addMBB(FBB);
+  return 2;
+}
+
+bool AlphaInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MI,
+                                  unsigned DestReg, unsigned SrcReg,
+                                  const TargetRegisterClass *DestRC,
+                                  const TargetRegisterClass *SrcRC) const {
+  //cerr << "copyRegToReg " << DestReg << " <- " << SrcReg << "\n";
+  if (DestRC != SrcRC) {
+    // Not yet supported!
+    return false;
+  }
+
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  if (DestRC == Alpha::GPRCRegisterClass) {
+    BuildMI(MBB, MI, DL, get(Alpha::BISr), DestReg)
+      .addReg(SrcReg)
+      .addReg(SrcReg);
+  } else if (DestRC == Alpha::F4RCRegisterClass) {
+    BuildMI(MBB, MI, DL, get(Alpha::CPYSS), DestReg)
+      .addReg(SrcReg)
+      .addReg(SrcReg);
+  } else if (DestRC == Alpha::F8RCRegisterClass) {
+    BuildMI(MBB, MI, DL, get(Alpha::CPYST), DestReg)
+      .addReg(SrcReg)
+      .addReg(SrcReg);
+  } else {
+    // Attempt to copy register that is not GPR or FPR
+    return false;
+  }
+  
+  return true;
+}
+
+void
+AlphaInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MI,
+                                    unsigned SrcReg, bool isKill, int FrameIdx,
+                                    const TargetRegisterClass *RC) const {
+  //cerr << "Trying to store " << getPrettyName(SrcReg) << " to "
+  //     << FrameIdx << "\n";
+  //BuildMI(MBB, MI, Alpha::WTF, 0).addReg(SrcReg);
+
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  if (RC == Alpha::F4RCRegisterClass)
+    BuildMI(MBB, MI, DL, get(Alpha::STS))
+      .addReg(SrcReg, getKillRegState(isKill))
+      .addFrameIndex(FrameIdx).addReg(Alpha::F31);
+  else if (RC == Alpha::F8RCRegisterClass)
+    BuildMI(MBB, MI, DL, get(Alpha::STT))
+      .addReg(SrcReg, getKillRegState(isKill))
+      .addFrameIndex(FrameIdx).addReg(Alpha::F31);
+  else if (RC == Alpha::GPRCRegisterClass)
+    BuildMI(MBB, MI, DL, get(Alpha::STQ))
+      .addReg(SrcReg, getKillRegState(isKill))
+      .addFrameIndex(FrameIdx).addReg(Alpha::F31);
+  else
+    llvm_unreachable("Unhandled register class");
+}
+
+void
+AlphaInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                        unsigned DestReg, int FrameIdx,
+                                        const TargetRegisterClass *RC) const {
+  //cerr << "Trying to load " << getPrettyName(DestReg) << " to "
+  //     << FrameIdx << "\n";
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  if (RC == Alpha::F4RCRegisterClass)
+    BuildMI(MBB, MI, DL, get(Alpha::LDS), DestReg)
+      .addFrameIndex(FrameIdx).addReg(Alpha::F31);
+  else if (RC == Alpha::F8RCRegisterClass)
+    BuildMI(MBB, MI, DL, get(Alpha::LDT), DestReg)
+      .addFrameIndex(FrameIdx).addReg(Alpha::F31);
+  else if (RC == Alpha::GPRCRegisterClass)
+    BuildMI(MBB, MI, DL, get(Alpha::LDQ), DestReg)
+      .addFrameIndex(FrameIdx).addReg(Alpha::F31);
+  else
+    llvm_unreachable("Unhandled register class");
+}
+
+MachineInstr *AlphaInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+                                                    MachineInstr *MI,
+                                          const SmallVectorImpl<unsigned> &Ops,
+                                                    int FrameIndex) const {
+   if (Ops.size() != 1) return NULL;
+
+   // Make sure this is a reg-reg copy.
+   unsigned Opc = MI->getOpcode();
+
+   MachineInstr *NewMI = NULL;
+   switch(Opc) {
+   default:
+     break;
+   case Alpha::BISr:
+   case Alpha::CPYSS:
+   case Alpha::CPYST:
+     if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) {
+       if (Ops[0] == 0) {  // move -> store
+         unsigned InReg = MI->getOperand(1).getReg();
+         bool isKill = MI->getOperand(1).isKill();
+         bool isUndef = MI->getOperand(1).isUndef();
+         Opc = (Opc == Alpha::BISr) ? Alpha::STQ : 
+           ((Opc == Alpha::CPYSS) ? Alpha::STS : Alpha::STT);
+         NewMI = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+           .addReg(InReg, getKillRegState(isKill) | getUndefRegState(isUndef))
+           .addFrameIndex(FrameIndex)
+           .addReg(Alpha::F31);
+       } else {           // load -> move
+         unsigned OutReg = MI->getOperand(0).getReg();
+         bool isDead = MI->getOperand(0).isDead();
+         bool isUndef = MI->getOperand(0).isUndef();
+         Opc = (Opc == Alpha::BISr) ? Alpha::LDQ : 
+           ((Opc == Alpha::CPYSS) ? Alpha::LDS : Alpha::LDT);
+         NewMI = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+           .addReg(OutReg, RegState::Define | getDeadRegState(isDead) |
+                   getUndefRegState(isUndef))
+           .addFrameIndex(FrameIndex)
+           .addReg(Alpha::F31);
+       }
+     }
+     break;
+   }
+  return NewMI;
+}
+
+static unsigned AlphaRevCondCode(unsigned Opcode) {
+  switch (Opcode) {
+  case Alpha::BEQ: return Alpha::BNE;
+  case Alpha::BNE: return Alpha::BEQ;
+  case Alpha::BGE: return Alpha::BLT;
+  case Alpha::BGT: return Alpha::BLE;
+  case Alpha::BLE: return Alpha::BGT;
+  case Alpha::BLT: return Alpha::BGE;
+  case Alpha::BLBC: return Alpha::BLBS;
+  case Alpha::BLBS: return Alpha::BLBC;
+  case Alpha::FBEQ: return Alpha::FBNE;
+  case Alpha::FBNE: return Alpha::FBEQ;
+  case Alpha::FBGE: return Alpha::FBLT;
+  case Alpha::FBGT: return Alpha::FBLE;
+  case Alpha::FBLE: return Alpha::FBGT;
+  case Alpha::FBLT: return Alpha::FBGE;
+  default:
+    llvm_unreachable("Unknown opcode");
+  }
+  return 0; // Not reached
+}
+
+// Branch analysis.
+bool AlphaInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
+                                   MachineBasicBlock *&FBB,
+                                   SmallVectorImpl<MachineOperand> &Cond,
+                                   bool AllowModify) const {
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+  
+  // If there is only one terminator instruction, process it.
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+    if (LastInst->getOpcode() == Alpha::BR) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    } else if (LastInst->getOpcode() == Alpha::COND_BRANCH_I ||
+               LastInst->getOpcode() == Alpha::COND_BRANCH_F) {
+      // Block ends with fall-through condbranch.
+      TBB = LastInst->getOperand(2).getMBB();
+      Cond.push_back(LastInst->getOperand(0));
+      Cond.push_back(LastInst->getOperand(1));
+      return false;
+    }
+    // Otherwise, don't know what this is.
+    return true;
+  }
+  
+  // Get the instruction before it if it's a terminator.
+  MachineInstr *SecondLastInst = I;
+
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() &&
+      isUnpredicatedTerminator(--I))
+    return true;
+  
+  // If the block ends with Alpha::BR and Alpha::COND_BRANCH_*, handle it.
+  if ((SecondLastInst->getOpcode() == Alpha::COND_BRANCH_I ||
+      SecondLastInst->getOpcode() == Alpha::COND_BRANCH_F) && 
+      LastInst->getOpcode() == Alpha::BR) {
+    TBB =  SecondLastInst->getOperand(2).getMBB();
+    Cond.push_back(SecondLastInst->getOperand(0));
+    Cond.push_back(SecondLastInst->getOperand(1));
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+  
+  // If the block ends with two Alpha::BRs, handle it.  The second one is not
+  // executed, so remove it.
+  if (SecondLastInst->getOpcode() == Alpha::BR && 
+      LastInst->getOpcode() == Alpha::BR) {
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return false;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+unsigned AlphaInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin()) return 0;
+  --I;
+  if (I->getOpcode() != Alpha::BR && 
+      I->getOpcode() != Alpha::COND_BRANCH_I &&
+      I->getOpcode() != Alpha::COND_BRANCH_F)
+    return 0;
+  
+  // Remove the branch.
+  I->eraseFromParent();
+  
+  I = MBB.end();
+
+  if (I == MBB.begin()) return 1;
+  --I;
+  if (I->getOpcode() != Alpha::COND_BRANCH_I && 
+      I->getOpcode() != Alpha::COND_BRANCH_F)
+    return 1;
+  
+  // Remove the branch.
+  I->eraseFromParent();
+  return 2;
+}
+
+void AlphaInstrInfo::insertNoop(MachineBasicBlock &MBB, 
+                                MachineBasicBlock::iterator MI) const {
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+  BuildMI(MBB, MI, DL, get(Alpha::BISr), Alpha::R31)
+    .addReg(Alpha::R31)
+    .addReg(Alpha::R31);
+}
+
+bool AlphaInstrInfo::
+ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+  assert(Cond.size() == 2 && "Invalid Alpha branch opcode!");
+  Cond[0].setImm(AlphaRevCondCode(Cond[0].getImm()));
+  return false;
+}
+
+/// getGlobalBaseReg - Return a virtual register initialized with the
+/// the global base register value. Output instructions required to
+/// initialize the register in the function entry block, if necessary.
+///
+unsigned AlphaInstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
+  AlphaMachineFunctionInfo *AlphaFI = MF->getInfo<AlphaMachineFunctionInfo>();
+  unsigned GlobalBaseReg = AlphaFI->getGlobalBaseReg();
+  if (GlobalBaseReg != 0)
+    return GlobalBaseReg;
+
+  // Insert the set of GlobalBaseReg into the first MBB of the function
+  MachineBasicBlock &FirstMBB = MF->front();
+  MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
+
+  GlobalBaseReg = RegInfo.createVirtualRegister(&Alpha::GPRCRegClass);
+  bool Ok = TII->copyRegToReg(FirstMBB, MBBI, GlobalBaseReg, Alpha::R29,
+                              &Alpha::GPRCRegClass, &Alpha::GPRCRegClass);
+  assert(Ok && "Couldn't assign to global base register!");
+  Ok = Ok; // Silence warning when assertions are turned off.
+  RegInfo.addLiveIn(Alpha::R29);
+
+  AlphaFI->setGlobalBaseReg(GlobalBaseReg);
+  return GlobalBaseReg;
+}
+
+/// getGlobalRetAddr - Return a virtual register initialized with the
+/// the global base register value. Output instructions required to
+/// initialize the register in the function entry block, if necessary.
+///
+unsigned AlphaInstrInfo::getGlobalRetAddr(MachineFunction *MF) const {
+  AlphaMachineFunctionInfo *AlphaFI = MF->getInfo<AlphaMachineFunctionInfo>();
+  unsigned GlobalRetAddr = AlphaFI->getGlobalRetAddr();
+  if (GlobalRetAddr != 0)
+    return GlobalRetAddr;
+
+  // Insert the set of GlobalRetAddr into the first MBB of the function
+  MachineBasicBlock &FirstMBB = MF->front();
+  MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
+
+  GlobalRetAddr = RegInfo.createVirtualRegister(&Alpha::GPRCRegClass);
+  bool Ok = TII->copyRegToReg(FirstMBB, MBBI, GlobalRetAddr, Alpha::R26,
+                              &Alpha::GPRCRegClass, &Alpha::GPRCRegClass);
+  assert(Ok && "Couldn't assign to global return address register!");
+  Ok = Ok; // Silence warning when assertions are turned off.
+  RegInfo.addLiveIn(Alpha::R26);
+
+  AlphaFI->setGlobalRetAddr(GlobalRetAddr);
+  return GlobalRetAddr;
+}

diff --git a/lib/Target/Alpha/AlphaInstrInfo.h b/lib/Target/Alpha/AlphaInstrInfo.h
new file mode 100644
index 0000000..c3b6044
--- /dev/null
+++ b/lib/Target/Alpha/AlphaInstrInfo.h

@@ -0,0 +1,98 @@
+//===- AlphaInstrInfo.h - Alpha Instruction Information ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Alpha implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ALPHAINSTRUCTIONINFO_H
+#define ALPHAINSTRUCTIONINFO_H
+
+#include "llvm/Target/TargetInstrInfo.h"
+#include "AlphaRegisterInfo.h"
+
+namespace llvm {
+
+class AlphaInstrInfo : public TargetInstrInfoImpl {
+  const AlphaRegisterInfo RI;
+public:
+  AlphaInstrInfo();
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  virtual const AlphaRegisterInfo &getRegisterInfo() const { return RI; }
+
+  /// Return true if the instruction is a register to register move and return
+  /// the source and dest operands and their sub-register indices by reference.
+  virtual bool isMoveInstr(const MachineInstr &MI,
+                           unsigned &SrcReg, unsigned &DstReg,
+                           unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
+  
+  virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                                       int &FrameIndex) const;
+  virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
+                                      int &FrameIndex) const;
+  
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                            MachineBasicBlock *FBB,
+                            const SmallVectorImpl<MachineOperand> &Cond) const;
+  virtual bool copyRegToReg(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            unsigned DestReg, unsigned SrcReg,
+                            const TargetRegisterClass *DestRC,
+                            const TargetRegisterClass *SrcRC) const;
+  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   unsigned SrcReg, bool isKill, int FrameIndex,
+                                   const TargetRegisterClass *RC) const;
+
+  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MBBI,
+                                    unsigned DestReg, int FrameIndex,
+                                    const TargetRegisterClass *RC) const;
+  
+  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                              MachineInstr* MI,
+                                           const SmallVectorImpl<unsigned> &Ops,
+                                              int FrameIndex) const;
+
+  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                              MachineInstr* MI,
+                                           const SmallVectorImpl<unsigned> &Ops,
+                                              MachineInstr* LoadMI) const {
+    return 0;
+  }
+  
+  bool AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify) const;
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+  void insertNoop(MachineBasicBlock &MBB, 
+                  MachineBasicBlock::iterator MI) const;
+  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+
+  /// getGlobalBaseReg - Return a virtual register initialized with the
+  /// the global base register value. Output instructions required to
+  /// initialize the register in the function entry block, if necessary.
+  ///
+  unsigned getGlobalBaseReg(MachineFunction *MF) const;
+
+  /// getGlobalRetAddr - Return a virtual register initialized with the
+  /// the global return address register value. Output instructions required to
+  /// initialize the register in the function entry block, if necessary.
+  ///
+  unsigned getGlobalRetAddr(MachineFunction *MF) const;
+};
+
+}
+
+#endif

diff --git a/lib/Target/Alpha/AlphaInstrInfo.td b/lib/Target/Alpha/AlphaInstrInfo.td
new file mode 100644
index 0000000..8917e86
--- /dev/null
+++ b/lib/Target/Alpha/AlphaInstrInfo.td

@@ -0,0 +1,1143 @@
+//===- AlphaInstrInfo.td - The Alpha Instruction Set -------*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+include "AlphaInstrFormats.td"
+
+//********************
+//Custom DAG Nodes
+//********************
+
+def SDTFPUnaryOpUnC  : SDTypeProfile<1, 1, [
+  SDTCisFP<1>, SDTCisFP<0>
+]>;
+def Alpha_cvtqt   : SDNode<"AlphaISD::CVTQT_",    SDTFPUnaryOpUnC, []>;
+def Alpha_cvtqs   : SDNode<"AlphaISD::CVTQS_",    SDTFPUnaryOpUnC, []>;
+def Alpha_cvttq   : SDNode<"AlphaISD::CVTTQ_"  ,  SDTFPUnaryOp, []>;
+def Alpha_gprello : SDNode<"AlphaISD::GPRelLo",   SDTIntBinOp, []>;
+def Alpha_gprelhi : SDNode<"AlphaISD::GPRelHi",   SDTIntBinOp, []>;
+def Alpha_rellit  : SDNode<"AlphaISD::RelLit",    SDTIntBinOp, [SDNPMayLoad]>;
+
+def retflag       : SDNode<"AlphaISD::RET_FLAG", SDTNone,
+                           [SDNPHasChain, SDNPOptInFlag]>;
+
+// These are target-independent nodes, but have target-specific formats.
+def SDT_AlphaCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i64> ]>;
+def SDT_AlphaCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i64>,
+                                           SDTCisVT<1, i64> ]>;
+
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_AlphaCallSeqStart,
+                           [SDNPHasChain, SDNPOutFlag]>;
+def callseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_AlphaCallSeqEnd,
+                           [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+
+//********************
+//Paterns for matching
+//********************
+def invX : SDNodeXForm<imm, [{ //invert
+  return getI64Imm(~N->getZExtValue());
+}]>;
+def negX : SDNodeXForm<imm, [{ //negate
+  return getI64Imm(~N->getZExtValue() + 1);
+}]>;
+def SExt32 : SDNodeXForm<imm, [{ //signed extend int to long
+  return getI64Imm(((int64_t)N->getZExtValue() << 32) >> 32);
+}]>;
+def SExt16 : SDNodeXForm<imm, [{ //signed extend int to long
+  return getI64Imm(((int64_t)N->getZExtValue() << 48) >> 48);
+}]>;
+def LL16 : SDNodeXForm<imm, [{ //lda part of constant
+  return getI64Imm(get_lda16(N->getZExtValue()));
+}]>;
+def LH16 : SDNodeXForm<imm, [{ //ldah part of constant (or more if too big)
+  return getI64Imm(get_ldah16(N->getZExtValue()));
+}]>;
+def iZAPX : SDNodeXForm<and, [{ // get imm to ZAPi
+  ConstantSDNode *RHS = cast<ConstantSDNode>(N->getOperand(1));
+  return getI64Imm(get_zapImm(SDValue(), RHS->getZExtValue()));
+}]>;
+def nearP2X : SDNodeXForm<imm, [{
+  return getI64Imm(Log2_64(getNearPower2((uint64_t)N->getZExtValue())));
+}]>;
+def nearP2RemX : SDNodeXForm<imm, [{
+  uint64_t x =
+    abs64(N->getZExtValue() - getNearPower2((uint64_t)N->getZExtValue()));
+  return getI64Imm(Log2_64(x));
+}]>;
+
+def immUExt8  : PatLeaf<(imm), [{ //imm fits in 8 bit zero extended field
+  return (uint64_t)N->getZExtValue() == (uint8_t)N->getZExtValue();
+}]>;
+def immUExt8inv  : PatLeaf<(imm), [{ //inverted imm fits in 8 bit zero extended field
+  return (uint64_t)~N->getZExtValue() == (uint8_t)~N->getZExtValue();
+}], invX>;
+def immUExt8neg  : PatLeaf<(imm), [{ //negated imm fits in 8 bit zero extended field
+  return ((uint64_t)~N->getZExtValue() + 1) ==
+         (uint8_t)((uint64_t)~N->getZExtValue() + 1);
+}], negX>;
+def immSExt16  : PatLeaf<(imm), [{ //imm fits in 16 bit sign extended field
+  return ((int64_t)N->getZExtValue() << 48) >> 48 ==
+         (int64_t)N->getZExtValue();
+}]>;
+def immSExt16int  : PatLeaf<(imm), [{ //(int)imm fits in a 16 bit sign extended field
+  return ((int64_t)N->getZExtValue() << 48) >> 48 ==
+         ((int64_t)N->getZExtValue() << 32) >> 32;
+}], SExt16>;
+
+def zappat : PatFrag<(ops node:$LHS), (and node:$LHS, imm:$L), [{
+  ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!RHS) return 0;
+  uint64_t build = get_zapImm(N->getOperand(0), (uint64_t)RHS->getZExtValue());
+  return build != 0;
+}]>;
+
+def immFPZ  : PatLeaf<(fpimm), [{ //the only fpconstant nodes are +/- 0.0
+  (void)N; // silence warning.
+  return true;
+}]>;
+
+def immRem1 :PatLeaf<(imm),[{return chkRemNearPower2(N->getZExtValue(),1,0);}]>;
+def immRem2 :PatLeaf<(imm),[{return chkRemNearPower2(N->getZExtValue(),2,0);}]>;
+def immRem3 :PatLeaf<(imm),[{return chkRemNearPower2(N->getZExtValue(),3,0);}]>;
+def immRem4 :PatLeaf<(imm),[{return chkRemNearPower2(N->getZExtValue(),4,0);}]>;
+def immRem5 :PatLeaf<(imm),[{return chkRemNearPower2(N->getZExtValue(),5,0);}]>;
+def immRem1n:PatLeaf<(imm),[{return chkRemNearPower2(N->getZExtValue(),1,1);}]>;
+def immRem2n:PatLeaf<(imm),[{return chkRemNearPower2(N->getZExtValue(),2,1);}]>;
+def immRem3n:PatLeaf<(imm),[{return chkRemNearPower2(N->getZExtValue(),3,1);}]>;
+def immRem4n:PatLeaf<(imm),[{return chkRemNearPower2(N->getZExtValue(),4,1);}]>;
+def immRem5n:PatLeaf<(imm),[{return chkRemNearPower2(N->getZExtValue(),5,1);}]>;
+
+def immRemP2n : PatLeaf<(imm), [{
+  return isPowerOf2_64(getNearPower2((uint64_t)N->getZExtValue()) -
+                         N->getZExtValue());
+}]>;
+def immRemP2 : PatLeaf<(imm), [{
+  return isPowerOf2_64(N->getZExtValue() -
+                         getNearPower2((uint64_t)N->getZExtValue()));
+}]>;
+def immUExt8ME : PatLeaf<(imm), [{ //use this imm for mulqi
+  int64_t d =  abs64((int64_t)N->getZExtValue() -
+               (int64_t)getNearPower2((uint64_t)N->getZExtValue()));
+  if (isPowerOf2_64(d)) return false;
+  switch (d) {
+    case 1: case 3: case 5: return false; 
+    default: return (uint64_t)N->getZExtValue() == (uint8_t)N->getZExtValue();
+  };
+}]>;
+
+def intop : PatFrag<(ops node:$op), (sext_inreg node:$op, i32)>;
+def add4  : PatFrag<(ops node:$op1, node:$op2),
+                    (add (shl node:$op1, 2), node:$op2)>;
+def sub4  : PatFrag<(ops node:$op1, node:$op2),
+                    (sub (shl node:$op1, 2), node:$op2)>;
+def add8  : PatFrag<(ops node:$op1, node:$op2),
+                    (add (shl node:$op1, 3), node:$op2)>;
+def sub8  : PatFrag<(ops node:$op1, node:$op2),
+                    (sub (shl node:$op1, 3), node:$op2)>;
+class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
+class CmpOpFrag<dag res> : PatFrag<(ops node:$R), res>;
+
+//Pseudo ops for selection
+
+def WTF : PseudoInstAlpha<(outs), (ins variable_ops), "#wtf", [], s_pseudo>;
+
+let hasCtrlDep = 1, Defs = [R30], Uses = [R30] in {
+def ADJUSTSTACKUP : PseudoInstAlpha<(outs), (ins s64imm:$amt),
+                "; ADJUP $amt", 
+                [(callseq_start timm:$amt)], s_pseudo>;
+def ADJUSTSTACKDOWN : PseudoInstAlpha<(outs), (ins s64imm:$amt1, s64imm:$amt2),
+                "; ADJDOWN $amt1",
+                [(callseq_end timm:$amt1, timm:$amt2)], s_pseudo>;
+}
+
+def ALTENT : PseudoInstAlpha<(outs), (ins s64imm:$TARGET), "$$$TARGET..ng:\n", [], s_pseudo>;
+def PCLABEL : PseudoInstAlpha<(outs), (ins s64imm:$num), "PCMARKER_$num:\n",[], s_pseudo>;
+def MEMLABEL : PseudoInstAlpha<(outs), (ins s64imm:$i, s64imm:$j, s64imm:$k, s64imm:$m),
+         "LSMARKER$$$i$$$j$$$k$$$m:", [], s_pseudo>;
+
+
+let usesCustomInserter = 1 in {   // Expanded after instruction selection.
+def CAS32 : PseudoInstAlpha<(outs GPRC:$dst), (ins GPRC:$ptr, GPRC:$cmp, GPRC:$swp), "",
+      [(set GPRC:$dst, (atomic_cmp_swap_32 GPRC:$ptr, GPRC:$cmp, GPRC:$swp))], s_pseudo>;
+def CAS64 : PseudoInstAlpha<(outs GPRC:$dst), (ins GPRC:$ptr, GPRC:$cmp, GPRC:$swp), "",
+      [(set GPRC:$dst, (atomic_cmp_swap_64 GPRC:$ptr, GPRC:$cmp, GPRC:$swp))], s_pseudo>;
+
+def LAS32 : PseudoInstAlpha<(outs GPRC:$dst), (ins GPRC:$ptr, GPRC:$swp), "",
+      [(set GPRC:$dst, (atomic_load_add_32 GPRC:$ptr, GPRC:$swp))], s_pseudo>;
+def LAS64 :PseudoInstAlpha<(outs GPRC:$dst), (ins GPRC:$ptr, GPRC:$swp), "",
+      [(set GPRC:$dst, (atomic_load_add_64 GPRC:$ptr, GPRC:$swp))], s_pseudo>;
+
+def SWAP32 : PseudoInstAlpha<(outs GPRC:$dst), (ins GPRC:$ptr, GPRC:$swp), "",
+        [(set GPRC:$dst, (atomic_swap_32 GPRC:$ptr, GPRC:$swp))], s_pseudo>;
+def SWAP64 :PseudoInstAlpha<(outs GPRC:$dst), (ins GPRC:$ptr, GPRC:$swp), "",
+        [(set GPRC:$dst, (atomic_swap_64 GPRC:$ptr, GPRC:$swp))], s_pseudo>;
+}
+
+//***********************
+//Real instructions
+//***********************
+
+//Operation Form:
+
+//conditional moves, int
+
+multiclass cmov_inst<bits<7> fun, string asmstr, PatFrag OpNode> {
+def r : OForm4<0x11, fun, !strconcat(asmstr, " $RCOND,$RTRUE,$RDEST"),
+             [(set GPRC:$RDEST, (select (OpNode GPRC:$RCOND), GPRC:$RTRUE, GPRC:$RFALSE))], s_cmov>;
+def i : OForm4L<0x11, fun, !strconcat(asmstr, " $RCOND,$RTRUE,$RDEST"),
+             [(set GPRC:$RDEST, (select (OpNode GPRC:$RCOND), immUExt8:$RTRUE, GPRC:$RFALSE))], s_cmov>;
+}
+
+defm CMOVEQ  : cmov_inst<0x24, "cmoveq",  CmpOpFrag<(seteq node:$R, 0)>>;
+defm CMOVNE  : cmov_inst<0x26, "cmovne",  CmpOpFrag<(setne node:$R, 0)>>;
+defm CMOVLT  : cmov_inst<0x44, "cmovlt",  CmpOpFrag<(setlt node:$R, 0)>>;
+defm CMOVLE  : cmov_inst<0x64, "cmovle",  CmpOpFrag<(setle node:$R, 0)>>;
+defm CMOVGT  : cmov_inst<0x66, "cmovgt",  CmpOpFrag<(setgt node:$R, 0)>>;
+defm CMOVGE  : cmov_inst<0x46, "cmovge",  CmpOpFrag<(setge node:$R, 0)>>;
+defm CMOVLBC : cmov_inst<0x16, "cmovlbc", CmpOpFrag<(xor   node:$R, 1)>>;
+defm CMOVLBS : cmov_inst<0x14, "cmovlbs", CmpOpFrag<(and   node:$R, 1)>>;
+
+//General pattern for cmov
+def : Pat<(select GPRC:$which, GPRC:$src1, GPRC:$src2),
+      (CMOVNEr GPRC:$src2, GPRC:$src1, GPRC:$which)>;
+def : Pat<(select GPRC:$which, GPRC:$src1, immUExt8:$src2),
+      (CMOVEQi GPRC:$src1, immUExt8:$src2, GPRC:$which)>;
+
+//Invert sense when we can for constants:
+def : Pat<(select (setne GPRC:$RCOND, 0), GPRC:$RTRUE, immUExt8:$RFALSE),
+          (CMOVEQi GPRC:$RCOND, immUExt8:$RFALSE, GPRC:$RTRUE)>;
+def : Pat<(select (setgt GPRC:$RCOND, 0), GPRC:$RTRUE, immUExt8:$RFALSE),
+          (CMOVLEi GPRC:$RCOND, immUExt8:$RFALSE, GPRC:$RTRUE)>;
+def : Pat<(select (setge GPRC:$RCOND, 0), GPRC:$RTRUE, immUExt8:$RFALSE),
+          (CMOVLTi GPRC:$RCOND, immUExt8:$RFALSE, GPRC:$RTRUE)>;
+def : Pat<(select (setlt GPRC:$RCOND, 0), GPRC:$RTRUE, immUExt8:$RFALSE),
+          (CMOVGEi GPRC:$RCOND, immUExt8:$RFALSE, GPRC:$RTRUE)>;
+def : Pat<(select (setle GPRC:$RCOND, 0), GPRC:$RTRUE, immUExt8:$RFALSE),
+          (CMOVGTi GPRC:$RCOND, immUExt8:$RFALSE, GPRC:$RTRUE)>;
+
+multiclass all_inst<bits<6> opc, bits<7> funl, bits<7> funq, 
+                    string asmstr, PatFrag OpNode, InstrItinClass itin> {
+  def Lr : OForm< opc, funl, !strconcat(asmstr, "l $RA,$RB,$RC"),
+               [(set GPRC:$RC, (intop (OpNode GPRC:$RA, GPRC:$RB)))], itin>;
+  def Li : OFormL<opc, funl, !strconcat(asmstr, "l $RA,$L,$RC"),
+               [(set GPRC:$RC, (intop (OpNode GPRC:$RA, immUExt8:$L)))], itin>;
+  def Qr : OForm< opc, funq, !strconcat(asmstr, "q $RA,$RB,$RC"),
+               [(set GPRC:$RC, (OpNode GPRC:$RA, GPRC:$RB))], itin>;
+  def Qi : OFormL<opc, funq, !strconcat(asmstr, "q $RA,$L,$RC"),
+               [(set GPRC:$RC, (OpNode GPRC:$RA, immUExt8:$L))], itin>;
+}
+
+defm MUL   : all_inst<0x13, 0x00, 0x20, "mul",   BinOpFrag<(mul node:$LHS, node:$RHS)>, s_imul>;
+defm ADD   : all_inst<0x10, 0x00, 0x20, "add",   BinOpFrag<(add node:$LHS, node:$RHS)>, s_iadd>;
+defm S4ADD : all_inst<0x10, 0x02, 0x22, "s4add", add4, s_iadd>;
+defm S8ADD : all_inst<0x10, 0x12, 0x32, "s8add", add8, s_iadd>;
+defm S4SUB : all_inst<0x10, 0x0B, 0x2B, "s4sub", sub4, s_iadd>;
+defm S8SUB : all_inst<0x10, 0x1B, 0x3B, "s8sub", sub8, s_iadd>;
+defm SUB   : all_inst<0x10, 0x09, 0x29, "sub",   BinOpFrag<(sub node:$LHS, node:$RHS)>, s_iadd>;
+//Const cases since legalize does sub x, int -> add x, inv(int) + 1
+def : Pat<(intop (add GPRC:$RA, immUExt8neg:$L)), (SUBLi GPRC:$RA, immUExt8neg:$L)>;
+def : Pat<(add GPRC:$RA, immUExt8neg:$L), (SUBQi GPRC:$RA, immUExt8neg:$L)>;
+def : Pat<(intop (add4 GPRC:$RA, immUExt8neg:$L)), (S4SUBLi GPRC:$RA, immUExt8neg:$L)>;
+def : Pat<(add4 GPRC:$RA, immUExt8neg:$L), (S4SUBQi GPRC:$RA, immUExt8neg:$L)>;
+def : Pat<(intop (add8 GPRC:$RA, immUExt8neg:$L)), (S8SUBLi GPRC:$RA, immUExt8neg:$L)>;
+def : Pat<(add8 GPRC:$RA, immUExt8neg:$L), (S8SUBQi GPRC:$RA, immUExt8neg:$L)>;
+
+multiclass log_inst<bits<6> opc, bits<7> fun, string asmstr, SDNode OpNode, InstrItinClass itin> {
+def r : OForm<opc, fun, !strconcat(asmstr, " $RA,$RB,$RC"),
+              [(set GPRC:$RC, (OpNode GPRC:$RA, GPRC:$RB))], itin>;
+def i : OFormL<opc, fun, !strconcat(asmstr, " $RA,$L,$RC"),
+              [(set GPRC:$RC, (OpNode GPRC:$RA, immUExt8:$L))], itin>;
+}
+multiclass inv_inst<bits<6> opc, bits<7> fun, string asmstr, SDNode OpNode, InstrItinClass itin> {
+def r : OForm<opc, fun, !strconcat(asmstr, " $RA,$RB,$RC"),
+              [(set GPRC:$RC, (OpNode GPRC:$RA, (not GPRC:$RB)))], itin>;
+def i : OFormL<opc, fun, !strconcat(asmstr, " $RA,$L,$RC"),
+              [(set GPRC:$RC, (OpNode GPRC:$RA, immUExt8inv:$L))], itin>;
+}
+
+defm AND   : log_inst<0x11, 0x00, "and",   and,   s_ilog>;
+defm BIC   : inv_inst<0x11, 0x08, "bic",   and,   s_ilog>;
+defm BIS   : log_inst<0x11, 0x20, "bis",   or,    s_ilog>;
+defm ORNOT : inv_inst<0x11, 0x28, "ornot", or,    s_ilog>;
+defm XOR   : log_inst<0x11, 0x40, "xor",   xor,   s_ilog>;
+defm EQV   : inv_inst<0x11, 0x48, "eqv",   xor,   s_ilog>;
+
+defm SL    : log_inst<0x12, 0x39, "sll",   shl,   s_ishf>;
+defm SRA   : log_inst<0x12, 0x3c, "sra",   sra,   s_ishf>;
+defm SRL   : log_inst<0x12, 0x34, "srl",   srl,   s_ishf>;
+defm UMULH : log_inst<0x13, 0x30, "umulh", mulhu, s_imul>;
+
+def CTLZ     : OForm2<0x1C, 0x32, "CTLZ $RB,$RC", 
+                      [(set GPRC:$RC, (ctlz GPRC:$RB))], s_imisc>;
+def CTPOP    : OForm2<0x1C, 0x30, "CTPOP $RB,$RC", 
+                      [(set GPRC:$RC, (ctpop GPRC:$RB))], s_imisc>;
+def CTTZ     : OForm2<0x1C, 0x33, "CTTZ $RB,$RC", 
+                      [(set GPRC:$RC, (cttz GPRC:$RB))], s_imisc>;
+def EXTBL    : OForm< 0x12, 0x06, "EXTBL $RA,$RB,$RC", 
+                      [(set GPRC:$RC, (and (srl GPRC:$RA, (shl GPRC:$RB, 3)), 255))], s_ishf>;
+def EXTWL    : OForm< 0x12, 0x16, "EXTWL $RA,$RB,$RC", 
+                      [(set GPRC:$RC, (and (srl GPRC:$RA, (shl GPRC:$RB, 3)), 65535))], s_ishf>;
+def EXTLL    : OForm< 0x12, 0x26, "EXTLL $RA,$RB,$RC", 
+                      [(set GPRC:$RC, (and (srl GPRC:$RA, (shl GPRC:$RB, 3)), 4294967295))], s_ishf>;
+def SEXTB    : OForm2<0x1C, 0x00, "sextb $RB,$RC", 
+                      [(set GPRC:$RC, (sext_inreg GPRC:$RB, i8))], s_ishf>;
+def SEXTW    : OForm2<0x1C, 0x01, "sextw $RB,$RC", 
+                      [(set GPRC:$RC, (sext_inreg GPRC:$RB, i16))], s_ishf>;
+
+//def EXTBLi   : OFormL<0x12, 0x06, "EXTBL $RA,$L,$RC", []>; //Extract byte low
+//def EXTLH    : OForm< 0x12, 0x6A, "EXTLH $RA,$RB,$RC", []>; //Extract longword high
+//def EXTLHi   : OFormL<0x12, 0x6A, "EXTLH $RA,$L,$RC", []>; //Extract longword high
+//def EXTLLi   : OFormL<0x12, 0x26, "EXTLL $RA,$L,$RC", []>; //Extract longword low
+//def EXTQH    : OForm< 0x12, 0x7A, "EXTQH $RA,$RB,$RC", []>; //Extract quadword high
+//def EXTQHi   : OFormL<0x12, 0x7A, "EXTQH $RA,$L,$RC", []>; //Extract quadword high
+//def EXTQ     : OForm< 0x12, 0x36, "EXTQ $RA,$RB,$RC", []>; //Extract quadword low
+//def EXTQi    : OFormL<0x12, 0x36, "EXTQ $RA,$L,$RC", []>; //Extract quadword low
+//def EXTWH    : OForm< 0x12, 0x5A, "EXTWH $RA,$RB,$RC", []>; //Extract word high
+//def EXTWHi   : OFormL<0x12, 0x5A, "EXTWH $RA,$L,$RC", []>; //Extract word high
+//def EXTWLi   : OFormL<0x12, 0x16, "EXTWL $RA,$L,$RC", []>; //Extract word low
+
+//def INSBL    : OForm< 0x12, 0x0B, "INSBL $RA,$RB,$RC", []>; //Insert byte low
+//def INSBLi   : OFormL<0x12, 0x0B, "INSBL $RA,$L,$RC", []>; //Insert byte low
+//def INSLH    : OForm< 0x12, 0x67, "INSLH $RA,$RB,$RC", []>; //Insert longword high
+//def INSLHi   : OFormL<0x12, 0x67, "INSLH $RA,$L,$RC", []>; //Insert longword high
+//def INSLL    : OForm< 0x12, 0x2B, "INSLL $RA,$RB,$RC", []>; //Insert longword low
+//def INSLLi   : OFormL<0x12, 0x2B, "INSLL $RA,$L,$RC", []>; //Insert longword low
+//def INSQH    : OForm< 0x12, 0x77, "INSQH $RA,$RB,$RC", []>; //Insert quadword high
+//def INSQHi   : OFormL<0x12, 0x77, "INSQH $RA,$L,$RC", []>; //Insert quadword high
+//def INSQL    : OForm< 0x12, 0x3B, "INSQL $RA,$RB,$RC", []>; //Insert quadword low
+//def INSQLi   : OFormL<0x12, 0x3B, "INSQL $RA,$L,$RC", []>; //Insert quadword low
+//def INSWH    : OForm< 0x12, 0x57, "INSWH $RA,$RB,$RC", []>; //Insert word high
+//def INSWHi   : OFormL<0x12, 0x57, "INSWH $RA,$L,$RC", []>; //Insert word high
+//def INSWL    : OForm< 0x12, 0x1B, "INSWL $RA,$RB,$RC", []>; //Insert word low
+//def INSWLi   : OFormL<0x12, 0x1B, "INSWL $RA,$L,$RC", []>; //Insert word low
+
+//def MSKBL    : OForm< 0x12, 0x02, "MSKBL $RA,$RB,$RC", []>; //Mask byte low
+//def MSKBLi   : OFormL<0x12, 0x02, "MSKBL $RA,$L,$RC", []>; //Mask byte low
+//def MSKLH    : OForm< 0x12, 0x62, "MSKLH $RA,$RB,$RC", []>; //Mask longword high
+//def MSKLHi   : OFormL<0x12, 0x62, "MSKLH $RA,$L,$RC", []>; //Mask longword high
+//def MSKLL    : OForm< 0x12, 0x22, "MSKLL $RA,$RB,$RC", []>; //Mask longword low
+//def MSKLLi   : OFormL<0x12, 0x22, "MSKLL $RA,$L,$RC", []>; //Mask longword low
+//def MSKQH    : OForm< 0x12, 0x72, "MSKQH $RA,$RB,$RC", []>; //Mask quadword high
+//def MSKQHi   : OFormL<0x12, 0x72, "MSKQH $RA,$L,$RC", []>; //Mask quadword high
+//def MSKQL    : OForm< 0x12, 0x32, "MSKQL $RA,$RB,$RC", []>; //Mask quadword low
+//def MSKQLi   : OFormL<0x12, 0x32, "MSKQL $RA,$L,$RC", []>; //Mask quadword low
+//def MSKWH    : OForm< 0x12, 0x52, "MSKWH $RA,$RB,$RC", []>; //Mask word high
+//def MSKWHi   : OFormL<0x12, 0x52, "MSKWH $RA,$L,$RC", []>; //Mask word high
+//def MSKWL    : OForm< 0x12, 0x12, "MSKWL $RA,$RB,$RC", []>; //Mask word low
+//def MSKWLi   : OFormL<0x12, 0x12, "MSKWL $RA,$L,$RC", []>; //Mask word low
+                      
+def ZAPNOTi  : OFormL<0x12, 0x31, "zapnot $RA,$L,$RC", [], s_ishf>;
+
+// Define the pattern that produces ZAPNOTi.
+def : Pat<(zappat:$imm GPRC:$RA),
+          (ZAPNOTi GPRC:$RA, (iZAPX GPRC:$imm))>;
+
+
+//Comparison, int
+//So this is a waste of what this instruction can do, but it still saves something
+def CMPBGE  : OForm< 0x10, 0x0F, "cmpbge $RA,$RB,$RC", 
+                     [(set GPRC:$RC, (setuge (and GPRC:$RA, 255), (and GPRC:$RB, 255)))], s_ilog>;
+def CMPBGEi : OFormL<0x10, 0x0F, "cmpbge $RA,$L,$RC",
+                     [(set GPRC:$RC, (setuge (and GPRC:$RA, 255), immUExt8:$L))], s_ilog>;
+def CMPEQ   : OForm< 0x10, 0x2D, "cmpeq $RA,$RB,$RC", 
+                     [(set GPRC:$RC, (seteq GPRC:$RA, GPRC:$RB))], s_iadd>;
+def CMPEQi  : OFormL<0x10, 0x2D, "cmpeq $RA,$L,$RC", 
+                     [(set GPRC:$RC, (seteq GPRC:$RA, immUExt8:$L))], s_iadd>;
+def CMPLE   : OForm< 0x10, 0x6D, "cmple $RA,$RB,$RC", 
+                     [(set GPRC:$RC, (setle GPRC:$RA, GPRC:$RB))], s_iadd>;
+def CMPLEi  : OFormL<0x10, 0x6D, "cmple $RA,$L,$RC",
+                     [(set GPRC:$RC, (setle GPRC:$RA, immUExt8:$L))], s_iadd>;
+def CMPLT   : OForm< 0x10, 0x4D, "cmplt $RA,$RB,$RC",
+                     [(set GPRC:$RC, (setlt GPRC:$RA, GPRC:$RB))], s_iadd>;
+def CMPLTi  : OFormL<0x10, 0x4D, "cmplt $RA,$L,$RC",
+                     [(set GPRC:$RC, (setlt GPRC:$RA, immUExt8:$L))], s_iadd>;
+def CMPULE  : OForm< 0x10, 0x3D, "cmpule $RA,$RB,$RC",
+                     [(set GPRC:$RC, (setule GPRC:$RA, GPRC:$RB))], s_iadd>;
+def CMPULEi : OFormL<0x10, 0x3D, "cmpule $RA,$L,$RC",
+                     [(set GPRC:$RC, (setule GPRC:$RA, immUExt8:$L))], s_iadd>;
+def CMPULT  : OForm< 0x10, 0x1D, "cmpult $RA,$RB,$RC",
+                     [(set GPRC:$RC, (setult GPRC:$RA, GPRC:$RB))], s_iadd>;
+def CMPULTi : OFormL<0x10, 0x1D, "cmpult $RA,$L,$RC", 
+                      [(set GPRC:$RC, (setult GPRC:$RA, immUExt8:$L))], s_iadd>;
+
+//Patterns for unsupported int comparisons
+def : Pat<(setueq GPRC:$X, GPRC:$Y), (CMPEQ GPRC:$X, GPRC:$Y)>;
+def : Pat<(setueq GPRC:$X, immUExt8:$Y), (CMPEQi GPRC:$X, immUExt8:$Y)>;
+
+def : Pat<(setugt GPRC:$X, GPRC:$Y), (CMPULT GPRC:$Y, GPRC:$X)>;
+def : Pat<(setugt immUExt8:$X, GPRC:$Y), (CMPULTi GPRC:$Y, immUExt8:$X)>;
+
+def : Pat<(setuge GPRC:$X, GPRC:$Y), (CMPULE GPRC:$Y, GPRC:$X)>;
+def : Pat<(setuge immUExt8:$X, GPRC:$Y), (CMPULEi GPRC:$Y, immUExt8:$X)>;
+
+def : Pat<(setgt GPRC:$X, GPRC:$Y), (CMPLT GPRC:$Y, GPRC:$X)>;
+def : Pat<(setgt immUExt8:$X, GPRC:$Y), (CMPLTi GPRC:$Y, immUExt8:$X)>;
+
+def : Pat<(setge GPRC:$X, GPRC:$Y), (CMPLE GPRC:$Y, GPRC:$X)>;
+def : Pat<(setge immUExt8:$X, GPRC:$Y), (CMPLEi GPRC:$Y, immUExt8:$X)>;
+
+def : Pat<(setne GPRC:$X, GPRC:$Y), (CMPEQi (CMPEQ GPRC:$X, GPRC:$Y), 0)>;
+def : Pat<(setne GPRC:$X, immUExt8:$Y), (CMPEQi (CMPEQi GPRC:$X, immUExt8:$Y), 0)>;
+
+def : Pat<(setune GPRC:$X, GPRC:$Y), (CMPEQi (CMPEQ GPRC:$X, GPRC:$Y), 0)>;
+def : Pat<(setune GPRC:$X, immUExt8:$Y), (CMPEQi (CMPEQ GPRC:$X, immUExt8:$Y), 0)>;
+
+
+let isReturn = 1, isTerminator = 1, isBarrier = 1, Ra = 31, Rb = 26, disp = 1, Uses = [R26] in {
+  def RETDAG : MbrForm< 0x1A, 0x02, (ops), "ret $$31,($$26),1", s_jsr>; //Return from subroutine
+  def RETDAGp : MbrpForm< 0x1A, 0x02, (ops), "ret $$31,($$26),1", [(retflag)], s_jsr>; //Return from subroutine
+}
+
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1, Ra = 31, disp = 0 in
+def JMP : MbrpForm< 0x1A, 0x00, (ops GPRC:$RS), "jmp $$31,($RS),0", 
+          [(brind GPRC:$RS)], s_jsr>; //Jump
+
+let isCall = 1, Ra = 26,
+    Defs = [R0, R1, R2, R3, R4, R5, R6, R7, R8, R16, R17, R18, R19,
+            R20, R21, R22, R23, R24, R25, R26, R27, R28, R29,
+            F0, F1,
+            F10, F11, F12, F13, F14, F15, F16, F17, F18, F19,
+            F20, F21, F22, F23, F24, F25, F26, F27, F28, F29, F30], Uses = [R29] in {
+    def BSR : BFormD<0x34, "bsr $$26,$$$DISP..ng", [], s_jsr>; //Branch to subroutine
+}
+let isCall = 1, Ra = 26, Rb = 27, disp = 0,
+    Defs = [R0, R1, R2, R3, R4, R5, R6, R7, R8, R16, R17, R18, R19,
+            R20, R21, R22, R23, R24, R25, R26, R27, R28, R29,
+            F0, F1,
+            F10, F11, F12, F13, F14, F15, F16, F17, F18, F19,
+            F20, F21, F22, F23, F24, F25, F26, F27, F28, F29, F30], Uses = [R27, R29] in {
+    def JSR : MbrForm< 0x1A, 0x01, (ops ), "jsr $$26,($$27),0", s_jsr>; //Jump to subroutine
+}
+
+let isCall = 1, Ra = 23, Rb = 27, disp = 0,
+    Defs = [R23, R24, R25, R27, R28], Uses = [R24, R25, R27] in
+  def JSRs : MbrForm< 0x1A, 0x01, (ops ), "jsr $$23,($$27),0", s_jsr>; //Jump to div or rem
+
+
+def JSR_COROUTINE : MbrForm< 0x1A, 0x03, (ops GPRC:$RD, GPRC:$RS, s14imm:$DISP), "jsr_coroutine $RD,($RS),$DISP", s_jsr>; //Jump to subroutine return
+
+
+let OutOperandList = (ops GPRC:$RA), InOperandList = (ops s64imm:$DISP, GPRC:$RB) in {
+def LDQ   : MForm<0x29, 1, "ldq $RA,$DISP($RB)",
+                 [(set GPRC:$RA, (load (add GPRC:$RB, immSExt16:$DISP)))], s_ild>;
+def LDQr  : MForm<0x29, 1, "ldq $RA,$DISP($RB)\t\t!gprellow",
+                 [(set GPRC:$RA, (load (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB)))], s_ild>;
+def LDL   : MForm<0x28, 1, "ldl $RA,$DISP($RB)",
+                 [(set GPRC:$RA, (sextloadi32 (add GPRC:$RB, immSExt16:$DISP)))], s_ild>;
+def LDLr  : MForm<0x28, 1, "ldl $RA,$DISP($RB)\t\t!gprellow",
+                 [(set GPRC:$RA, (sextloadi32 (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB)))], s_ild>;
+def LDBU  : MForm<0x0A, 1, "ldbu $RA,$DISP($RB)",
+                 [(set GPRC:$RA, (zextloadi8 (add GPRC:$RB, immSExt16:$DISP)))], s_ild>;
+def LDBUr : MForm<0x0A, 1, "ldbu $RA,$DISP($RB)\t\t!gprellow",
+                 [(set GPRC:$RA, (zextloadi8 (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB)))], s_ild>;
+def LDWU  : MForm<0x0C, 1, "ldwu $RA,$DISP($RB)",
+                 [(set GPRC:$RA, (zextloadi16 (add GPRC:$RB, immSExt16:$DISP)))], s_ild>;
+def LDWUr : MForm<0x0C, 1, "ldwu $RA,$DISP($RB)\t\t!gprellow",
+                 [(set GPRC:$RA, (zextloadi16 (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB)))], s_ild>;
+}
+
+
+let OutOperandList = (ops), InOperandList = (ops GPRC:$RA, s64imm:$DISP, GPRC:$RB) in {
+def STB   : MForm<0x0E, 0, "stb $RA,$DISP($RB)",
+                 [(truncstorei8 GPRC:$RA, (add GPRC:$RB, immSExt16:$DISP))], s_ist>;
+def STBr  : MForm<0x0E, 0, "stb $RA,$DISP($RB)\t\t!gprellow",
+                 [(truncstorei8 GPRC:$RA, (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB))], s_ist>;
+def STW   : MForm<0x0D, 0, "stw $RA,$DISP($RB)",
+                 [(truncstorei16 GPRC:$RA, (add GPRC:$RB, immSExt16:$DISP))], s_ist>;
+def STWr  : MForm<0x0D, 0, "stw $RA,$DISP($RB)\t\t!gprellow",
+                 [(truncstorei16 GPRC:$RA, (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB))], s_ist>;
+def STL   : MForm<0x2C, 0, "stl $RA,$DISP($RB)",
+                 [(truncstorei32 GPRC:$RA, (add GPRC:$RB, immSExt16:$DISP))], s_ist>;
+def STLr  : MForm<0x2C, 0, "stl $RA,$DISP($RB)\t\t!gprellow",
+                 [(truncstorei32 GPRC:$RA, (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB))], s_ist>;
+def STQ   : MForm<0x2D, 0, "stq $RA,$DISP($RB)",
+                 [(store GPRC:$RA, (add GPRC:$RB, immSExt16:$DISP))], s_ist>;
+def STQr  : MForm<0x2D, 0, "stq $RA,$DISP($RB)\t\t!gprellow",
+                 [(store GPRC:$RA, (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB))], s_ist>;
+}
+
+//Load address
+let OutOperandList = (ops GPRC:$RA), InOperandList = (ops s64imm:$DISP, GPRC:$RB) in {
+def LDA   : MForm<0x08, 0, "lda $RA,$DISP($RB)",
+                 [(set GPRC:$RA, (add GPRC:$RB, immSExt16:$DISP))], s_lda>;
+def LDAr  : MForm<0x08, 0, "lda $RA,$DISP($RB)\t\t!gprellow",
+                 [(set GPRC:$RA, (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB))], s_lda>;  //Load address
+def LDAH  : MForm<0x09, 0, "ldah $RA,$DISP($RB)",
+                 [], s_lda>;  //Load address high
+def LDAHr : MForm<0x09, 0, "ldah $RA,$DISP($RB)\t\t!gprelhigh",
+                 [(set GPRC:$RA, (Alpha_gprelhi tglobaladdr:$DISP, GPRC:$RB))], s_lda>;  //Load address high
+}
+
+let OutOperandList = (ops), InOperandList = (ops F4RC:$RA, s64imm:$DISP, GPRC:$RB) in {
+def STS  : MForm<0x26, 0, "sts $RA,$DISP($RB)",
+                [(store F4RC:$RA, (add GPRC:$RB, immSExt16:$DISP))], s_fst>;
+def STSr : MForm<0x26, 0, "sts $RA,$DISP($RB)\t\t!gprellow",
+                [(store F4RC:$RA, (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB))], s_fst>;
+}
+let OutOperandList = (ops F4RC:$RA), InOperandList = (ops s64imm:$DISP, GPRC:$RB) in {
+def LDS  : MForm<0x22, 1, "lds $RA,$DISP($RB)",
+                [(set F4RC:$RA, (load (add GPRC:$RB, immSExt16:$DISP)))], s_fld>;
+def LDSr : MForm<0x22, 1, "lds $RA,$DISP($RB)\t\t!gprellow",
+                [(set F4RC:$RA, (load (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB)))], s_fld>;
+}
+let OutOperandList = (ops), InOperandList = (ops F8RC:$RA, s64imm:$DISP, GPRC:$RB) in {
+def STT  : MForm<0x27, 0, "stt $RA,$DISP($RB)",
+                 [(store F8RC:$RA, (add GPRC:$RB, immSExt16:$DISP))], s_fst>;
+def STTr : MForm<0x27, 0, "stt $RA,$DISP($RB)\t\t!gprellow",
+                 [(store F8RC:$RA, (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB))], s_fst>;
+}
+let OutOperandList = (ops F8RC:$RA), InOperandList = (ops s64imm:$DISP, GPRC:$RB) in {
+def LDT  : MForm<0x23, 1, "ldt $RA,$DISP($RB)",
+                [(set F8RC:$RA, (load (add GPRC:$RB, immSExt16:$DISP)))], s_fld>;
+def LDTr : MForm<0x23, 1, "ldt $RA,$DISP($RB)\t\t!gprellow",
+                [(set F8RC:$RA, (load (Alpha_gprello tglobaladdr:$DISP, GPRC:$RB)))], s_fld>;
+}
+
+
+//constpool rels
+def : Pat<(i64 (load (Alpha_gprello tconstpool:$DISP, GPRC:$RB))),
+          (LDQr tconstpool:$DISP, GPRC:$RB)>;
+def : Pat<(i64 (sextloadi32 (Alpha_gprello tconstpool:$DISP, GPRC:$RB))),
+          (LDLr tconstpool:$DISP, GPRC:$RB)>;
+def : Pat<(i64 (zextloadi8 (Alpha_gprello tconstpool:$DISP, GPRC:$RB))),
+          (LDBUr tconstpool:$DISP, GPRC:$RB)>;
+def : Pat<(i64 (zextloadi16 (Alpha_gprello tconstpool:$DISP, GPRC:$RB))),
+          (LDWUr tconstpool:$DISP, GPRC:$RB)>;
+def : Pat<(i64 (Alpha_gprello tconstpool:$DISP, GPRC:$RB)),
+          (LDAr tconstpool:$DISP, GPRC:$RB)>;
+def : Pat<(i64 (Alpha_gprelhi tconstpool:$DISP, GPRC:$RB)),
+          (LDAHr tconstpool:$DISP, GPRC:$RB)>;
+def : Pat<(f32 (load (Alpha_gprello tconstpool:$DISP, GPRC:$RB))),
+          (LDSr tconstpool:$DISP, GPRC:$RB)>;
+def : Pat<(f64 (load (Alpha_gprello tconstpool:$DISP, GPRC:$RB))),
+          (LDTr tconstpool:$DISP, GPRC:$RB)>;
+
+//jumptable rels
+def : Pat<(i64 (Alpha_gprelhi tjumptable:$DISP, GPRC:$RB)),
+          (LDAHr tjumptable:$DISP, GPRC:$RB)>;
+def : Pat<(i64 (Alpha_gprello tjumptable:$DISP, GPRC:$RB)),
+          (LDAr tjumptable:$DISP, GPRC:$RB)>;
+
+
+//misc ext patterns
+def : Pat<(i64 (extloadi8 (add GPRC:$RB, immSExt16:$DISP))),
+          (LDBU   immSExt16:$DISP, GPRC:$RB)>;
+def : Pat<(i64 (extloadi16 (add GPRC:$RB, immSExt16:$DISP))),
+          (LDWU  immSExt16:$DISP, GPRC:$RB)>;
+def : Pat<(i64 (extloadi32 (add GPRC:$RB, immSExt16:$DISP))),
+          (LDL   immSExt16:$DISP, GPRC:$RB)>;
+
+//0 disp patterns
+def : Pat<(i64 (load GPRC:$addr)),
+          (LDQ  0, GPRC:$addr)>;
+def : Pat<(f64 (load GPRC:$addr)),
+          (LDT  0, GPRC:$addr)>;
+def : Pat<(f32 (load GPRC:$addr)),
+          (LDS  0, GPRC:$addr)>;
+def : Pat<(i64 (sextloadi32 GPRC:$addr)),
+          (LDL  0, GPRC:$addr)>;
+def : Pat<(i64 (zextloadi16 GPRC:$addr)),
+          (LDWU 0, GPRC:$addr)>;
+def : Pat<(i64 (zextloadi8 GPRC:$addr)),
+          (LDBU 0, GPRC:$addr)>;
+def : Pat<(i64 (extloadi8 GPRC:$addr)),
+          (LDBU 0, GPRC:$addr)>;
+def : Pat<(i64 (extloadi16 GPRC:$addr)),
+          (LDWU 0, GPRC:$addr)>;
+def : Pat<(i64 (extloadi32 GPRC:$addr)),
+          (LDL  0, GPRC:$addr)>;
+
+def : Pat<(store GPRC:$DATA, GPRC:$addr),
+          (STQ  GPRC:$DATA, 0, GPRC:$addr)>;
+def : Pat<(store F8RC:$DATA, GPRC:$addr),
+          (STT  F8RC:$DATA, 0, GPRC:$addr)>;
+def : Pat<(store F4RC:$DATA, GPRC:$addr),
+          (STS  F4RC:$DATA, 0, GPRC:$addr)>;
+def : Pat<(truncstorei32 GPRC:$DATA, GPRC:$addr),
+          (STL  GPRC:$DATA, 0, GPRC:$addr)>;
+def : Pat<(truncstorei16 GPRC:$DATA, GPRC:$addr),
+          (STW GPRC:$DATA, 0, GPRC:$addr)>;
+def : Pat<(truncstorei8 GPRC:$DATA, GPRC:$addr),
+          (STB GPRC:$DATA, 0, GPRC:$addr)>;
+
+
+//load address, rellocated gpdist form
+let OutOperandList = (ops GPRC:$RA),
+    InOperandList = (ops s16imm:$DISP, GPRC:$RB, s16imm:$NUM),
+    mayLoad = 1 in {
+def LDAg  : MForm<0x08, 1, "lda $RA,0($RB)\t\t!gpdisp!$NUM", [], s_lda>;  //Load address
+def LDAHg : MForm<0x09, 1, "ldah $RA,0($RB)\t\t!gpdisp!$NUM", [], s_lda>;  //Load address
+}
+
+//Load quad, rellocated literal form
+let OutOperandList = (ops GPRC:$RA), InOperandList = (ops s64imm:$DISP, GPRC:$RB) in 
+def LDQl : MForm<0x29, 1, "ldq $RA,$DISP($RB)\t\t!literal",
+                 [(set GPRC:$RA, (Alpha_rellit tglobaladdr:$DISP, GPRC:$RB))], s_ild>;
+def : Pat<(Alpha_rellit texternalsym:$ext, GPRC:$RB),
+          (LDQl texternalsym:$ext, GPRC:$RB)>;
+
+let OutOperandList = (outs GPRC:$RR),
+    InOperandList = (ins GPRC:$RA, s64imm:$DISP, GPRC:$RB),
+    Constraints = "$RA = $RR",
+    DisableEncoding = "$RR" in {
+def STQ_C : MForm<0x2F, 0, "stq_l $RA,$DISP($RB)", [], s_ist>;
+def STL_C : MForm<0x2E, 0, "stl_l $RA,$DISP($RB)", [], s_ist>;
+}
+let OutOperandList = (ops GPRC:$RA),
+    InOperandList = (ops s64imm:$DISP, GPRC:$RB),
+    mayLoad = 1 in {
+def LDQ_L : MForm<0x2B, 1, "ldq_l $RA,$DISP($RB)", [], s_ild>;
+def LDL_L : MForm<0x2A, 1, "ldl_l $RA,$DISP($RB)", [], s_ild>;
+}
+
+def RPCC : MfcForm<0x18, 0xC000, "rpcc $RA", s_rpcc>; //Read process cycle counter
+def MB  : MfcPForm<0x18, 0x4000, "mb",  s_imisc>; //memory barrier
+def WMB : MfcPForm<0x18, 0x4400, "wmb", s_imisc>; //write memory barrier
+
+def : Pat<(membarrier (i64 imm:$ll), (i64 imm:$ls), (i64 imm:$sl), (i64 1), (i64 imm:$dev)),
+          (WMB)>;
+def : Pat<(membarrier (i64 imm:$ll), (i64 imm:$ls), (i64 imm:$sl), (i64 imm:$ss), (i64 imm:$dev)),
+          (MB)>;
+
+//Basic Floating point ops
+
+//Floats
+
+let OutOperandList = (ops F4RC:$RC), InOperandList = (ops F4RC:$RB), Fa = 31 in 
+def SQRTS : FPForm<0x14, 0x58B, "sqrts/su $RB,$RC",
+                   [(set F4RC:$RC, (fsqrt F4RC:$RB))], s_fsqrts>;
+
+let OutOperandList = (ops F4RC:$RC), InOperandList = (ops F4RC:$RA, F4RC:$RB) in {
+def ADDS  : FPForm<0x16, 0x580, "adds/su $RA,$RB,$RC",
+                   [(set F4RC:$RC, (fadd F4RC:$RA, F4RC:$RB))], s_fadd>;
+def SUBS  : FPForm<0x16, 0x581, "subs/su $RA,$RB,$RC",
+                   [(set F4RC:$RC, (fsub F4RC:$RA, F4RC:$RB))], s_fadd>;
+def DIVS  : FPForm<0x16, 0x583, "divs/su $RA,$RB,$RC",
+                   [(set F4RC:$RC, (fdiv F4RC:$RA, F4RC:$RB))], s_fdivs>;
+def MULS  : FPForm<0x16, 0x582, "muls/su $RA,$RB,$RC",
+                   [(set F4RC:$RC, (fmul F4RC:$RA, F4RC:$RB))], s_fmul>;
+
+def CPYSS  : FPForm<0x17, 0x020, "cpys $RA,$RB,$RC",
+                   [(set F4RC:$RC, (fcopysign F4RC:$RB, F4RC:$RA))], s_fadd>;
+def CPYSES : FPForm<0x17, 0x022, "cpyse $RA,$RB,$RC",[], s_fadd>; //Copy sign and exponent
+def CPYSNS : FPForm<0x17, 0x021, "cpysn $RA,$RB,$RC",
+                   [(set F4RC:$RC, (fneg (fcopysign F4RC:$RB, F4RC:$RA)))], s_fadd>;
+}
+
+//Doubles
+
+let OutOperandList = (ops F8RC:$RC), InOperandList = (ops F8RC:$RB), Fa = 31 in 
+def SQRTT : FPForm<0x14, 0x5AB, "sqrtt/su $RB,$RC",
+                   [(set F8RC:$RC, (fsqrt F8RC:$RB))], s_fsqrtt>;
+
+let OutOperandList = (ops F8RC:$RC), InOperandList = (ops F8RC:$RA, F8RC:$RB) in {
+def ADDT  : FPForm<0x16, 0x5A0, "addt/su $RA,$RB,$RC",
+                   [(set F8RC:$RC, (fadd F8RC:$RA, F8RC:$RB))], s_fadd>;
+def SUBT  : FPForm<0x16, 0x5A1, "subt/su $RA,$RB,$RC",
+                   [(set F8RC:$RC, (fsub F8RC:$RA, F8RC:$RB))], s_fadd>;
+def DIVT  : FPForm<0x16, 0x5A3, "divt/su $RA,$RB,$RC",
+                   [(set F8RC:$RC, (fdiv F8RC:$RA, F8RC:$RB))], s_fdivt>;
+def MULT  : FPForm<0x16, 0x5A2, "mult/su $RA,$RB,$RC",
+                   [(set F8RC:$RC, (fmul F8RC:$RA, F8RC:$RB))], s_fmul>;
+
+def CPYST  : FPForm<0x17, 0x020, "cpys $RA,$RB,$RC",
+                   [(set F8RC:$RC, (fcopysign F8RC:$RB, F8RC:$RA))], s_fadd>;
+def CPYSET : FPForm<0x17, 0x022, "cpyse $RA,$RB,$RC",[], s_fadd>; //Copy sign and exponent
+def CPYSNT : FPForm<0x17, 0x021, "cpysn $RA,$RB,$RC",
+                   [(set F8RC:$RC, (fneg (fcopysign F8RC:$RB, F8RC:$RA)))], s_fadd>;
+
+def CMPTEQ : FPForm<0x16, 0x5A5, "cmpteq/su $RA,$RB,$RC", [], s_fadd>;
+//                    [(set F8RC:$RC, (seteq F8RC:$RA, F8RC:$RB))]>;
+def CMPTLE : FPForm<0x16, 0x5A7, "cmptle/su $RA,$RB,$RC", [], s_fadd>;
+//                    [(set F8RC:$RC, (setle F8RC:$RA, F8RC:$RB))]>;
+def CMPTLT : FPForm<0x16, 0x5A6, "cmptlt/su $RA,$RB,$RC", [], s_fadd>;
+//                    [(set F8RC:$RC, (setlt F8RC:$RA, F8RC:$RB))]>;
+def CMPTUN : FPForm<0x16, 0x5A4, "cmptun/su $RA,$RB,$RC", [], s_fadd>;
+//                    [(set F8RC:$RC, (setuo F8RC:$RA, F8RC:$RB))]>;
+}
+
+//More CPYS forms:
+let OutOperandList = (ops F8RC:$RC), InOperandList = (ops F4RC:$RA, F8RC:$RB) in {
+def CPYSTs  : FPForm<0x17, 0x020, "cpys $RA,$RB,$RC",
+                   [(set F8RC:$RC, (fcopysign F8RC:$RB, F4RC:$RA))], s_fadd>;
+def CPYSNTs : FPForm<0x17, 0x021, "cpysn $RA,$RB,$RC",
+                   [(set F8RC:$RC, (fneg (fcopysign F8RC:$RB, F4RC:$RA)))], s_fadd>;
+}
+let OutOperandList = (ops F4RC:$RC), InOperandList = (ops F8RC:$RA, F4RC:$RB) in {
+def CPYSSt  : FPForm<0x17, 0x020, "cpys $RA,$RB,$RC",
+                   [(set F4RC:$RC, (fcopysign F4RC:$RB, F8RC:$RA))], s_fadd>;
+def CPYSESt : FPForm<0x17, 0x022, "cpyse $RA,$RB,$RC",[], s_fadd>; //Copy sign and exponent
+def CPYSNSt : FPForm<0x17, 0x021, "cpysn $RA,$RB,$RC",
+                   [(set F4RC:$RC, (fneg (fcopysign F4RC:$RB, F8RC:$RA)))], s_fadd>;
+}
+
+//conditional moves, floats
+let OutOperandList = (ops F4RC:$RDEST), InOperandList = (ops F4RC:$RFALSE, F4RC:$RTRUE, F8RC:$RCOND),
+    isTwoAddress = 1 in {
+def FCMOVEQS : FPForm<0x17, 0x02A, "fcmoveq $RCOND,$RTRUE,$RDEST",[], s_fcmov>; //FCMOVE if = zero
+def FCMOVGES : FPForm<0x17, 0x02D, "fcmovge $RCOND,$RTRUE,$RDEST",[], s_fcmov>; //FCMOVE if >= zero
+def FCMOVGTS : FPForm<0x17, 0x02F, "fcmovgt $RCOND,$RTRUE,$RDEST",[], s_fcmov>; //FCMOVE if > zero
+def FCMOVLES : FPForm<0x17, 0x02E, "fcmovle $RCOND,$RTRUE,$RDEST",[], s_fcmov>; //FCMOVE if <= zero
+def FCMOVLTS : FPForm<0x17, 0x02C, "fcmovlt $RCOND,$RTRUE,$RDEST",[], s_fcmov>; // FCMOVE if < zero
+def FCMOVNES : FPForm<0x17, 0x02B, "fcmovne $RCOND,$RTRUE,$RDEST",[], s_fcmov>; //FCMOVE if != zero
+}
+//conditional moves, doubles
+let OutOperandList = (ops F8RC:$RDEST), InOperandList = (ops F8RC:$RFALSE, F8RC:$RTRUE, F8RC:$RCOND),
+    isTwoAddress = 1 in {
+def FCMOVEQT : FPForm<0x17, 0x02A, "fcmoveq $RCOND,$RTRUE,$RDEST", [], s_fcmov>;
+def FCMOVGET : FPForm<0x17, 0x02D, "fcmovge $RCOND,$RTRUE,$RDEST", [], s_fcmov>;
+def FCMOVGTT : FPForm<0x17, 0x02F, "fcmovgt $RCOND,$RTRUE,$RDEST", [], s_fcmov>;
+def FCMOVLET : FPForm<0x17, 0x02E, "fcmovle $RCOND,$RTRUE,$RDEST", [], s_fcmov>;
+def FCMOVLTT : FPForm<0x17, 0x02C, "fcmovlt $RCOND,$RTRUE,$RDEST", [], s_fcmov>;
+def FCMOVNET : FPForm<0x17, 0x02B, "fcmovne $RCOND,$RTRUE,$RDEST", [], s_fcmov>;
+}
+
+//misc FP selects
+//Select double
+
+def : Pat<(select (seteq F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setoeq F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setueq F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>;
+
+def : Pat<(select (setne F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVEQT F8RC:$sf, F8RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setone F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVEQT F8RC:$sf, F8RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setune F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVEQT F8RC:$sf, F8RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>;
+
+def : Pat<(select (setgt F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLT F8RC:$RB, F8RC:$RA))>;
+def : Pat<(select (setogt F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLT F8RC:$RB, F8RC:$RA))>;
+def : Pat<(select (setugt F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLT F8RC:$RB, F8RC:$RA))>;
+
+def : Pat<(select (setge F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLE F8RC:$RB, F8RC:$RA))>;
+def : Pat<(select (setoge F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLE F8RC:$RB, F8RC:$RA))>;
+def : Pat<(select (setuge F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLE F8RC:$RB, F8RC:$RA))>;
+
+def : Pat<(select (setlt F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLT F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setolt F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLT F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setult F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLT F8RC:$RA, F8RC:$RB))>;
+
+def : Pat<(select (setle F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLE F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setole F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLE F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setule F8RC:$RA, F8RC:$RB), F8RC:$st, F8RC:$sf),
+      (FCMOVNET F8RC:$sf, F8RC:$st, (CMPTLE F8RC:$RA, F8RC:$RB))>;
+
+//Select single
+def : Pat<(select (seteq F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setoeq F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setueq F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>;
+
+def : Pat<(select (setne F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVEQS F4RC:$sf, F4RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setone F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVEQS F4RC:$sf, F4RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setune F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVEQS F4RC:$sf, F4RC:$st, (CMPTEQ F8RC:$RA, F8RC:$RB))>;
+
+def : Pat<(select (setgt F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLT F8RC:$RB, F8RC:$RA))>;
+def : Pat<(select (setogt F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLT F8RC:$RB, F8RC:$RA))>;
+def : Pat<(select (setugt F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLT F8RC:$RB, F8RC:$RA))>;
+
+def : Pat<(select (setge F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLE F8RC:$RB, F8RC:$RA))>;
+def : Pat<(select (setoge F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLE F8RC:$RB, F8RC:$RA))>;
+def : Pat<(select (setuge F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLE F8RC:$RB, F8RC:$RA))>;
+
+def : Pat<(select (setlt F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLT F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setolt F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLT F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setult F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLT F8RC:$RA, F8RC:$RB))>;
+
+def : Pat<(select (setle F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLE F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setole F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLE F8RC:$RA, F8RC:$RB))>;
+def : Pat<(select (setule F8RC:$RA, F8RC:$RB), F4RC:$st, F4RC:$sf),
+      (FCMOVNES F4RC:$sf, F4RC:$st, (CMPTLE F8RC:$RA, F8RC:$RB))>;
+
+
+
+let OutOperandList = (ops GPRC:$RC), InOperandList = (ops F4RC:$RA), Fb = 31 in 
+def FTOIS : FPForm<0x1C, 0x078, "ftois $RA,$RC",
+        [(set GPRC:$RC, (bitconvert F4RC:$RA))], s_ftoi>; //Floating to integer move, S_floating
+let OutOperandList = (ops GPRC:$RC), InOperandList = (ops F8RC:$RA), Fb = 31 in 
+def FTOIT : FPForm<0x1C, 0x070, "ftoit $RA,$RC",
+        [(set GPRC:$RC, (bitconvert F8RC:$RA))], s_ftoi>; //Floating to integer move
+let OutOperandList = (ops F4RC:$RC), InOperandList = (ops GPRC:$RA), Fb = 31 in 
+def ITOFS : FPForm<0x14, 0x004, "itofs $RA,$RC",
+    	[(set F4RC:$RC, (bitconvert GPRC:$RA))], s_itof>; //Integer to floating move, S_floating
+let OutOperandList = (ops F8RC:$RC), InOperandList = (ops GPRC:$RA), Fb = 31 in 
+def ITOFT : FPForm<0x14, 0x024, "itoft $RA,$RC",
+        [(set F8RC:$RC, (bitconvert GPRC:$RA))], s_itof>; //Integer to floating move
+
+
+let OutOperandList = (ops F4RC:$RC), InOperandList = (ops F8RC:$RB), Fa = 31 in 
+def CVTQS : FPForm<0x16, 0x7BC, "cvtqs/sui $RB,$RC",
+        [(set F4RC:$RC, (Alpha_cvtqs F8RC:$RB))], s_fadd>;
+let OutOperandList = (ops F8RC:$RC), InOperandList = (ops F8RC:$RB), Fa = 31 in 
+def CVTQT : FPForm<0x16, 0x7BE, "cvtqt/sui $RB,$RC",
+        [(set F8RC:$RC, (Alpha_cvtqt F8RC:$RB))], s_fadd>;
+let OutOperandList = (ops F8RC:$RC), InOperandList = (ops F8RC:$RB), Fa = 31 in 
+def CVTTQ : FPForm<0x16, 0x52F, "cvttq/svc $RB,$RC",
+        [(set F8RC:$RC, (Alpha_cvttq F8RC:$RB))], s_fadd>;
+let OutOperandList = (ops F8RC:$RC), InOperandList = (ops F4RC:$RB), Fa = 31 in 
+def CVTST : FPForm<0x16, 0x6AC, "cvtst/s $RB,$RC",
+                   [(set F8RC:$RC, (fextend F4RC:$RB))], s_fadd>;
+let OutOperandList = (ops F4RC:$RC), InOperandList = (ops F8RC:$RB), Fa = 31 in 
+def CVTTS : FPForm<0x16, 0x7AC, "cvtts/sui $RB,$RC",
+                   [(set F4RC:$RC, (fround F8RC:$RB))], s_fadd>;
+
+def :  Pat<(select GPRC:$RC, F8RC:$st, F8RC:$sf),
+       (f64 (FCMOVEQT  F8RC:$st, F8RC:$sf, (ITOFT GPRC:$RC)))>; 
+def :  Pat<(select GPRC:$RC, F4RC:$st, F4RC:$sf),
+       (f32 (FCMOVEQS  F4RC:$st, F4RC:$sf, (ITOFT GPRC:$RC)))>; 
+
+/////////////////////////////////////////////////////////
+//Branching
+/////////////////////////////////////////////////////////
+class br_icc<bits<6> opc, string asmstr>
+  : BFormN<opc, (ops u64imm:$opc, GPRC:$R, target:$dst), 
+    !strconcat(asmstr, " $R,$dst"),  s_icbr>;
+class br_fcc<bits<6> opc, string asmstr>
+  : BFormN<opc, (ops u64imm:$opc, F8RC:$R, target:$dst), 
+    !strconcat(asmstr, " $R,$dst"),  s_fbr>;
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1 in {
+let Ra = 31 in
+def BR : BFormD<0x30, "br $$31,$DISP", [(br bb:$DISP)], s_ubr>;
+
+def COND_BRANCH_I : BFormN<0, (ops u64imm:$opc, GPRC:$R, target:$dst), 
+                    "{:comment} COND_BRANCH imm:$opc, GPRC:$R, bb:$dst", 
+                    s_icbr>;
+def COND_BRANCH_F : BFormN<0, (ops u64imm:$opc, F8RC:$R, target:$dst), 
+                    "{:comment} COND_BRANCH imm:$opc, F8RC:$R, bb:$dst",
+                    s_fbr>;
+//Branches, int
+def BEQ  : br_icc<0x39, "beq">;
+def BGE  : br_icc<0x3E, "bge">;
+def BGT  : br_icc<0x3F, "bgt">;
+def BLBC : br_icc<0x38, "blbc">;
+def BLBS : br_icc<0x3C, "blbs">;
+def BLE  : br_icc<0x3B, "ble">;
+def BLT  : br_icc<0x3A, "blt">;
+def BNE  : br_icc<0x3D, "bne">;
+
+//Branches, float
+def FBEQ : br_fcc<0x31, "fbeq">;
+def FBGE : br_fcc<0x36, "fbge">;
+def FBGT : br_fcc<0x37, "fbgt">;
+def FBLE : br_fcc<0x33, "fble">;
+def FBLT : br_fcc<0x32, "fblt">;
+def FBNE : br_fcc<0x36, "fbne">;
+}
+
+//An ugly trick to get the opcode as an imm I can use
+def immBRCond : SDNodeXForm<imm, [{
+  switch((uint64_t)N->getZExtValue()) {
+    default: assert(0 && "Unknown branch type");
+    case 0:  return getI64Imm(Alpha::BEQ);
+    case 1:  return getI64Imm(Alpha::BNE);
+    case 2:  return getI64Imm(Alpha::BGE);
+    case 3:  return getI64Imm(Alpha::BGT);
+    case 4:  return getI64Imm(Alpha::BLE);
+    case 5:  return getI64Imm(Alpha::BLT);
+    case 6:  return getI64Imm(Alpha::BLBS);
+    case 7:  return getI64Imm(Alpha::BLBC);
+    case 20: return getI64Imm(Alpha::FBEQ);
+    case 21: return getI64Imm(Alpha::FBNE);
+    case 22: return getI64Imm(Alpha::FBGE);
+    case 23: return getI64Imm(Alpha::FBGT);
+    case 24: return getI64Imm(Alpha::FBLE);
+    case 25: return getI64Imm(Alpha::FBLT);
+  }
+}]>;
+
+//Int cond patterns
+def : Pat<(brcond (seteq GPRC:$RA, 0), bb:$DISP), 
+      (COND_BRANCH_I (immBRCond 0),  GPRC:$RA, bb:$DISP)>;
+def : Pat<(brcond (setge GPRC:$RA, 0), bb:$DISP), 
+      (COND_BRANCH_I (immBRCond 2),  GPRC:$RA, bb:$DISP)>;
+def : Pat<(brcond (setgt GPRC:$RA, 0), bb:$DISP), 
+      (COND_BRANCH_I (immBRCond 3),  GPRC:$RA, bb:$DISP)>;
+def : Pat<(brcond (and   GPRC:$RA, 1), bb:$DISP), 
+      (COND_BRANCH_I (immBRCond 6),  GPRC:$RA, bb:$DISP)>;
+def : Pat<(brcond (setle GPRC:$RA, 0), bb:$DISP), 
+      (COND_BRANCH_I (immBRCond 4),  GPRC:$RA, bb:$DISP)>;
+def : Pat<(brcond (setlt GPRC:$RA, 0), bb:$DISP), 
+      (COND_BRANCH_I (immBRCond 5),  GPRC:$RA, bb:$DISP)>;
+def : Pat<(brcond (setne GPRC:$RA, 0), bb:$DISP), 
+      (COND_BRANCH_I (immBRCond 1),  GPRC:$RA, bb:$DISP)>;
+
+def : Pat<(brcond GPRC:$RA, bb:$DISP), 
+      (COND_BRANCH_I (immBRCond 1), GPRC:$RA, bb:$DISP)>;
+def : Pat<(brcond (setne GPRC:$RA, GPRC:$RB), bb:$DISP), 
+      (COND_BRANCH_I (immBRCond 0), (CMPEQ GPRC:$RA, GPRC:$RB), bb:$DISP)>;
+def : Pat<(brcond (setne GPRC:$RA, immUExt8:$L), bb:$DISP), 
+      (COND_BRANCH_I (immBRCond 0), (CMPEQi GPRC:$RA, immUExt8:$L), bb:$DISP)>;
+
+//FP cond patterns
+def : Pat<(brcond (seteq F8RC:$RA, immFPZ), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 20),  F8RC:$RA, bb:$DISP)>;
+def : Pat<(brcond (setne F8RC:$RA, immFPZ), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 21),  F8RC:$RA, bb:$DISP)>;
+def : Pat<(brcond (setge F8RC:$RA, immFPZ), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 22),  F8RC:$RA, bb:$DISP)>;
+def : Pat<(brcond (setgt F8RC:$RA, immFPZ), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 23),  F8RC:$RA, bb:$DISP)>;
+def : Pat<(brcond (setle F8RC:$RA, immFPZ), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 24),  F8RC:$RA, bb:$DISP)>;
+def : Pat<(brcond (setlt F8RC:$RA, immFPZ), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 25),  F8RC:$RA, bb:$DISP)>;
+
+
+def : Pat<(brcond (seteq F8RC:$RA, F8RC:$RB), bb:$DISP),  
+      (COND_BRANCH_F (immBRCond 21), (CMPTEQ F8RC:$RA, F8RC:$RB), bb:$DISP)>;
+def : Pat<(brcond (setoeq F8RC:$RA, F8RC:$RB), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 21), (CMPTEQ F8RC:$RA, F8RC:$RB), bb:$DISP)>;
+def : Pat<(brcond (setueq F8RC:$RA, F8RC:$RB), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 21), (CMPTEQ F8RC:$RA, F8RC:$RB), bb:$DISP)>;
+
+def : Pat<(brcond (setlt F8RC:$RA, F8RC:$RB), bb:$DISP),  
+      (COND_BRANCH_F (immBRCond 21), (CMPTLT F8RC:$RA, F8RC:$RB), bb:$DISP)>;
+def : Pat<(brcond (setolt F8RC:$RA, F8RC:$RB), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 21), (CMPTLT F8RC:$RA, F8RC:$RB), bb:$DISP)>;
+def : Pat<(brcond (setult F8RC:$RA, F8RC:$RB), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 21), (CMPTLT F8RC:$RA, F8RC:$RB), bb:$DISP)>;
+
+def : Pat<(brcond (setle F8RC:$RA, F8RC:$RB), bb:$DISP),  
+      (COND_BRANCH_F (immBRCond 21), (CMPTLE F8RC:$RA, F8RC:$RB), bb:$DISP)>;
+def : Pat<(brcond (setole F8RC:$RA, F8RC:$RB), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 21), (CMPTLE F8RC:$RA, F8RC:$RB), bb:$DISP)>;
+def : Pat<(brcond (setule F8RC:$RA, F8RC:$RB), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 21), (CMPTLE F8RC:$RA, F8RC:$RB), bb:$DISP)>;
+
+def : Pat<(brcond (setgt F8RC:$RA, F8RC:$RB), bb:$DISP),  
+      (COND_BRANCH_F (immBRCond 21), (CMPTLT F8RC:$RB, F8RC:$RA), bb:$DISP)>;
+def : Pat<(brcond (setogt F8RC:$RA, F8RC:$RB), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 21), (CMPTLT F8RC:$RB, F8RC:$RA), bb:$DISP)>;
+def : Pat<(brcond (setugt F8RC:$RA, F8RC:$RB), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 21), (CMPTLT F8RC:$RB, F8RC:$RA), bb:$DISP)>;
+
+def : Pat<(brcond (setge F8RC:$RA, F8RC:$RB), bb:$DISP),  
+      (COND_BRANCH_F (immBRCond 21), (CMPTLE F8RC:$RB, F8RC:$RA), bb:$DISP)>;
+def : Pat<(brcond (setoge F8RC:$RA, F8RC:$RB), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 21), (CMPTLE F8RC:$RB, F8RC:$RA), bb:$DISP)>;
+def : Pat<(brcond (setuge F8RC:$RA, F8RC:$RB), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 21), (CMPTLE F8RC:$RB, F8RC:$RA), bb:$DISP)>;
+
+def : Pat<(brcond (setne F8RC:$RA, F8RC:$RB), bb:$DISP),  
+      (COND_BRANCH_F (immBRCond 20), (CMPTEQ F8RC:$RA, F8RC:$RB), bb:$DISP)>;
+def : Pat<(brcond (setone F8RC:$RA, F8RC:$RB), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 20), (CMPTEQ F8RC:$RA, F8RC:$RB), bb:$DISP)>;
+def : Pat<(brcond (setune F8RC:$RA, F8RC:$RB), bb:$DISP), 
+      (COND_BRANCH_F (immBRCond 20), (CMPTEQ F8RC:$RA, F8RC:$RB), bb:$DISP)>;
+
+
+def : Pat<(brcond (setoeq F8RC:$RA, immFPZ), bb:$DISP),   
+      (COND_BRANCH_F (immBRCond 20), F8RC:$RA,bb:$DISP)>;
+def : Pat<(brcond (setueq F8RC:$RA, immFPZ), bb:$DISP),   
+      (COND_BRANCH_F (immBRCond 20), F8RC:$RA,bb:$DISP)>;
+
+def : Pat<(brcond (setoge F8RC:$RA, immFPZ), bb:$DISP),   
+      (COND_BRANCH_F (immBRCond 22), F8RC:$RA,bb:$DISP)>;
+def : Pat<(brcond (setuge F8RC:$RA, immFPZ), bb:$DISP),   
+      (COND_BRANCH_F (immBRCond 22), F8RC:$RA,bb:$DISP)>;
+
+def : Pat<(brcond (setogt F8RC:$RA, immFPZ), bb:$DISP),   
+      (COND_BRANCH_F (immBRCond 23), F8RC:$RA,bb:$DISP)>;
+def : Pat<(brcond (setugt F8RC:$RA, immFPZ), bb:$DISP),   
+      (COND_BRANCH_F (immBRCond 23), F8RC:$RA,bb:$DISP)>;
+
+def : Pat<(brcond (setole F8RC:$RA, immFPZ), bb:$DISP),   
+      (COND_BRANCH_F (immBRCond 24), F8RC:$RA,bb:$DISP)>;
+def : Pat<(brcond (setule F8RC:$RA, immFPZ), bb:$DISP),   
+      (COND_BRANCH_F (immBRCond 24), F8RC:$RA,bb:$DISP)>;
+
+def : Pat<(brcond (setolt F8RC:$RA, immFPZ), bb:$DISP),   
+      (COND_BRANCH_F (immBRCond 25), F8RC:$RA,bb:$DISP)>;
+def : Pat<(brcond (setult F8RC:$RA, immFPZ), bb:$DISP),   
+      (COND_BRANCH_F (immBRCond 25), F8RC:$RA,bb:$DISP)>;
+
+def : Pat<(brcond (setone F8RC:$RA, immFPZ), bb:$DISP),   
+      (COND_BRANCH_F (immBRCond 21), F8RC:$RA,bb:$DISP)>;
+def : Pat<(brcond (setune F8RC:$RA, immFPZ), bb:$DISP),   
+      (COND_BRANCH_F (immBRCond 21), F8RC:$RA,bb:$DISP)>;
+
+//End Branches
+
+//S_floating : IEEE Single
+//T_floating : IEEE Double
+
+//Unused instructions
+//Mnemonic Format Opcode Description
+//CALL_PAL Pcd 00 Trap to PALcode
+//ECB Mfc 18.E800 Evict cache block
+//EXCB Mfc 18.0400 Exception barrier
+//FETCH Mfc 18.8000 Prefetch data
+//FETCH_M Mfc 18.A000 Prefetch data, modify intent
+//LDQ_U Mem 0B Load unaligned quadword
+//MB Mfc 18.4000 Memory barrier
+//STQ_U Mem 0F Store unaligned quadword
+//TRAPB Mfc 18.0000 Trap barrier
+//WH64 Mfc 18.F800 Write hint  64 bytes
+//WMB Mfc 18.4400 Write memory barrier
+//MF_FPCR F-P 17.025 Move from FPCR
+//MT_FPCR F-P 17.024 Move to FPCR
+//There are in the Multimedia extentions, so let's not use them yet
+//def MAXSB8  : OForm<0x1C, 0x3E, "MAXSB8 $RA,$RB,$RC">; //Vector signed byte maximum
+//def MAXSW4 : OForm< 0x1C, 0x3F, "MAXSW4 $RA,$RB,$RC">; //Vector signed word maximum
+//def MAXUB8  : OForm<0x1C, 0x3C, "MAXUB8 $RA,$RB,$RC">; //Vector unsigned byte maximum
+//def MAXUW4 : OForm< 0x1C, 0x3D, "MAXUW4 $RA,$RB,$RC">; //Vector unsigned word maximum
+//def MINSB8 : OForm< 0x1C, 0x38, "MINSB8 $RA,$RB,$RC">; //Vector signed byte minimum
+//def MINSW4 : OForm< 0x1C, 0x39, "MINSW4 $RA,$RB,$RC">; //Vector signed word minimum
+//def MINUB8 : OForm< 0x1C, 0x3A, "MINUB8 $RA,$RB,$RC">; //Vector unsigned byte minimum
+//def MINUW4 : OForm< 0x1C, 0x3B, "MINUW4 $RA,$RB,$RC">; //Vector unsigned word minimum
+//def PERR : OForm< 0x1C, 0x31, "PERR $RA,$RB,$RC">; //Pixel error
+//def PKLB : OForm< 0x1C, 0x37, "PKLB $RA,$RB,$RC">; //Pack longwords to bytes
+//def PKWB  : OForm<0x1C, 0x36, "PKWB $RA,$RB,$RC">; //Pack words to bytes
+//def UNPKBL : OForm< 0x1C, 0x35, "UNPKBL $RA,$RB,$RC">; //Unpack bytes to longwords
+//def UNPKBW : OForm< 0x1C, 0x34, "UNPKBW $RA,$RB,$RC">; //Unpack bytes to words
+//CVTLQ F-P 17.010 Convert longword to quadword
+//CVTQL F-P 17.030 Convert quadword to longword
+
+
+//Constant handling
+
+def immConst2Part  : PatLeaf<(imm), [{
+  //true if imm fits in a LDAH LDA pair
+  int64_t val = (int64_t)N->getZExtValue();
+  return (val <= IMM_FULLHIGH  && val >= IMM_FULLLOW);
+}]>;
+def immConst2PartInt  : PatLeaf<(imm), [{
+  //true if imm fits in a LDAH LDA pair with zeroext
+  uint64_t uval = N->getZExtValue();
+  int32_t val32 = (int32_t)uval;
+  return ((uval >> 32) == 0 && //empty upper bits
+          val32 <= IMM_FULLHIGH);
+//          val32 >= IMM_FULLLOW  + IMM_LOW  * IMM_MULT); //Always True
+}], SExt32>;
+
+def : Pat<(i64 immConst2Part:$imm),
+          (LDA (LL16 immConst2Part:$imm), (LDAH (LH16 immConst2Part:$imm), R31))>;
+
+def : Pat<(i64 immSExt16:$imm),
+          (LDA immSExt16:$imm, R31)>;
+
+def : Pat<(i64 immSExt16int:$imm),
+          (ZAPNOTi (LDA (SExt16 immSExt16int:$imm), R31), 15)>;
+def : Pat<(i64 immConst2PartInt:$imm),
+          (ZAPNOTi (LDA (LL16 (SExt32 immConst2PartInt:$imm)), 
+                        (LDAH (LH16 (SExt32 immConst2PartInt:$imm)), R31)), 15)>;
+
+
+//TODO: I want to just define these like this!
+//def : Pat<(i64 0),
+//          (R31)>;
+//def : Pat<(f64 0.0),
+//          (F31)>;
+//def : Pat<(f64 -0.0),
+//          (CPYSNT F31, F31)>;
+//def : Pat<(f32 0.0),
+//          (F31)>;
+//def : Pat<(f32 -0.0),
+//          (CPYSNS F31, F31)>;
+
+//Misc Patterns:
+
+def : Pat<(sext_inreg GPRC:$RB, i32),
+          (ADDLi GPRC:$RB, 0)>;
+
+def : Pat<(fabs F8RC:$RB),
+          (CPYST F31, F8RC:$RB)>;
+def : Pat<(fabs F4RC:$RB),
+          (CPYSS F31, F4RC:$RB)>;
+def : Pat<(fneg F8RC:$RB),
+          (CPYSNT F8RC:$RB, F8RC:$RB)>;
+def : Pat<(fneg F4RC:$RB),
+          (CPYSNS F4RC:$RB, F4RC:$RB)>;
+
+def : Pat<(fcopysign F4RC:$A, (fneg F4RC:$B)),
+          (CPYSNS F4RC:$B, F4RC:$A)>;
+def : Pat<(fcopysign F8RC:$A, (fneg F8RC:$B)),
+          (CPYSNT F8RC:$B, F8RC:$A)>;
+def : Pat<(fcopysign F4RC:$A, (fneg F8RC:$B)),
+          (CPYSNSt F8RC:$B, F4RC:$A)>;
+def : Pat<(fcopysign F8RC:$A, (fneg F4RC:$B)),
+          (CPYSNTs F4RC:$B, F8RC:$A)>;
+
+//Yes, signed multiply high is ugly
+def : Pat<(mulhs GPRC:$RA, GPRC:$RB),
+          (SUBQr (UMULHr GPRC:$RA, GPRC:$RB), (ADDQr (CMOVGEr GPRC:$RB, R31, GPRC:$RA), 
+                                                     (CMOVGEr GPRC:$RA, R31, GPRC:$RB)))>;
+
+//Stupid crazy arithmetic stuff:
+let AddedComplexity = 1 in {
+def : Pat<(mul GPRC:$RA, 5), (S4ADDQr GPRC:$RA, GPRC:$RA)>;
+def : Pat<(mul GPRC:$RA, 9), (S8ADDQr GPRC:$RA, GPRC:$RA)>;
+def : Pat<(mul GPRC:$RA, 3), (S4SUBQr GPRC:$RA, GPRC:$RA)>;
+def : Pat<(mul GPRC:$RA, 7), (S8SUBQr GPRC:$RA, GPRC:$RA)>;
+
+//slight tree expansion if we are multiplying near to a power of 2
+//n is above a power of 2
+def : Pat<(mul GPRC:$RA, immRem1:$imm), 
+          (ADDQr (SLr GPRC:$RA, (nearP2X immRem1:$imm)), GPRC:$RA)>;
+def : Pat<(mul GPRC:$RA, immRem2:$imm), 
+          (ADDQr (SLr GPRC:$RA, (nearP2X immRem2:$imm)), (ADDQr GPRC:$RA, GPRC:$RA))>;
+def : Pat<(mul GPRC:$RA, immRem3:$imm),
+          (ADDQr (SLr GPRC:$RA, (nearP2X immRem3:$imm)), (S4SUBQr GPRC:$RA, GPRC:$RA))>;
+def : Pat<(mul GPRC:$RA, immRem4:$imm),
+          (S4ADDQr GPRC:$RA, (SLr GPRC:$RA, (nearP2X immRem4:$imm)))>;
+def : Pat<(mul GPRC:$RA, immRem5:$imm),
+          (ADDQr (SLr GPRC:$RA, (nearP2X immRem5:$imm)), (S4ADDQr GPRC:$RA, GPRC:$RA))>;
+def : Pat<(mul GPRC:$RA, immRemP2:$imm),
+          (ADDQr (SLr GPRC:$RA, (nearP2X immRemP2:$imm)), (SLi GPRC:$RA, (nearP2RemX immRemP2:$imm)))>;
+
+//n is below a power of 2
+//FIXME: figure out why something is truncating the imm to 32bits
+// this will fix 2007-11-27-mulneg3
+//def : Pat<(mul GPRC:$RA, immRem1n:$imm), 
+//          (SUBQr (SLr GPRC:$RA, (nearP2X immRem1n:$imm)), GPRC:$RA)>;
+//def : Pat<(mul GPRC:$RA, immRem2n:$imm), 
+//          (SUBQr (SLr GPRC:$RA, (nearP2X immRem2n:$imm)), (ADDQr GPRC:$RA, GPRC:$RA))>;
+//def : Pat<(mul GPRC:$RA, immRem3n:$imm),
+//          (SUBQr (SLr GPRC:$RA, (nearP2X immRem3n:$imm)), (S4SUBQr GPRC:$RA, GPRC:$RA))>;
+//def : Pat<(mul GPRC:$RA, immRem4n:$imm),
+//          (SUBQr (SLr GPRC:$RA, (nearP2X immRem4n:$imm)), (SLi GPRC:$RA, 2))>;
+//def : Pat<(mul GPRC:$RA, immRem5n:$imm),
+//          (SUBQr (SLr GPRC:$RA, (nearP2X immRem5n:$imm)), (S4ADDQr GPRC:$RA, GPRC:$RA))>;
+//def : Pat<(mul GPRC:$RA, immRemP2n:$imm),
+//          (SUBQr (SLr GPRC:$RA, (nearP2X immRemP2n:$imm)), (SLi GPRC:$RA, (nearP2RemX immRemP2n:$imm)))>;
+} //Added complexity

diff --git a/lib/Target/Alpha/AlphaJITInfo.cpp b/lib/Target/Alpha/AlphaJITInfo.cpp
new file mode 100644
index 0000000..cb8eb51
--- /dev/null
+++ b/lib/Target/Alpha/AlphaJITInfo.cpp

@@ -0,0 +1,310 @@
+//===-- AlphaJITInfo.cpp - Implement the JIT interfaces for the Alpha ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the JIT interfaces for the Alpha target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jit"
+#include "AlphaJITInfo.h"
+#include "AlphaRelocations.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstdlib>
+using namespace llvm;
+
+#define BUILD_OFormatI(Op, RA, LIT, FUN, RC) \
+  ((Op << 26) | (RA << 21) | (LIT << 13) | (1 << 12) | (FUN << 5) | (RC))
+#define BUILD_OFormat(Op, RA, RB, FUN, RC) \
+  ((Op << 26) | (RA << 21) | (RB << 16) | (FUN << 5) | (RC))
+
+#define BUILD_LDA(RD, RS, IMM16) \
+  ((0x08 << 26) | ((RD) << 21) | ((RS) << 16) | ((IMM16) & 65535))
+#define BUILD_LDAH(RD, RS, IMM16) \
+  ((0x09 << 26) | ((RD) << 21) | ((RS) << 16) | ((IMM16) & 65535))
+
+#define BUILD_LDQ(RD, RS, IMM16) \
+  ((0x29 << 26) | ((RD) << 21) | ((RS) << 16) | ((IMM16) & 0xFFFF))
+
+#define BUILD_JMP(RD, RS, IMM16) \
+  ((0x1A << 26) | ((RD) << 21) | ((RS) << 16) | (0x00 << 14) | ((IMM16) & 0x3FFF))
+#define BUILD_JSR(RD, RS, IMM16) \
+  ((0x1A << 26) | ((RD) << 21) | ((RS) << 16) | (0x01 << 14) | ((IMM16) & 0x3FFF))
+
+#define BUILD_SLLi(RD, RS, IMM8) \
+  (BUILD_OFormatI(0x12, RS, IMM8, 0x39, RD))
+
+#define BUILD_ORi(RD, RS, IMM8) \
+  (BUILD_OFormatI(0x11, RS, IMM8, 0x20, RD))
+
+#define BUILD_OR(RD, RS, RT) \
+  (BUILD_OFormat(0x11, RS, RT, 0x20, RD))
+
+
+
+static void EmitBranchToAt(void *At, void *To) {
+  unsigned long Fn = (unsigned long)To;
+
+  unsigned *AtI = (unsigned*)At;
+
+  AtI[0] = BUILD_OR(0, 27, 27);
+
+  DEBUG(errs() << "Stub targeting " << To << "\n");
+
+  for (int x = 1; x <= 8; ++x) {
+    AtI[2*x - 1] = BUILD_SLLi(27,27,8);
+    unsigned d = (Fn >> (64 - 8 * x)) & 0x00FF;
+    //DEBUG(errs() << "outputing " << hex << d << dec << "\n");
+    AtI[2*x] = BUILD_ORi(27, 27, d);
+  }
+  AtI[17] = BUILD_JMP(31,27,0); //jump, preserving ra, and setting pv
+  AtI[18] = 0x00FFFFFF; //mark this as a stub
+}
+
+void AlphaJITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
+  //FIXME
+  llvm_unreachable(0);
+}
+
+static TargetJITInfo::JITCompilerFn JITCompilerFunction;
+//static AlphaJITInfo* AlphaJTI;
+
+extern "C" {
+#ifdef __alpha
+
+  void AlphaCompilationCallbackC(long* oldpv, void* CameFromStub)
+  {
+    void* Target = JITCompilerFunction(CameFromStub);
+
+    //rewrite the stub to an unconditional branch
+    if (((unsigned*)CameFromStub)[18] == 0x00FFFFFF) {
+      DEBUG(errs() << "Came from a stub, rewriting\n");
+      EmitBranchToAt(CameFromStub, Target);
+    } else {
+      DEBUG(errs() << "confused, didn't come from stub at " << CameFromStub
+                   << " old jump vector " << oldpv
+                   << " new jump vector " << Target << "\n");
+    }
+
+    //Change pv to new Target
+    *oldpv = (long)Target;
+  }
+
+  void AlphaCompilationCallback(void);
+
+  asm(
+      ".text\n"
+      ".globl AlphaComilationCallbackC\n"
+      ".align 4\n"
+      ".globl AlphaCompilationCallback\n"
+      ".ent AlphaCompilationCallback\n"
+"AlphaCompilationCallback:\n"
+      //      //get JIT's GOT
+      "ldgp $29, 0($27)\n"
+      //Save args, callee saved, and perhaps others?
+      //args: $16-$21 $f16-$f21     (12)
+      //callee: $9-$14 $f2-$f9      (14)
+      //others: fp:$15 ra:$26 pv:$27 (3)
+      "lda $30, -232($30)\n"
+      "stq $16,   0($30)\n"
+      "stq $17,   8($30)\n"
+      "stq $18,  16($30)\n"
+      "stq $19,  24($30)\n"
+      "stq $20,  32($30)\n"
+      "stq $21,  40($30)\n"
+      "stt $f16, 48($30)\n"
+      "stt $f17, 56($30)\n"
+      "stt $f18, 64($30)\n"
+      "stt $f19, 72($30)\n"
+      "stt $f20, 80($30)\n"
+      "stt $f21, 88($30)\n"
+      "stq $9,   96($30)\n"
+      "stq $10, 104($30)\n"
+      "stq $11, 112($30)\n"
+      "stq $12, 120($30)\n"
+      "stq $13, 128($30)\n"
+      "stq $14, 136($30)\n"
+      "stt $f2, 144($30)\n"
+      "stt $f3, 152($30)\n"
+      "stt $f4, 160($30)\n"
+      "stt $f5, 168($30)\n"
+      "stt $f6, 176($30)\n"
+      "stt $f7, 184($30)\n"
+      "stt $f8, 192($30)\n"
+      "stt $f9, 200($30)\n"
+      "stq $15, 208($30)\n"
+      "stq $26, 216($30)\n"
+      "stq $27, 224($30)\n"
+
+      "addq $30, 224, $16\n" //pass the addr of saved pv as the first arg
+      "bis $0, $0, $17\n" //pass the roughly stub addr in second arg
+      "jsr $26, AlphaCompilationCallbackC\n" //call without saving ra
+
+      "ldq $16,   0($30)\n"
+      "ldq $17,   8($30)\n"
+      "ldq $18,  16($30)\n"
+      "ldq $19,  24($30)\n"
+      "ldq $20,  32($30)\n"
+      "ldq $21,  40($30)\n"
+      "ldt $f16, 48($30)\n"
+      "ldt $f17, 56($30)\n"
+      "ldt $f18, 64($30)\n"
+      "ldt $f19, 72($30)\n"
+      "ldt $f20, 80($30)\n"
+      "ldt $f21, 88($30)\n"
+      "ldq $9,   96($30)\n"
+      "ldq $10, 104($30)\n"
+      "ldq $11, 112($30)\n"
+      "ldq $12, 120($30)\n"
+      "ldq $13, 128($30)\n"
+      "ldq $14, 136($30)\n"
+      "ldt $f2, 144($30)\n"
+      "ldt $f3, 152($30)\n"
+      "ldt $f4, 160($30)\n"
+      "ldt $f5, 168($30)\n"
+      "ldt $f6, 176($30)\n"
+      "ldt $f7, 184($30)\n"
+      "ldt $f8, 192($30)\n"
+      "ldt $f9, 200($30)\n"
+      "ldq $15, 208($30)\n"
+      "ldq $26, 216($30)\n"
+      "ldq $27, 224($30)\n" //this was updated in the callback with the target
+
+      "lda $30, 232($30)\n" //restore sp
+      "jmp $31, ($27)\n" //jump to the new function
+      ".end AlphaCompilationCallback\n"
+      );
+#else
+  void AlphaCompilationCallback() {
+    llvm_unreachable("Cannot call AlphaCompilationCallback() on a non-Alpha arch!");
+  }
+#endif
+}
+
+TargetJITInfo::StubLayout AlphaJITInfo::getStubLayout() {
+  // The stub contains 19 4-byte instructions, aligned at 4 bytes:
+  // R0 = R27
+  // 8 x "R27 <<= 8; R27 |= 8-bits-of-Target"  == 16 instructions
+  // JMP R27
+  // Magic number so the compilation callback can recognize the stub.
+  StubLayout Result = {19 * 4, 4};
+  return Result;
+}
+
+void *AlphaJITInfo::emitFunctionStub(const Function* F, void *Fn,
+                                     JITCodeEmitter &JCE) {
+  //assert(Fn == AlphaCompilationCallback && "Where are you going?\n");
+  //Do things in a stupid slow way!
+  void* Addr = (void*)(intptr_t)JCE.getCurrentPCValue();
+  for (int x = 0; x < 19; ++ x)
+    JCE.emitWordLE(0);
+  EmitBranchToAt(Addr, Fn);
+  DEBUG(errs() << "Emitting Stub to " << Fn << " at [" << Addr << "]\n");
+  return Addr;
+}
+
+TargetJITInfo::LazyResolverFn
+AlphaJITInfo::getLazyResolverFunction(JITCompilerFn F) {
+  JITCompilerFunction = F;
+  //  setZerothGOTEntry((void*)AlphaCompilationCallback);
+  return AlphaCompilationCallback;
+}
+
+//These describe LDAx
+static const int IMM_LOW  = -32768;
+static const int IMM_HIGH = 32767;
+static const int IMM_MULT = 65536;
+
+static long getUpper16(long l)
+{
+  long y = l / IMM_MULT;
+  if (l % IMM_MULT > IMM_HIGH)
+    ++y;
+  if (l % IMM_MULT < IMM_LOW)
+    --y;
+  assert((short)y == y && "displacement out of range");
+  return y;
+}
+
+static long getLower16(long l)
+{
+  long h = getUpper16(l);
+  long y = l - h * IMM_MULT;
+  assert(y == (short)y && "Displacement out of range");
+  return y;
+}
+
+void AlphaJITInfo::relocate(void *Function, MachineRelocation *MR,
+                            unsigned NumRelocs, unsigned char* GOTBase) {
+  for (unsigned i = 0; i != NumRelocs; ++i, ++MR) {
+    unsigned *RelocPos = (unsigned*)Function + MR->getMachineCodeOffset()/4;
+    long idx = 0;
+    bool doCommon = true;
+    switch ((Alpha::RelocationType)MR->getRelocationType()) {
+    default: llvm_unreachable("Unknown relocation type!");
+    case Alpha::reloc_literal:
+      //This is a LDQl
+      idx = MR->getGOTIndex();
+      DEBUG(errs() << "Literal relocation to slot " << idx);
+      idx = (idx - GOToffset) * 8;
+      DEBUG(errs() << " offset " << idx << "\n");
+      break;
+    case Alpha::reloc_gprellow:
+      idx = (unsigned char*)MR->getResultPointer() - &GOTBase[GOToffset * 8];
+      idx = getLower16(idx);
+      DEBUG(errs() << "gprellow relocation offset " << idx << "\n");
+      DEBUG(errs() << " Pointer is " << (void*)MR->getResultPointer()
+           << " GOT is " << (void*)&GOTBase[GOToffset * 8] << "\n");
+      break;
+    case Alpha::reloc_gprelhigh:
+      idx = (unsigned char*)MR->getResultPointer() - &GOTBase[GOToffset * 8];
+      idx = getUpper16(idx);
+      DEBUG(errs() << "gprelhigh relocation offset " << idx << "\n");
+      DEBUG(errs() << " Pointer is " << (void*)MR->getResultPointer()
+            << " GOT is " << (void*)&GOTBase[GOToffset * 8] << "\n");
+      break;
+    case Alpha::reloc_gpdist:
+      switch (*RelocPos >> 26) {
+      case 0x09: //LDAH
+        idx = &GOTBase[GOToffset * 8] - (unsigned char*)RelocPos;
+        idx = getUpper16(idx);
+        DEBUG(errs() << "LDAH: " << idx << "\n");
+        //add the relocation to the map
+        gpdistmap[std::make_pair(Function, MR->getConstantVal())] = RelocPos;
+        break;
+      case 0x08: //LDA
+        assert(gpdistmap[std::make_pair(Function, MR->getConstantVal())] &&
+               "LDAg without seeing LDAHg");
+        idx = &GOTBase[GOToffset * 8] -
+          (unsigned char*)gpdistmap[std::make_pair(Function, MR->getConstantVal())];
+        idx = getLower16(idx);
+        DEBUG(errs() << "LDA: " << idx << "\n");
+        break;
+      default:
+        llvm_unreachable("Cannot handle gpdist yet");
+      }
+      break;
+    case Alpha::reloc_bsr: {
+      idx = (((unsigned char*)MR->getResultPointer() -
+             (unsigned char*)RelocPos) >> 2) + 1; //skip first 2 inst of fun
+      *RelocPos |= (idx & ((1 << 21)-1));
+      doCommon = false;
+      break;
+    }
+    }
+    if (doCommon) {
+      short x = (short)idx;
+      assert(x == idx);
+      *(short*)RelocPos = x;
+    }
+  }
+}

diff --git a/lib/Target/Alpha/AlphaJITInfo.h b/lib/Target/Alpha/AlphaJITInfo.h
new file mode 100644
index 0000000..bd358a4
--- /dev/null
+++ b/lib/Target/Alpha/AlphaJITInfo.h

@@ -0,0 +1,53 @@
+//===- AlphaJITInfo.h - Alpha impl. of the JIT interface ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Alpha implementation of the TargetJITInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ALPHA_JITINFO_H
+#define ALPHA_JITINFO_H
+
+#include "llvm/Target/TargetJITInfo.h"
+#include <map>
+
+namespace llvm {
+  class TargetMachine;
+
+  class AlphaJITInfo : public TargetJITInfo {
+  protected:
+    TargetMachine &TM;
+    
+    //because gpdist are paired and relative to the pc of the first inst,
+    //we need to have some state
+    std::map<std::pair<void*, int>, void*> gpdistmap;
+  public:
+    explicit AlphaJITInfo(TargetMachine &tm) : TM(tm)
+    { useGOT = true; }
+
+    virtual StubLayout getStubLayout();
+    virtual void *emitFunctionStub(const Function* F, void *Fn,
+                                   JITCodeEmitter &JCE);
+    virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn);
+    virtual void relocate(void *Function, MachineRelocation *MR,
+                          unsigned NumRelocs, unsigned char* GOTBase);
+
+    /// replaceMachineCodeForFunction - Make it so that calling the function
+    /// whose machine code is at OLD turns into a call to NEW, perhaps by
+    /// overwriting OLD with a branch to NEW.  This is used for self-modifying
+    /// code.
+    ///
+    virtual void replaceMachineCodeForFunction(void *Old, void *New);
+  private:
+    static const unsigned GOToffset = 4096;
+
+  };
+}
+
+#endif

diff --git a/lib/Target/Alpha/AlphaLLRP.cpp b/lib/Target/Alpha/AlphaLLRP.cpp
new file mode 100644
index 0000000..0c51bc5
--- /dev/null
+++ b/lib/Target/Alpha/AlphaLLRP.cpp

@@ -0,0 +1,158 @@
+//===-- AlphaLLRP.cpp - Alpha Load Load Replay Trap elimination pass. -- --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Here we check for potential replay traps introduced by the spiller
+// We also align some branch targets if we can do so for free.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "alpha-nops"
+#include "Alpha.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+STATISTIC(nopintro, "Number of nops inserted");
+STATISTIC(nopalign, "Number of nops inserted for alignment");
+
+namespace {
+  cl::opt<bool>
+  AlignAll("alpha-align-all", cl::Hidden,
+                   cl::desc("Align all blocks"));
+
+  struct AlphaLLRPPass : public MachineFunctionPass {
+    /// Target machine description which we query for reg. names, data
+    /// layout, etc.
+    ///
+    AlphaTargetMachine &TM;
+
+    static char ID;
+    AlphaLLRPPass(AlphaTargetMachine &tm) 
+      : MachineFunctionPass(&ID), TM(tm) { }
+
+    virtual const char *getPassName() const {
+      return "Alpha NOP inserter";
+    }
+
+    bool runOnMachineFunction(MachineFunction &F) {
+      const TargetInstrInfo *TII = F.getTarget().getInstrInfo();
+      bool Changed = false;
+      MachineInstr* prev[3] = {0,0,0};
+      DebugLoc dl = DebugLoc::getUnknownLoc();
+      unsigned count = 0;
+      for (MachineFunction::iterator FI = F.begin(), FE = F.end();
+           FI != FE; ++FI) {
+        MachineBasicBlock& MBB = *FI;
+        bool ub = false;
+        for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ) {
+          if (count%4 == 0)
+            prev[0] = prev[1] = prev[2] = 0; //Slots cleared at fetch boundary
+          ++count;
+          MachineInstr *MI = I++;
+          switch (MI->getOpcode()) {
+          case Alpha::LDQ:  case Alpha::LDL:
+          case Alpha::LDWU: case Alpha::LDBU:
+          case Alpha::LDT: case Alpha::LDS:
+          case Alpha::STQ:  case Alpha::STL:
+          case Alpha::STW:  case Alpha::STB:
+          case Alpha::STT: case Alpha::STS:
+           if (MI->getOperand(2).getReg() == Alpha::R30) {
+             if (prev[0] && 
+                 prev[0]->getOperand(2).getReg() == MI->getOperand(2).getReg()&&
+                 prev[0]->getOperand(1).getImm() == MI->getOperand(1).getImm()){
+               prev[0] = prev[1];
+               prev[1] = prev[2];
+               prev[2] = 0;
+               BuildMI(MBB, MI, dl, TII->get(Alpha::BISr), Alpha::R31)
+                 .addReg(Alpha::R31)
+                 .addReg(Alpha::R31); 
+               Changed = true; nopintro += 1;
+               count += 1;
+             } else if (prev[1] 
+                        && prev[1]->getOperand(2).getReg() == 
+                        MI->getOperand(2).getReg()
+                        && prev[1]->getOperand(1).getImm() == 
+                        MI->getOperand(1).getImm()) {
+               prev[0] = prev[2];
+               prev[1] = prev[2] = 0;
+               BuildMI(MBB, MI, dl, TII->get(Alpha::BISr), Alpha::R31)
+                 .addReg(Alpha::R31)
+                 .addReg(Alpha::R31); 
+               BuildMI(MBB, MI, dl, TII->get(Alpha::BISr), Alpha::R31)
+                 .addReg(Alpha::R31)
+                 .addReg(Alpha::R31);
+               Changed = true; nopintro += 2;
+               count += 2;
+             } else if (prev[2] 
+                        && prev[2]->getOperand(2).getReg() == 
+                        MI->getOperand(2).getReg()
+                        && prev[2]->getOperand(1).getImm() == 
+                        MI->getOperand(1).getImm()) {
+               prev[0] = prev[1] = prev[2] = 0;
+               BuildMI(MBB, MI, dl, TII->get(Alpha::BISr), Alpha::R31)
+                 .addReg(Alpha::R31).addReg(Alpha::R31);
+               BuildMI(MBB, MI, dl, TII->get(Alpha::BISr), Alpha::R31)
+                 .addReg(Alpha::R31).addReg(Alpha::R31);
+               BuildMI(MBB, MI, dl, TII->get(Alpha::BISr), Alpha::R31)
+                 .addReg(Alpha::R31).addReg(Alpha::R31);
+               Changed = true; nopintro += 3;
+               count += 3;
+             }
+             prev[0] = prev[1];
+             prev[1] = prev[2];
+             prev[2] = MI;
+             break;
+           }
+           prev[0] = prev[1];
+           prev[1] = prev[2];
+           prev[2] = 0;
+           break;
+          case Alpha::ALTENT:
+          case Alpha::MEMLABEL:
+          case Alpha::PCLABEL:
+            --count;
+            break;
+          case Alpha::BR:
+          case Alpha::JMP:
+            ub = true;
+            //fall through
+          default:
+            prev[0] = prev[1];
+            prev[1] = prev[2];
+            prev[2] = 0;
+            break;
+          }
+        }
+        if (ub || AlignAll) {
+          //we can align stuff for free at this point
+          while (count % 4) {
+            BuildMI(MBB, MBB.end(), dl, TII->get(Alpha::BISr), Alpha::R31)
+              .addReg(Alpha::R31).addReg(Alpha::R31);
+            ++count;
+            ++nopalign;
+            prev[0] = prev[1];
+            prev[1] = prev[2];
+            prev[2] = 0;
+          }
+        }
+      }
+      return Changed;
+    }
+  };
+  char AlphaLLRPPass::ID = 0;
+} // end of anonymous namespace
+
+FunctionPass *llvm::createAlphaLLRPPass(AlphaTargetMachine &tm) {
+  return new AlphaLLRPPass(tm);
+}

diff --git a/lib/Target/Alpha/AlphaMCAsmInfo.cpp b/lib/Target/Alpha/AlphaMCAsmInfo.cpp
new file mode 100644
index 0000000..c67c6a2
--- /dev/null
+++ b/lib/Target/Alpha/AlphaMCAsmInfo.cpp

@@ -0,0 +1,23 @@
+//===-- AlphaMCAsmInfo.cpp - Alpha asm properties ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the AlphaMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AlphaMCAsmInfo.h"
+using namespace llvm;
+
+AlphaMCAsmInfo::AlphaMCAsmInfo(const Target &T, const StringRef &TT) {
+  AlignmentIsInBytes = false;
+  PrivateGlobalPrefix = "$";
+  GPRel32Directive = ".gprel32";
+  WeakRefDirective = "\t.weak\t";
+  HasSetDirective = false;
+}

diff --git a/lib/Target/Alpha/AlphaMCAsmInfo.h b/lib/Target/Alpha/AlphaMCAsmInfo.h
new file mode 100644
index 0000000..c27065d
--- /dev/null
+++ b/lib/Target/Alpha/AlphaMCAsmInfo.h

@@ -0,0 +1,29 @@
+//=====-- AlphaMCAsmInfo.h - Alpha asm properties -------------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the AlphaMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ALPHATARGETASMINFO_H
+#define ALPHATARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+  class Target;
+  class StringRef;
+
+  struct AlphaMCAsmInfo : public MCAsmInfo {
+    explicit AlphaMCAsmInfo(const Target &T, const StringRef &TT);
+  };
+
+} // namespace llvm
+
+#endif

diff --git a/lib/Target/Alpha/AlphaMachineFunctionInfo.h b/lib/Target/Alpha/AlphaMachineFunctionInfo.h
new file mode 100644
index 0000000..8221fc7
--- /dev/null
+++ b/lib/Target/Alpha/AlphaMachineFunctionInfo.h

@@ -0,0 +1,48 @@
+//====- AlphaMachineFuctionInfo.h - Alpha machine function info -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares Alpha-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ALPHAMACHINEFUNCTIONINFO_H
+#define ALPHAMACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+/// AlphaMachineFunctionInfo - This class is derived from MachineFunction
+/// private Alpha target-specific information for each MachineFunction.
+class AlphaMachineFunctionInfo : public MachineFunctionInfo {
+  /// GlobalBaseReg - keeps track of the virtual register initialized for
+  /// use as the global base register. This is used for PIC in some PIC
+  /// relocation models.
+  unsigned GlobalBaseReg;
+
+  /// GlobalRetAddr = keeps track of the virtual register initialized for
+  /// the return address value.
+  unsigned GlobalRetAddr;
+
+public:
+  AlphaMachineFunctionInfo() : GlobalBaseReg(0), GlobalRetAddr(0) {}
+
+  explicit AlphaMachineFunctionInfo(MachineFunction &MF) : GlobalBaseReg(0),
+                                                           GlobalRetAddr(0) {}
+
+  unsigned getGlobalBaseReg() const { return GlobalBaseReg; }
+  void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; }
+
+  unsigned getGlobalRetAddr() const { return GlobalRetAddr; }
+  void setGlobalRetAddr(unsigned Reg) { GlobalRetAddr = Reg; }
+};
+
+} // End llvm namespace
+
+#endif

diff --git a/lib/Target/Alpha/AlphaRegisterInfo.cpp b/lib/Target/Alpha/AlphaRegisterInfo.cpp
new file mode 100644
index 0000000..64bdd62
--- /dev/null
+++ b/lib/Target/Alpha/AlphaRegisterInfo.cpp

@@ -0,0 +1,342 @@
+//===- AlphaRegisterInfo.cpp - Alpha Register Information -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Alpha implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "reginfo"
+#include "Alpha.h"
+#include "AlphaRegisterInfo.h"
+#include "llvm/Constants.h"
+#include "llvm/Type.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include <cstdlib>
+using namespace llvm;
+
+//These describe LDAx
+static const int IMM_LOW  = -32768;
+static const int IMM_HIGH = 32767;
+static const int IMM_MULT = 65536;
+
+static long getUpper16(long l)
+{
+  long y = l / IMM_MULT;
+  if (l % IMM_MULT > IMM_HIGH)
+    ++y;
+  return y;
+}
+
+static long getLower16(long l)
+{
+  long h = getUpper16(l);
+  return l - h * IMM_MULT;
+}
+
+AlphaRegisterInfo::AlphaRegisterInfo(const TargetInstrInfo &tii)
+  : AlphaGenRegisterInfo(Alpha::ADJUSTSTACKDOWN, Alpha::ADJUSTSTACKUP),
+    TII(tii), curgpdist(0)
+{
+}
+
+const unsigned* AlphaRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
+                                                                         const {
+  static const unsigned CalleeSavedRegs[] = {
+    Alpha::R9, Alpha::R10,
+    Alpha::R11, Alpha::R12,
+    Alpha::R13, Alpha::R14,
+    Alpha::F2, Alpha::F3,
+    Alpha::F4, Alpha::F5,
+    Alpha::F6, Alpha::F7,
+    Alpha::F8, Alpha::F9,  0
+  };
+  return CalleeSavedRegs;
+}
+
+const TargetRegisterClass* const*
+AlphaRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
+  static const TargetRegisterClass * const CalleeSavedRegClasses[] = {
+    &Alpha::GPRCRegClass, &Alpha::GPRCRegClass,
+    &Alpha::GPRCRegClass, &Alpha::GPRCRegClass,
+    &Alpha::GPRCRegClass, &Alpha::GPRCRegClass,
+    &Alpha::F8RCRegClass, &Alpha::F8RCRegClass,
+    &Alpha::F8RCRegClass, &Alpha::F8RCRegClass,
+    &Alpha::F8RCRegClass, &Alpha::F8RCRegClass,
+    &Alpha::F8RCRegClass, &Alpha::F8RCRegClass,  0
+  };
+  return CalleeSavedRegClasses;
+}
+
+BitVector AlphaRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  Reserved.set(Alpha::R15);
+  Reserved.set(Alpha::R30);
+  Reserved.set(Alpha::R31);
+  return Reserved;
+}
+
+//===----------------------------------------------------------------------===//
+// Stack Frame Processing methods
+//===----------------------------------------------------------------------===//
+
+// hasFP - Return true if the specified function should have a dedicated frame
+// pointer register.  This is true if the function has variable sized allocas or
+// if frame pointer elimination is disabled.
+//
+bool AlphaRegisterInfo::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  return MFI->hasVarSizedObjects();
+}
+
+void AlphaRegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  if (hasFP(MF)) {
+    // If we have a frame pointer, turn the adjcallstackup instruction into a
+    // 'sub ESP, <amt>' and the adjcallstackdown instruction into 'add ESP,
+    // <amt>'
+    MachineInstr *Old = I;
+    uint64_t Amount = Old->getOperand(0).getImm();
+    if (Amount != 0) {
+      // We need to keep the stack aligned properly.  To do this, we round the
+      // amount of space needed for the outgoing arguments up to the next
+      // alignment boundary.
+      unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment();
+      Amount = (Amount+Align-1)/Align*Align;
+
+      MachineInstr *New;
+      if (Old->getOpcode() == Alpha::ADJUSTSTACKDOWN) {
+        New=BuildMI(MF, Old->getDebugLoc(), TII.get(Alpha::LDA), Alpha::R30)
+          .addImm(-Amount).addReg(Alpha::R30);
+      } else {
+         assert(Old->getOpcode() == Alpha::ADJUSTSTACKUP);
+         New=BuildMI(MF, Old->getDebugLoc(), TII.get(Alpha::LDA), Alpha::R30)
+          .addImm(Amount).addReg(Alpha::R30);
+      }
+
+      // Replace the pseudo instruction with a new instruction...
+      MBB.insert(I, New);
+    }
+  }
+
+  MBB.erase(I);
+}
+
+//Alpha has a slightly funny stack:
+//Args
+//<- incoming SP
+//fixed locals (and spills, callee saved, etc)
+//<- FP
+//variable locals
+//<- SP
+
+unsigned
+AlphaRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                       int SPAdj, int *Value,
+                                       RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+
+  unsigned i = 0;
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  bool FP = hasFP(MF);
+
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+
+  int FrameIndex = MI.getOperand(i).getIndex();
+
+  // Add the base register of R30 (SP) or R15 (FP).
+  MI.getOperand(i + 1).ChangeToRegister(FP ? Alpha::R15 : Alpha::R30, false);
+
+  // Now add the frame object offset to the offset from the virtual frame index.
+  int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex);
+
+  DEBUG(errs() << "FI: " << FrameIndex << " Offset: " << Offset << "\n");
+
+  Offset += MF.getFrameInfo()->getStackSize();
+
+  DEBUG(errs() << "Corrected Offset " << Offset
+       << " for stack size: " << MF.getFrameInfo()->getStackSize() << "\n");
+
+  if (Offset > IMM_HIGH || Offset < IMM_LOW) {
+    DEBUG(errs() << "Unconditionally using R28 for evil purposes Offset: "
+          << Offset << "\n");
+    //so in this case, we need to use a temporary register, and move the
+    //original inst off the SP/FP
+    //fix up the old:
+    MI.getOperand(i + 1).ChangeToRegister(Alpha::R28, false);
+    MI.getOperand(i).ChangeToImmediate(getLower16(Offset));
+    //insert the new
+    MachineInstr* nMI=BuildMI(MF, MI.getDebugLoc(),
+                              TII.get(Alpha::LDAH), Alpha::R28)
+      .addImm(getUpper16(Offset)).addReg(FP ? Alpha::R15 : Alpha::R30);
+    MBB.insert(II, nMI);
+  } else {
+    MI.getOperand(i).ChangeToImmediate(Offset);
+  }
+  return 0;
+}
+
+
+void AlphaRegisterInfo::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();   // Prolog goes in entry BB
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  DebugLoc dl = (MBBI != MBB.end() ?
+                 MBBI->getDebugLoc() : DebugLoc::getUnknownLoc());
+  bool FP = hasFP(MF);
+
+  //handle GOP offset
+  BuildMI(MBB, MBBI, dl, TII.get(Alpha::LDAHg), Alpha::R29)
+    .addGlobalAddress(const_cast<Function*>(MF.getFunction()))
+    .addReg(Alpha::R27).addImm(++curgpdist);
+  BuildMI(MBB, MBBI, dl, TII.get(Alpha::LDAg), Alpha::R29)
+    .addGlobalAddress(const_cast<Function*>(MF.getFunction()))
+    .addReg(Alpha::R29).addImm(curgpdist);
+
+  //evil const_cast until MO stuff setup to handle const
+  BuildMI(MBB, MBBI, dl, TII.get(Alpha::ALTENT))
+    .addGlobalAddress(const_cast<Function*>(MF.getFunction()));
+
+  // Get the number of bytes to allocate from the FrameInfo
+  long NumBytes = MFI->getStackSize();
+
+  if (FP)
+    NumBytes += 8; //reserve space for the old FP
+
+  // Do we need to allocate space on the stack?
+  if (NumBytes == 0) return;
+
+  unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment();
+  NumBytes = (NumBytes+Align-1)/Align*Align;
+
+  // Update frame info to pretend that this is part of the stack...
+  MFI->setStackSize(NumBytes);
+
+  // adjust stack pointer: r30 -= numbytes
+  NumBytes = -NumBytes;
+  if (NumBytes >= IMM_LOW) {
+    BuildMI(MBB, MBBI, dl, TII.get(Alpha::LDA), Alpha::R30).addImm(NumBytes)
+      .addReg(Alpha::R30);
+  } else if (getUpper16(NumBytes) >= IMM_LOW) {
+    BuildMI(MBB, MBBI, dl, TII.get(Alpha::LDAH), Alpha::R30)
+      .addImm(getUpper16(NumBytes)).addReg(Alpha::R30);
+    BuildMI(MBB, MBBI, dl, TII.get(Alpha::LDA), Alpha::R30)
+      .addImm(getLower16(NumBytes)).addReg(Alpha::R30);
+  } else {
+    std::string msg;
+    raw_string_ostream Msg(msg); 
+    Msg << "Too big a stack frame at " + NumBytes;
+    llvm_report_error(Msg.str());
+  }
+
+  //now if we need to, save the old FP and set the new
+  if (FP)
+  {
+    BuildMI(MBB, MBBI, dl, TII.get(Alpha::STQ))
+      .addReg(Alpha::R15).addImm(0).addReg(Alpha::R30);
+    //this must be the last instr in the prolog
+    BuildMI(MBB, MBBI, dl, TII.get(Alpha::BISr), Alpha::R15)
+      .addReg(Alpha::R30).addReg(Alpha::R30);
+  }
+
+}
+
+void AlphaRegisterInfo::emitEpilogue(MachineFunction &MF,
+                                     MachineBasicBlock &MBB) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  assert((MBBI->getOpcode() == Alpha::RETDAG ||
+          MBBI->getOpcode() == Alpha::RETDAGp)
+         && "Can only insert epilog into returning blocks");
+  DebugLoc dl = MBBI->getDebugLoc();
+
+  bool FP = hasFP(MF);
+
+  // Get the number of bytes allocated from the FrameInfo...
+  long NumBytes = MFI->getStackSize();
+
+  //now if we need to, restore the old FP
+  if (FP) {
+    //copy the FP into the SP (discards allocas)
+    BuildMI(MBB, MBBI, dl, TII.get(Alpha::BISr), Alpha::R30).addReg(Alpha::R15)
+      .addReg(Alpha::R15);
+    //restore the FP
+    BuildMI(MBB, MBBI, dl, TII.get(Alpha::LDQ), Alpha::R15)
+      .addImm(0).addReg(Alpha::R15);
+  }
+
+  if (NumBytes != 0) {
+    if (NumBytes <= IMM_HIGH) {
+      BuildMI(MBB, MBBI, dl, TII.get(Alpha::LDA), Alpha::R30).addImm(NumBytes)
+        .addReg(Alpha::R30);
+    } else if (getUpper16(NumBytes) <= IMM_HIGH) {
+      BuildMI(MBB, MBBI, dl, TII.get(Alpha::LDAH), Alpha::R30)
+        .addImm(getUpper16(NumBytes)).addReg(Alpha::R30);
+      BuildMI(MBB, MBBI, dl, TII.get(Alpha::LDA), Alpha::R30)
+        .addImm(getLower16(NumBytes)).addReg(Alpha::R30);
+    } else {
+      std::string msg;
+      raw_string_ostream Msg(msg); 
+      Msg << "Too big a stack frame at " + NumBytes;
+      llvm_report_error(Msg.str());
+    }
+  }
+}
+
+unsigned AlphaRegisterInfo::getRARegister() const {
+  llvm_unreachable("What is the return address register");
+  return 0;
+}
+
+unsigned AlphaRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  return hasFP(MF) ? Alpha::R15 : Alpha::R30;
+}
+
+unsigned AlphaRegisterInfo::getEHExceptionRegister() const {
+  llvm_unreachable("What is the exception register");
+  return 0;
+}
+
+unsigned AlphaRegisterInfo::getEHHandlerRegister() const {
+  llvm_unreachable("What is the exception handler register");
+  return 0;
+}
+
+int AlphaRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
+  llvm_unreachable("What is the dwarf register number");
+  return -1;
+}
+
+#include "AlphaGenRegisterInfo.inc"
+
+std::string AlphaRegisterInfo::getPrettyName(unsigned reg)
+{
+  std::string s(RegisterDescriptors[reg].Name);
+  return s;
+}

diff --git a/lib/Target/Alpha/AlphaRegisterInfo.h b/lib/Target/Alpha/AlphaRegisterInfo.h
new file mode 100644
index 0000000..a971e21
--- /dev/null
+++ b/lib/Target/Alpha/AlphaRegisterInfo.h

@@ -0,0 +1,71 @@
+//===- AlphaRegisterInfo.h - Alpha Register Information Impl ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Alpha implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ALPHAREGISTERINFO_H
+#define ALPHAREGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "AlphaGenRegisterInfo.h.inc"
+
+namespace llvm {
+
+class TargetInstrInfo;
+class Type;
+
+struct AlphaRegisterInfo : public AlphaGenRegisterInfo {
+  const TargetInstrInfo &TII;
+
+  AlphaRegisterInfo(const TargetInstrInfo &tii);
+
+  /// Code Generation virtual methods...
+  const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+
+  const TargetRegisterClass* const* getCalleeSavedRegClasses(
+                                     const MachineFunction *MF = 0) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+
+  bool hasFP(const MachineFunction &MF) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+  unsigned eliminateFrameIndex(MachineBasicBlock::iterator II,
+                               int SPAdj, int *Value = NULL,
+                               RegScavenger *RS = NULL) const;
+
+  //void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
+
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+  // Debug information queries.
+  unsigned getRARegister() const;
+  unsigned getFrameRegister(const MachineFunction &MF) const;
+
+  // Exception handling queries.
+  unsigned getEHExceptionRegister() const;
+  unsigned getEHHandlerRegister() const;
+
+  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+
+  static std::string getPrettyName(unsigned reg);
+  
+private:
+  mutable int curgpdist;
+};
+
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/Alpha/AlphaRegisterInfo.td b/lib/Target/Alpha/AlphaRegisterInfo.td
new file mode 100644
index 0000000..35e6804
--- /dev/null
+++ b/lib/Target/Alpha/AlphaRegisterInfo.td

@@ -0,0 +1,171 @@
+//===- AlphaRegisterInfo.td - The Alpha Register File ------*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Alpha register set.
+//
+//===----------------------------------------------------------------------===//
+
+class AlphaReg<string n> : Register<n> {
+  field bits<5> Num;
+  let Namespace = "Alpha";
+}
+
+// We identify all our registers with a 5-bit ID, for consistency's sake.
+
+// GPR - One of the 32 32-bit general-purpose registers
+class GPR<bits<5> num, string n> : AlphaReg<n> {
+  let Num = num;
+}
+
+// FPR - One of the 32 64-bit floating-point registers
+class FPR<bits<5> num, string n> : AlphaReg<n> {
+  let Num = num;
+}
+
+//#define FP    $15
+//#define RA    $26
+//#define PV    $27
+//#define GP    $29
+//#define SP    $30
+
+// General-purpose registers
+def R0  : GPR< 0,  "$0">, DwarfRegNum<[0]>;
+def R1  : GPR< 1,  "$1">, DwarfRegNum<[1]>;
+def R2  : GPR< 2,  "$2">, DwarfRegNum<[2]>;
+def R3  : GPR< 3,  "$3">, DwarfRegNum<[3]>;
+def R4  : GPR< 4,  "$4">, DwarfRegNum<[4]>;
+def R5  : GPR< 5,  "$5">, DwarfRegNum<[5]>;
+def R6  : GPR< 6,  "$6">, DwarfRegNum<[6]>;
+def R7  : GPR< 7,  "$7">, DwarfRegNum<[7]>;
+def R8  : GPR< 8,  "$8">, DwarfRegNum<[8]>;
+def R9  : GPR< 9,  "$9">, DwarfRegNum<[9]>;
+def R10 : GPR<10, "$10">, DwarfRegNum<[10]>;
+def R11 : GPR<11, "$11">, DwarfRegNum<[11]>;
+def R12 : GPR<12, "$12">, DwarfRegNum<[12]>;
+def R13 : GPR<13, "$13">, DwarfRegNum<[13]>;
+def R14 : GPR<14, "$14">, DwarfRegNum<[14]>;
+def R15 : GPR<15, "$15">, DwarfRegNum<[15]>;
+def R16 : GPR<16, "$16">, DwarfRegNum<[16]>;
+def R17 : GPR<17, "$17">, DwarfRegNum<[17]>;
+def R18 : GPR<18, "$18">, DwarfRegNum<[18]>;
+def R19 : GPR<19, "$19">, DwarfRegNum<[19]>;
+def R20 : GPR<20, "$20">, DwarfRegNum<[20]>;
+def R21 : GPR<21, "$21">, DwarfRegNum<[21]>;
+def R22 : GPR<22, "$22">, DwarfRegNum<[22]>;
+def R23 : GPR<23, "$23">, DwarfRegNum<[23]>;
+def R24 : GPR<24, "$24">, DwarfRegNum<[24]>;
+def R25 : GPR<25, "$25">, DwarfRegNum<[25]>;
+def R26 : GPR<26, "$26">, DwarfRegNum<[26]>;
+def R27 : GPR<27, "$27">, DwarfRegNum<[27]>;
+def R28 : GPR<28, "$28">, DwarfRegNum<[28]>;
+def R29 : GPR<29, "$29">, DwarfRegNum<[29]>;
+def R30 : GPR<30, "$30">, DwarfRegNum<[30]>;
+def R31 : GPR<31, "$31">, DwarfRegNum<[31]>;
+
+// Floating-point registers
+def F0  : FPR< 0,  "$f0">, DwarfRegNum<[33]>;
+def F1  : FPR< 1,  "$f1">, DwarfRegNum<[34]>;
+def F2  : FPR< 2,  "$f2">, DwarfRegNum<[35]>;
+def F3  : FPR< 3,  "$f3">, DwarfRegNum<[36]>;
+def F4  : FPR< 4,  "$f4">, DwarfRegNum<[37]>;
+def F5  : FPR< 5,  "$f5">, DwarfRegNum<[38]>;
+def F6  : FPR< 6,  "$f6">, DwarfRegNum<[39]>;
+def F7  : FPR< 7,  "$f7">, DwarfRegNum<[40]>;
+def F8  : FPR< 8,  "$f8">, DwarfRegNum<[41]>;
+def F9  : FPR< 9,  "$f9">, DwarfRegNum<[42]>;
+def F10 : FPR<10, "$f10">, DwarfRegNum<[43]>;
+def F11 : FPR<11, "$f11">, DwarfRegNum<[44]>;
+def F12 : FPR<12, "$f12">, DwarfRegNum<[45]>;
+def F13 : FPR<13, "$f13">, DwarfRegNum<[46]>;
+def F14 : FPR<14, "$f14">, DwarfRegNum<[47]>;
+def F15 : FPR<15, "$f15">, DwarfRegNum<[48]>;
+def F16 : FPR<16, "$f16">, DwarfRegNum<[49]>;
+def F17 : FPR<17, "$f17">, DwarfRegNum<[50]>;
+def F18 : FPR<18, "$f18">, DwarfRegNum<[51]>;
+def F19 : FPR<19, "$f19">, DwarfRegNum<[52]>;
+def F20 : FPR<20, "$f20">, DwarfRegNum<[53]>;
+def F21 : FPR<21, "$f21">, DwarfRegNum<[54]>;
+def F22 : FPR<22, "$f22">, DwarfRegNum<[55]>;
+def F23 : FPR<23, "$f23">, DwarfRegNum<[56]>;
+def F24 : FPR<24, "$f24">, DwarfRegNum<[57]>;
+def F25 : FPR<25, "$f25">, DwarfRegNum<[58]>;
+def F26 : FPR<26, "$f26">, DwarfRegNum<[59]>;
+def F27 : FPR<27, "$f27">, DwarfRegNum<[60]>;
+def F28 : FPR<28, "$f28">, DwarfRegNum<[61]>;
+def F29 : FPR<29, "$f29">, DwarfRegNum<[62]>;
+def F30 : FPR<30, "$f30">, DwarfRegNum<[63]>;
+def F31 : FPR<31, "$f31">, DwarfRegNum<[64]>;
+
+  // //#define FP    $15
+  // //#define RA    $26
+  // //#define PV    $27
+  // //#define GP    $29
+  // //#define SP    $30
+  // $28 is undefined after any and all calls
+
+/// Register classes
+def GPRC : RegisterClass<"Alpha", [i64], 64,
+     // Volatile
+     [R0, R1, R2, R3, R4, R5, R6, R7, R8, R16, R17, R18, R19, R20, R21, R22,
+      R23, R24, R25, R28, 
+     //Special meaning, but volatile
+     R27, //procedure address
+     R26, //return address
+     R29, //global offset table address
+     // Non-volatile
+     R9, R10, R11, R12, R13, R14,
+// Don't allocate 15, 30, 31
+     R15, R30, R31 ]> //zero
+{
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    GPRCClass::iterator
+    GPRCClass::allocation_order_end(const MachineFunction &MF) const {
+        return end()-3;
+    }
+  }];
+}
+
+def F4RC : RegisterClass<"Alpha", [f32], 64, [F0, F1, 
+        F10, F11, F12, F13, F14, F15, F16, F17, F18, F19,
+        F20, F21, F22, F23, F24, F25, F26, F27, F28, F29, F30,
+        // Saved:
+        F2, F3, F4, F5, F6, F7, F8, F9,
+        F31 ]> //zero
+{
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    F4RCClass::iterator
+    F4RCClass::allocation_order_end(const MachineFunction &MF) const {
+        return end()-1;
+    }
+  }];
+}
+
+def F8RC : RegisterClass<"Alpha", [f64], 64, [F0, F1, 
+        F10, F11, F12, F13, F14, F15, F16, F17, F18, F19,
+        F20, F21, F22, F23, F24, F25, F26, F27, F28, F29, F30,
+        // Saved:
+        F2, F3, F4, F5, F6, F7, F8, F9,
+        F31 ]> //zero
+{
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    F8RCClass::iterator
+    F8RCClass::allocation_order_end(const MachineFunction &MF) const {
+        return end()-1;
+    }
+  }];
+}

diff --git a/lib/Target/Alpha/AlphaRelocations.h b/lib/Target/Alpha/AlphaRelocations.h
new file mode 100644
index 0000000..4c92045
--- /dev/null
+++ b/lib/Target/Alpha/AlphaRelocations.h

@@ -0,0 +1,31 @@
+//===- AlphaRelocations.h - Alpha Code Relocations --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the Alpha target-specific relocation types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ALPHARELOCATIONS_H
+#define ALPHARELOCATIONS_H
+
+#include "llvm/CodeGen/MachineRelocation.h"
+
+namespace llvm {
+  namespace Alpha {
+    enum RelocationType {
+      reloc_literal,
+      reloc_gprellow,
+      reloc_gprelhigh,
+      reloc_gpdist,
+      reloc_bsr
+    };
+  }
+}
+
+#endif

diff --git a/lib/Target/Alpha/AlphaSchedule.td b/lib/Target/Alpha/AlphaSchedule.td
new file mode 100644
index 0000000..b7b4560
--- /dev/null
+++ b/lib/Target/Alpha/AlphaSchedule.td

@@ -0,0 +1,84 @@
+//===- AlphaSchedule.td - Alpha Scheduling Definitions -----*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+
+//This is table 2-2 from the 21264 compiler writers guide
+//modified some
+
+//Pipelines
+
+def L0   : FuncUnit;
+def L1   : FuncUnit;
+def FST0 : FuncUnit;
+def FST1 : FuncUnit;
+def U0   : FuncUnit;
+def U1   : FuncUnit;
+def FA   : FuncUnit;
+def FM   : FuncUnit;
+
+def s_ild   : InstrItinClass;
+def s_fld   : InstrItinClass;
+def s_ist   : InstrItinClass;
+def s_fst   : InstrItinClass;
+def s_lda   : InstrItinClass;
+def s_rpcc  : InstrItinClass;
+def s_rx    : InstrItinClass;
+def s_mxpr  : InstrItinClass;
+def s_icbr  : InstrItinClass;
+def s_ubr   : InstrItinClass;
+def s_jsr   : InstrItinClass;
+def s_iadd  : InstrItinClass;
+def s_ilog  : InstrItinClass;
+def s_ishf  : InstrItinClass;
+def s_cmov  : InstrItinClass;
+def s_imul  : InstrItinClass;
+def s_imisc : InstrItinClass;
+def s_fbr   : InstrItinClass;
+def s_fadd  : InstrItinClass;
+def s_fmul  : InstrItinClass;
+def s_fcmov : InstrItinClass;
+def s_fdivt : InstrItinClass;
+def s_fdivs : InstrItinClass;
+def s_fsqrts: InstrItinClass;
+def s_fsqrtt: InstrItinClass;
+def s_ftoi  : InstrItinClass;
+def s_itof  : InstrItinClass;
+def s_pseudo : InstrItinClass;
+
+//Table 24 Instruction Class Latency in Cycles
+//modified some
+
+def Alpha21264Itineraries : ProcessorItineraries<[
+  InstrItinData<s_ild    , [InstrStage<3, [L0, L1]>]>,
+  InstrItinData<s_fld    , [InstrStage<4, [L0, L1]>]>,
+  InstrItinData<s_ist    , [InstrStage<0, [L0, L1]>]>,
+  InstrItinData<s_fst    , [InstrStage<0, [FST0, FST1, L0, L1]>]>,
+  InstrItinData<s_lda    , [InstrStage<1, [L0, L1, U0, U1]>]>,
+  InstrItinData<s_rpcc   , [InstrStage<1, [L1]>]>,
+  InstrItinData<s_rx     , [InstrStage<1, [L1]>]>,
+  InstrItinData<s_mxpr   , [InstrStage<1, [L0, L1]>]>,
+  InstrItinData<s_icbr   , [InstrStage<0, [U0, U1]>]>,
+  InstrItinData<s_ubr    , [InstrStage<3, [U0, U1]>]>,
+  InstrItinData<s_jsr    , [InstrStage<3, [L0]>]>,
+  InstrItinData<s_iadd   , [InstrStage<1, [L0, U0, L1, U1]>]>,
+  InstrItinData<s_ilog   , [InstrStage<1, [L0, U0, L1, U1]>]>,
+  InstrItinData<s_ishf   , [InstrStage<1, [U0, U1]>]>,
+  InstrItinData<s_cmov   , [InstrStage<1, [L0, U0, L1, U1]>]>,
+  InstrItinData<s_imul   , [InstrStage<7, [U1]>]>,
+  InstrItinData<s_imisc  , [InstrStage<3, [U0]>]>,
+  InstrItinData<s_fbr    , [InstrStage<0, [FA]>]>,
+  InstrItinData<s_fadd   , [InstrStage<6, [FA]>]>,
+  InstrItinData<s_fmul   , [InstrStage<6, [FM]>]>,
+  InstrItinData<s_fcmov  , [InstrStage<6, [FA]>]>,
+  InstrItinData<s_fdivs  , [InstrStage<12, [FA]>]>,
+  InstrItinData<s_fdivt  , [InstrStage<15, [FA]>]>,
+  InstrItinData<s_fsqrts , [InstrStage<18, [FA]>]>,
+  InstrItinData<s_fsqrtt , [InstrStage<33, [FA]>]>,
+  InstrItinData<s_ftoi   , [InstrStage<3, [FST0, FST1, L0, L1]>]>,
+  InstrItinData<s_itof   , [InstrStage<4, [L0, L1]>]>
+]>;

diff --git a/lib/Target/Alpha/AlphaSubtarget.cpp b/lib/Target/Alpha/AlphaSubtarget.cpp
new file mode 100644
index 0000000..bda7104
--- /dev/null
+++ b/lib/Target/Alpha/AlphaSubtarget.cpp

@@ -0,0 +1,25 @@
+//===- AlphaSubtarget.cpp - Alpha Subtarget Information ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Alpha specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AlphaSubtarget.h"
+#include "Alpha.h"
+#include "AlphaGenSubtarget.inc"
+using namespace llvm;
+
+AlphaSubtarget::AlphaSubtarget(const std::string &TT, const std::string &FS)
+  : HasCT(false) {
+  std::string CPU = "generic";
+
+  // Parse features string.
+  ParseSubtargetFeatures(FS, CPU);
+}

diff --git a/lib/Target/Alpha/AlphaSubtarget.h b/lib/Target/Alpha/AlphaSubtarget.h
new file mode 100644
index 0000000..f0eb93c
--- /dev/null
+++ b/lib/Target/Alpha/AlphaSubtarget.h

@@ -0,0 +1,46 @@
+//=====-- AlphaSubtarget.h - Define Subtarget for the Alpha --*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Alpha specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ALPHASUBTARGET_H
+#define ALPHASUBTARGET_H
+
+#include "llvm/Target/TargetInstrItineraries.h"
+#include "llvm/Target/TargetSubtarget.h"
+
+#include <string>
+
+namespace llvm {
+
+class AlphaSubtarget : public TargetSubtarget {
+protected:
+
+  bool HasCT;
+
+  InstrItineraryData InstrItins;
+
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified triple.
+  ///
+  AlphaSubtarget(const std::string &TT, const std::string &FS);
+  
+  /// ParseSubtargetFeatures - Parses features string setting specified 
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  std::string ParseSubtargetFeatures(const std::string &FS,
+                                     const std::string &CPU);
+
+  bool hasCT() const { return HasCT; }
+};
+} // End llvm namespace
+
+#endif

diff --git a/lib/Target/Alpha/AlphaTargetMachine.cpp b/lib/Target/Alpha/AlphaTargetMachine.cpp
new file mode 100644
index 0000000..5169a01
--- /dev/null
+++ b/lib/Target/Alpha/AlphaTargetMachine.cpp

@@ -0,0 +1,61 @@
+//===-- AlphaTargetMachine.cpp - Define TargetMachine for Alpha -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "Alpha.h"
+#include "AlphaJITInfo.h"
+#include "AlphaMCAsmInfo.h"
+#include "AlphaTargetMachine.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+extern "C" void LLVMInitializeAlphaTarget() { 
+  // Register the target.
+  RegisterTargetMachine<AlphaTargetMachine> X(TheAlphaTarget);
+  RegisterAsmInfo<AlphaMCAsmInfo> Y(TheAlphaTarget);
+}
+
+AlphaTargetMachine::AlphaTargetMachine(const Target &T, const std::string &TT,
+                                       const std::string &FS)
+  : LLVMTargetMachine(T, TT),
+    DataLayout("e-f128:128:128-n64"),
+    FrameInfo(TargetFrameInfo::StackGrowsDown, 16, 0),
+    JITInfo(*this),
+    Subtarget(TT, FS),
+    TLInfo(*this) {
+  setRelocationModel(Reloc::PIC_);
+}
+
+
+//===----------------------------------------------------------------------===//
+// Pass Pipeline Configuration
+//===----------------------------------------------------------------------===//
+
+bool AlphaTargetMachine::addInstSelector(PassManagerBase &PM,
+                                         CodeGenOpt::Level OptLevel) {
+  PM.add(createAlphaISelDag(*this));
+  return false;
+}
+bool AlphaTargetMachine::addPreEmitPass(PassManagerBase &PM,
+                                        CodeGenOpt::Level OptLevel) {
+  // Must run branch selection immediately preceding the asm printer
+  PM.add(createAlphaBranchSelectionPass());
+  PM.add(createAlphaLLRPPass(*this));
+  return false;
+}
+bool AlphaTargetMachine::addCodeEmitter(PassManagerBase &PM,
+                                        CodeGenOpt::Level OptLevel,
+                                        JITCodeEmitter &JCE) {
+  PM.add(createAlphaJITCodeEmitterPass(*this, JCE));
+  return false;
+}

diff --git a/lib/Target/Alpha/AlphaTargetMachine.h b/lib/Target/Alpha/AlphaTargetMachine.h
new file mode 100644
index 0000000..6f3a774
--- /dev/null
+++ b/lib/Target/Alpha/AlphaTargetMachine.h

@@ -0,0 +1,64 @@
+//===-- AlphaTargetMachine.h - Define TargetMachine for Alpha ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Alpha-specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ALPHA_TARGETMACHINE_H
+#define ALPHA_TARGETMACHINE_H
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "AlphaInstrInfo.h"
+#include "AlphaJITInfo.h"
+#include "AlphaISelLowering.h"
+#include "AlphaSubtarget.h"
+
+namespace llvm {
+
+class GlobalValue;
+
+class AlphaTargetMachine : public LLVMTargetMachine {
+  const TargetData DataLayout;       // Calculates type size & alignment
+  AlphaInstrInfo InstrInfo;
+  TargetFrameInfo FrameInfo;
+  AlphaJITInfo JITInfo;
+  AlphaSubtarget Subtarget;
+  AlphaTargetLowering TLInfo;
+
+public:
+  AlphaTargetMachine(const Target &T, const std::string &TT,
+                     const std::string &FS);
+
+  virtual const AlphaInstrInfo *getInstrInfo() const { return &InstrInfo; }
+  virtual const TargetFrameInfo  *getFrameInfo() const { return &FrameInfo; }
+  virtual const AlphaSubtarget   *getSubtargetImpl() const{ return &Subtarget; }
+  virtual const AlphaRegisterInfo *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+  virtual AlphaTargetLowering* getTargetLowering() const {
+    return const_cast<AlphaTargetLowering*>(&TLInfo);
+  }
+  virtual const TargetData       *getTargetData() const { return &DataLayout; }
+  virtual AlphaJITInfo* getJITInfo() {
+    return &JITInfo;
+  }
+
+  // Pass Pipeline Configuration
+  virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+  virtual bool addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+  virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel,
+                              JITCodeEmitter &JCE);
+};
+
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/Alpha/AsmPrinter/AlphaAsmPrinter.cpp b/lib/Target/Alpha/AsmPrinter/AlphaAsmPrinter.cpp
new file mode 100644
index 0000000..733a46c
--- /dev/null
+++ b/lib/Target/Alpha/AsmPrinter/AlphaAsmPrinter.cpp

@@ -0,0 +1,170 @@
+//===-- AlphaAsmPrinter.cpp - Alpha LLVM assembly writer ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to GAS-format Alpha assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "Alpha.h"
+#include "AlphaInstrInfo.h"
+#include "AlphaTargetMachine.h"
+#include "llvm/Module.h"
+#include "llvm/Type.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+using namespace llvm;
+
+namespace {
+  struct AlphaAsmPrinter : public AsmPrinter {
+    /// Unique incrementer for label values for referencing Global values.
+    ///
+
+    explicit AlphaAsmPrinter(formatted_raw_ostream &o, TargetMachine &tm,
+                             MCContext &Ctx, MCStreamer &Streamer,
+                             const MCAsmInfo *T)
+      : AsmPrinter(o, tm, Ctx, Streamer, T) {}
+
+    virtual const char *getPassName() const {
+      return "Alpha Assembly Printer";
+    }
+    void printInstruction(const MachineInstr *MI);
+    void EmitInstruction(const MachineInstr *MI) {
+      printInstruction(MI);
+      OutStreamer.AddBlankLine();
+    }
+    static const char *getRegisterName(unsigned RegNo);
+
+    void printOp(const MachineOperand &MO, bool IsCallOp = false);
+    void printOperand(const MachineInstr *MI, int opNum);
+    void printBaseOffsetPair(const MachineInstr *MI, int i, bool brackets=true);
+    virtual void EmitFunctionBodyStart();
+    virtual void EmitFunctionBodyEnd(); 
+    void EmitStartOfAsmFile(Module &M);
+
+    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                         unsigned AsmVariant, const char *ExtraCode);
+    bool PrintAsmMemoryOperand(const MachineInstr *MI,
+                               unsigned OpNo,
+                               unsigned AsmVariant,
+                               const char *ExtraCode);
+  };
+} // end of anonymous namespace
+
+#include "AlphaGenAsmWriter.inc"
+
+void AlphaAsmPrinter::printOperand(const MachineInstr *MI, int opNum)
+{
+  const MachineOperand &MO = MI->getOperand(opNum);
+  if (MO.getType() == MachineOperand::MO_Register) {
+    assert(TargetRegisterInfo::isPhysicalRegister(MO.getReg()) &&
+           "Not physreg??");
+    O << getRegisterName(MO.getReg());
+  } else if (MO.isImm()) {
+    O << MO.getImm();
+    assert(MO.getImm() < (1 << 30));
+  } else {
+    printOp(MO);
+  }
+}
+
+
+void AlphaAsmPrinter::printOp(const MachineOperand &MO, bool IsCallOp) {
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    O << getRegisterName(MO.getReg());
+    return;
+
+  case MachineOperand::MO_Immediate:
+    llvm_unreachable("printOp() does not handle immediate values");
+    return;
+
+  case MachineOperand::MO_MachineBasicBlock:
+    O << *MO.getMBB()->getSymbol(OutContext);
+    return;
+
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << MAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << "_"
+      << MO.getIndex();
+    return;
+
+  case MachineOperand::MO_ExternalSymbol:
+    O << MO.getSymbolName();
+    return;
+
+  case MachineOperand::MO_GlobalAddress:
+    O << *GetGlobalValueSymbol(MO.getGlobal());
+    return;
+
+  case MachineOperand::MO_JumpTableIndex:
+    O << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+      << '_' << MO.getIndex();
+    return;
+
+  default:
+    O << "<unknown operand type: " << MO.getType() << ">";
+    return;
+  }
+}
+
+/// EmitFunctionBodyStart - Targets can override this to emit stuff before
+/// the first basic block in the function.
+void AlphaAsmPrinter::EmitFunctionBodyStart() {
+  O << "\t.ent " << *CurrentFnSym << "\n";
+}
+
+/// EmitFunctionBodyEnd - Targets can override this to emit stuff after
+/// the last basic block in the function.
+void AlphaAsmPrinter::EmitFunctionBodyEnd() {
+  O << "\t.end " << *CurrentFnSym << "\n";
+}
+
+void AlphaAsmPrinter::EmitStartOfAsmFile(Module &M) {
+  if (TM.getSubtarget<AlphaSubtarget>().hasCT())
+    O << "\t.arch ev6\n"; //This might need to be ev67, so leave this test here
+  else
+    O << "\t.arch ev6\n";
+  O << "\t.set noat\n";
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool AlphaAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                      unsigned AsmVariant,
+                                      const char *ExtraCode) {
+  printOperand(MI, OpNo);
+  return false;
+}
+
+bool AlphaAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                            unsigned OpNo,
+                                            unsigned AsmVariant,
+                                            const char *ExtraCode) {
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier.
+  O << "0(";
+  printOperand(MI, OpNo);
+  O << ")";
+  return false;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeAlphaAsmPrinter() { 
+  RegisterAsmPrinter<AlphaAsmPrinter> X(TheAlphaTarget);
+}

diff --git a/lib/Target/Alpha/AsmPrinter/CMakeLists.txt b/lib/Target/Alpha/AsmPrinter/CMakeLists.txt
new file mode 100644
index 0000000..992c218
--- /dev/null
+++ b/lib/Target/Alpha/AsmPrinter/CMakeLists.txt

@@ -0,0 +1,6 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMAlphaAsmPrinter
+  AlphaAsmPrinter.cpp
+  )
+add_dependencies(LLVMAlphaAsmPrinter AlphaCodeGenTable_gen)

diff --git a/lib/Target/Alpha/AsmPrinter/Makefile b/lib/Target/Alpha/AsmPrinter/Makefile
new file mode 100644
index 0000000..3c64a3c
--- /dev/null
+++ b/lib/Target/Alpha/AsmPrinter/Makefile

@@ -0,0 +1,15 @@
+##===- lib/Target/Alpha/AsmPrinter/Makefile ----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMAlphaAsmPrinter
+
+# Hack: we need to include 'main' alpha target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common

diff --git a/lib/Target/Alpha/CMakeLists.txt b/lib/Target/Alpha/CMakeLists.txt
new file mode 100644
index 0000000..b4f41ae
--- /dev/null
+++ b/lib/Target/Alpha/CMakeLists.txt

@@ -0,0 +1,28 @@
+set(LLVM_TARGET_DEFINITIONS Alpha.td)
+
+tablegen(AlphaGenRegisterInfo.h.inc -gen-register-desc-header)
+tablegen(AlphaGenRegisterNames.inc -gen-register-enums)
+tablegen(AlphaGenRegisterInfo.inc -gen-register-desc)
+tablegen(AlphaGenInstrNames.inc -gen-instr-enums)
+tablegen(AlphaGenInstrInfo.inc -gen-instr-desc)
+tablegen(AlphaGenCodeEmitter.inc -gen-emitter)
+tablegen(AlphaGenAsmWriter.inc -gen-asm-writer)
+tablegen(AlphaGenDAGISel.inc -gen-dag-isel)
+tablegen(AlphaGenCallingConv.inc -gen-callingconv)
+tablegen(AlphaGenSubtarget.inc -gen-subtarget)
+
+add_llvm_target(AlphaCodeGen
+  AlphaBranchSelector.cpp
+  AlphaCodeEmitter.cpp
+  AlphaInstrInfo.cpp
+  AlphaISelDAGToDAG.cpp
+  AlphaISelLowering.cpp
+  AlphaJITInfo.cpp
+  AlphaLLRP.cpp
+  AlphaMCAsmInfo.cpp
+  AlphaRegisterInfo.cpp
+  AlphaSubtarget.cpp
+  AlphaTargetMachine.cpp
+  )
+
+target_link_libraries (LLVMAlphaCodeGen LLVMSelectionDAG)

diff --git a/lib/Target/Alpha/Makefile b/lib/Target/Alpha/Makefile
new file mode 100644
index 0000000..54d53ab
--- /dev/null
+++ b/lib/Target/Alpha/Makefile

@@ -0,0 +1,23 @@
+##===- lib/Target/Alpha/Makefile -------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMAlphaCodeGen
+TARGET = Alpha
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = AlphaGenRegisterInfo.h.inc AlphaGenRegisterNames.inc \
+                AlphaGenRegisterInfo.inc AlphaGenInstrNames.inc \
+                AlphaGenInstrInfo.inc AlphaGenCodeEmitter.inc \
+                AlphaGenAsmWriter.inc AlphaGenDAGISel.inc \
+                AlphaGenCallingConv.inc AlphaGenSubtarget.inc
+
+DIRS = AsmPrinter TargetInfo
+
+include $(LEVEL)/Makefile.common

diff --git a/lib/Target/Alpha/README.txt b/lib/Target/Alpha/README.txt
new file mode 100644
index 0000000..9ae1517
--- /dev/null
+++ b/lib/Target/Alpha/README.txt

@@ -0,0 +1,42 @@
+***
+
+add gcc builtins for alpha instructions
+
+
+***
+
+custom expand byteswap into nifty 
+extract/insert/mask byte/word/longword/quadword low/high
+sequences
+
+***
+
+see if any of the extract/insert/mask operations can be added
+
+***
+
+match more interesting things for cmovlbc cmovlbs (move if low bit clear/set)
+
+***
+
+lower srem and urem
+
+remq(i,j):  i - (j * divq(i,j)) if j != 0
+remqu(i,j): i - (j * divqu(i,j)) if j != 0
+reml(i,j):  i - (j * divl(i,j)) if j != 0
+remlu(i,j): i - (j * divlu(i,j)) if j != 0
+
+***
+
+add crazy vector instructions (MVI):
+
+(MIN|MAX)(U|S)(B8|W4) min and max, signed and unsigned, byte and word
+PKWB, UNPKBW pack/unpack word to byte
+PKLB UNPKBL pack/unpack long to byte
+PERR pixel error (sum accross bytes of bytewise abs(i8v8 a - i8v8 b))
+
+cmpbytes bytewise cmpeq of i8v8 a and i8v8 b (not part of MVI extentions)
+
+this has some good examples for other operations that can be synthesised well 
+from these rather meager vector ops (such as saturating add).
+http://www.alphalinux.org/docs/MVI-full.html

diff --git a/lib/Target/Alpha/TargetInfo/AlphaTargetInfo.cpp b/lib/Target/Alpha/TargetInfo/AlphaTargetInfo.cpp
new file mode 100644
index 0000000..f7099b9
--- /dev/null
+++ b/lib/Target/Alpha/TargetInfo/AlphaTargetInfo.cpp

@@ -0,0 +1,20 @@
+//===-- AlphaTargetInfo.cpp - Alpha Target Implementation -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Alpha.h"
+#include "llvm/Module.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+llvm::Target llvm::TheAlphaTarget;
+
+extern "C" void LLVMInitializeAlphaTargetInfo() { 
+  RegisterTarget<Triple::alpha, /*HasJIT=*/true>
+    X(TheAlphaTarget, "alpha", "Alpha [experimental]");
+}

diff --git a/lib/Target/Alpha/TargetInfo/CMakeLists.txt b/lib/Target/Alpha/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000..2a7291b
--- /dev/null
+++ b/lib/Target/Alpha/TargetInfo/CMakeLists.txt

@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMAlphaInfo
+  AlphaTargetInfo.cpp
+  )
+
+add_dependencies(LLVMAlphaInfo AlphaCodeGenTable_gen)

diff --git a/lib/Target/Alpha/TargetInfo/Makefile b/lib/Target/Alpha/TargetInfo/Makefile
new file mode 100644
index 0000000..de01d7f
--- /dev/null
+++ b/lib/Target/Alpha/TargetInfo/Makefile

@@ -0,0 +1,15 @@
+##===- lib/Target/Alpha/TargetInfo/Makefile ----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMAlphaInfo
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common

diff --git a/lib/Target/Blackfin/AsmPrinter/BlackfinAsmPrinter.cpp b/lib/Target/Blackfin/AsmPrinter/BlackfinAsmPrinter.cpp
new file mode 100644
index 0000000..fe13e14
--- /dev/null
+++ b/lib/Target/Blackfin/AsmPrinter/BlackfinAsmPrinter.cpp

@@ -0,0 +1,151 @@
+//===-- BlackfinAsmPrinter.cpp - Blackfin LLVM assembly writer ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to GAS-format BLACKFIN assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "Blackfin.h"
+#include "BlackfinInstrInfo.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+using namespace llvm;
+
+namespace {
+  class BlackfinAsmPrinter : public AsmPrinter {
+  public:
+    BlackfinAsmPrinter(formatted_raw_ostream &O, TargetMachine &TM,
+                       MCContext &Ctx, MCStreamer &Streamer,
+                       const MCAsmInfo *MAI)
+      : AsmPrinter(O, TM, Ctx, Streamer, MAI) {}
+
+    virtual const char *getPassName() const {
+      return "Blackfin Assembly Printer";
+    }
+
+    void printOperand(const MachineInstr *MI, int opNum);
+    void printMemoryOperand(const MachineInstr *MI, int opNum);
+    void printInstruction(const MachineInstr *MI);  // autogenerated.
+    static const char *getRegisterName(unsigned RegNo);
+
+    void EmitInstruction(const MachineInstr *MI) {
+      printInstruction(MI);
+      OutStreamer.AddBlankLine();
+    }
+    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                         unsigned AsmVariant, const char *ExtraCode);
+    bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                               unsigned AsmVariant, const char *ExtraCode);
+  };
+} // end of anonymous namespace
+
+#include "BlackfinGenAsmWriter.inc"
+
+extern "C" void LLVMInitializeBlackfinAsmPrinter() {
+  RegisterAsmPrinter<BlackfinAsmPrinter> X(TheBlackfinTarget);
+}
+
+void BlackfinAsmPrinter::printOperand(const MachineInstr *MI, int opNum) {
+  const MachineOperand &MO = MI->getOperand (opNum);
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    assert(TargetRegisterInfo::isPhysicalRegister(MO.getReg()) &&
+           "Virtual registers should be already mapped!");
+    O << getRegisterName(MO.getReg());
+    break;
+
+  case MachineOperand::MO_Immediate:
+    O << MO.getImm();
+    break;
+  case MachineOperand::MO_MachineBasicBlock:
+    O << *MO.getMBB()->getSymbol(OutContext);
+    return;
+  case MachineOperand::MO_GlobalAddress:
+    O << *GetGlobalValueSymbol(MO.getGlobal());
+    printOffset(MO.getOffset());
+    break;
+  case MachineOperand::MO_ExternalSymbol:
+    O << *GetExternalSymbolSymbol(MO.getSymbolName());
+    break;
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << MAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << "_"
+      << MO.getIndex();
+    break;
+  case MachineOperand::MO_JumpTableIndex:
+    O << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+      << '_' << MO.getIndex();
+    break;
+  default:
+    llvm_unreachable("<unknown operand type>");
+    break;
+  }
+}
+
+void BlackfinAsmPrinter::printMemoryOperand(const MachineInstr *MI, int opNum) {
+  printOperand(MI, opNum);
+
+  if (MI->getOperand(opNum+1).isImm() && MI->getOperand(opNum+1).getImm() == 0)
+    return;
+
+  O << " + ";
+  printOperand(MI, opNum+1);
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool BlackfinAsmPrinter::PrintAsmOperand(const MachineInstr *MI,
+                                         unsigned OpNo,
+                                         unsigned AsmVariant,
+                                         const char *ExtraCode) {
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default: return true;  // Unknown modifier.
+    case 'r':
+      break;
+    }
+  }
+
+  printOperand(MI, OpNo);
+
+  return false;
+}
+
+bool BlackfinAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                               unsigned OpNo,
+                                               unsigned AsmVariant,
+                                               const char *ExtraCode) {
+  if (ExtraCode && ExtraCode[0])
+    return true;  // Unknown modifier
+
+  O << '[';
+  printOperand(MI, OpNo);
+  O << ']';
+
+  return false;
+}

diff --git a/lib/Target/Blackfin/AsmPrinter/CMakeLists.txt b/lib/Target/Blackfin/AsmPrinter/CMakeLists.txt
new file mode 100644
index 0000000..795aebf
--- /dev/null
+++ b/lib/Target/Blackfin/AsmPrinter/CMakeLists.txt

@@ -0,0 +1,6 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMBlackfinAsmPrinter
+  BlackfinAsmPrinter.cpp
+  )
+add_dependencies(LLVMBlackfinAsmPrinter BlackfinCodeGenTable_gen)

diff --git a/lib/Target/Blackfin/AsmPrinter/Makefile b/lib/Target/Blackfin/AsmPrinter/Makefile
new file mode 100644
index 0000000..091d4df
--- /dev/null
+++ b/lib/Target/Blackfin/AsmPrinter/Makefile

@@ -0,0 +1,16 @@
+##===- lib/Target/Blackfin/AsmPrinter/Makefile -------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMBlackfinAsmPrinter
+
+# Hack: we need to include 'main' Blackfin target directory to grab private
+# headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common

diff --git a/lib/Target/Blackfin/Blackfin.h b/lib/Target/Blackfin/Blackfin.h
new file mode 100644
index 0000000..ec1fa86
--- /dev/null
+++ b/lib/Target/Blackfin/Blackfin.h

@@ -0,0 +1,38 @@
+//=== Blackfin.h - Top-level interface for Blackfin backend -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// Blackfin back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_BLACKFIN_H
+#define TARGET_BLACKFIN_H
+
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+  class FunctionPass;
+  class BlackfinTargetMachine;
+
+  FunctionPass *createBlackfinISelDag(BlackfinTargetMachine &TM,
+                                      CodeGenOpt::Level OptLevel);
+  extern Target TheBlackfinTarget;
+
+} // end namespace llvm
+
+// Defines symbolic names for Blackfin registers.  This defines a mapping from
+// register name to register number.
+#include "BlackfinGenRegisterNames.inc"
+
+// Defines symbolic names for the Blackfin instructions.
+#include "BlackfinGenInstrNames.inc"
+
+#endif

diff --git a/lib/Target/Blackfin/Blackfin.td b/lib/Target/Blackfin/Blackfin.td
new file mode 100644
index 0000000..cd90962
--- /dev/null
+++ b/lib/Target/Blackfin/Blackfin.td

@@ -0,0 +1,202 @@
+//===- Blackfin.td - Describe the Blackfin Target Machine --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// Blackfin Subtarget features.
+//===----------------------------------------------------------------------===//
+
+def FeatureSDRAM : SubtargetFeature<"sdram", "sdram", "true",
+    "Build for SDRAM">;
+
+def FeatureICPLB : SubtargetFeature<"icplb", "icplb", "true",
+    "Assume instruction cache lookaside buffers are enabled at runtime">;
+
+//===----------------------------------------------------------------------===//
+// Bugs in the silicon becomes workarounds in the compiler.
+// See http://www.analog.com/ for the full list of IC anomalies.
+//===----------------------------------------------------------------------===//
+
+def WA_MI_SHIFT : SubtargetFeature<"mi-shift-anomaly","wa_mi_shift", "true",
+    "Work around 05000074 - "
+    "Multi-Issue Instruction with dsp32shiftimm and P-reg Store">;
+
+def WA_CSYNC : SubtargetFeature<"csync-anomaly","wa_csync", "true",
+    "Work around 05000244 - "
+    "If I-Cache Is On, CSYNC/SSYNC/IDLE Around Change of Control">;
+
+def WA_SPECLD : SubtargetFeature<"specld-anomaly","wa_specld", "true",
+    "Work around 05000245 - "
+    "Access in the Shadow of a Conditional Branch">;
+
+def WA_HWLOOP : SubtargetFeature<"hwloop-anomaly","wa_hwloop", "true",
+    "Work around 05000257 - "
+    "Interrupt/Exception During Short Hardware Loop">;
+
+def WA_MMR_STALL : SubtargetFeature<"mmr-stall-anomaly","wa_mmr_stall", "true",
+    "Work around 05000283 - "
+    "System MMR Write Is Stalled Indefinitely when Killed">;
+
+def WA_LCREGS : SubtargetFeature<"lcregs-anomaly","wa_lcregs", "true",
+    "Work around 05000312 - "
+    "SSYNC, CSYNC, or Loads to LT, LB and LC Registers Are Interrupted">;
+
+def WA_KILLED_MMR : SubtargetFeature<"killed-mmr-anomaly",
+                                     "wa_killed_mmr", "true",
+    "Work around 05000315 - "
+    "Killed System MMR Write Completes Erroneously on Next System MMR Access">;
+
+def WA_RETS : SubtargetFeature<"rets-anomaly", "wa_rets", "true",
+    "Work around 05000371 - "
+    "Possible RETS Register Corruption when Subroutine Is under 5 Cycles">;
+
+def WA_IND_CALL : SubtargetFeature<"ind-call-anomaly", "wa_ind_call", "true",
+    "Work around 05000426 - "
+    "Speculative Fetches of Indirect-Pointer Instructions">;
+
+//===----------------------------------------------------------------------===//
+// Register File, Calling Conv, Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "BlackfinRegisterInfo.td"
+include "BlackfinCallingConv.td"
+include "BlackfinIntrinsics.td"
+include "BlackfinInstrInfo.td"
+
+def BlackfinInstrInfo : InstrInfo {}
+
+//===----------------------------------------------------------------------===//
+// Blackfin processors supported.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, string Suffix, list<SubtargetFeature> Features>
+ : Processor<!strconcat(Name, Suffix), NoItineraries, Features>;
+
+def : Proc<"generic", "", []>;
+
+multiclass Core<string Name,string Suffix,
+                list<SubtargetFeature> Features> {
+  def : Proc<Name, Suffix, Features>;
+  def : Proc<Name, "", Features>;
+  def : Proc<Name, "-none", []>;
+}
+
+multiclass CoreEdinburgh<string Name>
+      : Core<Name, "-0.6", [WA_MI_SHIFT, WA_SPECLD, WA_LCREGS]> {
+  def : Proc<Name, "-0.5",
+        [WA_MI_SHIFT, WA_SPECLD, WA_MMR_STALL, WA_LCREGS, WA_KILLED_MMR,
+         WA_RETS]>;
+  def : Proc<Name, "-0.4",
+        [WA_MI_SHIFT, WA_CSYNC, WA_SPECLD, WA_HWLOOP, WA_MMR_STALL, WA_LCREGS,
+         WA_KILLED_MMR, WA_RETS]>;
+  def : Proc<Name, "-0.3",
+        [WA_MI_SHIFT, WA_CSYNC, WA_SPECLD, WA_HWLOOP, WA_MMR_STALL, WA_LCREGS,
+         WA_KILLED_MMR, WA_RETS]>;
+  def : Proc<Name, "-any",
+        [WA_MI_SHIFT, WA_CSYNC, WA_SPECLD, WA_HWLOOP, WA_MMR_STALL, WA_LCREGS,
+         WA_KILLED_MMR, WA_RETS]>;
+}
+multiclass CoreBraemar<string Name>
+       : Core<Name, "-0.3",
+         [WA_MI_SHIFT, WA_SPECLD, WA_LCREGS, WA_RETS, WA_IND_CALL]> {
+  def  : Proc<Name, "-0.2",
+         [WA_MI_SHIFT, WA_CSYNC, WA_SPECLD, WA_HWLOOP, WA_MMR_STALL, WA_LCREGS,
+          WA_KILLED_MMR, WA_RETS, WA_IND_CALL]>;
+  def  : Proc<Name, "-any",
+         [WA_MI_SHIFT, WA_CSYNC, WA_SPECLD, WA_HWLOOP, WA_MMR_STALL, WA_LCREGS,
+          WA_KILLED_MMR, WA_RETS, WA_IND_CALL]>;
+}
+multiclass CoreStirling<string Name>
+      : Core<Name, "-0.5", [WA_MI_SHIFT, WA_SPECLD, WA_IND_CALL]> {
+  def : Proc<Name, "-0.4",
+        [WA_MI_SHIFT, WA_SPECLD, WA_LCREGS, WA_RETS, WA_IND_CALL]>;
+  def : Proc<Name, "-0.3",
+        [WA_MI_SHIFT, WA_SPECLD, WA_MMR_STALL, WA_LCREGS, WA_KILLED_MMR,
+         WA_RETS, WA_IND_CALL]>;
+  def : Proc<Name, "-any",
+        [WA_MI_SHIFT, WA_SPECLD, WA_MMR_STALL, WA_LCREGS, WA_KILLED_MMR,
+         WA_RETS, WA_IND_CALL]>;
+}
+multiclass CoreMoab<string Name>
+      : Core<Name, "-0.3", [WA_MI_SHIFT, WA_SPECLD, WA_IND_CALL]> {
+  def : Proc<Name, "-0.2", [WA_MI_SHIFT, WA_SPECLD, WA_IND_CALL]>;
+  def : Proc<Name, "-0.1", [WA_MI_SHIFT, WA_SPECLD, WA_RETS, WA_IND_CALL]>;
+  def : Proc<Name, "-0.0",
+        [WA_MI_SHIFT, WA_SPECLD, WA_LCREGS, WA_RETS, WA_IND_CALL]>;
+  def : Proc<Name, "-any",
+        [WA_MI_SHIFT, WA_SPECLD, WA_LCREGS, WA_RETS, WA_IND_CALL]>;
+}
+multiclass CoreTeton<string Name>
+      : Core<Name, "-0.5",
+        [WA_MI_SHIFT, WA_SPECLD, WA_MMR_STALL, WA_LCREGS, WA_KILLED_MMR,
+         WA_RETS, WA_IND_CALL]> {
+  def : Proc<Name, "-0.3",
+        [WA_MI_SHIFT, WA_CSYNC, WA_SPECLD, WA_HWLOOP, WA_MMR_STALL, WA_LCREGS,
+         WA_KILLED_MMR, WA_RETS, WA_IND_CALL]>;
+  def : Proc<Name, "-any",
+        [WA_MI_SHIFT, WA_CSYNC, WA_SPECLD, WA_HWLOOP, WA_MMR_STALL, WA_LCREGS,
+         WA_KILLED_MMR, WA_RETS, WA_IND_CALL]>;
+}
+multiclass CoreKookaburra<string Name>
+      : Core<Name, "-0.2", [WA_MI_SHIFT, WA_SPECLD, WA_IND_CALL]> {
+  def : Proc<Name, "-0.1", [WA_MI_SHIFT, WA_SPECLD, WA_RETS, WA_IND_CALL]>;
+  def : Proc<Name, "-0.0", [WA_MI_SHIFT, WA_SPECLD, WA_RETS, WA_IND_CALL]>;
+  def : Proc<Name, "-any", [WA_MI_SHIFT, WA_SPECLD, WA_RETS, WA_IND_CALL]>;
+}
+multiclass CoreMockingbird<string Name>
+      : Core<Name, "-0.1", [WA_MI_SHIFT, WA_SPECLD, WA_IND_CALL]> {
+  def : Proc<Name, "-0.0", [WA_MI_SHIFT, WA_SPECLD, WA_IND_CALL]>;
+  def : Proc<Name, "-any", [WA_MI_SHIFT, WA_SPECLD, WA_IND_CALL]>;
+}
+multiclass CoreBrodie<string Name>
+      : Core<Name, "-0.1", [WA_MI_SHIFT, WA_SPECLD, WA_IND_CALL]> {
+  def : Proc<Name, "-0.0", [WA_MI_SHIFT, WA_SPECLD, WA_IND_CALL]>;
+  def : Proc<Name, "-any", [WA_MI_SHIFT, WA_SPECLD, WA_IND_CALL]>;
+}
+
+defm BF512 : CoreBrodie<"bf512">;
+defm BF514 : CoreBrodie<"bf514">;
+defm BF516 : CoreBrodie<"bf516">;
+defm BF518 : CoreBrodie<"bf518">;
+defm BF522 : CoreMockingbird<"bf522">;
+defm BF523 : CoreKookaburra<"bf523">;
+defm BF524 : CoreMockingbird<"bf524">;
+defm BF525 : CoreKookaburra<"bf525">;
+defm BF526 : CoreMockingbird<"bf526">;
+defm BF527 : CoreKookaburra<"bf527">;
+defm BF531 : CoreEdinburgh<"bf531">;
+defm BF532 : CoreEdinburgh<"bf532">;
+defm BF533 : CoreEdinburgh<"bf533">;
+defm BF534 : CoreBraemar<"bf534">;
+defm BF536 : CoreBraemar<"bf536">;
+defm BF537 : CoreBraemar<"bf537">;
+defm BF538 : CoreStirling<"bf538">;
+defm BF539 : CoreStirling<"bf539">;
+defm BF542 : CoreMoab<"bf542">;
+defm BF544 : CoreMoab<"bf544">;
+defm BF548 : CoreMoab<"bf548">;
+defm BF549 : CoreMoab<"bf549">;
+defm BF561 : CoreTeton<"bf561">;
+
+//===----------------------------------------------------------------------===//
+// Declare the target which we are implementing
+//===----------------------------------------------------------------------===//
+
+def Blackfin : Target {
+  // Pull in Instruction Info:
+  let InstructionSet = BlackfinInstrInfo;
+}

diff --git a/lib/Target/Blackfin/BlackfinCallingConv.td b/lib/Target/Blackfin/BlackfinCallingConv.td
new file mode 100644
index 0000000..0abc84c
--- /dev/null
+++ b/lib/Target/Blackfin/BlackfinCallingConv.td

@@ -0,0 +1,30 @@
+//===--- BlackfinCallingConv.td - Calling Conventions ------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the Blackfin architectures.
+//
+//===----------------------------------------------------------------------===//
+
+// Blackfin C Calling convention.
+def CC_Blackfin : CallingConv<[
+  CCIfType<[i16], CCPromoteToType<i32>>,
+  CCIfSRet<CCAssignToReg<[P0]>>,
+  CCAssignToReg<[R0, R1, R2]>,
+  CCAssignToStack<4, 4>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// Blackfin C return-value convention.
+def RetCC_Blackfin : CallingConv<[
+  CCIfType<[i16], CCPromoteToType<i32>>,
+  CCAssignToReg<[R0, R1]>
+]>;

diff --git a/lib/Target/Blackfin/BlackfinISelDAGToDAG.cpp b/lib/Target/Blackfin/BlackfinISelDAGToDAG.cpp
new file mode 100644
index 0000000..2c9cc60
--- /dev/null
+++ b/lib/Target/Blackfin/BlackfinISelDAGToDAG.cpp

@@ -0,0 +1,189 @@
+//===- BlackfinISelDAGToDAG.cpp - A dag to dag inst selector for Blackfin -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the Blackfin target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Blackfin.h"
+#include "BlackfinISelLowering.h"
+#include "BlackfinTargetMachine.h"
+#include "BlackfinRegisterInfo.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+/// BlackfinDAGToDAGISel - Blackfin specific code to select blackfin machine
+/// instructions for SelectionDAG operations.
+namespace {
+  class BlackfinDAGToDAGISel : public SelectionDAGISel {
+    /// Subtarget - Keep a pointer to the Blackfin Subtarget around so that we
+    /// can make the right decision when generating code for different targets.
+    //const BlackfinSubtarget &Subtarget;
+  public:
+    BlackfinDAGToDAGISel(BlackfinTargetMachine &TM, CodeGenOpt::Level OptLevel)
+      : SelectionDAGISel(TM, OptLevel) {}
+
+    virtual void InstructionSelect();
+
+    virtual const char *getPassName() const {
+      return "Blackfin DAG->DAG Pattern Instruction Selection";
+    }
+
+    // Include the pieces autogenerated from the target description.
+#include "BlackfinGenDAGISel.inc"
+
+  private:
+    SDNode *Select(SDNode *N);
+    bool SelectADDRspii(SDNode *Op, SDValue Addr,
+                        SDValue &Base, SDValue &Offset);
+
+    // Walk the DAG after instruction selection, fixing register class issues.
+    void FixRegisterClasses(SelectionDAG &DAG);
+
+    const BlackfinInstrInfo &getInstrInfo() {
+      return *static_cast<const BlackfinTargetMachine&>(TM).getInstrInfo();
+    }
+    const BlackfinRegisterInfo *getRegisterInfo() {
+      return static_cast<const BlackfinTargetMachine&>(TM).getRegisterInfo();
+    }
+  };
+}  // end anonymous namespace
+
+FunctionPass *llvm::createBlackfinISelDag(BlackfinTargetMachine &TM,
+                                          CodeGenOpt::Level OptLevel) {
+  return new BlackfinDAGToDAGISel(TM, OptLevel);
+}
+
+/// InstructionSelect - This callback is invoked by
+/// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+void BlackfinDAGToDAGISel::InstructionSelect() {
+  // Select target instructions for the DAG.
+  SelectRoot(*CurDAG);
+  DEBUG(errs() << "Selected selection DAG before regclass fixup:\n");
+  DEBUG(CurDAG->dump());
+  FixRegisterClasses(*CurDAG);
+}
+
+SDNode *BlackfinDAGToDAGISel::Select(SDNode *N) {
+  if (N->isMachineOpcode())
+    return NULL;   // Already selected.
+
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::FrameIndex: {
+    // Selects to ADDpp FI, 0 which in turn will become ADDimm7 SP, imm or ADDpp
+    // SP, Px
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI, MVT::i32);
+    return CurDAG->SelectNodeTo(N, BF::ADDpp, MVT::i32, TFI,
+                                CurDAG->getTargetConstant(0, MVT::i32));
+  }
+  }
+
+  return SelectCode(N);
+}
+
+bool BlackfinDAGToDAGISel::SelectADDRspii(SDNode *Op,
+                                          SDValue Addr,
+                                          SDValue &Base,
+                                          SDValue &Offset) {
+  FrameIndexSDNode *FIN = 0;
+  if ((FIN = dyn_cast<FrameIndexSDNode>(Addr))) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+  if (Addr.getOpcode() == ISD::ADD) {
+    ConstantSDNode *CN = 0;
+    if ((FIN = dyn_cast<FrameIndexSDNode>(Addr.getOperand(0))) &&
+        (CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) &&
+        (CN->getSExtValue() % 4 == 0 && CN->getSExtValue() >= 0)) {
+      // Constant positive word offset from frame index
+      Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+      Offset = CurDAG->getTargetConstant(CN->getSExtValue(), MVT::i32);
+      return true;
+    }
+  }
+  return false;
+}
+
+static inline bool isCC(const TargetRegisterClass *RC) {
+  return RC == &BF::AnyCCRegClass || BF::AnyCCRegClass.hasSubClass(RC);
+}
+
+static inline bool isDCC(const TargetRegisterClass *RC) {
+  return RC == &BF::DRegClass || BF::DRegClass.hasSubClass(RC) || isCC(RC);
+}
+
+static void UpdateNodeOperand(SelectionDAG &DAG,
+                              SDNode *N,
+                              unsigned Num,
+                              SDValue Val) {
+  SmallVector<SDValue, 8> ops(N->op_begin(), N->op_end());
+  ops[Num] = Val;
+  SDValue New = DAG.UpdateNodeOperands(SDValue(N, 0), ops.data(), ops.size());
+  DAG.ReplaceAllUsesWith(N, New.getNode());
+}
+
+// After instruction selection, insert COPY_TO_REGCLASS nodes to help in
+// choosing the proper register classes.
+void BlackfinDAGToDAGISel::FixRegisterClasses(SelectionDAG &DAG) {
+  const BlackfinInstrInfo &TII = getInstrInfo();
+  const BlackfinRegisterInfo *TRI = getRegisterInfo();
+  DAG.AssignTopologicalOrder();
+  HandleSDNode Dummy(DAG.getRoot());
+
+  for (SelectionDAG::allnodes_iterator NI = DAG.allnodes_begin();
+       NI != DAG.allnodes_end(); ++NI) {
+    if (NI->use_empty() || !NI->isMachineOpcode())
+      continue;
+    const TargetInstrDesc &DefTID = TII.get(NI->getMachineOpcode());
+    for (SDNode::use_iterator UI = NI->use_begin(); !UI.atEnd(); ++UI) {
+      if (!UI->isMachineOpcode())
+        continue;
+
+      if (UI.getUse().getResNo() >= DefTID.getNumDefs())
+        continue;
+      const TargetRegisterClass *DefRC =
+        DefTID.OpInfo[UI.getUse().getResNo()].getRegClass(TRI);
+
+      const TargetInstrDesc &UseTID = TII.get(UI->getMachineOpcode());
+      if (UseTID.getNumDefs()+UI.getOperandNo() >= UseTID.getNumOperands())
+        continue;
+      const TargetRegisterClass *UseRC =
+        UseTID.OpInfo[UseTID.getNumDefs()+UI.getOperandNo()].getRegClass(TRI);
+      if (!DefRC || !UseRC)
+        continue;
+      // We cannot copy CC <-> !(CC/D)
+      if ((isCC(DefRC) && !isDCC(UseRC)) || (isCC(UseRC) && !isDCC(DefRC))) {
+        SDNode *Copy =
+          DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
+                             NI->getDebugLoc(),
+                             MVT::i32,
+                             UI.getUse().get(),
+                             DAG.getTargetConstant(BF::DRegClassID, MVT::i32));
+        UpdateNodeOperand(DAG, *UI, UI.getOperandNo(), SDValue(Copy, 0));
+      }
+    }
+  }
+  DAG.setRoot(Dummy.getValue());
+}
+

diff --git a/lib/Target/Blackfin/BlackfinISelLowering.cpp b/lib/Target/Blackfin/BlackfinISelLowering.cpp
new file mode 100644
index 0000000..269707a
--- /dev/null
+++ b/lib/Target/Blackfin/BlackfinISelLowering.cpp

@@ -0,0 +1,613 @@
+//===- BlackfinISelLowering.cpp - Blackfin DAG Lowering Implementation ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the interfaces that Blackfin uses to lower LLVM code
+// into a selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BlackfinISelLowering.h"
+#include "BlackfinTargetMachine.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/ADT/VectorExtras.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "BlackfinGenCallingConv.inc"
+
+//===----------------------------------------------------------------------===//
+// TargetLowering Implementation
+//===----------------------------------------------------------------------===//
+
+BlackfinTargetLowering::BlackfinTargetLowering(TargetMachine &TM)
+  : TargetLowering(TM, new TargetLoweringObjectFileELF()) {
+  setShiftAmountType(MVT::i16);
+  setBooleanContents(ZeroOrOneBooleanContent);
+  setStackPointerRegisterToSaveRestore(BF::SP);
+  setIntDivIsCheap(false);
+
+  // Set up the legal register classes.
+  addRegisterClass(MVT::i32, BF::DRegisterClass);
+  addRegisterClass(MVT::i16, BF::D16RegisterClass);
+
+  computeRegisterProperties();
+
+  // Blackfin doesn't have i1 loads or stores
+  setLoadExtAction(ISD::EXTLOAD,  MVT::i1, Promote);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+
+  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+  setOperationAction(ISD::JumpTable,     MVT::i32, Custom);
+
+  setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
+  setOperationAction(ISD::BR_JT,     MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC,     MVT::Other, Expand);
+
+  // i16 registers don't do much
+  setOperationAction(ISD::AND,   MVT::i16, Promote);
+  setOperationAction(ISD::OR,    MVT::i16, Promote);
+  setOperationAction(ISD::XOR,   MVT::i16, Promote);
+  setOperationAction(ISD::CTPOP, MVT::i16, Promote);
+  // The expansion of CTLZ/CTTZ uses AND/OR, so we might as well promote
+  // immediately.
+  setOperationAction(ISD::CTLZ,  MVT::i16, Promote);
+  setOperationAction(ISD::CTTZ,  MVT::i16, Promote);
+  setOperationAction(ISD::SETCC, MVT::i16, Promote);
+
+  // Blackfin has no division
+  setOperationAction(ISD::SDIV,    MVT::i16, Expand);
+  setOperationAction(ISD::SDIV,    MVT::i32, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i16, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::SREM,    MVT::i16, Expand);
+  setOperationAction(ISD::SREM,    MVT::i32, Expand);
+  setOperationAction(ISD::UDIV,    MVT::i16, Expand);
+  setOperationAction(ISD::UDIV,    MVT::i32, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i16, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::UREM,    MVT::i16, Expand);
+  setOperationAction(ISD::UREM,    MVT::i32, Expand);
+
+  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::MULHU,     MVT::i32, Expand);
+  setOperationAction(ISD::MULHS,     MVT::i32, Expand);
+
+  // No carry-in operations.
+  setOperationAction(ISD::ADDE, MVT::i32, Custom);
+  setOperationAction(ISD::SUBE, MVT::i32, Custom);
+
+  // Blackfin has no intrinsics for these particular operations.
+  setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
+  setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+
+  setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+  // i32 has native CTPOP, but not CTLZ/CTTZ
+  setOperationAction(ISD::CTLZ, MVT::i32, Expand);
+  setOperationAction(ISD::CTTZ, MVT::i32, Expand);
+
+  // READCYCLECOUNTER needs special type legalization.
+  setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
+
+  setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
+
+  // Use the default implementation.
+  setOperationAction(ISD::VACOPY, MVT::Other, Expand);
+  setOperationAction(ISD::VAEND, MVT::Other, Expand);
+  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+}
+
+const char *BlackfinTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return 0;
+  case BFISD::CALL:     return "BFISD::CALL";
+  case BFISD::RET_FLAG: return "BFISD::RET_FLAG";
+  case BFISD::Wrapper:  return "BFISD::Wrapper";
+  }
+}
+
+MVT::SimpleValueType BlackfinTargetLowering::getSetCCResultType(EVT VT) const {
+  // SETCC always sets the CC register. Technically that is an i1 register, but
+  // that type is not legal, so we treat it as an i32 register.
+  return MVT::i32;
+}
+
+SDValue BlackfinTargetLowering::LowerGlobalAddress(SDValue Op,
+                                                   SelectionDAG &DAG) {
+  DebugLoc DL = Op.getDebugLoc();
+  GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+
+  Op = DAG.getTargetGlobalAddress(GV, MVT::i32);
+  return DAG.getNode(BFISD::Wrapper, DL, MVT::i32, Op);
+}
+
+SDValue BlackfinTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc DL = Op.getDebugLoc();
+  int JTI = cast<JumpTableSDNode>(Op)->getIndex();
+
+  Op = DAG.getTargetJumpTable(JTI, MVT::i32);
+  return DAG.getNode(BFISD::Wrapper, DL, MVT::i32, Op);
+}
+
+SDValue
+BlackfinTargetLowering::LowerFormalArguments(SDValue Chain,
+                                             CallingConv::ID CallConv, bool isVarArg,
+                                            const SmallVectorImpl<ISD::InputArg>
+                                               &Ins,
+                                             DebugLoc dl, SelectionDAG &DAG,
+                                             SmallVectorImpl<SDValue> &InVals) {
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 ArgLocs, *DAG.getContext());
+  CCInfo.AllocateStack(12, 4);  // ABI requires 12 bytes stack space
+  CCInfo.AnalyzeFormalArguments(Ins, CC_Blackfin);
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+
+    if (VA.isRegLoc()) {
+      EVT RegVT = VA.getLocVT();
+      TargetRegisterClass *RC = VA.getLocReg() == BF::P0 ?
+        BF::PRegisterClass : BF::DRegisterClass;
+      assert(RC->contains(VA.getLocReg()) && "Unexpected regclass in CCState");
+      assert(RC->hasType(RegVT) && "Unexpected regclass in CCState");
+
+      unsigned Reg = MF.getRegInfo().createVirtualRegister(RC);
+      MF.getRegInfo().addLiveIn(VA.getLocReg(), Reg);
+      SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
+
+      // If this is an 8 or 16-bit value, it is really passed promoted to 32
+      // bits.  Insert an assert[sz]ext to capture this, then truncate to the
+      // right size.
+      if (VA.getLocInfo() == CCValAssign::SExt)
+        ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+      else if (VA.getLocInfo() == CCValAssign::ZExt)
+        ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+
+      if (VA.getLocInfo() != CCValAssign::Full)
+        ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+
+      InVals.push_back(ArgValue);
+    } else {
+      assert(VA.isMemLoc() && "CCValAssign must be RegLoc or MemLoc");
+      unsigned ObjSize = VA.getLocVT().getStoreSize();
+      int FI = MFI->CreateFixedObject(ObjSize, VA.getLocMemOffset(),
+                                      true, false);
+      SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+      InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, NULL, 0));
+    }
+  }
+
+  return Chain;
+}
+
+SDValue
+BlackfinTargetLowering::LowerReturn(SDValue Chain,
+                                    CallingConv::ID CallConv, bool isVarArg,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    DebugLoc dl, SelectionDAG &DAG) {
+
+  // CCValAssign - represent the assignment of the return value to locations.
+  SmallVector<CCValAssign, 16> RVLocs;
+
+  // CCState - Info about the registers and stack slot.
+  CCState CCInfo(CallConv, isVarArg, DAG.getTarget(),
+                 RVLocs, *DAG.getContext());
+
+  // Analize return values.
+  CCInfo.AnalyzeReturn(Outs, RetCC_Blackfin);
+
+  // If this is the first return lowered for this function, add the regs to the
+  // liveout set for the function.
+  if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
+    for (unsigned i = 0; i != RVLocs.size(); ++i)
+      DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
+  }
+
+  SDValue Flag;
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+    SDValue Opi = Outs[i].Val;
+
+    // Expand to i32 if necessary
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::SExt:
+      Opi = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Opi);
+      break;
+    case CCValAssign::ZExt:
+      Opi = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Opi);
+      break;
+    case CCValAssign::AExt:
+      Opi = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Opi);
+      break;
+    }
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Opi, SDValue());
+    // Guarantee that all emitted copies are stuck together with flags.
+    Flag = Chain.getValue(1);
+  }
+
+  if (Flag.getNode()) {
+    return DAG.getNode(BFISD::RET_FLAG, dl, MVT::Other, Chain, Flag);
+  } else {
+    return DAG.getNode(BFISD::RET_FLAG, dl, MVT::Other, Chain);
+  }
+}
+
+SDValue
+BlackfinTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
+                                  CallingConv::ID CallConv, bool isVarArg,
+                                  bool &isTailCall,
+                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                  const SmallVectorImpl<ISD::InputArg> &Ins,
+                                  DebugLoc dl, SelectionDAG &DAG,
+                                  SmallVectorImpl<SDValue> &InVals) {
+  // Blackfin target does not yet support tail call optimization.
+  isTailCall = false;
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getTarget(), ArgLocs,
+                 *DAG.getContext());
+  CCInfo.AllocateStack(12, 4);  // ABI requires 12 bytes stack space
+  CCInfo.AnalyzeCallOperands(Outs, CC_Blackfin);
+
+  // Get the size of the outgoing arguments stack space requirement.
+  unsigned ArgsSize = CCInfo.getNextStackOffset();
+
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(ArgsSize, true));
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    SDValue Arg = Outs[i].Val;
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    }
+
+    // Arguments that can be passed on register must be kept at
+    // RegsToPass vector
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      assert(VA.isMemLoc() && "CCValAssign must be RegLoc or MemLoc");
+      int Offset = VA.getLocMemOffset();
+      assert(Offset%4 == 0 && "Unaligned LocMemOffset");
+      assert(VA.getLocVT()==MVT::i32 && "Illegal CCValAssign type");
+      SDValue SPN = DAG.getCopyFromReg(Chain, dl, BF::SP, MVT::i32);
+      SDValue OffsetN = DAG.getIntPtrConstant(Offset);
+      OffsetN = DAG.getNode(ISD::ADD, dl, MVT::i32, SPN, OffsetN);
+      MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, OffsetN,
+                                         PseudoSourceValue::getStack(),
+                                         Offset));
+    }
+  }
+
+  // Transform all store nodes into one single node because
+  // all store nodes are independent of each other.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Build a sequence of copy-to-reg nodes chained together with token
+  // chain and flag operands which copy the outgoing args into registers.
+  // The InFlag in necessary since all emited instructions must be
+  // stuck together.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress node (quite common, every direct call is)
+  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+  // Likewise ExternalSymbol -> TargetExternalSymbol.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), MVT::i32);
+  else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
+    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), MVT::i32);
+
+  std::vector<EVT> NodeTys;
+  NodeTys.push_back(MVT::Other);   // Returns a chain
+  NodeTys.push_back(MVT::Flag);    // Returns a flag for retval copy to use.
+  SDValue Ops[] = { Chain, Callee, InFlag };
+  Chain = DAG.getNode(BFISD::CALL, dl, NodeTys, Ops,
+                      InFlag.getNode() ? 3 : 2);
+  InFlag = Chain.getValue(1);
+
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(ArgsSize, true),
+                             DAG.getIntPtrConstant(0, true), InFlag);
+  InFlag = Chain.getValue(1);
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState RVInfo(CallConv, isVarArg, DAG.getTarget(), RVLocs,
+                 *DAG.getContext());
+
+  RVInfo.AnalyzeCallResult(Ins, RetCC_Blackfin);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &RV = RVLocs[i];
+    unsigned Reg = RV.getLocReg();
+
+    Chain = DAG.getCopyFromReg(Chain, dl, Reg,
+                               RVLocs[i].getLocVT(), InFlag);
+    SDValue Val = Chain.getValue(0);
+    InFlag = Chain.getValue(2);
+    Chain = Chain.getValue(1);
+
+    // Callee is responsible for extending any i16 return values.
+    switch (RV.getLocInfo()) {
+    case CCValAssign::SExt:
+      Val = DAG.getNode(ISD::AssertSext, dl, RV.getLocVT(), Val,
+                        DAG.getValueType(RV.getValVT()));
+      break;
+    case CCValAssign::ZExt:
+      Val = DAG.getNode(ISD::AssertZext, dl, RV.getLocVT(), Val,
+                        DAG.getValueType(RV.getValVT()));
+      break;
+    default:
+      break;
+    }
+
+    // Truncate to valtype
+    if (RV.getLocInfo() != CCValAssign::Full)
+      Val = DAG.getNode(ISD::TRUNCATE, dl, RV.getValVT(), Val);
+    InVals.push_back(Val);
+  }
+
+  return Chain;
+}
+
+// Expansion of ADDE / SUBE. This is a bit involved since blackfin doesn't have
+// add-with-carry instructions.
+SDValue BlackfinTargetLowering::LowerADDE(SDValue Op, SelectionDAG &DAG) {
+  // Operands: lhs, rhs, carry-in (AC0 flag)
+  // Results: sum, carry-out (AC0 flag)
+  DebugLoc dl = Op.getDebugLoc();
+
+  unsigned Opcode = Op.getOpcode()==ISD::ADDE ? BF::ADD : BF::SUB;
+
+  // zext incoming carry flag in AC0 to 32 bits
+  SDNode* CarryIn = DAG.getMachineNode(BF::MOVE_cc_ac0, dl, MVT::i32,
+                                       /* flag= */ Op.getOperand(2));
+  CarryIn = DAG.getMachineNode(BF::MOVECC_zext, dl, MVT::i32,
+                               SDValue(CarryIn, 0));
+
+  // Add operands, produce sum and carry flag
+  SDNode *Sum = DAG.getMachineNode(Opcode, dl, MVT::i32, MVT::Flag,
+                                   Op.getOperand(0), Op.getOperand(1));
+
+  // Store intermediate carry from Sum
+  SDNode* Carry1 = DAG.getMachineNode(BF::MOVE_cc_ac0, dl, MVT::i32,
+                                      /* flag= */ SDValue(Sum, 1));
+
+  // Add incoming carry, again producing an output flag
+  Sum = DAG.getMachineNode(Opcode, dl, MVT::i32, MVT::Flag,
+                           SDValue(Sum, 0), SDValue(CarryIn, 0));
+
+  // Update AC0 with the intermediate carry, producing a flag.
+  SDNode *CarryOut = DAG.getMachineNode(BF::OR_ac0_cc, dl, MVT::Flag,
+                                        SDValue(Carry1, 0));
+
+  // Compose (i32, flag) pair
+  SDValue ops[2] = { SDValue(Sum, 0), SDValue(CarryOut, 0) };
+  return DAG.getMergeValues(ops, 2, dl);
+}
+
+SDValue BlackfinTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
+  switch (Op.getOpcode()) {
+  default:
+    Op.getNode()->dump();
+    llvm_unreachable("Should not custom lower this!");
+  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
+  case ISD::GlobalTLSAddress:
+    llvm_unreachable("TLS not implemented for Blackfin.");
+  case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
+    // Frame & Return address.  Currently unimplemented
+  case ISD::FRAMEADDR:          return SDValue();
+  case ISD::RETURNADDR:         return SDValue();
+  case ISD::ADDE:
+  case ISD::SUBE:               return LowerADDE(Op, DAG);
+  }
+}
+
+void
+BlackfinTargetLowering::ReplaceNodeResults(SDNode *N,
+                                           SmallVectorImpl<SDValue> &Results,
+                                           SelectionDAG &DAG) {
+  DebugLoc dl = N->getDebugLoc();
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Do not know how to custom type legalize this operation!");
+    return;
+  case ISD::READCYCLECOUNTER: {
+    // The low part of the cycle counter is in CYCLES, the high part in
+    // CYCLES2. Reading CYCLES will latch the value of CYCLES2, so we must read
+    // CYCLES2 last.
+    SDValue TheChain = N->getOperand(0);
+    SDValue lo = DAG.getCopyFromReg(TheChain, dl, BF::CYCLES, MVT::i32);
+    SDValue hi = DAG.getCopyFromReg(lo.getValue(1), dl, BF::CYCLES2, MVT::i32);
+    // Use a buildpair to merge the two 32-bit values into a 64-bit one.
+    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, lo, hi));
+    // Outgoing chain. If we were to use the chain from lo instead, it would be
+    // possible to entirely eliminate the CYCLES2 read in (i32 (trunc
+    // readcyclecounter)). Unfortunately this could possibly delay the CYCLES2
+    // read beyond the next CYCLES read, leading to invalid results.
+    Results.push_back(hi.getValue(1));
+    return;
+  }
+  }
+}
+
+/// getFunctionAlignment - Return the Log2 alignment of this function.
+unsigned BlackfinTargetLowering::getFunctionAlignment(const Function *F) const {
+  return 2;
+}
+
+//===----------------------------------------------------------------------===//
+//                         Blackfin Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+BlackfinTargetLowering::ConstraintType
+BlackfinTargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() != 1)
+    return TargetLowering::getConstraintType(Constraint);
+
+  switch (Constraint[0]) {
+    // Standard constraints
+  case 'r':
+    return C_RegisterClass;
+
+    // Blackfin-specific constraints
+  case 'a':
+  case 'd':
+  case 'z':
+  case 'D':
+  case 'W':
+  case 'e':
+  case 'b':
+  case 'v':
+  case 'f':
+  case 'c':
+  case 't':
+  case 'u':
+  case 'k':
+  case 'x':
+  case 'y':
+  case 'w':
+    return C_RegisterClass;
+  case 'A':
+  case 'B':
+  case 'C':
+  case 'Z':
+  case 'Y':
+    return C_Register;
+  }
+
+  // Not implemented: q0-q7, qA. Use {R2} etc instead
+
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+/// getRegForInlineAsmConstraint - Return register no and class for a C_Register
+/// constraint.
+std::pair<unsigned, const TargetRegisterClass*> BlackfinTargetLowering::
+getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const {
+  typedef std::pair<unsigned, const TargetRegisterClass*> Pair;
+  using namespace BF;
+
+  if (Constraint.size() != 1)
+    return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+
+  switch (Constraint[0]) {
+    // Standard constraints
+  case 'r':
+    return Pair(0U, VT == MVT::i16 ? D16RegisterClass : DPRegisterClass);
+
+    // Blackfin-specific constraints
+  case 'a': return Pair(0U, PRegisterClass);
+  case 'd': return Pair(0U, DRegisterClass);
+  case 'e': return Pair(0U, AccuRegisterClass);
+  case 'A': return Pair(A0, AccuRegisterClass);
+  case 'B': return Pair(A1, AccuRegisterClass);
+  case 'b': return Pair(0U, IRegisterClass);
+  case 'v': return Pair(0U, BRegisterClass);
+  case 'f': return Pair(0U, MRegisterClass);
+  case 'C': return Pair(CC, JustCCRegisterClass);
+  case 'x': return Pair(0U, GRRegisterClass);
+  case 'w': return Pair(0U, ALLRegisterClass);
+  case 'Z': return Pair(P3, PRegisterClass);
+  case 'Y': return Pair(P1, PRegisterClass);
+  }
+
+  // Not implemented: q0-q7, qA. Use {R2} etc instead.
+  // Constraints z, D, W, c, t, u, k, and y use non-existing classes, defer to
+  // getRegClassForInlineAsmConstraint()
+
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
+
+std::vector<unsigned> BlackfinTargetLowering::
+getRegClassForInlineAsmConstraint(const std::string &Constraint, EVT VT) const {
+  using namespace BF;
+
+  if (Constraint.size() != 1)
+    return std::vector<unsigned>();
+
+  switch (Constraint[0]) {
+  case 'z': return make_vector<unsigned>(P0, P1, P2, 0);
+  case 'D': return make_vector<unsigned>(R0, R2, R4, R6, 0);
+  case 'W': return make_vector<unsigned>(R1, R3, R5, R7, 0);
+  case 'c': return make_vector<unsigned>(I0, I1, I2, I3,
+                                         B0, B1, B2, B3,
+                                         L0, L1, L2, L3, 0);
+  case 't': return make_vector<unsigned>(LT0, LT1, 0);
+  case 'u': return make_vector<unsigned>(LB0, LB1, 0);
+  case 'k': return make_vector<unsigned>(LC0, LC1, 0);
+  case 'y': return make_vector<unsigned>(RETS, RETN, RETI, RETX, RETE,
+                                         ASTAT, SEQSTAT, USP, 0);
+  }
+
+  return std::vector<unsigned>();
+}
+
+bool BlackfinTargetLowering::
+isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  // The Blackfin target isn't yet aware of offsets.
+  return false;
+}

diff --git a/lib/Target/Blackfin/BlackfinISelLowering.h b/lib/Target/Blackfin/BlackfinISelLowering.h
new file mode 100644
index 0000000..5f39910
--- /dev/null
+++ b/lib/Target/Blackfin/BlackfinISelLowering.h

@@ -0,0 +1,81 @@
+//===- BlackfinISelLowering.h - Blackfin DAG Lowering Interface -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that Blackfin uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BLACKFIN_ISELLOWERING_H
+#define BLACKFIN_ISELLOWERING_H
+
+#include "llvm/Target/TargetLowering.h"
+#include "Blackfin.h"
+
+namespace llvm {
+
+  namespace BFISD {
+    enum {
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+      CALL,                     // A call instruction.
+      RET_FLAG,                 // Return with a flag operand.
+      Wrapper                   // Address wrapper
+    };
+  }
+
+  class BlackfinTargetLowering : public TargetLowering {
+    int VarArgsFrameOffset;   // Frame offset to start of varargs area.
+  public:
+    BlackfinTargetLowering(TargetMachine &TM);
+    virtual MVT::SimpleValueType getSetCCResultType(EVT VT) const;
+    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG);
+    virtual void ReplaceNodeResults(SDNode *N,
+                                    SmallVectorImpl<SDValue> &Results,
+                                    SelectionDAG &DAG);
+
+    int getVarArgsFrameOffset() const { return VarArgsFrameOffset; }
+
+    ConstraintType getConstraintType(const std::string &Constraint) const;
+    std::pair<unsigned, const TargetRegisterClass*>
+    getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const;
+    std::vector<unsigned>
+    getRegClassForInlineAsmConstraint(const std::string &Constraint,
+                                      EVT VT) const;
+    virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
+    const char *getTargetNodeName(unsigned Opcode) const;
+    unsigned getFunctionAlignment(const Function *F) const;
+
+  private:
+    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerADDE(SDValue Op, SelectionDAG &DAG);
+
+    virtual SDValue
+      LowerFormalArguments(SDValue Chain,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals);
+    virtual SDValue
+      LowerCall(SDValue Chain, SDValue Callee,
+                CallingConv::ID CallConv, bool isVarArg, bool &isTailCall,
+                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<ISD::InputArg> &Ins,
+                DebugLoc dl, SelectionDAG &DAG,
+                SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerReturn(SDValue Chain,
+                  CallingConv::ID CallConv, bool isVarArg,
+                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  DebugLoc dl, SelectionDAG &DAG);
+  };
+} // end namespace llvm
+
+#endif    // BLACKFIN_ISELLOWERING_H

diff --git a/lib/Target/Blackfin/BlackfinInstrFormats.td b/lib/Target/Blackfin/BlackfinInstrFormats.td
new file mode 100644
index 0000000..d8e6e25
--- /dev/null
+++ b/lib/Target/Blackfin/BlackfinInstrFormats.td

@@ -0,0 +1,34 @@
+//===--- BlackfinInstrFormats.td ---------------------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+
+class InstBfin<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : Instruction {
+  field bits<32> Inst;
+
+  let Namespace = "BF";
+
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  let AsmString   = asmstr;
+  let Pattern = pattern;
+}
+
+// Single-word (16-bit) instructions
+class F1<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstBfin<outs, ins, asmstr, pattern> {
+}
+
+// Double-word (32-bit) instructions
+class F2<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstBfin<outs, ins, asmstr, pattern> {
+}

diff --git a/lib/Target/Blackfin/BlackfinInstrInfo.cpp b/lib/Target/Blackfin/BlackfinInstrInfo.cpp
new file mode 100644
index 0000000..3fd5d4d
--- /dev/null
+++ b/lib/Target/Blackfin/BlackfinInstrInfo.cpp

@@ -0,0 +1,280 @@
+//===- BlackfinInstrInfo.cpp - Blackfin Instruction Information -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Blackfin implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BlackfinInstrInfo.h"
+#include "BlackfinSubtarget.h"
+#include "Blackfin.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "BlackfinGenInstrInfo.inc"
+
+using namespace llvm;
+
+BlackfinInstrInfo::BlackfinInstrInfo(BlackfinSubtarget &ST)
+  : TargetInstrInfoImpl(BlackfinInsts, array_lengthof(BlackfinInsts)),
+    RI(ST, *this),
+    Subtarget(ST) {}
+
+/// Return true if the instruction is a register to register move and
+/// leave the source and dest operands in the passed parameters.
+bool BlackfinInstrInfo::isMoveInstr(const MachineInstr &MI,
+                                    unsigned &SrcReg,
+                                    unsigned &DstReg,
+                                    unsigned &SrcSR,
+                                    unsigned &DstSR) const {
+  SrcSR = DstSR = 0; // No sub-registers.
+  switch (MI.getOpcode()) {
+  case BF::MOVE:
+  case BF::MOVE_ncccc:
+  case BF::MOVE_ccncc:
+  case BF::MOVECC_zext:
+  case BF::MOVECC_nz:
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+    return true;
+  case BF::SLL16i:
+    if (MI.getOperand(2).getImm()!=0)
+      return false;
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+    return true;
+  default:
+    return false;
+  }
+}
+
+/// isLoadFromStackSlot - If the specified machine instruction is a direct
+/// load from a stack slot, return the virtual or physical register number of
+/// the destination along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than loading from the stack slot.
+unsigned BlackfinInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                                int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case BF::LOAD32fi:
+  case BF::LOAD16fi:
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isImm() &&
+        MI->getOperand(2).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+/// isStoreToStackSlot - If the specified machine instruction is a direct
+/// store to a stack slot, return the virtual or physical register number of
+/// the source reg along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than storing to the stack slot.
+unsigned BlackfinInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                               int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case BF::STORE32fi:
+  case BF::STORE16fi:
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isImm() &&
+        MI->getOperand(2).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+unsigned BlackfinInstrInfo::
+InsertBranch(MachineBasicBlock &MBB,
+             MachineBasicBlock *TBB,
+             MachineBasicBlock *FBB,
+             const SmallVectorImpl<MachineOperand> &Cond) const {
+  // FIXME this should probably have a DebugLoc operand
+  DebugLoc dl = DebugLoc::getUnknownLoc();
+
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 1 || Cond.size() == 0) &&
+         "Branch conditions have one component!");
+
+  if (Cond.empty()) {
+    // Unconditional branch?
+    assert(!FBB && "Unconditional branch with multiple successors!");
+    BuildMI(&MBB, dl, get(BF::JUMPa)).addMBB(TBB);
+    return 1;
+  }
+
+  // Conditional branch.
+  llvm_unreachable("Implement conditional branches!");
+}
+
+static bool inClass(const TargetRegisterClass &Test,
+                    unsigned Reg,
+                    const TargetRegisterClass *RC) {
+  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    return Test.contains(Reg);
+  else
+    return &Test==RC || Test.hasSubClass(RC);
+}
+
+bool BlackfinInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I,
+                                     unsigned DestReg,
+                                     unsigned SrcReg,
+                                     const TargetRegisterClass *DestRC,
+                                     const TargetRegisterClass *SrcRC) const {
+  DebugLoc dl = DebugLoc::getUnknownLoc();
+
+  if (inClass(BF::ALLRegClass, DestReg, DestRC) &&
+      inClass(BF::ALLRegClass, SrcReg,  SrcRC)) {
+    BuildMI(MBB, I, dl, get(BF::MOVE), DestReg).addReg(SrcReg);
+    return true;
+  }
+
+  if (inClass(BF::D16RegClass, DestReg, DestRC) &&
+      inClass(BF::D16RegClass, SrcReg,  SrcRC)) {
+    BuildMI(MBB, I, dl, get(BF::SLL16i), DestReg).addReg(SrcReg).addImm(0);
+    return true;
+  }
+
+  if (inClass(BF::AnyCCRegClass, SrcReg, SrcRC) &&
+      inClass(BF::DRegClass, DestReg, DestRC)) {
+    if (inClass(BF::NotCCRegClass, SrcReg, SrcRC)) {
+      BuildMI(MBB, I, dl, get(BF::MOVENCC_z), DestReg).addReg(SrcReg);
+      BuildMI(MBB, I, dl, get(BF::BITTGL), DestReg).addReg(DestReg).addImm(0);
+    } else {
+      BuildMI(MBB, I, dl, get(BF::MOVECC_zext), DestReg).addReg(SrcReg);
+    }
+    return true;
+  }
+
+  if (inClass(BF::AnyCCRegClass, DestReg, DestRC) &&
+      inClass(BF::DRegClass, SrcReg,  SrcRC)) {
+    if (inClass(BF::NotCCRegClass, DestReg, DestRC))
+      BuildMI(MBB, I, dl, get(BF::SETEQri_not), DestReg).addReg(SrcReg);
+    else
+      BuildMI(MBB, I, dl, get(BF::MOVECC_nz), DestReg).addReg(SrcReg);
+    return true;
+  }
+
+  if (inClass(BF::NotCCRegClass, DestReg, DestRC) &&
+      inClass(BF::JustCCRegClass, SrcReg,  SrcRC)) {
+    BuildMI(MBB, I, dl, get(BF::MOVE_ncccc), DestReg).addReg(SrcReg);
+    return true;
+  }
+
+  if (inClass(BF::JustCCRegClass, DestReg, DestRC) &&
+      inClass(BF::NotCCRegClass, SrcReg,  SrcRC)) {
+    BuildMI(MBB, I, dl, get(BF::MOVE_ccncc), DestReg).addReg(SrcReg);
+    return true;
+  }
+
+  llvm_unreachable((std::string("Bad regclasses for reg-to-reg copy: ")+
+                    SrcRC->getName() + " -> " + DestRC->getName()).c_str());
+  return false;
+}
+
+void
+BlackfinInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator I,
+                                       unsigned SrcReg,
+                                       bool isKill,
+                                       int FI,
+                                       const TargetRegisterClass *RC) const {
+  DebugLoc DL = I != MBB.end() ?
+    I->getDebugLoc() : DebugLoc::getUnknownLoc();
+
+  if (inClass(BF::DPRegClass, SrcReg, RC)) {
+    BuildMI(MBB, I, DL, get(BF::STORE32fi))
+      .addReg(SrcReg, getKillRegState(isKill))
+      .addFrameIndex(FI)
+      .addImm(0);
+    return;
+  }
+
+  if (inClass(BF::D16RegClass, SrcReg, RC)) {
+    BuildMI(MBB, I, DL, get(BF::STORE16fi))
+      .addReg(SrcReg, getKillRegState(isKill))
+      .addFrameIndex(FI)
+      .addImm(0);
+    return;
+  }
+
+  if (inClass(BF::AnyCCRegClass, SrcReg, RC)) {
+    BuildMI(MBB, I, DL, get(BF::STORE8fi))
+      .addReg(SrcReg, getKillRegState(isKill))
+      .addFrameIndex(FI)
+      .addImm(0);
+    return;
+  }
+
+  llvm_unreachable((std::string("Cannot store regclass to stack slot: ")+
+                    RC->getName()).c_str());
+}
+
+void BlackfinInstrInfo::
+storeRegToAddr(MachineFunction &MF,
+               unsigned SrcReg,
+               bool isKill,
+               SmallVectorImpl<MachineOperand> &Addr,
+               const TargetRegisterClass *RC,
+               SmallVectorImpl<MachineInstr*> &NewMIs) const {
+  llvm_unreachable("storeRegToAddr not implemented");
+}
+
+void
+BlackfinInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator I,
+                                        unsigned DestReg,
+                                        int FI,
+                                        const TargetRegisterClass *RC) const {
+  DebugLoc DL = I != MBB.end() ?
+    I->getDebugLoc() : DebugLoc::getUnknownLoc();
+  if (inClass(BF::DPRegClass, DestReg, RC)) {
+    BuildMI(MBB, I, DL, get(BF::LOAD32fi), DestReg)
+      .addFrameIndex(FI)
+      .addImm(0);
+    return;
+  }
+
+  if (inClass(BF::D16RegClass, DestReg, RC)) {
+    BuildMI(MBB, I, DL, get(BF::LOAD16fi), DestReg)
+      .addFrameIndex(FI)
+      .addImm(0);
+    return;
+  }
+
+  if (inClass(BF::AnyCCRegClass, DestReg, RC)) {
+    BuildMI(MBB, I, DL, get(BF::LOAD8fi), DestReg)
+      .addFrameIndex(FI)
+      .addImm(0);
+    return;
+  }
+
+  llvm_unreachable("Cannot load regclass from stack slot");
+}
+
+void BlackfinInstrInfo::
+loadRegFromAddr(MachineFunction &MF,
+                unsigned DestReg,
+                SmallVectorImpl<MachineOperand> &Addr,
+                const TargetRegisterClass *RC,
+                SmallVectorImpl<MachineInstr*> &NewMIs) const {
+  llvm_unreachable("loadRegFromAddr not implemented");
+}

diff --git a/lib/Target/Blackfin/BlackfinInstrInfo.h b/lib/Target/Blackfin/BlackfinInstrInfo.h
new file mode 100644
index 0000000..ea3429c
--- /dev/null
+++ b/lib/Target/Blackfin/BlackfinInstrInfo.h

@@ -0,0 +1,80 @@
+//===- BlackfinInstrInfo.h - Blackfin Instruction Information ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Blackfin implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BLACKFININSTRUCTIONINFO_H
+#define BLACKFININSTRUCTIONINFO_H
+
+#include "llvm/Target/TargetInstrInfo.h"
+#include "BlackfinRegisterInfo.h"
+
+namespace llvm {
+
+  class BlackfinInstrInfo : public TargetInstrInfoImpl {
+    const BlackfinRegisterInfo RI;
+    const BlackfinSubtarget& Subtarget;
+  public:
+    explicit BlackfinInstrInfo(BlackfinSubtarget &ST);
+
+    /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+    /// such, whenever a client has an instance of instruction info, it should
+    /// always be able to get register info as well (through this method).
+    virtual const BlackfinRegisterInfo &getRegisterInfo() const { return RI; }
+
+    virtual bool isMoveInstr(const MachineInstr &MI,
+                             unsigned &SrcReg, unsigned &DstReg,
+                             unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
+
+    virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                                         int &FrameIndex) const;
+
+    virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
+                                        int &FrameIndex) const;
+
+    virtual unsigned
+    InsertBranch(MachineBasicBlock &MBB,
+                 MachineBasicBlock *TBB,
+                 MachineBasicBlock *FBB,
+                 const SmallVectorImpl<MachineOperand> &Cond) const;
+
+    virtual bool copyRegToReg(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I,
+                              unsigned DestReg, unsigned SrcReg,
+                              const TargetRegisterClass *DestRC,
+                              const TargetRegisterClass *SrcRC) const;
+
+    virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MBBI,
+                                     unsigned SrcReg, bool isKill,
+                                     int FrameIndex,
+                                     const TargetRegisterClass *RC) const;
+
+    virtual void storeRegToAddr(MachineFunction &MF,
+                                unsigned SrcReg, bool isKill,
+                                SmallVectorImpl<MachineOperand> &Addr,
+                                const TargetRegisterClass *RC,
+                                SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+    virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator MBBI,
+                                      unsigned DestReg, int FrameIndex,
+                                      const TargetRegisterClass *RC) const;
+
+    virtual void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+                                 SmallVectorImpl<MachineOperand> &Addr,
+                                 const TargetRegisterClass *RC,
+                                 SmallVectorImpl<MachineInstr*> &NewMIs) const;
+  };
+
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/Blackfin/BlackfinInstrInfo.td b/lib/Target/Blackfin/BlackfinInstrInfo.td
new file mode 100644
index 0000000..88ff85f
--- /dev/null
+++ b/lib/Target/Blackfin/BlackfinInstrInfo.td

@@ -0,0 +1,874 @@
+//===- BlackfinInstrInfo.td - Target Description for Blackfin Target ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Blackfin instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+
+include "BlackfinInstrFormats.td"
+
+// These are target-independent nodes, but have target-specific formats.
+def SDT_BfinCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_BfinCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
+                                        SDTCisVT<1, i32> ]>;
+
+def BfinCallseqStart : SDNode<"ISD::CALLSEQ_START", SDT_BfinCallSeqStart,
+                              [SDNPHasChain, SDNPOutFlag]>;
+def BfinCallseqEnd   : SDNode<"ISD::CALLSEQ_END",   SDT_BfinCallSeqEnd,
+                              [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+
+def SDT_BfinCall  : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+def BfinCall      : SDNode<"BFISD::CALL", SDT_BfinCall,
+                           [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+
+def BfinRet: SDNode<"BFISD::RET_FLAG", SDTNone,
+                    [SDNPHasChain, SDNPOptInFlag]>;
+
+def BfinWrapper: SDNode<"BFISD::Wrapper", SDTIntUnaryOp>;
+
+//===----------------------------------------------------------------------===//
+// Transformations
+//===----------------------------------------------------------------------===//
+
+def trailingZeros_xform : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getAPIntValue().countTrailingZeros(),
+                                   MVT::i32);
+}]>;
+
+def trailingOnes_xform : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getAPIntValue().countTrailingOnes(),
+                                   MVT::i32);
+}]>;
+
+def LO16 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant((unsigned short)N->getZExtValue(), MVT::i16);
+}]>;
+
+def HI16 : SDNodeXForm<imm, [{
+  // Transformation function: shift the immediate value down into the low bits.
+  return CurDAG->getTargetConstant((unsigned)N->getZExtValue() >> 16, MVT::i16);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Immediates
+//===----------------------------------------------------------------------===//
+
+def imm3  : PatLeaf<(imm), [{return isInt<3>(N->getSExtValue());}]>;
+def uimm3 : PatLeaf<(imm), [{return isUint<3>(N->getZExtValue());}]>;
+def uimm4 : PatLeaf<(imm), [{return isUint<4>(N->getZExtValue());}]>;
+def uimm5 : PatLeaf<(imm), [{return isUint<5>(N->getZExtValue());}]>;
+
+def uimm5m2 : PatLeaf<(imm), [{
+    uint64_t value = N->getZExtValue();
+    return value % 2 == 0 && isUint<5>(value);
+}]>;
+
+def uimm6m4 : PatLeaf<(imm), [{
+    uint64_t value = N->getZExtValue();
+    return value % 4 == 0 && isUint<6>(value);
+}]>;
+
+def imm7   : PatLeaf<(imm), [{return isInt<7>(N->getSExtValue());}]>;
+def imm16  : PatLeaf<(imm), [{return isInt<16>(N->getSExtValue());}]>;
+def uimm16 : PatLeaf<(imm), [{return isUint<16>(N->getZExtValue());}]>;
+
+def ximm16 : PatLeaf<(imm), [{
+    int64_t value = N->getSExtValue();
+    return value < (1<<16) && value >= -(1<<15);
+}]>;
+
+def imm17m2 : PatLeaf<(imm), [{
+    int64_t value = N->getSExtValue();
+    return value % 2 == 0 && isInt<17>(value);
+}]>;
+
+def imm18m4 : PatLeaf<(imm), [{
+    int64_t value = N->getSExtValue();
+    return value % 4 == 0 && isInt<18>(value);
+}]>;
+
+// 32-bit bitmask transformed to a bit number
+def uimm5mask : Operand<i32>, PatLeaf<(imm), [{
+    return isPowerOf2_32(N->getZExtValue());
+}], trailingZeros_xform>;
+
+// 32-bit inverse bitmask transformed to a bit number
+def uimm5imask : Operand<i32>, PatLeaf<(imm), [{
+    return isPowerOf2_32(~N->getZExtValue());
+}], trailingOnes_xform>;
+
+//===----------------------------------------------------------------------===//
+// Operands
+//===----------------------------------------------------------------------===//
+
+def calltarget : Operand<iPTR>;
+
+def brtarget : Operand<OtherVT>;
+
+// Addressing modes
+def ADDRspii : ComplexPattern<i32, 2, "SelectADDRspii", [add, frameindex], []>;
+
+// Address operands
+def MEMii : Operand<i32> {
+  let PrintMethod = "printMemoryOperand";
+  let MIOperandInfo = (ops i32imm, i32imm);
+}
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+// Pseudo instructions.
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+   : InstBfin<outs, ins, asmstr, pattern>;
+
+let Defs = [SP], Uses = [SP] in {
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
+                              "${:comment}ADJCALLSTACKDOWN $amt",
+                              [(BfinCallseqStart timm:$amt)]>;
+def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                            "${:comment}ADJCALLSTACKUP $amt1 $amt2",
+                            [(BfinCallseqEnd timm:$amt1, timm:$amt2)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Table C-9. Program Flow Control Instructions
+//===----------------------------------------------------------------------===//
+
+let isBranch = 1, isTerminator = 1 in {
+
+let isIndirectBranch = 1 in
+def JUMPp : F1<(outs), (ins P:$target),
+               "JUMP ($target);",
+               [(brind P:$target)]>;
+
+// TODO JUMP (PC-P)
+
+// NOTE: assembler chooses between JUMP.S and JUMP.L
+def JUMPa : F1<(outs), (ins brtarget:$target),
+               "jump $target;",
+               [(br bb:$target)]>;
+
+def JUMPcc : F1<(outs), (ins AnyCC:$cc, brtarget:$target),
+               "if $cc jump $target;",
+               [(brcond AnyCC:$cc, bb:$target)]>;
+}
+
+let isCall = 1,
+    Defs   = [R0, R1, R2, R3, P0, P1, P2, LB0, LB1, LC0, LC1, RETS, ASTAT] in {
+def CALLa: F1<(outs), (ins calltarget:$func, variable_ops),
+              "call $func;", []>;
+def CALLp: F1<(outs), (ins P:$func, variable_ops),
+              "call ($func);", [(BfinCall P:$func)]>;
+}
+
+let isReturn     = 1,
+    isTerminator = 1,
+    isBarrier    = 1,
+    Uses         = [RETS] in
+def RTS: F1<(outs), (ins), "rts;", [(BfinRet)]>;
+
+//===----------------------------------------------------------------------===//
+// Table C-10. Load / Store Instructions
+//===----------------------------------------------------------------------===//
+
+// Immediate constant loads
+
+// sext immediate, i32 D/P regs
+def LOADimm7: F1<(outs DP:$dst), (ins i32imm:$src),
+                 "$dst = $src (x);",
+                 [(set DP:$dst, imm7:$src)]>;
+
+// zext immediate, i32 reg groups 0-3
+def LOADuimm16: F2<(outs GR:$dst), (ins i32imm:$src),
+                   "$dst = $src (z);",
+                   [(set GR:$dst, uimm16:$src)]>;
+
+// sext immediate, i32 reg groups 0-3
+def LOADimm16: F2<(outs GR:$dst), (ins i32imm:$src),
+                  "$dst = $src (x);",
+                  [(set GR:$dst, imm16:$src)]>;
+
+// Pseudo-instruction for loading a general 32-bit constant.
+def LOAD32imm: Pseudo<(outs GR:$dst), (ins i32imm:$src),
+                      "$dst.h = ($src >> 16); $dst.l = ($src & 0xffff);",
+                      [(set GR:$dst, imm:$src)]>;
+
+def LOAD32sym: Pseudo<(outs GR:$dst), (ins i32imm:$src),
+                      "$dst.h = $src; $dst.l = $src;", []>;
+
+
+// 16-bit immediate, i16 reg groups 0-3
+def LOAD16i: F2<(outs GR16:$dst), (ins i16imm:$src),
+                 "$dst = $src;", []>;
+
+def : Pat<(BfinWrapper (i32 tglobaladdr:$addr)),
+          (LOAD32sym tglobaladdr:$addr)>;
+
+def : Pat<(BfinWrapper (i32 tjumptable:$addr)),
+          (LOAD32sym tjumptable:$addr)>;
+
+// We cannot copy from GR16 to D16, and codegen wants to insert copies if we
+// emit GR16 instructions. As a hack, we use this fake instruction instead.
+def LOAD16i_d16: F2<(outs D16:$dst), (ins i16imm:$src),
+                    "$dst = $src;",
+                    [(set D16:$dst, ximm16:$src)]>;
+
+// Memory loads with patterns
+
+def LOAD32p: F1<(outs DP:$dst), (ins P:$ptr),
+                "$dst = [$ptr];",
+                [(set DP:$dst, (load P:$ptr))]>;
+
+// Pseudo-instruction for loading a stack slot
+def LOAD32fi: Pseudo<(outs DP:$dst), (ins MEMii:$mem),
+                     "${:comment}FI $dst = [$mem];",
+                     [(set DP:$dst, (load ADDRspii:$mem))]>;
+
+// Note: Expands to multiple insns
+def LOAD16fi: Pseudo<(outs D16:$dst), (ins MEMii:$mem),
+                     "${:comment}FI $dst = [$mem];",
+                     [(set D16:$dst, (load ADDRspii:$mem))]>;
+
+// Pseudo-instruction for loading a stack slot, used for AnyCC regs.
+// Replaced with Load D + CC=D
+def LOAD8fi: Pseudo<(outs AnyCC:$dst), (ins MEMii:$mem),
+                    "${:comment}FI $dst = B[$mem];",
+                    [(set AnyCC:$dst, (load ADDRspii:$mem))]>;
+
+def LOAD32p_uimm6m4: F1<(outs DP:$dst), (ins P:$ptr, i32imm:$off),
+                        "$dst = [$ptr + $off];",
+                        [(set DP:$dst, (load (add P:$ptr, uimm6m4:$off)))]>;
+
+def LOAD32p_imm18m4: F2<(outs DP:$dst), (ins P:$ptr, i32imm:$off),
+                         "$dst = [$ptr + $off];",
+                         [(set DP:$dst, (load (add P:$ptr, imm18m4:$off)))]>;
+
+def LOAD32p_16z: F1<(outs D:$dst), (ins P:$ptr),
+                    "$dst = W[$ptr] (z);",
+                    [(set D:$dst, (zextloadi16 P:$ptr))]>;
+
+def : Pat<(i32 (extloadi16 P:$ptr)),(LOAD32p_16z P:$ptr)>;
+
+def LOAD32p_uimm5m2_16z: F1<(outs D:$dst), (ins P:$ptr, i32imm:$off),
+                            "$dst = w[$ptr + $off] (z);",
+                            [(set D:$dst, (zextloadi16 (add P:$ptr,
+                                                        uimm5m2:$off)))]>;
+
+def : Pat<(i32 (extloadi16 (add P:$ptr, uimm5m2:$off))),
+          (LOAD32p_uimm5m2_16z P:$ptr, imm:$off)>;
+
+def LOAD32p_imm17m2_16z: F1<(outs D:$dst), (ins P:$ptr, i32imm:$off),
+                            "$dst = w[$ptr + $off] (z);",
+                            [(set D:$dst,
+                                  (zextloadi16 (add P:$ptr, imm17m2:$off)))]>;
+
+def : Pat<(i32 (extloadi16 (add P:$ptr, imm17m2:$off))),
+          (LOAD32p_imm17m2_16z P:$ptr, imm:$off)>;
+
+def LOAD32p_16s: F1<(outs D:$dst), (ins P:$ptr),
+                    "$dst = w[$ptr] (x);",
+                    [(set D:$dst, (sextloadi16 P:$ptr))]>;
+
+def LOAD32p_uimm5m2_16s: F1<(outs D:$dst), (ins P:$ptr, i32imm:$off),
+                            "$dst = w[$ptr + $off] (x);",
+                            [(set D:$dst,
+                                  (sextloadi16 (add P:$ptr, uimm5m2:$off)))]>;
+
+def LOAD32p_imm17m2_16s: F1<(outs D:$dst), (ins P:$ptr, i32imm:$off),
+                            "$dst = w[$ptr + $off] (x);",
+                            [(set D:$dst,
+                                  (sextloadi16 (add P:$ptr, imm17m2:$off)))]>;
+
+def LOAD16pi: F1<(outs D16:$dst), (ins PI:$ptr),
+                "$dst = w[$ptr];",
+                [(set D16:$dst, (load PI:$ptr))]>;
+
+def LOAD32p_8z: F1<(outs D:$dst), (ins P:$ptr),
+                   "$dst = B[$ptr] (z);",
+                   [(set D:$dst, (zextloadi8 P:$ptr))]>;
+
+def : Pat<(i32 (extloadi8 P:$ptr)), (LOAD32p_8z P:$ptr)>;
+def : Pat<(i16 (extloadi8 P:$ptr)),
+          (EXTRACT_SUBREG (LOAD32p_8z P:$ptr), bfin_subreg_lo16)>;
+def : Pat<(i16 (zextloadi8 P:$ptr)),
+          (EXTRACT_SUBREG (LOAD32p_8z P:$ptr), bfin_subreg_lo16)>;
+
+def LOAD32p_imm16_8z: F1<(outs D:$dst), (ins P:$ptr, i32imm:$off),
+                         "$dst = b[$ptr + $off] (z);",
+                         [(set D:$dst, (zextloadi8 (add P:$ptr, imm16:$off)))]>;
+
+def : Pat<(i32 (extloadi8 (add P:$ptr, imm16:$off))),
+          (LOAD32p_imm16_8z P:$ptr, imm:$off)>;
+def : Pat<(i16 (extloadi8 (add P:$ptr, imm16:$off))),
+          (EXTRACT_SUBREG (LOAD32p_imm16_8z P:$ptr, imm:$off),
+                           bfin_subreg_lo16)>;
+def : Pat<(i16 (zextloadi8 (add P:$ptr, imm16:$off))),
+          (EXTRACT_SUBREG (LOAD32p_imm16_8z P:$ptr, imm:$off),
+                           bfin_subreg_lo16)>;
+
+def LOAD32p_8s: F1<(outs D:$dst), (ins P:$ptr),
+                   "$dst = b[$ptr] (x);",
+                   [(set D:$dst, (sextloadi8 P:$ptr))]>;
+
+def : Pat<(i16 (sextloadi8 P:$ptr)),
+          (EXTRACT_SUBREG (LOAD32p_8s P:$ptr), bfin_subreg_lo16)>;
+
+def LOAD32p_imm16_8s: F1<(outs D:$dst), (ins P:$ptr, i32imm:$off),
+                         "$dst = b[$ptr + $off] (x);",
+                         [(set D:$dst, (sextloadi8 (add P:$ptr, imm16:$off)))]>;
+
+def : Pat<(i16 (sextloadi8 (add P:$ptr, imm16:$off))),
+          (EXTRACT_SUBREG (LOAD32p_imm16_8s P:$ptr, imm:$off),
+                           bfin_subreg_lo16)>;
+// Memory loads without patterns
+
+let mayLoad = 1 in {
+
+multiclass LOAD_incdec<RegisterClass drc, RegisterClass prc,
+                       string mem="", string suf=";"> {
+  def _inc : F1<(outs drc:$dst, prc:$ptr_wb), (ins prc:$ptr),
+                !strconcat(!subst("M", mem, "$dst = M[$ptr++]"), suf), []>;
+  def _dec : F1<(outs drc:$dst, prc:$ptr_wb), (ins prc:$ptr),
+                !strconcat(!subst("M", mem, "$dst = M[$ptr--]"), suf), []>;
+}
+multiclass LOAD_incdecpost<RegisterClass drc, RegisterClass prc,
+                           string mem="", string suf=";">
+         : LOAD_incdec<drc, prc, mem, suf> {
+  def _post : F1<(outs drc:$dst, prc:$ptr_wb), (ins prc:$ptr, prc:$off),
+                 !strconcat(!subst("M", mem, "$dst = M[$ptr++$off]"), suf), []>;
+}
+
+defm LOAD32p:    LOAD_incdec<DP, P>;
+defm LOAD32i:    LOAD_incdec<D, I>;
+defm LOAD8z32p:  LOAD_incdec<D, P, "b", " (z);">;
+defm LOAD8s32p:  LOAD_incdec<D, P, "b", " (x);">;
+defm LOADhi:     LOAD_incdec<D16, I, "w">;
+defm LOAD16z32p: LOAD_incdecpost<D, P, "w", " (z);">;
+defm LOAD16s32p: LOAD_incdecpost<D, P, "w", " (x);">;
+
+def LOAD32p_post: F1<(outs D:$dst, P:$ptr_wb), (ins P:$ptr, P:$off),
+                     "$dst = [$ptr ++ $off];", []>;
+
+// Note: $fp MUST be FP
+def LOAD32fp_nimm7m4: F1<(outs DP:$dst), (ins P:$fp, i32imm:$off),
+                         "$dst = [$fp - $off];", []>;
+
+def LOAD32i:      F1<(outs D:$dst), (ins I:$ptr),
+                     "$dst = [$ptr];", []>;
+def LOAD32i_post: F1<(outs D:$dst, I:$ptr_wb), (ins I:$ptr, M:$off),
+                     "$dst = [$ptr ++ $off];", []>;
+
+
+
+def LOADhp_post: F1<(outs D16:$dst, P:$ptr_wb), (ins P:$ptr, P:$off),
+                    "$dst = w[$ptr ++ $off];", []>;
+
+
+}
+
+// Memory stores with patterns
+def STORE32p: F1<(outs), (ins DP:$val, P:$ptr),
+                 "[$ptr] = $val;",
+                 [(store DP:$val, P:$ptr)]>;
+
+// Pseudo-instructions for storing to a stack slot
+def STORE32fi: Pseudo<(outs), (ins DP:$val, MEMii:$mem),
+                      "${:comment}FI [$mem] = $val;",
+                      [(store DP:$val, ADDRspii:$mem)]>;
+
+// Note: This stack-storing pseudo-instruction is expanded to multiple insns
+def STORE16fi: Pseudo<(outs), (ins D16:$val, MEMii:$mem),
+                  "${:comment}FI [$mem] = $val;",
+                  [(store D16:$val, ADDRspii:$mem)]>;
+
+// Pseudo-instructions for storing AnyCC register to a stack slot.
+// Replaced with D=CC + STORE byte
+def STORE8fi: Pseudo<(outs), (ins AnyCC:$val, MEMii:$mem),
+                      "${:comment}FI b[$mem] = $val;",
+                      [(store AnyCC:$val, ADDRspii:$mem)]>;
+
+def STORE32p_uimm6m4: F1<(outs), (ins DP:$val, P:$ptr, i32imm:$off),
+                 "[$ptr + $off] = $val;",
+                 [(store DP:$val, (add P:$ptr, uimm6m4:$off))]>;
+
+def STORE32p_imm18m4: F1<(outs), (ins DP:$val, P:$ptr, i32imm:$off),
+                 "[$ptr + $off] = $val;",
+                 [(store DP:$val, (add P:$ptr, imm18m4:$off))]>;
+
+def STORE16pi: F1<(outs), (ins D16:$val, PI:$ptr),
+                  "w[$ptr] = $val;",
+                  [(store D16:$val, PI:$ptr)]>;
+
+def STORE8p: F1<(outs), (ins D:$val, P:$ptr),
+                "b[$ptr] = $val;",
+                [(truncstorei8 D:$val, P:$ptr)]>;
+
+def STORE8p_imm16: F1<(outs), (ins D:$val, P:$ptr, i32imm:$off),
+                 "b[$ptr + $off] = $val;",
+                 [(truncstorei8 D:$val, (add P:$ptr, imm16:$off))]>;
+
+let Constraints = "$ptr = $ptr_wb" in {
+
+multiclass STORE_incdec<RegisterClass drc, RegisterClass prc,
+                        int off=4, string pre=""> {
+  def _inc : F1<(outs prc:$ptr_wb), (ins drc:$val, prc:$ptr),
+                !strconcat(pre, "[$ptr++] = $val;"),
+                [(set prc:$ptr_wb, (post_store drc:$val, prc:$ptr, off))]>;
+  def _dec : F1<(outs prc:$ptr_wb), (ins drc:$val, prc:$ptr),
+                !strconcat(pre, "[$ptr--] = $val;"),
+                [(set prc:$ptr_wb, (post_store drc:$val, prc:$ptr,
+                                               (ineg off)))]>;
+}
+
+defm STORE32p: STORE_incdec<DP, P>;
+defm STORE16i: STORE_incdec<D16, I, 2, "w">;
+defm STORE8p:  STORE_incdec<D, P, 1, "b">;
+
+def STORE32p_post: F1<(outs P:$ptr_wb), (ins D:$val, P:$ptr, P:$off),
+                      "[$ptr ++ $off] = $val;",
+                      [(set P:$ptr_wb, (post_store D:$val, P:$ptr, P:$off))]>;
+
+def STORE16p_post: F1<(outs P:$ptr_wb), (ins D16:$val, P:$ptr, P:$off),
+                      "w[$ptr ++ $off] = $val;",
+                      [(set P:$ptr_wb, (post_store D16:$val, P:$ptr, P:$off))]>;
+}
+
+// Memory stores without patterns
+
+let mayStore = 1 in {
+
+// Note: only works for $fp == FP
+def STORE32fp_nimm7m4: F1<(outs), (ins DP:$val, P:$fp, i32imm:$off),
+                         "[$fp - $off] = $val;", []>;
+
+def STORE32i: F1<(outs), (ins D:$val, I:$ptr),
+                 "[$ptr] = $val;", []>;
+
+def STORE32i_inc: F1<(outs I:$ptr_wb), (ins D:$val, I:$ptr),
+                 "[$ptr++] = $val;", []>;
+
+def STORE32i_dec: F1<(outs I:$ptr_wb), (ins D:$val, I:$ptr),
+                 "[$ptr--] = $val;", []>;
+
+def STORE32i_post: F1<(outs I:$ptr_wb), (ins D:$val, I:$ptr, M:$off),
+                      "[$ptr ++ $off] = $val;", []>;
+}
+
+def : Pat<(truncstorei16 D:$val, PI:$ptr),
+          (STORE16pi (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS D:$val, D)),
+                                     bfin_subreg_lo16), PI:$ptr)>;
+
+def : Pat<(truncstorei16 (srl D:$val, (i16 16)), PI:$ptr),
+          (STORE16pi (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS D:$val, D)),
+                                     bfin_subreg_hi16), PI:$ptr)>;
+
+def : Pat<(truncstorei8 D16L:$val, P:$ptr),
+          (STORE8p (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+                                  (i16 (COPY_TO_REGCLASS D16L:$val, D16L)),
+                                  bfin_subreg_lo16),
+                   P:$ptr)>;
+
+//===----------------------------------------------------------------------===//
+// Table C-11. Move Instructions.
+//===----------------------------------------------------------------------===//
+
+def MOVE: F1<(outs ALL:$dst), (ins ALL:$src),
+             "$dst = $src;",
+             []>;
+
+let isTwoAddress = 1 in
+def MOVEcc: F1<(outs DP:$dst), (ins DP:$src1, DP:$src2, AnyCC:$cc),
+               "if $cc $dst = $src2;",
+               [(set DP:$dst, (select AnyCC:$cc, DP:$src2, DP:$src1))]>;
+
+let Defs = [AZ, AN, AC0, V] in {
+def MOVEzext: F1<(outs D:$dst), (ins D16L:$src),
+                 "$dst = $src (z);",
+                 [(set D:$dst, (zext D16L:$src))]>;
+
+def MOVEsext: F1<(outs D:$dst), (ins D16L:$src),
+                 "$dst = $src (x);",
+                 [(set D:$dst, (sext D16L:$src))]>;
+
+def MOVEzext8: F1<(outs D:$dst), (ins D:$src),
+                  "$dst = $src.b (z);",
+                  [(set D:$dst, (and D:$src, 0xff))]>;
+
+def MOVEsext8: F1<(outs D:$dst), (ins D:$src),
+                  "$dst = $src.b (x);",
+                  [(set D:$dst, (sext_inreg D:$src, i8))]>;
+
+}
+
+def : Pat<(sext_inreg D16L:$src, i8),
+          (EXTRACT_SUBREG (MOVEsext8
+                           (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+                                          D16L:$src,
+                                          bfin_subreg_lo16)),
+                          bfin_subreg_lo16)>;
+
+def : Pat<(sext_inreg D:$src, i16),
+          (MOVEsext (EXTRACT_SUBREG D:$src, bfin_subreg_lo16))>;
+
+def : Pat<(and D:$src, 0xffff),
+          (MOVEzext (EXTRACT_SUBREG D:$src, bfin_subreg_lo16))>;
+
+def : Pat<(i32 (anyext D16L:$src)),
+          (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+                         (i16 (COPY_TO_REGCLASS D16L:$src, D16L)),
+                         bfin_subreg_lo16)>;
+
+// TODO Dreg = Dreg_byte (X/Z)
+
+// TODO Accumulator moves
+
+//===----------------------------------------------------------------------===//
+// Table C-12. Stack Control Instructions
+//===----------------------------------------------------------------------===//
+
+let Uses = [SP], Defs = [SP] in {
+def PUSH: F1<(outs), (ins ALL:$src),
+             "[--sp] = $src;", []> { let mayStore = 1; }
+
+// NOTE: POP does not work for DP regs, use LOAD instead
+def POP:  F1<(outs ALL:$dst), (ins),
+             "$dst = [sp++];", []> { let mayLoad = 1; }
+}
+
+// TODO: push/pop multiple
+
+def LINK: F2<(outs), (ins i32imm:$amount),
+             "link $amount;", []>;
+
+def UNLINK: F2<(outs), (ins),
+               "unlink;", []>;
+
+//===----------------------------------------------------------------------===//
+// Table C-13. Control Code Bit Management Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass SETCC<PatFrag opnode, PatFrag invnode, string cond, string suf=";"> {
+  def dd : F1<(outs JustCC:$cc), (ins D:$a, D:$b),
+              !strconcat(!subst("XX", cond, "cc = $a XX $b"), suf),
+              [(set JustCC:$cc, (opnode  D:$a, D:$b))]>;
+
+  def ri : F1<(outs JustCC:$cc), (ins DP:$a, i32imm:$b),
+              !strconcat(!subst("XX", cond, "cc = $a XX $b"), suf),
+              [(set JustCC:$cc, (opnode  DP:$a, imm3:$b))]>;
+
+  def pp : F1<(outs JustCC:$cc), (ins P:$a, P:$b),
+              !strconcat(!subst("XX", cond, "cc = $a XX $b"), suf),
+              []>;
+
+  def ri_not : F1<(outs NotCC:$cc), (ins DP:$a, i32imm:$b),
+                  !strconcat(!subst("XX", cond, "cc = $a XX $b"), suf),
+                  [(set NotCC:$cc, (invnode  DP:$a, imm3:$b))]>;
+}
+
+defm SETEQ  : SETCC<seteq,  setne,  "==">;
+defm SETLT  : SETCC<setlt,  setge,  "<">;
+defm SETLE  : SETCC<setle,  setgt,  "<=">;
+defm SETULT : SETCC<setult, setuge, "<",  " (iu);">;
+defm SETULE : SETCC<setule, setugt, "<=", " (iu);">;
+
+def SETNEdd : F1<(outs NotCC:$cc), (ins D:$a, D:$b),
+                 "cc = $a == $b;",
+                 [(set NotCC:$cc, (setne  D:$a, D:$b))]>;
+
+def : Pat<(setgt  D:$a, D:$b), (SETLTdd  D:$b, D:$a)>;
+def : Pat<(setge  D:$a, D:$b), (SETLEdd  D:$b, D:$a)>;
+def : Pat<(setugt D:$a, D:$b), (SETULTdd D:$b, D:$a)>;
+def : Pat<(setuge D:$a, D:$b), (SETULEdd D:$b, D:$a)>;
+
+// TODO: compare pointer for P-P comparisons
+// TODO: compare accumulator
+
+let Defs = [AC0] in
+def OR_ac0_cc : F1<(outs), (ins JustCC:$cc),
+                   "ac0 \\|= cc;", []>;
+
+let Uses = [AC0] in
+def MOVE_cc_ac0 : F1<(outs JustCC:$cc), (ins),
+                   "cc = ac0;", []>;
+
+def MOVE_ccncc : F1<(outs JustCC:$cc), (ins NotCC:$sb),
+                    "cc = !cc;", []>;
+
+def MOVE_ncccc : F1<(outs NotCC:$cc), (ins JustCC:$sb),
+                    "cc = !cc;", []>;
+
+def MOVECC_zext : F1<(outs D:$dst), (ins JustCC:$cc),
+                      "$dst = $cc;",
+                      [(set D:$dst, (zext JustCC:$cc))]>;
+
+def MOVENCC_z : F1<(outs D:$dst), (ins NotCC:$cc),
+                   "$dst = cc;", []>;
+
+def MOVECC_nz : F1<(outs AnyCC:$cc), (ins D:$src),
+                   "cc = $src;",
+                   [(set AnyCC:$cc, (setne D:$src, 0))]>;
+
+//===----------------------------------------------------------------------===//
+// Table C-14. Logical Operations Instructions
+//===----------------------------------------------------------------------===//
+
+def AND: F1<(outs D:$dst), (ins D:$src1, D:$src2),
+            "$dst = $src1 & $src2;",
+            [(set D:$dst, (and D:$src1, D:$src2))]>;
+
+def NOT: F1<(outs D:$dst), (ins D:$src),
+            "$dst = ~$src;",
+            [(set D:$dst, (not D:$src))]>;
+
+def OR: F1<(outs D:$dst), (ins D:$src1, D:$src2),
+           "$dst = $src1 \\| $src2;",
+           [(set D:$dst, (or D:$src1, D:$src2))]>;
+
+def XOR: F1<(outs D:$dst), (ins D:$src1, D:$src2),
+            "$dst = $src1 ^ $src2;",
+            [(set D:$dst, (xor D:$src1, D:$src2))]>;
+
+// missing: BXOR, BXORSHIFT
+
+//===----------------------------------------------------------------------===//
+// Table C-15. Bit Operations Instructions
+//===----------------------------------------------------------------------===//
+
+let isTwoAddress = 1 in {
+def BITCLR: F1<(outs D:$dst), (ins D:$src1, uimm5imask:$src2),
+              "bitclr($dst, $src2);",
+              [(set D:$dst, (and D:$src1, uimm5imask:$src2))]>;
+
+def BITSET: F1<(outs D:$dst), (ins D:$src1, uimm5mask:$src2),
+              "bitset($dst, $src2);",
+              [(set D:$dst, (or D:$src1, uimm5mask:$src2))]>;
+
+def BITTGL: F1<(outs D:$dst), (ins D:$src1, uimm5mask:$src2),
+              "bittgl($dst, $src2);",
+              [(set D:$dst, (xor D:$src1, uimm5mask:$src2))]>;
+}
+
+def BITTST: F1<(outs JustCC:$cc), (ins D:$src1, uimm5mask:$src2),
+              "cc = bittst($src1, $src2);",
+              [(set JustCC:$cc, (setne (and D:$src1, uimm5mask:$src2),
+                                       (i32 0)))]>;
+
+def NBITTST: F1<(outs JustCC:$cc), (ins D:$src1, uimm5mask:$src2),
+               "cc = !bittst($src1, $src2);",
+               [(set JustCC:$cc, (seteq (and D:$src1, uimm5mask:$src2),
+                                        (i32 0)))]>;
+
+// TODO: DEPOSIT, EXTRACT, BITMUX
+
+def ONES: F2<(outs D16L:$dst), (ins D:$src),
+              "$dst = ones $src;",
+              [(set D16L:$dst, (trunc (ctpop D:$src)))]>;
+
+def : Pat<(ctpop D:$src), (MOVEzext (ONES D:$src))>;
+
+//===----------------------------------------------------------------------===//
+// Table C-16. Shift / Rotate Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass SHIFT32<SDNode opnode, string ops> {
+  def i : F1<(outs D:$dst), (ins D:$src, i16imm:$amount),
+             !subst("XX", ops, "$dst XX= $amount;"),
+             [(set D:$dst, (opnode D:$src, (i16 uimm5:$amount)))]>;
+  def r : F1<(outs D:$dst), (ins D:$src, D:$amount),
+             !subst("XX", ops, "$dst XX= $amount;"),
+             [(set D:$dst, (opnode D:$src, D:$amount))]>;
+}
+
+let Defs = [AZ, AN, V, VS],
+    isTwoAddress = 1 in {
+defm SRA : SHIFT32<sra, ">>>">;
+defm SRL : SHIFT32<srl, ">>">;
+defm SLL : SHIFT32<shl, "<<">;
+}
+
+// TODO: automatic switching between 2-addr and 3-addr (?)
+
+let Defs = [AZ, AN, V, VS] in {
+def SLLr16: F2<(outs D:$dst), (ins D:$src, D16L:$amount),
+             "$dst = lshift $src by $amount;",
+             [(set D:$dst, (shl D:$src, D16L:$amount))]>;
+
+// Arithmetic left-shift = saturing overflow.
+def SLAr16: F2<(outs D:$dst), (ins D:$src, D16L:$amount),
+             "$dst = ashift $src by $amount;",
+             [(set D:$dst, (sra D:$src, (ineg D16L:$amount)))]>;
+
+def SRA16i: F1<(outs D16:$dst), (ins D16:$src, i16imm:$amount),
+              "$dst = $src >>> $amount;",
+              [(set D16:$dst, (sra D16:$src, (i16 uimm4:$amount)))]>;
+
+def SRL16i: F1<(outs D16:$dst), (ins D16:$src, i16imm:$amount),
+              "$dst = $src >> $amount;",
+              [(set D16:$dst, (srl D16:$src, (i16 uimm4:$amount)))]>;
+
+// Arithmetic left-shift = saturing overflow.
+def SLA16r: F1<(outs D16:$dst), (ins D16:$src, D16L:$amount),
+              "$dst = ashift $src BY $amount;",
+              [(set D16:$dst, (srl D16:$src, (ineg D16L:$amount)))]>;
+
+def SLL16i: F1<(outs D16:$dst), (ins D16:$src, i16imm:$amount),
+              "$dst = $src << $amount;",
+              [(set D16:$dst, (shl D16:$src, (i16 uimm4:$amount)))]>;
+
+def SLL16r: F1<(outs D16:$dst), (ins D16:$src, D16L:$amount),
+              "$dst = lshift $src by $amount;",
+              [(set D16:$dst, (shl D16:$src, D16L:$amount))]>;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Table C-17. Arithmetic Operations Instructions
+//===----------------------------------------------------------------------===//
+
+// TODO: ABS
+
+let Defs = [AZ, AN, AC0, V, VS] in {
+
+def ADD: F1<(outs D:$dst), (ins D:$src1, D:$src2),
+            "$dst = $src1 + $src2;",
+            [(set D:$dst, (add D:$src1, D:$src2))]>;
+
+def ADD16: F2<(outs D16:$dst), (ins D16:$src1, D16:$src2),
+              "$dst = $src1 + $src2;",
+              [(set D16:$dst, (add D16:$src1, D16:$src2))]>;
+
+let isTwoAddress = 1 in
+def ADDimm7: F1<(outs D:$dst), (ins D:$src1, i32imm:$src2),
+                "$dst += $src2;",
+                [(set D:$dst, (add D:$src1, imm7:$src2))]>;
+
+def SUB: F1<(outs D:$dst), (ins D:$src1, D:$src2),
+            "$dst = $src1 - $src2;",
+            [(set D:$dst, (sub D:$src1, D:$src2))]>;
+
+def SUB16: F2<(outs D16:$dst), (ins D16:$src1, D16:$src2),
+              "$dst = $src1 - $src2;",
+              [(set D16:$dst, (sub D16:$src1, D16:$src2))]>;
+
+}
+
+def : Pat<(addc D:$src1, D:$src2), (ADD D:$src1, D:$src2)>;
+def : Pat<(subc D:$src1, D:$src2), (SUB D:$src1, D:$src2)>;
+
+let Defs = [AZ, AN, V, VS] in
+def NEG: F1<(outs D:$dst), (ins D:$src),
+            "$dst = -$src;",
+            [(set D:$dst, (ineg D:$src))]>;
+
+// No pattern, it would confuse isel to have two i32 = i32+i32 patterns
+def ADDpp: F1<(outs P:$dst), (ins P:$src1, P:$src2),
+              "$dst = $src1 + $src2;", []>;
+
+let isTwoAddress = 1 in
+def ADDpp_imm7: F1<(outs P:$dst), (ins P:$src1, i32imm:$src2),
+                "$dst += $src2;", []>;
+
+let Defs = [AZ, AN, V] in
+def ADD_RND20: F2<(outs D16:$dst), (ins D:$src1, D:$src2),
+                  "$dst = $src1 + $src2 (rnd20);", []>;
+
+let Defs = [V, VS] in {
+def MUL16: F2<(outs D16:$dst), (ins D16:$src1, D16:$src2),
+              "$dst = $src1 * $src2 (is);",
+              [(set D16:$dst, (mul D16:$src1, D16:$src2))]>;
+
+def MULHS16: F2<(outs D16:$dst), (ins D16:$src1, D16:$src2),
+                "$dst = $src1 * $src2 (ih);",
+                [(set D16:$dst, (mulhs D16:$src1, D16:$src2))]>;
+
+def MULhh32s: F2<(outs D:$dst), (ins D16:$src1, D16:$src2),
+                "$dst = $src1 * $src2 (is);",
+                [(set D:$dst, (mul (sext D16:$src1), (sext D16:$src2)))]>;
+
+def MULhh32u: F2<(outs D:$dst), (ins D16:$src1, D16:$src2),
+                "$dst = $src1 * $src2 (is);",
+                [(set D:$dst, (mul (zext D16:$src1), (zext D16:$src2)))]>;
+}
+
+
+let isTwoAddress = 1 in
+def MUL32: F1<(outs D:$dst), (ins D:$src1, D:$src2),
+            "$dst *= $src2;",
+            [(set D:$dst, (mul D:$src1, D:$src2))]>;
+
+//===----------------------------------------------------------------------===//
+// Table C-18. External Exent Management Instructions
+//===----------------------------------------------------------------------===//
+
+def IDLE : F1<(outs), (ins), "idle;", [(int_bfin_idle)]>;
+def CSYNC : F1<(outs), (ins), "csync;", [(int_bfin_csync)]>;
+def SSYNC : F1<(outs), (ins), "ssync;", [(int_bfin_ssync)]>;
+def EMUEXCPT : F1<(outs), (ins), "emuexcpt;", []>;
+def CLI : F1<(outs D:$mask), (ins), "cli $mask;", []>;
+def STI : F1<(outs), (ins D:$mask), "sti $mask;", []>;
+def RAISE : F1<(outs), (ins i32imm:$itr), "raise $itr;", []>;
+def EXCPT : F1<(outs), (ins i32imm:$exc), "excpt $exc;", []>;
+def NOP : F1<(outs), (ins), "nop;", []>;
+def MNOP : F2<(outs), (ins), "mnop;", []>;
+def ABORT : F1<(outs), (ins), "abort;", []>;
+
+//===----------------------------------------------------------------------===//
+// Table C-19. Cache Control Instructions
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Table C-20. Video Pixel Operations Instructions
+//===----------------------------------------------------------------------===//
+
+def ALIGN8 : F2<(outs D:$dst), (ins D:$src1, D:$src2),
+                "$dst = align8($src1, $src2);",
+                [(set D:$dst, (or (shl D:$src1, (i32 24)),
+                                  (srl D:$src2, (i32 8))))]>;
+
+def ALIGN16 : F2<(outs D:$dst), (ins D:$src1, D:$src2),
+                 "$dst = align16($src1, $src2);",
+                 [(set D:$dst, (or (shl D:$src1, (i32 16)),
+                                   (srl D:$src2, (i32 16))))]>;
+
+def ALIGN24 : F2<(outs D:$dst), (ins D:$src1, D:$src2),
+                 "$dst = align16($src1, $src2);",
+                 [(set D:$dst, (or (shl D:$src1, (i32 8)),
+                                   (srl D:$src2, (i32 24))))]>;
+
+def DISALGNEXCPT : F2<(outs), (ins), "disalignexcpt;", []>;
+
+// TODO: BYTEOP3P, BYTEOP16P, BYTEOP1P, BYTEOP2P, BYTEOP16M, SAA,
+//       BYTEPACK, BYTEUNPACK
+
+// Table C-21. Vector Operations Instructions
+
+// Patterns
+def : Pat<(BfinCall (i32 tglobaladdr:$dst)),
+          (CALLa tglobaladdr:$dst)>;
+def : Pat<(BfinCall (i32 texternalsym:$dst)),
+          (CALLa texternalsym:$dst)>;
+
+def : Pat<(sext JustCC:$cc),
+          (NEG (MOVECC_zext JustCC:$cc))>;
+def : Pat<(anyext JustCC:$cc),
+          (MOVECC_zext JustCC:$cc)>;
+def : Pat<(i16 (zext JustCC:$cc)),
+          (EXTRACT_SUBREG (MOVECC_zext JustCC:$cc), bfin_subreg_lo16)>;
+def : Pat<(i16 (sext JustCC:$cc)),
+          (EXTRACT_SUBREG (NEG (MOVECC_zext JustCC:$cc)), bfin_subreg_lo16)>;
+def : Pat<(i16 (anyext JustCC:$cc)),
+          (EXTRACT_SUBREG (MOVECC_zext JustCC:$cc), bfin_subreg_lo16)>;
+
+def : Pat<(i16 (trunc D:$src)),
+          (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS D:$src, D)), bfin_subreg_lo16)>;

diff --git a/lib/Target/Blackfin/BlackfinIntrinsicInfo.cpp b/lib/Target/Blackfin/BlackfinIntrinsicInfo.cpp
new file mode 100644
index 0000000..ea9480d
--- /dev/null
+++ b/lib/Target/Blackfin/BlackfinIntrinsicInfo.cpp

@@ -0,0 +1,100 @@
+//===- BlackfinIntrinsicInfo.cpp - Intrinsic Information --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Blackfin implementation of TargetIntrinsicInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BlackfinIntrinsicInfo.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Module.h"
+#include "llvm/Type.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstring>
+
+using namespace llvm;
+
+namespace bfinIntrinsic {
+
+  enum ID {
+    last_non_bfin_intrinsic = Intrinsic::num_intrinsics-1,
+#define GET_INTRINSIC_ENUM_VALUES
+#include "BlackfinGenIntrinsics.inc"
+#undef GET_INTRINSIC_ENUM_VALUES
+    , num_bfin_intrinsics
+  };
+
+}
+
+std::string BlackfinIntrinsicInfo::getName(unsigned IntrID, const Type **Tys,
+                                           unsigned numTys) const {
+  static const char *const names[] = {
+#define GET_INTRINSIC_NAME_TABLE
+#include "BlackfinGenIntrinsics.inc"
+#undef GET_INTRINSIC_NAME_TABLE
+  };
+
+  assert(!isOverloaded(IntrID) && "Blackfin intrinsics are not overloaded");
+  if (IntrID < Intrinsic::num_intrinsics)
+    return 0;
+  assert(IntrID < bfinIntrinsic::num_bfin_intrinsics && "Invalid intrinsic ID");
+
+  std::string Result(names[IntrID - Intrinsic::num_intrinsics]);
+  return Result;
+}
+
+unsigned
+BlackfinIntrinsicInfo::lookupName(const char *Name, unsigned Len) const {
+#define GET_FUNCTION_RECOGNIZER
+#include "BlackfinGenIntrinsics.inc"
+#undef GET_FUNCTION_RECOGNIZER
+  return 0;
+}
+
+bool BlackfinIntrinsicInfo::isOverloaded(unsigned IntrID) const {
+  // Overload Table
+  const bool OTable[] = {
+#define GET_INTRINSIC_OVERLOAD_TABLE
+#include "BlackfinGenIntrinsics.inc"
+#undef GET_INTRINSIC_OVERLOAD_TABLE
+  };
+  if (IntrID == 0)
+    return false;
+  else
+    return OTable[IntrID - Intrinsic::num_intrinsics];
+}
+
+/// This defines the "getAttributes(ID id)" method.
+#define GET_INTRINSIC_ATTRIBUTES
+#include "BlackfinGenIntrinsics.inc"
+#undef GET_INTRINSIC_ATTRIBUTES
+
+static const FunctionType *getType(LLVMContext &Context, unsigned id) {
+  const Type *ResultTy = NULL;
+  std::vector<const Type*> ArgTys;
+  bool IsVarArg = false;
+  
+#define GET_INTRINSIC_GENERATOR
+#include "BlackfinGenIntrinsics.inc"
+#undef GET_INTRINSIC_GENERATOR
+
+  return FunctionType::get(ResultTy, ArgTys, IsVarArg); 
+}
+
+Function *BlackfinIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
+                                                const Type **Tys,
+                                                unsigned numTy) const {
+  assert(!isOverloaded(IntrID) && "Blackfin intrinsics are not overloaded");
+  AttrListPtr AList = getAttributes((bfinIntrinsic::ID) IntrID);
+  return cast<Function>(M->getOrInsertFunction(getName(IntrID),
+                                               getType(M->getContext(), IntrID),
+                                               AList));
+}

diff --git a/lib/Target/Blackfin/BlackfinIntrinsicInfo.h b/lib/Target/Blackfin/BlackfinIntrinsicInfo.h
new file mode 100644
index 0000000..7c4b5a9
--- /dev/null
+++ b/lib/Target/Blackfin/BlackfinIntrinsicInfo.h

@@ -0,0 +1,32 @@
+//===- BlackfinIntrinsicInfo.h - Blackfin Intrinsic Information -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Blackfin implementation of TargetIntrinsicInfo.
+//
+//===----------------------------------------------------------------------===//
+#ifndef BLACKFININTRINSICS_H
+#define BLACKFININTRINSICS_H
+
+#include "llvm/Target/TargetIntrinsicInfo.h"
+
+namespace llvm {
+
+  class BlackfinIntrinsicInfo : public TargetIntrinsicInfo {
+  public:
+    std::string getName(unsigned IntrID, const Type **Tys = 0,
+                        unsigned numTys = 0) const;
+    unsigned lookupName(const char *Name, unsigned Len) const;
+    bool isOverloaded(unsigned IID) const;
+    Function *getDeclaration(Module *M, unsigned ID, const Type **Tys = 0,
+                             unsigned numTys = 0) const;
+  };
+
+}
+
+#endif

diff --git a/lib/Target/Blackfin/BlackfinIntrinsics.td b/lib/Target/Blackfin/BlackfinIntrinsics.td
new file mode 100644
index 0000000..bf02cfe
--- /dev/null
+++ b/lib/Target/Blackfin/BlackfinIntrinsics.td

@@ -0,0 +1,34 @@
+//===- BlackfinIntrinsics.td - Defines Blackfin intrinsics -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines all of the blackfin-specific intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+let TargetPrefix = "bfin", isTarget = 1 in {
+
+//===----------------------------------------------------------------------===//
+// Core synchronisation etc.
+//
+// These intrinsics have sideeffects. Each represent a single instruction, but
+// workarounds are sometimes required depending on the cpu.
+
+// Execute csync instruction with workarounds
+def int_bfin_csync : GCCBuiltin<"__builtin_bfin_csync">,
+        Intrinsic<[llvm_void_ty]>;
+
+// Execute ssync instruction with workarounds
+def int_bfin_ssync : GCCBuiltin<"__builtin_bfin_ssync">,
+        Intrinsic<[llvm_void_ty]>;
+
+// Execute idle instruction with workarounds
+def int_bfin_idle : GCCBuiltin<"__builtin_bfin_idle">,
+        Intrinsic<[llvm_void_ty]>;
+
+}

diff --git a/lib/Target/Blackfin/BlackfinMCAsmInfo.cpp b/lib/Target/Blackfin/BlackfinMCAsmInfo.cpp
new file mode 100644
index 0000000..31470fb
--- /dev/null
+++ b/lib/Target/Blackfin/BlackfinMCAsmInfo.cpp

@@ -0,0 +1,22 @@
+//===-- BlackfinMCAsmInfo.cpp - Blackfin asm properties -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the BlackfinMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BlackfinMCAsmInfo.h"
+
+using namespace llvm;
+
+BlackfinMCAsmInfo::BlackfinMCAsmInfo(const Target &T, const StringRef &TT) {
+  GlobalPrefix = "_";
+  CommentString = "//";
+  HasSetDirective = false;
+}

diff --git a/lib/Target/Blackfin/BlackfinMCAsmInfo.h b/lib/Target/Blackfin/BlackfinMCAsmInfo.h
new file mode 100644
index 0000000..0efc295
--- /dev/null
+++ b/lib/Target/Blackfin/BlackfinMCAsmInfo.h

@@ -0,0 +1,29 @@
+//===-- BlackfinMCAsmInfo.h - Blackfin asm properties ---------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the BlackfinMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BLACKFINTARGETASMINFO_H
+#define BLACKFINTARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+  class Target;
+  class StringRef;
+
+  struct BlackfinMCAsmInfo : public MCAsmInfo {
+    explicit BlackfinMCAsmInfo(const Target &T, const StringRef &TT);
+  };
+
+} // namespace llvm
+
+#endif

diff --git a/lib/Target/Blackfin/BlackfinRegisterInfo.cpp b/lib/Target/Blackfin/BlackfinRegisterInfo.cpp
new file mode 100644
index 0000000..224165b
--- /dev/null
+++ b/lib/Target/Blackfin/BlackfinRegisterInfo.cpp

@@ -0,0 +1,474 @@
+//===- BlackfinRegisterInfo.cpp - Blackfin Register Information -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Blackfin implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Blackfin.h"
+#include "BlackfinRegisterInfo.h"
+#include "BlackfinSubtarget.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Type.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+using namespace llvm;
+
+BlackfinRegisterInfo::BlackfinRegisterInfo(BlackfinSubtarget &st,
+                                           const TargetInstrInfo &tii)
+  : BlackfinGenRegisterInfo(BF::ADJCALLSTACKDOWN, BF::ADJCALLSTACKUP),
+    Subtarget(st),
+    TII(tii) {}
+
+const unsigned*
+BlackfinRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  using namespace BF;
+  static const unsigned CalleeSavedRegs[] = {
+    FP,
+    R4, R5, R6, R7,
+    P3, P4, P5,
+    0 };
+  return  CalleeSavedRegs;
+}
+
+const TargetRegisterClass* const *BlackfinRegisterInfo::
+getCalleeSavedRegClasses(const MachineFunction *MF) const {
+  using namespace BF;
+  static const TargetRegisterClass * const CalleeSavedRegClasses[] = {
+    &PRegClass,
+    &DRegClass, &DRegClass, &DRegClass, &DRegClass,
+    &PRegClass, &PRegClass, &PRegClass,
+    0 };
+  return CalleeSavedRegClasses;
+}
+
+BitVector
+BlackfinRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  using namespace BF;
+  BitVector Reserved(getNumRegs());
+  Reserved.set(AZ);
+  Reserved.set(AN);
+  Reserved.set(AQ);
+  Reserved.set(AC0);
+  Reserved.set(AC1);
+  Reserved.set(AV0);
+  Reserved.set(AV0S);
+  Reserved.set(AV1);
+  Reserved.set(AV1S);
+  Reserved.set(V);
+  Reserved.set(VS);
+  Reserved.set(CYCLES).set(CYCLES2);
+  Reserved.set(L0);
+  Reserved.set(L1);
+  Reserved.set(L2);
+  Reserved.set(L3);
+  Reserved.set(SP);
+  Reserved.set(RETS);
+  if (hasFP(MF))
+    Reserved.set(FP);
+  return Reserved;
+}
+
+const TargetRegisterClass*
+BlackfinRegisterInfo::getPhysicalRegisterRegClass(unsigned reg, EVT VT) const {
+  assert(isPhysicalRegister(reg) && "reg must be a physical register");
+
+  // Pick the smallest register class of the right type that contains
+  // this physreg.
+  const TargetRegisterClass* BestRC = 0;
+  for (regclass_iterator I = regclass_begin(), E = regclass_end();
+       I != E; ++I) {
+    const TargetRegisterClass* RC = *I;
+    if ((VT == MVT::Other || RC->hasType(VT)) && RC->contains(reg) &&
+        (!BestRC || RC->getNumRegs() < BestRC->getNumRegs()))
+      BestRC = RC;
+  }
+
+  assert(BestRC && "Couldn't find the register class");
+  return BestRC;
+}
+
+// hasFP - Return true if the specified function should have a dedicated frame
+// pointer register.  This is true if the function has variable sized allocas or
+// if frame pointer elimination is disabled.
+bool BlackfinRegisterInfo::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  return NoFramePointerElim || MFI->hasCalls() || MFI->hasVarSizedObjects();
+}
+
+bool BlackfinRegisterInfo::
+requiresRegisterScavenging(const MachineFunction &MF) const {
+  return true;
+}
+
+// Emit instructions to add delta to D/P register. ScratchReg must be of the
+// same class as Reg (P).
+void BlackfinRegisterInfo::adjustRegister(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator I,
+                                          DebugLoc DL,
+                                          unsigned Reg,
+                                          unsigned ScratchReg,
+                                          int delta) const {
+  if (!delta)
+    return;
+  if (isInt<7>(delta)) {
+    BuildMI(MBB, I, DL, TII.get(BF::ADDpp_imm7), Reg)
+      .addReg(Reg)              // No kill on two-addr operand
+      .addImm(delta);
+    return;
+  }
+
+  // We must load delta into ScratchReg and add that.
+  loadConstant(MBB, I, DL, ScratchReg, delta);
+  if (BF::PRegClass.contains(Reg)) {
+    assert(BF::PRegClass.contains(ScratchReg) &&
+           "ScratchReg must be a P register");
+    BuildMI(MBB, I, DL, TII.get(BF::ADDpp), Reg)
+      .addReg(Reg, RegState::Kill)
+      .addReg(ScratchReg, RegState::Kill);
+  } else {
+    assert(BF::DRegClass.contains(Reg) && "Reg must be a D or P register");
+    assert(BF::DRegClass.contains(ScratchReg) &&
+           "ScratchReg must be a D register");
+    BuildMI(MBB, I, DL, TII.get(BF::ADD), Reg)
+      .addReg(Reg, RegState::Kill)
+      .addReg(ScratchReg, RegState::Kill);
+  }
+}
+
+// Emit instructions to load a constant into D/P register
+void BlackfinRegisterInfo::loadConstant(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator I,
+                                        DebugLoc DL,
+                                        unsigned Reg,
+                                        int value) const {
+  if (isInt<7>(value)) {
+    BuildMI(MBB, I, DL, TII.get(BF::LOADimm7), Reg).addImm(value);
+    return;
+  }
+
+  if (isUint<16>(value)) {
+    BuildMI(MBB, I, DL, TII.get(BF::LOADuimm16), Reg).addImm(value);
+    return;
+  }
+
+  if (isInt<16>(value)) {
+    BuildMI(MBB, I, DL, TII.get(BF::LOADimm16), Reg).addImm(value);
+    return;
+  }
+
+  // We must split into halves
+  BuildMI(MBB, I, DL,
+          TII.get(BF::LOAD16i), getSubReg(Reg, bfin_subreg_hi16))
+    .addImm((value >> 16) & 0xffff)
+    .addReg(Reg, RegState::ImplicitDefine);
+  BuildMI(MBB, I, DL,
+          TII.get(BF::LOAD16i), getSubReg(Reg, bfin_subreg_lo16))
+    .addImm(value & 0xffff)
+    .addReg(Reg, RegState::ImplicitKill)
+    .addReg(Reg, RegState::ImplicitDefine);
+}
+
+void BlackfinRegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF,
+                              MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  if (!hasReservedCallFrame(MF)) {
+    int64_t Amount = I->getOperand(0).getImm();
+    if (Amount != 0) {
+      assert(Amount%4 == 0 && "Unaligned call frame size");
+      if (I->getOpcode() == BF::ADJCALLSTACKDOWN) {
+        adjustRegister(MBB, I, I->getDebugLoc(), BF::SP, BF::P1, -Amount);
+      } else {
+        assert(I->getOpcode() == BF::ADJCALLSTACKUP &&
+               "Unknown call frame pseudo instruction");
+        adjustRegister(MBB, I, I->getDebugLoc(), BF::SP, BF::P1, Amount);
+      }
+    }
+  }
+  MBB.erase(I);
+}
+
+/// findScratchRegister - Find a 'free' register. Try for a call-clobbered
+/// register first and then a spilled callee-saved register if that fails.
+static unsigned findScratchRegister(MachineBasicBlock::iterator II,
+                                    RegScavenger *RS,
+                                    const TargetRegisterClass *RC,
+                                    int SPAdj) {
+  assert(RS && "Register scavenging must be on");
+  unsigned Reg = RS->FindUnusedReg(RC);
+  if (Reg == 0)
+    Reg = RS->scavengeRegister(RC, II, SPAdj);
+  return Reg;
+}
+
+unsigned
+BlackfinRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                          int SPAdj, int *Value,
+                                          RegScavenger *RS) const {
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  DebugLoc DL = MI.getDebugLoc();
+
+  unsigned FIPos;
+  for (FIPos=0; !MI.getOperand(FIPos).isFI(); ++FIPos) {
+    assert(FIPos < MI.getNumOperands() &&
+           "Instr doesn't have FrameIndex operand!");
+  }
+  int FrameIndex = MI.getOperand(FIPos).getIndex();
+  assert(FIPos+1 < MI.getNumOperands() && MI.getOperand(FIPos+1).isImm());
+  int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex)
+    + MI.getOperand(FIPos+1).getImm();
+  unsigned BaseReg = BF::FP;
+  if (hasFP(MF)) {
+    assert(SPAdj==0 && "Unexpected SP adjust in function with frame pointer");
+  } else {
+    BaseReg = BF::SP;
+    Offset += MF.getFrameInfo()->getStackSize() + SPAdj;
+  }
+
+  bool isStore = false;
+
+  switch (MI.getOpcode()) {
+  case BF::STORE32fi:
+    isStore = true;
+  case BF::LOAD32fi: {
+    assert(Offset%4 == 0 && "Unaligned i32 stack access");
+    assert(FIPos==1 && "Bad frame index operand");
+    MI.getOperand(FIPos).ChangeToRegister(BaseReg, false);
+    MI.getOperand(FIPos+1).setImm(Offset);
+    if (isUint<6>(Offset)) {
+      MI.setDesc(TII.get(isStore
+                         ? BF::STORE32p_uimm6m4
+                         : BF::LOAD32p_uimm6m4));
+      return 0;
+    }
+    if (BaseReg == BF::FP && isUint<7>(-Offset)) {
+      MI.setDesc(TII.get(isStore
+                         ? BF::STORE32fp_nimm7m4
+                         : BF::LOAD32fp_nimm7m4));
+      MI.getOperand(FIPos+1).setImm(-Offset);
+      return 0;
+    }
+    if (isInt<18>(Offset)) {
+      MI.setDesc(TII.get(isStore
+                         ? BF::STORE32p_imm18m4
+                         : BF::LOAD32p_imm18m4));
+      return 0;
+    }
+    // Use RegScavenger to calculate proper offset...
+    MI.dump();
+    llvm_unreachable("Stack frame offset too big");
+    break;
+  }
+  case BF::ADDpp: {
+    assert(MI.getOperand(0).isReg() && "ADD instruction needs a register");
+    unsigned DestReg = MI.getOperand(0).getReg();
+    // We need to produce a stack offset in a P register. We emit:
+    // P0 = offset;
+    // P0 = BR + P0;
+    assert(FIPos==1 && "Bad frame index operand");
+    loadConstant(MBB, II, DL, DestReg, Offset);
+    MI.getOperand(1).ChangeToRegister(DestReg, false, false, true);
+    MI.getOperand(2).ChangeToRegister(BaseReg, false);
+    break;
+  }
+  case BF::STORE16fi:
+    isStore = true;
+  case BF::LOAD16fi: {
+    assert(Offset%2 == 0 && "Unaligned i16 stack access");
+    assert(FIPos==1 && "Bad frame index operand");
+    // We need a P register to use as an address
+    unsigned ScratchReg = findScratchRegister(II, RS, &BF::PRegClass, SPAdj);
+    assert(ScratchReg && "Could not scavenge register");
+    loadConstant(MBB, II, DL, ScratchReg, Offset);
+    BuildMI(MBB, II, DL, TII.get(BF::ADDpp), ScratchReg)
+      .addReg(ScratchReg, RegState::Kill)
+      .addReg(BaseReg);
+    MI.setDesc(TII.get(isStore ? BF::STORE16pi : BF::LOAD16pi));
+    MI.getOperand(1).ChangeToRegister(ScratchReg, false, false, true);
+    MI.RemoveOperand(2);
+    break;
+  }
+  case BF::STORE8fi: {
+    // This is an AnyCC spill, we need a scratch register.
+    assert(FIPos==1 && "Bad frame index operand");
+    MachineOperand SpillReg = MI.getOperand(0);
+    unsigned ScratchReg = findScratchRegister(II, RS, &BF::DRegClass, SPAdj);
+    assert(ScratchReg && "Could not scavenge register");
+    if (SpillReg.getReg()==BF::NCC) {
+      BuildMI(MBB, II, DL, TII.get(BF::MOVENCC_z), ScratchReg)
+        .addOperand(SpillReg);
+      BuildMI(MBB, II, DL, TII.get(BF::BITTGL), ScratchReg)
+        .addReg(ScratchReg).addImm(0);
+    } else {
+      BuildMI(MBB, II, DL, TII.get(BF::MOVECC_zext), ScratchReg)
+        .addOperand(SpillReg);
+    }
+    // STORE D
+    MI.setDesc(TII.get(BF::STORE8p_imm16));
+    MI.getOperand(0).ChangeToRegister(ScratchReg, false, false, true);
+    MI.getOperand(FIPos).ChangeToRegister(BaseReg, false);
+    MI.getOperand(FIPos+1).setImm(Offset);
+    break;
+  }
+  case BF::LOAD8fi: {
+    // This is an restore, we need a scratch register.
+    assert(FIPos==1 && "Bad frame index operand");
+    MachineOperand SpillReg = MI.getOperand(0);
+    unsigned ScratchReg = findScratchRegister(II, RS, &BF::DRegClass, SPAdj);
+    assert(ScratchReg && "Could not scavenge register");
+    MI.setDesc(TII.get(BF::LOAD32p_imm16_8z));
+    MI.getOperand(0).ChangeToRegister(ScratchReg, true);
+    MI.getOperand(FIPos).ChangeToRegister(BaseReg, false);
+    MI.getOperand(FIPos+1).setImm(Offset);
+    ++II;
+    if (SpillReg.getReg()==BF::CC) {
+      // CC = D
+      BuildMI(MBB, II, DL, TII.get(BF::MOVECC_nz), BF::CC)
+        .addReg(ScratchReg, RegState::Kill);
+    } else {
+      // Restore NCC (CC = D==0)
+      BuildMI(MBB, II, DL, TII.get(BF::SETEQri_not), BF::NCC)
+        .addReg(ScratchReg, RegState::Kill)
+        .addImm(0);
+    }
+    break;
+  }
+  default:
+    llvm_unreachable("Cannot eliminate frame index");
+    break;
+  }
+  return 0;
+}
+
+void BlackfinRegisterInfo::
+processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                     RegScavenger *RS) const {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const TargetRegisterClass *RC = BF::DPRegisterClass;
+  if (requiresRegisterScavenging(MF)) {
+    // Reserve a slot close to SP or frame pointer.
+    RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
+                                                       RC->getAlignment(),
+                                                       false));
+  }
+}
+
+void BlackfinRegisterInfo::
+processFunctionBeforeFrameFinalized(MachineFunction &MF) const {
+}
+
+// Emit a prologue that sets up a stack frame.
+// On function entry, R0-R2 and P0 may hold arguments.
+// R3, P1, and P2 may be used as scratch registers
+void BlackfinRegisterInfo::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();   // Prolog goes in entry BB
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  DebugLoc dl = (MBBI != MBB.end()
+                 ? MBBI->getDebugLoc()
+                 : DebugLoc::getUnknownLoc());
+
+  int FrameSize = MFI->getStackSize();
+  if (FrameSize%4) {
+    FrameSize = (FrameSize+3) & ~3;
+    MFI->setStackSize(FrameSize);
+  }
+
+  if (!hasFP(MF)) {
+    assert(!MFI->hasCalls() &&
+           "FP elimination on a non-leaf function is not supported");
+    adjustRegister(MBB, MBBI, dl, BF::SP, BF::P1, -FrameSize);
+    return;
+  }
+
+  // emit a LINK instruction
+  if (FrameSize <= 0x3ffff) {
+    BuildMI(MBB, MBBI, dl, TII.get(BF::LINK)).addImm(FrameSize);
+    return;
+  }
+
+  // Frame is too big, do a manual LINK:
+  // [--SP] = RETS;
+  // [--SP] = FP;
+  // FP = SP;
+  // P1 = -FrameSize;
+  // SP = SP + P1;
+  BuildMI(MBB, MBBI, dl, TII.get(BF::PUSH))
+    .addReg(BF::RETS, RegState::Kill);
+  BuildMI(MBB, MBBI, dl, TII.get(BF::PUSH))
+    .addReg(BF::FP, RegState::Kill);
+  BuildMI(MBB, MBBI, dl, TII.get(BF::MOVE), BF::FP)
+    .addReg(BF::SP);
+  loadConstant(MBB, MBBI, dl, BF::P1, -FrameSize);
+  BuildMI(MBB, MBBI, dl, TII.get(BF::ADDpp), BF::SP)
+    .addReg(BF::SP, RegState::Kill)
+    .addReg(BF::P1, RegState::Kill);
+
+}
+
+void BlackfinRegisterInfo::emitEpilogue(MachineFunction &MF,
+                                        MachineBasicBlock &MBB) const {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  DebugLoc dl = MBBI->getDebugLoc();
+
+  int FrameSize = MFI->getStackSize();
+  assert(FrameSize%4 == 0 && "Misaligned frame size");
+
+  if (!hasFP(MF)) {
+    assert(!MFI->hasCalls() &&
+           "FP elimination on a non-leaf function is not supported");
+    adjustRegister(MBB, MBBI, dl, BF::SP, BF::P1, FrameSize);
+    return;
+  }
+
+  // emit an UNLINK instruction
+  BuildMI(MBB, MBBI, dl, TII.get(BF::UNLINK));
+}
+
+unsigned BlackfinRegisterInfo::getRARegister() const {
+  return BF::RETS;
+}
+
+unsigned
+BlackfinRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  return hasFP(MF) ? BF::FP : BF::SP;
+}
+
+unsigned BlackfinRegisterInfo::getEHExceptionRegister() const {
+  llvm_unreachable("What is the exception register");
+  return 0;
+}
+
+unsigned BlackfinRegisterInfo::getEHHandlerRegister() const {
+  llvm_unreachable("What is the exception handler register");
+  return 0;
+}
+
+int BlackfinRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
+  llvm_unreachable("What is the dwarf register number");
+  return -1;
+}
+
+#include "BlackfinGenRegisterInfo.inc"
+

diff --git a/lib/Target/Blackfin/BlackfinRegisterInfo.h b/lib/Target/Blackfin/BlackfinRegisterInfo.h
new file mode 100644
index 0000000..68ef08a
--- /dev/null
+++ b/lib/Target/Blackfin/BlackfinRegisterInfo.h

@@ -0,0 +1,104 @@
+//===- BlackfinRegisterInfo.h - Blackfin Register Information ..-*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Blackfin implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BLACKFINREGISTERINFO_H
+#define BLACKFINREGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "BlackfinGenRegisterInfo.h.inc"
+
+namespace llvm {
+
+  class BlackfinSubtarget;
+  class TargetInstrInfo;
+  class Type;
+
+  // Subregister indices, keep in sync with BlackfinRegisterInfo.td
+  enum BfinSubregIdx {
+    bfin_subreg_lo16 = 1,
+    bfin_subreg_hi16 = 2,
+    bfin_subreg_lo32 = 3
+  };
+
+  struct BlackfinRegisterInfo : public BlackfinGenRegisterInfo {
+    BlackfinSubtarget &Subtarget;
+    const TargetInstrInfo &TII;
+
+    BlackfinRegisterInfo(BlackfinSubtarget &st, const TargetInstrInfo &tii);
+
+    /// Code Generation virtual methods...
+    const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+
+    const TargetRegisterClass* const*
+    getCalleeSavedRegClasses(const MachineFunction *MF = 0) const;
+
+    BitVector getReservedRegs(const MachineFunction &MF) const;
+
+    // getSubReg implemented by tablegen
+
+    const TargetRegisterClass *getPointerRegClass(unsigned Kind = 0) const {
+      return &BF::PRegClass;
+    }
+
+    const TargetRegisterClass *getPhysicalRegisterRegClass(unsigned reg,
+                                                           EVT VT) const;
+
+    bool hasFP(const MachineFunction &MF) const;
+
+    // bool hasReservedCallFrame(MachineFunction &MF) const;
+
+    bool requiresRegisterScavenging(const MachineFunction &MF) const;
+
+    void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                       MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator I) const;
+
+    unsigned eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                 int SPAdj, int *Value = NULL,
+                                 RegScavenger *RS = NULL) const;
+
+    void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                              RegScavenger *RS) const;
+
+    void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
+
+    void emitPrologue(MachineFunction &MF) const;
+    void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+    unsigned getFrameRegister(const MachineFunction &MF) const;
+    unsigned getRARegister() const;
+
+    // Exception handling queries.
+    unsigned getEHExceptionRegister() const;
+    unsigned getEHHandlerRegister() const;
+
+    int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+
+    // Utility functions
+    void adjustRegister(MachineBasicBlock &MBB,
+                        MachineBasicBlock::iterator I,
+                        DebugLoc DL,
+                        unsigned Reg,
+                        unsigned ScratchReg,
+                        int delta) const;
+    void loadConstant(MachineBasicBlock &MBB,
+                      MachineBasicBlock::iterator I,
+                      DebugLoc DL,
+                      unsigned Reg,
+                      int value) const;
+  };
+
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/Blackfin/BlackfinRegisterInfo.td b/lib/Target/Blackfin/BlackfinRegisterInfo.td
new file mode 100644
index 0000000..d396cc8
--- /dev/null
+++ b/lib/Target/Blackfin/BlackfinRegisterInfo.td

@@ -0,0 +1,385 @@
+//===- BlackfinRegisterInfo.td - Blackfin Register defs ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the Blackfin register file
+//===----------------------------------------------------------------------===//
+
+// Registers are identified with 3-bit group and 3-bit ID numbers.
+
+class BlackfinReg<string n> : Register<n> {
+  field bits<3> Group;
+  field bits<3> Num;
+  let Namespace = "BF";
+}
+
+// Rc - 1-bit registers
+class Rc<bits<5> bitno, string n> : BlackfinReg<n> {
+  field bits<5> BitNum = bitno;
+}
+
+// Rs - 16-bit integer registers
+class Rs<bits<3> group, bits<3> num, bits<1> hi, string n> : BlackfinReg<n> {
+  let Group = group;
+  let Num = num;
+  field bits<1> High = hi;
+}
+
+// Ri - 32-bit integer registers with subregs
+class Ri<bits<3> group, bits<3> num, string n> : BlackfinReg<n> {
+  let Group = group;
+  let Num = num;
+}
+
+// Ra 40-bit accumulator registers
+class Ra<bits<3> num, string n, list<Register> subs> : BlackfinReg<n> {
+  let SubRegs = subs;
+  let Group = 4;
+  let Num = num;
+}
+
+// Two halves of 32-bit register
+multiclass Rss<bits<3> group, bits<3> num, string n> {
+  def H : Rs<group, num, 1, !strconcat(n, ".h")>;
+  def L : Rs<group, num, 0, !strconcat(n, ".l")>;
+}
+
+// Rii - 32-bit integer registers with subregs
+class Rii<bits<3> group, bits<3> num, string n, list<Register> subs>
+      : BlackfinReg<n> {
+  let SubRegs = subs;
+  let Group = group;
+  let Num = num;
+}
+
+// Status bits are all part of ASTAT
+def AZ   : Rc<0,  "az">;
+def AN   : Rc<1,  "an">;
+def CC   : Rc<5,  "cc">, DwarfRegNum<[34]>;
+def NCC  : Rc<5,  "!cc"> { let Aliases = [CC]; }
+def AQ   : Rc<6,  "aq">;
+def AC0  : Rc<12, "ac0">;
+def AC1  : Rc<13, "ac1">;
+def AV0  : Rc<16, "av0">;
+def AV0S : Rc<17, "av0s">;
+def AV1  : Rc<18, "av1">;
+def AV1S : Rc<19, "av1s">;
+def V    : Rc<24, "v">;
+def VS   : Rc<25, "vs">;
+// Skipped non-status bits: AC0_COPY, V_COPY, RND_MOD
+
+// Group 0: Integer registers
+defm R0 : Rss<0, 0, "r0">;
+def  R0 : Rii<0, 0, "r0", [R0H, R0L]>, DwarfRegNum<[0]>;
+defm R1 : Rss<0, 1, "r1">;
+def  R1 : Rii<0, 1, "r1", [R1H, R1L]>, DwarfRegNum<[1]>;
+defm R2 : Rss<0, 2, "r2">;
+def  R2 : Rii<0, 2, "r2", [R2H, R2L]>, DwarfRegNum<[2]>;
+defm R3 : Rss<0, 3, "r3">;
+def  R3 : Rii<0, 3, "r3", [R3H, R3L]>, DwarfRegNum<[3]>;
+defm R4 : Rss<0, 4, "r4">;
+def  R4 : Rii<0, 4, "r4", [R4H, R4L]>, DwarfRegNum<[4]>;
+defm R5 : Rss<0, 5, "r5">;
+def  R5 : Rii<0, 5, "r5", [R5H, R5L]>, DwarfRegNum<[5]>;
+defm R6 : Rss<0, 6, "r6">;
+def  R6 : Rii<0, 6, "r6", [R6H, R6L]>, DwarfRegNum<[6]>;
+defm R7 : Rss<0, 7, "r7">;
+def  R7 : Rii<0, 7, "r7", [R7H, R7L]>, DwarfRegNum<[7]>;
+
+// Group 1: Pointer registers
+defm P0 : Rss<1, 0, "p0">;
+def  P0 : Rii<1, 0, "p0", [P0H, P0L]>, DwarfRegNum<[8]>;
+defm P1 : Rss<1, 1, "p1">;
+def  P1 : Rii<1, 1, "p1", [P1H, P1L]>, DwarfRegNum<[9]>;
+defm P2 : Rss<1, 2, "p2">;
+def  P2 : Rii<1, 2, "p2", [P2H, P2L]>, DwarfRegNum<[10]>;
+defm P3 : Rss<1, 3, "p3">;
+def  P3 : Rii<1, 3, "p3", [P3H, P3L]>, DwarfRegNum<[11]>;
+defm P4 : Rss<1, 4, "p4">;
+def  P4 : Rii<1, 4, "p4", [P4H, P4L]>, DwarfRegNum<[12]>;
+defm P5 : Rss<1, 5, "p5">;
+def  P5 : Rii<1, 5, "p5", [P5H, P5L]>, DwarfRegNum<[13]>;
+defm SP : Rss<1, 6, "sp">;
+def  SP : Rii<1, 6, "sp", [SPH, SPL]>, DwarfRegNum<[14]>;
+defm FP : Rss<1, 7, "fp">;
+def  FP : Rii<1, 7, "fp", [FPH, FPL]>, DwarfRegNum<[15]>;
+
+// Group 2: Index registers
+defm I0 : Rss<2, 0, "i0">;
+def  I0 : Rii<2, 0, "i0", [I0H, I0L]>, DwarfRegNum<[16]>;
+defm I1 : Rss<2, 1, "i1">;
+def  I1 : Rii<2, 1, "i1", [I1H, I1L]>, DwarfRegNum<[17]>;
+defm I2 : Rss<2, 2, "i2">;
+def  I2 : Rii<2, 2, "i2", [I2H, I2L]>, DwarfRegNum<[18]>;
+defm I3 : Rss<2, 3, "i3">;
+def  I3 : Rii<2, 3, "i3", [I3H, I3L]>, DwarfRegNum<[19]>;
+defm M0 : Rss<2, 4, "m0">;
+def  M0 : Rii<2, 4, "m0", [M0H, M0L]>, DwarfRegNum<[20]>;
+defm M1 : Rss<2, 5, "m1">;
+def  M1 : Rii<2, 5, "m1", [M1H, M1L]>, DwarfRegNum<[21]>;
+defm M2 : Rss<2, 6, "m2">;
+def  M2 : Rii<2, 6, "m2", [M2H, M2L]>, DwarfRegNum<[22]>;
+defm M3 : Rss<2, 7, "m3">;
+def  M3 : Rii<2, 7, "m3", [M3H, M3L]>, DwarfRegNum<[23]>;
+
+// Group 3: Cyclic indexing registers
+defm B0 : Rss<3, 0, "b0">;
+def  B0 : Rii<3, 0, "b0", [B0H, B0L]>, DwarfRegNum<[24]>;
+defm B1 : Rss<3, 1, "b1">;
+def  B1 : Rii<3, 1, "b1", [B1H, B1L]>, DwarfRegNum<[25]>;
+defm B2 : Rss<3, 2, "b2">;
+def  B2 : Rii<3, 2, "b2", [B2H, B2L]>, DwarfRegNum<[26]>;
+defm B3 : Rss<3, 3, "b3">;
+def  B3 : Rii<3, 3, "b3", [B3H, B3L]>, DwarfRegNum<[27]>;
+defm L0 : Rss<3, 4, "l0">;
+def  L0 : Rii<3, 4, "l0", [L0H, L0L]>, DwarfRegNum<[28]>;
+defm L1 : Rss<3, 5, "l1">;
+def  L1 : Rii<3, 5, "l1", [L1H, L1L]>, DwarfRegNum<[29]>;
+defm L2 : Rss<3, 6, "l2">;
+def  L2 : Rii<3, 6, "l2", [L2H, L2L]>, DwarfRegNum<[30]>;
+defm L3 : Rss<3, 7, "l3">;
+def  L3 : Rii<3, 7, "l3", [L3H, L3L]>, DwarfRegNum<[31]>;
+
+// Accumulators
+def  A0X : Ri <4, 0, "a0.x">;
+defm A0  : Rss<4, 1, "a0">;
+def  A0W : Rii<4, 1, "a0.w", [A0H, A0L]>, DwarfRegNum<[32]>;
+def  A0  : Ra <0, "a0", [A0X, A0W]>;
+
+def  A1X : Ri <4, 2, "a1.x">;
+defm A1  : Rss<4, 3, "a1">;
+def  A1W : Rii<4, 3, "a1.w", [A1H, A1L]>, DwarfRegNum<[33]>;
+def  A1  : Ra <2, "a1", [A1X, A1W]>;
+
+def RETS : Ri<4, 7, "rets">,  DwarfRegNum<[35]>;
+def RETI : Ri<7, 3, "reti">,  DwarfRegNum<[36]>;
+def RETX : Ri<7, 4, "retx">,  DwarfRegNum<[37]>;
+def RETN : Ri<7, 5, "retn">,  DwarfRegNum<[38]>;
+def RETE : Ri<7, 6, "rete">,  DwarfRegNum<[39]>;
+
+def ASTAT   : Ri<4, 6, "astat">,   DwarfRegNum<[40]> {
+  let SubRegs = [AZ, AN, CC, NCC, AQ, AC0, AC1, AV0, AV0S, AV1, AV1S, V, VS];
+}
+
+def SEQSTAT : Ri<7, 1, "seqstat">, DwarfRegNum<[41]>;
+def USP     : Ri<7, 0, "usp">,     DwarfRegNum<[42]>;
+def EMUDAT  : Ri<7, 7, "emudat">,  DwarfRegNum<[43]>;
+def SYSCFG  : Ri<7, 2, "syscfg">;
+def CYCLES  : Ri<6, 6, "cycles">;
+def CYCLES2 : Ri<6, 7, "cycles2">;
+
+// Hardware loops
+def LT0 : Ri<6, 1, "lt0">, DwarfRegNum<[44]>;
+def LT1 : Ri<6, 4, "lt1">, DwarfRegNum<[45]>;
+def LC0 : Ri<6, 0, "lc0">, DwarfRegNum<[46]>;
+def LC1 : Ri<6, 3, "lc1">, DwarfRegNum<[47]>;
+def LB0 : Ri<6, 2, "lb0">, DwarfRegNum<[48]>;
+def LB1 : Ri<6, 5, "lb1">, DwarfRegNum<[49]>;
+
+// Subregs are:
+// 1: .L
+// 2: .H
+// 3: .W (32 low bits of 40-bit accu)
+// Keep in sync with enum in BlackfinRegisterInfo.h
+def bfin_subreg_lo16  : PatLeaf<(i32 1)>;
+def bfin_subreg_hi16  : PatLeaf<(i32 2)>;
+def bfin_subreg_32bit : PatLeaf<(i32 3)>;
+
+def : SubRegSet<1,
+    [R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,
+     P0,  P1,  P2,  P3,  P4,  P5,  SP,  FP,
+     I0,  I1,  I2,  I3,  M0,  M1,  M2,  M3,
+     B0,  B1,  B2,  B3,  L0,  L1,  L2,  L3],
+    [R0L, R1L, R2L, R3L, R4L, R5L, R6L, R7L,
+     P0L, P1L, P2L, P3L, P4L, P5L, SPL, FPL,
+     I0L, I1L, I2L, I3L, M0L, M1L, M2L, M3L,
+     B0L, B1L, B2L, B3L, L0L, L1L, L2L, L3L]>;
+
+def : SubRegSet<2,
+    [R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,
+     P0,  P1,  P2,  P3,  P4,  P5,  SP,  FP,
+     I0,  I1,  I2,  I3,  M0,  M1,  M2,  M3,
+     B0,  B1,  B2,  B3,  L0,  L1,  L2,  L3],
+    [R0H, R1H, R2H, R3H, R4H, R5H, R6H, R7H,
+     P0H, P1H, P2H, P3H, P4H, P5H, SPH, FPH,
+     I0H, I1H, I2H, I3H, M0H, M1H, M2H, M3H,
+     B0H, B1H, B2H, B3H, L0H, L1H, L2H, L3H]>;
+
+def : SubRegSet<1, [A0, A0W, A1, A1W], [A0L, A0L, A1L, A1L]>;
+def : SubRegSet<2, [A0, A0W, A1, A1W], [A0H, A0H, A1H, A1H]>;
+
+// Register classes.
+def D16 : RegisterClass<"BF", [i16], 16,
+    [R0H, R0L, R1H, R1L, R2H, R2L, R3H, R3L,
+     R4H, R4L, R5H, R5L, R6H, R6L, R7H, R7L]>;
+
+def D16L : RegisterClass<"BF", [i16], 16,
+    [R0L, R1L, R2L, R3L, R4L, R5L, R6L, R7L]>;
+
+def D16H : RegisterClass<"BF", [i16], 16,
+    [R0H, R1H, R2H, R3H, R4H, R5H, R6H, R7H]>;
+
+def P16 : RegisterClass<"BF", [i16], 16,
+    [P0H, P0L, P1H, P1L, P2H, P2L, P3H, P3L,
+     P4H, P4L, P5H, P5L, SPH, SPL, FPH, FPL]>;
+
+def P16L : RegisterClass<"BF", [i16], 16,
+    [P0L, P1L, P2L, P3L, P4L, P5L, SPL, FPL]>;
+
+def P16H : RegisterClass<"BF", [i16], 16,
+    [P0H, P1H, P2H, P3H, P4H, P5H, SPH, FPH]>;
+
+def DP16 : RegisterClass<"BF", [i16], 16,
+    [R0H, R0L, R1H, R1L, R2H, R2L, R3H, R3L,
+     R4H, R4L, R5H, R5L, R6H, R6L, R7H, R7L,
+     P0H, P0L, P1H, P1L, P2H, P2L, P3H, P3L,
+     P4H, P4L, P5H, P5L, SPH, SPL, FPH, FPL]>;
+
+def DP16L : RegisterClass<"BF", [i16], 16,
+    [R0L, R1L, R2L, R3L, R4L, R5L, R6L, R7L,
+     P0L, P1L, P2L, P3L, P4L, P5L, SPL, FPL]>;
+
+def DP16H : RegisterClass<"BF", [i16], 16,
+    [R0H, R1H, R2H, R3H, R4H, R5H, R6H, R7H,
+     P0H, P1H, P2H, P3H, P4H, P5H, SPH, FPH]>;
+
+def GR16 : RegisterClass<"BF", [i16], 16,
+    [R0H, R0L, R1H, R1L, R2H, R2L, R3H, R3L,
+     R4H, R4L, R5H, R5L, R6H, R6L, R7H, R7L,
+     P0H, P0L, P1H, P1L, P2H, P2L, P3H, P3L,
+     P4H, P4L, P5H, P5L, SPH, SPL, FPH, FPL,
+     I0H, I0L, I1H, I1L, I2H, I2L, I3H, I3L,
+     M0H, M0L, M1H, M1L, M2H, M2L, M3H, M3L,
+     B0H, B0L, B1H, B1L, B2H, B2L, B3H, B3L,
+     L0H, L0L, L1H, L1L, L2H, L2L, L3H, L3L]>;
+
+def D : RegisterClass<"BF", [i32], 32, [R0, R1, R2, R3, R4, R5, R6, R7]> {
+  let SubRegClassList = [D16L, D16H];
+}
+
+def P : RegisterClass<"BF", [i32], 32, [P0, P1, P2, P3, P4, P5, FP, SP]> {
+  let SubRegClassList = [P16L, P16H];
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    PClass::iterator
+    PClass::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      return allocation_order_begin(MF)
+             + (RI->hasFP(MF) ? 7 : 6);
+    }
+  }];
+}
+
+def I : RegisterClass<"BF", [i32], 32, [I0, I1, I2, I3]>;
+def M : RegisterClass<"BF", [i32], 32, [M0, M1, M2, M3]>;
+def B : RegisterClass<"BF", [i32], 32, [B0, B1, B2, B3]>;
+def L : RegisterClass<"BF", [i32], 32, [L0, L1, L2, L3]>;
+
+def DP : RegisterClass<"BF", [i32], 32,
+    [R0, R1, R2, R3, R4, R5, R6, R7,
+     P0, P1, P2, P3, P4, P5, FP, SP]> {
+  let SubRegClassList = [DP16L, DP16H];
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    DPClass::iterator
+    DPClass::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      return allocation_order_begin(MF)
+             + (RI->hasFP(MF) ? 15 : 14);
+    }
+  }];
+}
+
+def GR : RegisterClass<"BF", [i32], 32,
+    [R0, R1, R2, R3, R4, R5, R6, R7,
+     P0, P1, P2, P3, P4, P5,
+     I0, I1, I2, I3, M0, M1, M2, M3,
+     B0, B1, B2, B3, L0, L1, L2, L3,
+     FP, SP]> {
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    GRClass::iterator
+    GRClass::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      return allocation_order_begin(MF)
+             + (RI->hasFP(MF) ? 31 : 30);
+    }
+  }];
+}
+
+def ALL : RegisterClass<"BF", [i32], 32,
+    [R0, R1, R2, R3, R4, R5, R6, R7,
+     P0, P1, P2, P3, P4, P5,
+     I0, I1, I2, I3, M0, M1, M2, M3,
+     B0, B1, B2, B3, L0, L1, L2, L3,
+     FP, SP,
+     A0X, A0W, A1X, A1W, ASTAT, RETS,
+     LC0, LT0, LB0, LC1, LT1, LB1, CYCLES, CYCLES2,
+     USP, SEQSTAT, SYSCFG, RETI, RETX, RETN, RETE, EMUDAT]> {
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    ALLClass::iterator
+    ALLClass::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      return allocation_order_begin(MF)
+             + (RI->hasFP(MF) ? 31 : 30);
+    }
+  }];
+}
+
+def PI : RegisterClass<"BF", [i32], 32,
+    [P0, P1, P2, P3, P4, P5, I0, I1, I2, I3, FP, SP]> {
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    PIClass::iterator
+    PIClass::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      return allocation_order_begin(MF)
+             + (RI->hasFP(MF) ? 11 : 10);
+    }
+  }];
+}
+
+// We are going to pretend that CC and !CC are 32-bit registers, even though
+// they only can hold 1 bit.
+let CopyCost = -1, Size = 8 in {
+def JustCC  : RegisterClass<"BF", [i32], 8, [CC]>;
+def NotCC   : RegisterClass<"BF", [i32], 8, [NCC]>;
+def AnyCC   : RegisterClass<"BF", [i32], 8, [CC, NCC]> {
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    AnyCCClass::iterator
+    AnyCCClass::allocation_order_end(const MachineFunction &MF) const {
+      return allocation_order_begin(MF)+1;
+    }
+  }];
+}
+def StatBit : RegisterClass<"BF", [i1], 8,
+    [AZ, AN, CC, AQ, AC0, AC1, AV0, AV0S, AV1, AV1S, V, VS]>;
+}
+
+// Should be i40, but that isn't defined. It is not a legal type yet anyway.
+def Accu : RegisterClass<"BF", [i64], 64, [A0, A1]>;

diff --git a/lib/Target/Blackfin/BlackfinSubtarget.cpp b/lib/Target/Blackfin/BlackfinSubtarget.cpp
new file mode 100644
index 0000000..e104c52
--- /dev/null
+++ b/lib/Target/Blackfin/BlackfinSubtarget.cpp

@@ -0,0 +1,36 @@
+//===- BlackfinSubtarget.cpp - BLACKFIN Subtarget Information -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the blackfin specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BlackfinSubtarget.h"
+#include "BlackfinGenSubtarget.inc"
+
+using namespace llvm;
+
+BlackfinSubtarget::BlackfinSubtarget(const std::string &TT,
+                                     const std::string &FS)
+  : sdram(false),
+    icplb(false),
+    wa_mi_shift(false),
+    wa_csync(false),
+    wa_specld(false),
+    wa_mmr_stall(false),
+    wa_lcregs(false),
+    wa_hwloop(false),
+    wa_ind_call(false),
+    wa_killed_mmr(false),
+    wa_rets(false)
+{
+  std::string CPU = "generic";
+  // Parse features string.
+  ParseSubtargetFeatures(FS, CPU);
+}

diff --git a/lib/Target/Blackfin/BlackfinSubtarget.h b/lib/Target/Blackfin/BlackfinSubtarget.h
new file mode 100644
index 0000000..d667fe2
--- /dev/null
+++ b/lib/Target/Blackfin/BlackfinSubtarget.h

@@ -0,0 +1,45 @@
+//===- BlackfinSubtarget.h - Define Subtarget for the Blackfin -*- C++ -*-====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the BLACKFIN specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BLACKFIN_SUBTARGET_H
+#define BLACKFIN_SUBTARGET_H
+
+#include "llvm/Target/TargetSubtarget.h"
+#include <string>
+
+namespace llvm {
+
+  class BlackfinSubtarget : public TargetSubtarget {
+    bool sdram;
+    bool icplb;
+    bool wa_mi_shift;
+    bool wa_csync;
+    bool wa_specld;
+    bool wa_mmr_stall;
+    bool wa_lcregs;
+    bool wa_hwloop;
+    bool wa_ind_call;
+    bool wa_killed_mmr;
+    bool wa_rets;
+  public:
+    BlackfinSubtarget(const std::string &TT, const std::string &FS);
+
+    /// ParseSubtargetFeatures - Parses features string setting specified
+    /// subtarget options.  Definition of function is auto generated by tblgen.
+    std::string ParseSubtargetFeatures(const std::string &FS,
+                                       const std::string &CPU);
+  };
+
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/Blackfin/BlackfinTargetMachine.cpp b/lib/Target/Blackfin/BlackfinTargetMachine.cpp
new file mode 100644
index 0000000..45d7c35
--- /dev/null
+++ b/lib/Target/Blackfin/BlackfinTargetMachine.cpp

@@ -0,0 +1,42 @@
+//===-- BlackfinTargetMachine.cpp - Define TargetMachine for Blackfin -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "BlackfinTargetMachine.h"
+#include "Blackfin.h"
+#include "BlackfinMCAsmInfo.h"
+#include "llvm/PassManager.h"
+#include "llvm/Target/TargetRegistry.h"
+
+using namespace llvm;
+
+extern "C" void LLVMInitializeBlackfinTarget() {
+  RegisterTargetMachine<BlackfinTargetMachine> X(TheBlackfinTarget);
+  RegisterAsmInfo<BlackfinMCAsmInfo> Y(TheBlackfinTarget);
+
+}
+
+BlackfinTargetMachine::BlackfinTargetMachine(const Target &T,
+                                             const std::string &TT,
+                                             const std::string &FS)
+  : LLVMTargetMachine(T, TT),
+    DataLayout("e-p:32:32-i64:32-f64:32-n32"),
+    Subtarget(TT, FS),
+    TLInfo(*this),
+    InstrInfo(Subtarget),
+    FrameInfo(TargetFrameInfo::StackGrowsDown, 4, 0) {
+}
+
+bool BlackfinTargetMachine::addInstSelector(PassManagerBase &PM,
+                                            CodeGenOpt::Level OptLevel) {
+  PM.add(createBlackfinISelDag(*this, OptLevel));
+  return false;
+}

diff --git a/lib/Target/Blackfin/BlackfinTargetMachine.h b/lib/Target/Blackfin/BlackfinTargetMachine.h
new file mode 100644
index 0000000..a14052b
--- /dev/null
+++ b/lib/Target/Blackfin/BlackfinTargetMachine.h

@@ -0,0 +1,59 @@
+//===-- BlackfinTargetMachine.h - TargetMachine for Blackfin ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Blackfin specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef BLACKFINTARGETMACHINE_H
+#define BLACKFINTARGETMACHINE_H
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "BlackfinInstrInfo.h"
+#include "BlackfinSubtarget.h"
+#include "BlackfinISelLowering.h"
+#include "BlackfinIntrinsicInfo.h"
+
+namespace llvm {
+
+  class BlackfinTargetMachine : public LLVMTargetMachine {
+    const TargetData DataLayout;
+    BlackfinSubtarget Subtarget;
+    BlackfinTargetLowering TLInfo;
+    BlackfinInstrInfo InstrInfo;
+    TargetFrameInfo FrameInfo;
+    BlackfinIntrinsicInfo IntrinsicInfo;
+  public:
+    BlackfinTargetMachine(const Target &T, const std::string &TT,
+                          const std::string &FS);
+
+    virtual const BlackfinInstrInfo *getInstrInfo() const { return &InstrInfo; }
+    virtual const TargetFrameInfo *getFrameInfo() const { return &FrameInfo; }
+    virtual const BlackfinSubtarget *getSubtargetImpl() const {
+      return &Subtarget;
+    }
+    virtual const BlackfinRegisterInfo *getRegisterInfo() const {
+      return &InstrInfo.getRegisterInfo();
+    }
+    virtual BlackfinTargetLowering* getTargetLowering() const {
+      return const_cast<BlackfinTargetLowering*>(&TLInfo);
+    }
+    virtual const TargetData *getTargetData() const { return &DataLayout; }
+    virtual bool addInstSelector(PassManagerBase &PM,
+                                 CodeGenOpt::Level OptLevel);
+    const TargetIntrinsicInfo *getIntrinsicInfo() const {
+      return &IntrinsicInfo;
+    }
+  };
+
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/Blackfin/CMakeLists.txt b/lib/Target/Blackfin/CMakeLists.txt
new file mode 100644
index 0000000..deb005d
--- /dev/null
+++ b/lib/Target/Blackfin/CMakeLists.txt

@@ -0,0 +1,23 @@
+set(LLVM_TARGET_DEFINITIONS Blackfin.td)
+
+tablegen(BlackfinGenRegisterInfo.h.inc -gen-register-desc-header)
+tablegen(BlackfinGenRegisterNames.inc -gen-register-enums)
+tablegen(BlackfinGenRegisterInfo.inc -gen-register-desc)
+tablegen(BlackfinGenInstrNames.inc -gen-instr-enums)
+tablegen(BlackfinGenInstrInfo.inc -gen-instr-desc)
+tablegen(BlackfinGenAsmWriter.inc -gen-asm-writer)
+tablegen(BlackfinGenDAGISel.inc -gen-dag-isel)
+tablegen(BlackfinGenSubtarget.inc -gen-subtarget)
+tablegen(BlackfinGenCallingConv.inc -gen-callingconv)
+tablegen(BlackfinGenIntrinsics.inc -gen-tgt-intrinsic)
+
+add_llvm_target(BlackfinCodeGen
+  BlackfinInstrInfo.cpp
+  BlackfinIntrinsicInfo.cpp
+  BlackfinISelDAGToDAG.cpp
+  BlackfinISelLowering.cpp
+  BlackfinMCAsmInfo.cpp
+  BlackfinRegisterInfo.cpp
+  BlackfinSubtarget.cpp
+  BlackfinTargetMachine.cpp
+  )

diff --git a/lib/Target/Blackfin/Makefile b/lib/Target/Blackfin/Makefile
new file mode 100644
index 0000000..339bef9
--- /dev/null
+++ b/lib/Target/Blackfin/Makefile

@@ -0,0 +1,24 @@
+##===- lib/Target/Blackfin/Makefile ------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMBlackfinCodeGen
+TARGET = Blackfin
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = BlackfinGenRegisterInfo.h.inc BlackfinGenRegisterNames.inc \
+                BlackfinGenRegisterInfo.inc BlackfinGenInstrNames.inc \
+                BlackfinGenInstrInfo.inc BlackfinGenAsmWriter.inc \
+                BlackfinGenDAGISel.inc BlackfinGenSubtarget.inc \
+		BlackfinGenCallingConv.inc BlackfinGenIntrinsics.inc
+
+DIRS = AsmPrinter TargetInfo
+
+include $(LEVEL)/Makefile.common
+

diff --git a/lib/Target/Blackfin/README.txt b/lib/Target/Blackfin/README.txt
new file mode 100644
index 0000000..b4c8227
--- /dev/null
+++ b/lib/Target/Blackfin/README.txt

@@ -0,0 +1,244 @@
+//===-- README.txt - Notes for Blackfin Target ------------------*- org -*-===//
+
+* Condition codes
+** DONE Problem with asymmetric SETCC operations
+The instruction
+
+  CC = R0 < 2
+
+is not symmetric - there is no R0 > 2 instruction. On the other hand, IF CC
+JUMP can take both CC and !CC as a condition. We cannot pattern-match (brcond
+(not cc), target), the DAG optimizer removes that kind of thing.
+
+This is handled by creating a pseudo-register NCC that aliases CC. Register
+classes JustCC and NotCC are used to control the inversion of CC.
+
+** DONE CC as an i32 register
+The AnyCC register class pretends to hold i32 values. It can only represent the
+values 0 and 1, but we can copy to and from the D class. This hack makes it
+possible to represent the setcc instruction without having i1 as a legal type.
+
+In most cases, the CC register is set by a "CC = .." or BITTST instruction, and
+then used in a conditional branch or move. The code generator thinks it is
+moving 32 bits, but the value stays in CC. In other cases, the result of a
+comparison is actually used as am i32 number, and CC will be copied to a D
+register.
+
+* Stack frames
+** TODO Use Push/Pop instructions
+We should use the push/pop instructions when saving callee-saved
+registers. The are smaller, and we may even use push multiple instructions.
+
+** TODO requiresRegisterScavenging
+We need more intelligence in determining when the scavenger is needed. We
+should keep track of:
+- Spilling D16 registers
+- Spilling AnyCC registers
+
+* Assembler
+** TODO Implement PrintGlobalVariable
+** TODO Remove LOAD32sym
+It's a hack combining two instructions by concatenation.
+
+* Inline Assembly
+
+These are the GCC constraints from bfin/constraints.md:
+
+| Code  | Register class                            | LLVM |
+|-------+-------------------------------------------+------|
+| a     | P                                         | C    |
+| d     | D                                         | C    |
+| z     | Call clobbered P (P0, P1, P2)             | X    |
+| D     | EvenD                                     | X    |
+| W     | OddD                                      | X    |
+| e     | Accu                                      | C    |
+| A     | A0                                        | S    |
+| B     | A1                                        | S    |
+| b     | I                                         | C    |
+| v     | B                                         | C    |
+| f     | M                                         | C    |
+| c     | Circular I, B, L                          | X    |
+| C     | JustCC                                    | S    |
+| t     | LoopTop                                   | X    |
+| u     | LoopBottom                                | X    |
+| k     | LoopCount                                 | X    |
+| x     | GR                                        | C    |
+| y     | RET*, ASTAT, SEQSTAT, USP                 | X    |
+| w     | ALL                                       | C    |
+| Z     | The FD-PIC GOT pointer (P3)               | S    |
+| Y     | The FD-PIC function pointer register (P1) | S    |
+| q0-q7 | R0-R7 individually                        |      |
+| qA    | P0                                        |      |
+|-------+-------------------------------------------+------|
+| Code  | Constant                                  |      |
+|-------+-------------------------------------------+------|
+| J     | 1<<N, N<32                                |      |
+| Ks3   | imm3                                      |      |
+| Ku3   | uimm3                                     |      |
+| Ks4   | imm4                                      |      |
+| Ku4   | uimm4                                     |      |
+| Ks5   | imm5                                      |      |
+| Ku5   | uimm5                                     |      |
+| Ks7   | imm7                                      |      |
+| KN7   | -imm7                                     |      |
+| Ksh   | imm16                                     |      |
+| Kuh   | uimm16                                    |      |
+| L     | ~(1<<N)                                   |      |
+| M1    | 0xff                                      |      |
+| M2    | 0xffff                                    |      |
+| P0-P4 | 0-4                                       |      |
+| PA    | Macflag, not M                            |      |
+| PB    | Macflag, only M                           |      |
+| Q     | Symbol                                    |      |
+
+** TODO Support all register classes
+* DAG combiner
+** Create test case for each Illegal SETCC case
+The DAG combiner may someimes produce illegal i16 SETCC instructions.
+
+*** TODO SETCC (ctlz x), 5) == const
+*** TODO SETCC (and load, const) == const
+*** DONE SETCC (zext x) == const
+*** TODO SETCC (sext x) == const
+
+* Instruction selection
+** TODO Better imediate constants
+Like ARM, build constants as small imm + shift.
+
+** TODO Implement cycle counter
+We have CYCLES and CYCLES2 registers, but the readcyclecounter intrinsic wants
+to return i64, and the code generator doesn't know how to legalize that.
+
+** TODO Instruction alternatives
+Some instructions come in different variants for example:
+
+  D = D + D
+  P = P + P
+
+Cross combinations are not allowed:
+
+  P = D + D (bad)
+
+Similarly for the subreg pseudo-instructions:
+
+ D16L = EXTRACT_SUBREG D16, bfin_subreg_lo16
+ P16L = EXTRACT_SUBREG P16, bfin_subreg_lo16
+
+We want to take advantage of the alternative instructions. This could be done by
+changing the DAG after instruction selection.
+
+
+** Multipatterns for load/store
+We should try to identify multipatterns for load and store instructions. The
+available instruction matrix is a bit irregular.
+
+Loads:
+
+| Addr       | D | P | D 16z | D 16s | D16 | D 8z | D 8s |
+|------------+---+---+-------+-------+-----+------+------|
+| P          | * | * | *     | *     | *   | *    | *    |
+| P++        | * | * | *     | *     |     | *    | *    |
+| P--        | * | * | *     | *     |     | *    | *    |
+| P+uimm5m2  |   |   | *     | *     |     |      |      |
+| P+uimm6m4  | * | * |       |       |     |      |      |
+| P+imm16    |   |   |       |       |     | *    | *    |
+| P+imm17m2  |   |   | *     | *     |     |      |      |
+| P+imm18m4  | * | * |       |       |     |      |      |
+| P++P       | * |   | *     | *     | *   |      |      |
+| FP-uimm7m4 | * | * |       |       |     |      |      |
+| I          | * |   |       |       | *   |      |      |
+| I++        | * |   |       |       | *   |      |      |
+| I--        | * |   |       |       | *   |      |      |
+| I++M       | * |   |       |       |     |      |      |
+
+Stores:
+
+| Addr       | D | P | D16H | D16L | D 8 |
+|------------+---+---+------+------+-----|
+| P          | * | * | *    | *    | *   |
+| P++        | * | * |      | *    | *   |
+| P--        | * | * |      | *    | *   |
+| P+uimm5m2  |   |   |      | *    |     |
+| P+uimm6m4  | * | * |      |      |     |
+| P+imm16    |   |   |      |      | *   |
+| P+imm17m2  |   |   |      | *    |     |
+| P+imm18m4  | * | * |      |      |     |
+| P++P       | * |   | *    | *    |     |
+| FP-uimm7m4 | * | * |      |      |     |
+| I          | * |   | *    | *    |     |
+| I++        | * |   | *    | *    |     |
+| I--        | * |   | *    | *    |     |
+| I++M       | * |   |      |      |     |
+
+* Workarounds and features
+Blackfin CPUs have bugs. Each model comes in a number of silicon revisions with
+different bugs. We learn about the CPU model from the -mcpu switch.
+
+** Interpretation of -mcpu value
+- -mcpu=bf527 refers to the latest known BF527 revision
+- -mcpu=bf527-0.2 refers to silicon rev. 0.2
+- -mcpu=bf527-any refers to all known revisions
+- -mcpu=bf527-none disables all workarounds
+
+The -mcpu setting affects the __SILICON_REVISION__ macro and enabled workarounds:
+
+| -mcpu      | __SILICON_REVISION__ | Workarounds        |
+|------------+----------------------+--------------------|
+| bf527      | Def Latest           | Specific to latest |
+| bf527-1.3  | Def 0x0103           | Specific to 1.3    |
+| bf527-any  | Def 0xffff           | All bf527-x.y      |
+| bf527-none | Undefined            | None               |
+
+These are the known cores and revisions:
+
+| Core        | Silicon            | Processors              |
+|-------------+--------------------+-------------------------|
+| Edinburgh   | 0.3, 0.4, 0.5, 0.6 | BF531 BF532 BF533       |
+| Braemar     | 0.2, 0.3           | BF534 BF536 BF537       |
+| Stirling    | 0.3, 0.4, 0.5      | BF538 BF539             |
+| Moab        | 0.0, 0.1, 0.2      | BF542 BF544 BF548 BF549 |
+| Teton       | 0.3, 0.5           | BF561                   |
+| Kookaburra  | 0.0, 0.1, 0.2      | BF523 BF525 BF527       |
+| Mockingbird | 0.0, 0.1           | BF522 BF524 BF526       |
+| Brodie      | 0.0, 0.1           | BF512 BF514 BF516 BF518 |
+
+
+** Compiler implemented workarounds
+Most workarounds are implemented in header files and source code using the
+__ADSPBF527__ macros. A few workarounds require compiler support.
+
+|  Anomaly | Macro                          | GCC Switch       |
+|----------+--------------------------------+------------------|
+|      Any | __WORKAROUNDS_ENABLED          |                  |
+| 05000074 | WA_05000074                    |                  |
+| 05000244 | __WORKAROUND_SPECULATIVE_SYNCS | -mcsync-anomaly  |
+| 05000245 | __WORKAROUND_SPECULATIVE_LOADS | -mspecld-anomaly |
+| 05000257 | WA_05000257                    |                  |
+| 05000283 | WA_05000283                    |                  |
+| 05000312 | WA_LOAD_LCREGS                 |                  |
+| 05000315 | WA_05000315                    |                  |
+| 05000371 | __WORKAROUND_RETS              |                  |
+| 05000426 | __WORKAROUND_INDIRECT_CALLS    | Not -micplb      |
+
+** GCC feature switches
+| Switch                    | Description                            |
+|---------------------------+----------------------------------------|
+| -msim                     | Use simulator runtime                  |
+| -momit-leaf-frame-pointer | Omit frame pointer for leaf functions  |
+| -mlow64k                  |                                        |
+| -mcsync-anomaly           |                                        |
+| -mspecld-anomaly          |                                        |
+| -mid-shared-library       |                                        |
+| -mleaf-id-shared-library  |                                        |
+| -mshared-library-id=      |                                        |
+| -msep-data                | Enable separate data segment           |
+| -mlong-calls              | Use indirect calls                     |
+| -mfast-fp                 |                                        |
+| -mfdpic                   |                                        |
+| -minline-plt              |                                        |
+| -mstack-check-l1          | Do stack checking in L1 scratch memory |
+| -mmulticore               | Enable multicore support               |
+| -mcorea                   | Build for Core A                       |
+| -mcoreb                   | Build for Core B                       |
+| -msdram                   | Build for SDRAM                        |
+| -micplb                   | Assume ICPLBs are enabled at runtime.  |

diff --git a/lib/Target/Blackfin/TargetInfo/BlackfinTargetInfo.cpp b/lib/Target/Blackfin/TargetInfo/BlackfinTargetInfo.cpp
new file mode 100644
index 0000000..402e0af
--- /dev/null
+++ b/lib/Target/Blackfin/TargetInfo/BlackfinTargetInfo.cpp

@@ -0,0 +1,21 @@
+//===-- BlackfinTargetInfo.cpp - Blackfin Target Implementation -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Blackfin.h"
+#include "llvm/Module.h"
+#include "llvm/Target/TargetRegistry.h"
+
+using namespace llvm;
+
+Target llvm::TheBlackfinTarget;
+
+extern "C" void LLVMInitializeBlackfinTargetInfo() {
+  RegisterTarget<Triple::bfin> X(TheBlackfinTarget, "bfin",
+                                 "Analog Devices Blackfin [experimental]");
+}

diff --git a/lib/Target/Blackfin/TargetInfo/CMakeLists.txt b/lib/Target/Blackfin/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000..5ca8060
--- /dev/null
+++ b/lib/Target/Blackfin/TargetInfo/CMakeLists.txt

@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMBlackfinInfo
+  BlackfinTargetInfo.cpp
+  )
+
+add_dependencies(LLVMBlackfinInfo BlackfinCodeGenTable_gen)

diff --git a/lib/Target/Blackfin/TargetInfo/Makefile b/lib/Target/Blackfin/TargetInfo/Makefile
new file mode 100644
index 0000000..c49cfbe
--- /dev/null
+++ b/lib/Target/Blackfin/TargetInfo/Makefile

@@ -0,0 +1,15 @@
+##===- lib/Target/Blackfin/TargetInfo/Makefile -------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMBlackfinInfo
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common

diff --git a/lib/Target/CBackend/CBackend.cpp b/lib/Target/CBackend/CBackend.cpp
new file mode 100644
index 0000000..fd4c4e7
--- /dev/null
+++ b/lib/Target/CBackend/CBackend.cpp

@@ -0,0 +1,3718 @@
+//===-- CBackend.cpp - Library for converting LLVM code to C --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This library converts LLVM code to C code, compilable by GCC and other C
+// compilers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CTargetMachine.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/PassManager.h"
+#include "llvm/TypeSymbolTable.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/InlineAsm.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/ConstantsScanner.h"
+#include "llvm/Analysis/FindUsedTypes.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/IntrinsicLowering.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/InstVisitor.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/System/Host.h"
+#include "llvm/Config/config.h"
+#include <algorithm>
+#include <sstream>
+using namespace llvm;
+
+extern "C" void LLVMInitializeCBackendTarget() { 
+  // Register the target.
+  RegisterTargetMachine<CTargetMachine> X(TheCBackendTarget);
+}
+
+namespace {
+  class CBEMCAsmInfo : public MCAsmInfo {
+  public:
+    CBEMCAsmInfo() {
+      GlobalPrefix = "";
+      PrivateGlobalPrefix = "";
+    }
+  };
+  /// CBackendNameAllUsedStructsAndMergeFunctions - This pass inserts names for
+  /// any unnamed structure types that are used by the program, and merges
+  /// external functions with the same name.
+  ///
+  class CBackendNameAllUsedStructsAndMergeFunctions : public ModulePass {
+  public:
+    static char ID;
+    CBackendNameAllUsedStructsAndMergeFunctions() 
+      : ModulePass(&ID) {}
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<FindUsedTypes>();
+    }
+
+    virtual const char *getPassName() const {
+      return "C backend type canonicalizer";
+    }
+
+    virtual bool runOnModule(Module &M);
+  };
+
+  char CBackendNameAllUsedStructsAndMergeFunctions::ID = 0;
+
+  /// CWriter - This class is the main chunk of code that converts an LLVM
+  /// module to a C translation unit.
+  class CWriter : public FunctionPass, public InstVisitor<CWriter> {
+    formatted_raw_ostream &Out;
+    IntrinsicLowering *IL;
+    Mangler *Mang;
+    LoopInfo *LI;
+    const Module *TheModule;
+    const MCAsmInfo* TAsm;
+    const TargetData* TD;
+    std::map<const Type *, std::string> TypeNames;
+    std::map<const ConstantFP *, unsigned> FPConstantMap;
+    std::set<Function*> intrinsicPrototypesAlreadyGenerated;
+    std::set<const Argument*> ByValParams;
+    unsigned FPCounter;
+    unsigned OpaqueCounter;
+    DenseMap<const Value*, unsigned> AnonValueNumbers;
+    unsigned NextAnonValueNumber;
+
+  public:
+    static char ID;
+    explicit CWriter(formatted_raw_ostream &o)
+      : FunctionPass(&ID), Out(o), IL(0), Mang(0), LI(0), 
+        TheModule(0), TAsm(0), TD(0), OpaqueCounter(0), NextAnonValueNumber(0) {
+      FPCounter = 0;
+    }
+
+    virtual const char *getPassName() const { return "C backend"; }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<LoopInfo>();
+      AU.setPreservesAll();
+    }
+
+    virtual bool doInitialization(Module &M);
+
+    bool runOnFunction(Function &F) {
+     // Do not codegen any 'available_externally' functions at all, they have
+     // definitions outside the translation unit.
+     if (F.hasAvailableExternallyLinkage())
+       return false;
+
+      LI = &getAnalysis<LoopInfo>();
+
+      // Get rid of intrinsics we can't handle.
+      lowerIntrinsics(F);
+
+      // Output all floating point constants that cannot be printed accurately.
+      printFloatingPointConstants(F);
+
+      printFunction(F);
+      return false;
+    }
+
+    virtual bool doFinalization(Module &M) {
+      // Free memory...
+      delete IL;
+      delete TD;
+      delete Mang;
+      FPConstantMap.clear();
+      TypeNames.clear();
+      ByValParams.clear();
+      intrinsicPrototypesAlreadyGenerated.clear();
+      return false;
+    }
+
+    raw_ostream &printType(formatted_raw_ostream &Out,
+                           const Type *Ty, 
+                           bool isSigned = false,
+                           const std::string &VariableName = "",
+                           bool IgnoreName = false,
+                           const AttrListPtr &PAL = AttrListPtr());
+    std::ostream &printType(std::ostream &Out, const Type *Ty, 
+                           bool isSigned = false,
+                           const std::string &VariableName = "",
+                           bool IgnoreName = false,
+                           const AttrListPtr &PAL = AttrListPtr());
+    raw_ostream &printSimpleType(formatted_raw_ostream &Out,
+                                 const Type *Ty, 
+                                 bool isSigned, 
+                                 const std::string &NameSoFar = "");
+    std::ostream &printSimpleType(std::ostream &Out, const Type *Ty, 
+                                 bool isSigned, 
+                                 const std::string &NameSoFar = "");
+
+    void printStructReturnPointerFunctionType(formatted_raw_ostream &Out,
+                                              const AttrListPtr &PAL,
+                                              const PointerType *Ty);
+
+    /// writeOperandDeref - Print the result of dereferencing the specified
+    /// operand with '*'.  This is equivalent to printing '*' then using
+    /// writeOperand, but avoids excess syntax in some cases.
+    void writeOperandDeref(Value *Operand) {
+      if (isAddressExposed(Operand)) {
+        // Already something with an address exposed.
+        writeOperandInternal(Operand);
+      } else {
+        Out << "*(";
+        writeOperand(Operand);
+        Out << ")";
+      }
+    }
+    
+    void writeOperand(Value *Operand, bool Static = false);
+    void writeInstComputationInline(Instruction &I);
+    void writeOperandInternal(Value *Operand, bool Static = false);
+    void writeOperandWithCast(Value* Operand, unsigned Opcode);
+    void writeOperandWithCast(Value* Operand, const ICmpInst &I);
+    bool writeInstructionCast(const Instruction &I);
+
+    void writeMemoryAccess(Value *Operand, const Type *OperandType,
+                           bool IsVolatile, unsigned Alignment);
+
+  private :
+    std::string InterpretASMConstraint(InlineAsm::ConstraintInfo& c);
+
+    void lowerIntrinsics(Function &F);
+
+    void printModule(Module *M);
+    void printModuleTypes(const TypeSymbolTable &ST);
+    void printContainedStructs(const Type *Ty, std::set<const Type *> &);
+    void printFloatingPointConstants(Function &F);
+    void printFloatingPointConstants(const Constant *C);
+    void printFunctionSignature(const Function *F, bool Prototype);
+
+    void printFunction(Function &);
+    void printBasicBlock(BasicBlock *BB);
+    void printLoop(Loop *L);
+
+    void printCast(unsigned opcode, const Type *SrcTy, const Type *DstTy);
+    void printConstant(Constant *CPV, bool Static);
+    void printConstantWithCast(Constant *CPV, unsigned Opcode);
+    bool printConstExprCast(const ConstantExpr *CE, bool Static);
+    void printConstantArray(ConstantArray *CPA, bool Static);
+    void printConstantVector(ConstantVector *CV, bool Static);
+
+    /// isAddressExposed - Return true if the specified value's name needs to
+    /// have its address taken in order to get a C value of the correct type.
+    /// This happens for global variables, byval parameters, and direct allocas.
+    bool isAddressExposed(const Value *V) const {
+      if (const Argument *A = dyn_cast<Argument>(V))
+        return ByValParams.count(A);
+      return isa<GlobalVariable>(V) || isDirectAlloca(V);
+    }
+    
+    // isInlinableInst - Attempt to inline instructions into their uses to build
+    // trees as much as possible.  To do this, we have to consistently decide
+    // what is acceptable to inline, so that variable declarations don't get
+    // printed and an extra copy of the expr is not emitted.
+    //
+    static bool isInlinableInst(const Instruction &I) {
+      // Always inline cmp instructions, even if they are shared by multiple
+      // expressions.  GCC generates horrible code if we don't.
+      if (isa<CmpInst>(I)) 
+        return true;
+
+      // Must be an expression, must be used exactly once.  If it is dead, we
+      // emit it inline where it would go.
+      if (I.getType() == Type::getVoidTy(I.getContext()) || !I.hasOneUse() ||
+          isa<TerminatorInst>(I) || isa<CallInst>(I) || isa<PHINode>(I) ||
+          isa<LoadInst>(I) || isa<VAArgInst>(I) || isa<InsertElementInst>(I) ||
+          isa<InsertValueInst>(I))
+        // Don't inline a load across a store or other bad things!
+        return false;
+
+      // Must not be used in inline asm, extractelement, or shufflevector.
+      if (I.hasOneUse()) {
+        const Instruction &User = cast<Instruction>(*I.use_back());
+        if (isInlineAsm(User) || isa<ExtractElementInst>(User) ||
+            isa<ShuffleVectorInst>(User))
+          return false;
+      }
+
+      // Only inline instruction it if it's use is in the same BB as the inst.
+      return I.getParent() == cast<Instruction>(I.use_back())->getParent();
+    }
+
+    // isDirectAlloca - Define fixed sized allocas in the entry block as direct
+    // variables which are accessed with the & operator.  This causes GCC to
+    // generate significantly better code than to emit alloca calls directly.
+    //
+    static const AllocaInst *isDirectAlloca(const Value *V) {
+      const AllocaInst *AI = dyn_cast<AllocaInst>(V);
+      if (!AI) return false;
+      if (AI->isArrayAllocation())
+        return 0;   // FIXME: we can also inline fixed size array allocas!
+      if (AI->getParent() != &AI->getParent()->getParent()->getEntryBlock())
+        return 0;
+      return AI;
+    }
+    
+    // isInlineAsm - Check if the instruction is a call to an inline asm chunk
+    static bool isInlineAsm(const Instruction& I) {
+      if (isa<CallInst>(&I) && isa<InlineAsm>(I.getOperand(0)))
+        return true;
+      return false;
+    }
+    
+    // Instruction visitation functions
+    friend class InstVisitor<CWriter>;
+
+    void visitReturnInst(ReturnInst &I);
+    void visitBranchInst(BranchInst &I);
+    void visitSwitchInst(SwitchInst &I);
+    void visitIndirectBrInst(IndirectBrInst &I);
+    void visitInvokeInst(InvokeInst &I) {
+      llvm_unreachable("Lowerinvoke pass didn't work!");
+    }
+
+    void visitUnwindInst(UnwindInst &I) {
+      llvm_unreachable("Lowerinvoke pass didn't work!");
+    }
+    void visitUnreachableInst(UnreachableInst &I);
+
+    void visitPHINode(PHINode &I);
+    void visitBinaryOperator(Instruction &I);
+    void visitICmpInst(ICmpInst &I);
+    void visitFCmpInst(FCmpInst &I);
+
+    void visitCastInst (CastInst &I);
+    void visitSelectInst(SelectInst &I);
+    void visitCallInst (CallInst &I);
+    void visitInlineAsm(CallInst &I);
+    bool visitBuiltinCall(CallInst &I, Intrinsic::ID ID, bool &WroteCallee);
+
+    void visitAllocaInst(AllocaInst &I);
+    void visitLoadInst  (LoadInst   &I);
+    void visitStoreInst (StoreInst  &I);
+    void visitGetElementPtrInst(GetElementPtrInst &I);
+    void visitVAArgInst (VAArgInst &I);
+    
+    void visitInsertElementInst(InsertElementInst &I);
+    void visitExtractElementInst(ExtractElementInst &I);
+    void visitShuffleVectorInst(ShuffleVectorInst &SVI);
+
+    void visitInsertValueInst(InsertValueInst &I);
+    void visitExtractValueInst(ExtractValueInst &I);
+
+    void visitInstruction(Instruction &I) {
+#ifndef NDEBUG
+      errs() << "C Writer does not know about " << I;
+#endif
+      llvm_unreachable(0);
+    }
+
+    void outputLValue(Instruction *I) {
+      Out << "  " << GetValueName(I) << " = ";
+    }
+
+    bool isGotoCodeNecessary(BasicBlock *From, BasicBlock *To);
+    void printPHICopiesForSuccessor(BasicBlock *CurBlock,
+                                    BasicBlock *Successor, unsigned Indent);
+    void printBranchToBlock(BasicBlock *CurBlock, BasicBlock *SuccBlock,
+                            unsigned Indent);
+    void printGEPExpression(Value *Ptr, gep_type_iterator I,
+                            gep_type_iterator E, bool Static);
+
+    std::string GetValueName(const Value *Operand);
+  };
+}
+
+char CWriter::ID = 0;
+
+
+static std::string CBEMangle(const std::string &S) {
+  std::string Result;
+  
+  for (unsigned i = 0, e = S.size(); i != e; ++i)
+    if (isalnum(S[i]) || S[i] == '_') {
+      Result += S[i];
+    } else {
+      Result += '_';
+      Result += 'A'+(S[i]&15);
+      Result += 'A'+((S[i]>>4)&15);
+      Result += '_';
+    }
+  return Result;
+}
+
+
+/// This method inserts names for any unnamed structure types that are used by
+/// the program, and removes names from structure types that are not used by the
+/// program.
+///
+bool CBackendNameAllUsedStructsAndMergeFunctions::runOnModule(Module &M) {
+  // Get a set of types that are used by the program...
+  std::set<const Type *> UT = getAnalysis<FindUsedTypes>().getTypes();
+
+  // Loop over the module symbol table, removing types from UT that are
+  // already named, and removing names for types that are not used.
+  //
+  TypeSymbolTable &TST = M.getTypeSymbolTable();
+  for (TypeSymbolTable::iterator TI = TST.begin(), TE = TST.end();
+       TI != TE; ) {
+    TypeSymbolTable::iterator I = TI++;
+    
+    // If this isn't a struct or array type, remove it from our set of types
+    // to name. This simplifies emission later.
+    if (!isa<StructType>(I->second) && !isa<OpaqueType>(I->second) &&
+        !isa<ArrayType>(I->second)) {
+      TST.remove(I);
+    } else {
+      // If this is not used, remove it from the symbol table.
+      std::set<const Type *>::iterator UTI = UT.find(I->second);
+      if (UTI == UT.end())
+        TST.remove(I);
+      else
+        UT.erase(UTI);    // Only keep one name for this type.
+    }
+  }
+
+  // UT now contains types that are not named.  Loop over it, naming
+  // structure types.
+  //
+  bool Changed = false;
+  unsigned RenameCounter = 0;
+  for (std::set<const Type *>::const_iterator I = UT.begin(), E = UT.end();
+       I != E; ++I)
+    if (isa<StructType>(*I) || isa<ArrayType>(*I)) {
+      while (M.addTypeName("unnamed"+utostr(RenameCounter), *I))
+        ++RenameCounter;
+      Changed = true;
+    }
+      
+      
+  // Loop over all external functions and globals.  If we have two with
+  // identical names, merge them.
+  // FIXME: This code should disappear when we don't allow values with the same
+  // names when they have different types!
+  std::map<std::string, GlobalValue*> ExtSymbols;
+  for (Module::iterator I = M.begin(), E = M.end(); I != E;) {
+    Function *GV = I++;
+    if (GV->isDeclaration() && GV->hasName()) {
+      std::pair<std::map<std::string, GlobalValue*>::iterator, bool> X
+        = ExtSymbols.insert(std::make_pair(GV->getName(), GV));
+      if (!X.second) {
+        // Found a conflict, replace this global with the previous one.
+        GlobalValue *OldGV = X.first->second;
+        GV->replaceAllUsesWith(ConstantExpr::getBitCast(OldGV, GV->getType()));
+        GV->eraseFromParent();
+        Changed = true;
+      }
+    }
+  }
+  // Do the same for globals.
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+       I != E;) {
+    GlobalVariable *GV = I++;
+    if (GV->isDeclaration() && GV->hasName()) {
+      std::pair<std::map<std::string, GlobalValue*>::iterator, bool> X
+        = ExtSymbols.insert(std::make_pair(GV->getName(), GV));
+      if (!X.second) {
+        // Found a conflict, replace this global with the previous one.
+        GlobalValue *OldGV = X.first->second;
+        GV->replaceAllUsesWith(ConstantExpr::getBitCast(OldGV, GV->getType()));
+        GV->eraseFromParent();
+        Changed = true;
+      }
+    }
+  }
+  
+  return Changed;
+}
+
+/// printStructReturnPointerFunctionType - This is like printType for a struct
+/// return type, except, instead of printing the type as void (*)(Struct*, ...)
+/// print it as "Struct (*)(...)", for struct return functions.
+void CWriter::printStructReturnPointerFunctionType(formatted_raw_ostream &Out,
+                                                   const AttrListPtr &PAL,
+                                                   const PointerType *TheTy) {
+  const FunctionType *FTy = cast<FunctionType>(TheTy->getElementType());
+  std::stringstream FunctionInnards;
+  FunctionInnards << " (*) (";
+  bool PrintedType = false;
+
+  FunctionType::param_iterator I = FTy->param_begin(), E = FTy->param_end();
+  const Type *RetTy = cast<PointerType>(I->get())->getElementType();
+  unsigned Idx = 1;
+  for (++I, ++Idx; I != E; ++I, ++Idx) {
+    if (PrintedType)
+      FunctionInnards << ", ";
+    const Type *ArgTy = *I;
+    if (PAL.paramHasAttr(Idx, Attribute::ByVal)) {
+      assert(isa<PointerType>(ArgTy));
+      ArgTy = cast<PointerType>(ArgTy)->getElementType();
+    }
+    printType(FunctionInnards, ArgTy,
+        /*isSigned=*/PAL.paramHasAttr(Idx, Attribute::SExt), "");
+    PrintedType = true;
+  }
+  if (FTy->isVarArg()) {
+    if (PrintedType)
+      FunctionInnards << ", ...";
+  } else if (!PrintedType) {
+    FunctionInnards << "void";
+  }
+  FunctionInnards << ')';
+  std::string tstr = FunctionInnards.str();
+  printType(Out, RetTy, 
+      /*isSigned=*/PAL.paramHasAttr(0, Attribute::SExt), tstr);
+}
+
+raw_ostream &
+CWriter::printSimpleType(formatted_raw_ostream &Out, const Type *Ty,
+                         bool isSigned,
+                         const std::string &NameSoFar) {
+  assert((Ty->isPrimitiveType() || Ty->isInteger() || isa<VectorType>(Ty)) && 
+         "Invalid type for printSimpleType");
+  switch (Ty->getTypeID()) {
+  case Type::VoidTyID:   return Out << "void " << NameSoFar;
+  case Type::IntegerTyID: {
+    unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth();
+    if (NumBits == 1) 
+      return Out << "bool " << NameSoFar;
+    else if (NumBits <= 8)
+      return Out << (isSigned?"signed":"unsigned") << " char " << NameSoFar;
+    else if (NumBits <= 16)
+      return Out << (isSigned?"signed":"unsigned") << " short " << NameSoFar;
+    else if (NumBits <= 32)
+      return Out << (isSigned?"signed":"unsigned") << " int " << NameSoFar;
+    else if (NumBits <= 64)
+      return Out << (isSigned?"signed":"unsigned") << " long long "<< NameSoFar;
+    else { 
+      assert(NumBits <= 128 && "Bit widths > 128 not implemented yet");
+      return Out << (isSigned?"llvmInt128":"llvmUInt128") << " " << NameSoFar;
+    }
+  }
+  case Type::FloatTyID:  return Out << "float "   << NameSoFar;
+  case Type::DoubleTyID: return Out << "double "  << NameSoFar;
+  // Lacking emulation of FP80 on PPC, etc., we assume whichever of these is
+  // present matches host 'long double'.
+  case Type::X86_FP80TyID:
+  case Type::PPC_FP128TyID:
+  case Type::FP128TyID:  return Out << "long double " << NameSoFar;
+      
+  case Type::VectorTyID: {
+    const VectorType *VTy = cast<VectorType>(Ty);
+    return printSimpleType(Out, VTy->getElementType(), isSigned,
+                     " __attribute__((vector_size(" +
+                     utostr(TD->getTypeAllocSize(VTy)) + " ))) " + NameSoFar);
+  }
+    
+  default:
+#ifndef NDEBUG
+    errs() << "Unknown primitive type: " << *Ty << "\n";
+#endif
+    llvm_unreachable(0);
+  }
+}
+
+std::ostream &
+CWriter::printSimpleType(std::ostream &Out, const Type *Ty, bool isSigned,
+                         const std::string &NameSoFar) {
+  assert((Ty->isPrimitiveType() || Ty->isInteger() || isa<VectorType>(Ty)) && 
+         "Invalid type for printSimpleType");
+  switch (Ty->getTypeID()) {
+  case Type::VoidTyID:   return Out << "void " << NameSoFar;
+  case Type::IntegerTyID: {
+    unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth();
+    if (NumBits == 1) 
+      return Out << "bool " << NameSoFar;
+    else if (NumBits <= 8)
+      return Out << (isSigned?"signed":"unsigned") << " char " << NameSoFar;
+    else if (NumBits <= 16)
+      return Out << (isSigned?"signed":"unsigned") << " short " << NameSoFar;
+    else if (NumBits <= 32)
+      return Out << (isSigned?"signed":"unsigned") << " int " << NameSoFar;
+    else if (NumBits <= 64)
+      return Out << (isSigned?"signed":"unsigned") << " long long "<< NameSoFar;
+    else { 
+      assert(NumBits <= 128 && "Bit widths > 128 not implemented yet");
+      return Out << (isSigned?"llvmInt128":"llvmUInt128") << " " << NameSoFar;
+    }
+  }
+  case Type::FloatTyID:  return Out << "float "   << NameSoFar;
+  case Type::DoubleTyID: return Out << "double "  << NameSoFar;
+  // Lacking emulation of FP80 on PPC, etc., we assume whichever of these is
+  // present matches host 'long double'.
+  case Type::X86_FP80TyID:
+  case Type::PPC_FP128TyID:
+  case Type::FP128TyID:  return Out << "long double " << NameSoFar;
+      
+  case Type::VectorTyID: {
+    const VectorType *VTy = cast<VectorType>(Ty);
+    return printSimpleType(Out, VTy->getElementType(), isSigned,
+                     " __attribute__((vector_size(" +
+                     utostr(TD->getTypeAllocSize(VTy)) + " ))) " + NameSoFar);
+  }
+    
+  default:
+#ifndef NDEBUG
+    errs() << "Unknown primitive type: " << *Ty << "\n";
+#endif
+    llvm_unreachable(0);
+  }
+}
+
+// Pass the Type* and the variable name and this prints out the variable
+// declaration.
+//
+raw_ostream &CWriter::printType(formatted_raw_ostream &Out,
+                                const Type *Ty,
+                                bool isSigned, const std::string &NameSoFar,
+                                bool IgnoreName, const AttrListPtr &PAL) {
+  if (Ty->isPrimitiveType() || Ty->isInteger() || isa<VectorType>(Ty)) {
+    printSimpleType(Out, Ty, isSigned, NameSoFar);
+    return Out;
+  }
+
+  // Check to see if the type is named.
+  if (!IgnoreName || isa<OpaqueType>(Ty)) {
+    std::map<const Type *, std::string>::iterator I = TypeNames.find(Ty);
+    if (I != TypeNames.end()) return Out << I->second << ' ' << NameSoFar;
+  }
+
+  switch (Ty->getTypeID()) {
+  case Type::FunctionTyID: {
+    const FunctionType *FTy = cast<FunctionType>(Ty);
+    std::stringstream FunctionInnards;
+    FunctionInnards << " (" << NameSoFar << ") (";
+    unsigned Idx = 1;
+    for (FunctionType::param_iterator I = FTy->param_begin(),
+           E = FTy->param_end(); I != E; ++I) {
+      const Type *ArgTy = *I;
+      if (PAL.paramHasAttr(Idx, Attribute::ByVal)) {
+        assert(isa<PointerType>(ArgTy));
+        ArgTy = cast<PointerType>(ArgTy)->getElementType();
+      }
+      if (I != FTy->param_begin())
+        FunctionInnards << ", ";
+      printType(FunctionInnards, ArgTy,
+        /*isSigned=*/PAL.paramHasAttr(Idx, Attribute::SExt), "");
+      ++Idx;
+    }
+    if (FTy->isVarArg()) {
+      if (FTy->getNumParams())
+        FunctionInnards << ", ...";
+    } else if (!FTy->getNumParams()) {
+      FunctionInnards << "void";
+    }
+    FunctionInnards << ')';
+    std::string tstr = FunctionInnards.str();
+    printType(Out, FTy->getReturnType(), 
+      /*isSigned=*/PAL.paramHasAttr(0, Attribute::SExt), tstr);
+    return Out;
+  }
+  case Type::StructTyID: {
+    const StructType *STy = cast<StructType>(Ty);
+    Out << NameSoFar + " {\n";
+    unsigned Idx = 0;
+    for (StructType::element_iterator I = STy->element_begin(),
+           E = STy->element_end(); I != E; ++I) {
+      Out << "  ";
+      printType(Out, *I, false, "field" + utostr(Idx++));
+      Out << ";\n";
+    }
+    Out << '}';
+    if (STy->isPacked())
+      Out << " __attribute__ ((packed))";
+    return Out;
+  }
+
+  case Type::PointerTyID: {
+    const PointerType *PTy = cast<PointerType>(Ty);
+    std::string ptrName = "*" + NameSoFar;
+
+    if (isa<ArrayType>(PTy->getElementType()) ||
+        isa<VectorType>(PTy->getElementType()))
+      ptrName = "(" + ptrName + ")";
+
+    if (!PAL.isEmpty())
+      // Must be a function ptr cast!
+      return printType(Out, PTy->getElementType(), false, ptrName, true, PAL);
+    return printType(Out, PTy->getElementType(), false, ptrName);
+  }
+
+  case Type::ArrayTyID: {
+    const ArrayType *ATy = cast<ArrayType>(Ty);
+    unsigned NumElements = ATy->getNumElements();
+    if (NumElements == 0) NumElements = 1;
+    // Arrays are wrapped in structs to allow them to have normal
+    // value semantics (avoiding the array "decay").
+    Out << NameSoFar << " { ";
+    printType(Out, ATy->getElementType(), false,
+              "array[" + utostr(NumElements) + "]");
+    return Out << "; }";
+  }
+
+  case Type::OpaqueTyID: {
+    std::string TyName = "struct opaque_" + itostr(OpaqueCounter++);
+    assert(TypeNames.find(Ty) == TypeNames.end());
+    TypeNames[Ty] = TyName;
+    return Out << TyName << ' ' << NameSoFar;
+  }
+  default:
+    llvm_unreachable("Unhandled case in getTypeProps!");
+  }
+
+  return Out;
+}
+
+// Pass the Type* and the variable name and this prints out the variable
+// declaration.
+//
+std::ostream &CWriter::printType(std::ostream &Out, const Type *Ty,
+                                 bool isSigned, const std::string &NameSoFar,
+                                 bool IgnoreName, const AttrListPtr &PAL) {
+  if (Ty->isPrimitiveType() || Ty->isInteger() || isa<VectorType>(Ty)) {
+    printSimpleType(Out, Ty, isSigned, NameSoFar);
+    return Out;
+  }
+
+  // Check to see if the type is named.
+  if (!IgnoreName || isa<OpaqueType>(Ty)) {
+    std::map<const Type *, std::string>::iterator I = TypeNames.find(Ty);
+    if (I != TypeNames.end()) return Out << I->second << ' ' << NameSoFar;
+  }
+
+  switch (Ty->getTypeID()) {
+  case Type::FunctionTyID: {
+    const FunctionType *FTy = cast<FunctionType>(Ty);
+    std::stringstream FunctionInnards;
+    FunctionInnards << " (" << NameSoFar << ") (";
+    unsigned Idx = 1;
+    for (FunctionType::param_iterator I = FTy->param_begin(),
+           E = FTy->param_end(); I != E; ++I) {
+      const Type *ArgTy = *I;
+      if (PAL.paramHasAttr(Idx, Attribute::ByVal)) {
+        assert(isa<PointerType>(ArgTy));
+        ArgTy = cast<PointerType>(ArgTy)->getElementType();
+      }
+      if (I != FTy->param_begin())
+        FunctionInnards << ", ";
+      printType(FunctionInnards, ArgTy,
+        /*isSigned=*/PAL.paramHasAttr(Idx, Attribute::SExt), "");
+      ++Idx;
+    }
+    if (FTy->isVarArg()) {
+      if (FTy->getNumParams())
+        FunctionInnards << ", ...";
+    } else if (!FTy->getNumParams()) {
+      FunctionInnards << "void";
+    }
+    FunctionInnards << ')';
+    std::string tstr = FunctionInnards.str();
+    printType(Out, FTy->getReturnType(), 
+      /*isSigned=*/PAL.paramHasAttr(0, Attribute::SExt), tstr);
+    return Out;
+  }
+  case Type::StructTyID: {
+    const StructType *STy = cast<StructType>(Ty);
+    Out << NameSoFar + " {\n";
+    unsigned Idx = 0;
+    for (StructType::element_iterator I = STy->element_begin(),
+           E = STy->element_end(); I != E; ++I) {
+      Out << "  ";
+      printType(Out, *I, false, "field" + utostr(Idx++));
+      Out << ";\n";
+    }
+    Out << '}';
+    if (STy->isPacked())
+      Out << " __attribute__ ((packed))";
+    return Out;
+  }
+
+  case Type::PointerTyID: {
+    const PointerType *PTy = cast<PointerType>(Ty);
+    std::string ptrName = "*" + NameSoFar;
+
+    if (isa<ArrayType>(PTy->getElementType()) ||
+        isa<VectorType>(PTy->getElementType()))
+      ptrName = "(" + ptrName + ")";
+
+    if (!PAL.isEmpty())
+      // Must be a function ptr cast!
+      return printType(Out, PTy->getElementType(), false, ptrName, true, PAL);
+    return printType(Out, PTy->getElementType(), false, ptrName);
+  }
+
+  case Type::ArrayTyID: {
+    const ArrayType *ATy = cast<ArrayType>(Ty);
+    unsigned NumElements = ATy->getNumElements();
+    if (NumElements == 0) NumElements = 1;
+    // Arrays are wrapped in structs to allow them to have normal
+    // value semantics (avoiding the array "decay").
+    Out << NameSoFar << " { ";
+    printType(Out, ATy->getElementType(), false,
+              "array[" + utostr(NumElements) + "]");
+    return Out << "; }";
+  }
+
+  case Type::OpaqueTyID: {
+    std::string TyName = "struct opaque_" + itostr(OpaqueCounter++);
+    assert(TypeNames.find(Ty) == TypeNames.end());
+    TypeNames[Ty] = TyName;
+    return Out << TyName << ' ' << NameSoFar;
+  }
+  default:
+    llvm_unreachable("Unhandled case in getTypeProps!");
+  }
+
+  return Out;
+}
+
+void CWriter::printConstantArray(ConstantArray *CPA, bool Static) {
+
+  // As a special case, print the array as a string if it is an array of
+  // ubytes or an array of sbytes with positive values.
+  //
+  const Type *ETy = CPA->getType()->getElementType();
+  bool isString = (ETy == Type::getInt8Ty(CPA->getContext()) ||
+                   ETy == Type::getInt8Ty(CPA->getContext()));
+
+  // Make sure the last character is a null char, as automatically added by C
+  if (isString && (CPA->getNumOperands() == 0 ||
+                   !cast<Constant>(*(CPA->op_end()-1))->isNullValue()))
+    isString = false;
+
+  if (isString) {
+    Out << '\"';
+    // Keep track of whether the last number was a hexadecimal escape
+    bool LastWasHex = false;
+
+    // Do not include the last character, which we know is null
+    for (unsigned i = 0, e = CPA->getNumOperands()-1; i != e; ++i) {
+      unsigned char C = cast<ConstantInt>(CPA->getOperand(i))->getZExtValue();
+
+      // Print it out literally if it is a printable character.  The only thing
+      // to be careful about is when the last letter output was a hex escape
+      // code, in which case we have to be careful not to print out hex digits
+      // explicitly (the C compiler thinks it is a continuation of the previous
+      // character, sheesh...)
+      //
+      if (isprint(C) && (!LastWasHex || !isxdigit(C))) {
+        LastWasHex = false;
+        if (C == '"' || C == '\\')
+          Out << "\\" << (char)C;
+        else
+          Out << (char)C;
+      } else {
+        LastWasHex = false;
+        switch (C) {
+        case '\n': Out << "\\n"; break;
+        case '\t': Out << "\\t"; break;
+        case '\r': Out << "\\r"; break;
+        case '\v': Out << "\\v"; break;
+        case '\a': Out << "\\a"; break;
+        case '\"': Out << "\\\""; break;
+        case '\'': Out << "\\\'"; break;
+        default:
+          Out << "\\x";
+          Out << (char)(( C/16  < 10) ? ( C/16 +'0') : ( C/16 -10+'A'));
+          Out << (char)(((C&15) < 10) ? ((C&15)+'0') : ((C&15)-10+'A'));
+          LastWasHex = true;
+          break;
+        }
+      }
+    }
+    Out << '\"';
+  } else {
+    Out << '{';
+    if (CPA->getNumOperands()) {
+      Out << ' ';
+      printConstant(cast<Constant>(CPA->getOperand(0)), Static);
+      for (unsigned i = 1, e = CPA->getNumOperands(); i != e; ++i) {
+        Out << ", ";
+        printConstant(cast<Constant>(CPA->getOperand(i)), Static);
+      }
+    }
+    Out << " }";
+  }
+}
+
+void CWriter::printConstantVector(ConstantVector *CP, bool Static) {
+  Out << '{';
+  if (CP->getNumOperands()) {
+    Out << ' ';
+    printConstant(cast<Constant>(CP->getOperand(0)), Static);
+    for (unsigned i = 1, e = CP->getNumOperands(); i != e; ++i) {
+      Out << ", ";
+      printConstant(cast<Constant>(CP->getOperand(i)), Static);
+    }
+  }
+  Out << " }";
+}
+
+// isFPCSafeToPrint - Returns true if we may assume that CFP may be written out
+// textually as a double (rather than as a reference to a stack-allocated
+// variable). We decide this by converting CFP to a string and back into a
+// double, and then checking whether the conversion results in a bit-equal
+// double to the original value of CFP. This depends on us and the target C
+// compiler agreeing on the conversion process (which is pretty likely since we
+// only deal in IEEE FP).
+//
+static bool isFPCSafeToPrint(const ConstantFP *CFP) {
+  bool ignored;
+  // Do long doubles in hex for now.
+  if (CFP->getType() != Type::getFloatTy(CFP->getContext()) &&
+      CFP->getType() != Type::getDoubleTy(CFP->getContext()))
+    return false;
+  APFloat APF = APFloat(CFP->getValueAPF());  // copy
+  if (CFP->getType() == Type::getFloatTy(CFP->getContext()))
+    APF.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &ignored);
+#if HAVE_PRINTF_A && ENABLE_CBE_PRINTF_A
+  char Buffer[100];
+  sprintf(Buffer, "%a", APF.convertToDouble());
+  if (!strncmp(Buffer, "0x", 2) ||
+      !strncmp(Buffer, "-0x", 3) ||
+      !strncmp(Buffer, "+0x", 3))
+    return APF.bitwiseIsEqual(APFloat(atof(Buffer)));
+  return false;
+#else
+  std::string StrVal = ftostr(APF);
+
+  while (StrVal[0] == ' ')
+    StrVal.erase(StrVal.begin());
+
+  // Check to make sure that the stringized number is not some string like "Inf"
+  // or NaN.  Check that the string matches the "[-+]?[0-9]" regex.
+  if ((StrVal[0] >= '0' && StrVal[0] <= '9') ||
+      ((StrVal[0] == '-' || StrVal[0] == '+') &&
+       (StrVal[1] >= '0' && StrVal[1] <= '9')))
+    // Reparse stringized version!
+    return APF.bitwiseIsEqual(APFloat(atof(StrVal.c_str())));
+  return false;
+#endif
+}
+
+/// Print out the casting for a cast operation. This does the double casting
+/// necessary for conversion to the destination type, if necessary. 
+/// @brief Print a cast
+void CWriter::printCast(unsigned opc, const Type *SrcTy, const Type *DstTy) {
+  // Print the destination type cast
+  switch (opc) {
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::IntToPtr:
+    case Instruction::Trunc:
+    case Instruction::BitCast:
+    case Instruction::FPExt:
+    case Instruction::FPTrunc: // For these the DstTy sign doesn't matter
+      Out << '(';
+      printType(Out, DstTy);
+      Out << ')';
+      break;
+    case Instruction::ZExt:
+    case Instruction::PtrToInt:
+    case Instruction::FPToUI: // For these, make sure we get an unsigned dest
+      Out << '(';
+      printSimpleType(Out, DstTy, false);
+      Out << ')';
+      break;
+    case Instruction::SExt: 
+    case Instruction::FPToSI: // For these, make sure we get a signed dest
+      Out << '(';
+      printSimpleType(Out, DstTy, true);
+      Out << ')';
+      break;
+    default:
+      llvm_unreachable("Invalid cast opcode");
+  }
+
+  // Print the source type cast
+  switch (opc) {
+    case Instruction::UIToFP:
+    case Instruction::ZExt:
+      Out << '(';
+      printSimpleType(Out, SrcTy, false);
+      Out << ')';
+      break;
+    case Instruction::SIToFP:
+    case Instruction::SExt:
+      Out << '(';
+      printSimpleType(Out, SrcTy, true); 
+      Out << ')';
+      break;
+    case Instruction::IntToPtr:
+    case Instruction::PtrToInt:
+      // Avoid "cast to pointer from integer of different size" warnings
+      Out << "(unsigned long)";
+      break;
+    case Instruction::Trunc:
+    case Instruction::BitCast:
+    case Instruction::FPExt:
+    case Instruction::FPTrunc:
+    case Instruction::FPToSI:
+    case Instruction::FPToUI:
+      break; // These don't need a source cast.
+    default:
+      llvm_unreachable("Invalid cast opcode");
+      break;
+  }
+}
+
+// printConstant - The LLVM Constant to C Constant converter.
+void CWriter::printConstant(Constant *CPV, bool Static) {
+  if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(CPV)) {
+    switch (CE->getOpcode()) {
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::BitCast:
+      Out << "(";
+      printCast(CE->getOpcode(), CE->getOperand(0)->getType(), CE->getType());
+      if (CE->getOpcode() == Instruction::SExt &&
+          CE->getOperand(0)->getType() == Type::getInt1Ty(CPV->getContext())) {
+        // Make sure we really sext from bool here by subtracting from 0
+        Out << "0-";
+      }
+      printConstant(CE->getOperand(0), Static);
+      if (CE->getType() == Type::getInt1Ty(CPV->getContext()) &&
+          (CE->getOpcode() == Instruction::Trunc ||
+           CE->getOpcode() == Instruction::FPToUI ||
+           CE->getOpcode() == Instruction::FPToSI ||
+           CE->getOpcode() == Instruction::PtrToInt)) {
+        // Make sure we really truncate to bool here by anding with 1
+        Out << "&1u";
+      }
+      Out << ')';
+      return;
+
+    case Instruction::GetElementPtr:
+      Out << "(";
+      printGEPExpression(CE->getOperand(0), gep_type_begin(CPV),
+                         gep_type_end(CPV), Static);
+      Out << ")";
+      return;
+    case Instruction::Select:
+      Out << '(';
+      printConstant(CE->getOperand(0), Static);
+      Out << '?';
+      printConstant(CE->getOperand(1), Static);
+      Out << ':';
+      printConstant(CE->getOperand(2), Static);
+      Out << ')';
+      return;
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::SDiv:
+    case Instruction::UDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::ICmp:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    {
+      Out << '(';
+      bool NeedsClosingParens = printConstExprCast(CE, Static); 
+      printConstantWithCast(CE->getOperand(0), CE->getOpcode());
+      switch (CE->getOpcode()) {
+      case Instruction::Add:
+      case Instruction::FAdd: Out << " + "; break;
+      case Instruction::Sub:
+      case Instruction::FSub: Out << " - "; break;
+      case Instruction::Mul:
+      case Instruction::FMul: Out << " * "; break;
+      case Instruction::URem:
+      case Instruction::SRem: 
+      case Instruction::FRem: Out << " % "; break;
+      case Instruction::UDiv: 
+      case Instruction::SDiv: 
+      case Instruction::FDiv: Out << " / "; break;
+      case Instruction::And: Out << " & "; break;
+      case Instruction::Or:  Out << " | "; break;
+      case Instruction::Xor: Out << " ^ "; break;
+      case Instruction::Shl: Out << " << "; break;
+      case Instruction::LShr:
+      case Instruction::AShr: Out << " >> "; break;
+      case Instruction::ICmp:
+        switch (CE->getPredicate()) {
+          case ICmpInst::ICMP_EQ: Out << " == "; break;
+          case ICmpInst::ICMP_NE: Out << " != "; break;
+          case ICmpInst::ICMP_SLT: 
+          case ICmpInst::ICMP_ULT: Out << " < "; break;
+          case ICmpInst::ICMP_SLE:
+          case ICmpInst::ICMP_ULE: Out << " <= "; break;
+          case ICmpInst::ICMP_SGT:
+          case ICmpInst::ICMP_UGT: Out << " > "; break;
+          case ICmpInst::ICMP_SGE:
+          case ICmpInst::ICMP_UGE: Out << " >= "; break;
+          default: llvm_unreachable("Illegal ICmp predicate");
+        }
+        break;
+      default: llvm_unreachable("Illegal opcode here!");
+      }
+      printConstantWithCast(CE->getOperand(1), CE->getOpcode());
+      if (NeedsClosingParens)
+        Out << "))";
+      Out << ')';
+      return;
+    }
+    case Instruction::FCmp: {
+      Out << '('; 
+      bool NeedsClosingParens = printConstExprCast(CE, Static); 
+      if (CE->getPredicate() == FCmpInst::FCMP_FALSE)
+        Out << "0";
+      else if (CE->getPredicate() == FCmpInst::FCMP_TRUE)
+        Out << "1";
+      else {
+        const char* op = 0;
+        switch (CE->getPredicate()) {
+        default: llvm_unreachable("Illegal FCmp predicate");
+        case FCmpInst::FCMP_ORD: op = "ord"; break;
+        case FCmpInst::FCMP_UNO: op = "uno"; break;
+        case FCmpInst::FCMP_UEQ: op = "ueq"; break;
+        case FCmpInst::FCMP_UNE: op = "une"; break;
+        case FCmpInst::FCMP_ULT: op = "ult"; break;
+        case FCmpInst::FCMP_ULE: op = "ule"; break;
+        case FCmpInst::FCMP_UGT: op = "ugt"; break;
+        case FCmpInst::FCMP_UGE: op = "uge"; break;
+        case FCmpInst::FCMP_OEQ: op = "oeq"; break;
+        case FCmpInst::FCMP_ONE: op = "one"; break;
+        case FCmpInst::FCMP_OLT: op = "olt"; break;
+        case FCmpInst::FCMP_OLE: op = "ole"; break;
+        case FCmpInst::FCMP_OGT: op = "ogt"; break;
+        case FCmpInst::FCMP_OGE: op = "oge"; break;
+        }
+        Out << "llvm_fcmp_" << op << "(";
+        printConstantWithCast(CE->getOperand(0), CE->getOpcode());
+        Out << ", ";
+        printConstantWithCast(CE->getOperand(1), CE->getOpcode());
+        Out << ")";
+      }
+      if (NeedsClosingParens)
+        Out << "))";
+      Out << ')';
+      return;
+    }
+    default:
+#ifndef NDEBUG
+      errs() << "CWriter Error: Unhandled constant expression: "
+           << *CE << "\n";
+#endif
+      llvm_unreachable(0);
+    }
+  } else if (isa<UndefValue>(CPV) && CPV->getType()->isSingleValueType()) {
+    Out << "((";
+    printType(Out, CPV->getType()); // sign doesn't matter
+    Out << ")/*UNDEF*/";
+    if (!isa<VectorType>(CPV->getType())) {
+      Out << "0)";
+    } else {
+      Out << "{})";
+    }
+    return;
+  }
+
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(CPV)) {
+    const Type* Ty = CI->getType();
+    if (Ty == Type::getInt1Ty(CPV->getContext()))
+      Out << (CI->getZExtValue() ? '1' : '0');
+    else if (Ty == Type::getInt32Ty(CPV->getContext()))
+      Out << CI->getZExtValue() << 'u';
+    else if (Ty->getPrimitiveSizeInBits() > 32)
+      Out << CI->getZExtValue() << "ull";
+    else {
+      Out << "((";
+      printSimpleType(Out, Ty, false) << ')';
+      if (CI->isMinValue(true)) 
+        Out << CI->getZExtValue() << 'u';
+      else
+        Out << CI->getSExtValue();
+      Out << ')';
+    }
+    return;
+  } 
+
+  switch (CPV->getType()->getTypeID()) {
+  case Type::FloatTyID:
+  case Type::DoubleTyID: 
+  case Type::X86_FP80TyID:
+  case Type::PPC_FP128TyID:
+  case Type::FP128TyID: {
+    ConstantFP *FPC = cast<ConstantFP>(CPV);
+    std::map<const ConstantFP*, unsigned>::iterator I = FPConstantMap.find(FPC);
+    if (I != FPConstantMap.end()) {
+      // Because of FP precision problems we must load from a stack allocated
+      // value that holds the value in hex.
+      Out << "(*(" << (FPC->getType() == Type::getFloatTy(CPV->getContext()) ?
+                       "float" : 
+                       FPC->getType() == Type::getDoubleTy(CPV->getContext()) ? 
+                       "double" :
+                       "long double")
+          << "*)&FPConstant" << I->second << ')';
+    } else {
+      double V;
+      if (FPC->getType() == Type::getFloatTy(CPV->getContext()))
+        V = FPC->getValueAPF().convertToFloat();
+      else if (FPC->getType() == Type::getDoubleTy(CPV->getContext()))
+        V = FPC->getValueAPF().convertToDouble();
+      else {
+        // Long double.  Convert the number to double, discarding precision.
+        // This is not awesome, but it at least makes the CBE output somewhat
+        // useful.
+        APFloat Tmp = FPC->getValueAPF();
+        bool LosesInfo;
+        Tmp.convert(APFloat::IEEEdouble, APFloat::rmTowardZero, &LosesInfo);
+        V = Tmp.convertToDouble();
+      }
+      
+      if (IsNAN(V)) {
+        // The value is NaN
+
+        // FIXME the actual NaN bits should be emitted.
+        // The prefix for a quiet NaN is 0x7FF8. For a signalling NaN,
+        // it's 0x7ff4.
+        const unsigned long QuietNaN = 0x7ff8UL;
+        //const unsigned long SignalNaN = 0x7ff4UL;
+
+        // We need to grab the first part of the FP #
+        char Buffer[100];
+
+        uint64_t ll = DoubleToBits(V);
+        sprintf(Buffer, "0x%llx", static_cast<long long>(ll));
+
+        std::string Num(&Buffer[0], &Buffer[6]);
+        unsigned long Val = strtoul(Num.c_str(), 0, 16);
+
+        if (FPC->getType() == Type::getFloatTy(FPC->getContext()))
+          Out << "LLVM_NAN" << (Val == QuietNaN ? "" : "S") << "F(\""
+              << Buffer << "\") /*nan*/ ";
+        else
+          Out << "LLVM_NAN" << (Val == QuietNaN ? "" : "S") << "(\""
+              << Buffer << "\") /*nan*/ ";
+      } else if (IsInf(V)) {
+        // The value is Inf
+        if (V < 0) Out << '-';
+        Out << "LLVM_INF" <<
+            (FPC->getType() == Type::getFloatTy(FPC->getContext()) ? "F" : "")
+            << " /*inf*/ ";
+      } else {
+        std::string Num;
+#if HAVE_PRINTF_A && ENABLE_CBE_PRINTF_A
+        // Print out the constant as a floating point number.
+        char Buffer[100];
+        sprintf(Buffer, "%a", V);
+        Num = Buffer;
+#else
+        Num = ftostr(FPC->getValueAPF());
+#endif
+       Out << Num;
+      }
+    }
+    break;
+  }
+
+  case Type::ArrayTyID:
+    // Use C99 compound expression literal initializer syntax.
+    if (!Static) {
+      Out << "(";
+      printType(Out, CPV->getType());
+      Out << ")";
+    }
+    Out << "{ "; // Arrays are wrapped in struct types.
+    if (ConstantArray *CA = dyn_cast<ConstantArray>(CPV)) {
+      printConstantArray(CA, Static);
+    } else {
+      assert(isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV));
+      const ArrayType *AT = cast<ArrayType>(CPV->getType());
+      Out << '{';
+      if (AT->getNumElements()) {
+        Out << ' ';
+        Constant *CZ = Constant::getNullValue(AT->getElementType());
+        printConstant(CZ, Static);
+        for (unsigned i = 1, e = AT->getNumElements(); i != e; ++i) {
+          Out << ", ";
+          printConstant(CZ, Static);
+        }
+      }
+      Out << " }";
+    }
+    Out << " }"; // Arrays are wrapped in struct types.
+    break;
+
+  case Type::VectorTyID:
+    // Use C99 compound expression literal initializer syntax.
+    if (!Static) {
+      Out << "(";
+      printType(Out, CPV->getType());
+      Out << ")";
+    }
+    if (ConstantVector *CV = dyn_cast<ConstantVector>(CPV)) {
+      printConstantVector(CV, Static);
+    } else {
+      assert(isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV));
+      const VectorType *VT = cast<VectorType>(CPV->getType());
+      Out << "{ ";
+      Constant *CZ = Constant::getNullValue(VT->getElementType());
+      printConstant(CZ, Static);
+      for (unsigned i = 1, e = VT->getNumElements(); i != e; ++i) {
+        Out << ", ";
+        printConstant(CZ, Static);
+      }
+      Out << " }";
+    }
+    break;
+
+  case Type::StructTyID:
+    // Use C99 compound expression literal initializer syntax.
+    if (!Static) {
+      Out << "(";
+      printType(Out, CPV->getType());
+      Out << ")";
+    }
+    if (isa<ConstantAggregateZero>(CPV) || isa<UndefValue>(CPV)) {
+      const StructType *ST = cast<StructType>(CPV->getType());
+      Out << '{';
+      if (ST->getNumElements()) {
+        Out << ' ';
+        printConstant(Constant::getNullValue(ST->getElementType(0)), Static);
+        for (unsigned i = 1, e = ST->getNumElements(); i != e; ++i) {
+          Out << ", ";
+          printConstant(Constant::getNullValue(ST->getElementType(i)), Static);
+        }
+      }
+      Out << " }";
+    } else {
+      Out << '{';
+      if (CPV->getNumOperands()) {
+        Out << ' ';
+        printConstant(cast<Constant>(CPV->getOperand(0)), Static);
+        for (unsigned i = 1, e = CPV->getNumOperands(); i != e; ++i) {
+          Out << ", ";
+          printConstant(cast<Constant>(CPV->getOperand(i)), Static);
+        }
+      }
+      Out << " }";
+    }
+    break;
+
+  case Type::PointerTyID:
+    if (isa<ConstantPointerNull>(CPV)) {
+      Out << "((";
+      printType(Out, CPV->getType()); // sign doesn't matter
+      Out << ")/*NULL*/0)";
+      break;
+    } else if (GlobalValue *GV = dyn_cast<GlobalValue>(CPV)) {
+      writeOperand(GV, Static);
+      break;
+    }
+    // FALL THROUGH
+  default:
+#ifndef NDEBUG
+    errs() << "Unknown constant type: " << *CPV << "\n";
+#endif
+    llvm_unreachable(0);
+  }
+}
+
+// Some constant expressions need to be casted back to the original types
+// because their operands were casted to the expected type. This function takes
+// care of detecting that case and printing the cast for the ConstantExpr.
+bool CWriter::printConstExprCast(const ConstantExpr* CE, bool Static) {
+  bool NeedsExplicitCast = false;
+  const Type *Ty = CE->getOperand(0)->getType();
+  bool TypeIsSigned = false;
+  switch (CE->getOpcode()) {
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+    // We need to cast integer arithmetic so that it is always performed
+    // as unsigned, to avoid undefined behavior on overflow.
+  case Instruction::LShr:
+  case Instruction::URem: 
+  case Instruction::UDiv: NeedsExplicitCast = true; break;
+  case Instruction::AShr:
+  case Instruction::SRem: 
+  case Instruction::SDiv: NeedsExplicitCast = true; TypeIsSigned = true; break;
+  case Instruction::SExt:
+    Ty = CE->getType();
+    NeedsExplicitCast = true;
+    TypeIsSigned = true;
+    break;
+  case Instruction::ZExt:
+  case Instruction::Trunc:
+  case Instruction::FPTrunc:
+  case Instruction::FPExt:
+  case Instruction::UIToFP:
+  case Instruction::SIToFP:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::BitCast:
+    Ty = CE->getType();
+    NeedsExplicitCast = true;
+    break;
+  default: break;
+  }
+  if (NeedsExplicitCast) {
+    Out << "((";
+    if (Ty->isInteger() && Ty != Type::getInt1Ty(Ty->getContext()))
+      printSimpleType(Out, Ty, TypeIsSigned);
+    else
+      printType(Out, Ty); // not integer, sign doesn't matter
+    Out << ")(";
+  }
+  return NeedsExplicitCast;
+}
+
+//  Print a constant assuming that it is the operand for a given Opcode. The
+//  opcodes that care about sign need to cast their operands to the expected
+//  type before the operation proceeds. This function does the casting.
+void CWriter::printConstantWithCast(Constant* CPV, unsigned Opcode) {
+
+  // Extract the operand's type, we'll need it.
+  const Type* OpTy = CPV->getType();
+
+  // Indicate whether to do the cast or not.
+  bool shouldCast = false;
+  bool typeIsSigned = false;
+
+  // Based on the Opcode for which this Constant is being written, determine
+  // the new type to which the operand should be casted by setting the value
+  // of OpTy. If we change OpTy, also set shouldCast to true so it gets
+  // casted below.
+  switch (Opcode) {
+    default:
+      // for most instructions, it doesn't matter
+      break; 
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::Mul:
+      // We need to cast integer arithmetic so that it is always performed
+      // as unsigned, to avoid undefined behavior on overflow.
+    case Instruction::LShr:
+    case Instruction::UDiv:
+    case Instruction::URem:
+      shouldCast = true;
+      break;
+    case Instruction::AShr:
+    case Instruction::SDiv:
+    case Instruction::SRem:
+      shouldCast = true;
+      typeIsSigned = true;
+      break;
+  }
+
+  // Write out the casted constant if we should, otherwise just write the
+  // operand.
+  if (shouldCast) {
+    Out << "((";
+    printSimpleType(Out, OpTy, typeIsSigned);
+    Out << ")";
+    printConstant(CPV, false);
+    Out << ")";
+  } else 
+    printConstant(CPV, false);
+}
+
+std::string CWriter::GetValueName(const Value *Operand) {
+  // Mangle globals with the standard mangler interface for LLC compatibility.
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(Operand)) {
+    SmallString<128> Str;
+    Mang->getNameWithPrefix(Str, GV, false);
+    return CBEMangle(Str.str().str());
+  }
+    
+  std::string Name = Operand->getName();
+    
+  if (Name.empty()) { // Assign unique names to local temporaries.
+    unsigned &No = AnonValueNumbers[Operand];
+    if (No == 0)
+      No = ++NextAnonValueNumber;
+    Name = "tmp__" + utostr(No);
+  }
+    
+  std::string VarName;
+  VarName.reserve(Name.capacity());
+
+  for (std::string::iterator I = Name.begin(), E = Name.end();
+       I != E; ++I) {
+    char ch = *I;
+
+    if (!((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
+          (ch >= '0' && ch <= '9') || ch == '_')) {
+      char buffer[5];
+      sprintf(buffer, "_%x_", ch);
+      VarName += buffer;
+    } else
+      VarName += ch;
+  }
+
+  return "llvm_cbe_" + VarName;
+}
+
+/// writeInstComputationInline - Emit the computation for the specified
+/// instruction inline, with no destination provided.
+void CWriter::writeInstComputationInline(Instruction &I) {
+  // We can't currently support integer types other than 1, 8, 16, 32, 64.
+  // Validate this.
+  const Type *Ty = I.getType();
+  if (Ty->isInteger() && (Ty!=Type::getInt1Ty(I.getContext()) &&
+        Ty!=Type::getInt8Ty(I.getContext()) && 
+        Ty!=Type::getInt16Ty(I.getContext()) &&
+        Ty!=Type::getInt32Ty(I.getContext()) &&
+        Ty!=Type::getInt64Ty(I.getContext()))) {
+      llvm_report_error("The C backend does not currently support integer "
+                        "types of widths other than 1, 8, 16, 32, 64.\n"
+                        "This is being tracked as PR 4158.");
+  }
+
+  // If this is a non-trivial bool computation, make sure to truncate down to
+  // a 1 bit value.  This is important because we want "add i1 x, y" to return
+  // "0" when x and y are true, not "2" for example.
+  bool NeedBoolTrunc = false;
+  if (I.getType() == Type::getInt1Ty(I.getContext()) &&
+      !isa<ICmpInst>(I) && !isa<FCmpInst>(I))
+    NeedBoolTrunc = true;
+  
+  if (NeedBoolTrunc)
+    Out << "((";
+  
+  visit(I);
+  
+  if (NeedBoolTrunc)
+    Out << ")&1)";
+}
+
+
+void CWriter::writeOperandInternal(Value *Operand, bool Static) {
+  if (Instruction *I = dyn_cast<Instruction>(Operand))
+    // Should we inline this instruction to build a tree?
+    if (isInlinableInst(*I) && !isDirectAlloca(I)) {
+      Out << '(';
+      writeInstComputationInline(*I);
+      Out << ')';
+      return;
+    }
+
+  Constant* CPV = dyn_cast<Constant>(Operand);
+
+  if (CPV && !isa<GlobalValue>(CPV))
+    printConstant(CPV, Static);
+  else
+    Out << GetValueName(Operand);
+}
+
+void CWriter::writeOperand(Value *Operand, bool Static) {
+  bool isAddressImplicit = isAddressExposed(Operand);
+  if (isAddressImplicit)
+    Out << "(&";  // Global variables are referenced as their addresses by llvm
+
+  writeOperandInternal(Operand, Static);
+
+  if (isAddressImplicit)
+    Out << ')';
+}
+
+// Some instructions need to have their result value casted back to the 
+// original types because their operands were casted to the expected type. 
+// This function takes care of detecting that case and printing the cast 
+// for the Instruction.
+bool CWriter::writeInstructionCast(const Instruction &I) {
+  const Type *Ty = I.getOperand(0)->getType();
+  switch (I.getOpcode()) {
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+    // We need to cast integer arithmetic so that it is always performed
+    // as unsigned, to avoid undefined behavior on overflow.
+  case Instruction::LShr:
+  case Instruction::URem: 
+  case Instruction::UDiv: 
+    Out << "((";
+    printSimpleType(Out, Ty, false);
+    Out << ")(";
+    return true;
+  case Instruction::AShr:
+  case Instruction::SRem: 
+  case Instruction::SDiv: 
+    Out << "((";
+    printSimpleType(Out, Ty, true);
+    Out << ")(";
+    return true;
+  default: break;
+  }
+  return false;
+}
+
+// Write the operand with a cast to another type based on the Opcode being used.
+// This will be used in cases where an instruction has specific type
+// requirements (usually signedness) for its operands. 
+void CWriter::writeOperandWithCast(Value* Operand, unsigned Opcode) {
+
+  // Extract the operand's type, we'll need it.
+  const Type* OpTy = Operand->getType();
+
+  // Indicate whether to do the cast or not.
+  bool shouldCast = false;
+
+  // Indicate whether the cast should be to a signed type or not.
+  bool castIsSigned = false;
+
+  // Based on the Opcode for which this Operand is being written, determine
+  // the new type to which the operand should be casted by setting the value
+  // of OpTy. If we change OpTy, also set shouldCast to true.
+  switch (Opcode) {
+    default:
+      // for most instructions, it doesn't matter
+      break; 
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::Mul:
+      // We need to cast integer arithmetic so that it is always performed
+      // as unsigned, to avoid undefined behavior on overflow.
+    case Instruction::LShr:
+    case Instruction::UDiv:
+    case Instruction::URem: // Cast to unsigned first
+      shouldCast = true;
+      castIsSigned = false;
+      break;
+    case Instruction::GetElementPtr:
+    case Instruction::AShr:
+    case Instruction::SDiv:
+    case Instruction::SRem: // Cast to signed first
+      shouldCast = true;
+      castIsSigned = true;
+      break;
+  }
+
+  // Write out the casted operand if we should, otherwise just write the
+  // operand.
+  if (shouldCast) {
+    Out << "((";
+    printSimpleType(Out, OpTy, castIsSigned);
+    Out << ")";
+    writeOperand(Operand);
+    Out << ")";
+  } else 
+    writeOperand(Operand);
+}
+
+// Write the operand with a cast to another type based on the icmp predicate 
+// being used. 
+void CWriter::writeOperandWithCast(Value* Operand, const ICmpInst &Cmp) {
+  // This has to do a cast to ensure the operand has the right signedness. 
+  // Also, if the operand is a pointer, we make sure to cast to an integer when
+  // doing the comparison both for signedness and so that the C compiler doesn't
+  // optimize things like "p < NULL" to false (p may contain an integer value
+  // f.e.).
+  bool shouldCast = Cmp.isRelational();
+
+  // Write out the casted operand if we should, otherwise just write the
+  // operand.
+  if (!shouldCast) {
+    writeOperand(Operand);
+    return;
+  }
+  
+  // Should this be a signed comparison?  If so, convert to signed.
+  bool castIsSigned = Cmp.isSigned();
+
+  // If the operand was a pointer, convert to a large integer type.
+  const Type* OpTy = Operand->getType();
+  if (isa<PointerType>(OpTy))
+    OpTy = TD->getIntPtrType(Operand->getContext());
+  
+  Out << "((";
+  printSimpleType(Out, OpTy, castIsSigned);
+  Out << ")";
+  writeOperand(Operand);
+  Out << ")";
+}
+
+// generateCompilerSpecificCode - This is where we add conditional compilation
+// directives to cater to specific compilers as need be.
+//
+static void generateCompilerSpecificCode(formatted_raw_ostream& Out,
+                                         const TargetData *TD) {
+  // Alloca is hard to get, and we don't want to include stdlib.h here.
+  Out << "/* get a declaration for alloca */\n"
+      << "#if defined(__CYGWIN__) || defined(__MINGW32__)\n"
+      << "#define  alloca(x) __builtin_alloca((x))\n"
+      << "#define _alloca(x) __builtin_alloca((x))\n"
+      << "#elif defined(__APPLE__)\n"
+      << "extern void *__builtin_alloca(unsigned long);\n"
+      << "#define alloca(x) __builtin_alloca(x)\n"
+      << "#define longjmp _longjmp\n"
+      << "#define setjmp _setjmp\n"
+      << "#elif defined(__sun__)\n"
+      << "#if defined(__sparcv9)\n"
+      << "extern void *__builtin_alloca(unsigned long);\n"
+      << "#else\n"
+      << "extern void *__builtin_alloca(unsigned int);\n"
+      << "#endif\n"
+      << "#define alloca(x) __builtin_alloca(x)\n"
+      << "#elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__arm__)\n"
+      << "#define alloca(x) __builtin_alloca(x)\n"
+      << "#elif defined(_MSC_VER)\n"
+      << "#define inline _inline\n"
+      << "#define alloca(x) _alloca(x)\n"
+      << "#else\n"
+      << "#include <alloca.h>\n"
+      << "#endif\n\n";
+
+  // We output GCC specific attributes to preserve 'linkonce'ness on globals.
+  // If we aren't being compiled with GCC, just drop these attributes.
+  Out << "#ifndef __GNUC__  /* Can only support \"linkonce\" vars with GCC */\n"
+      << "#define __attribute__(X)\n"
+      << "#endif\n\n";
+
+  // On Mac OS X, "external weak" is spelled "__attribute__((weak_import))".
+  Out << "#if defined(__GNUC__) && defined(__APPLE_CC__)\n"
+      << "#define __EXTERNAL_WEAK__ __attribute__((weak_import))\n"
+      << "#elif defined(__GNUC__)\n"
+      << "#define __EXTERNAL_WEAK__ __attribute__((weak))\n"
+      << "#else\n"
+      << "#define __EXTERNAL_WEAK__\n"
+      << "#endif\n\n";
+
+  // For now, turn off the weak linkage attribute on Mac OS X. (See above.)
+  Out << "#if defined(__GNUC__) && defined(__APPLE_CC__)\n"
+      << "#define __ATTRIBUTE_WEAK__\n"
+      << "#elif defined(__GNUC__)\n"
+      << "#define __ATTRIBUTE_WEAK__ __attribute__((weak))\n"
+      << "#else\n"
+      << "#define __ATTRIBUTE_WEAK__\n"
+      << "#endif\n\n";
+
+  // Add hidden visibility support. FIXME: APPLE_CC?
+  Out << "#if defined(__GNUC__)\n"
+      << "#define __HIDDEN__ __attribute__((visibility(\"hidden\")))\n"
+      << "#endif\n\n";
+    
+  // Define NaN and Inf as GCC builtins if using GCC, as 0 otherwise
+  // From the GCC documentation:
+  //
+  //   double __builtin_nan (const char *str)
+  //
+  // This is an implementation of the ISO C99 function nan.
+  //
+  // Since ISO C99 defines this function in terms of strtod, which we do
+  // not implement, a description of the parsing is in order. The string is
+  // parsed as by strtol; that is, the base is recognized by leading 0 or
+  // 0x prefixes. The number parsed is placed in the significand such that
+  // the least significant bit of the number is at the least significant
+  // bit of the significand. The number is truncated to fit the significand
+  // field provided. The significand is forced to be a quiet NaN.
+  //
+  // This function, if given a string literal, is evaluated early enough
+  // that it is considered a compile-time constant.
+  //
+  //   float __builtin_nanf (const char *str)
+  //
+  // Similar to __builtin_nan, except the return type is float.
+  //
+  //   double __builtin_inf (void)
+  //
+  // Similar to __builtin_huge_val, except a warning is generated if the
+  // target floating-point format does not support infinities. This
+  // function is suitable for implementing the ISO C99 macro INFINITY.
+  //
+  //   float __builtin_inff (void)
+  //
+  // Similar to __builtin_inf, except the return type is float.
+  Out << "#ifdef __GNUC__\n"
+      << "#define LLVM_NAN(NanStr)   __builtin_nan(NanStr)   /* Double */\n"
+      << "#define LLVM_NANF(NanStr)  __builtin_nanf(NanStr)  /* Float */\n"
+      << "#define LLVM_NANS(NanStr)  __builtin_nans(NanStr)  /* Double */\n"
+      << "#define LLVM_NANSF(NanStr) __builtin_nansf(NanStr) /* Float */\n"
+      << "#define LLVM_INF           __builtin_inf()         /* Double */\n"
+      << "#define LLVM_INFF          __builtin_inff()        /* Float */\n"
+      << "#define LLVM_PREFETCH(addr,rw,locality) "
+                              "__builtin_prefetch(addr,rw,locality)\n"
+      << "#define __ATTRIBUTE_CTOR__ __attribute__((constructor))\n"
+      << "#define __ATTRIBUTE_DTOR__ __attribute__((destructor))\n"
+      << "#define LLVM_ASM           __asm__\n"
+      << "#else\n"
+      << "#define LLVM_NAN(NanStr)   ((double)0.0)           /* Double */\n"
+      << "#define LLVM_NANF(NanStr)  0.0F                    /* Float */\n"
+      << "#define LLVM_NANS(NanStr)  ((double)0.0)           /* Double */\n"
+      << "#define LLVM_NANSF(NanStr) 0.0F                    /* Float */\n"
+      << "#define LLVM_INF           ((double)0.0)           /* Double */\n"
+      << "#define LLVM_INFF          0.0F                    /* Float */\n"
+      << "#define LLVM_PREFETCH(addr,rw,locality)            /* PREFETCH */\n"
+      << "#define __ATTRIBUTE_CTOR__\n"
+      << "#define __ATTRIBUTE_DTOR__\n"
+      << "#define LLVM_ASM(X)\n"
+      << "#endif\n\n";
+  
+  Out << "#if __GNUC__ < 4 /* Old GCC's, or compilers not GCC */ \n"
+      << "#define __builtin_stack_save() 0   /* not implemented */\n"
+      << "#define __builtin_stack_restore(X) /* noop */\n"
+      << "#endif\n\n";
+
+  // Output typedefs for 128-bit integers. If these are needed with a
+  // 32-bit target or with a C compiler that doesn't support mode(TI),
+  // more drastic measures will be needed.
+  Out << "#if __GNUC__ && __LP64__ /* 128-bit integer types */\n"
+      << "typedef int __attribute__((mode(TI))) llvmInt128;\n"
+      << "typedef unsigned __attribute__((mode(TI))) llvmUInt128;\n"
+      << "#endif\n\n";
+
+  // Output target-specific code that should be inserted into main.
+  Out << "#define CODE_FOR_MAIN() /* Any target-specific code for main()*/\n";
+}
+
+/// FindStaticTors - Given a static ctor/dtor list, unpack its contents into
+/// the StaticTors set.
+static void FindStaticTors(GlobalVariable *GV, std::set<Function*> &StaticTors){
+  ConstantArray *InitList = dyn_cast<ConstantArray>(GV->getInitializer());
+  if (!InitList) return;
+  
+  for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i)
+    if (ConstantStruct *CS = dyn_cast<ConstantStruct>(InitList->getOperand(i))){
+      if (CS->getNumOperands() != 2) return;  // Not array of 2-element structs.
+      
+      if (CS->getOperand(1)->isNullValue())
+        return;  // Found a null terminator, exit printing.
+      Constant *FP = CS->getOperand(1);
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(FP))
+        if (CE->isCast())
+          FP = CE->getOperand(0);
+      if (Function *F = dyn_cast<Function>(FP))
+        StaticTors.insert(F);
+    }
+}
+
+enum SpecialGlobalClass {
+  NotSpecial = 0,
+  GlobalCtors, GlobalDtors,
+  NotPrinted
+};
+
+/// getGlobalVariableClass - If this is a global that is specially recognized
+/// by LLVM, return a code that indicates how we should handle it.
+static SpecialGlobalClass getGlobalVariableClass(const GlobalVariable *GV) {
+  // If this is a global ctors/dtors list, handle it now.
+  if (GV->hasAppendingLinkage() && GV->use_empty()) {
+    if (GV->getName() == "llvm.global_ctors")
+      return GlobalCtors;
+    else if (GV->getName() == "llvm.global_dtors")
+      return GlobalDtors;
+  }
+  
+  // Otherwise, if it is other metadata, don't print it.  This catches things
+  // like debug information.
+  if (GV->getSection() == "llvm.metadata")
+    return NotPrinted;
+  
+  return NotSpecial;
+}
+
+// PrintEscapedString - Print each character of the specified string, escaping
+// it if it is not printable or if it is an escape char.
+static void PrintEscapedString(const char *Str, unsigned Length,
+                               raw_ostream &Out) {
+  for (unsigned i = 0; i != Length; ++i) {
+    unsigned char C = Str[i];
+    if (isprint(C) && C != '\\' && C != '"')
+      Out << C;
+    else if (C == '\\')
+      Out << "\\\\";
+    else if (C == '\"')
+      Out << "\\\"";
+    else if (C == '\t')
+      Out << "\\t";
+    else
+      Out << "\\x" << hexdigit(C >> 4) << hexdigit(C & 0x0F);
+  }
+}
+
+// PrintEscapedString - Print each character of the specified string, escaping
+// it if it is not printable or if it is an escape char.
+static void PrintEscapedString(const std::string &Str, raw_ostream &Out) {
+  PrintEscapedString(Str.c_str(), Str.size(), Out);
+}
+
+bool CWriter::doInitialization(Module &M) {
+  FunctionPass::doInitialization(M);
+  
+  // Initialize
+  TheModule = &M;
+
+  TD = new TargetData(&M);
+  IL = new IntrinsicLowering(*TD);
+  IL->AddPrototypes(M);
+
+#if 0
+  std::string Triple = TheModule->getTargetTriple();
+  if (Triple.empty())
+    Triple = llvm::sys::getHostTriple();
+  
+  std::string E;
+  if (const Target *Match = TargetRegistry::lookupTarget(Triple, E))
+    TAsm = Match->createAsmInfo(Triple);
+#endif    
+  TAsm = new CBEMCAsmInfo();
+  Mang = new Mangler(*TAsm);
+
+  // Keep track of which functions are static ctors/dtors so they can have
+  // an attribute added to their prototypes.
+  std::set<Function*> StaticCtors, StaticDtors;
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I) {
+    switch (getGlobalVariableClass(I)) {
+    default: break;
+    case GlobalCtors:
+      FindStaticTors(I, StaticCtors);
+      break;
+    case GlobalDtors:
+      FindStaticTors(I, StaticDtors);
+      break;
+    }
+  }
+  
+  // get declaration for alloca
+  Out << "/* Provide Declarations */\n";
+  Out << "#include <stdarg.h>\n";      // Varargs support
+  Out << "#include <setjmp.h>\n";      // Unwind support
+  generateCompilerSpecificCode(Out, TD);
+
+  // Provide a definition for `bool' if not compiling with a C++ compiler.
+  Out << "\n"
+      << "#ifndef __cplusplus\ntypedef unsigned char bool;\n#endif\n"
+
+      << "\n\n/* Support for floating point constants */\n"
+      << "typedef unsigned long long ConstantDoubleTy;\n"
+      << "typedef unsigned int        ConstantFloatTy;\n"
+      << "typedef struct { unsigned long long f1; unsigned short f2; "
+         "unsigned short pad[3]; } ConstantFP80Ty;\n"
+      // This is used for both kinds of 128-bit long double; meaning differs.
+      << "typedef struct { unsigned long long f1; unsigned long long f2; }"
+         " ConstantFP128Ty;\n"
+      << "\n\n/* Global Declarations */\n";
+
+  // First output all the declarations for the program, because C requires
+  // Functions & globals to be declared before they are used.
+  //
+  if (!M.getModuleInlineAsm().empty()) {
+    Out << "/* Module asm statements */\n"
+        << "asm(";
+
+    // Split the string into lines, to make it easier to read the .ll file.
+    std::string Asm = M.getModuleInlineAsm();
+    size_t CurPos = 0;
+    size_t NewLine = Asm.find_first_of('\n', CurPos);
+    while (NewLine != std::string::npos) {
+      // We found a newline, print the portion of the asm string from the
+      // last newline up to this newline.
+      Out << "\"";
+      PrintEscapedString(std::string(Asm.begin()+CurPos, Asm.begin()+NewLine),
+                         Out);
+      Out << "\\n\"\n";
+      CurPos = NewLine+1;
+      NewLine = Asm.find_first_of('\n', CurPos);
+    }
+    Out << "\"";
+    PrintEscapedString(std::string(Asm.begin()+CurPos, Asm.end()), Out);
+    Out << "\");\n"
+        << "/* End Module asm statements */\n";
+  }
+
+  // Loop over the symbol table, emitting all named constants...
+  printModuleTypes(M.getTypeSymbolTable());
+
+  // Global variable declarations...
+  if (!M.global_empty()) {
+    Out << "\n/* External Global Variable Declarations */\n";
+    for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+         I != E; ++I) {
+
+      if (I->hasExternalLinkage() || I->hasExternalWeakLinkage() || 
+          I->hasCommonLinkage())
+        Out << "extern ";
+      else if (I->hasDLLImportLinkage())
+        Out << "__declspec(dllimport) ";
+      else
+        continue; // Internal Global
+
+      // Thread Local Storage
+      if (I->isThreadLocal())
+        Out << "__thread ";
+
+      printType(Out, I->getType()->getElementType(), false, GetValueName(I));
+
+      if (I->hasExternalWeakLinkage())
+         Out << " __EXTERNAL_WEAK__";
+      Out << ";\n";
+    }
+  }
+
+  // Function declarations
+  Out << "\n/* Function Declarations */\n";
+  Out << "double fmod(double, double);\n";   // Support for FP rem
+  Out << "float fmodf(float, float);\n";
+  Out << "long double fmodl(long double, long double);\n";
+  
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+    // Don't print declarations for intrinsic functions.
+    if (!I->isIntrinsic() && I->getName() != "setjmp" &&
+        I->getName() != "longjmp" && I->getName() != "_setjmp") {
+      if (I->hasExternalWeakLinkage())
+        Out << "extern ";
+      printFunctionSignature(I, true);
+      if (I->hasWeakLinkage() || I->hasLinkOnceLinkage()) 
+        Out << " __ATTRIBUTE_WEAK__";
+      if (I->hasExternalWeakLinkage())
+        Out << " __EXTERNAL_WEAK__";
+      if (StaticCtors.count(I))
+        Out << " __ATTRIBUTE_CTOR__";
+      if (StaticDtors.count(I))
+        Out << " __ATTRIBUTE_DTOR__";
+      if (I->hasHiddenVisibility())
+        Out << " __HIDDEN__";
+      
+      if (I->hasName() && I->getName()[0] == 1)
+        Out << " LLVM_ASM(\"" << I->getName().substr(1) << "\")";
+          
+      Out << ";\n";
+    }
+  }
+
+  // Output the global variable declarations
+  if (!M.global_empty()) {
+    Out << "\n\n/* Global Variable Declarations */\n";
+    for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+         I != E; ++I)
+      if (!I->isDeclaration()) {
+        // Ignore special globals, such as debug info.
+        if (getGlobalVariableClass(I))
+          continue;
+
+        if (I->hasLocalLinkage())
+          Out << "static ";
+        else
+          Out << "extern ";
+
+        // Thread Local Storage
+        if (I->isThreadLocal())
+          Out << "__thread ";
+
+        printType(Out, I->getType()->getElementType(), false, 
+                  GetValueName(I));
+
+        if (I->hasLinkOnceLinkage())
+          Out << " __attribute__((common))";
+        else if (I->hasCommonLinkage())     // FIXME is this right?
+          Out << " __ATTRIBUTE_WEAK__";
+        else if (I->hasWeakLinkage())
+          Out << " __ATTRIBUTE_WEAK__";
+        else if (I->hasExternalWeakLinkage())
+          Out << " __EXTERNAL_WEAK__";
+        if (I->hasHiddenVisibility())
+          Out << " __HIDDEN__";
+        Out << ";\n";
+      }
+  }
+
+  // Output the global variable definitions and contents...
+  if (!M.global_empty()) {
+    Out << "\n\n/* Global Variable Definitions and Initialization */\n";
+    for (Module::global_iterator I = M.global_begin(), E = M.global_end(); 
+         I != E; ++I)
+      if (!I->isDeclaration()) {
+        // Ignore special globals, such as debug info.
+        if (getGlobalVariableClass(I))
+          continue;
+
+        if (I->hasLocalLinkage())
+          Out << "static ";
+        else if (I->hasDLLImportLinkage())
+          Out << "__declspec(dllimport) ";
+        else if (I->hasDLLExportLinkage())
+          Out << "__declspec(dllexport) ";
+
+        // Thread Local Storage
+        if (I->isThreadLocal())
+          Out << "__thread ";
+
+        printType(Out, I->getType()->getElementType(), false, 
+                  GetValueName(I));
+        if (I->hasLinkOnceLinkage())
+          Out << " __attribute__((common))";
+        else if (I->hasWeakLinkage())
+          Out << " __ATTRIBUTE_WEAK__";
+        else if (I->hasCommonLinkage())
+          Out << " __ATTRIBUTE_WEAK__";
+
+        if (I->hasHiddenVisibility())
+          Out << " __HIDDEN__";
+        
+        // If the initializer is not null, emit the initializer.  If it is null,
+        // we try to avoid emitting large amounts of zeros.  The problem with
+        // this, however, occurs when the variable has weak linkage.  In this
+        // case, the assembler will complain about the variable being both weak
+        // and common, so we disable this optimization.
+        // FIXME common linkage should avoid this problem.
+        if (!I->getInitializer()->isNullValue()) {
+          Out << " = " ;
+          writeOperand(I->getInitializer(), true);
+        } else if (I->hasWeakLinkage()) {
+          // We have to specify an initializer, but it doesn't have to be
+          // complete.  If the value is an aggregate, print out { 0 }, and let
+          // the compiler figure out the rest of the zeros.
+          Out << " = " ;
+          if (isa<StructType>(I->getInitializer()->getType()) ||
+              isa<VectorType>(I->getInitializer()->getType())) {
+            Out << "{ 0 }";
+          } else if (isa<ArrayType>(I->getInitializer()->getType())) {
+            // As with structs and vectors, but with an extra set of braces
+            // because arrays are wrapped in structs.
+            Out << "{ { 0 } }";
+          } else {
+            // Just print it out normally.
+            writeOperand(I->getInitializer(), true);
+          }
+        }
+        Out << ";\n";
+      }
+  }
+
+  if (!M.empty())
+    Out << "\n\n/* Function Bodies */\n";
+
+  // Emit some helper functions for dealing with FCMP instruction's 
+  // predicates
+  Out << "static inline int llvm_fcmp_ord(double X, double Y) { ";
+  Out << "return X == X && Y == Y; }\n";
+  Out << "static inline int llvm_fcmp_uno(double X, double Y) { ";
+  Out << "return X != X || Y != Y; }\n";
+  Out << "static inline int llvm_fcmp_ueq(double X, double Y) { ";
+  Out << "return X == Y || llvm_fcmp_uno(X, Y); }\n";
+  Out << "static inline int llvm_fcmp_une(double X, double Y) { ";
+  Out << "return X != Y; }\n";
+  Out << "static inline int llvm_fcmp_ult(double X, double Y) { ";
+  Out << "return X <  Y || llvm_fcmp_uno(X, Y); }\n";
+  Out << "static inline int llvm_fcmp_ugt(double X, double Y) { ";
+  Out << "return X >  Y || llvm_fcmp_uno(X, Y); }\n";
+  Out << "static inline int llvm_fcmp_ule(double X, double Y) { ";
+  Out << "return X <= Y || llvm_fcmp_uno(X, Y); }\n";
+  Out << "static inline int llvm_fcmp_uge(double X, double Y) { ";
+  Out << "return X >= Y || llvm_fcmp_uno(X, Y); }\n";
+  Out << "static inline int llvm_fcmp_oeq(double X, double Y) { ";
+  Out << "return X == Y ; }\n";
+  Out << "static inline int llvm_fcmp_one(double X, double Y) { ";
+  Out << "return X != Y && llvm_fcmp_ord(X, Y); }\n";
+  Out << "static inline int llvm_fcmp_olt(double X, double Y) { ";
+  Out << "return X <  Y ; }\n";
+  Out << "static inline int llvm_fcmp_ogt(double X, double Y) { ";
+  Out << "return X >  Y ; }\n";
+  Out << "static inline int llvm_fcmp_ole(double X, double Y) { ";
+  Out << "return X <= Y ; }\n";
+  Out << "static inline int llvm_fcmp_oge(double X, double Y) { ";
+  Out << "return X >= Y ; }\n";
+  return false;
+}
+
+
+/// Output all floating point constants that cannot be printed accurately...
+void CWriter::printFloatingPointConstants(Function &F) {
+  // Scan the module for floating point constants.  If any FP constant is used
+  // in the function, we want to redirect it here so that we do not depend on
+  // the precision of the printed form, unless the printed form preserves
+  // precision.
+  //
+  for (constant_iterator I = constant_begin(&F), E = constant_end(&F);
+       I != E; ++I)
+    printFloatingPointConstants(*I);
+
+  Out << '\n';
+}
+
+void CWriter::printFloatingPointConstants(const Constant *C) {
+  // If this is a constant expression, recursively check for constant fp values.
+  if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
+    for (unsigned i = 0, e = CE->getNumOperands(); i != e; ++i)
+      printFloatingPointConstants(CE->getOperand(i));
+    return;
+  }
+    
+  // Otherwise, check for a FP constant that we need to print.
+  const ConstantFP *FPC = dyn_cast<ConstantFP>(C);
+  if (FPC == 0 ||
+      // Do not put in FPConstantMap if safe.
+      isFPCSafeToPrint(FPC) ||
+      // Already printed this constant?
+      FPConstantMap.count(FPC))
+    return;
+
+  FPConstantMap[FPC] = FPCounter;  // Number the FP constants
+  
+  if (FPC->getType() == Type::getDoubleTy(FPC->getContext())) {
+    double Val = FPC->getValueAPF().convertToDouble();
+    uint64_t i = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
+    Out << "static const ConstantDoubleTy FPConstant" << FPCounter++
+    << " = 0x" << utohexstr(i)
+    << "ULL;    /* " << Val << " */\n";
+  } else if (FPC->getType() == Type::getFloatTy(FPC->getContext())) {
+    float Val = FPC->getValueAPF().convertToFloat();
+    uint32_t i = (uint32_t)FPC->getValueAPF().bitcastToAPInt().
+    getZExtValue();
+    Out << "static const ConstantFloatTy FPConstant" << FPCounter++
+    << " = 0x" << utohexstr(i)
+    << "U;    /* " << Val << " */\n";
+  } else if (FPC->getType() == Type::getX86_FP80Ty(FPC->getContext())) {
+    // api needed to prevent premature destruction
+    APInt api = FPC->getValueAPF().bitcastToAPInt();
+    const uint64_t *p = api.getRawData();
+    Out << "static const ConstantFP80Ty FPConstant" << FPCounter++
+    << " = { 0x" << utohexstr(p[0]) 
+    << "ULL, 0x" << utohexstr((uint16_t)p[1]) << ",{0,0,0}"
+    << "}; /* Long double constant */\n";
+  } else if (FPC->getType() == Type::getPPC_FP128Ty(FPC->getContext()) ||
+             FPC->getType() == Type::getFP128Ty(FPC->getContext())) {
+    APInt api = FPC->getValueAPF().bitcastToAPInt();
+    const uint64_t *p = api.getRawData();
+    Out << "static const ConstantFP128Ty FPConstant" << FPCounter++
+    << " = { 0x"
+    << utohexstr(p[0]) << ", 0x" << utohexstr(p[1])
+    << "}; /* Long double constant */\n";
+    
+  } else {
+    llvm_unreachable("Unknown float type!");
+  }
+}
+
+
+
+/// printSymbolTable - Run through symbol table looking for type names.  If a
+/// type name is found, emit its declaration...
+///
+void CWriter::printModuleTypes(const TypeSymbolTable &TST) {
+  Out << "/* Helper union for bitcasts */\n";
+  Out << "typedef union {\n";
+  Out << "  unsigned int Int32;\n";
+  Out << "  unsigned long long Int64;\n";
+  Out << "  float Float;\n";
+  Out << "  double Double;\n";
+  Out << "} llvmBitCastUnion;\n";
+
+  // We are only interested in the type plane of the symbol table.
+  TypeSymbolTable::const_iterator I   = TST.begin();
+  TypeSymbolTable::const_iterator End = TST.end();
+
+  // If there are no type names, exit early.
+  if (I == End) return;
+
+  // Print out forward declarations for structure types before anything else!
+  Out << "/* Structure forward decls */\n";
+  for (; I != End; ++I) {
+    std::string Name = "struct " + CBEMangle("l_"+I->first);
+    Out << Name << ";\n";
+    TypeNames.insert(std::make_pair(I->second, Name));
+  }
+
+  Out << '\n';
+
+  // Now we can print out typedefs.  Above, we guaranteed that this can only be
+  // for struct or opaque types.
+  Out << "/* Typedefs */\n";
+  for (I = TST.begin(); I != End; ++I) {
+    std::string Name = CBEMangle("l_"+I->first);
+    Out << "typedef ";
+    printType(Out, I->second, false, Name);
+    Out << ";\n";
+  }
+
+  Out << '\n';
+
+  // Keep track of which structures have been printed so far...
+  std::set<const Type *> StructPrinted;
+
+  // Loop over all structures then push them into the stack so they are
+  // printed in the correct order.
+  //
+  Out << "/* Structure contents */\n";
+  for (I = TST.begin(); I != End; ++I)
+    if (isa<StructType>(I->second) || isa<ArrayType>(I->second))
+      // Only print out used types!
+      printContainedStructs(I->second, StructPrinted);
+}
+
+// Push the struct onto the stack and recursively push all structs
+// this one depends on.
+//
+// TODO:  Make this work properly with vector types
+//
+void CWriter::printContainedStructs(const Type *Ty,
+                                    std::set<const Type*> &StructPrinted) {
+  // Don't walk through pointers.
+  if (isa<PointerType>(Ty) || Ty->isPrimitiveType() || Ty->isInteger()) return;
+  
+  // Print all contained types first.
+  for (Type::subtype_iterator I = Ty->subtype_begin(),
+       E = Ty->subtype_end(); I != E; ++I)
+    printContainedStructs(*I, StructPrinted);
+  
+  if (isa<StructType>(Ty) || isa<ArrayType>(Ty)) {
+    // Check to see if we have already printed this struct.
+    if (StructPrinted.insert(Ty).second) {
+      // Print structure type out.
+      std::string Name = TypeNames[Ty];
+      printType(Out, Ty, false, Name, true);
+      Out << ";\n\n";
+    }
+  }
+}
+
+void CWriter::printFunctionSignature(const Function *F, bool Prototype) {
+  /// isStructReturn - Should this function actually return a struct by-value?
+  bool isStructReturn = F->hasStructRetAttr();
+  
+  if (F->hasLocalLinkage()) Out << "static ";
+  if (F->hasDLLImportLinkage()) Out << "__declspec(dllimport) ";
+  if (F->hasDLLExportLinkage()) Out << "__declspec(dllexport) ";  
+  switch (F->getCallingConv()) {
+   case CallingConv::X86_StdCall:
+    Out << "__attribute__((stdcall)) ";
+    break;
+   case CallingConv::X86_FastCall:
+    Out << "__attribute__((fastcall)) ";
+    break;
+   default:
+    break;
+  }
+  
+  // Loop over the arguments, printing them...
+  const FunctionType *FT = cast<FunctionType>(F->getFunctionType());
+  const AttrListPtr &PAL = F->getAttributes();
+
+  std::stringstream FunctionInnards;
+
+  // Print out the name...
+  FunctionInnards << GetValueName(F) << '(';
+
+  bool PrintedArg = false;
+  if (!F->isDeclaration()) {
+    if (!F->arg_empty()) {
+      Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+      unsigned Idx = 1;
+      
+      // If this is a struct-return function, don't print the hidden
+      // struct-return argument.
+      if (isStructReturn) {
+        assert(I != E && "Invalid struct return function!");
+        ++I;
+        ++Idx;
+      }
+      
+      std::string ArgName;
+      for (; I != E; ++I) {
+        if (PrintedArg) FunctionInnards << ", ";
+        if (I->hasName() || !Prototype)
+          ArgName = GetValueName(I);
+        else
+          ArgName = "";
+        const Type *ArgTy = I->getType();
+        if (PAL.paramHasAttr(Idx, Attribute::ByVal)) {
+          ArgTy = cast<PointerType>(ArgTy)->getElementType();
+          ByValParams.insert(I);
+        }
+        printType(FunctionInnards, ArgTy,
+            /*isSigned=*/PAL.paramHasAttr(Idx, Attribute::SExt),
+            ArgName);
+        PrintedArg = true;
+        ++Idx;
+      }
+    }
+  } else {
+    // Loop over the arguments, printing them.
+    FunctionType::param_iterator I = FT->param_begin(), E = FT->param_end();
+    unsigned Idx = 1;
+    
+    // If this is a struct-return function, don't print the hidden
+    // struct-return argument.
+    if (isStructReturn) {
+      assert(I != E && "Invalid struct return function!");
+      ++I;
+      ++Idx;
+    }
+    
+    for (; I != E; ++I) {
+      if (PrintedArg) FunctionInnards << ", ";
+      const Type *ArgTy = *I;
+      if (PAL.paramHasAttr(Idx, Attribute::ByVal)) {
+        assert(isa<PointerType>(ArgTy));
+        ArgTy = cast<PointerType>(ArgTy)->getElementType();
+      }
+      printType(FunctionInnards, ArgTy,
+             /*isSigned=*/PAL.paramHasAttr(Idx, Attribute::SExt));
+      PrintedArg = true;
+      ++Idx;
+    }
+  }
+
+  // Finish printing arguments... if this is a vararg function, print the ...,
+  // unless there are no known types, in which case, we just emit ().
+  //
+  if (FT->isVarArg() && PrintedArg) {
+    if (PrintedArg) FunctionInnards << ", ";
+    FunctionInnards << "...";  // Output varargs portion of signature!
+  } else if (!FT->isVarArg() && !PrintedArg) {
+    FunctionInnards << "void"; // ret() -> ret(void) in C.
+  }
+  FunctionInnards << ')';
+  
+  // Get the return tpe for the function.
+  const Type *RetTy;
+  if (!isStructReturn)
+    RetTy = F->getReturnType();
+  else {
+    // If this is a struct-return function, print the struct-return type.
+    RetTy = cast<PointerType>(FT->getParamType(0))->getElementType();
+  }
+    
+  // Print out the return type and the signature built above.
+  printType(Out, RetTy, 
+            /*isSigned=*/PAL.paramHasAttr(0, Attribute::SExt),
+            FunctionInnards.str());
+}
+
+static inline bool isFPIntBitCast(const Instruction &I) {
+  if (!isa<BitCastInst>(I))
+    return false;
+  const Type *SrcTy = I.getOperand(0)->getType();
+  const Type *DstTy = I.getType();
+  return (SrcTy->isFloatingPoint() && DstTy->isInteger()) ||
+         (DstTy->isFloatingPoint() && SrcTy->isInteger());
+}
+
+void CWriter::printFunction(Function &F) {
+  /// isStructReturn - Should this function actually return a struct by-value?
+  bool isStructReturn = F.hasStructRetAttr();
+
+  printFunctionSignature(&F, false);
+  Out << " {\n";
+  
+  // If this is a struct return function, handle the result with magic.
+  if (isStructReturn) {
+    const Type *StructTy =
+      cast<PointerType>(F.arg_begin()->getType())->getElementType();
+    Out << "  ";
+    printType(Out, StructTy, false, "StructReturn");
+    Out << ";  /* Struct return temporary */\n";
+
+    Out << "  ";
+    printType(Out, F.arg_begin()->getType(), false, 
+              GetValueName(F.arg_begin()));
+    Out << " = &StructReturn;\n";
+  }
+
+  bool PrintedVar = false;
+  
+  // print local variable information for the function
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I) {
+    if (const AllocaInst *AI = isDirectAlloca(&*I)) {
+      Out << "  ";
+      printType(Out, AI->getAllocatedType(), false, GetValueName(AI));
+      Out << ";    /* Address-exposed local */\n";
+      PrintedVar = true;
+    } else if (I->getType() != Type::getVoidTy(F.getContext()) && 
+               !isInlinableInst(*I)) {
+      Out << "  ";
+      printType(Out, I->getType(), false, GetValueName(&*I));
+      Out << ";\n";
+
+      if (isa<PHINode>(*I)) {  // Print out PHI node temporaries as well...
+        Out << "  ";
+        printType(Out, I->getType(), false,
+                  GetValueName(&*I)+"__PHI_TEMPORARY");
+        Out << ";\n";
+      }
+      PrintedVar = true;
+    }
+    // We need a temporary for the BitCast to use so it can pluck a value out
+    // of a union to do the BitCast. This is separate from the need for a
+    // variable to hold the result of the BitCast. 
+    if (isFPIntBitCast(*I)) {
+      Out << "  llvmBitCastUnion " << GetValueName(&*I)
+          << "__BITCAST_TEMPORARY;\n";
+      PrintedVar = true;
+    }
+  }
+
+  if (PrintedVar)
+    Out << '\n';
+
+  if (F.hasExternalLinkage() && F.getName() == "main")
+    Out << "  CODE_FOR_MAIN();\n";
+
+  // print the basic blocks
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    if (Loop *L = LI->getLoopFor(BB)) {
+      if (L->getHeader() == BB && L->getParentLoop() == 0)
+        printLoop(L);
+    } else {
+      printBasicBlock(BB);
+    }
+  }
+
+  Out << "}\n\n";
+}
+
+void CWriter::printLoop(Loop *L) {
+  Out << "  do {     /* Syntactic loop '" << L->getHeader()->getName()
+      << "' to make GCC happy */\n";
+  for (unsigned i = 0, e = L->getBlocks().size(); i != e; ++i) {
+    BasicBlock *BB = L->getBlocks()[i];
+    Loop *BBLoop = LI->getLoopFor(BB);
+    if (BBLoop == L)
+      printBasicBlock(BB);
+    else if (BB == BBLoop->getHeader() && BBLoop->getParentLoop() == L)
+      printLoop(BBLoop);
+  }
+  Out << "  } while (1); /* end of syntactic loop '"
+      << L->getHeader()->getName() << "' */\n";
+}
+
+void CWriter::printBasicBlock(BasicBlock *BB) {
+
+  // Don't print the label for the basic block if there are no uses, or if
+  // the only terminator use is the predecessor basic block's terminator.
+  // We have to scan the use list because PHI nodes use basic blocks too but
+  // do not require a label to be generated.
+  //
+  bool NeedsLabel = false;
+  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
+    if (isGotoCodeNecessary(*PI, BB)) {
+      NeedsLabel = true;
+      break;
+    }
+
+  if (NeedsLabel) Out << GetValueName(BB) << ":\n";
+
+  // Output all of the instructions in the basic block...
+  for (BasicBlock::iterator II = BB->begin(), E = --BB->end(); II != E;
+       ++II) {
+    if (!isInlinableInst(*II) && !isDirectAlloca(II)) {
+      if (II->getType() != Type::getVoidTy(BB->getContext()) &&
+          !isInlineAsm(*II))
+        outputLValue(II);
+      else
+        Out << "  ";
+      writeInstComputationInline(*II);
+      Out << ";\n";
+    }
+  }
+
+  // Don't emit prefix or suffix for the terminator.
+  visit(*BB->getTerminator());
+}
+
+
+// Specific Instruction type classes... note that all of the casts are
+// necessary because we use the instruction classes as opaque types...
+//
+void CWriter::visitReturnInst(ReturnInst &I) {
+  // If this is a struct return function, return the temporary struct.
+  bool isStructReturn = I.getParent()->getParent()->hasStructRetAttr();
+
+  if (isStructReturn) {
+    Out << "  return StructReturn;\n";
+    return;
+  }
+  
+  // Don't output a void return if this is the last basic block in the function
+  if (I.getNumOperands() == 0 &&
+      &*--I.getParent()->getParent()->end() == I.getParent() &&
+      !I.getParent()->size() == 1) {
+    return;
+  }
+
+  if (I.getNumOperands() > 1) {
+    Out << "  {\n";
+    Out << "    ";
+    printType(Out, I.getParent()->getParent()->getReturnType());
+    Out << "   llvm_cbe_mrv_temp = {\n";
+    for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) {
+      Out << "      ";
+      writeOperand(I.getOperand(i));
+      if (i != e - 1)
+        Out << ",";
+      Out << "\n";
+    }
+    Out << "    };\n";
+    Out << "    return llvm_cbe_mrv_temp;\n";
+    Out << "  }\n";
+    return;
+  }
+
+  Out << "  return";
+  if (I.getNumOperands()) {
+    Out << ' ';
+    writeOperand(I.getOperand(0));
+  }
+  Out << ";\n";
+}
+
+void CWriter::visitSwitchInst(SwitchInst &SI) {
+
+  Out << "  switch (";
+  writeOperand(SI.getOperand(0));
+  Out << ") {\n  default:\n";
+  printPHICopiesForSuccessor (SI.getParent(), SI.getDefaultDest(), 2);
+  printBranchToBlock(SI.getParent(), SI.getDefaultDest(), 2);
+  Out << ";\n";
+  for (unsigned i = 2, e = SI.getNumOperands(); i != e; i += 2) {
+    Out << "  case ";
+    writeOperand(SI.getOperand(i));
+    Out << ":\n";
+    BasicBlock *Succ = cast<BasicBlock>(SI.getOperand(i+1));
+    printPHICopiesForSuccessor (SI.getParent(), Succ, 2);
+    printBranchToBlock(SI.getParent(), Succ, 2);
+    if (Function::iterator(Succ) == llvm::next(Function::iterator(SI.getParent())))
+      Out << "    break;\n";
+  }
+  Out << "  }\n";
+}
+
+void CWriter::visitIndirectBrInst(IndirectBrInst &IBI) {
+  Out << "  goto *(void*)(";
+  writeOperand(IBI.getOperand(0));
+  Out << ");\n";
+}
+
+void CWriter::visitUnreachableInst(UnreachableInst &I) {
+  Out << "  /*UNREACHABLE*/;\n";
+}
+
+bool CWriter::isGotoCodeNecessary(BasicBlock *From, BasicBlock *To) {
+  /// FIXME: This should be reenabled, but loop reordering safe!!
+  return true;
+
+  if (llvm::next(Function::iterator(From)) != Function::iterator(To))
+    return true;  // Not the direct successor, we need a goto.
+
+  //isa<SwitchInst>(From->getTerminator())
+
+  if (LI->getLoopFor(From) != LI->getLoopFor(To))
+    return true;
+  return false;
+}
+
+void CWriter::printPHICopiesForSuccessor (BasicBlock *CurBlock,
+                                          BasicBlock *Successor,
+                                          unsigned Indent) {
+  for (BasicBlock::iterator I = Successor->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PN = cast<PHINode>(I);
+    // Now we have to do the printing.
+    Value *IV = PN->getIncomingValueForBlock(CurBlock);
+    if (!isa<UndefValue>(IV)) {
+      Out << std::string(Indent, ' ');
+      Out << "  " << GetValueName(I) << "__PHI_TEMPORARY = ";
+      writeOperand(IV);
+      Out << ";   /* for PHI node */\n";
+    }
+  }
+}
+
+void CWriter::printBranchToBlock(BasicBlock *CurBB, BasicBlock *Succ,
+                                 unsigned Indent) {
+  if (isGotoCodeNecessary(CurBB, Succ)) {
+    Out << std::string(Indent, ' ') << "  goto ";
+    writeOperand(Succ);
+    Out << ";\n";
+  }
+}
+
+// Branch instruction printing - Avoid printing out a branch to a basic block
+// that immediately succeeds the current one.
+//
+void CWriter::visitBranchInst(BranchInst &I) {
+
+  if (I.isConditional()) {
+    if (isGotoCodeNecessary(I.getParent(), I.getSuccessor(0))) {
+      Out << "  if (";
+      writeOperand(I.getCondition());
+      Out << ") {\n";
+
+      printPHICopiesForSuccessor (I.getParent(), I.getSuccessor(0), 2);
+      printBranchToBlock(I.getParent(), I.getSuccessor(0), 2);
+
+      if (isGotoCodeNecessary(I.getParent(), I.getSuccessor(1))) {
+        Out << "  } else {\n";
+        printPHICopiesForSuccessor (I.getParent(), I.getSuccessor(1), 2);
+        printBranchToBlock(I.getParent(), I.getSuccessor(1), 2);
+      }
+    } else {
+      // First goto not necessary, assume second one is...
+      Out << "  if (!";
+      writeOperand(I.getCondition());
+      Out << ") {\n";
+
+      printPHICopiesForSuccessor (I.getParent(), I.getSuccessor(1), 2);
+      printBranchToBlock(I.getParent(), I.getSuccessor(1), 2);
+    }
+
+    Out << "  }\n";
+  } else {
+    printPHICopiesForSuccessor (I.getParent(), I.getSuccessor(0), 0);
+    printBranchToBlock(I.getParent(), I.getSuccessor(0), 0);
+  }
+  Out << "\n";
+}
+
+// PHI nodes get copied into temporary values at the end of predecessor basic
+// blocks.  We now need to copy these temporary values into the REAL value for
+// the PHI.
+void CWriter::visitPHINode(PHINode &I) {
+  writeOperand(&I);
+  Out << "__PHI_TEMPORARY";
+}
+
+
+void CWriter::visitBinaryOperator(Instruction &I) {
+  // binary instructions, shift instructions, setCond instructions.
+  assert(!isa<PointerType>(I.getType()));
+
+  // We must cast the results of binary operations which might be promoted.
+  bool needsCast = false;
+  if ((I.getType() == Type::getInt8Ty(I.getContext())) ||
+      (I.getType() == Type::getInt16Ty(I.getContext())) 
+      || (I.getType() == Type::getFloatTy(I.getContext()))) {
+    needsCast = true;
+    Out << "((";
+    printType(Out, I.getType(), false);
+    Out << ")(";
+  }
+
+  // If this is a negation operation, print it out as such.  For FP, we don't
+  // want to print "-0.0 - X".
+  if (BinaryOperator::isNeg(&I)) {
+    Out << "-(";
+    writeOperand(BinaryOperator::getNegArgument(cast<BinaryOperator>(&I)));
+    Out << ")";
+  } else if (BinaryOperator::isFNeg(&I)) {
+    Out << "-(";
+    writeOperand(BinaryOperator::getFNegArgument(cast<BinaryOperator>(&I)));
+    Out << ")";
+  } else if (I.getOpcode() == Instruction::FRem) {
+    // Output a call to fmod/fmodf instead of emitting a%b
+    if (I.getType() == Type::getFloatTy(I.getContext()))
+      Out << "fmodf(";
+    else if (I.getType() == Type::getDoubleTy(I.getContext()))
+      Out << "fmod(";
+    else  // all 3 flavors of long double
+      Out << "fmodl(";
+    writeOperand(I.getOperand(0));
+    Out << ", ";
+    writeOperand(I.getOperand(1));
+    Out << ")";
+  } else {
+
+    // Write out the cast of the instruction's value back to the proper type
+    // if necessary.
+    bool NeedsClosingParens = writeInstructionCast(I);
+
+    // Certain instructions require the operand to be forced to a specific type
+    // so we use writeOperandWithCast here instead of writeOperand. Similarly
+    // below for operand 1
+    writeOperandWithCast(I.getOperand(0), I.getOpcode());
+
+    switch (I.getOpcode()) {
+    case Instruction::Add:
+    case Instruction::FAdd: Out << " + "; break;
+    case Instruction::Sub:
+    case Instruction::FSub: Out << " - "; break;
+    case Instruction::Mul:
+    case Instruction::FMul: Out << " * "; break;
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem: Out << " % "; break;
+    case Instruction::UDiv:
+    case Instruction::SDiv: 
+    case Instruction::FDiv: Out << " / "; break;
+    case Instruction::And:  Out << " & "; break;
+    case Instruction::Or:   Out << " | "; break;
+    case Instruction::Xor:  Out << " ^ "; break;
+    case Instruction::Shl : Out << " << "; break;
+    case Instruction::LShr:
+    case Instruction::AShr: Out << " >> "; break;
+    default: 
+#ifndef NDEBUG
+       errs() << "Invalid operator type!" << I;
+#endif
+       llvm_unreachable(0);
+    }
+
+    writeOperandWithCast(I.getOperand(1), I.getOpcode());
+    if (NeedsClosingParens)
+      Out << "))";
+  }
+
+  if (needsCast) {
+    Out << "))";
+  }
+}
+
+void CWriter::visitICmpInst(ICmpInst &I) {
+  // We must cast the results of icmp which might be promoted.
+  bool needsCast = false;
+
+  // Write out the cast of the instruction's value back to the proper type
+  // if necessary.
+  bool NeedsClosingParens = writeInstructionCast(I);
+
+  // Certain icmp predicate require the operand to be forced to a specific type
+  // so we use writeOperandWithCast here instead of writeOperand. Similarly
+  // below for operand 1
+  writeOperandWithCast(I.getOperand(0), I);
+
+  switch (I.getPredicate()) {
+  case ICmpInst::ICMP_EQ:  Out << " == "; break;
+  case ICmpInst::ICMP_NE:  Out << " != "; break;
+  case ICmpInst::ICMP_ULE:
+  case ICmpInst::ICMP_SLE: Out << " <= "; break;
+  case ICmpInst::ICMP_UGE:
+  case ICmpInst::ICMP_SGE: Out << " >= "; break;
+  case ICmpInst::ICMP_ULT:
+  case ICmpInst::ICMP_SLT: Out << " < "; break;
+  case ICmpInst::ICMP_UGT:
+  case ICmpInst::ICMP_SGT: Out << " > "; break;
+  default:
+#ifndef NDEBUG
+    errs() << "Invalid icmp predicate!" << I; 
+#endif
+    llvm_unreachable(0);
+  }
+
+  writeOperandWithCast(I.getOperand(1), I);
+  if (NeedsClosingParens)
+    Out << "))";
+
+  if (needsCast) {
+    Out << "))";
+  }
+}
+
+void CWriter::visitFCmpInst(FCmpInst &I) {
+  if (I.getPredicate() == FCmpInst::FCMP_FALSE) {
+    Out << "0";
+    return;
+  }
+  if (I.getPredicate() == FCmpInst::FCMP_TRUE) {
+    Out << "1";
+    return;
+  }
+
+  const char* op = 0;
+  switch (I.getPredicate()) {
+  default: llvm_unreachable("Illegal FCmp predicate");
+  case FCmpInst::FCMP_ORD: op = "ord"; break;
+  case FCmpInst::FCMP_UNO: op = "uno"; break;
+  case FCmpInst::FCMP_UEQ: op = "ueq"; break;
+  case FCmpInst::FCMP_UNE: op = "une"; break;
+  case FCmpInst::FCMP_ULT: op = "ult"; break;
+  case FCmpInst::FCMP_ULE: op = "ule"; break;
+  case FCmpInst::FCMP_UGT: op = "ugt"; break;
+  case FCmpInst::FCMP_UGE: op = "uge"; break;
+  case FCmpInst::FCMP_OEQ: op = "oeq"; break;
+  case FCmpInst::FCMP_ONE: op = "one"; break;
+  case FCmpInst::FCMP_OLT: op = "olt"; break;
+  case FCmpInst::FCMP_OLE: op = "ole"; break;
+  case FCmpInst::FCMP_OGT: op = "ogt"; break;
+  case FCmpInst::FCMP_OGE: op = "oge"; break;
+  }
+
+  Out << "llvm_fcmp_" << op << "(";
+  // Write the first operand
+  writeOperand(I.getOperand(0));
+  Out << ", ";
+  // Write the second operand
+  writeOperand(I.getOperand(1));
+  Out << ")";
+}
+
+static const char * getFloatBitCastField(const Type *Ty) {
+  switch (Ty->getTypeID()) {
+    default: llvm_unreachable("Invalid Type");
+    case Type::FloatTyID:  return "Float";
+    case Type::DoubleTyID: return "Double";
+    case Type::IntegerTyID: {
+      unsigned NumBits = cast<IntegerType>(Ty)->getBitWidth();
+      if (NumBits <= 32)
+        return "Int32";
+      else
+        return "Int64";
+    }
+  }
+}
+
+void CWriter::visitCastInst(CastInst &I) {
+  const Type *DstTy = I.getType();
+  const Type *SrcTy = I.getOperand(0)->getType();
+  if (isFPIntBitCast(I)) {
+    Out << '(';
+    // These int<->float and long<->double casts need to be handled specially
+    Out << GetValueName(&I) << "__BITCAST_TEMPORARY." 
+        << getFloatBitCastField(I.getOperand(0)->getType()) << " = ";
+    writeOperand(I.getOperand(0));
+    Out << ", " << GetValueName(&I) << "__BITCAST_TEMPORARY."
+        << getFloatBitCastField(I.getType());
+    Out << ')';
+    return;
+  }
+  
+  Out << '(';
+  printCast(I.getOpcode(), SrcTy, DstTy);
+
+  // Make a sext from i1 work by subtracting the i1 from 0 (an int).
+  if (SrcTy == Type::getInt1Ty(I.getContext()) &&
+      I.getOpcode() == Instruction::SExt)
+    Out << "0-";
+  
+  writeOperand(I.getOperand(0));
+    
+  if (DstTy == Type::getInt1Ty(I.getContext()) && 
+      (I.getOpcode() == Instruction::Trunc ||
+       I.getOpcode() == Instruction::FPToUI ||
+       I.getOpcode() == Instruction::FPToSI ||
+       I.getOpcode() == Instruction::PtrToInt)) {
+    // Make sure we really get a trunc to bool by anding the operand with 1 
+    Out << "&1u";
+  }
+  Out << ')';
+}
+
+void CWriter::visitSelectInst(SelectInst &I) {
+  Out << "((";
+  writeOperand(I.getCondition());
+  Out << ") ? (";
+  writeOperand(I.getTrueValue());
+  Out << ") : (";
+  writeOperand(I.getFalseValue());
+  Out << "))";
+}
+
+
+void CWriter::lowerIntrinsics(Function &F) {
+  // This is used to keep track of intrinsics that get generated to a lowered
+  // function. We must generate the prototypes before the function body which
+  // will only be expanded on first use (by the loop below).
+  std::vector<Function*> prototypesToGen;
+
+  // Examine all the instructions in this function to find the intrinsics that
+  // need to be lowered.
+  for (Function::iterator BB = F.begin(), EE = F.end(); BB != EE; ++BB)
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; )
+      if (CallInst *CI = dyn_cast<CallInst>(I++))
+        if (Function *F = CI->getCalledFunction())
+          switch (F->getIntrinsicID()) {
+          case Intrinsic::not_intrinsic:
+          case Intrinsic::memory_barrier:
+          case Intrinsic::vastart:
+          case Intrinsic::vacopy:
+          case Intrinsic::vaend:
+          case Intrinsic::returnaddress:
+          case Intrinsic::frameaddress:
+          case Intrinsic::setjmp:
+          case Intrinsic::longjmp:
+          case Intrinsic::prefetch:
+          case Intrinsic::powi:
+          case Intrinsic::x86_sse_cmp_ss:
+          case Intrinsic::x86_sse_cmp_ps:
+          case Intrinsic::x86_sse2_cmp_sd:
+          case Intrinsic::x86_sse2_cmp_pd:
+          case Intrinsic::ppc_altivec_lvsl:
+              // We directly implement these intrinsics
+            break;
+          default:
+            // If this is an intrinsic that directly corresponds to a GCC
+            // builtin, we handle it.
+            const char *BuiltinName = "";
+#define GET_GCC_BUILTIN_NAME
+#include "llvm/Intrinsics.gen"
+#undef GET_GCC_BUILTIN_NAME
+            // If we handle it, don't lower it.
+            if (BuiltinName[0]) break;
+            
+            // All other intrinsic calls we must lower.
+            Instruction *Before = 0;
+            if (CI != &BB->front())
+              Before = prior(BasicBlock::iterator(CI));
+
+            IL->LowerIntrinsicCall(CI);
+            if (Before) {        // Move iterator to instruction after call
+              I = Before; ++I;
+            } else {
+              I = BB->begin();
+            }
+            // If the intrinsic got lowered to another call, and that call has
+            // a definition then we need to make sure its prototype is emitted
+            // before any calls to it.
+            if (CallInst *Call = dyn_cast<CallInst>(I))
+              if (Function *NewF = Call->getCalledFunction())
+                if (!NewF->isDeclaration())
+                  prototypesToGen.push_back(NewF);
+
+            break;
+          }
+
+  // We may have collected some prototypes to emit in the loop above. 
+  // Emit them now, before the function that uses them is emitted. But,
+  // be careful not to emit them twice.
+  std::vector<Function*>::iterator I = prototypesToGen.begin();
+  std::vector<Function*>::iterator E = prototypesToGen.end();
+  for ( ; I != E; ++I) {
+    if (intrinsicPrototypesAlreadyGenerated.insert(*I).second) {
+      Out << '\n';
+      printFunctionSignature(*I, true);
+      Out << ";\n";
+    }
+  }
+}
+
+void CWriter::visitCallInst(CallInst &I) {
+  if (isa<InlineAsm>(I.getOperand(0)))
+    return visitInlineAsm(I);
+
+  bool WroteCallee = false;
+
+  // Handle intrinsic function calls first...
+  if (Function *F = I.getCalledFunction())
+    if (Intrinsic::ID ID = (Intrinsic::ID)F->getIntrinsicID())
+      if (visitBuiltinCall(I, ID, WroteCallee))
+        return;
+
+  Value *Callee = I.getCalledValue();
+
+  const PointerType  *PTy   = cast<PointerType>(Callee->getType());
+  const FunctionType *FTy   = cast<FunctionType>(PTy->getElementType());
+
+  // If this is a call to a struct-return function, assign to the first
+  // parameter instead of passing it to the call.
+  const AttrListPtr &PAL = I.getAttributes();
+  bool hasByVal = I.hasByValArgument();
+  bool isStructRet = I.hasStructRetAttr();
+  if (isStructRet) {
+    writeOperandDeref(I.getOperand(1));
+    Out << " = ";
+  }
+  
+  if (I.isTailCall()) Out << " /*tail*/ ";
+  
+  if (!WroteCallee) {
+    // If this is an indirect call to a struct return function, we need to cast
+    // the pointer. Ditto for indirect calls with byval arguments.
+    bool NeedsCast = (hasByVal || isStructRet) && !isa<Function>(Callee);
+
+    // GCC is a real PITA.  It does not permit codegening casts of functions to
+    // function pointers if they are in a call (it generates a trap instruction
+    // instead!).  We work around this by inserting a cast to void* in between
+    // the function and the function pointer cast.  Unfortunately, we can't just
+    // form the constant expression here, because the folder will immediately
+    // nuke it.
+    //
+    // Note finally, that this is completely unsafe.  ANSI C does not guarantee
+    // that void* and function pointers have the same size. :( To deal with this
+    // in the common case, we handle casts where the number of arguments passed
+    // match exactly.
+    //
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Callee))
+      if (CE->isCast())
+        if (Function *RF = dyn_cast<Function>(CE->getOperand(0))) {
+          NeedsCast = true;
+          Callee = RF;
+        }
+  
+    if (NeedsCast) {
+      // Ok, just cast the pointer type.
+      Out << "((";
+      if (isStructRet)
+        printStructReturnPointerFunctionType(Out, PAL,
+                             cast<PointerType>(I.getCalledValue()->getType()));
+      else if (hasByVal)
+        printType(Out, I.getCalledValue()->getType(), false, "", true, PAL);
+      else
+        printType(Out, I.getCalledValue()->getType());
+      Out << ")(void*)";
+    }
+    writeOperand(Callee);
+    if (NeedsCast) Out << ')';
+  }
+
+  Out << '(';
+
+  unsigned NumDeclaredParams = FTy->getNumParams();
+
+  CallSite::arg_iterator AI = I.op_begin()+1, AE = I.op_end();
+  unsigned ArgNo = 0;
+  if (isStructRet) {   // Skip struct return argument.
+    ++AI;
+    ++ArgNo;
+  }
+      
+  bool PrintedArg = false;
+  for (; AI != AE; ++AI, ++ArgNo) {
+    if (PrintedArg) Out << ", ";
+    if (ArgNo < NumDeclaredParams &&
+        (*AI)->getType() != FTy->getParamType(ArgNo)) {
+      Out << '(';
+      printType(Out, FTy->getParamType(ArgNo), 
+            /*isSigned=*/PAL.paramHasAttr(ArgNo+1, Attribute::SExt));
+      Out << ')';
+    }
+    // Check if the argument is expected to be passed by value.
+    if (I.paramHasAttr(ArgNo+1, Attribute::ByVal))
+      writeOperandDeref(*AI);
+    else
+      writeOperand(*AI);
+    PrintedArg = true;
+  }
+  Out << ')';
+}
+
+/// visitBuiltinCall - Handle the call to the specified builtin.  Returns true
+/// if the entire call is handled, return false if it wasn't handled, and
+/// optionally set 'WroteCallee' if the callee has already been printed out.
+bool CWriter::visitBuiltinCall(CallInst &I, Intrinsic::ID ID,
+                               bool &WroteCallee) {
+  switch (ID) {
+  default: {
+    // If this is an intrinsic that directly corresponds to a GCC
+    // builtin, we emit it here.
+    const char *BuiltinName = "";
+    Function *F = I.getCalledFunction();
+#define GET_GCC_BUILTIN_NAME
+#include "llvm/Intrinsics.gen"
+#undef GET_GCC_BUILTIN_NAME
+    assert(BuiltinName[0] && "Unknown LLVM intrinsic!");
+    
+    Out << BuiltinName;
+    WroteCallee = true;
+    return false;
+  }
+  case Intrinsic::memory_barrier:
+    Out << "__sync_synchronize()";
+    return true;
+  case Intrinsic::vastart:
+    Out << "0; ";
+      
+    Out << "va_start(*(va_list*)";
+    writeOperand(I.getOperand(1));
+    Out << ", ";
+    // Output the last argument to the enclosing function.
+    if (I.getParent()->getParent()->arg_empty()) {
+      std::string msg;
+      raw_string_ostream Msg(msg);
+      Msg << "The C backend does not currently support zero "
+           << "argument varargs functions, such as '"
+           << I.getParent()->getParent()->getName() << "'!";
+      llvm_report_error(Msg.str());
+    }
+    writeOperand(--I.getParent()->getParent()->arg_end());
+    Out << ')';
+    return true;
+  case Intrinsic::vaend:
+    if (!isa<ConstantPointerNull>(I.getOperand(1))) {
+      Out << "0; va_end(*(va_list*)";
+      writeOperand(I.getOperand(1));
+      Out << ')';
+    } else {
+      Out << "va_end(*(va_list*)0)";
+    }
+    return true;
+  case Intrinsic::vacopy:
+    Out << "0; ";
+    Out << "va_copy(*(va_list*)";
+    writeOperand(I.getOperand(1));
+    Out << ", *(va_list*)";
+    writeOperand(I.getOperand(2));
+    Out << ')';
+    return true;
+  case Intrinsic::returnaddress:
+    Out << "__builtin_return_address(";
+    writeOperand(I.getOperand(1));
+    Out << ')';
+    return true;
+  case Intrinsic::frameaddress:
+    Out << "__builtin_frame_address(";
+    writeOperand(I.getOperand(1));
+    Out << ')';
+    return true;
+  case Intrinsic::powi:
+    Out << "__builtin_powi(";
+    writeOperand(I.getOperand(1));
+    Out << ", ";
+    writeOperand(I.getOperand(2));
+    Out << ')';
+    return true;
+  case Intrinsic::setjmp:
+    Out << "setjmp(*(jmp_buf*)";
+    writeOperand(I.getOperand(1));
+    Out << ')';
+    return true;
+  case Intrinsic::longjmp:
+    Out << "longjmp(*(jmp_buf*)";
+    writeOperand(I.getOperand(1));
+    Out << ", ";
+    writeOperand(I.getOperand(2));
+    Out << ')';
+    return true;
+  case Intrinsic::prefetch:
+    Out << "LLVM_PREFETCH((const void *)";
+    writeOperand(I.getOperand(1));
+    Out << ", ";
+    writeOperand(I.getOperand(2));
+    Out << ", ";
+    writeOperand(I.getOperand(3));
+    Out << ")";
+    return true;
+  case Intrinsic::stacksave:
+    // Emit this as: Val = 0; *((void**)&Val) = __builtin_stack_save()
+    // to work around GCC bugs (see PR1809).
+    Out << "0; *((void**)&" << GetValueName(&I)
+        << ") = __builtin_stack_save()";
+    return true;
+  case Intrinsic::x86_sse_cmp_ss:
+  case Intrinsic::x86_sse_cmp_ps:
+  case Intrinsic::x86_sse2_cmp_sd:
+  case Intrinsic::x86_sse2_cmp_pd:
+    Out << '(';
+    printType(Out, I.getType());
+    Out << ')';  
+    // Multiple GCC builtins multiplex onto this intrinsic.
+    switch (cast<ConstantInt>(I.getOperand(3))->getZExtValue()) {
+    default: llvm_unreachable("Invalid llvm.x86.sse.cmp!");
+    case 0: Out << "__builtin_ia32_cmpeq"; break;
+    case 1: Out << "__builtin_ia32_cmplt"; break;
+    case 2: Out << "__builtin_ia32_cmple"; break;
+    case 3: Out << "__builtin_ia32_cmpunord"; break;
+    case 4: Out << "__builtin_ia32_cmpneq"; break;
+    case 5: Out << "__builtin_ia32_cmpnlt"; break;
+    case 6: Out << "__builtin_ia32_cmpnle"; break;
+    case 7: Out << "__builtin_ia32_cmpord"; break;
+    }
+    if (ID == Intrinsic::x86_sse_cmp_ps || ID == Intrinsic::x86_sse2_cmp_pd)
+      Out << 'p';
+    else
+      Out << 's';
+    if (ID == Intrinsic::x86_sse_cmp_ss || ID == Intrinsic::x86_sse_cmp_ps)
+      Out << 's';
+    else
+      Out << 'd';
+      
+    Out << "(";
+    writeOperand(I.getOperand(1));
+    Out << ", ";
+    writeOperand(I.getOperand(2));
+    Out << ")";
+    return true;
+  case Intrinsic::ppc_altivec_lvsl:
+    Out << '(';
+    printType(Out, I.getType());
+    Out << ')';  
+    Out << "__builtin_altivec_lvsl(0, (void*)";
+    writeOperand(I.getOperand(1));
+    Out << ")";
+    return true;
+  }
+}
+
+//This converts the llvm constraint string to something gcc is expecting.
+//TODO: work out platform independent constraints and factor those out
+//      of the per target tables
+//      handle multiple constraint codes
+std::string CWriter::InterpretASMConstraint(InlineAsm::ConstraintInfo& c) {
+  assert(c.Codes.size() == 1 && "Too many asm constraint codes to handle");
+
+  // Grab the translation table from MCAsmInfo if it exists.
+  const MCAsmInfo *TargetAsm;
+  std::string Triple = TheModule->getTargetTriple();
+  if (Triple.empty())
+    Triple = llvm::sys::getHostTriple();
+  
+  std::string E;
+  if (const Target *Match = TargetRegistry::lookupTarget(Triple, E))
+    TargetAsm = Match->createAsmInfo(Triple);
+  else
+    return c.Codes[0];
+  
+  const char *const *table = TargetAsm->getAsmCBE();
+
+  // Search the translation table if it exists.
+  for (int i = 0; table && table[i]; i += 2)
+    if (c.Codes[0] == table[i]) {
+      delete TargetAsm;
+      return table[i+1];
+    }
+
+  // Default is identity.
+  delete TargetAsm;
+  return c.Codes[0];
+}
+
+//TODO: import logic from AsmPrinter.cpp
+static std::string gccifyAsm(std::string asmstr) {
+  for (std::string::size_type i = 0; i != asmstr.size(); ++i)
+    if (asmstr[i] == '\n')
+      asmstr.replace(i, 1, "\\n");
+    else if (asmstr[i] == '\t')
+      asmstr.replace(i, 1, "\\t");
+    else if (asmstr[i] == '$') {
+      if (asmstr[i + 1] == '{') {
+        std::string::size_type a = asmstr.find_first_of(':', i + 1);
+        std::string::size_type b = asmstr.find_first_of('}', i + 1);
+        std::string n = "%" + 
+          asmstr.substr(a + 1, b - a - 1) +
+          asmstr.substr(i + 2, a - i - 2);
+        asmstr.replace(i, b - i + 1, n);
+        i += n.size() - 1;
+      } else
+        asmstr.replace(i, 1, "%");
+    }
+    else if (asmstr[i] == '%')//grr
+      { asmstr.replace(i, 1, "%%"); ++i;}
+  
+  return asmstr;
+}
+
+//TODO: assumptions about what consume arguments from the call are likely wrong
+//      handle communitivity
+void CWriter::visitInlineAsm(CallInst &CI) {
+  InlineAsm* as = cast<InlineAsm>(CI.getOperand(0));
+  std::vector<InlineAsm::ConstraintInfo> Constraints = as->ParseConstraints();
+  
+  std::vector<std::pair<Value*, int> > ResultVals;
+  if (CI.getType() == Type::getVoidTy(CI.getContext()))
+    ;
+  else if (const StructType *ST = dyn_cast<StructType>(CI.getType())) {
+    for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i)
+      ResultVals.push_back(std::make_pair(&CI, (int)i));
+  } else {
+    ResultVals.push_back(std::make_pair(&CI, -1));
+  }
+  
+  // Fix up the asm string for gcc and emit it.
+  Out << "__asm__ volatile (\"" << gccifyAsm(as->getAsmString()) << "\"\n";
+  Out << "        :";
+
+  unsigned ValueCount = 0;
+  bool IsFirst = true;
+  
+  // Convert over all the output constraints.
+  for (std::vector<InlineAsm::ConstraintInfo>::iterator I = Constraints.begin(),
+       E = Constraints.end(); I != E; ++I) {
+    
+    if (I->Type != InlineAsm::isOutput) {
+      ++ValueCount;
+      continue;  // Ignore non-output constraints.
+    }
+    
+    assert(I->Codes.size() == 1 && "Too many asm constraint codes to handle");
+    std::string C = InterpretASMConstraint(*I);
+    if (C.empty()) continue;
+    
+    if (!IsFirst) {
+      Out << ", ";
+      IsFirst = false;
+    }
+
+    // Unpack the dest.
+    Value *DestVal;
+    int DestValNo = -1;
+    
+    if (ValueCount < ResultVals.size()) {
+      DestVal = ResultVals[ValueCount].first;
+      DestValNo = ResultVals[ValueCount].second;
+    } else
+      DestVal = CI.getOperand(ValueCount-ResultVals.size()+1);
+
+    if (I->isEarlyClobber)
+      C = "&"+C;
+      
+    Out << "\"=" << C << "\"(" << GetValueName(DestVal);
+    if (DestValNo != -1)
+      Out << ".field" << DestValNo; // Multiple retvals.
+    Out << ")";
+    ++ValueCount;
+  }
+  
+  
+  // Convert over all the input constraints.
+  Out << "\n        :";
+  IsFirst = true;
+  ValueCount = 0;
+  for (std::vector<InlineAsm::ConstraintInfo>::iterator I = Constraints.begin(),
+       E = Constraints.end(); I != E; ++I) {
+    if (I->Type != InlineAsm::isInput) {
+      ++ValueCount;
+      continue;  // Ignore non-input constraints.
+    }
+    
+    assert(I->Codes.size() == 1 && "Too many asm constraint codes to handle");
+    std::string C = InterpretASMConstraint(*I);
+    if (C.empty()) continue;
+    
+    if (!IsFirst) {
+      Out << ", ";
+      IsFirst = false;
+    }
+    
+    assert(ValueCount >= ResultVals.size() && "Input can't refer to result");
+    Value *SrcVal = CI.getOperand(ValueCount-ResultVals.size()+1);
+    
+    Out << "\"" << C << "\"(";
+    if (!I->isIndirect)
+      writeOperand(SrcVal);
+    else
+      writeOperandDeref(SrcVal);
+    Out << ")";
+  }
+  
+  // Convert over the clobber constraints.
+  IsFirst = true;
+  for (std::vector<InlineAsm::ConstraintInfo>::iterator I = Constraints.begin(),
+       E = Constraints.end(); I != E; ++I) {
+    if (I->Type != InlineAsm::isClobber)
+      continue;  // Ignore non-input constraints.
+
+    assert(I->Codes.size() == 1 && "Too many asm constraint codes to handle");
+    std::string C = InterpretASMConstraint(*I);
+    if (C.empty()) continue;
+    
+    if (!IsFirst) {
+      Out << ", ";
+      IsFirst = false;
+    }
+    
+    Out << '\"' << C << '"';
+  }
+  
+  Out << ")";
+}
+
+void CWriter::visitAllocaInst(AllocaInst &I) {
+  Out << '(';
+  printType(Out, I.getType());
+  Out << ") alloca(sizeof(";
+  printType(Out, I.getType()->getElementType());
+  Out << ')';
+  if (I.isArrayAllocation()) {
+    Out << " * " ;
+    writeOperand(I.getOperand(0));
+  }
+  Out << ')';
+}
+
+void CWriter::printGEPExpression(Value *Ptr, gep_type_iterator I,
+                                 gep_type_iterator E, bool Static) {
+  
+  // If there are no indices, just print out the pointer.
+  if (I == E) {
+    writeOperand(Ptr);
+    return;
+  }
+    
+  // Find out if the last index is into a vector.  If so, we have to print this
+  // specially.  Since vectors can't have elements of indexable type, only the
+  // last index could possibly be of a vector element.
+  const VectorType *LastIndexIsVector = 0;
+  {
+    for (gep_type_iterator TmpI = I; TmpI != E; ++TmpI)
+      LastIndexIsVector = dyn_cast<VectorType>(*TmpI);
+  }
+  
+  Out << "(";
+  
+  // If the last index is into a vector, we can't print it as &a[i][j] because
+  // we can't index into a vector with j in GCC.  Instead, emit this as
+  // (((float*)&a[i])+j)
+  if (LastIndexIsVector) {
+    Out << "((";
+    printType(Out, PointerType::getUnqual(LastIndexIsVector->getElementType()));
+    Out << ")(";
+  }
+  
+  Out << '&';
+
+  // If the first index is 0 (very typical) we can do a number of
+  // simplifications to clean up the code.
+  Value *FirstOp = I.getOperand();
+  if (!isa<Constant>(FirstOp) || !cast<Constant>(FirstOp)->isNullValue()) {
+    // First index isn't simple, print it the hard way.
+    writeOperand(Ptr);
+  } else {
+    ++I;  // Skip the zero index.
+
+    // Okay, emit the first operand. If Ptr is something that is already address
+    // exposed, like a global, avoid emitting (&foo)[0], just emit foo instead.
+    if (isAddressExposed(Ptr)) {
+      writeOperandInternal(Ptr, Static);
+    } else if (I != E && isa<StructType>(*I)) {
+      // If we didn't already emit the first operand, see if we can print it as
+      // P->f instead of "P[0].f"
+      writeOperand(Ptr);
+      Out << "->field" << cast<ConstantInt>(I.getOperand())->getZExtValue();
+      ++I;  // eat the struct index as well.
+    } else {
+      // Instead of emitting P[0][1], emit (*P)[1], which is more idiomatic.
+      Out << "(*";
+      writeOperand(Ptr);
+      Out << ")";
+    }
+  }
+
+  for (; I != E; ++I) {
+    if (isa<StructType>(*I)) {
+      Out << ".field" << cast<ConstantInt>(I.getOperand())->getZExtValue();
+    } else if (isa<ArrayType>(*I)) {
+      Out << ".array[";
+      writeOperandWithCast(I.getOperand(), Instruction::GetElementPtr);
+      Out << ']';
+    } else if (!isa<VectorType>(*I)) {
+      Out << '[';
+      writeOperandWithCast(I.getOperand(), Instruction::GetElementPtr);
+      Out << ']';
+    } else {
+      // If the last index is into a vector, then print it out as "+j)".  This
+      // works with the 'LastIndexIsVector' code above.
+      if (isa<Constant>(I.getOperand()) &&
+          cast<Constant>(I.getOperand())->isNullValue()) {
+        Out << "))";  // avoid "+0".
+      } else {
+        Out << ")+(";
+        writeOperandWithCast(I.getOperand(), Instruction::GetElementPtr);
+        Out << "))";
+      }
+    }
+  }
+  Out << ")";
+}
+
+void CWriter::writeMemoryAccess(Value *Operand, const Type *OperandType,
+                                bool IsVolatile, unsigned Alignment) {
+
+  bool IsUnaligned = Alignment &&
+    Alignment < TD->getABITypeAlignment(OperandType);
+
+  if (!IsUnaligned)
+    Out << '*';
+  if (IsVolatile || IsUnaligned) {
+    Out << "((";
+    if (IsUnaligned)
+      Out << "struct __attribute__ ((packed, aligned(" << Alignment << "))) {";
+    printType(Out, OperandType, false, IsUnaligned ? "data" : "volatile*");
+    if (IsUnaligned) {
+      Out << "; } ";
+      if (IsVolatile) Out << "volatile ";
+      Out << "*";
+    }
+    Out << ")";
+  }
+
+  writeOperand(Operand);
+
+  if (IsVolatile || IsUnaligned) {
+    Out << ')';
+    if (IsUnaligned)
+      Out << "->data";
+  }
+}
+
+void CWriter::visitLoadInst(LoadInst &I) {
+  writeMemoryAccess(I.getOperand(0), I.getType(), I.isVolatile(),
+                    I.getAlignment());
+
+}
+
+void CWriter::visitStoreInst(StoreInst &I) {
+  writeMemoryAccess(I.getPointerOperand(), I.getOperand(0)->getType(),
+                    I.isVolatile(), I.getAlignment());
+  Out << " = ";
+  Value *Operand = I.getOperand(0);
+  Constant *BitMask = 0;
+  if (const IntegerType* ITy = dyn_cast<IntegerType>(Operand->getType()))
+    if (!ITy->isPowerOf2ByteWidth())
+      // We have a bit width that doesn't match an even power-of-2 byte
+      // size. Consequently we must & the value with the type's bit mask
+      BitMask = ConstantInt::get(ITy, ITy->getBitMask());
+  if (BitMask)
+    Out << "((";
+  writeOperand(Operand);
+  if (BitMask) {
+    Out << ") & ";
+    printConstant(BitMask, false);
+    Out << ")"; 
+  }
+}
+
+void CWriter::visitGetElementPtrInst(GetElementPtrInst &I) {
+  printGEPExpression(I.getPointerOperand(), gep_type_begin(I),
+                     gep_type_end(I), false);
+}
+
+void CWriter::visitVAArgInst(VAArgInst &I) {
+  Out << "va_arg(*(va_list*)";
+  writeOperand(I.getOperand(0));
+  Out << ", ";
+  printType(Out, I.getType());
+  Out << ");\n ";
+}
+
+void CWriter::visitInsertElementInst(InsertElementInst &I) {
+  const Type *EltTy = I.getType()->getElementType();
+  writeOperand(I.getOperand(0));
+  Out << ";\n  ";
+  Out << "((";
+  printType(Out, PointerType::getUnqual(EltTy));
+  Out << ")(&" << GetValueName(&I) << "))[";
+  writeOperand(I.getOperand(2));
+  Out << "] = (";
+  writeOperand(I.getOperand(1));
+  Out << ")";
+}
+
+void CWriter::visitExtractElementInst(ExtractElementInst &I) {
+  // We know that our operand is not inlined.
+  Out << "((";
+  const Type *EltTy = 
+    cast<VectorType>(I.getOperand(0)->getType())->getElementType();
+  printType(Out, PointerType::getUnqual(EltTy));
+  Out << ")(&" << GetValueName(I.getOperand(0)) << "))[";
+  writeOperand(I.getOperand(1));
+  Out << "]";
+}
+
+void CWriter::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
+  Out << "(";
+  printType(Out, SVI.getType());
+  Out << "){ ";
+  const VectorType *VT = SVI.getType();
+  unsigned NumElts = VT->getNumElements();
+  const Type *EltTy = VT->getElementType();
+
+  for (unsigned i = 0; i != NumElts; ++i) {
+    if (i) Out << ", ";
+    int SrcVal = SVI.getMaskValue(i);
+    if ((unsigned)SrcVal >= NumElts*2) {
+      Out << " 0/*undef*/ ";
+    } else {
+      Value *Op = SVI.getOperand((unsigned)SrcVal >= NumElts);
+      if (isa<Instruction>(Op)) {
+        // Do an extractelement of this value from the appropriate input.
+        Out << "((";
+        printType(Out, PointerType::getUnqual(EltTy));
+        Out << ")(&" << GetValueName(Op)
+            << "))[" << (SrcVal & (NumElts-1)) << "]";
+      } else if (isa<ConstantAggregateZero>(Op) || isa<UndefValue>(Op)) {
+        Out << "0";
+      } else {
+        printConstant(cast<ConstantVector>(Op)->getOperand(SrcVal &
+                                                           (NumElts-1)),
+                      false);
+      }
+    }
+  }
+  Out << "}";
+}
+
+void CWriter::visitInsertValueInst(InsertValueInst &IVI) {
+  // Start by copying the entire aggregate value into the result variable.
+  writeOperand(IVI.getOperand(0));
+  Out << ";\n  ";
+
+  // Then do the insert to update the field.
+  Out << GetValueName(&IVI);
+  for (const unsigned *b = IVI.idx_begin(), *i = b, *e = IVI.idx_end();
+       i != e; ++i) {
+    const Type *IndexedTy =
+      ExtractValueInst::getIndexedType(IVI.getOperand(0)->getType(), b, i+1);
+    if (isa<ArrayType>(IndexedTy))
+      Out << ".array[" << *i << "]";
+    else
+      Out << ".field" << *i;
+  }
+  Out << " = ";
+  writeOperand(IVI.getOperand(1));
+}
+
+void CWriter::visitExtractValueInst(ExtractValueInst &EVI) {
+  Out << "(";
+  if (isa<UndefValue>(EVI.getOperand(0))) {
+    Out << "(";
+    printType(Out, EVI.getType());
+    Out << ") 0/*UNDEF*/";
+  } else {
+    Out << GetValueName(EVI.getOperand(0));
+    for (const unsigned *b = EVI.idx_begin(), *i = b, *e = EVI.idx_end();
+         i != e; ++i) {
+      const Type *IndexedTy =
+        ExtractValueInst::getIndexedType(EVI.getOperand(0)->getType(), b, i+1);
+      if (isa<ArrayType>(IndexedTy))
+        Out << ".array[" << *i << "]";
+      else
+        Out << ".field" << *i;
+    }
+  }
+  Out << ")";
+}
+
+//===----------------------------------------------------------------------===//
+//                       External Interface declaration
+//===----------------------------------------------------------------------===//
+
+bool CTargetMachine::addPassesToEmitWholeFile(PassManager &PM,
+                                              formatted_raw_ostream &o,
+                                              CodeGenFileType FileType,
+                                              CodeGenOpt::Level OptLevel) {
+  if (FileType != TargetMachine::CGFT_AssemblyFile) return true;
+
+  PM.add(createGCLoweringPass());
+  PM.add(createLowerInvokePass());
+  PM.add(createCFGSimplificationPass());   // clean up after lower invoke.
+  PM.add(new CBackendNameAllUsedStructsAndMergeFunctions());
+  PM.add(new CWriter(o));
+  PM.add(createGCInfoDeleter());
+  return false;
+}

diff --git a/lib/Target/CBackend/CMakeLists.txt b/lib/Target/CBackend/CMakeLists.txt
new file mode 100644
index 0000000..be24336
--- /dev/null
+++ b/lib/Target/CBackend/CMakeLists.txt

@@ -0,0 +1,3 @@
+add_llvm_target(CBackend
+  CBackend.cpp
+  )

diff --git a/lib/Target/CBackend/CTargetMachine.h b/lib/Target/CBackend/CTargetMachine.h
new file mode 100644
index 0000000..715bbda
--- /dev/null
+++ b/lib/Target/CBackend/CTargetMachine.h

@@ -0,0 +1,40 @@
+//===-- CTargetMachine.h - TargetMachine for the C backend ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the TargetMachine that is used by the C backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CTARGETMACHINE_H
+#define CTARGETMACHINE_H
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+
+namespace llvm {
+
+struct CTargetMachine : public TargetMachine {
+  CTargetMachine(const Target &T, const std::string &TT, const std::string &FS)
+    : TargetMachine(T) {}
+
+  virtual bool WantsWholeFile() const { return true; }
+  virtual bool addPassesToEmitWholeFile(PassManager &PM,
+                                        formatted_raw_ostream &Out,
+                                        CodeGenFileType FileType,
+                                        CodeGenOpt::Level OptLevel);
+  
+  virtual const TargetData *getTargetData() const { return 0; }
+};
+
+extern Target TheCBackendTarget;
+
+} // End llvm namespace
+
+
+#endif

diff --git a/lib/Target/CBackend/Makefile b/lib/Target/CBackend/Makefile
new file mode 100644
index 0000000..621948a
--- /dev/null
+++ b/lib/Target/CBackend/Makefile

@@ -0,0 +1,16 @@
+##===- lib/Target/CBackend/Makefile ------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMCBackend
+DIRS = TargetInfo
+
+include $(LEVEL)/Makefile.common
+
+CompileCommonOpts += -Wno-format

diff --git a/lib/Target/CBackend/TargetInfo/CBackendTargetInfo.cpp b/lib/Target/CBackend/TargetInfo/CBackendTargetInfo.cpp
new file mode 100644
index 0000000..f7e8ff2
--- /dev/null
+++ b/lib/Target/CBackend/TargetInfo/CBackendTargetInfo.cpp

@@ -0,0 +1,19 @@
+//===-- CBackendTargetInfo.cpp - CBackend Target Implementation -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CTargetMachine.h"
+#include "llvm/Module.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheCBackendTarget;
+
+extern "C" void LLVMInitializeCBackendTargetInfo() { 
+  RegisterTarget<> X(TheCBackendTarget, "c", "C backend");
+}

diff --git a/lib/Target/CBackend/TargetInfo/CMakeLists.txt b/lib/Target/CBackend/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000..5b35fa7
--- /dev/null
+++ b/lib/Target/CBackend/TargetInfo/CMakeLists.txt

@@ -0,0 +1,6 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMCBackendInfo
+  CBackendTargetInfo.cpp
+  )
+

diff --git a/lib/Target/CBackend/TargetInfo/Makefile b/lib/Target/CBackend/TargetInfo/Makefile
new file mode 100644
index 0000000..d4d5e15
--- /dev/null
+++ b/lib/Target/CBackend/TargetInfo/Makefile

@@ -0,0 +1,15 @@
+##===- lib/Target/CBackend/TargetInfo/Makefile -------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMCBackendInfo
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common

diff --git a/lib/Target/CMakeLists.txt b/lib/Target/CMakeLists.txt
new file mode 100644
index 0000000..43ebdac
--- /dev/null
+++ b/lib/Target/CMakeLists.txt

@@ -0,0 +1,15 @@
+add_llvm_library(LLVMTarget
+  Mangler.cpp
+  SubtargetFeature.cpp
+  Target.cpp
+  TargetAsmLexer.cpp
+  TargetData.cpp
+  TargetELFWriterInfo.cpp
+  TargetFrameInfo.cpp
+  TargetInstrInfo.cpp
+  TargetIntrinsicInfo.cpp
+  TargetLoweringObjectFile.cpp
+  TargetMachine.cpp
+  TargetRegisterInfo.cpp
+  TargetSubtarget.cpp
+  )

diff --git a/lib/Target/CellSPU/AsmPrinter/CMakeLists.txt b/lib/Target/CellSPU/AsmPrinter/CMakeLists.txt
new file mode 100644
index 0000000..1e508fe
--- /dev/null
+++ b/lib/Target/CellSPU/AsmPrinter/CMakeLists.txt

@@ -0,0 +1,9 @@
+include_directories(

+  ${CMAKE_CURRENT_BINARY_DIR}/..

+  ${CMAKE_CURRENT_SOURCE_DIR}/..

+  )

+

+add_llvm_library(LLVMCellSPUAsmPrinter

+  SPUAsmPrinter.cpp
+  )

+add_dependencies(LLVMCellSPUAsmPrinter CellSPUCodeGenTable_gen)
\ No newline at end of file

diff --git a/lib/Target/CellSPU/AsmPrinter/Makefile b/lib/Target/CellSPU/AsmPrinter/Makefile
new file mode 100644
index 0000000..69639ef
--- /dev/null
+++ b/lib/Target/CellSPU/AsmPrinter/Makefile

@@ -0,0 +1,17 @@
+##===- lib/Target/CellSPU/AsmPrinter/Makefile --------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMCellSPUAsmPrinter
+
+# Hack: we need to include 'main' CellSPU target directory to grab
+# private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common

diff --git a/lib/Target/CellSPU/AsmPrinter/SPUAsmPrinter.cpp b/lib/Target/CellSPU/AsmPrinter/SPUAsmPrinter.cpp
new file mode 100644
index 0000000..2ca05c2
--- /dev/null
+++ b/lib/Target/CellSPU/AsmPrinter/SPUAsmPrinter.cpp

@@ -0,0 +1,361 @@
+//===-- SPUAsmPrinter.cpp - Print machine instrs to Cell SPU assembly -------=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to Cell SPU assembly language. This printer
+// is the output mechanism used by `llc'.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asmprinter"
+#include "SPU.h"
+#include "SPUTargetMachine.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+using namespace llvm;
+
+namespace {
+  class SPUAsmPrinter : public AsmPrinter {
+  public:
+    explicit SPUAsmPrinter(formatted_raw_ostream &O, TargetMachine &TM,
+                           MCContext &Ctx, MCStreamer &Streamer,
+                           const MCAsmInfo *T) :
+      AsmPrinter(O, TM, Ctx, Streamer, T) {}
+
+    virtual const char *getPassName() const {
+      return "STI CBEA SPU Assembly Printer";
+    }
+
+    SPUTargetMachine &getTM() {
+      return static_cast<SPUTargetMachine&>(TM);
+    }
+
+    /// printInstruction - This method is automatically generated by tablegen
+    /// from the instruction set description.
+    void printInstruction(const MachineInstr *MI);
+    static const char *getRegisterName(unsigned RegNo);
+
+
+    void EmitInstruction(const MachineInstr *MI) {
+      printInstruction(MI);
+      OutStreamer.AddBlankLine();
+    }
+    void printOp(const MachineOperand &MO);
+
+    /// printRegister - Print register according to target requirements.
+    ///
+    void printRegister(const MachineOperand &MO, bool R0AsZero) {
+      unsigned RegNo = MO.getReg();
+      assert(TargetRegisterInfo::isPhysicalRegister(RegNo) &&
+             "Not physreg??");
+      O << getRegisterName(RegNo);
+    }
+
+    void printOperand(const MachineInstr *MI, unsigned OpNo) {
+      const MachineOperand &MO = MI->getOperand(OpNo);
+      if (MO.isReg()) {
+        O << getRegisterName(MO.getReg());
+      } else if (MO.isImm()) {
+        O << MO.getImm();
+      } else {
+        printOp(MO);
+      }
+    }
+
+    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                         unsigned AsmVariant, const char *ExtraCode);
+    bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                               unsigned AsmVariant, const char *ExtraCode);
+
+
+    void
+    printS7ImmOperand(const MachineInstr *MI, unsigned OpNo)
+    {
+      int value = MI->getOperand(OpNo).getImm();
+      value = (value << (32 - 7)) >> (32 - 7);
+
+      assert((value >= -(1 << 8) && value <= (1 << 7) - 1)
+             && "Invalid s7 argument");
+      O << value;
+    }
+
+    void
+    printU7ImmOperand(const MachineInstr *MI, unsigned OpNo)
+    {
+      unsigned int value = MI->getOperand(OpNo).getImm();
+      assert(value < (1 << 8) && "Invalid u7 argument");
+      O << value;
+    }
+
+    void
+    printShufAddr(const MachineInstr *MI, unsigned OpNo)
+    {
+      char value = MI->getOperand(OpNo).getImm();
+      O << (int) value;
+      O << "(";
+      printOperand(MI, OpNo+1);
+      O << ")";
+    }
+
+    void
+    printS16ImmOperand(const MachineInstr *MI, unsigned OpNo)
+    {
+      O << (short) MI->getOperand(OpNo).getImm();
+    }
+
+    void
+    printU16ImmOperand(const MachineInstr *MI, unsigned OpNo)
+    {
+      O << (unsigned short)MI->getOperand(OpNo).getImm();
+    }
+
+    void
+    printU32ImmOperand(const MachineInstr *MI, unsigned OpNo)
+    {
+      O << (unsigned)MI->getOperand(OpNo).getImm();
+    }
+
+    void
+    printMemRegReg(const MachineInstr *MI, unsigned OpNo) {
+      // When used as the base register, r0 reads constant zero rather than
+      // the value contained in the register.  For this reason, the darwin
+      // assembler requires that we print r0 as 0 (no r) when used as the base.
+      const MachineOperand &MO = MI->getOperand(OpNo);
+      O << getRegisterName(MO.getReg()) << ", ";
+      printOperand(MI, OpNo+1);
+    }
+
+    void
+    printU18ImmOperand(const MachineInstr *MI, unsigned OpNo)
+    {
+      unsigned int value = MI->getOperand(OpNo).getImm();
+      assert(value <= (1 << 19) - 1 && "Invalid u18 argument");
+      O << value;
+    }
+
+    void
+    printS10ImmOperand(const MachineInstr *MI, unsigned OpNo)
+    {
+      short value = (short) (((int) MI->getOperand(OpNo).getImm() << 16)
+                             >> 16);
+      assert((value >= -(1 << 9) && value <= (1 << 9) - 1)
+             && "Invalid s10 argument");
+      O << value;
+    }
+
+    void
+    printU10ImmOperand(const MachineInstr *MI, unsigned OpNo)
+    {
+      short value = (short) (((int) MI->getOperand(OpNo).getImm() << 16)
+                             >> 16);
+      assert((value <= (1 << 10) - 1) && "Invalid u10 argument");
+      O << value;
+    }
+
+    void
+    printDFormAddr(const MachineInstr *MI, unsigned OpNo)
+    {
+      assert(MI->getOperand(OpNo).isImm() &&
+             "printDFormAddr first operand is not immediate");
+      int64_t value = int64_t(MI->getOperand(OpNo).getImm());
+      int16_t value16 = int16_t(value);
+      assert((value16 >= -(1 << (9+4)) && value16 <= (1 << (9+4)) - 1)
+             && "Invalid dform s10 offset argument");
+      O << (value16 & ~0xf) << "(";
+      printOperand(MI, OpNo+1);
+      O << ")";
+    }
+
+    void
+    printAddr256K(const MachineInstr *MI, unsigned OpNo)
+    {
+      /* Note: operand 1 is an offset or symbol name. */
+      if (MI->getOperand(OpNo).isImm()) {
+        printS16ImmOperand(MI, OpNo);
+      } else {
+        printOp(MI->getOperand(OpNo));
+        if (MI->getOperand(OpNo+1).isImm()) {
+          int displ = int(MI->getOperand(OpNo+1).getImm());
+          if (displ > 0)
+            O << "+" << displ;
+          else if (displ < 0)
+            O << displ;
+        }
+      }
+    }
+
+    void printCallOperand(const MachineInstr *MI, unsigned OpNo) {
+      printOp(MI->getOperand(OpNo));
+    }
+
+    void printPCRelativeOperand(const MachineInstr *MI, unsigned OpNo) {
+      // Used to generate a ".-<target>", but it turns out that the assembler
+      // really wants the target.
+      //
+      // N.B.: This operand is used for call targets. Branch hints are another
+      // animal entirely.
+      printOp(MI->getOperand(OpNo));
+    }
+
+    void printHBROperand(const MachineInstr *MI, unsigned OpNo) {
+      // HBR operands are generated in front of branches, hence, the
+      // program counter plus the target.
+      O << ".+";
+      printOp(MI->getOperand(OpNo));
+    }
+
+    void printSymbolHi(const MachineInstr *MI, unsigned OpNo) {
+      if (MI->getOperand(OpNo).isImm()) {
+        printS16ImmOperand(MI, OpNo);
+      } else {
+        printOp(MI->getOperand(OpNo));
+        O << "@h";
+      }
+    }
+
+    void printSymbolLo(const MachineInstr *MI, unsigned OpNo) {
+      if (MI->getOperand(OpNo).isImm()) {
+        printS16ImmOperand(MI, OpNo);
+      } else {
+        printOp(MI->getOperand(OpNo));
+        O << "@l";
+      }
+    }
+
+    /// Print local store address
+    void printSymbolLSA(const MachineInstr *MI, unsigned OpNo) {
+      printOp(MI->getOperand(OpNo));
+    }
+
+    void printROTHNeg7Imm(const MachineInstr *MI, unsigned OpNo) {
+      if (MI->getOperand(OpNo).isImm()) {
+        int value = (int) MI->getOperand(OpNo).getImm();
+        assert((value >= 0 && value < 16)
+               && "Invalid negated immediate rotate 7-bit argument");
+        O << -value;
+      } else {
+        llvm_unreachable("Invalid/non-immediate rotate amount in printRotateNeg7Imm");
+      }
+    }
+
+    void printROTNeg7Imm(const MachineInstr *MI, unsigned OpNo) {
+      if (MI->getOperand(OpNo).isImm()) {
+        int value = (int) MI->getOperand(OpNo).getImm();
+        assert((value >= 0 && value <= 32)
+               && "Invalid negated immediate rotate 7-bit argument");
+        O << -value;
+      } else {
+        llvm_unreachable("Invalid/non-immediate rotate amount in printRotateNeg7Imm");
+      }
+    }
+  };
+} // end of anonymous namespace
+
+// Include the auto-generated portion of the assembly writer
+#include "SPUGenAsmWriter.inc"
+
+void SPUAsmPrinter::printOp(const MachineOperand &MO) {
+  switch (MO.getType()) {
+  case MachineOperand::MO_Immediate:
+    llvm_report_error("printOp() does not handle immediate values");
+    return;
+
+  case MachineOperand::MO_MachineBasicBlock:
+    O << *MO.getMBB()->getSymbol(OutContext);
+    return;
+  case MachineOperand::MO_JumpTableIndex:
+    O << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+      << '_' << MO.getIndex();
+    return;
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << MAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber()
+      << '_' << MO.getIndex();
+    return;
+  case MachineOperand::MO_ExternalSymbol:
+    // Computing the address of an external symbol, not calling it.
+    if (TM.getRelocationModel() != Reloc::Static) {
+      O << "L" << MAI->getGlobalPrefix() << MO.getSymbolName()
+        << "$non_lazy_ptr";
+      return;
+    }
+    O << *GetExternalSymbolSymbol(MO.getSymbolName());
+    return;
+  case MachineOperand::MO_GlobalAddress:
+    // External or weakly linked global variables need non-lazily-resolved
+    // stubs
+    if (TM.getRelocationModel() != Reloc::Static) {
+      GlobalValue *GV = MO.getGlobal();
+      if (((GV->isDeclaration() || GV->hasWeakLinkage() ||
+            GV->hasLinkOnceLinkage() || GV->hasCommonLinkage()))) {
+        O << *GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+        return;
+      }
+    }
+    O << *GetGlobalValueSymbol(MO.getGlobal());
+    return;
+  default:
+    O << "<unknown operand type: " << MO.getType() << ">";
+    return;
+  }
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool SPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                    unsigned AsmVariant,
+                                    const char *ExtraCode) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default: return true;  // Unknown modifier.
+    case 'L': // Write second word of DImode reference.
+      // Verify that this operand has two consecutive registers.
+      if (!MI->getOperand(OpNo).isReg() ||
+          OpNo+1 == MI->getNumOperands() ||
+          !MI->getOperand(OpNo+1).isReg())
+        return true;
+      ++OpNo;   // Return the high-part.
+      break;
+    }
+  }
+
+  printOperand(MI, OpNo);
+  return false;
+}
+
+bool SPUAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                          unsigned OpNo,
+                                          unsigned AsmVariant,
+                                          const char *ExtraCode) {
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier.
+  printMemRegReg(MI, OpNo);
+  return false;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeCellSPUAsmPrinter() { 
+  RegisterAsmPrinter<SPUAsmPrinter> X(TheCellSPUTarget);
+}

diff --git a/lib/Target/CellSPU/CMakeLists.txt b/lib/Target/CellSPU/CMakeLists.txt
new file mode 100644
index 0000000..0cb6676
--- /dev/null
+++ b/lib/Target/CellSPU/CMakeLists.txt

@@ -0,0 +1,26 @@
+set(LLVM_TARGET_DEFINITIONS SPU.td)
+
+tablegen(SPUGenInstrNames.inc -gen-instr-enums)
+tablegen(SPUGenRegisterNames.inc -gen-register-enums)
+tablegen(SPUGenAsmWriter.inc -gen-asm-writer)
+tablegen(SPUGenCodeEmitter.inc -gen-emitter)
+tablegen(SPUGenRegisterInfo.h.inc -gen-register-desc-header)
+tablegen(SPUGenRegisterInfo.inc -gen-register-desc)
+tablegen(SPUGenInstrInfo.inc -gen-instr-desc)
+tablegen(SPUGenDAGISel.inc -gen-dag-isel)
+tablegen(SPUGenSubtarget.inc -gen-subtarget)
+tablegen(SPUGenCallingConv.inc -gen-callingconv)
+
+add_llvm_target(CellSPUCodeGen
+  SPUFrameInfo.cpp
+  SPUHazardRecognizers.cpp
+  SPUInstrInfo.cpp
+  SPUISelDAGToDAG.cpp
+  SPUISelLowering.cpp
+  SPUMCAsmInfo.cpp
+  SPURegisterInfo.cpp
+  SPUSubtarget.cpp
+  SPUTargetMachine.cpp
+  )
+
+target_link_libraries (LLVMCellSPUCodeGen LLVMSelectionDAG)

diff --git a/lib/Target/CellSPU/CellSDKIntrinsics.td b/lib/Target/CellSPU/CellSDKIntrinsics.td
new file mode 100644
index 0000000..5d759a4
--- /dev/null
+++ b/lib/Target/CellSPU/CellSDKIntrinsics.td

@@ -0,0 +1,448 @@
+//===-- CellSDKIntrinsics.td - Cell SDK Intrinsics ---------*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+///--==-- Arithmetic ops intrinsics --==--
+def CellSDKah:
+    RR_Int_v8i16<0b00010011000, "ah", IntegerOp, int_spu_si_ah>;
+def CellSDKahi:
+    RI10_Int_v8i16<0b00010011000, "ahi", IntegerOp, int_spu_si_ahi>;
+def CellSDKa:
+    RR_Int_v4i32<0b00000011000, "a", IntegerOp, int_spu_si_a>;
+def CellSDKai:
+    RI10_Int_v4i32<0b00111000, "ai", IntegerOp, int_spu_si_ai>;
+def CellSDKsfh:
+    RR_Int_v8i16<0b00010010000, "sfh", IntegerOp, int_spu_si_sfh>;
+def CellSDKsfhi:
+    RI10_Int_v8i16<0b10110000, "sfhi", IntegerOp, int_spu_si_sfhi>;
+def CellSDKsf:
+    RR_Int_v4i32<0b00000010000, "sf", IntegerOp, int_spu_si_sf>;
+def CellSDKsfi:
+    RI10_Int_v4i32<0b00110000, "sfi", IntegerOp, int_spu_si_sfi>;
+def CellSDKaddx:
+    RR_Int_v4i32<0b00000010110, "addx", IntegerOp, int_spu_si_addx>;
+def CellSDKcg:
+    RR_Int_v4i32<0b0100001100, "cg", IntegerOp, int_spu_si_cg>;
+def CellSDKcgx:
+    RR_Int_v4i32<0b01000010110, "cgx", IntegerOp, int_spu_si_cgx>;
+def CellSDKsfx:
+    RR_Int_v4i32<0b10000010110, "sfx", IntegerOp, int_spu_si_sfx>;
+def CellSDKbg:
+    RR_Int_v4i32<0b01000010000, "bg", IntegerOp, int_spu_si_bg>;
+def CellSDKbgx:
+    RR_Int_v4i32<0b11000010110, "bgx", IntegerOp, int_spu_si_bgx>;
+
+def CellSDKmpy:
+    RRForm<0b00100011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "mpy $rT, $rA, $rB", IntegerMulDiv,
+      [(set (v4i32 VECREG:$rT), (int_spu_si_mpy (v8i16 VECREG:$rA),
+                                                (v8i16 VECREG:$rB)))]>;
+
+def CellSDKmpyu:
+    RRForm<0b00110011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "mpyu $rT, $rA, $rB", IntegerMulDiv,
+      [(set (v4i32 VECREG:$rT), (int_spu_si_mpyu (v8i16 VECREG:$rA),
+                                                 (v8i16 VECREG:$rB)))] >;
+
+def CellSDKmpyi:
+    RI10Form<0b00101110, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+      "mpyi $rT, $rA, $val", IntegerMulDiv,
+      [(set (v4i32 VECREG:$rT), (int_spu_si_mpyi (v8i16 VECREG:$rA),
+                                                 i16ImmSExt10:$val))]>;
+
+def CellSDKmpyui:
+    RI10Form<0b10101110, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+      "mpyui $rT, $rA, $val", IntegerMulDiv,
+      [(set (v4i32 VECREG:$rT), (int_spu_si_mpyui (v8i16 VECREG:$rA),
+                                                  i16ImmSExt10:$val))]>;
+
+def CellSDKmpya:
+    RRRForm<0b0011, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+      "mpya $rT, $rA, $rB, $rC", IntegerMulDiv,
+      [(set (v4i32 VECREG:$rT), (int_spu_si_mpya (v8i16 VECREG:$rA),
+                                                 (v8i16 VECREG:$rB),
+                                                 (v8i16 VECREG:$rC)))]>;
+
+def CellSDKmpyh:
+    RRForm<0b10100011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "mpyh $rT, $rA, $rB", IntegerMulDiv,
+      [(set (v4i32 VECREG:$rT), (int_spu_si_mpyh (v4i32 VECREG:$rA),
+                                                 (v8i16 VECREG:$rB)))]>;
+
+def CellSDKmpys:
+    RRForm<0b11100011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "mpys $rT, $rA, $rB", IntegerMulDiv,
+      [(set (v4i32 VECREG:$rT), (int_spu_si_mpys (v8i16 VECREG:$rA),
+                                                 (v8i16 VECREG:$rB)))]>;
+
+def CellSDKmpyhh:
+    RRForm<0b01100011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "mpyhh $rT, $rA, $rB", IntegerMulDiv,
+      [(set (v4i32 VECREG:$rT), (int_spu_si_mpyhh (v8i16 VECREG:$rA),
+                                                  (v8i16 VECREG:$rB)))]>;
+
+def CellSDKmpyhha:
+    RRForm<0b01100010110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "mpyhha $rT, $rA, $rB", IntegerMulDiv,
+      [(set (v4i32 VECREG:$rT), (int_spu_si_mpyhha (v8i16 VECREG:$rA),
+                                                   (v8i16 VECREG:$rB)))]>;
+
+// Not sure how to match a (set $rT, (add $rT (mpyhh $rA, $rB)))... so leave
+// as an intrinsic for the time being
+def CellSDKmpyhhu:
+    RRForm<0b01110011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "mpyhhu $rT, $rA, $rB", IntegerMulDiv,
+      [(set (v4i32 VECREG:$rT), (int_spu_si_mpyhhu (v8i16 VECREG:$rA),
+                                                   (v8i16 VECREG:$rB)))]>;
+
+def CellSDKmpyhhau:
+    RRForm<0b01110010110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "mpyhhau $rT, $rA, $rB", IntegerMulDiv,
+      [(set (v4i32 VECREG:$rT), (int_spu_si_mpyhhau (v8i16 VECREG:$rA),
+                                                    (v8i16 VECREG:$rB)))]>;
+
+def CellSDKand:
+        RRForm<0b1000011000, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+          "and\t $rT, $rA, $rB", IntegerOp,
+          [(set (v4i32 VECREG:$rT),
+                (int_spu_si_and (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+
+def CellSDKandc:
+        RRForm<0b10000011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+          "andc\t $rT, $rA, $rB", IntegerOp,
+          [(set (v4i32 VECREG:$rT),
+                (int_spu_si_andc (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+
+def CellSDKandbi:
+     RI10Form<0b01101000, (outs VECREG:$rT), (ins VECREG:$rA, u10imm_i8:$val),
+       "andbi\t $rT, $rA, $val", BranchResolv,
+       [(set (v16i8 VECREG:$rT),
+             (int_spu_si_andbi (v16i8 VECREG:$rA), immU8:$val))]>;
+
+def CellSDKandhi:
+     RI10Form<0b10101000, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+           "andhi\t $rT, $rA, $val", BranchResolv,
+       [(set (v8i16 VECREG:$rT),
+             (int_spu_si_andhi (v8i16 VECREG:$rA), i16ImmSExt10:$val))]>;
+
+def CellSDKandi:
+     RI10Form<0b00101000, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+           "andi\t $rT, $rA, $val", BranchResolv,
+       [(set (v4i32 VECREG:$rT),
+             (int_spu_si_andi (v4i32 VECREG:$rA), i32ImmSExt10:$val))]>;
+
+def CellSDKor:
+        RRForm<0b10000010000, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+          "or\t $rT, $rA, $rB", IntegerOp,
+          [(set (v4i32 VECREG:$rT),
+                (int_spu_si_or (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+
+def CellSDKorc:
+        RRForm<0b10010011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+          "addc\t $rT, $rA, $rB", IntegerOp,
+          [(set (v4i32 VECREG:$rT),
+                (int_spu_si_orc (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+
+def CellSDKorbi:
+     RI10Form<0b01100000, (outs VECREG:$rT), (ins VECREG:$rA, u10imm_i8:$val),
+       "orbi\t $rT, $rA, $val", BranchResolv,
+       [(set (v16i8 VECREG:$rT),
+             (int_spu_si_orbi (v16i8 VECREG:$rA), immU8:$val))]>;
+
+def CellSDKorhi:
+     RI10Form<0b10100000, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+           "orhi\t $rT, $rA, $val", BranchResolv,
+       [(set (v8i16 VECREG:$rT),
+             (int_spu_si_orhi (v8i16 VECREG:$rA), i16ImmSExt10:$val))]>;
+
+def CellSDKori:
+     RI10Form<0b00100000, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+           "ori\t $rT, $rA, $val", BranchResolv,
+       [(set (v4i32 VECREG:$rT),
+             (int_spu_si_ori (v4i32 VECREG:$rA), i32ImmSExt10:$val))]>;
+
+def CellSDKxor:
+        RRForm<0b10000010000, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+          "xor\t $rT, $rA, $rB", IntegerOp,
+          [(set (v4i32 VECREG:$rT), 
+                (int_spu_si_xor (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+
+def CellSDKxorbi:
+     RI10Form<0b01100000, (outs VECREG:$rT), (ins VECREG:$rA, u10imm_i8:$val),
+       "xorbi\t $rT, $rA, $val", BranchResolv,
+       [(set (v16i8 VECREG:$rT), (int_spu_si_xorbi (v16i8 VECREG:$rA), immU8:$val))]>;
+
+def CellSDKxorhi:
+     RI10Form<0b10100000, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+       "xorhi\t $rT, $rA, $val", BranchResolv,
+       [(set (v8i16 VECREG:$rT), 
+             (int_spu_si_xorhi (v8i16 VECREG:$rA), i16ImmSExt10:$val))]>;
+
+def CellSDKxori:
+     RI10Form<0b00100000, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+           "xori\t $rT, $rA, $val", BranchResolv,
+       [(set (v4i32 VECREG:$rT), 
+             (int_spu_si_xori (v4i32 VECREG:$rA), i32ImmSExt10:$val))]>;
+
+def CellSDKnor:
+        RRForm<0b10000010000, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+          "nor\t $rT, $rA, $rB", IntegerOp,
+          [(set (v4i32 VECREG:$rT), 
+                (int_spu_si_nor (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+
+def CellSDKnand:
+        RRForm<0b10000010000, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+          "nand\t $rT, $rA, $rB", IntegerOp,
+          [(set (v4i32 VECREG:$rT), 
+                (int_spu_si_nand (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+
+//===----------------------------------------------------------------------===//
+// Shift/rotate intrinsics:
+//===----------------------------------------------------------------------===//
+
+def CellSDKshli:
+  Pat<(int_spu_si_shli (v4i32 VECREG:$rA), uimm7:$val),
+      (SHLIv4i32 VECREG:$rA, uimm7:$val)>;
+
+def CellSDKshlqbi:
+  Pat<(int_spu_si_shlqbi VECREG:$rA, R32C:$rB),
+      (SHLQBIv16i8 VECREG:$rA, R32C:$rB)>;
+
+def CellSDKshlqii:
+  Pat<(int_spu_si_shlqbii VECREG:$rA, uimm7:$val),
+      (SHLQBIIv16i8 VECREG:$rA, uimm7:$val)>;
+
+def CellSDKshlqby:
+  Pat<(int_spu_si_shlqby VECREG:$rA, R32C:$rB),
+      (SHLQBYv16i8 VECREG:$rA, R32C:$rB)>;
+
+def CellSDKshlqbyi:
+  Pat<(int_spu_si_shlqbyi VECREG:$rA, uimm7:$val),
+      (SHLQBYIv16i8 VECREG:$rA, uimm7:$val)>;
+          
+//===----------------------------------------------------------------------===//
+// Branch/compare intrinsics:
+//===----------------------------------------------------------------------===//
+
+def CellSDKceq:
+  RRForm<0b00000011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+        "ceq\t $rT, $rA, $rB", BranchResolv,
+        [(set (v4i32 VECREG:$rT), 
+              (int_spu_si_ceq (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+
+def CellSDKceqi:
+  RI10Form<0b00111110, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+        "ceqi\t $rT, $rA, $val", BranchResolv,
+    [(set (v4i32 VECREG:$rT), 
+          (int_spu_si_ceqi (v4i32 VECREG:$rA), i32ImmSExt10:$val))]>;
+
+def CellSDKceqb:
+  RRForm<0b00001011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+        "ceqb\t $rT, $rA, $rB", BranchResolv,
+        [(set (v16i8 VECREG:$rT), 
+              (int_spu_si_ceqb (v16i8 VECREG:$rA), (v16i8 VECREG:$rB)))]>;
+
+def CellSDKceqbi:
+  RI10Form<0b01111110, (outs VECREG:$rT), (ins VECREG:$rA, u10imm_i8:$val),
+        "ceqbi\t $rT, $rA, $val", BranchResolv,
+    [(set (v16i8 VECREG:$rT), (int_spu_si_ceqbi (v16i8 VECREG:$rA), immU8:$val))]>;
+
+def CellSDKceqh:
+  RRForm<0b00010011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+        "ceqh\t $rT, $rA, $rB", BranchResolv,
+        [(set (v8i16 VECREG:$rT), 
+              (int_spu_si_ceqh (v8i16 VECREG:$rA), (v8i16 VECREG:$rB)))]>;
+
+def CellSDKceqhi:
+  RI10Form<0b10111110, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+        "ceqhi\t $rT, $rA, $val", BranchResolv,
+    [(set (v8i16 VECREG:$rT), 
+          (int_spu_si_ceqhi (v8i16 VECREG:$rA), i16ImmSExt10:$val))]>;
+def CellSDKcgth:
+  RRForm<0b00010011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "cgth\t $rT, $rA, $rB", BranchResolv,
+        [(set (v8i16 VECREG:$rT),
+              (int_spu_si_cgth (v8i16 VECREG:$rA), (v8i16 VECREG:$rB)))]>;
+
+def CellSDKcgthi:
+  RI10Form<0b10111110, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+    "cgthi\t $rT, $rA, $val", BranchResolv,
+        [(set (v8i16 VECREG:$rT), 
+              (int_spu_si_cgthi (v8i16 VECREG:$rA), i16ImmSExt10:$val))]>;
+
+def CellSDKcgt:
+  RRForm<0b00000010010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "cgt\t $rT, $rA, $rB", BranchResolv,
+        [(set (v4i32 VECREG:$rT), 
+              (int_spu_si_cgt (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+
+def CellSDKcgti:
+  RI10Form<0b00110010, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+    "cgti\t $rT, $rA, $val", BranchResolv,
+        [(set (v4i32 VECREG:$rT), 
+              (int_spu_si_cgti (v4i32 VECREG:$rA), i32ImmSExt10:$val))]>;
+
+def CellSDKcgtb:
+  RRForm<0b00001010010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "cgtb\t $rT, $rA, $rB", BranchResolv,
+        [(set (v16i8 VECREG:$rT), 
+              (int_spu_si_cgtb (v16i8 VECREG:$rA), (v16i8 VECREG:$rB)))]>;
+
+def CellSDKcgtbi:
+  RI10Form<0b01110010, (outs VECREG:$rT), (ins VECREG:$rA, u10imm_i8:$val),
+    "cgtbi\t $rT, $rA, $val", BranchResolv,
+        [(set (v16i8 VECREG:$rT), (int_spu_si_cgtbi (v16i8 VECREG:$rA), immU8:$val))]>;
+
+def CellSDKclgth:
+  RRForm<0b00010011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "clgth\t $rT, $rA, $rB", BranchResolv,
+        [(set (v8i16 VECREG:$rT), 
+              (int_spu_si_clgth (v8i16 VECREG:$rA), (v8i16 VECREG:$rB)))]>;
+
+def CellSDKclgthi:
+  RI10Form<0b10111010, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+    "clgthi\t $rT, $rA, $val", BranchResolv,
+        [(set (v8i16 VECREG:$rT), 
+              (int_spu_si_clgthi (v8i16 VECREG:$rA), i16ImmSExt10:$val))]>;
+
+def CellSDKclgt:
+  RRForm<0b00000011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "clgt\t $rT, $rA, $rB", BranchResolv,
+        [(set (v4i32 VECREG:$rT), 
+              (int_spu_si_clgt (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+
+def CellSDKclgti:
+  RI10Form<0b00111010, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+    "clgti\t $rT, $rA, $val", BranchResolv,
+        [(set (v4i32 VECREG:$rT), 
+              (int_spu_si_clgti (v4i32 VECREG:$rA), i32ImmSExt10:$val))]>;
+
+def CellSDKclgtb:
+  RRForm<0b00001011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "clgtb\t $rT, $rA, $rB", BranchResolv,
+    [(set (v16i8 VECREG:$rT),
+          (int_spu_si_clgtb (v16i8 VECREG:$rA), (v16i8 VECREG:$rB)))]>;
+
+def CellSDKclgtbi:
+  RI10Form<0b01111010, (outs VECREG:$rT), (ins VECREG:$rA, u10imm_i8:$val),
+    "clgtbi\t $rT, $rA, $val", BranchResolv,
+    [(set (v16i8 VECREG:$rT),
+          (int_spu_si_clgtbi (v16i8 VECREG:$rA), immU8:$val))]>;
+
+//===----------------------------------------------------------------------===//
+// Floating-point intrinsics:
+//===----------------------------------------------------------------------===//
+
+def CellSDKfa:
+  RRForm<0b00100011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "fa\t $rT, $rA, $rB", SPrecFP,
+        [(set (v4f32 VECREG:$rT), (int_spu_si_fa (v4f32 VECREG:$rA),
+                                                 (v4f32 VECREG:$rB)))]>;
+
+def CellSDKfs:
+  RRForm<0b10100011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "fs\t $rT, $rA, $rB", SPrecFP,
+        [(set (v4f32 VECREG:$rT), (int_spu_si_fs (v4f32 VECREG:$rA),
+                                                 (v4f32 VECREG:$rB)))]>;
+
+def CellSDKfm:
+  RRForm<0b01100011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "fm\t $rT, $rA, $rB", SPrecFP,
+        [(set (v4f32 VECREG:$rT), (int_spu_si_fm (v4f32 VECREG:$rA),
+                                                 (v4f32 VECREG:$rB)))]>;
+
+def CellSDKfceq:
+  RRForm<0b01000011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "fceq\t $rT, $rA, $rB", SPrecFP,
+        [(set (v4f32 VECREG:$rT), (int_spu_si_fceq (v4f32 VECREG:$rA),
+                                                   (v4f32 VECREG:$rB)))]>;
+
+def CellSDKfcgt:
+  RRForm<0b01000011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "fcgt\t $rT, $rA, $rB", SPrecFP,
+        [(set (v4f32 VECREG:$rT), (int_spu_si_fcgt (v4f32 VECREG:$rA),
+                                                   (v4f32 VECREG:$rB)))]>;
+
+def CellSDKfcmeq:
+  RRForm<0b01010011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "fcmeq\t $rT, $rA, $rB", SPrecFP,
+        [(set (v4f32 VECREG:$rT), (int_spu_si_fcmeq (v4f32 VECREG:$rA),
+                                                    (v4f32 VECREG:$rB)))]>;
+
+def CellSDKfcmgt:
+  RRForm<0b01010011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "fcmgt\t $rT, $rA, $rB", SPrecFP,
+        [(set (v4f32 VECREG:$rT), (int_spu_si_fcmgt (v4f32 VECREG:$rA),
+                                                    (v4f32 VECREG:$rB)))]>;
+
+def CellSDKfma:
+  RRRForm<0b0111, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+    "fma\t $rT, $rA, $rB, $rC", SPrecFP,
+        [(set (v4f32 VECREG:$rT), (int_spu_si_fma (v4f32 VECREG:$rA),
+                                                  (v4f32 VECREG:$rB),
+                                                  (v4f32 VECREG:$rC)))]>;
+
+def CellSDKfnms:
+  RRRForm<0b1011, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+    "fnms\t $rT, $rA, $rB, $rC", SPrecFP,
+        [(set (v4f32 VECREG:$rT), (int_spu_si_fnms (v4f32 VECREG:$rA),
+                                                   (v4f32 VECREG:$rB),
+                                                   (v4f32 VECREG:$rC)))]>;
+
+def CellSDKfms:
+  RRRForm<0b1111, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+    "fms\t $rT, $rA, $rB, $rC", SPrecFP,
+        [(set (v4f32 VECREG:$rT), (int_spu_si_fms (v4f32 VECREG:$rA),
+                                                  (v4f32 VECREG:$rB),
+                                                  (v4f32 VECREG:$rC)))]>;
+
+//===----------------------------------------------------------------------===//
+// Double precision floating-point intrinsics:
+//===----------------------------------------------------------------------===//
+
+def CellSDKdfa:
+  RRForm<0b00110011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "dfa\t $rT, $rA, $rB", DPrecFP,
+        [(set (v2f64 VECREG:$rT), (int_spu_si_dfa (v2f64 VECREG:$rA),
+                                                  (v2f64 VECREG:$rB)))]>;
+
+def CellSDKdfs:
+  RRForm<0b10110011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "dfs\t $rT, $rA, $rB", DPrecFP,
+        [(set (v2f64 VECREG:$rT), (int_spu_si_dfs (v2f64 VECREG:$rA),
+                                                  (v2f64 VECREG:$rB)))]>;
+
+def CellSDKdfm:
+  RRForm<0b01110011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "dfm\t $rT, $rA, $rB", DPrecFP,
+        [(set (v2f64 VECREG:$rT), (int_spu_si_dfm (v2f64 VECREG:$rA),
+                                                  (v2f64 VECREG:$rB)))]>;
+
+def CellSDKdfma:
+  RRForm<0b00111010110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "dfma\t $rT, $rA, $rB", DPrecFP,
+        [(set (v2f64 VECREG:$rT), (int_spu_si_dfma (v2f64 VECREG:$rA),
+                                                   (v2f64 VECREG:$rB)))]>;
+
+def CellSDKdfnma:
+  RRForm<0b11111010110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "dfnma\t $rT, $rA, $rB", DPrecFP,
+        [(set (v2f64 VECREG:$rT), (int_spu_si_dfnma (v2f64 VECREG:$rA),
+                                                    (v2f64 VECREG:$rB)))]>;
+
+def CellSDKdfnms:
+  RRForm<0b01111010110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "dfnms\t $rT, $rA, $rB", DPrecFP,
+        [(set (v2f64 VECREG:$rT), (int_spu_si_dfnms (v2f64 VECREG:$rA),
+                                                    (v2f64 VECREG:$rB)))]>;
+
+def CellSDKdfms:
+  RRForm<0b10111010110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "dfms\t $rT, $rA, $rB", DPrecFP,
+        [(set (v2f64 VECREG:$rT), (int_spu_si_dfms (v2f64 VECREG:$rA),
+                                                   (v2f64 VECREG:$rB)))]>;

diff --git a/lib/Target/CellSPU/Makefile b/lib/Target/CellSPU/Makefile
new file mode 100644
index 0000000..cbdbd3c
--- /dev/null
+++ b/lib/Target/CellSPU/Makefile

@@ -0,0 +1,21 @@
+##===- lib/Target/CellSPU/Makefile -------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMCellSPUCodeGen
+TARGET = SPU
+BUILT_SOURCES = SPUGenInstrNames.inc SPUGenRegisterNames.inc \
+		SPUGenAsmWriter.inc SPUGenCodeEmitter.inc \
+		SPUGenRegisterInfo.h.inc SPUGenRegisterInfo.inc \
+		SPUGenInstrInfo.inc SPUGenDAGISel.inc \
+		SPUGenSubtarget.inc SPUGenCallingConv.inc
+
+DIRS = AsmPrinter TargetInfo
+
+include $(LEVEL)/Makefile.common

diff --git a/lib/Target/CellSPU/README.txt b/lib/Target/CellSPU/README.txt
new file mode 100644
index 0000000..4783dd5
--- /dev/null
+++ b/lib/Target/CellSPU/README.txt

@@ -0,0 +1,90 @@
+//===- README.txt - Notes for improving CellSPU-specific code gen ---------===//
+
+This code was contributed by a team from the Computer Systems Research
+Department in The Aerospace Corporation:
+
+- Scott Michel (head bottle washer and much of the non-floating point
+  instructions)
+- Mark Thomas (floating point instructions)
+- Michael AuYeung (intrinsics)
+- Chandler Carruth (LLVM expertise)
+- Nehal Desai (debugging, i32 operations, RoadRunner SPU expertise)
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NONINFRINGEMENT, OR
+OTHERWISE.  IN NO EVENT SHALL THE AEROSPACE CORPORATION BE LIABLE FOR DAMAGES
+OF ANY KIND OR NATURE WHETHER BASED IN CONTRACT, TORT, OR OTHERWISE ARISING
+OUT OF OR IN CONNECTION WITH THE USE OF THE SOFTWARE INCLUDING, WITHOUT
+LIMITATION, DAMAGES RESULTING FROM LOST OR CONTAMINATED DATA, LOST PROFITS OR
+REVENUE, COMPUTER MALFUNCTION, OR FOR ANY SPECIAL, INCIDENTAL, CONSEQUENTIAL,
+OR PUNITIVE  DAMAGES, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES OR
+SUCH DAMAGES ARE FORESEEABLE.
+
+---------------------------------------------------------------------------
+--WARNING--:
+--WARNING--: The CellSPU work is work-in-progress and "alpha" quality code.
+--WARNING--:
+
+If you are brave enough to try this code or help to hack on it, be sure
+to add 'spu' to configure's --enable-targets option, e.g.:
+
+        ./configure <your_configure_flags_here> \
+           --enable-targets=x86,x86_64,powerpc,spu
+
+---------------------------------------------------------------------------
+
+TODO:
+* Create a machine pass for performing dual-pipeline scheduling specifically
+  for CellSPU, and insert branch prediction instructions as needed.
+
+* i32 instructions:
+
+  * i32 division (work-in-progress)
+
+* i64 support (see i64operations.c test harness):
+
+  * shifts and comparison operators: done
+  * sign and zero extension: done
+  * addition: done
+  * subtraction: needed
+  * multiplication: done
+
+* i128 support:
+
+  * zero extension, any extension: done
+  * sign extension: needed
+  * arithmetic operators (add, sub, mul, div): needed
+  * logical operations (and, or, shl, srl, sra, xor, nor, nand): needed
+
+    * or: done
+
+* f64 support
+
+  * Comparison operators:
+    SETOEQ              unimplemented
+    SETOGT              unimplemented
+    SETOGE              unimplemented
+    SETOLT              unimplemented
+    SETOLE              unimplemented
+    SETONE              unimplemented
+    SETO                done (lowered)
+    SETUO               done (lowered)
+    SETUEQ              unimplemented
+    SETUGT              unimplemented
+    SETUGE              unimplemented
+    SETULT              unimplemented
+    SETULE              unimplemented
+    SETUNE              unimplemented
+
+* LLVM vector suport
+
+  * VSETCC needs to be implemented. It's pretty straightforward to code, but
+    needs implementation.
+
+* Intrinsics
+
+  * spu.h instrinsics added but not tested. Need to have an operational
+    llvm-spu-gcc in order to write a unit test harness.
+
+===-------------------------------------------------------------------------===

diff --git a/lib/Target/CellSPU/SPU.h b/lib/Target/CellSPU/SPU.h
new file mode 100644
index 0000000..c960974
--- /dev/null
+++ b/lib/Target/CellSPU/SPU.h

@@ -0,0 +1,101 @@
+//===-- SPU.h - Top-level interface for Cell SPU Target ----------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// Cell SPU back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_IBMCELLSPU_H
+#define LLVM_TARGET_IBMCELLSPU_H
+
+#include "llvm/System/DataTypes.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+  class SPUTargetMachine;
+  class FunctionPass;
+  class formatted_raw_ostream;
+
+  FunctionPass *createSPUISelDag(SPUTargetMachine &TM);
+
+  /*--== Utility functions/predicates/etc used all over the place: --==*/
+  //! Predicate test for a signed 10-bit value
+  /*!
+    \param Value The input value to be tested
+
+    This predicate tests for a signed 10-bit value, returning the 10-bit value
+    as a short if true.
+   */
+  template<typename T>
+  inline bool isS10Constant(T Value);
+
+  template<>
+  inline bool isS10Constant<short>(short Value) {
+    int SExtValue = ((int) Value << (32 - 10)) >> (32 - 10);
+    return ((Value > 0 && Value <= (1 << 9) - 1)
+            || (Value < 0 && (short) SExtValue == Value));
+  }
+
+  template<>
+  inline bool isS10Constant<int>(int Value) {
+    return (Value >= -(1 << 9) && Value <= (1 << 9) - 1);
+  }
+
+  template<>
+  inline bool isS10Constant<uint32_t>(uint32_t Value) {
+    return (Value <= ((1 << 9) - 1));
+  }
+
+  template<>
+  inline bool isS10Constant<int64_t>(int64_t Value) {
+    return (Value >= -(1 << 9) && Value <= (1 << 9) - 1);
+  }
+
+  template<>
+  inline bool isS10Constant<uint64_t>(uint64_t Value) {
+    return (Value <= ((1 << 9) - 1));
+  }
+
+  //! Predicate test for an unsigned 10-bit value
+  /*!
+    \param Value The input value to be tested
+
+    This predicate tests for an unsigned 10-bit value, returning the 10-bit value
+    as a short if true.
+   */
+  inline bool isU10Constant(short Value) {
+    return (Value == (Value & 0x3ff));
+  }
+
+  inline bool isU10Constant(int Value) {
+    return (Value == (Value & 0x3ff));
+  }
+
+  inline bool isU10Constant(uint32_t Value) {
+    return (Value == (Value & 0x3ff));
+  }
+
+  inline bool isU10Constant(int64_t Value) {
+    return (Value == (Value & 0x3ff));
+  }
+
+  inline bool isU10Constant(uint64_t Value) {
+    return (Value == (Value & 0x3ff));
+  }
+
+  extern Target TheCellSPUTarget;
+
+}
+
+// Defines symbolic names for the SPU instructions.
+//
+#include "SPUGenInstrNames.inc"
+
+#endif /* LLVM_TARGET_IBMCELLSPU_H */

diff --git a/lib/Target/CellSPU/SPU.td b/lib/Target/CellSPU/SPU.td
new file mode 100644
index 0000000..8327fe0
--- /dev/null
+++ b/lib/Target/CellSPU/SPU.td

@@ -0,0 +1,66 @@
+//===- SPU.td - Describe the STI Cell SPU Target Machine ----*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the top level entry point for the STI Cell SPU target machine.
+//
+//===----------------------------------------------------------------------===//
+
+// Get the target-independent interfaces which we are implementing.
+//
+include "llvm/Target/Target.td"
+
+// Holder of code fragments (you'd think this'd already be in
+// a td file somewhere... :-)
+
+class CodeFrag<dag frag> {
+  dag Fragment = frag;
+}
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "SPURegisterInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction formats, instructions
+//===----------------------------------------------------------------------===//
+
+include "SPUNodes.td"
+include "SPUOperands.td"
+include "SPUSchedule.td"
+include "SPUInstrFormats.td"
+include "SPUInstrInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Subtarget features:
+//===----------------------------------------------------------------------===//
+
+def DefaultProc: SubtargetFeature<"", "ProcDirective", "SPU::DEFAULT_PROC", "">;
+def LargeMemFeature:
+  SubtargetFeature<"large_mem","UseLargeMem", "true",
+                   "Use large (>256) LSA memory addressing [default = false]">;
+
+def SPURev0 : Processor<"v0", SPUItineraries, [DefaultProc]>;
+
+//===----------------------------------------------------------------------===//
+// Calling convention:
+//===----------------------------------------------------------------------===//
+
+include "SPUCallingConv.td"
+
+// Target:
+
+def SPUInstrInfo : InstrInfo {
+  let isLittleEndianEncoding = 1;
+}
+
+def SPU : Target {
+  let InstructionSet = SPUInstrInfo;
+}

diff --git a/lib/Target/CellSPU/SPU128InstrInfo.td b/lib/Target/CellSPU/SPU128InstrInfo.td
new file mode 100644
index 0000000..3031fda
--- /dev/null
+++ b/lib/Target/CellSPU/SPU128InstrInfo.td

@@ -0,0 +1,41 @@
+//===--- SPU128InstrInfo.td - Cell SPU 128-bit operations -*- tablegen -*--===//
+//
+//                     Cell SPU 128-bit operations
+//
+//===----------------------------------------------------------------------===//
+                                  
+// zext 32->128: Zero extend 32-bit to 128-bit
+def : Pat<(i128 (zext R32C:$rSrc)),
+          (ROTQMBYIr128_zext_r32 R32C:$rSrc, 12)>;
+
+// zext 64->128: Zero extend 64-bit to 128-bit
+def : Pat<(i128 (zext R64C:$rSrc)),
+          (ROTQMBYIr128_zext_r64 R64C:$rSrc, 8)>;
+
+// zext 16->128: Zero extend 16-bit to 128-bit
+def : Pat<(i128 (zext R16C:$rSrc)),
+          (ROTQMBYIr128_zext_r32 (ANDi16i32 R16C:$rSrc, (ILAr32 0xffff)), 12)>;
+
+// zext 8->128: Zero extend 8-bit to 128-bit
+def : Pat<(i128 (zext R8C:$rSrc)),
+          (ROTQMBYIr128_zext_r32 (ANDIi8i32 R8C:$rSrc, 0xf), 12)>;
+
+// anyext 32->128: Zero extend 32-bit to 128-bit
+def : Pat<(i128 (anyext R32C:$rSrc)),
+          (ROTQMBYIr128_zext_r32 R32C:$rSrc, 12)>;
+
+// anyext 64->128: Zero extend 64-bit to 128-bit
+def : Pat<(i128 (anyext R64C:$rSrc)),
+          (ROTQMBYIr128_zext_r64 R64C:$rSrc, 8)>;
+
+// anyext 16->128: Zero extend 16-bit to 128-bit
+def : Pat<(i128 (anyext R16C:$rSrc)),
+          (ROTQMBYIr128_zext_r32 (ANDi16i32 R16C:$rSrc, (ILAr32 0xffff)), 12)>;
+
+// anyext 8->128: Zero extend 8-bit to 128-bit
+def : Pat<(i128 (anyext R8C:$rSrc)),
+          (ROTQMBYIr128_zext_r32 (ANDIi8i32 R8C:$rSrc, 0xf), 12)>;
+
+// Shift left
+def : Pat<(shl GPRC:$rA, R32C:$rB),
+          (SHLQBYBIr128 (SHLQBIr128 GPRC:$rA, R32C:$rB), R32C:$rB)>;

diff --git a/lib/Target/CellSPU/SPU64InstrInfo.td b/lib/Target/CellSPU/SPU64InstrInfo.td
new file mode 100644
index 0000000..06eb149
--- /dev/null
+++ b/lib/Target/CellSPU/SPU64InstrInfo.td

@@ -0,0 +1,394 @@
+//====--- SPU64InstrInfo.td - Cell SPU 64-bit operations -*- tablegen -*--====//
+//
+//                     Cell SPU 64-bit operations
+//
+//===----------------------------------------------------------------------===//
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// 64-bit comparisons:
+//
+// 1. The instruction sequences for vector vice scalar differ by a
+//    constant. In the scalar case, we're only interested in the
+//    top two 32-bit slots, whereas we're interested in an exact
+//    all-four-slot match in the vector case.
+//
+// 2. There are no "immediate" forms, since loading 64-bit constants
+//    could be a constant pool load.
+//
+// 3. i64 setcc results are i32, which are subsequently converted to a FSM
+//    mask when used in a select pattern.
+//
+// 4. v2i64 setcc results are v4i32, which can be converted to a FSM mask (TODO)
+//    [Note: this may be moot, since gb produces v4i32 or r32.]
+//
+// 5. The code sequences for r64 and v2i64 are probably overly conservative,
+//    compared to the code that gcc produces.
+//
+// M00$E B!tes Kan be Pretty N@sTi!!!!! (appologies to Monty!)
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+// selb instruction definition for i64. Note that the selection mask is
+// a vector, produced by various forms of FSM:
+def SELBr64_cond:
+  SELBInst<(outs R64C:$rT), (ins R64C:$rA, R64C:$rB, VECREG:$rC),
+           [/* no pattern */]>;
+
+// The generic i64 select pattern, which assumes that the comparison result
+// is in a 32-bit register that contains a select mask pattern (i.e., gather
+// bits result):
+
+def : Pat<(select R32C:$rCond, R64C:$rFalse, R64C:$rTrue),
+          (SELBr64_cond R64C:$rTrue, R64C:$rFalse, (FSMr32 R32C:$rCond))>;
+
+// select the negative condition:
+class I64SELECTNegCond<PatFrag cond, CodeFrag compare>:
+  Pat<(select (i32 (cond R64C:$rA, R64C:$rB)), R64C:$rTrue, R64C:$rFalse),
+      (SELBr64_cond R64C:$rTrue, R64C:$rFalse, (FSMr32 compare.Fragment))>;
+
+// setcc the negative condition:
+class I64SETCCNegCond<PatFrag cond, CodeFrag compare>:
+  Pat<(cond R64C:$rA, R64C:$rB),
+      (XORIr32 compare.Fragment, -1)>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// The i64 seteq fragment that does the scalar->vector conversion and
+// comparison:
+def CEQr64compare:
+    CodeFrag<(CGTIv4i32 (GBv4i32 (CEQv4i32 (ORv2i64_i64 R64C:$rA),
+                                           (ORv2i64_i64 R64C:$rB))), 0xb)>;
+
+// The i64 seteq fragment that does the vector comparison
+def CEQv2i64compare:
+    CodeFrag<(CEQIv4i32 (GBv4i32 (CEQv4i32 VECREG:$rA, VECREG:$rB)), 0xf)>;
+
+// i64 seteq (equality): the setcc result is i32, which is converted to a
+// vector FSM mask when used in a select pattern.
+//
+// v2i64 seteq (equality): the setcc result is v4i32
+multiclass CompareEqual64 {
+  // Plain old comparison, converts back to i32 scalar
+  def r64: CodeFrag<(ORi32_v4i32 CEQr64compare.Fragment)>;
+  def v2i64: CodeFrag<(ORi32_v4i32 CEQv2i64compare.Fragment)>;
+
+  // SELB mask from FSM:
+  def r64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CEQr64compare.Fragment))>;
+  def v2i64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CEQv2i64compare.Fragment))>;
+}
+
+defm I64EQ: CompareEqual64;
+
+def : Pat<(seteq R64C:$rA, R64C:$rB), I64EQr64.Fragment>;
+def : Pat<(seteq (v2i64 VECREG:$rA), (v2i64 VECREG:$rB)), I64EQv2i64.Fragment>;
+
+// i64 setne:
+def : I64SETCCNegCond<setne, I64EQr64>;
+def : I64SELECTNegCond<setne, I64EQr64>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// i64 setugt/setule:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+def CLGTr64ugt:
+    CodeFrag<(CLGTv4i32 (ORv2i64_i64 R64C:$rA), (ORv2i64_i64 R64C:$rB))>;
+
+def CLGTr64eq:
+    CodeFrag<(CEQv4i32 (ORv2i64_i64 R64C:$rA), (ORv2i64_i64 R64C:$rB))>;
+    
+def CLGTr64compare:
+    CodeFrag<(SELBv2i64 CLGTr64ugt.Fragment,
+                        (XSWDv2i64 CLGTr64ugt.Fragment),
+                        CLGTr64eq.Fragment)>;
+
+def CLGTv2i64ugt:
+    CodeFrag<(CLGTv4i32 VECREG:$rA, VECREG:$rB)>;
+
+def CLGTv2i64eq:
+    CodeFrag<(CEQv4i32 VECREG:$rA, VECREG:$rB)>;
+    
+def CLGTv2i64compare:
+    CodeFrag<(SELBv2i64 CLGTv2i64ugt.Fragment,
+                        (XSWDv2i64 CLGTr64ugt.Fragment),
+                        CLGTv2i64eq.Fragment)>;
+
+multiclass CompareLogicalGreaterThan64 {
+  // Plain old comparison, converts back to i32 scalar
+  def r64: CodeFrag<(ORi32_v4i32 CLGTr64compare.Fragment)>;
+  def v2i64: CodeFrag<CLGTv2i64compare.Fragment>;
+
+  // SELB mask from FSM:
+  def r64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CLGTr64compare.Fragment))>;
+  def v2i64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CLGTv2i64compare.Fragment))>;
+}
+
+defm I64LGT: CompareLogicalGreaterThan64;
+
+def : Pat<(setugt R64C:$rA, R64C:$rB), I64LGTr64.Fragment>;
+def : Pat<(setugt (v2i64 VECREG:$rA), (v2i64 VECREG:$rB)),
+                  I64LGTv2i64.Fragment>;
+
+// i64 setult:
+def : I64SETCCNegCond<setule, I64LGTr64>;
+def : I64SELECTNegCond<setule, I64LGTr64>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// i64 setuge/setult:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+def CLGEr64compare:
+    CodeFrag<(CGTIv4i32 (GBv4i32 (ORv4i32 CLGTr64ugt.Fragment,
+                                          CLGTr64eq.Fragment)), 0xb)>;
+
+def CLGEv2i64compare:
+    CodeFrag<(CEQIv4i32 (GBv4i32 (ORv4i32 CLGTv2i64ugt.Fragment,
+                                          CLGTv2i64eq.Fragment)), 0xf)>;
+
+multiclass CompareLogicalGreaterEqual64 {
+  // Plain old comparison, converts back to i32 scalar
+  def r64: CodeFrag<(ORi32_v4i32 CLGEr64compare.Fragment)>;
+  def v2i64: CodeFrag<CLGEv2i64compare.Fragment>;
+
+  // SELB mask from FSM:
+  def r64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CLGEr64compare.Fragment))>;
+  def v2i64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CLGEv2i64compare.Fragment))>;
+}
+
+defm I64LGE: CompareLogicalGreaterEqual64;
+
+def : Pat<(setuge R64C:$rA, R64C:$rB), I64LGEr64.Fragment>;
+def : Pat<(setuge (v2i64 VECREG:$rA), (v2i64 VECREG:$rB)),
+                  I64LGEv2i64.Fragment>;
+
+// i64 setult:
+def : I64SETCCNegCond<setult, I64LGEr64>;
+def : I64SELECTNegCond<setult, I64LGEr64>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// i64 setgt/setle:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+def CGTr64sgt:
+    CodeFrag<(CGTv4i32 (ORv2i64_i64 R64C:$rA), (ORv2i64_i64 R64C:$rB))>;
+
+def CGTr64eq:
+    CodeFrag<(CEQv4i32 (ORv2i64_i64 R64C:$rA), (ORv2i64_i64 R64C:$rB))>;
+    
+def CGTr64compare:
+    CodeFrag<(SELBv2i64 CGTr64sgt.Fragment,
+                        (XSWDv2i64 CGTr64sgt.Fragment),
+                        CGTr64eq.Fragment)>;
+
+def CGTv2i64sgt:
+    CodeFrag<(CGTv4i32 VECREG:$rA, VECREG:$rB)>;
+
+def CGTv2i64eq:
+    CodeFrag<(CEQv4i32 VECREG:$rA, VECREG:$rB)>;
+    
+def CGTv2i64compare:
+    CodeFrag<(SELBv2i64 CGTv2i64sgt.Fragment,
+                        (XSWDv2i64 CGTr64sgt.Fragment),
+                        CGTv2i64eq.Fragment)>;
+
+multiclass CompareGreaterThan64 {
+  // Plain old comparison, converts back to i32 scalar
+  def r64: CodeFrag<(ORi32_v4i32 CGTr64compare.Fragment)>;
+  def v2i64: CodeFrag<CGTv2i64compare.Fragment>;
+
+  // SELB mask from FSM:
+  def r64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CGTr64compare.Fragment))>;
+  def v2i64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CGTv2i64compare.Fragment))>;
+}
+
+defm I64GT: CompareLogicalGreaterThan64;
+
+def : Pat<(setgt R64C:$rA, R64C:$rB), I64GTr64.Fragment>;
+def : Pat<(setgt (v2i64 VECREG:$rA), (v2i64 VECREG:$rB)),
+                  I64GTv2i64.Fragment>;
+
+// i64 setult:
+def : I64SETCCNegCond<setle, I64GTr64>;
+def : I64SELECTNegCond<setle, I64GTr64>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// i64 setge/setlt:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+    
+def CGEr64compare:
+    CodeFrag<(CGTIv4i32 (GBv4i32 (ORv4i32 CGTr64sgt.Fragment,
+                                          CGTr64eq.Fragment)), 0xb)>;
+
+def CGEv2i64compare:
+    CodeFrag<(CEQIv4i32 (GBv4i32 (ORv4i32 CGTv2i64sgt.Fragment,
+                                          CGTv2i64eq.Fragment)), 0xf)>;
+
+multiclass CompareGreaterEqual64 {
+  // Plain old comparison, converts back to i32 scalar
+  def r64: CodeFrag<(ORi32_v4i32 CGEr64compare.Fragment)>;
+  def v2i64: CodeFrag<CGEv2i64compare.Fragment>;
+
+  // SELB mask from FSM:
+  def r64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CGEr64compare.Fragment))>;
+  def v2i64mask: CodeFrag<(ORi32_v4i32 (FSMv4i32 CGEv2i64compare.Fragment))>;
+}
+
+defm I64GE: CompareGreaterEqual64;
+
+def : Pat<(setge R64C:$rA, R64C:$rB), I64GEr64.Fragment>;
+def : Pat<(setge (v2i64 VECREG:$rA), (v2i64 VECREG:$rB)),
+                  I64GEv2i64.Fragment>;
+
+// i64 setult:
+def : I64SETCCNegCond<setlt, I64GEr64>;
+def : I64SELECTNegCond<setlt, I64GEr64>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// v2i64, i64 add
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class v2i64_add_cg<dag lhs, dag rhs>:
+    CodeFrag<(CGv4i32 lhs, rhs)>;
+
+class v2i64_add_1<dag lhs, dag rhs, dag cg, dag cg_mask>:
+    CodeFrag<(ADDXv4i32 lhs, rhs, (SHUFBv4i32 cg, cg, cg_mask))>;
+
+class v2i64_add<dag lhs, dag rhs, dag cg_mask>:
+    v2i64_add_1<lhs, rhs, v2i64_add_cg<lhs, rhs>.Fragment, cg_mask>;
+
+def : Pat<(SPUadd64 R64C:$rA, R64C:$rB, (v4i32 VECREG:$rCGmask)),
+           (ORi64_v2i64 v2i64_add<(ORv2i64_i64 R64C:$rA),
+                                  (ORv2i64_i64 R64C:$rB),
+                                  (v4i32 VECREG:$rCGmask)>.Fragment)>;
+
+def : Pat<(SPUadd64 (v2i64 VECREG:$rA), (v2i64 VECREG:$rB),
+                    (v4i32 VECREG:$rCGmask)),
+           v2i64_add<(v2i64 VECREG:$rA),
+                     (v2i64 VECREG:$rB),
+                     (v4i32 VECREG:$rCGmask)>.Fragment>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// v2i64, i64 subtraction
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class v2i64_sub_bg<dag lhs, dag rhs>: CodeFrag<(BGv4i32 lhs, rhs)>;
+
+class v2i64_sub<dag lhs, dag rhs, dag bg, dag bg_mask>:
+    CodeFrag<(SFXv4i32 lhs, rhs, (SHUFBv4i32 bg, bg, bg_mask))>;
+
+def : Pat<(SPUsub64 R64C:$rA, R64C:$rB, (v4i32 VECREG:$rCGmask)),
+           (ORi64_v2i64 v2i64_sub<(ORv2i64_i64 R64C:$rA),
+                                  (ORv2i64_i64 R64C:$rB),
+                                  v2i64_sub_bg<(ORv2i64_i64 R64C:$rA),
+                                               (ORv2i64_i64 R64C:$rB)>.Fragment,
+                                  (v4i32 VECREG:$rCGmask)>.Fragment)>;
+
+def : Pat<(SPUsub64 (v2i64 VECREG:$rA), (v2i64 VECREG:$rB),
+                    (v4i32 VECREG:$rCGmask)),
+           v2i64_sub<(v2i64 VECREG:$rA),
+                     (v2i64 VECREG:$rB),
+                     v2i64_sub_bg<(v2i64 VECREG:$rA),
+                                  (v2i64 VECREG:$rB)>.Fragment,
+                     (v4i32 VECREG:$rCGmask)>.Fragment>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// v2i64, i64 multiply
+//
+// Note: i64 multiply is simply the vector->scalar conversion of the
+// full-on v2i64 multiply, since the entire vector has to be manipulated
+// anyway.
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class v2i64_mul_ahi64<dag rA> :
+    CodeFrag<(SELBv4i32 rA, (ILv4i32 0), (FSMBIv4i32 0x0f0f))>;
+
+class v2i64_mul_bhi64<dag rB> :
+    CodeFrag<(SELBv4i32 rB, (ILv4i32 0), (FSMBIv4i32 0x0f0f))>;
+
+class v2i64_mul_alo64<dag rB> :
+    CodeFrag<(SELBv4i32 rB, (ILv4i32 0), (FSMBIv4i32 0xf0f0))>;
+
+class v2i64_mul_blo64<dag rB> :
+    CodeFrag<(SELBv4i32 rB, (ILv4i32 0), (FSMBIv4i32 0xf0f0))>;
+
+class v2i64_mul_ashlq2<dag rA>:
+    CodeFrag<(SHLQBYIv4i32 rA, 0x2)>;
+
+class v2i64_mul_ashlq4<dag rA>:
+    CodeFrag<(SHLQBYIv4i32 rA, 0x4)>;
+
+class v2i64_mul_bshlq2<dag rB> :
+    CodeFrag<(SHLQBYIv4i32 rB, 0x2)>;
+
+class v2i64_mul_bshlq4<dag rB> :
+    CodeFrag<(SHLQBYIv4i32 rB, 0x4)>;
+
+class v2i64_highprod<dag rA, dag rB>:
+    CodeFrag<(Av4i32
+                (Av4i32
+                  (MPYUv4i32 v2i64_mul_bshlq4<rB>.Fragment,     // a1 x b3
+                             v2i64_mul_ahi64<rA>.Fragment),
+                  (MPYHv4i32 v2i64_mul_ahi64<rA>.Fragment,      // a0 x b3
+                             v2i64_mul_bshlq4<rB>.Fragment)),
+                (Av4i32
+                  (MPYHv4i32 v2i64_mul_bhi64<rB>.Fragment,
+                             v2i64_mul_ashlq4<rA>.Fragment),
+                  (Av4i32
+                      (MPYHv4i32 v2i64_mul_ashlq4<rA>.Fragment,
+                                 v2i64_mul_bhi64<rB>.Fragment),
+                    (Av4i32
+                      (MPYUv4i32 v2i64_mul_ashlq4<rA>.Fragment,
+                                 v2i64_mul_bhi64<rB>.Fragment),
+                      (Av4i32
+                        (MPYHv4i32 v2i64_mul_ashlq2<rA>.Fragment,
+                                   v2i64_mul_bshlq2<rB>.Fragment),
+                        (MPYUv4i32 v2i64_mul_ashlq2<rA>.Fragment,
+                                   v2i64_mul_bshlq2<rB>.Fragment))))))>;
+
+class v2i64_mul_a3_b3<dag rA, dag rB>:
+    CodeFrag<(MPYUv4i32 v2i64_mul_alo64<rA>.Fragment,
+                        v2i64_mul_blo64<rB>.Fragment)>;
+
+class v2i64_mul_a2_b3<dag rA, dag rB>:
+    CodeFrag<(SELBv4i32 (SHLQBYIv4i32
+                          (MPYHHUv4i32 v2i64_mul_alo64<rA>.Fragment,
+                                       v2i64_mul_bshlq2<rB>.Fragment), 0x2),
+                        (ILv4i32 0),
+                        (FSMBIv4i32 0xc3c3))>;
+
+class v2i64_mul_a3_b2<dag rA, dag rB>:
+    CodeFrag<(SELBv4i32 (SHLQBYIv4i32
+                          (MPYHHUv4i32 v2i64_mul_blo64<rB>.Fragment,
+                                       v2i64_mul_ashlq2<rA>.Fragment), 0x2),
+                        (ILv4i32 0),
+                        (FSMBIv4i32 0xc3c3))>;
+
+class v2i64_lowsum<dag rA, dag rB, dag rCGmask>:
+    v2i64_add<v2i64_add<v2i64_mul_a3_b3<rA, rB>.Fragment,
+                        v2i64_mul_a2_b3<rA, rB>.Fragment, rCGmask>.Fragment,
+              v2i64_mul_a3_b2<rA, rB>.Fragment, rCGmask>;
+
+class v2i64_mul<dag rA, dag rB, dag rCGmask>:
+    v2i64_add<v2i64_lowsum<rA, rB, rCGmask>.Fragment,
+              (SELBv4i32 v2i64_highprod<rA, rB>.Fragment,
+                         (ILv4i32 0),
+                         (FSMBIv4i32 0x0f0f)),
+              rCGmask>;
+
+def : Pat<(SPUmul64 R64C:$rA, R64C:$rB, (v4i32 VECREG:$rCGmask)),
+          (ORi64_v2i64 v2i64_mul<(ORv2i64_i64 R64C:$rA),
+                                 (ORv2i64_i64 R64C:$rB),
+                                 (v4i32 VECREG:$rCGmask)>.Fragment)>;
+
+def : Pat<(SPUmul64 (v2i64 VECREG:$rA), (v2i64 VECREG:$rB),
+                    (v4i32 VECREG:$rCGmask)),
+          v2i64_mul<(v2i64 VECREG:$rA), (v2i64 VECREG:$rB),
+                    (v4i32 VECREG:$rCGmask)>.Fragment>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// f64 comparisons
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+// selb instruction definition for i64. Note that the selection mask is
+// a vector, produced by various forms of FSM:
+def SELBf64_cond:
+   SELBInst<(outs R64FP:$rT), (ins R64FP:$rA, R64FP:$rB, R32C:$rC),
+            [(set R64FP:$rT,
+                  (select R32C:$rC, R64FP:$rB, R64FP:$rA))]>;

diff --git a/lib/Target/CellSPU/SPUCallingConv.td b/lib/Target/CellSPU/SPUCallingConv.td
new file mode 100644
index 0000000..10dc837
--- /dev/null
+++ b/lib/Target/CellSPU/SPUCallingConv.td

@@ -0,0 +1,115 @@
+//===- SPUCallingConv.td - Calling Conventions for CellSPU ------*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the STI Cell SPU architecture.
+//
+//===----------------------------------------------------------------------===//
+
+/// CCIfSubtarget - Match if the current subtarget has a feature F.
+class CCIfSubtarget<string F, CCAction A>
+ : CCIf<!strconcat("State.getTarget().getSubtarget<PPCSubtarget>().", F), A>;
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Convention
+//===----------------------------------------------------------------------===//
+
+// Return-value convention for Cell SPU: Everything can be passed back via $3:
+def RetCC_SPU : CallingConv<[
+  CCIfType<[i8],       CCAssignToReg<[R3]>>,
+  CCIfType<[i16],      CCAssignToReg<[R3]>>,
+  CCIfType<[i32],      CCAssignToReg<[R3]>>,
+  CCIfType<[i64],      CCAssignToReg<[R3]>>,
+  CCIfType<[i128],     CCAssignToReg<[R3]>>,
+  CCIfType<[f32, f64], CCAssignToReg<[R3]>>,
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToReg<[R3]>>,
+  CCIfType<[v2i32],                                    CCAssignToReg<[R3]>>
+]>;
+
+
+//===----------------------------------------------------------------------===//
+// CellSPU Argument Calling Conventions
+// (note: this isn't used, but presumably should be at some point when other
+//  targets do.)
+//===----------------------------------------------------------------------===//
+/*
+def CC_SPU : CallingConv<[
+  CCIfType<[i8],  CCAssignToReg<[R3,   R4,  R5,  R6,  R7,  R8,  R9, R10, R11,
+                                 R12, R13, R14, R15, R16, R17, R18, R19, R20,
+                                 R21, R22, R23, R24, R25, R26, R27, R28, R29,
+                                 R30, R31, R32, R33, R34, R35, R36, R37, R38,
+                                 R39, R40, R41, R42, R43, R44, R45, R46, R47,
+                                 R48, R49, R50, R51, R52, R53, R54, R55, R56,
+                                 R57, R58, R59, R60, R61, R62, R63, R64, R65,
+                                 R66, R67, R68, R69, R70, R71, R72, R73, R74,
+                                 R75, R76, R77, R78, R79]>>,
+  CCIfType<[i16], CCAssignToReg<[R3,   R4,  R5,  R6,  R7,  R8,  R9, R10, R11,
+                                 R12, R13, R14, R15, R16, R17, R18, R19, R20,
+                                 R21, R22, R23, R24, R25, R26, R27, R28, R29,
+                                 R30, R31, R32, R33, R34, R35, R36, R37, R38,
+                                 R39, R40, R41, R42, R43, R44, R45, R46, R47,
+                                 R48, R49, R50, R51, R52, R53, R54, R55, R56,
+                                 R57, R58, R59, R60, R61, R62, R63, R64, R65,
+                                 R66, R67, R68, R69, R70, R71, R72, R73, R74,
+                                 R75, R76, R77, R78, R79]>>,
+  CCIfType<[i32], CCAssignToReg<[R3,   R4,  R5,  R6,  R7,  R8,  R9, R10, R11,
+                                 R12, R13, R14, R15, R16, R17, R18, R19, R20,
+                                 R21, R22, R23, R24, R25, R26, R27, R28, R29,
+                                 R30, R31, R32, R33, R34, R35, R36, R37, R38,
+                                 R39, R40, R41, R42, R43, R44, R45, R46, R47,
+                                 R48, R49, R50, R51, R52, R53, R54, R55, R56,
+                                 R57, R58, R59, R60, R61, R62, R63, R64, R65,
+                                 R66, R67, R68, R69, R70, R71, R72, R73, R74,
+                                 R75, R76, R77, R78, R79]>>,
+  CCIfType<[f32], CCAssignToReg<[R3,   R4,  R5,  R6,  R7,  R8,  R9, R10, R11,
+                                 R12, R13, R14, R15, R16, R17, R18, R19, R20,
+                                 R21, R22, R23, R24, R25, R26, R27, R28, R29,
+                                 R30, R31, R32, R33, R34, R35, R36, R37, R38,
+                                 R39, R40, R41, R42, R43, R44, R45, R46, R47,
+                                 R48, R49, R50, R51, R52, R53, R54, R55, R56,
+                                 R57, R58, R59, R60, R61, R62, R63, R64, R65,
+                                 R66, R67, R68, R69, R70, R71, R72, R73, R74,
+                                 R75, R76, R77, R78, R79]>>,
+  CCIfType<[i64], CCAssignToReg<[R3,   R4,  R5,  R6,  R7,  R8,  R9, R10, R11,
+                                 R12, R13, R14, R15, R16, R17, R18, R19, R20,
+                                 R21, R22, R23, R24, R25, R26, R27, R28, R29,
+                                 R30, R31, R32, R33, R34, R35, R36, R37, R38,
+                                 R39, R40, R41, R42, R43, R44, R45, R46, R47,
+                                 R48, R49, R50, R51, R52, R53, R54, R55, R56,
+                                 R57, R58, R59, R60, R61, R62, R63, R64, R65,
+                                 R66, R67, R68, R69, R70, R71, R72, R73, R74,
+                                 R75, R76, R77, R78, R79]>>,
+  CCIfType<[f64], CCAssignToReg<[R3,   R4,  R5,  R6,  R7,  R8,  R9, R10, R11,
+                                 R12, R13, R14, R15, R16, R17, R18, R19, R20,
+                                 R21, R22, R23, R24, R25, R26, R27, R28, R29,
+                                 R30, R31, R32, R33, R34, R35, R36, R37, R38,
+                                 R39, R40, R41, R42, R43, R44, R45, R46, R47,
+                                 R48, R49, R50, R51, R52, R53, R54, R55, R56,
+                                 R57, R58, R59, R60, R61, R62, R63, R64, R65,
+                                 R66, R67, R68, R69, R70, R71, R72, R73, R74,
+                                 R75, R76, R77, R78, R79]>>,
+  CCIfType<[v16i8, v8i16, v4i32, v4f32, v2i64, v2f64],
+                  CCAssignToReg<[R3,   R4,  R5,  R6,  R7,  R8,  R9, R10, R11,
+                                 R12, R13, R14, R15, R16, R17, R18, R19, R20,
+                                 R21, R22, R23, R24, R25, R26, R27, R28, R29,
+                                 R30, R31, R32, R33, R34, R35, R36, R37, R38,
+                                 R39, R40, R41, R42, R43, R44, R45, R46, R47,
+                                 R48, R49, R50, R51, R52, R53, R54, R55, R56,
+                                 R57, R58, R59, R60, R61, R62, R63, R64, R65,
+                                 R66, R67, R68, R69, R70, R71, R72, R73, R74,
+                                 R75, R76, R77, R78, R79]>>,
+  
+  // Integer/FP values get stored in stack slots that are 8 bytes in size and
+  // 8-byte aligned if there are no more registers to hold them.
+  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
+  
+  // Vectors get 16-byte stack slots that are 16-byte aligned.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+              CCAssignToStack<16, 16>>
+]>;
+*/

diff --git a/lib/Target/CellSPU/SPUFrameInfo.cpp b/lib/Target/CellSPU/SPUFrameInfo.cpp
new file mode 100644
index 0000000..60d7ba7
--- /dev/null
+++ b/lib/Target/CellSPU/SPUFrameInfo.cpp

@@ -0,0 +1,29 @@
+//===-- SPUTargetMachine.cpp - Define TargetMachine for Cell SPU ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Top-level implementation for the Cell SPU target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPU.h"
+#include "SPUFrameInfo.h"
+#include "SPURegisterNames.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// SPUFrameInfo:
+//===----------------------------------------------------------------------===//
+
+SPUFrameInfo::SPUFrameInfo(const TargetMachine &tm):
+  TargetFrameInfo(TargetFrameInfo::StackGrowsDown, 16, 0),
+  TM(tm)
+{
+  LR[0].first = SPU::R0;
+  LR[0].second = 16;
+}

diff --git a/lib/Target/CellSPU/SPUFrameInfo.h b/lib/Target/CellSPU/SPUFrameInfo.h
new file mode 100644
index 0000000..e8ca333
--- /dev/null
+++ b/lib/Target/CellSPU/SPUFrameInfo.h

@@ -0,0 +1,79 @@
+//===-- SPUFrameInfo.h - Top-level interface for Cell SPU Target -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains CellSPU frame information that doesn't fit anywhere else
+// cleanly...
+//
+//===----------------------------------------------------------------------===//
+
+#if !defined(SPUFRAMEINFO_H)
+
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "SPURegisterInfo.h"
+
+namespace llvm {
+  class SPUFrameInfo: public TargetFrameInfo {
+    const TargetMachine &TM;
+    std::pair<unsigned, int> LR[1];
+
+  public:
+    SPUFrameInfo(const TargetMachine &tm);
+
+    //! Return a function's saved spill slots
+    /*!
+      For CellSPU, a function's saved spill slots is just the link register.
+     */
+    const std::pair<unsigned, int> *
+    getCalleeSaveSpillSlots(unsigned &NumEntries) const;
+
+    //! Stack slot size (16 bytes)
+    static int stackSlotSize() {
+      return 16;
+    }
+    //! Maximum frame offset representable by a signed 10-bit integer
+    /*!
+      This is the maximum frame offset that can be expressed as a 10-bit
+      integer, used in D-form addresses.
+     */
+    static int maxFrameOffset() {
+      return ((1 << 9) - 1) * stackSlotSize();
+    }
+    //! Minimum frame offset representable by a signed 10-bit integer
+    static int minFrameOffset() {
+      return -(1 << 9) * stackSlotSize();
+    }
+    //! Minimum frame size (enough to spill LR + SP)
+    static int minStackSize() {
+      return (2 * stackSlotSize());
+    }
+    //! Frame size required to spill all registers plus frame info
+    static int fullSpillSize() {
+      return (SPURegisterInfo::getNumArgRegs() * stackSlotSize());
+    }
+    //! Convert frame index to stack offset
+    static int FItoStackOffset(int frame_index) {
+      return frame_index * stackSlotSize();
+    }
+    //! Number of instructions required to overcome hint-for-branch latency
+    /*!
+      HBR (hint-for-branch) instructions can be inserted when, for example,
+      we know that a given function is going to be called, such as printf(),
+      in the control flow graph. HBRs are only inserted if a sufficient number
+      of instructions occurs between the HBR and the target. Currently, HBRs
+      take 6 cycles, ergo, the magic number 6.
+     */
+    static int branchHintPenalty() {
+      return 6;
+    }
+  };
+}
+
+#define SPUFRAMEINFO_H 1
+#endif

diff --git a/lib/Target/CellSPU/SPUHazardRecognizers.cpp b/lib/Target/CellSPU/SPUHazardRecognizers.cpp
new file mode 100644
index 0000000..9dbab1d
--- /dev/null
+++ b/lib/Target/CellSPU/SPUHazardRecognizers.cpp

@@ -0,0 +1,139 @@
+//===-- SPUHazardRecognizers.cpp - Cell Hazard Recognizer Impls -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements hazard recognizers for scheduling on Cell SPU
+// processors.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "sched"
+
+#include "SPUHazardRecognizers.h"
+#include "SPU.h"
+#include "SPUInstrInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Cell SPU hazard recognizer
+//
+// This is the pipeline hazard recognizer for the Cell SPU processor. It does
+// very little right now.
+//===----------------------------------------------------------------------===//
+
+SPUHazardRecognizer::SPUHazardRecognizer(const TargetInstrInfo &tii) :
+  TII(tii),
+  EvenOdd(0)
+{
+}
+
+/// Return the pipeline hazard type encountered or generated by this
+/// instruction. Currently returns NoHazard.
+///
+/// \return NoHazard
+ScheduleHazardRecognizer::HazardType
+SPUHazardRecognizer::getHazardType(SUnit *SU)
+{
+  // Initial thoughts on how to do this, but this code cannot work unless the
+  // function's prolog and epilog code are also being scheduled so that we can
+  // accurately determine which pipeline is being scheduled.
+#if 0
+  const SDNode *Node = SU->getNode()->getFlaggedMachineNode();
+  ScheduleHazardRecognizer::HazardType retval = NoHazard;
+  bool mustBeOdd = false;
+
+  switch (Node->getOpcode()) {
+  case SPU::LQDv16i8:
+  case SPU::LQDv8i16:
+  case SPU::LQDv4i32:
+  case SPU::LQDv4f32:
+  case SPU::LQDv2f64:
+  case SPU::LQDr128:
+  case SPU::LQDr64:
+  case SPU::LQDr32:
+  case SPU::LQDr16:
+  case SPU::LQAv16i8:
+  case SPU::LQAv8i16:
+  case SPU::LQAv4i32:
+  case SPU::LQAv4f32:
+  case SPU::LQAv2f64:
+  case SPU::LQAr128:
+  case SPU::LQAr64:
+  case SPU::LQAr32:
+  case SPU::LQXv4i32:
+  case SPU::LQXr128:
+  case SPU::LQXr64:
+  case SPU::LQXr32:
+  case SPU::LQXr16:
+  case SPU::STQDv16i8:
+  case SPU::STQDv8i16:
+  case SPU::STQDv4i32:
+  case SPU::STQDv4f32:
+  case SPU::STQDv2f64:
+  case SPU::STQDr128:
+  case SPU::STQDr64:
+  case SPU::STQDr32:
+  case SPU::STQDr16:
+  case SPU::STQDr8:
+  case SPU::STQAv16i8:
+  case SPU::STQAv8i16:
+  case SPU::STQAv4i32:
+  case SPU::STQAv4f32:
+  case SPU::STQAv2f64:
+  case SPU::STQAr128:
+  case SPU::STQAr64:
+  case SPU::STQAr32:
+  case SPU::STQAr16:
+  case SPU::STQAr8:
+  case SPU::STQXv16i8:
+  case SPU::STQXv8i16:
+  case SPU::STQXv4i32:
+  case SPU::STQXv4f32:
+  case SPU::STQXv2f64:
+  case SPU::STQXr128:
+  case SPU::STQXr64:
+  case SPU::STQXr32:
+  case SPU::STQXr16:
+  case SPU::STQXr8:
+  case SPU::RET:
+    mustBeOdd = true;
+    break;
+  default:
+    // Assume that this instruction can be on the even pipe
+    break;
+  }
+
+  if (mustBeOdd && !EvenOdd)
+    retval = Hazard;
+
+  DEBUG(errs() << "SPUHazardRecognizer EvenOdd " << EvenOdd << " Hazard "
+               << retval << "\n");
+  EvenOdd ^= 1;
+  return retval;
+#else
+  return NoHazard;
+#endif
+}
+
+void SPUHazardRecognizer::EmitInstruction(SUnit *SU)
+{
+}
+
+void SPUHazardRecognizer::AdvanceCycle()
+{
+  DEBUG(errs() << "SPUHazardRecognizer::AdvanceCycle\n");
+}
+
+void SPUHazardRecognizer::EmitNoop()
+{
+  AdvanceCycle();
+}

diff --git a/lib/Target/CellSPU/SPUHazardRecognizers.h b/lib/Target/CellSPU/SPUHazardRecognizers.h
new file mode 100644
index 0000000..d0ae2d8
--- /dev/null
+++ b/lib/Target/CellSPU/SPUHazardRecognizers.h

@@ -0,0 +1,41 @@
+//===-- SPUHazardRecognizers.h - Cell SPU Hazard Recognizer -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines hazard recognizers for scheduling on the Cell SPU
+// processor.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPUHAZRECS_H
+#define SPUHAZRECS_H
+
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+
+namespace llvm {
+
+class TargetInstrInfo;
+  
+/// SPUHazardRecognizer
+class SPUHazardRecognizer : public ScheduleHazardRecognizer
+{
+private:
+  const TargetInstrInfo &TII;
+  int EvenOdd;
+
+public:
+  SPUHazardRecognizer(const TargetInstrInfo &TII);
+  virtual HazardType getHazardType(SUnit *SU);
+  virtual void EmitInstruction(SUnit *SU);
+  virtual void AdvanceCycle();
+  virtual void EmitNoop();
+};
+
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
new file mode 100644
index 0000000..80693e1
--- /dev/null
+++ b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp

@@ -0,0 +1,1248 @@
+//===-- SPUISelDAGToDAG.cpp - CellSPU pattern matching inst selector ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pattern matching instruction selector for the Cell SPU,
+// converting from a legalized dag to a SPU-target dag.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPU.h"
+#include "SPUTargetMachine.h"
+#include "SPUISelLowering.h"
+#include "SPUHazardRecognizers.h"
+#include "SPUFrameInfo.h"
+#include "SPURegisterNames.h"
+#include "SPUTargetMachine.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Constants.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+  //! ConstantSDNode predicate for i32 sign-extended, 10-bit immediates
+  bool
+  isI64IntS10Immediate(ConstantSDNode *CN)
+  {
+    return isS10Constant(CN->getSExtValue());
+  }
+
+  //! ConstantSDNode predicate for i32 sign-extended, 10-bit immediates
+  bool
+  isI32IntS10Immediate(ConstantSDNode *CN)
+  {
+    return isS10Constant(CN->getSExtValue());
+  }
+
+  //! ConstantSDNode predicate for i32 unsigned 10-bit immediate values
+  bool
+  isI32IntU10Immediate(ConstantSDNode *CN)
+  {
+    return isU10Constant(CN->getSExtValue());
+  }
+
+  //! ConstantSDNode predicate for i16 sign-extended, 10-bit immediate values
+  bool
+  isI16IntS10Immediate(ConstantSDNode *CN)
+  {
+    return isS10Constant(CN->getSExtValue());
+  }
+
+  //! SDNode predicate for i16 sign-extended, 10-bit immediate values
+  bool
+  isI16IntS10Immediate(SDNode *N)
+  {
+    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N);
+    return (CN != 0 && isI16IntS10Immediate(CN));
+  }
+
+  //! ConstantSDNode predicate for i16 unsigned 10-bit immediate values
+  bool
+  isI16IntU10Immediate(ConstantSDNode *CN)
+  {
+    return isU10Constant((short) CN->getZExtValue());
+  }
+
+  //! SDNode predicate for i16 sign-extended, 10-bit immediate values
+  bool
+  isI16IntU10Immediate(SDNode *N)
+  {
+    return (N->getOpcode() == ISD::Constant
+            && isI16IntU10Immediate(cast<ConstantSDNode>(N)));
+  }
+
+  //! ConstantSDNode predicate for signed 16-bit values
+  /*!
+    \arg CN The constant SelectionDAG node holding the value
+    \arg Imm The returned 16-bit value, if returning true
+
+    This predicate tests the value in \a CN to see whether it can be
+    represented as a 16-bit, sign-extended quantity. Returns true if
+    this is the case.
+   */
+  bool
+  isIntS16Immediate(ConstantSDNode *CN, short &Imm)
+  {
+    EVT vt = CN->getValueType(0);
+    Imm = (short) CN->getZExtValue();
+    if (vt.getSimpleVT() >= MVT::i1 && vt.getSimpleVT() <= MVT::i16) {
+      return true;
+    } else if (vt == MVT::i32) {
+      int32_t i_val = (int32_t) CN->getZExtValue();
+      short s_val = (short) i_val;
+      return i_val == s_val;
+    } else {
+      int64_t i_val = (int64_t) CN->getZExtValue();
+      short s_val = (short) i_val;
+      return i_val == s_val;
+    }
+
+    return false;
+  }
+
+  //! SDNode predicate for signed 16-bit values.
+  bool
+  isIntS16Immediate(SDNode *N, short &Imm)
+  {
+    return (N->getOpcode() == ISD::Constant
+            && isIntS16Immediate(cast<ConstantSDNode>(N), Imm));
+  }
+
+  //! ConstantFPSDNode predicate for representing floats as 16-bit sign ext.
+  static bool
+  isFPS16Immediate(ConstantFPSDNode *FPN, short &Imm)
+  {
+    EVT vt = FPN->getValueType(0);
+    if (vt == MVT::f32) {
+      int val = FloatToBits(FPN->getValueAPF().convertToFloat());
+      int sval = (int) ((val << 16) >> 16);
+      Imm = (short) val;
+      return val == sval;
+    }
+
+    return false;
+  }
+
+  bool
+  isHighLow(const SDValue &Op)
+  {
+    return (Op.getOpcode() == SPUISD::IndirectAddr
+            && ((Op.getOperand(0).getOpcode() == SPUISD::Hi
+                 && Op.getOperand(1).getOpcode() == SPUISD::Lo)
+                || (Op.getOperand(0).getOpcode() == SPUISD::Lo
+                    && Op.getOperand(1).getOpcode() == SPUISD::Hi)));
+  }
+
+  //===------------------------------------------------------------------===//
+  //! EVT to "useful stuff" mapping structure:
+
+  struct valtype_map_s {
+    EVT VT;
+    unsigned ldresult_ins;      /// LDRESULT instruction (0 = undefined)
+    bool ldresult_imm;          /// LDRESULT instruction requires immediate?
+    unsigned lrinst;            /// LR instruction
+  };
+
+  const valtype_map_s valtype_map[] = {
+    { MVT::i8,    SPU::ORBIr8,  true,  SPU::LRr8 },
+    { MVT::i16,   SPU::ORHIr16, true,  SPU::LRr16 },
+    { MVT::i32,   SPU::ORIr32,  true,  SPU::LRr32 },
+    { MVT::i64,   SPU::ORr64,   false, SPU::LRr64 },
+    { MVT::f32,   SPU::ORf32,   false, SPU::LRf32 },
+    { MVT::f64,   SPU::ORf64,   false, SPU::LRf64 },
+    // vector types... (sigh!)
+    { MVT::v16i8, 0,            false, SPU::LRv16i8 },
+    { MVT::v8i16, 0,            false, SPU::LRv8i16 },
+    { MVT::v4i32, 0,            false, SPU::LRv4i32 },
+    { MVT::v2i64, 0,            false, SPU::LRv2i64 },
+    { MVT::v4f32, 0,            false, SPU::LRv4f32 },
+    { MVT::v2f64, 0,            false, SPU::LRv2f64 }
+  };
+
+  const size_t n_valtype_map = sizeof(valtype_map) / sizeof(valtype_map[0]);
+
+  const valtype_map_s *getValueTypeMapEntry(EVT VT)
+  {
+    const valtype_map_s *retval = 0;
+    for (size_t i = 0; i < n_valtype_map; ++i) {
+      if (valtype_map[i].VT == VT) {
+        retval = valtype_map + i;
+        break;
+      }
+    }
+
+
+#ifndef NDEBUG
+    if (retval == 0) {
+      std::string msg;
+      raw_string_ostream Msg(msg);
+      Msg << "SPUISelDAGToDAG.cpp: getValueTypeMapEntry returns NULL for "
+           << VT.getEVTString();
+      llvm_report_error(Msg.str());
+    }
+#endif
+
+    return retval;
+  }
+
+  //! Generate the carry-generate shuffle mask.
+  SDValue getCarryGenerateShufMask(SelectionDAG &DAG, DebugLoc dl) {
+    SmallVector<SDValue, 16 > ShufBytes;
+
+    // Create the shuffle mask for "rotating" the borrow up one register slot
+    // once the borrow is generated.
+    ShufBytes.push_back(DAG.getConstant(0x04050607, MVT::i32));
+    ShufBytes.push_back(DAG.getConstant(0x80808080, MVT::i32));
+    ShufBytes.push_back(DAG.getConstant(0x0c0d0e0f, MVT::i32));
+    ShufBytes.push_back(DAG.getConstant(0x80808080, MVT::i32));
+
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                       &ShufBytes[0], ShufBytes.size());
+  }
+
+  //! Generate the borrow-generate shuffle mask
+  SDValue getBorrowGenerateShufMask(SelectionDAG &DAG, DebugLoc dl) {
+    SmallVector<SDValue, 16 > ShufBytes;
+
+    // Create the shuffle mask for "rotating" the borrow up one register slot
+    // once the borrow is generated.
+    ShufBytes.push_back(DAG.getConstant(0x04050607, MVT::i32));
+    ShufBytes.push_back(DAG.getConstant(0xc0c0c0c0, MVT::i32));
+    ShufBytes.push_back(DAG.getConstant(0x0c0d0e0f, MVT::i32));
+    ShufBytes.push_back(DAG.getConstant(0xc0c0c0c0, MVT::i32));
+
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                       &ShufBytes[0], ShufBytes.size());
+  }
+
+  //===------------------------------------------------------------------===//
+  /// SPUDAGToDAGISel - Cell SPU-specific code to select SPU machine
+  /// instructions for SelectionDAG operations.
+  ///
+  class SPUDAGToDAGISel :
+    public SelectionDAGISel
+  {
+    SPUTargetMachine &TM;
+    SPUTargetLowering &SPUtli;
+    unsigned GlobalBaseReg;
+
+  public:
+    explicit SPUDAGToDAGISel(SPUTargetMachine &tm) :
+      SelectionDAGISel(tm),
+      TM(tm),
+      SPUtli(*tm.getTargetLowering())
+    { }
+
+    virtual bool runOnMachineFunction(MachineFunction &MF) {
+      // Make sure we re-emit a set of the global base reg if necessary
+      GlobalBaseReg = 0;
+      SelectionDAGISel::runOnMachineFunction(MF);
+      return true;
+    }
+
+    /// getI32Imm - Return a target constant with the specified value, of type
+    /// i32.
+    inline SDValue getI32Imm(uint32_t Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i32);
+    }
+
+    /// getI64Imm - Return a target constant with the specified value, of type
+    /// i64.
+    inline SDValue getI64Imm(uint64_t Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i64);
+    }
+
+    /// getSmallIPtrImm - Return a target constant of pointer type.
+    inline SDValue getSmallIPtrImm(unsigned Imm) {
+      return CurDAG->getTargetConstant(Imm, SPUtli.getPointerTy());
+      }
+
+    SDNode *emitBuildVector(SDNode *bvNode) {
+      EVT vecVT = bvNode->getValueType(0);
+      EVT eltVT = vecVT.getVectorElementType();
+      DebugLoc dl = bvNode->getDebugLoc();
+
+      // Check to see if this vector can be represented as a CellSPU immediate
+      // constant by invoking all of the instruction selection predicates:
+      if (((vecVT == MVT::v8i16) &&
+           (SPU::get_vec_i16imm(bvNode, *CurDAG, MVT::i16).getNode() != 0)) ||
+          ((vecVT == MVT::v4i32) &&
+           ((SPU::get_vec_i16imm(bvNode, *CurDAG, MVT::i32).getNode() != 0) ||
+            (SPU::get_ILHUvec_imm(bvNode, *CurDAG, MVT::i32).getNode() != 0) ||
+            (SPU::get_vec_u18imm(bvNode, *CurDAG, MVT::i32).getNode() != 0) ||
+            (SPU::get_v4i32_imm(bvNode, *CurDAG).getNode() != 0))) ||
+          ((vecVT == MVT::v2i64) &&
+           ((SPU::get_vec_i16imm(bvNode, *CurDAG, MVT::i64).getNode() != 0) ||
+            (SPU::get_ILHUvec_imm(bvNode, *CurDAG, MVT::i64).getNode() != 0) ||
+            (SPU::get_vec_u18imm(bvNode, *CurDAG, MVT::i64).getNode() != 0))))
+        return Select(bvNode);
+
+      // No, need to emit a constant pool spill:
+      std::vector<Constant*> CV;
+
+      for (size_t i = 0; i < bvNode->getNumOperands(); ++i) {
+        ConstantSDNode *V = dyn_cast<ConstantSDNode > (bvNode->getOperand(i));
+        CV.push_back(const_cast<ConstantInt *> (V->getConstantIntValue()));
+      }
+
+      Constant *CP = ConstantVector::get(CV);
+      SDValue CPIdx = CurDAG->getConstantPool(CP, SPUtli.getPointerTy());
+      unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+      SDValue CGPoolOffset =
+              SPU::LowerConstantPool(CPIdx, *CurDAG,
+                                     SPUtli.getSPUTargetMachine());
+      return SelectCode(CurDAG->getLoad(vecVT, dl,
+                                        CurDAG->getEntryNode(), CGPoolOffset,
+                                        PseudoSourceValue::getConstantPool(), 0,
+                                        false, Alignment).getNode());
+    }
+
+    /// Select - Convert the specified operand from a target-independent to a
+    /// target-specific node if it hasn't already been changed.
+    SDNode *Select(SDNode *N);
+
+    //! Emit the instruction sequence for i64 shl
+    SDNode *SelectSHLi64(SDNode *N, EVT OpVT);
+
+    //! Emit the instruction sequence for i64 srl
+    SDNode *SelectSRLi64(SDNode *N, EVT OpVT);
+
+    //! Emit the instruction sequence for i64 sra
+    SDNode *SelectSRAi64(SDNode *N, EVT OpVT);
+
+    //! Emit the necessary sequence for loading i64 constants:
+    SDNode *SelectI64Constant(SDNode *N, EVT OpVT, DebugLoc dl);
+
+    //! Alternate instruction emit sequence for loading i64 constants
+    SDNode *SelectI64Constant(uint64_t i64const, EVT OpVT, DebugLoc dl);
+
+    //! Returns true if the address N is an A-form (local store) address
+    bool SelectAFormAddr(SDNode *Op, SDValue N, SDValue &Base,
+                         SDValue &Index);
+
+    //! D-form address predicate
+    bool SelectDFormAddr(SDNode *Op, SDValue N, SDValue &Base,
+                         SDValue &Index);
+
+    /// Alternate D-form address using i7 offset predicate
+    bool SelectDForm2Addr(SDNode *Op, SDValue N, SDValue &Disp,
+                          SDValue &Base);
+
+    /// D-form address selection workhorse
+    bool DFormAddressPredicate(SDNode *Op, SDValue N, SDValue &Disp,
+                               SDValue &Base, int minOffset, int maxOffset);
+
+    //! Address predicate if N can be expressed as an indexed [r+r] operation.
+    bool SelectXFormAddr(SDNode *Op, SDValue N, SDValue &Base,
+                         SDValue &Index);
+
+    /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+    /// inline asm expressions.
+    virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                              char ConstraintCode,
+                                              std::vector<SDValue> &OutOps) {
+      SDValue Op0, Op1;
+      switch (ConstraintCode) {
+      default: return true;
+      case 'm':   // memory
+        if (!SelectDFormAddr(Op.getNode(), Op, Op0, Op1)
+            && !SelectAFormAddr(Op.getNode(), Op, Op0, Op1))
+          SelectXFormAddr(Op.getNode(), Op, Op0, Op1);
+        break;
+      case 'o':   // offsetable
+        if (!SelectDFormAddr(Op.getNode(), Op, Op0, Op1)
+            && !SelectAFormAddr(Op.getNode(), Op, Op0, Op1)) {
+          Op0 = Op;
+          Op1 = getSmallIPtrImm(0);
+        }
+        break;
+      case 'v':   // not offsetable
+#if 1
+        llvm_unreachable("InlineAsmMemoryOperand 'v' constraint not handled.");
+#else
+        SelectAddrIdxOnly(Op, Op, Op0, Op1);
+#endif
+        break;
+      }
+
+      OutOps.push_back(Op0);
+      OutOps.push_back(Op1);
+      return false;
+    }
+
+    /// InstructionSelect - This callback is invoked by
+    /// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+    virtual void InstructionSelect();
+
+    virtual const char *getPassName() const {
+      return "Cell SPU DAG->DAG Pattern Instruction Selection";
+    }
+
+    /// CreateTargetHazardRecognizer - Return the hazard recognizer to use for
+    /// this target when scheduling the DAG.
+    virtual ScheduleHazardRecognizer *CreateTargetHazardRecognizer() {
+      const TargetInstrInfo *II = TM.getInstrInfo();
+      assert(II && "No InstrInfo?");
+      return new SPUHazardRecognizer(*II);
+    }
+
+    // Include the pieces autogenerated from the target description.
+#include "SPUGenDAGISel.inc"
+  };
+}
+
+/// InstructionSelect - This callback is invoked by
+/// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+void
+SPUDAGToDAGISel::InstructionSelect()
+{
+  // Select target instructions for the DAG.
+  SelectRoot(*CurDAG);
+  CurDAG->RemoveDeadNodes();
+}
+
+/*!
+ \arg Op The ISD instruction operand
+ \arg N The address to be tested
+ \arg Base The base address
+ \arg Index The base address index
+ */
+bool
+SPUDAGToDAGISel::SelectAFormAddr(SDNode *Op, SDValue N, SDValue &Base,
+                    SDValue &Index) {
+  // These match the addr256k operand type:
+  EVT OffsVT = MVT::i16;
+  SDValue Zero = CurDAG->getTargetConstant(0, OffsVT);
+
+  switch (N.getOpcode()) {
+  case ISD::Constant:
+  case ISD::ConstantPool:
+  case ISD::GlobalAddress:
+    llvm_report_error("SPU SelectAFormAddr: Constant/Pool/Global not lowered.");
+    /*NOTREACHED*/
+
+  case ISD::TargetConstant:
+  case ISD::TargetGlobalAddress:
+  case ISD::TargetJumpTable:
+    llvm_report_error("SPUSelectAFormAddr: Target Constant/Pool/Global "
+                      "not wrapped as A-form address.");
+    /*NOTREACHED*/
+
+  case SPUISD::AFormAddr:
+    // Just load from memory if there's only a single use of the location,
+    // otherwise, this will get handled below with D-form offset addresses
+    if (N.hasOneUse()) {
+      SDValue Op0 = N.getOperand(0);
+      switch (Op0.getOpcode()) {
+      case ISD::TargetConstantPool:
+      case ISD::TargetJumpTable:
+        Base = Op0;
+        Index = Zero;
+        return true;
+
+      case ISD::TargetGlobalAddress: {
+        GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op0);
+        GlobalValue *GV = GSDN->getGlobal();
+        if (GV->getAlignment() == 16) {
+          Base = Op0;
+          Index = Zero;
+          return true;
+        }
+        break;
+      }
+      }
+    }
+    break;
+  }
+  return false;
+}
+
+bool
+SPUDAGToDAGISel::SelectDForm2Addr(SDNode *Op, SDValue N, SDValue &Disp,
+                                  SDValue &Base) {
+  const int minDForm2Offset = -(1 << 7);
+  const int maxDForm2Offset = (1 << 7) - 1;
+  return DFormAddressPredicate(Op, N, Disp, Base, minDForm2Offset,
+                               maxDForm2Offset);
+}
+
+/*!
+  \arg Op The ISD instruction (ignored)
+  \arg N The address to be tested
+  \arg Base Base address register/pointer
+  \arg Index Base address index
+
+  Examine the input address by a base register plus a signed 10-bit
+  displacement, [r+I10] (D-form address).
+
+  \return true if \a N is a D-form address with \a Base and \a Index set
+  to non-empty SDValue instances.
+*/
+bool
+SPUDAGToDAGISel::SelectDFormAddr(SDNode *Op, SDValue N, SDValue &Base,
+                                 SDValue &Index) {
+  return DFormAddressPredicate(Op, N, Base, Index,
+                               SPUFrameInfo::minFrameOffset(),
+                               SPUFrameInfo::maxFrameOffset());
+}
+
+bool
+SPUDAGToDAGISel::DFormAddressPredicate(SDNode *Op, SDValue N, SDValue &Base,
+                                      SDValue &Index, int minOffset,
+                                      int maxOffset) {
+  unsigned Opc = N.getOpcode();
+  EVT PtrTy = SPUtli.getPointerTy();
+
+  if (Opc == ISD::FrameIndex) {
+    // Stack frame index must be less than 512 (divided by 16):
+    FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(N);
+    int FI = int(FIN->getIndex());
+    DEBUG(errs() << "SelectDFormAddr: ISD::FrameIndex = "
+               << FI << "\n");
+    if (SPUFrameInfo::FItoStackOffset(FI) < maxOffset) {
+      Base = CurDAG->getTargetConstant(0, PtrTy);
+      Index = CurDAG->getTargetFrameIndex(FI, PtrTy);
+      return true;
+    }
+  } else if (Opc == ISD::ADD) {
+    // Generated by getelementptr
+    const SDValue Op0 = N.getOperand(0);
+    const SDValue Op1 = N.getOperand(1);
+
+    if ((Op0.getOpcode() == SPUISD::Hi && Op1.getOpcode() == SPUISD::Lo)
+        || (Op1.getOpcode() == SPUISD::Hi && Op0.getOpcode() == SPUISD::Lo)) {
+      Base = CurDAG->getTargetConstant(0, PtrTy);
+      Index = N;
+      return true;
+    } else if (Op1.getOpcode() == ISD::Constant
+               || Op1.getOpcode() == ISD::TargetConstant) {
+      ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op1);
+      int32_t offset = int32_t(CN->getSExtValue());
+
+      if (Op0.getOpcode() == ISD::FrameIndex) {
+        FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op0);
+        int FI = int(FIN->getIndex());
+        DEBUG(errs() << "SelectDFormAddr: ISD::ADD offset = " << offset
+                   << " frame index = " << FI << "\n");
+
+        if (SPUFrameInfo::FItoStackOffset(FI) < maxOffset) {
+          Base = CurDAG->getTargetConstant(offset, PtrTy);
+          Index = CurDAG->getTargetFrameIndex(FI, PtrTy);
+          return true;
+        }
+      } else if (offset > minOffset && offset < maxOffset) {
+        Base = CurDAG->getTargetConstant(offset, PtrTy);
+        Index = Op0;
+        return true;
+      }
+    } else if (Op0.getOpcode() == ISD::Constant
+               || Op0.getOpcode() == ISD::TargetConstant) {
+      ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op0);
+      int32_t offset = int32_t(CN->getSExtValue());
+
+      if (Op1.getOpcode() == ISD::FrameIndex) {
+        FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Op1);
+        int FI = int(FIN->getIndex());
+        DEBUG(errs() << "SelectDFormAddr: ISD::ADD offset = " << offset
+                   << " frame index = " << FI << "\n");
+
+        if (SPUFrameInfo::FItoStackOffset(FI) < maxOffset) {
+          Base = CurDAG->getTargetConstant(offset, PtrTy);
+          Index = CurDAG->getTargetFrameIndex(FI, PtrTy);
+          return true;
+        }
+      } else if (offset > minOffset && offset < maxOffset) {
+        Base = CurDAG->getTargetConstant(offset, PtrTy);
+        Index = Op1;
+        return true;
+      }
+    }
+  } else if (Opc == SPUISD::IndirectAddr) {
+    // Indirect with constant offset -> D-Form address
+    const SDValue Op0 = N.getOperand(0);
+    const SDValue Op1 = N.getOperand(1);
+
+    if (Op0.getOpcode() == SPUISD::Hi
+        && Op1.getOpcode() == SPUISD::Lo) {
+      // (SPUindirect (SPUhi <arg>, 0), (SPUlo <arg>, 0))
+      Base = CurDAG->getTargetConstant(0, PtrTy);
+      Index = N;
+      return true;
+    } else if (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1)) {
+      int32_t offset = 0;
+      SDValue idxOp;
+
+      if (isa<ConstantSDNode>(Op1)) {
+        ConstantSDNode *CN = cast<ConstantSDNode>(Op1);
+        offset = int32_t(CN->getSExtValue());
+        idxOp = Op0;
+      } else if (isa<ConstantSDNode>(Op0)) {
+        ConstantSDNode *CN = cast<ConstantSDNode>(Op0);
+        offset = int32_t(CN->getSExtValue());
+        idxOp = Op1;
+      }
+
+      if (offset >= minOffset && offset <= maxOffset) {
+        Base = CurDAG->getTargetConstant(offset, PtrTy);
+        Index = idxOp;
+        return true;
+      }
+    }
+  } else if (Opc == SPUISD::AFormAddr) {
+    Base = CurDAG->getTargetConstant(0, N.getValueType());
+    Index = N;
+    return true;
+  } else if (Opc == SPUISD::LDRESULT) {
+    Base = CurDAG->getTargetConstant(0, N.getValueType());
+    Index = N;
+    return true;
+  } else if (Opc == ISD::Register || Opc == ISD::CopyFromReg) {
+    unsigned OpOpc = Op->getOpcode();
+
+    if (OpOpc == ISD::STORE || OpOpc == ISD::LOAD) {
+      // Direct load/store without getelementptr
+      SDValue Addr, Offs;
+
+      // Get the register from CopyFromReg
+      if (Opc == ISD::CopyFromReg)
+        Addr = N.getOperand(1);
+      else
+        Addr = N;                       // Register
+
+      Offs = ((OpOpc == ISD::STORE) ? Op->getOperand(3) : Op->getOperand(2));
+
+      if (Offs.getOpcode() == ISD::Constant || Offs.getOpcode() == ISD::UNDEF) {
+        if (Offs.getOpcode() == ISD::UNDEF)
+          Offs = CurDAG->getTargetConstant(0, Offs.getValueType());
+
+        Base = Offs;
+        Index = Addr;
+        return true;
+      }
+    } else {
+      /* If otherwise unadorned, default to D-form address with 0 offset: */
+      if (Opc == ISD::CopyFromReg) {
+        Index = N.getOperand(1);
+      } else {
+        Index = N;
+      }
+
+      Base = CurDAG->getTargetConstant(0, Index.getValueType());
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/*!
+  \arg Op The ISD instruction operand
+  \arg N The address operand
+  \arg Base The base pointer operand
+  \arg Index The offset/index operand
+
+  If the address \a N can be expressed as an A-form or D-form address, returns
+  false.  Otherwise, creates two operands, Base and Index that will become the
+  (r)(r) X-form address.
+*/
+bool
+SPUDAGToDAGISel::SelectXFormAddr(SDNode *Op, SDValue N, SDValue &Base,
+                                 SDValue &Index) {
+  if (!SelectAFormAddr(Op, N, Base, Index)
+      && !SelectDFormAddr(Op, N, Base, Index)) {
+    // If the address is neither A-form or D-form, punt and use an X-form
+    // address:
+    Base = N.getOperand(1);
+    Index = N.getOperand(0);
+    return true;
+  }
+
+  return false;
+}
+
+//! Convert the operand from a target-independent to a target-specific node
+/*!
+ */
+SDNode *
+SPUDAGToDAGISel::Select(SDNode *N) {
+  unsigned Opc = N->getOpcode();
+  int n_ops = -1;
+  unsigned NewOpc;
+  EVT OpVT = N->getValueType(0);
+  SDValue Ops[8];
+  DebugLoc dl = N->getDebugLoc();
+
+  if (N->isMachineOpcode()) {
+    return NULL;   // Already selected.
+  }
+
+  if (Opc == ISD::FrameIndex) {
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI, N->getValueType(0));
+    SDValue Imm0 = CurDAG->getTargetConstant(0, N->getValueType(0));
+
+    if (FI < 128) {
+      NewOpc = SPU::AIr32;
+      Ops[0] = TFI;
+      Ops[1] = Imm0;
+      n_ops = 2;
+    } else {
+      NewOpc = SPU::Ar32;
+      Ops[0] = CurDAG->getRegister(SPU::R1, N->getValueType(0));
+      Ops[1] = SDValue(CurDAG->getMachineNode(SPU::ILAr32, dl,
+                                              N->getValueType(0), TFI, Imm0),
+                       0);
+      n_ops = 2;
+    }
+  } else if (Opc == ISD::Constant && OpVT == MVT::i64) {
+    // Catch the i64 constants that end up here. Note: The backend doesn't
+    // attempt to legalize the constant (it's useless because DAGCombiner
+    // will insert 64-bit constants and we can't stop it).
+    return SelectI64Constant(N, OpVT, N->getDebugLoc());
+  } else if ((Opc == ISD::ZERO_EXTEND || Opc == ISD::ANY_EXTEND)
+             && OpVT == MVT::i64) {
+    SDValue Op0 = N->getOperand(0);
+    EVT Op0VT = Op0.getValueType();
+    EVT Op0VecVT = EVT::getVectorVT(*CurDAG->getContext(),
+                                    Op0VT, (128 / Op0VT.getSizeInBits()));
+    EVT OpVecVT = EVT::getVectorVT(*CurDAG->getContext(), 
+                                   OpVT, (128 / OpVT.getSizeInBits()));
+    SDValue shufMask;
+
+    switch (Op0VT.getSimpleVT().SimpleTy) {
+    default:
+      llvm_report_error("CellSPU Select: Unhandled zero/any extend EVT");
+      /*NOTREACHED*/
+    case MVT::i32:
+      shufMask = CurDAG->getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                                 CurDAG->getConstant(0x80808080, MVT::i32),
+                                 CurDAG->getConstant(0x00010203, MVT::i32),
+                                 CurDAG->getConstant(0x80808080, MVT::i32),
+                                 CurDAG->getConstant(0x08090a0b, MVT::i32));
+      break;
+
+    case MVT::i16:
+      shufMask = CurDAG->getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                                 CurDAG->getConstant(0x80808080, MVT::i32),
+                                 CurDAG->getConstant(0x80800203, MVT::i32),
+                                 CurDAG->getConstant(0x80808080, MVT::i32),
+                                 CurDAG->getConstant(0x80800a0b, MVT::i32));
+      break;
+
+    case MVT::i8:
+      shufMask = CurDAG->getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                                 CurDAG->getConstant(0x80808080, MVT::i32),
+                                 CurDAG->getConstant(0x80808003, MVT::i32),
+                                 CurDAG->getConstant(0x80808080, MVT::i32),
+                                 CurDAG->getConstant(0x8080800b, MVT::i32));
+      break;
+    }
+
+    SDNode *shufMaskLoad = emitBuildVector(shufMask.getNode());
+    SDNode *PromoteScalar =
+            SelectCode(CurDAG->getNode(SPUISD::PREFSLOT2VEC, dl,
+                                       Op0VecVT, Op0).getNode());
+
+    SDValue zextShuffle =
+            CurDAG->getNode(SPUISD::SHUFB, dl, OpVecVT,
+                            SDValue(PromoteScalar, 0),
+                            SDValue(PromoteScalar, 0),
+                            SDValue(shufMaskLoad, 0));
+
+    // N.B.: BIT_CONVERT replaces and updates the zextShuffle node, so we
+    // re-use it in the VEC2PREFSLOT selection without needing to explicitly
+    // call SelectCode (it's already done for us.)
+    SelectCode(CurDAG->getNode(ISD::BIT_CONVERT, dl, OpVecVT, zextShuffle).getNode());
+    return SelectCode(CurDAG->getNode(SPUISD::VEC2PREFSLOT, dl, OpVT,
+                                      zextShuffle).getNode());
+  } else if (Opc == ISD::ADD && (OpVT == MVT::i64 || OpVT == MVT::v2i64)) {
+    SDNode *CGLoad =
+            emitBuildVector(getCarryGenerateShufMask(*CurDAG, dl).getNode());
+
+    return SelectCode(CurDAG->getNode(SPUISD::ADD64_MARKER, dl, OpVT,
+                                      N->getOperand(0), N->getOperand(1),
+                                      SDValue(CGLoad, 0)).getNode());
+  } else if (Opc == ISD::SUB && (OpVT == MVT::i64 || OpVT == MVT::v2i64)) {
+    SDNode *CGLoad =
+            emitBuildVector(getBorrowGenerateShufMask(*CurDAG, dl).getNode());
+
+    return SelectCode(CurDAG->getNode(SPUISD::SUB64_MARKER, dl, OpVT,
+                                      N->getOperand(0), N->getOperand(1),
+                                      SDValue(CGLoad, 0)).getNode());
+  } else if (Opc == ISD::MUL && (OpVT == MVT::i64 || OpVT == MVT::v2i64)) {
+    SDNode *CGLoad =
+            emitBuildVector(getCarryGenerateShufMask(*CurDAG, dl).getNode());
+
+    return SelectCode(CurDAG->getNode(SPUISD::MUL64_MARKER, dl, OpVT,
+                                      N->getOperand(0), N->getOperand(1),
+                                      SDValue(CGLoad, 0)).getNode());
+  } else if (Opc == ISD::TRUNCATE) {
+    SDValue Op0 = N->getOperand(0);
+    if ((Op0.getOpcode() == ISD::SRA || Op0.getOpcode() == ISD::SRL)
+        && OpVT == MVT::i32
+        && Op0.getValueType() == MVT::i64) {
+      // Catch (truncate:i32 ([sra|srl]:i64 arg, c), where c >= 32
+      //
+      // Take advantage of the fact that the upper 32 bits are in the
+      // i32 preferred slot and avoid shuffle gymnastics:
+      ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
+      if (CN != 0) {
+        unsigned shift_amt = unsigned(CN->getZExtValue());
+
+        if (shift_amt >= 32) {
+          SDNode *hi32 =
+                  CurDAG->getMachineNode(SPU::ORr32_r64, dl, OpVT,
+                                         Op0.getOperand(0));
+
+          shift_amt -= 32;
+          if (shift_amt > 0) {
+            // Take care of the additional shift, if present:
+            SDValue shift = CurDAG->getTargetConstant(shift_amt, MVT::i32);
+            unsigned Opc = SPU::ROTMAIr32_i32;
+
+            if (Op0.getOpcode() == ISD::SRL)
+              Opc = SPU::ROTMr32;
+
+            hi32 = CurDAG->getMachineNode(Opc, dl, OpVT, SDValue(hi32, 0),
+                                          shift);
+          }
+
+          return hi32;
+        }
+      }
+    }
+  } else if (Opc == ISD::SHL) {
+    if (OpVT == MVT::i64) {
+      return SelectSHLi64(N, OpVT);
+    }
+  } else if (Opc == ISD::SRL) {
+    if (OpVT == MVT::i64) {
+      return SelectSRLi64(N, OpVT);
+    }
+  } else if (Opc == ISD::SRA) {
+    if (OpVT == MVT::i64) {
+      return SelectSRAi64(N, OpVT);
+    }
+  } else if (Opc == ISD::FNEG
+             && (OpVT == MVT::f64 || OpVT == MVT::v2f64)) {
+    DebugLoc dl = N->getDebugLoc();
+    // Check if the pattern is a special form of DFNMS:
+    // (fneg (fsub (fmul R64FP:$rA, R64FP:$rB), R64FP:$rC))
+    SDValue Op0 = N->getOperand(0);
+    if (Op0.getOpcode() == ISD::FSUB) {
+      SDValue Op00 = Op0.getOperand(0);
+      if (Op00.getOpcode() == ISD::FMUL) {
+        unsigned Opc = SPU::DFNMSf64;
+        if (OpVT == MVT::v2f64)
+          Opc = SPU::DFNMSv2f64;
+
+        return CurDAG->getMachineNode(Opc, dl, OpVT,
+                                      Op00.getOperand(0),
+                                      Op00.getOperand(1),
+                                      Op0.getOperand(1));
+      }
+    }
+
+    SDValue negConst = CurDAG->getConstant(0x8000000000000000ULL, MVT::i64);
+    SDNode *signMask = 0;
+    unsigned Opc = SPU::XORfneg64;
+
+    if (OpVT == MVT::f64) {
+      signMask = SelectI64Constant(negConst.getNode(), MVT::i64, dl);
+    } else if (OpVT == MVT::v2f64) {
+      Opc = SPU::XORfnegvec;
+      signMask = emitBuildVector(CurDAG->getNode(ISD::BUILD_VECTOR, dl,
+                                                 MVT::v2i64,
+                                                 negConst, negConst).getNode());
+    }
+
+    return CurDAG->getMachineNode(Opc, dl, OpVT,
+                                  N->getOperand(0), SDValue(signMask, 0));
+  } else if (Opc == ISD::FABS) {
+    if (OpVT == MVT::f64) {
+      SDNode *signMask = SelectI64Constant(0x7fffffffffffffffULL, MVT::i64, dl);
+      return CurDAG->getMachineNode(SPU::ANDfabs64, dl, OpVT,
+                                    N->getOperand(0), SDValue(signMask, 0));
+    } else if (OpVT == MVT::v2f64) {
+      SDValue absConst = CurDAG->getConstant(0x7fffffffffffffffULL, MVT::i64);
+      SDValue absVec = CurDAG->getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64,
+                                       absConst, absConst);
+      SDNode *signMask = emitBuildVector(absVec.getNode());
+      return CurDAG->getMachineNode(SPU::ANDfabsvec, dl, OpVT,
+                                    N->getOperand(0), SDValue(signMask, 0));
+    }
+  } else if (Opc == SPUISD::LDRESULT) {
+    // Custom select instructions for LDRESULT
+    EVT VT = N->getValueType(0);
+    SDValue Arg = N->getOperand(0);
+    SDValue Chain = N->getOperand(1);
+    SDNode *Result;
+    const valtype_map_s *vtm = getValueTypeMapEntry(VT);
+
+    if (vtm->ldresult_ins == 0) {
+      std::string msg;
+      raw_string_ostream Msg(msg);
+      Msg << "LDRESULT for unsupported type: "
+           << VT.getEVTString();
+      llvm_report_error(Msg.str());
+    }
+
+    Opc = vtm->ldresult_ins;
+    if (vtm->ldresult_imm) {
+      SDValue Zero = CurDAG->getTargetConstant(0, VT);
+
+      Result = CurDAG->getMachineNode(Opc, dl, VT, MVT::Other, Arg, Zero, Chain);
+    } else {
+      Result = CurDAG->getMachineNode(Opc, dl, VT, MVT::Other, Arg, Arg, Chain);
+    }
+
+    return Result;
+  } else if (Opc == SPUISD::IndirectAddr) {
+    // Look at the operands: SelectCode() will catch the cases that aren't
+    // specifically handled here.
+    //
+    // SPUInstrInfo catches the following patterns:
+    // (SPUindirect (SPUhi ...), (SPUlo ...))
+    // (SPUindirect $sp, imm)
+    EVT VT = N->getValueType(0);
+    SDValue Op0 = N->getOperand(0);
+    SDValue Op1 = N->getOperand(1);
+    RegisterSDNode *RN;
+
+    if ((Op0.getOpcode() != SPUISD::Hi && Op1.getOpcode() != SPUISD::Lo)
+        || (Op0.getOpcode() == ISD::Register
+            && ((RN = dyn_cast<RegisterSDNode>(Op0.getNode())) != 0
+                && RN->getReg() != SPU::R1))) {
+      NewOpc = SPU::Ar32;
+      if (Op1.getOpcode() == ISD::Constant) {
+        ConstantSDNode *CN = cast<ConstantSDNode>(Op1);
+        Op1 = CurDAG->getTargetConstant(CN->getSExtValue(), VT);
+        NewOpc = (isI32IntS10Immediate(CN) ? SPU::AIr32 : SPU::Ar32);
+      }
+      Ops[0] = Op0;
+      Ops[1] = Op1;
+      n_ops = 2;
+    }
+  }
+
+  if (n_ops > 0) {
+    if (N->hasOneUse())
+      return CurDAG->SelectNodeTo(N, NewOpc, OpVT, Ops, n_ops);
+    else
+      return CurDAG->getMachineNode(NewOpc, dl, OpVT, Ops, n_ops);
+  } else
+    return SelectCode(N);
+}
+
+/*!
+ * Emit the instruction sequence for i64 left shifts. The basic algorithm
+ * is to fill the bottom two word slots with zeros so that zeros are shifted
+ * in as the entire quadword is shifted left.
+ *
+ * \note This code could also be used to implement v2i64 shl.
+ *
+ * @param Op The shl operand
+ * @param OpVT Op's machine value value type (doesn't need to be passed, but
+ * makes life easier.)
+ * @return The SDNode with the entire instruction sequence
+ */
+SDNode *
+SPUDAGToDAGISel::SelectSHLi64(SDNode *N, EVT OpVT) {
+  SDValue Op0 = N->getOperand(0);
+  EVT VecVT = EVT::getVectorVT(*CurDAG->getContext(), 
+                               OpVT, (128 / OpVT.getSizeInBits()));
+  SDValue ShiftAmt = N->getOperand(1);
+  EVT ShiftAmtVT = ShiftAmt.getValueType();
+  SDNode *VecOp0, *SelMask, *ZeroFill, *Shift = 0;
+  SDValue SelMaskVal;
+  DebugLoc dl = N->getDebugLoc();
+
+  VecOp0 = CurDAG->getMachineNode(SPU::ORv2i64_i64, dl, VecVT, Op0);
+  SelMaskVal = CurDAG->getTargetConstant(0xff00ULL, MVT::i16);
+  SelMask = CurDAG->getMachineNode(SPU::FSMBIv2i64, dl, VecVT, SelMaskVal);
+  ZeroFill = CurDAG->getMachineNode(SPU::ILv2i64, dl, VecVT,
+                                    CurDAG->getTargetConstant(0, OpVT));
+  VecOp0 = CurDAG->getMachineNode(SPU::SELBv2i64, dl, VecVT,
+                                  SDValue(ZeroFill, 0),
+                                  SDValue(VecOp0, 0),
+                                  SDValue(SelMask, 0));
+
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(ShiftAmt)) {
+    unsigned bytes = unsigned(CN->getZExtValue()) >> 3;
+    unsigned bits = unsigned(CN->getZExtValue()) & 7;
+
+    if (bytes > 0) {
+      Shift =
+        CurDAG->getMachineNode(SPU::SHLQBYIv2i64, dl, VecVT,
+                               SDValue(VecOp0, 0),
+                               CurDAG->getTargetConstant(bytes, ShiftAmtVT));
+    }
+
+    if (bits > 0) {
+      Shift =
+        CurDAG->getMachineNode(SPU::SHLQBIIv2i64, dl, VecVT,
+                               SDValue((Shift != 0 ? Shift : VecOp0), 0),
+                               CurDAG->getTargetConstant(bits, ShiftAmtVT));
+    }
+  } else {
+    SDNode *Bytes =
+      CurDAG->getMachineNode(SPU::ROTMIr32, dl, ShiftAmtVT,
+                             ShiftAmt,
+                             CurDAG->getTargetConstant(3, ShiftAmtVT));
+    SDNode *Bits =
+      CurDAG->getMachineNode(SPU::ANDIr32, dl, ShiftAmtVT,
+                             ShiftAmt,
+                             CurDAG->getTargetConstant(7, ShiftAmtVT));
+    Shift =
+      CurDAG->getMachineNode(SPU::SHLQBYv2i64, dl, VecVT,
+                             SDValue(VecOp0, 0), SDValue(Bytes, 0));
+    Shift =
+      CurDAG->getMachineNode(SPU::SHLQBIv2i64, dl, VecVT,
+                             SDValue(Shift, 0), SDValue(Bits, 0));
+  }
+
+  return CurDAG->getMachineNode(SPU::ORi64_v2i64, dl, OpVT, SDValue(Shift, 0));
+}
+
+/*!
+ * Emit the instruction sequence for i64 logical right shifts.
+ *
+ * @param Op The shl operand
+ * @param OpVT Op's machine value value type (doesn't need to be passed, but
+ * makes life easier.)
+ * @return The SDNode with the entire instruction sequence
+ */
+SDNode *
+SPUDAGToDAGISel::SelectSRLi64(SDNode *N, EVT OpVT) {
+  SDValue Op0 = N->getOperand(0);
+  EVT VecVT = EVT::getVectorVT(*CurDAG->getContext(),
+                               OpVT, (128 / OpVT.getSizeInBits()));
+  SDValue ShiftAmt = N->getOperand(1);
+  EVT ShiftAmtVT = ShiftAmt.getValueType();
+  SDNode *VecOp0, *Shift = 0;
+  DebugLoc dl = N->getDebugLoc();
+
+  VecOp0 = CurDAG->getMachineNode(SPU::ORv2i64_i64, dl, VecVT, Op0);
+
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(ShiftAmt)) {
+    unsigned bytes = unsigned(CN->getZExtValue()) >> 3;
+    unsigned bits = unsigned(CN->getZExtValue()) & 7;
+
+    if (bytes > 0) {
+      Shift =
+        CurDAG->getMachineNode(SPU::ROTQMBYIv2i64, dl, VecVT,
+                               SDValue(VecOp0, 0),
+                               CurDAG->getTargetConstant(bytes, ShiftAmtVT));
+    }
+
+    if (bits > 0) {
+      Shift =
+        CurDAG->getMachineNode(SPU::ROTQMBIIv2i64, dl, VecVT,
+                               SDValue((Shift != 0 ? Shift : VecOp0), 0),
+                               CurDAG->getTargetConstant(bits, ShiftAmtVT));
+    }
+  } else {
+    SDNode *Bytes =
+      CurDAG->getMachineNode(SPU::ROTMIr32, dl, ShiftAmtVT,
+                             ShiftAmt,
+                             CurDAG->getTargetConstant(3, ShiftAmtVT));
+    SDNode *Bits =
+      CurDAG->getMachineNode(SPU::ANDIr32, dl, ShiftAmtVT,
+                             ShiftAmt,
+                             CurDAG->getTargetConstant(7, ShiftAmtVT));
+
+    // Ensure that the shift amounts are negated!
+    Bytes = CurDAG->getMachineNode(SPU::SFIr32, dl, ShiftAmtVT,
+                                   SDValue(Bytes, 0),
+                                   CurDAG->getTargetConstant(0, ShiftAmtVT));
+
+    Bits = CurDAG->getMachineNode(SPU::SFIr32, dl, ShiftAmtVT,
+                                  SDValue(Bits, 0),
+                                  CurDAG->getTargetConstant(0, ShiftAmtVT));
+
+    Shift =
+      CurDAG->getMachineNode(SPU::ROTQMBYv2i64, dl, VecVT,
+                             SDValue(VecOp0, 0), SDValue(Bytes, 0));
+    Shift =
+      CurDAG->getMachineNode(SPU::ROTQMBIv2i64, dl, VecVT,
+                             SDValue(Shift, 0), SDValue(Bits, 0));
+  }
+
+  return CurDAG->getMachineNode(SPU::ORi64_v2i64, dl, OpVT, SDValue(Shift, 0));
+}
+
+/*!
+ * Emit the instruction sequence for i64 arithmetic right shifts.
+ *
+ * @param Op The shl operand
+ * @param OpVT Op's machine value value type (doesn't need to be passed, but
+ * makes life easier.)
+ * @return The SDNode with the entire instruction sequence
+ */
+SDNode *
+SPUDAGToDAGISel::SelectSRAi64(SDNode *N, EVT OpVT) {
+  // Promote Op0 to vector
+  EVT VecVT = EVT::getVectorVT(*CurDAG->getContext(), 
+                               OpVT, (128 / OpVT.getSizeInBits()));
+  SDValue ShiftAmt = N->getOperand(1);
+  EVT ShiftAmtVT = ShiftAmt.getValueType();
+  DebugLoc dl = N->getDebugLoc();
+
+  SDNode *VecOp0 =
+    CurDAG->getMachineNode(SPU::ORv2i64_i64, dl, VecVT, N->getOperand(0));
+
+  SDValue SignRotAmt = CurDAG->getTargetConstant(31, ShiftAmtVT);
+  SDNode *SignRot =
+    CurDAG->getMachineNode(SPU::ROTMAIv2i64_i32, dl, MVT::v2i64,
+                           SDValue(VecOp0, 0), SignRotAmt);
+  SDNode *UpperHalfSign =
+    CurDAG->getMachineNode(SPU::ORi32_v4i32, dl, MVT::i32, SDValue(SignRot, 0));
+
+  SDNode *UpperHalfSignMask =
+    CurDAG->getMachineNode(SPU::FSM64r32, dl, VecVT, SDValue(UpperHalfSign, 0));
+  SDNode *UpperLowerMask =
+    CurDAG->getMachineNode(SPU::FSMBIv2i64, dl, VecVT,
+                           CurDAG->getTargetConstant(0xff00ULL, MVT::i16));
+  SDNode *UpperLowerSelect =
+    CurDAG->getMachineNode(SPU::SELBv2i64, dl, VecVT,
+                           SDValue(UpperHalfSignMask, 0),
+                           SDValue(VecOp0, 0),
+                           SDValue(UpperLowerMask, 0));
+
+  SDNode *Shift = 0;
+
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(ShiftAmt)) {
+    unsigned bytes = unsigned(CN->getZExtValue()) >> 3;
+    unsigned bits = unsigned(CN->getZExtValue()) & 7;
+
+    if (bytes > 0) {
+      bytes = 31 - bytes;
+      Shift =
+        CurDAG->getMachineNode(SPU::ROTQBYIv2i64, dl, VecVT,
+                               SDValue(UpperLowerSelect, 0),
+                               CurDAG->getTargetConstant(bytes, ShiftAmtVT));
+    }
+
+    if (bits > 0) {
+      bits = 8 - bits;
+      Shift =
+        CurDAG->getMachineNode(SPU::ROTQBIIv2i64, dl, VecVT,
+                               SDValue((Shift != 0 ? Shift : UpperLowerSelect), 0),
+                               CurDAG->getTargetConstant(bits, ShiftAmtVT));
+    }
+  } else {
+    SDNode *NegShift =
+      CurDAG->getMachineNode(SPU::SFIr32, dl, ShiftAmtVT,
+                             ShiftAmt, CurDAG->getTargetConstant(0, ShiftAmtVT));
+
+    Shift =
+      CurDAG->getMachineNode(SPU::ROTQBYBIv2i64_r32, dl, VecVT,
+                             SDValue(UpperLowerSelect, 0), SDValue(NegShift, 0));
+    Shift =
+      CurDAG->getMachineNode(SPU::ROTQBIv2i64, dl, VecVT,
+                             SDValue(Shift, 0), SDValue(NegShift, 0));
+  }
+
+  return CurDAG->getMachineNode(SPU::ORi64_v2i64, dl, OpVT, SDValue(Shift, 0));
+}
+
+/*!
+ Do the necessary magic necessary to load a i64 constant
+ */
+SDNode *SPUDAGToDAGISel::SelectI64Constant(SDNode *N, EVT OpVT,
+                                           DebugLoc dl) {
+  ConstantSDNode *CN = cast<ConstantSDNode>(N);
+  return SelectI64Constant(CN->getZExtValue(), OpVT, dl);
+}
+
+SDNode *SPUDAGToDAGISel::SelectI64Constant(uint64_t Value64, EVT OpVT,
+                                           DebugLoc dl) {
+  EVT OpVecVT = EVT::getVectorVT(*CurDAG->getContext(), OpVT, 2);
+  SDValue i64vec =
+          SPU::LowerV2I64Splat(OpVecVT, *CurDAG, Value64, dl);
+
+  // Here's where it gets interesting, because we have to parse out the
+  // subtree handed back in i64vec:
+
+  if (i64vec.getOpcode() == ISD::BIT_CONVERT) {
+    // The degenerate case where the upper and lower bits in the splat are
+    // identical:
+    SDValue Op0 = i64vec.getOperand(0);
+
+    ReplaceUses(i64vec, Op0);
+    return CurDAG->getMachineNode(SPU::ORi64_v2i64, dl, OpVT,
+                                  SDValue(emitBuildVector(Op0.getNode()), 0));
+  } else if (i64vec.getOpcode() == SPUISD::SHUFB) {
+    SDValue lhs = i64vec.getOperand(0);
+    SDValue rhs = i64vec.getOperand(1);
+    SDValue shufmask = i64vec.getOperand(2);
+
+    if (lhs.getOpcode() == ISD::BIT_CONVERT) {
+      ReplaceUses(lhs, lhs.getOperand(0));
+      lhs = lhs.getOperand(0);
+    }
+
+    SDNode *lhsNode = (lhs.getNode()->isMachineOpcode()
+                       ? lhs.getNode()
+                       : emitBuildVector(lhs.getNode()));
+
+    if (rhs.getOpcode() == ISD::BIT_CONVERT) {
+      ReplaceUses(rhs, rhs.getOperand(0));
+      rhs = rhs.getOperand(0);
+    }
+
+    SDNode *rhsNode = (rhs.getNode()->isMachineOpcode()
+                       ? rhs.getNode()
+                       : emitBuildVector(rhs.getNode()));
+
+    if (shufmask.getOpcode() == ISD::BIT_CONVERT) {
+      ReplaceUses(shufmask, shufmask.getOperand(0));
+      shufmask = shufmask.getOperand(0);
+    }
+
+    SDNode *shufMaskNode = (shufmask.getNode()->isMachineOpcode()
+                            ? shufmask.getNode()
+                            : emitBuildVector(shufmask.getNode()));
+
+    SDNode *shufNode =
+            Select(CurDAG->getNode(SPUISD::SHUFB, dl, OpVecVT,
+                                   SDValue(lhsNode, 0), SDValue(rhsNode, 0),
+                                   SDValue(shufMaskNode, 0)).getNode());
+
+    return CurDAG->getMachineNode(SPU::ORi64_v2i64, dl, OpVT,
+                                  SDValue(shufNode, 0));
+  } else if (i64vec.getOpcode() == ISD::BUILD_VECTOR) {
+    return CurDAG->getMachineNode(SPU::ORi64_v2i64, dl, OpVT,
+                                  SDValue(emitBuildVector(i64vec.getNode()), 0));
+  } else {
+    llvm_report_error("SPUDAGToDAGISel::SelectI64Constant: Unhandled i64vec"
+                      "condition");
+  }
+}
+
+/// createSPUISelDag - This pass converts a legalized DAG into a
+/// SPU-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createSPUISelDag(SPUTargetMachine &TM) {
+  return new SPUDAGToDAGISel(TM);
+}

diff --git a/lib/Target/CellSPU/SPUISelLowering.cpp b/lib/Target/CellSPU/SPUISelLowering.cpp
new file mode 100644
index 0000000..fe0f019
--- /dev/null
+++ b/lib/Target/CellSPU/SPUISelLowering.cpp

@@ -0,0 +1,3101 @@
+//
+//===-- SPUISelLowering.cpp - Cell SPU DAG Lowering Implementation --------===//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SPUTargetLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPURegisterNames.h"
+#include "SPUISelLowering.h"
+#include "SPUTargetMachine.h"
+#include "SPUFrameInfo.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/CallingConv.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/VectorExtras.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <map>
+
+using namespace llvm;
+
+// Used in getTargetNodeName() below
+namespace {
+  std::map<unsigned, const char *> node_names;
+
+  //! EVT mapping to useful data for Cell SPU
+  struct valtype_map_s {
+    EVT   valtype;
+    int   prefslot_byte;
+  };
+
+  const valtype_map_s valtype_map[] = {
+    { MVT::i1,   3 },
+    { MVT::i8,   3 },
+    { MVT::i16,  2 },
+    { MVT::i32,  0 },
+    { MVT::f32,  0 },
+    { MVT::i64,  0 },
+    { MVT::f64,  0 },
+    { MVT::i128, 0 }
+  };
+
+  const size_t n_valtype_map = sizeof(valtype_map) / sizeof(valtype_map[0]);
+
+  const valtype_map_s *getValueTypeMapEntry(EVT VT) {
+    const valtype_map_s *retval = 0;
+
+    for (size_t i = 0; i < n_valtype_map; ++i) {
+      if (valtype_map[i].valtype == VT) {
+        retval = valtype_map + i;
+        break;
+      }
+    }
+
+#ifndef NDEBUG
+    if (retval == 0) {
+      std::string msg;
+      raw_string_ostream Msg(msg);
+      Msg << "getValueTypeMapEntry returns NULL for "
+           << VT.getEVTString();
+      llvm_report_error(Msg.str());
+    }
+#endif
+
+    return retval;
+  }
+
+  //! Expand a library call into an actual call DAG node
+  /*!
+   \note
+   This code is taken from SelectionDAGLegalize, since it is not exposed as
+   part of the LLVM SelectionDAG API.
+   */
+
+  SDValue
+  ExpandLibCall(RTLIB::Libcall LC, SDValue Op, SelectionDAG &DAG,
+                bool isSigned, SDValue &Hi, SPUTargetLowering &TLI) {
+    // The input chain to this libcall is the entry node of the function.
+    // Legalizing the call will automatically add the previous call to the
+    // dependence.
+    SDValue InChain = DAG.getEntryNode();
+
+    TargetLowering::ArgListTy Args;
+    TargetLowering::ArgListEntry Entry;
+    for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
+      EVT ArgVT = Op.getOperand(i).getValueType();
+      const Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+      Entry.Node = Op.getOperand(i);
+      Entry.Ty = ArgTy;
+      Entry.isSExt = isSigned;
+      Entry.isZExt = !isSigned;
+      Args.push_back(Entry);
+    }
+    SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
+                                           TLI.getPointerTy());
+
+    // Splice the libcall in wherever FindInputOutputChains tells us to.
+    const Type *RetTy =
+                Op.getNode()->getValueType(0).getTypeForEVT(*DAG.getContext());
+    std::pair<SDValue, SDValue> CallInfo =
+            TLI.LowerCallTo(InChain, RetTy, isSigned, !isSigned, false, false,
+                            0, TLI.getLibcallCallingConv(LC), false,
+                            /*isReturnValueUsed=*/true,
+                            Callee, Args, DAG, Op.getDebugLoc(),
+                            DAG.GetOrdering(InChain.getNode()));
+
+    return CallInfo.first;
+  }
+}
+
+SPUTargetLowering::SPUTargetLowering(SPUTargetMachine &TM)
+  : TargetLowering(TM, new TargetLoweringObjectFileELF()),
+    SPUTM(TM) {
+  // Fold away setcc operations if possible.
+  setPow2DivIsCheap();
+
+  // Use _setjmp/_longjmp instead of setjmp/longjmp.
+  setUseUnderscoreSetJmp(true);
+  setUseUnderscoreLongJmp(true);
+
+  // Set RTLIB libcall names as used by SPU:
+  setLibcallName(RTLIB::DIV_F64, "__fast_divdf3");
+
+  // Set up the SPU's register classes:
+  addRegisterClass(MVT::i8,   SPU::R8CRegisterClass);
+  addRegisterClass(MVT::i16,  SPU::R16CRegisterClass);
+  addRegisterClass(MVT::i32,  SPU::R32CRegisterClass);
+  addRegisterClass(MVT::i64,  SPU::R64CRegisterClass);
+  addRegisterClass(MVT::f32,  SPU::R32FPRegisterClass);
+  addRegisterClass(MVT::f64,  SPU::R64FPRegisterClass);
+  addRegisterClass(MVT::i128, SPU::GPRCRegisterClass);
+
+  // SPU has no sign or zero extended loads for i1, i8, i16:
+  setLoadExtAction(ISD::EXTLOAD,  MVT::i1, Promote);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
+
+  setLoadExtAction(ISD::EXTLOAD,  MVT::f32, Expand);
+  setLoadExtAction(ISD::EXTLOAD,  MVT::f64, Expand);
+
+  setTruncStoreAction(MVT::i128, MVT::i64, Expand);
+  setTruncStoreAction(MVT::i128, MVT::i32, Expand);
+  setTruncStoreAction(MVT::i128, MVT::i16, Expand);
+  setTruncStoreAction(MVT::i128, MVT::i8, Expand);
+
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+  // SPU constant load actions are custom lowered:
+  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
+  setOperationAction(ISD::ConstantFP, MVT::f64, Custom);
+
+  // SPU's loads and stores have to be custom lowered:
+  for (unsigned sctype = (unsigned) MVT::i8; sctype < (unsigned) MVT::i128;
+       ++sctype) {
+    MVT::SimpleValueType VT = (MVT::SimpleValueType)sctype;
+
+    setOperationAction(ISD::LOAD,   VT, Custom);
+    setOperationAction(ISD::STORE,  VT, Custom);
+    setLoadExtAction(ISD::EXTLOAD,  VT, Custom);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, Custom);
+    setLoadExtAction(ISD::SEXTLOAD, VT, Custom);
+
+    for (unsigned stype = sctype - 1; stype >= (unsigned) MVT::i8; --stype) {
+      MVT::SimpleValueType StoreVT = (MVT::SimpleValueType) stype;
+      setTruncStoreAction(VT, StoreVT, Expand);
+    }
+  }
+
+  for (unsigned sctype = (unsigned) MVT::f32; sctype < (unsigned) MVT::f64;
+       ++sctype) {
+    MVT::SimpleValueType VT = (MVT::SimpleValueType) sctype;
+
+    setOperationAction(ISD::LOAD,   VT, Custom);
+    setOperationAction(ISD::STORE,  VT, Custom);
+
+    for (unsigned stype = sctype - 1; stype >= (unsigned) MVT::f32; --stype) {
+      MVT::SimpleValueType StoreVT = (MVT::SimpleValueType) stype;
+      setTruncStoreAction(VT, StoreVT, Expand);
+    }
+  }
+
+  // Expand the jumptable branches
+  setOperationAction(ISD::BR_JT,        MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC,        MVT::Other, Expand);
+
+  // Custom lower SELECT_CC for most cases, but expand by default
+  setOperationAction(ISD::SELECT_CC,    MVT::Other, Expand);
+  setOperationAction(ISD::SELECT_CC,    MVT::i8,    Custom);
+  setOperationAction(ISD::SELECT_CC,    MVT::i16,   Custom);
+  setOperationAction(ISD::SELECT_CC,    MVT::i32,   Custom);
+  setOperationAction(ISD::SELECT_CC,    MVT::i64,   Custom);
+
+  // SPU has no intrinsics for these particular operations:
+  setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
+
+  // SPU has no division/remainder instructions
+  setOperationAction(ISD::SREM,    MVT::i8,   Expand);
+  setOperationAction(ISD::UREM,    MVT::i8,   Expand);
+  setOperationAction(ISD::SDIV,    MVT::i8,   Expand);
+  setOperationAction(ISD::UDIV,    MVT::i8,   Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i8,   Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i8,   Expand);
+  setOperationAction(ISD::SREM,    MVT::i16,  Expand);
+  setOperationAction(ISD::UREM,    MVT::i16,  Expand);
+  setOperationAction(ISD::SDIV,    MVT::i16,  Expand);
+  setOperationAction(ISD::UDIV,    MVT::i16,  Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i16,  Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i16,  Expand);
+  setOperationAction(ISD::SREM,    MVT::i32,  Expand);
+  setOperationAction(ISD::UREM,    MVT::i32,  Expand);
+  setOperationAction(ISD::SDIV,    MVT::i32,  Expand);
+  setOperationAction(ISD::UDIV,    MVT::i32,  Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i32,  Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i32,  Expand);
+  setOperationAction(ISD::SREM,    MVT::i64,  Expand);
+  setOperationAction(ISD::UREM,    MVT::i64,  Expand);
+  setOperationAction(ISD::SDIV,    MVT::i64,  Expand);
+  setOperationAction(ISD::UDIV,    MVT::i64,  Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i64,  Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i64,  Expand);
+  setOperationAction(ISD::SREM,    MVT::i128, Expand);
+  setOperationAction(ISD::UREM,    MVT::i128, Expand);
+  setOperationAction(ISD::SDIV,    MVT::i128, Expand);
+  setOperationAction(ISD::UDIV,    MVT::i128, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i128, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i128, Expand);
+
+  // We don't support sin/cos/sqrt/fmod
+  setOperationAction(ISD::FSIN , MVT::f64, Expand);
+  setOperationAction(ISD::FCOS , MVT::f64, Expand);
+  setOperationAction(ISD::FREM , MVT::f64, Expand);
+  setOperationAction(ISD::FSIN , MVT::f32, Expand);
+  setOperationAction(ISD::FCOS , MVT::f32, Expand);
+  setOperationAction(ISD::FREM , MVT::f32, Expand);
+
+  // Expand fsqrt to the appropriate libcall (NOTE: should use h/w fsqrt
+  // for f32!)
+  setOperationAction(ISD::FSQRT, MVT::f64, Expand);
+  setOperationAction(ISD::FSQRT, MVT::f32, Expand);
+
+  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+
+  // SPU can do rotate right and left, so legalize it... but customize for i8
+  // because instructions don't exist.
+
+  // FIXME: Change from "expand" to appropriate type once ROTR is supported in
+  //        .td files.
+  setOperationAction(ISD::ROTR, MVT::i32,    Expand /*Legal*/);
+  setOperationAction(ISD::ROTR, MVT::i16,    Expand /*Legal*/);
+  setOperationAction(ISD::ROTR, MVT::i8,     Expand /*Custom*/);
+
+  setOperationAction(ISD::ROTL, MVT::i32,    Legal);
+  setOperationAction(ISD::ROTL, MVT::i16,    Legal);
+  setOperationAction(ISD::ROTL, MVT::i8,     Custom);
+
+  // SPU has no native version of shift left/right for i8
+  setOperationAction(ISD::SHL,  MVT::i8,     Custom);
+  setOperationAction(ISD::SRL,  MVT::i8,     Custom);
+  setOperationAction(ISD::SRA,  MVT::i8,     Custom);
+
+  // Make these operations legal and handle them during instruction selection:
+  setOperationAction(ISD::SHL,  MVT::i64,    Legal);
+  setOperationAction(ISD::SRL,  MVT::i64,    Legal);
+  setOperationAction(ISD::SRA,  MVT::i64,    Legal);
+
+  // Custom lower i8, i32 and i64 multiplications
+  setOperationAction(ISD::MUL,  MVT::i8,     Custom);
+  setOperationAction(ISD::MUL,  MVT::i32,    Legal);
+  setOperationAction(ISD::MUL,  MVT::i64,    Legal);
+
+  // Expand double-width multiplication
+  // FIXME: It would probably be reasonable to support some of these operations
+  setOperationAction(ISD::UMUL_LOHI, MVT::i8,  Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i8,  Expand);
+  setOperationAction(ISD::MULHU,     MVT::i8,  Expand);
+  setOperationAction(ISD::MULHS,     MVT::i8,  Expand);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i16, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i16, Expand);
+  setOperationAction(ISD::MULHU,     MVT::i16, Expand);
+  setOperationAction(ISD::MULHS,     MVT::i16, Expand);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::MULHU,     MVT::i32, Expand);
+  setOperationAction(ISD::MULHS,     MVT::i32, Expand);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+  setOperationAction(ISD::MULHU,     MVT::i64, Expand);
+  setOperationAction(ISD::MULHS,     MVT::i64, Expand);
+
+  // Need to custom handle (some) common i8, i64 math ops
+  setOperationAction(ISD::ADD,  MVT::i8,     Custom);
+  setOperationAction(ISD::ADD,  MVT::i64,    Legal);
+  setOperationAction(ISD::SUB,  MVT::i8,     Custom);
+  setOperationAction(ISD::SUB,  MVT::i64,    Legal);
+
+  // SPU does not have BSWAP. It does have i32 support CTLZ.
+  // CTPOP has to be custom lowered.
+  setOperationAction(ISD::BSWAP, MVT::i32,   Expand);
+  setOperationAction(ISD::BSWAP, MVT::i64,   Expand);
+
+  setOperationAction(ISD::CTPOP, MVT::i8,    Custom);
+  setOperationAction(ISD::CTPOP, MVT::i16,   Custom);
+  setOperationAction(ISD::CTPOP, MVT::i32,   Custom);
+  setOperationAction(ISD::CTPOP, MVT::i64,   Custom);
+  setOperationAction(ISD::CTPOP, MVT::i128,  Expand);
+
+  setOperationAction(ISD::CTTZ , MVT::i8,    Expand);
+  setOperationAction(ISD::CTTZ , MVT::i16,   Expand);
+  setOperationAction(ISD::CTTZ , MVT::i32,   Expand);
+  setOperationAction(ISD::CTTZ , MVT::i64,   Expand);
+  setOperationAction(ISD::CTTZ , MVT::i128,  Expand);
+
+  setOperationAction(ISD::CTLZ , MVT::i8,    Promote);
+  setOperationAction(ISD::CTLZ , MVT::i16,   Promote);
+  setOperationAction(ISD::CTLZ , MVT::i32,   Legal);
+  setOperationAction(ISD::CTLZ , MVT::i64,   Expand);
+  setOperationAction(ISD::CTLZ , MVT::i128,  Expand);
+
+  // SPU has a version of select that implements (a&~c)|(b&c), just like
+  // select ought to work:
+  setOperationAction(ISD::SELECT, MVT::i8,   Legal);
+  setOperationAction(ISD::SELECT, MVT::i16,  Legal);
+  setOperationAction(ISD::SELECT, MVT::i32,  Legal);
+  setOperationAction(ISD::SELECT, MVT::i64,  Legal);
+
+  setOperationAction(ISD::SETCC, MVT::i8,    Legal);
+  setOperationAction(ISD::SETCC, MVT::i16,   Legal);
+  setOperationAction(ISD::SETCC, MVT::i32,   Legal);
+  setOperationAction(ISD::SETCC, MVT::i64,   Legal);
+  setOperationAction(ISD::SETCC, MVT::f64,   Custom);
+
+  // Custom lower i128 -> i64 truncates
+  setOperationAction(ISD::TRUNCATE, MVT::i64, Custom);
+
+  // Custom lower i32/i64 -> i128 sign extend
+  setOperationAction(ISD::SIGN_EXTEND, MVT::i128, Custom);
+
+  setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
+  // SPU has a legal FP -> signed INT instruction for f32, but for f64, need
+  // to expand to a libcall, hence the custom lowering:
+  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Expand);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
+  setOperationAction(ISD::FP_TO_SINT, MVT::i128, Expand);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i128, Expand);
+
+  // FDIV on SPU requires custom lowering
+  setOperationAction(ISD::FDIV, MVT::f64, Expand);      // to libcall
+
+  // SPU has [U|S]INT_TO_FP for f32->i32, but not for f64->i32, f64->i64:
+  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i8,  Promote);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i8,  Promote);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
+
+  setOperationAction(ISD::BIT_CONVERT, MVT::i32, Legal);
+  setOperationAction(ISD::BIT_CONVERT, MVT::f32, Legal);
+  setOperationAction(ISD::BIT_CONVERT, MVT::i64, Legal);
+  setOperationAction(ISD::BIT_CONVERT, MVT::f64, Legal);
+
+  // We cannot sextinreg(i1).  Expand to shifts.
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+  // We want to legalize GlobalAddress and ConstantPool nodes into the
+  // appropriate instructions to materialize the address.
+  for (unsigned sctype = (unsigned) MVT::i8; sctype < (unsigned) MVT::f128;
+       ++sctype) {
+    MVT::SimpleValueType VT = (MVT::SimpleValueType)sctype;
+
+    setOperationAction(ISD::GlobalAddress,  VT, Custom);
+    setOperationAction(ISD::ConstantPool,   VT, Custom);
+    setOperationAction(ISD::JumpTable,      VT, Custom);
+  }
+
+  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
+  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
+
+  // Use the default implementation.
+  setOperationAction(ISD::VAARG             , MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
+  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
+  setOperationAction(ISD::STACKSAVE         , MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE      , MVT::Other, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64  , Expand);
+
+  // Cell SPU has instructions for converting between i64 and fp.
+  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
+
+  // To take advantage of the above i64 FP_TO_SINT, promote i32 FP_TO_UINT
+  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Promote);
+
+  // BUILD_PAIR can't be handled natively, and should be expanded to shl/or
+  setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
+
+  // First set operation action for all vector types to expand. Then we
+  // will selectively turn on ones that can be effectively codegen'd.
+  addRegisterClass(MVT::v16i8, SPU::VECREGRegisterClass);
+  addRegisterClass(MVT::v8i16, SPU::VECREGRegisterClass);
+  addRegisterClass(MVT::v4i32, SPU::VECREGRegisterClass);
+  addRegisterClass(MVT::v2i64, SPU::VECREGRegisterClass);
+  addRegisterClass(MVT::v4f32, SPU::VECREGRegisterClass);
+  addRegisterClass(MVT::v2f64, SPU::VECREGRegisterClass);
+
+  // "Odd size" vector classes that we're willing to support:
+  addRegisterClass(MVT::v2i32, SPU::VECREGRegisterClass);
+
+  for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+       i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
+    MVT::SimpleValueType VT = (MVT::SimpleValueType)i;
+
+    // add/sub are legal for all supported vector VT's.
+    setOperationAction(ISD::ADD,     VT, Legal);
+    setOperationAction(ISD::SUB,     VT, Legal);
+    // mul has to be custom lowered.
+    setOperationAction(ISD::MUL,     VT, Legal);
+
+    setOperationAction(ISD::AND,     VT, Legal);
+    setOperationAction(ISD::OR,      VT, Legal);
+    setOperationAction(ISD::XOR,     VT, Legal);
+    setOperationAction(ISD::LOAD,    VT, Legal);
+    setOperationAction(ISD::SELECT,  VT, Legal);
+    setOperationAction(ISD::STORE,   VT, Legal);
+
+    // These operations need to be expanded:
+    setOperationAction(ISD::SDIV,    VT, Expand);
+    setOperationAction(ISD::SREM,    VT, Expand);
+    setOperationAction(ISD::UDIV,    VT, Expand);
+    setOperationAction(ISD::UREM,    VT, Expand);
+
+    // Custom lower build_vector, constant pool spills, insert and
+    // extract vector elements:
+    setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+    setOperationAction(ISD::ConstantPool, VT, Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+  }
+
+  setOperationAction(ISD::AND, MVT::v16i8, Custom);
+  setOperationAction(ISD::OR,  MVT::v16i8, Custom);
+  setOperationAction(ISD::XOR, MVT::v16i8, Custom);
+  setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
+
+  setOperationAction(ISD::FDIV, MVT::v4f32, Legal);
+
+  setShiftAmountType(MVT::i32);
+  setBooleanContents(ZeroOrNegativeOneBooleanContent);
+
+  setStackPointerRegisterToSaveRestore(SPU::R1);
+
+  // We have target-specific dag combine patterns for the following nodes:
+  setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::ZERO_EXTEND);
+  setTargetDAGCombine(ISD::SIGN_EXTEND);
+  setTargetDAGCombine(ISD::ANY_EXTEND);
+
+  computeRegisterProperties();
+
+  // Set pre-RA register scheduler default to BURR, which produces slightly
+  // better code than the default (could also be TDRR, but TargetLowering.h
+  // needs a mod to support that model):
+  setSchedulingPreference(SchedulingForRegPressure);
+}
+
+const char *
+SPUTargetLowering::getTargetNodeName(unsigned Opcode) const
+{
+  if (node_names.empty()) {
+    node_names[(unsigned) SPUISD::RET_FLAG] = "SPUISD::RET_FLAG";
+    node_names[(unsigned) SPUISD::Hi] = "SPUISD::Hi";
+    node_names[(unsigned) SPUISD::Lo] = "SPUISD::Lo";
+    node_names[(unsigned) SPUISD::PCRelAddr] = "SPUISD::PCRelAddr";
+    node_names[(unsigned) SPUISD::AFormAddr] = "SPUISD::AFormAddr";
+    node_names[(unsigned) SPUISD::IndirectAddr] = "SPUISD::IndirectAddr";
+    node_names[(unsigned) SPUISD::LDRESULT] = "SPUISD::LDRESULT";
+    node_names[(unsigned) SPUISD::CALL] = "SPUISD::CALL";
+    node_names[(unsigned) SPUISD::SHUFB] = "SPUISD::SHUFB";
+    node_names[(unsigned) SPUISD::SHUFFLE_MASK] = "SPUISD::SHUFFLE_MASK";
+    node_names[(unsigned) SPUISD::CNTB] = "SPUISD::CNTB";
+    node_names[(unsigned) SPUISD::PREFSLOT2VEC] = "SPUISD::PREFSLOT2VEC";
+    node_names[(unsigned) SPUISD::VEC2PREFSLOT] = "SPUISD::VEC2PREFSLOT";
+    node_names[(unsigned) SPUISD::SHLQUAD_L_BITS] = "SPUISD::SHLQUAD_L_BITS";
+    node_names[(unsigned) SPUISD::SHLQUAD_L_BYTES] = "SPUISD::SHLQUAD_L_BYTES";
+    node_names[(unsigned) SPUISD::VEC_ROTL] = "SPUISD::VEC_ROTL";
+    node_names[(unsigned) SPUISD::VEC_ROTR] = "SPUISD::VEC_ROTR";
+    node_names[(unsigned) SPUISD::ROTBYTES_LEFT] = "SPUISD::ROTBYTES_LEFT";
+    node_names[(unsigned) SPUISD::ROTBYTES_LEFT_BITS] =
+            "SPUISD::ROTBYTES_LEFT_BITS";
+    node_names[(unsigned) SPUISD::SELECT_MASK] = "SPUISD::SELECT_MASK";
+    node_names[(unsigned) SPUISD::SELB] = "SPUISD::SELB";
+    node_names[(unsigned) SPUISD::ADD64_MARKER] = "SPUISD::ADD64_MARKER";
+    node_names[(unsigned) SPUISD::SUB64_MARKER] = "SPUISD::SUB64_MARKER";
+    node_names[(unsigned) SPUISD::MUL64_MARKER] = "SPUISD::MUL64_MARKER";
+  }
+
+  std::map<unsigned, const char *>::iterator i = node_names.find(Opcode);
+
+  return ((i != node_names.end()) ? i->second : 0);
+}
+
+/// getFunctionAlignment - Return the Log2 alignment of this function.
+unsigned SPUTargetLowering::getFunctionAlignment(const Function *) const {
+  return 3;
+}
+
+//===----------------------------------------------------------------------===//
+// Return the Cell SPU's SETCC result type
+//===----------------------------------------------------------------------===//
+
+MVT::SimpleValueType SPUTargetLowering::getSetCCResultType(EVT VT) const {
+  // i16 and i32 are valid SETCC result types
+  return ((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) ?
+    VT.getSimpleVT().SimpleTy :
+    MVT::i32);
+}
+
+//===----------------------------------------------------------------------===//
+// Calling convention code:
+//===----------------------------------------------------------------------===//
+
+#include "SPUGenCallingConv.inc"
+
+//===----------------------------------------------------------------------===//
+//  LowerOperation implementation
+//===----------------------------------------------------------------------===//
+
+/// Custom lower loads for CellSPU
+/*!
+ All CellSPU loads and stores are aligned to 16-byte boundaries, so for elements
+ within a 16-byte block, we have to rotate to extract the requested element.
+
+ For extending loads, we also want to ensure that the following sequence is
+ emitted, e.g. for MVT::f32 extending load to MVT::f64:
+
+\verbatim
+%1  v16i8,ch = load
+%2  v16i8,ch = rotate %1
+%3  v4f8, ch = bitconvert %2
+%4  f32      = vec2perfslot %3
+%5  f64      = fp_extend %4
+\endverbatim
+*/
+static SDValue
+LowerLOAD(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
+  LoadSDNode *LN = cast<LoadSDNode>(Op);
+  SDValue the_chain = LN->getChain();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  EVT InVT = LN->getMemoryVT();
+  EVT OutVT = Op.getValueType();
+  ISD::LoadExtType ExtType = LN->getExtensionType();
+  unsigned alignment = LN->getAlignment();
+  const valtype_map_s *vtm = getValueTypeMapEntry(InVT);
+  DebugLoc dl = Op.getDebugLoc();
+
+  switch (LN->getAddressingMode()) {
+  case ISD::UNINDEXED: {
+    SDValue result;
+    SDValue basePtr = LN->getBasePtr();
+    SDValue rotate;
+
+    if (alignment == 16) {
+      ConstantSDNode *CN;
+
+      // Special cases for a known aligned load to simplify the base pointer
+      // and the rotation amount:
+      if (basePtr.getOpcode() == ISD::ADD
+          && (CN = dyn_cast<ConstantSDNode > (basePtr.getOperand(1))) != 0) {
+        // Known offset into basePtr
+        int64_t offset = CN->getSExtValue();
+        int64_t rotamt = int64_t((offset & 0xf) - vtm->prefslot_byte);
+
+        if (rotamt < 0)
+          rotamt += 16;
+
+        rotate = DAG.getConstant(rotamt, MVT::i16);
+
+        // Simplify the base pointer for this case:
+        basePtr = basePtr.getOperand(0);
+        if ((offset & ~0xf) > 0) {
+          basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
+                                basePtr,
+                                DAG.getConstant((offset & ~0xf), PtrVT));
+        }
+      } else if ((basePtr.getOpcode() == SPUISD::AFormAddr)
+                 || (basePtr.getOpcode() == SPUISD::IndirectAddr
+                     && basePtr.getOperand(0).getOpcode() == SPUISD::Hi
+                     && basePtr.getOperand(1).getOpcode() == SPUISD::Lo)) {
+        // Plain aligned a-form address: rotate into preferred slot
+        // Same for (SPUindirect (SPUhi ...), (SPUlo ...))
+        int64_t rotamt = -vtm->prefslot_byte;
+        if (rotamt < 0)
+          rotamt += 16;
+        rotate = DAG.getConstant(rotamt, MVT::i16);
+      } else {
+        // Offset the rotate amount by the basePtr and the preferred slot
+        // byte offset
+        int64_t rotamt = -vtm->prefslot_byte;
+        if (rotamt < 0)
+          rotamt += 16;
+        rotate = DAG.getNode(ISD::ADD, dl, PtrVT,
+                             basePtr,
+                             DAG.getConstant(rotamt, PtrVT));
+      }
+    } else {
+      // Unaligned load: must be more pessimistic about addressing modes:
+      if (basePtr.getOpcode() == ISD::ADD) {
+        MachineFunction &MF = DAG.getMachineFunction();
+        MachineRegisterInfo &RegInfo = MF.getRegInfo();
+        unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
+        SDValue Flag;
+
+        SDValue Op0 = basePtr.getOperand(0);
+        SDValue Op1 = basePtr.getOperand(1);
+
+        if (isa<ConstantSDNode>(Op1)) {
+          // Convert the (add <ptr>, <const>) to an indirect address contained
+          // in a register. Note that this is done because we need to avoid
+          // creating a 0(reg) d-form address due to the SPU's block loads.
+          basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
+          the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag);
+          basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT);
+        } else {
+          // Convert the (add <arg1>, <arg2>) to an indirect address, which
+          // will likely be lowered as a reg(reg) x-form address.
+          basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
+        }
+      } else {
+        basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
+                              basePtr,
+                              DAG.getConstant(0, PtrVT));
+      }
+
+      // Offset the rotate amount by the basePtr and the preferred slot
+      // byte offset
+      rotate = DAG.getNode(ISD::ADD, dl, PtrVT,
+                           basePtr,
+                           DAG.getConstant(-vtm->prefslot_byte, PtrVT));
+    }
+
+    // Re-emit as a v16i8 vector load
+    result = DAG.getLoad(MVT::v16i8, dl, the_chain, basePtr,
+                         LN->getSrcValue(), LN->getSrcValueOffset(),
+                         LN->isVolatile(), 16);
+
+    // Update the chain
+    the_chain = result.getValue(1);
+
+    // Rotate into the preferred slot:
+    result = DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, MVT::v16i8,
+                         result.getValue(0), rotate);
+
+    // Convert the loaded v16i8 vector to the appropriate vector type
+    // specified by the operand:
+    EVT vecVT = EVT::getVectorVT(*DAG.getContext(), 
+                                 InVT, (128 / InVT.getSizeInBits()));
+    result = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, InVT,
+                         DAG.getNode(ISD::BIT_CONVERT, dl, vecVT, result));
+
+    // Handle extending loads by extending the scalar result:
+    if (ExtType == ISD::SEXTLOAD) {
+      result = DAG.getNode(ISD::SIGN_EXTEND, dl, OutVT, result);
+    } else if (ExtType == ISD::ZEXTLOAD) {
+      result = DAG.getNode(ISD::ZERO_EXTEND, dl, OutVT, result);
+    } else if (ExtType == ISD::EXTLOAD) {
+      unsigned NewOpc = ISD::ANY_EXTEND;
+
+      if (OutVT.isFloatingPoint())
+        NewOpc = ISD::FP_EXTEND;
+
+      result = DAG.getNode(NewOpc, dl, OutVT, result);
+    }
+
+    SDVTList retvts = DAG.getVTList(OutVT, MVT::Other);
+    SDValue retops[2] = {
+      result,
+      the_chain
+    };
+
+    result = DAG.getNode(SPUISD::LDRESULT, dl, retvts,
+                         retops, sizeof(retops) / sizeof(retops[0]));
+    return result;
+  }
+  case ISD::PRE_INC:
+  case ISD::PRE_DEC:
+  case ISD::POST_INC:
+  case ISD::POST_DEC:
+  case ISD::LAST_INDEXED_MODE:
+    {
+      std::string msg;
+      raw_string_ostream Msg(msg);
+      Msg << "LowerLOAD: Got a LoadSDNode with an addr mode other than "
+            "UNINDEXED\n";
+      Msg << (unsigned) LN->getAddressingMode();
+      llvm_report_error(Msg.str());
+      /*NOTREACHED*/
+    }
+  }
+
+  return SDValue();
+}
+
+/// Custom lower stores for CellSPU
+/*!
+ All CellSPU stores are aligned to 16-byte boundaries, so for elements
+ within a 16-byte block, we have to generate a shuffle to insert the
+ requested element into its place, then store the resulting block.
+ */
+static SDValue
+LowerSTORE(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
+  StoreSDNode *SN = cast<StoreSDNode>(Op);
+  SDValue Value = SN->getValue();
+  EVT VT = Value.getValueType();
+  EVT StVT = (!SN->isTruncatingStore() ? VT : SN->getMemoryVT());
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned alignment = SN->getAlignment();
+
+  switch (SN->getAddressingMode()) {
+  case ISD::UNINDEXED: {
+    // The vector type we really want to load from the 16-byte chunk.
+    EVT vecVT = EVT::getVectorVT(*DAG.getContext(),
+                                 VT, (128 / VT.getSizeInBits()));
+
+    SDValue alignLoadVec;
+    SDValue basePtr = SN->getBasePtr();
+    SDValue the_chain = SN->getChain();
+    SDValue insertEltOffs;
+
+    if (alignment == 16) {
+      ConstantSDNode *CN;
+
+      // Special cases for a known aligned load to simplify the base pointer
+      // and insertion byte:
+      if (basePtr.getOpcode() == ISD::ADD
+          && (CN = dyn_cast<ConstantSDNode>(basePtr.getOperand(1))) != 0) {
+        // Known offset into basePtr
+        int64_t offset = CN->getSExtValue();
+
+        // Simplify the base pointer for this case:
+        basePtr = basePtr.getOperand(0);
+        insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
+                                    basePtr,
+                                    DAG.getConstant((offset & 0xf), PtrVT));
+
+        if ((offset & ~0xf) > 0) {
+          basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
+                                basePtr,
+                                DAG.getConstant((offset & ~0xf), PtrVT));
+        }
+      } else {
+        // Otherwise, assume it's at byte 0 of basePtr
+        insertEltOffs = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
+                                    basePtr,
+                                    DAG.getConstant(0, PtrVT));
+      }
+    } else {
+      // Unaligned load: must be more pessimistic about addressing modes:
+      if (basePtr.getOpcode() == ISD::ADD) {
+        MachineFunction &MF = DAG.getMachineFunction();
+        MachineRegisterInfo &RegInfo = MF.getRegInfo();
+        unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
+        SDValue Flag;
+
+        SDValue Op0 = basePtr.getOperand(0);
+        SDValue Op1 = basePtr.getOperand(1);
+
+        if (isa<ConstantSDNode>(Op1)) {
+          // Convert the (add <ptr>, <const>) to an indirect address contained
+          // in a register. Note that this is done because we need to avoid
+          // creating a 0(reg) d-form address due to the SPU's block loads.
+          basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
+          the_chain = DAG.getCopyToReg(the_chain, dl, VReg, basePtr, Flag);
+          basePtr = DAG.getCopyFromReg(the_chain, dl, VReg, PtrVT);
+        } else {
+          // Convert the (add <arg1>, <arg2>) to an indirect address, which
+          // will likely be lowered as a reg(reg) x-form address.
+          basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Op0, Op1);
+        }
+      } else {
+        basePtr = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
+                              basePtr,
+                              DAG.getConstant(0, PtrVT));
+      }
+
+      // Insertion point is solely determined by basePtr's contents
+      insertEltOffs = DAG.getNode(ISD::ADD, dl, PtrVT,
+                                  basePtr,
+                                  DAG.getConstant(0, PtrVT));
+    }
+
+    // Re-emit as a v16i8 vector load
+    alignLoadVec = DAG.getLoad(MVT::v16i8, dl, the_chain, basePtr,
+                               SN->getSrcValue(), SN->getSrcValueOffset(),
+                               SN->isVolatile(), 16);
+
+    // Update the chain
+    the_chain = alignLoadVec.getValue(1);
+
+    LoadSDNode *LN = cast<LoadSDNode>(alignLoadVec);
+    SDValue theValue = SN->getValue();
+    SDValue result;
+
+    if (StVT != VT
+        && (theValue.getOpcode() == ISD::AssertZext
+            || theValue.getOpcode() == ISD::AssertSext)) {
+      // Drill down and get the value for zero- and sign-extended
+      // quantities
+      theValue = theValue.getOperand(0);
+    }
+
+    // If the base pointer is already a D-form address, then just create
+    // a new D-form address with a slot offset and the orignal base pointer.
+    // Otherwise generate a D-form address with the slot offset relative
+    // to the stack pointer, which is always aligned.
+#if !defined(NDEBUG)
+      if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
+        errs() << "CellSPU LowerSTORE: basePtr = ";
+        basePtr.getNode()->dump(&DAG);
+        errs() << "\n";
+      }
+#endif
+
+    SDValue insertEltOp =
+            DAG.getNode(SPUISD::SHUFFLE_MASK, dl, vecVT, insertEltOffs);
+    SDValue vectorizeOp =
+            DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, vecVT, theValue);
+
+    result = DAG.getNode(SPUISD::SHUFB, dl, vecVT,
+                         vectorizeOp, alignLoadVec,
+                         DAG.getNode(ISD::BIT_CONVERT, dl,
+                                     MVT::v4i32, insertEltOp));
+
+    result = DAG.getStore(the_chain, dl, result, basePtr,
+                          LN->getSrcValue(), LN->getSrcValueOffset(),
+                          LN->isVolatile(), LN->getAlignment());
+
+#if 0 && !defined(NDEBUG)
+    if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
+      const SDValue &currentRoot = DAG.getRoot();
+
+      DAG.setRoot(result);
+      errs() << "------- CellSPU:LowerStore result:\n";
+      DAG.dump();
+      errs() << "-------\n";
+      DAG.setRoot(currentRoot);
+    }
+#endif
+
+    return result;
+    /*UNREACHED*/
+  }
+  case ISD::PRE_INC:
+  case ISD::PRE_DEC:
+  case ISD::POST_INC:
+  case ISD::POST_DEC:
+  case ISD::LAST_INDEXED_MODE:
+    {
+      std::string msg;
+      raw_string_ostream Msg(msg);
+      Msg << "LowerLOAD: Got a LoadSDNode with an addr mode other than "
+            "UNINDEXED\n";
+      Msg << (unsigned) SN->getAddressingMode();
+      llvm_report_error(Msg.str());
+      /*NOTREACHED*/
+    }
+  }
+
+  return SDValue();
+}
+
+//! Generate the address of a constant pool entry.
+static SDValue
+LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
+  EVT PtrVT = Op.getValueType();
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+  Constant *C = CP->getConstVal();
+  SDValue CPI = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment());
+  SDValue Zero = DAG.getConstant(0, PtrVT);
+  const TargetMachine &TM = DAG.getTarget();
+  // FIXME there is no actual debug info here
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (TM.getRelocationModel() == Reloc::Static) {
+    if (!ST->usingLargeMem()) {
+      // Just return the SDValue with the constant pool address in it.
+      return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, CPI, Zero);
+    } else {
+      SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, CPI, Zero);
+      SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, CPI, Zero);
+      return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
+    }
+  }
+
+  llvm_unreachable("LowerConstantPool: Relocation model other than static"
+                   " not supported.");
+  return SDValue();
+}
+
+//! Alternate entry point for generating the address of a constant pool entry
+SDValue
+SPU::LowerConstantPool(SDValue Op, SelectionDAG &DAG, const SPUTargetMachine &TM) {
+  return ::LowerConstantPool(Op, DAG, TM.getSubtargetImpl());
+}
+
+static SDValue
+LowerJumpTable(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
+  EVT PtrVT = Op.getValueType();
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+  SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
+  SDValue Zero = DAG.getConstant(0, PtrVT);
+  const TargetMachine &TM = DAG.getTarget();
+  // FIXME there is no actual debug info here
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (TM.getRelocationModel() == Reloc::Static) {
+    if (!ST->usingLargeMem()) {
+      return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, JTI, Zero);
+    } else {
+      SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, JTI, Zero);
+      SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, JTI, Zero);
+      return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
+    }
+  }
+
+  llvm_unreachable("LowerJumpTable: Relocation model other than static"
+                   " not supported.");
+  return SDValue();
+}
+
+static SDValue
+LowerGlobalAddress(SDValue Op, SelectionDAG &DAG, const SPUSubtarget *ST) {
+  EVT PtrVT = Op.getValueType();
+  GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
+  GlobalValue *GV = GSDN->getGlobal();
+  SDValue GA = DAG.getTargetGlobalAddress(GV, PtrVT, GSDN->getOffset());
+  const TargetMachine &TM = DAG.getTarget();
+  SDValue Zero = DAG.getConstant(0, PtrVT);
+  // FIXME there is no actual debug info here
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (TM.getRelocationModel() == Reloc::Static) {
+    if (!ST->usingLargeMem()) {
+      return DAG.getNode(SPUISD::AFormAddr, dl, PtrVT, GA, Zero);
+    } else {
+      SDValue Hi = DAG.getNode(SPUISD::Hi, dl, PtrVT, GA, Zero);
+      SDValue Lo = DAG.getNode(SPUISD::Lo, dl, PtrVT, GA, Zero);
+      return DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, Hi, Lo);
+    }
+  } else {
+    llvm_report_error("LowerGlobalAddress: Relocation model other than static"
+                      "not supported.");
+    /*NOTREACHED*/
+  }
+
+  return SDValue();
+}
+
+//! Custom lower double precision floating point constants
+static SDValue
+LowerConstantFP(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+  // FIXME there is no actual debug info here
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (VT == MVT::f64) {
+    ConstantFPSDNode *FP = cast<ConstantFPSDNode>(Op.getNode());
+
+    assert((FP != 0) &&
+           "LowerConstantFP: Node is not ConstantFPSDNode");
+
+    uint64_t dbits = DoubleToBits(FP->getValueAPF().convertToDouble());
+    SDValue T = DAG.getConstant(dbits, MVT::i64);
+    SDValue Tvec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, T, T);
+    return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
+                       DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Tvec));
+  }
+
+  return SDValue();
+}
+
+SDValue
+SPUTargetLowering::LowerFormalArguments(SDValue Chain,
+                                        CallingConv::ID CallConv, bool isVarArg,
+                                        const SmallVectorImpl<ISD::InputArg>
+                                          &Ins,
+                                        DebugLoc dl, SelectionDAG &DAG,
+                                        SmallVectorImpl<SDValue> &InVals) {
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+
+  const unsigned *ArgRegs = SPURegisterInfo::getArgRegs();
+  const unsigned NumArgRegs = SPURegisterInfo::getNumArgRegs();
+
+  unsigned ArgOffset = SPUFrameInfo::minStackSize();
+  unsigned ArgRegIdx = 0;
+  unsigned StackSlotSize = SPUFrameInfo::stackSlotSize();
+
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+
+  // Add DAG nodes to load the arguments or copy them out of registers.
+  for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
+    EVT ObjectVT = Ins[ArgNo].VT;
+    unsigned ObjSize = ObjectVT.getSizeInBits()/8;
+    SDValue ArgVal;
+
+    if (ArgRegIdx < NumArgRegs) {
+      const TargetRegisterClass *ArgRegClass;
+
+      switch (ObjectVT.getSimpleVT().SimpleTy) {
+      default: {
+        std::string msg;
+        raw_string_ostream Msg(msg);
+        Msg << "LowerFormalArguments Unhandled argument type: "
+             << ObjectVT.getEVTString();
+        llvm_report_error(Msg.str());
+      }
+      case MVT::i8:
+        ArgRegClass = &SPU::R8CRegClass;
+        break;
+      case MVT::i16:
+        ArgRegClass = &SPU::R16CRegClass;
+        break;
+      case MVT::i32:
+        ArgRegClass = &SPU::R32CRegClass;
+        break;
+      case MVT::i64:
+        ArgRegClass = &SPU::R64CRegClass;
+        break;
+      case MVT::i128:
+        ArgRegClass = &SPU::GPRCRegClass;
+        break;
+      case MVT::f32:
+        ArgRegClass = &SPU::R32FPRegClass;
+        break;
+      case MVT::f64:
+        ArgRegClass = &SPU::R64FPRegClass;
+        break;
+      case MVT::v2f64:
+      case MVT::v4f32:
+      case MVT::v2i64:
+      case MVT::v4i32:
+      case MVT::v8i16:
+      case MVT::v16i8:
+        ArgRegClass = &SPU::VECREGRegClass;
+        break;
+      }
+
+      unsigned VReg = RegInfo.createVirtualRegister(ArgRegClass);
+      RegInfo.addLiveIn(ArgRegs[ArgRegIdx], VReg);
+      ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
+      ++ArgRegIdx;
+    } else {
+      // We need to load the argument to a virtual register if we determined
+      // above that we ran out of physical registers of the appropriate type
+      // or we're forced to do vararg
+      int FI = MFI->CreateFixedObject(ObjSize, ArgOffset, true, false);
+      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+      ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, NULL, 0);
+      ArgOffset += StackSlotSize;
+    }
+
+    InVals.push_back(ArgVal);
+    // Update the chain
+    Chain = ArgVal.getOperand(0);
+  }
+
+  // vararg handling:
+  if (isVarArg) {
+    // unsigned int ptr_size = PtrVT.getSizeInBits() / 8;
+    // We will spill (79-3)+1 registers to the stack
+    SmallVector<SDValue, 79-3+1> MemOps;
+
+    // Create the frame slot
+
+    for (; ArgRegIdx != NumArgRegs; ++ArgRegIdx) {
+      VarArgsFrameIndex = MFI->CreateFixedObject(StackSlotSize, ArgOffset,
+                                                 true, false);
+      SDValue FIN = DAG.getFrameIndex(VarArgsFrameIndex, PtrVT);
+      SDValue ArgVal = DAG.getRegister(ArgRegs[ArgRegIdx], MVT::v16i8);
+      SDValue Store = DAG.getStore(Chain, dl, ArgVal, FIN, NULL, 0);
+      Chain = Store.getOperand(0);
+      MemOps.push_back(Store);
+
+      // Increment address by stack slot size for the next stored argument
+      ArgOffset += StackSlotSize;
+    }
+    if (!MemOps.empty())
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                          &MemOps[0], MemOps.size());
+  }
+
+  return Chain;
+}
+
+/// isLSAAddress - Return the immediate to use if the specified
+/// value is representable as a LSA address.
+static SDNode *isLSAAddress(SDValue Op, SelectionDAG &DAG) {
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
+  if (!C) return 0;
+
+  int Addr = C->getZExtValue();
+  if ((Addr & 3) != 0 ||  // Low 2 bits are implicitly zero.
+      (Addr << 14 >> 14) != Addr)
+    return 0;  // Top 14 bits have to be sext of immediate.
+
+  return DAG.getConstant((int)C->getZExtValue() >> 2, MVT::i32).getNode();
+}
+
+SDValue
+SPUTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
+                             CallingConv::ID CallConv, bool isVarArg,
+                             bool &isTailCall,
+                             const SmallVectorImpl<ISD::OutputArg> &Outs,
+                             const SmallVectorImpl<ISD::InputArg> &Ins,
+                             DebugLoc dl, SelectionDAG &DAG,
+                             SmallVectorImpl<SDValue> &InVals) {
+  // CellSPU target does not yet support tail call optimization.
+  isTailCall = false;
+
+  const SPUSubtarget *ST = SPUTM.getSubtargetImpl();
+  unsigned NumOps     = Outs.size();
+  unsigned StackSlotSize = SPUFrameInfo::stackSlotSize();
+  const unsigned *ArgRegs = SPURegisterInfo::getArgRegs();
+  const unsigned NumArgRegs = SPURegisterInfo::getNumArgRegs();
+
+  // Handy pointer type
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+
+  // Set up a copy of the stack pointer for use loading and storing any
+  // arguments that may not fit in the registers available for argument
+  // passing.
+  SDValue StackPtr = DAG.getRegister(SPU::R1, MVT::i32);
+
+  // Figure out which arguments are going to go in registers, and which in
+  // memory.
+  unsigned ArgOffset = SPUFrameInfo::minStackSize(); // Just below [LR]
+  unsigned ArgRegIdx = 0;
+
+  // Keep track of registers passing arguments
+  std::vector<std::pair<unsigned, SDValue> > RegsToPass;
+  // And the arguments passed on the stack
+  SmallVector<SDValue, 8> MemOpChains;
+
+  for (unsigned i = 0; i != NumOps; ++i) {
+    SDValue Arg = Outs[i].Val;
+
+    // PtrOff will be used to store the current argument to the stack if a
+    // register cannot be found for it.
+    SDValue PtrOff = DAG.getConstant(ArgOffset, StackPtr.getValueType());
+    PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
+
+    switch (Arg.getValueType().getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unexpected ValueType for argument!");
+    case MVT::i8:
+    case MVT::i16:
+    case MVT::i32:
+    case MVT::i64:
+    case MVT::i128:
+      if (ArgRegIdx != NumArgRegs) {
+        RegsToPass.push_back(std::make_pair(ArgRegs[ArgRegIdx++], Arg));
+      } else {
+        MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff, NULL, 0));
+        ArgOffset += StackSlotSize;
+      }
+      break;
+    case MVT::f32:
+    case MVT::f64:
+      if (ArgRegIdx != NumArgRegs) {
+        RegsToPass.push_back(std::make_pair(ArgRegs[ArgRegIdx++], Arg));
+      } else {
+        MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff, NULL, 0));
+        ArgOffset += StackSlotSize;
+      }
+      break;
+    case MVT::v2i64:
+    case MVT::v2f64:
+    case MVT::v4f32:
+    case MVT::v4i32:
+    case MVT::v8i16:
+    case MVT::v16i8:
+      if (ArgRegIdx != NumArgRegs) {
+        RegsToPass.push_back(std::make_pair(ArgRegs[ArgRegIdx++], Arg));
+      } else {
+        MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff, NULL, 0));
+        ArgOffset += StackSlotSize;
+      }
+      break;
+    }
+  }
+
+  // Accumulate how many bytes are to be pushed on the stack, including the
+  // linkage area, and parameter passing area.  According to the SPU ABI,
+  // we minimally need space for [LR] and [SP].
+  unsigned NumStackBytes = ArgOffset - SPUFrameInfo::minStackSize();
+
+  // Insert a call sequence start
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumStackBytes,
+                                                            true));
+
+  if (!MemOpChains.empty()) {
+    // Adjust the stack pointer for the stack arguments.
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+  }
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into the appropriate regs.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  SmallVector<SDValue, 8> Ops;
+  unsigned CallOpc = SPUISD::CALL;
+
+  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
+  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
+  // node so that legalize doesn't hack it.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    GlobalValue *GV = G->getGlobal();
+    EVT CalleeVT = Callee.getValueType();
+    SDValue Zero = DAG.getConstant(0, PtrVT);
+    SDValue GA = DAG.getTargetGlobalAddress(GV, CalleeVT);
+
+    if (!ST->usingLargeMem()) {
+      // Turn calls to targets that are defined (i.e., have bodies) into BRSL
+      // style calls, otherwise, external symbols are BRASL calls. This assumes
+      // that declared/defined symbols are in the same compilation unit and can
+      // be reached through PC-relative jumps.
+      //
+      // NOTE:
+      // This may be an unsafe assumption for JIT and really large compilation
+      // units.
+      if (GV->isDeclaration()) {
+        Callee = DAG.getNode(SPUISD::AFormAddr, dl, CalleeVT, GA, Zero);
+      } else {
+        Callee = DAG.getNode(SPUISD::PCRelAddr, dl, CalleeVT, GA, Zero);
+      }
+    } else {
+      // "Large memory" mode: Turn all calls into indirect calls with a X-form
+      // address pairs:
+      Callee = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, GA, Zero);
+    }
+  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    EVT CalleeVT = Callee.getValueType();
+    SDValue Zero = DAG.getConstant(0, PtrVT);
+    SDValue ExtSym = DAG.getTargetExternalSymbol(S->getSymbol(),
+        Callee.getValueType());
+
+    if (!ST->usingLargeMem()) {
+      Callee = DAG.getNode(SPUISD::AFormAddr, dl, CalleeVT, ExtSym, Zero);
+    } else {
+      Callee = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT, ExtSym, Zero);
+    }
+  } else if (SDNode *Dest = isLSAAddress(Callee, DAG)) {
+    // If this is an absolute destination address that appears to be a legal
+    // local store address, use the munged value.
+    Callee = SDValue(Dest, 0);
+  }
+
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are known live
+  // into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+  // Returns a chain and a flag for retval copy to use.
+  Chain = DAG.getNode(CallOpc, dl, DAG.getVTList(MVT::Other, MVT::Flag),
+                      &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumStackBytes, true),
+                             DAG.getIntPtrConstant(0, true), InFlag);
+  if (!Ins.empty())
+    InFlag = Chain.getValue(1);
+
+  // If the function returns void, just return the chain.
+  if (Ins.empty())
+    return Chain;
+
+  // If the call has results, copy the values out of the ret val registers.
+  switch (Ins[0].VT.getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("Unexpected ret value!");
+  case MVT::Other: break;
+  case MVT::i32:
+    if (Ins.size() > 1 && Ins[1].VT == MVT::i32) {
+      Chain = DAG.getCopyFromReg(Chain, dl, SPU::R4,
+                                 MVT::i32, InFlag).getValue(1);
+      InVals.push_back(Chain.getValue(0));
+      Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, MVT::i32,
+                                 Chain.getValue(2)).getValue(1);
+      InVals.push_back(Chain.getValue(0));
+    } else {
+      Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, MVT::i32,
+                                 InFlag).getValue(1);
+      InVals.push_back(Chain.getValue(0));
+    }
+    break;
+  case MVT::i64:
+    Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, MVT::i64,
+                               InFlag).getValue(1);
+    InVals.push_back(Chain.getValue(0));
+    break;
+  case MVT::i128:
+    Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, MVT::i128,
+                               InFlag).getValue(1);
+    InVals.push_back(Chain.getValue(0));
+    break;
+  case MVT::f32:
+  case MVT::f64:
+    Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, Ins[0].VT,
+                               InFlag).getValue(1);
+    InVals.push_back(Chain.getValue(0));
+    break;
+  case MVT::v2f64:
+  case MVT::v2i64:
+  case MVT::v4f32:
+  case MVT::v4i32:
+  case MVT::v8i16:
+  case MVT::v16i8:
+    Chain = DAG.getCopyFromReg(Chain, dl, SPU::R3, Ins[0].VT,
+                                   InFlag).getValue(1);
+    InVals.push_back(Chain.getValue(0));
+    break;
+  }
+
+  return Chain;
+}
+
+SDValue
+SPUTargetLowering::LowerReturn(SDValue Chain,
+                               CallingConv::ID CallConv, bool isVarArg,
+                               const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               DebugLoc dl, SelectionDAG &DAG) {
+
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 RVLocs, *DAG.getContext());
+  CCInfo.AnalyzeReturn(Outs, RetCC_SPU);
+
+  // If this is the first return lowered for this function, add the regs to the
+  // liveout set for the function.
+  if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
+    for (unsigned i = 0; i != RVLocs.size(); ++i)
+      DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
+  }
+
+  SDValue Flag;
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+                             Outs[i].Val, Flag);
+    Flag = Chain.getValue(1);
+  }
+
+  if (Flag.getNode())
+    return DAG.getNode(SPUISD::RET_FLAG, dl, MVT::Other, Chain, Flag);
+  else
+    return DAG.getNode(SPUISD::RET_FLAG, dl, MVT::Other, Chain);
+}
+
+
+//===----------------------------------------------------------------------===//
+// Vector related lowering:
+//===----------------------------------------------------------------------===//
+
+static ConstantSDNode *
+getVecImm(SDNode *N) {
+  SDValue OpVal(0, 0);
+
+  // Check to see if this buildvec has a single non-undef value in its elements.
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
+    if (OpVal.getNode() == 0)
+      OpVal = N->getOperand(i);
+    else if (OpVal != N->getOperand(i))
+      return 0;
+  }
+
+  if (OpVal.getNode() != 0) {
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) {
+      return CN;
+    }
+  }
+
+  return 0;
+}
+
+/// get_vec_i18imm - Test if this vector is a vector filled with the same value
+/// and the value fits into an unsigned 18-bit constant, and if so, return the
+/// constant
+SDValue SPU::get_vec_u18imm(SDNode *N, SelectionDAG &DAG,
+                              EVT ValueType) {
+  if (ConstantSDNode *CN = getVecImm(N)) {
+    uint64_t Value = CN->getZExtValue();
+    if (ValueType == MVT::i64) {
+      uint64_t UValue = CN->getZExtValue();
+      uint32_t upper = uint32_t(UValue >> 32);
+      uint32_t lower = uint32_t(UValue);
+      if (upper != lower)
+        return SDValue();
+      Value = Value >> 32;
+    }
+    if (Value <= 0x3ffff)
+      return DAG.getTargetConstant(Value, ValueType);
+  }
+
+  return SDValue();
+}
+
+/// get_vec_i16imm - Test if this vector is a vector filled with the same value
+/// and the value fits into a signed 16-bit constant, and if so, return the
+/// constant
+SDValue SPU::get_vec_i16imm(SDNode *N, SelectionDAG &DAG,
+                              EVT ValueType) {
+  if (ConstantSDNode *CN = getVecImm(N)) {
+    int64_t Value = CN->getSExtValue();
+    if (ValueType == MVT::i64) {
+      uint64_t UValue = CN->getZExtValue();
+      uint32_t upper = uint32_t(UValue >> 32);
+      uint32_t lower = uint32_t(UValue);
+      if (upper != lower)
+        return SDValue();
+      Value = Value >> 32;
+    }
+    if (Value >= -(1 << 15) && Value <= ((1 << 15) - 1)) {
+      return DAG.getTargetConstant(Value, ValueType);
+    }
+  }
+
+  return SDValue();
+}
+
+/// get_vec_i10imm - Test if this vector is a vector filled with the same value
+/// and the value fits into a signed 10-bit constant, and if so, return the
+/// constant
+SDValue SPU::get_vec_i10imm(SDNode *N, SelectionDAG &DAG,
+                              EVT ValueType) {
+  if (ConstantSDNode *CN = getVecImm(N)) {
+    int64_t Value = CN->getSExtValue();
+    if (ValueType == MVT::i64) {
+      uint64_t UValue = CN->getZExtValue();
+      uint32_t upper = uint32_t(UValue >> 32);
+      uint32_t lower = uint32_t(UValue);
+      if (upper != lower)
+        return SDValue();
+      Value = Value >> 32;
+    }
+    if (isS10Constant(Value))
+      return DAG.getTargetConstant(Value, ValueType);
+  }
+
+  return SDValue();
+}
+
+/// get_vec_i8imm - Test if this vector is a vector filled with the same value
+/// and the value fits into a signed 8-bit constant, and if so, return the
+/// constant.
+///
+/// @note: The incoming vector is v16i8 because that's the only way we can load
+/// constant vectors. Thus, we test to see if the upper and lower bytes are the
+/// same value.
+SDValue SPU::get_vec_i8imm(SDNode *N, SelectionDAG &DAG,
+                             EVT ValueType) {
+  if (ConstantSDNode *CN = getVecImm(N)) {
+    int Value = (int) CN->getZExtValue();
+    if (ValueType == MVT::i16
+        && Value <= 0xffff                 /* truncated from uint64_t */
+        && ((short) Value >> 8) == ((short) Value & 0xff))
+      return DAG.getTargetConstant(Value & 0xff, ValueType);
+    else if (ValueType == MVT::i8
+             && (Value & 0xff) == Value)
+      return DAG.getTargetConstant(Value, ValueType);
+  }
+
+  return SDValue();
+}
+
+/// get_ILHUvec_imm - Test if this vector is a vector filled with the same value
+/// and the value fits into a signed 16-bit constant, and if so, return the
+/// constant
+SDValue SPU::get_ILHUvec_imm(SDNode *N, SelectionDAG &DAG,
+                               EVT ValueType) {
+  if (ConstantSDNode *CN = getVecImm(N)) {
+    uint64_t Value = CN->getZExtValue();
+    if ((ValueType == MVT::i32
+          && ((unsigned) Value & 0xffff0000) == (unsigned) Value)
+        || (ValueType == MVT::i64 && (Value & 0xffff0000) == Value))
+      return DAG.getTargetConstant(Value >> 16, ValueType);
+  }
+
+  return SDValue();
+}
+
+/// get_v4i32_imm - Catch-all for general 32-bit constant vectors
+SDValue SPU::get_v4i32_imm(SDNode *N, SelectionDAG &DAG) {
+  if (ConstantSDNode *CN = getVecImm(N)) {
+    return DAG.getTargetConstant((unsigned) CN->getZExtValue(), MVT::i32);
+  }
+
+  return SDValue();
+}
+
+/// get_v4i32_imm - Catch-all for general 64-bit constant vectors
+SDValue SPU::get_v2i64_imm(SDNode *N, SelectionDAG &DAG) {
+  if (ConstantSDNode *CN = getVecImm(N)) {
+    return DAG.getTargetConstant((unsigned) CN->getZExtValue(), MVT::i64);
+  }
+
+  return SDValue();
+}
+
+//! Lower a BUILD_VECTOR instruction creatively:
+static SDValue
+LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+  EVT EltVT = VT.getVectorElementType();
+  DebugLoc dl = Op.getDebugLoc();
+  BuildVectorSDNode *BCN = dyn_cast<BuildVectorSDNode>(Op.getNode());
+  assert(BCN != 0 && "Expected BuildVectorSDNode in SPU LowerBUILD_VECTOR");
+  unsigned minSplatBits = EltVT.getSizeInBits();
+
+  if (minSplatBits < 16)
+    minSplatBits = 16;
+
+  APInt APSplatBits, APSplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+
+  if (!BCN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
+                            HasAnyUndefs, minSplatBits)
+      || minSplatBits < SplatBitSize)
+    return SDValue();   // Wasn't a constant vector or splat exceeded min
+
+  uint64_t SplatBits = APSplatBits.getZExtValue();
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: {
+    std::string msg;
+    raw_string_ostream Msg(msg);
+    Msg << "CellSPU: Unhandled VT in LowerBUILD_VECTOR, VT = "
+         << VT.getEVTString();
+    llvm_report_error(Msg.str());
+    /*NOTREACHED*/
+  }
+  case MVT::v4f32: {
+    uint32_t Value32 = uint32_t(SplatBits);
+    assert(SplatBitSize == 32
+           && "LowerBUILD_VECTOR: Unexpected floating point vector element.");
+    // NOTE: pretend the constant is an integer. LLVM won't load FP constants
+    SDValue T = DAG.getConstant(Value32, MVT::i32);
+    return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32,
+                       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, T,T,T,T));
+    break;
+  }
+  case MVT::v2f64: {
+    uint64_t f64val = uint64_t(SplatBits);
+    assert(SplatBitSize == 64
+           && "LowerBUILD_VECTOR: 64-bit float vector size > 8 bytes.");
+    // NOTE: pretend the constant is an integer. LLVM won't load FP constants
+    SDValue T = DAG.getConstant(f64val, MVT::i64);
+    return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64,
+                       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64, T, T));
+    break;
+  }
+  case MVT::v16i8: {
+   // 8-bit constants have to be expanded to 16-bits
+   unsigned short Value16 = SplatBits /* | (SplatBits << 8) */;
+   SmallVector<SDValue, 8> Ops;
+
+   Ops.assign(8, DAG.getConstant(Value16, MVT::i16));
+   return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
+                      DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i16, &Ops[0], Ops.size()));
+  }
+  case MVT::v8i16: {
+    unsigned short Value16 = SplatBits;
+    SDValue T = DAG.getConstant(Value16, EltVT);
+    SmallVector<SDValue, 8> Ops;
+
+    Ops.assign(8, T);
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Ops[0], Ops.size());
+  }
+  case MVT::v4i32: {
+    SDValue T = DAG.getConstant(unsigned(SplatBits), VT.getVectorElementType());
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, T, T, T, T);
+  }
+  case MVT::v2i32: {
+    SDValue T = DAG.getConstant(unsigned(SplatBits), VT.getVectorElementType());
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, T, T);
+  }
+  case MVT::v2i64: {
+    return SPU::LowerV2I64Splat(VT, DAG, SplatBits, dl);
+  }
+  }
+
+  return SDValue();
+}
+
+/*!
+ */
+SDValue
+SPU::LowerV2I64Splat(EVT OpVT, SelectionDAG& DAG, uint64_t SplatVal,
+                     DebugLoc dl) {
+  uint32_t upper = uint32_t(SplatVal >> 32);
+  uint32_t lower = uint32_t(SplatVal);
+
+  if (upper == lower) {
+    // Magic constant that can be matched by IL, ILA, et. al.
+    SDValue Val = DAG.getTargetConstant(upper, MVT::i32);
+    return DAG.getNode(ISD::BIT_CONVERT, dl, OpVT,
+                       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                                   Val, Val, Val, Val));
+  } else {
+    bool upper_special, lower_special;
+
+    // NOTE: This code creates common-case shuffle masks that can be easily
+    // detected as common expressions. It is not attempting to create highly
+    // specialized masks to replace any and all 0's, 0xff's and 0x80's.
+
+    // Detect if the upper or lower half is a special shuffle mask pattern:
+    upper_special = (upper == 0 || upper == 0xffffffff || upper == 0x80000000);
+    lower_special = (lower == 0 || lower == 0xffffffff || lower == 0x80000000);
+
+    // Both upper and lower are special, lower to a constant pool load:
+    if (lower_special && upper_special) {
+      SDValue SplatValCN = DAG.getConstant(SplatVal, MVT::i64);
+      return DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i64,
+                         SplatValCN, SplatValCN);
+    }
+
+    SDValue LO32;
+    SDValue HI32;
+    SmallVector<SDValue, 16> ShufBytes;
+    SDValue Result;
+
+    // Create lower vector if not a special pattern
+    if (!lower_special) {
+      SDValue LO32C = DAG.getConstant(lower, MVT::i32);
+      LO32 = DAG.getNode(ISD::BIT_CONVERT, dl, OpVT,
+                         DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                                     LO32C, LO32C, LO32C, LO32C));
+    }
+
+    // Create upper vector if not a special pattern
+    if (!upper_special) {
+      SDValue HI32C = DAG.getConstant(upper, MVT::i32);
+      HI32 = DAG.getNode(ISD::BIT_CONVERT, dl, OpVT,
+                         DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                                     HI32C, HI32C, HI32C, HI32C));
+    }
+
+    // If either upper or lower are special, then the two input operands are
+    // the same (basically, one of them is a "don't care")
+    if (lower_special)
+      LO32 = HI32;
+    if (upper_special)
+      HI32 = LO32;
+
+    for (int i = 0; i < 4; ++i) {
+      uint64_t val = 0;
+      for (int j = 0; j < 4; ++j) {
+        SDValue V;
+        bool process_upper, process_lower;
+        val <<= 8;
+        process_upper = (upper_special && (i & 1) == 0);
+        process_lower = (lower_special && (i & 1) == 1);
+
+        if (process_upper || process_lower) {
+          if ((process_upper && upper == 0)
+                  || (process_lower && lower == 0))
+            val |= 0x80;
+          else if ((process_upper && upper == 0xffffffff)
+                  || (process_lower && lower == 0xffffffff))
+            val |= 0xc0;
+          else if ((process_upper && upper == 0x80000000)
+                  || (process_lower && lower == 0x80000000))
+            val |= (j == 0 ? 0xe0 : 0x80);
+        } else
+          val |= i * 4 + j + ((i & 1) * 16);
+      }
+
+      ShufBytes.push_back(DAG.getConstant(val, MVT::i32));
+    }
+
+    return DAG.getNode(SPUISD::SHUFB, dl, OpVT, HI32, LO32,
+                       DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                                   &ShufBytes[0], ShufBytes.size()));
+  }
+}
+
+/// LowerVECTOR_SHUFFLE - Lower a vector shuffle (V1, V2, V3) to something on
+/// which the Cell can operate. The code inspects V3 to ascertain whether the
+/// permutation vector, V3, is monotonically increasing with one "exception"
+/// element, e.g., (0, 1, _, 3). If this is the case, then generate a
+/// SHUFFLE_MASK synthetic instruction. Otherwise, spill V3 to the constant pool.
+/// In either case, the net result is going to eventually invoke SHUFB to
+/// permute/shuffle the bytes from V1 and V2.
+/// \note
+/// SHUFFLE_MASK is eventually selected as one of the C*D instructions, generate
+/// control word for byte/halfword/word insertion. This takes care of a single
+/// element move from V2 into V1.
+/// \note
+/// SPUISD::SHUFB is eventually selected as Cell's <i>shufb</i> instructions.
+static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
+  const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (V2.getOpcode() == ISD::UNDEF) V2 = V1;
+
+  // If we have a single element being moved from V1 to V2, this can be handled
+  // using the C*[DX] compute mask instructions, but the vector elements have
+  // to be monotonically increasing with one exception element.
+  EVT VecVT = V1.getValueType();
+  EVT EltVT = VecVT.getVectorElementType();
+  unsigned EltsFromV2 = 0;
+  unsigned V2Elt = 0;
+  unsigned V2EltIdx0 = 0;
+  unsigned CurrElt = 0;
+  unsigned MaxElts = VecVT.getVectorNumElements();
+  unsigned PrevElt = 0;
+  unsigned V0Elt = 0;
+  bool monotonic = true;
+  bool rotate = true;
+
+  if (EltVT == MVT::i8) {
+    V2EltIdx0 = 16;
+  } else if (EltVT == MVT::i16) {
+    V2EltIdx0 = 8;
+  } else if (EltVT == MVT::i32 || EltVT == MVT::f32) {
+    V2EltIdx0 = 4;
+  } else if (EltVT == MVT::i64 || EltVT == MVT::f64) {
+    V2EltIdx0 = 2;
+  } else
+    llvm_unreachable("Unhandled vector type in LowerVECTOR_SHUFFLE");
+
+  for (unsigned i = 0; i != MaxElts; ++i) {
+    if (SVN->getMaskElt(i) < 0)
+      continue;
+    
+    unsigned SrcElt = SVN->getMaskElt(i);
+
+    if (monotonic) {
+      if (SrcElt >= V2EltIdx0) {
+        if (1 >= (++EltsFromV2)) {
+          V2Elt = (V2EltIdx0 - SrcElt) << 2;
+        }
+      } else if (CurrElt != SrcElt) {
+        monotonic = false;
+      }
+
+      ++CurrElt;
+    }
+
+    if (rotate) {
+      if (PrevElt > 0 && SrcElt < MaxElts) {
+        if ((PrevElt == SrcElt - 1)
+            || (PrevElt == MaxElts - 1 && SrcElt == 0)) {
+          PrevElt = SrcElt;
+          if (SrcElt == 0)
+            V0Elt = i;
+        } else {
+          rotate = false;
+        }
+      } else if (PrevElt == 0) {
+        // First time through, need to keep track of previous element
+        PrevElt = SrcElt;
+      } else {
+        // This isn't a rotation, takes elements from vector 2
+        rotate = false;
+      }
+    }
+  }
+
+  if (EltsFromV2 == 1 && monotonic) {
+    // Compute mask and shuffle
+    MachineFunction &MF = DAG.getMachineFunction();
+    MachineRegisterInfo &RegInfo = MF.getRegInfo();
+    unsigned VReg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
+    EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+    // Initialize temporary register to 0
+    SDValue InitTempReg =
+      DAG.getCopyToReg(DAG.getEntryNode(), dl, VReg, DAG.getConstant(0, PtrVT));
+    // Copy register's contents as index in SHUFFLE_MASK:
+    SDValue ShufMaskOp =
+      DAG.getNode(SPUISD::SHUFFLE_MASK, dl, MVT::v4i32,
+                  DAG.getTargetConstant(V2Elt, MVT::i32),
+                  DAG.getCopyFromReg(InitTempReg, dl, VReg, PtrVT));
+    // Use shuffle mask in SHUFB synthetic instruction:
+    return DAG.getNode(SPUISD::SHUFB, dl, V1.getValueType(), V2, V1,
+                       ShufMaskOp);
+  } else if (rotate) {
+    int rotamt = (MaxElts - V0Elt) * EltVT.getSizeInBits()/8;
+
+    return DAG.getNode(SPUISD::ROTBYTES_LEFT, dl, V1.getValueType(),
+                       V1, DAG.getConstant(rotamt, MVT::i16));
+  } else {
+   // Convert the SHUFFLE_VECTOR mask's input element units to the
+   // actual bytes.
+    unsigned BytesPerElement = EltVT.getSizeInBits()/8;
+
+    SmallVector<SDValue, 16> ResultMask;
+    for (unsigned i = 0, e = MaxElts; i != e; ++i) {
+      unsigned SrcElt = SVN->getMaskElt(i) < 0 ? 0 : SVN->getMaskElt(i);
+
+      for (unsigned j = 0; j < BytesPerElement; ++j)
+        ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement+j,MVT::i8));
+    }
+
+    SDValue VPermMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8,
+                                    &ResultMask[0], ResultMask.size());
+    return DAG.getNode(SPUISD::SHUFB, dl, V1.getValueType(), V1, V2, VPermMask);
+  }
+}
+
+static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
+  SDValue Op0 = Op.getOperand(0);                     // Op0 = the scalar
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (Op0.getNode()->getOpcode() == ISD::Constant) {
+    // For a constant, build the appropriate constant vector, which will
+    // eventually simplify to a vector register load.
+
+    ConstantSDNode *CN = cast<ConstantSDNode>(Op0.getNode());
+    SmallVector<SDValue, 16> ConstVecValues;
+    EVT VT;
+    size_t n_copies;
+
+    // Create a constant vector:
+    switch (Op.getValueType().getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unexpected constant value type in "
+                              "LowerSCALAR_TO_VECTOR");
+    case MVT::v16i8: n_copies = 16; VT = MVT::i8; break;
+    case MVT::v8i16: n_copies = 8; VT = MVT::i16; break;
+    case MVT::v4i32: n_copies = 4; VT = MVT::i32; break;
+    case MVT::v4f32: n_copies = 4; VT = MVT::f32; break;
+    case MVT::v2i64: n_copies = 2; VT = MVT::i64; break;
+    case MVT::v2f64: n_copies = 2; VT = MVT::f64; break;
+    }
+
+    SDValue CValue = DAG.getConstant(CN->getZExtValue(), VT);
+    for (size_t j = 0; j < n_copies; ++j)
+      ConstVecValues.push_back(CValue);
+
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, Op.getValueType(),
+                       &ConstVecValues[0], ConstVecValues.size());
+  } else {
+    // Otherwise, copy the value from one register to another:
+    switch (Op0.getValueType().getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unexpected value type in LowerSCALAR_TO_VECTOR");
+    case MVT::i8:
+    case MVT::i16:
+    case MVT::i32:
+    case MVT::i64:
+    case MVT::f32:
+    case MVT::f64:
+      return DAG.getNode(SPUISD::PREFSLOT2VEC, dl, Op.getValueType(), Op0, Op0);
+    }
+  }
+
+  return SDValue();
+}
+
+static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+  SDValue N = Op.getOperand(0);
+  SDValue Elt = Op.getOperand(1);
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue retval;
+
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Elt)) {
+    // Constant argument:
+    int EltNo = (int) C->getZExtValue();
+
+    // sanity checks:
+    if (VT == MVT::i8 && EltNo >= 16)
+      llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i8 extraction slot > 15");
+    else if (VT == MVT::i16 && EltNo >= 8)
+      llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i16 extraction slot > 7");
+    else if (VT == MVT::i32 && EltNo >= 4)
+      llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i32 extraction slot > 4");
+    else if (VT == MVT::i64 && EltNo >= 2)
+      llvm_unreachable("SPU LowerEXTRACT_VECTOR_ELT: i64 extraction slot > 2");
+
+    if (EltNo == 0 && (VT == MVT::i32 || VT == MVT::i64)) {
+      // i32 and i64: Element 0 is the preferred slot
+      return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT, N);
+    }
+
+    // Need to generate shuffle mask and extract:
+    int prefslot_begin = -1, prefslot_end = -1;
+    int elt_byte = EltNo * VT.getSizeInBits() / 8;
+
+    switch (VT.getSimpleVT().SimpleTy) {
+    default:
+      assert(false && "Invalid value type!");
+    case MVT::i8: {
+      prefslot_begin = prefslot_end = 3;
+      break;
+    }
+    case MVT::i16: {
+      prefslot_begin = 2; prefslot_end = 3;
+      break;
+    }
+    case MVT::i32:
+    case MVT::f32: {
+      prefslot_begin = 0; prefslot_end = 3;
+      break;
+    }
+    case MVT::i64:
+    case MVT::f64: {
+      prefslot_begin = 0; prefslot_end = 7;
+      break;
+    }
+    }
+
+    assert(prefslot_begin != -1 && prefslot_end != -1 &&
+           "LowerEXTRACT_VECTOR_ELT: preferred slots uninitialized");
+
+    unsigned int ShufBytes[16] = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+    };
+    for (int i = 0; i < 16; ++i) {
+      // zero fill uppper part of preferred slot, don't care about the
+      // other slots:
+      unsigned int mask_val;
+      if (i <= prefslot_end) {
+        mask_val =
+          ((i < prefslot_begin)
+           ? 0x80
+           : elt_byte + (i - prefslot_begin));
+
+        ShufBytes[i] = mask_val;
+      } else
+        ShufBytes[i] = ShufBytes[i % (prefslot_end + 1)];
+    }
+
+    SDValue ShufMask[4];
+    for (unsigned i = 0; i < sizeof(ShufMask)/sizeof(ShufMask[0]); ++i) {
+      unsigned bidx = i * 4;
+      unsigned int bits = ((ShufBytes[bidx] << 24) |
+                           (ShufBytes[bidx+1] << 16) |
+                           (ShufBytes[bidx+2] << 8) |
+                           ShufBytes[bidx+3]);
+      ShufMask[i] = DAG.getConstant(bits, MVT::i32);
+    }
+
+    SDValue ShufMaskVec =
+      DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                  &ShufMask[0], sizeof(ShufMask)/sizeof(ShufMask[0]));
+
+    retval = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
+                         DAG.getNode(SPUISD::SHUFB, dl, N.getValueType(),
+                                     N, N, ShufMaskVec));
+  } else {
+    // Variable index: Rotate the requested element into slot 0, then replicate
+    // slot 0 across the vector
+    EVT VecVT = N.getValueType();
+    if (!VecVT.isSimple() || !VecVT.isVector() || !VecVT.is128BitVector()) {
+      llvm_report_error("LowerEXTRACT_VECTOR_ELT: Must have a simple, 128-bit"
+                        "vector type!");
+    }
+
+    // Make life easier by making sure the index is zero-extended to i32
+    if (Elt.getValueType() != MVT::i32)
+      Elt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Elt);
+
+    // Scale the index to a bit/byte shift quantity
+    APInt scaleFactor =
+            APInt(32, uint64_t(16 / N.getValueType().getVectorNumElements()), false);
+    unsigned scaleShift = scaleFactor.logBase2();
+    SDValue vecShift;
+
+    if (scaleShift > 0) {
+      // Scale the shift factor:
+      Elt = DAG.getNode(ISD::SHL, dl, MVT::i32, Elt,
+                        DAG.getConstant(scaleShift, MVT::i32));
+    }
+
+    vecShift = DAG.getNode(SPUISD::SHLQUAD_L_BYTES, dl, VecVT, N, Elt);
+
+    // Replicate the bytes starting at byte 0 across the entire vector (for
+    // consistency with the notion of a unified register set)
+    SDValue replicate;
+
+    switch (VT.getSimpleVT().SimpleTy) {
+    default:
+      llvm_report_error("LowerEXTRACT_VECTOR_ELT(varable): Unhandled vector"
+                        "type");
+      /*NOTREACHED*/
+    case MVT::i8: {
+      SDValue factor = DAG.getConstant(0x00000000, MVT::i32);
+      replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                              factor, factor, factor, factor);
+      break;
+    }
+    case MVT::i16: {
+      SDValue factor = DAG.getConstant(0x00010001, MVT::i32);
+      replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                              factor, factor, factor, factor);
+      break;
+    }
+    case MVT::i32:
+    case MVT::f32: {
+      SDValue factor = DAG.getConstant(0x00010203, MVT::i32);
+      replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                              factor, factor, factor, factor);
+      break;
+    }
+    case MVT::i64:
+    case MVT::f64: {
+      SDValue loFactor = DAG.getConstant(0x00010203, MVT::i32);
+      SDValue hiFactor = DAG.getConstant(0x04050607, MVT::i32);
+      replicate = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                              loFactor, hiFactor, loFactor, hiFactor);
+      break;
+    }
+    }
+
+    retval = DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT,
+                         DAG.getNode(SPUISD::SHUFB, dl, VecVT,
+                                     vecShift, vecShift, replicate));
+  }
+
+  return retval;
+}
+
+static SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
+  SDValue VecOp = Op.getOperand(0);
+  SDValue ValOp = Op.getOperand(1);
+  SDValue IdxOp = Op.getOperand(2);
+  DebugLoc dl = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+
+  ConstantSDNode *CN = cast<ConstantSDNode>(IdxOp);
+  assert(CN != 0 && "LowerINSERT_VECTOR_ELT: Index is not constant!");
+
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  // Use $sp ($1) because it's always 16-byte aligned and it's available:
+  SDValue Pointer = DAG.getNode(SPUISD::IndirectAddr, dl, PtrVT,
+                                DAG.getRegister(SPU::R1, PtrVT),
+                                DAG.getConstant(CN->getSExtValue(), PtrVT));
+  SDValue ShufMask = DAG.getNode(SPUISD::SHUFFLE_MASK, dl, VT, Pointer);
+
+  SDValue result =
+    DAG.getNode(SPUISD::SHUFB, dl, VT,
+                DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, ValOp),
+                VecOp,
+                DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, ShufMask));
+
+  return result;
+}
+
+static SDValue LowerI8Math(SDValue Op, SelectionDAG &DAG, unsigned Opc,
+                           const TargetLowering &TLI)
+{
+  SDValue N0 = Op.getOperand(0);      // Everything has at least one operand
+  DebugLoc dl = Op.getDebugLoc();
+  EVT ShiftVT = TLI.getShiftAmountTy();
+
+  assert(Op.getValueType() == MVT::i8);
+  switch (Opc) {
+  default:
+    llvm_unreachable("Unhandled i8 math operator");
+    /*NOTREACHED*/
+    break;
+  case ISD::ADD: {
+    // 8-bit addition: Promote the arguments up to 16-bits and truncate
+    // the result:
+    SDValue N1 = Op.getOperand(1);
+    N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
+    N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
+    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
+                       DAG.getNode(Opc, dl, MVT::i16, N0, N1));
+
+  }
+
+  case ISD::SUB: {
+    // 8-bit subtraction: Promote the arguments up to 16-bits and truncate
+    // the result:
+    SDValue N1 = Op.getOperand(1);
+    N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
+    N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
+    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
+                       DAG.getNode(Opc, dl, MVT::i16, N0, N1));
+  }
+  case ISD::ROTR:
+  case ISD::ROTL: {
+    SDValue N1 = Op.getOperand(1);
+    EVT N1VT = N1.getValueType();
+
+    N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, N0);
+    if (!N1VT.bitsEq(ShiftVT)) {
+      unsigned N1Opc = N1.getValueType().bitsLT(ShiftVT)
+                       ? ISD::ZERO_EXTEND
+                       : ISD::TRUNCATE;
+      N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
+    }
+
+    // Replicate lower 8-bits into upper 8:
+    SDValue ExpandArg =
+      DAG.getNode(ISD::OR, dl, MVT::i16, N0,
+                  DAG.getNode(ISD::SHL, dl, MVT::i16,
+                              N0, DAG.getConstant(8, MVT::i32)));
+
+    // Truncate back down to i8
+    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
+                       DAG.getNode(Opc, dl, MVT::i16, ExpandArg, N1));
+  }
+  case ISD::SRL:
+  case ISD::SHL: {
+    SDValue N1 = Op.getOperand(1);
+    EVT N1VT = N1.getValueType();
+
+    N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, N0);
+    if (!N1VT.bitsEq(ShiftVT)) {
+      unsigned N1Opc = ISD::ZERO_EXTEND;
+
+      if (N1.getValueType().bitsGT(ShiftVT))
+        N1Opc = ISD::TRUNCATE;
+
+      N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
+    }
+
+    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
+                       DAG.getNode(Opc, dl, MVT::i16, N0, N1));
+  }
+  case ISD::SRA: {
+    SDValue N1 = Op.getOperand(1);
+    EVT N1VT = N1.getValueType();
+
+    N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
+    if (!N1VT.bitsEq(ShiftVT)) {
+      unsigned N1Opc = ISD::SIGN_EXTEND;
+
+      if (N1VT.bitsGT(ShiftVT))
+        N1Opc = ISD::TRUNCATE;
+      N1 = DAG.getNode(N1Opc, dl, ShiftVT, N1);
+    }
+
+    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
+                       DAG.getNode(Opc, dl, MVT::i16, N0, N1));
+  }
+  case ISD::MUL: {
+    SDValue N1 = Op.getOperand(1);
+
+    N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N0);
+    N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i16, N1);
+    return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
+                       DAG.getNode(Opc, dl, MVT::i16, N0, N1));
+    break;
+  }
+  }
+
+  return SDValue();
+}
+
+//! Lower byte immediate operations for v16i8 vectors:
+static SDValue
+LowerByteImmed(SDValue Op, SelectionDAG &DAG) {
+  SDValue ConstVec;
+  SDValue Arg;
+  EVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+
+  ConstVec = Op.getOperand(0);
+  Arg = Op.getOperand(1);
+  if (ConstVec.getNode()->getOpcode() != ISD::BUILD_VECTOR) {
+    if (ConstVec.getNode()->getOpcode() == ISD::BIT_CONVERT) {
+      ConstVec = ConstVec.getOperand(0);
+    } else {
+      ConstVec = Op.getOperand(1);
+      Arg = Op.getOperand(0);
+      if (ConstVec.getNode()->getOpcode() == ISD::BIT_CONVERT) {
+        ConstVec = ConstVec.getOperand(0);
+      }
+    }
+  }
+
+  if (ConstVec.getNode()->getOpcode() == ISD::BUILD_VECTOR) {
+    BuildVectorSDNode *BCN = dyn_cast<BuildVectorSDNode>(ConstVec.getNode());
+    assert(BCN != 0 && "Expected BuildVectorSDNode in SPU LowerByteImmed");
+
+    APInt APSplatBits, APSplatUndef;
+    unsigned SplatBitSize;
+    bool HasAnyUndefs;
+    unsigned minSplatBits = VT.getVectorElementType().getSizeInBits();
+
+    if (BCN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
+                              HasAnyUndefs, minSplatBits)
+        && minSplatBits <= SplatBitSize) {
+      uint64_t SplatBits = APSplatBits.getZExtValue();
+      SDValue tc = DAG.getTargetConstant(SplatBits & 0xff, MVT::i8);
+
+      SmallVector<SDValue, 16> tcVec;
+      tcVec.assign(16, tc);
+      return DAG.getNode(Op.getNode()->getOpcode(), dl, VT, Arg,
+                         DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &tcVec[0], tcVec.size()));
+    }
+  }
+
+  // These operations (AND, OR, XOR) are legal, they just couldn't be custom
+  // lowered.  Return the operation, rather than a null SDValue.
+  return Op;
+}
+
+//! Custom lowering for CTPOP (count population)
+/*!
+  Custom lowering code that counts the number ones in the input
+  operand. SPU has such an instruction, but it counts the number of
+  ones per byte, which then have to be accumulated.
+*/
+static SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+  EVT vecVT = EVT::getVectorVT(*DAG.getContext(), 
+                               VT, (128 / VT.getSizeInBits()));
+  DebugLoc dl = Op.getDebugLoc();
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  default:
+    assert(false && "Invalid value type!");
+  case MVT::i8: {
+    SDValue N = Op.getOperand(0);
+    SDValue Elt0 = DAG.getConstant(0, MVT::i32);
+
+    SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
+    SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);
+
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, CNTB, Elt0);
+  }
+
+  case MVT::i16: {
+    MachineFunction &MF = DAG.getMachineFunction();
+    MachineRegisterInfo &RegInfo = MF.getRegInfo();
+
+    unsigned CNTB_reg = RegInfo.createVirtualRegister(&SPU::R16CRegClass);
+
+    SDValue N = Op.getOperand(0);
+    SDValue Elt0 = DAG.getConstant(0, MVT::i16);
+    SDValue Mask0 = DAG.getConstant(0x0f, MVT::i16);
+    SDValue Shift1 = DAG.getConstant(8, MVT::i32);
+
+    SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
+    SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);
+
+    // CNTB_result becomes the chain to which all of the virtual registers
+    // CNTB_reg, SUM1_reg become associated:
+    SDValue CNTB_result =
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, CNTB, Elt0);
+
+    SDValue CNTB_rescopy =
+      DAG.getCopyToReg(CNTB_result, dl, CNTB_reg, CNTB_result);
+
+    SDValue Tmp1 = DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i16);
+
+    return DAG.getNode(ISD::AND, dl, MVT::i16,
+                       DAG.getNode(ISD::ADD, dl, MVT::i16,
+                                   DAG.getNode(ISD::SRL, dl, MVT::i16,
+                                               Tmp1, Shift1),
+                                   Tmp1),
+                       Mask0);
+  }
+
+  case MVT::i32: {
+    MachineFunction &MF = DAG.getMachineFunction();
+    MachineRegisterInfo &RegInfo = MF.getRegInfo();
+
+    unsigned CNTB_reg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
+    unsigned SUM1_reg = RegInfo.createVirtualRegister(&SPU::R32CRegClass);
+
+    SDValue N = Op.getOperand(0);
+    SDValue Elt0 = DAG.getConstant(0, MVT::i32);
+    SDValue Mask0 = DAG.getConstant(0xff, MVT::i32);
+    SDValue Shift1 = DAG.getConstant(16, MVT::i32);
+    SDValue Shift2 = DAG.getConstant(8, MVT::i32);
+
+    SDValue Promote = DAG.getNode(SPUISD::PREFSLOT2VEC, dl, vecVT, N, N);
+    SDValue CNTB = DAG.getNode(SPUISD::CNTB, dl, vecVT, Promote);
+
+    // CNTB_result becomes the chain to which all of the virtual registers
+    // CNTB_reg, SUM1_reg become associated:
+    SDValue CNTB_result =
+      DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, CNTB, Elt0);
+
+    SDValue CNTB_rescopy =
+      DAG.getCopyToReg(CNTB_result, dl, CNTB_reg, CNTB_result);
+
+    SDValue Comp1 =
+      DAG.getNode(ISD::SRL, dl, MVT::i32,
+                  DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i32),
+                  Shift1);
+
+    SDValue Sum1 =
+      DAG.getNode(ISD::ADD, dl, MVT::i32, Comp1,
+                  DAG.getCopyFromReg(CNTB_rescopy, dl, CNTB_reg, MVT::i32));
+
+    SDValue Sum1_rescopy =
+      DAG.getCopyToReg(CNTB_result, dl, SUM1_reg, Sum1);
+
+    SDValue Comp2 =
+      DAG.getNode(ISD::SRL, dl, MVT::i32,
+                  DAG.getCopyFromReg(Sum1_rescopy, dl, SUM1_reg, MVT::i32),
+                  Shift2);
+    SDValue Sum2 =
+      DAG.getNode(ISD::ADD, dl, MVT::i32, Comp2,
+                  DAG.getCopyFromReg(Sum1_rescopy, dl, SUM1_reg, MVT::i32));
+
+    return DAG.getNode(ISD::AND, dl, MVT::i32, Sum2, Mask0);
+  }
+
+  case MVT::i64:
+    break;
+  }
+
+  return SDValue();
+}
+
+//! Lower ISD::FP_TO_SINT, ISD::FP_TO_UINT for i32
+/*!
+ f32->i32 passes through unchanged, whereas f64->i32 expands to a libcall.
+ All conversions to i64 are expanded to a libcall.
+ */
+static SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
+                              SPUTargetLowering &TLI) {
+  EVT OpVT = Op.getValueType();
+  SDValue Op0 = Op.getOperand(0);
+  EVT Op0VT = Op0.getValueType();
+
+  if ((OpVT == MVT::i32 && Op0VT == MVT::f64)
+      || OpVT == MVT::i64) {
+    // Convert f32 / f64 to i32 / i64 via libcall.
+    RTLIB::Libcall LC =
+            (Op.getOpcode() == ISD::FP_TO_SINT)
+             ? RTLIB::getFPTOSINT(Op0VT, OpVT)
+             : RTLIB::getFPTOUINT(Op0VT, OpVT);
+    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpectd fp-to-int conversion!");
+    SDValue Dummy;
+    return ExpandLibCall(LC, Op, DAG, false, Dummy, TLI);
+  }
+
+  return Op;
+}
+
+//! Lower ISD::SINT_TO_FP, ISD::UINT_TO_FP for i32
+/*!
+ i32->f32 passes through unchanged, whereas i32->f64 is expanded to a libcall.
+ All conversions from i64 are expanded to a libcall.
+ */
+static SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG,
+                              SPUTargetLowering &TLI) {
+  EVT OpVT = Op.getValueType();
+  SDValue Op0 = Op.getOperand(0);
+  EVT Op0VT = Op0.getValueType();
+
+  if ((OpVT == MVT::f64 && Op0VT == MVT::i32)
+      || Op0VT == MVT::i64) {
+    // Convert i32, i64 to f64 via libcall:
+    RTLIB::Libcall LC =
+            (Op.getOpcode() == ISD::SINT_TO_FP)
+             ? RTLIB::getSINTTOFP(Op0VT, OpVT)
+             : RTLIB::getUINTTOFP(Op0VT, OpVT);
+    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpectd int-to-fp conversion!");
+    SDValue Dummy;
+    return ExpandLibCall(LC, Op, DAG, false, Dummy, TLI);
+  }
+
+  return Op;
+}
+
+//! Lower ISD::SETCC
+/*!
+ This handles MVT::f64 (double floating point) condition lowering
+ */
+static SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG,
+                          const TargetLowering &TLI) {
+  CondCodeSDNode *CC = dyn_cast<CondCodeSDNode>(Op.getOperand(2));
+  DebugLoc dl = Op.getDebugLoc();
+  assert(CC != 0 && "LowerSETCC: CondCodeSDNode should not be null here!\n");
+
+  SDValue lhs = Op.getOperand(0);
+  SDValue rhs = Op.getOperand(1);
+  EVT lhsVT = lhs.getValueType();
+  assert(lhsVT == MVT::f64 && "LowerSETCC: type other than MVT::64\n");
+
+  EVT ccResultVT = TLI.getSetCCResultType(lhs.getValueType());
+  APInt ccResultOnes = APInt::getAllOnesValue(ccResultVT.getSizeInBits());
+  EVT IntVT(MVT::i64);
+
+  // Take advantage of the fact that (truncate (sra arg, 32)) is efficiently
+  // selected to a NOP:
+  SDValue i64lhs = DAG.getNode(ISD::BIT_CONVERT, dl, IntVT, lhs);
+  SDValue lhsHi32 =
+          DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
+                      DAG.getNode(ISD::SRL, dl, IntVT,
+                                  i64lhs, DAG.getConstant(32, MVT::i32)));
+  SDValue lhsHi32abs =
+          DAG.getNode(ISD::AND, dl, MVT::i32,
+                      lhsHi32, DAG.getConstant(0x7fffffff, MVT::i32));
+  SDValue lhsLo32 =
+          DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, i64lhs);
+
+  // SETO and SETUO only use the lhs operand:
+  if (CC->get() == ISD::SETO) {
+    // Evaluates to true if Op0 is not [SQ]NaN - lowers to the inverse of
+    // SETUO
+    APInt ccResultAllOnes = APInt::getAllOnesValue(ccResultVT.getSizeInBits());
+    return DAG.getNode(ISD::XOR, dl, ccResultVT,
+                       DAG.getSetCC(dl, ccResultVT,
+                                    lhs, DAG.getConstantFP(0.0, lhsVT),
+                                    ISD::SETUO),
+                       DAG.getConstant(ccResultAllOnes, ccResultVT));
+  } else if (CC->get() == ISD::SETUO) {
+    // Evaluates to true if Op0 is [SQ]NaN
+    return DAG.getNode(ISD::AND, dl, ccResultVT,
+                       DAG.getSetCC(dl, ccResultVT,
+                                    lhsHi32abs,
+                                    DAG.getConstant(0x7ff00000, MVT::i32),
+                                    ISD::SETGE),
+                       DAG.getSetCC(dl, ccResultVT,
+                                    lhsLo32,
+                                    DAG.getConstant(0, MVT::i32),
+                                    ISD::SETGT));
+  }
+
+  SDValue i64rhs = DAG.getNode(ISD::BIT_CONVERT, dl, IntVT, rhs);
+  SDValue rhsHi32 =
+          DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
+                      DAG.getNode(ISD::SRL, dl, IntVT,
+                                  i64rhs, DAG.getConstant(32, MVT::i32)));
+
+  // If a value is negative, subtract from the sign magnitude constant:
+  SDValue signMag2TC = DAG.getConstant(0x8000000000000000ULL, IntVT);
+
+  // Convert the sign-magnitude representation into 2's complement:
+  SDValue lhsSelectMask = DAG.getNode(ISD::SRA, dl, ccResultVT,
+                                      lhsHi32, DAG.getConstant(31, MVT::i32));
+  SDValue lhsSignMag2TC = DAG.getNode(ISD::SUB, dl, IntVT, signMag2TC, i64lhs);
+  SDValue lhsSelect =
+          DAG.getNode(ISD::SELECT, dl, IntVT,
+                      lhsSelectMask, lhsSignMag2TC, i64lhs);
+
+  SDValue rhsSelectMask = DAG.getNode(ISD::SRA, dl, ccResultVT,
+                                      rhsHi32, DAG.getConstant(31, MVT::i32));
+  SDValue rhsSignMag2TC = DAG.getNode(ISD::SUB, dl, IntVT, signMag2TC, i64rhs);
+  SDValue rhsSelect =
+          DAG.getNode(ISD::SELECT, dl, IntVT,
+                      rhsSelectMask, rhsSignMag2TC, i64rhs);
+
+  unsigned compareOp;
+
+  switch (CC->get()) {
+  case ISD::SETOEQ:
+  case ISD::SETUEQ:
+    compareOp = ISD::SETEQ; break;
+  case ISD::SETOGT:
+  case ISD::SETUGT:
+    compareOp = ISD::SETGT; break;
+  case ISD::SETOGE:
+  case ISD::SETUGE:
+    compareOp = ISD::SETGE; break;
+  case ISD::SETOLT:
+  case ISD::SETULT:
+    compareOp = ISD::SETLT; break;
+  case ISD::SETOLE:
+  case ISD::SETULE:
+    compareOp = ISD::SETLE; break;
+  case ISD::SETUNE:
+  case ISD::SETONE:
+    compareOp = ISD::SETNE; break;
+  default:
+    llvm_report_error("CellSPU ISel Select: unimplemented f64 condition");
+  }
+
+  SDValue result =
+          DAG.getSetCC(dl, ccResultVT, lhsSelect, rhsSelect,
+                       (ISD::CondCode) compareOp);
+
+  if ((CC->get() & 0x8) == 0) {
+    // Ordered comparison:
+    SDValue lhsNaN = DAG.getSetCC(dl, ccResultVT,
+                                  lhs, DAG.getConstantFP(0.0, MVT::f64),
+                                  ISD::SETO);
+    SDValue rhsNaN = DAG.getSetCC(dl, ccResultVT,
+                                  rhs, DAG.getConstantFP(0.0, MVT::f64),
+                                  ISD::SETO);
+    SDValue ordered = DAG.getNode(ISD::AND, dl, ccResultVT, lhsNaN, rhsNaN);
+
+    result = DAG.getNode(ISD::AND, dl, ccResultVT, ordered, result);
+  }
+
+  return result;
+}
+
+//! Lower ISD::SELECT_CC
+/*!
+  ISD::SELECT_CC can (generally) be implemented directly on the SPU using the
+  SELB instruction.
+
+  \note Need to revisit this in the future: if the code path through the true
+  and false value computations is longer than the latency of a branch (6
+  cycles), then it would be more advantageous to branch and insert a new basic
+  block and branch on the condition. However, this code does not make that
+  assumption, given the simplisitc uses so far.
+ */
+
+static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG,
+                              const TargetLowering &TLI) {
+  EVT VT = Op.getValueType();
+  SDValue lhs = Op.getOperand(0);
+  SDValue rhs = Op.getOperand(1);
+  SDValue trueval = Op.getOperand(2);
+  SDValue falseval = Op.getOperand(3);
+  SDValue condition = Op.getOperand(4);
+  DebugLoc dl = Op.getDebugLoc();
+
+  // NOTE: SELB's arguments: $rA, $rB, $mask
+  //
+  // SELB selects bits from $rA where bits in $mask are 0, bits from $rB
+  // where bits in $mask are 1. CCond will be inverted, having 1s where the
+  // condition was true and 0s where the condition was false. Hence, the
+  // arguments to SELB get reversed.
+
+  // Note: Really should be ISD::SELECT instead of SPUISD::SELB, but LLVM's
+  // legalizer insists on combining SETCC/SELECT into SELECT_CC, so we end up
+  // with another "cannot select select_cc" assert:
+
+  SDValue compare = DAG.getNode(ISD::SETCC, dl,
+                                TLI.getSetCCResultType(Op.getValueType()),
+                                lhs, rhs, condition);
+  return DAG.getNode(SPUISD::SELB, dl, VT, falseval, trueval, compare);
+}
+
+//! Custom lower ISD::TRUNCATE
+static SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG)
+{
+  // Type to truncate to
+  EVT VT = Op.getValueType();
+  MVT simpleVT = VT.getSimpleVT();
+  EVT VecVT = EVT::getVectorVT(*DAG.getContext(), 
+                               VT, (128 / VT.getSizeInBits()));
+  DebugLoc dl = Op.getDebugLoc();
+
+  // Type to truncate from
+  SDValue Op0 = Op.getOperand(0);
+  EVT Op0VT = Op0.getValueType();
+
+  if (Op0VT.getSimpleVT() == MVT::i128 && simpleVT == MVT::i64) {
+    // Create shuffle mask, least significant doubleword of quadword
+    unsigned maskHigh = 0x08090a0b;
+    unsigned maskLow = 0x0c0d0e0f;
+    // Use a shuffle to perform the truncation
+    SDValue shufMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                                   DAG.getConstant(maskHigh, MVT::i32),
+                                   DAG.getConstant(maskLow, MVT::i32),
+                                   DAG.getConstant(maskHigh, MVT::i32),
+                                   DAG.getConstant(maskLow, MVT::i32));
+
+    SDValue truncShuffle = DAG.getNode(SPUISD::SHUFB, dl, VecVT,
+                                       Op0, Op0, shufMask);
+
+    return DAG.getNode(SPUISD::VEC2PREFSLOT, dl, VT, truncShuffle);
+  }
+
+  return SDValue();             // Leave the truncate unmolested
+}
+
+/*!
+ * Emit the instruction sequence for i64/i32 -> i128 sign extend. The basic
+ * algorithm is to duplicate the sign bit using rotmai to generate at
+ * least one byte full of sign bits. Then propagate the "sign-byte" into
+ * the leftmost words and the i64/i32 into the rightmost words using shufb.
+ *
+ * @param Op The sext operand
+ * @param DAG The current DAG
+ * @return The SDValue with the entire instruction sequence
+ */
+static SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG)
+{
+  DebugLoc dl = Op.getDebugLoc();
+
+  // Type to extend to
+  MVT OpVT = Op.getValueType().getSimpleVT();
+
+  // Type to extend from
+  SDValue Op0 = Op.getOperand(0);
+  MVT Op0VT = Op0.getValueType().getSimpleVT();
+
+  // The type to extend to needs to be a i128 and
+  // the type to extend from needs to be i64 or i32.
+  assert((OpVT == MVT::i128 && (Op0VT == MVT::i64 || Op0VT == MVT::i32)) &&
+          "LowerSIGN_EXTEND: input and/or output operand have wrong size");
+
+  // Create shuffle mask
+  unsigned mask1 = 0x10101010; // byte 0 - 3 and 4 - 7
+  unsigned mask2 = Op0VT == MVT::i64 ? 0x00010203 : 0x10101010; // byte  8 - 11
+  unsigned mask3 = Op0VT == MVT::i64 ? 0x04050607 : 0x00010203; // byte 12 - 15
+  SDValue shufMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                                 DAG.getConstant(mask1, MVT::i32),
+                                 DAG.getConstant(mask1, MVT::i32),
+                                 DAG.getConstant(mask2, MVT::i32),
+                                 DAG.getConstant(mask3, MVT::i32));
+
+  // Word wise arithmetic right shift to generate at least one byte
+  // that contains sign bits.
+  MVT mvt = Op0VT == MVT::i64 ? MVT::v2i64 : MVT::v4i32;
+  SDValue sraVal = DAG.getNode(ISD::SRA,
+                 dl,
+                 mvt,
+                 DAG.getNode(SPUISD::PREFSLOT2VEC, dl, mvt, Op0, Op0),
+                 DAG.getConstant(31, MVT::i32));
+
+  // Shuffle bytes - Copy the sign bits into the upper 64 bits
+  // and the input value into the lower 64 bits.
+  SDValue extShuffle = DAG.getNode(SPUISD::SHUFB, dl, mvt,
+      DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i128, Op0), sraVal, shufMask);
+
+  return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i128, extShuffle);
+}
+
+//! Custom (target-specific) lowering entry point
+/*!
+  This is where LLVM's DAG selection process calls to do target-specific
+  lowering of nodes.
+ */
+SDValue
+SPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
+{
+  unsigned Opc = (unsigned) Op.getOpcode();
+  EVT VT = Op.getValueType();
+
+  switch (Opc) {
+  default: {
+#ifndef NDEBUG
+    errs() << "SPUTargetLowering::LowerOperation(): need to lower this!\n";
+    errs() << "Op.getOpcode() = " << Opc << "\n";
+    errs() << "*Op.getNode():\n";
+    Op.getNode()->dump();
+#endif
+    llvm_unreachable(0);
+  }
+  case ISD::LOAD:
+  case ISD::EXTLOAD:
+  case ISD::SEXTLOAD:
+  case ISD::ZEXTLOAD:
+    return LowerLOAD(Op, DAG, SPUTM.getSubtargetImpl());
+  case ISD::STORE:
+    return LowerSTORE(Op, DAG, SPUTM.getSubtargetImpl());
+  case ISD::ConstantPool:
+    return LowerConstantPool(Op, DAG, SPUTM.getSubtargetImpl());
+  case ISD::GlobalAddress:
+    return LowerGlobalAddress(Op, DAG, SPUTM.getSubtargetImpl());
+  case ISD::JumpTable:
+    return LowerJumpTable(Op, DAG, SPUTM.getSubtargetImpl());
+  case ISD::ConstantFP:
+    return LowerConstantFP(Op, DAG);
+
+  // i8, i64 math ops:
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::ROTR:
+  case ISD::ROTL:
+  case ISD::SRL:
+  case ISD::SHL:
+  case ISD::SRA: {
+    if (VT == MVT::i8)
+      return LowerI8Math(Op, DAG, Opc, *this);
+    break;
+  }
+
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+    return LowerFP_TO_INT(Op, DAG, *this);
+
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    return LowerINT_TO_FP(Op, DAG, *this);
+
+  // Vector-related lowering.
+  case ISD::BUILD_VECTOR:
+    return LowerBUILD_VECTOR(Op, DAG);
+  case ISD::SCALAR_TO_VECTOR:
+    return LowerSCALAR_TO_VECTOR(Op, DAG);
+  case ISD::VECTOR_SHUFFLE:
+    return LowerVECTOR_SHUFFLE(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT:
+    return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::INSERT_VECTOR_ELT:
+    return LowerINSERT_VECTOR_ELT(Op, DAG);
+
+  // Look for ANDBI, ORBI and XORBI opportunities and lower appropriately:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+    return LowerByteImmed(Op, DAG);
+
+  // Vector and i8 multiply:
+  case ISD::MUL:
+    if (VT == MVT::i8)
+      return LowerI8Math(Op, DAG, Opc, *this);
+
+  case ISD::CTPOP:
+    return LowerCTPOP(Op, DAG);
+
+  case ISD::SELECT_CC:
+    return LowerSELECT_CC(Op, DAG, *this);
+
+  case ISD::SETCC:
+    return LowerSETCC(Op, DAG, *this);
+
+  case ISD::TRUNCATE:
+    return LowerTRUNCATE(Op, DAG);
+
+  case ISD::SIGN_EXTEND:
+    return LowerSIGN_EXTEND(Op, DAG);
+  }
+
+  return SDValue();
+}
+
+void SPUTargetLowering::ReplaceNodeResults(SDNode *N,
+                                           SmallVectorImpl<SDValue>&Results,
+                                           SelectionDAG &DAG)
+{
+#if 0
+  unsigned Opc = (unsigned) N->getOpcode();
+  EVT OpVT = N->getValueType(0);
+
+  switch (Opc) {
+  default: {
+    errs() << "SPUTargetLowering::ReplaceNodeResults(): need to fix this!\n";
+    errs() << "Op.getOpcode() = " << Opc << "\n";
+    errs() << "*Op.getNode():\n";
+    N->dump();
+    abort();
+    /*NOTREACHED*/
+  }
+  }
+#endif
+
+  /* Otherwise, return unchanged */
+}
+
+//===----------------------------------------------------------------------===//
+// Target Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+SDValue
+SPUTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const
+{
+#if 0
+  TargetMachine &TM = getTargetMachine();
+#endif
+  const SPUSubtarget *ST = SPUTM.getSubtargetImpl();
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue Op0 = N->getOperand(0);       // everything has at least one operand
+  EVT NodeVT = N->getValueType(0);      // The node's value type
+  EVT Op0VT = Op0.getValueType();       // The first operand's result
+  SDValue Result;                       // Initially, empty result
+  DebugLoc dl = N->getDebugLoc();
+
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::ADD: {
+    SDValue Op1 = N->getOperand(1);
+
+    if (Op0.getOpcode() == SPUISD::IndirectAddr
+        || Op1.getOpcode() == SPUISD::IndirectAddr) {
+      // Normalize the operands to reduce repeated code
+      SDValue IndirectArg = Op0, AddArg = Op1;
+
+      if (Op1.getOpcode() == SPUISD::IndirectAddr) {
+        IndirectArg = Op1;
+        AddArg = Op0;
+      }
+
+      if (isa<ConstantSDNode>(AddArg)) {
+        ConstantSDNode *CN0 = cast<ConstantSDNode > (AddArg);
+        SDValue IndOp1 = IndirectArg.getOperand(1);
+
+        if (CN0->isNullValue()) {
+          // (add (SPUindirect <arg>, <arg>), 0) ->
+          // (SPUindirect <arg>, <arg>)
+
+#if !defined(NDEBUG)
+          if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
+            errs() << "\n"
+                 << "Replace: (add (SPUindirect <arg>, <arg>), 0)\n"
+                 << "With:    (SPUindirect <arg>, <arg>)\n";
+          }
+#endif
+
+          return IndirectArg;
+        } else if (isa<ConstantSDNode>(IndOp1)) {
+          // (add (SPUindirect <arg>, <const>), <const>) ->
+          // (SPUindirect <arg>, <const + const>)
+          ConstantSDNode *CN1 = cast<ConstantSDNode > (IndOp1);
+          int64_t combinedConst = CN0->getSExtValue() + CN1->getSExtValue();
+          SDValue combinedValue = DAG.getConstant(combinedConst, Op0VT);
+
+#if !defined(NDEBUG)
+          if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
+            errs() << "\n"
+                 << "Replace: (add (SPUindirect <arg>, " << CN1->getSExtValue()
+                 << "), " << CN0->getSExtValue() << ")\n"
+                 << "With:    (SPUindirect <arg>, "
+                 << combinedConst << ")\n";
+          }
+#endif
+
+          return DAG.getNode(SPUISD::IndirectAddr, dl, Op0VT,
+                             IndirectArg, combinedValue);
+        }
+      }
+    }
+    break;
+  }
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::ANY_EXTEND: {
+    if (Op0.getOpcode() == SPUISD::VEC2PREFSLOT && NodeVT == Op0VT) {
+      // (any_extend (SPUextract_elt0 <arg>)) ->
+      // (SPUextract_elt0 <arg>)
+      // Types must match, however...
+#if !defined(NDEBUG)
+      if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
+        errs() << "\nReplace: ";
+        N->dump(&DAG);
+        errs() << "\nWith:    ";
+        Op0.getNode()->dump(&DAG);
+        errs() << "\n";
+      }
+#endif
+
+      return Op0;
+    }
+    break;
+  }
+  case SPUISD::IndirectAddr: {
+    if (!ST->usingLargeMem() && Op0.getOpcode() == SPUISD::AFormAddr) {
+      ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1));
+      if (CN != 0 && CN->getZExtValue() == 0) {
+        // (SPUindirect (SPUaform <addr>, 0), 0) ->
+        // (SPUaform <addr>, 0)
+
+        DEBUG(errs() << "Replace: ");
+        DEBUG(N->dump(&DAG));
+        DEBUG(errs() << "\nWith:    ");
+        DEBUG(Op0.getNode()->dump(&DAG));
+        DEBUG(errs() << "\n");
+
+        return Op0;
+      }
+    } else if (Op0.getOpcode() == ISD::ADD) {
+      SDValue Op1 = N->getOperand(1);
+      if (ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(Op1)) {
+        // (SPUindirect (add <arg>, <arg>), 0) ->
+        // (SPUindirect <arg>, <arg>)
+        if (CN1->isNullValue()) {
+
+#if !defined(NDEBUG)
+          if (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) {
+            errs() << "\n"
+                 << "Replace: (SPUindirect (add <arg>, <arg>), 0)\n"
+                 << "With:    (SPUindirect <arg>, <arg>)\n";
+          }
+#endif
+
+          return DAG.getNode(SPUISD::IndirectAddr, dl, Op0VT,
+                             Op0.getOperand(0), Op0.getOperand(1));
+        }
+      }
+    }
+    break;
+  }
+  case SPUISD::SHLQUAD_L_BITS:
+  case SPUISD::SHLQUAD_L_BYTES:
+  case SPUISD::ROTBYTES_LEFT: {
+    SDValue Op1 = N->getOperand(1);
+
+    // Kill degenerate vector shifts:
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op1)) {
+      if (CN->isNullValue()) {
+        Result = Op0;
+      }
+    }
+    break;
+  }
+  case SPUISD::PREFSLOT2VEC: {
+    switch (Op0.getOpcode()) {
+    default:
+      break;
+    case ISD::ANY_EXTEND:
+    case ISD::ZERO_EXTEND:
+    case ISD::SIGN_EXTEND: {
+      // (SPUprefslot2vec (any|zero|sign_extend (SPUvec2prefslot <arg>))) ->
+      // <arg>
+      // but only if the SPUprefslot2vec and <arg> types match.
+      SDValue Op00 = Op0.getOperand(0);
+      if (Op00.getOpcode() == SPUISD::VEC2PREFSLOT) {
+        SDValue Op000 = Op00.getOperand(0);
+        if (Op000.getValueType() == NodeVT) {
+          Result = Op000;
+        }
+      }
+      break;
+    }
+    case SPUISD::VEC2PREFSLOT: {
+      // (SPUprefslot2vec (SPUvec2prefslot <arg>)) ->
+      // <arg>
+      Result = Op0.getOperand(0);
+      break;
+    }
+    }
+    break;
+  }
+  }
+
+  // Otherwise, return unchanged.
+#ifndef NDEBUG
+  if (Result.getNode()) {
+    DEBUG(errs() << "\nReplace.SPU: ");
+    DEBUG(N->dump(&DAG));
+    DEBUG(errs() << "\nWith:        ");
+    DEBUG(Result.getNode()->dump(&DAG));
+    DEBUG(errs() << "\n");
+  }
+#endif
+
+  return Result;
+}
+
+//===----------------------------------------------------------------------===//
+// Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+SPUTargetLowering::ConstraintType
+SPUTargetLowering::getConstraintType(const std::string &ConstraintLetter) const {
+  if (ConstraintLetter.size() == 1) {
+    switch (ConstraintLetter[0]) {
+    default: break;
+    case 'b':
+    case 'r':
+    case 'f':
+    case 'v':
+    case 'y':
+      return C_RegisterClass;
+    }
+  }
+  return TargetLowering::getConstraintType(ConstraintLetter);
+}
+
+std::pair<unsigned, const TargetRegisterClass*>
+SPUTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+                                                EVT VT) const
+{
+  if (Constraint.size() == 1) {
+    // GCC RS6000 Constraint Letters
+    switch (Constraint[0]) {
+    case 'b':   // R1-R31
+    case 'r':   // R0-R31
+      if (VT == MVT::i64)
+        return std::make_pair(0U, SPU::R64CRegisterClass);
+      return std::make_pair(0U, SPU::R32CRegisterClass);
+    case 'f':
+      if (VT == MVT::f32)
+        return std::make_pair(0U, SPU::R32FPRegisterClass);
+      else if (VT == MVT::f64)
+        return std::make_pair(0U, SPU::R64FPRegisterClass);
+      break;
+    case 'v':
+      return std::make_pair(0U, SPU::GPRCRegisterClass);
+    }
+  }
+
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
+
+//! Compute used/known bits for a SPU operand
+void
+SPUTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
+                                                  const APInt &Mask,
+                                                  APInt &KnownZero,
+                                                  APInt &KnownOne,
+                                                  const SelectionDAG &DAG,
+                                                  unsigned Depth ) const {
+#if 0
+  const uint64_t uint64_sizebits = sizeof(uint64_t) * CHAR_BIT;
+
+  switch (Op.getOpcode()) {
+  default:
+    // KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);
+    break;
+  case CALL:
+  case SHUFB:
+  case SHUFFLE_MASK:
+  case CNTB:
+  case SPUISD::PREFSLOT2VEC:
+  case SPUISD::LDRESULT:
+  case SPUISD::VEC2PREFSLOT:
+  case SPUISD::SHLQUAD_L_BITS:
+  case SPUISD::SHLQUAD_L_BYTES:
+  case SPUISD::VEC_ROTL:
+  case SPUISD::VEC_ROTR:
+  case SPUISD::ROTBYTES_LEFT:
+  case SPUISD::SELECT_MASK:
+  case SPUISD::SELB:
+  }
+#endif
+}
+
+unsigned
+SPUTargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
+                                                   unsigned Depth) const {
+  switch (Op.getOpcode()) {
+  default:
+    return 1;
+
+  case ISD::SETCC: {
+    EVT VT = Op.getValueType();
+
+    if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32) {
+      VT = MVT::i32;
+    }
+    return VT.getSizeInBits();
+  }
+  }
+}
+
+// LowerAsmOperandForConstraint
+void
+SPUTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+                                                char ConstraintLetter,
+                                                bool hasMemory,
+                                                std::vector<SDValue> &Ops,
+                                                SelectionDAG &DAG) const {
+  // Default, for the time being, to the base class handler
+  TargetLowering::LowerAsmOperandForConstraint(Op, ConstraintLetter, hasMemory,
+                                               Ops, DAG);
+}
+
+/// isLegalAddressImmediate - Return true if the integer value can be used
+/// as the offset of the target addressing mode.
+bool SPUTargetLowering::isLegalAddressImmediate(int64_t V,
+                                                const Type *Ty) const {
+  // SPU's addresses are 256K:
+  return (V > -(1 << 18) && V < (1 << 18) - 1);
+}
+
+bool SPUTargetLowering::isLegalAddressImmediate(llvm::GlobalValue* GV) const {
+  return false;
+}
+
+bool
+SPUTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  // The SPU target isn't yet aware of offsets.
+  return false;
+}

diff --git a/lib/Target/CellSPU/SPUISelLowering.h b/lib/Target/CellSPU/SPUISelLowering.h
new file mode 100644
index 0000000..3c51177
--- /dev/null
+++ b/lib/Target/CellSPU/SPUISelLowering.h

@@ -0,0 +1,175 @@
+//===-- SPUISelLowering.h - Cell SPU DAG Lowering Interface -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that Cell SPU uses to lower LLVM code into
+// a selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPU_ISELLOWERING_H
+#define SPU_ISELLOWERING_H
+
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "SPU.h"
+
+namespace llvm {
+  namespace SPUISD {
+    enum NodeType {
+      // Start the numbering where the builting ops and target ops leave off.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      // Pseudo instructions:
+      RET_FLAG,                 ///< Return with flag, matched by bi instruction
+
+      Hi,                       ///< High address component (upper 16)
+      Lo,                       ///< Low address component (lower 16)
+      PCRelAddr,                ///< Program counter relative address
+      AFormAddr,                ///< A-form address (local store)
+      IndirectAddr,             ///< D-Form "imm($r)" and X-form "$r($r)"
+
+      LDRESULT,                 ///< Load result (value, chain)
+      CALL,                     ///< CALL instruction
+      SHUFB,                    ///< Vector shuffle (permute)
+      SHUFFLE_MASK,             ///< Shuffle mask
+      CNTB,                     ///< Count leading ones in bytes
+      PREFSLOT2VEC,             ///< Promote scalar->vector
+      VEC2PREFSLOT,             ///< Extract element 0
+      SHLQUAD_L_BITS,           ///< Rotate quad left, by bits
+      SHLQUAD_L_BYTES,          ///< Rotate quad left, by bytes
+      VEC_ROTL,                 ///< Vector rotate left
+      VEC_ROTR,                 ///< Vector rotate right
+      ROTBYTES_LEFT,            ///< Rotate bytes (loads -> ROTQBYI)
+      ROTBYTES_LEFT_BITS,       ///< Rotate bytes left by bit shift count
+      SELECT_MASK,              ///< Select Mask (FSM, FSMB, FSMH, FSMBI)
+      SELB,                     ///< Select bits -> (b & mask) | (a & ~mask)
+      // Markers: These aren't used to generate target-dependent nodes, but
+      // are used during instruction selection.
+      ADD64_MARKER,             ///< i64 addition marker
+      SUB64_MARKER,             ///< i64 subtraction marker
+      MUL64_MARKER,             ///< i64 multiply marker
+      LAST_SPUISD               ///< Last user-defined instruction
+    };
+  }
+
+  //! Utility functions specific to CellSPU:
+  namespace SPU {
+    SDValue get_vec_u18imm(SDNode *N, SelectionDAG &DAG,
+                             EVT ValueType);
+    SDValue get_vec_i16imm(SDNode *N, SelectionDAG &DAG,
+                             EVT ValueType);
+    SDValue get_vec_i10imm(SDNode *N, SelectionDAG &DAG,
+                             EVT ValueType);
+    SDValue get_vec_i8imm(SDNode *N, SelectionDAG &DAG,
+                            EVT ValueType);
+    SDValue get_ILHUvec_imm(SDNode *N, SelectionDAG &DAG,
+                              EVT ValueType);
+    SDValue get_v4i32_imm(SDNode *N, SelectionDAG &DAG);
+    SDValue get_v2i64_imm(SDNode *N, SelectionDAG &DAG);
+
+    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG,
+                              const SPUTargetMachine &TM);
+    //! Simplify a EVT::v2i64 constant splat to CellSPU-ready form
+    SDValue LowerV2I64Splat(EVT OpVT, SelectionDAG &DAG, uint64_t splat,
+                             DebugLoc dl);
+  }
+
+  class SPUTargetMachine;            // forward dec'l.
+
+  class SPUTargetLowering :
+    public TargetLowering
+  {
+    int VarArgsFrameIndex;            // FrameIndex for start of varargs area.
+    SPUTargetMachine &SPUTM;
+
+  public:
+    //! The venerable constructor
+    /*!
+     This is where the CellSPU backend sets operation handling (i.e., legal,
+     custom, expand or promote.)
+     */
+    SPUTargetLowering(SPUTargetMachine &TM);
+
+    //! Get the target machine
+    SPUTargetMachine &getSPUTargetMachine() {
+      return SPUTM;
+    }
+
+    /// getTargetNodeName() - This method returns the name of a target specific
+    /// DAG node.
+    virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+    /// getSetCCResultType - Return the ValueType for ISD::SETCC
+    virtual MVT::SimpleValueType getSetCCResultType(EVT VT) const;
+
+    //! Custom lowering hooks
+    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG);
+
+    //! Custom lowering hook for nodes with illegal result types.
+    virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+                                    SelectionDAG &DAG);
+
+    virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+    virtual void computeMaskedBitsForTargetNode(const SDValue Op,
+                                                const APInt &Mask,
+                                                APInt &KnownZero,
+                                                APInt &KnownOne,
+                                                const SelectionDAG &DAG,
+                                                unsigned Depth = 0) const;
+
+    virtual unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
+                                                   unsigned Depth = 0) const;
+
+    ConstraintType getConstraintType(const std::string &ConstraintLetter) const;
+
+    std::pair<unsigned, const TargetRegisterClass*>
+      getRegForInlineAsmConstraint(const std::string &Constraint,
+                                   EVT VT) const;
+
+    void LowerAsmOperandForConstraint(SDValue Op, char ConstraintLetter,
+                                      bool hasMemory,
+                                      std::vector<SDValue> &Ops,
+                                      SelectionDAG &DAG) const;
+
+    /// isLegalAddressImmediate - Return true if the integer value can be used
+    /// as the offset of the target addressing mode.
+    virtual bool isLegalAddressImmediate(int64_t V, const Type *Ty) const;
+    virtual bool isLegalAddressImmediate(GlobalValue *) const;
+
+    virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
+
+    /// getFunctionAlignment - Return the Log2 alignment of this function.
+    virtual unsigned getFunctionAlignment(const Function *F) const;
+
+    virtual SDValue
+      LowerFormalArguments(SDValue Chain,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerCall(SDValue Chain, SDValue Callee,
+                CallingConv::ID CallConv, bool isVarArg,
+                bool &isTailCall,
+                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<ISD::InputArg> &Ins,
+                DebugLoc dl, SelectionDAG &DAG,
+                SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerReturn(SDValue Chain,
+                  CallingConv::ID CallConv, bool isVarArg,
+                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  DebugLoc dl, SelectionDAG &DAG);
+  };
+}
+
+#endif

diff --git a/lib/Target/CellSPU/SPUInstrBuilder.h b/lib/Target/CellSPU/SPUInstrBuilder.h
new file mode 100644
index 0000000..5e268f8
--- /dev/null
+++ b/lib/Target/CellSPU/SPUInstrBuilder.h

@@ -0,0 +1,43 @@
+//==-- SPUInstrBuilder.h - Aides for building Cell SPU insts -----*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes functions that may be used with BuildMI from the
+// MachineInstrBuilder.h file to simplify generating frame and constant pool
+// references.
+//
+// For reference, the order of operands for memory references is:
+// (Operand), Dest Reg, Base Reg, and either Reg Index or Immediate
+// Displacement.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPU_INSTRBUILDER_H
+#define SPU_INSTRBUILDER_H
+
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+namespace llvm {
+
+/// addFrameReference - This function is used to add a reference to the base of
+/// an abstract object on the stack frame of the current function.  This
+/// reference has base register as the FrameIndex offset until it is resolved.
+/// This allows a constant offset to be specified as well...
+///
+inline const MachineInstrBuilder&
+addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0,
+                  bool mem = true) {
+  if (mem)
+    return MIB.addImm(Offset).addFrameIndex(FI);
+  else
+    return MIB.addFrameIndex(FI).addImm(Offset);
+}
+
+} // End llvm namespace
+
+#endif

diff --git a/lib/Target/CellSPU/SPUInstrFormats.td b/lib/Target/CellSPU/SPUInstrFormats.td
new file mode 100644
index 0000000..21bc275
--- /dev/null
+++ b/lib/Target/CellSPU/SPUInstrFormats.td

@@ -0,0 +1,298 @@
+//==== SPUInstrFormats.td - Cell SPU Instruction Formats ---*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//
+// Cell SPU instruction formats. Note that these are notationally similar to
+// PowerPC, like "A-Form". But the sizes of operands and fields differ.
+
+// This was kiped from the PPC instruction formats (seemed like a good idea...)
+
+class SPUInstr<dag OOL, dag IOL, string asmstr, InstrItinClass itin>
+        : Instruction {
+  field bits<32> Inst;
+
+  let Namespace = "SPU";
+  let OutOperandList = OOL;
+  let InOperandList = IOL;
+  let AsmString = asmstr;
+  let Itinerary = itin;
+}
+
+// RR Format
+class RRForm<bits<11> opcode, dag OOL, dag IOL, string asmstr, 
+              InstrItinClass itin, list<dag> pattern>
+         : SPUInstr<OOL, IOL, asmstr, itin> {
+  bits<7> RA;
+  bits<7> RB;
+  bits<7> RT;
+
+  let Pattern = pattern;
+
+  let Inst{0-10} = opcode;
+  let Inst{11-17} = RB;
+  let Inst{18-24} = RA;
+  let Inst{25-31} = RT;
+}
+
+let RB = 0 in {
+  // RR Format, where RB is zeroed (dont care):
+  class RRForm_1<bits<11> opcode, dag OOL, dag IOL, string asmstr, 
+                 InstrItinClass itin, list<dag> pattern>
+           : RRForm<opcode, OOL, IOL, asmstr, itin, pattern>
+  { }
+
+  let RA = 0 in {
+    // RR Format, where RA and RB are zeroed (dont care):
+    // Used for reads from status control registers (see FPSCRRr32)
+    class RRForm_2<bits<11> opcode, dag OOL, dag IOL, string asmstr,
+                   InstrItinClass itin, list<dag> pattern>
+             : RRForm<opcode, OOL, IOL, asmstr, itin, pattern>
+    { }
+  }
+}
+
+let RT = 0 in {
+  // RR Format, where RT is zeroed (don't care), or as the instruction handbook
+  // says, "RT is a false target." Used in "Halt if" instructions
+  class RRForm_3<bits<11> opcode, dag OOL, dag IOL, string asmstr,
+                 InstrItinClass itin, list<dag> pattern>
+      : RRForm<opcode, OOL, IOL, asmstr, itin, pattern>
+  { }
+}
+
+// RRR Format
+class RRRForm<bits<4> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+        : SPUInstr<OOL, IOL, asmstr, itin>
+{
+  bits<7> RA;
+  bits<7> RB;
+  bits<7> RC;
+  bits<7> RT;
+
+  let Pattern = pattern;
+
+  let Inst{0-3} = opcode;
+  let Inst{4-10} = RT;
+  let Inst{11-17} = RB;
+  let Inst{18-24} = RA;
+  let Inst{25-31} = RC;
+}
+
+// RI7 Format
+class RI7Form<bits<11> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+        : SPUInstr<OOL, IOL, asmstr, itin>
+{
+  bits<7> i7;
+  bits<7> RA;
+  bits<7> RT;
+
+  let Pattern = pattern;
+
+  let Inst{0-10} = opcode;
+  let Inst{11-17} = i7;
+  let Inst{18-24} = RA;
+  let Inst{25-31} = RT;
+}
+
+// CVTIntFp Format
+class CVTIntFPForm<bits<10> opcode, dag OOL, dag IOL, string asmstr,
+                   InstrItinClass itin, list<dag> pattern>
+        : SPUInstr<OOL, IOL, asmstr, itin>
+{
+  bits<7> RA;
+  bits<7> RT;
+
+  let Pattern = pattern;
+
+  let Inst{0-9} = opcode;
+  let Inst{10-17} = 0;
+  let Inst{18-24} = RA;
+  let Inst{25-31} = RT;
+}
+
+let RA = 0 in {
+  class BICondForm<bits<11> opcode, dag OOL, dag IOL, string asmstr, list<dag> pattern>
+           : RRForm<opcode, OOL, IOL, asmstr, BranchResolv, pattern>
+  { }
+
+  let RT = 0 in {
+    // Branch instruction format (without D/E flag settings)
+    class BRForm<bits<11> opcode, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+          : RRForm<opcode, OOL, IOL, asmstr, itin, pattern>
+    { }
+
+    class BIForm<bits<11> opcode, string asmstr, list<dag> pattern>
+             : RRForm<opcode, (outs), (ins R32C:$func), asmstr, BranchResolv,
+                      pattern>
+    { }
+
+    let RB = 0 in {
+      // Return instruction (bi, branch indirect), RA is zero (LR):
+      class RETForm<string asmstr, list<dag> pattern>
+             : BRForm<0b00010101100, (outs), (ins), asmstr, BranchResolv,
+                      pattern>
+      { }
+    }
+  }
+}
+
+// Branch indirect external data forms:
+class BISLEDForm<bits<2> DE_flag, string asmstr, list<dag> pattern>
+         : SPUInstr<(outs), (ins indcalltarget:$func), asmstr, BranchResolv>
+{
+  bits<7> Rcalldest;
+
+  let Pattern = pattern;
+
+  let Inst{0-10} = 0b11010101100;
+  let Inst{11} = 0;
+  let Inst{12-13} = DE_flag;
+  let Inst{14-17} = 0b0000;
+  let Inst{18-24} = Rcalldest;
+  let Inst{25-31} = 0b0000000;
+}
+
+// RI10 Format
+class RI10Form<bits<8> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+        : SPUInstr<OOL, IOL, asmstr, itin>
+{
+  bits<10> i10;
+  bits<7> RA;
+  bits<7> RT;
+
+  let Pattern = pattern;
+
+  let Inst{0-7} = opcode;
+  let Inst{8-17} = i10;
+  let Inst{18-24} = RA;
+  let Inst{25-31} = RT;
+}
+
+// RI10 Format, where the constant is zero (or effectively ignored by the
+// SPU)
+let i10 = 0 in {
+  class RI10Form_1<bits<8> opcode, dag OOL, dag IOL, string asmstr,
+                   InstrItinClass itin, list<dag> pattern>
+          : RI10Form<opcode, OOL, IOL, asmstr, itin, pattern>
+  { }
+}
+
+// RI10 Format, where RT is ignored.
+// This format is used primarily by the Halt If ... Immediate set of
+// instructions
+let RT = 0 in {
+  class RI10Form_2<bits<8> opcode, dag OOL, dag IOL, string asmstr,
+                   InstrItinClass itin, list<dag> pattern>
+        : RI10Form<opcode, OOL, IOL, asmstr, itin, pattern>
+  { }
+}
+
+// RI16 Format
+class RI16Form<bits<9> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+        : SPUInstr<OOL, IOL, asmstr, itin>
+{
+  bits<16> i16;
+  bits<7> RT;
+
+  let Pattern = pattern;
+
+  let Inst{0-8} = opcode;
+  let Inst{9-24} = i16;
+  let Inst{25-31} = RT;
+}
+
+// Specialized version of the RI16 Format for unconditional branch relative and
+// branch absolute, branch and set link. Note that for branch and set link, the
+// link register doesn't have to be $lr, but this is actually hard coded into
+// the instruction pattern.
+
+let RT = 0 in {
+  class UncondBranch<bits<9> opcode, dag OOL, dag IOL, string asmstr,
+                     list<dag> pattern>
+    : RI16Form<opcode, OOL, IOL, asmstr, BranchResolv, pattern>
+  { }
+
+  class BranchSetLink<bits<9> opcode, dag OOL, dag IOL, string asmstr,
+                      list<dag> pattern>
+        : RI16Form<opcode, OOL, IOL, asmstr, BranchResolv, pattern>
+  { }
+}
+
+//===----------------------------------------------------------------------===//
+// Specialized versions of RI16:
+//===----------------------------------------------------------------------===//
+
+// RI18 Format
+class RI18Form<bits<7> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+        : SPUInstr<OOL, IOL, asmstr, itin>
+{
+  bits<18> i18;
+  bits<7> RT;
+
+  let Pattern = pattern;
+
+  let Inst{0-6} = opcode;
+  let Inst{7-24} = i18;
+  let Inst{25-31} = RT;
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction formats for intrinsics:
+//===----------------------------------------------------------------------===//
+
+// RI10 Format for v8i16 intrinsics
+class RI10_Int_v8i16<bits<8> opcode, string opc, InstrItinClass itin,
+                     Intrinsic IntID> :
+  RI10Form<opcode, (outs VECREG:$rT), (ins s10imm:$val, VECREG:$rA),
+           !strconcat(opc, " $rT, $rA, $val"), itin,
+           [(set (v8i16 VECREG:$rT), (IntID (v8i16 VECREG:$rA),
+                                            i16ImmSExt10:$val))] >;
+
+class RI10_Int_v4i32<bits<8> opcode, string opc, InstrItinClass itin,
+                     Intrinsic IntID> :
+  RI10Form<opcode, (outs VECREG:$rT), (ins s10imm:$val, VECREG:$rA),
+           !strconcat(opc, " $rT, $rA, $val"), itin,
+           [(set (v4i32 VECREG:$rT), (IntID (v4i32 VECREG:$rA),
+                                            i32ImmSExt10:$val))] >;
+
+// RR Format for v8i16 intrinsics
+class RR_Int_v8i16<bits<11> opcode, string opc, InstrItinClass itin,
+                   Intrinsic IntID> :
+  RRForm<opcode, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+         !strconcat(opc, " $rT, $rA, $rB"), itin,
+         [(set (v8i16 VECREG:$rT), (IntID (v8i16 VECREG:$rA),
+                                          (v8i16 VECREG:$rB)))] >;
+
+// RR Format for v4i32 intrinsics
+class RR_Int_v4i32<bits<11> opcode, string opc, InstrItinClass itin,
+                   Intrinsic IntID> :
+  RRForm<opcode, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+         !strconcat(opc, " $rT, $rA, $rB"), itin,
+         [(set (v4i32 VECREG:$rT), (IntID (v4i32 VECREG:$rA),
+                                          (v4i32 VECREG:$rB)))] >;
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions, like call frames:
+//===----------------------------------------------------------------------===//
+
+class Pseudo<dag OOL, dag IOL, string asmstr, list<dag> pattern>
+    : SPUInstr<OOL, IOL, asmstr, NoItinerary> {
+  let OutOperandList = OOL;
+  let InOperandList = IOL;
+  let AsmString   = asmstr;
+  let Pattern = pattern;
+  let Inst{31-0} = 0;
+}

diff --git a/lib/Target/CellSPU/SPUInstrInfo.cpp b/lib/Target/CellSPU/SPUInstrInfo.cpp
new file mode 100644
index 0000000..2306665
--- /dev/null
+++ b/lib/Target/CellSPU/SPUInstrInfo.cpp

@@ -0,0 +1,613 @@
+//===- SPUInstrInfo.cpp - Cell SPU Instruction Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Cell SPU implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPURegisterNames.h"
+#include "SPUInstrInfo.h"
+#include "SPUInstrBuilder.h"
+#include "SPUTargetMachine.h"
+#include "SPUGenInstrInfo.inc"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+  //! Predicate for an unconditional branch instruction
+  inline bool isUncondBranch(const MachineInstr *I) {
+    unsigned opc = I->getOpcode();
+
+    return (opc == SPU::BR
+            || opc == SPU::BRA
+            || opc == SPU::BI);
+  }
+
+  //! Predicate for a conditional branch instruction
+  inline bool isCondBranch(const MachineInstr *I) {
+    unsigned opc = I->getOpcode();
+
+    return (opc == SPU::BRNZr32
+            || opc == SPU::BRNZv4i32
+            || opc == SPU::BRZr32
+            || opc == SPU::BRZv4i32
+            || opc == SPU::BRHNZr16
+            || opc == SPU::BRHNZv8i16
+            || opc == SPU::BRHZr16
+            || opc == SPU::BRHZv8i16);
+  }
+}
+
+SPUInstrInfo::SPUInstrInfo(SPUTargetMachine &tm)
+  : TargetInstrInfoImpl(SPUInsts, sizeof(SPUInsts)/sizeof(SPUInsts[0])),
+    TM(tm),
+    RI(*TM.getSubtargetImpl(), *this)
+{ /* NOP */ }
+
+bool
+SPUInstrInfo::isMoveInstr(const MachineInstr& MI,
+                          unsigned& sourceReg,
+                          unsigned& destReg,
+                          unsigned& SrcSR, unsigned& DstSR) const {
+  SrcSR = DstSR = 0;  // No sub-registers.
+
+  switch (MI.getOpcode()) {
+  default:
+    break;
+  case SPU::ORIv4i32:
+  case SPU::ORIr32:
+  case SPU::ORHIv8i16:
+  case SPU::ORHIr16:
+  case SPU::ORHIi8i16:
+  case SPU::ORBIv16i8:
+  case SPU::ORBIr8:
+  case SPU::ORIi16i32:
+  case SPU::ORIi8i32:
+  case SPU::AHIvec:
+  case SPU::AHIr16:
+  case SPU::AIv4i32:
+    assert(MI.getNumOperands() == 3 &&
+           MI.getOperand(0).isReg() &&
+           MI.getOperand(1).isReg() &&
+           MI.getOperand(2).isImm() &&
+           "invalid SPU ORI/ORHI/ORBI/AHI/AI/SFI/SFHI instruction!");
+    if (MI.getOperand(2).getImm() == 0) {
+      sourceReg = MI.getOperand(1).getReg();
+      destReg = MI.getOperand(0).getReg();
+      return true;
+    }
+    break;
+  case SPU::AIr32:
+    assert(MI.getNumOperands() == 3 &&
+           "wrong number of operands to AIr32");
+    if (MI.getOperand(0).isReg() &&
+        MI.getOperand(1).isReg() &&
+        (MI.getOperand(2).isImm() &&
+         MI.getOperand(2).getImm() == 0)) {
+      sourceReg = MI.getOperand(1).getReg();
+      destReg = MI.getOperand(0).getReg();
+      return true;
+    }
+    break;
+  case SPU::LRr8:
+  case SPU::LRr16:
+  case SPU::LRr32:
+  case SPU::LRf32:
+  case SPU::LRr64:
+  case SPU::LRf64:
+  case SPU::LRr128:
+  case SPU::LRv16i8:
+  case SPU::LRv8i16:
+  case SPU::LRv4i32:
+  case SPU::LRv4f32:
+  case SPU::LRv2i64:
+  case SPU::LRv2f64:
+  case SPU::ORv16i8_i8:
+  case SPU::ORv8i16_i16:
+  case SPU::ORv4i32_i32:
+  case SPU::ORv2i64_i64:
+  case SPU::ORv4f32_f32:
+  case SPU::ORv2f64_f64:
+  case SPU::ORi8_v16i8:
+  case SPU::ORi16_v8i16:
+  case SPU::ORi32_v4i32:
+  case SPU::ORi64_v2i64:
+  case SPU::ORf32_v4f32:
+  case SPU::ORf64_v2f64:
+/*
+  case SPU::ORi128_r64:
+  case SPU::ORi128_f64:
+  case SPU::ORi128_r32:
+  case SPU::ORi128_f32:
+  case SPU::ORi128_r16:
+  case SPU::ORi128_r8:
+*/
+  case SPU::ORi128_vec:
+/*
+  case SPU::ORr64_i128:
+  case SPU::ORf64_i128:
+  case SPU::ORr32_i128:
+  case SPU::ORf32_i128:
+  case SPU::ORr16_i128:
+  case SPU::ORr8_i128:
+*/
+  case SPU::ORvec_i128:
+/*
+  case SPU::ORr16_r32:
+  case SPU::ORr8_r32:
+  case SPU::ORf32_r32:
+  case SPU::ORr32_f32:
+  case SPU::ORr32_r16:
+  case SPU::ORr32_r8:
+  case SPU::ORr16_r64:
+  case SPU::ORr8_r64:
+  case SPU::ORr64_r16:
+  case SPU::ORr64_r8:
+*/
+  case SPU::ORr64_r32:
+  case SPU::ORr32_r64:
+  case SPU::ORf32_r32:
+  case SPU::ORr32_f32:
+  case SPU::ORf64_r64:
+  case SPU::ORr64_f64: {
+    assert(MI.getNumOperands() == 2 &&
+           MI.getOperand(0).isReg() &&
+           MI.getOperand(1).isReg() &&
+           "invalid SPU OR<type>_<vec> or LR instruction!");
+    if (MI.getOperand(0).getReg() == MI.getOperand(1).getReg()) {
+      sourceReg = MI.getOperand(1).getReg();
+      destReg = MI.getOperand(0).getReg();
+      return true;
+    }
+    break;
+  }
+  case SPU::ORv16i8:
+  case SPU::ORv8i16:
+  case SPU::ORv4i32:
+  case SPU::ORv2i64:
+  case SPU::ORr8:
+  case SPU::ORr16:
+  case SPU::ORr32:
+  case SPU::ORr64:
+  case SPU::ORr128:
+  case SPU::ORf32:
+  case SPU::ORf64:
+    assert(MI.getNumOperands() == 3 &&
+           MI.getOperand(0).isReg() &&
+           MI.getOperand(1).isReg() &&
+           MI.getOperand(2).isReg() &&
+           "invalid SPU OR(vec|r32|r64|gprc) instruction!");
+    if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
+      sourceReg = MI.getOperand(1).getReg();
+      destReg = MI.getOperand(0).getReg();
+      return true;
+    }
+    break;
+  }
+
+  return false;
+}
+
+unsigned
+SPUInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                  int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case SPU::LQDv16i8:
+  case SPU::LQDv8i16:
+  case SPU::LQDv4i32:
+  case SPU::LQDv4f32:
+  case SPU::LQDv2f64:
+  case SPU::LQDr128:
+  case SPU::LQDr64:
+  case SPU::LQDr32:
+  case SPU::LQDr16: {
+    const MachineOperand MOp1 = MI->getOperand(1);
+    const MachineOperand MOp2 = MI->getOperand(2);
+    if (MOp1.isImm() && MOp2.isFI()) {
+      FrameIndex = MOp2.getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  }
+  return 0;
+}
+
+unsigned
+SPUInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                 int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case SPU::STQDv16i8:
+  case SPU::STQDv8i16:
+  case SPU::STQDv4i32:
+  case SPU::STQDv4f32:
+  case SPU::STQDv2f64:
+  case SPU::STQDr128:
+  case SPU::STQDr64:
+  case SPU::STQDr32:
+  case SPU::STQDr16:
+  case SPU::STQDr8: {
+    const MachineOperand MOp1 = MI->getOperand(1);
+    const MachineOperand MOp2 = MI->getOperand(2);
+    if (MOp1.isImm() && MOp2.isFI()) {
+      FrameIndex = MOp2.getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  }
+  return 0;
+}
+
+bool SPUInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   unsigned DestReg, unsigned SrcReg,
+                                   const TargetRegisterClass *DestRC,
+                                   const TargetRegisterClass *SrcRC) const
+{
+  // We support cross register class moves for our aliases, such as R3 in any
+  // reg class to any other reg class containing R3.  This is required because
+  // we instruction select bitconvert i64 -> f64 as a noop for example, so our
+  // types have no specific meaning.
+
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  if (DestRC == SPU::R8CRegisterClass) {
+    BuildMI(MBB, MI, DL, get(SPU::LRr8), DestReg).addReg(SrcReg);
+  } else if (DestRC == SPU::R16CRegisterClass) {
+    BuildMI(MBB, MI, DL, get(SPU::LRr16), DestReg).addReg(SrcReg);
+  } else if (DestRC == SPU::R32CRegisterClass) {
+    BuildMI(MBB, MI, DL, get(SPU::LRr32), DestReg).addReg(SrcReg);
+  } else if (DestRC == SPU::R32FPRegisterClass) {
+    BuildMI(MBB, MI, DL, get(SPU::LRf32), DestReg).addReg(SrcReg);
+  } else if (DestRC == SPU::R64CRegisterClass) {
+    BuildMI(MBB, MI, DL, get(SPU::LRr64), DestReg).addReg(SrcReg);
+  } else if (DestRC == SPU::R64FPRegisterClass) {
+    BuildMI(MBB, MI, DL, get(SPU::LRf64), DestReg).addReg(SrcReg);
+  } else if (DestRC == SPU::GPRCRegisterClass) {
+    BuildMI(MBB, MI, DL, get(SPU::LRr128), DestReg).addReg(SrcReg);
+  } else if (DestRC == SPU::VECREGRegisterClass) {
+    BuildMI(MBB, MI, DL, get(SPU::LRv16i8), DestReg).addReg(SrcReg);
+  } else {
+    // Attempt to copy unknown/unsupported register class!
+    return false;
+  }
+
+  return true;
+}
+
+void
+SPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MI,
+                                     unsigned SrcReg, bool isKill, int FrameIdx,
+                                     const TargetRegisterClass *RC) const
+{
+  unsigned opc;
+  bool isValidFrameIdx = (FrameIdx < SPUFrameInfo::maxFrameOffset());
+  if (RC == SPU::GPRCRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::STQDr128 : SPU::STQXr128);
+  } else if (RC == SPU::R64CRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::STQDr64 : SPU::STQXr64);
+  } else if (RC == SPU::R64FPRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::STQDr64 : SPU::STQXr64);
+  } else if (RC == SPU::R32CRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::STQDr32 : SPU::STQXr32);
+  } else if (RC == SPU::R32FPRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::STQDr32 : SPU::STQXr32);
+  } else if (RC == SPU::R16CRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::STQDr16 : SPU::STQXr16);
+  } else if (RC == SPU::R8CRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::STQDr8 : SPU::STQXr8);
+  } else if (RC == SPU::VECREGRegisterClass) {
+    opc = (isValidFrameIdx) ? SPU::STQDv16i8 : SPU::STQXv16i8;
+  } else {
+    llvm_unreachable("Unknown regclass!");
+  }
+
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+  addFrameReference(BuildMI(MBB, MI, DL, get(opc))
+                    .addReg(SrcReg, getKillRegState(isKill)), FrameIdx);
+}
+
+void
+SPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                        unsigned DestReg, int FrameIdx,
+                                        const TargetRegisterClass *RC) const
+{
+  unsigned opc;
+  bool isValidFrameIdx = (FrameIdx < SPUFrameInfo::maxFrameOffset());
+  if (RC == SPU::GPRCRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::LQDr128 : SPU::LQXr128);
+  } else if (RC == SPU::R64CRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::LQDr64 : SPU::LQXr64);
+  } else if (RC == SPU::R64FPRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::LQDr64 : SPU::LQXr64);
+  } else if (RC == SPU::R32CRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::LQDr32 : SPU::LQXr32);
+  } else if (RC == SPU::R32FPRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::LQDr32 : SPU::LQXr32);
+  } else if (RC == SPU::R16CRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::LQDr16 : SPU::LQXr16);
+  } else if (RC == SPU::R8CRegisterClass) {
+    opc = (isValidFrameIdx ? SPU::LQDr8 : SPU::LQXr8);
+  } else if (RC == SPU::VECREGRegisterClass) {
+    opc = (isValidFrameIdx) ? SPU::LQDv16i8 : SPU::LQXv16i8;
+  } else {
+    llvm_unreachable("Unknown regclass in loadRegFromStackSlot!");
+  }
+
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+  addFrameReference(BuildMI(MBB, MI, DL, get(opc), DestReg), FrameIdx);
+}
+
+//! Return true if the specified load or store can be folded
+bool
+SPUInstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
+                                   const SmallVectorImpl<unsigned> &Ops) const {
+  if (Ops.size() != 1) return false;
+
+  // Make sure this is a reg-reg copy.
+  unsigned Opc = MI->getOpcode();
+
+  switch (Opc) {
+  case SPU::ORv16i8:
+  case SPU::ORv8i16:
+  case SPU::ORv4i32:
+  case SPU::ORv2i64:
+  case SPU::ORr8:
+  case SPU::ORr16:
+  case SPU::ORr32:
+  case SPU::ORr64:
+  case SPU::ORf32:
+  case SPU::ORf64:
+    if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg())
+      return true;
+    break;
+  }
+
+  return false;
+}
+
+/// foldMemoryOperand - SPU, like PPC, can only fold spills into
+/// copy instructions, turning them into load/store instructions.
+MachineInstr *
+SPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+                                    MachineInstr *MI,
+                                    const SmallVectorImpl<unsigned> &Ops,
+                                    int FrameIndex) const
+{
+  if (Ops.size() != 1) return 0;
+
+  unsigned OpNum = Ops[0];
+  unsigned Opc = MI->getOpcode();
+  MachineInstr *NewMI = 0;
+
+  switch (Opc) {
+  case SPU::ORv16i8:
+  case SPU::ORv8i16:
+  case SPU::ORv4i32:
+  case SPU::ORv2i64:
+  case SPU::ORr8:
+  case SPU::ORr16:
+  case SPU::ORr32:
+  case SPU::ORr64:
+  case SPU::ORf32:
+  case SPU::ORf64:
+    if (OpNum == 0) {  // move -> store
+      unsigned InReg = MI->getOperand(1).getReg();
+      bool isKill = MI->getOperand(1).isKill();
+      bool isUndef = MI->getOperand(1).isUndef();
+      if (FrameIndex < SPUFrameInfo::maxFrameOffset()) {
+        MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(),
+                                          get(SPU::STQDr32));
+
+        MIB.addReg(InReg, getKillRegState(isKill) | getUndefRegState(isUndef));
+        NewMI = addFrameReference(MIB, FrameIndex);
+      }
+    } else {           // move -> load
+      unsigned OutReg = MI->getOperand(0).getReg();
+      bool isDead = MI->getOperand(0).isDead();
+      bool isUndef = MI->getOperand(0).isUndef();
+      MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc));
+
+      MIB.addReg(OutReg, RegState::Define | getDeadRegState(isDead) |
+                 getUndefRegState(isUndef));
+      Opc = (FrameIndex < SPUFrameInfo::maxFrameOffset())
+        ? SPU::STQDr32 : SPU::STQXr32;
+      NewMI = addFrameReference(MIB, FrameIndex);
+    break;
+  }
+  }
+
+  return NewMI;
+}
+
+//! Branch analysis
+/*!
+  \note This code was kiped from PPC. There may be more branch analysis for
+  CellSPU than what's currently done here.
+ */
+bool
+SPUInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                            MachineBasicBlock *&FBB,
+                            SmallVectorImpl<MachineOperand> &Cond,
+                            bool AllowModify) const {
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+
+  // If there is only one terminator instruction, process it.
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+    if (isUncondBranch(LastInst)) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    } else if (isCondBranch(LastInst)) {
+      // Block ends with fall-through condbranch.
+      TBB = LastInst->getOperand(1).getMBB();
+      DEBUG(errs() << "Pushing LastInst:               ");
+      DEBUG(LastInst->dump());
+      Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
+      Cond.push_back(LastInst->getOperand(0));
+      return false;
+    }
+    // Otherwise, don't know what this is.
+    return true;
+  }
+
+  // Get the instruction before it if it's a terminator.
+  MachineInstr *SecondLastInst = I;
+
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() &&
+      isUnpredicatedTerminator(--I))
+    return true;
+
+  // If the block ends with a conditional and unconditional branch, handle it.
+  if (isCondBranch(SecondLastInst) && isUncondBranch(LastInst)) {
+    TBB =  SecondLastInst->getOperand(1).getMBB();
+    DEBUG(errs() << "Pushing SecondLastInst:         ");
+    DEBUG(SecondLastInst->dump());
+    Cond.push_back(MachineOperand::CreateImm(SecondLastInst->getOpcode()));
+    Cond.push_back(SecondLastInst->getOperand(0));
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+
+  // If the block ends with two unconditional branches, handle it.  The second
+  // one is not executed, so remove it.
+  if (isUncondBranch(SecondLastInst) && isUncondBranch(LastInst)) {
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return false;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+unsigned
+SPUInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin())
+    return 0;
+  --I;
+  if (!isCondBranch(I) && !isUncondBranch(I))
+    return 0;
+
+  // Remove the first branch.
+  DEBUG(errs() << "Removing branch:                ");
+  DEBUG(I->dump());
+  I->eraseFromParent();
+  I = MBB.end();
+  if (I == MBB.begin())
+    return 1;
+
+  --I;
+  if (!(isCondBranch(I) || isUncondBranch(I)))
+    return 1;
+
+  // Remove the second branch.
+  DEBUG(errs() << "Removing second branch:         ");
+  DEBUG(I->dump());
+  I->eraseFromParent();
+  return 2;
+}
+
+unsigned
+SPUInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                           MachineBasicBlock *FBB,
+                           const SmallVectorImpl<MachineOperand> &Cond) const {
+  // FIXME this should probably have a DebugLoc argument
+  DebugLoc dl = DebugLoc::getUnknownLoc();
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 2 || Cond.size() == 0) &&
+         "SPU branch conditions have two components!");
+
+  // One-way branch.
+  if (FBB == 0) {
+    if (Cond.empty()) {
+      // Unconditional branch
+      MachineInstrBuilder MIB = BuildMI(&MBB, dl, get(SPU::BR));
+      MIB.addMBB(TBB);
+
+      DEBUG(errs() << "Inserted one-way uncond branch: ");
+      DEBUG((*MIB).dump());
+    } else {
+      // Conditional branch
+      MachineInstrBuilder  MIB = BuildMI(&MBB, dl, get(Cond[0].getImm()));
+      MIB.addReg(Cond[1].getReg()).addMBB(TBB);
+
+      DEBUG(errs() << "Inserted one-way cond branch:   ");
+      DEBUG((*MIB).dump());
+    }
+    return 1;
+  } else {
+    MachineInstrBuilder MIB = BuildMI(&MBB, dl, get(Cond[0].getImm()));
+    MachineInstrBuilder MIB2 = BuildMI(&MBB, dl, get(SPU::BR));
+
+    // Two-way Conditional Branch.
+    MIB.addReg(Cond[1].getReg()).addMBB(TBB);
+    MIB2.addMBB(FBB);
+
+    DEBUG(errs() << "Inserted conditional branch:    ");
+    DEBUG((*MIB).dump());
+    DEBUG(errs() << "part 2: ");
+    DEBUG((*MIB2).dump());
+   return 2;
+  }
+}
+
+//! Reverses a branch's condition, returning false on success.
+bool
+SPUInstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond)
+  const {
+  // Pretty brainless way of inverting the condition, but it works, considering
+  // there are only two conditions...
+  static struct {
+    unsigned Opc;               //! The incoming opcode
+    unsigned RevCondOpc;        //! The reversed condition opcode
+  } revconds[] = {
+    { SPU::BRNZr32, SPU::BRZr32 },
+    { SPU::BRNZv4i32, SPU::BRZv4i32 },
+    { SPU::BRZr32, SPU::BRNZr32 },
+    { SPU::BRZv4i32, SPU::BRNZv4i32 },
+    { SPU::BRHNZr16, SPU::BRHZr16 },
+    { SPU::BRHNZv8i16, SPU::BRHZv8i16 },
+    { SPU::BRHZr16, SPU::BRHNZr16 },
+    { SPU::BRHZv8i16, SPU::BRHNZv8i16 }
+  };
+
+  unsigned Opc = unsigned(Cond[0].getImm());
+  // Pretty dull mapping between the two conditions that SPU can generate:
+  for (int i = sizeof(revconds)/sizeof(revconds[0]) - 1; i >= 0; --i) {
+    if (revconds[i].Opc == Opc) {
+      Cond[0].setImm(revconds[i].RevCondOpc);
+      return false;
+    }
+  }
+
+  return true;
+}

diff --git a/lib/Target/CellSPU/SPUInstrInfo.h b/lib/Target/CellSPU/SPUInstrInfo.h
new file mode 100644
index 0000000..42677fc
--- /dev/null
+++ b/lib/Target/CellSPU/SPUInstrInfo.h

@@ -0,0 +1,99 @@
+//===- SPUInstrInfo.h - Cell SPU Instruction Information --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the CellSPU implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPU_INSTRUCTIONINFO_H
+#define SPU_INSTRUCTIONINFO_H
+
+#include "SPU.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "SPURegisterInfo.h"
+
+namespace llvm {
+  //! Cell SPU instruction information class
+  class SPUInstrInfo : public TargetInstrInfoImpl {
+    SPUTargetMachine &TM;
+    const SPURegisterInfo RI;
+  protected:
+    virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                            MachineInstr* MI,
+                                            const SmallVectorImpl<unsigned> &Ops,
+                                            int FrameIndex) const;
+
+    virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                                MachineInstr* MI,
+                                                const SmallVectorImpl<unsigned> &Ops,
+                                                MachineInstr* LoadMI) const {
+      return 0;
+    }
+
+  public:
+    explicit SPUInstrInfo(SPUTargetMachine &tm);
+
+    /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+    /// such, whenever a client has an instance of instruction info, it should
+    /// always be able to get register info as well (through this method).
+    ///
+    virtual const SPURegisterInfo &getRegisterInfo() const { return RI; }
+
+    /// Return true if the instruction is a register to register move and return
+    /// the source and dest operands and their sub-register indices by reference.
+    virtual bool isMoveInstr(const MachineInstr &MI,
+                             unsigned &SrcReg, unsigned &DstReg,
+                             unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
+
+    unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                                 int &FrameIndex) const;
+    unsigned isStoreToStackSlot(const MachineInstr *MI,
+                                int &FrameIndex) const;
+
+    virtual bool copyRegToReg(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MI,
+                              unsigned DestReg, unsigned SrcReg,
+                              const TargetRegisterClass *DestRC,
+                              const TargetRegisterClass *SrcRC) const;
+
+    //! Store a register to a stack slot, based on its register class.
+    virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MBBI,
+                                     unsigned SrcReg, bool isKill, int FrameIndex,
+                                     const TargetRegisterClass *RC) const;
+
+    //! Load a register from a stack slot, based on its register class.
+    virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator MBBI,
+                                      unsigned DestReg, int FrameIndex,
+                                      const TargetRegisterClass *RC) const;
+
+    //! Return true if the specified load or store can be folded
+    virtual
+    bool canFoldMemoryOperand(const MachineInstr *MI,
+                              const SmallVectorImpl<unsigned> &Ops) const;
+
+    //! Reverses a branch's condition, returning false on success.
+    virtual
+    bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+
+    virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                               MachineBasicBlock *&FBB,
+                               SmallVectorImpl<MachineOperand> &Cond,
+                               bool AllowModify) const;
+
+    virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+
+    virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                              MachineBasicBlock *FBB,
+                              const SmallVectorImpl<MachineOperand> &Cond) const;
+   };
+}
+
+#endif

diff --git a/lib/Target/CellSPU/SPUInstrInfo.td b/lib/Target/CellSPU/SPUInstrInfo.td
new file mode 100644
index 0000000..f24ffd2
--- /dev/null
+++ b/lib/Target/CellSPU/SPUInstrInfo.td

@@ -0,0 +1,4601 @@
+//==- SPUInstrInfo.td - Describe the Cell SPU Instructions -*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Cell SPU Instructions:
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// TODO Items (not urgent today, but would be nice, low priority)
+//
+// ANDBI, ORBI: SPU constructs a 4-byte constant for these instructions by
+// concatenating the byte argument b as "bbbb". Could recognize this bit pattern
+// in 16-bit and 32-bit constants and reduce instruction count.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions:
+//===----------------------------------------------------------------------===//
+
+let hasCtrlDep = 1, Defs = [R1], Uses = [R1] in {
+  def ADJCALLSTACKDOWN : Pseudo<(outs), (ins u16imm_i32:$amt),
+                                "${:comment} ADJCALLSTACKDOWN",
+                                [(callseq_start timm:$amt)]>;
+  def ADJCALLSTACKUP   : Pseudo<(outs), (ins u16imm_i32:$amt),
+                                "${:comment} ADJCALLSTACKUP",
+                                [(callseq_end timm:$amt)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Loads:
+// NB: The ordering is actually important, since the instruction selection
+// will try each of the instructions in sequence, i.e., the D-form first with
+// the 10-bit displacement, then the A-form with the 16 bit displacement, and
+// finally the X-form with the register-register.
+//===----------------------------------------------------------------------===//
+
+let canFoldAsLoad = 1 in {
+  class LoadDFormVec<ValueType vectype>
+    : RI10Form<0b00101100, (outs VECREG:$rT), (ins dformaddr:$src),
+               "lqd\t$rT, $src",
+               LoadStore,
+               [(set (vectype VECREG:$rT), (load dform_addr:$src))]>
+  { }
+
+  class LoadDForm<RegisterClass rclass>
+    : RI10Form<0b00101100, (outs rclass:$rT), (ins dformaddr:$src),
+               "lqd\t$rT, $src",
+               LoadStore,
+               [(set rclass:$rT, (load dform_addr:$src))]>
+  { }
+
+  multiclass LoadDForms
+  {
+    def v16i8: LoadDFormVec<v16i8>;
+    def v8i16: LoadDFormVec<v8i16>;
+    def v4i32: LoadDFormVec<v4i32>;
+    def v2i64: LoadDFormVec<v2i64>;
+    def v4f32: LoadDFormVec<v4f32>;
+    def v2f64: LoadDFormVec<v2f64>;
+
+    def v2i32: LoadDFormVec<v2i32>;
+
+    def r128:  LoadDForm<GPRC>;
+    def r64:   LoadDForm<R64C>;
+    def r32:   LoadDForm<R32C>;
+    def f32:   LoadDForm<R32FP>;
+    def f64:   LoadDForm<R64FP>;
+    def r16:   LoadDForm<R16C>;
+    def r8:    LoadDForm<R8C>;
+  }
+
+  class LoadAFormVec<ValueType vectype>
+    : RI16Form<0b100001100, (outs VECREG:$rT), (ins addr256k:$src),
+               "lqa\t$rT, $src",
+               LoadStore,
+               [(set (vectype VECREG:$rT), (load aform_addr:$src))]>
+  { }
+
+  class LoadAForm<RegisterClass rclass>
+    : RI16Form<0b100001100, (outs rclass:$rT), (ins addr256k:$src),
+               "lqa\t$rT, $src",
+               LoadStore,
+               [(set rclass:$rT, (load aform_addr:$src))]>
+  { }
+
+  multiclass LoadAForms
+  {
+    def v16i8: LoadAFormVec<v16i8>;
+    def v8i16: LoadAFormVec<v8i16>;
+    def v4i32: LoadAFormVec<v4i32>;
+    def v2i64: LoadAFormVec<v2i64>;
+    def v4f32: LoadAFormVec<v4f32>;
+    def v2f64: LoadAFormVec<v2f64>;
+
+    def v2i32: LoadAFormVec<v2i32>;
+
+    def r128:  LoadAForm<GPRC>;
+    def r64:   LoadAForm<R64C>;
+    def r32:   LoadAForm<R32C>;
+    def f32:   LoadAForm<R32FP>;
+    def f64:   LoadAForm<R64FP>;
+    def r16:   LoadAForm<R16C>;
+    def r8:    LoadAForm<R8C>;
+  }
+
+  class LoadXFormVec<ValueType vectype>
+    : RRForm<0b00100011100, (outs VECREG:$rT), (ins memrr:$src),
+             "lqx\t$rT, $src",
+             LoadStore,
+             [(set (vectype VECREG:$rT), (load xform_addr:$src))]>
+  { }
+
+  class LoadXForm<RegisterClass rclass>
+    : RRForm<0b00100011100, (outs rclass:$rT), (ins memrr:$src),
+             "lqx\t$rT, $src",
+             LoadStore,
+             [(set rclass:$rT, (load xform_addr:$src))]>
+  { }
+
+  multiclass LoadXForms
+  {
+    def v16i8: LoadXFormVec<v16i8>;
+    def v8i16: LoadXFormVec<v8i16>;
+    def v4i32: LoadXFormVec<v4i32>;
+    def v2i64: LoadXFormVec<v2i64>;
+    def v4f32: LoadXFormVec<v4f32>;
+    def v2f64: LoadXFormVec<v2f64>;
+
+    def v2i32: LoadXFormVec<v2i32>;
+
+    def r128:  LoadXForm<GPRC>;
+    def r64:   LoadXForm<R64C>;
+    def r32:   LoadXForm<R32C>;
+    def f32:   LoadXForm<R32FP>;
+    def f64:   LoadXForm<R64FP>;
+    def r16:   LoadXForm<R16C>;
+    def r8:    LoadXForm<R8C>;
+  }
+
+  defm LQA : LoadAForms;
+  defm LQD : LoadDForms;
+  defm LQX : LoadXForms;
+
+/* Load quadword, PC relative: Not much use at this point in time.
+   Might be of use later for relocatable code. It's effectively the
+   same as LQA, but uses PC-relative addressing.
+  def LQR : RI16Form<0b111001100, (outs VECREG:$rT), (ins s16imm:$disp),
+                     "lqr\t$rT, $disp", LoadStore,
+                     [(set VECREG:$rT, (load iaddr:$disp))]>;
+ */
+}
+
+//===----------------------------------------------------------------------===//
+// Stores:
+//===----------------------------------------------------------------------===//
+class StoreDFormVec<ValueType vectype>
+  : RI10Form<0b00100100, (outs), (ins VECREG:$rT, dformaddr:$src),
+             "stqd\t$rT, $src",
+             LoadStore,
+             [(store (vectype VECREG:$rT), dform_addr:$src)]>
+{ }
+
+class StoreDForm<RegisterClass rclass>
+  : RI10Form<0b00100100, (outs), (ins rclass:$rT, dformaddr:$src),
+             "stqd\t$rT, $src",
+             LoadStore,
+             [(store rclass:$rT, dform_addr:$src)]>
+{ }
+
+multiclass StoreDForms
+{
+  def v16i8: StoreDFormVec<v16i8>;
+  def v8i16: StoreDFormVec<v8i16>;
+  def v4i32: StoreDFormVec<v4i32>;
+  def v2i64: StoreDFormVec<v2i64>;
+  def v4f32: StoreDFormVec<v4f32>;
+  def v2f64: StoreDFormVec<v2f64>;
+
+  def v2i32: StoreDFormVec<v2i32>;
+
+  def r128:  StoreDForm<GPRC>;
+  def r64:   StoreDForm<R64C>;
+  def r32:   StoreDForm<R32C>;
+  def f32:   StoreDForm<R32FP>;
+  def f64:   StoreDForm<R64FP>;
+  def r16:   StoreDForm<R16C>;
+  def r8:    StoreDForm<R8C>;
+}
+
+class StoreAFormVec<ValueType vectype>
+  : RI16Form<0b0010010, (outs), (ins VECREG:$rT, addr256k:$src),
+             "stqa\t$rT, $src",
+             LoadStore,
+             [(store (vectype VECREG:$rT), aform_addr:$src)]>;
+
+class StoreAForm<RegisterClass rclass>
+  : RI16Form<0b001001, (outs), (ins rclass:$rT, addr256k:$src),
+             "stqa\t$rT, $src",
+             LoadStore,
+             [(store rclass:$rT, aform_addr:$src)]>;
+
+multiclass StoreAForms
+{
+  def v16i8: StoreAFormVec<v16i8>;
+  def v8i16: StoreAFormVec<v8i16>;
+  def v4i32: StoreAFormVec<v4i32>;
+  def v2i64: StoreAFormVec<v2i64>;
+  def v4f32: StoreAFormVec<v4f32>;
+  def v2f64: StoreAFormVec<v2f64>;
+
+  def v2i32: StoreAFormVec<v2i32>;
+
+  def r128:  StoreAForm<GPRC>;
+  def r64:   StoreAForm<R64C>;
+  def r32:   StoreAForm<R32C>;
+  def f32:   StoreAForm<R32FP>;
+  def f64:   StoreAForm<R64FP>;
+  def r16:   StoreAForm<R16C>;
+  def r8:    StoreAForm<R8C>;
+}
+
+class StoreXFormVec<ValueType vectype>
+  : RRForm<0b00100100, (outs), (ins VECREG:$rT, memrr:$src),
+           "stqx\t$rT, $src",
+           LoadStore,
+           [(store (vectype VECREG:$rT), xform_addr:$src)]>
+{ }
+
+class StoreXForm<RegisterClass rclass>
+  : RRForm<0b00100100, (outs), (ins rclass:$rT, memrr:$src),
+           "stqx\t$rT, $src",
+           LoadStore,
+           [(store rclass:$rT, xform_addr:$src)]>
+{ }
+
+multiclass StoreXForms
+{
+  def v16i8: StoreXFormVec<v16i8>;
+  def v8i16: StoreXFormVec<v8i16>;
+  def v4i32: StoreXFormVec<v4i32>;
+  def v2i64: StoreXFormVec<v2i64>;
+  def v4f32: StoreXFormVec<v4f32>;
+  def v2f64: StoreXFormVec<v2f64>;
+
+  def v2i32: StoreXFormVec<v2i32>;
+
+  def r128:  StoreXForm<GPRC>;
+  def r64:   StoreXForm<R64C>;
+  def r32:   StoreXForm<R32C>;
+  def f32:   StoreXForm<R32FP>;
+  def f64:   StoreXForm<R64FP>;
+  def r16:   StoreXForm<R16C>;
+  def r8:    StoreXForm<R8C>;
+}
+
+defm STQD : StoreDForms;
+defm STQA : StoreAForms;
+defm STQX : StoreXForms;
+
+/* Store quadword, PC relative: Not much use at this point in time. Might
+   be useful for relocatable code.
+def STQR : RI16Form<0b111000100, (outs), (ins VECREG:$rT, s16imm:$disp),
+                   "stqr\t$rT, $disp", LoadStore,
+                   [(store VECREG:$rT, iaddr:$disp)]>;
+*/
+
+//===----------------------------------------------------------------------===//
+// Generate Controls for Insertion:
+//===----------------------------------------------------------------------===//
+
+def CBD: RI7Form<0b10101111100, (outs VECREG:$rT), (ins shufaddr:$src),
+    "cbd\t$rT, $src", ShuffleOp,
+    [(set (v16i8 VECREG:$rT), (SPUshufmask dform2_addr:$src))]>;
+
+def CBX: RRForm<0b00101011100, (outs VECREG:$rT), (ins memrr:$src),
+    "cbx\t$rT, $src", ShuffleOp,
+    [(set (v16i8 VECREG:$rT), (SPUshufmask xform_addr:$src))]>;
+
+def CHD: RI7Form<0b10101111100, (outs VECREG:$rT), (ins shufaddr:$src),
+    "chd\t$rT, $src", ShuffleOp,
+    [(set (v8i16 VECREG:$rT), (SPUshufmask dform2_addr:$src))]>;
+
+def CHX: RRForm<0b10101011100, (outs VECREG:$rT), (ins memrr:$src),
+    "chx\t$rT, $src", ShuffleOp,
+    [(set (v8i16 VECREG:$rT), (SPUshufmask xform_addr:$src))]>;
+
+def CWD: RI7Form<0b01101111100, (outs VECREG:$rT), (ins shufaddr:$src),
+    "cwd\t$rT, $src", ShuffleOp,
+    [(set (v4i32 VECREG:$rT), (SPUshufmask dform2_addr:$src))]>;
+
+def CWX: RRForm<0b01101011100, (outs VECREG:$rT), (ins memrr:$src),
+    "cwx\t$rT, $src", ShuffleOp,
+    [(set (v4i32 VECREG:$rT), (SPUshufmask xform_addr:$src))]>;
+
+def CWDf32: RI7Form<0b01101111100, (outs VECREG:$rT), (ins shufaddr:$src),
+    "cwd\t$rT, $src", ShuffleOp,
+    [(set (v4f32 VECREG:$rT), (SPUshufmask dform2_addr:$src))]>;
+
+def CWXf32: RRForm<0b01101011100, (outs VECREG:$rT), (ins memrr:$src),
+    "cwx\t$rT, $src", ShuffleOp,
+    [(set (v4f32 VECREG:$rT), (SPUshufmask xform_addr:$src))]>;
+
+def CDD: RI7Form<0b11101111100, (outs VECREG:$rT), (ins shufaddr:$src),
+    "cdd\t$rT, $src", ShuffleOp,
+    [(set (v2i64 VECREG:$rT), (SPUshufmask dform2_addr:$src))]>;
+
+def CDX: RRForm<0b11101011100, (outs VECREG:$rT), (ins memrr:$src),
+    "cdx\t$rT, $src", ShuffleOp,
+    [(set (v2i64 VECREG:$rT), (SPUshufmask xform_addr:$src))]>;
+
+def CDDf64: RI7Form<0b11101111100, (outs VECREG:$rT), (ins shufaddr:$src),
+    "cdd\t$rT, $src", ShuffleOp,
+    [(set (v2f64 VECREG:$rT), (SPUshufmask dform2_addr:$src))]>;
+
+def CDXf64: RRForm<0b11101011100, (outs VECREG:$rT), (ins memrr:$src),
+    "cdx\t$rT, $src", ShuffleOp,
+    [(set (v2f64 VECREG:$rT), (SPUshufmask xform_addr:$src))]>;
+
+//===----------------------------------------------------------------------===//
+// Constant formation:
+//===----------------------------------------------------------------------===//
+
+def ILHv8i16:
+  RI16Form<0b110000010, (outs VECREG:$rT), (ins s16imm:$val),
+    "ilh\t$rT, $val", ImmLoad,
+    [(set (v8i16 VECREG:$rT), (v8i16 v8i16SExt16Imm:$val))]>;
+
+def ILHr16:
+  RI16Form<0b110000010, (outs R16C:$rT), (ins s16imm:$val),
+    "ilh\t$rT, $val", ImmLoad,
+    [(set R16C:$rT, immSExt16:$val)]>;
+
+// Cell SPU doesn't have a native 8-bit immediate load, but ILH works ("with
+// the right constant")
+def ILHr8:
+  RI16Form<0b110000010, (outs R8C:$rT), (ins s16imm_i8:$val),
+    "ilh\t$rT, $val", ImmLoad,
+    [(set R8C:$rT, immSExt8:$val)]>;
+
+// IL does sign extension!
+
+class ILInst<dag OOL, dag IOL, list<dag> pattern>:
+  RI16Form<0b100000010, OOL, IOL, "il\t$rT, $val",
+           ImmLoad, pattern>;
+
+class ILVecInst<ValueType vectype, Operand immtype, PatLeaf xform>:
+  ILInst<(outs VECREG:$rT), (ins immtype:$val),
+         [(set (vectype VECREG:$rT), (vectype xform:$val))]>;
+
+class ILRegInst<RegisterClass rclass, Operand immtype, PatLeaf xform>:
+  ILInst<(outs rclass:$rT), (ins immtype:$val),
+         [(set rclass:$rT, xform:$val)]>;
+
+multiclass ImmediateLoad
+{
+  def v2i64: ILVecInst<v2i64, s16imm_i64, v2i64SExt16Imm>;
+  def v4i32: ILVecInst<v4i32, s16imm_i32, v4i32SExt16Imm>;
+
+  // TODO: Need v2f64, v4f32
+
+  def r64: ILRegInst<R64C, s16imm_i64, immSExt16>;
+  def r32: ILRegInst<R32C, s16imm_i32, immSExt16>;
+  def f32: ILRegInst<R32FP, s16imm_f32, fpimmSExt16>;
+  def f64: ILRegInst<R64FP, s16imm_f64, fpimmSExt16>;
+}
+
+defm IL : ImmediateLoad;
+
+class ILHUInst<dag OOL, dag IOL, list<dag> pattern>:
+  RI16Form<0b010000010, OOL, IOL, "ilhu\t$rT, $val",
+           ImmLoad, pattern>;
+
+class ILHUVecInst<ValueType vectype, Operand immtype, PatLeaf xform>:
+  ILHUInst<(outs VECREG:$rT), (ins immtype:$val),
+           [(set (vectype VECREG:$rT), (vectype xform:$val))]>;
+
+class ILHURegInst<RegisterClass rclass, Operand immtype, PatLeaf xform>:
+  ILHUInst<(outs rclass:$rT), (ins immtype:$val),
+           [(set rclass:$rT, xform:$val)]>;
+
+multiclass ImmLoadHalfwordUpper
+{
+  def v2i64: ILHUVecInst<v2i64, u16imm_i64, immILHUvec_i64>;
+  def v4i32: ILHUVecInst<v4i32, u16imm_i32, immILHUvec>;
+
+  def r64: ILHURegInst<R64C, u16imm_i64, hi16>;
+  def r32: ILHURegInst<R32C, u16imm_i32, hi16>;
+
+  // Loads the high portion of an address
+  def hi: ILHURegInst<R32C, symbolHi, hi16>;
+
+  // Used in custom lowering constant SFP loads:
+  def f32: ILHURegInst<R32FP, f16imm, hi16_f32>;
+}
+
+defm ILHU : ImmLoadHalfwordUpper;
+
+// Immediate load address (can also be used to load 18-bit unsigned constants,
+// see the zext 16->32 pattern)
+
+class ILAInst<dag OOL, dag IOL, list<dag> pattern>:
+  RI18Form<0b1000010, OOL, IOL, "ila\t$rT, $val",
+           LoadNOP, pattern>;
+
+class ILAVecInst<ValueType vectype, Operand immtype, PatLeaf xform>:
+  ILAInst<(outs VECREG:$rT), (ins immtype:$val),
+          [(set (vectype VECREG:$rT), (vectype xform:$val))]>;
+
+class ILARegInst<RegisterClass rclass, Operand immtype, PatLeaf xform>:
+  ILAInst<(outs rclass:$rT), (ins immtype:$val),
+          [(set rclass:$rT, xform:$val)]>;
+
+multiclass ImmLoadAddress
+{
+  def v2i64: ILAVecInst<v2i64, u18imm, v2i64Uns18Imm>;
+  def v4i32: ILAVecInst<v4i32, u18imm, v4i32Uns18Imm>;
+
+  def r64: ILARegInst<R64C, u18imm_i64, imm18>;
+  def r32: ILARegInst<R32C, u18imm, imm18>;
+  def f32: ILARegInst<R32FP, f18imm, fpimm18>;
+  def f64: ILARegInst<R64FP, f18imm_f64, fpimm18>;
+
+  def hi: ILARegInst<R32C, symbolHi, imm18>;
+  def lo: ILARegInst<R32C, symbolLo, imm18>;
+
+  def lsa: ILAInst<(outs R32C:$rT), (ins symbolLSA:$val),
+                   [/* no pattern */]>;
+}
+
+defm ILA : ImmLoadAddress;
+
+// Immediate OR, Halfword Lower: The "other" part of loading large constants
+// into 32-bit registers. See the anonymous pattern Pat<(i32 imm:$imm), ...>
+// Note that these are really two operand instructions, but they're encoded
+// as three operands with the first two arguments tied-to each other.
+
+class IOHLInst<dag OOL, dag IOL, list<dag> pattern>:
+  RI16Form<0b100000110, OOL, IOL, "iohl\t$rT, $val",
+           ImmLoad, pattern>,
+  RegConstraint<"$rS = $rT">,
+  NoEncode<"$rS">;
+
+class IOHLVecInst<ValueType vectype, Operand immtype /* , PatLeaf xform */>:
+  IOHLInst<(outs VECREG:$rT), (ins VECREG:$rS, immtype:$val),
+           [/* no pattern */]>;
+
+class IOHLRegInst<RegisterClass rclass, Operand immtype /* , PatLeaf xform */>:
+  IOHLInst<(outs rclass:$rT), (ins rclass:$rS, immtype:$val),
+           [/* no pattern */]>;
+
+multiclass ImmOrHalfwordLower
+{
+  def v2i64: IOHLVecInst<v2i64, u16imm_i64>;
+  def v4i32: IOHLVecInst<v4i32, u16imm_i32>;
+
+  def r32: IOHLRegInst<R32C, i32imm>;
+  def f32: IOHLRegInst<R32FP, f32imm>;
+
+  def lo: IOHLRegInst<R32C, symbolLo>;
+}
+
+defm IOHL: ImmOrHalfwordLower;
+
+// Form select mask for bytes using immediate, used in conjunction with the
+// SELB instruction:
+
+class FSMBIVec<ValueType vectype>:
+  RI16Form<0b101001100, (outs VECREG:$rT), (ins u16imm:$val),
+          "fsmbi\t$rT, $val",
+          SelectOp,
+          [(set (vectype VECREG:$rT), (SPUselmask (i16 immU16:$val)))]>;
+
+multiclass FormSelectMaskBytesImm
+{
+  def v16i8: FSMBIVec<v16i8>;
+  def v8i16: FSMBIVec<v8i16>;
+  def v4i32: FSMBIVec<v4i32>;
+  def v2i64: FSMBIVec<v2i64>;
+}
+
+defm FSMBI : FormSelectMaskBytesImm;
+
+// fsmb: Form select mask for bytes. N.B. Input operand, $rA, is 16-bits
+class FSMBInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm_1<0b01101101100, OOL, IOL, "fsmb\t$rT, $rA", SelectOp,
+             pattern>;
+
+class FSMBRegInst<RegisterClass rclass, ValueType vectype>:
+    FSMBInst<(outs VECREG:$rT), (ins rclass:$rA),
+             [(set (vectype VECREG:$rT), (SPUselmask rclass:$rA))]>;
+
+class FSMBVecInst<ValueType vectype>:
+    FSMBInst<(outs VECREG:$rT), (ins VECREG:$rA),
+             [(set (vectype VECREG:$rT),
+                   (SPUselmask (vectype VECREG:$rA)))]>;
+
+multiclass FormSelectMaskBits {
+  def v16i8_r16: FSMBRegInst<R16C, v16i8>;
+  def v16i8:     FSMBVecInst<v16i8>;
+}
+
+defm FSMB: FormSelectMaskBits;
+
+// fsmh: Form select mask for halfwords. N.B., Input operand, $rA, is
+// only 8-bits wide (even though it's input as 16-bits here)
+
+class FSMHInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm_1<0b10101101100, OOL, IOL, "fsmh\t$rT, $rA", SelectOp,
+             pattern>;
+
+class FSMHRegInst<RegisterClass rclass, ValueType vectype>:
+    FSMHInst<(outs VECREG:$rT), (ins rclass:$rA),
+             [(set (vectype VECREG:$rT), (SPUselmask rclass:$rA))]>;
+
+class FSMHVecInst<ValueType vectype>:
+    FSMHInst<(outs VECREG:$rT), (ins VECREG:$rA),
+             [(set (vectype VECREG:$rT),
+                   (SPUselmask (vectype VECREG:$rA)))]>;
+
+multiclass FormSelectMaskHalfword {
+  def v8i16_r16: FSMHRegInst<R16C, v8i16>;
+  def v8i16:     FSMHVecInst<v8i16>;
+}
+
+defm FSMH: FormSelectMaskHalfword;
+
+// fsm: Form select mask for words. Like the other fsm* instructions,
+// only the lower 4 bits of $rA are significant.
+
+class FSMInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm_1<0b00101101100, OOL, IOL, "fsm\t$rT, $rA", SelectOp,
+             pattern>;
+
+class FSMRegInst<ValueType vectype, RegisterClass rclass>:
+    FSMInst<(outs VECREG:$rT), (ins rclass:$rA),
+            [(set (vectype VECREG:$rT), (SPUselmask rclass:$rA))]>;
+
+class FSMVecInst<ValueType vectype>:
+    FSMInst<(outs VECREG:$rT), (ins VECREG:$rA),
+            [(set (vectype VECREG:$rT), (SPUselmask (vectype VECREG:$rA)))]>;
+
+multiclass FormSelectMaskWord {
+  def v4i32: FSMVecInst<v4i32>;
+
+  def r32 :  FSMRegInst<v4i32, R32C>;
+  def r16 :  FSMRegInst<v4i32, R16C>;
+}
+
+defm FSM : FormSelectMaskWord;
+
+// Special case when used for i64 math operations
+multiclass FormSelectMaskWord64 {
+  def r32 : FSMRegInst<v2i64, R32C>;
+  def r16 : FSMRegInst<v2i64, R16C>;
+}
+
+defm FSM64 : FormSelectMaskWord64;
+
+//===----------------------------------------------------------------------===//
+// Integer and Logical Operations:
+//===----------------------------------------------------------------------===//
+
+def AHv8i16:
+  RRForm<0b00010011000, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "ah\t$rT, $rA, $rB", IntegerOp,
+    [(set (v8i16 VECREG:$rT), (int_spu_si_ah VECREG:$rA, VECREG:$rB))]>;
+
+def : Pat<(add (v8i16 VECREG:$rA), (v8i16 VECREG:$rB)),
+          (AHv8i16 VECREG:$rA, VECREG:$rB)>;
+
+def AHr16:
+  RRForm<0b00010011000, (outs R16C:$rT), (ins R16C:$rA, R16C:$rB),
+    "ah\t$rT, $rA, $rB", IntegerOp,
+    [(set R16C:$rT, (add R16C:$rA, R16C:$rB))]>;
+
+def AHIvec:
+    RI10Form<0b10111000, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+      "ahi\t$rT, $rA, $val", IntegerOp,
+      [(set (v8i16 VECREG:$rT), (add (v8i16 VECREG:$rA),
+                                     v8i16SExt10Imm:$val))]>;
+
+def AHIr16:
+  RI10Form<0b10111000, (outs R16C:$rT), (ins R16C:$rA, s10imm:$val),
+    "ahi\t$rT, $rA, $val", IntegerOp,
+    [(set R16C:$rT, (add R16C:$rA, i16ImmSExt10:$val))]>;
+
+// v4i32, i32 add instruction:
+
+class AInst<dag OOL, dag IOL, list<dag> pattern>:
+  RRForm<0b00000011000, OOL, IOL,
+         "a\t$rT, $rA, $rB", IntegerOp,
+         pattern>;
+
+class AVecInst<ValueType vectype>:
+  AInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+        [(set (vectype VECREG:$rT), (add (vectype VECREG:$rA),
+                                         (vectype VECREG:$rB)))]>;
+
+class ARegInst<RegisterClass rclass>:
+  AInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
+        [(set rclass:$rT, (add rclass:$rA, rclass:$rB))]>;
+        
+multiclass AddInstruction {
+  def v4i32: AVecInst<v4i32>;
+  def v16i8: AVecInst<v16i8>;
+  
+  def r32:   ARegInst<R32C>;
+}
+
+defm A : AddInstruction;
+
+class AIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI10Form<0b00111000, OOL, IOL,
+             "ai\t$rT, $rA, $val", IntegerOp,
+             pattern>;
+
+class AIVecInst<ValueType vectype, PatLeaf immpred>:
+    AIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+            [(set (vectype VECREG:$rT), (add (vectype VECREG:$rA), immpred:$val))]>;
+
+class AIFPVecInst<ValueType vectype, PatLeaf immpred>:
+    AIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+            [/* no pattern */]>;
+
+class AIRegInst<RegisterClass rclass, PatLeaf immpred>:
+    AIInst<(outs rclass:$rT), (ins rclass:$rA, s10imm_i32:$val),
+           [(set rclass:$rT, (add rclass:$rA, immpred:$val))]>;
+
+// This is used to add epsilons to floating point numbers in the f32 fdiv code:
+class AIFPInst<RegisterClass rclass, PatLeaf immpred>:
+    AIInst<(outs rclass:$rT), (ins rclass:$rA, s10imm_i32:$val),
+           [/* no pattern */]>;
+
+multiclass AddImmediate {
+  def v4i32: AIVecInst<v4i32, v4i32SExt10Imm>;
+
+  def r32: AIRegInst<R32C, i32ImmSExt10>;
+
+  def v4f32: AIFPVecInst<v4f32, v4i32SExt10Imm>;
+  def f32: AIFPInst<R32FP, i32ImmSExt10>;
+}
+
+defm AI : AddImmediate;
+
+def SFHvec:
+    RRForm<0b00010010000, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "sfh\t$rT, $rA, $rB", IntegerOp,
+      [(set (v8i16 VECREG:$rT), (sub (v8i16 VECREG:$rA),
+                                     (v8i16 VECREG:$rB)))]>;
+
+def SFHr16:
+    RRForm<0b00010010000, (outs R16C:$rT), (ins R16C:$rA, R16C:$rB),
+      "sfh\t$rT, $rA, $rB", IntegerOp,
+      [(set R16C:$rT, (sub R16C:$rA, R16C:$rB))]>;
+
+def SFHIvec:
+    RI10Form<0b10110000, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+      "sfhi\t$rT, $rA, $val", IntegerOp,
+      [(set (v8i16 VECREG:$rT), (sub v8i16SExt10Imm:$val,
+                                     (v8i16 VECREG:$rA)))]>;
+
+def SFHIr16 : RI10Form<0b10110000, (outs R16C:$rT), (ins R16C:$rA, s10imm:$val),
+  "sfhi\t$rT, $rA, $val", IntegerOp,
+  [(set R16C:$rT, (sub i16ImmSExt10:$val, R16C:$rA))]>;
+
+def SFvec : RRForm<0b00000010000, (outs VECREG:$rT),
+                                  (ins VECREG:$rA, VECREG:$rB),
+  "sf\t$rT, $rA, $rB", IntegerOp,
+  [(set (v4i32 VECREG:$rT), (sub (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+
+def SFr32 : RRForm<0b00000010000, (outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
+  "sf\t$rT, $rA, $rB", IntegerOp,
+  [(set R32C:$rT, (sub R32C:$rA, R32C:$rB))]>;
+
+def SFIvec:
+    RI10Form<0b00110000, (outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+      "sfi\t$rT, $rA, $val", IntegerOp,
+      [(set (v4i32 VECREG:$rT), (sub v4i32SExt10Imm:$val,
+                                     (v4i32 VECREG:$rA)))]>;
+
+def SFIr32 : RI10Form<0b00110000, (outs R32C:$rT),
+                                  (ins R32C:$rA, s10imm_i32:$val),
+  "sfi\t$rT, $rA, $val", IntegerOp,
+  [(set R32C:$rT, (sub i32ImmSExt10:$val, R32C:$rA))]>;
+
+// ADDX: only available in vector form, doesn't match a pattern.
+class ADDXInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b00000010110, OOL, IOL,
+      "addx\t$rT, $rA, $rB",
+      IntegerOp, pattern>;
+
+class ADDXVecInst<ValueType vectype>:
+    ADDXInst<(outs VECREG:$rT),
+             (ins VECREG:$rA, VECREG:$rB, VECREG:$rCarry),
+             [/* no pattern */]>,
+    RegConstraint<"$rCarry = $rT">,
+    NoEncode<"$rCarry">;
+
+class ADDXRegInst<RegisterClass rclass>:
+    ADDXInst<(outs rclass:$rT),
+             (ins rclass:$rA, rclass:$rB, rclass:$rCarry),
+             [/* no pattern */]>,
+    RegConstraint<"$rCarry = $rT">,
+    NoEncode<"$rCarry">;
+
+multiclass AddExtended {
+  def v2i64 : ADDXVecInst<v2i64>;
+  def v4i32 : ADDXVecInst<v4i32>;
+  def r64 : ADDXRegInst<R64C>;
+  def r32 : ADDXRegInst<R32C>;
+}
+
+defm ADDX : AddExtended;
+
+// CG: Generate carry for add
+class CGInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b01000011000, OOL, IOL,
+      "cg\t$rT, $rA, $rB",
+      IntegerOp, pattern>;
+
+class CGVecInst<ValueType vectype>:
+    CGInst<(outs VECREG:$rT),
+           (ins VECREG:$rA, VECREG:$rB),
+           [/* no pattern */]>;
+
+class CGRegInst<RegisterClass rclass>:
+    CGInst<(outs rclass:$rT),
+           (ins rclass:$rA, rclass:$rB),
+           [/* no pattern */]>;
+
+multiclass CarryGenerate {
+  def v2i64 : CGVecInst<v2i64>;
+  def v4i32 : CGVecInst<v4i32>;
+  def r64 : CGRegInst<R64C>;
+  def r32 : CGRegInst<R32C>;
+}
+
+defm CG : CarryGenerate;
+
+// SFX: Subract from, extended. This is used in conjunction with BG to subtract
+// with carry (borrow, in this case)
+class SFXInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b10000010110, OOL, IOL,
+      "sfx\t$rT, $rA, $rB",
+      IntegerOp, pattern>;
+
+class SFXVecInst<ValueType vectype>:
+    SFXInst<(outs VECREG:$rT),
+            (ins VECREG:$rA, VECREG:$rB, VECREG:$rCarry),
+             [/* no pattern */]>,
+    RegConstraint<"$rCarry = $rT">,
+    NoEncode<"$rCarry">;
+
+class SFXRegInst<RegisterClass rclass>:
+    SFXInst<(outs rclass:$rT),
+            (ins rclass:$rA, rclass:$rB, rclass:$rCarry),
+             [/* no pattern */]>,
+    RegConstraint<"$rCarry = $rT">,
+    NoEncode<"$rCarry">;
+
+multiclass SubtractExtended {
+  def v2i64 : SFXVecInst<v2i64>;
+  def v4i32 : SFXVecInst<v4i32>;
+  def r64 : SFXRegInst<R64C>;
+  def r32 : SFXRegInst<R32C>;
+}
+
+defm SFX : SubtractExtended;
+
+// BG: only available in vector form, doesn't match a pattern.
+class BGInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b01000010000, OOL, IOL,
+      "bg\t$rT, $rA, $rB",
+      IntegerOp, pattern>;
+
+class BGVecInst<ValueType vectype>:
+    BGInst<(outs VECREG:$rT),
+           (ins VECREG:$rA, VECREG:$rB),
+           [/* no pattern */]>;
+
+class BGRegInst<RegisterClass rclass>:
+    BGInst<(outs rclass:$rT),
+           (ins rclass:$rA, rclass:$rB),
+           [/* no pattern */]>;
+
+multiclass BorrowGenerate {
+  def v4i32 : BGVecInst<v4i32>;
+  def v2i64 : BGVecInst<v2i64>;
+  def r64 : BGRegInst<R64C>;
+  def r32 : BGRegInst<R32C>;
+}
+
+defm BG : BorrowGenerate;
+
+// BGX: Borrow generate, extended.
+def BGXvec:
+    RRForm<0b11000010110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB,
+                                VECREG:$rCarry),
+      "bgx\t$rT, $rA, $rB", IntegerOp,
+      []>,
+    RegConstraint<"$rCarry = $rT">,
+    NoEncode<"$rCarry">;
+
+// Halfword multiply variants:
+// N.B: These can be used to build up larger quantities (16x16 -> 32)
+
+def MPYv8i16:
+  RRForm<0b00100011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+    "mpy\t$rT, $rA, $rB", IntegerMulDiv,
+    [/* no pattern */]>;
+
+def MPYr16:
+  RRForm<0b00100011110, (outs R16C:$rT), (ins R16C:$rA, R16C:$rB),
+    "mpy\t$rT, $rA, $rB", IntegerMulDiv,
+    [(set R16C:$rT, (mul R16C:$rA, R16C:$rB))]>;
+
+// Unsigned 16-bit multiply:
+
+class MPYUInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b00110011110, OOL, IOL,
+      "mpyu\t$rT, $rA, $rB", IntegerMulDiv,
+      pattern>;
+
+def MPYUv4i32:
+  MPYUInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+           [/* no pattern */]>;
+
+def MPYUr16:
+  MPYUInst<(outs R32C:$rT), (ins R16C:$rA, R16C:$rB),
+           [(set R32C:$rT, (mul (zext R16C:$rA), (zext R16C:$rB)))]>;
+
+def MPYUr32:
+  MPYUInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
+           [/* no pattern */]>;
+
+// mpyi: multiply 16 x s10imm -> 32 result.
+
+class MPYIInst<dag OOL, dag IOL, list<dag> pattern>:
+  RI10Form<0b00101110, OOL, IOL,
+    "mpyi\t$rT, $rA, $val", IntegerMulDiv,
+    pattern>;
+
+def MPYIvec:
+  MPYIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+           [(set (v8i16 VECREG:$rT),
+                 (mul (v8i16 VECREG:$rA), v8i16SExt10Imm:$val))]>;
+
+def MPYIr16:
+  MPYIInst<(outs R16C:$rT), (ins R16C:$rA, s10imm:$val),
+           [(set R16C:$rT, (mul R16C:$rA, i16ImmSExt10:$val))]>;
+
+// mpyui: same issues as other multiplies, plus, this doesn't match a
+// pattern... but may be used during target DAG selection or lowering
+
+class MPYUIInst<dag OOL, dag IOL, list<dag> pattern>:
+  RI10Form<0b10101110, OOL, IOL,
+           "mpyui\t$rT, $rA, $val", IntegerMulDiv,
+           pattern>;
+    
+def MPYUIvec:
+  MPYUIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+            []>;
+
+def MPYUIr16:
+  MPYUIInst<(outs R16C:$rT), (ins R16C:$rA, s10imm:$val),
+            []>;
+
+// mpya: 16 x 16 + 16 -> 32 bit result
+class MPYAInst<dag OOL, dag IOL, list<dag> pattern>:
+  RRRForm<0b0011, OOL, IOL,
+          "mpya\t$rT, $rA, $rB, $rC", IntegerMulDiv,
+          pattern>;
+          
+def MPYAv4i32:
+  MPYAInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+           [(set (v4i32 VECREG:$rT),
+                 (add (v4i32 (bitconvert (mul (v8i16 VECREG:$rA),
+                                              (v8i16 VECREG:$rB)))),
+                      (v4i32 VECREG:$rC)))]>;
+
+def MPYAr32:
+  MPYAInst<(outs R32C:$rT), (ins R16C:$rA, R16C:$rB, R32C:$rC),
+           [(set R32C:$rT, (add (sext (mul R16C:$rA, R16C:$rB)),
+                                R32C:$rC))]>;
+                                
+def MPYAr32_sext:
+  MPYAInst<(outs R32C:$rT), (ins R16C:$rA, R16C:$rB, R32C:$rC),
+           [(set R32C:$rT, (add (mul (sext R16C:$rA), (sext R16C:$rB)),
+                                R32C:$rC))]>;
+
+def MPYAr32_sextinreg:
+  MPYAInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB, R32C:$rC),
+           [(set R32C:$rT, (add (mul (sext_inreg R32C:$rA, i16),
+                                     (sext_inreg R32C:$rB, i16)),
+                                R32C:$rC))]>;
+
+// mpyh: multiply high, used to synthesize 32-bit multiplies
+class MPYHInst<dag OOL, dag IOL, list<dag> pattern>:
+  RRForm<0b10100011110, OOL, IOL,
+         "mpyh\t$rT, $rA, $rB", IntegerMulDiv,
+         pattern>;
+         
+def MPYHv4i32:
+    MPYHInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+             [/* no pattern */]>;
+
+def MPYHr32:
+    MPYHInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
+             [/* no pattern */]>;
+
+// mpys: multiply high and shift right (returns the top half of
+// a 16-bit multiply, sign extended to 32 bits.)
+
+class MPYSInst<dag OOL, dag IOL>:
+    RRForm<0b11100011110, OOL, IOL, 
+      "mpys\t$rT, $rA, $rB", IntegerMulDiv,
+      [/* no pattern */]>;
+
+def MPYSv4i32:
+    MPYSInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB)>;
+    
+def MPYSr16:
+    MPYSInst<(outs R32C:$rT), (ins R16C:$rA, R16C:$rB)>;
+
+// mpyhh: multiply high-high (returns the 32-bit result from multiplying
+// the top 16 bits of the $rA, $rB)
+
+class MPYHHInst<dag OOL, dag IOL>:
+  RRForm<0b01100011110, OOL, IOL,
+        "mpyhh\t$rT, $rA, $rB", IntegerMulDiv,
+        [/* no pattern */]>;
+        
+def MPYHHv8i16:
+    MPYHHInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB)>;
+
+def MPYHHr32:
+    MPYHHInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB)>;
+
+// mpyhha: Multiply high-high, add to $rT:
+
+class MPYHHAInst<dag OOL, dag IOL>:
+    RRForm<0b01100010110, OOL, IOL,
+      "mpyhha\t$rT, $rA, $rB", IntegerMulDiv,
+      [/* no pattern */]>;
+
+def MPYHHAvec:
+    MPYHHAInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB)>;
+    
+def MPYHHAr32:
+    MPYHHAInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB)>;
+
+// mpyhhu: Multiply high-high, unsigned, e.g.:
+//
+// +-------+-------+   +-------+-------+   +---------+
+// |  a0   .  a1   | x |  b0   .  b1   | = | a0 x b0 |
+// +-------+-------+   +-------+-------+   +---------+
+//
+// where a0, b0 are the upper 16 bits of the 32-bit word
+
+class MPYHHUInst<dag OOL, dag IOL>:
+    RRForm<0b01110011110, OOL, IOL,
+      "mpyhhu\t$rT, $rA, $rB", IntegerMulDiv,
+      [/* no pattern */]>;
+
+def MPYHHUv4i32:
+    MPYHHUInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB)>;
+    
+def MPYHHUr32:
+    MPYHHUInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB)>;
+
+// mpyhhau: Multiply high-high, unsigned
+
+class MPYHHAUInst<dag OOL, dag IOL>:
+    RRForm<0b01110010110, OOL, IOL,
+      "mpyhhau\t$rT, $rA, $rB", IntegerMulDiv,
+      [/* no pattern */]>;
+
+def MPYHHAUvec:
+    MPYHHAUInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB)>;
+    
+def MPYHHAUr32:
+    MPYHHAUInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB)>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// clz: Count leading zeroes
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+class CLZInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm_1<0b10100101010, OOL, IOL, "clz\t$rT, $rA",
+             IntegerOp, pattern>;
+
+class CLZRegInst<RegisterClass rclass>:
+    CLZInst<(outs rclass:$rT), (ins rclass:$rA),
+            [(set rclass:$rT, (ctlz rclass:$rA))]>;
+
+class CLZVecInst<ValueType vectype>:
+    CLZInst<(outs VECREG:$rT), (ins VECREG:$rA),
+            [(set (vectype VECREG:$rT), (ctlz (vectype VECREG:$rA)))]>;
+
+multiclass CountLeadingZeroes {
+  def v4i32 : CLZVecInst<v4i32>;
+  def r32   : CLZRegInst<R32C>;
+}
+
+defm CLZ : CountLeadingZeroes;
+
+// cntb: Count ones in bytes (aka "population count")
+//
+// NOTE: This instruction is really a vector instruction, but the custom
+// lowering code uses it in unorthodox ways to support CTPOP for other
+// data types!
+
+def CNTBv16i8:
+    RRForm_1<0b00101101010, (outs VECREG:$rT), (ins VECREG:$rA),
+      "cntb\t$rT, $rA", IntegerOp,
+      [(set (v16i8 VECREG:$rT), (SPUcntb (v16i8 VECREG:$rA)))]>;
+
+def CNTBv8i16 :
+    RRForm_1<0b00101101010, (outs VECREG:$rT), (ins VECREG:$rA),
+      "cntb\t$rT, $rA", IntegerOp,
+      [(set (v8i16 VECREG:$rT), (SPUcntb (v8i16 VECREG:$rA)))]>;
+
+def CNTBv4i32 :
+    RRForm_1<0b00101101010, (outs VECREG:$rT), (ins VECREG:$rA),
+      "cntb\t$rT, $rA", IntegerOp,
+      [(set (v4i32 VECREG:$rT), (SPUcntb (v4i32 VECREG:$rA)))]>;
+
+// gbb: Gather the low order bits from each byte in $rA into a single 16-bit
+// quantity stored into $rT's slot 0, upper 16 bits are zeroed, as are
+// slots 1-3.
+//
+// Note: This instruction "pairs" with the fsmb instruction for all of the
+// various types defined here.
+//
+// Note 2: The "VecInst" and "RegInst" forms refer to the result being either
+// a vector or register.
+
+class GBBInst<dag OOL, dag IOL, list<dag> pattern>:
+  RRForm_1<0b01001101100, OOL, IOL, "gbb\t$rT, $rA", GatherOp, pattern>;
+
+class GBBRegInst<RegisterClass rclass, ValueType vectype>:
+  GBBInst<(outs rclass:$rT), (ins VECREG:$rA),
+          [/* no pattern */]>;
+
+class GBBVecInst<ValueType vectype>:
+  GBBInst<(outs VECREG:$rT), (ins VECREG:$rA),
+          [/* no pattern */]>;
+
+multiclass GatherBitsFromBytes {
+  def v16i8_r32: GBBRegInst<R32C, v16i8>;
+  def v16i8_r16: GBBRegInst<R16C, v16i8>;
+  def v16i8:     GBBVecInst<v16i8>;
+}
+
+defm GBB: GatherBitsFromBytes;
+
+// gbh: Gather all low order bits from each halfword in $rA into a single
+// 8-bit quantity stored in $rT's slot 0, with the upper bits of $rT set to 0
+// and slots 1-3 also set to 0.
+//
+// See notes for GBBInst, above.
+
+class GBHInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm_1<0b10001101100, OOL, IOL, "gbh\t$rT, $rA", GatherOp,
+             pattern>;
+
+class GBHRegInst<RegisterClass rclass, ValueType vectype>:
+    GBHInst<(outs rclass:$rT), (ins VECREG:$rA),
+            [/* no pattern */]>;
+
+class GBHVecInst<ValueType vectype>:
+    GBHInst<(outs VECREG:$rT), (ins VECREG:$rA),
+            [/* no pattern */]>;
+
+multiclass GatherBitsHalfword {
+  def v8i16_r32: GBHRegInst<R32C, v8i16>;
+  def v8i16_r16: GBHRegInst<R16C, v8i16>;
+  def v8i16:     GBHVecInst<v8i16>;
+}
+
+defm GBH: GatherBitsHalfword;
+
+// gb: Gather all low order bits from each word in $rA into a single
+// 4-bit quantity stored in $rT's slot 0, upper bits in $rT set to 0,
+// as well as slots 1-3.
+//
+// See notes for gbb, above.
+
+class GBInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm_1<0b00001101100, OOL, IOL, "gb\t$rT, $rA", GatherOp,
+             pattern>;
+
+class GBRegInst<RegisterClass rclass, ValueType vectype>:
+    GBInst<(outs rclass:$rT), (ins VECREG:$rA),
+           [/* no pattern */]>;
+
+class GBVecInst<ValueType vectype>:
+    GBInst<(outs VECREG:$rT), (ins VECREG:$rA),
+           [/* no pattern */]>;
+
+multiclass GatherBitsWord {
+  def v4i32_r32: GBRegInst<R32C, v4i32>;
+  def v4i32_r16: GBRegInst<R16C, v4i32>;
+  def v4i32:     GBVecInst<v4i32>;
+}
+
+defm GB: GatherBitsWord;
+
+// avgb: average bytes
+def AVGB:
+    RRForm<0b11001011000, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "avgb\t$rT, $rA, $rB", ByteOp,
+      []>;
+
+// absdb: absolute difference of bytes
+def ABSDB:
+    RRForm<0b11001010000, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "absdb\t$rT, $rA, $rB", ByteOp,
+      []>;
+
+// sumb: sum bytes into halfwords
+def SUMB:
+    RRForm<0b11001010010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "sumb\t$rT, $rA, $rB", ByteOp,
+      []>;
+
+// Sign extension operations:
+class XSBHInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm_1<0b01101101010, OOL, IOL,
+      "xsbh\t$rDst, $rSrc",
+      IntegerOp, pattern>;
+
+class XSBHVecInst<ValueType vectype>:
+    XSBHInst<(outs VECREG:$rDst), (ins VECREG:$rSrc),
+      [(set (v8i16 VECREG:$rDst), (sext (vectype VECREG:$rSrc)))]>;
+
+class XSBHInRegInst<RegisterClass rclass, list<dag> pattern>:
+    XSBHInst<(outs rclass:$rDst), (ins rclass:$rSrc),
+             pattern>;
+
+multiclass ExtendByteHalfword {
+  def v16i8:     XSBHVecInst<v8i16>;
+  def r8:        XSBHInst<(outs R16C:$rDst), (ins R8C:$rSrc),
+                          [(set R16C:$rDst, (sext R8C:$rSrc))]>;
+  def r16:       XSBHInRegInst<R16C,
+                               [(set R16C:$rDst, (sext_inreg R16C:$rSrc, i8))]>;
+
+  // 32-bit form for XSBH: used to sign extend 8-bit quantities to 16-bit
+  // quantities to 32-bit quantities via a 32-bit register (see the sext 8->32
+  // pattern below). Intentionally doesn't match a pattern because we want the
+  // sext 8->32 pattern to do the work for us, namely because we need the extra
+  // XSHWr32.
+  def r32:   XSBHInRegInst<R32C, [/* no pattern */]>;
+  
+  // Same as the 32-bit version, but for i64
+  def r64:   XSBHInRegInst<R64C, [/* no pattern */]>;
+}
+
+defm XSBH : ExtendByteHalfword;
+
+// Sign extend halfwords to words:
+
+class XSHWInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm_1<0b01101101010, OOL, IOL, "xshw\t$rDest, $rSrc",
+            IntegerOp, pattern>;
+
+class XSHWVecInst<ValueType in_vectype, ValueType out_vectype>:
+    XSHWInst<(outs VECREG:$rDest), (ins VECREG:$rSrc),
+             [(set (out_vectype VECREG:$rDest),
+                   (sext (in_vectype VECREG:$rSrc)))]>;
+
+class XSHWInRegInst<RegisterClass rclass, list<dag> pattern>:
+    XSHWInst<(outs rclass:$rDest), (ins rclass:$rSrc),
+             pattern>;
+             
+class XSHWRegInst<RegisterClass rclass>:
+    XSHWInst<(outs rclass:$rDest), (ins R16C:$rSrc),
+             [(set rclass:$rDest, (sext R16C:$rSrc))]>;
+
+multiclass ExtendHalfwordWord {
+  def v4i32: XSHWVecInst<v4i32, v8i16>;
+  
+  def r16:   XSHWRegInst<R32C>;
+  
+  def r32:   XSHWInRegInst<R32C,
+                          [(set R32C:$rDest, (sext_inreg R32C:$rSrc, i16))]>;
+  def r64:   XSHWInRegInst<R64C, [/* no pattern */]>;
+}
+
+defm XSHW : ExtendHalfwordWord;
+
+// Sign-extend words to doublewords (32->64 bits)
+
+class XSWDInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm_1<0b01100101010, OOL, IOL, "xswd\t$rDst, $rSrc",
+              IntegerOp, pattern>;
+      
+class XSWDVecInst<ValueType in_vectype, ValueType out_vectype>:
+    XSWDInst<(outs VECREG:$rDst), (ins VECREG:$rSrc),
+             [(set (out_vectype VECREG:$rDst),
+                   (sext (out_vectype VECREG:$rSrc)))]>;
+      
+class XSWDRegInst<RegisterClass in_rclass, RegisterClass out_rclass>:
+    XSWDInst<(outs out_rclass:$rDst), (ins in_rclass:$rSrc),
+             [(set out_rclass:$rDst, (sext in_rclass:$rSrc))]>;
+             
+multiclass ExtendWordToDoubleWord {
+  def v2i64: XSWDVecInst<v4i32, v2i64>;
+  def r64:   XSWDRegInst<R32C, R64C>;
+  
+  def r64_inreg: XSWDInst<(outs R64C:$rDst), (ins R64C:$rSrc),
+                          [(set R64C:$rDst, (sext_inreg R64C:$rSrc, i32))]>;
+}
+
+defm XSWD : ExtendWordToDoubleWord;
+
+// AND operations
+
+class ANDInst<dag OOL, dag IOL, list<dag> pattern> :
+    RRForm<0b10000011000, OOL, IOL, "and\t$rT, $rA, $rB",
+           IntegerOp, pattern>;
+
+class ANDVecInst<ValueType vectype>:
+    ANDInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+             [(set (vectype VECREG:$rT), (and (vectype VECREG:$rA),
+                                              (vectype VECREG:$rB)))]>;
+
+class ANDRegInst<RegisterClass rclass>:
+    ANDInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
+             [(set rclass:$rT, (and rclass:$rA, rclass:$rB))]>;
+
+multiclass BitwiseAnd
+{
+  def v16i8: ANDVecInst<v16i8>;
+  def v8i16: ANDVecInst<v8i16>;
+  def v4i32: ANDVecInst<v4i32>;
+  def v2i64: ANDVecInst<v2i64>;
+
+  def r128:  ANDRegInst<GPRC>;
+  def r64:   ANDRegInst<R64C>;
+  def r32:   ANDRegInst<R32C>;
+  def r16:   ANDRegInst<R16C>;
+  def r8:    ANDRegInst<R8C>;
+
+  //===---------------------------------------------
+  // Special instructions to perform the fabs instruction
+  def fabs32: ANDInst<(outs R32FP:$rT), (ins R32FP:$rA, R32C:$rB),
+                      [/* Intentionally does not match a pattern */]>;
+
+  def fabs64: ANDInst<(outs R64FP:$rT), (ins R64FP:$rA, R64C:$rB),
+                      [/* Intentionally does not match a pattern */]>;
+
+  def fabsvec: ANDInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+                       [/* Intentionally does not match a pattern */]>;
+
+  //===---------------------------------------------
+
+  // Hacked form of AND to zero-extend 16-bit quantities to 32-bit
+  // quantities -- see 16->32 zext pattern.
+  //
+  // This pattern is somewhat artificial, since it might match some
+  // compiler generated pattern but it is unlikely to do so.
+
+  def i16i32: ANDInst<(outs R32C:$rT), (ins R16C:$rA, R32C:$rB),
+                      [(set R32C:$rT, (and (zext R16C:$rA), R32C:$rB))]>;
+}
+
+defm AND : BitwiseAnd;
+
+// N.B.: vnot_conv is one of those special target selection pattern fragments,
+// in which we expect there to be a bit_convert on the constant. Bear in mind
+// that llvm translates "not <reg>" to "xor <reg>, -1" (or in this case, a
+// constant -1 vector.)
+
+class ANDCInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b10000011010, OOL, IOL, "andc\t$rT, $rA, $rB",
+           IntegerOp, pattern>;
+
+class ANDCVecInst<ValueType vectype, PatFrag vnot_frag = vnot>:
+    ANDCInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+             [(set (vectype VECREG:$rT),
+                   (and (vectype VECREG:$rA),
+                        (vnot_frag (vectype VECREG:$rB))))]>;
+
+class ANDCRegInst<RegisterClass rclass>:
+    ANDCInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
+             [(set rclass:$rT, (and rclass:$rA, (not rclass:$rB)))]>;
+
+multiclass AndComplement
+{
+  def v16i8: ANDCVecInst<v16i8>;
+  def v8i16: ANDCVecInst<v8i16>;
+  def v4i32: ANDCVecInst<v4i32>;
+  def v2i64: ANDCVecInst<v2i64>;
+
+  def r128: ANDCRegInst<GPRC>;
+  def r64:  ANDCRegInst<R64C>;
+  def r32:  ANDCRegInst<R32C>;
+  def r16:  ANDCRegInst<R16C>;
+  def r8:   ANDCRegInst<R8C>;
+
+  // Sometimes, the xor pattern has a bitcast constant:
+  def v16i8_conv: ANDCVecInst<v16i8, vnot_conv>;
+}
+
+defm ANDC : AndComplement;
+
+class ANDBIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI10Form<0b01101000, OOL, IOL, "andbi\t$rT, $rA, $val",
+             ByteOp, pattern>;
+
+multiclass AndByteImm
+{
+  def v16i8: ANDBIInst<(outs VECREG:$rT), (ins VECREG:$rA, u10imm:$val),
+                       [(set (v16i8 VECREG:$rT),
+                             (and (v16i8 VECREG:$rA),
+                                  (v16i8 v16i8U8Imm:$val)))]>;
+
+  def r8: ANDBIInst<(outs R8C:$rT), (ins R8C:$rA, u10imm_i8:$val),
+                    [(set R8C:$rT, (and R8C:$rA, immU8:$val))]>;
+}
+
+defm ANDBI : AndByteImm;
+
+class ANDHIInst<dag OOL, dag IOL, list<dag> pattern> :
+    RI10Form<0b10101000, OOL, IOL, "andhi\t$rT, $rA, $val",
+             ByteOp, pattern>;
+
+multiclass AndHalfwordImm
+{
+  def v8i16: ANDHIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+                       [(set (v8i16 VECREG:$rT),
+                             (and (v8i16 VECREG:$rA), v8i16SExt10Imm:$val))]>;
+
+  def r16: ANDHIInst<(outs R16C:$rT), (ins R16C:$rA, u10imm:$val),
+                     [(set R16C:$rT, (and R16C:$rA, i16ImmUns10:$val))]>;
+
+  // Zero-extend i8 to i16:
+  def i8i16: ANDHIInst<(outs R16C:$rT), (ins R8C:$rA, u10imm:$val),
+                      [(set R16C:$rT, (and (zext R8C:$rA), i16ImmUns10:$val))]>;
+}
+
+defm ANDHI : AndHalfwordImm;
+
+class ANDIInst<dag OOL, dag IOL, list<dag> pattern> :
+    RI10Form<0b00101000, OOL, IOL, "andi\t$rT, $rA, $val",
+             IntegerOp, pattern>;
+
+multiclass AndWordImm
+{
+  def v4i32: ANDIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+                      [(set (v4i32 VECREG:$rT),
+                            (and (v4i32 VECREG:$rA), v4i32SExt10Imm:$val))]>;
+
+  def r32: ANDIInst<(outs R32C:$rT), (ins R32C:$rA, s10imm_i32:$val),
+                    [(set R32C:$rT, (and R32C:$rA, i32ImmSExt10:$val))]>;
+
+  // Hacked form of ANDI to zero-extend i8 quantities to i32. See the zext 8->32
+  // pattern below.
+  def i8i32: ANDIInst<(outs R32C:$rT), (ins R8C:$rA, s10imm_i32:$val),
+                      [(set R32C:$rT,
+                            (and (zext R8C:$rA), i32ImmSExt10:$val))]>;
+
+  // Hacked form of ANDI to zero-extend i16 quantities to i32. See the
+  // zext 16->32 pattern below.
+  //
+  // Note that this pattern is somewhat artificial, since it might match
+  // something the compiler generates but is unlikely to occur in practice.
+  def i16i32: ANDIInst<(outs R32C:$rT), (ins R16C:$rA, s10imm_i32:$val),
+                       [(set R32C:$rT,
+                             (and (zext R16C:$rA), i32ImmSExt10:$val))]>;
+}
+
+defm ANDI : AndWordImm;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// Bitwise OR group:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+// Bitwise "or" (N.B.: These are also register-register copy instructions...)
+class ORInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b10000010000, OOL, IOL, "or\t$rT, $rA, $rB",
+           IntegerOp, pattern>;
+
+class ORVecInst<ValueType vectype>:
+    ORInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+           [(set (vectype VECREG:$rT), (or (vectype VECREG:$rA),
+                                           (vectype VECREG:$rB)))]>;
+
+class ORRegInst<RegisterClass rclass>:
+    ORInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
+           [(set rclass:$rT, (or rclass:$rA, rclass:$rB))]>;
+
+// ORCvtForm: OR conversion form
+//
+// This is used to "convert" the preferred slot to its vector equivalent, as
+// well as convert a vector back to its preferred slot.
+//
+// These are effectively no-ops, but need to exist for proper type conversion
+// and type coercion.
+
+class ORCvtForm<dag OOL, dag IOL, list<dag> pattern = [/* no pattern */]>
+          : SPUInstr<OOL, IOL, "or\t$rT, $rA, $rA", IntegerOp> {
+  bits<7> RA;
+  bits<7> RT;
+
+  let Pattern = pattern;
+
+  let Inst{0-10} = 0b10000010000;
+  let Inst{11-17} = RA;
+  let Inst{18-24} = RA;
+  let Inst{25-31} = RT;
+}
+
+class ORPromoteScalar<RegisterClass rclass>:
+    ORCvtForm<(outs VECREG:$rT), (ins rclass:$rA)>;
+
+class ORExtractElt<RegisterClass rclass>:
+    ORCvtForm<(outs rclass:$rT), (ins VECREG:$rA)>;
+
+/* class ORCvtRegGPRC<RegisterClass rclass>:
+    ORCvtForm<(outs GPRC:$rT), (ins rclass:$rA)>; */
+
+/* class ORCvtGPRCReg<RegisterClass rclass>:
+    ORCvtForm<(outs rclass:$rT), (ins GPRC:$rA)>; */
+    
+class ORCvtFormR32Reg<RegisterClass rclass, list<dag> pattern = [ ]>:
+    ORCvtForm<(outs rclass:$rT), (ins R32C:$rA), pattern>;
+    
+class ORCvtFormRegR32<RegisterClass rclass, list<dag> pattern = [ ]>:
+    ORCvtForm<(outs R32C:$rT), (ins rclass:$rA), pattern>;
+
+class ORCvtFormR64Reg<RegisterClass rclass, list<dag> pattern = [ ]>:
+    ORCvtForm<(outs rclass:$rT), (ins R64C:$rA), pattern>;
+    
+class ORCvtFormRegR64<RegisterClass rclass, list<dag> pattern = [ ]>:
+    ORCvtForm<(outs R64C:$rT), (ins rclass:$rA), pattern>;
+
+class ORCvtGPRCVec:
+    ORCvtForm<(outs VECREG:$rT), (ins GPRC:$rA)>;
+
+class ORCvtVecGPRC:
+    ORCvtForm<(outs GPRC:$rT), (ins VECREG:$rA)>;
+
+multiclass BitwiseOr
+{
+  def v16i8: ORVecInst<v16i8>;
+  def v8i16: ORVecInst<v8i16>;
+  def v4i32: ORVecInst<v4i32>;
+  def v2i64: ORVecInst<v2i64>;
+
+  def v4f32: ORInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+                    [(set (v4f32 VECREG:$rT),
+                          (v4f32 (bitconvert (or (v4i32 VECREG:$rA),
+                                                 (v4i32 VECREG:$rB)))))]>;
+
+  def v2f64: ORInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+                    [(set (v2f64 VECREG:$rT),
+                          (v2f64 (bitconvert (or (v2i64 VECREG:$rA),
+                                                 (v2i64 VECREG:$rB)))))]>;
+
+  def r128: ORRegInst<GPRC>;
+  def r64:  ORRegInst<R64C>;
+  def r32:  ORRegInst<R32C>;
+  def r16:  ORRegInst<R16C>;
+  def r8:   ORRegInst<R8C>;
+
+  // OR instructions used to copy f32 and f64 registers.
+  def f32: ORInst<(outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB),
+                  [/* no pattern */]>;
+
+  def f64: ORInst<(outs R64FP:$rT), (ins R64FP:$rA, R64FP:$rB),
+                  [/* no pattern */]>;
+
+  // scalar->vector promotion, prefslot2vec:
+  def v16i8_i8:  ORPromoteScalar<R8C>;
+  def v8i16_i16: ORPromoteScalar<R16C>;
+  def v4i32_i32: ORPromoteScalar<R32C>;
+  def v2i64_i64: ORPromoteScalar<R64C>;
+  def v4f32_f32: ORPromoteScalar<R32FP>;
+  def v2f64_f64: ORPromoteScalar<R64FP>;
+
+  // vector->scalar demotion, vec2prefslot:
+  def i8_v16i8:  ORExtractElt<R8C>;
+  def i16_v8i16: ORExtractElt<R16C>;
+  def i32_v4i32: ORExtractElt<R32C>;
+  def i64_v2i64: ORExtractElt<R64C>;
+  def f32_v4f32: ORExtractElt<R32FP>;
+  def f64_v2f64: ORExtractElt<R64FP>;
+
+  // Conversion from vector to GPRC
+  def i128_vec:  ORCvtVecGPRC;
+
+  // Conversion from GPRC to vector
+  def vec_i128:  ORCvtGPRCVec;
+
+/*
+  // Conversion from register to GPRC
+  def i128_r64:  ORCvtRegGPRC<R64C>;
+  def i128_f64:  ORCvtRegGPRC<R64FP>;
+  def i128_r32:  ORCvtRegGPRC<R32C>;
+  def i128_f32:  ORCvtRegGPRC<R32FP>;
+  def i128_r16:  ORCvtRegGPRC<R16C>;
+  def i128_r8:   ORCvtRegGPRC<R8C>;
+
+  // Conversion from GPRC to register
+  def r64_i128:  ORCvtGPRCReg<R64C>;
+  def f64_i128:  ORCvtGPRCReg<R64FP>;
+  def r32_i128:  ORCvtGPRCReg<R32C>;
+  def f32_i128:  ORCvtGPRCReg<R32FP>;
+  def r16_i128:  ORCvtGPRCReg<R16C>;
+  def r8_i128:   ORCvtGPRCReg<R8C>;
+*/
+/*
+  // Conversion from register to R32C:
+  def r32_r16:   ORCvtFormRegR32<R16C>;
+  def r32_r8:    ORCvtFormRegR32<R8C>;
+  
+  // Conversion from R32C to register
+  def r32_r16:   ORCvtFormR32Reg<R16C>;
+  def r32_r8:    ORCvtFormR32Reg<R8C>;
+*/
+  
+  // Conversion from R64C to register:
+  def r32_r64:   ORCvtFormR64Reg<R32C>;
+  // def r16_r64:   ORCvtFormR64Reg<R16C>;
+  // def r8_r64:    ORCvtFormR64Reg<R8C>;
+  
+  // Conversion to R64C from register:
+  def r64_r32:   ORCvtFormRegR64<R32C>;
+  // def r64_r16:   ORCvtFormRegR64<R16C>;
+  // def r64_r8:    ORCvtFormRegR64<R8C>;
+
+  // bitconvert patterns:
+  def r32_f32:   ORCvtFormR32Reg<R32FP,
+                                 [(set R32FP:$rT, (bitconvert R32C:$rA))]>;
+  def f32_r32:   ORCvtFormRegR32<R32FP,
+                                 [(set R32C:$rT, (bitconvert R32FP:$rA))]>;
+
+  def r64_f64:   ORCvtFormR64Reg<R64FP,
+                                 [(set R64FP:$rT, (bitconvert R64C:$rA))]>;
+  def f64_r64:   ORCvtFormRegR64<R64FP,
+                                 [(set R64C:$rT, (bitconvert R64FP:$rA))]>;
+}
+
+defm OR : BitwiseOr;
+
+// scalar->vector promotion patterns (preferred slot to vector):
+def : Pat<(v16i8 (SPUprefslot2vec R8C:$rA)),
+          (ORv16i8_i8 R8C:$rA)>;
+
+def : Pat<(v8i16 (SPUprefslot2vec R16C:$rA)),
+          (ORv8i16_i16 R16C:$rA)>;
+
+def : Pat<(v4i32 (SPUprefslot2vec R32C:$rA)),
+          (ORv4i32_i32 R32C:$rA)>;
+
+def : Pat<(v2i64 (SPUprefslot2vec R64C:$rA)),
+          (ORv2i64_i64 R64C:$rA)>;
+
+def : Pat<(v4f32 (SPUprefslot2vec R32FP:$rA)),
+          (ORv4f32_f32 R32FP:$rA)>;
+
+def : Pat<(v2f64 (SPUprefslot2vec R64FP:$rA)),
+          (ORv2f64_f64 R64FP:$rA)>;
+
+// ORi*_v*: Used to extract vector element 0 (the preferred slot), otherwise
+// known as converting the vector back to its preferred slot
+
+def : Pat<(SPUvec2prefslot (v16i8 VECREG:$rA)),
+          (ORi8_v16i8 VECREG:$rA)>;
+
+def : Pat<(SPUvec2prefslot (v8i16 VECREG:$rA)),
+          (ORi16_v8i16 VECREG:$rA)>;
+
+def : Pat<(SPUvec2prefslot (v4i32 VECREG:$rA)),
+          (ORi32_v4i32 VECREG:$rA)>;
+
+def : Pat<(SPUvec2prefslot (v2i64 VECREG:$rA)),
+          (ORi64_v2i64 VECREG:$rA)>;
+
+def : Pat<(SPUvec2prefslot (v4f32 VECREG:$rA)),
+          (ORf32_v4f32 VECREG:$rA)>;
+
+def : Pat<(SPUvec2prefslot (v2f64 VECREG:$rA)),
+          (ORf64_v2f64 VECREG:$rA)>;
+
+// Load Register: This is an assembler alias for a bitwise OR of a register
+// against itself. It's here because it brings some clarity to assembly
+// language output.
+
+let hasCtrlDep = 1 in {
+    class LRInst<dag OOL, dag IOL>
+              : SPUInstr<OOL, IOL, "lr\t$rT, $rA", IntegerOp> {
+      bits<7> RA;
+      bits<7> RT;
+
+      let Pattern = [/*no pattern*/];
+
+      let Inst{0-10} = 0b10000010000;   /* It's an OR operation */
+      let Inst{11-17} = RA;
+      let Inst{18-24} = RA;
+      let Inst{25-31} = RT;
+    }
+
+    class LRVecInst<ValueType vectype>:
+        LRInst<(outs VECREG:$rT), (ins VECREG:$rA)>;
+
+    class LRRegInst<RegisterClass rclass>:
+        LRInst<(outs rclass:$rT), (ins rclass:$rA)>;
+
+    multiclass LoadRegister {
+      def v2i64: LRVecInst<v2i64>;
+      def v2f64: LRVecInst<v2f64>;
+      def v4i32: LRVecInst<v4i32>;
+      def v4f32: LRVecInst<v4f32>;
+      def v8i16: LRVecInst<v8i16>;
+      def v16i8: LRVecInst<v16i8>;
+
+      def r128:  LRRegInst<GPRC>;
+      def r64:   LRRegInst<R64C>;
+      def f64:   LRRegInst<R64FP>;
+      def r32:   LRRegInst<R32C>;
+      def f32:   LRRegInst<R32FP>;
+      def r16:   LRRegInst<R16C>;
+      def r8:    LRRegInst<R8C>;
+    }
+
+    defm LR: LoadRegister;
+}
+
+// ORC: Bitwise "or" with complement (c = a | ~b)
+
+class ORCInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b10010010000, OOL, IOL, "orc\t$rT, $rA, $rB",
+           IntegerOp, pattern>;
+
+class ORCVecInst<ValueType vectype>:
+    ORCInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+            [(set (vectype VECREG:$rT), (or (vectype VECREG:$rA),
+                                            (vnot (vectype VECREG:$rB))))]>;
+
+class ORCRegInst<RegisterClass rclass>:
+  ORCInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
+          [(set rclass:$rT, (or rclass:$rA, (not rclass:$rB)))]>;
+
+multiclass BitwiseOrComplement
+{
+  def v16i8: ORCVecInst<v16i8>;
+  def v8i16: ORCVecInst<v8i16>;
+  def v4i32: ORCVecInst<v4i32>;
+  def v2i64: ORCVecInst<v2i64>;
+
+  def r128:  ORCRegInst<GPRC>;
+  def r64:   ORCRegInst<R64C>;
+  def r32:   ORCRegInst<R32C>;
+  def r16:   ORCRegInst<R16C>;
+  def r8:    ORCRegInst<R8C>;
+}
+
+defm ORC : BitwiseOrComplement;
+
+// OR byte immediate
+class ORBIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI10Form<0b01100000, OOL, IOL, "orbi\t$rT, $rA, $val",
+             IntegerOp, pattern>;
+
+class ORBIVecInst<ValueType vectype, PatLeaf immpred>:
+    ORBIInst<(outs VECREG:$rT), (ins VECREG:$rA, u10imm:$val),
+             [(set (v16i8 VECREG:$rT), (or (vectype VECREG:$rA),
+                                           (vectype immpred:$val)))]>;
+
+multiclass BitwiseOrByteImm
+{
+  def v16i8: ORBIVecInst<v16i8, v16i8U8Imm>;
+
+  def r8: ORBIInst<(outs R8C:$rT), (ins R8C:$rA, u10imm_i8:$val),
+                   [(set R8C:$rT, (or R8C:$rA, immU8:$val))]>;
+}
+
+defm ORBI : BitwiseOrByteImm;
+
+// OR halfword immediate
+class ORHIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI10Form<0b10100000, OOL, IOL, "orhi\t$rT, $rA, $val",
+             IntegerOp, pattern>;
+
+class ORHIVecInst<ValueType vectype, PatLeaf immpred>:
+    ORHIInst<(outs VECREG:$rT), (ins VECREG:$rA, u10imm:$val),
+              [(set (vectype VECREG:$rT), (or (vectype VECREG:$rA),
+                                              immpred:$val))]>;
+
+multiclass BitwiseOrHalfwordImm
+{
+  def v8i16: ORHIVecInst<v8i16, v8i16Uns10Imm>;
+
+  def r16: ORHIInst<(outs R16C:$rT), (ins R16C:$rA, u10imm:$val),
+                    [(set R16C:$rT, (or R16C:$rA, i16ImmUns10:$val))]>;
+
+  // Specialized ORHI form used to promote 8-bit registers to 16-bit
+  def i8i16: ORHIInst<(outs R16C:$rT), (ins R8C:$rA, s10imm:$val),
+                      [(set R16C:$rT, (or (anyext R8C:$rA),
+                                          i16ImmSExt10:$val))]>;
+}
+
+defm ORHI : BitwiseOrHalfwordImm;
+
+class ORIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI10Form<0b00100000, OOL, IOL, "ori\t$rT, $rA, $val",
+             IntegerOp, pattern>;
+
+class ORIVecInst<ValueType vectype, PatLeaf immpred>:
+    ORIInst<(outs VECREG:$rT), (ins VECREG:$rA, u10imm:$val),
+            [(set (vectype VECREG:$rT), (or (vectype VECREG:$rA),
+                                            immpred:$val))]>;
+
+// Bitwise "or" with immediate
+multiclass BitwiseOrImm
+{
+  def v4i32: ORIVecInst<v4i32, v4i32Uns10Imm>;
+
+  def r32: ORIInst<(outs R32C:$rT), (ins R32C:$rA, u10imm_i32:$val),
+                   [(set R32C:$rT, (or R32C:$rA, i32ImmUns10:$val))]>;
+
+  // i16i32: hacked version of the ori instruction to extend 16-bit quantities
+  // to 32-bit quantities. used exclusively to match "anyext" conversions (vide
+  // infra "anyext 16->32" pattern.)
+  def i16i32: ORIInst<(outs R32C:$rT), (ins R16C:$rA, s10imm_i32:$val),
+                      [(set R32C:$rT, (or (anyext R16C:$rA),
+                                          i32ImmSExt10:$val))]>;
+
+  // i8i32: Hacked version of the ORI instruction to extend 16-bit quantities
+  // to 32-bit quantities. Used exclusively to match "anyext" conversions (vide
+  // infra "anyext 16->32" pattern.)
+  def i8i32: ORIInst<(outs R32C:$rT), (ins R8C:$rA, s10imm_i32:$val),
+                     [(set R32C:$rT, (or (anyext R8C:$rA),
+                                         i32ImmSExt10:$val))]>;
+}
+
+defm ORI : BitwiseOrImm;
+
+// ORX: "or" across the vector: or's $rA's word slots leaving the result in
+// $rT[0], slots 1-3 are zeroed.
+//
+// FIXME: Needs to match an intrinsic pattern.
+def ORXv4i32:
+    RRForm<0b10010010000, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "orx\t$rT, $rA, $rB", IntegerOp,
+      []>;
+
+// XOR:
+
+class XORInst<dag OOL, dag IOL, list<dag> pattern> :
+    RRForm<0b10010010000, OOL, IOL, "xor\t$rT, $rA, $rB",
+           IntegerOp, pattern>;
+
+class XORVecInst<ValueType vectype>:
+    XORInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+             [(set (vectype VECREG:$rT), (xor (vectype VECREG:$rA),
+                                              (vectype VECREG:$rB)))]>;
+
+class XORRegInst<RegisterClass rclass>:
+    XORInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
+             [(set rclass:$rT, (xor rclass:$rA, rclass:$rB))]>;
+
+multiclass BitwiseExclusiveOr
+{
+  def v16i8: XORVecInst<v16i8>;
+  def v8i16: XORVecInst<v8i16>;
+  def v4i32: XORVecInst<v4i32>;
+  def v2i64: XORVecInst<v2i64>;
+
+  def r128:  XORRegInst<GPRC>;
+  def r64:   XORRegInst<R64C>;
+  def r32:   XORRegInst<R32C>;
+  def r16:   XORRegInst<R16C>;
+  def r8:    XORRegInst<R8C>;
+
+  // XOR instructions used to negate f32 and f64 quantities.
+
+  def fneg32: XORInst<(outs R32FP:$rT), (ins R32FP:$rA, R32C:$rB),
+                     [/* no pattern */]>;
+
+  def fneg64: XORInst<(outs R64FP:$rT), (ins R64FP:$rA, R64C:$rB),
+                     [/* no pattern */]>;
+
+  def fnegvec: XORInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+                      [/* no pattern, see fneg{32,64} */]>;
+}
+
+defm XOR : BitwiseExclusiveOr;
+
+//==----------------------------------------------------------
+
+class XORBIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI10Form<0b01100000, OOL, IOL, "xorbi\t$rT, $rA, $val",
+             IntegerOp, pattern>;
+
+multiclass XorByteImm
+{
+  def v16i8:
+    XORBIInst<(outs VECREG:$rT), (ins VECREG:$rA, u10imm:$val),
+              [(set (v16i8 VECREG:$rT), (xor (v16i8 VECREG:$rA), v16i8U8Imm:$val))]>;
+
+  def r8:
+    XORBIInst<(outs R8C:$rT), (ins R8C:$rA, u10imm_i8:$val),
+              [(set R8C:$rT, (xor R8C:$rA, immU8:$val))]>;
+}
+
+defm XORBI : XorByteImm;
+
+def XORHIv8i16:
+    RI10Form<0b10100000, (outs VECREG:$rT), (ins VECREG:$rA, u10imm:$val),
+      "xorhi\t$rT, $rA, $val", IntegerOp,
+      [(set (v8i16 VECREG:$rT), (xor (v8i16 VECREG:$rA),
+                                      v8i16SExt10Imm:$val))]>;
+
+def XORHIr16:
+    RI10Form<0b10100000, (outs R16C:$rT), (ins R16C:$rA, s10imm:$val),
+      "xorhi\t$rT, $rA, $val", IntegerOp,
+      [(set R16C:$rT, (xor R16C:$rA, i16ImmSExt10:$val))]>;
+
+def XORIv4i32:
+    RI10Form<0b00100000, (outs VECREG:$rT), (ins VECREG:$rA, s10imm_i32:$val),
+      "xori\t$rT, $rA, $val", IntegerOp,
+      [(set (v4i32 VECREG:$rT), (xor (v4i32 VECREG:$rA),
+                                     v4i32SExt10Imm:$val))]>;
+
+def XORIr32:
+    RI10Form<0b00100000, (outs R32C:$rT), (ins R32C:$rA, s10imm_i32:$val),
+      "xori\t$rT, $rA, $val", IntegerOp,
+      [(set R32C:$rT, (xor R32C:$rA, i32ImmSExt10:$val))]>;
+
+// NAND:
+
+class NANDInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b10010011000, OOL, IOL, "nand\t$rT, $rA, $rB",
+           IntegerOp, pattern>;
+
+class NANDVecInst<ValueType vectype>:
+    NANDInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+             [(set (vectype VECREG:$rT), (vnot (and (vectype VECREG:$rA),
+                                                    (vectype VECREG:$rB))))]>;
+class NANDRegInst<RegisterClass rclass>:
+    NANDInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
+             [(set rclass:$rT, (not (and rclass:$rA, rclass:$rB)))]>;
+
+multiclass BitwiseNand
+{
+  def v16i8: NANDVecInst<v16i8>;
+  def v8i16: NANDVecInst<v8i16>;
+  def v4i32: NANDVecInst<v4i32>;
+  def v2i64: NANDVecInst<v2i64>;
+
+  def r128:  NANDRegInst<GPRC>;
+  def r64:   NANDRegInst<R64C>;
+  def r32:   NANDRegInst<R32C>;
+  def r16:   NANDRegInst<R16C>;
+  def r8:    NANDRegInst<R8C>;
+}
+
+defm NAND : BitwiseNand;
+
+// NOR:
+
+class NORInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b10010010000, OOL, IOL, "nor\t$rT, $rA, $rB",
+           IntegerOp, pattern>;
+
+class NORVecInst<ValueType vectype>:
+    NORInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+            [(set (vectype VECREG:$rT), (vnot (or (vectype VECREG:$rA),
+                                                  (vectype VECREG:$rB))))]>;
+class NORRegInst<RegisterClass rclass>:
+    NORInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
+            [(set rclass:$rT, (not (or rclass:$rA, rclass:$rB)))]>;
+
+multiclass BitwiseNor
+{
+  def v16i8: NORVecInst<v16i8>;
+  def v8i16: NORVecInst<v8i16>;
+  def v4i32: NORVecInst<v4i32>;
+  def v2i64: NORVecInst<v2i64>;
+
+  def r128:  NORRegInst<GPRC>;
+  def r64:   NORRegInst<R64C>;
+  def r32:   NORRegInst<R32C>;
+  def r16:   NORRegInst<R16C>;
+  def r8:    NORRegInst<R8C>;
+}
+
+defm NOR : BitwiseNor;
+
+// Select bits:
+class SELBInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRRForm<0b1000, OOL, IOL, "selb\t$rT, $rA, $rB, $rC",
+            IntegerOp, pattern>;
+
+class SELBVecInst<ValueType vectype, PatFrag vnot_frag = vnot>:
+  SELBInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+           [(set (vectype VECREG:$rT),
+                 (or (and (vectype VECREG:$rC), (vectype VECREG:$rB)),
+                     (and (vnot_frag (vectype VECREG:$rC)),
+                          (vectype VECREG:$rA))))]>;
+
+class SELBVecVCondInst<ValueType vectype>:
+  SELBInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+           [(set (vectype VECREG:$rT),
+                 (select (vectype VECREG:$rC),
+                         (vectype VECREG:$rB),
+                         (vectype VECREG:$rA)))]>;
+
+class SELBVecCondInst<ValueType vectype>:
+  SELBInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, R32C:$rC),
+           [(set (vectype VECREG:$rT),
+                 (select R32C:$rC,
+                         (vectype VECREG:$rB),
+                         (vectype VECREG:$rA)))]>;
+
+class SELBRegInst<RegisterClass rclass>:
+  SELBInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB, rclass:$rC),
+           [(set rclass:$rT,
+                 (or (and rclass:$rB, rclass:$rC),
+                     (and rclass:$rA, (not rclass:$rC))))]>;
+
+class SELBRegCondInst<RegisterClass rcond, RegisterClass rclass>:
+  SELBInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB, rcond:$rC),
+           [(set rclass:$rT,
+                 (select rcond:$rC, rclass:$rB, rclass:$rA))]>;
+
+multiclass SelectBits
+{
+  def v16i8: SELBVecInst<v16i8>;
+  def v8i16: SELBVecInst<v8i16>;
+  def v4i32: SELBVecInst<v4i32>;
+  def v2i64: SELBVecInst<v2i64, vnot_conv>;
+
+  def r128:  SELBRegInst<GPRC>;
+  def r64:   SELBRegInst<R64C>;
+  def r32:   SELBRegInst<R32C>;
+  def r16:   SELBRegInst<R16C>;
+  def r8:    SELBRegInst<R8C>;
+
+  def v16i8_cond: SELBVecCondInst<v16i8>;
+  def v8i16_cond: SELBVecCondInst<v8i16>;
+  def v4i32_cond: SELBVecCondInst<v4i32>;
+  def v2i64_cond: SELBVecCondInst<v2i64>;
+
+  def v16i8_vcond: SELBVecCondInst<v16i8>;
+  def v8i16_vcond: SELBVecCondInst<v8i16>;
+  def v4i32_vcond: SELBVecCondInst<v4i32>;
+  def v2i64_vcond: SELBVecCondInst<v2i64>;
+
+  def v4f32_cond:
+        SELBInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+                 [(set (v4f32 VECREG:$rT),
+                       (select (v4i32 VECREG:$rC),
+                               (v4f32 VECREG:$rB),
+                               (v4f32 VECREG:$rA)))]>;
+
+  // SELBr64_cond is defined in SPU64InstrInfo.td
+  def r32_cond:   SELBRegCondInst<R32C, R32C>;
+  def f32_cond:   SELBRegCondInst<R32C, R32FP>;
+  def r16_cond:   SELBRegCondInst<R16C, R16C>;
+  def r8_cond:    SELBRegCondInst<R8C,  R8C>;
+}
+
+defm SELB : SelectBits;
+
+class SPUselbPatVec<ValueType vectype, SPUInstr inst>:
+   Pat<(SPUselb (vectype VECREG:$rA), (vectype VECREG:$rB), (vectype VECREG:$rC)),
+       (inst VECREG:$rA, VECREG:$rB, VECREG:$rC)>;
+
+def : SPUselbPatVec<v16i8, SELBv16i8>;
+def : SPUselbPatVec<v8i16, SELBv8i16>;
+def : SPUselbPatVec<v4i32, SELBv4i32>;
+def : SPUselbPatVec<v2i64, SELBv2i64>;
+
+class SPUselbPatReg<RegisterClass rclass, SPUInstr inst>:
+   Pat<(SPUselb rclass:$rA, rclass:$rB, rclass:$rC),
+       (inst rclass:$rA, rclass:$rB, rclass:$rC)>;
+
+def : SPUselbPatReg<R8C,   SELBr8>;
+def : SPUselbPatReg<R16C,  SELBr16>;
+def : SPUselbPatReg<R32C,  SELBr32>;
+def : SPUselbPatReg<R64C,  SELBr64>;
+
+// EQV: Equivalence (1 for each same bit, otherwise 0)
+//
+// Note: There are a lot of ways to match this bit operator and these patterns
+// attempt to be as exhaustive as possible.
+
+class EQVInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b10010010000, OOL, IOL, "eqv\t$rT, $rA, $rB",
+           IntegerOp, pattern>;
+
+class EQVVecInst<ValueType vectype>:
+    EQVInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+            [(set (vectype VECREG:$rT),
+                  (or (and (vectype VECREG:$rA), (vectype VECREG:$rB)),
+                      (and (vnot (vectype VECREG:$rA)),
+                           (vnot (vectype VECREG:$rB)))))]>;
+
+class EQVRegInst<RegisterClass rclass>:
+    EQVInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
+            [(set rclass:$rT, (or (and rclass:$rA, rclass:$rB),
+                                  (and (not rclass:$rA), (not rclass:$rB))))]>;
+
+class EQVVecPattern1<ValueType vectype>:
+  EQVInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+          [(set (vectype VECREG:$rT),
+                (xor (vectype VECREG:$rA), (vnot (vectype VECREG:$rB))))]>;
+
+class EQVRegPattern1<RegisterClass rclass>:
+  EQVInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
+          [(set rclass:$rT, (xor rclass:$rA, (not rclass:$rB)))]>;
+
+class EQVVecPattern2<ValueType vectype>:
+  EQVInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+          [(set (vectype VECREG:$rT),
+                (or (and (vectype VECREG:$rA), (vectype VECREG:$rB)),
+                    (vnot (or (vectype VECREG:$rA), (vectype VECREG:$rB)))))]>;
+
+class EQVRegPattern2<RegisterClass rclass>:
+  EQVInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
+          [(set rclass:$rT,
+                (or (and rclass:$rA, rclass:$rB),
+                    (not (or rclass:$rA, rclass:$rB))))]>;
+
+class EQVVecPattern3<ValueType vectype>:
+  EQVInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+          [(set (vectype VECREG:$rT),
+                (not (xor (vectype VECREG:$rA), (vectype VECREG:$rB))))]>;
+
+class EQVRegPattern3<RegisterClass rclass>:
+  EQVInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
+          [(set rclass:$rT, (not (xor rclass:$rA, rclass:$rB)))]>;
+
+multiclass BitEquivalence
+{
+  def v16i8: EQVVecInst<v16i8>;
+  def v8i16: EQVVecInst<v8i16>;
+  def v4i32: EQVVecInst<v4i32>;
+  def v2i64: EQVVecInst<v2i64>;
+
+  def v16i8_1: EQVVecPattern1<v16i8>;
+  def v8i16_1: EQVVecPattern1<v8i16>;
+  def v4i32_1: EQVVecPattern1<v4i32>;
+  def v2i64_1: EQVVecPattern1<v2i64>;
+
+  def v16i8_2: EQVVecPattern2<v16i8>;
+  def v8i16_2: EQVVecPattern2<v8i16>;
+  def v4i32_2: EQVVecPattern2<v4i32>;
+  def v2i64_2: EQVVecPattern2<v2i64>;
+
+  def v16i8_3: EQVVecPattern3<v16i8>;
+  def v8i16_3: EQVVecPattern3<v8i16>;
+  def v4i32_3: EQVVecPattern3<v4i32>;
+  def v2i64_3: EQVVecPattern3<v2i64>;
+
+  def r128:  EQVRegInst<GPRC>;
+  def r64:   EQVRegInst<R64C>;
+  def r32:   EQVRegInst<R32C>;
+  def r16:   EQVRegInst<R16C>;
+  def r8:    EQVRegInst<R8C>;
+
+  def r128_1: EQVRegPattern1<GPRC>;
+  def r64_1:  EQVRegPattern1<R64C>;
+  def r32_1:  EQVRegPattern1<R32C>;
+  def r16_1:  EQVRegPattern1<R16C>;
+  def r8_1:   EQVRegPattern1<R8C>;
+
+  def r128_2: EQVRegPattern2<GPRC>;
+  def r64_2:  EQVRegPattern2<R64C>;
+  def r32_2:  EQVRegPattern2<R32C>;
+  def r16_2:  EQVRegPattern2<R16C>;
+  def r8_2:   EQVRegPattern2<R8C>;
+
+  def r128_3: EQVRegPattern3<GPRC>;
+  def r64_3:  EQVRegPattern3<R64C>;
+  def r32_3:  EQVRegPattern3<R32C>;
+  def r16_3:  EQVRegPattern3<R16C>;
+  def r8_3:   EQVRegPattern3<R8C>;
+}
+
+defm EQV: BitEquivalence;
+
+//===----------------------------------------------------------------------===//
+// Vector shuffle...
+//===----------------------------------------------------------------------===//
+// SPUshuffle is generated in LowerVECTOR_SHUFFLE and gets replaced with SHUFB.
+// See the SPUshuffle SDNode operand above, which sets up the DAG pattern
+// matcher to emit something when the LowerVECTOR_SHUFFLE generates a node with
+// the SPUISD::SHUFB opcode.
+//===----------------------------------------------------------------------===//
+
+class SHUFBInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRRForm<0b1000, OOL, IOL, "shufb\t$rT, $rA, $rB, $rC",
+            IntegerOp, pattern>;
+
+class SHUFBVecInst<ValueType resultvec, ValueType maskvec>:
+    SHUFBInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+              [(set (resultvec VECREG:$rT),
+                    (SPUshuffle (resultvec VECREG:$rA),
+                                (resultvec VECREG:$rB),
+                                (maskvec VECREG:$rC)))]>;
+
+class SHUFBGPRCInst:
+    SHUFBInst<(outs VECREG:$rT), (ins GPRC:$rA, GPRC:$rB, VECREG:$rC),
+              [/* no pattern */]>;
+
+multiclass ShuffleBytes
+{
+  def v16i8     : SHUFBVecInst<v16i8, v16i8>;
+  def v16i8_m32 : SHUFBVecInst<v16i8, v4i32>;
+  def v8i16     : SHUFBVecInst<v8i16, v16i8>;
+  def v8i16_m32 : SHUFBVecInst<v8i16, v4i32>;
+  def v4i32     : SHUFBVecInst<v4i32, v16i8>;
+  def v4i32_m32 : SHUFBVecInst<v4i32, v4i32>;
+  def v2i64     : SHUFBVecInst<v2i64, v16i8>;
+  def v2i64_m32 : SHUFBVecInst<v2i64, v4i32>;
+
+  def v4f32     : SHUFBVecInst<v4f32, v16i8>;
+  def v4f32_m32 : SHUFBVecInst<v4f32, v4i32>;
+
+  def v2f64     : SHUFBVecInst<v2f64, v16i8>;
+  def v2f64_m32 : SHUFBVecInst<v2f64, v4i32>;
+
+  def gprc      : SHUFBGPRCInst;
+}
+
+defm SHUFB : ShuffleBytes;
+
+//===----------------------------------------------------------------------===//
+// Shift and rotate group:
+//===----------------------------------------------------------------------===//
+
+class SHLHInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b11111010000, OOL, IOL, "shlh\t$rT, $rA, $rB",
+           RotateShift, pattern>;
+
+class SHLHVecInst<ValueType vectype>:
+    SHLHInst<(outs VECREG:$rT), (ins VECREG:$rA, R16C:$rB),
+             [(set (vectype VECREG:$rT),
+                   (SPUvec_shl (vectype VECREG:$rA), R16C:$rB))]>;
+
+multiclass ShiftLeftHalfword
+{
+  def v8i16: SHLHVecInst<v8i16>;
+  def r16:   SHLHInst<(outs R16C:$rT), (ins R16C:$rA, R16C:$rB),
+                      [(set R16C:$rT, (shl R16C:$rA, R16C:$rB))]>;
+  def r16_r32: SHLHInst<(outs R16C:$rT), (ins R16C:$rA, R32C:$rB),
+                        [(set R16C:$rT, (shl R16C:$rA, R32C:$rB))]>;
+}
+
+defm SHLH : ShiftLeftHalfword;
+
+//===----------------------------------------------------------------------===//
+
+class SHLHIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI7Form<0b11111010000, OOL, IOL, "shlhi\t$rT, $rA, $val",
+            RotateShift, pattern>;
+
+class SHLHIVecInst<ValueType vectype>:
+    SHLHIInst<(outs VECREG:$rT), (ins VECREG:$rA, u7imm:$val),
+              [(set (vectype VECREG:$rT),
+                    (SPUvec_shl (vectype VECREG:$rA), (i16 uimm7:$val)))]>;
+
+multiclass ShiftLeftHalfwordImm
+{
+  def v8i16: SHLHIVecInst<v8i16>;
+  def r16: SHLHIInst<(outs R16C:$rT), (ins R16C:$rA, u7imm:$val),
+                     [(set R16C:$rT, (shl R16C:$rA, (i16 uimm7:$val)))]>;
+}
+
+defm SHLHI : ShiftLeftHalfwordImm;
+
+def : Pat<(SPUvec_shl (v8i16 VECREG:$rA), (i32 uimm7:$val)),
+          (SHLHIv8i16 VECREG:$rA, uimm7:$val)>;
+
+def : Pat<(shl R16C:$rA, (i32 uimm7:$val)),
+          (SHLHIr16 R16C:$rA, uimm7:$val)>;
+
+//===----------------------------------------------------------------------===//
+
+class SHLInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b11111010000, OOL, IOL, "shl\t$rT, $rA, $rB",
+           RotateShift, pattern>;
+
+multiclass ShiftLeftWord
+{
+  def v4i32:
+      SHLInst<(outs VECREG:$rT), (ins VECREG:$rA, R16C:$rB),
+              [(set (v4i32 VECREG:$rT),
+                    (SPUvec_shl (v4i32 VECREG:$rA), R16C:$rB))]>;
+  def r32:
+      SHLInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
+              [(set R32C:$rT, (shl R32C:$rA, R32C:$rB))]>;
+}
+
+defm SHL: ShiftLeftWord;
+
+//===----------------------------------------------------------------------===//
+
+class SHLIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI7Form<0b11111010000, OOL, IOL, "shli\t$rT, $rA, $val",
+            RotateShift, pattern>;
+
+multiclass ShiftLeftWordImm
+{
+  def v4i32:
+    SHLIInst<(outs VECREG:$rT), (ins VECREG:$rA, u7imm_i32:$val),
+             [(set (v4i32 VECREG:$rT),
+                   (SPUvec_shl (v4i32 VECREG:$rA), (i32 uimm7:$val)))]>;
+
+  def r32:
+    SHLIInst<(outs R32C:$rT), (ins R32C:$rA, u7imm_i32:$val),
+             [(set R32C:$rT, (shl R32C:$rA, (i32 uimm7:$val)))]>;
+}
+
+defm SHLI : ShiftLeftWordImm;
+
+//===----------------------------------------------------------------------===//
+// SHLQBI vec form: Note that this will shift the entire vector (the 128-bit
+// register) to the left. Vector form is here to ensure type correctness.
+//
+// The shift count is in the lowest 3 bits (29-31) of $rB, so only a bit shift
+// of 7 bits is actually possible.
+//
+// Note also that SHLQBI/SHLQBII are used in conjunction with SHLQBY/SHLQBYI
+// to shift i64 and i128. SHLQBI is the residual left over after shifting by
+// bytes with SHLQBY.
+
+class SHLQBIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b11011011100, OOL, IOL, "shlqbi\t$rT, $rA, $rB",
+           RotateShift, pattern>;
+
+class SHLQBIVecInst<ValueType vectype>:
+    SHLQBIInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+               [(set (vectype VECREG:$rT),
+                     (SPUshlquad_l_bits (vectype VECREG:$rA), R32C:$rB))]>;
+
+class SHLQBIRegInst<RegisterClass rclass>:
+    SHLQBIInst<(outs rclass:$rT), (ins rclass:$rA, R32C:$rB),
+               [/* no pattern */]>;
+
+multiclass ShiftLeftQuadByBits
+{
+  def v16i8: SHLQBIVecInst<v16i8>;
+  def v8i16: SHLQBIVecInst<v8i16>;
+  def v4i32: SHLQBIVecInst<v4i32>;
+  def v4f32: SHLQBIVecInst<v4f32>;
+  def v2i64: SHLQBIVecInst<v2i64>;
+  def v2f64: SHLQBIVecInst<v2f64>;
+
+  def r128:  SHLQBIRegInst<GPRC>;
+}
+
+defm SHLQBI : ShiftLeftQuadByBits;
+
+// See note above on SHLQBI. In this case, the predicate actually does then
+// enforcement, whereas with SHLQBI, we have to "take it on faith."
+class SHLQBIIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI7Form<0b11011111100, OOL, IOL, "shlqbii\t$rT, $rA, $val",
+            RotateShift, pattern>;
+
+class SHLQBIIVecInst<ValueType vectype>:
+    SHLQBIIInst<(outs VECREG:$rT), (ins VECREG:$rA, u7imm_i32:$val),
+                [(set (vectype VECREG:$rT),
+                      (SPUshlquad_l_bits (vectype VECREG:$rA), (i32 bitshift:$val)))]>;
+
+multiclass ShiftLeftQuadByBitsImm
+{
+  def v16i8 : SHLQBIIVecInst<v16i8>;
+  def v8i16 : SHLQBIIVecInst<v8i16>;
+  def v4i32 : SHLQBIIVecInst<v4i32>;
+  def v4f32 : SHLQBIIVecInst<v4f32>;
+  def v2i64 : SHLQBIIVecInst<v2i64>;
+  def v2f64 : SHLQBIIVecInst<v2f64>;
+}
+
+defm SHLQBII : ShiftLeftQuadByBitsImm;
+
+// SHLQBY, SHLQBYI vector forms: Shift the entire vector to the left by bytes,
+// not by bits. See notes above on SHLQBI.
+
+class SHLQBYInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI7Form<0b11111011100, OOL, IOL, "shlqby\t$rT, $rA, $rB",
+            RotateShift, pattern>;
+
+class SHLQBYVecInst<ValueType vectype>:
+    SHLQBYInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+               [(set (vectype VECREG:$rT),
+                     (SPUshlquad_l_bytes (vectype VECREG:$rA), R32C:$rB))]>;
+
+multiclass ShiftLeftQuadBytes
+{
+  def v16i8: SHLQBYVecInst<v16i8>;
+  def v8i16: SHLQBYVecInst<v8i16>;
+  def v4i32: SHLQBYVecInst<v4i32>;
+  def v4f32: SHLQBYVecInst<v4f32>;
+  def v2i64: SHLQBYVecInst<v2i64>;
+  def v2f64: SHLQBYVecInst<v2f64>;
+  def r128: SHLQBYInst<(outs GPRC:$rT), (ins GPRC:$rA, R32C:$rB),
+                       [(set GPRC:$rT, (SPUshlquad_l_bytes GPRC:$rA, R32C:$rB))]>;
+}
+
+defm SHLQBY: ShiftLeftQuadBytes;
+
+class SHLQBYIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI7Form<0b11111111100, OOL, IOL, "shlqbyi\t$rT, $rA, $val",
+            RotateShift, pattern>;
+
+class SHLQBYIVecInst<ValueType vectype>:
+    SHLQBYIInst<(outs VECREG:$rT), (ins VECREG:$rA, u7imm_i32:$val),
+                [(set (vectype VECREG:$rT),
+                      (SPUshlquad_l_bytes (vectype VECREG:$rA), (i32 uimm7:$val)))]>;
+
+multiclass ShiftLeftQuadBytesImm
+{
+  def v16i8: SHLQBYIVecInst<v16i8>;
+  def v8i16: SHLQBYIVecInst<v8i16>;
+  def v4i32: SHLQBYIVecInst<v4i32>;
+  def v4f32: SHLQBYIVecInst<v4f32>;
+  def v2i64: SHLQBYIVecInst<v2i64>;
+  def v2f64: SHLQBYIVecInst<v2f64>;
+  def r128:  SHLQBYIInst<(outs GPRC:$rT), (ins GPRC:$rA, u7imm_i32:$val),
+                         [(set GPRC:$rT,
+                               (SPUshlquad_l_bytes GPRC:$rA, (i32 uimm7:$val)))]>;
+}
+
+defm SHLQBYI : ShiftLeftQuadBytesImm;
+
+class SHLQBYBIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b00111001111, OOL, IOL, "shlqbybi\t$rT, $rA, $rB",
+           RotateShift, pattern>;
+
+class SHLQBYBIVecInst<ValueType vectype>:
+    SHLQBYBIInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+                [/* no pattern */]>;
+
+class SHLQBYBIRegInst<RegisterClass rclass>:
+    SHLQBYBIInst<(outs rclass:$rT), (ins rclass:$rA, R32C:$rB),
+                 [/* no pattern */]>;
+
+multiclass ShiftLeftQuadBytesBitCount
+{
+  def v16i8: SHLQBYBIVecInst<v16i8>;
+  def v8i16: SHLQBYBIVecInst<v8i16>;
+  def v4i32: SHLQBYBIVecInst<v4i32>;
+  def v4f32: SHLQBYBIVecInst<v4f32>;
+  def v2i64: SHLQBYBIVecInst<v2i64>;
+  def v2f64: SHLQBYBIVecInst<v2f64>;
+
+  def r128:  SHLQBYBIRegInst<GPRC>;
+}
+
+defm SHLQBYBI : ShiftLeftQuadBytesBitCount;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// Rotate halfword:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+class ROTHInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b00111010000, OOL, IOL, "roth\t$rT, $rA, $rB",
+           RotateShift, pattern>;
+
+class ROTHVecInst<ValueType vectype>:
+    ROTHInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+             [(set (vectype VECREG:$rT),
+                   (SPUvec_rotl VECREG:$rA, VECREG:$rB))]>;
+
+class ROTHRegInst<RegisterClass rclass>:
+    ROTHInst<(outs rclass:$rT), (ins rclass:$rA, rclass:$rB),
+             [(set rclass:$rT, (rotl rclass:$rA, rclass:$rB))]>;
+
+multiclass RotateLeftHalfword
+{
+  def v8i16: ROTHVecInst<v8i16>;
+  def r16: ROTHRegInst<R16C>;
+}
+
+defm ROTH: RotateLeftHalfword;
+
+def ROTHr16_r32: ROTHInst<(outs R16C:$rT), (ins R16C:$rA, R32C:$rB),
+                          [(set R16C:$rT, (rotl R16C:$rA, R32C:$rB))]>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// Rotate halfword, immediate:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+class ROTHIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI7Form<0b00111110000, OOL, IOL, "rothi\t$rT, $rA, $val",
+            RotateShift, pattern>;
+
+class ROTHIVecInst<ValueType vectype>:
+    ROTHIInst<(outs VECREG:$rT), (ins VECREG:$rA, u7imm:$val),
+              [(set (vectype VECREG:$rT),
+                    (SPUvec_rotl VECREG:$rA, (i16 uimm7:$val)))]>;
+
+multiclass RotateLeftHalfwordImm
+{
+  def v8i16: ROTHIVecInst<v8i16>;
+  def r16: ROTHIInst<(outs R16C:$rT), (ins R16C:$rA, u7imm:$val),
+                     [(set R16C:$rT, (rotl R16C:$rA, (i16 uimm7:$val)))]>;
+  def r16_r32: ROTHIInst<(outs R16C:$rT), (ins R16C:$rA, u7imm_i32:$val),
+                         [(set R16C:$rT, (rotl R16C:$rA, (i32 uimm7:$val)))]>;
+}
+
+defm ROTHI: RotateLeftHalfwordImm;
+
+def : Pat<(SPUvec_rotl VECREG:$rA, (i32 uimm7:$val)),
+          (ROTHIv8i16 VECREG:$rA, imm:$val)>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// Rotate word:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class ROTInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b00011010000, OOL, IOL, "rot\t$rT, $rA, $rB",
+           RotateShift, pattern>;
+
+class ROTVecInst<ValueType vectype>:
+    ROTInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+            [(set (vectype VECREG:$rT),
+                  (SPUvec_rotl (vectype VECREG:$rA), R32C:$rB))]>;
+
+class ROTRegInst<RegisterClass rclass>:
+    ROTInst<(outs rclass:$rT), (ins rclass:$rA, R32C:$rB),
+            [(set rclass:$rT,
+                  (rotl rclass:$rA, R32C:$rB))]>;
+
+multiclass RotateLeftWord
+{
+  def v4i32: ROTVecInst<v4i32>;
+  def r32:   ROTRegInst<R32C>;
+}
+
+defm ROT: RotateLeftWord;
+
+// The rotate amount is in the same bits whether we've got an 8-bit, 16-bit or
+// 32-bit register
+def ROTr32_r16_anyext:
+    ROTInst<(outs R32C:$rT), (ins R32C:$rA, R16C:$rB),
+            [(set R32C:$rT, (rotl R32C:$rA, (i32 (anyext R16C:$rB))))]>;
+
+def : Pat<(rotl R32C:$rA, (i32 (zext R16C:$rB))),
+          (ROTr32_r16_anyext R32C:$rA, R16C:$rB)>;
+
+def : Pat<(rotl R32C:$rA, (i32 (sext R16C:$rB))),
+          (ROTr32_r16_anyext R32C:$rA, R16C:$rB)>;
+
+def ROTr32_r8_anyext:
+    ROTInst<(outs R32C:$rT), (ins R32C:$rA, R8C:$rB),
+            [(set R32C:$rT, (rotl R32C:$rA, (i32 (anyext R8C:$rB))))]>;
+
+def : Pat<(rotl R32C:$rA, (i32 (zext R8C:$rB))),
+          (ROTr32_r8_anyext R32C:$rA, R8C:$rB)>;
+
+def : Pat<(rotl R32C:$rA, (i32 (sext R8C:$rB))),
+          (ROTr32_r8_anyext R32C:$rA, R8C:$rB)>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// Rotate word, immediate
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class ROTIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI7Form<0b00011110000, OOL, IOL, "roti\t$rT, $rA, $val",
+            RotateShift, pattern>;
+
+class ROTIVecInst<ValueType vectype, Operand optype, ValueType inttype, PatLeaf pred>:
+    ROTIInst<(outs VECREG:$rT), (ins VECREG:$rA, optype:$val),
+             [(set (vectype VECREG:$rT),
+                   (SPUvec_rotl (vectype VECREG:$rA), (inttype pred:$val)))]>;
+
+class ROTIRegInst<RegisterClass rclass, Operand optype, ValueType inttype, PatLeaf pred>:
+    ROTIInst<(outs rclass:$rT), (ins rclass:$rA, optype:$val),
+             [(set rclass:$rT, (rotl rclass:$rA, (inttype pred:$val)))]>;
+
+multiclass RotateLeftWordImm
+{
+  def v4i32: ROTIVecInst<v4i32, u7imm_i32, i32, uimm7>;
+  def v4i32_i16: ROTIVecInst<v4i32, u7imm, i16, uimm7>;
+  def v4i32_i8:  ROTIVecInst<v4i32, u7imm_i8, i8, uimm7>;
+
+  def r32:       ROTIRegInst<R32C, u7imm_i32, i32, uimm7>;
+  def r32_i16:   ROTIRegInst<R32C, u7imm, i16, uimm7>;
+  def r32_i8:    ROTIRegInst<R32C, u7imm_i8, i8, uimm7>;
+}
+
+defm ROTI : RotateLeftWordImm;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// Rotate quad by byte (count)
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class ROTQBYInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b00111011100, OOL, IOL, "rotqby\t$rT, $rA, $rB",
+           RotateShift, pattern>;
+
+class ROTQBYVecInst<ValueType vectype>:
+    ROTQBYInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+               [(set (vectype VECREG:$rT),
+                     (SPUrotbytes_left (vectype VECREG:$rA), R32C:$rB))]>;
+
+multiclass RotateQuadLeftByBytes
+{
+  def v16i8: ROTQBYVecInst<v16i8>;
+  def v8i16: ROTQBYVecInst<v8i16>;
+  def v4i32: ROTQBYVecInst<v4i32>;
+  def v4f32: ROTQBYVecInst<v4f32>;
+  def v2i64: ROTQBYVecInst<v2i64>;
+  def v2f64: ROTQBYVecInst<v2f64>;
+}
+
+defm ROTQBY: RotateQuadLeftByBytes;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// Rotate quad by byte (count), immediate
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class ROTQBYIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI7Form<0b00111111100, OOL, IOL, "rotqbyi\t$rT, $rA, $val",
+            RotateShift, pattern>;
+
+class ROTQBYIVecInst<ValueType vectype>:
+    ROTQBYIInst<(outs VECREG:$rT), (ins VECREG:$rA, u7imm:$val),
+                [(set (vectype VECREG:$rT),
+                      (SPUrotbytes_left (vectype VECREG:$rA), (i16 uimm7:$val)))]>;
+
+multiclass RotateQuadByBytesImm
+{
+  def v16i8: ROTQBYIVecInst<v16i8>;
+  def v8i16: ROTQBYIVecInst<v8i16>;
+  def v4i32: ROTQBYIVecInst<v4i32>;
+  def v4f32: ROTQBYIVecInst<v4f32>;
+  def v2i64: ROTQBYIVecInst<v2i64>;
+  def vfi64: ROTQBYIVecInst<v2f64>;
+}
+
+defm ROTQBYI: RotateQuadByBytesImm;
+
+// See ROTQBY note above.
+class ROTQBYBIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI7Form<0b00110011100, OOL, IOL,
+      "rotqbybi\t$rT, $rA, $shift",
+      RotateShift, pattern>;
+
+class ROTQBYBIVecInst<ValueType vectype, RegisterClass rclass>:
+    ROTQBYBIInst<(outs VECREG:$rT), (ins VECREG:$rA, rclass:$shift),
+      [(set (vectype VECREG:$rT),
+            (SPUrotbytes_left_bits (vectype VECREG:$rA), rclass:$shift))]>;
+
+multiclass RotateQuadByBytesByBitshift {
+  def v16i8_r32: ROTQBYBIVecInst<v16i8, R32C>;
+  def v8i16_r32: ROTQBYBIVecInst<v8i16, R32C>;
+  def v4i32_r32: ROTQBYBIVecInst<v4i32, R32C>;
+  def v2i64_r32: ROTQBYBIVecInst<v2i64, R32C>;
+}
+
+defm ROTQBYBI : RotateQuadByBytesByBitshift;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// See ROTQBY note above.
+//
+// Assume that the user of this instruction knows to shift the rotate count
+// into bit 29
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class ROTQBIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b00011011100, OOL, IOL, "rotqbi\t$rT, $rA, $rB",
+           RotateShift, pattern>;
+
+class ROTQBIVecInst<ValueType vectype>:
+    ROTQBIInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+               [/* no pattern yet */]>;
+
+class ROTQBIRegInst<RegisterClass rclass>:
+    ROTQBIInst<(outs rclass:$rT), (ins rclass:$rA, R32C:$rB),
+               [/* no pattern yet */]>;
+
+multiclass RotateQuadByBitCount
+{
+  def v16i8: ROTQBIVecInst<v16i8>;
+  def v8i16: ROTQBIVecInst<v8i16>;
+  def v4i32: ROTQBIVecInst<v4i32>;
+  def v2i64: ROTQBIVecInst<v2i64>;
+
+  def r128:  ROTQBIRegInst<GPRC>;
+  def r64:   ROTQBIRegInst<R64C>;
+}
+
+defm ROTQBI: RotateQuadByBitCount;
+
+class ROTQBIIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI7Form<0b00011111100, OOL, IOL, "rotqbii\t$rT, $rA, $val",
+            RotateShift, pattern>;
+
+class ROTQBIIVecInst<ValueType vectype, Operand optype, ValueType inttype,
+                     PatLeaf pred>:
+    ROTQBIIInst<(outs VECREG:$rT), (ins VECREG:$rA, optype:$val),
+                [/* no pattern yet */]>;
+
+class ROTQBIIRegInst<RegisterClass rclass, Operand optype, ValueType inttype,
+                     PatLeaf pred>:
+    ROTQBIIInst<(outs rclass:$rT), (ins rclass:$rA, optype:$val),
+                [/* no pattern yet */]>;
+
+multiclass RotateQuadByBitCountImm
+{
+  def v16i8: ROTQBIIVecInst<v16i8, u7imm_i32, i32, uimm7>;
+  def v8i16: ROTQBIIVecInst<v8i16, u7imm_i32, i32, uimm7>;
+  def v4i32: ROTQBIIVecInst<v4i32, u7imm_i32, i32, uimm7>;
+  def v2i64: ROTQBIIVecInst<v2i64, u7imm_i32, i32, uimm7>;
+
+  def r128:  ROTQBIIRegInst<GPRC, u7imm_i32, i32, uimm7>;
+  def r64:   ROTQBIIRegInst<R64C, u7imm_i32, i32, uimm7>;
+}
+
+defm ROTQBII : RotateQuadByBitCountImm;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// ROTHM v8i16 form:
+// NOTE(1): No vector rotate is generated by the C/C++ frontend (today),
+//          so this only matches a synthetically generated/lowered code
+//          fragment.
+// NOTE(2): $rB must be negated before the right rotate!
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class ROTHMInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b10111010000, OOL, IOL, "rothm\t$rT, $rA, $rB",
+           RotateShift, pattern>;
+
+def ROTHMv8i16:
+    ROTHMInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+              [/* see patterns below - $rB must be negated */]>;
+
+def : Pat<(SPUvec_srl (v8i16 VECREG:$rA), R32C:$rB),
+          (ROTHMv8i16 VECREG:$rA, (SFIr32 R32C:$rB, 0))>;
+
+def : Pat<(SPUvec_srl (v8i16 VECREG:$rA), R16C:$rB),
+          (ROTHMv8i16 VECREG:$rA,
+                      (SFIr32 (XSHWr16 R16C:$rB), 0))>;
+
+def : Pat<(SPUvec_srl (v8i16 VECREG:$rA), R8C:$rB),
+          (ROTHMv8i16 VECREG:$rA,
+                      (SFIr32 (XSHWr16 (XSBHr8 R8C:$rB) ), 0))>;
+
+// ROTHM r16 form: Rotate 16-bit quantity to right, zero fill at the left
+// Note: This instruction doesn't match a pattern because rB must be negated
+// for the instruction to work. Thus, the pattern below the instruction!
+
+def ROTHMr16:
+    ROTHMInst<(outs R16C:$rT), (ins R16C:$rA, R32C:$rB),
+              [/* see patterns below - $rB must be negated! */]>;
+
+def : Pat<(srl R16C:$rA, R32C:$rB),
+          (ROTHMr16 R16C:$rA, (SFIr32 R32C:$rB, 0))>;
+
+def : Pat<(srl R16C:$rA, R16C:$rB),
+          (ROTHMr16 R16C:$rA,
+                    (SFIr32 (XSHWr16 R16C:$rB), 0))>;
+
+def : Pat<(srl R16C:$rA, R8C:$rB),
+          (ROTHMr16 R16C:$rA,
+                    (SFIr32 (XSHWr16 (XSBHr8 R8C:$rB) ), 0))>;
+
+// ROTHMI v8i16 form: See the comment for ROTHM v8i16. The difference here is
+// that the immediate can be complemented, so that the user doesn't have to
+// worry about it.
+
+class ROTHMIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI7Form<0b10111110000, OOL, IOL, "rothmi\t$rT, $rA, $val",
+            RotateShift, pattern>;
+
+def ROTHMIv8i16:
+    ROTHMIInst<(outs VECREG:$rT), (ins VECREG:$rA, rothNeg7imm:$val),
+               [/* no pattern */]>;
+
+def : Pat<(SPUvec_srl (v8i16 VECREG:$rA), (i32 imm:$val)),
+          (ROTHMIv8i16 VECREG:$rA, imm:$val)>;
+
+def: Pat<(SPUvec_srl (v8i16 VECREG:$rA), (i16 imm:$val)),
+         (ROTHMIv8i16 VECREG:$rA, imm:$val)>;
+
+def: Pat<(SPUvec_srl (v8i16 VECREG:$rA), (i8 imm:$val)),
+         (ROTHMIv8i16 VECREG:$rA, imm:$val)>;
+
+def ROTHMIr16:
+    ROTHMIInst<(outs R16C:$rT), (ins R16C:$rA, rothNeg7imm:$val),
+               [/* no pattern */]>;
+
+def: Pat<(srl R16C:$rA, (i32 uimm7:$val)),
+         (ROTHMIr16 R16C:$rA, uimm7:$val)>;
+
+def: Pat<(srl R16C:$rA, (i16 uimm7:$val)),
+         (ROTHMIr16 R16C:$rA, uimm7:$val)>;
+
+def: Pat<(srl R16C:$rA, (i8 uimm7:$val)),
+         (ROTHMIr16 R16C:$rA, uimm7:$val)>;
+
+// ROTM v4i32 form: See the ROTHM v8i16 comments.
+class ROTMInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b10011010000, OOL, IOL, "rotm\t$rT, $rA, $rB",
+           RotateShift, pattern>;
+
+def ROTMv4i32:
+    ROTMInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+             [/* see patterns below - $rB must be negated */]>;
+
+def : Pat<(SPUvec_srl VECREG:$rA, R32C:$rB),
+          (ROTMv4i32 VECREG:$rA, (SFIr32 R32C:$rB, 0))>;
+
+def : Pat<(SPUvec_srl VECREG:$rA, R16C:$rB),
+          (ROTMv4i32 VECREG:$rA,
+                     (SFIr32 (XSHWr16 R16C:$rB), 0))>;
+
+def : Pat<(SPUvec_srl VECREG:$rA, R8C:$rB),
+          (ROTMv4i32 VECREG:$rA,
+                     (SFIr32 (XSHWr16 (XSBHr8 R8C:$rB)), 0))>;
+
+def ROTMr32:
+    ROTMInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
+             [/* see patterns below - $rB must be negated */]>;
+
+def : Pat<(srl R32C:$rA, R32C:$rB),
+          (ROTMr32 R32C:$rA, (SFIr32 R32C:$rB, 0))>;
+
+def : Pat<(srl R32C:$rA, R16C:$rB),
+          (ROTMr32 R32C:$rA,
+                   (SFIr32 (XSHWr16 R16C:$rB), 0))>;
+
+def : Pat<(srl R32C:$rA, R8C:$rB),
+          (ROTMr32 R32C:$rA,
+                   (SFIr32 (XSHWr16 (XSBHr8 R8C:$rB)), 0))>;
+
+// ROTMI v4i32 form: See the comment for ROTHM v8i16.
+def ROTMIv4i32:
+    RI7Form<0b10011110000, (outs VECREG:$rT), (ins VECREG:$rA, rotNeg7imm:$val),
+      "rotmi\t$rT, $rA, $val", RotateShift,
+      [(set (v4i32 VECREG:$rT),
+            (SPUvec_srl VECREG:$rA, (i32 uimm7:$val)))]>;
+
+def : Pat<(SPUvec_srl VECREG:$rA, (i16 uimm7:$val)),
+          (ROTMIv4i32 VECREG:$rA, uimm7:$val)>;
+
+def : Pat<(SPUvec_srl VECREG:$rA, (i8 uimm7:$val)),
+          (ROTMIv4i32 VECREG:$rA, uimm7:$val)>;
+
+// ROTMI r32 form: know how to complement the immediate value.
+def ROTMIr32:
+    RI7Form<0b10011110000, (outs R32C:$rT), (ins R32C:$rA, rotNeg7imm:$val),
+      "rotmi\t$rT, $rA, $val", RotateShift,
+      [(set R32C:$rT, (srl R32C:$rA, (i32 uimm7:$val)))]>;
+
+def : Pat<(srl R32C:$rA, (i16 imm:$val)),
+          (ROTMIr32 R32C:$rA, uimm7:$val)>;
+
+def : Pat<(srl R32C:$rA, (i8 imm:$val)),
+          (ROTMIr32 R32C:$rA, uimm7:$val)>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// ROTQMBY: This is a vector form merely so that when used in an
+// instruction pattern, type checking will succeed. This instruction assumes
+// that the user knew to negate $rB.
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class ROTQMBYInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b10111011100, OOL, IOL, "rotqmby\t$rT, $rA, $rB",
+           RotateShift, pattern>;
+
+class ROTQMBYVecInst<ValueType vectype>:
+    ROTQMBYInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+                [/* no pattern, $rB must be negated */]>;
+
+class ROTQMBYRegInst<RegisterClass rclass>:
+    ROTQMBYInst<(outs rclass:$rT), (ins rclass:$rA, R32C:$rB),
+                [/* no pattern */]>;
+
+multiclass RotateQuadBytes
+{
+  def v16i8: ROTQMBYVecInst<v16i8>;
+  def v8i16: ROTQMBYVecInst<v8i16>;
+  def v4i32: ROTQMBYVecInst<v4i32>;
+  def v2i64: ROTQMBYVecInst<v2i64>;
+
+  def r128: ROTQMBYRegInst<GPRC>;
+  def r64:  ROTQMBYRegInst<R64C>;
+}
+
+defm ROTQMBY : RotateQuadBytes;
+
+class ROTQMBYIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI7Form<0b10111111100, OOL, IOL, "rotqmbyi\t$rT, $rA, $val",
+            RotateShift, pattern>;
+
+class ROTQMBYIVecInst<ValueType vectype>:
+    ROTQMBYIInst<(outs VECREG:$rT), (ins VECREG:$rA, rotNeg7imm:$val),
+                 [/* no pattern */]>;
+
+class ROTQMBYIRegInst<RegisterClass rclass, Operand optype, ValueType inttype,
+                      PatLeaf pred>:
+    ROTQMBYIInst<(outs rclass:$rT), (ins rclass:$rA, optype:$val),
+                 [/* no pattern */]>;
+
+// 128-bit zero extension form:
+class ROTQMBYIZExtInst<RegisterClass rclass, Operand optype, PatLeaf pred>:
+    ROTQMBYIInst<(outs GPRC:$rT), (ins rclass:$rA, optype:$val),
+                 [/* no pattern */]>;
+
+multiclass RotateQuadBytesImm
+{
+  def v16i8: ROTQMBYIVecInst<v16i8>;
+  def v8i16: ROTQMBYIVecInst<v8i16>;
+  def v4i32: ROTQMBYIVecInst<v4i32>;
+  def v2i64: ROTQMBYIVecInst<v2i64>;
+
+  def r128:  ROTQMBYIRegInst<GPRC, rotNeg7imm, i32, uimm7>;
+  def r64:   ROTQMBYIRegInst<R64C, rotNeg7imm, i32, uimm7>;
+  
+  def r128_zext_r8:  ROTQMBYIZExtInst<R8C, rotNeg7imm, uimm7>;
+  def r128_zext_r16: ROTQMBYIZExtInst<R16C, rotNeg7imm, uimm7>;
+  def r128_zext_r32: ROTQMBYIZExtInst<R32C, rotNeg7imm, uimm7>;
+  def r128_zext_r64: ROTQMBYIZExtInst<R64C, rotNeg7imm, uimm7>;
+}
+
+defm ROTQMBYI : RotateQuadBytesImm;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// Rotate right and mask by bit count
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class ROTQMBYBIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b10110011100, OOL, IOL, "rotqmbybi\t$rT, $rA, $rB",
+           RotateShift, pattern>;
+
+class ROTQMBYBIVecInst<ValueType vectype>:
+    ROTQMBYBIInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+                  [/* no pattern, */]>;
+
+multiclass RotateMaskQuadByBitCount
+{
+  def v16i8: ROTQMBYBIVecInst<v16i8>;
+  def v8i16: ROTQMBYBIVecInst<v8i16>;
+  def v4i32: ROTQMBYBIVecInst<v4i32>;
+  def v2i64: ROTQMBYBIVecInst<v2i64>;
+}
+
+defm ROTQMBYBI: RotateMaskQuadByBitCount;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// Rotate quad and mask by bits
+// Note that the rotate amount has to be negated
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class ROTQMBIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b10011011100, OOL, IOL, "rotqmbi\t$rT, $rA, $rB",
+           RotateShift, pattern>;
+
+class ROTQMBIVecInst<ValueType vectype>:
+    ROTQMBIInst<(outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+                [/* no pattern */]>;
+
+class ROTQMBIRegInst<RegisterClass rclass>:
+    ROTQMBIInst<(outs rclass:$rT), (ins rclass:$rA, R32C:$rB),
+                [/* no pattern */]>;
+
+multiclass RotateMaskQuadByBits
+{
+  def v16i8: ROTQMBIVecInst<v16i8>;
+  def v8i16: ROTQMBIVecInst<v8i16>;
+  def v4i32: ROTQMBIVecInst<v4i32>;
+  def v2i64: ROTQMBIVecInst<v2i64>;
+
+  def r128:  ROTQMBIRegInst<GPRC>;
+  def r64:   ROTQMBIRegInst<R64C>;
+}
+
+defm ROTQMBI: RotateMaskQuadByBits;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// Rotate quad and mask by bits, immediate
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class ROTQMBIIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RI7Form<0b10011111100, OOL, IOL, "rotqmbii\t$rT, $rA, $val",
+            RotateShift, pattern>;
+
+class ROTQMBIIVecInst<ValueType vectype>:
+   ROTQMBIIInst<(outs VECREG:$rT), (ins VECREG:$rA, rotNeg7imm:$val),
+                 [/* no pattern */]>;
+
+class ROTQMBIIRegInst<RegisterClass rclass>:
+   ROTQMBIIInst<(outs rclass:$rT), (ins rclass:$rA, rotNeg7imm:$val),
+                 [/* no pattern */]>;
+
+multiclass RotateMaskQuadByBitsImm
+{
+  def v16i8: ROTQMBIIVecInst<v16i8>;
+  def v8i16: ROTQMBIIVecInst<v8i16>;
+  def v4i32: ROTQMBIIVecInst<v4i32>;
+  def v2i64: ROTQMBIIVecInst<v2i64>;
+
+  def r128:  ROTQMBIIRegInst<GPRC>;
+  def r64:   ROTQMBIIRegInst<R64C>;
+}
+
+defm ROTQMBII: RotateMaskQuadByBitsImm;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+def ROTMAHv8i16:
+    RRForm<0b01111010000, (outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+      "rotmah\t$rT, $rA, $rB", RotateShift,
+      [/* see patterns below - $rB must be negated */]>;
+
+def : Pat<(SPUvec_sra VECREG:$rA, R32C:$rB),
+          (ROTMAHv8i16 VECREG:$rA, (SFIr32 R32C:$rB, 0))>;
+
+def : Pat<(SPUvec_sra VECREG:$rA, R16C:$rB),
+          (ROTMAHv8i16 VECREG:$rA,
+                       (SFIr32 (XSHWr16 R16C:$rB), 0))>;
+
+def : Pat<(SPUvec_sra VECREG:$rA, R8C:$rB),
+          (ROTMAHv8i16 VECREG:$rA,
+                       (SFIr32 (XSHWr16 (XSBHr8 R8C:$rB)), 0))>;
+
+def ROTMAHr16:
+    RRForm<0b01111010000, (outs R16C:$rT), (ins R16C:$rA, R32C:$rB),
+      "rotmah\t$rT, $rA, $rB", RotateShift,
+      [/* see patterns below - $rB must be negated */]>;
+
+def : Pat<(sra R16C:$rA, R32C:$rB),
+          (ROTMAHr16 R16C:$rA, (SFIr32 R32C:$rB, 0))>;
+
+def : Pat<(sra R16C:$rA, R16C:$rB),
+          (ROTMAHr16 R16C:$rA,
+                     (SFIr32 (XSHWr16 R16C:$rB), 0))>;
+
+def : Pat<(sra R16C:$rA, R8C:$rB),
+          (ROTMAHr16 R16C:$rA,
+                     (SFIr32 (XSHWr16 (XSBHr8 R8C:$rB)), 0))>;
+
+def ROTMAHIv8i16:
+    RRForm<0b01111110000, (outs VECREG:$rT), (ins VECREG:$rA, rothNeg7imm:$val),
+      "rotmahi\t$rT, $rA, $val", RotateShift,
+      [(set (v8i16 VECREG:$rT),
+            (SPUvec_sra (v8i16 VECREG:$rA), (i32 uimm7:$val)))]>;
+
+def : Pat<(SPUvec_sra (v8i16 VECREG:$rA), (i16 uimm7:$val)),
+          (ROTMAHIv8i16 (v8i16 VECREG:$rA), (i32 uimm7:$val))>;
+
+def : Pat<(SPUvec_sra (v8i16 VECREG:$rA), (i8 uimm7:$val)),
+          (ROTMAHIv8i16 (v8i16 VECREG:$rA), (i32 uimm7:$val))>;
+
+def ROTMAHIr16:
+    RRForm<0b01111110000, (outs R16C:$rT), (ins R16C:$rA, rothNeg7imm_i16:$val),
+      "rotmahi\t$rT, $rA, $val", RotateShift,
+      [(set R16C:$rT, (sra R16C:$rA, (i16 uimm7:$val)))]>;
+
+def : Pat<(sra R16C:$rA, (i32 imm:$val)),
+          (ROTMAHIr16 R16C:$rA, uimm7:$val)>;
+
+def : Pat<(sra R16C:$rA, (i8 imm:$val)),
+          (ROTMAHIr16 R16C:$rA, uimm7:$val)>;
+
+def ROTMAv4i32:
+    RRForm<0b01011010000, (outs VECREG:$rT), (ins VECREG:$rA, R32C:$rB),
+      "rotma\t$rT, $rA, $rB", RotateShift,
+      [/* see patterns below - $rB must be negated */]>;
+
+def : Pat<(SPUvec_sra VECREG:$rA, R32C:$rB),
+          (ROTMAv4i32 (v4i32 VECREG:$rA), (SFIr32 R32C:$rB, 0))>;
+
+def : Pat<(SPUvec_sra VECREG:$rA, R16C:$rB),
+          (ROTMAv4i32 (v4i32 VECREG:$rA),
+                      (SFIr32 (XSHWr16 R16C:$rB), 0))>;
+
+def : Pat<(SPUvec_sra VECREG:$rA, R8C:$rB),
+          (ROTMAv4i32 (v4i32 VECREG:$rA),
+                      (SFIr32 (XSHWr16 (XSBHr8 R8C:$rB)), 0))>;
+
+def ROTMAr32:
+    RRForm<0b01011010000, (outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
+      "rotma\t$rT, $rA, $rB", RotateShift,
+      [/* see patterns below - $rB must be negated */]>;
+
+def : Pat<(sra R32C:$rA, R32C:$rB),
+          (ROTMAr32 R32C:$rA, (SFIr32 R32C:$rB, 0))>;
+
+def : Pat<(sra R32C:$rA, R16C:$rB),
+          (ROTMAr32 R32C:$rA,
+                    (SFIr32 (XSHWr16 R16C:$rB), 0))>;
+
+def : Pat<(sra R32C:$rA, R8C:$rB),
+          (ROTMAr32 R32C:$rA,
+                    (SFIr32 (XSHWr16 (XSBHr8 R8C:$rB)), 0))>;
+
+class ROTMAIInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b01011110000, OOL, IOL,
+      "rotmai\t$rT, $rA, $val",
+      RotateShift, pattern>;
+
+class ROTMAIVecInst<ValueType vectype, Operand intop, ValueType inttype>:
+    ROTMAIInst<(outs VECREG:$rT), (ins VECREG:$rA, intop:$val),
+      [(set (vectype VECREG:$rT),
+            (SPUvec_sra VECREG:$rA, (inttype uimm7:$val)))]>;
+
+class ROTMAIRegInst<RegisterClass rclass, Operand intop, ValueType inttype>:
+    ROTMAIInst<(outs rclass:$rT), (ins rclass:$rA, intop:$val),
+      [(set rclass:$rT, (sra rclass:$rA, (inttype uimm7:$val)))]>;
+
+multiclass RotateMaskAlgebraicImm {
+  def v2i64_i32 : ROTMAIVecInst<v2i64, rotNeg7imm, i32>;
+  def v4i32_i32 : ROTMAIVecInst<v4i32, rotNeg7imm, i32>;
+  def r64_i32 : ROTMAIRegInst<R64C, rotNeg7imm, i32>;
+  def r32_i32 : ROTMAIRegInst<R32C, rotNeg7imm, i32>;
+}
+
+defm ROTMAI : RotateMaskAlgebraicImm;
+
+//===----------------------------------------------------------------------===//
+// Branch and conditionals:
+//===----------------------------------------------------------------------===//
+
+let isTerminator = 1, isBarrier = 1 in {
+  // Halt If Equal (r32 preferred slot only, no vector form)
+  def HEQr32:
+    RRForm_3<0b00011011110, (outs), (ins R32C:$rA, R32C:$rB),
+      "heq\t$rA, $rB", BranchResolv,
+      [/* no pattern to match */]>;
+
+  def HEQIr32 :
+    RI10Form_2<0b11111110, (outs), (ins R32C:$rA, s10imm:$val),
+      "heqi\t$rA, $val", BranchResolv,
+      [/* no pattern to match */]>;
+
+  // HGT/HGTI: These instructions use signed arithmetic for the comparison,
+  // contrasting with HLGT/HLGTI, which use unsigned comparison:
+  def HGTr32:
+    RRForm_3<0b00011010010, (outs), (ins R32C:$rA, R32C:$rB),
+      "hgt\t$rA, $rB", BranchResolv,
+      [/* no pattern to match */]>;
+
+  def HGTIr32:
+    RI10Form_2<0b11110010, (outs), (ins R32C:$rA, s10imm:$val),
+      "hgti\t$rA, $val", BranchResolv,
+      [/* no pattern to match */]>;
+
+  def HLGTr32:
+    RRForm_3<0b00011011010, (outs), (ins R32C:$rA, R32C:$rB),
+      "hlgt\t$rA, $rB", BranchResolv,
+      [/* no pattern to match */]>;
+
+  def HLGTIr32:
+    RI10Form_2<0b11111010, (outs), (ins R32C:$rA, s10imm:$val),
+      "hlgti\t$rA, $val", BranchResolv,
+      [/* no pattern to match */]>;
+}
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// Comparison operators for i8, i16 and i32:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class CEQBInst<dag OOL, dag IOL, list<dag> pattern> :
+  RRForm<0b00001011110, OOL, IOL, "ceqb\t$rT, $rA, $rB",
+         ByteOp, pattern>;
+
+multiclass CmpEqualByte
+{
+  def v16i8 :
+    CEQBInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      [(set (v16i8 VECREG:$rT), (seteq (v8i16 VECREG:$rA),
+                                       (v8i16 VECREG:$rB)))]>;
+
+  def r8 :
+    CEQBInst<(outs R8C:$rT), (ins R8C:$rA, R8C:$rB),
+             [(set R8C:$rT, (seteq R8C:$rA, R8C:$rB))]>;
+}
+
+class CEQBIInst<dag OOL, dag IOL, list<dag> pattern> :
+  RI10Form<0b01111110, OOL, IOL, "ceqbi\t$rT, $rA, $val",
+           ByteOp, pattern>;
+
+multiclass CmpEqualByteImm
+{
+  def v16i8 :
+    CEQBIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm_i8:$val),
+              [(set (v16i8 VECREG:$rT), (seteq (v16i8 VECREG:$rA),
+                                               v16i8SExt8Imm:$val))]>;
+  def r8:
+    CEQBIInst<(outs R8C:$rT), (ins R8C:$rA, s10imm_i8:$val),
+             [(set R8C:$rT, (seteq R8C:$rA, immSExt8:$val))]>;
+}
+
+class CEQHInst<dag OOL, dag IOL, list<dag> pattern> :
+  RRForm<0b00010011110, OOL, IOL, "ceqh\t$rT, $rA, $rB",
+         ByteOp, pattern>;
+
+multiclass CmpEqualHalfword
+{
+  def v8i16 : CEQHInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+                       [(set (v8i16 VECREG:$rT), (seteq (v8i16 VECREG:$rA),
+                                                        (v8i16 VECREG:$rB)))]>;
+
+  def r16 : CEQHInst<(outs R16C:$rT), (ins R16C:$rA, R16C:$rB),
+                     [(set R16C:$rT, (seteq R16C:$rA, R16C:$rB))]>;
+}
+
+class CEQHIInst<dag OOL, dag IOL, list<dag> pattern> :
+  RI10Form<0b10111110, OOL, IOL, "ceqhi\t$rT, $rA, $val",
+           ByteOp, pattern>;
+
+multiclass CmpEqualHalfwordImm
+{
+  def v8i16 : CEQHIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+                        [(set (v8i16 VECREG:$rT),
+                              (seteq (v8i16 VECREG:$rA),
+                                     (v8i16 v8i16SExt10Imm:$val)))]>;
+  def r16 : CEQHIInst<(outs R16C:$rT), (ins R16C:$rA, s10imm:$val),
+                      [(set R16C:$rT, (seteq R16C:$rA, i16ImmSExt10:$val))]>;
+}
+
+class CEQInst<dag OOL, dag IOL, list<dag> pattern> :
+  RRForm<0b00000011110, OOL, IOL, "ceq\t$rT, $rA, $rB",
+         ByteOp, pattern>;
+
+multiclass CmpEqualWord
+{
+  def v4i32 : CEQInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+                      [(set (v4i32 VECREG:$rT),
+                            (seteq (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+
+  def r32 : CEQInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
+                    [(set R32C:$rT, (seteq R32C:$rA, R32C:$rB))]>;
+}
+
+class CEQIInst<dag OOL, dag IOL, list<dag> pattern> :
+  RI10Form<0b00111110, OOL, IOL, "ceqi\t$rT, $rA, $val",
+           ByteOp, pattern>;
+
+multiclass CmpEqualWordImm
+{
+  def v4i32 : CEQIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+                       [(set (v4i32 VECREG:$rT),
+                             (seteq (v4i32 VECREG:$rA),
+                                    (v4i32 v4i32SExt16Imm:$val)))]>;
+
+  def r32: CEQIInst<(outs R32C:$rT), (ins R32C:$rA, s10imm_i32:$val),
+                    [(set R32C:$rT, (seteq R32C:$rA, i32ImmSExt10:$val))]>;
+}
+
+class CGTBInst<dag OOL, dag IOL, list<dag> pattern> :
+  RRForm<0b00001010010, OOL, IOL, "cgtb\t$rT, $rA, $rB",
+         ByteOp, pattern>;
+
+multiclass CmpGtrByte
+{
+  def v16i8 :
+    CGTBInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      [(set (v16i8 VECREG:$rT), (setgt (v8i16 VECREG:$rA),
+                                       (v8i16 VECREG:$rB)))]>;
+
+  def r8 :
+    CGTBInst<(outs R8C:$rT), (ins R8C:$rA, R8C:$rB),
+             [(set R8C:$rT, (setgt R8C:$rA, R8C:$rB))]>;
+}
+
+class CGTBIInst<dag OOL, dag IOL, list<dag> pattern> :
+  RI10Form<0b01110010, OOL, IOL, "cgtbi\t$rT, $rA, $val",
+           ByteOp, pattern>;
+
+multiclass CmpGtrByteImm
+{
+  def v16i8 :
+    CGTBIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm_i8:$val),
+              [(set (v16i8 VECREG:$rT), (setgt (v16i8 VECREG:$rA),
+                                               v16i8SExt8Imm:$val))]>;
+  def r8:
+    CGTBIInst<(outs R8C:$rT), (ins R8C:$rA, s10imm_i8:$val),
+              [(set R8C:$rT, (setgt R8C:$rA, immSExt8:$val))]>;
+}
+
+class CGTHInst<dag OOL, dag IOL, list<dag> pattern> :
+  RRForm<0b00010010010, OOL, IOL, "cgth\t$rT, $rA, $rB",
+         ByteOp, pattern>;
+
+multiclass CmpGtrHalfword
+{
+  def v8i16 : CGTHInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+                       [(set (v8i16 VECREG:$rT), (setgt (v8i16 VECREG:$rA),
+                                                        (v8i16 VECREG:$rB)))]>;
+
+  def r16 : CGTHInst<(outs R16C:$rT), (ins R16C:$rA, R16C:$rB),
+                     [(set R16C:$rT, (setgt R16C:$rA, R16C:$rB))]>;
+}
+
+class CGTHIInst<dag OOL, dag IOL, list<dag> pattern> :
+  RI10Form<0b10110010, OOL, IOL, "cgthi\t$rT, $rA, $val",
+           ByteOp, pattern>;
+
+multiclass CmpGtrHalfwordImm
+{
+  def v8i16 : CGTHIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+                        [(set (v8i16 VECREG:$rT),
+                              (setgt (v8i16 VECREG:$rA),
+                                     (v8i16 v8i16SExt10Imm:$val)))]>;
+  def r16 : CGTHIInst<(outs R16C:$rT), (ins R16C:$rA, s10imm:$val),
+                      [(set R16C:$rT, (setgt R16C:$rA, i16ImmSExt10:$val))]>;
+}
+
+class CGTInst<dag OOL, dag IOL, list<dag> pattern> :
+  RRForm<0b00000010010, OOL, IOL, "cgt\t$rT, $rA, $rB",
+         ByteOp, pattern>;
+
+multiclass CmpGtrWord
+{
+  def v4i32 : CGTInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+                      [(set (v4i32 VECREG:$rT),
+                            (setgt (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+
+  def r32 : CGTInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
+                    [(set R32C:$rT, (setgt R32C:$rA, R32C:$rB))]>;
+}
+
+class CGTIInst<dag OOL, dag IOL, list<dag> pattern> :
+  RI10Form<0b00110010, OOL, IOL, "cgti\t$rT, $rA, $val",
+           ByteOp, pattern>;
+
+multiclass CmpGtrWordImm
+{
+  def v4i32 : CGTIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+                       [(set (v4i32 VECREG:$rT),
+                             (setgt (v4i32 VECREG:$rA),
+                                    (v4i32 v4i32SExt16Imm:$val)))]>;
+
+  def r32: CGTIInst<(outs R32C:$rT), (ins R32C:$rA, s10imm_i32:$val),
+                    [(set R32C:$rT, (setgt R32C:$rA, i32ImmSExt10:$val))]>;
+
+  // CGTIv4f32, CGTIf32: These are used in the f32 fdiv instruction sequence:
+  def v4f32: CGTIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+                       [(set (v4i32 VECREG:$rT),
+                             (setgt (v4i32 (bitconvert (v4f32 VECREG:$rA))),
+                                    (v4i32 v4i32SExt16Imm:$val)))]>;
+
+  def f32:   CGTIInst<(outs R32C:$rT), (ins R32FP:$rA, s10imm_i32:$val),
+                      [/* no pattern */]>;
+}
+
+class CLGTBInst<dag OOL, dag IOL, list<dag> pattern> :
+  RRForm<0b00001011010, OOL, IOL, "clgtb\t$rT, $rA, $rB",
+         ByteOp, pattern>;
+
+multiclass CmpLGtrByte
+{
+  def v16i8 :
+    CLGTBInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      [(set (v16i8 VECREG:$rT), (setugt (v8i16 VECREG:$rA),
+                                       (v8i16 VECREG:$rB)))]>;
+
+  def r8 :
+    CLGTBInst<(outs R8C:$rT), (ins R8C:$rA, R8C:$rB),
+             [(set R8C:$rT, (setugt R8C:$rA, R8C:$rB))]>;
+}
+
+class CLGTBIInst<dag OOL, dag IOL, list<dag> pattern> :
+  RI10Form<0b01111010, OOL, IOL, "clgtbi\t$rT, $rA, $val",
+           ByteOp, pattern>;
+
+multiclass CmpLGtrByteImm
+{
+  def v16i8 :
+    CLGTBIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm_i8:$val),
+              [(set (v16i8 VECREG:$rT), (setugt (v16i8 VECREG:$rA),
+                                               v16i8SExt8Imm:$val))]>;
+  def r8:
+    CLGTBIInst<(outs R8C:$rT), (ins R8C:$rA, s10imm_i8:$val),
+             [(set R8C:$rT, (setugt R8C:$rA, immSExt8:$val))]>;
+}
+
+class CLGTHInst<dag OOL, dag IOL, list<dag> pattern> :
+  RRForm<0b00010011010, OOL, IOL, "clgth\t$rT, $rA, $rB",
+         ByteOp, pattern>;
+
+multiclass CmpLGtrHalfword
+{
+  def v8i16 : CLGTHInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+                       [(set (v8i16 VECREG:$rT), (setugt (v8i16 VECREG:$rA),
+                                                        (v8i16 VECREG:$rB)))]>;
+
+  def r16 : CLGTHInst<(outs R16C:$rT), (ins R16C:$rA, R16C:$rB),
+                     [(set R16C:$rT, (setugt R16C:$rA, R16C:$rB))]>;
+}
+
+class CLGTHIInst<dag OOL, dag IOL, list<dag> pattern> :
+  RI10Form<0b10111010, OOL, IOL, "clgthi\t$rT, $rA, $val",
+           ByteOp, pattern>;
+
+multiclass CmpLGtrHalfwordImm
+{
+  def v8i16 : CLGTHIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+                         [(set (v8i16 VECREG:$rT),
+                               (setugt (v8i16 VECREG:$rA),
+                                       (v8i16 v8i16SExt10Imm:$val)))]>;
+  def r16 : CLGTHIInst<(outs R16C:$rT), (ins R16C:$rA, s10imm:$val),
+                       [(set R16C:$rT, (setugt R16C:$rA, i16ImmSExt10:$val))]>;
+}
+
+class CLGTInst<dag OOL, dag IOL, list<dag> pattern> :
+  RRForm<0b00000011010, OOL, IOL, "clgt\t$rT, $rA, $rB",
+         ByteOp, pattern>;
+
+multiclass CmpLGtrWord
+{
+  def v4i32 : CLGTInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+                      [(set (v4i32 VECREG:$rT),
+                            (setugt (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)))]>;
+
+  def r32 : CLGTInst<(outs R32C:$rT), (ins R32C:$rA, R32C:$rB),
+                     [(set R32C:$rT, (setugt R32C:$rA, R32C:$rB))]>;
+}
+
+class CLGTIInst<dag OOL, dag IOL, list<dag> pattern> :
+  RI10Form<0b00111010, OOL, IOL, "clgti\t$rT, $rA, $val",
+           ByteOp, pattern>;
+
+multiclass CmpLGtrWordImm
+{
+  def v4i32 : CLGTIInst<(outs VECREG:$rT), (ins VECREG:$rA, s10imm:$val),
+                       [(set (v4i32 VECREG:$rT),
+                             (setugt (v4i32 VECREG:$rA),
+                                    (v4i32 v4i32SExt16Imm:$val)))]>;
+
+  def r32: CLGTIInst<(outs R32C:$rT), (ins R32C:$rA, s10imm_i32:$val),
+                     [(set R32C:$rT, (setugt R32C:$rA, i32ImmSExt10:$val))]>;
+}
+
+defm CEQB   : CmpEqualByte;
+defm CEQBI  : CmpEqualByteImm;
+defm CEQH   : CmpEqualHalfword;
+defm CEQHI  : CmpEqualHalfwordImm;
+defm CEQ    : CmpEqualWord;
+defm CEQI   : CmpEqualWordImm;
+defm CGTB   : CmpGtrByte;
+defm CGTBI  : CmpGtrByteImm;
+defm CGTH   : CmpGtrHalfword;
+defm CGTHI  : CmpGtrHalfwordImm;
+defm CGT    : CmpGtrWord;
+defm CGTI   : CmpGtrWordImm;
+defm CLGTB  : CmpLGtrByte;
+defm CLGTBI : CmpLGtrByteImm;
+defm CLGTH  : CmpLGtrHalfword;
+defm CLGTHI : CmpLGtrHalfwordImm;
+defm CLGT   : CmpLGtrWord;
+defm CLGTI  : CmpLGtrWordImm;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// For SETCC primitives not supported above (setlt, setle, setge, etc.)
+// define a pattern to generate the right code, as a binary operator
+// (in a manner of speaking.)
+//
+// Notes:
+// 1. This only matches the setcc set of conditionals. Special pattern
+//    matching is used for select conditionals.
+//
+// 2. The "DAG" versions of these classes is almost exclusively used for
+//    i64 comparisons. See the tblgen fundamentals documentation for what
+//    ".ResultInstrs[0]" means; see TargetSelectionDAG.td and the Pattern
+//    class for where ResultInstrs originates.
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class SETCCNegCondReg<PatFrag cond, RegisterClass rclass, ValueType inttype,
+                      SPUInstr xorinst, SPUInstr cmpare>:
+  Pat<(cond rclass:$rA, rclass:$rB),
+      (xorinst (cmpare rclass:$rA, rclass:$rB), (inttype -1))>;
+
+class SETCCNegCondImm<PatFrag cond, RegisterClass rclass, ValueType inttype,
+                      PatLeaf immpred, SPUInstr xorinst, SPUInstr cmpare>:
+  Pat<(cond rclass:$rA, (inttype immpred:$imm)),
+      (xorinst (cmpare rclass:$rA, (inttype immpred:$imm)), (inttype -1))>;
+
+def : SETCCNegCondReg<setne, R8C, i8, XORBIr8,  CEQBr8>;
+def : SETCCNegCondImm<setne, R8C, i8, immSExt8, XORBIr8, CEQBIr8>;
+
+def : SETCCNegCondReg<setne, R16C, i16, XORHIr16,     CEQHr16>;
+def : SETCCNegCondImm<setne, R16C, i16, i16ImmSExt10, XORHIr16, CEQHIr16>;
+
+def : SETCCNegCondReg<setne, R32C, i32, XORIr32, CEQr32>;
+def : SETCCNegCondImm<setne, R32C, i32, i32ImmSExt10, XORIr32, CEQIr32>;
+
+class SETCCBinOpReg<PatFrag cond, RegisterClass rclass,
+                    SPUInstr binop, SPUInstr cmpOp1, SPUInstr cmpOp2>:
+    Pat<(cond rclass:$rA, rclass:$rB),
+        (binop (cmpOp1 rclass:$rA, rclass:$rB),
+               (cmpOp2 rclass:$rA, rclass:$rB))>;
+
+class SETCCBinOpImm<PatFrag cond, RegisterClass rclass, PatLeaf immpred,
+                    ValueType immtype,
+                    SPUInstr binop, SPUInstr cmpOp1, SPUInstr cmpOp2>:
+    Pat<(cond rclass:$rA, (immtype immpred:$imm)),
+        (binop (cmpOp1 rclass:$rA, (immtype immpred:$imm)),
+               (cmpOp2 rclass:$rA, (immtype immpred:$imm)))>;
+
+def : SETCCBinOpReg<setge, R8C, ORr8, CGTBr8, CEQBr8>;
+def : SETCCBinOpImm<setge, R8C, immSExt8, i8, ORr8, CGTBIr8, CEQBIr8>;
+def : SETCCBinOpReg<setlt, R8C, NORr8, CGTBr8, CEQBr8>;
+def : SETCCBinOpImm<setlt, R8C, immSExt8, i8, NORr8, CGTBIr8, CEQBIr8>;
+def : Pat<(setle R8C:$rA, R8C:$rB),
+          (XORBIr8 (CGTBr8 R8C:$rA, R8C:$rB), 0xff)>;
+def :  Pat<(setle R8C:$rA, immU8:$imm),
+           (XORBIr8 (CGTBIr8 R8C:$rA, immU8:$imm), 0xff)>;
+
+def : SETCCBinOpReg<setge, R16C, ORr16, CGTHr16, CEQHr16>;
+def : SETCCBinOpImm<setge, R16C, i16ImmSExt10, i16,
+                    ORr16, CGTHIr16, CEQHIr16>;
+def : SETCCBinOpReg<setlt, R16C, NORr16, CGTHr16, CEQHr16>;
+def : SETCCBinOpImm<setlt, R16C, i16ImmSExt10, i16, NORr16, CGTHIr16, CEQHIr16>;
+def : Pat<(setle R16C:$rA, R16C:$rB),
+          (XORHIr16 (CGTHr16 R16C:$rA, R16C:$rB), 0xffff)>;
+def : Pat<(setle R16C:$rA, i16ImmSExt10:$imm),
+          (XORHIr16 (CGTHIr16 R16C:$rA, i16ImmSExt10:$imm), 0xffff)>;
+
+def : SETCCBinOpReg<setge, R32C, ORr32, CGTr32, CEQr32>;
+def : SETCCBinOpImm<setge, R32C, i32ImmSExt10, i32,
+                    ORr32, CGTIr32, CEQIr32>;
+def : SETCCBinOpReg<setlt, R32C, NORr32, CGTr32, CEQr32>;
+def : SETCCBinOpImm<setlt, R32C, i32ImmSExt10, i32, NORr32, CGTIr32, CEQIr32>;
+def : Pat<(setle R32C:$rA, R32C:$rB),
+          (XORIr32 (CGTr32 R32C:$rA, R32C:$rB), 0xffffffff)>;
+def : Pat<(setle R32C:$rA, i32ImmSExt10:$imm),
+          (XORIr32 (CGTIr32 R32C:$rA, i32ImmSExt10:$imm), 0xffffffff)>;
+
+def : SETCCBinOpReg<setuge, R8C, ORr8, CLGTBr8, CEQBr8>;
+def : SETCCBinOpImm<setuge, R8C, immSExt8, i8, ORr8, CLGTBIr8, CEQBIr8>;
+def : SETCCBinOpReg<setult, R8C, NORr8, CLGTBr8, CEQBr8>;
+def : SETCCBinOpImm<setult, R8C, immSExt8, i8, NORr8, CLGTBIr8, CEQBIr8>;
+def : Pat<(setule R8C:$rA, R8C:$rB),
+          (XORBIr8 (CLGTBr8 R8C:$rA, R8C:$rB), 0xff)>;
+def :  Pat<(setule R8C:$rA, immU8:$imm),
+           (XORBIr8 (CLGTBIr8 R8C:$rA, immU8:$imm), 0xff)>;
+
+def : SETCCBinOpReg<setuge, R16C, ORr16, CLGTHr16, CEQHr16>;
+def : SETCCBinOpImm<setuge, R16C, i16ImmSExt10, i16,
+                    ORr16, CLGTHIr16, CEQHIr16>;
+def : SETCCBinOpReg<setult, R16C, NORr16, CLGTHr16, CEQHr16>;
+def : SETCCBinOpImm<setult, R16C, i16ImmSExt10, i16, NORr16,
+                    CLGTHIr16, CEQHIr16>;
+def : Pat<(setule R16C:$rA, R16C:$rB),
+          (XORHIr16 (CLGTHr16 R16C:$rA, R16C:$rB), 0xffff)>;
+def :  Pat<(setule R16C:$rA, i16ImmSExt10:$imm),
+           (XORHIr16 (CLGTHIr16 R16C:$rA, i16ImmSExt10:$imm), 0xffff)>;
+
+def : SETCCBinOpReg<setuge, R32C, ORr32, CLGTr32, CEQr32>;
+def : SETCCBinOpImm<setuge, R32C, i32ImmSExt10, i32,
+                    ORr32, CLGTIr32, CEQIr32>;
+def : SETCCBinOpReg<setult, R32C, NORr32, CLGTr32, CEQr32>;
+def : SETCCBinOpImm<setult, R32C, i32ImmSExt10, i32, NORr32, CLGTIr32, CEQIr32>;
+def : Pat<(setule R32C:$rA, R32C:$rB),
+          (XORIr32 (CLGTr32 R32C:$rA, R32C:$rB), 0xffffffff)>;
+def : Pat<(setule R32C:$rA, i32ImmSExt10:$imm),
+          (XORIr32 (CLGTIr32 R32C:$rA, i32ImmSExt10:$imm), 0xffffffff)>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// select conditional patterns:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+class SELECTNegCondReg<PatFrag cond, RegisterClass rclass, ValueType inttype,
+                       SPUInstr selinstr, SPUInstr cmpare>:
+  Pat<(select (inttype (cond rclass:$rA, rclass:$rB)),
+              rclass:$rTrue, rclass:$rFalse),
+      (selinstr rclass:$rTrue, rclass:$rFalse,
+                (cmpare rclass:$rA, rclass:$rB))>;
+
+class SELECTNegCondImm<PatFrag cond, RegisterClass rclass, ValueType inttype,
+                       PatLeaf immpred, SPUInstr selinstr, SPUInstr cmpare>:
+  Pat<(select (inttype (cond rclass:$rA, immpred:$imm)),
+              rclass:$rTrue, rclass:$rFalse),
+      (selinstr rclass:$rTrue, rclass:$rFalse,
+                (cmpare rclass:$rA, immpred:$imm))>;
+
+def : SELECTNegCondReg<setne, R8C, i8, SELBr8, CEQBr8>;
+def : SELECTNegCondImm<setne, R8C, i8, immSExt8, SELBr8, CEQBIr8>;
+def : SELECTNegCondReg<setle, R8C, i8, SELBr8, CGTBr8>;
+def : SELECTNegCondImm<setle, R8C, i8, immSExt8, SELBr8, CGTBr8>;
+def : SELECTNegCondReg<setule, R8C, i8, SELBr8, CLGTBr8>;
+def : SELECTNegCondImm<setule, R8C, i8, immU8, SELBr8, CLGTBIr8>;
+
+def : SELECTNegCondReg<setne, R16C, i16, SELBr16, CEQHr16>;
+def : SELECTNegCondImm<setne, R16C, i16, i16ImmSExt10, SELBr16, CEQHIr16>;
+def : SELECTNegCondReg<setle, R16C, i16, SELBr16, CGTHr16>;
+def : SELECTNegCondImm<setle, R16C, i16, i16ImmSExt10, SELBr16, CGTHIr16>;
+def : SELECTNegCondReg<setule, R16C, i16, SELBr16, CLGTHr16>;
+def : SELECTNegCondImm<setule, R16C, i16, i16ImmSExt10, SELBr16, CLGTHIr16>;
+
+def : SELECTNegCondReg<setne, R32C, i32, SELBr32, CEQr32>;
+def : SELECTNegCondImm<setne, R32C, i32, i32ImmSExt10, SELBr32, CEQIr32>;
+def : SELECTNegCondReg<setle, R32C, i32, SELBr32, CGTr32>;
+def : SELECTNegCondImm<setle, R32C, i32, i32ImmSExt10, SELBr32, CGTIr32>;
+def : SELECTNegCondReg<setule, R32C, i32, SELBr32, CLGTr32>;
+def : SELECTNegCondImm<setule, R32C, i32, i32ImmSExt10, SELBr32, CLGTIr32>;
+
+class SELECTBinOpReg<PatFrag cond, RegisterClass rclass, ValueType inttype,
+                     SPUInstr selinstr, SPUInstr binop, SPUInstr cmpOp1,
+                     SPUInstr cmpOp2>:
+  Pat<(select (inttype (cond rclass:$rA, rclass:$rB)),
+              rclass:$rTrue, rclass:$rFalse),
+      (selinstr rclass:$rFalse, rclass:$rTrue,
+                (binop (cmpOp1 rclass:$rA, rclass:$rB),
+                       (cmpOp2 rclass:$rA, rclass:$rB)))>;
+
+class SELECTBinOpImm<PatFrag cond, RegisterClass rclass, PatLeaf immpred,
+                     ValueType inttype,
+                     SPUInstr selinstr, SPUInstr binop, SPUInstr cmpOp1,
+                     SPUInstr cmpOp2>:
+    Pat<(select (inttype (cond rclass:$rA, (inttype immpred:$imm))),
+                rclass:$rTrue, rclass:$rFalse),
+        (selinstr rclass:$rFalse, rclass:$rTrue,
+                  (binop (cmpOp1 rclass:$rA, (inttype immpred:$imm)),
+                         (cmpOp2 rclass:$rA, (inttype immpred:$imm))))>;
+
+def : SELECTBinOpReg<setge, R8C, i8, SELBr8, ORr8, CGTBr8, CEQBr8>;
+def : SELECTBinOpImm<setge, R8C, immSExt8, i8,
+                     SELBr8, ORr8, CGTBIr8, CEQBIr8>;
+
+def : SELECTBinOpReg<setge, R16C, i16, SELBr16, ORr16, CGTHr16, CEQHr16>;
+def : SELECTBinOpImm<setge, R16C, i16ImmSExt10, i16,
+                     SELBr16, ORr16, CGTHIr16, CEQHIr16>;
+
+def : SELECTBinOpReg<setge, R32C, i32, SELBr32, ORr32, CGTr32, CEQr32>;
+def : SELECTBinOpImm<setge, R32C, i32ImmSExt10, i32,
+                     SELBr32, ORr32, CGTIr32, CEQIr32>;
+
+def : SELECTBinOpReg<setuge, R8C, i8, SELBr8, ORr8, CLGTBr8, CEQBr8>;
+def : SELECTBinOpImm<setuge, R8C, immSExt8, i8,
+                     SELBr8, ORr8, CLGTBIr8, CEQBIr8>;
+
+def : SELECTBinOpReg<setuge, R16C, i16, SELBr16, ORr16, CLGTHr16, CEQHr16>;
+def : SELECTBinOpImm<setuge, R16C, i16ImmUns10, i16,
+                     SELBr16, ORr16, CLGTHIr16, CEQHIr16>;
+
+def : SELECTBinOpReg<setuge, R32C, i32, SELBr32, ORr32, CLGTr32, CEQr32>;
+def : SELECTBinOpImm<setuge, R32C, i32ImmUns10, i32,
+                     SELBr32, ORr32, CLGTIr32, CEQIr32>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+let isCall = 1,
+  // All calls clobber the non-callee-saved registers:
+  Defs = [R0, R1, R2, R3, R4, R5, R6, R7, R8, R9,
+          R10,R11,R12,R13,R14,R15,R16,R17,R18,R19,
+          R20,R21,R22,R23,R24,R25,R26,R27,R28,R29,
+          R30,R31,R32,R33,R34,R35,R36,R37,R38,R39,
+          R40,R41,R42,R43,R44,R45,R46,R47,R48,R49,
+          R50,R51,R52,R53,R54,R55,R56,R57,R58,R59,
+          R60,R61,R62,R63,R64,R65,R66,R67,R68,R69,
+          R70,R71,R72,R73,R74,R75,R76,R77,R78,R79],
+  // All of these instructions use $lr (aka $0)
+  Uses = [R0]  in {
+  // Branch relative and set link: Used if we actually know that the target
+  // is within [-32768, 32767] bytes of the target
+  def BRSL:
+    BranchSetLink<0b011001100, (outs), (ins relcalltarget:$func, variable_ops),
+      "brsl\t$$lr, $func",
+      [(SPUcall (SPUpcrel tglobaladdr:$func, 0))]>;
+
+  // Branch absolute and set link: Used if we actually know that the target
+  // is an absolute address
+  def BRASL:
+    BranchSetLink<0b011001100, (outs), (ins calltarget:$func, variable_ops),
+      "brasl\t$$lr, $func",
+      [(SPUcall (SPUaform tglobaladdr:$func, 0))]>;
+
+  // Branch indirect and set link if external data. These instructions are not
+  // actually generated, matched by an intrinsic:
+  def BISLED_00: BISLEDForm<0b11, "bisled\t$$lr, $func", [/* empty pattern */]>;
+  def BISLED_E0: BISLEDForm<0b10, "bisled\t$$lr, $func", [/* empty pattern */]>;
+  def BISLED_0D: BISLEDForm<0b01, "bisled\t$$lr, $func", [/* empty pattern */]>;
+  def BISLED_ED: BISLEDForm<0b00, "bisled\t$$lr, $func", [/* empty pattern */]>;
+
+  // Branch indirect and set link. This is the "X-form" address version of a
+  // function call
+  def BISL:
+    BIForm<0b10010101100, "bisl\t$$lr, $func", [(SPUcall R32C:$func)]>;
+}
+
+// Support calls to external symbols:      
+def : Pat<(SPUcall (SPUpcrel texternalsym:$func, 0)),
+          (BRSL texternalsym:$func)>;
+      
+def : Pat<(SPUcall (SPUaform texternalsym:$func, 0)),
+          (BRASL texternalsym:$func)>;
+
+// Unconditional branches:
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1 in {
+  let isBarrier = 1 in {
+    def BR :
+      UncondBranch<0b001001100, (outs), (ins brtarget:$dest),
+        "br\t$dest",
+        [(br bb:$dest)]>;
+
+    // Unconditional, absolute address branch
+    def BRA:
+      UncondBranch<0b001100000, (outs), (ins brtarget:$dest),
+        "bra\t$dest",
+        [/* no pattern */]>;
+
+    // Indirect branch
+    def BI:
+      BIForm<0b00010101100, "bi\t$func", [(brind R32C:$func)]>;
+  }
+
+  // Conditional branches:
+  class BRNZInst<dag IOL, list<dag> pattern>:
+    RI16Form<0b010000100, (outs), IOL, "brnz\t$rCond,$dest",
+             BranchResolv, pattern>;
+
+  class BRNZRegInst<RegisterClass rclass>:
+    BRNZInst<(ins rclass:$rCond, brtarget:$dest),
+             [(brcond rclass:$rCond, bb:$dest)]>;
+
+  class BRNZVecInst<ValueType vectype>:
+    BRNZInst<(ins VECREG:$rCond, brtarget:$dest),
+             [(brcond (vectype VECREG:$rCond), bb:$dest)]>;
+
+  multiclass BranchNotZero {
+    def v4i32 : BRNZVecInst<v4i32>;
+    def r32   : BRNZRegInst<R32C>;
+  }
+
+  defm BRNZ : BranchNotZero;
+
+  class BRZInst<dag IOL, list<dag> pattern>:
+    RI16Form<0b000000100, (outs), IOL, "brz\t$rT,$dest",
+             BranchResolv, pattern>;
+
+  class BRZRegInst<RegisterClass rclass>:
+    BRZInst<(ins rclass:$rT, brtarget:$dest), [/* no pattern */]>;
+
+  class BRZVecInst<ValueType vectype>:
+    BRZInst<(ins VECREG:$rT, brtarget:$dest), [/* no pattern */]>;
+
+  multiclass BranchZero {
+    def v4i32: BRZVecInst<v4i32>;
+    def r32:   BRZRegInst<R32C>;
+  }
+
+  defm BRZ: BranchZero;
+
+  // Note: LLVM doesn't do branch conditional, indirect. Otherwise these would
+  // be useful:
+  /*
+  class BINZInst<dag IOL, list<dag> pattern>:
+   BICondForm<0b10010100100, (outs), IOL, "binz\t$rA, $dest", pattern>;
+
+  class BINZRegInst<RegisterClass rclass>:
+    BINZInst<(ins rclass:$rA, brtarget:$dest),
+             [(brcond rclass:$rA, R32C:$dest)]>;
+
+  class BINZVecInst<ValueType vectype>:
+    BINZInst<(ins VECREG:$rA, R32C:$dest),
+             [(brcond (vectype VECREG:$rA), R32C:$dest)]>;
+
+  multiclass BranchNotZeroIndirect {
+    def v4i32: BINZVecInst<v4i32>;
+    def r32:   BINZRegInst<R32C>;
+  }
+
+  defm BINZ: BranchNotZeroIndirect;
+
+  class BIZInst<dag IOL, list<dag> pattern>:
+    BICondForm<0b00010100100, (outs), IOL, "biz\t$rA, $func", pattern>;
+
+  class BIZRegInst<RegisterClass rclass>:
+    BIZInst<(ins rclass:$rA, R32C:$func), [/* no pattern */]>;
+
+  class BIZVecInst<ValueType vectype>:
+    BIZInst<(ins VECREG:$rA, R32C:$func), [/* no pattern */]>;
+
+  multiclass BranchZeroIndirect {
+    def v4i32: BIZVecInst<v4i32>;
+    def r32:   BIZRegInst<R32C>;
+  }
+
+  defm BIZ: BranchZeroIndirect;
+  */
+
+  class BRHNZInst<dag IOL, list<dag> pattern>:
+    RI16Form<0b011000100, (outs), IOL, "brhnz\t$rCond,$dest", BranchResolv,
+             pattern>;
+
+  class BRHNZRegInst<RegisterClass rclass>:
+    BRHNZInst<(ins rclass:$rCond, brtarget:$dest),
+              [(brcond rclass:$rCond, bb:$dest)]>;
+
+  class BRHNZVecInst<ValueType vectype>:
+    BRHNZInst<(ins VECREG:$rCond, brtarget:$dest), [/* no pattern */]>;
+
+  multiclass BranchNotZeroHalfword {
+    def v8i16: BRHNZVecInst<v8i16>;
+    def r16:   BRHNZRegInst<R16C>;
+  }
+
+  defm BRHNZ: BranchNotZeroHalfword;
+
+  class BRHZInst<dag IOL, list<dag> pattern>:
+    RI16Form<0b001000100, (outs), IOL, "brhz\t$rT,$dest", BranchResolv,
+             pattern>;
+
+  class BRHZRegInst<RegisterClass rclass>:
+    BRHZInst<(ins rclass:$rT, brtarget:$dest), [/* no pattern */]>;
+
+  class BRHZVecInst<ValueType vectype>:
+    BRHZInst<(ins VECREG:$rT, brtarget:$dest), [/* no pattern */]>;
+
+  multiclass BranchZeroHalfword {
+    def v8i16: BRHZVecInst<v8i16>;
+    def r16:   BRHZRegInst<R16C>;
+  }
+
+  defm BRHZ: BranchZeroHalfword;
+}
+
+//===----------------------------------------------------------------------===//
+// setcc and brcond patterns:
+//===----------------------------------------------------------------------===//
+
+def : Pat<(brcond (i16 (seteq R16C:$rA, 0)), bb:$dest),
+          (BRHZr16 R16C:$rA, bb:$dest)>;
+def : Pat<(brcond (i16 (setne R16C:$rA, 0)), bb:$dest),
+          (BRHNZr16 R16C:$rA, bb:$dest)>;
+
+def : Pat<(brcond (i32 (seteq R32C:$rA, 0)), bb:$dest),
+          (BRZr32 R32C:$rA, bb:$dest)>;
+def : Pat<(brcond (i32 (setne R32C:$rA, 0)), bb:$dest),
+          (BRNZr32 R32C:$rA, bb:$dest)>;
+
+multiclass BranchCondEQ<PatFrag cond, SPUInstr brinst16, SPUInstr brinst32>
+{
+  def r16imm: Pat<(brcond (i16 (cond R16C:$rA, i16ImmSExt10:$val)), bb:$dest),
+                  (brinst16 (CEQHIr16 R16C:$rA, i16ImmSExt10:$val), bb:$dest)>;
+
+  def r16 : Pat<(brcond (i16 (cond R16C:$rA, R16C:$rB)), bb:$dest),
+                (brinst16 (CEQHr16 R16C:$rA, R16:$rB), bb:$dest)>;
+
+  def r32imm : Pat<(brcond (i32 (cond R32C:$rA, i32ImmSExt10:$val)), bb:$dest),
+                   (brinst32 (CEQIr32 R32C:$rA, i32ImmSExt10:$val), bb:$dest)>;
+
+  def r32 : Pat<(brcond (i32 (cond R32C:$rA, R32C:$rB)), bb:$dest),
+                (brinst32 (CEQr32 R32C:$rA, R32C:$rB), bb:$dest)>;
+}
+
+defm BRCONDeq : BranchCondEQ<seteq, BRHNZr16, BRNZr32>;
+defm BRCONDne : BranchCondEQ<setne, BRHZr16, BRZr32>;
+
+multiclass BranchCondLGT<PatFrag cond, SPUInstr brinst16, SPUInstr brinst32>
+{
+  def r16imm : Pat<(brcond (i16 (cond R16C:$rA, i16ImmSExt10:$val)), bb:$dest),
+                   (brinst16 (CLGTHIr16 R16C:$rA, i16ImmSExt10:$val), bb:$dest)>;
+
+  def r16 : Pat<(brcond (i16 (cond R16C:$rA, R16C:$rB)), bb:$dest),
+                (brinst16 (CLGTHr16 R16C:$rA, R16:$rB), bb:$dest)>;
+
+  def r32imm : Pat<(brcond (i32 (cond R32C:$rA, i32ImmSExt10:$val)), bb:$dest),
+                   (brinst32 (CLGTIr32 R32C:$rA, i32ImmSExt10:$val), bb:$dest)>;
+
+  def r32 : Pat<(brcond (i32 (cond R32C:$rA, R32C:$rB)), bb:$dest),
+                (brinst32 (CLGTr32 R32C:$rA, R32C:$rB), bb:$dest)>;
+}
+
+defm BRCONDugt : BranchCondLGT<setugt, BRHNZr16, BRNZr32>;
+defm BRCONDule : BranchCondLGT<setule, BRHZr16, BRZr32>;
+
+multiclass BranchCondLGTEQ<PatFrag cond, SPUInstr orinst16, SPUInstr brinst16,
+                           SPUInstr orinst32, SPUInstr brinst32>
+{
+  def r16imm: Pat<(brcond (i16 (cond R16C:$rA, i16ImmSExt10:$val)), bb:$dest),
+                  (brinst16 (orinst16 (CLGTHIr16 R16C:$rA, i16ImmSExt10:$val),
+                                      (CEQHIr16 R16C:$rA, i16ImmSExt10:$val)),
+                            bb:$dest)>;
+
+  def r16: Pat<(brcond (i16 (cond R16C:$rA, R16C:$rB)), bb:$dest),
+               (brinst16 (orinst16 (CLGTHr16 R16C:$rA, R16:$rB),
+                                   (CEQHr16 R16C:$rA, R16:$rB)),
+                         bb:$dest)>;
+
+  def r32imm : Pat<(brcond (i32 (cond R32C:$rA, i32ImmSExt10:$val)), bb:$dest),
+                   (brinst32 (orinst32 (CLGTIr32 R32C:$rA, i32ImmSExt10:$val),
+                                       (CEQIr32 R32C:$rA, i32ImmSExt10:$val)),
+                             bb:$dest)>;
+
+  def r32 : Pat<(brcond (i32 (cond R32C:$rA, R32C:$rB)), bb:$dest),
+                (brinst32 (orinst32 (CLGTr32 R32C:$rA, R32C:$rB),
+                                    (CEQr32 R32C:$rA, R32C:$rB)),
+                          bb:$dest)>;
+}
+
+defm BRCONDuge : BranchCondLGTEQ<setuge, ORr16, BRHNZr16, ORr32, BRNZr32>;
+defm BRCONDult : BranchCondLGTEQ<setult, ORr16, BRHZr16, ORr32, BRZr32>;
+
+multiclass BranchCondGT<PatFrag cond, SPUInstr brinst16, SPUInstr brinst32>
+{
+  def r16imm : Pat<(brcond (i16 (cond R16C:$rA, i16ImmSExt10:$val)), bb:$dest),
+                   (brinst16 (CGTHIr16 R16C:$rA, i16ImmSExt10:$val), bb:$dest)>;
+
+  def r16 : Pat<(brcond (i16 (cond R16C:$rA, R16C:$rB)), bb:$dest),
+                (brinst16 (CGTHr16 R16C:$rA, R16:$rB), bb:$dest)>;
+
+  def r32imm : Pat<(brcond (i32 (cond R32C:$rA, i32ImmSExt10:$val)), bb:$dest),
+                   (brinst32 (CGTIr32 R32C:$rA, i32ImmSExt10:$val), bb:$dest)>;
+
+  def r32 : Pat<(brcond (i32 (cond R32C:$rA, R32C:$rB)), bb:$dest),
+                (brinst32 (CGTr32 R32C:$rA, R32C:$rB), bb:$dest)>;
+}
+
+defm BRCONDgt : BranchCondGT<setgt, BRHNZr16, BRNZr32>;
+defm BRCONDle : BranchCondGT<setle, BRHZr16, BRZr32>;
+
+multiclass BranchCondGTEQ<PatFrag cond, SPUInstr orinst16, SPUInstr brinst16,
+                          SPUInstr orinst32, SPUInstr brinst32>
+{
+  def r16imm: Pat<(brcond (i16 (cond R16C:$rA, i16ImmSExt10:$val)), bb:$dest),
+                  (brinst16 (orinst16 (CGTHIr16 R16C:$rA, i16ImmSExt10:$val),
+                                      (CEQHIr16 R16C:$rA, i16ImmSExt10:$val)),
+                            bb:$dest)>;
+
+  def r16: Pat<(brcond (i16 (cond R16C:$rA, R16C:$rB)), bb:$dest),
+               (brinst16 (orinst16 (CGTHr16 R16C:$rA, R16:$rB),
+                                   (CEQHr16 R16C:$rA, R16:$rB)),
+                         bb:$dest)>;
+
+  def r32imm : Pat<(brcond (i32 (cond R32C:$rA, i32ImmSExt10:$val)), bb:$dest),
+                   (brinst32 (orinst32 (CGTIr32 R32C:$rA, i32ImmSExt10:$val),
+                                       (CEQIr32 R32C:$rA, i32ImmSExt10:$val)),
+                             bb:$dest)>;
+
+  def r32 : Pat<(brcond (i32 (cond R32C:$rA, R32C:$rB)), bb:$dest),
+                (brinst32 (orinst32 (CGTr32 R32C:$rA, R32C:$rB),
+                                    (CEQr32 R32C:$rA, R32C:$rB)),
+                          bb:$dest)>;
+}
+
+defm BRCONDge : BranchCondGTEQ<setge, ORr16, BRHNZr16, ORr32, BRNZr32>;
+defm BRCONDlt : BranchCondGTEQ<setlt, ORr16, BRHZr16, ORr32, BRZr32>;
+
+let isTerminator = 1, isBarrier = 1 in {
+  let isReturn = 1 in {
+    def RET:
+        RETForm<"bi\t$$lr", [(retflag)]>;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Single precision floating point instructions
+//===----------------------------------------------------------------------===//
+
+class FAInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b01011000100, OOL, IOL, "fa\t$rT, $rA, $rB",
+           SPrecFP, pattern>;
+
+class FAVecInst<ValueType vectype>:
+    FAInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+             [(set (vectype VECREG:$rT),
+                   (fadd (vectype VECREG:$rA), (vectype VECREG:$rB)))]>;
+
+multiclass SFPAdd
+{
+  def v4f32: FAVecInst<v4f32>;
+  def f32:   FAInst<(outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB),
+                    [(set R32FP:$rT, (fadd R32FP:$rA, R32FP:$rB))]>;
+}
+
+defm FA : SFPAdd;
+
+class FSInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b01011000100, OOL, IOL, "fs\t$rT, $rA, $rB",
+           SPrecFP, pattern>;
+
+class FSVecInst<ValueType vectype>:
+    FSInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+           [(set (vectype VECREG:$rT),
+                 (fsub (vectype VECREG:$rA), (vectype VECREG:$rB)))]>;
+
+multiclass SFPSub
+{
+  def v4f32: FSVecInst<v4f32>;
+  def f32:   FSInst<(outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB),
+                    [(set R32FP:$rT, (fsub R32FP:$rA, R32FP:$rB))]>;
+}
+
+defm FS : SFPSub;
+
+// Floating point reciprocal estimate
+
+class FRESTInst<dag OOL, dag IOL>:
+  RRForm_1<0b00110111000, OOL, IOL,
+           "frest\t$rT, $rA", SPrecFP,
+           [/* no pattern */]>;
+
+def FRESTv4f32 :
+    FRESTInst<(outs VECREG:$rT), (ins VECREG:$rA)>;
+
+def FRESTf32 :
+    FRESTInst<(outs R32FP:$rT), (ins R32FP:$rA)>;
+
+// Floating point interpolate (used in conjunction with reciprocal estimate)
+def FIv4f32 :
+    RRForm<0b00101011110, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "fi\t$rT, $rA, $rB", SPrecFP,
+      [/* no pattern */]>;
+
+def FIf32 :
+    RRForm<0b00101011110, (outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB),
+      "fi\t$rT, $rA, $rB", SPrecFP,
+      [/* no pattern */]>;
+
+//--------------------------------------------------------------------------
+// Basic single precision floating point comparisons:
+//
+// Note: There is no support on SPU for single precision NaN. Consequently,
+// ordered and unordered comparisons are the same.
+//--------------------------------------------------------------------------
+
+def FCEQf32 :
+    RRForm<0b01000011110, (outs R32C:$rT), (ins R32FP:$rA, R32FP:$rB),
+      "fceq\t$rT, $rA, $rB", SPrecFP,
+      [(set R32C:$rT, (setueq R32FP:$rA, R32FP:$rB))]>;
+
+def : Pat<(setoeq R32FP:$rA, R32FP:$rB),
+          (FCEQf32 R32FP:$rA, R32FP:$rB)>;
+
+def FCMEQf32 :
+    RRForm<0b01010011110, (outs R32C:$rT), (ins R32FP:$rA, R32FP:$rB),
+      "fcmeq\t$rT, $rA, $rB", SPrecFP,
+      [(set R32C:$rT, (setueq (fabs R32FP:$rA), (fabs R32FP:$rB)))]>;
+
+def : Pat<(setoeq (fabs R32FP:$rA), (fabs R32FP:$rB)),
+          (FCMEQf32 R32FP:$rA, R32FP:$rB)>;
+
+def FCGTf32 :
+    RRForm<0b01000011010, (outs R32C:$rT), (ins R32FP:$rA, R32FP:$rB),
+      "fcgt\t$rT, $rA, $rB", SPrecFP,
+      [(set R32C:$rT, (setugt R32FP:$rA, R32FP:$rB))]>;
+
+def : Pat<(setugt R32FP:$rA, R32FP:$rB),
+          (FCGTf32 R32FP:$rA, R32FP:$rB)>;
+
+def FCMGTf32 :
+    RRForm<0b01010011010, (outs R32C:$rT), (ins R32FP:$rA, R32FP:$rB),
+      "fcmgt\t$rT, $rA, $rB", SPrecFP,
+      [(set R32C:$rT, (setugt (fabs R32FP:$rA), (fabs R32FP:$rB)))]>;
+
+def : Pat<(setugt (fabs R32FP:$rA), (fabs R32FP:$rB)),
+          (FCMGTf32 R32FP:$rA, R32FP:$rB)>;
+
+//--------------------------------------------------------------------------
+// Single precision floating point comparisons and SETCC equivalents:
+//--------------------------------------------------------------------------
+
+def : SETCCNegCondReg<setune, R32FP, i32, XORIr32, FCEQf32>;
+def : SETCCNegCondReg<setone, R32FP, i32, XORIr32, FCEQf32>;
+
+def : SETCCBinOpReg<setuge, R32FP, ORr32, FCGTf32, FCEQf32>;
+def : SETCCBinOpReg<setoge, R32FP, ORr32, FCGTf32, FCEQf32>;
+
+def : SETCCBinOpReg<setult, R32FP, NORr32, FCGTf32, FCEQf32>;
+def : SETCCBinOpReg<setolt, R32FP, NORr32, FCGTf32, FCEQf32>;
+
+def : Pat<(setule R32FP:$rA, R32FP:$rB),
+          (XORIr32 (FCGTf32 R32FP:$rA, R32FP:$rB), 0xffffffff)>;
+def : Pat<(setole R32FP:$rA, R32FP:$rB),
+          (XORIr32 (FCGTf32 R32FP:$rA, R32FP:$rB), 0xffffffff)>;
+
+// FP Status and Control Register Write
+// Why isn't rT a don't care in the ISA?
+// Should we create a special RRForm_3 for this guy and zero out the rT?
+def FSCRWf32 :
+    RRForm_1<0b01011101110, (outs R32FP:$rT), (ins R32FP:$rA),
+      "fscrwr\t$rA", SPrecFP,
+      [/* This instruction requires an intrinsic. Note: rT is unused. */]>;
+
+// FP Status and Control Register Read
+def FSCRRf32 :
+    RRForm_2<0b01011101110, (outs R32FP:$rT), (ins),
+      "fscrrd\t$rT", SPrecFP,
+      [/* This instruction requires an intrinsic */]>;
+
+// llvm instruction space
+// How do these map onto cell instructions?
+// fdiv rA rB
+//   frest rC rB        # c = 1/b (both lines)
+//   fi rC rB rC
+//   fm rD rA rC        # d = a * 1/b
+//   fnms rB rD rB rA # b = - (d * b - a) --should == 0 in a perfect world
+//   fma rB rB rC rD            # b = b * c + d
+//                              = -(d *b -a) * c + d
+//                              = a * c - c ( a *b *c - a)
+
+// fcopysign (???)
+
+// Library calls:
+// These llvm instructions will actually map to library calls.
+// All that's needed, then, is to check that the appropriate library is
+// imported and do a brsl to the proper function name.
+// frem # fmod(x, y): x - (x/y) * y
+// (Note: fmod(double, double), fmodf(float,float)
+// fsqrt?
+// fsin?
+// fcos?
+// Unimplemented SPU instruction space
+// floating reciprocal absolute square root estimate (frsqest)
+
+// The following are probably just intrinsics
+// status and control register write
+// status and control register read
+
+//--------------------------------------
+// Floating point multiply instructions
+//--------------------------------------
+
+def FMv4f32:
+    RRForm<0b00100011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "fm\t$rT, $rA, $rB", SPrecFP,
+      [(set (v4f32 VECREG:$rT), (fmul (v4f32 VECREG:$rA),
+                                      (v4f32 VECREG:$rB)))]>;
+
+def FMf32 :
+    RRForm<0b01100011010, (outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB),
+      "fm\t$rT, $rA, $rB", SPrecFP,
+      [(set R32FP:$rT, (fmul R32FP:$rA, R32FP:$rB))]>;
+
+// Floating point multiply and add
+// e.g. d = c + (a * b)
+def FMAv4f32:
+    RRRForm<0b0111, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+      "fma\t$rT, $rA, $rB, $rC", SPrecFP,
+      [(set (v4f32 VECREG:$rT),
+            (fadd (v4f32 VECREG:$rC),
+                  (fmul (v4f32 VECREG:$rA), (v4f32 VECREG:$rB))))]>;
+
+def FMAf32:
+    RRRForm<0b0111, (outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB, R32FP:$rC),
+      "fma\t$rT, $rA, $rB, $rC", SPrecFP,
+      [(set R32FP:$rT, (fadd R32FP:$rC, (fmul R32FP:$rA, R32FP:$rB)))]>;
+
+// FP multiply and subtract
+// Subtracts value in rC from product
+// res = a * b - c
+def FMSv4f32 :
+    RRRForm<0b0111, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+      "fms\t$rT, $rA, $rB, $rC", SPrecFP,
+      [(set (v4f32 VECREG:$rT),
+            (fsub (fmul (v4f32 VECREG:$rA), (v4f32 VECREG:$rB)),
+                  (v4f32 VECREG:$rC)))]>;
+
+def FMSf32 :
+    RRRForm<0b0111, (outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB, R32FP:$rC),
+      "fms\t$rT, $rA, $rB, $rC", SPrecFP,
+      [(set R32FP:$rT,
+            (fsub (fmul R32FP:$rA, R32FP:$rB), R32FP:$rC))]>;
+
+// Floating Negative Mulitply and Subtract
+// Subtracts product from value in rC
+// res = fneg(fms a b c)
+//     = - (a * b - c)
+//     = c - a * b
+// NOTE: subtraction order
+// fsub a b = a - b
+// fs a b = b - a?
+def FNMSf32 :
+    RRRForm<0b1101, (outs R32FP:$rT), (ins R32FP:$rA, R32FP:$rB, R32FP:$rC),
+      "fnms\t$rT, $rA, $rB, $rC", SPrecFP,
+      [(set R32FP:$rT, (fsub R32FP:$rC, (fmul R32FP:$rA, R32FP:$rB)))]>;
+
+def FNMSv4f32 :
+    RRRForm<0b1101, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+      "fnms\t$rT, $rA, $rB, $rC", SPrecFP,
+      [(set (v4f32 VECREG:$rT),
+            (fsub (v4f32 VECREG:$rC),
+                  (fmul (v4f32 VECREG:$rA),
+                        (v4f32 VECREG:$rB))))]>;
+
+//--------------------------------------
+// Floating Point Conversions
+// Signed conversions:
+def CSiFv4f32:
+    CVTIntFPForm<0b0101101110, (outs VECREG:$rT), (ins VECREG:$rA),
+      "csflt\t$rT, $rA, 0", SPrecFP,
+      [(set (v4f32 VECREG:$rT), (sint_to_fp (v4i32 VECREG:$rA)))]>;
+
+// Convert signed integer to floating point
+def CSiFf32 :
+    CVTIntFPForm<0b0101101110, (outs R32FP:$rT), (ins R32C:$rA),
+      "csflt\t$rT, $rA, 0", SPrecFP,
+      [(set R32FP:$rT, (sint_to_fp R32C:$rA))]>;
+
+// Convert unsigned into to float
+def CUiFv4f32 :
+    CVTIntFPForm<0b1101101110, (outs VECREG:$rT), (ins VECREG:$rA),
+      "cuflt\t$rT, $rA, 0", SPrecFP,
+      [(set (v4f32 VECREG:$rT), (uint_to_fp (v4i32 VECREG:$rA)))]>;
+
+def CUiFf32 :
+    CVTIntFPForm<0b1101101110, (outs R32FP:$rT), (ins R32C:$rA),
+      "cuflt\t$rT, $rA, 0", SPrecFP,
+      [(set R32FP:$rT, (uint_to_fp R32C:$rA))]>;
+
+// Convert float to unsigned int
+// Assume that scale = 0
+
+def CFUiv4f32 :
+    CVTIntFPForm<0b1101101110, (outs VECREG:$rT), (ins VECREG:$rA),
+      "cfltu\t$rT, $rA, 0", SPrecFP,
+      [(set (v4i32 VECREG:$rT), (fp_to_uint (v4f32 VECREG:$rA)))]>;
+
+def CFUif32 :
+    CVTIntFPForm<0b1101101110, (outs R32C:$rT), (ins R32FP:$rA),
+      "cfltu\t$rT, $rA, 0", SPrecFP,
+      [(set R32C:$rT, (fp_to_uint R32FP:$rA))]>;
+
+// Convert float to signed int
+// Assume that scale = 0
+
+def CFSiv4f32 :
+    CVTIntFPForm<0b1101101110, (outs VECREG:$rT), (ins VECREG:$rA),
+      "cflts\t$rT, $rA, 0", SPrecFP,
+      [(set (v4i32 VECREG:$rT), (fp_to_sint (v4f32 VECREG:$rA)))]>;
+
+def CFSif32 :
+    CVTIntFPForm<0b1101101110, (outs R32C:$rT), (ins R32FP:$rA),
+      "cflts\t$rT, $rA, 0", SPrecFP,
+      [(set R32C:$rT, (fp_to_sint R32FP:$rA))]>;
+
+//===----------------------------------------------------------------------==//
+// Single<->Double precision conversions
+//===----------------------------------------------------------------------==//
+
+// NOTE: We use "vec" name suffix here to avoid confusion (e.g. input is a
+// v4f32, output is v2f64--which goes in the name?)
+
+// Floating point extend single to double
+// NOTE: Not sure if passing in v4f32 to FESDvec is correct since it
+// operates on two double-word slots (i.e. 1st and 3rd fp numbers
+// are ignored).
+def FESDvec :
+    RRForm_1<0b00011101110, (outs VECREG:$rT), (ins VECREG:$rA),
+      "fesd\t$rT, $rA", SPrecFP,
+      [(set (v2f64 VECREG:$rT), (fextend (v4f32 VECREG:$rA)))]>;
+
+def FESDf32 :
+    RRForm_1<0b00011101110, (outs R64FP:$rT), (ins R32FP:$rA),
+      "fesd\t$rT, $rA", SPrecFP,
+      [(set R64FP:$rT, (fextend R32FP:$rA))]>;
+
+// Floating point round double to single
+//def FRDSvec :
+//    RRForm_1<0b10011101110, (outs VECREG:$rT), (ins VECREG:$rA),
+//      "frds\t$rT, $rA,", SPrecFP,
+//      [(set (v4f32 R32FP:$rT), (fround (v2f64 R64FP:$rA)))]>;
+
+def FRDSf64 :
+    RRForm_1<0b10011101110, (outs R32FP:$rT), (ins R64FP:$rA),
+      "frds\t$rT, $rA", SPrecFP,
+      [(set R32FP:$rT, (fround R64FP:$rA))]>;
+
+//ToDo include anyextend?
+
+//===----------------------------------------------------------------------==//
+// Double precision floating point instructions
+//===----------------------------------------------------------------------==//
+def FAf64 :
+    RRForm<0b00110011010, (outs R64FP:$rT), (ins R64FP:$rA, R64FP:$rB),
+      "dfa\t$rT, $rA, $rB", DPrecFP,
+      [(set R64FP:$rT, (fadd R64FP:$rA, R64FP:$rB))]>;
+
+def FAv2f64 :
+    RRForm<0b00110011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "dfa\t$rT, $rA, $rB", DPrecFP,
+      [(set (v2f64 VECREG:$rT), (fadd (v2f64 VECREG:$rA), (v2f64 VECREG:$rB)))]>;
+
+def FSf64 :
+    RRForm<0b10100011010, (outs R64FP:$rT), (ins R64FP:$rA, R64FP:$rB),
+      "dfs\t$rT, $rA, $rB", DPrecFP,
+      [(set R64FP:$rT, (fsub R64FP:$rA, R64FP:$rB))]>;
+
+def FSv2f64 :
+    RRForm<0b10100011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "dfs\t$rT, $rA, $rB", DPrecFP,
+      [(set (v2f64 VECREG:$rT),
+            (fsub (v2f64 VECREG:$rA), (v2f64 VECREG:$rB)))]>;
+
+def FMf64 :
+    RRForm<0b01100011010, (outs R64FP:$rT), (ins R64FP:$rA, R64FP:$rB),
+      "dfm\t$rT, $rA, $rB", DPrecFP,
+      [(set R64FP:$rT, (fmul R64FP:$rA, R64FP:$rB))]>;
+
+def FMv2f64:
+    RRForm<0b00100011010, (outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB),
+      "dfm\t$rT, $rA, $rB", DPrecFP,
+      [(set (v2f64 VECREG:$rT),
+            (fmul (v2f64 VECREG:$rA), (v2f64 VECREG:$rB)))]>;
+
+def FMAf64:
+    RRForm<0b00111010110, (outs R64FP:$rT),
+                          (ins R64FP:$rA, R64FP:$rB, R64FP:$rC),
+      "dfma\t$rT, $rA, $rB", DPrecFP,
+      [(set R64FP:$rT, (fadd R64FP:$rC, (fmul R64FP:$rA, R64FP:$rB)))]>,
+    RegConstraint<"$rC = $rT">,
+    NoEncode<"$rC">;
+
+def FMAv2f64:
+    RRForm<0b00111010110, (outs VECREG:$rT),
+                          (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+      "dfma\t$rT, $rA, $rB", DPrecFP,
+      [(set (v2f64 VECREG:$rT),
+            (fadd (v2f64 VECREG:$rC),
+                  (fmul (v2f64 VECREG:$rA), (v2f64 VECREG:$rB))))]>,
+    RegConstraint<"$rC = $rT">,
+    NoEncode<"$rC">;
+
+def FMSf64 :
+    RRForm<0b10111010110, (outs R64FP:$rT),
+                          (ins R64FP:$rA, R64FP:$rB, R64FP:$rC),
+      "dfms\t$rT, $rA, $rB", DPrecFP,
+      [(set R64FP:$rT, (fsub (fmul R64FP:$rA, R64FP:$rB), R64FP:$rC))]>,
+    RegConstraint<"$rC = $rT">,
+    NoEncode<"$rC">;
+
+def FMSv2f64 :
+    RRForm<0b10111010110, (outs VECREG:$rT),
+                          (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+      "dfms\t$rT, $rA, $rB", DPrecFP,
+      [(set (v2f64 VECREG:$rT),
+            (fsub (fmul (v2f64 VECREG:$rA), (v2f64 VECREG:$rB)),
+                  (v2f64 VECREG:$rC)))]>;
+
+// DFNMS: - (a * b - c)
+// - (a * b) + c => c - (a * b)
+
+class DFNMSInst<dag OOL, dag IOL, list<dag> pattern>:
+    RRForm<0b01111010110, OOL, IOL, "dfnms\t$rT, $rA, $rB",
+           DPrecFP, pattern>,
+    RegConstraint<"$rC = $rT">,
+    NoEncode<"$rC">;
+
+class DFNMSVecInst<list<dag> pattern>:
+    DFNMSInst<(outs VECREG:$rT), (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+              pattern>;
+
+class DFNMSRegInst<list<dag> pattern>:
+    DFNMSInst<(outs R64FP:$rT), (ins R64FP:$rA, R64FP:$rB, R64FP:$rC),
+             pattern>;
+
+multiclass DFMultiplySubtract
+{
+  def v2f64 : DFNMSVecInst<[(set (v2f64 VECREG:$rT), 
+                                 (fsub (v2f64 VECREG:$rC),
+                                       (fmul (v2f64 VECREG:$rA),
+                                             (v2f64 VECREG:$rB))))]>;
+
+  def f64 : DFNMSRegInst<[(set R64FP:$rT,
+                               (fsub R64FP:$rC,
+                                     (fmul R64FP:$rA, R64FP:$rB)))]>;
+}
+
+defm DFNMS : DFMultiplySubtract;
+
+// - (a * b + c)
+// - (a * b) - c
+def FNMAf64 :
+    RRForm<0b11111010110, (outs R64FP:$rT),
+                          (ins R64FP:$rA, R64FP:$rB, R64FP:$rC),
+      "dfnma\t$rT, $rA, $rB", DPrecFP,
+      [(set R64FP:$rT, (fneg (fadd R64FP:$rC, (fmul R64FP:$rA, R64FP:$rB))))]>,
+    RegConstraint<"$rC = $rT">,
+    NoEncode<"$rC">;
+
+def FNMAv2f64 :
+    RRForm<0b11111010110, (outs VECREG:$rT),
+                          (ins VECREG:$rA, VECREG:$rB, VECREG:$rC),
+      "dfnma\t$rT, $rA, $rB", DPrecFP,
+      [(set (v2f64 VECREG:$rT),
+            (fneg (fadd (v2f64 VECREG:$rC),
+                        (fmul (v2f64 VECREG:$rA),
+                              (v2f64 VECREG:$rB)))))]>,
+    RegConstraint<"$rC = $rT">,
+    NoEncode<"$rC">;
+
+//===----------------------------------------------------------------------==//
+// Floating point negation and absolute value
+//===----------------------------------------------------------------------==//
+
+def : Pat<(fneg (v4f32 VECREG:$rA)),
+          (XORfnegvec (v4f32 VECREG:$rA),
+                      (v4f32 (ILHUv4i32 0x8000)))>;
+
+def : Pat<(fneg R32FP:$rA),
+          (XORfneg32 R32FP:$rA, (ILHUr32 0x8000))>;
+
+// Floating point absolute value
+// Note: f64 fabs is custom-selected.
+
+def : Pat<(fabs R32FP:$rA),
+          (ANDfabs32 R32FP:$rA, (IOHLr32 (ILHUr32 0x7fff), 0xffff))>;
+
+def : Pat<(fabs (v4f32 VECREG:$rA)),
+          (ANDfabsvec (v4f32 VECREG:$rA),
+                      (IOHLv4i32 (ILHUv4i32 0x7fff), 0xffff))>;
+
+//===----------------------------------------------------------------------===//
+// Hint for branch instructions:
+//===----------------------------------------------------------------------===//
+
+/* def HBR : SPUInstr<(outs), (ins), "hbr\t" */
+
+//===----------------------------------------------------------------------===//
+// Execution, Load NOP (execute NOPs belong in even pipeline, load NOPs belong
+// in the odd pipeline)
+//===----------------------------------------------------------------------===//
+
+def ENOP : SPUInstr<(outs), (ins), "enop", ExecNOP> {
+  let Pattern = [];
+
+  let Inst{0-10} = 0b10000000010;
+  let Inst{11-17} = 0;
+  let Inst{18-24} = 0;
+  let Inst{25-31} = 0;
+}
+
+def LNOP : SPUInstr<(outs), (ins), "lnop", LoadNOP> {
+  let Pattern = [];
+
+  let Inst{0-10} = 0b10000000000;
+  let Inst{11-17} = 0;
+  let Inst{18-24} = 0;
+  let Inst{25-31} = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Bit conversions (type conversions between vector/packed types)
+// NOTE: Promotions are handled using the XS* instructions.
+//===----------------------------------------------------------------------===//
+def : Pat<(v16i8 (bitconvert (v8i16 VECREG:$src))), (v16i8 VECREG:$src)>;
+def : Pat<(v16i8 (bitconvert (v4i32 VECREG:$src))), (v16i8 VECREG:$src)>;
+def : Pat<(v16i8 (bitconvert (v2i64 VECREG:$src))), (v16i8 VECREG:$src)>;
+def : Pat<(v16i8 (bitconvert (v4f32 VECREG:$src))), (v16i8 VECREG:$src)>;
+def : Pat<(v16i8 (bitconvert (v2f64 VECREG:$src))), (v16i8 VECREG:$src)>;
+
+def : Pat<(v8i16 (bitconvert (v16i8 VECREG:$src))), (v8i16 VECREG:$src)>;
+def : Pat<(v8i16 (bitconvert (v4i32 VECREG:$src))), (v8i16 VECREG:$src)>;
+def : Pat<(v8i16 (bitconvert (v2i64 VECREG:$src))), (v8i16 VECREG:$src)>;
+def : Pat<(v8i16 (bitconvert (v4f32 VECREG:$src))), (v8i16 VECREG:$src)>;
+def : Pat<(v8i16 (bitconvert (v2f64 VECREG:$src))), (v8i16 VECREG:$src)>;
+
+def : Pat<(v4i32 (bitconvert (v16i8 VECREG:$src))), (v4i32 VECREG:$src)>;
+def : Pat<(v4i32 (bitconvert (v8i16 VECREG:$src))), (v4i32 VECREG:$src)>;
+def : Pat<(v4i32 (bitconvert (v2i64 VECREG:$src))), (v4i32 VECREG:$src)>;
+def : Pat<(v4i32 (bitconvert (v4f32 VECREG:$src))), (v4i32 VECREG:$src)>;
+def : Pat<(v4i32 (bitconvert (v2f64 VECREG:$src))), (v4i32 VECREG:$src)>;
+
+def : Pat<(v2i64 (bitconvert (v16i8 VECREG:$src))), (v2i64 VECREG:$src)>;
+def : Pat<(v2i64 (bitconvert (v8i16 VECREG:$src))), (v2i64 VECREG:$src)>;
+def : Pat<(v2i64 (bitconvert (v4i32 VECREG:$src))), (v2i64 VECREG:$src)>;
+def : Pat<(v2i64 (bitconvert (v4f32 VECREG:$src))), (v2i64 VECREG:$src)>;
+def : Pat<(v2i64 (bitconvert (v2f64 VECREG:$src))), (v2i64 VECREG:$src)>;
+
+def : Pat<(v4f32 (bitconvert (v16i8 VECREG:$src))), (v4f32 VECREG:$src)>;
+def : Pat<(v4f32 (bitconvert (v8i16 VECREG:$src))), (v4f32 VECREG:$src)>;
+def : Pat<(v4f32 (bitconvert (v2i64 VECREG:$src))), (v4f32 VECREG:$src)>;
+def : Pat<(v4f32 (bitconvert (v4i32 VECREG:$src))), (v4f32 VECREG:$src)>;
+def : Pat<(v4f32 (bitconvert (v2f64 VECREG:$src))), (v4f32 VECREG:$src)>;
+
+def : Pat<(v2f64 (bitconvert (v16i8 VECREG:$src))), (v2f64 VECREG:$src)>;
+def : Pat<(v2f64 (bitconvert (v8i16 VECREG:$src))), (v2f64 VECREG:$src)>;
+def : Pat<(v2f64 (bitconvert (v4i32 VECREG:$src))), (v2f64 VECREG:$src)>;
+def : Pat<(v2f64 (bitconvert (v2i64 VECREG:$src))), (v2f64 VECREG:$src)>;
+def : Pat<(v2f64 (bitconvert (v2f64 VECREG:$src))), (v2f64 VECREG:$src)>;
+
+def : Pat<(i128 (bitconvert (v16i8 VECREG:$src))),
+          (ORi128_vec VECREG:$src)>;
+def : Pat<(i128 (bitconvert (v8i16 VECREG:$src))),
+          (ORi128_vec VECREG:$src)>;
+def : Pat<(i128 (bitconvert (v4i32 VECREG:$src))),
+          (ORi128_vec VECREG:$src)>;
+def : Pat<(i128 (bitconvert (v2i64 VECREG:$src))),
+          (ORi128_vec VECREG:$src)>;
+def : Pat<(i128 (bitconvert (v4f32 VECREG:$src))),
+          (ORi128_vec VECREG:$src)>;
+def : Pat<(i128 (bitconvert (v2f64 VECREG:$src))),
+          (ORi128_vec VECREG:$src)>;
+
+def : Pat<(v16i8 (bitconvert (i128 GPRC:$src))),
+          (v16i8 (ORvec_i128 GPRC:$src))>;
+def : Pat<(v8i16 (bitconvert (i128 GPRC:$src))),
+          (v8i16 (ORvec_i128 GPRC:$src))>;
+def : Pat<(v4i32 (bitconvert (i128 GPRC:$src))),
+          (v4i32 (ORvec_i128 GPRC:$src))>;
+def : Pat<(v2i64 (bitconvert (i128 GPRC:$src))),
+          (v2i64 (ORvec_i128 GPRC:$src))>;
+def : Pat<(v4f32 (bitconvert (i128 GPRC:$src))),
+          (v4f32 (ORvec_i128 GPRC:$src))>;
+def : Pat<(v2f64 (bitconvert (i128 GPRC:$src))),
+          (v2f64 (ORvec_i128 GPRC:$src))>;
+
+//===----------------------------------------------------------------------===//
+// Instruction patterns:
+//===----------------------------------------------------------------------===//
+
+// General 32-bit constants:
+def : Pat<(i32 imm:$imm),
+          (IOHLr32 (ILHUr32 (HI16 imm:$imm)), (LO16 imm:$imm))>;
+
+// Single precision float constants:
+def : Pat<(f32 fpimm:$imm),
+          (IOHLf32 (ILHUf32 (HI16_f32 fpimm:$imm)), (LO16_f32 fpimm:$imm))>;
+
+// General constant 32-bit vectors
+def : Pat<(v4i32 v4i32Imm:$imm),
+          (IOHLv4i32 (v4i32 (ILHUv4i32 (HI16_vec v4i32Imm:$imm))),
+                     (LO16_vec v4i32Imm:$imm))>;
+
+// 8-bit constants
+def : Pat<(i8 imm:$imm),
+          (ILHr8 imm:$imm)>;
+
+//===----------------------------------------------------------------------===//
+// Zero/Any/Sign extensions
+//===----------------------------------------------------------------------===//
+
+// sext 8->32: Sign extend bytes to words
+def : Pat<(sext_inreg R32C:$rSrc, i8),
+          (XSHWr32 (XSBHr32 R32C:$rSrc))>;
+
+def : Pat<(i32 (sext R8C:$rSrc)),
+          (XSHWr16 (XSBHr8 R8C:$rSrc))>;
+
+// sext 8->64: Sign extend bytes to double word
+def : Pat<(sext_inreg R64C:$rSrc, i8),
+          (XSWDr64_inreg (XSHWr64 (XSBHr64 R64C:$rSrc)))>;
+          
+def : Pat<(i64 (sext R8C:$rSrc)),
+          (XSWDr64 (XSHWr16 (XSBHr8 R8C:$rSrc)))>;
+
+// zext 8->16: Zero extend bytes to halfwords
+def : Pat<(i16 (zext R8C:$rSrc)),
+          (ANDHIi8i16 R8C:$rSrc, 0xff)>;
+
+// zext 8->32: Zero extend bytes to words
+def : Pat<(i32 (zext R8C:$rSrc)),
+          (ANDIi8i32 R8C:$rSrc, 0xff)>;
+
+// zext 8->64: Zero extend bytes to double words
+def : Pat<(i64 (zext R8C:$rSrc)),
+          (ORi64_v2i64 (SELBv4i32 (ROTQMBYv4i32
+                                    (ORv4i32_i32 (ANDIi8i32 R8C:$rSrc, 0xff)),
+                                    0x4),
+                                  (ILv4i32 0x0),
+                                  (FSMBIv4i32 0x0f0f)))>;
+
+// anyext 8->16: Extend 8->16 bits, irrespective of sign, preserves high bits
+def : Pat<(i16 (anyext R8C:$rSrc)),
+          (ORHIi8i16 R8C:$rSrc, 0)>;
+
+// anyext 8->32: Extend 8->32 bits, irrespective of sign, preserves high bits
+def : Pat<(i32 (anyext R8C:$rSrc)),
+          (ORIi8i32 R8C:$rSrc, 0)>;
+
+// sext 16->64: Sign extend halfword to double word
+def : Pat<(sext_inreg R64C:$rSrc, i16),
+          (XSWDr64_inreg (XSHWr64 R64C:$rSrc))>;
+          
+def : Pat<(sext R16C:$rSrc),
+          (XSWDr64 (XSHWr16 R16C:$rSrc))>;
+
+// zext 16->32: Zero extend halfwords to words
+def : Pat<(i32 (zext R16C:$rSrc)),
+          (ANDi16i32 R16C:$rSrc, (ILAr32 0xffff))>;
+
+def : Pat<(i32 (zext (and R16C:$rSrc, 0xf))),
+          (ANDIi16i32 R16C:$rSrc, 0xf)>;
+
+def : Pat<(i32 (zext (and R16C:$rSrc, 0xff))),
+          (ANDIi16i32 R16C:$rSrc, 0xff)>;
+
+def : Pat<(i32 (zext (and R16C:$rSrc, 0xfff))),
+          (ANDIi16i32 R16C:$rSrc, 0xfff)>;
+
+// anyext 16->32: Extend 16->32 bits, irrespective of sign
+def : Pat<(i32 (anyext R16C:$rSrc)),
+          (ORIi16i32 R16C:$rSrc, 0)>;
+
+//===----------------------------------------------------------------------===//
+// Truncates:
+// These truncates are for the SPU's supported types (i8, i16, i32). i64 and
+// above are custom lowered.
+//===----------------------------------------------------------------------===//
+
+def : Pat<(i8 (trunc GPRC:$src)),
+          (ORi8_v16i8
+            (SHUFBgprc GPRC:$src, GPRC:$src,
+                       (IOHLv4i32 (ILHUv4i32 0x0f0f), 0x0f0f)))>;
+
+def : Pat<(i8 (trunc R64C:$src)),
+          (ORi8_v16i8
+            (SHUFBv2i64_m32
+              (ORv2i64_i64 R64C:$src),
+              (ORv2i64_i64 R64C:$src),
+              (IOHLv4i32 (ILHUv4i32 0x0707), 0x0707)))>;
+
+def : Pat<(i8 (trunc R32C:$src)),
+          (ORi8_v16i8
+            (SHUFBv4i32_m32
+               (ORv4i32_i32 R32C:$src),
+               (ORv4i32_i32 R32C:$src),
+               (IOHLv4i32 (ILHUv4i32 0x0303), 0x0303)))>;
+
+def : Pat<(i8 (trunc R16C:$src)),
+          (ORi8_v16i8
+            (SHUFBv4i32_m32
+               (ORv8i16_i16 R16C:$src),
+               (ORv8i16_i16 R16C:$src),
+               (IOHLv4i32 (ILHUv4i32 0x0303), 0x0303)))>;
+
+def : Pat<(i16 (trunc GPRC:$src)),
+          (ORi16_v8i16
+            (SHUFBgprc GPRC:$src, GPRC:$src,
+                       (IOHLv4i32 (ILHUv4i32 0x0e0f), 0x0e0f)))>;
+
+def : Pat<(i16 (trunc R64C:$src)),
+          (ORi16_v8i16
+            (SHUFBv2i64_m32
+              (ORv2i64_i64 R64C:$src),
+              (ORv2i64_i64 R64C:$src),
+              (IOHLv4i32 (ILHUv4i32 0x0607), 0x0607)))>;
+
+def : Pat<(i16 (trunc R32C:$src)),
+          (ORi16_v8i16
+            (SHUFBv4i32_m32
+               (ORv4i32_i32 R32C:$src),
+               (ORv4i32_i32 R32C:$src),
+               (IOHLv4i32 (ILHUv4i32 0x0203), 0x0203)))>;
+
+def : Pat<(i32 (trunc GPRC:$src)),
+          (ORi32_v4i32
+            (SHUFBgprc GPRC:$src, GPRC:$src,
+                       (IOHLv4i32 (ILHUv4i32 0x0c0d), 0x0e0f)))>;
+
+def : Pat<(i32 (trunc R64C:$src)),
+          (ORi32_v4i32
+            (SHUFBv2i64_m32
+              (ORv2i64_i64 R64C:$src),
+              (ORv2i64_i64 R64C:$src),
+              (IOHLv4i32 (ILHUv4i32 0x0405), 0x0607)))>;
+
+//===----------------------------------------------------------------------===//
+// Address generation: SPU, like PPC, has to split addresses into high and
+// low parts in order to load them into a register.
+//===----------------------------------------------------------------------===//
+
+def : Pat<(SPUaform tglobaladdr:$in, 0),  (ILAlsa tglobaladdr:$in)>;
+def : Pat<(SPUaform texternalsym:$in, 0), (ILAlsa texternalsym:$in)>;
+def : Pat<(SPUaform tjumptable:$in, 0),   (ILAlsa tjumptable:$in)>;
+def : Pat<(SPUaform tconstpool:$in, 0),   (ILAlsa  tconstpool:$in)>;
+
+def : Pat<(SPUindirect (SPUhi tglobaladdr:$in, 0),
+                       (SPUlo tglobaladdr:$in, 0)),
+          (IOHLlo (ILHUhi tglobaladdr:$in), tglobaladdr:$in)>;
+
+def : Pat<(SPUindirect (SPUhi texternalsym:$in, 0),
+                       (SPUlo texternalsym:$in, 0)),
+          (IOHLlo (ILHUhi texternalsym:$in), texternalsym:$in)>;
+
+def : Pat<(SPUindirect (SPUhi tjumptable:$in, 0),
+                       (SPUlo tjumptable:$in, 0)),
+          (IOHLlo (ILHUhi tjumptable:$in), tjumptable:$in)>;
+
+def : Pat<(SPUindirect (SPUhi tconstpool:$in, 0),
+                       (SPUlo tconstpool:$in, 0)),
+          (IOHLlo (ILHUhi tconstpool:$in), tconstpool:$in)>;
+
+def : Pat<(add (SPUhi tglobaladdr:$in, 0), (SPUlo tglobaladdr:$in, 0)),
+          (IOHLlo (ILHUhi tglobaladdr:$in), tglobaladdr:$in)>;
+
+def : Pat<(add (SPUhi texternalsym:$in, 0), (SPUlo texternalsym:$in, 0)),
+          (IOHLlo (ILHUhi texternalsym:$in), texternalsym:$in)>;
+
+def : Pat<(add (SPUhi tjumptable:$in, 0), (SPUlo tjumptable:$in, 0)),
+          (IOHLlo (ILHUhi tjumptable:$in), tjumptable:$in)>;
+
+def : Pat<(add (SPUhi tconstpool:$in, 0), (SPUlo tconstpool:$in, 0)),
+          (IOHLlo (ILHUhi tconstpool:$in), tconstpool:$in)>;
+
+// Intrinsics:
+include "CellSDKIntrinsics.td"
+// Various math operator instruction sequences
+include "SPUMathInstr.td"
+// 64-bit "instructions"/support
+include "SPU64InstrInfo.td"
+// 128-bit "instructions"/support
+include "SPU128InstrInfo.td"

diff --git a/lib/Target/CellSPU/SPUMCAsmInfo.cpp b/lib/Target/CellSPU/SPUMCAsmInfo.cpp
new file mode 100644
index 0000000..5ef3c6b
--- /dev/null
+++ b/lib/Target/CellSPU/SPUMCAsmInfo.cpp

@@ -0,0 +1,38 @@
+//===-- SPUMCAsmInfo.cpp - Cell SPU asm properties ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the SPUMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPUMCAsmInfo.h"
+using namespace llvm;
+
+SPULinuxMCAsmInfo::SPULinuxMCAsmInfo(const Target &T, const StringRef &TT) {
+  ZeroDirective = "\t.space\t";
+  Data64bitsDirective = "\t.quad\t";
+  AlignmentIsInBytes = false;
+  HasLCOMMDirective = true;
+      
+  PCSymbol = ".";
+  CommentString = "#";
+  GlobalPrefix = "";
+  PrivateGlobalPrefix = ".L";
+
+  // Has leb128, .loc and .file
+  HasLEB128 = true;
+  HasDotLocAndDotFile = true;
+
+  SupportsDebugInformation = true;
+
+  // Exception handling is not supported on CellSPU (think about it: you only
+  // have 256K for code+data. Would you support exception handling?)
+  ExceptionsType = ExceptionHandling::None;
+}
+

diff --git a/lib/Target/CellSPU/SPUMCAsmInfo.h b/lib/Target/CellSPU/SPUMCAsmInfo.h
new file mode 100644
index 0000000..8d75ea8
--- /dev/null
+++ b/lib/Target/CellSPU/SPUMCAsmInfo.h

@@ -0,0 +1,28 @@
+//===-- SPUMCAsmInfo.h - Cell SPU asm properties ---------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the SPUMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPUTARGETASMINFO_H
+#define SPUTARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+  class Target;
+  class StringRef;
+  
+  struct SPULinuxMCAsmInfo : public MCAsmInfo {
+    explicit SPULinuxMCAsmInfo(const Target &T, const StringRef &TT);
+  };
+} // namespace llvm
+
+#endif /* SPUTARGETASMINFO_H */

diff --git a/lib/Target/CellSPU/SPUMachineFunction.h b/lib/Target/CellSPU/SPUMachineFunction.h
new file mode 100644
index 0000000..6a66967
--- /dev/null
+++ b/lib/Target/CellSPU/SPUMachineFunction.h

@@ -0,0 +1,43 @@
+//===-- SPUMachineFunctionInfo.h - Private data used for CellSPU --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the IBM Cell SPU specific subclass of MachineFunctionInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPU_MACHINE_FUNCTION_INFO_H
+#define SPU_MACHINE_FUNCTION_INFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+/// SPUFunctionInfo - Cell SPU target-specific information for each
+/// MachineFunction
+class SPUFunctionInfo : public MachineFunctionInfo {
+private:
+  /// UsesLR - Indicates whether LR is used in the current function.
+  ///
+  bool UsesLR;
+
+public:
+  SPUFunctionInfo(MachineFunction& MF) 
+  : UsesLR(false)
+  {}
+
+  void setUsesLR(bool U) { UsesLR = U; }
+  bool usesLR()          { return UsesLR; }
+
+};
+
+} // end of namespace llvm
+
+
+#endif
+

diff --git a/lib/Target/CellSPU/SPUMathInstr.td b/lib/Target/CellSPU/SPUMathInstr.td
new file mode 100644
index 0000000..80ebde3
--- /dev/null
+++ b/lib/Target/CellSPU/SPUMathInstr.td

@@ -0,0 +1,97 @@
+//======--- SPUMathInst.td - Cell SPU math operations -*- tablegen -*---======//
+//
+//                     Cell SPU math operations
+//
+// This target description file contains instruction sequences for various
+// math operations, such as vector multiplies, i32 multiply, etc., for the
+// SPU's i32, i16 i8 and corresponding vector types.
+//
+// Any resemblance to libsimdmath or the Cell SDK simdmath library is
+// purely and completely coincidental.
+//===----------------------------------------------------------------------===//
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// v16i8 multiply instruction sequence:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+def : Pat<(mul (v16i8 VECREG:$rA), (v16i8 VECREG:$rB)),
+          (ORv4i32
+           (ANDv4i32
+            (SELBv4i32 (MPYv8i16 VECREG:$rA, VECREG:$rB),
+                       (SHLHIv8i16 (MPYv8i16 (ROTMAHIv8i16 VECREG:$rA, 8),
+                                             (ROTMAHIv8i16 VECREG:$rB, 8)), 8),
+                       (FSMBIv8i16 0x2222)),
+            (ILAv4i32 0x0000ffff)),
+           (SHLIv4i32
+            (SELBv4i32 (MPYv8i16 (ROTMAIv4i32_i32 VECREG:$rA, 16),
+                                 (ROTMAIv4i32_i32 VECREG:$rB, 16)),
+                       (SHLHIv8i16 (MPYv8i16 (ROTMAIv4i32_i32 VECREG:$rA, 8),
+                                             (ROTMAIv4i32_i32 VECREG:$rB, 8)), 8),
+                       (FSMBIv8i16 0x2222)), 16))>;
+                        
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// v8i16 multiply instruction sequence:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+def : Pat<(mul (v8i16 VECREG:$rA), (v8i16 VECREG:$rB)),
+          (SELBv8i16 (MPYv8i16 VECREG:$rA, VECREG:$rB),
+                     (SHLIv4i32 (MPYHHv8i16 VECREG:$rA, VECREG:$rB), 16),
+                     (FSMBIv8i16 0xcccc))>;
+                 
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// v4i32, i32 multiply instruction sequence:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+def MPYv4i32:
+  Pat<(mul (v4i32 VECREG:$rA), (v4i32 VECREG:$rB)),
+      (Av4i32
+        (Av4i32 (MPYHv4i32 VECREG:$rA, VECREG:$rB),
+                (MPYHv4i32 VECREG:$rB, VECREG:$rA)),
+        (MPYUv4i32 VECREG:$rA, VECREG:$rB))>;
+
+def MPYi32:
+  Pat<(mul R32C:$rA, R32C:$rB),
+      (Ar32
+        (Ar32 (MPYHr32 R32C:$rA, R32C:$rB),
+              (MPYHr32 R32C:$rB, R32C:$rA)),
+        (MPYUr32 R32C:$rA, R32C:$rB))>;
+
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+// f32, v4f32 divide instruction sequence:
+//-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~-~
+
+// Reciprocal estimate and interpolation
+def Interpf32: CodeFrag<(FIf32 R32FP:$rB, (FRESTf32 R32FP:$rB))>;
+// Division estimate
+def DivEstf32: CodeFrag<(FMf32 R32FP:$rA, Interpf32.Fragment)>;
+// Newton-Raphson iteration
+def NRaphf32: CodeFrag<(FMAf32 (FNMSf32 DivEstf32.Fragment, R32FP:$rB, R32FP:$rA),
+                               Interpf32.Fragment,
+                               DivEstf32.Fragment)>;
+// Epsilon addition
+def Epsilonf32: CodeFrag<(AIf32 NRaphf32.Fragment, 1)>;
+
+def : Pat<(fdiv R32FP:$rA, R32FP:$rB),
+          (SELBf32_cond NRaphf32.Fragment,
+                        Epsilonf32.Fragment,
+                        (CGTIf32 (FNMSf32 R32FP:$rB, Epsilonf32.Fragment, R32FP:$rA), -1))>;
+
+// Reciprocal estimate and interpolation
+def Interpv4f32: CodeFrag<(FIv4f32 (v4f32 VECREG:$rB), (FRESTv4f32 (v4f32 VECREG:$rB)))>;
+// Division estimate
+def DivEstv4f32: CodeFrag<(FMv4f32 (v4f32 VECREG:$rA), Interpv4f32.Fragment)>;
+// Newton-Raphson iteration
+def NRaphv4f32: CodeFrag<(FMAv4f32 (FNMSv4f32 DivEstv4f32.Fragment,
+                                              (v4f32 VECREG:$rB),
+                                              (v4f32 VECREG:$rA)),
+                                   Interpv4f32.Fragment,
+                                   DivEstv4f32.Fragment)>;
+// Epsilon addition
+def Epsilonv4f32: CodeFrag<(AIv4f32 NRaphv4f32.Fragment, 1)>;
+
+def : Pat<(fdiv (v4f32 VECREG:$rA), (v4f32 VECREG:$rB)),
+          (SELBv4f32_cond NRaphv4f32.Fragment,
+                        Epsilonv4f32.Fragment,
+                        (CGTIv4f32 (FNMSv4f32 (v4f32 VECREG:$rB),
+                                              Epsilonv4f32.Fragment,
+                                              (v4f32 VECREG:$rA)), -1))>;

diff --git a/lib/Target/CellSPU/SPUNodes.td b/lib/Target/CellSPU/SPUNodes.td
new file mode 100644
index 0000000..c722e4b
--- /dev/null
+++ b/lib/Target/CellSPU/SPUNodes.td

@@ -0,0 +1,156 @@
+//===- SPUNodes.td - Specialized SelectionDAG nodes used for CellSPU ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Type profiles and SelectionDAG nodes used by CellSPU
+//
+//===----------------------------------------------------------------------===//
+
+// Type profile for a call sequence
+def SDT_SPUCallSeq : SDTypeProfile<0, 1, [ SDTCisVT<0, i32> ]>;
+
+// SPU_GenControl: Type profile for generating control words for insertions
+def SPU_GenControl : SDTypeProfile<1, 1, []>;
+def SPUshufmask    : SDNode<"SPUISD::SHUFFLE_MASK", SPU_GenControl, []>;
+
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_SPUCallSeq,
+                           [SDNPHasChain, SDNPOutFlag]>;
+def callseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_SPUCallSeq,
+                           [SDNPHasChain, SDNPOutFlag]>;
+//===----------------------------------------------------------------------===//
+// Operand constraints:
+//===----------------------------------------------------------------------===//
+
+def SDT_SPUCall   : SDTypeProfile<0, -1, [SDTCisInt<0>]>;
+def SPUcall       : SDNode<"SPUISD::CALL", SDT_SPUCall,
+                           [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+
+// Operand type constraints for vector shuffle/permute operations
+def SDT_SPUshuffle   : SDTypeProfile<1, 3, [
+  SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>
+]>;
+
+// Vector binary operator type constraints (needs a further constraint to
+// ensure that operand 0 is a vector...):
+
+def SPUVecBinop: SDTypeProfile<1, 2, [
+  SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>
+]>;
+
+// Trinary operators, e.g., addx, carry generate
+def SPUIntTrinaryOp : SDTypeProfile<1, 3, [
+  SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisInt<0>
+]>;
+
+// SELECT_MASK type constraints: There are several variations for the various
+// vector types (this avoids having to bit_convert all over the place.)
+def SPUselmask_type: SDTypeProfile<1, 1, [
+  SDTCisInt<1>
+]>;
+
+// SELB type constraints:
+def SPUselb_type: SDTypeProfile<1, 3, [
+  SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisSameAs<0, 3> ]>;
+
+// SPU Vector shift pseudo-instruction type constraints
+def SPUvecshift_type: SDTypeProfile<1, 2, [
+  SDTCisSameAs<0, 1>, SDTCisInt<2>]>;
+
+// "marker" type for i64 operators that need a shuffle mask
+// (i.e., uses cg or bg or another instruction that needs to
+// use shufb to get things in the right place.)
+// Op0: The result
+// Op1, 2: LHS, RHS
+// Op3: Carry-generate shuffle mask
+
+def SPUmarker_type : SDTypeProfile<1, 3, [
+  SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2> ]>;
+
+//===----------------------------------------------------------------------===//
+// Synthetic/pseudo-instructions
+//===----------------------------------------------------------------------===//
+
+// SPU CNTB:
+def SPUcntb : SDNode<"SPUISD::CNTB", SDTIntUnaryOp>;
+
+// SPU vector shuffle node, matched by the SPUISD::SHUFB enum (see
+// SPUISelLowering.h):
+def SPUshuffle: SDNode<"SPUISD::SHUFB", SDT_SPUshuffle, []>;
+
+// Shift left quadword by bits and bytes
+def SPUshlquad_l_bits: SDNode<"SPUISD::SHLQUAD_L_BITS", SPUvecshift_type, []>;
+def SPUshlquad_l_bytes: SDNode<"SPUISD::SHLQUAD_L_BYTES", SPUvecshift_type, []>;
+
+// Vector shifts (ISD::SHL,SRL,SRA are for _integers_ only):
+def SPUvec_shl: SDNode<"ISD::SHL", SPUvecshift_type, []>;
+def SPUvec_srl: SDNode<"ISD::SRL", SPUvecshift_type, []>;
+def SPUvec_sra: SDNode<"ISD::SRA", SPUvecshift_type, []>;
+
+def SPUvec_rotl: SDNode<"SPUISD::VEC_ROTL", SPUvecshift_type, []>;
+def SPUvec_rotr: SDNode<"SPUISD::VEC_ROTR", SPUvecshift_type, []>;
+
+// Vector rotate left, bits shifted out of the left are rotated in on the right
+def SPUrotbytes_left: SDNode<"SPUISD::ROTBYTES_LEFT",
+                             SPUvecshift_type, []>;
+
+// Vector rotate left by bytes, but the count is given in bits and the SPU
+// internally converts it to bytes (saves an instruction to mask off lower
+// three bits)
+def SPUrotbytes_left_bits : SDNode<"SPUISD::ROTBYTES_LEFT_BITS",
+                                   SPUvecshift_type>;
+
+// SPU form select mask for bytes, immediate
+def SPUselmask: SDNode<"SPUISD::SELECT_MASK", SPUselmask_type, []>;
+
+// SPU select bits instruction
+def SPUselb: SDNode<"SPUISD::SELB", SPUselb_type, []>;
+
+def SDTprefslot2vec: SDTypeProfile<1, 1, []>;
+def SPUprefslot2vec: SDNode<"SPUISD::PREFSLOT2VEC", SDTprefslot2vec, []>;
+
+def SPU_vec_demote   : SDTypeProfile<1, 1, []>;
+def SPUvec2prefslot: SDNode<"SPUISD::VEC2PREFSLOT", SPU_vec_demote, []>;
+
+// Address high and low components, used for [r+r] type addressing
+def SPUhi : SDNode<"SPUISD::Hi", SDTIntBinOp, []>;
+def SPUlo : SDNode<"SPUISD::Lo", SDTIntBinOp, []>;
+
+// PC-relative address
+def SPUpcrel : SDNode<"SPUISD::PCRelAddr", SDTIntBinOp, []>;
+
+// A-Form local store addresses
+def SPUaform : SDNode<"SPUISD::AFormAddr", SDTIntBinOp, []>;
+
+// Indirect [D-Form "imm($reg)" and X-Form "$reg($reg)"] addresses
+def SPUindirect : SDNode<"SPUISD::IndirectAddr", SDTIntBinOp, []>;
+
+// i64 markers: supplies extra operands used to generate the i64 operator
+// instruction sequences
+def SPUadd64 : SDNode<"SPUISD::ADD64_MARKER", SPUmarker_type, []>;
+def SPUsub64 : SDNode<"SPUISD::SUB64_MARKER", SPUmarker_type, []>;
+def SPUmul64 : SDNode<"SPUISD::MUL64_MARKER", SPUmarker_type, []>;
+
+//===----------------------------------------------------------------------===//
+// Constraints: (taken from PPCInstrInfo.td)
+//===----------------------------------------------------------------------===//
+
+class RegConstraint<string C> {
+  string Constraints = C;
+}
+
+class NoEncode<string E> {
+  string DisableEncoding = E;
+}
+
+//===----------------------------------------------------------------------===//
+// Return (flag isn't quite what it means: the operations are flagged so that
+// instruction scheduling doesn't disassociate them.)
+//===----------------------------------------------------------------------===//
+
+def retflag     : SDNode<"SPUISD::RET_FLAG", SDTNone,
+                         [SDNPHasChain, SDNPOptInFlag]>;

diff --git a/lib/Target/CellSPU/SPUOperands.td b/lib/Target/CellSPU/SPUOperands.td
new file mode 100644
index 0000000..802628f
--- /dev/null
+++ b/lib/Target/CellSPU/SPUOperands.td

@@ -0,0 +1,655 @@
+//===- SPUOperands.td - Cell SPU Instruction Operands ------*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+// Cell SPU Instruction Operands:
+//===----------------------------------------------------------------------===//
+
+def LO16 : SDNodeXForm<imm, [{
+  unsigned val = N->getZExtValue();
+  // Transformation function: get the low 16 bits.
+  return getI32Imm(val & 0xffff);
+}]>;
+
+def LO16_vec : SDNodeXForm<scalar_to_vector, [{
+  SDValue OpVal(0, 0);
+
+  // Transformation function: get the low 16 bit immediate from a build_vector
+  // node.
+  assert(N->getOpcode() == ISD::BUILD_VECTOR
+         && "LO16_vec got something other than a BUILD_VECTOR");
+
+  // Get first constant operand...
+  for (unsigned i = 0, e = N->getNumOperands();
+       OpVal.getNode() == 0 && i != e; ++i) {
+    if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
+    if (OpVal.getNode() == 0)
+      OpVal = N->getOperand(i);
+  }
+  
+  assert(OpVal.getNode() != 0 && "LO16_vec did not locate a <defined> node");
+  ConstantSDNode *CN = cast<ConstantSDNode>(OpVal);
+  return getI32Imm((unsigned)CN->getZExtValue() & 0xffff);
+}]>;
+
+// Transform an immediate, returning the high 16 bits shifted down:
+def HI16 : SDNodeXForm<imm, [{
+  return getI32Imm((unsigned)N->getZExtValue() >> 16);
+}]>;
+
+// Transformation function: shift the high 16 bit immediate from a build_vector
+// node into the low 16 bits, and return a 16-bit constant.
+def HI16_vec : SDNodeXForm<scalar_to_vector, [{
+  SDValue OpVal(0, 0);
+
+  assert(N->getOpcode() == ISD::BUILD_VECTOR
+         && "HI16_vec got something other than a BUILD_VECTOR");
+  
+  // Get first constant operand...
+  for (unsigned i = 0, e = N->getNumOperands();
+       OpVal.getNode() == 0 && i != e; ++i) {
+    if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
+    if (OpVal.getNode() == 0)
+      OpVal = N->getOperand(i);
+  }
+  
+  assert(OpVal.getNode() != 0 && "HI16_vec did not locate a <defined> node");
+  ConstantSDNode *CN = cast<ConstantSDNode>(OpVal);
+  return getI32Imm((unsigned)CN->getZExtValue() >> 16);
+}]>;
+
+// simm7 predicate - True if the immediate fits in an 7-bit signed
+// field.
+def simm7: PatLeaf<(imm), [{
+  int sextVal = int(N->getSExtValue());
+  return (sextVal >= -64 && sextVal <= 63);
+}]>;
+
+// uimm7 predicate - True if the immediate fits in an 7-bit unsigned
+// field.
+def uimm7: PatLeaf<(imm), [{
+  return (N->getZExtValue() <= 0x7f);
+}]>;
+
+// immSExt8 predicate - True if the immediate fits in an 8-bit sign extended
+// field.
+def immSExt8  : PatLeaf<(imm), [{
+  int Value = int(N->getSExtValue());
+  return (Value >= -(1 << 8) && Value <= (1 << 8) - 1);
+}]>;
+
+// immU8: immediate, unsigned 8-bit quantity
+def immU8 : PatLeaf<(imm), [{
+  return (N->getZExtValue() <= 0xff);
+}]>;
+
+// i64ImmSExt10 predicate - True if the i64 immediate fits in a 10-bit sign
+// extended field.  Used by RI10Form instructions like 'ldq'.
+def i64ImmSExt10  : PatLeaf<(imm), [{
+  return isI64IntS10Immediate(N);
+}]>;
+
+// i32ImmSExt10 predicate - True if the i32 immediate fits in a 10-bit sign
+// extended field.  Used by RI10Form instructions like 'ldq'.
+def i32ImmSExt10  : PatLeaf<(imm), [{
+  return isI32IntS10Immediate(N);
+}]>;
+
+// i32ImmUns10 predicate - True if the i32 immediate fits in a 10-bit unsigned
+// field.  Used by RI10Form instructions like 'ldq'.
+def i32ImmUns10  : PatLeaf<(imm), [{
+  return isI32IntU10Immediate(N);
+}]>;
+
+// i16ImmSExt10 predicate - True if the i16 immediate fits in a 10-bit sign
+// extended field.  Used by RI10Form instructions like 'ldq'.
+def i16ImmSExt10  : PatLeaf<(imm), [{
+  return isI16IntS10Immediate(N);
+}]>;
+
+// i16ImmUns10 predicate - True if the i16 immediate fits into a 10-bit unsigned
+// value. Used by RI10Form instructions.
+def i16ImmUns10 : PatLeaf<(imm), [{
+  return isI16IntU10Immediate(N);
+}]>;
+
+def immSExt16  : PatLeaf<(imm), [{
+  // immSExt16 predicate - True if the immediate fits in a 16-bit sign extended
+  // field.
+  short Ignored;
+  return isIntS16Immediate(N, Ignored);
+}]>;
+
+def immZExt16  : PatLeaf<(imm), [{
+  // immZExt16 predicate - True if the immediate fits in a 16-bit zero extended
+  // field.
+  return (uint64_t)N->getZExtValue() == (unsigned short)N->getZExtValue();
+}], LO16>;
+
+def immU16 : PatLeaf<(imm), [{
+  // immU16 predicate- True if the immediate fits into a 16-bit unsigned field.
+  return (uint64_t)N->getZExtValue() == (N->getZExtValue() & 0xffff);
+}]>;
+
+def imm18  : PatLeaf<(imm), [{
+  // imm18 predicate: True if the immediate fits into an 18-bit unsigned field.
+  int Value = (int) N->getZExtValue();
+  return ((Value & ((1 << 19) - 1)) == Value);
+}]>;
+
+def lo16 : PatLeaf<(imm), [{
+  // lo16 predicate - returns true if the immediate has all zeros in the
+  // low order bits and is a 32-bit constant:
+  if (N->getValueType(0) == MVT::i32) {
+    uint32_t val = N->getZExtValue();
+    return ((val & 0x0000ffff) == val);
+  }
+
+  return false;
+}], LO16>;
+
+def hi16 : PatLeaf<(imm), [{
+  // hi16 predicate - returns true if the immediate has all zeros in the
+  // low order bits and is a 32-bit constant:
+  if (N->getValueType(0) == MVT::i32) {
+    uint32_t val = uint32_t(N->getZExtValue());
+    return ((val & 0xffff0000) == val);
+  } else if (N->getValueType(0) == MVT::i64) {
+    uint64_t val = N->getZExtValue();
+    return ((val & 0xffff0000ULL) == val);
+  }
+
+  return false;
+}], HI16>;
+
+def bitshift : PatLeaf<(imm), [{
+  // bitshift predicate - returns true if 0 < imm <= 7 for SHLQBII
+  // (shift left quadword by bits immediate)
+  int64_t Val = N->getZExtValue();
+  return (Val > 0 && Val <= 7);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Floating point operands:
+//===----------------------------------------------------------------------===//
+
+// Transform a float, returning the high 16 bits shifted down, as if
+// the float was really an unsigned integer:
+def HI16_f32 : SDNodeXForm<fpimm, [{
+  float fval = N->getValueAPF().convertToFloat();
+  return getI32Imm(FloatToBits(fval) >> 16);
+}]>;
+
+// Transformation function on floats: get the low 16 bits as if the float was
+// an unsigned integer.
+def LO16_f32 : SDNodeXForm<fpimm, [{
+  float fval = N->getValueAPF().convertToFloat();
+  return getI32Imm(FloatToBits(fval) & 0xffff);
+}]>;
+
+def FPimm_sext16 : SDNodeXForm<fpimm, [{
+  float fval = N->getValueAPF().convertToFloat();
+  return getI32Imm((int) ((FloatToBits(fval) << 16) >> 16));
+}]>;
+
+def FPimm_u18 : SDNodeXForm<fpimm, [{
+  float fval = N->getValueAPF().convertToFloat();
+  return getI32Imm(FloatToBits(fval) & ((1 << 19) - 1));
+}]>;
+
+def fpimmSExt16 : PatLeaf<(fpimm), [{
+  short Ignored;
+  return isFPS16Immediate(N, Ignored);  
+}], FPimm_sext16>;
+
+// Does the SFP constant only have upp 16 bits set?
+def hi16_f32 : PatLeaf<(fpimm), [{
+  if (N->getValueType(0) == MVT::f32) {
+    uint32_t val = FloatToBits(N->getValueAPF().convertToFloat());
+    return ((val & 0xffff0000) == val);
+  }
+
+  return false;
+}], HI16_f32>;
+
+// Does the SFP constant fit into 18 bits?
+def fpimm18  : PatLeaf<(fpimm), [{
+  if (N->getValueType(0) == MVT::f32) {
+    uint32_t Value = FloatToBits(N->getValueAPF().convertToFloat());
+    return ((Value & ((1 << 19) - 1)) == Value);
+  }
+
+  return false;
+}], FPimm_u18>;
+
+//===----------------------------------------------------------------------===//
+// 64-bit operands (TODO):
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// build_vector operands:
+//===----------------------------------------------------------------------===//
+
+// v16i8SExt8Imm_xform function: convert build_vector to 8-bit sign extended
+// immediate constant load for v16i8 vectors. N.B.: The incoming constant has
+// to be a 16-bit quantity with the upper and lower bytes equal (e.g., 0x2a2a).
+def v16i8SExt8Imm_xform: SDNodeXForm<build_vector, [{
+  return SPU::get_vec_i8imm(N, *CurDAG, MVT::i8);
+}]>;
+
+// v16i8SExt8Imm: Predicate test for 8-bit sign extended immediate constant
+// load, works in conjunction with its transform function. N.B.: This relies the
+// incoming constant being a 16-bit quantity, where the upper and lower bytes
+// are EXACTLY the same (e.g., 0x2a2a)
+def v16i8SExt8Imm: PatLeaf<(build_vector), [{
+  return SPU::get_vec_i8imm(N, *CurDAG, MVT::i8).getNode() != 0;
+}], v16i8SExt8Imm_xform>;
+
+// v16i8U8Imm_xform function: convert build_vector to unsigned 8-bit
+// immediate constant load for v16i8 vectors. N.B.: The incoming constant has
+// to be a 16-bit quantity with the upper and lower bytes equal (e.g., 0x2a2a).
+def v16i8U8Imm_xform: SDNodeXForm<build_vector, [{
+  return SPU::get_vec_i8imm(N, *CurDAG, MVT::i8);
+}]>;
+
+// v16i8U8Imm: Predicate test for unsigned 8-bit immediate constant
+// load, works in conjunction with its transform function. N.B.: This relies the
+// incoming constant being a 16-bit quantity, where the upper and lower bytes
+// are EXACTLY the same (e.g., 0x2a2a)
+def v16i8U8Imm: PatLeaf<(build_vector), [{
+  return SPU::get_vec_i8imm(N, *CurDAG, MVT::i8).getNode() != 0;
+}], v16i8U8Imm_xform>;
+
+// v8i16SExt8Imm_xform function: convert build_vector to 8-bit sign extended
+// immediate constant load for v8i16 vectors.
+def v8i16SExt8Imm_xform: SDNodeXForm<build_vector, [{
+  return SPU::get_vec_i8imm(N, *CurDAG, MVT::i16);
+}]>;
+
+// v8i16SExt8Imm: Predicate test for 8-bit sign extended immediate constant
+// load, works in conjunction with its transform function.
+def v8i16SExt8Imm: PatLeaf<(build_vector), [{
+  return SPU::get_vec_i8imm(N, *CurDAG, MVT::i16).getNode() != 0;
+}], v8i16SExt8Imm_xform>;
+
+// v8i16SExt10Imm_xform function: convert build_vector to 16-bit sign extended
+// immediate constant load for v8i16 vectors.
+def v8i16SExt10Imm_xform: SDNodeXForm<build_vector, [{
+  return SPU::get_vec_i10imm(N, *CurDAG, MVT::i16);
+}]>;
+
+// v8i16SExt10Imm: Predicate test for 16-bit sign extended immediate constant
+// load, works in conjunction with its transform function.
+def v8i16SExt10Imm: PatLeaf<(build_vector), [{
+  return SPU::get_vec_i10imm(N, *CurDAG, MVT::i16).getNode() != 0;
+}], v8i16SExt10Imm_xform>;
+
+// v8i16Uns10Imm_xform function: convert build_vector to 16-bit unsigned
+// immediate constant load for v8i16 vectors.
+def v8i16Uns10Imm_xform: SDNodeXForm<build_vector, [{
+  return SPU::get_vec_i10imm(N, *CurDAG, MVT::i16);
+}]>;
+
+// v8i16Uns10Imm: Predicate test for 16-bit unsigned immediate constant
+// load, works in conjunction with its transform function.
+def v8i16Uns10Imm: PatLeaf<(build_vector), [{
+  return SPU::get_vec_i10imm(N, *CurDAG, MVT::i16).getNode() != 0;
+}], v8i16Uns10Imm_xform>;
+
+// v8i16SExt16Imm_xform function: convert build_vector to 16-bit sign extended
+// immediate constant load for v8i16 vectors.
+def v8i16Uns16Imm_xform: SDNodeXForm<build_vector, [{
+  return SPU::get_vec_i16imm(N, *CurDAG, MVT::i16);
+}]>;
+
+// v8i16SExt16Imm: Predicate test for 16-bit sign extended immediate constant
+// load, works in conjunction with its transform function.
+def v8i16SExt16Imm: PatLeaf<(build_vector), [{
+  return SPU::get_vec_i16imm(N, *CurDAG, MVT::i16).getNode() != 0;
+}], v8i16Uns16Imm_xform>;
+
+// v4i32SExt10Imm_xform function: convert build_vector to 10-bit sign extended
+// immediate constant load for v4i32 vectors.
+def v4i32SExt10Imm_xform: SDNodeXForm<build_vector, [{
+  return SPU::get_vec_i10imm(N, *CurDAG, MVT::i32);
+}]>;
+
+// v4i32SExt10Imm: Predicate test for 10-bit sign extended immediate constant
+// load, works in conjunction with its transform function.
+def v4i32SExt10Imm: PatLeaf<(build_vector), [{
+  return SPU::get_vec_i10imm(N, *CurDAG, MVT::i32).getNode() != 0;
+}], v4i32SExt10Imm_xform>;
+
+// v4i32Uns10Imm_xform function: convert build_vector to 10-bit unsigned
+// immediate constant load for v4i32 vectors.
+def v4i32Uns10Imm_xform: SDNodeXForm<build_vector, [{
+  return SPU::get_vec_i10imm(N, *CurDAG, MVT::i32);
+}]>;
+
+// v4i32Uns10Imm: Predicate test for 10-bit unsigned immediate constant
+// load, works in conjunction with its transform function.
+def v4i32Uns10Imm: PatLeaf<(build_vector), [{
+  return SPU::get_vec_i10imm(N, *CurDAG, MVT::i32).getNode() != 0;
+}], v4i32Uns10Imm_xform>;
+
+// v4i32SExt16Imm_xform function: convert build_vector to 16-bit sign extended
+// immediate constant load for v4i32 vectors.
+def v4i32SExt16Imm_xform: SDNodeXForm<build_vector, [{
+  return SPU::get_vec_i16imm(N, *CurDAG, MVT::i32);
+}]>;
+
+// v4i32SExt16Imm: Predicate test for 16-bit sign extended immediate constant
+// load, works in conjunction with its transform function.
+def v4i32SExt16Imm: PatLeaf<(build_vector), [{
+  return SPU::get_vec_i16imm(N, *CurDAG, MVT::i32).getNode() != 0;
+}], v4i32SExt16Imm_xform>;
+
+// v4i32Uns18Imm_xform function: convert build_vector to 18-bit unsigned
+// immediate constant load for v4i32 vectors.
+def v4i32Uns18Imm_xform: SDNodeXForm<build_vector, [{
+  return SPU::get_vec_u18imm(N, *CurDAG, MVT::i32);
+}]>;
+
+// v4i32Uns18Imm: Predicate test for 18-bit unsigned immediate constant load,
+// works in conjunction with its transform function.
+def v4i32Uns18Imm: PatLeaf<(build_vector), [{
+  return SPU::get_vec_u18imm(N, *CurDAG, MVT::i32).getNode() != 0;
+}], v4i32Uns18Imm_xform>;
+
+// ILHUvec_get_imm xform function: convert build_vector to ILHUvec imm constant
+// load.
+def ILHUvec_get_imm: SDNodeXForm<build_vector, [{
+  return SPU::get_ILHUvec_imm(N, *CurDAG, MVT::i32);
+}]>;
+
+/// immILHUvec: Predicate test for a ILHU constant vector.
+def immILHUvec: PatLeaf<(build_vector), [{
+  return SPU::get_ILHUvec_imm(N, *CurDAG, MVT::i32).getNode() != 0;
+}], ILHUvec_get_imm>;
+
+// Catch-all for any other i32 vector constants
+def v4i32_get_imm: SDNodeXForm<build_vector, [{
+  return SPU::get_v4i32_imm(N, *CurDAG);
+}]>;
+
+def v4i32Imm: PatLeaf<(build_vector), [{
+  return SPU::get_v4i32_imm(N, *CurDAG).getNode() != 0;
+}], v4i32_get_imm>;
+
+// v2i64SExt10Imm_xform function: convert build_vector to 10-bit sign extended
+// immediate constant load for v2i64 vectors.
+def v2i64SExt10Imm_xform: SDNodeXForm<build_vector, [{
+  return SPU::get_vec_i10imm(N, *CurDAG, MVT::i64);
+}]>;
+
+// v2i64SExt10Imm: Predicate test for 10-bit sign extended immediate constant
+// load, works in conjunction with its transform function.
+def v2i64SExt10Imm: PatLeaf<(build_vector), [{
+  return SPU::get_vec_i10imm(N, *CurDAG, MVT::i64).getNode() != 0;
+}], v2i64SExt10Imm_xform>;
+
+// v2i64SExt16Imm_xform function: convert build_vector to 16-bit sign extended
+// immediate constant load for v2i64 vectors.
+def v2i64SExt16Imm_xform: SDNodeXForm<build_vector, [{
+  return SPU::get_vec_i16imm(N, *CurDAG, MVT::i64);
+}]>;
+
+// v2i64SExt16Imm: Predicate test for 16-bit sign extended immediate constant
+// load, works in conjunction with its transform function.
+def v2i64SExt16Imm: PatLeaf<(build_vector), [{
+  return SPU::get_vec_i16imm(N, *CurDAG, MVT::i64).getNode() != 0;
+}], v2i64SExt16Imm_xform>;
+
+// v2i64Uns18Imm_xform function: convert build_vector to 18-bit unsigned
+// immediate constant load for v2i64 vectors.
+def v2i64Uns18Imm_xform: SDNodeXForm<build_vector, [{
+  return SPU::get_vec_u18imm(N, *CurDAG, MVT::i64);
+}]>;
+
+// v2i64Uns18Imm: Predicate test for 18-bit unsigned immediate constant load,
+// works in conjunction with its transform function.
+def v2i64Uns18Imm: PatLeaf<(build_vector), [{
+  return SPU::get_vec_u18imm(N, *CurDAG, MVT::i64).getNode() != 0;
+}], v2i64Uns18Imm_xform>;
+
+/// immILHUvec: Predicate test for a ILHU constant vector.
+def immILHUvec_i64: PatLeaf<(build_vector), [{
+  return SPU::get_ILHUvec_imm(N, *CurDAG, MVT::i64).getNode() != 0;
+}], ILHUvec_get_imm>;
+
+// Catch-all for any other i32 vector constants
+def v2i64_get_imm: SDNodeXForm<build_vector, [{
+  return SPU::get_v2i64_imm(N, *CurDAG);
+}]>;
+
+def v2i64Imm: PatLeaf<(build_vector), [{
+  return SPU::get_v2i64_imm(N, *CurDAG).getNode() != 0;
+}], v2i64_get_imm>;
+
+//===----------------------------------------------------------------------===//
+// Operand Definitions.
+
+def s7imm: Operand<i8> {
+  let PrintMethod = "printS7ImmOperand";
+}
+
+def s7imm_i8: Operand<i8> {
+  let PrintMethod = "printS7ImmOperand";
+}
+
+def u7imm: Operand<i16> {
+  let PrintMethod = "printU7ImmOperand";
+}
+
+def u7imm_i8: Operand<i8> {
+  let PrintMethod = "printU7ImmOperand";
+}
+
+def u7imm_i32: Operand<i32> {
+  let PrintMethod = "printU7ImmOperand";
+}
+
+// Halfword, signed 10-bit constant
+def s10imm : Operand<i16> {
+  let PrintMethod = "printS10ImmOperand";
+}
+
+def s10imm_i8: Operand<i8> {
+  let PrintMethod = "printS10ImmOperand";
+}
+
+def s10imm_i32: Operand<i32> {
+  let PrintMethod = "printS10ImmOperand";
+}
+
+def s10imm_i64: Operand<i64> {
+  let PrintMethod = "printS10ImmOperand";
+}
+
+// Unsigned 10-bit integers:
+def u10imm: Operand<i16> {
+  let PrintMethod = "printU10ImmOperand";
+}
+
+def u10imm_i8: Operand<i8> {
+  let PrintMethod = "printU10ImmOperand";
+}
+
+def u10imm_i32: Operand<i32> {
+  let PrintMethod = "printU10ImmOperand";
+}
+
+def s16imm  : Operand<i16> {
+  let PrintMethod = "printS16ImmOperand";
+}
+
+def s16imm_i8: Operand<i8> {
+  let PrintMethod = "printS16ImmOperand";
+}
+
+def s16imm_i32: Operand<i32> {
+  let PrintMethod = "printS16ImmOperand";
+}
+
+def s16imm_i64: Operand<i64> {
+  let PrintMethod = "printS16ImmOperand";
+}
+
+def s16imm_f32: Operand<f32> {
+  let PrintMethod = "printS16ImmOperand";
+}
+
+def s16imm_f64: Operand<f64> {
+  let PrintMethod = "printS16ImmOperand";
+}
+
+def u16imm_i64 : Operand<i64> {
+  let PrintMethod = "printU16ImmOperand";
+}
+
+def u16imm_i32 : Operand<i32> {
+  let PrintMethod = "printU16ImmOperand";
+}
+
+def u16imm : Operand<i16> {
+  let PrintMethod = "printU16ImmOperand";
+}
+
+def f16imm : Operand<f32> {
+  let PrintMethod = "printU16ImmOperand";
+}
+
+def s18imm  : Operand<i32> {
+  let PrintMethod = "printS18ImmOperand";
+}
+
+def u18imm : Operand<i32> {
+  let PrintMethod = "printU18ImmOperand";
+}
+
+def u18imm_i64 : Operand<i64> {
+  let PrintMethod = "printU18ImmOperand";
+}
+
+def f18imm : Operand<f32> {
+  let PrintMethod = "printU18ImmOperand";
+}
+
+def f18imm_f64 : Operand<f64> {
+  let PrintMethod = "printU18ImmOperand";
+}
+
+// Negated 7-bit halfword rotate immediate operands
+def rothNeg7imm : Operand<i32> {
+  let PrintMethod = "printROTHNeg7Imm";
+}
+
+def rothNeg7imm_i16 : Operand<i16> {
+  let PrintMethod = "printROTHNeg7Imm";
+}
+
+// Negated 7-bit word rotate immediate operands
+def rotNeg7imm : Operand<i32> {
+  let PrintMethod = "printROTNeg7Imm";
+}
+
+def rotNeg7imm_i16 : Operand<i16> {
+  let PrintMethod = "printROTNeg7Imm";
+}
+
+def rotNeg7imm_i8 : Operand<i8> {
+  let PrintMethod = "printROTNeg7Imm";
+}
+
+def target : Operand<OtherVT> {
+  let PrintMethod = "printBranchOperand";
+}
+
+// Absolute address call target
+def calltarget : Operand<iPTR> {
+  let PrintMethod = "printCallOperand";
+  let MIOperandInfo = (ops u18imm:$calldest);
+}
+
+// PC relative call target
+def relcalltarget : Operand<iPTR> {
+  let PrintMethod = "printPCRelativeOperand";
+  let MIOperandInfo = (ops s16imm:$calldest);
+}
+
+// Branch targets:
+def brtarget : Operand<OtherVT> {
+  let PrintMethod = "printPCRelativeOperand";
+}
+
+// Hint for branch target
+def hbrtarget : Operand<OtherVT> {
+  let PrintMethod = "printHBROperand";
+}
+
+// Indirect call target
+def indcalltarget : Operand<iPTR> {
+  let PrintMethod = "printCallOperand";
+  let MIOperandInfo = (ops ptr_rc:$calldest);
+}
+
+def symbolHi: Operand<i32> {
+  let PrintMethod = "printSymbolHi";
+}
+
+def symbolLo: Operand<i32> {
+  let PrintMethod = "printSymbolLo";
+}
+
+def symbolLSA: Operand<i32> {
+  let PrintMethod = "printSymbolLSA";
+}
+
+// Shuffle address memory operaand [s7imm(reg) d-format]
+def shufaddr : Operand<iPTR> {
+  let PrintMethod = "printShufAddr";
+  let MIOperandInfo = (ops s7imm:$imm, ptr_rc:$reg);
+}
+
+// memory s10imm(reg) operand
+def dformaddr : Operand<iPTR> {
+  let PrintMethod = "printDFormAddr";
+  let MIOperandInfo = (ops s10imm:$imm, ptr_rc:$reg);
+}
+
+// 256K local store address
+// N.B.: The tblgen code generator expects to have two operands, an offset
+// and a pointer. Of these, only the immediate is actually used.
+def addr256k : Operand<iPTR> {
+  let PrintMethod = "printAddr256K";
+  let MIOperandInfo = (ops s16imm:$imm, ptr_rc:$reg);
+}
+
+// memory s18imm(reg) operand
+def memri18 : Operand<iPTR> {
+  let PrintMethod = "printMemRegImmS18";
+  let MIOperandInfo = (ops s18imm:$imm, ptr_rc:$reg);
+}
+
+// memory register + register operand
+def memrr : Operand<iPTR> {
+  let PrintMethod = "printMemRegReg";
+  let MIOperandInfo = (ops ptr_rc:$reg_a, ptr_rc:$reg_b);
+}
+
+// Define SPU-specific addressing modes: These come in three basic
+// flavors:
+//
+// D-form   : [r+I10] (10-bit signed offset + reg)
+// X-form   : [r+r]   (reg+reg)
+// A-form   : abs     (256K LSA offset)
+// D-form(2): [r+I7]  (7-bit signed offset + reg)
+
+def dform_addr   : ComplexPattern<iPTR, 2, "SelectDFormAddr",     [], []>;
+def xform_addr   : ComplexPattern<iPTR, 2, "SelectXFormAddr",     [], []>;
+def aform_addr   : ComplexPattern<iPTR, 2, "SelectAFormAddr",     [], []>;
+def dform2_addr  : ComplexPattern<iPTR, 2, "SelectDForm2Addr",    [], []>;

diff --git a/lib/Target/CellSPU/SPURegisterInfo.cpp b/lib/Target/CellSPU/SPURegisterInfo.cpp
new file mode 100644
index 0000000..af94e67
--- /dev/null
+++ b/lib/Target/CellSPU/SPURegisterInfo.cpp

@@ -0,0 +1,620 @@
+//===- SPURegisterInfo.cpp - Cell SPU Register Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Cell implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "reginfo"
+#include "SPU.h"
+#include "SPURegisterInfo.h"
+#include "SPURegisterNames.h"
+#include "SPUInstrBuilder.h"
+#include "SPUSubtarget.h"
+#include "SPUMachineFunction.h"
+#include "SPUFrameInfo.h"
+#include "llvm/Constants.h"
+#include "llvm/Type.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include <cstdlib>
+
+using namespace llvm;
+
+/// getRegisterNumbering - Given the enum value for some register, e.g.
+/// PPC::F14, return the number that it corresponds to (e.g. 14).
+unsigned SPURegisterInfo::getRegisterNumbering(unsigned RegEnum) {
+  using namespace SPU;
+  switch (RegEnum) {
+  case SPU::R0: return 0;
+  case SPU::R1: return 1;
+  case SPU::R2: return 2;
+  case SPU::R3: return 3;
+  case SPU::R4: return 4;
+  case SPU::R5: return 5;
+  case SPU::R6: return 6;
+  case SPU::R7: return 7;
+  case SPU::R8: return 8;
+  case SPU::R9: return 9;
+  case SPU::R10: return 10;
+  case SPU::R11: return 11;
+  case SPU::R12: return 12;
+  case SPU::R13: return 13;
+  case SPU::R14: return 14;
+  case SPU::R15: return 15;
+  case SPU::R16: return 16;
+  case SPU::R17: return 17;
+  case SPU::R18: return 18;
+  case SPU::R19: return 19;
+  case SPU::R20: return 20;
+  case SPU::R21: return 21;
+  case SPU::R22: return 22;
+  case SPU::R23: return 23;
+  case SPU::R24: return 24;
+  case SPU::R25: return 25;
+  case SPU::R26: return 26;
+  case SPU::R27: return 27;
+  case SPU::R28: return 28;
+  case SPU::R29: return 29;
+  case SPU::R30: return 30;
+  case SPU::R31: return 31;
+  case SPU::R32: return 32;
+  case SPU::R33: return 33;
+  case SPU::R34: return 34;
+  case SPU::R35: return 35;
+  case SPU::R36: return 36;
+  case SPU::R37: return 37;
+  case SPU::R38: return 38;
+  case SPU::R39: return 39;
+  case SPU::R40: return 40;
+  case SPU::R41: return 41;
+  case SPU::R42: return 42;
+  case SPU::R43: return 43;
+  case SPU::R44: return 44;
+  case SPU::R45: return 45;
+  case SPU::R46: return 46;
+  case SPU::R47: return 47;
+  case SPU::R48: return 48;
+  case SPU::R49: return 49;
+  case SPU::R50: return 50;
+  case SPU::R51: return 51;
+  case SPU::R52: return 52;
+  case SPU::R53: return 53;
+  case SPU::R54: return 54;
+  case SPU::R55: return 55;
+  case SPU::R56: return 56;
+  case SPU::R57: return 57;
+  case SPU::R58: return 58;
+  case SPU::R59: return 59;
+  case SPU::R60: return 60;
+  case SPU::R61: return 61;
+  case SPU::R62: return 62;
+  case SPU::R63: return 63;
+  case SPU::R64: return 64;
+  case SPU::R65: return 65;
+  case SPU::R66: return 66;
+  case SPU::R67: return 67;
+  case SPU::R68: return 68;
+  case SPU::R69: return 69;
+  case SPU::R70: return 70;
+  case SPU::R71: return 71;
+  case SPU::R72: return 72;
+  case SPU::R73: return 73;
+  case SPU::R74: return 74;
+  case SPU::R75: return 75;
+  case SPU::R76: return 76;
+  case SPU::R77: return 77;
+  case SPU::R78: return 78;
+  case SPU::R79: return 79;
+  case SPU::R80: return 80;
+  case SPU::R81: return 81;
+  case SPU::R82: return 82;
+  case SPU::R83: return 83;
+  case SPU::R84: return 84;
+  case SPU::R85: return 85;
+  case SPU::R86: return 86;
+  case SPU::R87: return 87;
+  case SPU::R88: return 88;
+  case SPU::R89: return 89;
+  case SPU::R90: return 90;
+  case SPU::R91: return 91;
+  case SPU::R92: return 92;
+  case SPU::R93: return 93;
+  case SPU::R94: return 94;
+  case SPU::R95: return 95;
+  case SPU::R96: return 96;
+  case SPU::R97: return 97;
+  case SPU::R98: return 98;
+  case SPU::R99: return 99;
+  case SPU::R100: return 100;
+  case SPU::R101: return 101;
+  case SPU::R102: return 102;
+  case SPU::R103: return 103;
+  case SPU::R104: return 104;
+  case SPU::R105: return 105;
+  case SPU::R106: return 106;
+  case SPU::R107: return 107;
+  case SPU::R108: return 108;
+  case SPU::R109: return 109;
+  case SPU::R110: return 110;
+  case SPU::R111: return 111;
+  case SPU::R112: return 112;
+  case SPU::R113: return 113;
+  case SPU::R114: return 114;
+  case SPU::R115: return 115;
+  case SPU::R116: return 116;
+  case SPU::R117: return 117;
+  case SPU::R118: return 118;
+  case SPU::R119: return 119;
+  case SPU::R120: return 120;
+  case SPU::R121: return 121;
+  case SPU::R122: return 122;
+  case SPU::R123: return 123;
+  case SPU::R124: return 124;
+  case SPU::R125: return 125;
+  case SPU::R126: return 126;
+  case SPU::R127: return 127;
+  default:
+    llvm_report_error("Unhandled reg in SPURegisterInfo::getRegisterNumbering");
+  }
+}
+
+SPURegisterInfo::SPURegisterInfo(const SPUSubtarget &subtarget,
+                                 const TargetInstrInfo &tii) :
+  SPUGenRegisterInfo(SPU::ADJCALLSTACKDOWN, SPU::ADJCALLSTACKUP),
+  Subtarget(subtarget),
+  TII(tii)
+{
+}
+
+// SPU's 128-bit registers used for argument passing:
+static const unsigned SPU_ArgRegs[] = {
+  SPU::R3,  SPU::R4,  SPU::R5,  SPU::R6,  SPU::R7,  SPU::R8,  SPU::R9,
+  SPU::R10, SPU::R11, SPU::R12, SPU::R13, SPU::R14, SPU::R15, SPU::R16,
+  SPU::R17, SPU::R18, SPU::R19, SPU::R20, SPU::R21, SPU::R22, SPU::R23,
+  SPU::R24, SPU::R25, SPU::R26, SPU::R27, SPU::R28, SPU::R29, SPU::R30,
+  SPU::R31, SPU::R32, SPU::R33, SPU::R34, SPU::R35, SPU::R36, SPU::R37,
+  SPU::R38, SPU::R39, SPU::R40, SPU::R41, SPU::R42, SPU::R43, SPU::R44,
+  SPU::R45, SPU::R46, SPU::R47, SPU::R48, SPU::R49, SPU::R50, SPU::R51,
+  SPU::R52, SPU::R53, SPU::R54, SPU::R55, SPU::R56, SPU::R57, SPU::R58,
+  SPU::R59, SPU::R60, SPU::R61, SPU::R62, SPU::R63, SPU::R64, SPU::R65,
+  SPU::R66, SPU::R67, SPU::R68, SPU::R69, SPU::R70, SPU::R71, SPU::R72,
+  SPU::R73, SPU::R74, SPU::R75, SPU::R76, SPU::R77, SPU::R78, SPU::R79
+};
+
+const unsigned *
+SPURegisterInfo::getArgRegs()
+{
+  return SPU_ArgRegs;
+}
+
+unsigned
+SPURegisterInfo::getNumArgRegs()
+{
+  return sizeof(SPU_ArgRegs) / sizeof(SPU_ArgRegs[0]);
+}
+
+/// getPointerRegClass - Return the register class to use to hold pointers.
+/// This is used for addressing modes.
+const TargetRegisterClass *
+SPURegisterInfo::getPointerRegClass(unsigned Kind) const {
+  return &SPU::R32CRegClass;
+}
+
+const unsigned *
+SPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const
+{
+  // Cell ABI calling convention
+  static const unsigned SPU_CalleeSaveRegs[] = {
+    SPU::R80, SPU::R81, SPU::R82, SPU::R83,
+    SPU::R84, SPU::R85, SPU::R86, SPU::R87,
+    SPU::R88, SPU::R89, SPU::R90, SPU::R91,
+    SPU::R92, SPU::R93, SPU::R94, SPU::R95,
+    SPU::R96, SPU::R97, SPU::R98, SPU::R99,
+    SPU::R100, SPU::R101, SPU::R102, SPU::R103,
+    SPU::R104, SPU::R105, SPU::R106, SPU::R107,
+    SPU::R108, SPU::R109, SPU::R110, SPU::R111,
+    SPU::R112, SPU::R113, SPU::R114, SPU::R115,
+    SPU::R116, SPU::R117, SPU::R118, SPU::R119,
+    SPU::R120, SPU::R121, SPU::R122, SPU::R123,
+    SPU::R124, SPU::R125, SPU::R126, SPU::R127,
+    SPU::R2,    /* environment pointer */
+    SPU::R1,    /* stack pointer */
+    SPU::R0,    /* link register */
+    0 /* end */
+  };
+
+  return SPU_CalleeSaveRegs;
+}
+
+const TargetRegisterClass* const*
+SPURegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const
+{
+  // Cell ABI Calling Convention
+  static const TargetRegisterClass * const SPU_CalleeSaveRegClasses[] = {
+    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
+    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
+    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
+    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
+    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
+    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
+    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
+    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
+    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
+    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
+    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
+    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
+    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
+    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
+    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
+    &SPU::GPRCRegClass, &SPU::GPRCRegClass, &SPU::GPRCRegClass,
+    &SPU::GPRCRegClass, /* environment pointer */
+    &SPU::GPRCRegClass, /* stack pointer */
+    &SPU::GPRCRegClass, /* link register */
+    0 /* end */
+  };
+
+  return SPU_CalleeSaveRegClasses;
+}
+
+/*!
+ R0 (link register), R1 (stack pointer) and R2 (environment pointer -- this is
+ generally unused) are the Cell's reserved registers
+ */
+BitVector SPURegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  Reserved.set(SPU::R0);                // LR
+  Reserved.set(SPU::R1);                // SP
+  Reserved.set(SPU::R2);                // environment pointer
+  return Reserved;
+}
+
+//===----------------------------------------------------------------------===//
+// Stack Frame Processing methods
+//===----------------------------------------------------------------------===//
+
+// needsFP - Return true if the specified function should have a dedicated frame
+// pointer register.  This is true if the function has variable sized allocas or
+// if frame pointer elimination is disabled.
+//
+static bool needsFP(const MachineFunction &MF) {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  return NoFramePointerElim || MFI->hasVarSizedObjects();
+}
+
+//--------------------------------------------------------------------------
+// hasFP - Return true if the specified function actually has a dedicated frame
+// pointer register.  This is true if the function needs a frame pointer and has
+// a non-zero stack size.
+bool
+SPURegisterInfo::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  return MFI->getStackSize() && needsFP(MF);
+}
+
+//--------------------------------------------------------------------------
+void
+SPURegisterInfo::eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                               MachineBasicBlock &MBB,
+                                               MachineBasicBlock::iterator I)
+  const
+{
+  // Simply discard ADJCALLSTACKDOWN, ADJCALLSTACKUP instructions.
+  MBB.erase(I);
+}
+
+unsigned
+SPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
+                                     int *Value, RegScavenger *RS) const
+{
+  unsigned i = 0;
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+
+  MachineOperand &SPOp = MI.getOperand(i);
+  int FrameIndex = SPOp.getIndex();
+
+  // Now add the frame object offset to the offset from r1.
+  int Offset = MFI->getObjectOffset(FrameIndex);
+
+  // Most instructions, except for generated FrameIndex additions using AIr32
+  // and ILAr32, have the immediate in operand 1. AIr32 and ILAr32 have the
+  // immediate in operand 2.
+  unsigned OpNo = 1;
+  if (MI.getOpcode() == SPU::AIr32 || MI.getOpcode() == SPU::ILAr32)
+    OpNo = 2;
+
+  MachineOperand &MO = MI.getOperand(OpNo);
+
+  // Offset is biased by $lr's slot at the bottom.
+  Offset += MO.getImm() + MFI->getStackSize() + SPUFrameInfo::minStackSize();
+  assert((Offset & 0xf) == 0
+         && "16-byte alignment violated in eliminateFrameIndex");
+
+  // Replace the FrameIndex with base register with $sp (aka $r1)
+  SPOp.ChangeToRegister(SPU::R1, false);
+  if (Offset > SPUFrameInfo::maxFrameOffset()
+      || Offset < SPUFrameInfo::minFrameOffset()) {
+    errs() << "Large stack adjustment ("
+         << Offset
+         << ") in SPURegisterInfo::eliminateFrameIndex.";
+  } else {
+    MO.ChangeToImmediate(Offset);
+  }
+  return 0;
+}
+
+/// determineFrameLayout - Determine the size of the frame and maximum call
+/// frame size.
+void
+SPURegisterInfo::determineFrameLayout(MachineFunction &MF) const
+{
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Get the number of bytes to allocate from the FrameInfo
+  unsigned FrameSize = MFI->getStackSize();
+
+  // Get the alignments provided by the target, and the maximum alignment
+  // (if any) of the fixed frame objects.
+  unsigned TargetAlign = MF.getTarget().getFrameInfo()->getStackAlignment();
+  unsigned Align = std::max(TargetAlign, MFI->getMaxAlignment());
+  assert(isPowerOf2_32(Align) && "Alignment is not power of 2");
+  unsigned AlignMask = Align - 1;
+
+  // Get the maximum call frame size of all the calls.
+  unsigned maxCallFrameSize = MFI->getMaxCallFrameSize();
+
+  // If we have dynamic alloca then maxCallFrameSize needs to be aligned so
+  // that allocations will be aligned.
+  if (MFI->hasVarSizedObjects())
+    maxCallFrameSize = (maxCallFrameSize + AlignMask) & ~AlignMask;
+
+  // Update maximum call frame size.
+  MFI->setMaxCallFrameSize(maxCallFrameSize);
+
+  // Include call frame size in total.
+  FrameSize += maxCallFrameSize;
+
+  // Make sure the frame is aligned.
+  FrameSize = (FrameSize + AlignMask) & ~AlignMask;
+
+  // Update frame info.
+  MFI->setStackSize(FrameSize);
+}
+
+void SPURegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                                           RegScavenger *RS)
+  const {
+  // Mark LR and SP unused, since the prolog spills them to stack and
+  // we don't want anyone else to spill them for us.
+  //
+  // Also, unless R2 is really used someday, don't spill it automatically.
+  MF.getRegInfo().setPhysRegUnused(SPU::R0);
+  MF.getRegInfo().setPhysRegUnused(SPU::R1);
+  MF.getRegInfo().setPhysRegUnused(SPU::R2);
+}
+
+void SPURegisterInfo::emitPrologue(MachineFunction &MF) const
+{
+  MachineBasicBlock &MBB = MF.front();   // Prolog goes in entry BB
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
+  DebugLoc dl = (MBBI != MBB.end() ?
+                 MBBI->getDebugLoc() : DebugLoc::getUnknownLoc());
+
+  // Prepare for debug frame info.
+  bool hasDebugInfo = MMI && MMI->hasDebugInfo();
+  unsigned FrameLabelId = 0;
+
+  // Move MBBI back to the beginning of the function.
+  MBBI = MBB.begin();
+
+  // Work out frame sizes.
+  determineFrameLayout(MF);
+  int FrameSize = MFI->getStackSize();
+
+  assert((FrameSize & 0xf) == 0
+         && "SPURegisterInfo::emitPrologue: FrameSize not aligned");
+
+  if (FrameSize > 0 || MFI->hasCalls()) {
+    FrameSize = -(FrameSize + SPUFrameInfo::minStackSize());
+    if (hasDebugInfo) {
+      // Mark effective beginning of when frame pointer becomes valid.
+      FrameLabelId = MMI->NextLabelID();
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::DBG_LABEL)).addImm(FrameLabelId);
+    }
+
+    // Adjust stack pointer, spilling $lr -> 16($sp) and $sp -> -FrameSize($sp)
+    // for the ABI
+    BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr32), SPU::R0).addImm(16)
+      .addReg(SPU::R1);
+    if (isS10Constant(FrameSize)) {
+      // Spill $sp to adjusted $sp
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr32), SPU::R1).addImm(FrameSize)
+        .addReg(SPU::R1);
+      // Adjust $sp by required amout
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::AIr32), SPU::R1).addReg(SPU::R1)
+        .addImm(FrameSize);
+    } else if (FrameSize <= (1 << 16) - 1 && FrameSize >= -(1 << 16)) {
+      // Frame size can be loaded into ILr32n, so temporarily spill $r2 and use
+      // $r2 to adjust $sp:
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr128), SPU::R2)
+        .addImm(-16)
+        .addReg(SPU::R1);
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::ILr32), SPU::R2)
+        .addImm(FrameSize);
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr32), SPU::R1)
+        .addReg(SPU::R2)
+        .addReg(SPU::R1);
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::Ar32), SPU::R1)
+        .addReg(SPU::R1)
+        .addReg(SPU::R2);
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::SFIr32), SPU::R2)
+        .addReg(SPU::R2)
+        .addImm(16);
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::LQXr128), SPU::R2)
+        .addReg(SPU::R2)
+        .addReg(SPU::R1);
+    } else {
+      std::string msg;
+      raw_string_ostream Msg(msg);
+      Msg << "Unhandled frame size: " << FrameSize;
+      llvm_report_error(Msg.str());
+    }
+
+    if (hasDebugInfo) {
+      std::vector<MachineMove> &Moves = MMI->getFrameMoves();
+
+      // Show update of SP.
+      MachineLocation SPDst(MachineLocation::VirtualFP);
+      MachineLocation SPSrc(MachineLocation::VirtualFP, -FrameSize);
+      Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc));
+
+      // Add callee saved registers to move list.
+      const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+      for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
+        int Offset = MFI->getObjectOffset(CSI[I].getFrameIdx());
+        unsigned Reg = CSI[I].getReg();
+        if (Reg == SPU::R0) continue;
+        MachineLocation CSDst(MachineLocation::VirtualFP, Offset);
+        MachineLocation CSSrc(Reg);
+        Moves.push_back(MachineMove(FrameLabelId, CSDst, CSSrc));
+      }
+
+      // Mark effective beginning of when frame pointer is ready.
+      unsigned ReadyLabelId = MMI->NextLabelID();
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::DBG_LABEL)).addImm(ReadyLabelId);
+
+      MachineLocation FPDst(SPU::R1);
+      MachineLocation FPSrc(MachineLocation::VirtualFP);
+      Moves.push_back(MachineMove(ReadyLabelId, FPDst, FPSrc));
+    }
+  } else {
+    // This is a leaf function -- insert a branch hint iff there are
+    // sufficient number instructions in the basic block. Note that
+    // this is just a best guess based on the basic block's size.
+    if (MBB.size() >= (unsigned) SPUFrameInfo::branchHintPenalty()) {
+      MachineBasicBlock::iterator MBBI = prior(MBB.end());
+      dl = MBBI->getDebugLoc();
+
+      // Insert terminator label
+      unsigned BranchLabelId = MMI->NextLabelID();
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::DBG_LABEL)).addImm(BranchLabelId);
+    }
+  }
+}
+
+void
+SPURegisterInfo::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const
+{
+  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  int FrameSize = MFI->getStackSize();
+  int LinkSlotOffset = SPUFrameInfo::stackSlotSize();
+  DebugLoc dl = MBBI->getDebugLoc();
+
+  assert(MBBI->getOpcode() == SPU::RET &&
+         "Can only insert epilog into returning blocks");
+  assert((FrameSize & 0xf) == 0
+         && "SPURegisterInfo::emitEpilogue: FrameSize not aligned");
+  if (FrameSize > 0 || MFI->hasCalls()) {
+    FrameSize = FrameSize + SPUFrameInfo::minStackSize();
+    if (isS10Constant(FrameSize + LinkSlotOffset)) {
+      // Reload $lr, adjust $sp by required amount
+      // Note: We do this to slightly improve dual issue -- not by much, but it
+      // is an opportunity for dual issue.
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::LQDr128), SPU::R0)
+        .addImm(FrameSize + LinkSlotOffset)
+        .addReg(SPU::R1);
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::AIr32), SPU::R1)
+        .addReg(SPU::R1)
+        .addImm(FrameSize);
+    } else if (FrameSize <= (1 << 16) - 1 && FrameSize >= -(1 << 16)) {
+      // Frame size can be loaded into ILr32n, so temporarily spill $r2 and use
+      // $r2 to adjust $sp:
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::STQDr128), SPU::R2)
+        .addImm(16)
+        .addReg(SPU::R1);
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::ILr32), SPU::R2)
+        .addImm(FrameSize);
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::Ar32), SPU::R1)
+        .addReg(SPU::R1)
+        .addReg(SPU::R2);
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::LQDr128), SPU::R0)
+        .addImm(16)
+        .addReg(SPU::R2);
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::SFIr32), SPU::R2).
+        addReg(SPU::R2)
+        .addImm(16);
+      BuildMI(MBB, MBBI, dl, TII.get(SPU::LQXr128), SPU::R2)
+        .addReg(SPU::R2)
+        .addReg(SPU::R1);
+    } else {
+      std::string msg;
+      raw_string_ostream Msg(msg);
+      Msg << "Unhandled frame size: " << FrameSize;
+      llvm_report_error(Msg.str());
+    }
+   }
+}
+
+unsigned
+SPURegisterInfo::getRARegister() const
+{
+  return SPU::R0;
+}
+
+unsigned
+SPURegisterInfo::getFrameRegister(const MachineFunction &MF) const
+{
+  return SPU::R1;
+}
+
+void
+SPURegisterInfo::getInitialFrameState(std::vector<MachineMove> &Moves) const
+{
+  // Initial state of the frame pointer is R1.
+  MachineLocation Dst(MachineLocation::VirtualFP);
+  MachineLocation Src(SPU::R1, 0);
+  Moves.push_back(MachineMove(0, Dst, Src));
+}
+
+
+int
+SPURegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
+  // FIXME: Most probably dwarf numbers differs for Linux and Darwin
+  return SPUGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
+}
+
+#include "SPUGenRegisterInfo.inc"

diff --git a/lib/Target/CellSPU/SPURegisterInfo.h b/lib/Target/CellSPU/SPURegisterInfo.h
new file mode 100644
index 0000000..9691cb6
--- /dev/null
+++ b/lib/Target/CellSPU/SPURegisterInfo.h

@@ -0,0 +1,103 @@
+//===- SPURegisterInfo.h - Cell SPU Register Information Impl ----*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Cell SPU implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPU_REGISTERINFO_H
+#define SPU_REGISTERINFO_H
+
+#include "SPU.h"
+#include "SPUGenRegisterInfo.h.inc"
+
+namespace llvm {
+  class SPUSubtarget;
+  class TargetInstrInfo;
+  class Type;
+
+  class SPURegisterInfo : public SPUGenRegisterInfo {
+  private:
+    const SPUSubtarget &Subtarget;
+    const TargetInstrInfo &TII;
+
+    //! Predicate: Does the machine function use the link register?
+    bool usesLR(MachineFunction &MF) const;
+
+  public:
+    SPURegisterInfo(const SPUSubtarget &subtarget, const TargetInstrInfo &tii);
+    
+    //! Translate a register's enum value to a register number
+    /*!
+      This method translates a register's enum value to it's regiser number,
+      e.g. SPU::R14 -> 14.
+     */
+    static unsigned getRegisterNumbering(unsigned RegEnum);
+
+    /// getPointerRegClass - Return the register class to use to hold pointers.
+    /// This is used for addressing modes.
+    virtual const TargetRegisterClass *
+    getPointerRegClass(unsigned Kind = 0) const;
+
+    //! Return the array of callee-saved registers
+    virtual const unsigned* getCalleeSavedRegs(const MachineFunction *MF) const;
+
+    //! Return the register class array of the callee-saved registers
+    virtual const TargetRegisterClass* const *
+      getCalleeSavedRegClasses(const MachineFunction *MF) const;
+
+    //! Return the reserved registers
+    BitVector getReservedRegs(const MachineFunction &MF) const;
+
+    //! Prediate: Target has dedicated frame pointer
+    bool hasFP(const MachineFunction &MF) const;
+    //! Eliminate the call frame setup pseudo-instructions
+    void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                       MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator I) const;
+    //! Convert frame indicies into machine operands
+    unsigned eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
+                                 int *Value = NULL,
+                                 RegScavenger *RS = NULL) const;
+    //! Determine the frame's layour
+    void determineFrameLayout(MachineFunction &MF) const;
+
+    void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                              RegScavenger *RS = NULL) const;
+    //! Emit the function prologue
+    void emitPrologue(MachineFunction &MF) const;
+    //! Emit the function epilogue
+    void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+    //! Get return address register (LR, aka R0)
+    unsigned getRARegister() const;
+    //! Get the stack frame register (SP, aka R1)
+    unsigned getFrameRegister(const MachineFunction &MF) const;
+    //! Perform target-specific stack frame setup.
+    void getInitialFrameState(std::vector<MachineMove> &Moves) const;
+
+    //------------------------------------------------------------------------
+    // New methods added:
+    //------------------------------------------------------------------------
+
+    //! Return the array of argument passing registers
+    /*!
+      \note The size of this array is returned by getArgRegsSize().
+     */
+    static const unsigned *getArgRegs();
+
+    //! Return the size of the argument passing register array
+    static unsigned getNumArgRegs();
+
+    //! Get DWARF debugging register number
+    int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+  };
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/CellSPU/SPURegisterInfo.td b/lib/Target/CellSPU/SPURegisterInfo.td
new file mode 100644
index 0000000..bb88f2b
--- /dev/null
+++ b/lib/Target/CellSPU/SPURegisterInfo.td

@@ -0,0 +1,429 @@
+//===- SPURegisterInfo.td - The Cell SPU Register File -----*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+class SPUReg<string n> : Register<n> {
+  let Namespace = "SPU";
+}
+
+// The SPU's register are all 128-bits wide, which makes specifying the
+// registers relatively easy, if relatively mundane:
+
+class SPUVecReg<bits<7> num, string n> : SPUReg<n> {
+  field bits<7> Num = num;
+}
+
+def R0 : SPUVecReg<0, "$lr">, DwarfRegNum<[0]>;
+def R1 : SPUVecReg<1, "$sp">, DwarfRegNum<[1]>;
+def R2 : SPUVecReg<2, "$2">, DwarfRegNum<[2]>;
+def R3 : SPUVecReg<3, "$3">, DwarfRegNum<[3]>;
+def R4 : SPUVecReg<4, "$4">, DwarfRegNum<[4]>;
+def R5 : SPUVecReg<5, "$5">, DwarfRegNum<[5]>;
+def R6 : SPUVecReg<6, "$6">, DwarfRegNum<[6]>;
+def R7 : SPUVecReg<7, "$7">, DwarfRegNum<[7]>;
+def R8 : SPUVecReg<8, "$8">, DwarfRegNum<[8]>;
+def R9 : SPUVecReg<9, "$9">, DwarfRegNum<[9]>;
+def R10 : SPUVecReg<10, "$10">, DwarfRegNum<[10]>;
+def R11 : SPUVecReg<11, "$11">, DwarfRegNum<[11]>;
+def R12 : SPUVecReg<12, "$12">, DwarfRegNum<[12]>;
+def R13 : SPUVecReg<13, "$13">, DwarfRegNum<[13]>;
+def R14 : SPUVecReg<14, "$14">, DwarfRegNum<[14]>;
+def R15 : SPUVecReg<15, "$15">, DwarfRegNum<[15]>;
+def R16 : SPUVecReg<16, "$16">, DwarfRegNum<[16]>;
+def R17 : SPUVecReg<17, "$17">, DwarfRegNum<[17]>;
+def R18 : SPUVecReg<18, "$18">, DwarfRegNum<[18]>;
+def R19 : SPUVecReg<19, "$19">, DwarfRegNum<[19]>;
+def R20 : SPUVecReg<20, "$20">, DwarfRegNum<[20]>;
+def R21 : SPUVecReg<21, "$21">, DwarfRegNum<[21]>;
+def R22 : SPUVecReg<22, "$22">, DwarfRegNum<[22]>;
+def R23 : SPUVecReg<23, "$23">, DwarfRegNum<[23]>;
+def R24 : SPUVecReg<24, "$24">, DwarfRegNum<[24]>;
+def R25 : SPUVecReg<25, "$25">, DwarfRegNum<[25]>;
+def R26 : SPUVecReg<26, "$26">, DwarfRegNum<[26]>;
+def R27 : SPUVecReg<27, "$27">, DwarfRegNum<[27]>;
+def R28 : SPUVecReg<28, "$28">, DwarfRegNum<[28]>;
+def R29 : SPUVecReg<29, "$29">, DwarfRegNum<[29]>;
+def R30 : SPUVecReg<30, "$30">, DwarfRegNum<[30]>;
+def R31 : SPUVecReg<31, "$31">, DwarfRegNum<[31]>;
+def R32 : SPUVecReg<32, "$32">, DwarfRegNum<[32]>;
+def R33 : SPUVecReg<33, "$33">, DwarfRegNum<[33]>;
+def R34 : SPUVecReg<34, "$34">, DwarfRegNum<[34]>;
+def R35 : SPUVecReg<35, "$35">, DwarfRegNum<[35]>;
+def R36 : SPUVecReg<36, "$36">, DwarfRegNum<[36]>;
+def R37 : SPUVecReg<37, "$37">, DwarfRegNum<[37]>;
+def R38 : SPUVecReg<38, "$38">, DwarfRegNum<[38]>;
+def R39 : SPUVecReg<39, "$39">, DwarfRegNum<[39]>;
+def R40 : SPUVecReg<40, "$40">, DwarfRegNum<[40]>;
+def R41 : SPUVecReg<41, "$41">, DwarfRegNum<[41]>;
+def R42 : SPUVecReg<42, "$42">, DwarfRegNum<[42]>;
+def R43 : SPUVecReg<43, "$43">, DwarfRegNum<[43]>;
+def R44 : SPUVecReg<44, "$44">, DwarfRegNum<[44]>;
+def R45 : SPUVecReg<45, "$45">, DwarfRegNum<[45]>;
+def R46 : SPUVecReg<46, "$46">, DwarfRegNum<[46]>;
+def R47 : SPUVecReg<47, "$47">, DwarfRegNum<[47]>;
+def R48 : SPUVecReg<48, "$48">, DwarfRegNum<[48]>;
+def R49 : SPUVecReg<49, "$49">, DwarfRegNum<[49]>;
+def R50 : SPUVecReg<50, "$50">, DwarfRegNum<[50]>;
+def R51 : SPUVecReg<51, "$51">, DwarfRegNum<[51]>;
+def R52 : SPUVecReg<52, "$52">, DwarfRegNum<[52]>;
+def R53 : SPUVecReg<53, "$53">, DwarfRegNum<[53]>;
+def R54 : SPUVecReg<54, "$54">, DwarfRegNum<[54]>;
+def R55 : SPUVecReg<55, "$55">, DwarfRegNum<[55]>;
+def R56 : SPUVecReg<56, "$56">, DwarfRegNum<[56]>;
+def R57 : SPUVecReg<57, "$57">, DwarfRegNum<[57]>;
+def R58 : SPUVecReg<58, "$58">, DwarfRegNum<[58]>;
+def R59 : SPUVecReg<59, "$59">, DwarfRegNum<[59]>;
+def R60 : SPUVecReg<60, "$60">, DwarfRegNum<[60]>;
+def R61 : SPUVecReg<61, "$61">, DwarfRegNum<[61]>;
+def R62 : SPUVecReg<62, "$62">, DwarfRegNum<[62]>;
+def R63 : SPUVecReg<63, "$63">, DwarfRegNum<[63]>;
+def R64 : SPUVecReg<64, "$64">, DwarfRegNum<[64]>;
+def R65 : SPUVecReg<65, "$65">, DwarfRegNum<[65]>;
+def R66 : SPUVecReg<66, "$66">, DwarfRegNum<[66]>;
+def R67 : SPUVecReg<67, "$67">, DwarfRegNum<[67]>;
+def R68 : SPUVecReg<68, "$68">, DwarfRegNum<[68]>;
+def R69 : SPUVecReg<69, "$69">, DwarfRegNum<[69]>;
+def R70 : SPUVecReg<70, "$70">, DwarfRegNum<[70]>;
+def R71 : SPUVecReg<71, "$71">, DwarfRegNum<[71]>;
+def R72 : SPUVecReg<72, "$72">, DwarfRegNum<[72]>;
+def R73 : SPUVecReg<73, "$73">, DwarfRegNum<[73]>;
+def R74 : SPUVecReg<74, "$74">, DwarfRegNum<[74]>;
+def R75 : SPUVecReg<75, "$75">, DwarfRegNum<[75]>;
+def R76 : SPUVecReg<76, "$76">, DwarfRegNum<[76]>;
+def R77 : SPUVecReg<77, "$77">, DwarfRegNum<[77]>;
+def R78 : SPUVecReg<78, "$78">, DwarfRegNum<[78]>;
+def R79 : SPUVecReg<79, "$79">, DwarfRegNum<[79]>;
+def R80 : SPUVecReg<80, "$80">, DwarfRegNum<[80]>;
+def R81 : SPUVecReg<81, "$81">, DwarfRegNum<[81]>;
+def R82 : SPUVecReg<82, "$82">, DwarfRegNum<[82]>;
+def R83 : SPUVecReg<83, "$83">, DwarfRegNum<[83]>;
+def R84 : SPUVecReg<84, "$84">, DwarfRegNum<[84]>;
+def R85 : SPUVecReg<85, "$85">, DwarfRegNum<[85]>;
+def R86 : SPUVecReg<86, "$86">, DwarfRegNum<[86]>;
+def R87 : SPUVecReg<87, "$87">, DwarfRegNum<[87]>;
+def R88 : SPUVecReg<88, "$88">, DwarfRegNum<[88]>;
+def R89 : SPUVecReg<89, "$89">, DwarfRegNum<[89]>;
+def R90 : SPUVecReg<90, "$90">, DwarfRegNum<[90]>;
+def R91 : SPUVecReg<91, "$91">, DwarfRegNum<[91]>;
+def R92 : SPUVecReg<92, "$92">, DwarfRegNum<[92]>;
+def R93 : SPUVecReg<93, "$93">, DwarfRegNum<[93]>;
+def R94 : SPUVecReg<94, "$94">, DwarfRegNum<[94]>;
+def R95 : SPUVecReg<95, "$95">, DwarfRegNum<[95]>;
+def R96 : SPUVecReg<96, "$96">, DwarfRegNum<[96]>;
+def R97 : SPUVecReg<97, "$97">, DwarfRegNum<[97]>;
+def R98 : SPUVecReg<98, "$98">, DwarfRegNum<[98]>;
+def R99 : SPUVecReg<99, "$99">, DwarfRegNum<[99]>;
+def R100 : SPUVecReg<100, "$100">, DwarfRegNum<[100]>;
+def R101 : SPUVecReg<101, "$101">, DwarfRegNum<[101]>;
+def R102 : SPUVecReg<102, "$102">, DwarfRegNum<[102]>;
+def R103 : SPUVecReg<103, "$103">, DwarfRegNum<[103]>;
+def R104 : SPUVecReg<104, "$104">, DwarfRegNum<[104]>;
+def R105 : SPUVecReg<105, "$105">, DwarfRegNum<[105]>;
+def R106 : SPUVecReg<106, "$106">, DwarfRegNum<[106]>;
+def R107 : SPUVecReg<107, "$107">, DwarfRegNum<[107]>;
+def R108 : SPUVecReg<108, "$108">, DwarfRegNum<[108]>;
+def R109 : SPUVecReg<109, "$109">, DwarfRegNum<[109]>;
+def R110 : SPUVecReg<110, "$110">, DwarfRegNum<[110]>;
+def R111 : SPUVecReg<111, "$111">, DwarfRegNum<[111]>;
+def R112 : SPUVecReg<112, "$112">, DwarfRegNum<[112]>;
+def R113 : SPUVecReg<113, "$113">, DwarfRegNum<[113]>;
+def R114 : SPUVecReg<114, "$114">, DwarfRegNum<[114]>;
+def R115 : SPUVecReg<115, "$115">, DwarfRegNum<[115]>;
+def R116 : SPUVecReg<116, "$116">, DwarfRegNum<[116]>;
+def R117 : SPUVecReg<117, "$117">, DwarfRegNum<[117]>;
+def R118 : SPUVecReg<118, "$118">, DwarfRegNum<[118]>;
+def R119 : SPUVecReg<119, "$119">, DwarfRegNum<[119]>;
+def R120 : SPUVecReg<120, "$120">, DwarfRegNum<[120]>;
+def R121 : SPUVecReg<121, "$121">, DwarfRegNum<[121]>;
+def R122 : SPUVecReg<122, "$122">, DwarfRegNum<[122]>;
+def R123 : SPUVecReg<123, "$123">, DwarfRegNum<[123]>;
+def R124 : SPUVecReg<124, "$124">, DwarfRegNum<[124]>;
+def R125 : SPUVecReg<125, "$125">, DwarfRegNum<[125]>;
+def R126 : SPUVecReg<126, "$126">, DwarfRegNum<[126]>;
+def R127 : SPUVecReg<127, "$127">, DwarfRegNum<[127]>;
+
+/* Need floating point status register here: */
+/* def FPCSR : ... */
+
+// The SPU's registers as 128-bit wide entities, and can function as general
+// purpose registers, where the operands are in the "preferred slot":
+def GPRC : RegisterClass<"SPU", [i128], 128,
+ [
+   /* volatile register */
+   R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, 
+   R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31,
+   R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46,
+   R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61,
+   R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76,
+   R77, R78, R79,
+   /* non-volatile register: take hint from PPC and allocate in reverse order */
+   R127, R126, R125, R124, R123, R122, R121, R120, R119, R118, R117, R116, R115,
+   R114, R113, R112, R111, R110, R109, R108, R107, R106, R105, R104, R103, R102,
+   R101, R100, R99, R98, R97, R96, R95, R94, R93, R92, R91, R90, R89, R88, R87,
+   R86, R85, R84, R83, R82, R81, R80, 
+   /* environment ptr, SP, LR */ 
+   R2, R1, R0 ]>
+{
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    GPRCClass::iterator
+    GPRCClass::allocation_order_begin(const MachineFunction &MF) const {
+      return begin();
+    }
+    GPRCClass::iterator
+    GPRCClass::allocation_order_end(const MachineFunction &MF) const {
+      return end()-3;  // don't allocate R2, R1, or R0 (envp, sp, lr)
+    }
+  }];
+}
+
+// The SPU's registers as 64-bit wide (double word integer) "preferred slot":
+def R64C : RegisterClass<"SPU", [i64], 128,
+ [
+   /* volatile register */
+   R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, 
+   R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31,
+   R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46,
+   R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61,
+   R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76,
+   R77, R78, R79,
+   /* non-volatile register: take hint from PPC and allocate in reverse order */
+   R127, R126, R125, R124, R123, R122, R121, R120, R119, R118, R117, R116, R115,
+   R114, R113, R112, R111, R110, R109, R108, R107, R106, R105, R104, R103, R102,
+   R101, R100, R99, R98, R97, R96, R95, R94, R93, R92, R91, R90, R89, R88, R87,
+   R86, R85, R84, R83, R82, R81, R80, 
+   /* environment ptr, SP, LR */ 
+   R2, R1, R0 ]>
+{
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    R64CClass::iterator
+    R64CClass::allocation_order_begin(const MachineFunction &MF) const {
+      return begin();
+    }
+    R64CClass::iterator
+    R64CClass::allocation_order_end(const MachineFunction &MF) const {
+      return end()-3;  // don't allocate R2, R1, or R0 (envp, sp, lr)
+    }
+  }];
+}
+
+// The SPU's registers as 64-bit wide (double word) FP "preferred slot":
+def R64FP : RegisterClass<"SPU", [f64], 128,
+ [
+   /* volatile register */
+   R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, 
+   R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31,
+   R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46,
+   R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61,
+   R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76,
+   R77, R78, R79,
+   /* non-volatile register: take hint from PPC and allocate in reverse order */
+   R127, R126, R125, R124, R123, R122, R121, R120, R119, R118, R117, R116, R115,
+   R114, R113, R112, R111, R110, R109, R108, R107, R106, R105, R104, R103, R102,
+   R101, R100, R99, R98, R97, R96, R95, R94, R93, R92, R91, R90, R89, R88, R87,
+   R86, R85, R84, R83, R82, R81, R80, 
+   /* environment ptr, SP, LR */ 
+   R2, R1, R0 ]>
+{
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    R64FPClass::iterator
+    R64FPClass::allocation_order_begin(const MachineFunction &MF) const {
+      return begin();
+    }
+    R64FPClass::iterator
+    R64FPClass::allocation_order_end(const MachineFunction &MF) const {
+      return end()-3;  // don't allocate R2, R1, or R0 (envp, sp, lr)
+    }
+  }];
+}
+
+// The SPU's registers as 32-bit wide (word) "preferred slot":
+def R32C : RegisterClass<"SPU", [i32], 128,
+ [
+   /* volatile register */
+   R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, 
+   R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31,
+   R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46,
+   R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61,
+   R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76,
+   R77, R78, R79,
+   /* non-volatile register: take hint from PPC and allocate in reverse order */
+   R127, R126, R125, R124, R123, R122, R121, R120, R119, R118, R117, R116, R115,
+   R114, R113, R112, R111, R110, R109, R108, R107, R106, R105, R104, R103, R102,
+   R101, R100, R99, R98, R97, R96, R95, R94, R93, R92, R91, R90, R89, R88, R87,
+   R86, R85, R84, R83, R82, R81, R80, 
+   /* environment ptr, SP, LR */ 
+   R2, R1, R0 ]>
+{
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    R32CClass::iterator
+    R32CClass::allocation_order_begin(const MachineFunction &MF) const {
+      return begin();
+    }
+    R32CClass::iterator
+    R32CClass::allocation_order_end(const MachineFunction &MF) const {
+      return end()-3;  // don't allocate R2, R1, or R0 (envp, sp, lr)
+    }
+  }];
+}
+
+// The SPU's registers as single precision floating point "preferred slot":
+def R32FP : RegisterClass<"SPU", [f32], 128,
+ [
+   /* volatile register */
+   R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, 
+   R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31,
+   R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46,
+   R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61,
+   R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76,
+   R77, R78, R79,
+   /* non-volatile register: take hint from PPC and allocate in reverse order */
+   R127, R126, R125, R124, R123, R122, R121, R120, R119, R118, R117, R116, R115,
+   R114, R113, R112, R111, R110, R109, R108, R107, R106, R105, R104, R103, R102,
+   R101, R100, R99, R98, R97, R96, R95, R94, R93, R92, R91, R90, R89, R88, R87,
+   R86, R85, R84, R83, R82, R81, R80, 
+   /* environment ptr, SP, LR */ 
+   R2, R1, R0 ]>
+{
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    R32FPClass::iterator
+    R32FPClass::allocation_order_begin(const MachineFunction &MF) const {
+      return begin();
+    }
+    R32FPClass::iterator
+    R32FPClass::allocation_order_end(const MachineFunction &MF) const {
+      return end()-3;  // don't allocate R2, R1, or R0 (envp, sp, lr)
+    }
+  }];
+}
+
+// The SPU's registers as 16-bit wide (halfword) "preferred slot":
+def R16C : RegisterClass<"SPU", [i16], 128,
+ [
+   /* volatile register */
+   R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, 
+   R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31,
+   R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46,
+   R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61,
+   R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76,
+   R77, R78, R79,
+   /* non-volatile register: take hint from PPC and allocate in reverse order */
+   R127, R126, R125, R124, R123, R122, R121, R120, R119, R118, R117, R116, R115,
+   R114, R113, R112, R111, R110, R109, R108, R107, R106, R105, R104, R103, R102,
+   R101, R100, R99, R98, R97, R96, R95, R94, R93, R92, R91, R90, R89, R88, R87,
+   R86, R85, R84, R83, R82, R81, R80, 
+   /* environment ptr, SP, LR */ 
+   R2, R1, R0 ]>
+{
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    R16CClass::iterator
+    R16CClass::allocation_order_begin(const MachineFunction &MF) const {
+      return begin();
+    }
+    R16CClass::iterator
+    R16CClass::allocation_order_end(const MachineFunction &MF) const {
+      return end()-3;  // don't allocate R2, R1, or R0 (envp, sp, lr)
+    }
+  }];
+}
+
+// The SPU's registers as 8-bit wide (byte) "preferred slot":
+def R8C : RegisterClass<"SPU", [i8], 128,
+ [
+   /* volatile register */
+   R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, 
+   R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31,
+   R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46,
+   R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61,
+   R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76,
+   R77, R78, R79,
+   /* non-volatile register: take hint from PPC and allocate in reverse order */
+   R127, R126, R125, R124, R123, R122, R121, R120, R119, R118, R117, R116, R115,
+   R114, R113, R112, R111, R110, R109, R108, R107, R106, R105, R104, R103, R102,
+   R101, R100, R99, R98, R97, R96, R95, R94, R93, R92, R91, R90, R89, R88, R87,
+   R86, R85, R84, R83, R82, R81, R80, 
+   /* environment ptr, SP, LR */ 
+   R2, R1, R0 ]>
+{
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    R8CClass::iterator
+    R8CClass::allocation_order_begin(const MachineFunction &MF) const {
+      return begin();
+    }
+    R8CClass::iterator
+    R8CClass::allocation_order_end(const MachineFunction &MF) const {
+      return end()-3;  // don't allocate R2, R1, or R0 (envp, sp, lr)
+    }
+  }];
+}
+
+// The SPU's registers as vector registers:
+def VECREG : RegisterClass<"SPU",
+                           [v16i8,v8i16,v2i32,v4i32,v4f32,v2i64,v2f64],
+                           128,
+ [
+   /* volatile register */
+   R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, 
+   R17, R18, R19, R20, R21, R22, R23, R24, R25, R26, R27, R28, R29, R30, R31,
+   R32, R33, R34, R35, R36, R37, R38, R39, R40, R41, R42, R43, R44, R45, R46,
+   R47, R48, R49, R50, R51, R52, R53, R54, R55, R56, R57, R58, R59, R60, R61,
+   R62, R63, R64, R65, R66, R67, R68, R69, R70, R71, R72, R73, R74, R75, R76,
+   R77, R78, R79,
+   /* non-volatile register: take hint from PPC and allocate in reverse order */
+   R127, R126, R125, R124, R123, R122, R121, R120, R119, R118, R117, R116, R115,
+   R114, R113, R112, R111, R110, R109, R108, R107, R106, R105, R104, R103, R102,
+   R101, R100, R99, R98, R97, R96, R95, R94, R93, R92, R91, R90, R89, R88, R87,
+   R86, R85, R84, R83, R82, R81, R80, 
+   /* environment ptr, SP, LR */ 
+   R2, R1, R0 ]>
+{
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    VECREGClass::iterator
+    VECREGClass::allocation_order_begin(const MachineFunction &MF) const {
+      return begin();
+    }
+    VECREGClass::iterator
+    VECREGClass::allocation_order_end(const MachineFunction &MF) const {
+      return end()-3;  // don't allocate R2, R1, or R0 (envp, sp, lr)
+    }
+  }];
+}

diff --git a/lib/Target/CellSPU/SPURegisterNames.h b/lib/Target/CellSPU/SPURegisterNames.h
new file mode 100644
index 0000000..6c3afdf
--- /dev/null
+++ b/lib/Target/CellSPU/SPURegisterNames.h

@@ -0,0 +1,18 @@
+//===- SPURegisterNames.h - Wrapper header for SPU register names -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPU_REGISTER_NAMES_H
+#define SPU_REGISTER_NAMES_H
+
+// Define symbolic names for Cell registers.  This defines a mapping from
+// register name to register number.
+//
+#include "SPUGenRegisterNames.inc"
+
+#endif

diff --git a/lib/Target/CellSPU/SPUSchedule.td b/lib/Target/CellSPU/SPUSchedule.td
new file mode 100644
index 0000000..785dc46
--- /dev/null
+++ b/lib/Target/CellSPU/SPUSchedule.td

@@ -0,0 +1,57 @@
+//===- SPUSchedule.td - Cell Scheduling Definitions --------*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Even pipeline:
+
+def EVEN_UNIT : FuncUnit;       // Even execution unit: (PC & 0x7 == 000)
+def ODD_UNIT  : FuncUnit;       // Odd execution unit:  (PC & 0x7 == 100)
+
+//===----------------------------------------------------------------------===//
+// Instruction Itinerary classes used for Cell SPU
+//===----------------------------------------------------------------------===//
+
+def LoadStore    : InstrItinClass;              // ODD_UNIT
+def BranchHints  : InstrItinClass;              // ODD_UNIT
+def BranchResolv : InstrItinClass;              // ODD_UNIT
+def ChanOpSPR    : InstrItinClass;              // ODD_UNIT
+def ShuffleOp    : InstrItinClass;              // ODD_UNIT
+def SelectOp     : InstrItinClass;              // ODD_UNIT
+def GatherOp     : InstrItinClass;              // ODD_UNIT
+def LoadNOP      : InstrItinClass;              // ODD_UNIT
+def ExecNOP      : InstrItinClass;              // EVEN_UNIT
+def SPrecFP      : InstrItinClass;              // EVEN_UNIT
+def DPrecFP      : InstrItinClass;              // EVEN_UNIT
+def FPInt        : InstrItinClass;              // EVEN_UNIT (FP<->integer)
+def ByteOp       : InstrItinClass;              // EVEN_UNIT
+def IntegerOp    : InstrItinClass;              // EVEN_UNIT
+def IntegerMulDiv: InstrItinClass;              // EVEN_UNIT
+def RotateShift  : InstrItinClass;              // EVEN_UNIT
+def ImmLoad      : InstrItinClass;              // EVEN_UNIT
+
+/* Note: The itinerary for the Cell SPU is somewhat contrived... */
+def SPUItineraries : ProcessorItineraries<[
+  InstrItinData<LoadStore   , [InstrStage<6,  [ODD_UNIT]>]>,
+  InstrItinData<BranchHints , [InstrStage<6,  [ODD_UNIT]>]>,
+  InstrItinData<BranchResolv, [InstrStage<4,  [ODD_UNIT]>]>,
+  InstrItinData<ChanOpSPR   , [InstrStage<6,  [ODD_UNIT]>]>,
+  InstrItinData<ShuffleOp   , [InstrStage<4,  [ODD_UNIT]>]>,
+  InstrItinData<SelectOp    , [InstrStage<4,  [ODD_UNIT]>]>,
+  InstrItinData<GatherOp    , [InstrStage<4,  [ODD_UNIT]>]>,
+  InstrItinData<LoadNOP     , [InstrStage<1,  [ODD_UNIT]>]>,
+  InstrItinData<ExecNOP     , [InstrStage<1,  [EVEN_UNIT]>]>,
+  InstrItinData<SPrecFP     , [InstrStage<6,  [EVEN_UNIT]>]>,
+  InstrItinData<DPrecFP     , [InstrStage<13, [EVEN_UNIT]>]>,
+  InstrItinData<FPInt       , [InstrStage<2,  [EVEN_UNIT]>]>,
+  InstrItinData<ByteOp      , [InstrStage<4,  [EVEN_UNIT]>]>,
+  InstrItinData<IntegerOp   , [InstrStage<2,  [EVEN_UNIT]>]>,
+  InstrItinData<RotateShift , [InstrStage<4,  [EVEN_UNIT]>]>,
+  InstrItinData<IntegerMulDiv,[InstrStage<7,  [EVEN_UNIT]>]>,
+  InstrItinData<ImmLoad     , [InstrStage<2,  [EVEN_UNIT]>]>
+  ]>;

diff --git a/lib/Target/CellSPU/SPUSubtarget.cpp b/lib/Target/CellSPU/SPUSubtarget.cpp
new file mode 100644
index 0000000..0f18b7f
--- /dev/null
+++ b/lib/Target/CellSPU/SPUSubtarget.cpp

@@ -0,0 +1,36 @@
+//===- SPUSubtarget.cpp - STI Cell SPU Subtarget Information --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CellSPU-specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPUSubtarget.h"
+#include "SPU.h"
+#include "SPUGenSubtarget.inc"
+
+using namespace llvm;
+
+SPUSubtarget::SPUSubtarget(const std::string &TT, const std::string &FS) :
+  StackAlignment(16),
+  ProcDirective(SPU::DEFAULT_PROC),
+  UseLargeMem(false)
+{
+  // Should be the target SPU processor type. For now, since there's only
+  // one, simply default to the current "v0" default:
+  std::string default_cpu("v0");
+
+  // Parse features string.
+  ParseSubtargetFeatures(FS, default_cpu);
+}
+
+/// SetJITMode - This is called to inform the subtarget info that we are
+/// producing code for the JIT.
+void SPUSubtarget::SetJITMode() {
+}

diff --git a/lib/Target/CellSPU/SPUSubtarget.h b/lib/Target/CellSPU/SPUSubtarget.h
new file mode 100644
index 0000000..88201c6
--- /dev/null
+++ b/lib/Target/CellSPU/SPUSubtarget.h

@@ -0,0 +1,90 @@
+//===-- SPUSubtarget.h - Define Subtarget for the Cell SPU ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Cell SPU-specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CELLSUBTARGET_H
+#define CELLSUBTARGET_H
+
+#include "llvm/Target/TargetInstrItineraries.h"
+#include "llvm/Target/TargetSubtarget.h"
+
+#include <string>
+
+namespace llvm {
+  class GlobalValue;
+
+  namespace SPU {
+    enum {
+      PROC_NONE,
+      DEFAULT_PROC
+    };
+  }
+    
+  class SPUSubtarget : public TargetSubtarget {
+  protected:
+    /// stackAlignment - The minimum alignment known to hold of the stack frame
+    /// on entry to the function and which must be maintained by every function.
+    unsigned StackAlignment;
+    
+    /// Selected instruction itineraries (one entry per itinerary class.)
+    InstrItineraryData InstrItins;
+
+    /// Which SPU processor (this isn't really used, but it's there to keep
+    /// the C compiler happy)
+    unsigned ProcDirective;
+
+    /// Use (assume) large memory -- effectively disables the LQA/STQA
+    /// instructions that assume 259K local store.
+    bool UseLargeMem;
+    
+  public:
+    /// This constructor initializes the data members to match that
+    /// of the specified triple.
+    ///
+    SPUSubtarget(const std::string &TT, const std::string &FS);
+    
+    /// ParseSubtargetFeatures - Parses features string setting specified 
+    /// subtarget options.  Definition of function is auto generated by tblgen.
+    std::string ParseSubtargetFeatures(const std::string &FS,
+                                       const std::string &CPU);
+
+    /// SetJITMode - This is called to inform the subtarget info that we are
+    /// producing code for the JIT.
+    void SetJITMode();
+
+    /// getStackAlignment - Returns the minimum alignment known to hold of the
+    /// stack frame on entry to the function and which must be maintained by
+    /// every function for this subtarget.
+    unsigned getStackAlignment() const { return StackAlignment; }
+    
+    /// getInstrItins - Return the instruction itineraies based on subtarget 
+    /// selection.
+    const InstrItineraryData &getInstrItineraryData() const {
+      return InstrItins;
+    }
+
+    /// Use large memory addressing predicate
+    bool usingLargeMem() const {
+      return UseLargeMem;
+    }
+
+    /// getTargetDataString - Return the pointer size and type alignment
+    /// properties of this subtarget.
+    const char *getTargetDataString() const {
+      return "E-p:32:32:128-f64:64:128-f32:32:128-i64:32:128-i32:32:128"
+             "-i16:16:128-i8:8:128-i1:8:128-a:0:128-v64:128:128-v128:128:128"
+             "-s:128:128-n32:64";
+    }
+  };
+} // End llvm namespace
+
+#endif

diff --git a/lib/Target/CellSPU/SPUTargetMachine.cpp b/lib/Target/CellSPU/SPUTargetMachine.cpp
new file mode 100644
index 0000000..6500067
--- /dev/null
+++ b/lib/Target/CellSPU/SPUTargetMachine.cpp

@@ -0,0 +1,60 @@
+//===-- SPUTargetMachine.cpp - Define TargetMachine for Cell SPU ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Top-level implementation for the Cell SPU target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPU.h"
+#include "SPURegisterNames.h"
+#include "SPUMCAsmInfo.h"
+#include "SPUTargetMachine.h"
+#include "llvm/PassManager.h"
+#include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/Target/TargetRegistry.h"
+
+using namespace llvm;
+
+extern "C" void LLVMInitializeCellSPUTarget() { 
+  // Register the target.
+  RegisterTargetMachine<SPUTargetMachine> X(TheCellSPUTarget);
+  RegisterAsmInfo<SPULinuxMCAsmInfo> Y(TheCellSPUTarget);
+}
+
+const std::pair<unsigned, int> *
+SPUFrameInfo::getCalleeSaveSpillSlots(unsigned &NumEntries) const {
+  NumEntries = 1;
+  return &LR[0];
+}
+
+SPUTargetMachine::SPUTargetMachine(const Target &T, const std::string &TT,
+                                   const std::string &FS)
+  : LLVMTargetMachine(T, TT),
+    Subtarget(TT, FS),
+    DataLayout(Subtarget.getTargetDataString()),
+    InstrInfo(*this),
+    FrameInfo(*this),
+    TLInfo(*this),
+    InstrItins(Subtarget.getInstrItineraryData()) {
+  // For the time being, use static relocations, since there's really no
+  // support for PIC yet.
+  setRelocationModel(Reloc::Static);
+}
+
+//===----------------------------------------------------------------------===//
+// Pass Pipeline Configuration
+//===----------------------------------------------------------------------===//
+
+bool SPUTargetMachine::addInstSelector(PassManagerBase &PM,
+                                       CodeGenOpt::Level OptLevel) {
+  // Install an instruction selector.
+  PM.add(createSPUISelDag(*this));
+  return false;
+}

diff --git a/lib/Target/CellSPU/SPUTargetMachine.h b/lib/Target/CellSPU/SPUTargetMachine.h
new file mode 100644
index 0000000..9fdcfe9
--- /dev/null
+++ b/lib/Target/CellSPU/SPUTargetMachine.h

@@ -0,0 +1,83 @@
+//===-- SPUTargetMachine.h - Define TargetMachine for Cell SPU ----*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the CellSPU-specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPU_TARGETMACHINE_H
+#define SPU_TARGETMACHINE_H
+
+#include "SPUSubtarget.h"
+#include "SPUInstrInfo.h"
+#include "SPUISelLowering.h"
+#include "SPUFrameInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+
+namespace llvm {
+class PassManager;
+class GlobalValue;
+class TargetFrameInfo;
+
+/// SPUTargetMachine
+///
+class SPUTargetMachine : public LLVMTargetMachine {
+  SPUSubtarget        Subtarget;
+  const TargetData    DataLayout;
+  SPUInstrInfo        InstrInfo;
+  SPUFrameInfo        FrameInfo;
+  SPUTargetLowering   TLInfo;
+  InstrItineraryData  InstrItins;
+public:
+  SPUTargetMachine(const Target &T, const std::string &TT,
+                   const std::string &FS);
+
+  /// Return the subtarget implementation object
+  virtual const SPUSubtarget     *getSubtargetImpl() const {
+    return &Subtarget;
+  }
+  virtual const SPUInstrInfo     *getInstrInfo() const {
+    return &InstrInfo;
+  }
+  virtual const SPUFrameInfo     *getFrameInfo() const {
+    return &FrameInfo;
+  }
+  /*!
+    \note Cell SPU does not support JIT today. It could support JIT at some
+    point.
+   */
+  virtual       TargetJITInfo    *getJITInfo() {
+    return NULL;
+  }
+
+  virtual       SPUTargetLowering *getTargetLowering() const { 
+   return const_cast<SPUTargetLowering*>(&TLInfo); 
+  }
+
+  virtual const SPURegisterInfo *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+  
+  virtual const TargetData *getTargetData() const {
+    return &DataLayout;
+  }
+
+  virtual const InstrItineraryData getInstrItineraryData() const {
+    return InstrItins;
+  }
+  
+  // Pass Pipeline Configuration
+  virtual bool addInstSelector(PassManagerBase &PM,
+                               CodeGenOpt::Level OptLevel);
+};
+
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/CellSPU/TargetInfo/CMakeLists.txt b/lib/Target/CellSPU/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000..928d0fe
--- /dev/null
+++ b/lib/Target/CellSPU/TargetInfo/CMakeLists.txt

@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMCellSPUInfo
+  CellSPUTargetInfo.cpp
+  )
+
+add_dependencies(LLVMCellSPUInfo CellSPUCodeGenTable_gen)

diff --git a/lib/Target/CellSPU/TargetInfo/CellSPUTargetInfo.cpp b/lib/Target/CellSPU/TargetInfo/CellSPUTargetInfo.cpp
new file mode 100644
index 0000000..049ea23
--- /dev/null
+++ b/lib/Target/CellSPU/TargetInfo/CellSPUTargetInfo.cpp

@@ -0,0 +1,20 @@
+//===-- CellSPUTargetInfo.cpp - CellSPU Target Implementation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SPU.h"
+#include "llvm/Module.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheCellSPUTarget;
+
+extern "C" void LLVMInitializeCellSPUTargetInfo() { 
+  RegisterTarget<Triple::cellspu> 
+    X(TheCellSPUTarget, "cellspu", "STI CBEA Cell SPU [experimental]");
+}

diff --git a/lib/Target/CellSPU/TargetInfo/Makefile b/lib/Target/CellSPU/TargetInfo/Makefile
new file mode 100644
index 0000000..9cb6827
--- /dev/null
+++ b/lib/Target/CellSPU/TargetInfo/Makefile

@@ -0,0 +1,15 @@
+##===- lib/Target/CellSPU/TargetInfo/Makefile --------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMCellSPUInfo
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common

diff --git a/lib/Target/CppBackend/CMakeLists.txt b/lib/Target/CppBackend/CMakeLists.txt
new file mode 100644
index 0000000..f8182b8
--- /dev/null
+++ b/lib/Target/CppBackend/CMakeLists.txt

@@ -0,0 +1,3 @@
+add_llvm_target(CppBackend
+  CPPBackend.cpp
+  )

diff --git a/lib/Target/CppBackend/CPPBackend.cpp b/lib/Target/CppBackend/CPPBackend.cpp
new file mode 100644
index 0000000..3dd8ca7
--- /dev/null
+++ b/lib/Target/CppBackend/CPPBackend.cpp

@@ -0,0 +1,2016 @@
+//===-- CPPBackend.cpp - Library for converting LLVM code to C++ code -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the writing of the LLVM IR as a set of C++ calls to the
+// LLVM IR interface. The input module is assumed to be verified.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CPPTargetMachine.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/InlineAsm.h"
+#include "llvm/Instruction.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/PassManager.h"
+#include "llvm/TypeSymbolTable.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Config/config.h"
+#include <algorithm>
+#include <set>
+
+using namespace llvm;
+
+static cl::opt<std::string>
+FuncName("cppfname", cl::desc("Specify the name of the generated function"),
+         cl::value_desc("function name"));
+
+enum WhatToGenerate {
+  GenProgram,
+  GenModule,
+  GenContents,
+  GenFunction,
+  GenFunctions,
+  GenInline,
+  GenVariable,
+  GenType
+};
+
+static cl::opt<WhatToGenerate> GenerationType("cppgen", cl::Optional,
+  cl::desc("Choose what kind of output to generate"),
+  cl::init(GenProgram),
+  cl::values(
+    clEnumValN(GenProgram,  "program",   "Generate a complete program"),
+    clEnumValN(GenModule,   "module",    "Generate a module definition"),
+    clEnumValN(GenContents, "contents",  "Generate contents of a module"),
+    clEnumValN(GenFunction, "function",  "Generate a function definition"),
+    clEnumValN(GenFunctions,"functions", "Generate all function definitions"),
+    clEnumValN(GenInline,   "inline",    "Generate an inline function"),
+    clEnumValN(GenVariable, "variable",  "Generate a variable definition"),
+    clEnumValN(GenType,     "type",      "Generate a type definition"),
+    clEnumValEnd
+  )
+);
+
+static cl::opt<std::string> NameToGenerate("cppfor", cl::Optional,
+  cl::desc("Specify the name of the thing to generate"),
+  cl::init("!bad!"));
+
+extern "C" void LLVMInitializeCppBackendTarget() {
+  // Register the target.
+  RegisterTargetMachine<CPPTargetMachine> X(TheCppBackendTarget);
+}
+
+namespace {
+  typedef std::vector<const Type*> TypeList;
+  typedef std::map<const Type*,std::string> TypeMap;
+  typedef std::map<const Value*,std::string> ValueMap;
+  typedef std::set<std::string> NameSet;
+  typedef std::set<const Type*> TypeSet;
+  typedef std::set<const Value*> ValueSet;
+  typedef std::map<const Value*,std::string> ForwardRefMap;
+
+  /// CppWriter - This class is the main chunk of code that converts an LLVM
+  /// module to a C++ translation unit.
+  class CppWriter : public ModulePass {
+    formatted_raw_ostream &Out;
+    const Module *TheModule;
+    uint64_t uniqueNum;
+    TypeMap TypeNames;
+    ValueMap ValueNames;
+    TypeMap UnresolvedTypes;
+    TypeList TypeStack;
+    NameSet UsedNames;
+    TypeSet DefinedTypes;
+    ValueSet DefinedValues;
+    ForwardRefMap ForwardRefs;
+    bool is_inline;
+
+  public:
+    static char ID;
+    explicit CppWriter(formatted_raw_ostream &o) :
+      ModulePass(&ID), Out(o), uniqueNum(0), is_inline(false) {}
+
+    virtual const char *getPassName() const { return "C++ backend"; }
+
+    bool runOnModule(Module &M);
+
+    void printProgram(const std::string& fname, const std::string& modName );
+    void printModule(const std::string& fname, const std::string& modName );
+    void printContents(const std::string& fname, const std::string& modName );
+    void printFunction(const std::string& fname, const std::string& funcName );
+    void printFunctions();
+    void printInline(const std::string& fname, const std::string& funcName );
+    void printVariable(const std::string& fname, const std::string& varName );
+    void printType(const std::string& fname, const std::string& typeName );
+
+    void error(const std::string& msg);
+
+  private:
+    void printLinkageType(GlobalValue::LinkageTypes LT);
+    void printVisibilityType(GlobalValue::VisibilityTypes VisTypes);
+    void printCallingConv(CallingConv::ID cc);
+    void printEscapedString(const std::string& str);
+    void printCFP(const ConstantFP* CFP);
+
+    std::string getCppName(const Type* val);
+    inline void printCppName(const Type* val);
+
+    std::string getCppName(const Value* val);
+    inline void printCppName(const Value* val);
+
+    void printAttributes(const AttrListPtr &PAL, const std::string &name);
+    bool printTypeInternal(const Type* Ty);
+    inline void printType(const Type* Ty);
+    void printTypes(const Module* M);
+
+    void printConstant(const Constant *CPV);
+    void printConstants(const Module* M);
+
+    void printVariableUses(const GlobalVariable *GV);
+    void printVariableHead(const GlobalVariable *GV);
+    void printVariableBody(const GlobalVariable *GV);
+
+    void printFunctionUses(const Function *F);
+    void printFunctionHead(const Function *F);
+    void printFunctionBody(const Function *F);
+    void printInstruction(const Instruction *I, const std::string& bbname);
+    std::string getOpName(Value*);
+
+    void printModuleBody();
+  };
+
+  static unsigned indent_level = 0;
+  inline formatted_raw_ostream& nl(formatted_raw_ostream& Out, int delta = 0) {
+    Out << "\n";
+    if (delta >= 0 || indent_level >= unsigned(-delta))
+      indent_level += delta;
+    for (unsigned i = 0; i < indent_level; ++i)
+      Out << "  ";
+    return Out;
+  }
+
+  inline void in() { indent_level++; }
+  inline void out() { if (indent_level >0) indent_level--; }
+
+  inline void
+  sanitize(std::string& str) {
+    for (size_t i = 0; i < str.length(); ++i)
+      if (!isalnum(str[i]) && str[i] != '_')
+        str[i] = '_';
+  }
+
+  inline std::string
+  getTypePrefix(const Type* Ty ) {
+    switch (Ty->getTypeID()) {
+    case Type::VoidTyID:     return "void_";
+    case Type::IntegerTyID:
+      return std::string("int") + utostr(cast<IntegerType>(Ty)->getBitWidth()) +
+        "_";
+    case Type::FloatTyID:    return "float_";
+    case Type::DoubleTyID:   return "double_";
+    case Type::LabelTyID:    return "label_";
+    case Type::FunctionTyID: return "func_";
+    case Type::StructTyID:   return "struct_";
+    case Type::ArrayTyID:    return "array_";
+    case Type::PointerTyID:  return "ptr_";
+    case Type::VectorTyID:   return "packed_";
+    case Type::OpaqueTyID:   return "opaque_";
+    default:                 return "other_";
+    }
+    return "unknown_";
+  }
+
+  // Looks up the type in the symbol table and returns a pointer to its name or
+  // a null pointer if it wasn't found. Note that this isn't the same as the
+  // Mode::getTypeName function which will return an empty string, not a null
+  // pointer if the name is not found.
+  inline const std::string*
+  findTypeName(const TypeSymbolTable& ST, const Type* Ty) {
+    TypeSymbolTable::const_iterator TI = ST.begin();
+    TypeSymbolTable::const_iterator TE = ST.end();
+    for (;TI != TE; ++TI)
+      if (TI->second == Ty)
+        return &(TI->first);
+    return 0;
+  }
+
+  void CppWriter::error(const std::string& msg) {
+    llvm_report_error(msg);
+  }
+
+  // printCFP - Print a floating point constant .. very carefully :)
+  // This makes sure that conversion to/from floating yields the same binary
+  // result so that we don't lose precision.
+  void CppWriter::printCFP(const ConstantFP *CFP) {
+    bool ignored;
+    APFloat APF = APFloat(CFP->getValueAPF());  // copy
+    if (CFP->getType() == Type::getFloatTy(CFP->getContext()))
+      APF.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &ignored);
+    Out << "ConstantFP::get(getGlobalContext(), ";
+    Out << "APFloat(";
+#if HAVE_PRINTF_A
+    char Buffer[100];
+    sprintf(Buffer, "%A", APF.convertToDouble());
+    if ((!strncmp(Buffer, "0x", 2) ||
+         !strncmp(Buffer, "-0x", 3) ||
+         !strncmp(Buffer, "+0x", 3)) &&
+        APF.bitwiseIsEqual(APFloat(atof(Buffer)))) {
+      if (CFP->getType() == Type::getDoubleTy(CFP->getContext()))
+        Out << "BitsToDouble(" << Buffer << ")";
+      else
+        Out << "BitsToFloat((float)" << Buffer << ")";
+      Out << ")";
+    } else {
+#endif
+      std::string StrVal = ftostr(CFP->getValueAPF());
+
+      while (StrVal[0] == ' ')
+        StrVal.erase(StrVal.begin());
+
+      // Check to make sure that the stringized number is not some string like
+      // "Inf" or NaN.  Check that the string matches the "[-+]?[0-9]" regex.
+      if (((StrVal[0] >= '0' && StrVal[0] <= '9') ||
+           ((StrVal[0] == '-' || StrVal[0] == '+') &&
+            (StrVal[1] >= '0' && StrVal[1] <= '9'))) &&
+          (CFP->isExactlyValue(atof(StrVal.c_str())))) {
+        if (CFP->getType() == Type::getDoubleTy(CFP->getContext()))
+          Out <<  StrVal;
+        else
+          Out << StrVal << "f";
+      } else if (CFP->getType() == Type::getDoubleTy(CFP->getContext()))
+        Out << "BitsToDouble(0x"
+            << utohexstr(CFP->getValueAPF().bitcastToAPInt().getZExtValue())
+            << "ULL) /* " << StrVal << " */";
+      else
+        Out << "BitsToFloat(0x"
+            << utohexstr((uint32_t)CFP->getValueAPF().
+                                        bitcastToAPInt().getZExtValue())
+            << "U) /* " << StrVal << " */";
+      Out << ")";
+#if HAVE_PRINTF_A
+    }
+#endif
+    Out << ")";
+  }
+
+  void CppWriter::printCallingConv(CallingConv::ID cc){
+    // Print the calling convention.
+    switch (cc) {
+    case CallingConv::C:     Out << "CallingConv::C"; break;
+    case CallingConv::Fast:  Out << "CallingConv::Fast"; break;
+    case CallingConv::Cold:  Out << "CallingConv::Cold"; break;
+    case CallingConv::FirstTargetCC: Out << "CallingConv::FirstTargetCC"; break;
+    default:                 Out << cc; break;
+    }
+  }
+
+  void CppWriter::printLinkageType(GlobalValue::LinkageTypes LT) {
+    switch (LT) {
+    case GlobalValue::InternalLinkage:
+      Out << "GlobalValue::InternalLinkage"; break;
+    case GlobalValue::PrivateLinkage:
+      Out << "GlobalValue::PrivateLinkage"; break;
+    case GlobalValue::LinkerPrivateLinkage:
+      Out << "GlobalValue::LinkerPrivateLinkage"; break;
+    case GlobalValue::AvailableExternallyLinkage:
+      Out << "GlobalValue::AvailableExternallyLinkage "; break;
+    case GlobalValue::LinkOnceAnyLinkage:
+      Out << "GlobalValue::LinkOnceAnyLinkage "; break;
+    case GlobalValue::LinkOnceODRLinkage:
+      Out << "GlobalValue::LinkOnceODRLinkage "; break;
+    case GlobalValue::WeakAnyLinkage:
+      Out << "GlobalValue::WeakAnyLinkage"; break;
+    case GlobalValue::WeakODRLinkage:
+      Out << "GlobalValue::WeakODRLinkage"; break;
+    case GlobalValue::AppendingLinkage:
+      Out << "GlobalValue::AppendingLinkage"; break;
+    case GlobalValue::ExternalLinkage:
+      Out << "GlobalValue::ExternalLinkage"; break;
+    case GlobalValue::DLLImportLinkage:
+      Out << "GlobalValue::DLLImportLinkage"; break;
+    case GlobalValue::DLLExportLinkage:
+      Out << "GlobalValue::DLLExportLinkage"; break;
+    case GlobalValue::ExternalWeakLinkage:
+      Out << "GlobalValue::ExternalWeakLinkage"; break;
+    case GlobalValue::CommonLinkage:
+      Out << "GlobalValue::CommonLinkage"; break;
+    }
+  }
+
+  void CppWriter::printVisibilityType(GlobalValue::VisibilityTypes VisType) {
+    switch (VisType) {
+    default: llvm_unreachable("Unknown GVar visibility");
+    case GlobalValue::DefaultVisibility:
+      Out << "GlobalValue::DefaultVisibility";
+      break;
+    case GlobalValue::HiddenVisibility:
+      Out << "GlobalValue::HiddenVisibility";
+      break;
+    case GlobalValue::ProtectedVisibility:
+      Out << "GlobalValue::ProtectedVisibility";
+      break;
+    }
+  }
+
+  // printEscapedString - Print each character of the specified string, escaping
+  // it if it is not printable or if it is an escape char.
+  void CppWriter::printEscapedString(const std::string &Str) {
+    for (unsigned i = 0, e = Str.size(); i != e; ++i) {
+      unsigned char C = Str[i];
+      if (isprint(C) && C != '"' && C != '\\') {
+        Out << C;
+      } else {
+        Out << "\\x"
+            << (char) ((C/16  < 10) ? ( C/16 +'0') : ( C/16 -10+'A'))
+            << (char)(((C&15) < 10) ? ((C&15)+'0') : ((C&15)-10+'A'));
+      }
+    }
+  }
+
+  std::string CppWriter::getCppName(const Type* Ty) {
+    // First, handle the primitive types .. easy
+    if (Ty->isPrimitiveType() || Ty->isInteger()) {
+      switch (Ty->getTypeID()) {
+      case Type::VoidTyID:   return "Type::getVoidTy(getGlobalContext())";
+      case Type::IntegerTyID: {
+        unsigned BitWidth = cast<IntegerType>(Ty)->getBitWidth();
+        return "IntegerType::get(getGlobalContext(), " + utostr(BitWidth) + ")";
+      }
+      case Type::X86_FP80TyID: return "Type::getX86_FP80Ty(getGlobalContext())";
+      case Type::FloatTyID:    return "Type::getFloatTy(getGlobalContext())";
+      case Type::DoubleTyID:   return "Type::getDoubleTy(getGlobalContext())";
+      case Type::LabelTyID:    return "Type::getLabelTy(getGlobalContext())";
+      default:
+        error("Invalid primitive type");
+        break;
+      }
+      // shouldn't be returned, but make it sensible
+      return "Type::getVoidTy(getGlobalContext())";
+    }
+
+    // Now, see if we've seen the type before and return that
+    TypeMap::iterator I = TypeNames.find(Ty);
+    if (I != TypeNames.end())
+      return I->second;
+
+    // Okay, let's build a new name for this type. Start with a prefix
+    const char* prefix = 0;
+    switch (Ty->getTypeID()) {
+    case Type::FunctionTyID:    prefix = "FuncTy_"; break;
+    case Type::StructTyID:      prefix = "StructTy_"; break;
+    case Type::ArrayTyID:       prefix = "ArrayTy_"; break;
+    case Type::PointerTyID:     prefix = "PointerTy_"; break;
+    case Type::OpaqueTyID:      prefix = "OpaqueTy_"; break;
+    case Type::VectorTyID:      prefix = "VectorTy_"; break;
+    default:                    prefix = "OtherTy_"; break; // prevent breakage
+    }
+
+    // See if the type has a name in the symboltable and build accordingly
+    const std::string* tName = findTypeName(TheModule->getTypeSymbolTable(), Ty);
+    std::string name;
+    if (tName)
+      name = std::string(prefix) + *tName;
+    else
+      name = std::string(prefix) + utostr(uniqueNum++);
+    sanitize(name);
+
+    // Save the name
+    return TypeNames[Ty] = name;
+  }
+
+  void CppWriter::printCppName(const Type* Ty) {
+    printEscapedString(getCppName(Ty));
+  }
+
+  std::string CppWriter::getCppName(const Value* val) {
+    std::string name;
+    ValueMap::iterator I = ValueNames.find(val);
+    if (I != ValueNames.end() && I->first == val)
+      return  I->second;
+
+    if (const GlobalVariable* GV = dyn_cast<GlobalVariable>(val)) {
+      name = std::string("gvar_") +
+        getTypePrefix(GV->getType()->getElementType());
+    } else if (isa<Function>(val)) {
+      name = std::string("func_");
+    } else if (const Constant* C = dyn_cast<Constant>(val)) {
+      name = std::string("const_") + getTypePrefix(C->getType());
+    } else if (const Argument* Arg = dyn_cast<Argument>(val)) {
+      if (is_inline) {
+        unsigned argNum = std::distance(Arg->getParent()->arg_begin(),
+                                        Function::const_arg_iterator(Arg)) + 1;
+        name = std::string("arg_") + utostr(argNum);
+        NameSet::iterator NI = UsedNames.find(name);
+        if (NI != UsedNames.end())
+          name += std::string("_") + utostr(uniqueNum++);
+        UsedNames.insert(name);
+        return ValueNames[val] = name;
+      } else {
+        name = getTypePrefix(val->getType());
+      }
+    } else {
+      name = getTypePrefix(val->getType());
+    }
+    if (val->hasName())
+      name += val->getName();
+    else
+      name += utostr(uniqueNum++);
+    sanitize(name);
+    NameSet::iterator NI = UsedNames.find(name);
+    if (NI != UsedNames.end())
+      name += std::string("_") + utostr(uniqueNum++);
+    UsedNames.insert(name);
+    return ValueNames[val] = name;
+  }
+
+  void CppWriter::printCppName(const Value* val) {
+    printEscapedString(getCppName(val));
+  }
+
+  void CppWriter::printAttributes(const AttrListPtr &PAL,
+                                  const std::string &name) {
+    Out << "AttrListPtr " << name << "_PAL;";
+    nl(Out);
+    if (!PAL.isEmpty()) {
+      Out << '{'; in(); nl(Out);
+      Out << "SmallVector<AttributeWithIndex, 4> Attrs;"; nl(Out);
+      Out << "AttributeWithIndex PAWI;"; nl(Out);
+      for (unsigned i = 0; i < PAL.getNumSlots(); ++i) {
+        unsigned index = PAL.getSlot(i).Index;
+        Attributes attrs = PAL.getSlot(i).Attrs;
+        Out << "PAWI.Index = " << index << "U; PAWI.Attrs = 0 ";
+#define HANDLE_ATTR(X)                 \
+        if (attrs & Attribute::X)      \
+          Out << " | Attribute::" #X;  \
+        attrs &= ~Attribute::X;
+        
+        HANDLE_ATTR(SExt);
+        HANDLE_ATTR(ZExt);
+        HANDLE_ATTR(NoReturn);
+        HANDLE_ATTR(InReg);
+        HANDLE_ATTR(StructRet);
+        HANDLE_ATTR(NoUnwind);
+        HANDLE_ATTR(NoAlias);
+        HANDLE_ATTR(ByVal);
+        HANDLE_ATTR(Nest);
+        HANDLE_ATTR(ReadNone);
+        HANDLE_ATTR(ReadOnly);
+        HANDLE_ATTR(InlineHint);
+        HANDLE_ATTR(NoInline);
+        HANDLE_ATTR(AlwaysInline);
+        HANDLE_ATTR(OptimizeForSize);
+        HANDLE_ATTR(StackProtect);
+        HANDLE_ATTR(StackProtectReq);
+        HANDLE_ATTR(NoCapture);
+#undef HANDLE_ATTR
+        assert(attrs == 0 && "Unhandled attribute!");
+        Out << ";";
+        nl(Out);
+        Out << "Attrs.push_back(PAWI);";
+        nl(Out);
+      }
+      Out << name << "_PAL = AttrListPtr::get(Attrs.begin(), Attrs.end());";
+      nl(Out);
+      out(); nl(Out);
+      Out << '}'; nl(Out);
+    }
+  }
+
+  bool CppWriter::printTypeInternal(const Type* Ty) {
+    // We don't print definitions for primitive types
+    if (Ty->isPrimitiveType() || Ty->isInteger())
+      return false;
+
+    // If we already defined this type, we don't need to define it again.
+    if (DefinedTypes.find(Ty) != DefinedTypes.end())
+      return false;
+
+    // Everything below needs the name for the type so get it now.
+    std::string typeName(getCppName(Ty));
+
+    // Search the type stack for recursion. If we find it, then generate this
+    // as an OpaqueType, but make sure not to do this multiple times because
+    // the type could appear in multiple places on the stack. Once the opaque
+    // definition is issued, it must not be re-issued. Consequently we have to
+    // check the UnresolvedTypes list as well.
+    TypeList::const_iterator TI = std::find(TypeStack.begin(), TypeStack.end(),
+                                            Ty);
+    if (TI != TypeStack.end()) {
+      TypeMap::const_iterator I = UnresolvedTypes.find(Ty);
+      if (I == UnresolvedTypes.end()) {
+        Out << "PATypeHolder " << typeName;
+        Out << "_fwd = OpaqueType::get(getGlobalContext());";
+        nl(Out);
+        UnresolvedTypes[Ty] = typeName;
+      }
+      return true;
+    }
+
+    // We're going to print a derived type which, by definition, contains other
+    // types. So, push this one we're printing onto the type stack to assist with
+    // recursive definitions.
+    TypeStack.push_back(Ty);
+
+    // Print the type definition
+    switch (Ty->getTypeID()) {
+    case Type::FunctionTyID:  {
+      const FunctionType* FT = cast<FunctionType>(Ty);
+      Out << "std::vector<const Type*>" << typeName << "_args;";
+      nl(Out);
+      FunctionType::param_iterator PI = FT->param_begin();
+      FunctionType::param_iterator PE = FT->param_end();
+      for (; PI != PE; ++PI) {
+        const Type* argTy = static_cast<const Type*>(*PI);
+        bool isForward = printTypeInternal(argTy);
+        std::string argName(getCppName(argTy));
+        Out << typeName << "_args.push_back(" << argName;
+        if (isForward)
+          Out << "_fwd";
+        Out << ");";
+        nl(Out);
+      }
+      bool isForward = printTypeInternal(FT->getReturnType());
+      std::string retTypeName(getCppName(FT->getReturnType()));
+      Out << "FunctionType* " << typeName << " = FunctionType::get(";
+      in(); nl(Out) << "/*Result=*/" << retTypeName;
+      if (isForward)
+        Out << "_fwd";
+      Out << ",";
+      nl(Out) << "/*Params=*/" << typeName << "_args,";
+      nl(Out) << "/*isVarArg=*/" << (FT->isVarArg() ? "true" : "false") << ");";
+      out();
+      nl(Out);
+      break;
+    }
+    case Type::StructTyID: {
+      const StructType* ST = cast<StructType>(Ty);
+      Out << "std::vector<const Type*>" << typeName << "_fields;";
+      nl(Out);
+      StructType::element_iterator EI = ST->element_begin();
+      StructType::element_iterator EE = ST->element_end();
+      for (; EI != EE; ++EI) {
+        const Type* fieldTy = static_cast<const Type*>(*EI);
+        bool isForward = printTypeInternal(fieldTy);
+        std::string fieldName(getCppName(fieldTy));
+        Out << typeName << "_fields.push_back(" << fieldName;
+        if (isForward)
+          Out << "_fwd";
+        Out << ");";
+        nl(Out);
+      }
+      Out << "StructType* " << typeName << " = StructType::get("
+          << "mod->getContext(), "
+          << typeName << "_fields, /*isPacked=*/"
+          << (ST->isPacked() ? "true" : "false") << ");";
+      nl(Out);
+      break;
+    }
+    case Type::ArrayTyID: {
+      const ArrayType* AT = cast<ArrayType>(Ty);
+      const Type* ET = AT->getElementType();
+      bool isForward = printTypeInternal(ET);
+      std::string elemName(getCppName(ET));
+      Out << "ArrayType* " << typeName << " = ArrayType::get("
+          << elemName << (isForward ? "_fwd" : "")
+          << ", " << utostr(AT->getNumElements()) << ");";
+      nl(Out);
+      break;
+    }
+    case Type::PointerTyID: {
+      const PointerType* PT = cast<PointerType>(Ty);
+      const Type* ET = PT->getElementType();
+      bool isForward = printTypeInternal(ET);
+      std::string elemName(getCppName(ET));
+      Out << "PointerType* " << typeName << " = PointerType::get("
+          << elemName << (isForward ? "_fwd" : "")
+          << ", " << utostr(PT->getAddressSpace()) << ");";
+      nl(Out);
+      break;
+    }
+    case Type::VectorTyID: {
+      const VectorType* PT = cast<VectorType>(Ty);
+      const Type* ET = PT->getElementType();
+      bool isForward = printTypeInternal(ET);
+      std::string elemName(getCppName(ET));
+      Out << "VectorType* " << typeName << " = VectorType::get("
+          << elemName << (isForward ? "_fwd" : "")
+          << ", " << utostr(PT->getNumElements()) << ");";
+      nl(Out);
+      break;
+    }
+    case Type::OpaqueTyID: {
+      Out << "OpaqueType* " << typeName;
+      Out << " = OpaqueType::get(getGlobalContext());";
+      nl(Out);
+      break;
+    }
+    default:
+      error("Invalid TypeID");
+    }
+
+    // If the type had a name, make sure we recreate it.
+    const std::string* progTypeName =
+      findTypeName(TheModule->getTypeSymbolTable(),Ty);
+    if (progTypeName) {
+      Out << "mod->addTypeName(\"" << *progTypeName << "\", "
+          << typeName << ");";
+      nl(Out);
+    }
+
+    // Pop us off the type stack
+    TypeStack.pop_back();
+
+    // Indicate that this type is now defined.
+    DefinedTypes.insert(Ty);
+
+    // Early resolve as many unresolved types as possible. Search the unresolved
+    // types map for the type we just printed. Now that its definition is complete
+    // we can resolve any previous references to it. This prevents a cascade of
+    // unresolved types.
+    TypeMap::iterator I = UnresolvedTypes.find(Ty);
+    if (I != UnresolvedTypes.end()) {
+      Out << "cast<OpaqueType>(" << I->second
+          << "_fwd.get())->refineAbstractTypeTo(" << I->second << ");";
+      nl(Out);
+      Out << I->second << " = cast<";
+      switch (Ty->getTypeID()) {
+      case Type::FunctionTyID: Out << "FunctionType"; break;
+      case Type::ArrayTyID:    Out << "ArrayType"; break;
+      case Type::StructTyID:   Out << "StructType"; break;
+      case Type::VectorTyID:   Out << "VectorType"; break;
+      case Type::PointerTyID:  Out << "PointerType"; break;
+      case Type::OpaqueTyID:   Out << "OpaqueType"; break;
+      default:                 Out << "NoSuchDerivedType"; break;
+      }
+      Out << ">(" << I->second << "_fwd.get());";
+      nl(Out); nl(Out);
+      UnresolvedTypes.erase(I);
+    }
+
+    // Finally, separate the type definition from other with a newline.
+    nl(Out);
+
+    // We weren't a recursive type
+    return false;
+  }
+
+  // Prints a type definition. Returns true if it could not resolve all the
+  // types in the definition but had to use a forward reference.
+  void CppWriter::printType(const Type* Ty) {
+    assert(TypeStack.empty());
+    TypeStack.clear();
+    printTypeInternal(Ty);
+    assert(TypeStack.empty());
+  }
+
+  void CppWriter::printTypes(const Module* M) {
+    // Walk the symbol table and print out all its types
+    const TypeSymbolTable& symtab = M->getTypeSymbolTable();
+    for (TypeSymbolTable::const_iterator TI = symtab.begin(), TE = symtab.end();
+         TI != TE; ++TI) {
+
+      // For primitive types and types already defined, just add a name
+      TypeMap::const_iterator TNI = TypeNames.find(TI->second);
+      if (TI->second->isInteger() || TI->second->isPrimitiveType() ||
+          TNI != TypeNames.end()) {
+        Out << "mod->addTypeName(\"";
+        printEscapedString(TI->first);
+        Out << "\", " << getCppName(TI->second) << ");";
+        nl(Out);
+        // For everything else, define the type
+      } else {
+        printType(TI->second);
+      }
+    }
+
+    // Add all of the global variables to the value table...
+    for (Module::const_global_iterator I = TheModule->global_begin(),
+           E = TheModule->global_end(); I != E; ++I) {
+      if (I->hasInitializer())
+        printType(I->getInitializer()->getType());
+      printType(I->getType());
+    }
+
+    // Add all the functions to the table
+    for (Module::const_iterator FI = TheModule->begin(), FE = TheModule->end();
+         FI != FE; ++FI) {
+      printType(FI->getReturnType());
+      printType(FI->getFunctionType());
+      // Add all the function arguments
+      for (Function::const_arg_iterator AI = FI->arg_begin(),
+             AE = FI->arg_end(); AI != AE; ++AI) {
+        printType(AI->getType());
+      }
+
+      // Add all of the basic blocks and instructions
+      for (Function::const_iterator BB = FI->begin(),
+             E = FI->end(); BB != E; ++BB) {
+        printType(BB->getType());
+        for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I!=E;
+             ++I) {
+          printType(I->getType());
+          for (unsigned i = 0; i < I->getNumOperands(); ++i)
+            printType(I->getOperand(i)->getType());
+        }
+      }
+    }
+  }
+
+
+  // printConstant - Print out a constant pool entry...
+  void CppWriter::printConstant(const Constant *CV) {
+    // First, if the constant is actually a GlobalValue (variable or function)
+    // or its already in the constant list then we've printed it already and we
+    // can just return.
+    if (isa<GlobalValue>(CV) || ValueNames.find(CV) != ValueNames.end())
+      return;
+
+    std::string constName(getCppName(CV));
+    std::string typeName(getCppName(CV->getType()));
+
+    if (isa<GlobalValue>(CV)) {
+      // Skip variables and functions, we emit them elsewhere
+      return;
+    }
+
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
+      std::string constValue = CI->getValue().toString(10, true);
+      Out << "ConstantInt* " << constName
+          << " = ConstantInt::get(getGlobalContext(), APInt("
+          << cast<IntegerType>(CI->getType())->getBitWidth()
+          << ", StringRef(\"" <<  constValue << "\"), 10));";
+    } else if (isa<ConstantAggregateZero>(CV)) {
+      Out << "ConstantAggregateZero* " << constName
+          << " = ConstantAggregateZero::get(" << typeName << ");";
+    } else if (isa<ConstantPointerNull>(CV)) {
+      Out << "ConstantPointerNull* " << constName
+          << " = ConstantPointerNull::get(" << typeName << ");";
+    } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV)) {
+      Out << "ConstantFP* " << constName << " = ";
+      printCFP(CFP);
+      Out << ";";
+    } else if (const ConstantArray *CA = dyn_cast<ConstantArray>(CV)) {
+      if (CA->isString() &&
+          CA->getType()->getElementType() ==
+              Type::getInt8Ty(CA->getContext())) {
+        Out << "Constant* " << constName <<
+               " = ConstantArray::get(getGlobalContext(), \"";
+        std::string tmp = CA->getAsString();
+        bool nullTerminate = false;
+        if (tmp[tmp.length()-1] == 0) {
+          tmp.erase(tmp.length()-1);
+          nullTerminate = true;
+        }
+        printEscapedString(tmp);
+        // Determine if we want null termination or not.
+        if (nullTerminate)
+          Out << "\", true"; // Indicate that the null terminator should be
+                             // added.
+        else
+          Out << "\", false";// No null terminator
+        Out << ");";
+      } else {
+        Out << "std::vector<Constant*> " << constName << "_elems;";
+        nl(Out);
+        unsigned N = CA->getNumOperands();
+        for (unsigned i = 0; i < N; ++i) {
+          printConstant(CA->getOperand(i)); // recurse to print operands
+          Out << constName << "_elems.push_back("
+              << getCppName(CA->getOperand(i)) << ");";
+          nl(Out);
+        }
+        Out << "Constant* " << constName << " = ConstantArray::get("
+            << typeName << ", " << constName << "_elems);";
+      }
+    } else if (const ConstantStruct *CS = dyn_cast<ConstantStruct>(CV)) {
+      Out << "std::vector<Constant*> " << constName << "_fields;";
+      nl(Out);
+      unsigned N = CS->getNumOperands();
+      for (unsigned i = 0; i < N; i++) {
+        printConstant(CS->getOperand(i));
+        Out << constName << "_fields.push_back("
+            << getCppName(CS->getOperand(i)) << ");";
+        nl(Out);
+      }
+      Out << "Constant* " << constName << " = ConstantStruct::get("
+          << typeName << ", " << constName << "_fields);";
+    } else if (const ConstantVector *CP = dyn_cast<ConstantVector>(CV)) {
+      Out << "std::vector<Constant*> " << constName << "_elems;";
+      nl(Out);
+      unsigned N = CP->getNumOperands();
+      for (unsigned i = 0; i < N; ++i) {
+        printConstant(CP->getOperand(i));
+        Out << constName << "_elems.push_back("
+            << getCppName(CP->getOperand(i)) << ");";
+        nl(Out);
+      }
+      Out << "Constant* " << constName << " = ConstantVector::get("
+          << typeName << ", " << constName << "_elems);";
+    } else if (isa<UndefValue>(CV)) {
+      Out << "UndefValue* " << constName << " = UndefValue::get("
+          << typeName << ");";
+    } else if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV)) {
+      if (CE->getOpcode() == Instruction::GetElementPtr) {
+        Out << "std::vector<Constant*> " << constName << "_indices;";
+        nl(Out);
+        printConstant(CE->getOperand(0));
+        for (unsigned i = 1; i < CE->getNumOperands(); ++i ) {
+          printConstant(CE->getOperand(i));
+          Out << constName << "_indices.push_back("
+              << getCppName(CE->getOperand(i)) << ");";
+          nl(Out);
+        }
+        Out << "Constant* " << constName
+            << " = ConstantExpr::getGetElementPtr("
+            << getCppName(CE->getOperand(0)) << ", "
+            << "&" << constName << "_indices[0], "
+            << constName << "_indices.size()"
+            << ");";
+      } else if (CE->isCast()) {
+        printConstant(CE->getOperand(0));
+        Out << "Constant* " << constName << " = ConstantExpr::getCast(";
+        switch (CE->getOpcode()) {
+        default: llvm_unreachable("Invalid cast opcode");
+        case Instruction::Trunc: Out << "Instruction::Trunc"; break;
+        case Instruction::ZExt:  Out << "Instruction::ZExt"; break;
+        case Instruction::SExt:  Out << "Instruction::SExt"; break;
+        case Instruction::FPTrunc:  Out << "Instruction::FPTrunc"; break;
+        case Instruction::FPExt:  Out << "Instruction::FPExt"; break;
+        case Instruction::FPToUI:  Out << "Instruction::FPToUI"; break;
+        case Instruction::FPToSI:  Out << "Instruction::FPToSI"; break;
+        case Instruction::UIToFP:  Out << "Instruction::UIToFP"; break;
+        case Instruction::SIToFP:  Out << "Instruction::SIToFP"; break;
+        case Instruction::PtrToInt:  Out << "Instruction::PtrToInt"; break;
+        case Instruction::IntToPtr:  Out << "Instruction::IntToPtr"; break;
+        case Instruction::BitCast:  Out << "Instruction::BitCast"; break;
+        }
+        Out << ", " << getCppName(CE->getOperand(0)) << ", "
+            << getCppName(CE->getType()) << ");";
+      } else {
+        unsigned N = CE->getNumOperands();
+        for (unsigned i = 0; i < N; ++i ) {
+          printConstant(CE->getOperand(i));
+        }
+        Out << "Constant* " << constName << " = ConstantExpr::";
+        switch (CE->getOpcode()) {
+        case Instruction::Add:    Out << "getAdd(";  break;
+        case Instruction::FAdd:   Out << "getFAdd(";  break;
+        case Instruction::Sub:    Out << "getSub("; break;
+        case Instruction::FSub:   Out << "getFSub("; break;
+        case Instruction::Mul:    Out << "getMul("; break;
+        case Instruction::FMul:   Out << "getFMul("; break;
+        case Instruction::UDiv:   Out << "getUDiv("; break;
+        case Instruction::SDiv:   Out << "getSDiv("; break;
+        case Instruction::FDiv:   Out << "getFDiv("; break;
+        case Instruction::URem:   Out << "getURem("; break;
+        case Instruction::SRem:   Out << "getSRem("; break;
+        case Instruction::FRem:   Out << "getFRem("; break;
+        case Instruction::And:    Out << "getAnd("; break;
+        case Instruction::Or:     Out << "getOr("; break;
+        case Instruction::Xor:    Out << "getXor("; break;
+        case Instruction::ICmp:
+          Out << "getICmp(ICmpInst::ICMP_";
+          switch (CE->getPredicate()) {
+          case ICmpInst::ICMP_EQ:  Out << "EQ"; break;
+          case ICmpInst::ICMP_NE:  Out << "NE"; break;
+          case ICmpInst::ICMP_SLT: Out << "SLT"; break;
+          case ICmpInst::ICMP_ULT: Out << "ULT"; break;
+          case ICmpInst::ICMP_SGT: Out << "SGT"; break;
+          case ICmpInst::ICMP_UGT: Out << "UGT"; break;
+          case ICmpInst::ICMP_SLE: Out << "SLE"; break;
+          case ICmpInst::ICMP_ULE: Out << "ULE"; break;
+          case ICmpInst::ICMP_SGE: Out << "SGE"; break;
+          case ICmpInst::ICMP_UGE: Out << "UGE"; break;
+          default: error("Invalid ICmp Predicate");
+          }
+          break;
+        case Instruction::FCmp:
+          Out << "getFCmp(FCmpInst::FCMP_";
+          switch (CE->getPredicate()) {
+          case FCmpInst::FCMP_FALSE: Out << "FALSE"; break;
+          case FCmpInst::FCMP_ORD:   Out << "ORD"; break;
+          case FCmpInst::FCMP_UNO:   Out << "UNO"; break;
+          case FCmpInst::FCMP_OEQ:   Out << "OEQ"; break;
+          case FCmpInst::FCMP_UEQ:   Out << "UEQ"; break;
+          case FCmpInst::FCMP_ONE:   Out << "ONE"; break;
+          case FCmpInst::FCMP_UNE:   Out << "UNE"; break;
+          case FCmpInst::FCMP_OLT:   Out << "OLT"; break;
+          case FCmpInst::FCMP_ULT:   Out << "ULT"; break;
+          case FCmpInst::FCMP_OGT:   Out << "OGT"; break;
+          case FCmpInst::FCMP_UGT:   Out << "UGT"; break;
+          case FCmpInst::FCMP_OLE:   Out << "OLE"; break;
+          case FCmpInst::FCMP_ULE:   Out << "ULE"; break;
+          case FCmpInst::FCMP_OGE:   Out << "OGE"; break;
+          case FCmpInst::FCMP_UGE:   Out << "UGE"; break;
+          case FCmpInst::FCMP_TRUE:  Out << "TRUE"; break;
+          default: error("Invalid FCmp Predicate");
+          }
+          break;
+        case Instruction::Shl:     Out << "getShl("; break;
+        case Instruction::LShr:    Out << "getLShr("; break;
+        case Instruction::AShr:    Out << "getAShr("; break;
+        case Instruction::Select:  Out << "getSelect("; break;
+        case Instruction::ExtractElement: Out << "getExtractElement("; break;
+        case Instruction::InsertElement:  Out << "getInsertElement("; break;
+        case Instruction::ShuffleVector:  Out << "getShuffleVector("; break;
+        default:
+          error("Invalid constant expression");
+          break;
+        }
+        Out << getCppName(CE->getOperand(0));
+        for (unsigned i = 1; i < CE->getNumOperands(); ++i)
+          Out << ", " << getCppName(CE->getOperand(i));
+        Out << ");";
+      }
+    } else {
+      error("Bad Constant");
+      Out << "Constant* " << constName << " = 0; ";
+    }
+    nl(Out);
+  }
+
+  void CppWriter::printConstants(const Module* M) {
+    // Traverse all the global variables looking for constant initializers
+    for (Module::const_global_iterator I = TheModule->global_begin(),
+           E = TheModule->global_end(); I != E; ++I)
+      if (I->hasInitializer())
+        printConstant(I->getInitializer());
+
+    // Traverse the LLVM functions looking for constants
+    for (Module::const_iterator FI = TheModule->begin(), FE = TheModule->end();
+         FI != FE; ++FI) {
+      // Add all of the basic blocks and instructions
+      for (Function::const_iterator BB = FI->begin(),
+             E = FI->end(); BB != E; ++BB) {
+        for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I!=E;
+             ++I) {
+          for (unsigned i = 0; i < I->getNumOperands(); ++i) {
+            if (Constant* C = dyn_cast<Constant>(I->getOperand(i))) {
+              printConstant(C);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  void CppWriter::printVariableUses(const GlobalVariable *GV) {
+    nl(Out) << "// Type Definitions";
+    nl(Out);
+    printType(GV->getType());
+    if (GV->hasInitializer()) {
+      Constant *Init = GV->getInitializer();
+      printType(Init->getType());
+      if (Function *F = dyn_cast<Function>(Init)) {
+        nl(Out)<< "/ Function Declarations"; nl(Out);
+        printFunctionHead(F);
+      } else if (GlobalVariable* gv = dyn_cast<GlobalVariable>(Init)) {
+        nl(Out) << "// Global Variable Declarations"; nl(Out);
+        printVariableHead(gv);
+        
+        nl(Out) << "// Global Variable Definitions"; nl(Out);
+        printVariableBody(gv);
+      } else  {
+        nl(Out) << "// Constant Definitions"; nl(Out);
+        printConstant(Init);
+      }
+    }
+  }
+
+  void CppWriter::printVariableHead(const GlobalVariable *GV) {
+    nl(Out) << "GlobalVariable* " << getCppName(GV);
+    if (is_inline) {
+      Out << " = mod->getGlobalVariable(getGlobalContext(), ";
+      printEscapedString(GV->getName());
+      Out << ", " << getCppName(GV->getType()->getElementType()) << ",true)";
+      nl(Out) << "if (!" << getCppName(GV) << ") {";
+      in(); nl(Out) << getCppName(GV);
+    }
+    Out << " = new GlobalVariable(/*Module=*/*mod, ";
+    nl(Out) << "/*Type=*/";
+    printCppName(GV->getType()->getElementType());
+    Out << ",";
+    nl(Out) << "/*isConstant=*/" << (GV->isConstant()?"true":"false");
+    Out << ",";
+    nl(Out) << "/*Linkage=*/";
+    printLinkageType(GV->getLinkage());
+    Out << ",";
+    nl(Out) << "/*Initializer=*/0, ";
+    if (GV->hasInitializer()) {
+      Out << "// has initializer, specified below";
+    }
+    nl(Out) << "/*Name=*/\"";
+    printEscapedString(GV->getName());
+    Out << "\");";
+    nl(Out);
+
+    if (GV->hasSection()) {
+      printCppName(GV);
+      Out << "->setSection(\"";
+      printEscapedString(GV->getSection());
+      Out << "\");";
+      nl(Out);
+    }
+    if (GV->getAlignment()) {
+      printCppName(GV);
+      Out << "->setAlignment(" << utostr(GV->getAlignment()) << ");";
+      nl(Out);
+    }
+    if (GV->getVisibility() != GlobalValue::DefaultVisibility) {
+      printCppName(GV);
+      Out << "->setVisibility(";
+      printVisibilityType(GV->getVisibility());
+      Out << ");";
+      nl(Out);
+    }
+    if (is_inline) {
+      out(); Out << "}"; nl(Out);
+    }
+  }
+
+  void CppWriter::printVariableBody(const GlobalVariable *GV) {
+    if (GV->hasInitializer()) {
+      printCppName(GV);
+      Out << "->setInitializer(";
+      Out << getCppName(GV->getInitializer()) << ");";
+      nl(Out);
+    }
+  }
+
+  std::string CppWriter::getOpName(Value* V) {
+    if (!isa<Instruction>(V) || DefinedValues.find(V) != DefinedValues.end())
+      return getCppName(V);
+
+    // See if its alread in the map of forward references, if so just return the
+    // name we already set up for it
+    ForwardRefMap::const_iterator I = ForwardRefs.find(V);
+    if (I != ForwardRefs.end())
+      return I->second;
+
+    // This is a new forward reference. Generate a unique name for it
+    std::string result(std::string("fwdref_") + utostr(uniqueNum++));
+
+    // Yes, this is a hack. An Argument is the smallest instantiable value that
+    // we can make as a placeholder for the real value. We'll replace these
+    // Argument instances later.
+    Out << "Argument* " << result << " = new Argument("
+        << getCppName(V->getType()) << ");";
+    nl(Out);
+    ForwardRefs[V] = result;
+    return result;
+  }
+
+  // printInstruction - This member is called for each Instruction in a function.
+  void CppWriter::printInstruction(const Instruction *I,
+                                   const std::string& bbname) {
+    std::string iName(getCppName(I));
+
+    // Before we emit this instruction, we need to take care of generating any
+    // forward references. So, we get the names of all the operands in advance
+    std::string* opNames = new std::string[I->getNumOperands()];
+    for (unsigned i = 0; i < I->getNumOperands(); i++) {
+      opNames[i] = getOpName(I->getOperand(i));
+    }
+
+    switch (I->getOpcode()) {
+    default:
+      error("Invalid instruction");
+      break;
+
+    case Instruction::Ret: {
+      const ReturnInst* ret =  cast<ReturnInst>(I);
+      Out << "ReturnInst::Create(getGlobalContext(), "
+          << (ret->getReturnValue() ? opNames[0] + ", " : "") << bbname << ");";
+      break;
+    }
+    case Instruction::Br: {
+      const BranchInst* br = cast<BranchInst>(I);
+      Out << "BranchInst::Create(" ;
+      if (br->getNumOperands() == 3 ) {
+        Out << opNames[2] << ", "
+            << opNames[1] << ", "
+            << opNames[0] << ", ";
+
+      } else if (br->getNumOperands() == 1) {
+        Out << opNames[0] << ", ";
+      } else {
+        error("Branch with 2 operands?");
+      }
+      Out << bbname << ");";
+      break;
+    }
+    case Instruction::Switch: {
+      const SwitchInst *SI = cast<SwitchInst>(I);
+      Out << "SwitchInst* " << iName << " = SwitchInst::Create("
+          << opNames[0] << ", "
+          << opNames[1] << ", "
+          << SI->getNumCases() << ", " << bbname << ");";
+      nl(Out);
+      for (unsigned i = 2; i != SI->getNumOperands(); i += 2) {
+        Out << iName << "->addCase("
+            << opNames[i] << ", "
+            << opNames[i+1] << ");";
+        nl(Out);
+      }
+      break;
+    }
+    case Instruction::IndirectBr: {
+      const IndirectBrInst *IBI = cast<IndirectBrInst>(I);
+      Out << "IndirectBrInst *" << iName << " = IndirectBrInst::Create("
+          << opNames[0] << ", " << IBI->getNumDestinations() << ");";
+      nl(Out);
+      for (unsigned i = 1; i != IBI->getNumOperands(); ++i) {
+        Out << iName << "->addDestination(" << opNames[i] << ");";
+        nl(Out);
+      }
+      break;
+    }
+    case Instruction::Invoke: {
+      const InvokeInst* inv = cast<InvokeInst>(I);
+      Out << "std::vector<Value*> " << iName << "_params;";
+      nl(Out);
+      for (unsigned i = 3; i < inv->getNumOperands(); ++i) {
+        Out << iName << "_params.push_back("
+            << opNames[i] << ");";
+        nl(Out);
+      }
+      Out << "InvokeInst *" << iName << " = InvokeInst::Create("
+          << opNames[0] << ", "
+          << opNames[1] << ", "
+          << opNames[2] << ", "
+          << iName << "_params.begin(), " << iName << "_params.end(), \"";
+      printEscapedString(inv->getName());
+      Out << "\", " << bbname << ");";
+      nl(Out) << iName << "->setCallingConv(";
+      printCallingConv(inv->getCallingConv());
+      Out << ");";
+      printAttributes(inv->getAttributes(), iName);
+      Out << iName << "->setAttributes(" << iName << "_PAL);";
+      nl(Out);
+      break;
+    }
+    case Instruction::Unwind: {
+      Out << "new UnwindInst("
+          << bbname << ");";
+      break;
+    }
+    case Instruction::Unreachable: {
+      Out << "new UnreachableInst("
+          << "getGlobalContext(), "
+          << bbname << ");";
+      break;
+    }
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:{
+      Out << "BinaryOperator* " << iName << " = BinaryOperator::Create(";
+      switch (I->getOpcode()) {
+      case Instruction::Add: Out << "Instruction::Add"; break;
+      case Instruction::FAdd: Out << "Instruction::FAdd"; break;
+      case Instruction::Sub: Out << "Instruction::Sub"; break;
+      case Instruction::FSub: Out << "Instruction::FSub"; break;
+      case Instruction::Mul: Out << "Instruction::Mul"; break;
+      case Instruction::FMul: Out << "Instruction::FMul"; break;
+      case Instruction::UDiv:Out << "Instruction::UDiv"; break;
+      case Instruction::SDiv:Out << "Instruction::SDiv"; break;
+      case Instruction::FDiv:Out << "Instruction::FDiv"; break;
+      case Instruction::URem:Out << "Instruction::URem"; break;
+      case Instruction::SRem:Out << "Instruction::SRem"; break;
+      case Instruction::FRem:Out << "Instruction::FRem"; break;
+      case Instruction::And: Out << "Instruction::And"; break;
+      case Instruction::Or:  Out << "Instruction::Or";  break;
+      case Instruction::Xor: Out << "Instruction::Xor"; break;
+      case Instruction::Shl: Out << "Instruction::Shl"; break;
+      case Instruction::LShr:Out << "Instruction::LShr"; break;
+      case Instruction::AShr:Out << "Instruction::AShr"; break;
+      default: Out << "Instruction::BadOpCode"; break;
+      }
+      Out << ", " << opNames[0] << ", " << opNames[1] << ", \"";
+      printEscapedString(I->getName());
+      Out << "\", " << bbname << ");";
+      break;
+    }
+    case Instruction::FCmp: {
+      Out << "FCmpInst* " << iName << " = new FCmpInst(*" << bbname << ", ";
+      switch (cast<FCmpInst>(I)->getPredicate()) {
+      case FCmpInst::FCMP_FALSE: Out << "FCmpInst::FCMP_FALSE"; break;
+      case FCmpInst::FCMP_OEQ  : Out << "FCmpInst::FCMP_OEQ"; break;
+      case FCmpInst::FCMP_OGT  : Out << "FCmpInst::FCMP_OGT"; break;
+      case FCmpInst::FCMP_OGE  : Out << "FCmpInst::FCMP_OGE"; break;
+      case FCmpInst::FCMP_OLT  : Out << "FCmpInst::FCMP_OLT"; break;
+      case FCmpInst::FCMP_OLE  : Out << "FCmpInst::FCMP_OLE"; break;
+      case FCmpInst::FCMP_ONE  : Out << "FCmpInst::FCMP_ONE"; break;
+      case FCmpInst::FCMP_ORD  : Out << "FCmpInst::FCMP_ORD"; break;
+      case FCmpInst::FCMP_UNO  : Out << "FCmpInst::FCMP_UNO"; break;
+      case FCmpInst::FCMP_UEQ  : Out << "FCmpInst::FCMP_UEQ"; break;
+      case FCmpInst::FCMP_UGT  : Out << "FCmpInst::FCMP_UGT"; break;
+      case FCmpInst::FCMP_UGE  : Out << "FCmpInst::FCMP_UGE"; break;
+      case FCmpInst::FCMP_ULT  : Out << "FCmpInst::FCMP_ULT"; break;
+      case FCmpInst::FCMP_ULE  : Out << "FCmpInst::FCMP_ULE"; break;
+      case FCmpInst::FCMP_UNE  : Out << "FCmpInst::FCMP_UNE"; break;
+      case FCmpInst::FCMP_TRUE : Out << "FCmpInst::FCMP_TRUE"; break;
+      default: Out << "FCmpInst::BAD_ICMP_PREDICATE"; break;
+      }
+      Out << ", " << opNames[0] << ", " << opNames[1] << ", \"";
+      printEscapedString(I->getName());
+      Out << "\");";
+      break;
+    }
+    case Instruction::ICmp: {
+      Out << "ICmpInst* " << iName << " = new ICmpInst(*" << bbname << ", ";
+      switch (cast<ICmpInst>(I)->getPredicate()) {
+      case ICmpInst::ICMP_EQ:  Out << "ICmpInst::ICMP_EQ";  break;
+      case ICmpInst::ICMP_NE:  Out << "ICmpInst::ICMP_NE";  break;
+      case ICmpInst::ICMP_ULE: Out << "ICmpInst::ICMP_ULE"; break;
+      case ICmpInst::ICMP_SLE: Out << "ICmpInst::ICMP_SLE"; break;
+      case ICmpInst::ICMP_UGE: Out << "ICmpInst::ICMP_UGE"; break;
+      case ICmpInst::ICMP_SGE: Out << "ICmpInst::ICMP_SGE"; break;
+      case ICmpInst::ICMP_ULT: Out << "ICmpInst::ICMP_ULT"; break;
+      case ICmpInst::ICMP_SLT: Out << "ICmpInst::ICMP_SLT"; break;
+      case ICmpInst::ICMP_UGT: Out << "ICmpInst::ICMP_UGT"; break;
+      case ICmpInst::ICMP_SGT: Out << "ICmpInst::ICMP_SGT"; break;
+      default: Out << "ICmpInst::BAD_ICMP_PREDICATE"; break;
+      }
+      Out << ", " << opNames[0] << ", " << opNames[1] << ", \"";
+      printEscapedString(I->getName());
+      Out << "\");";
+      break;
+    }
+    case Instruction::Alloca: {
+      const AllocaInst* allocaI = cast<AllocaInst>(I);
+      Out << "AllocaInst* " << iName << " = new AllocaInst("
+          << getCppName(allocaI->getAllocatedType()) << ", ";
+      if (allocaI->isArrayAllocation())
+        Out << opNames[0] << ", ";
+      Out << "\"";
+      printEscapedString(allocaI->getName());
+      Out << "\", " << bbname << ");";
+      if (allocaI->getAlignment())
+        nl(Out) << iName << "->setAlignment("
+            << allocaI->getAlignment() << ");";
+      break;
+    }
+    case Instruction::Load:{
+      const LoadInst* load = cast<LoadInst>(I);
+      Out << "LoadInst* " << iName << " = new LoadInst("
+          << opNames[0] << ", \"";
+      printEscapedString(load->getName());
+      Out << "\", " << (load->isVolatile() ? "true" : "false" )
+          << ", " << bbname << ");";
+      break;
+    }
+    case Instruction::Store: {
+      const StoreInst* store = cast<StoreInst>(I);
+      Out << " new StoreInst("
+          << opNames[0] << ", "
+          << opNames[1] << ", "
+          << (store->isVolatile() ? "true" : "false")
+          << ", " << bbname << ");";
+      break;
+    }
+    case Instruction::GetElementPtr: {
+      const GetElementPtrInst* gep = cast<GetElementPtrInst>(I);
+      if (gep->getNumOperands() <= 2) {
+        Out << "GetElementPtrInst* " << iName << " = GetElementPtrInst::Create("
+            << opNames[0];
+        if (gep->getNumOperands() == 2)
+          Out << ", " << opNames[1];
+      } else {
+        Out << "std::vector<Value*> " << iName << "_indices;";
+        nl(Out);
+        for (unsigned i = 1; i < gep->getNumOperands(); ++i ) {
+          Out << iName << "_indices.push_back("
+              << opNames[i] << ");";
+          nl(Out);
+        }
+        Out << "Instruction* " << iName << " = GetElementPtrInst::Create("
+            << opNames[0] << ", " << iName << "_indices.begin(), "
+            << iName << "_indices.end()";
+      }
+      Out << ", \"";
+      printEscapedString(gep->getName());
+      Out << "\", " << bbname << ");";
+      break;
+    }
+    case Instruction::PHI: {
+      const PHINode* phi = cast<PHINode>(I);
+
+      Out << "PHINode* " << iName << " = PHINode::Create("
+          << getCppName(phi->getType()) << ", \"";
+      printEscapedString(phi->getName());
+      Out << "\", " << bbname << ");";
+      nl(Out) << iName << "->reserveOperandSpace("
+        << phi->getNumIncomingValues()
+          << ");";
+      nl(Out);
+      for (unsigned i = 0; i < phi->getNumOperands(); i+=2) {
+        Out << iName << "->addIncoming("
+            << opNames[i] << ", " << opNames[i+1] << ");";
+        nl(Out);
+      }
+      break;
+    }
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::BitCast: {
+      const CastInst* cst = cast<CastInst>(I);
+      Out << "CastInst* " << iName << " = new ";
+      switch (I->getOpcode()) {
+      case Instruction::Trunc:    Out << "TruncInst"; break;
+      case Instruction::ZExt:     Out << "ZExtInst"; break;
+      case Instruction::SExt:     Out << "SExtInst"; break;
+      case Instruction::FPTrunc:  Out << "FPTruncInst"; break;
+      case Instruction::FPExt:    Out << "FPExtInst"; break;
+      case Instruction::FPToUI:   Out << "FPToUIInst"; break;
+      case Instruction::FPToSI:   Out << "FPToSIInst"; break;
+      case Instruction::UIToFP:   Out << "UIToFPInst"; break;
+      case Instruction::SIToFP:   Out << "SIToFPInst"; break;
+      case Instruction::PtrToInt: Out << "PtrToIntInst"; break;
+      case Instruction::IntToPtr: Out << "IntToPtrInst"; break;
+      case Instruction::BitCast:  Out << "BitCastInst"; break;
+      default: assert(!"Unreachable"); break;
+      }
+      Out << "(" << opNames[0] << ", "
+          << getCppName(cst->getType()) << ", \"";
+      printEscapedString(cst->getName());
+      Out << "\", " << bbname << ");";
+      break;
+    }
+    case Instruction::Call:{
+      const CallInst* call = cast<CallInst>(I);
+      if (const InlineAsm* ila = dyn_cast<InlineAsm>(call->getCalledValue())) {
+        Out << "InlineAsm* " << getCppName(ila) << " = InlineAsm::get("
+            << getCppName(ila->getFunctionType()) << ", \""
+            << ila->getAsmString() << "\", \""
+            << ila->getConstraintString() << "\","
+            << (ila->hasSideEffects() ? "true" : "false") << ");";
+        nl(Out);
+      }
+      if (call->getNumOperands() > 2) {
+        Out << "std::vector<Value*> " << iName << "_params;";
+        nl(Out);
+        for (unsigned i = 1; i < call->getNumOperands(); ++i) {
+          Out << iName << "_params.push_back(" << opNames[i] << ");";
+          nl(Out);
+        }
+        Out << "CallInst* " << iName << " = CallInst::Create("
+            << opNames[0] << ", " << iName << "_params.begin(), "
+            << iName << "_params.end(), \"";
+      } else if (call->getNumOperands() == 2) {
+        Out << "CallInst* " << iName << " = CallInst::Create("
+            << opNames[0] << ", " << opNames[1] << ", \"";
+      } else {
+        Out << "CallInst* " << iName << " = CallInst::Create(" << opNames[0]
+            << ", \"";
+      }
+      printEscapedString(call->getName());
+      Out << "\", " << bbname << ");";
+      nl(Out) << iName << "->setCallingConv(";
+      printCallingConv(call->getCallingConv());
+      Out << ");";
+      nl(Out) << iName << "->setTailCall("
+          << (call->isTailCall() ? "true":"false");
+      Out << ");";
+      printAttributes(call->getAttributes(), iName);
+      Out << iName << "->setAttributes(" << iName << "_PAL);";
+      nl(Out);
+      break;
+    }
+    case Instruction::Select: {
+      const SelectInst* sel = cast<SelectInst>(I);
+      Out << "SelectInst* " << getCppName(sel) << " = SelectInst::Create(";
+      Out << opNames[0] << ", " << opNames[1] << ", " << opNames[2] << ", \"";
+      printEscapedString(sel->getName());
+      Out << "\", " << bbname << ");";
+      break;
+    }
+    case Instruction::UserOp1:
+      /// FALL THROUGH
+    case Instruction::UserOp2: {
+      /// FIXME: What should be done here?
+      break;
+    }
+    case Instruction::VAArg: {
+      const VAArgInst* va = cast<VAArgInst>(I);
+      Out << "VAArgInst* " << getCppName(va) << " = new VAArgInst("
+          << opNames[0] << ", " << getCppName(va->getType()) << ", \"";
+      printEscapedString(va->getName());
+      Out << "\", " << bbname << ");";
+      break;
+    }
+    case Instruction::ExtractElement: {
+      const ExtractElementInst* eei = cast<ExtractElementInst>(I);
+      Out << "ExtractElementInst* " << getCppName(eei)
+          << " = new ExtractElementInst(" << opNames[0]
+          << ", " << opNames[1] << ", \"";
+      printEscapedString(eei->getName());
+      Out << "\", " << bbname << ");";
+      break;
+    }
+    case Instruction::InsertElement: {
+      const InsertElementInst* iei = cast<InsertElementInst>(I);
+      Out << "InsertElementInst* " << getCppName(iei)
+          << " = InsertElementInst::Create(" << opNames[0]
+          << ", " << opNames[1] << ", " << opNames[2] << ", \"";
+      printEscapedString(iei->getName());
+      Out << "\", " << bbname << ");";
+      break;
+    }
+    case Instruction::ShuffleVector: {
+      const ShuffleVectorInst* svi = cast<ShuffleVectorInst>(I);
+      Out << "ShuffleVectorInst* " << getCppName(svi)
+          << " = new ShuffleVectorInst(" << opNames[0]
+          << ", " << opNames[1] << ", " << opNames[2] << ", \"";
+      printEscapedString(svi->getName());
+      Out << "\", " << bbname << ");";
+      break;
+    }
+    case Instruction::ExtractValue: {
+      const ExtractValueInst *evi = cast<ExtractValueInst>(I);
+      Out << "std::vector<unsigned> " << iName << "_indices;";
+      nl(Out);
+      for (unsigned i = 0; i < evi->getNumIndices(); ++i) {
+        Out << iName << "_indices.push_back("
+            << evi->idx_begin()[i] << ");";
+        nl(Out);
+      }
+      Out << "ExtractValueInst* " << getCppName(evi)
+          << " = ExtractValueInst::Create(" << opNames[0]
+          << ", "
+          << iName << "_indices.begin(), " << iName << "_indices.end(), \"";
+      printEscapedString(evi->getName());
+      Out << "\", " << bbname << ");";
+      break;
+    }
+    case Instruction::InsertValue: {
+      const InsertValueInst *ivi = cast<InsertValueInst>(I);
+      Out << "std::vector<unsigned> " << iName << "_indices;";
+      nl(Out);
+      for (unsigned i = 0; i < ivi->getNumIndices(); ++i) {
+        Out << iName << "_indices.push_back("
+            << ivi->idx_begin()[i] << ");";
+        nl(Out);
+      }
+      Out << "InsertValueInst* " << getCppName(ivi)
+          << " = InsertValueInst::Create(" << opNames[0]
+          << ", " << opNames[1] << ", "
+          << iName << "_indices.begin(), " << iName << "_indices.end(), \"";
+      printEscapedString(ivi->getName());
+      Out << "\", " << bbname << ");";
+      break;
+    }
+  }
+  DefinedValues.insert(I);
+  nl(Out);
+  delete [] opNames;
+}
+
+  // Print out the types, constants and declarations needed by one function
+  void CppWriter::printFunctionUses(const Function* F) {
+    nl(Out) << "// Type Definitions"; nl(Out);
+    if (!is_inline) {
+      // Print the function's return type
+      printType(F->getReturnType());
+
+      // Print the function's function type
+      printType(F->getFunctionType());
+
+      // Print the types of each of the function's arguments
+      for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
+           AI != AE; ++AI) {
+        printType(AI->getType());
+      }
+    }
+
+    // Print type definitions for every type referenced by an instruction and
+    // make a note of any global values or constants that are referenced
+    SmallPtrSet<GlobalValue*,64> gvs;
+    SmallPtrSet<Constant*,64> consts;
+    for (Function::const_iterator BB = F->begin(), BE = F->end();
+         BB != BE; ++BB){
+      for (BasicBlock::const_iterator I = BB->begin(), E = BB->end();
+           I != E; ++I) {
+        // Print the type of the instruction itself
+        printType(I->getType());
+
+        // Print the type of each of the instruction's operands
+        for (unsigned i = 0; i < I->getNumOperands(); ++i) {
+          Value* operand = I->getOperand(i);
+          printType(operand->getType());
+
+          // If the operand references a GVal or Constant, make a note of it
+          if (GlobalValue* GV = dyn_cast<GlobalValue>(operand)) {
+            gvs.insert(GV);
+            if (GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
+              if (GVar->hasInitializer())
+                consts.insert(GVar->getInitializer());
+          } else if (Constant* C = dyn_cast<Constant>(operand))
+            consts.insert(C);
+        }
+      }
+    }
+
+    // Print the function declarations for any functions encountered
+    nl(Out) << "// Function Declarations"; nl(Out);
+    for (SmallPtrSet<GlobalValue*,64>::iterator I = gvs.begin(), E = gvs.end();
+         I != E; ++I) {
+      if (Function* Fun = dyn_cast<Function>(*I)) {
+        if (!is_inline || Fun != F)
+          printFunctionHead(Fun);
+      }
+    }
+
+    // Print the global variable declarations for any variables encountered
+    nl(Out) << "// Global Variable Declarations"; nl(Out);
+    for (SmallPtrSet<GlobalValue*,64>::iterator I = gvs.begin(), E = gvs.end();
+         I != E; ++I) {
+      if (GlobalVariable* F = dyn_cast<GlobalVariable>(*I))
+        printVariableHead(F);
+    }
+
+  // Print the constants found
+    nl(Out) << "// Constant Definitions"; nl(Out);
+    for (SmallPtrSet<Constant*,64>::iterator I = consts.begin(),
+           E = consts.end(); I != E; ++I) {
+      printConstant(*I);
+    }
+
+    // Process the global variables definitions now that all the constants have
+    // been emitted. These definitions just couple the gvars with their constant
+    // initializers.
+    nl(Out) << "// Global Variable Definitions"; nl(Out);
+    for (SmallPtrSet<GlobalValue*,64>::iterator I = gvs.begin(), E = gvs.end();
+         I != E; ++I) {
+      if (GlobalVariable* GV = dyn_cast<GlobalVariable>(*I))
+        printVariableBody(GV);
+    }
+  }
+
+  void CppWriter::printFunctionHead(const Function* F) {
+    nl(Out) << "Function* " << getCppName(F);
+    if (is_inline) {
+      Out << " = mod->getFunction(\"";
+      printEscapedString(F->getName());
+      Out << "\", " << getCppName(F->getFunctionType()) << ");";
+      nl(Out) << "if (!" << getCppName(F) << ") {";
+      nl(Out) << getCppName(F);
+    }
+    Out<< " = Function::Create(";
+    nl(Out,1) << "/*Type=*/" << getCppName(F->getFunctionType()) << ",";
+    nl(Out) << "/*Linkage=*/";
+    printLinkageType(F->getLinkage());
+    Out << ",";
+    nl(Out) << "/*Name=*/\"";
+    printEscapedString(F->getName());
+    Out << "\", mod); " << (F->isDeclaration()? "// (external, no body)" : "");
+    nl(Out,-1);
+    printCppName(F);
+    Out << "->setCallingConv(";
+    printCallingConv(F->getCallingConv());
+    Out << ");";
+    nl(Out);
+    if (F->hasSection()) {
+      printCppName(F);
+      Out << "->setSection(\"" << F->getSection() << "\");";
+      nl(Out);
+    }
+    if (F->getAlignment()) {
+      printCppName(F);
+      Out << "->setAlignment(" << F->getAlignment() << ");";
+      nl(Out);
+    }
+    if (F->getVisibility() != GlobalValue::DefaultVisibility) {
+      printCppName(F);
+      Out << "->setVisibility(";
+      printVisibilityType(F->getVisibility());
+      Out << ");";
+      nl(Out);
+    }
+    if (F->hasGC()) {
+      printCppName(F);
+      Out << "->setGC(\"" << F->getGC() << "\");";
+      nl(Out);
+    }
+    if (is_inline) {
+      Out << "}";
+      nl(Out);
+    }
+    printAttributes(F->getAttributes(), getCppName(F));
+    printCppName(F);
+    Out << "->setAttributes(" << getCppName(F) << "_PAL);";
+    nl(Out);
+  }
+
+  void CppWriter::printFunctionBody(const Function *F) {
+    if (F->isDeclaration())
+      return; // external functions have no bodies.
+
+    // Clear the DefinedValues and ForwardRefs maps because we can't have
+    // cross-function forward refs
+    ForwardRefs.clear();
+    DefinedValues.clear();
+
+    // Create all the argument values
+    if (!is_inline) {
+      if (!F->arg_empty()) {
+        Out << "Function::arg_iterator args = " << getCppName(F)
+            << "->arg_begin();";
+        nl(Out);
+      }
+      for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
+           AI != AE; ++AI) {
+        Out << "Value* " << getCppName(AI) << " = args++;";
+        nl(Out);
+        if (AI->hasName()) {
+          Out << getCppName(AI) << "->setName(\"" << AI->getName() << "\");";
+          nl(Out);
+        }
+      }
+    }
+
+    // Create all the basic blocks
+    nl(Out);
+    for (Function::const_iterator BI = F->begin(), BE = F->end();
+         BI != BE; ++BI) {
+      std::string bbname(getCppName(BI));
+      Out << "BasicBlock* " << bbname <<
+             " = BasicBlock::Create(getGlobalContext(), \"";
+      if (BI->hasName())
+        printEscapedString(BI->getName());
+      Out << "\"," << getCppName(BI->getParent()) << ",0);";
+      nl(Out);
+    }
+
+    // Output all of its basic blocks... for the function
+    for (Function::const_iterator BI = F->begin(), BE = F->end();
+         BI != BE; ++BI) {
+      std::string bbname(getCppName(BI));
+      nl(Out) << "// Block " << BI->getName() << " (" << bbname << ")";
+      nl(Out);
+
+      // Output all of the instructions in the basic block...
+      for (BasicBlock::const_iterator I = BI->begin(), E = BI->end();
+           I != E; ++I) {
+        printInstruction(I,bbname);
+      }
+    }
+
+    // Loop over the ForwardRefs and resolve them now that all instructions
+    // are generated.
+    if (!ForwardRefs.empty()) {
+      nl(Out) << "// Resolve Forward References";
+      nl(Out);
+    }
+
+    while (!ForwardRefs.empty()) {
+      ForwardRefMap::iterator I = ForwardRefs.begin();
+      Out << I->second << "->replaceAllUsesWith("
+          << getCppName(I->first) << "); delete " << I->second << ";";
+      nl(Out);
+      ForwardRefs.erase(I);
+    }
+  }
+
+  void CppWriter::printInline(const std::string& fname,
+                              const std::string& func) {
+    const Function* F = TheModule->getFunction(func);
+    if (!F) {
+      error(std::string("Function '") + func + "' not found in input module");
+      return;
+    }
+    if (F->isDeclaration()) {
+      error(std::string("Function '") + func + "' is external!");
+      return;
+    }
+    nl(Out) << "BasicBlock* " << fname << "(Module* mod, Function *"
+            << getCppName(F);
+    unsigned arg_count = 1;
+    for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
+         AI != AE; ++AI) {
+      Out << ", Value* arg_" << arg_count;
+    }
+    Out << ") {";
+    nl(Out);
+    is_inline = true;
+    printFunctionUses(F);
+    printFunctionBody(F);
+    is_inline = false;
+    Out << "return " << getCppName(F->begin()) << ";";
+    nl(Out) << "}";
+    nl(Out);
+  }
+
+  void CppWriter::printModuleBody() {
+    // Print out all the type definitions
+    nl(Out) << "// Type Definitions"; nl(Out);
+    printTypes(TheModule);
+
+    // Functions can call each other and global variables can reference them so
+    // define all the functions first before emitting their function bodies.
+    nl(Out) << "// Function Declarations"; nl(Out);
+    for (Module::const_iterator I = TheModule->begin(), E = TheModule->end();
+         I != E; ++I)
+      printFunctionHead(I);
+
+    // Process the global variables declarations. We can't initialze them until
+    // after the constants are printed so just print a header for each global
+    nl(Out) << "// Global Variable Declarations\n"; nl(Out);
+    for (Module::const_global_iterator I = TheModule->global_begin(),
+           E = TheModule->global_end(); I != E; ++I) {
+      printVariableHead(I);
+    }
+
+    // Print out all the constants definitions. Constants don't recurse except
+    // through GlobalValues. All GlobalValues have been declared at this point
+    // so we can proceed to generate the constants.
+    nl(Out) << "// Constant Definitions"; nl(Out);
+    printConstants(TheModule);
+
+    // Process the global variables definitions now that all the constants have
+    // been emitted. These definitions just couple the gvars with their constant
+    // initializers.
+    nl(Out) << "// Global Variable Definitions"; nl(Out);
+    for (Module::const_global_iterator I = TheModule->global_begin(),
+           E = TheModule->global_end(); I != E; ++I) {
+      printVariableBody(I);
+    }
+
+    // Finally, we can safely put out all of the function bodies.
+    nl(Out) << "// Function Definitions"; nl(Out);
+    for (Module::const_iterator I = TheModule->begin(), E = TheModule->end();
+         I != E; ++I) {
+      if (!I->isDeclaration()) {
+        nl(Out) << "// Function: " << I->getName() << " (" << getCppName(I)
+                << ")";
+        nl(Out) << "{";
+        nl(Out,1);
+        printFunctionBody(I);
+        nl(Out,-1) << "}";
+        nl(Out);
+      }
+    }
+  }
+
+  void CppWriter::printProgram(const std::string& fname,
+                               const std::string& mName) {
+    Out << "#include <llvm/LLVMContext.h>\n";
+    Out << "#include <llvm/Module.h>\n";
+    Out << "#include <llvm/DerivedTypes.h>\n";
+    Out << "#include <llvm/Constants.h>\n";
+    Out << "#include <llvm/GlobalVariable.h>\n";
+    Out << "#include <llvm/Function.h>\n";
+    Out << "#include <llvm/CallingConv.h>\n";
+    Out << "#include <llvm/BasicBlock.h>\n";
+    Out << "#include <llvm/Instructions.h>\n";
+    Out << "#include <llvm/InlineAsm.h>\n";
+    Out << "#include <llvm/Support/FormattedStream.h>\n";
+    Out << "#include <llvm/Support/MathExtras.h>\n";
+    Out << "#include <llvm/Pass.h>\n";
+    Out << "#include <llvm/PassManager.h>\n";
+    Out << "#include <llvm/ADT/SmallVector.h>\n";
+    Out << "#include <llvm/Analysis/Verifier.h>\n";
+    Out << "#include <llvm/Assembly/PrintModulePass.h>\n";
+    Out << "#include <algorithm>\n";
+    Out << "using namespace llvm;\n\n";
+    Out << "Module* " << fname << "();\n\n";
+    Out << "int main(int argc, char**argv) {\n";
+    Out << "  Module* Mod = " << fname << "();\n";
+    Out << "  verifyModule(*Mod, PrintMessageAction);\n";
+    Out << "  PassManager PM;\n";
+    Out << "  PM.add(createPrintModulePass(&outs()));\n";
+    Out << "  PM.run(*Mod);\n";
+    Out << "  return 0;\n";
+    Out << "}\n\n";
+    printModule(fname,mName);
+  }
+
+  void CppWriter::printModule(const std::string& fname,
+                              const std::string& mName) {
+    nl(Out) << "Module* " << fname << "() {";
+    nl(Out,1) << "// Module Construction";
+    nl(Out) << "Module* mod = new Module(\"";
+    printEscapedString(mName);
+    Out << "\", getGlobalContext());";
+    if (!TheModule->getTargetTriple().empty()) {
+      nl(Out) << "mod->setDataLayout(\"" << TheModule->getDataLayout() << "\");";
+    }
+    if (!TheModule->getTargetTriple().empty()) {
+      nl(Out) << "mod->setTargetTriple(\"" << TheModule->getTargetTriple()
+              << "\");";
+    }
+
+    if (!TheModule->getModuleInlineAsm().empty()) {
+      nl(Out) << "mod->setModuleInlineAsm(\"";
+      printEscapedString(TheModule->getModuleInlineAsm());
+      Out << "\");";
+    }
+    nl(Out);
+
+    // Loop over the dependent libraries and emit them.
+    Module::lib_iterator LI = TheModule->lib_begin();
+    Module::lib_iterator LE = TheModule->lib_end();
+    while (LI != LE) {
+      Out << "mod->addLibrary(\"" << *LI << "\");";
+      nl(Out);
+      ++LI;
+    }
+    printModuleBody();
+    nl(Out) << "return mod;";
+    nl(Out,-1) << "}";
+    nl(Out);
+  }
+
+  void CppWriter::printContents(const std::string& fname,
+                                const std::string& mName) {
+    Out << "\nModule* " << fname << "(Module *mod) {\n";
+    Out << "\nmod->setModuleIdentifier(\"";
+    printEscapedString(mName);
+    Out << "\");\n";
+    printModuleBody();
+    Out << "\nreturn mod;\n";
+    Out << "\n}\n";
+  }
+
+  void CppWriter::printFunction(const std::string& fname,
+                                const std::string& funcName) {
+    const Function* F = TheModule->getFunction(funcName);
+    if (!F) {
+      error(std::string("Function '") + funcName + "' not found in input module");
+      return;
+    }
+    Out << "\nFunction* " << fname << "(Module *mod) {\n";
+    printFunctionUses(F);
+    printFunctionHead(F);
+    printFunctionBody(F);
+    Out << "return " << getCppName(F) << ";\n";
+    Out << "}\n";
+  }
+
+  void CppWriter::printFunctions() {
+    const Module::FunctionListType &funcs = TheModule->getFunctionList();
+    Module::const_iterator I  = funcs.begin();
+    Module::const_iterator IE = funcs.end();
+
+    for (; I != IE; ++I) {
+      const Function &func = *I;
+      if (!func.isDeclaration()) {
+        std::string name("define_");
+        name += func.getName();
+        printFunction(name, func.getName());
+      }
+    }
+  }
+
+  void CppWriter::printVariable(const std::string& fname,
+                                const std::string& varName) {
+    const GlobalVariable* GV = TheModule->getNamedGlobal(varName);
+
+    if (!GV) {
+      error(std::string("Variable '") + varName + "' not found in input module");
+      return;
+    }
+    Out << "\nGlobalVariable* " << fname << "(Module *mod) {\n";
+    printVariableUses(GV);
+    printVariableHead(GV);
+    printVariableBody(GV);
+    Out << "return " << getCppName(GV) << ";\n";
+    Out << "}\n";
+  }
+
+  void CppWriter::printType(const std::string& fname,
+                            const std::string& typeName) {
+    const Type* Ty = TheModule->getTypeByName(typeName);
+    if (!Ty) {
+      error(std::string("Type '") + typeName + "' not found in input module");
+      return;
+    }
+    Out << "\nType* " << fname << "(Module *mod) {\n";
+    printType(Ty);
+    Out << "return " << getCppName(Ty) << ";\n";
+    Out << "}\n";
+  }
+
+  bool CppWriter::runOnModule(Module &M) {
+    TheModule = &M;
+
+    // Emit a header
+    Out << "// Generated by llvm2cpp - DO NOT MODIFY!\n\n";
+
+    // Get the name of the function we're supposed to generate
+    std::string fname = FuncName.getValue();
+
+    // Get the name of the thing we are to generate
+    std::string tgtname = NameToGenerate.getValue();
+    if (GenerationType == GenModule ||
+        GenerationType == GenContents ||
+        GenerationType == GenProgram ||
+        GenerationType == GenFunctions) {
+      if (tgtname == "!bad!") {
+        if (M.getModuleIdentifier() == "-")
+          tgtname = "<stdin>";
+        else
+          tgtname = M.getModuleIdentifier();
+      }
+    } else if (tgtname == "!bad!")
+      error("You must use the -for option with -gen-{function,variable,type}");
+
+    switch (WhatToGenerate(GenerationType)) {
+     case GenProgram:
+      if (fname.empty())
+        fname = "makeLLVMModule";
+      printProgram(fname,tgtname);
+      break;
+     case GenModule:
+      if (fname.empty())
+        fname = "makeLLVMModule";
+      printModule(fname,tgtname);
+      break;
+     case GenContents:
+      if (fname.empty())
+        fname = "makeLLVMModuleContents";
+      printContents(fname,tgtname);
+      break;
+     case GenFunction:
+      if (fname.empty())
+        fname = "makeLLVMFunction";
+      printFunction(fname,tgtname);
+      break;
+     case GenFunctions:
+      printFunctions();
+      break;
+     case GenInline:
+      if (fname.empty())
+        fname = "makeLLVMInline";
+      printInline(fname,tgtname);
+      break;
+     case GenVariable:
+      if (fname.empty())
+        fname = "makeLLVMVariable";
+      printVariable(fname,tgtname);
+      break;
+     case GenType:
+      if (fname.empty())
+        fname = "makeLLVMType";
+      printType(fname,tgtname);
+      break;
+     default:
+      error("Invalid generation option");
+    }
+
+    return false;
+  }
+}
+
+char CppWriter::ID = 0;
+
+//===----------------------------------------------------------------------===//
+//                       External Interface declaration
+//===----------------------------------------------------------------------===//
+
+bool CPPTargetMachine::addPassesToEmitWholeFile(PassManager &PM,
+                                                formatted_raw_ostream &o,
+                                                CodeGenFileType FileType,
+                                                CodeGenOpt::Level OptLevel) {
+  if (FileType != TargetMachine::CGFT_AssemblyFile) return true;
+  PM.add(new CppWriter(o));
+  return false;
+}

diff --git a/lib/Target/CppBackend/CPPTargetMachine.h b/lib/Target/CppBackend/CPPTargetMachine.h
new file mode 100644
index 0000000..1f74f76
--- /dev/null
+++ b/lib/Target/CppBackend/CPPTargetMachine.h

@@ -0,0 +1,43 @@
+//===-- CPPTargetMachine.h - TargetMachine for the C++ backend --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the TargetMachine that is used by the C++ backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef CPPTARGETMACHINE_H
+#define CPPTARGETMACHINE_H
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+
+namespace llvm {
+
+class formatted_raw_ostream;
+
+struct CPPTargetMachine : public TargetMachine {
+  CPPTargetMachine(const Target &T, const std::string &TT,
+                   const std::string &FS)
+    : TargetMachine(T) {}
+
+  virtual bool WantsWholeFile() const { return true; }
+  virtual bool addPassesToEmitWholeFile(PassManager &PM,
+                                        formatted_raw_ostream &Out,
+                                        CodeGenFileType FileType,
+                                        CodeGenOpt::Level OptLevel);
+
+  virtual const TargetData *getTargetData() const { return 0; }
+};
+
+extern Target TheCppBackendTarget;
+
+} // End llvm namespace
+
+
+#endif

diff --git a/lib/Target/CppBackend/Makefile b/lib/Target/CppBackend/Makefile
new file mode 100644
index 0000000..d75f4e8
--- /dev/null
+++ b/lib/Target/CppBackend/Makefile

@@ -0,0 +1,16 @@
+##===- lib/Target/CppBackend/Makefile --- ------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMCppBackend
+DIRS = TargetInfo
+
+include $(LEVEL)/Makefile.common
+
+CompileCommonOpts += -Wno-format

diff --git a/lib/Target/CppBackend/TargetInfo/CMakeLists.txt b/lib/Target/CppBackend/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000..edaf5d3
--- /dev/null
+++ b/lib/Target/CppBackend/TargetInfo/CMakeLists.txt

@@ -0,0 +1,6 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMCppBackendInfo
+  CppBackendTargetInfo.cpp
+  )
+

diff --git a/lib/Target/CppBackend/TargetInfo/CppBackendTargetInfo.cpp b/lib/Target/CppBackend/TargetInfo/CppBackendTargetInfo.cpp
new file mode 100644
index 0000000..d0aeb12
--- /dev/null
+++ b/lib/Target/CppBackend/TargetInfo/CppBackendTargetInfo.cpp

@@ -0,0 +1,26 @@
+//===-- CppBackendTargetInfo.cpp - CppBackend Target Implementation -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CPPTargetMachine.h"
+#include "llvm/Module.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheCppBackendTarget;
+
+static unsigned CppBackend_TripleMatchQuality(const std::string &TT) {
+  // This class always works, but shouldn't be the default in most cases.
+  return 1;
+}
+
+extern "C" void LLVMInitializeCppBackendTargetInfo() { 
+  TargetRegistry::RegisterTarget(TheCppBackendTarget, "cpp",    
+                                  "C++ backend",
+                                  &CppBackend_TripleMatchQuality);
+}

diff --git a/lib/Target/CppBackend/TargetInfo/Makefile b/lib/Target/CppBackend/TargetInfo/Makefile
new file mode 100644
index 0000000..6e68283
--- /dev/null
+++ b/lib/Target/CppBackend/TargetInfo/Makefile

@@ -0,0 +1,15 @@
+##===- lib/Target/CppBackend/TargetInfo/Makefile -----------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMCppBackendInfo
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common

diff --git a/lib/Target/MSIL/CMakeLists.txt b/lib/Target/MSIL/CMakeLists.txt
new file mode 100644
index 0000000..b1d47ef
--- /dev/null
+++ b/lib/Target/MSIL/CMakeLists.txt

@@ -0,0 +1,3 @@
+add_llvm_target(MSIL
+  MSILWriter.cpp
+  )

diff --git a/lib/Target/MSIL/MSILWriter.cpp b/lib/Target/MSIL/MSILWriter.cpp
new file mode 100644
index 0000000..a96ee49
--- /dev/null
+++ b/lib/Target/MSIL/MSILWriter.cpp

@@ -0,0 +1,1703 @@
+//===-- MSILWriter.cpp - Library for converting LLVM code to MSIL ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This library converts LLVM code to MSIL code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSILWriter.h"
+#include "llvm/CallingConv.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/TypeSymbolTable.h"
+#include "llvm/Analysis/ConstantsScanner.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/InstVisitor.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/Passes.h"
+using namespace llvm;
+
+namespace llvm {
+  // TargetMachine for the MSIL 
+  struct MSILTarget : public TargetMachine {
+    MSILTarget(const Target &T, const std::string &TT, const std::string &FS)
+      : TargetMachine(T) {}
+
+    virtual bool WantsWholeFile() const { return true; }
+    virtual bool addPassesToEmitWholeFile(PassManager &PM,
+                                          formatted_raw_ostream &Out,
+                                          CodeGenFileType FileType,
+                                          CodeGenOpt::Level OptLevel);
+
+    virtual const TargetData *getTargetData() const { return 0; }
+  };
+}
+
+extern "C" void LLVMInitializeMSILTarget() {
+  // Register the target.
+  RegisterTargetMachine<MSILTarget> X(TheMSILTarget);
+}
+
+bool MSILModule::runOnModule(Module &M) {
+  ModulePtr = &M;
+  TD = &getAnalysis<TargetData>();
+  bool Changed = false;
+  // Find named types.  
+  TypeSymbolTable& Table = M.getTypeSymbolTable();
+  std::set<const Type *> Types = getAnalysis<FindUsedTypes>().getTypes();
+  for (TypeSymbolTable::iterator I = Table.begin(), E = Table.end(); I!=E; ) {
+    if (!isa<StructType>(I->second) && !isa<OpaqueType>(I->second))
+      Table.remove(I++);
+    else {
+      std::set<const Type *>::iterator T = Types.find(I->second);
+      if (T==Types.end())
+        Table.remove(I++);
+      else {
+        Types.erase(T);
+        ++I;
+      }
+    }
+  }
+  // Find unnamed types.
+  unsigned RenameCounter = 0;
+  for (std::set<const Type *>::const_iterator I = Types.begin(),
+       E = Types.end(); I!=E; ++I)
+    if (const StructType *STy = dyn_cast<StructType>(*I)) {
+      while (ModulePtr->addTypeName("unnamed$"+utostr(RenameCounter), STy))
+        ++RenameCounter;
+      Changed = true;
+    }
+  // Pointer for FunctionPass.
+  UsedTypes = &getAnalysis<FindUsedTypes>().getTypes();
+  return Changed;
+}
+
+char MSILModule::ID = 0;
+char MSILWriter::ID = 0;
+
+bool MSILWriter::runOnFunction(Function &F) {
+  if (F.isDeclaration()) return false;
+
+  // Do not codegen any 'available_externally' functions at all, they have
+  // definitions outside the translation unit.
+  if (F.hasAvailableExternallyLinkage())
+    return false;
+
+  LInfo = &getAnalysis<LoopInfo>();
+  printFunction(F);
+  return false;
+}
+
+
+bool MSILWriter::doInitialization(Module &M) {
+  ModulePtr = &M;
+  Out << ".assembly extern mscorlib {}\n";
+  Out << ".assembly MSIL {}\n\n";
+  Out << "// External\n";
+  printExternals();
+  Out << "// Declarations\n";
+  printDeclarations(M.getTypeSymbolTable());
+  Out << "// Definitions\n";
+  printGlobalVariables();
+  Out << "// Startup code\n";
+  printModuleStartup();
+  return false;
+}
+
+
+bool MSILWriter::doFinalization(Module &M) {
+  return false;
+}
+
+
+void MSILWriter::printModuleStartup() {
+  Out <<
+  ".method static public int32 $MSIL_Startup() {\n"
+  "\t.entrypoint\n"
+  "\t.locals (native int i)\n"
+  "\t.locals (native int argc)\n"
+  "\t.locals (native int ptr)\n"
+  "\t.locals (void* argv)\n"
+  "\t.locals (string[] args)\n"
+  "\tcall\tstring[] [mscorlib]System.Environment::GetCommandLineArgs()\n"
+  "\tdup\n"
+  "\tstloc\targs\n"
+  "\tldlen\n"
+  "\tconv.i4\n"
+  "\tdup\n"
+  "\tstloc\targc\n";
+  printPtrLoad(TD->getPointerSize());
+  Out <<
+  "\tmul\n"
+  "\tlocalloc\n"
+  "\tstloc\targv\n"
+  "\tldc.i4.0\n"
+  "\tstloc\ti\n"
+  "L_01:\n"
+  "\tldloc\ti\n"
+  "\tldloc\targc\n"
+  "\tceq\n"
+  "\tbrtrue\tL_02\n"
+  "\tldloc\targs\n"
+  "\tldloc\ti\n"
+  "\tldelem.ref\n"
+  "\tcall\tnative int [mscorlib]System.Runtime.InteropServices.Marshal::"
+           "StringToHGlobalAnsi(string)\n"
+  "\tstloc\tptr\n"
+  "\tldloc\targv\n"
+  "\tldloc\ti\n";
+  printPtrLoad(TD->getPointerSize());
+  Out << 
+  "\tmul\n"
+  "\tadd\n"
+  "\tldloc\tptr\n"
+  "\tstind.i\n"
+  "\tldloc\ti\n"
+  "\tldc.i4.1\n"
+  "\tadd\n"
+  "\tstloc\ti\n"
+  "\tbr\tL_01\n"
+  "L_02:\n"
+  "\tcall void $MSIL_Init()\n";
+
+  // Call user 'main' function.
+  const Function* F = ModulePtr->getFunction("main");
+  if (!F || F->isDeclaration()) {
+    Out << "\tldc.i4.0\n\tret\n}\n";
+    return;
+  }
+  bool BadSig = true;
+  std::string Args("");
+  Function::const_arg_iterator Arg1,Arg2;
+
+  switch (F->arg_size()) {
+  case 0:
+    BadSig = false;
+    break;
+  case 1:
+    Arg1 = F->arg_begin();
+    if (Arg1->getType()->isInteger()) {
+      Out << "\tldloc\targc\n";
+      Args = getTypeName(Arg1->getType());
+      BadSig = false;
+    }
+    break;
+  case 2:
+    Arg1 = Arg2 = F->arg_begin(); ++Arg2;
+    if (Arg1->getType()->isInteger() && 
+        Arg2->getType()->getTypeID() == Type::PointerTyID) {
+      Out << "\tldloc\targc\n\tldloc\targv\n";
+      Args = getTypeName(Arg1->getType())+","+getTypeName(Arg2->getType());
+      BadSig = false;
+    }
+    break;
+  default:
+    BadSig = true;
+  }
+
+  bool RetVoid = (F->getReturnType()->getTypeID() == Type::VoidTyID);
+  if (BadSig || (!F->getReturnType()->isInteger() && !RetVoid)) {
+    Out << "\tldc.i4.0\n";
+  } else {
+    Out << "\tcall\t" << getTypeName(F->getReturnType()) <<
+      getConvModopt(F->getCallingConv()) << "main(" << Args << ")\n";
+    if (RetVoid)
+      Out << "\tldc.i4.0\n";
+    else
+      Out << "\tconv.i4\n";
+  }
+  Out << "\tret\n}\n";
+}
+
+bool MSILWriter::isZeroValue(const Value* V) {
+  if (const Constant *C = dyn_cast<Constant>(V))
+    return C->isNullValue();
+  return false;
+}
+
+
+std::string MSILWriter::getValueName(const Value* V) {
+  std::string Name;
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(V))
+    Name = GV->getName();
+  else {
+    unsigned &No = AnonValueNumbers[V];
+    if (No == 0) No = ++NextAnonValueNumber;
+    Name = "tmp" + utostr(No);
+  }
+  
+  // Name into the quotes allow control and space characters.
+  return "'"+Name+"'";
+}
+
+
+std::string MSILWriter::getLabelName(const std::string& Name) {
+  if (Name.find('.')!=std::string::npos) {
+    std::string Tmp(Name);
+    // Replace unaccepable characters in the label name.
+    for (std::string::iterator I = Tmp.begin(), E = Tmp.end(); I!=E; ++I)
+      if (*I=='.') *I = '@';
+    return Tmp;
+  }
+  return Name;
+}
+
+
+std::string MSILWriter::getLabelName(const Value* V) {
+  std::string Name;
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(V))
+    Name = GV->getName();
+  else {
+    unsigned &No = AnonValueNumbers[V];
+    if (No == 0) No = ++NextAnonValueNumber;
+    Name = "tmp" + utostr(No);
+  }
+  
+  return getLabelName(Name);
+}
+
+
+std::string MSILWriter::getConvModopt(CallingConv::ID CallingConvID) {
+  switch (CallingConvID) {
+  case CallingConv::C:
+  case CallingConv::Cold:
+  case CallingConv::Fast:
+    return "modopt([mscorlib]System.Runtime.CompilerServices.CallConvCdecl) ";
+  case CallingConv::X86_FastCall:
+    return "modopt([mscorlib]System.Runtime.CompilerServices.CallConvFastcall) ";
+  case CallingConv::X86_StdCall:
+    return "modopt([mscorlib]System.Runtime.CompilerServices.CallConvStdcall) ";
+  default:
+    errs() << "CallingConvID = " << CallingConvID << '\n';
+    llvm_unreachable("Unsupported calling convention");
+  }
+  return ""; // Not reached
+}
+
+
+std::string MSILWriter::getArrayTypeName(Type::TypeID TyID, const Type* Ty) {
+  std::string Tmp = "";
+  const Type* ElemTy = Ty;
+  assert(Ty->getTypeID()==TyID && "Invalid type passed");
+  // Walk trought array element types.
+  for (;;) {
+    // Multidimensional array.
+    if (ElemTy->getTypeID()==TyID) {
+      if (const ArrayType* ATy = dyn_cast<ArrayType>(ElemTy))
+        Tmp += utostr(ATy->getNumElements());
+      else if (const VectorType* VTy = dyn_cast<VectorType>(ElemTy))
+        Tmp += utostr(VTy->getNumElements());
+      ElemTy = cast<SequentialType>(ElemTy)->getElementType();
+    }
+    // Base element type found.
+    if (ElemTy->getTypeID()!=TyID) break;
+    Tmp += ",";
+  }
+  return getTypeName(ElemTy, false, true)+"["+Tmp+"]";
+}
+
+
+std::string MSILWriter::getPrimitiveTypeName(const Type* Ty, bool isSigned) {
+  unsigned NumBits = 0;
+  switch (Ty->getTypeID()) {
+  case Type::VoidTyID:
+    return "void ";
+  case Type::IntegerTyID:
+    NumBits = getBitWidth(Ty);
+    if(NumBits==1)
+      return "bool ";
+    if (!isSigned)
+      return "unsigned int"+utostr(NumBits)+" ";
+    return "int"+utostr(NumBits)+" ";
+  case Type::FloatTyID:
+    return "float32 ";
+  case Type::DoubleTyID:
+    return "float64 "; 
+  default:
+    errs() << "Type = " << *Ty << '\n';
+    llvm_unreachable("Invalid primitive type");
+  }
+  return ""; // Not reached
+}
+
+
+std::string MSILWriter::getTypeName(const Type* Ty, bool isSigned,
+                                    bool isNested) {
+  if (Ty->isPrimitiveType() || Ty->isInteger())
+    return getPrimitiveTypeName(Ty,isSigned);
+  // FIXME: "OpaqueType" support
+  switch (Ty->getTypeID()) {
+  case Type::PointerTyID:
+    return "void* ";
+  case Type::StructTyID:
+    if (isNested)
+      return ModulePtr->getTypeName(Ty);
+    return "valuetype '"+ModulePtr->getTypeName(Ty)+"' ";
+  case Type::ArrayTyID:
+    if (isNested)
+      return getArrayTypeName(Ty->getTypeID(),Ty);
+    return "valuetype '"+getArrayTypeName(Ty->getTypeID(),Ty)+"' ";
+  case Type::VectorTyID:
+    if (isNested)
+      return getArrayTypeName(Ty->getTypeID(),Ty);
+    return "valuetype '"+getArrayTypeName(Ty->getTypeID(),Ty)+"' ";
+  default:
+    errs() << "Type = " << *Ty << '\n';
+    llvm_unreachable("Invalid type in getTypeName()");
+  }
+  return ""; // Not reached
+}
+
+
+MSILWriter::ValueType MSILWriter::getValueLocation(const Value* V) {
+  // Function argument
+  if (isa<Argument>(V))
+    return ArgumentVT;
+  // Function
+  else if (const Function* F = dyn_cast<Function>(V))
+    return F->hasLocalLinkage() ? InternalVT : GlobalVT;
+  // Variable
+  else if (const GlobalVariable* G = dyn_cast<GlobalVariable>(V))
+    return G->hasLocalLinkage() ? InternalVT : GlobalVT;
+  // Constant
+  else if (isa<Constant>(V))
+    return isa<ConstantExpr>(V) ? ConstExprVT : ConstVT;
+  // Local variable
+  return LocalVT;
+}
+
+
+std::string MSILWriter::getTypePostfix(const Type* Ty, bool Expand,
+                                       bool isSigned) {
+  unsigned NumBits = 0;
+  switch (Ty->getTypeID()) {
+  // Integer constant, expanding for stack operations.
+  case Type::IntegerTyID:
+    NumBits = getBitWidth(Ty);
+    // Expand integer value to "int32" or "int64".
+    if (Expand) return (NumBits<=32 ? "i4" : "i8");
+    if (NumBits==1) return "i1";
+    return (isSigned ? "i" : "u")+utostr(NumBits/8);
+  // Float constant.
+  case Type::FloatTyID:
+    return "r4";
+  case Type::DoubleTyID:
+    return "r8";
+  case Type::PointerTyID:
+    return "i"+utostr(TD->getTypeAllocSize(Ty));
+  default:
+    errs() << "TypeID = " << Ty->getTypeID() << '\n';
+    llvm_unreachable("Invalid type in TypeToPostfix()");
+  }
+  return ""; // Not reached
+}
+
+
+void MSILWriter::printConvToPtr() {
+  switch (ModulePtr->getPointerSize()) {
+  case Module::Pointer32:
+    printSimpleInstruction("conv.u4");
+    break;
+  case Module::Pointer64:
+    printSimpleInstruction("conv.u8");
+    break;
+  default:
+    llvm_unreachable("Module use not supporting pointer size");
+  }
+}
+
+
+void MSILWriter::printPtrLoad(uint64_t N) {
+  switch (ModulePtr->getPointerSize()) {
+  case Module::Pointer32:
+    printSimpleInstruction("ldc.i4",utostr(N).c_str());
+    // FIXME: Need overflow test?
+    if (!isUInt32(N)) {
+      errs() << "Value = " << utostr(N) << '\n';
+      llvm_unreachable("32-bit pointer overflowed");
+    }
+    break;
+  case Module::Pointer64:
+    printSimpleInstruction("ldc.i8",utostr(N).c_str());
+    break;
+  default:
+    llvm_unreachable("Module use not supporting pointer size");
+  }
+}
+
+
+void MSILWriter::printValuePtrLoad(const Value* V) {
+  printValueLoad(V);
+  printConvToPtr();
+}
+
+
+void MSILWriter::printConstLoad(const Constant* C) {
+  if (const ConstantInt* CInt = dyn_cast<ConstantInt>(C)) {
+    // Integer constant
+    Out << "\tldc." << getTypePostfix(C->getType(),true) << '\t';
+    if (CInt->isMinValue(true))
+      Out << CInt->getSExtValue();
+    else
+      Out << CInt->getZExtValue();
+  } else if (const ConstantFP* FP = dyn_cast<ConstantFP>(C)) {
+    // Float constant
+    uint64_t X;
+    unsigned Size;
+    if (FP->getType()->getTypeID()==Type::FloatTyID) {
+      X = (uint32_t)FP->getValueAPF().bitcastToAPInt().getZExtValue();
+      Size = 4;  
+    } else {
+      X = FP->getValueAPF().bitcastToAPInt().getZExtValue();
+      Size = 8;  
+    }
+    Out << "\tldc.r" << Size << "\t( " << utohexstr(X) << ')';
+  } else if (isa<UndefValue>(C)) {
+    // Undefined constant value = NULL.
+    printPtrLoad(0);
+  } else {
+    errs() << "Constant = " << *C << '\n';
+    llvm_unreachable("Invalid constant value");
+  }
+  Out << '\n';
+}
+
+
+void MSILWriter::printValueLoad(const Value* V) {
+  MSILWriter::ValueType Location = getValueLocation(V);
+  switch (Location) {
+  // Global variable or function address.
+  case GlobalVT:
+  case InternalVT:
+    if (const Function* F = dyn_cast<Function>(V)) {
+      std::string Name = getConvModopt(F->getCallingConv())+getValueName(F);
+      printSimpleInstruction("ldftn",
+        getCallSignature(F->getFunctionType(),NULL,Name).c_str());
+    } else {
+      std::string Tmp;
+      const Type* ElemTy = cast<PointerType>(V->getType())->getElementType();
+      if (Location==GlobalVT && cast<GlobalVariable>(V)->hasDLLImportLinkage()) {
+        Tmp = "void* "+getValueName(V);
+        printSimpleInstruction("ldsfld",Tmp.c_str());
+      } else {
+        Tmp = getTypeName(ElemTy)+getValueName(V);
+        printSimpleInstruction("ldsflda",Tmp.c_str());
+      }
+    }
+    break;
+  // Function argument.
+  case ArgumentVT:
+    printSimpleInstruction("ldarg",getValueName(V).c_str());
+    break;
+  // Local function variable.
+  case LocalVT:
+    printSimpleInstruction("ldloc",getValueName(V).c_str());
+    break;
+  // Constant value.
+  case ConstVT:
+    if (isa<ConstantPointerNull>(V))
+      printPtrLoad(0);
+    else
+      printConstLoad(cast<Constant>(V));
+    break;
+  // Constant expression.
+  case ConstExprVT:
+    printConstantExpr(cast<ConstantExpr>(V));
+    break;
+  default:
+    errs() << "Value = " << *V << '\n';
+    llvm_unreachable("Invalid value location");
+  }
+}
+
+
+void MSILWriter::printValueSave(const Value* V) {
+  switch (getValueLocation(V)) {
+  case ArgumentVT:
+    printSimpleInstruction("starg",getValueName(V).c_str());
+    break;
+  case LocalVT:
+    printSimpleInstruction("stloc",getValueName(V).c_str());
+    break;
+  default:
+    errs() << "Value  = " << *V << '\n';
+    llvm_unreachable("Invalid value location");
+  }
+}
+
+
+void MSILWriter::printBinaryInstruction(const char* Name, const Value* Left,
+                                        const Value* Right) {
+  printValueLoad(Left);
+  printValueLoad(Right);
+  Out << '\t' << Name << '\n';
+}
+
+
+void MSILWriter::printSimpleInstruction(const char* Inst, const char* Operand) {
+  if(Operand) 
+    Out << '\t' << Inst << '\t' << Operand << '\n';
+  else
+    Out << '\t' << Inst << '\n';
+}
+
+
+void MSILWriter::printPHICopy(const BasicBlock* Src, const BasicBlock* Dst) {
+  for (BasicBlock::const_iterator I = Dst->begin(); isa<PHINode>(I); ++I) {
+    const PHINode* Phi = cast<PHINode>(I);
+    const Value* Val = Phi->getIncomingValueForBlock(Src);
+    if (isa<UndefValue>(Val)) continue;
+    printValueLoad(Val);
+    printValueSave(Phi);
+  }
+}
+
+
+void MSILWriter::printBranchToBlock(const BasicBlock* CurrBB,
+                                    const BasicBlock* TrueBB,
+                                    const BasicBlock* FalseBB) {
+  if (TrueBB==FalseBB) {
+    // "TrueBB" and "FalseBB" destination equals
+    printPHICopy(CurrBB,TrueBB);
+    printSimpleInstruction("pop");
+    printSimpleInstruction("br",getLabelName(TrueBB).c_str());
+  } else if (FalseBB==NULL) {
+    // If "FalseBB" not used the jump have condition
+    printPHICopy(CurrBB,TrueBB);
+    printSimpleInstruction("brtrue",getLabelName(TrueBB).c_str());
+  } else if (TrueBB==NULL) {
+    // If "TrueBB" not used the jump is unconditional
+    printPHICopy(CurrBB,FalseBB);
+    printSimpleInstruction("br",getLabelName(FalseBB).c_str());
+  } else {
+    // Copy PHI instructions for each block
+    std::string TmpLabel;
+    // Print PHI instructions for "TrueBB"
+    if (isa<PHINode>(TrueBB->begin())) {
+      TmpLabel = getLabelName(TrueBB)+"$phi_"+utostr(getUniqID());
+      printSimpleInstruction("brtrue",TmpLabel.c_str());
+    } else {
+      printSimpleInstruction("brtrue",getLabelName(TrueBB).c_str());
+    }
+    // Print PHI instructions for "FalseBB"
+    if (isa<PHINode>(FalseBB->begin())) {
+      printPHICopy(CurrBB,FalseBB);
+      printSimpleInstruction("br",getLabelName(FalseBB).c_str());
+    } else {
+      printSimpleInstruction("br",getLabelName(FalseBB).c_str());
+    }
+    if (isa<PHINode>(TrueBB->begin())) {
+      // Handle "TrueBB" PHI Copy
+      Out << TmpLabel << ":\n";
+      printPHICopy(CurrBB,TrueBB);
+      printSimpleInstruction("br",getLabelName(TrueBB).c_str());
+    }
+  }
+}
+
+
+void MSILWriter::printBranchInstruction(const BranchInst* Inst) {
+  if (Inst->isUnconditional()) {
+    printBranchToBlock(Inst->getParent(),NULL,Inst->getSuccessor(0));
+  } else {
+    printValueLoad(Inst->getCondition());
+    printBranchToBlock(Inst->getParent(),Inst->getSuccessor(0),
+                       Inst->getSuccessor(1));
+  }
+}
+
+
+void MSILWriter::printSelectInstruction(const Value* Cond, const Value* VTrue,
+                                        const Value* VFalse) {
+  std::string TmpLabel = std::string("select$true_")+utostr(getUniqID());
+  printValueLoad(VTrue);
+  printValueLoad(Cond);
+  printSimpleInstruction("brtrue",TmpLabel.c_str());
+  printSimpleInstruction("pop");
+  printValueLoad(VFalse);
+  Out << TmpLabel << ":\n";
+}
+
+
+void MSILWriter::printIndirectLoad(const Value* V) {
+  const Type* Ty = V->getType();
+  printValueLoad(V);
+  if (const PointerType* P = dyn_cast<PointerType>(Ty))
+    Ty = P->getElementType();
+  std::string Tmp = "ldind."+getTypePostfix(Ty, false);
+  printSimpleInstruction(Tmp.c_str());
+}
+
+
+void MSILWriter::printIndirectSave(const Value* Ptr, const Value* Val) {
+  printValueLoad(Ptr);
+  printValueLoad(Val);
+  printIndirectSave(Val->getType());
+}
+
+
+void MSILWriter::printIndirectSave(const Type* Ty) {
+  // Instruction need signed postfix for any type.
+  std::string postfix = getTypePostfix(Ty, false);
+  if (*postfix.begin()=='u') *postfix.begin() = 'i';
+  postfix = "stind."+postfix;
+  printSimpleInstruction(postfix.c_str());
+}
+
+
+void MSILWriter::printCastInstruction(unsigned int Op, const Value* V,
+                                      const Type* Ty, const Type* SrcTy) {
+  std::string Tmp("");
+  printValueLoad(V);
+  switch (Op) {
+  // Signed
+  case Instruction::SExt:
+    // If sign extending int, convert first from unsigned to signed
+    // with the same bit size - because otherwise we will loose the sign.
+    if (SrcTy) {
+      Tmp = "conv."+getTypePostfix(SrcTy,false,true);
+      printSimpleInstruction(Tmp.c_str());
+    }
+    // FALLTHROUGH
+  case Instruction::SIToFP:
+  case Instruction::FPToSI:
+    Tmp = "conv."+getTypePostfix(Ty,false,true);
+    printSimpleInstruction(Tmp.c_str());
+    break;
+  // Unsigned
+  case Instruction::FPTrunc:
+  case Instruction::FPExt:
+  case Instruction::UIToFP:
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::FPToUI:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+    Tmp = "conv."+getTypePostfix(Ty,false);
+    printSimpleInstruction(Tmp.c_str());
+    break;
+  // Do nothing
+  case Instruction::BitCast:
+    // FIXME: meaning that ld*/st* instruction do not change data format.
+    break;
+  default:
+    errs() << "Opcode = " << Op << '\n';
+    llvm_unreachable("Invalid conversion instruction");
+  }
+}
+
+
+void MSILWriter::printGepInstruction(const Value* V, gep_type_iterator I,
+                                     gep_type_iterator E) {
+  unsigned Size;
+  // Load address
+  printValuePtrLoad(V);
+  // Calculate element offset.
+  for (; I!=E; ++I){
+    Size = 0;
+    const Value* IndexValue = I.getOperand();
+    if (const StructType* StrucTy = dyn_cast<StructType>(*I)) {
+      uint64_t FieldIndex = cast<ConstantInt>(IndexValue)->getZExtValue();
+      // Offset is the sum of all previous structure fields.
+      for (uint64_t F = 0; F<FieldIndex; ++F)
+        Size += TD->getTypeAllocSize(StrucTy->getContainedType((unsigned)F));
+      printPtrLoad(Size);
+      printSimpleInstruction("add");
+      continue;
+    } else if (const SequentialType* SeqTy = dyn_cast<SequentialType>(*I)) {
+      Size = TD->getTypeAllocSize(SeqTy->getElementType());
+    } else {
+      Size = TD->getTypeAllocSize(*I);
+    }
+    // Add offset of current element to stack top.
+    if (!isZeroValue(IndexValue)) {
+      // Constant optimization.
+      if (const ConstantInt* C = dyn_cast<ConstantInt>(IndexValue)) {
+        if (C->getValue().isNegative()) {
+          printPtrLoad(C->getValue().abs().getZExtValue()*Size);
+          printSimpleInstruction("sub");
+          continue;
+        } else
+          printPtrLoad(C->getZExtValue()*Size);
+      } else {
+        printPtrLoad(Size);
+        printValuePtrLoad(IndexValue);
+        printSimpleInstruction("mul");
+      }
+      printSimpleInstruction("add");
+    }
+  }
+}
+
+
+std::string MSILWriter::getCallSignature(const FunctionType* Ty,
+                                         const Instruction* Inst,
+                                         std::string Name) {
+  std::string Tmp("");
+  if (Ty->isVarArg()) Tmp += "vararg ";
+  // Name and return type.
+  Tmp += getTypeName(Ty->getReturnType())+Name+"(";
+  // Function argument type list.
+  unsigned NumParams = Ty->getNumParams();
+  for (unsigned I = 0; I!=NumParams; ++I) {
+    if (I!=0) Tmp += ",";
+    Tmp += getTypeName(Ty->getParamType(I));
+  }
+  // CLR needs to know the exact amount of parameters received by vararg
+  // function, because caller cleans the stack.
+  if (Ty->isVarArg() && Inst) {
+    // Origin to function arguments in "CallInst" or "InvokeInst".
+    unsigned Org = isa<InvokeInst>(Inst) ? 3 : 1;
+    // Print variable argument types.
+    unsigned NumOperands = Inst->getNumOperands()-Org;
+    if (NumParams<NumOperands) {
+      if (NumParams!=0) Tmp += ", ";
+      Tmp += "... , ";
+      for (unsigned J = NumParams; J!=NumOperands; ++J) {
+        if (J!=NumParams) Tmp += ", ";
+        Tmp += getTypeName(Inst->getOperand(J+Org)->getType());
+      }
+    }
+  }
+  return Tmp+")";
+}
+
+
+void MSILWriter::printFunctionCall(const Value* FnVal,
+                                   const Instruction* Inst) {
+  // Get function calling convention.
+  std::string Name = "";
+  if (const CallInst* Call = dyn_cast<CallInst>(Inst))
+    Name = getConvModopt(Call->getCallingConv());
+  else if (const InvokeInst* Invoke = dyn_cast<InvokeInst>(Inst))
+    Name = getConvModopt(Invoke->getCallingConv());
+  else {
+    errs() << "Instruction = " << Inst->getName() << '\n';
+    llvm_unreachable("Need \"Invoke\" or \"Call\" instruction only");
+  }
+  if (const Function* F = dyn_cast<Function>(FnVal)) {
+    // Direct call.
+    Name += getValueName(F);
+    printSimpleInstruction("call",
+      getCallSignature(F->getFunctionType(),Inst,Name).c_str());
+  } else {
+    // Indirect function call.
+    const PointerType* PTy = cast<PointerType>(FnVal->getType());
+    const FunctionType* FTy = cast<FunctionType>(PTy->getElementType());
+    // Load function address.
+    printValueLoad(FnVal);
+    printSimpleInstruction("calli",getCallSignature(FTy,Inst,Name).c_str());
+  }
+}
+
+
+void MSILWriter::printIntrinsicCall(const IntrinsicInst* Inst) {
+  std::string Name;
+  switch (Inst->getIntrinsicID()) {
+  case Intrinsic::vastart:
+    Name = getValueName(Inst->getOperand(1));
+    Name.insert(Name.length()-1,"$valist");
+    // Obtain the argument handle.
+    printSimpleInstruction("ldloca",Name.c_str());
+    printSimpleInstruction("arglist");
+    printSimpleInstruction("call",
+      "instance void [mscorlib]System.ArgIterator::.ctor"
+      "(valuetype [mscorlib]System.RuntimeArgumentHandle)");
+    // Save as pointer type "void*"
+    printValueLoad(Inst->getOperand(1));
+    printSimpleInstruction("ldloca",Name.c_str());
+    printIndirectSave(PointerType::getUnqual(
+          IntegerType::get(Inst->getContext(), 8)));
+    break;
+  case Intrinsic::vaend:
+    // Close argument list handle.
+    printIndirectLoad(Inst->getOperand(1));
+    printSimpleInstruction("call","instance void [mscorlib]System.ArgIterator::End()");
+    break;
+  case Intrinsic::vacopy:
+    // Copy "ArgIterator" valuetype.
+    printIndirectLoad(Inst->getOperand(1));
+    printIndirectLoad(Inst->getOperand(2));
+    printSimpleInstruction("cpobj","[mscorlib]System.ArgIterator");
+    break;        
+  default:
+    errs() << "Intrinsic ID = " << Inst->getIntrinsicID() << '\n';
+    llvm_unreachable("Invalid intrinsic function");
+  }
+}
+
+
+void MSILWriter::printCallInstruction(const Instruction* Inst) {
+  if (isa<IntrinsicInst>(Inst)) {
+    // Handle intrinsic function.
+    printIntrinsicCall(cast<IntrinsicInst>(Inst));
+  } else {
+    // Load arguments to stack and call function.
+    for (int I = 1, E = Inst->getNumOperands(); I!=E; ++I)
+      printValueLoad(Inst->getOperand(I));
+    printFunctionCall(Inst->getOperand(0),Inst);
+  }
+}
+
+
+void MSILWriter::printICmpInstruction(unsigned Predicate, const Value* Left,
+                                      const Value* Right) {
+  switch (Predicate) {
+  case ICmpInst::ICMP_EQ:
+    printBinaryInstruction("ceq",Left,Right);
+    break;
+  case ICmpInst::ICMP_NE:
+    // Emulate = not neg (Op1 eq Op2)
+    printBinaryInstruction("ceq",Left,Right);
+    printSimpleInstruction("neg");
+    printSimpleInstruction("not");
+    break;
+  case ICmpInst::ICMP_ULE:
+  case ICmpInst::ICMP_SLE:
+    // Emulate = (Op1 eq Op2) or (Op1 lt Op2)
+    printBinaryInstruction("ceq",Left,Right);
+    if (Predicate==ICmpInst::ICMP_ULE)
+      printBinaryInstruction("clt.un",Left,Right);
+    else
+      printBinaryInstruction("clt",Left,Right);
+    printSimpleInstruction("or");
+    break;
+  case ICmpInst::ICMP_UGE:
+  case ICmpInst::ICMP_SGE:
+    // Emulate = (Op1 eq Op2) or (Op1 gt Op2)
+    printBinaryInstruction("ceq",Left,Right);
+    if (Predicate==ICmpInst::ICMP_UGE)
+      printBinaryInstruction("cgt.un",Left,Right);
+    else
+      printBinaryInstruction("cgt",Left,Right);
+    printSimpleInstruction("or");
+    break;
+  case ICmpInst::ICMP_ULT:
+    printBinaryInstruction("clt.un",Left,Right);
+    break;
+  case ICmpInst::ICMP_SLT:
+    printBinaryInstruction("clt",Left,Right);
+    break;
+  case ICmpInst::ICMP_UGT:
+    printBinaryInstruction("cgt.un",Left,Right);
+    break;
+  case ICmpInst::ICMP_SGT:
+    printBinaryInstruction("cgt",Left,Right);
+    break;
+  default:
+    errs() << "Predicate = " << Predicate << '\n';
+    llvm_unreachable("Invalid icmp predicate");
+  }
+}
+
+
+void MSILWriter::printFCmpInstruction(unsigned Predicate, const Value* Left,
+                                      const Value* Right) {
+  // FIXME: Correct comparison
+  std::string NanFunc = "bool [mscorlib]System.Double::IsNaN(float64)";
+  switch (Predicate) {
+  case FCmpInst::FCMP_UGT:
+    // X >  Y || llvm_fcmp_uno(X, Y)
+    printBinaryInstruction("cgt",Left,Right);
+    printFCmpInstruction(FCmpInst::FCMP_UNO,Left,Right);
+    printSimpleInstruction("or");
+    break;
+  case FCmpInst::FCMP_OGT:
+    // X >  Y
+    printBinaryInstruction("cgt",Left,Right);
+    break;
+  case FCmpInst::FCMP_UGE:
+    // X >= Y || llvm_fcmp_uno(X, Y)
+    printBinaryInstruction("ceq",Left,Right);
+    printBinaryInstruction("cgt",Left,Right);
+    printSimpleInstruction("or");
+    printFCmpInstruction(FCmpInst::FCMP_UNO,Left,Right);
+    printSimpleInstruction("or");
+    break;
+  case FCmpInst::FCMP_OGE:
+    // X >= Y
+    printBinaryInstruction("ceq",Left,Right);
+    printBinaryInstruction("cgt",Left,Right);
+    printSimpleInstruction("or");
+    break;
+  case FCmpInst::FCMP_ULT:
+    // X <  Y || llvm_fcmp_uno(X, Y)
+    printBinaryInstruction("clt",Left,Right);
+    printFCmpInstruction(FCmpInst::FCMP_UNO,Left,Right);
+    printSimpleInstruction("or");
+    break;
+  case FCmpInst::FCMP_OLT:
+    // X <  Y
+    printBinaryInstruction("clt",Left,Right);
+    break;
+  case FCmpInst::FCMP_ULE:
+    // X <= Y || llvm_fcmp_uno(X, Y)
+    printBinaryInstruction("ceq",Left,Right);
+    printBinaryInstruction("clt",Left,Right);
+    printSimpleInstruction("or");
+    printFCmpInstruction(FCmpInst::FCMP_UNO,Left,Right);
+    printSimpleInstruction("or");
+    break;
+  case FCmpInst::FCMP_OLE:
+    // X <= Y
+    printBinaryInstruction("ceq",Left,Right);
+    printBinaryInstruction("clt",Left,Right);
+    printSimpleInstruction("or");
+    break;
+  case FCmpInst::FCMP_UEQ:
+    // X == Y || llvm_fcmp_uno(X, Y)
+    printBinaryInstruction("ceq",Left,Right);
+    printFCmpInstruction(FCmpInst::FCMP_UNO,Left,Right);
+    printSimpleInstruction("or");
+    break;
+  case FCmpInst::FCMP_OEQ:
+    // X == Y
+    printBinaryInstruction("ceq",Left,Right);
+    break;
+  case FCmpInst::FCMP_UNE:
+    // X != Y
+    printBinaryInstruction("ceq",Left,Right);
+    printSimpleInstruction("neg");
+    printSimpleInstruction("not");
+    break;
+  case FCmpInst::FCMP_ONE:
+    // X != Y && llvm_fcmp_ord(X, Y)
+    printBinaryInstruction("ceq",Left,Right);
+    printSimpleInstruction("not");
+    break;
+  case FCmpInst::FCMP_ORD:
+    // return X == X && Y == Y
+    printBinaryInstruction("ceq",Left,Left);
+    printBinaryInstruction("ceq",Right,Right);
+    printSimpleInstruction("or");
+    break;
+  case FCmpInst::FCMP_UNO:
+    // X != X || Y != Y
+    printBinaryInstruction("ceq",Left,Left);
+    printSimpleInstruction("not");
+    printBinaryInstruction("ceq",Right,Right);
+    printSimpleInstruction("not");
+    printSimpleInstruction("or");
+    break;
+  default:
+    llvm_unreachable("Illegal FCmp predicate");
+  }
+}
+
+
+void MSILWriter::printInvokeInstruction(const InvokeInst* Inst) {
+  std::string Label = "leave$normal_"+utostr(getUniqID());
+  Out << ".try {\n";
+  // Load arguments
+  for (int I = 3, E = Inst->getNumOperands(); I!=E; ++I)
+    printValueLoad(Inst->getOperand(I));
+  // Print call instruction
+  printFunctionCall(Inst->getOperand(0),Inst);
+  // Save function result and leave "try" block
+  printValueSave(Inst);
+  printSimpleInstruction("leave",Label.c_str());
+  Out << "}\n";
+  Out << "catch [mscorlib]System.Exception {\n";
+  // Redirect to unwind block
+  printSimpleInstruction("pop");
+  printBranchToBlock(Inst->getParent(),NULL,Inst->getUnwindDest());
+  Out << "}\n" << Label << ":\n";
+  // Redirect to continue block
+  printBranchToBlock(Inst->getParent(),NULL,Inst->getNormalDest());
+}
+
+
+void MSILWriter::printSwitchInstruction(const SwitchInst* Inst) {
+  // FIXME: Emulate with IL "switch" instruction
+  // Emulate = if () else if () else if () else ...
+  for (unsigned int I = 1, E = Inst->getNumCases(); I!=E; ++I) {
+    printValueLoad(Inst->getCondition());
+    printValueLoad(Inst->getCaseValue(I));
+    printSimpleInstruction("ceq");
+    // Condition jump to successor block
+    printBranchToBlock(Inst->getParent(),Inst->getSuccessor(I),NULL);
+  }
+  // Jump to default block
+  printBranchToBlock(Inst->getParent(),NULL,Inst->getDefaultDest());
+}
+
+
+void MSILWriter::printVAArgInstruction(const VAArgInst* Inst) {
+  printIndirectLoad(Inst->getOperand(0));
+  printSimpleInstruction("call",
+    "instance typedref [mscorlib]System.ArgIterator::GetNextArg()");
+  printSimpleInstruction("refanyval","void*");
+  std::string Name = 
+    "ldind."+getTypePostfix(PointerType::getUnqual(
+            IntegerType::get(Inst->getContext(), 8)),false);
+  printSimpleInstruction(Name.c_str());
+}
+
+
+void MSILWriter::printAllocaInstruction(const AllocaInst* Inst) {
+  uint64_t Size = TD->getTypeAllocSize(Inst->getAllocatedType());
+  // Constant optimization.
+  if (const ConstantInt* CInt = dyn_cast<ConstantInt>(Inst->getOperand(0))) {
+    printPtrLoad(CInt->getZExtValue()*Size);
+  } else {
+    printPtrLoad(Size);
+    printValueLoad(Inst->getOperand(0));
+    printSimpleInstruction("mul");
+  }
+  printSimpleInstruction("localloc");
+}
+
+
+void MSILWriter::printInstruction(const Instruction* Inst) {
+  const Value *Left = 0, *Right = 0;
+  if (Inst->getNumOperands()>=1) Left = Inst->getOperand(0);
+  if (Inst->getNumOperands()>=2) Right = Inst->getOperand(1);
+  // Print instruction
+  // FIXME: "ShuffleVector","ExtractElement","InsertElement" support.
+  switch (Inst->getOpcode()) {
+  // Terminator
+  case Instruction::Ret:
+    if (Inst->getNumOperands()) {
+      printValueLoad(Left);
+      printSimpleInstruction("ret");
+    } else
+      printSimpleInstruction("ret");
+    break;
+  case Instruction::Br:
+    printBranchInstruction(cast<BranchInst>(Inst));
+    break;
+  // Binary
+  case Instruction::Add:
+  case Instruction::FAdd:
+    printBinaryInstruction("add",Left,Right);
+    break;
+  case Instruction::Sub:
+  case Instruction::FSub:
+    printBinaryInstruction("sub",Left,Right);
+    break;
+  case Instruction::Mul:
+  case Instruction::FMul:
+    printBinaryInstruction("mul",Left,Right);
+    break;
+  case Instruction::UDiv:
+    printBinaryInstruction("div.un",Left,Right);
+    break;
+  case Instruction::SDiv:
+  case Instruction::FDiv:
+    printBinaryInstruction("div",Left,Right);
+    break;
+  case Instruction::URem:
+    printBinaryInstruction("rem.un",Left,Right);
+    break;
+  case Instruction::SRem:
+  case Instruction::FRem:
+    printBinaryInstruction("rem",Left,Right);
+    break;
+  // Binary Condition
+  case Instruction::ICmp:
+    printICmpInstruction(cast<ICmpInst>(Inst)->getPredicate(),Left,Right);
+    break;
+  case Instruction::FCmp:
+    printFCmpInstruction(cast<FCmpInst>(Inst)->getPredicate(),Left,Right);
+    break;
+  // Bitwise Binary
+  case Instruction::And:
+    printBinaryInstruction("and",Left,Right);
+    break;
+  case Instruction::Or:
+    printBinaryInstruction("or",Left,Right);
+    break;
+  case Instruction::Xor:
+    printBinaryInstruction("xor",Left,Right);
+    break;
+  case Instruction::Shl:
+    printValueLoad(Left);
+    printValueLoad(Right);
+    printSimpleInstruction("conv.i4");
+    printSimpleInstruction("shl");
+    break;
+  case Instruction::LShr:
+    printValueLoad(Left);
+    printValueLoad(Right);
+    printSimpleInstruction("conv.i4");
+    printSimpleInstruction("shr.un");
+    break;
+  case Instruction::AShr:
+    printValueLoad(Left);
+    printValueLoad(Right);
+    printSimpleInstruction("conv.i4");
+    printSimpleInstruction("shr");
+    break;
+  case Instruction::Select:
+    printSelectInstruction(Inst->getOperand(0),Inst->getOperand(1),Inst->getOperand(2));
+    break;
+  case Instruction::Load:
+    printIndirectLoad(Inst->getOperand(0));
+    break;
+  case Instruction::Store:
+    printIndirectSave(Inst->getOperand(1), Inst->getOperand(0));
+    break;
+  case Instruction::SExt:
+    printCastInstruction(Inst->getOpcode(),Left,
+                         cast<CastInst>(Inst)->getDestTy(),
+                         cast<CastInst>(Inst)->getSrcTy());
+    break;
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::FPTrunc:
+  case Instruction::FPExt:
+  case Instruction::UIToFP:
+  case Instruction::SIToFP:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::BitCast:
+    printCastInstruction(Inst->getOpcode(),Left,
+                         cast<CastInst>(Inst)->getDestTy());
+    break;
+  case Instruction::GetElementPtr:
+    printGepInstruction(Inst->getOperand(0),gep_type_begin(Inst),
+                        gep_type_end(Inst));
+    break;
+  case Instruction::Call:
+    printCallInstruction(cast<CallInst>(Inst));
+    break;
+  case Instruction::Invoke:
+    printInvokeInstruction(cast<InvokeInst>(Inst));
+    break;
+  case Instruction::Unwind:
+    printSimpleInstruction("newobj",
+      "instance void [mscorlib]System.Exception::.ctor()");
+    printSimpleInstruction("throw");
+    break;
+  case Instruction::Switch:
+    printSwitchInstruction(cast<SwitchInst>(Inst));
+    break;
+  case Instruction::Alloca:
+    printAllocaInstruction(cast<AllocaInst>(Inst));
+    break;
+  case Instruction::Unreachable:
+    printSimpleInstruction("ldstr", "\"Unreachable instruction\"");
+    printSimpleInstruction("newobj",
+      "instance void [mscorlib]System.Exception::.ctor(string)");
+    printSimpleInstruction("throw");
+    break;
+  case Instruction::VAArg:
+    printVAArgInstruction(cast<VAArgInst>(Inst));
+    break;
+  default:
+    errs() << "Instruction = " << Inst->getName() << '\n';
+    llvm_unreachable("Unsupported instruction");
+  }
+}
+
+
+void MSILWriter::printLoop(const Loop* L) {
+  Out << getLabelName(L->getHeader()->getName()) << ":\n";
+  const std::vector<BasicBlock*>& blocks = L->getBlocks();
+  for (unsigned I = 0, E = blocks.size(); I!=E; I++) {
+    BasicBlock* BB = blocks[I];
+    Loop* BBLoop = LInfo->getLoopFor(BB);
+    if (BBLoop == L)
+      printBasicBlock(BB);
+    else if (BB==BBLoop->getHeader() && BBLoop->getParentLoop()==L)
+      printLoop(BBLoop);
+  }
+  printSimpleInstruction("br",getLabelName(L->getHeader()->getName()).c_str());
+}
+
+
+void MSILWriter::printBasicBlock(const BasicBlock* BB) {
+  Out << getLabelName(BB) << ":\n";
+  for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I!=E; ++I) {
+    const Instruction* Inst = I;
+    // Comment llvm original instruction
+    // Out << "\n//" << *Inst << "\n";
+    // Do not handle PHI instruction in current block
+    if (Inst->getOpcode()==Instruction::PHI) continue;
+    // Print instruction
+    printInstruction(Inst);
+    // Save result
+    if (Inst->getType()!=Type::getVoidTy(BB->getContext())) {
+      // Do not save value after invoke, it done in "try" block
+      if (Inst->getOpcode()==Instruction::Invoke) continue;
+      printValueSave(Inst);
+    }
+  }
+}
+
+
+void MSILWriter::printLocalVariables(const Function& F) {
+  std::string Name;
+  const Type* Ty = NULL;
+  std::set<const Value*> Printed;
+  const Value* VaList = NULL;
+  unsigned StackDepth = 8;
+  // Find local variables
+  for (const_inst_iterator I = inst_begin(&F), E = inst_end(&F); I!=E; ++I) {
+    if (I->getOpcode()==Instruction::Call ||
+        I->getOpcode()==Instruction::Invoke) {
+      // Test stack depth.
+      if (StackDepth<I->getNumOperands())
+        StackDepth = I->getNumOperands();
+    }
+    const AllocaInst* AI = dyn_cast<AllocaInst>(&*I);
+    if (AI && !isa<GlobalVariable>(AI)) {
+      // Local variable allocation.
+      Ty = PointerType::getUnqual(AI->getAllocatedType());
+      Name = getValueName(AI);
+      Out << "\t.locals (" << getTypeName(Ty) << Name << ")\n";
+    } else if (I->getType()!=Type::getVoidTy(F.getContext())) {
+      // Operation result.
+      Ty = I->getType();
+      Name = getValueName(&*I);
+      Out << "\t.locals (" << getTypeName(Ty) << Name << ")\n";
+    }
+    // Test on 'va_list' variable    
+    bool isVaList = false;     
+    if (const VAArgInst* VaInst = dyn_cast<VAArgInst>(&*I)) {
+      // "va_list" as "va_arg" instruction operand.
+      isVaList = true;
+      VaList = VaInst->getOperand(0);
+    } else if (const IntrinsicInst* Inst = dyn_cast<IntrinsicInst>(&*I)) {
+      // "va_list" as intrinsic function operand. 
+      switch (Inst->getIntrinsicID()) {
+      case Intrinsic::vastart:
+      case Intrinsic::vaend:
+      case Intrinsic::vacopy:
+        isVaList = true;
+        VaList = Inst->getOperand(1);
+        break;
+      default:
+        isVaList = false;
+      }
+    }
+    // Print "va_list" variable.
+    if (isVaList && Printed.insert(VaList).second) {
+      Name = getValueName(VaList);
+      Name.insert(Name.length()-1,"$valist");
+      Out << "\t.locals (valuetype [mscorlib]System.ArgIterator "
+          << Name << ")\n";
+    }
+  }
+  printSimpleInstruction(".maxstack",utostr(StackDepth*2).c_str());
+}
+
+
+void MSILWriter::printFunctionBody(const Function& F) {
+  // Print body
+  for (Function::const_iterator I = F.begin(), E = F.end(); I!=E; ++I) {
+    if (Loop *L = LInfo->getLoopFor(I)) {
+      if (L->getHeader()==I && L->getParentLoop()==0)
+        printLoop(L);
+    } else {
+      printBasicBlock(I);
+    }
+  }
+}
+
+
+void MSILWriter::printConstantExpr(const ConstantExpr* CE) {
+  const Value *left = 0, *right = 0;
+  if (CE->getNumOperands()>=1) left = CE->getOperand(0);
+  if (CE->getNumOperands()>=2) right = CE->getOperand(1);
+  // Print instruction
+  switch (CE->getOpcode()) {
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPTrunc:
+  case Instruction::FPExt:
+  case Instruction::UIToFP:
+  case Instruction::SIToFP:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::BitCast:
+    printCastInstruction(CE->getOpcode(),left,CE->getType());
+    break;
+  case Instruction::GetElementPtr:
+    printGepInstruction(CE->getOperand(0),gep_type_begin(CE),gep_type_end(CE));
+    break;
+  case Instruction::ICmp:
+    printICmpInstruction(CE->getPredicate(),left,right);
+    break;
+  case Instruction::FCmp:
+    printFCmpInstruction(CE->getPredicate(),left,right);
+    break;
+  case Instruction::Select:
+    printSelectInstruction(CE->getOperand(0),CE->getOperand(1),CE->getOperand(2));
+    break;
+  case Instruction::Add:
+  case Instruction::FAdd:
+    printBinaryInstruction("add",left,right);
+    break;
+  case Instruction::Sub:
+  case Instruction::FSub:
+    printBinaryInstruction("sub",left,right);
+    break;
+  case Instruction::Mul:
+  case Instruction::FMul:
+    printBinaryInstruction("mul",left,right);
+    break;
+  case Instruction::UDiv:
+    printBinaryInstruction("div.un",left,right);
+    break;
+  case Instruction::SDiv:
+  case Instruction::FDiv:
+    printBinaryInstruction("div",left,right);
+    break;
+  case Instruction::URem:
+    printBinaryInstruction("rem.un",left,right);
+    break;
+  case Instruction::SRem:
+  case Instruction::FRem:
+    printBinaryInstruction("rem",left,right);
+    break;
+  case Instruction::And:
+    printBinaryInstruction("and",left,right);
+    break;
+  case Instruction::Or:
+    printBinaryInstruction("or",left,right);
+    break;
+  case Instruction::Xor:
+    printBinaryInstruction("xor",left,right);
+    break;
+  case Instruction::Shl:
+    printBinaryInstruction("shl",left,right);
+    break;
+  case Instruction::LShr:
+    printBinaryInstruction("shr.un",left,right);
+    break;
+  case Instruction::AShr:
+    printBinaryInstruction("shr",left,right);
+    break;
+  default:
+    errs() << "Expression = " << *CE << "\n";
+    llvm_unreachable("Invalid constant expression");
+  }
+}
+
+
+void MSILWriter::printStaticInitializerList() {
+  // List of global variables with uninitialized fields.
+  for (std::map<const GlobalVariable*,std::vector<StaticInitializer> >::iterator
+       VarI = StaticInitList.begin(), VarE = StaticInitList.end(); VarI!=VarE;
+       ++VarI) {
+    const std::vector<StaticInitializer>& InitList = VarI->second;
+    if (InitList.empty()) continue;
+    // For each uninitialized field.
+    for (std::vector<StaticInitializer>::const_iterator I = InitList.begin(),
+         E = InitList.end(); I!=E; ++I) {
+      if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(I->constant)) {
+        // Out << "\n// Init " << getValueName(VarI->first) << ", offset " <<
+        //  utostr(I->offset) << ", type "<< *I->constant->getType() << "\n\n";
+        // Load variable address
+        printValueLoad(VarI->first);
+        // Add offset
+        if (I->offset!=0) {
+          printPtrLoad(I->offset);
+          printSimpleInstruction("add");
+        }
+        // Load value
+        printConstantExpr(CE);
+        // Save result at offset
+        std::string postfix = getTypePostfix(CE->getType(),true);
+        if (*postfix.begin()=='u') *postfix.begin() = 'i';
+        postfix = "stind."+postfix;
+        printSimpleInstruction(postfix.c_str());
+      } else {
+        errs() << "Constant = " << *I->constant << '\n';
+        llvm_unreachable("Invalid static initializer");
+      }
+    }
+  }
+}
+
+
+void MSILWriter::printFunction(const Function& F) {
+  bool isSigned = F.paramHasAttr(0, Attribute::SExt);
+  Out << "\n.method static ";
+  Out << (F.hasLocalLinkage() ? "private " : "public ");
+  if (F.isVarArg()) Out << "vararg ";
+  Out << getTypeName(F.getReturnType(),isSigned) << 
+    getConvModopt(F.getCallingConv()) << getValueName(&F) << '\n';
+  // Arguments
+  Out << "\t(";
+  unsigned ArgIdx = 1;
+  for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end(); I!=E;
+       ++I, ++ArgIdx) {
+    isSigned = F.paramHasAttr(ArgIdx, Attribute::SExt);
+    if (I!=F.arg_begin()) Out << ", ";
+    Out << getTypeName(I->getType(),isSigned) << getValueName(I);
+  }
+  Out << ") cil managed\n";
+  // Body
+  Out << "{\n";
+  printLocalVariables(F);
+  printFunctionBody(F);
+  Out << "}\n";
+}
+
+
+void MSILWriter::printDeclarations(const TypeSymbolTable& ST) {
+  std::string Name;
+  std::set<const Type*> Printed;
+  for (std::set<const Type*>::const_iterator
+       UI = UsedTypes->begin(), UE = UsedTypes->end(); UI!=UE; ++UI) {
+    const Type* Ty = *UI;
+    if (isa<ArrayType>(Ty) || isa<VectorType>(Ty) || isa<StructType>(Ty))
+      Name = getTypeName(Ty, false, true);
+    // Type with no need to declare.
+    else continue;
+    // Print not duplicated type
+    if (Printed.insert(Ty).second) {
+      Out << ".class value explicit ansi sealed '" << Name << "'";
+      Out << " { .pack " << 1 << " .size " << TD->getTypeAllocSize(Ty);
+      Out << " }\n\n";
+    }
+  }
+}
+
+
+unsigned int MSILWriter::getBitWidth(const Type* Ty) {
+  unsigned int N = Ty->getPrimitiveSizeInBits();
+  assert(N!=0 && "Invalid type in getBitWidth()");
+  switch (N) {
+  case 1:
+  case 8:
+  case 16:
+  case 32:
+  case 64:
+    return N;
+  default:
+    errs() << "Bits = " << N << '\n';
+    llvm_unreachable("Unsupported integer width");
+  }
+  return 0; // Not reached
+}
+
+
+void MSILWriter::printStaticConstant(const Constant* C, uint64_t& Offset) {
+  uint64_t TySize = 0;
+  const Type* Ty = C->getType();
+  // Print zero initialized constant.
+  if (isa<ConstantAggregateZero>(C) || C->isNullValue()) {
+    TySize = TD->getTypeAllocSize(C->getType());
+    Offset += TySize;
+    Out << "int8 (0) [" << TySize << "]";
+    return;
+  }
+  // Print constant initializer
+  switch (Ty->getTypeID()) {
+  case Type::IntegerTyID: {
+    TySize = TD->getTypeAllocSize(Ty);
+    const ConstantInt* Int = cast<ConstantInt>(C);
+    Out << getPrimitiveTypeName(Ty,true) << "(" << Int->getSExtValue() << ")";
+    break;
+  }
+  case Type::FloatTyID:
+  case Type::DoubleTyID: {
+    TySize = TD->getTypeAllocSize(Ty);
+    const ConstantFP* FP = cast<ConstantFP>(C);
+    if (Ty->getTypeID() == Type::FloatTyID)
+      Out << "int32 (" << 
+        (uint32_t)FP->getValueAPF().bitcastToAPInt().getZExtValue() << ')';
+    else
+      Out << "int64 (" << 
+        FP->getValueAPF().bitcastToAPInt().getZExtValue() << ')';
+    break;
+  }
+  case Type::ArrayTyID:
+  case Type::VectorTyID:
+  case Type::StructTyID:
+    for (unsigned I = 0, E = C->getNumOperands(); I<E; I++) {
+      if (I!=0) Out << ",\n";
+      printStaticConstant(cast<Constant>(C->getOperand(I)), Offset);
+    }
+    break;
+  case Type::PointerTyID:
+    TySize = TD->getTypeAllocSize(C->getType());
+    // Initialize with global variable address
+    if (const GlobalVariable *G = dyn_cast<GlobalVariable>(C)) {
+      std::string name = getValueName(G);
+      Out << "&(" << name.insert(name.length()-1,"$data") << ")";
+    } else {
+      // Dynamic initialization
+      if (!isa<ConstantPointerNull>(C) && !C->isNullValue())
+        InitListPtr->push_back(StaticInitializer(C,Offset));
+      // Null pointer initialization
+      if (TySize==4) Out << "int32 (0)";
+      else if (TySize==8) Out << "int64 (0)";
+      else llvm_unreachable("Invalid pointer size");
+    }
+    break;
+  default:
+    errs() << "TypeID = " << Ty->getTypeID() << '\n';
+    llvm_unreachable("Invalid type in printStaticConstant()");
+  }
+  // Increase offset.
+  Offset += TySize;
+}
+
+
+void MSILWriter::printStaticInitializer(const Constant* C,
+                                        const std::string& Name) {
+  switch (C->getType()->getTypeID()) {
+  case Type::IntegerTyID:
+  case Type::FloatTyID:
+  case Type::DoubleTyID: 
+    Out << getPrimitiveTypeName(C->getType(), false);
+    break;
+  case Type::ArrayTyID:
+  case Type::VectorTyID:
+  case Type::StructTyID:
+  case Type::PointerTyID:
+    Out << getTypeName(C->getType());
+    break;
+  default:
+    errs() << "Type = " << *C << "\n";
+    llvm_unreachable("Invalid constant type");
+  }
+  // Print initializer
+  std::string label = Name;
+  label.insert(label.length()-1,"$data");
+  Out << Name << " at " << label << '\n';
+  Out << ".data " << label << " = {\n";
+  uint64_t offset = 0;
+  printStaticConstant(C,offset);
+  Out << "\n}\n\n";
+}
+
+
+void MSILWriter::printVariableDefinition(const GlobalVariable* G) {
+  const Constant* C = G->getInitializer();
+  if (C->isNullValue() || isa<ConstantAggregateZero>(C) || isa<UndefValue>(C))
+    InitListPtr = 0;
+  else
+    InitListPtr = &StaticInitList[G];
+  printStaticInitializer(C,getValueName(G));
+}
+
+
+void MSILWriter::printGlobalVariables() {
+  if (ModulePtr->global_empty()) return;
+  Module::global_iterator I,E;
+  for (I = ModulePtr->global_begin(), E = ModulePtr->global_end(); I!=E; ++I) {
+    // Variable definition
+    Out << ".field static " << (I->isDeclaration() ? "public " :
+                                                     "private ");
+    if (I->isDeclaration()) {
+      Out << getTypeName(I->getType()) << getValueName(&*I) << "\n\n";
+    } else
+      printVariableDefinition(&*I);
+  }
+}
+
+
+const char* MSILWriter::getLibraryName(const Function* F) {
+  return getLibraryForSymbol(F->getName(), true, F->getCallingConv());
+}
+
+
+const char* MSILWriter::getLibraryName(const GlobalVariable* GV) {
+  return getLibraryForSymbol(GV->getName(), false, CallingConv::C);
+}
+
+
+const char* MSILWriter::getLibraryForSymbol(const StringRef &Name, 
+                                            bool isFunction,
+                                            CallingConv::ID CallingConv) {
+  // TODO: Read *.def file with function and libraries definitions.
+  return "MSVCRT.DLL";  
+}
+
+
+void MSILWriter::printExternals() {
+  Module::const_iterator I,E;
+  // Functions.
+  for (I=ModulePtr->begin(),E=ModulePtr->end(); I!=E; ++I) {
+    // Skip intrisics
+    if (I->isIntrinsic()) continue;
+    if (I->isDeclaration()) {
+      const Function* F = I; 
+      std::string Name = getConvModopt(F->getCallingConv())+getValueName(F);
+      std::string Sig = 
+        getCallSignature(cast<FunctionType>(F->getFunctionType()), NULL, Name);
+      Out << ".method static hidebysig pinvokeimpl(\""
+          << getLibraryName(F) << "\")\n\t" << Sig << " preservesig {}\n\n";
+    }
+  }
+  // External variables and static initialization.
+  Out <<
+  ".method public hidebysig static pinvokeimpl(\"KERNEL32.DLL\" ansi winapi)"
+  "  native int LoadLibrary(string) preservesig {}\n"
+  ".method public hidebysig static pinvokeimpl(\"KERNEL32.DLL\" ansi winapi)"
+  "  native int GetProcAddress(native int, string) preservesig {}\n";
+  Out <<
+  ".method private static void* $MSIL_Import(string lib,string sym)\n"
+  " managed cil\n{\n"
+  "\tldarg\tlib\n"
+  "\tcall\tnative int LoadLibrary(string)\n"
+  "\tldarg\tsym\n"
+  "\tcall\tnative int GetProcAddress(native int,string)\n"
+  "\tdup\n"
+  "\tbrtrue\tL_01\n"
+  "\tldstr\t\"Can no import variable\"\n"
+  "\tnewobj\tinstance void [mscorlib]System.Exception::.ctor(string)\n"
+  "\tthrow\n"
+  "L_01:\n"
+  "\tret\n"
+  "}\n\n"
+  ".method static private void $MSIL_Init() managed cil\n{\n";
+  printStaticInitializerList();
+  // Foreach global variable.
+  for (Module::global_iterator I = ModulePtr->global_begin(),
+       E = ModulePtr->global_end(); I!=E; ++I) {
+    if (!I->isDeclaration() || !I->hasDLLImportLinkage()) continue;
+    // Use "LoadLibrary"/"GetProcAddress" to recive variable address.
+    std::string Tmp = getTypeName(I->getType())+getValueName(&*I);
+    printSimpleInstruction("ldsflda",Tmp.c_str());
+    Out << "\tldstr\t\"" << getLibraryName(&*I) << "\"\n";
+    Out << "\tldstr\t\"" << I->getName() << "\"\n";
+    printSimpleInstruction("call","void* $MSIL_Import(string,string)");
+    printIndirectSave(I->getType());
+  }
+  printSimpleInstruction("ret");
+  Out << "}\n\n";
+}
+
+
+//===----------------------------------------------------------------------===//
+//                      External Interface declaration
+//===----------------------------------------------------------------------===//
+
+bool MSILTarget::addPassesToEmitWholeFile(PassManager &PM,
+                                          formatted_raw_ostream &o,
+                                          CodeGenFileType FileType,
+                                          CodeGenOpt::Level OptLevel)
+{
+  if (FileType != TargetMachine::CGFT_AssemblyFile) return true;
+  MSILWriter* Writer = new MSILWriter(o);
+  PM.add(createGCLoweringPass());
+  // FIXME: Handle switch through native IL instruction "switch"
+  PM.add(createLowerSwitchPass());
+  PM.add(createCFGSimplificationPass());
+  PM.add(new MSILModule(Writer->UsedTypes,Writer->TD));
+  PM.add(Writer);
+  PM.add(createGCInfoDeleter());
+  return false;
+}

diff --git a/lib/Target/MSIL/MSILWriter.h b/lib/Target/MSIL/MSILWriter.h
new file mode 100644
index 0000000..a95ae23
--- /dev/null
+++ b/lib/Target/MSIL/MSILWriter.h

@@ -0,0 +1,258 @@
+//===-- MSILWriter.h - TargetMachine for the MSIL ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the MSILWriter that is used by the MSIL.
+//
+//===----------------------------------------------------------------------===//
+#ifndef MSILWRITER_H
+#define MSILWRITER_H
+
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/Module.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/PassManager.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/FindUsedTypes.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+  extern Target TheMSILTarget;
+
+  class MSILModule : public ModulePass {
+    Module *ModulePtr;
+    const std::set<const Type *>*& UsedTypes;
+    const TargetData*& TD;
+
+  public:
+    static char ID;
+    MSILModule(const std::set<const Type *>*& _UsedTypes,
+               const TargetData*& _TD)
+      : ModulePass(&ID), UsedTypes(_UsedTypes), TD(_TD) {}
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<FindUsedTypes>();
+      AU.addRequired<TargetData>();
+    }
+
+    virtual const char *getPassName() const {
+      return "MSIL backend definitions";
+    }
+
+    virtual bool runOnModule(Module &M);
+
+  };
+
+  class MSILWriter : public FunctionPass {
+    struct StaticInitializer {
+      const Constant* constant;
+      uint64_t offset;
+      
+      StaticInitializer()
+        : constant(0), offset(0) {}
+
+      StaticInitializer(const Constant* _constant, uint64_t _offset)
+        : constant(_constant), offset(_offset) {} 
+    };
+
+    uint64_t UniqID;
+
+    uint64_t getUniqID() {
+      return ++UniqID;
+    }
+
+  public:
+    formatted_raw_ostream &Out;
+    Module* ModulePtr;
+    const TargetData* TD;
+    LoopInfo *LInfo;
+    std::vector<StaticInitializer>* InitListPtr;
+    std::map<const GlobalVariable*,std::vector<StaticInitializer> >
+      StaticInitList;
+    const std::set<const Type *>* UsedTypes;
+    static char ID;
+    DenseMap<const Value*, unsigned> AnonValueNumbers;
+    unsigned NextAnonValueNumber;
+
+    MSILWriter(formatted_raw_ostream &o) : FunctionPass(&ID), Out(o),
+         NextAnonValueNumber(0) {
+      UniqID = 0;
+    }
+
+    enum ValueType {
+      UndefVT,
+      GlobalVT,
+      InternalVT,
+      ArgumentVT,
+      LocalVT,
+      ConstVT,
+      ConstExprVT
+    };
+
+    bool isVariable(ValueType V) {
+      return V==GlobalVT || V==InternalVT || V==ArgumentVT || V==LocalVT;
+    }
+
+    bool isConstValue(ValueType V) {
+      return V==ConstVT || V==ConstExprVT;
+    }
+
+    virtual const char *getPassName() const { return "MSIL backend"; }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<LoopInfo>();
+      AU.setPreservesAll();
+    }
+
+    bool runOnFunction(Function &F);
+
+    virtual bool doInitialization(Module &M);
+
+    virtual bool doFinalization(Module &M);
+
+    void printModuleStartup();
+
+    bool isZeroValue(const Value* V);
+
+    std::string getValueName(const Value* V);
+
+    std::string getLabelName(const Value* V);
+
+    std::string getLabelName(const std::string& Name);
+
+    std::string getConvModopt(CallingConv::ID CallingConvID);
+
+    std::string getArrayTypeName(Type::TypeID TyID, const Type* Ty);
+
+    std::string getPrimitiveTypeName(const Type* Ty, bool isSigned);
+
+    std::string getFunctionTypeName(const Type* Ty);
+
+    std::string getPointerTypeName(const Type* Ty);
+
+    std::string getTypeName(const Type* Ty, bool isSigned = false,
+                            bool isNested = false);
+
+    ValueType getValueLocation(const Value* V);
+
+    std::string getTypePostfix(const Type* Ty, bool Expand,
+                               bool isSigned = false);
+
+    void printConvToPtr();
+
+    void printPtrLoad(uint64_t N);
+
+    void printValuePtrLoad(const Value* V);
+
+    void printConstLoad(const Constant* C);
+
+    void printValueLoad(const Value* V);
+
+    void printValueSave(const Value* V);
+
+    void printBinaryInstruction(const char* Name, const Value* Left,
+                                const Value* Right);
+
+    void printSimpleInstruction(const char* Inst, const char* Operand = NULL);
+
+    void printPHICopy(const BasicBlock* Src, const BasicBlock* Dst);
+
+    void printBranchToBlock(const BasicBlock* CurrBB,
+                            const BasicBlock* TrueBB,
+                            const BasicBlock* FalseBB);
+
+    void printBranchInstruction(const BranchInst* Inst);
+
+    void printSelectInstruction(const Value* Cond, const Value* VTrue,
+                                const Value* VFalse);
+
+    void printIndirectLoad(const Value* V);
+
+    void printIndirectSave(const Value* Ptr, const Value* Val);
+
+    void printIndirectSave(const Type* Ty);
+
+    void printCastInstruction(unsigned int Op, const Value* V,
+                              const Type* Ty, const Type* SrcTy=0);
+
+    void printGepInstruction(const Value* V, gep_type_iterator I,
+                             gep_type_iterator E);
+
+    std::string getCallSignature(const FunctionType* Ty,
+                                 const Instruction* Inst,
+                                 std::string Name);
+
+    void printFunctionCall(const Value* FnVal, const Instruction* Inst);
+
+    void printIntrinsicCall(const IntrinsicInst* Inst);
+
+    void printCallInstruction(const Instruction* Inst);
+
+    void printICmpInstruction(unsigned Predicate, const Value* Left,
+                              const Value* Right);
+
+    void printFCmpInstruction(unsigned Predicate, const Value* Left,
+                              const Value* Right);
+
+    void printInvokeInstruction(const InvokeInst* Inst);
+
+    void printSwitchInstruction(const SwitchInst* Inst);
+
+    void printVAArgInstruction(const VAArgInst* Inst);
+
+    void printAllocaInstruction(const AllocaInst* Inst);
+
+    void printInstruction(const Instruction* Inst);
+
+    void printLoop(const Loop* L);
+
+    void printBasicBlock(const BasicBlock* BB);
+    
+    void printLocalVariables(const Function& F);
+
+    void printFunctionBody(const Function& F);
+
+    void printConstantExpr(const ConstantExpr* CE);
+
+    void printStaticInitializerList();
+
+    void printFunction(const Function& F);
+
+    void printDeclarations(const TypeSymbolTable& ST);
+
+    unsigned int getBitWidth(const Type* Ty);
+
+    void printStaticConstant(const Constant* C, uint64_t& Offset);
+
+    void printStaticInitializer(const Constant* C, const std::string& Name);
+
+    void printVariableDefinition(const GlobalVariable* G);
+
+    void printGlobalVariables();
+
+    const char* getLibraryName(const Function* F);
+
+    const char* getLibraryName(const GlobalVariable* GV); 
+    
+    const char* getLibraryForSymbol(const StringRef &Name, bool isFunction,
+                                    CallingConv::ID CallingConv);
+
+    void printExternals();
+  };
+
+}
+
+#endif
+

diff --git a/lib/Target/MSIL/Makefile b/lib/Target/MSIL/Makefile
new file mode 100644
index 0000000..70eadb3
--- /dev/null
+++ b/lib/Target/MSIL/Makefile

@@ -0,0 +1,16 @@
+##===- lib/Target/MSIL/Makefile ----------------------------*- Makefile -*-===##
+#
+#		      The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMMSIL
+DIRS = TargetInfo
+
+include $(LEVEL)/Makefile.common
+
+CompileCommonOpts := $(CompileCommonOpts) -Wno-format

diff --git a/lib/Target/MSIL/README.TXT b/lib/Target/MSIL/README.TXT
new file mode 100644
index 0000000..d797c71
--- /dev/null
+++ b/lib/Target/MSIL/README.TXT

@@ -0,0 +1,26 @@
+//===---------------------------------------------------------------------===// 
+
+Vector instructions support.
+
+ShuffleVector
+ExtractElement
+InsertElement
+
+//===---------------------------------------------------------------------===// 
+
+Add "OpaqueType" type.
+
+//===---------------------------------------------------------------------===// 
+
+"switch" instruction emulation with CLI "switch" instruction.
+
+//===---------------------------------------------------------------------===// 
+
+Write linker for external function, because function export need to know 
+dynamic library where function located.
+
+.method static hidebysig pinvokeimpl("msvcrt.dll" cdecl)
+    void free(void*) preservesig {}
+
+
+

diff --git a/lib/Target/MSIL/TargetInfo/CMakeLists.txt b/lib/Target/MSIL/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000..9f0c3a0
--- /dev/null
+++ b/lib/Target/MSIL/TargetInfo/CMakeLists.txt

@@ -0,0 +1,6 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMMSILInfo
+  MSILTargetInfo.cpp
+  )
+

diff --git a/lib/Target/MSIL/TargetInfo/MSILTargetInfo.cpp b/lib/Target/MSIL/TargetInfo/MSILTargetInfo.cpp
new file mode 100644
index 0000000..dfd4281
--- /dev/null
+++ b/lib/Target/MSIL/TargetInfo/MSILTargetInfo.cpp

@@ -0,0 +1,26 @@
+//===-- MSILTargetInfo.cpp - MSIL Target Implementation -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSILWriter.h"
+#include "llvm/Module.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheMSILTarget;
+
+static unsigned MSIL_TripleMatchQuality(const std::string &TT) {
+  // This class always works, but shouldn't be the default in most cases.
+  return 1;
+}
+
+extern "C" void LLVMInitializeMSILTargetInfo() { 
+  TargetRegistry::RegisterTarget(TheMSILTarget, "msil",    
+                                  "MSIL backend",
+                                  &MSIL_TripleMatchQuality);
+}

diff --git a/lib/Target/MSIL/TargetInfo/Makefile b/lib/Target/MSIL/TargetInfo/Makefile
new file mode 100644
index 0000000..30b0950
--- /dev/null
+++ b/lib/Target/MSIL/TargetInfo/Makefile

@@ -0,0 +1,15 @@
+##===- lib/Target/MSIL/TargetInfo/Makefile -----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMMSILInfo
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common

diff --git a/lib/Target/MSP430/AsmPrinter/CMakeLists.txt b/lib/Target/MSP430/AsmPrinter/CMakeLists.txt
new file mode 100644
index 0000000..4b1f4e6
--- /dev/null
+++ b/lib/Target/MSP430/AsmPrinter/CMakeLists.txt

@@ -0,0 +1,8 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMMSP430AsmPrinter
+  MSP430AsmPrinter.cpp
+  MSP430InstPrinter.cpp
+  MSP430MCInstLower.cpp
+  )
+add_dependencies(LLVMMSP430AsmPrinter MSP430CodeGenTable_gen)

diff --git a/lib/Target/MSP430/AsmPrinter/MSP430AsmPrinter.cpp b/lib/Target/MSP430/AsmPrinter/MSP430AsmPrinter.cpp
new file mode 100644
index 0000000..def5fc6
--- /dev/null
+++ b/lib/Target/MSP430/AsmPrinter/MSP430AsmPrinter.cpp

@@ -0,0 +1,203 @@
+//===-- MSP430AsmPrinter.cpp - MSP430 LLVM assembly writer ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to the MSP430 assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "MSP430.h"
+#include "MSP430InstrInfo.h"
+#include "MSP430InstPrinter.h"
+#include "MSP430MCAsmInfo.h"
+#include "MSP430MCInstLower.h"
+#include "MSP430TargetMachine.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/FormattedStream.h"
+using namespace llvm;
+
+namespace {
+  class MSP430AsmPrinter : public AsmPrinter {
+  public:
+    MSP430AsmPrinter(formatted_raw_ostream &O, TargetMachine &TM,
+                     MCContext &Ctx, MCStreamer &Streamer,
+                     const MCAsmInfo *MAI)
+      : AsmPrinter(O, TM, Ctx, Streamer, MAI) {}
+
+    virtual const char *getPassName() const {
+      return "MSP430 Assembly Printer";
+    }
+
+    void printMCInst(const MCInst *MI) {
+      MSP430InstPrinter(O, *MAI).printInstruction(MI);
+    }
+    void printOperand(const MachineInstr *MI, int OpNum,
+                      const char* Modifier = 0);
+    void printPCRelImmOperand(const MachineInstr *MI, int OpNum) {
+      printOperand(MI, OpNum);
+    }
+    void printSrcMemOperand(const MachineInstr *MI, int OpNum,
+                            const char* Modifier = 0);
+    void printCCOperand(const MachineInstr *MI, int OpNum);
+    void printMachineInstruction(const MachineInstr * MI);
+    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                         unsigned AsmVariant,
+                         const char *ExtraCode);
+    bool PrintAsmMemoryOperand(const MachineInstr *MI,
+                               unsigned OpNo, unsigned AsmVariant,
+                               const char *ExtraCode);
+    void EmitInstruction(const MachineInstr *MI);
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AsmPrinter::getAnalysisUsage(AU);
+      AU.setPreservesAll();
+    }
+  };
+} // end of anonymous namespace
+
+
+void MSP430AsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
+                                    const char* Modifier) {
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  switch (MO.getType()) {
+  default: assert(0 && "Not implemented yet!");
+  case MachineOperand::MO_Register:
+    O << MSP430InstPrinter::getRegisterName(MO.getReg());
+    return;
+  case MachineOperand::MO_Immediate:
+    if (!Modifier || strcmp(Modifier, "nohash"))
+      O << '#';
+    O << MO.getImm();
+    return;
+  case MachineOperand::MO_MachineBasicBlock:
+    O << *MO.getMBB()->getSymbol(OutContext);
+    return;
+  case MachineOperand::MO_GlobalAddress: {
+    bool isMemOp  = Modifier && !strcmp(Modifier, "mem");
+    uint64_t Offset = MO.getOffset();
+
+    O << (isMemOp ? '&' : '#');
+    if (Offset)
+      O << '(' << Offset << '+';
+
+    O << *GetGlobalValueSymbol(MO.getGlobal());
+    
+    if (Offset)
+      O << ')';
+
+    return;
+  }
+  case MachineOperand::MO_ExternalSymbol: {
+    bool isMemOp  = Modifier && !strcmp(Modifier, "mem");
+    O << (isMemOp ? '&' : '#');
+    O << MAI->getGlobalPrefix() << MO.getSymbolName();
+    return;
+  }
+  }
+}
+
+void MSP430AsmPrinter::printSrcMemOperand(const MachineInstr *MI, int OpNum,
+                                          const char* Modifier) {
+  const MachineOperand &Base = MI->getOperand(OpNum);
+  const MachineOperand &Disp = MI->getOperand(OpNum+1);
+
+  // Print displacement first
+  if (!Disp.isImm()) {
+    printOperand(MI, OpNum+1, "mem");
+  } else {
+    if (!Base.getReg())
+      O << '&';
+
+    printOperand(MI, OpNum+1, "nohash");
+  }
+
+
+  // Print register base field
+  if (Base.getReg()) {
+    O << '(';
+    printOperand(MI, OpNum);
+    O << ')';
+  }
+}
+
+void MSP430AsmPrinter::printCCOperand(const MachineInstr *MI, int OpNum) {
+  switch (MI->getOperand(OpNum).getImm()) {
+  default: assert(0 && "Unknown cond");
+  case MSP430CC::COND_E:  O << "eq"; break;
+  case MSP430CC::COND_NE: O << "ne"; break;
+  case MSP430CC::COND_HS: O << "hs"; break;
+  case MSP430CC::COND_LO: O << "lo"; break;
+  case MSP430CC::COND_GE: O << "ge"; break;
+  case MSP430CC::COND_L:  O << 'l';  break;
+  }
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool MSP430AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                       unsigned AsmVariant,
+                                       const char *ExtraCode) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier.
+
+  printOperand(MI, OpNo);
+  return false;
+}
+
+bool MSP430AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                             unsigned OpNo, unsigned AsmVariant,
+                                             const char *ExtraCode) {
+  if (ExtraCode && ExtraCode[0]) {
+    return true; // Unknown modifier.
+  }
+  printSrcMemOperand(MI, OpNo);
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+void MSP430AsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  MSP430MCInstLower MCInstLowering(OutContext, *Mang, *this);
+
+  MCInst TmpInst;
+  MCInstLowering.Lower(MI, TmpInst);
+  OutStreamer.EmitInstruction(TmpInst);
+}
+
+static MCInstPrinter *createMSP430MCInstPrinter(const Target &T,
+                                                unsigned SyntaxVariant,
+                                                const MCAsmInfo &MAI,
+                                                raw_ostream &O) {
+  if (SyntaxVariant == 0)
+    return new MSP430InstPrinter(O, MAI);
+  return 0;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeMSP430AsmPrinter() {
+  RegisterAsmPrinter<MSP430AsmPrinter> X(TheMSP430Target);
+  TargetRegistry::RegisterMCInstPrinter(TheMSP430Target,
+                                        createMSP430MCInstPrinter);
+}

diff --git a/lib/Target/MSP430/AsmPrinter/MSP430InstPrinter.cpp b/lib/Target/MSP430/AsmPrinter/MSP430InstPrinter.cpp
new file mode 100644
index 0000000..f6565bd
--- /dev/null
+++ b/lib/Target/MSP430/AsmPrinter/MSP430InstPrinter.cpp

@@ -0,0 +1,108 @@
+//===-- MSP430InstPrinter.cpp - Convert MSP430 MCInst to assembly syntax --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an MSP430 MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "MSP430.h"
+#include "MSP430InstrInfo.h"
+#include "MSP430InstPrinter.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+using namespace llvm;
+
+
+// Include the auto-generated portion of the assembly writer.
+#define MachineInstr MCInst
+#include "MSP430GenAsmWriter.inc"
+#undef MachineInstr
+
+void MSP430InstPrinter::printInst(const MCInst *MI) {
+  printInstruction(MI);
+}
+
+void MSP430InstPrinter::printPCRelImmOperand(const MCInst *MI, unsigned OpNo) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isImm())
+    O << Op.getImm();
+  else {
+    assert(Op.isExpr() && "unknown pcrel immediate operand");
+    O << *Op.getExpr();
+  }
+}
+
+void MSP430InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                     const char *Modifier) {
+  assert((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported");
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    O << getRegisterName(Op.getReg());
+  } else if (Op.isImm()) {
+    O << '#' << Op.getImm();
+  } else {
+    assert(Op.isExpr() && "unknown operand kind in printOperand");
+    O << '#' << *Op.getExpr();
+  }
+}
+
+void MSP430InstPrinter::printSrcMemOperand(const MCInst *MI, unsigned OpNo,
+                                           const char *Modifier) {
+  const MCOperand &Base = MI->getOperand(OpNo);
+  const MCOperand &Disp = MI->getOperand(OpNo+1);
+
+  // Print displacement first
+  if (Disp.isExpr()) {
+    O << '&' << *Disp.getExpr();
+  } else {
+    assert(Disp.isImm() && "Expected immediate in displacement field");
+    if (!Base.getReg())
+      O << '&';
+
+    O << Disp.getImm();
+  }
+
+
+  // Print register base field
+  if (Base.getReg()) {
+    O << '(' << getRegisterName(Base.getReg()) << ')';
+  }
+}
+
+void MSP430InstPrinter::printCCOperand(const MCInst *MI, unsigned OpNo) {
+  unsigned CC = MI->getOperand(OpNo).getImm();
+
+  switch (CC) {
+  default:
+   llvm_unreachable("Unsupported CC code");
+   break;
+  case MSP430CC::COND_E:
+   O << "eq";
+   break;
+  case MSP430CC::COND_NE:
+   O << "ne";
+   break;
+  case MSP430CC::COND_HS:
+   O << "hs";
+   break;
+  case MSP430CC::COND_LO:
+   O << "lo";
+   break;
+  case MSP430CC::COND_GE:
+   O << "ge";
+   break;
+  case MSP430CC::COND_L:
+   O << 'l';
+   break;
+  }
+}

diff --git a/lib/Target/MSP430/AsmPrinter/MSP430InstPrinter.h b/lib/Target/MSP430/AsmPrinter/MSP430InstPrinter.h
new file mode 100644
index 0000000..2fac800
--- /dev/null
+++ b/lib/Target/MSP430/AsmPrinter/MSP430InstPrinter.h

@@ -0,0 +1,46 @@
+//===-- MSP430InstPrinter.h - Convert MSP430 MCInst to assembly syntax ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints a MSP430 MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MSP430INSTPRINTER_H
+#define MSP430INSTPRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm
+{
+
+  class MCOperand;
+
+  class MSP430InstPrinter : public MCInstPrinter {
+  public:
+    MSP430InstPrinter(raw_ostream &O, const MCAsmInfo &MAI) :
+      MCInstPrinter(O, MAI){
+    }
+
+    virtual void printInst(const MCInst *MI);
+
+    // Autogenerated by tblgen.
+    void printInstruction(const MCInst *MI);
+    static const char *getRegisterName(unsigned RegNo);
+
+    void printOperand(const MCInst *MI, unsigned OpNo,
+                      const char *Modifier = 0);
+    void printPCRelImmOperand(const MCInst *MI, unsigned OpNo);
+    void printSrcMemOperand(const MCInst *MI, unsigned OpNo,
+                            const char *Modifier = 0);
+    void printCCOperand(const MCInst *MI, unsigned OpNo);
+
+  };
+}
+
+#endif

diff --git a/lib/Target/MSP430/AsmPrinter/MSP430MCInstLower.cpp b/lib/Target/MSP430/AsmPrinter/MSP430MCInstLower.cpp
new file mode 100644
index 0000000..4eb7f3d
--- /dev/null
+++ b/lib/Target/MSP430/AsmPrinter/MSP430MCInstLower.cpp

@@ -0,0 +1,137 @@
+//===-- MSP430MCInstLower.cpp - Convert MSP430 MachineInstr to an MCInst---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower MSP430 MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430MCInstLower.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/ADT/SmallString.h"
+using namespace llvm;
+
+MCSymbol *MSP430MCInstLower::
+GetGlobalAddressSymbol(const MachineOperand &MO) const {
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  case 0: break;
+  }
+
+  return Printer.GetGlobalValueSymbol(MO.getGlobal());
+}
+
+MCSymbol *MSP430MCInstLower::
+GetExternalSymbolSymbol(const MachineOperand &MO) const {
+  switch (MO.getTargetFlags()) {
+  default: assert(0 && "Unknown target flag on GV operand");
+  case 0: break;
+  }
+
+  return Printer.GetExternalSymbolSymbol(MO.getSymbolName());
+}
+
+MCSymbol *MSP430MCInstLower::
+GetJumpTableSymbol(const MachineOperand &MO) const {
+  SmallString<256> Name;
+  raw_svector_ostream(Name) << Printer.MAI->getPrivateGlobalPrefix() << "JTI"
+                            << Printer.getFunctionNumber() << '_'
+                            << MO.getIndex();
+
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  case 0: break;
+  }
+
+  // Create a symbol for the name.
+  return Ctx.GetOrCreateSymbol(Name.str());
+}
+
+MCSymbol *MSP430MCInstLower::
+GetConstantPoolIndexSymbol(const MachineOperand &MO) const {
+  SmallString<256> Name;
+  raw_svector_ostream(Name) << Printer.MAI->getPrivateGlobalPrefix() << "CPI"
+                            << Printer.getFunctionNumber() << '_'
+                            << MO.getIndex();
+
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  case 0: break;
+  }
+
+  // Create a symbol for the name.
+  return Ctx.GetOrCreateSymbol(Name.str());
+}
+
+MCOperand MSP430MCInstLower::
+LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const {
+  // FIXME: We would like an efficient form for this, so we don't have to do a
+  // lot of extra uniquing.
+  const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  case 0: break;
+  }
+
+  if (!MO.isJTI() && MO.getOffset())
+    Expr = MCBinaryExpr::CreateAdd(Expr,
+                                   MCConstantExpr::Create(MO.getOffset(), Ctx),
+                                   Ctx);
+  return MCOperand::CreateExpr(Expr);
+}
+
+void MSP430MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+
+    MCOperand MCOp;
+    switch (MO.getType()) {
+    default:
+      MI->dump();
+      assert(0 && "unknown operand type");
+    case MachineOperand::MO_Register:
+      // Ignore all implicit register operands.
+      if (MO.isImplicit()) continue;
+      MCOp = MCOperand::CreateReg(MO.getReg());
+      break;
+    case MachineOperand::MO_Immediate:
+      MCOp = MCOperand::CreateImm(MO.getImm());
+      break;
+    case MachineOperand::MO_MachineBasicBlock:
+      MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
+                         MO.getMBB()->getSymbol(Printer.OutContext), Ctx));
+      break;
+    case MachineOperand::MO_GlobalAddress:
+      MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO));
+      break;
+    case MachineOperand::MO_ExternalSymbol:
+      MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO));
+      break;
+    case MachineOperand::MO_JumpTableIndex:
+      MCOp = LowerSymbolOperand(MO, GetJumpTableSymbol(MO));
+      break;
+    case MachineOperand::MO_ConstantPoolIndex:
+      MCOp = LowerSymbolOperand(MO, GetConstantPoolIndexSymbol(MO));
+      break;
+    }
+
+    OutMI.addOperand(MCOp);
+  }
+}

diff --git a/lib/Target/MSP430/AsmPrinter/MSP430MCInstLower.h b/lib/Target/MSP430/AsmPrinter/MSP430MCInstLower.h
new file mode 100644
index 0000000..a2b99ae
--- /dev/null
+++ b/lib/Target/MSP430/AsmPrinter/MSP430MCInstLower.h

@@ -0,0 +1,49 @@
+//===-- MSP430MCInstLower.h - Lower MachineInstr to MCInst ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MSP430_MCINSTLOWER_H
+#define MSP430_MCINSTLOWER_H
+
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+  class AsmPrinter;
+  class MCAsmInfo;
+  class MCContext;
+  class MCInst;
+  class MCOperand;
+  class MCSymbol;
+  class MachineInstr;
+  class MachineModuleInfoMachO;
+  class MachineOperand;
+  class Mangler;
+
+  /// MSP430MCInstLower - This class is used to lower an MachineInstr
+  /// into an MCInst.
+class VISIBILITY_HIDDEN MSP430MCInstLower {
+  MCContext &Ctx;
+  Mangler &Mang;
+
+  AsmPrinter &Printer;
+public:
+  MSP430MCInstLower(MCContext &ctx, Mangler &mang, AsmPrinter &printer)
+    : Ctx(ctx), Mang(mang), Printer(printer) {}
+  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+  MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+
+  MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetJumpTableSymbol(const MachineOperand &MO) const;
+  MCSymbol *GetConstantPoolIndexSymbol(const MachineOperand &MO) const;
+};
+
+}
+
+#endif

diff --git a/lib/Target/MSP430/AsmPrinter/Makefile b/lib/Target/MSP430/AsmPrinter/Makefile
new file mode 100644
index 0000000..4f340c6
--- /dev/null
+++ b/lib/Target/MSP430/AsmPrinter/Makefile

@@ -0,0 +1,15 @@
+##===- lib/Target/MSP430/AsmPrinter/Makefile ---------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMMSP430AsmPrinter
+
+# Hack: we need to include 'main' MSP430 target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common

diff --git a/lib/Target/MSP430/CMakeLists.txt b/lib/Target/MSP430/CMakeLists.txt
new file mode 100644
index 0000000..29abe46
--- /dev/null
+++ b/lib/Target/MSP430/CMakeLists.txt

@@ -0,0 +1,24 @@
+set(LLVM_TARGET_DEFINITIONS MSP430.td)
+
+tablegen(MSP430GenRegisterInfo.h.inc -gen-register-desc-header)
+tablegen(MSP430GenRegisterNames.inc -gen-register-enums)
+tablegen(MSP430GenRegisterInfo.inc -gen-register-desc)
+tablegen(MSP430GenInstrNames.inc -gen-instr-enums)
+tablegen(MSP430GenInstrInfo.inc -gen-instr-desc)
+tablegen(MSP430GenAsmWriter.inc -gen-asm-writer)
+tablegen(MSP430GenDAGISel.inc -gen-dag-isel)
+tablegen(MSP430GenCallingConv.inc -gen-callingconv)
+tablegen(MSP430GenSubtarget.inc -gen-subtarget)
+
+add_llvm_target(MSP430CodeGen
+  MSP430BranchSelector.cpp
+  MSP430ISelDAGToDAG.cpp
+  MSP430ISelLowering.cpp
+  MSP430InstrInfo.cpp
+  MSP430MCAsmInfo.cpp
+  MSP430RegisterInfo.cpp
+  MSP430Subtarget.cpp
+  MSP430TargetMachine.cpp
+  )
+
+target_link_libraries (LLVMMSP430CodeGen LLVMSelectionDAG)

diff --git a/lib/Target/MSP430/MSP430.h b/lib/Target/MSP430/MSP430.h
new file mode 100644
index 0000000..e742118
--- /dev/null
+++ b/lib/Target/MSP430/MSP430.h

@@ -0,0 +1,55 @@
+//==-- MSP430.h - Top-level interface for MSP430 representation --*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in
+// the LLVM MSP430 backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_MSP430_H
+#define LLVM_TARGET_MSP430_H
+
+#include "llvm/Target/TargetMachine.h"
+
+namespace MSP430CC {
+  // MSP430 specific condition code.
+  enum CondCodes {
+    COND_E  = 0,  // aka COND_Z
+    COND_NE = 1,  // aka COND_NZ
+    COND_HS = 2,  // aka COND_C
+    COND_LO = 3,  // aka COND_NC
+    COND_GE = 4,
+    COND_L  = 5,
+
+    COND_INVALID = -1
+  };
+}
+
+namespace llvm {
+  class MSP430TargetMachine;
+  class FunctionPass;
+  class formatted_raw_ostream;
+
+  FunctionPass *createMSP430ISelDag(MSP430TargetMachine &TM,
+                                    CodeGenOpt::Level OptLevel);
+
+  FunctionPass *createMSP430BranchSelectionPass();
+
+  extern Target TheMSP430Target;
+
+} // end namespace llvm;
+
+// Defines symbolic names for MSP430 registers.
+// This defines a mapping from register name to register number.
+#include "MSP430GenRegisterNames.inc"
+
+// Defines symbolic names for the MSP430 instructions.
+#include "MSP430GenInstrNames.inc"
+
+#endif

diff --git a/lib/Target/MSP430/MSP430.td b/lib/Target/MSP430/MSP430.td
new file mode 100644
index 0000000..fe533d3
--- /dev/null
+++ b/lib/Target/MSP430/MSP430.td

@@ -0,0 +1,72 @@
+//===- MSP430.td - Describe the MSP430 Target Machine ---------*- tblgen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This is the top level entry point for the MSP430 target.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// Subtarget Features. 
+//===----------------------------------------------------------------------===//
+def FeatureX
+ : SubtargetFeature<"ext", "ExtendedInsts", "true",
+                    "Enable MSP430-X extensions">;
+
+//===----------------------------------------------------------------------===//
+// MSP430 supported processors.
+//===----------------------------------------------------------------------===//
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"generic",         []>;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "MSP430RegisterInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Calling Convention Description
+//===----------------------------------------------------------------------===//
+
+include "MSP430CallingConv.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "MSP430InstrInfo.td"
+
+def MSP430InstrInfo : InstrInfo {
+  // Define how we want to layout our TargetSpecific information field... This
+  // should be kept up-to-date with the fields in the MSP430InstrInfo.h file.
+  let TSFlagsFields = ["FormBits",
+                       "Size"];
+  let TSFlagsShifts = [0,
+                       2];
+}
+
+def MSP430InstPrinter : AsmWriter {
+  string AsmWriterClassName  = "InstPrinter";
+}
+
+//===----------------------------------------------------------------------===//
+// Target Declaration
+//===----------------------------------------------------------------------===//
+
+def MSP430 : Target {
+  let InstructionSet = MSP430InstrInfo;
+  let AssemblyWriters = [MSP430InstPrinter];
+}
+

diff --git a/lib/Target/MSP430/MSP430BranchSelector.cpp b/lib/Target/MSP430/MSP430BranchSelector.cpp
new file mode 100644
index 0000000..836e425
--- /dev/null
+++ b/lib/Target/MSP430/MSP430BranchSelector.cpp

@@ -0,0 +1,179 @@
+//===-- MSP430BranchSelector.cpp - Emit long conditional branches--*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that scans a machine function to determine which
+// conditional branches need more than 10 bits of displacement to reach their
+// target basic block.  It does this in two passes; a calculation of basic block
+// positions pass, and a branch psuedo op to machine branch opcode pass.  This
+// pass should be run last, just before the assembly printer.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "msp430-branch-select"
+#include "MSP430.h"
+#include "MSP430InstrInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/MathExtras.h"
+using namespace llvm;
+
+STATISTIC(NumExpanded, "Number of branches expanded to long format");
+
+namespace {
+  struct MSP430BSel : public MachineFunctionPass {
+    static char ID;
+    MSP430BSel() : MachineFunctionPass(&ID) {}
+
+    /// BlockSizes - The sizes of the basic blocks in the function.
+    std::vector<unsigned> BlockSizes;
+
+    virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+    virtual const char *getPassName() const {
+      return "MSP430 Branch Selector";
+    }
+  };
+  char MSP430BSel::ID = 0;
+}
+
+/// createMSP430BranchSelectionPass - returns an instance of the Branch
+/// Selection Pass
+///
+FunctionPass *llvm::createMSP430BranchSelectionPass() {
+  return new MSP430BSel();
+}
+
+bool MSP430BSel::runOnMachineFunction(MachineFunction &Fn) {
+  const TargetInstrInfo *TII = Fn.getTarget().getInstrInfo();
+  // Give the blocks of the function a dense, in-order, numbering.
+  Fn.RenumberBlocks();
+  BlockSizes.resize(Fn.getNumBlockIDs());
+
+  // Measure each MBB and compute a size for the entire function.
+  unsigned FuncSize = 0;
+  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+       ++MFI) {
+    MachineBasicBlock *MBB = MFI;
+
+    unsigned BlockSize = 0;
+    for (MachineBasicBlock::iterator MBBI = MBB->begin(), EE = MBB->end();
+         MBBI != EE; ++MBBI)
+      BlockSize += TII->GetInstSizeInBytes(MBBI);
+
+    BlockSizes[MBB->getNumber()] = BlockSize;
+    FuncSize += BlockSize;
+  }
+
+  // If the entire function is smaller than the displacement of a branch field,
+  // we know we don't need to shrink any branches in this function.  This is a
+  // common case.
+  if (FuncSize < (1 << 9)) {
+    BlockSizes.clear();
+    return false;
+  }
+
+  // For each conditional branch, if the offset to its destination is larger
+  // than the offset field allows, transform it into a long branch sequence
+  // like this:
+  //   short branch:
+  //     bCC MBB
+  //   long branch:
+  //     b!CC $PC+6
+  //     b MBB
+  //
+  bool MadeChange = true;
+  bool EverMadeChange = false;
+  while (MadeChange) {
+    // Iteratively expand branches until we reach a fixed point.
+    MadeChange = false;
+
+    for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+         ++MFI) {
+      MachineBasicBlock &MBB = *MFI;
+      unsigned MBBStartOffset = 0;
+      for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+           I != E; ++I) {
+        if ((I->getOpcode() != MSP430::JCC || I->getOperand(0).isImm()) &&
+            I->getOpcode() != MSP430::JMP) {
+          MBBStartOffset += TII->GetInstSizeInBytes(I);
+          continue;
+        }
+
+        // Determine the offset from the current branch to the destination
+        // block.
+        MachineBasicBlock *Dest = I->getOperand(0).getMBB();
+
+        int BranchSize;
+        if (Dest->getNumber() <= MBB.getNumber()) {
+          // If this is a backwards branch, the delta is the offset from the
+          // start of this block to this branch, plus the sizes of all blocks
+          // from this block to the dest.
+          BranchSize = MBBStartOffset;
+
+          for (unsigned i = Dest->getNumber(), e = MBB.getNumber(); i != e; ++i)
+            BranchSize += BlockSizes[i];
+        } else {
+          // Otherwise, add the size of the blocks between this block and the
+          // dest to the number of bytes left in this block.
+          BranchSize = -MBBStartOffset;
+
+          for (unsigned i = MBB.getNumber(), e = Dest->getNumber(); i != e; ++i)
+            BranchSize += BlockSizes[i];
+        }
+
+        // If this branch is in range, ignore it.
+        if (isInt<10>(BranchSize)) {
+          MBBStartOffset += 2;
+          continue;
+        }
+
+        // Otherwise, we have to expand it to a long branch.
+        unsigned NewSize;
+        MachineInstr *OldBranch = I;
+        DebugLoc dl = OldBranch->getDebugLoc();
+
+        if (I->getOpcode() == MSP430::JMP) {
+          NewSize = 4;
+        } else {
+          // The BCC operands are:
+          // 0. MSP430 branch predicate
+          // 1. Target MBB
+          SmallVector<MachineOperand, 1> Cond;
+          Cond.push_back(I->getOperand(1));
+
+          // Jump over the uncond branch inst (i.e. $+6) on opposite condition.
+          TII->ReverseBranchCondition(Cond);
+          BuildMI(MBB, I, dl, TII->get(MSP430::JCC))
+            .addImm(4).addOperand(Cond[0]);
+
+          NewSize = 6;
+        }
+        // Uncond branch to the real destination.
+        I = BuildMI(MBB, I, dl, TII->get(MSP430::B)).addMBB(Dest);
+
+        // Remove the old branch from the function.
+        OldBranch->eraseFromParent();
+
+        // Remember that this instruction is NewSize bytes, increase the size of the
+        // block by NewSize-2, remember to iterate.
+        BlockSizes[MBB.getNumber()] += NewSize-2;
+        MBBStartOffset += NewSize;
+
+        ++NumExpanded;
+        MadeChange = true;
+      }
+    }
+    EverMadeChange |= MadeChange;
+  }
+
+  BlockSizes.clear();
+  return true;
+}

diff --git a/lib/Target/MSP430/MSP430CallingConv.td b/lib/Target/MSP430/MSP430CallingConv.td
new file mode 100644
index 0000000..ad27cc9
--- /dev/null
+++ b/lib/Target/MSP430/MSP430CallingConv.td

@@ -0,0 +1,37 @@
+//==- MSP430CallingConv.td - Calling Conventions for MSP430 -*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This describes the calling conventions for MSP430 architecture.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MSP430 Return Value Calling Convention
+//===----------------------------------------------------------------------===//
+def RetCC_MSP430 : CallingConv<[
+  // i8 are returned in registers R15B, R14B, R13B, R12B
+  CCIfType<[i8], CCAssignToReg<[R15B, R14B, R13B, R12B]>>,
+
+  // i16 are returned in registers R15, R14, R13, R12
+  CCIfType<[i16], CCAssignToReg<[R15W, R14W, R13W, R12W]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// MSP430 Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+def CC_MSP430 : CallingConv<[
+  // Promote i8 arguments to i16.
+  CCIfType<[i8], CCPromoteToType<i16>>,
+
+  // The first 4 integer arguments of non-varargs functions are passed in
+  // integer registers.
+  CCIfNotVarArg<CCIfType<[i16], CCAssignToReg<[R15W, R14W, R13W, R12W]>>>,
+
+  // Integer values get stored in stack slots that are 2 bytes in
+  // size and 2-byte aligned.
+  CCIfType<[i16], CCAssignToStack<2, 2>>
+]>;

diff --git a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
new file mode 100644
index 0000000..4eec757
--- /dev/null
+++ b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp

@@ -0,0 +1,821 @@
+//===-- MSP430ISelDAGToDAG.cpp - A dag to dag inst selector for MSP430 ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the MSP430 target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430.h"
+#include "MSP430ISelLowering.h"
+#include "MSP430TargetMachine.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/Statistic.h"
+
+using namespace llvm;
+
+#ifndef NDEBUG
+static cl::opt<bool>
+ViewRMWDAGs("view-msp430-rmw-dags", cl::Hidden,
+          cl::desc("Pop up a window to show isel dags after RMW preprocess"));
+#else
+static const bool ViewRMWDAGs = false;
+#endif
+
+STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
+
+
+namespace {
+  struct MSP430ISelAddressMode {
+    enum {
+      RegBase,
+      FrameIndexBase
+    } BaseType;
+
+    struct {            // This is really a union, discriminated by BaseType!
+      SDValue Reg;
+      int FrameIndex;
+    } Base;
+
+    int16_t Disp;
+    GlobalValue *GV;
+    Constant *CP;
+    BlockAddress *BlockAddr;
+    const char *ES;
+    int JT;
+    unsigned Align;    // CP alignment.
+
+    MSP430ISelAddressMode()
+      : BaseType(RegBase), Disp(0), GV(0), CP(0), BlockAddr(0),
+        ES(0), JT(-1), Align(0) {
+    }
+
+    bool hasSymbolicDisplacement() const {
+      return GV != 0 || CP != 0 || ES != 0 || JT != -1;
+    }
+
+    bool hasBaseReg() const {
+      return Base.Reg.getNode() != 0;
+    }
+
+    void setBaseReg(SDValue Reg) {
+      BaseType = RegBase;
+      Base.Reg = Reg;
+    }
+
+    void dump() {
+      errs() << "MSP430ISelAddressMode " << this << '\n';
+      if (BaseType == RegBase && Base.Reg.getNode() != 0) {
+        errs() << "Base.Reg ";
+        Base.Reg.getNode()->dump();
+      } else if (BaseType == FrameIndexBase) {
+        errs() << " Base.FrameIndex " << Base.FrameIndex << '\n';
+      }
+      errs() << " Disp " << Disp << '\n';
+      if (GV) {
+        errs() << "GV ";
+        GV->dump();
+      } else if (CP) {
+        errs() << " CP ";
+        CP->dump();
+        errs() << " Align" << Align << '\n';
+      } else if (ES) {
+        errs() << "ES ";
+        errs() << ES << '\n';
+      } else if (JT != -1)
+        errs() << " JT" << JT << " Align" << Align << '\n';
+    }
+  };
+}
+
+/// MSP430DAGToDAGISel - MSP430 specific code to select MSP430 machine
+/// instructions for SelectionDAG operations.
+///
+namespace {
+  class MSP430DAGToDAGISel : public SelectionDAGISel {
+    MSP430TargetLowering &Lowering;
+    const MSP430Subtarget &Subtarget;
+
+  public:
+    MSP430DAGToDAGISel(MSP430TargetMachine &TM, CodeGenOpt::Level OptLevel)
+      : SelectionDAGISel(TM, OptLevel),
+        Lowering(*TM.getTargetLowering()),
+        Subtarget(*TM.getSubtargetImpl()) { }
+
+    virtual void InstructionSelect();
+
+    virtual const char *getPassName() const {
+      return "MSP430 DAG->DAG Pattern Instruction Selection";
+    }
+
+    bool MatchAddress(SDValue N, MSP430ISelAddressMode &AM);
+    bool MatchWrapper(SDValue N, MSP430ISelAddressMode &AM);
+    bool MatchAddressBase(SDValue N, MSP430ISelAddressMode &AM);
+
+    bool IsLegalAndProfitableToFold(SDNode *N, SDNode *U,
+                                    SDNode *Root) const;
+
+    virtual bool
+    SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
+                                 std::vector<SDValue> &OutOps);
+
+    // Include the pieces autogenerated from the target description.
+  #include "MSP430GenDAGISel.inc"
+
+  private:
+    DenseMap<SDNode*, SDNode*> RMWStores;
+    void PreprocessForRMW();
+    SDNode *Select(SDNode *N);
+    SDNode *SelectIndexedLoad(SDNode *Op);
+    SDNode *SelectIndexedBinOp(SDNode *Op, SDValue N1, SDValue N2,
+                               unsigned Opc8, unsigned Opc16);
+
+    bool SelectAddr(SDNode *Op, SDValue Addr, SDValue &Base, SDValue &Disp);
+
+  #ifndef NDEBUG
+    unsigned Indent;
+  #endif
+  };
+}  // end anonymous namespace
+
+/// createMSP430ISelDag - This pass converts a legalized DAG into a
+/// MSP430-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createMSP430ISelDag(MSP430TargetMachine &TM,
+                                        CodeGenOpt::Level OptLevel) {
+  return new MSP430DAGToDAGISel(TM, OptLevel);
+}
+
+
+/// MatchWrapper - Try to match MSP430ISD::Wrapper node into an addressing mode.
+/// These wrap things that will resolve down into a symbol reference.  If no
+/// match is possible, this returns true, otherwise it returns false.
+bool MSP430DAGToDAGISel::MatchWrapper(SDValue N, MSP430ISelAddressMode &AM) {
+  // If the addressing mode already has a symbol as the displacement, we can
+  // never match another symbol.
+  if (AM.hasSymbolicDisplacement())
+    return true;
+
+  SDValue N0 = N.getOperand(0);
+
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
+    AM.GV = G->getGlobal();
+    AM.Disp += G->getOffset();
+    //AM.SymbolFlags = G->getTargetFlags();
+  } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
+    AM.CP = CP->getConstVal();
+    AM.Align = CP->getAlignment();
+    AM.Disp += CP->getOffset();
+    //AM.SymbolFlags = CP->getTargetFlags();
+  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
+    AM.ES = S->getSymbol();
+    //AM.SymbolFlags = S->getTargetFlags();
+  } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
+    AM.JT = J->getIndex();
+    //AM.SymbolFlags = J->getTargetFlags();
+  } else {
+    AM.BlockAddr = cast<BlockAddressSDNode>(N0)->getBlockAddress();
+    //AM.SymbolFlags = cast<BlockAddressSDNode>(N0)->getTargetFlags();
+  }
+  return false;
+}
+
+/// MatchAddressBase - Helper for MatchAddress. Add the specified node to the
+/// specified addressing mode without any further recursion.
+bool MSP430DAGToDAGISel::MatchAddressBase(SDValue N, MSP430ISelAddressMode &AM) {
+  // Is the base register already occupied?
+  if (AM.BaseType != MSP430ISelAddressMode::RegBase || AM.Base.Reg.getNode()) {
+    // If so, we cannot select it.
+    return true;
+  }
+
+  // Default, generate it as a register.
+  AM.BaseType = MSP430ISelAddressMode::RegBase;
+  AM.Base.Reg = N;
+  return false;
+}
+
+bool MSP430DAGToDAGISel::MatchAddress(SDValue N, MSP430ISelAddressMode &AM) {
+  DEBUG({
+      errs() << "MatchAddress: ";
+      AM.dump();
+    });
+
+  switch (N.getOpcode()) {
+  default: break;
+  case ISD::Constant: {
+    uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
+    AM.Disp += Val;
+    return false;
+  }
+
+  case MSP430ISD::Wrapper:
+    if (!MatchWrapper(N, AM))
+      return false;
+    break;
+
+  case ISD::FrameIndex:
+    if (AM.BaseType == MSP430ISelAddressMode::RegBase
+        && AM.Base.Reg.getNode() == 0) {
+      AM.BaseType = MSP430ISelAddressMode::FrameIndexBase;
+      AM.Base.FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
+      return false;
+    }
+    break;
+
+  case ISD::ADD: {
+    MSP430ISelAddressMode Backup = AM;
+    if (!MatchAddress(N.getNode()->getOperand(0), AM) &&
+        !MatchAddress(N.getNode()->getOperand(1), AM))
+      return false;
+    AM = Backup;
+    if (!MatchAddress(N.getNode()->getOperand(1), AM) &&
+        !MatchAddress(N.getNode()->getOperand(0), AM))
+      return false;
+    AM = Backup;
+
+    break;
+  }
+
+  case ISD::OR:
+    // Handle "X | C" as "X + C" iff X is known to have C bits clear.
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      MSP430ISelAddressMode Backup = AM;
+      uint64_t Offset = CN->getSExtValue();
+      // Start with the LHS as an addr mode.
+      if (!MatchAddress(N.getOperand(0), AM) &&
+          // Address could not have picked a GV address for the displacement.
+          AM.GV == NULL &&
+          // Check to see if the LHS & C is zero.
+          CurDAG->MaskedValueIsZero(N.getOperand(0), CN->getAPIntValue())) {
+        AM.Disp += Offset;
+        return false;
+      }
+      AM = Backup;
+    }
+    break;
+  }
+
+  return MatchAddressBase(N, AM);
+}
+
+/// SelectAddr - returns true if it is able pattern match an addressing mode.
+/// It returns the operands which make up the maximal addressing mode it can
+/// match by reference.
+bool MSP430DAGToDAGISel::SelectAddr(SDNode *Op, SDValue N,
+                                    SDValue &Base, SDValue &Disp) {
+  MSP430ISelAddressMode AM;
+
+  if (MatchAddress(N, AM))
+    return false;
+
+  EVT VT = N.getValueType();
+  if (AM.BaseType == MSP430ISelAddressMode::RegBase) {
+    if (!AM.Base.Reg.getNode())
+      AM.Base.Reg = CurDAG->getRegister(0, VT);
+  }
+
+  Base  = (AM.BaseType == MSP430ISelAddressMode::FrameIndexBase) ?
+    CurDAG->getTargetFrameIndex(AM.Base.FrameIndex, TLI.getPointerTy()) :
+    AM.Base.Reg;
+
+  if (AM.GV)
+    Disp = CurDAG->getTargetGlobalAddress(AM.GV, MVT::i16, AM.Disp,
+                                          0/*AM.SymbolFlags*/);
+  else if (AM.CP)
+    Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i16,
+                                         AM.Align, AM.Disp, 0/*AM.SymbolFlags*/);
+  else if (AM.ES)
+    Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i16, 0/*AM.SymbolFlags*/);
+  else if (AM.JT != -1)
+    Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i16, 0/*AM.SymbolFlags*/);
+  else if (AM.BlockAddr)
+    Disp = CurDAG->getBlockAddress(AM.BlockAddr, MVT::i32,
+                                   true, 0/*AM.SymbolFlags*/);
+  else
+    Disp = CurDAG->getTargetConstant(AM.Disp, MVT::i16);
+
+  return true;
+}
+
+bool MSP430DAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
+                             std::vector<SDValue> &OutOps) {
+  SDValue Op0, Op1;
+  switch (ConstraintCode) {
+  default: return true;
+  case 'm':   // memory
+    if (!SelectAddr(Op.getNode(), Op, Op0, Op1))
+      return true;
+    break;
+  }
+
+  OutOps.push_back(Op0);
+  OutOps.push_back(Op1);
+  return false;
+}
+
+bool MSP430DAGToDAGISel::IsLegalAndProfitableToFold(SDNode *N, SDNode *U,
+                                                    SDNode *Root) const {
+  if (OptLevel == CodeGenOpt::None) return false;
+
+  /// RMW preprocessing creates the following code:
+  ///         [Load1]
+  ///         ^     ^
+  ///        /      |
+  ///       /       |
+  ///       [Load2] |
+  ///       ^    ^  |
+  ///       |    |  |
+  ///       |     \-|
+  ///       |       |
+  ///       |     [Op]
+  ///       |       ^
+  ///       |       |
+  ///       \      /
+  ///        \    /
+  ///       [Store]
+  ///
+  /// The path Store => Load2 => Load1 is via chain. Note that in general it is
+  /// not allowed to fold Load1 into Op (and Store) since it will creates a
+  /// cycle. However, this is perfectly legal for the loads moved below the
+  /// TokenFactor by PreprocessForRMW. Query the map Store => Load1 (created
+  /// during preprocessing) to determine whether it's legal to introduce such
+  /// "cycle" for a moment.
+  DenseMap<SDNode*, SDNode*>::const_iterator I = RMWStores.find(Root);
+  if (I != RMWStores.end() && I->second == N)
+    return true;
+
+  // Proceed to 'generic' cycle finder code
+  return SelectionDAGISel::IsLegalAndProfitableToFold(N, U, Root);
+}
+
+
+/// MoveBelowTokenFactor - Replace TokenFactor operand with load's chain operand
+/// and move load below the TokenFactor. Replace store's chain operand with
+/// load's chain result.
+static void MoveBelowTokenFactor(SelectionDAG *CurDAG, SDValue Load,
+                                 SDValue Store, SDValue TF) {
+  SmallVector<SDValue, 4> Ops;
+  for (unsigned i = 0, e = TF.getNode()->getNumOperands(); i != e; ++i)
+    if (Load.getNode() == TF.getOperand(i).getNode())
+      Ops.push_back(Load.getOperand(0));
+    else
+      Ops.push_back(TF.getOperand(i));
+  SDValue NewTF = CurDAG->UpdateNodeOperands(TF, &Ops[0], Ops.size());
+  SDValue NewLoad = CurDAG->UpdateNodeOperands(Load, NewTF,
+                                               Load.getOperand(1),
+                                               Load.getOperand(2));
+  CurDAG->UpdateNodeOperands(Store, NewLoad.getValue(1), Store.getOperand(1),
+                             Store.getOperand(2), Store.getOperand(3));
+}
+
+/// MoveBelowTokenFactor2 - Replace TokenFactor operand with load's chain operand
+/// and move load below the TokenFactor. Replace store's chain operand with
+/// load's chain result. This a version which sinks two loads below token factor.
+/// Look into PreprocessForRMW comments for explanation of transform.
+static void MoveBelowTokenFactor2(SelectionDAG *CurDAG,
+                                  SDValue Load1, SDValue Load2,
+                                  SDValue Store, SDValue TF) {
+  SmallVector<SDValue, 4> Ops;
+  for (unsigned i = 0, e = TF.getNode()->getNumOperands(); i != e; ++i) {
+    SDNode* N = TF.getOperand(i).getNode();
+    if (Load2.getNode() == N)
+      Ops.push_back(Load2.getOperand(0));
+    else if (Load1.getNode() != N)
+      Ops.push_back(TF.getOperand(i));
+  }
+
+  SDValue NewTF = SDValue(CurDAG->MorphNodeTo(TF.getNode(),
+                                  TF.getOpcode(),
+                                  TF.getNode()->getVTList(),
+                                  &Ops[0], Ops.size()), TF.getResNo());
+  SDValue NewLoad2 = CurDAG->UpdateNodeOperands(Load2, NewTF,
+                                                Load2.getOperand(1),
+                                                Load2.getOperand(2));
+
+  SDValue NewLoad1 = CurDAG->UpdateNodeOperands(Load1, NewLoad2.getValue(1),
+                                                Load1.getOperand(1),
+                                                Load1.getOperand(2));
+
+  CurDAG->UpdateNodeOperands(Store,
+                             NewLoad1.getValue(1),
+                             Store.getOperand(1),
+                             Store.getOperand(2), Store.getOperand(3));
+}
+
+/// isAllowedToSink - return true if N a load which can be moved below token
+/// factor. Basically, the load should be non-volatile and has single use.
+static bool isLoadAllowedToSink(SDValue N, SDValue Chain) {
+  if (N.getOpcode() == ISD::BIT_CONVERT)
+    N = N.getOperand(0);
+
+  LoadSDNode *LD = dyn_cast<LoadSDNode>(N);
+  if (!LD || LD->isVolatile())
+    return false;
+  if (LD->getAddressingMode() != ISD::UNINDEXED)
+    return false;
+
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  if (ExtType != ISD::NON_EXTLOAD && ExtType != ISD::EXTLOAD)
+    return false;
+
+  return (N.hasOneUse() &&
+          LD->hasNUsesOfValue(1, 1) &&
+          LD->isOperandOf(Chain.getNode()));
+}
+
+
+/// isRMWLoad - Return true if N is a load that's part of RMW sub-DAG.
+/// The chain produced by the load must only be used by the store's chain
+/// operand, otherwise this may produce a cycle in the DAG.
+static bool isRMWLoad(SDValue N, SDValue Chain, SDValue Address,
+                      SDValue &Load) {
+  if (isLoadAllowedToSink(N, Chain) &&
+      N.getOperand(1) == Address) {
+    Load = N;
+    return true;
+  }
+  return false;
+}
+
+/// PreprocessForRMW - Preprocess the DAG to make instruction selection better.
+/// This is only run if not in -O0 mode.
+/// This allows the instruction selector to pick more read-modify-write
+/// instructions. This is a common case:
+///
+///     [Load chain]
+///         ^
+///         |
+///       [Load]
+///       ^    ^
+///       |    |
+///      /      \-
+///     /         |
+/// [TokenFactor] [Op]
+///     ^          ^
+///     |          |
+///      \        /
+///       \      /
+///       [Store]
+///
+/// The fact the store's chain operand != load's chain will prevent the
+/// (store (op (load))) instruction from being selected. We can transform it to:
+///
+///     [Load chain]
+///         ^
+///         |
+///    [TokenFactor]
+///         ^
+///         |
+///       [Load]
+///       ^    ^
+///       |    |
+///       |     \-
+///       |       |
+///       |     [Op]
+///       |       ^
+///       |       |
+///       \      /
+///        \    /
+///       [Store]
+///
+/// We also recognize the case where second operand of Op is load as well and
+/// move it below token factor as well creating DAG as follows:
+///
+///       [Load chain]
+///            ^
+///            |
+///      [TokenFactor]
+///            ^
+///            |
+///         [Load1]
+///         ^     ^
+///        /      |
+///       /       |
+///       [Load2] |
+///       ^    ^  |
+///       |    |  |
+///       |     \-|
+///       |       |
+///       |     [Op]
+///       |       ^
+///       |       |
+///       \      /
+///        \    /
+///       [Store]
+///
+/// This allows selection of mem-mem instructions. Yay!
+
+void MSP430DAGToDAGISel::PreprocessForRMW() {
+  for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
+         E = CurDAG->allnodes_end(); I != E; ++I) {
+    if (!ISD::isNON_TRUNCStore(I))
+      continue;
+    SDValue Chain = I->getOperand(0);
+
+    if (Chain.getNode()->getOpcode() != ISD::TokenFactor)
+      continue;
+
+    SDValue N1 = I->getOperand(1);
+    SDValue N2 = I->getOperand(2);
+    if ((N1.getValueType().isFloatingPoint() &&
+         !N1.getValueType().isVector()) ||
+        !N1.hasOneUse())
+      continue;
+
+    unsigned RModW = 0;
+    SDValue Load1, Load2;
+    unsigned Opcode = N1.getNode()->getOpcode();
+    switch (Opcode) {
+    case ISD::ADD:
+    case ISD::AND:
+    case ISD::OR:
+    case ISD::XOR:
+    case ISD::ADDC:
+    case ISD::ADDE: {
+      SDValue N10 = N1.getOperand(0);
+      SDValue N11 = N1.getOperand(1);
+      if (isRMWLoad(N10, Chain, N2, Load1)) {
+        if (isLoadAllowedToSink(N11, Chain)) {
+          Load2 = N11;
+          RModW = 2;
+        } else
+          RModW = 1;
+      } else if (isRMWLoad(N11, Chain, N2, Load1)) {
+        if (isLoadAllowedToSink(N10, Chain)) {
+          Load2 = N10;
+          RModW = 2;
+        } else
+          RModW = 1;
+      }
+      break;
+    }
+    case ISD::SUB:
+    case ISD::SUBC:
+    case ISD::SUBE: {
+      SDValue N10 = N1.getOperand(0);
+      SDValue N11 = N1.getOperand(1);
+      if (isRMWLoad(N10, Chain, N2, Load1)) {
+        if (isLoadAllowedToSink(N11, Chain)) {
+          Load2 = N11;
+          RModW = 2;
+        } else
+          RModW = 1;
+      }
+      break;
+    }
+    }
+
+    NumLoadMoved += RModW;
+    if (RModW == 1)
+      MoveBelowTokenFactor(CurDAG, Load1, SDValue(I, 0), Chain);
+    else if (RModW == 2) {
+      MoveBelowTokenFactor2(CurDAG, Load1, Load2, SDValue(I, 0), Chain);
+      SDNode* Store = I;
+      RMWStores[Store] = Load2.getNode();
+    }
+  }
+}
+
+
+static bool isValidIndexedLoad(const LoadSDNode *LD) {
+  ISD::MemIndexedMode AM = LD->getAddressingMode();
+  if (AM != ISD::POST_INC || LD->getExtensionType() != ISD::NON_EXTLOAD)
+    return false;
+
+  EVT VT = LD->getMemoryVT();
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::i8:
+    // Sanity check
+    if (cast<ConstantSDNode>(LD->getOffset())->getZExtValue() != 1)
+      return false;
+
+    break;
+  case MVT::i16:
+    // Sanity check
+    if (cast<ConstantSDNode>(LD->getOffset())->getZExtValue() != 2)
+      return false;
+
+    break;
+  default:
+    return false;
+  }
+
+  return true;
+}
+
+SDNode *MSP430DAGToDAGISel::SelectIndexedLoad(SDNode *N) {
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  if (!isValidIndexedLoad(LD))
+    return NULL;
+
+  MVT VT = LD->getMemoryVT().getSimpleVT();
+
+  unsigned Opcode = 0;
+  switch (VT.SimpleTy) {
+  case MVT::i8:
+    Opcode = MSP430::MOV8rm_POST;
+    break;
+  case MVT::i16:
+    Opcode = MSP430::MOV16rm_POST;
+    break;
+  default:
+    return NULL;
+  }
+
+   return CurDAG->getMachineNode(Opcode, N->getDebugLoc(),
+                                 VT, MVT::i16, MVT::Other,
+                                 LD->getBasePtr(), LD->getChain());
+}
+
+SDNode *MSP430DAGToDAGISel::SelectIndexedBinOp(SDNode *Op,
+                                               SDValue N1, SDValue N2,
+                                               unsigned Opc8, unsigned Opc16) {
+  if (N1.getOpcode() == ISD::LOAD &&
+      N1.hasOneUse() &&
+      IsLegalAndProfitableToFold(N1.getNode(), Op, Op)) {
+    LoadSDNode *LD = cast<LoadSDNode>(N1);
+    if (!isValidIndexedLoad(LD))
+      return NULL;
+
+    MVT VT = LD->getMemoryVT().getSimpleVT();
+    unsigned Opc = (VT == MVT::i16 ? Opc16 : Opc8);
+    MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+    MemRefs0[0] = cast<MemSDNode>(N1)->getMemOperand();
+    SDValue Ops0[] = { N2, LD->getBasePtr(), LD->getChain() };
+    SDNode *ResNode =
+      CurDAG->SelectNodeTo(Op, Opc,
+                           VT, MVT::i16, MVT::Other,
+                           Ops0, 3);
+    cast<MachineSDNode>(ResNode)->setMemRefs(MemRefs0, MemRefs0 + 1);
+    // Transfer chain.
+    ReplaceUses(SDValue(N1.getNode(), 2), SDValue(ResNode, 2));
+    // Transfer writeback.
+    ReplaceUses(SDValue(N1.getNode(), 1), SDValue(ResNode, 1));
+    return ResNode;
+  }
+
+  return NULL;
+}
+
+
+/// InstructionSelect - This callback is invoked by
+/// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+void MSP430DAGToDAGISel::InstructionSelect() {
+  std::string BlockName;
+  if (ViewRMWDAGs)
+    BlockName = MF->getFunction()->getNameStr() + ":" +
+                BB->getBasicBlock()->getNameStr();
+
+  PreprocessForRMW();
+
+  if (ViewRMWDAGs) CurDAG->viewGraph("RMW preprocessed:" + BlockName);
+
+  DEBUG(errs() << "Selection DAG after RMW preprocessing:\n");
+  DEBUG(CurDAG->dump());
+
+  // Codegen the basic block.
+  DEBUG(errs() << "===== Instruction selection begins:\n");
+  DEBUG(Indent = 0);
+  SelectRoot(*CurDAG);
+  DEBUG(errs() << "===== Instruction selection ends:\n");
+
+  CurDAG->RemoveDeadNodes();
+  RMWStores.clear();
+}
+
+SDNode *MSP430DAGToDAGISel::Select(SDNode *Node) {
+  DebugLoc dl = Node->getDebugLoc();
+
+  // Dump information about the Node being selected
+  DEBUG(errs().indent(Indent) << "Selecting: ");
+  DEBUG(Node->dump(CurDAG));
+  DEBUG(errs() << "\n");
+  DEBUG(Indent += 2);
+
+  // If we have a custom node, we already have selected!
+  if (Node->isMachineOpcode()) {
+    DEBUG(errs().indent(Indent-2) << "== ";
+          Node->dump(CurDAG);
+          errs() << "\n");
+    DEBUG(Indent -= 2);
+    return NULL;
+  }
+
+  // Few custom selection stuff.
+  switch (Node->getOpcode()) {
+  default: break;
+  case ISD::FrameIndex: {
+    assert(Node->getValueType(0) == MVT::i16);
+    int FI = cast<FrameIndexSDNode>(Node)->getIndex();
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI, MVT::i16);
+    if (Node->hasOneUse())
+      return CurDAG->SelectNodeTo(Node, MSP430::ADD16ri, MVT::i16,
+                                  TFI, CurDAG->getTargetConstant(0, MVT::i16));
+    return CurDAG->getMachineNode(MSP430::ADD16ri, dl, MVT::i16,
+                                  TFI, CurDAG->getTargetConstant(0, MVT::i16));
+  }
+  case ISD::LOAD:
+    if (SDNode *ResNode = SelectIndexedLoad(Node))
+      return ResNode;
+    // Other cases are autogenerated.
+    break;
+  case ISD::ADD:
+    if (SDNode *ResNode =
+        SelectIndexedBinOp(Node,
+                           Node->getOperand(0), Node->getOperand(1),
+                           MSP430::ADD8rm_POST, MSP430::ADD16rm_POST))
+      return ResNode;
+    else if (SDNode *ResNode =
+             SelectIndexedBinOp(Node, Node->getOperand(1), Node->getOperand(0),
+                                MSP430::ADD8rm_POST, MSP430::ADD16rm_POST))
+      return ResNode;
+
+    // Other cases are autogenerated.
+    break;
+  case ISD::SUB:
+    if (SDNode *ResNode =
+        SelectIndexedBinOp(Node,
+                           Node->getOperand(0), Node->getOperand(1),
+                           MSP430::SUB8rm_POST, MSP430::SUB16rm_POST))
+      return ResNode;
+
+    // Other cases are autogenerated.
+    break;
+  case ISD::AND:
+    if (SDNode *ResNode =
+        SelectIndexedBinOp(Node,
+                           Node->getOperand(0), Node->getOperand(1),
+                           MSP430::AND8rm_POST, MSP430::AND16rm_POST))
+      return ResNode;
+    else if (SDNode *ResNode =
+             SelectIndexedBinOp(Node, Node->getOperand(1), Node->getOperand(0),
+                                MSP430::AND8rm_POST, MSP430::AND16rm_POST))
+      return ResNode;
+
+    // Other cases are autogenerated.
+    break;
+  case ISD::OR:
+    if (SDNode *ResNode =
+        SelectIndexedBinOp(Node,
+                           Node->getOperand(0), Node->getOperand(1),
+                           MSP430::OR8rm_POST, MSP430::OR16rm_POST))
+      return ResNode;
+    else if (SDNode *ResNode =
+             SelectIndexedBinOp(Node, Node->getOperand(1), Node->getOperand(0),
+                                MSP430::OR8rm_POST, MSP430::OR16rm_POST))
+      return ResNode;
+
+    // Other cases are autogenerated.
+    break;
+  case ISD::XOR:
+    if (SDNode *ResNode =
+        SelectIndexedBinOp(Node,
+                           Node->getOperand(0), Node->getOperand(1),
+                           MSP430::XOR8rm_POST, MSP430::XOR16rm_POST))
+      return ResNode;
+    else if (SDNode *ResNode =
+             SelectIndexedBinOp(Node, Node->getOperand(1), Node->getOperand(0),
+                                MSP430::XOR8rm_POST, MSP430::XOR16rm_POST))
+      return ResNode;
+
+    // Other cases are autogenerated.
+    break;
+  }
+
+  // Select the default instruction
+  SDNode *ResNode = SelectCode(Node);
+
+  DEBUG(errs() << std::string(Indent-2, ' ') << "=> ");
+  if (ResNode == NULL || ResNode == Node)
+    DEBUG(Node->dump(CurDAG));
+  else
+    DEBUG(ResNode->dump(CurDAG));
+  DEBUG(errs() << "\n");
+  DEBUG(Indent -= 2);
+
+  return ResNode;
+}

diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
new file mode 100644
index 0000000..ef81f51
--- /dev/null
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp

@@ -0,0 +1,1183 @@
+//===-- MSP430ISelLowering.cpp - MSP430 DAG Lowering Implementation  ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MSP430TargetLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "msp430-lower"
+
+#include "MSP430ISelLowering.h"
+#include "MSP430.h"
+#include "MSP430MachineFunctionInfo.h"
+#include "MSP430TargetMachine.h"
+#include "MSP430Subtarget.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/CallingConv.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/GlobalAlias.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/VectorExtras.h"
+using namespace llvm;
+
+typedef enum {
+  NoHWMult,
+  HWMultIntr,
+  HWMultNoIntr
+} HWMultUseMode;
+
+static cl::opt<HWMultUseMode>
+HWMultMode("msp430-hwmult-mode",
+           cl::desc("Hardware multiplier use mode"),
+           cl::init(HWMultNoIntr),
+           cl::values(
+             clEnumValN(NoHWMult, "no",
+                "Do not use hardware multiplier"),
+             clEnumValN(HWMultIntr, "interrupts",
+                "Assume hardware multiplier can be used inside interrupts"),
+             clEnumValN(HWMultNoIntr, "use",
+                "Assume hardware multiplier cannot be used inside interrupts"),
+             clEnumValEnd));
+
+MSP430TargetLowering::MSP430TargetLowering(MSP430TargetMachine &tm) :
+  TargetLowering(tm, new TargetLoweringObjectFileELF()),
+  Subtarget(*tm.getSubtargetImpl()), TM(tm) {
+
+  TD = getTargetData();
+
+  // Set up the register classes.
+  addRegisterClass(MVT::i8,  MSP430::GR8RegisterClass);
+  addRegisterClass(MVT::i16, MSP430::GR16RegisterClass);
+
+  // Compute derived properties from the register classes
+  computeRegisterProperties();
+
+  // Provide all sorts of operation actions
+
+  // Division is expensive
+  setIntDivIsCheap(false);
+
+  // Even if we have only 1 bit shift here, we can perform
+  // shifts of the whole bitwidth 1 bit per step.
+  setShiftAmountType(MVT::i8);
+
+  setStackPointerRegisterToSaveRestore(MSP430::SPW);
+  setBooleanContents(ZeroOrOneBooleanContent);
+  setSchedulingPreference(SchedulingForLatency);
+
+  // We have post-incremented loads / stores.
+  setIndexedLoadAction(ISD::POST_INC, MVT::i8, Legal);
+  setIndexedLoadAction(ISD::POST_INC, MVT::i16, Legal);
+
+  setLoadExtAction(ISD::EXTLOAD,  MVT::i1,  Promote);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1,  Promote);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1,  Promote);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i8,  Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Expand);
+
+  // We don't have any truncstores
+  setTruncStoreAction(MVT::i16, MVT::i8, Expand);
+
+  setOperationAction(ISD::SRA,              MVT::i8,    Custom);
+  setOperationAction(ISD::SHL,              MVT::i8,    Custom);
+  setOperationAction(ISD::SRL,              MVT::i8,    Custom);
+  setOperationAction(ISD::SRA,              MVT::i16,   Custom);
+  setOperationAction(ISD::SHL,              MVT::i16,   Custom);
+  setOperationAction(ISD::SRL,              MVT::i16,   Custom);
+  setOperationAction(ISD::ROTL,             MVT::i8,    Expand);
+  setOperationAction(ISD::ROTR,             MVT::i8,    Expand);
+  setOperationAction(ISD::ROTL,             MVT::i16,   Expand);
+  setOperationAction(ISD::ROTR,             MVT::i16,   Expand);
+  setOperationAction(ISD::GlobalAddress,    MVT::i16,   Custom);
+  setOperationAction(ISD::ExternalSymbol,   MVT::i16,   Custom);
+  setOperationAction(ISD::BR_JT,            MVT::Other, Expand);
+  setOperationAction(ISD::BRIND,            MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC,            MVT::i8,    Custom);
+  setOperationAction(ISD::BR_CC,            MVT::i16,   Custom);
+  setOperationAction(ISD::BRCOND,           MVT::Other, Expand);
+  setOperationAction(ISD::SETCC,            MVT::i8,    Custom);
+  setOperationAction(ISD::SETCC,            MVT::i16,   Custom);
+  setOperationAction(ISD::SELECT,           MVT::i8,    Expand);
+  setOperationAction(ISD::SELECT,           MVT::i16,   Expand);
+  setOperationAction(ISD::SELECT_CC,        MVT::i8,    Custom);
+  setOperationAction(ISD::SELECT_CC,        MVT::i16,   Custom);
+  setOperationAction(ISD::SIGN_EXTEND,      MVT::i16,   Custom);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i8, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i16, Expand);
+
+  setOperationAction(ISD::CTTZ,             MVT::i8,    Expand);
+  setOperationAction(ISD::CTTZ,             MVT::i16,   Expand);
+  setOperationAction(ISD::CTLZ,             MVT::i8,    Expand);
+  setOperationAction(ISD::CTLZ,             MVT::i16,   Expand);
+  setOperationAction(ISD::CTPOP,            MVT::i8,    Expand);
+  setOperationAction(ISD::CTPOP,            MVT::i16,   Expand);
+
+  setOperationAction(ISD::SHL_PARTS,        MVT::i8,    Expand);
+  setOperationAction(ISD::SHL_PARTS,        MVT::i16,   Expand);
+  setOperationAction(ISD::SRL_PARTS,        MVT::i8,    Expand);
+  setOperationAction(ISD::SRL_PARTS,        MVT::i16,   Expand);
+  setOperationAction(ISD::SRA_PARTS,        MVT::i8,    Expand);
+  setOperationAction(ISD::SRA_PARTS,        MVT::i16,   Expand);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1,   Expand);
+
+  // FIXME: Implement efficiently multiplication by a constant
+  setOperationAction(ISD::MUL,              MVT::i8,    Expand);
+  setOperationAction(ISD::MULHS,            MVT::i8,    Expand);
+  setOperationAction(ISD::MULHU,            MVT::i8,    Expand);
+  setOperationAction(ISD::SMUL_LOHI,        MVT::i8,    Expand);
+  setOperationAction(ISD::UMUL_LOHI,        MVT::i8,    Expand);
+  setOperationAction(ISD::MUL,              MVT::i16,   Expand);
+  setOperationAction(ISD::MULHS,            MVT::i16,   Expand);
+  setOperationAction(ISD::MULHU,            MVT::i16,   Expand);
+  setOperationAction(ISD::SMUL_LOHI,        MVT::i16,   Expand);
+  setOperationAction(ISD::UMUL_LOHI,        MVT::i16,   Expand);
+
+  setOperationAction(ISD::UDIV,             MVT::i8,    Expand);
+  setOperationAction(ISD::UDIVREM,          MVT::i8,    Expand);
+  setOperationAction(ISD::UREM,             MVT::i8,    Expand);
+  setOperationAction(ISD::SDIV,             MVT::i8,    Expand);
+  setOperationAction(ISD::SDIVREM,          MVT::i8,    Expand);
+  setOperationAction(ISD::SREM,             MVT::i8,    Expand);
+  setOperationAction(ISD::UDIV,             MVT::i16,   Expand);
+  setOperationAction(ISD::UDIVREM,          MVT::i16,   Expand);
+  setOperationAction(ISD::UREM,             MVT::i16,   Expand);
+  setOperationAction(ISD::SDIV,             MVT::i16,   Expand);
+  setOperationAction(ISD::SDIVREM,          MVT::i16,   Expand);
+  setOperationAction(ISD::SREM,             MVT::i16,   Expand);
+
+  // Libcalls names.
+  if (HWMultMode == HWMultIntr) {
+    setLibcallName(RTLIB::MUL_I8,  "__mulqi3hw");
+    setLibcallName(RTLIB::MUL_I16, "__mulhi3hw");
+  } else if (HWMultMode == HWMultNoIntr) {
+    setLibcallName(RTLIB::MUL_I8,  "__mulqi3hw_noint");
+    setLibcallName(RTLIB::MUL_I16, "__mulhi3hw_noint");
+  }
+}
+
+SDValue MSP430TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
+  switch (Op.getOpcode()) {
+  case ISD::SHL: // FALLTHROUGH
+  case ISD::SRL:
+  case ISD::SRA:              return LowerShifts(Op, DAG);
+  case ISD::GlobalAddress:    return LowerGlobalAddress(Op, DAG);
+  case ISD::ExternalSymbol:   return LowerExternalSymbol(Op, DAG);
+  case ISD::SETCC:            return LowerSETCC(Op, DAG);
+  case ISD::BR_CC:            return LowerBR_CC(Op, DAG);
+  case ISD::SELECT_CC:        return LowerSELECT_CC(Op, DAG);
+  case ISD::SIGN_EXTEND:      return LowerSIGN_EXTEND(Op, DAG);
+  case ISD::RETURNADDR:       return LowerRETURNADDR(Op, DAG);
+  case ISD::FRAMEADDR:        return LowerFRAMEADDR(Op, DAG);
+  default:
+    llvm_unreachable("unimplemented operand");
+    return SDValue();
+  }
+}
+
+/// getFunctionAlignment - Return the Log2 alignment of this function.
+unsigned MSP430TargetLowering::getFunctionAlignment(const Function *F) const {
+  return F->hasFnAttr(Attribute::OptimizeForSize) ? 1 : 2;
+}
+
+//===----------------------------------------------------------------------===//
+//                       MSP430 Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+TargetLowering::ConstraintType
+MSP430TargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'r':
+      return C_RegisterClass;
+    default:
+      break;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+std::pair<unsigned, const TargetRegisterClass*>
+MSP430TargetLowering::
+getRegForInlineAsmConstraint(const std::string &Constraint,
+                             EVT VT) const {
+  if (Constraint.size() == 1) {
+    // GCC Constraint Letters
+    switch (Constraint[0]) {
+    default: break;
+    case 'r':   // GENERAL_REGS
+      if (VT == MVT::i8)
+        return std::make_pair(0U, MSP430::GR8RegisterClass);
+
+      return std::make_pair(0U, MSP430::GR16RegisterClass);
+    }
+  }
+
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
+
+//===----------------------------------------------------------------------===//
+//                      Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "MSP430GenCallingConv.inc"
+
+SDValue
+MSP430TargetLowering::LowerFormalArguments(SDValue Chain,
+                                           CallingConv::ID CallConv,
+                                           bool isVarArg,
+                                           const SmallVectorImpl<ISD::InputArg>
+                                             &Ins,
+                                           DebugLoc dl,
+                                           SelectionDAG &DAG,
+                                           SmallVectorImpl<SDValue> &InVals) {
+
+  switch (CallConv) {
+  default:
+    llvm_unreachable("Unsupported calling convention");
+  case CallingConv::C:
+  case CallingConv::Fast:
+    return LowerCCCArguments(Chain, CallConv, isVarArg, Ins, dl, DAG, InVals);
+  case CallingConv::MSP430_INTR:
+   if (Ins.empty())
+     return Chain;
+   else {
+    llvm_report_error("ISRs cannot have arguments");
+    return SDValue();
+   }
+  }
+}
+
+SDValue
+MSP430TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
+                                CallingConv::ID CallConv, bool isVarArg,
+                                bool &isTailCall,
+                                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                const SmallVectorImpl<ISD::InputArg> &Ins,
+                                DebugLoc dl, SelectionDAG &DAG,
+                                SmallVectorImpl<SDValue> &InVals) {
+  // MSP430 target does not yet support tail call optimization.
+  isTailCall = false;
+
+  switch (CallConv) {
+  default:
+    llvm_unreachable("Unsupported calling convention");
+  case CallingConv::Fast:
+  case CallingConv::C:
+    return LowerCCCCallTo(Chain, Callee, CallConv, isVarArg, isTailCall,
+                          Outs, Ins, dl, DAG, InVals);
+  case CallingConv::MSP430_INTR:
+    llvm_report_error("ISRs cannot be called directly");
+    return SDValue();
+  }
+}
+
+/// LowerCCCArguments - transform physical registers into virtual registers and
+/// generate load operations for arguments places on the stack.
+// FIXME: struct return stuff
+// FIXME: varargs
+SDValue
+MSP430TargetLowering::LowerCCCArguments(SDValue Chain,
+                                        CallingConv::ID CallConv,
+                                        bool isVarArg,
+                                        const SmallVectorImpl<ISD::InputArg>
+                                          &Ins,
+                                        DebugLoc dl,
+                                        SelectionDAG &DAG,
+                                        SmallVectorImpl<SDValue> &InVals) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 ArgLocs, *DAG.getContext());
+  CCInfo.AnalyzeFormalArguments(Ins, CC_MSP430);
+
+  assert(!isVarArg && "Varargs not supported yet");
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    if (VA.isRegLoc()) {
+      // Arguments passed in registers
+      EVT RegVT = VA.getLocVT();
+      switch (RegVT.getSimpleVT().SimpleTy) {
+      default: 
+        {
+#ifndef NDEBUG
+          errs() << "LowerFormalArguments Unhandled argument type: "
+               << RegVT.getSimpleVT().SimpleTy << "\n";
+#endif
+          llvm_unreachable(0);
+        }
+      case MVT::i16:
+        unsigned VReg =
+          RegInfo.createVirtualRegister(MSP430::GR16RegisterClass);
+        RegInfo.addLiveIn(VA.getLocReg(), VReg);
+        SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, VReg, RegVT);
+
+        // If this is an 8-bit value, it is really passed promoted to 16
+        // bits. Insert an assert[sz]ext to capture this, then truncate to the
+        // right size.
+        if (VA.getLocInfo() == CCValAssign::SExt)
+          ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
+                                 DAG.getValueType(VA.getValVT()));
+        else if (VA.getLocInfo() == CCValAssign::ZExt)
+          ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
+                                 DAG.getValueType(VA.getValVT()));
+
+        if (VA.getLocInfo() != CCValAssign::Full)
+          ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+
+        InVals.push_back(ArgValue);
+      }
+    } else {
+      // Sanity check
+      assert(VA.isMemLoc());
+      // Load the argument to a virtual register
+      unsigned ObjSize = VA.getLocVT().getSizeInBits()/8;
+      if (ObjSize > 2) {
+        errs() << "LowerFormalArguments Unhandled argument type: "
+             << VA.getLocVT().getSimpleVT().SimpleTy
+             << "\n";
+      }
+      // Create the frame index object for this incoming parameter...
+      int FI = MFI->CreateFixedObject(ObjSize, VA.getLocMemOffset(), true, false);
+
+      // Create the SelectionDAG nodes corresponding to a load
+      //from this parameter
+      SDValue FIN = DAG.getFrameIndex(FI, MVT::i16);
+      InVals.push_back(DAG.getLoad(VA.getLocVT(), dl, Chain, FIN,
+                                   PseudoSourceValue::getFixedStack(FI), 0));
+    }
+  }
+
+  return Chain;
+}
+
+SDValue
+MSP430TargetLowering::LowerReturn(SDValue Chain,
+                                  CallingConv::ID CallConv, bool isVarArg,
+                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                  DebugLoc dl, SelectionDAG &DAG) {
+
+  // CCValAssign - represent the assignment of the return value to a location
+  SmallVector<CCValAssign, 16> RVLocs;
+
+  // ISRs cannot return any value.
+  if (CallConv == CallingConv::MSP430_INTR && !Outs.empty()) {
+    llvm_report_error("ISRs cannot return any value");
+    return SDValue();
+  }
+
+  // CCState - Info about the registers and stack slot.
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 RVLocs, *DAG.getContext());
+
+  // Analize return values.
+  CCInfo.AnalyzeReturn(Outs, RetCC_MSP430);
+
+  // If this is the first return lowered for this function, add the regs to the
+  // liveout set for the function.
+  if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
+    for (unsigned i = 0; i != RVLocs.size(); ++i)
+      if (RVLocs[i].isRegLoc())
+        DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
+  }
+
+  SDValue Flag;
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+                             Outs[i].Val, Flag);
+
+    // Guarantee that all emitted copies are stuck together,
+    // avoiding something bad.
+    Flag = Chain.getValue(1);
+  }
+
+  unsigned Opc = (CallConv == CallingConv::MSP430_INTR ?
+                  MSP430ISD::RETI_FLAG : MSP430ISD::RET_FLAG);
+
+  if (Flag.getNode())
+    return DAG.getNode(Opc, dl, MVT::Other, Chain, Flag);
+
+  // Return Void
+  return DAG.getNode(Opc, dl, MVT::Other, Chain);
+}
+
+/// LowerCCCCallTo - functions arguments are copied from virtual regs to
+/// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
+/// TODO: sret.
+SDValue
+MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
+                                     CallingConv::ID CallConv, bool isVarArg,
+                                     bool isTailCall,
+                                     const SmallVectorImpl<ISD::OutputArg>
+                                       &Outs,
+                                     const SmallVectorImpl<ISD::InputArg> &Ins,
+                                     DebugLoc dl, SelectionDAG &DAG,
+                                     SmallVectorImpl<SDValue> &InVals) {
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 ArgLocs, *DAG.getContext());
+
+  CCInfo.AnalyzeCallOperands(Outs, CC_MSP430);
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+
+  Chain = DAG.getCALLSEQ_START(Chain ,DAG.getConstant(NumBytes,
+                                                      getPointerTy(), true));
+
+  SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
+  SmallVector<SDValue, 12> MemOpChains;
+  SDValue StackPtr;
+
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+
+    SDValue Arg = Outs[i].Val;
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+      default: llvm_unreachable("Unknown loc info!");
+      case CCValAssign::Full: break;
+      case CCValAssign::SExt:
+        Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::ZExt:
+        Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::AExt:
+        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+    }
+
+    // Arguments that can be passed on register must be kept at RegsToPass
+    // vector
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      assert(VA.isMemLoc());
+
+      if (StackPtr.getNode() == 0)
+        StackPtr = DAG.getCopyFromReg(Chain, dl, MSP430::SPW, getPointerTy());
+
+      SDValue PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+                                   StackPtr,
+                                   DAG.getIntPtrConstant(VA.getLocMemOffset()));
+
+
+      MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
+                                         PseudoSourceValue::getStack(),
+                                         VA.getLocMemOffset()));
+    }
+  }
+
+  // Transform all store nodes into one single node because all store nodes are
+  // independent of each other.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain and
+  // flag operands which copy the outgoing args into registers.  The InFlag in
+  // necessary since all emited instructions must be stuck together.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress node (quite common, every direct call is)
+  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+  // Likewise ExternalSymbol -> TargetExternalSymbol.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), MVT::i16);
+  else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
+    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), MVT::i16);
+
+  // Returns a chain & a flag for retval copy to use.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are
+  // known live into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  Chain = DAG.getNode(MSP430ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  // Create the CALLSEQ_END node.
+  Chain = DAG.getCALLSEQ_END(Chain,
+                             DAG.getConstant(NumBytes, getPointerTy(), true),
+                             DAG.getConstant(0, getPointerTy(), true),
+                             InFlag);
+  InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl,
+                         DAG, InVals);
+}
+
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+///
+SDValue
+MSP430TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
+                                      CallingConv::ID CallConv, bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                      DebugLoc dl, SelectionDAG &DAG,
+                                      SmallVectorImpl<SDValue> &InVals) {
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 RVLocs, *DAG.getContext());
+
+  CCInfo.AnalyzeCallResult(Ins, RetCC_MSP430);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    Chain = DAG.getCopyFromReg(Chain, dl, RVLocs[i].getLocReg(),
+                               RVLocs[i].getValVT(), InFlag).getValue(1);
+    InFlag = Chain.getValue(2);
+    InVals.push_back(Chain.getValue(0));
+  }
+
+  return Chain;
+}
+
+SDValue MSP430TargetLowering::LowerShifts(SDValue Op,
+                                          SelectionDAG &DAG) {
+  unsigned Opc = Op.getOpcode();
+  SDNode* N = Op.getNode();
+  EVT VT = Op.getValueType();
+  DebugLoc dl = N->getDebugLoc();
+
+  // Expand non-constant shifts to loops:
+  if (!isa<ConstantSDNode>(N->getOperand(1)))
+    switch (Opc) {
+    default:
+      assert(0 && "Invalid shift opcode!");
+    case ISD::SHL:
+      return DAG.getNode(MSP430ISD::SHL, dl,
+                         VT, N->getOperand(0), N->getOperand(1));
+    case ISD::SRA:
+      return DAG.getNode(MSP430ISD::SRA, dl,
+                         VT, N->getOperand(0), N->getOperand(1));
+    case ISD::SRL:
+      return DAG.getNode(MSP430ISD::SRL, dl,
+                         VT, N->getOperand(0), N->getOperand(1));
+    }
+
+  uint64_t ShiftAmount = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+
+  // Expand the stuff into sequence of shifts.
+  // FIXME: for some shift amounts this might be done better!
+  // E.g.: foo >> (8 + N) => sxt(swpb(foo)) >> N
+  SDValue Victim = N->getOperand(0);
+
+  if (Opc == ISD::SRL && ShiftAmount) {
+    // Emit a special goodness here:
+    // srl A, 1 => clrc; rrc A
+    Victim = DAG.getNode(MSP430ISD::RRC, dl, VT, Victim);
+    ShiftAmount -= 1;
+  }
+
+  while (ShiftAmount--)
+    Victim = DAG.getNode((Opc == ISD::SHL ? MSP430ISD::RLA : MSP430ISD::RRA),
+                         dl, VT, Victim);
+
+  return Victim;
+}
+
+SDValue MSP430TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) {
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
+
+  // Create the TargetGlobalAddress node, folding in the constant offset.
+  SDValue Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), Offset);
+  return DAG.getNode(MSP430ISD::Wrapper, Op.getDebugLoc(),
+                     getPointerTy(), Result);
+}
+
+SDValue MSP430TargetLowering::LowerExternalSymbol(SDValue Op,
+                                                  SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
+  SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy());
+
+  return DAG.getNode(MSP430ISD::Wrapper, dl, getPointerTy(), Result);;
+}
+
+static SDValue EmitCMP(SDValue &LHS, SDValue &RHS, SDValue &TargetCC,
+                       ISD::CondCode CC,
+                       DebugLoc dl, SelectionDAG &DAG) {
+  // FIXME: Handle bittests someday
+  assert(!LHS.getValueType().isFloatingPoint() && "We don't handle FP yet");
+
+  // FIXME: Handle jump negative someday
+  MSP430CC::CondCodes TCC = MSP430CC::COND_INVALID;
+  switch (CC) {
+  default: llvm_unreachable("Invalid integer condition!");
+  case ISD::SETEQ:
+    TCC = MSP430CC::COND_E;     // aka COND_Z
+    // Minor optimization: if LHS is a constant, swap operands, then the
+    // constant can be folded into comparison.
+    if (LHS.getOpcode() == ISD::Constant)
+      std::swap(LHS, RHS);
+    break;
+  case ISD::SETNE:
+    TCC = MSP430CC::COND_NE;    // aka COND_NZ
+    // Minor optimization: if LHS is a constant, swap operands, then the
+    // constant can be folded into comparison.
+    if (LHS.getOpcode() == ISD::Constant)
+      std::swap(LHS, RHS);
+    break;
+  case ISD::SETULE:
+    std::swap(LHS, RHS);        // FALLTHROUGH
+  case ISD::SETUGE:
+    // Turn lhs u>= rhs with lhs constant into rhs u< lhs+1, this allows us to
+    // fold constant into instruction.
+    if (const ConstantSDNode * C = dyn_cast<ConstantSDNode>(LHS)) {
+      LHS = RHS;
+      RHS = DAG.getConstant(C->getSExtValue() + 1, C->getValueType(0));
+      TCC = MSP430CC::COND_LO;
+      break;
+    }
+    TCC = MSP430CC::COND_HS;    // aka COND_C
+    break;
+  case ISD::SETUGT:
+    std::swap(LHS, RHS);        // FALLTHROUGH
+  case ISD::SETULT:
+    // Turn lhs u< rhs with lhs constant into rhs u>= lhs+1, this allows us to
+    // fold constant into instruction.
+    if (const ConstantSDNode * C = dyn_cast<ConstantSDNode>(LHS)) {
+      LHS = RHS;
+      RHS = DAG.getConstant(C->getSExtValue() + 1, C->getValueType(0));
+      TCC = MSP430CC::COND_HS;
+      break;
+    }
+    TCC = MSP430CC::COND_LO;    // aka COND_NC
+    break;
+  case ISD::SETLE:
+    std::swap(LHS, RHS);        // FALLTHROUGH
+  case ISD::SETGE:
+    // Turn lhs >= rhs with lhs constant into rhs < lhs+1, this allows us to
+    // fold constant into instruction.
+    if (const ConstantSDNode * C = dyn_cast<ConstantSDNode>(LHS)) {
+      LHS = RHS;
+      RHS = DAG.getConstant(C->getSExtValue() + 1, C->getValueType(0));
+      TCC = MSP430CC::COND_L;
+      break;
+    }
+    TCC = MSP430CC::COND_GE;
+    break;
+  case ISD::SETGT:
+    std::swap(LHS, RHS);        // FALLTHROUGH
+  case ISD::SETLT:
+    // Turn lhs < rhs with lhs constant into rhs >= lhs+1, this allows us to
+    // fold constant into instruction.
+    if (const ConstantSDNode * C = dyn_cast<ConstantSDNode>(LHS)) {
+      LHS = RHS;
+      RHS = DAG.getConstant(C->getSExtValue() + 1, C->getValueType(0));
+      TCC = MSP430CC::COND_GE;
+      break;
+    }
+    TCC = MSP430CC::COND_L;
+    break;
+  }
+
+  TargetCC = DAG.getConstant(TCC, MVT::i8);
+  return DAG.getNode(MSP430ISD::CMP, dl, MVT::Flag, LHS, RHS);
+}
+
+
+SDValue MSP430TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) {
+  SDValue Chain = Op.getOperand(0);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+  SDValue LHS   = Op.getOperand(2);
+  SDValue RHS   = Op.getOperand(3);
+  SDValue Dest  = Op.getOperand(4);
+  DebugLoc dl   = Op.getDebugLoc();
+
+  SDValue TargetCC;
+  SDValue Flag = EmitCMP(LHS, RHS, TargetCC, CC, dl, DAG);
+
+  return DAG.getNode(MSP430ISD::BR_CC, dl, Op.getValueType(),
+                     Chain, Dest, TargetCC, Flag);
+}
+
+
+SDValue MSP430TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) {
+  SDValue LHS   = Op.getOperand(0);
+  SDValue RHS   = Op.getOperand(1);
+  DebugLoc dl   = Op.getDebugLoc();
+
+  // If we are doing an AND and testing against zero, then the CMP
+  // will not be generated.  The AND (or BIT) will generate the condition codes,
+  // but they are different from CMP.
+  // FIXME: since we're doing a post-processing, use a pseudoinstr here, so
+  // lowering & isel wouldn't diverge.
+  bool andCC = false;
+  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
+    if (RHSC->isNullValue() && LHS.hasOneUse() &&
+        (LHS.getOpcode() == ISD::AND ||
+         (LHS.getOpcode() == ISD::TRUNCATE &&
+          LHS.getOperand(0).getOpcode() == ISD::AND))) {
+      andCC = true;
+    }
+  }
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  SDValue TargetCC;
+  SDValue Flag = EmitCMP(LHS, RHS, TargetCC, CC, dl, DAG);
+
+  // Get the condition codes directly from the status register, if its easy.
+  // Otherwise a branch will be generated.  Note that the AND and BIT
+  // instructions generate different flags than CMP, the carry bit can be used
+  // for NE/EQ.
+  bool Invert = false;
+  bool Shift = false;
+  bool Convert = true;
+  switch (cast<ConstantSDNode>(TargetCC)->getZExtValue()) {
+   default:
+    Convert = false;
+    break;
+   case MSP430CC::COND_HS:
+     // Res = SRW & 1, no processing is required
+     break;
+   case MSP430CC::COND_LO:
+     // Res = ~(SRW & 1)
+     Invert = true;
+     break;
+   case MSP430CC::COND_NE:
+     if (andCC) {
+       // C = ~Z, thus Res = SRW & 1, no processing is required
+     } else {
+       // Res = (SRW >> 1) & 1
+       Shift = true;
+     }
+     break;
+   case MSP430CC::COND_E:
+     if (andCC) {
+       // C = ~Z, thus Res = ~(SRW & 1)
+     } else {
+       // Res = ~((SRW >> 1) & 1)
+       Shift = true;
+     }
+     Invert = true;
+     break;
+  }
+  EVT VT = Op.getValueType();
+  SDValue One  = DAG.getConstant(1, VT);
+  if (Convert) {
+    SDValue SR = DAG.getCopyFromReg(DAG.getEntryNode(), dl, MSP430::SRW,
+                                    MVT::i16, Flag);
+    if (Shift)
+      // FIXME: somewhere this is turned into a SRL, lower it MSP specific?
+      SR = DAG.getNode(ISD::SRA, dl, MVT::i16, SR, One);
+    SR = DAG.getNode(ISD::AND, dl, MVT::i16, SR, One);
+    if (Invert)
+      SR = DAG.getNode(ISD::XOR, dl, MVT::i16, SR, One);
+    return SR;
+  } else {
+    SDValue Zero = DAG.getConstant(0, VT);
+    SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag);
+    SmallVector<SDValue, 4> Ops;
+    Ops.push_back(One);
+    Ops.push_back(Zero);
+    Ops.push_back(TargetCC);
+    Ops.push_back(Flag);
+    return DAG.getNode(MSP430ISD::SELECT_CC, dl, VTs, &Ops[0], Ops.size());
+  }
+}
+
+SDValue MSP430TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) {
+  SDValue LHS    = Op.getOperand(0);
+  SDValue RHS    = Op.getOperand(1);
+  SDValue TrueV  = Op.getOperand(2);
+  SDValue FalseV = Op.getOperand(3);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+  DebugLoc dl    = Op.getDebugLoc();
+
+  SDValue TargetCC;
+  SDValue Flag = EmitCMP(LHS, RHS, TargetCC, CC, dl, DAG);
+
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag);
+  SmallVector<SDValue, 4> Ops;
+  Ops.push_back(TrueV);
+  Ops.push_back(FalseV);
+  Ops.push_back(TargetCC);
+  Ops.push_back(Flag);
+
+  return DAG.getNode(MSP430ISD::SELECT_CC, dl, VTs, &Ops[0], Ops.size());
+}
+
+SDValue MSP430TargetLowering::LowerSIGN_EXTEND(SDValue Op,
+                                               SelectionDAG &DAG) {
+  SDValue Val = Op.getOperand(0);
+  EVT VT      = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+
+  assert(VT == MVT::i16 && "Only support i16 for now!");
+
+  return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT,
+                     DAG.getNode(ISD::ANY_EXTEND, dl, VT, Val),
+                     DAG.getValueType(Val.getValueType()));
+}
+
+SDValue MSP430TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MSP430MachineFunctionInfo *FuncInfo = MF.getInfo<MSP430MachineFunctionInfo>();
+  int ReturnAddrIndex = FuncInfo->getRAIndex();
+
+  if (ReturnAddrIndex == 0) {
+    // Set up a frame object for the return address.
+    uint64_t SlotSize = TD->getPointerSize();
+    ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize,
+                                                           true, false);
+    FuncInfo->setRAIndex(ReturnAddrIndex);
+  }
+
+  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
+}
+
+SDValue MSP430TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) {
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (Depth > 0) {
+    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+    SDValue Offset =
+      DAG.getConstant(TD->getPointerSize(), MVT::i16);
+    return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
+                       DAG.getNode(ISD::ADD, dl, getPointerTy(),
+                                   FrameAddr, Offset),
+                       NULL, 0);
+  }
+
+  // Just load the return address.
+  SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
+  return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
+                     RetAddrFI, NULL, 0);
+}
+
+SDValue MSP430TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) {
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setFrameAddressIsTaken(true);
+  EVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();  // FIXME probably not meaningful
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl,
+                                         MSP430::FPW, VT);
+  while (Depth--)
+    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0);
+  return FrameAddr;
+}
+
+/// getPostIndexedAddressParts - returns true by value, base pointer and
+/// offset pointer and addressing mode by reference if this node can be
+/// combined with a load / store to form a post-indexed load / store.
+bool MSP430TargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
+                                                      SDValue &Base,
+                                                      SDValue &Offset,
+                                                      ISD::MemIndexedMode &AM,
+                                                      SelectionDAG &DAG) const {
+
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  if (LD->getExtensionType() != ISD::NON_EXTLOAD)
+    return false;
+
+  EVT VT = LD->getMemoryVT();
+  if (VT != MVT::i8 && VT != MVT::i16)
+    return false;
+
+  if (Op->getOpcode() != ISD::ADD)
+    return false;
+
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
+    uint64_t RHSC = RHS->getZExtValue();
+    if ((VT == MVT::i16 && RHSC != 2) ||
+        (VT == MVT::i8 && RHSC != 1))
+      return false;
+
+    Base = Op->getOperand(0);
+    Offset = DAG.getConstant(RHSC, VT);
+    AM = ISD::POST_INC;
+    return true;
+  }
+
+  return false;
+}
+
+
+const char *MSP430TargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return NULL;
+  case MSP430ISD::RET_FLAG:           return "MSP430ISD::RET_FLAG";
+  case MSP430ISD::RETI_FLAG:          return "MSP430ISD::RETI_FLAG";
+  case MSP430ISD::RRA:                return "MSP430ISD::RRA";
+  case MSP430ISD::RLA:                return "MSP430ISD::RLA";
+  case MSP430ISD::RRC:                return "MSP430ISD::RRC";
+  case MSP430ISD::CALL:               return "MSP430ISD::CALL";
+  case MSP430ISD::Wrapper:            return "MSP430ISD::Wrapper";
+  case MSP430ISD::BR_CC:              return "MSP430ISD::BR_CC";
+  case MSP430ISD::CMP:                return "MSP430ISD::CMP";
+  case MSP430ISD::SELECT_CC:          return "MSP430ISD::SELECT_CC";
+  case MSP430ISD::SHL:                return "MSP430ISD::SHL";
+  case MSP430ISD::SRA:                return "MSP430ISD::SRA";
+  }
+}
+
+bool MSP430TargetLowering::isTruncateFree(const Type *Ty1,
+                                          const Type *Ty2) const {
+  if (!Ty1->isInteger() || !Ty2->isInteger())
+    return false;
+
+  return (Ty1->getPrimitiveSizeInBits() > Ty2->getPrimitiveSizeInBits());
+}
+
+bool MSP430TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
+  if (!VT1.isInteger() || !VT2.isInteger())
+    return false;
+
+  return (VT1.getSizeInBits() > VT2.getSizeInBits());
+}
+
+bool MSP430TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const {
+  // MSP430 implicitly zero-extends 8-bit results in 16-bit registers.
+  return 0 && Ty1->isInteger(8) && Ty2->isInteger(16);
+}
+
+bool MSP430TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
+  // MSP430 implicitly zero-extends 8-bit results in 16-bit registers.
+  return 0 && VT1 == MVT::i8 && VT2 == MVT::i16;
+}
+
+//===----------------------------------------------------------------------===//
+//  Other Lowering Code
+//===----------------------------------------------------------------------===//
+
+MachineBasicBlock*
+MSP430TargetLowering::EmitShiftInstr(MachineInstr *MI,
+                                     MachineBasicBlock *BB,
+                   DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const {
+  MachineFunction *F = BB->getParent();
+  MachineRegisterInfo &RI = F->getRegInfo();
+  DebugLoc dl = MI->getDebugLoc();
+  const TargetInstrInfo &TII = *getTargetMachine().getInstrInfo();
+
+  unsigned Opc;
+  const TargetRegisterClass * RC;
+  switch (MI->getOpcode()) {
+  default:
+    assert(0 && "Invalid shift opcode!");
+  case MSP430::Shl8:
+   Opc = MSP430::SHL8r1;
+   RC = MSP430::GR8RegisterClass;
+   break;
+  case MSP430::Shl16:
+   Opc = MSP430::SHL16r1;
+   RC = MSP430::GR16RegisterClass;
+   break;
+  case MSP430::Sra8:
+   Opc = MSP430::SAR8r1;
+   RC = MSP430::GR8RegisterClass;
+   break;
+  case MSP430::Sra16:
+   Opc = MSP430::SAR16r1;
+   RC = MSP430::GR16RegisterClass;
+   break;
+  case MSP430::Srl8:
+   Opc = MSP430::SAR8r1c;
+   RC = MSP430::GR8RegisterClass;
+   break;
+  case MSP430::Srl16:
+   Opc = MSP430::SAR16r1c;
+   RC = MSP430::GR16RegisterClass;
+   break;
+  }
+
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator I = BB;
+  ++I;
+
+  // Create loop block
+  MachineBasicBlock *LoopBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *RemBB  = F->CreateMachineBasicBlock(LLVM_BB);
+
+  F->insert(I, LoopBB);
+  F->insert(I, RemBB);
+
+  // Update machine-CFG edges by transferring all successors of the current
+  // block to the block containing instructions after shift.
+  RemBB->transferSuccessors(BB);
+
+  // Inform sdisel of the edge changes.
+  for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
+         SE = BB->succ_end(); SI != SE; ++SI)
+    EM->insert(std::make_pair(*SI, RemBB));
+
+  // Add adges BB => LoopBB => RemBB, BB => RemBB, LoopBB => LoopBB
+  BB->addSuccessor(LoopBB);
+  BB->addSuccessor(RemBB);
+  LoopBB->addSuccessor(RemBB);
+  LoopBB->addSuccessor(LoopBB);
+
+  unsigned ShiftAmtReg = RI.createVirtualRegister(MSP430::GR8RegisterClass);
+  unsigned ShiftAmtReg2 = RI.createVirtualRegister(MSP430::GR8RegisterClass);
+  unsigned ShiftReg = RI.createVirtualRegister(RC);
+  unsigned ShiftReg2 = RI.createVirtualRegister(RC);
+  unsigned ShiftAmtSrcReg = MI->getOperand(2).getReg();
+  unsigned SrcReg = MI->getOperand(1).getReg();
+  unsigned DstReg = MI->getOperand(0).getReg();
+
+  // BB:
+  // cmp 0, N
+  // je RemBB
+  BuildMI(BB, dl, TII.get(MSP430::CMP8ri))
+    .addReg(ShiftAmtSrcReg).addImm(0);
+  BuildMI(BB, dl, TII.get(MSP430::JCC))
+    .addMBB(RemBB)
+    .addImm(MSP430CC::COND_E);
+
+  // LoopBB:
+  // ShiftReg = phi [%SrcReg, BB], [%ShiftReg2, LoopBB]
+  // ShiftAmt = phi [%N, BB],      [%ShiftAmt2, LoopBB]
+  // ShiftReg2 = shift ShiftReg
+  // ShiftAmt2 = ShiftAmt - 1;
+  BuildMI(LoopBB, dl, TII.get(MSP430::PHI), ShiftReg)
+    .addReg(SrcReg).addMBB(BB)
+    .addReg(ShiftReg2).addMBB(LoopBB);
+  BuildMI(LoopBB, dl, TII.get(MSP430::PHI), ShiftAmtReg)
+    .addReg(ShiftAmtSrcReg).addMBB(BB)
+    .addReg(ShiftAmtReg2).addMBB(LoopBB);
+  BuildMI(LoopBB, dl, TII.get(Opc), ShiftReg2)
+    .addReg(ShiftReg);
+  BuildMI(LoopBB, dl, TII.get(MSP430::SUB8ri), ShiftAmtReg2)
+    .addReg(ShiftAmtReg).addImm(1);
+  BuildMI(LoopBB, dl, TII.get(MSP430::JCC))
+    .addMBB(LoopBB)
+    .addImm(MSP430CC::COND_NE);
+
+  // RemBB:
+  // DestReg = phi [%SrcReg, BB], [%ShiftReg, LoopBB]
+  BuildMI(RemBB, dl, TII.get(MSP430::PHI), DstReg)
+    .addReg(SrcReg).addMBB(BB)
+    .addReg(ShiftReg2).addMBB(LoopBB);
+
+  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+  return RemBB;
+}
+
+MachineBasicBlock*
+MSP430TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                  MachineBasicBlock *BB,
+                   DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const {
+  unsigned Opc = MI->getOpcode();
+
+  if (Opc == MSP430::Shl8 || Opc == MSP430::Shl16 ||
+      Opc == MSP430::Sra8 || Opc == MSP430::Sra16 ||
+      Opc == MSP430::Srl8 || Opc == MSP430::Srl16)
+    return EmitShiftInstr(MI, BB, EM);
+
+  const TargetInstrInfo &TII = *getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+
+  assert((Opc == MSP430::Select16 || Opc == MSP430::Select8) &&
+         "Unexpected instr type to insert");
+
+  // To "insert" a SELECT instruction, we actually have to insert the diamond
+  // control-flow pattern.  The incoming instruction knows the destination vreg
+  // to set, the condition code register to branch on, the true/false values to
+  // select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator I = BB;
+  ++I;
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   cmpTY ccX, r1, r2
+  //   jCC copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *copy1MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  BuildMI(BB, dl, TII.get(MSP430::JCC))
+    .addMBB(copy1MBB)
+    .addImm(MI->getOperand(3).getImm());
+  F->insert(I, copy0MBB);
+  F->insert(I, copy1MBB);
+  // Inform sdisel of the edge changes.
+  for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(), 
+         SE = BB->succ_end(); SI != SE; ++SI)
+    EM->insert(std::make_pair(*SI, copy1MBB));
+  // Update machine-CFG edges by transferring all successors of the current
+  // block to the new block which will contain the Phi node for the select.
+  copy1MBB->transferSuccessors(BB);
+  // Next, add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(copy1MBB);
+
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to copy1MBB
+  BB = copy0MBB;
+
+  // Update machine-CFG edges
+  BB->addSuccessor(copy1MBB);
+
+  //  copy1MBB:
+  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+  //  ...
+  BB = copy1MBB;
+  BuildMI(BB, dl, TII.get(MSP430::PHI),
+          MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB)
+    .addReg(MI->getOperand(1).getReg()).addMBB(thisMBB);
+
+  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+  return BB;
+}

diff --git a/lib/Target/MSP430/MSP430ISelLowering.h b/lib/Target/MSP430/MSP430ISelLowering.h
new file mode 100644
index 0000000..87a790b
--- /dev/null
+++ b/lib/Target/MSP430/MSP430ISelLowering.h

@@ -0,0 +1,181 @@
+//==-- MSP430ISelLowering.h - MSP430 DAG Lowering Interface ------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that MSP430 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_MSP430_ISELLOWERING_H
+#define LLVM_TARGET_MSP430_ISELLOWERING_H
+
+#include "MSP430.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+  namespace MSP430ISD {
+    enum {
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      /// Return with a flag operand. Operand 0 is the chain operand.
+      RET_FLAG,
+
+      /// Same as RET_FLAG, but used for returning from ISRs.
+      RETI_FLAG,
+
+      /// Y = R{R,L}A X, rotate right (left) arithmetically
+      RRA, RLA,
+
+      /// Y = RRC X, rotate right via carry
+      RRC,
+
+      /// CALL - These operations represent an abstract call
+      /// instruction, which includes a bunch of information.
+      CALL,
+
+      /// Wrapper - A wrapper node for TargetConstantPool, TargetExternalSymbol,
+      /// and TargetGlobalAddress.
+      Wrapper,
+
+      /// CMP - Compare instruction.
+      CMP,
+
+      /// SetCC - Operand 0 is condition code, and operand 1 is the flag
+      /// operand produced by a CMP instruction.
+      SETCC,
+
+      /// MSP430 conditional branches. Operand 0 is the chain operand, operand 1
+      /// is the block to branch if condition is true, operand 2 is the
+      /// condition code, and operand 3 is the flag operand produced by a CMP
+      /// instruction.
+      BR_CC,
+
+      /// SELECT_CC - Operand 0 and operand 1 are selection variable, operand 3
+      /// is condition code and operand 4 is flag operand.
+      SELECT_CC,
+
+      /// SHL, SRA, SRL - Non-constant shifts.
+      SHL, SRA, SRL
+    };
+  }
+
+  class MSP430Subtarget;
+  class MSP430TargetMachine;
+
+  class MSP430TargetLowering : public TargetLowering {
+  public:
+    explicit MSP430TargetLowering(MSP430TargetMachine &TM);
+
+    /// LowerOperation - Provide custom lowering hooks for some operations.
+    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG);
+
+    /// getTargetNodeName - This method returns the name of a target specific
+    /// DAG node.
+    virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+    /// getFunctionAlignment - Return the Log2 alignment of this function.
+    virtual unsigned getFunctionAlignment(const Function *F) const;
+
+    SDValue LowerShifts(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG);
+    SDValue getReturnAddressFrameIndex(SelectionDAG &DAG);
+
+    TargetLowering::ConstraintType
+    getConstraintType(const std::string &Constraint) const;
+    std::pair<unsigned, const TargetRegisterClass*>
+    getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const;
+
+    /// isTruncateFree - Return true if it's free to truncate a value of type
+    /// Ty1 to type Ty2. e.g. On msp430 it's free to truncate a i16 value in
+    /// register R15W to i8 by referencing its sub-register R15B.
+    virtual bool isTruncateFree(const Type *Ty1, const Type *Ty2) const;
+    virtual bool isTruncateFree(EVT VT1, EVT VT2) const;
+
+    /// isZExtFree - Return true if any actual instruction that defines a value
+    /// of type Ty1 implicit zero-extends the value to Ty2 in the result
+    /// register. This does not necessarily include registers defined in unknown
+    /// ways, such as incoming arguments, or copies from unknown virtual
+    /// registers. Also, if isTruncateFree(Ty2, Ty1) is true, this does not
+    /// necessarily apply to truncate instructions. e.g. on msp430, all
+    /// instructions that define 8-bit values implicit zero-extend the result
+    /// out to 16 bits.
+    virtual bool isZExtFree(const Type *Ty1, const Type *Ty2) const;
+    virtual bool isZExtFree(EVT VT1, EVT VT2) const;
+
+    MachineBasicBlock* EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                   MachineBasicBlock *BB,
+                    DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const;
+    MachineBasicBlock* EmitShiftInstr(MachineInstr *MI,
+                                      MachineBasicBlock *BB,
+                    DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const;
+
+  private:
+    SDValue LowerCCCCallTo(SDValue Chain, SDValue Callee,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           bool isTailCall,
+                           const SmallVectorImpl<ISD::OutputArg> &Outs,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals);
+
+    SDValue LowerCCCArguments(SDValue Chain,
+                              CallingConv::ID CallConv,
+                              bool isVarArg,
+                              const SmallVectorImpl<ISD::InputArg> &Ins,
+                              DebugLoc dl,
+                              SelectionDAG &DAG,
+                              SmallVectorImpl<SDValue> &InVals);
+
+    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                            CallingConv::ID CallConv, bool isVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            DebugLoc dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerFormalArguments(SDValue Chain,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals);
+    virtual SDValue
+      LowerCall(SDValue Chain, SDValue Callee,
+                CallingConv::ID CallConv, bool isVarArg, bool &isTailCall,
+                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<ISD::InputArg> &Ins,
+                DebugLoc dl, SelectionDAG &DAG,
+                SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerReturn(SDValue Chain,
+                  CallingConv::ID CallConv, bool isVarArg,
+                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  DebugLoc dl, SelectionDAG &DAG);
+
+    virtual bool getPostIndexedAddressParts(SDNode *N, SDNode *Op,
+                                            SDValue &Base,
+                                            SDValue &Offset,
+                                            ISD::MemIndexedMode &AM,
+                                            SelectionDAG &DAG) const;
+
+    const MSP430Subtarget &Subtarget;
+    const MSP430TargetMachine &TM;
+    const TargetData *TD;
+  };
+} // namespace llvm
+
+#endif // LLVM_TARGET_MSP430_ISELLOWERING_H

diff --git a/lib/Target/MSP430/MSP430InstrFormats.td b/lib/Target/MSP430/MSP430InstrFormats.td
new file mode 100644
index 0000000..4ccc7df
--- /dev/null
+++ b/lib/Target/MSP430/MSP430InstrFormats.td

@@ -0,0 +1,209 @@
+//===- MSP430InstrFormats.td - MSP430 Instruction Formats-----*- tblgen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Describe MSP430 instructions format here
+//
+
+// Format specifies the encoding used by the instruction.  This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<2> val> {
+  bits<2> Value = val;
+}
+
+def PseudoFrm   : Format<0>;
+def SingleOpFrm : Format<1>;
+def DoubleOpFrm : Format<2>;
+def CondJumpFrm : Format<3>;
+
+class SourceMode<bits<2> val> {
+  bits<2> Value = val;
+}
+
+def SrcReg      : SourceMode<0>;
+def SrcMem      : SourceMode<1>;
+def SrcIndReg   : SourceMode<2>;
+def SrcPostInc  : SourceMode<3>;
+def SrcImm      : SourceMode<3>;
+
+class DestMode<bit val> {
+  bit Value = val;
+}
+
+def DstReg      : DestMode<0>;
+def DstMem      : DestMode<1>;
+
+class SizeVal<bits<3> val> {
+  bits<3> Value = val;
+}
+
+def SizeUnknown : SizeVal<0>; // Unknown / unset size
+def SizeSpecial : SizeVal<1>; // Special instruction, e.g. pseudo
+def Size2Bytes  : SizeVal<2>;
+def Size4Bytes  : SizeVal<3>;
+def Size6Bytes  : SizeVal<4>;
+
+// Generic MSP430 Format
+class MSP430Inst<dag outs, dag ins, SizeVal sz, Format f,
+                 string asmstr> : Instruction {
+  field bits<16> Inst;
+
+  let Namespace = "MSP430";
+
+  dag OutOperandList = outs;
+  dag InOperandList  = ins;
+
+  Format Form = f;
+  bits<2> FormBits = Form.Value;
+
+  SizeVal Sz = sz;
+  bits<3> Size = Sz.Value;
+
+  let AsmString   = asmstr;
+}
+
+// FIXME: Create different classes for different addressing modes.
+
+// MSP430 Double Operand (Format I) Instructions
+class IForm<bits<4> opcode, DestMode dest, bit bw, SourceMode src, SizeVal sz,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : MSP430Inst<outs, ins, sz, DoubleOpFrm, asmstr> {
+  let Pattern = pattern;
+
+  DestMode ad = dest;
+  SourceMode as = src;
+  
+  let Inst{12-15} = opcode;
+  let Inst{7}     = ad.Value;
+  let Inst{6}     = bw;
+  let Inst{4-5}   = as.Value;
+}
+
+// 8 bit IForm instructions
+class IForm8<bits<4> opcode, DestMode dest, SourceMode src, SizeVal sz,
+             dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm<opcode, dest, 1, src, sz, outs, ins, asmstr, pattern>;
+
+class I8rr<bits<4> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm8<opcode, DstReg, SrcReg, Size2Bytes, outs, ins, asmstr, pattern>;
+
+class I8ri<bits<4> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm8<opcode, DstReg, SrcImm, Size4Bytes, outs, ins, asmstr, pattern>;
+
+class I8rm<bits<4> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm8<opcode, DstReg, SrcMem, Size4Bytes, outs, ins, asmstr, pattern>;
+
+class I8mr<bits<4> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm8<opcode, DstMem, SrcReg, Size4Bytes, outs, ins, asmstr, pattern>;
+
+class I8mi<bits<4> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm8<opcode, DstMem, SrcImm, Size6Bytes, outs, ins, asmstr, pattern>;
+
+class I8mm<bits<4> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm8<opcode, DstMem, SrcMem, Size6Bytes, outs, ins, asmstr, pattern>;
+
+// 16 bit IForm instructions
+class IForm16<bits<4> opcode, DestMode dest, SourceMode src, SizeVal sz,
+              dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm<opcode, dest, 0, src, sz, outs, ins, asmstr, pattern>;
+
+class I16rr<bits<4> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm16<opcode, DstReg, SrcReg, Size2Bytes, outs, ins, asmstr, pattern>;
+
+class I16ri<bits<4> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm16<opcode, DstReg, SrcImm, Size4Bytes, outs, ins, asmstr, pattern>;
+
+class I16rm<bits<4> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm16<opcode, DstReg, SrcMem, Size4Bytes, outs, ins, asmstr, pattern>;
+
+class I16mr<bits<4> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm16<opcode, DstMem, SrcReg, Size4Bytes, outs, ins, asmstr, pattern>;
+
+class I16mi<bits<4> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm16<opcode, DstMem, SrcImm, Size6Bytes, outs, ins, asmstr, pattern>;
+
+class I16mm<bits<4> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IForm16<opcode, DstMem, SrcMem, Size6Bytes, outs, ins, asmstr, pattern>;
+
+// MSP430 Single Operand (Format II) Instructions
+class IIForm<bits<9> opcode, bit bw, SourceMode src, SizeVal sz,
+             dag outs, dag ins, string asmstr, list<dag> pattern>
+  : MSP430Inst<outs, ins, sz, SingleOpFrm, asmstr> {
+  let Pattern = pattern;
+  
+  SourceMode as = src;
+
+  let Inst{7-15} = opcode;
+  let Inst{6}    = bw;
+  let Inst{4-5}  = as.Value;
+}
+
+// 8 bit IIForm instructions
+class IIForm8<bits<9> opcode, SourceMode src, SizeVal sz,
+              dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm<opcode, 1, src, sz, outs, ins, asmstr, pattern>;
+
+class II8r<bits<9> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm8<opcode, SrcReg, Size2Bytes, outs, ins, asmstr, pattern>;
+
+class II8m<bits<9> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm8<opcode, SrcMem, Size4Bytes, outs, ins, asmstr, pattern>;
+
+class II8i<bits<9> opcode,
+           dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm8<opcode, SrcImm, Size4Bytes, outs, ins, asmstr, pattern>;
+
+// 16 bit IIForm instructions
+class IIForm16<bits<9> opcode, SourceMode src, SizeVal sz,
+               dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm<opcode, 0, src, sz, outs, ins, asmstr, pattern>;
+
+class II16r<bits<9> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm16<opcode, SrcReg, Size2Bytes, outs, ins, asmstr, pattern>;
+
+class II16m<bits<9> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm16<opcode, SrcMem, Size4Bytes, outs, ins, asmstr, pattern>;
+
+class II16i<bits<9> opcode,
+            dag outs, dag ins, string asmstr, list<dag> pattern>
+  : IIForm16<opcode, SrcImm, Size4Bytes, outs, ins, asmstr, pattern>;
+
+// MSP430 Conditional Jumps Instructions
+class CJForm<bits<3> opcode, bits<3> cond,
+             dag outs, dag ins, string asmstr, list<dag> pattern>
+  : MSP430Inst<outs, ins, Size2Bytes, CondJumpFrm, asmstr> {
+  let Pattern = pattern;
+  
+  let Inst{13-15} = opcode;
+  let Inst{10-12} = cond;
+}
+
+// Pseudo instructions
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : MSP430Inst<outs, ins, SizeSpecial, PseudoFrm, asmstr> {
+  let Pattern = pattern;
+  let Inst{15-0} = 0;
+}

diff --git a/lib/Target/MSP430/MSP430InstrInfo.cpp b/lib/Target/MSP430/MSP430InstrInfo.cpp
new file mode 100644
index 0000000..6372482
--- /dev/null
+++ b/lib/Target/MSP430/MSP430InstrInfo.cpp

@@ -0,0 +1,388 @@
+//===- MSP430InstrInfo.cpp - MSP430 Instruction Information ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MSP430 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430.h"
+#include "MSP430InstrInfo.h"
+#include "MSP430MachineFunctionInfo.h"
+#include "MSP430TargetMachine.h"
+#include "MSP430GenInstrInfo.inc"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+MSP430InstrInfo::MSP430InstrInfo(MSP430TargetMachine &tm)
+  : TargetInstrInfoImpl(MSP430Insts, array_lengthof(MSP430Insts)),
+    RI(tm, *this), TM(tm) {}
+
+void MSP430InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator MI,
+                                    unsigned SrcReg, bool isKill, int FrameIdx,
+                                    const TargetRegisterClass *RC) const {
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FrameIdx),
+                            MachineMemOperand::MOStore, 0,
+                            MFI.getObjectSize(FrameIdx),
+                            MFI.getObjectAlignment(FrameIdx));
+
+  if (RC == &MSP430::GR16RegClass)
+    BuildMI(MBB, MI, DL, get(MSP430::MOV16mr))
+      .addFrameIndex(FrameIdx).addImm(0)
+      .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
+  else if (RC == &MSP430::GR8RegClass)
+    BuildMI(MBB, MI, DL, get(MSP430::MOV8mr))
+      .addFrameIndex(FrameIdx).addImm(0)
+      .addReg(SrcReg, getKillRegState(isKill)).addMemOperand(MMO);
+  else
+    llvm_unreachable("Cannot store this register to stack slot!");
+}
+
+void MSP430InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator MI,
+                                           unsigned DestReg, int FrameIdx,
+                                           const TargetRegisterClass *RC) const{
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FrameIdx),
+                            MachineMemOperand::MOLoad, 0,
+                            MFI.getObjectSize(FrameIdx),
+                            MFI.getObjectAlignment(FrameIdx));
+
+  if (RC == &MSP430::GR16RegClass)
+    BuildMI(MBB, MI, DL, get(MSP430::MOV16rm))
+      .addReg(DestReg).addFrameIndex(FrameIdx).addImm(0).addMemOperand(MMO);
+  else if (RC == &MSP430::GR8RegClass)
+    BuildMI(MBB, MI, DL, get(MSP430::MOV8rm))
+      .addReg(DestReg).addFrameIndex(FrameIdx).addImm(0).addMemOperand(MMO);
+  else
+    llvm_unreachable("Cannot store this register to stack slot!");
+}
+
+bool MSP430InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator I,
+                                   unsigned DestReg, unsigned SrcReg,
+                                   const TargetRegisterClass *DestRC,
+                                   const TargetRegisterClass *SrcRC) const {
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  if (DestRC == SrcRC) {
+    unsigned Opc;
+    if (DestRC == &MSP430::GR16RegClass) {
+      Opc = MSP430::MOV16rr;
+    } else if (DestRC == &MSP430::GR8RegClass) {
+      Opc = MSP430::MOV8rr;
+    } else {
+      return false;
+    }
+
+    BuildMI(MBB, I, DL, get(Opc), DestReg).addReg(SrcReg);
+    return true;
+  }
+
+  return false;
+}
+
+bool
+MSP430InstrInfo::isMoveInstr(const MachineInstr& MI,
+                             unsigned &SrcReg, unsigned &DstReg,
+                             unsigned &SrcSubIdx, unsigned &DstSubIdx) const {
+  SrcSubIdx = DstSubIdx = 0; // No sub-registers yet.
+
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case MSP430::MOV8rr:
+  case MSP430::MOV16rr:
+   assert(MI.getNumOperands() >= 2 &&
+           MI.getOperand(0).isReg() &&
+           MI.getOperand(1).isReg() &&
+           "invalid register-register move instruction");
+    SrcReg = MI.getOperand(1).getReg();
+    DstReg = MI.getOperand(0).getReg();
+    return true;
+  }
+}
+
+bool
+MSP430InstrInfo::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator MI,
+                                const std::vector<CalleeSavedInfo> &CSI) const {
+  if (CSI.empty())
+    return false;
+
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  MachineFunction &MF = *MBB.getParent();
+  MSP430MachineFunctionInfo *MFI = MF.getInfo<MSP430MachineFunctionInfo>();
+  MFI->setCalleeSavedFrameSize(CSI.size() * 2);
+
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i-1].getReg();
+    // Add the callee-saved register as live-in. It's killed at the spill.
+    MBB.addLiveIn(Reg);
+    BuildMI(MBB, MI, DL, get(MSP430::PUSH16r))
+      .addReg(Reg, RegState::Kill);
+  }
+  return true;
+}
+
+bool
+MSP430InstrInfo::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                             MachineBasicBlock::iterator MI,
+                                const std::vector<CalleeSavedInfo> &CSI) const {
+  if (CSI.empty())
+    return false;
+
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i)
+    BuildMI(MBB, MI, DL, get(MSP430::POP16r), CSI[i].getReg());
+
+  return true;
+}
+
+unsigned MSP430InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  unsigned Count = 0;
+
+  while (I != MBB.begin()) {
+    --I;
+    if (I->getOpcode() != MSP430::JMP &&
+        I->getOpcode() != MSP430::JCC)
+      break;
+    // Remove the branch.
+    I->eraseFromParent();
+    I = MBB.end();
+    ++Count;
+  }
+
+  return Count;
+}
+
+bool MSP430InstrInfo::
+ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+  assert(Cond.size() == 1 && "Invalid Xbranch condition!");
+
+  MSP430CC::CondCodes CC = static_cast<MSP430CC::CondCodes>(Cond[0].getImm());
+
+  switch (CC) {
+  default:
+    assert(0 && "Invalid branch condition!");
+    break;
+  case MSP430CC::COND_E:
+    CC = MSP430CC::COND_NE;
+    break;
+  case MSP430CC::COND_NE:
+    CC = MSP430CC::COND_E;
+    break;
+  case MSP430CC::COND_L:
+    CC = MSP430CC::COND_GE;
+    break;
+  case MSP430CC::COND_GE:
+    CC = MSP430CC::COND_L;
+    break;
+  case MSP430CC::COND_HS:
+    CC = MSP430CC::COND_LO;
+    break;
+  case MSP430CC::COND_LO:
+    CC = MSP430CC::COND_HS;
+    break;
+  }
+
+  Cond[0].setImm(CC);
+  return false;
+}
+
+bool MSP430InstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
+  const TargetInstrDesc &TID = MI->getDesc();
+  if (!TID.isTerminator()) return false;
+
+  // Conditional branch is a special case.
+  if (TID.isBranch() && !TID.isBarrier())
+    return true;
+  if (!TID.isPredicable())
+    return true;
+  return !isPredicated(MI);
+}
+
+bool MSP430InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+                                    MachineBasicBlock *&TBB,
+                                    MachineBasicBlock *&FBB,
+                                    SmallVectorImpl<MachineOperand> &Cond,
+                                    bool AllowModify) const {
+  // Start from the bottom of the block and work up, examining the
+  // terminator instructions.
+  MachineBasicBlock::iterator I = MBB.end();
+  while (I != MBB.begin()) {
+    --I;
+    // Working from the bottom, when we see a non-terminator
+    // instruction, we're done.
+    if (!isUnpredicatedTerminator(I))
+      break;
+
+    // A terminator that isn't a branch can't easily be handled
+    // by this analysis.
+    if (!I->getDesc().isBranch())
+      return true;
+
+    // Handle unconditional branches.
+    if (I->getOpcode() == MSP430::JMP) {
+      if (!AllowModify) {
+        TBB = I->getOperand(0).getMBB();
+        continue;
+      }
+
+      // If the block has any instructions after a JMP, delete them.
+      while (llvm::next(I) != MBB.end())
+        llvm::next(I)->eraseFromParent();
+      Cond.clear();
+      FBB = 0;
+
+      // Delete the JMP if it's equivalent to a fall-through.
+      if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
+        TBB = 0;
+        I->eraseFromParent();
+        I = MBB.end();
+        continue;
+      }
+
+      // TBB is used to indicate the unconditinal destination.
+      TBB = I->getOperand(0).getMBB();
+      continue;
+    }
+
+    // Handle conditional branches.
+    assert(I->getOpcode() == MSP430::JCC && "Invalid conditional branch");
+    MSP430CC::CondCodes BranchCode =
+      static_cast<MSP430CC::CondCodes>(I->getOperand(1).getImm());
+    if (BranchCode == MSP430CC::COND_INVALID)
+      return true;  // Can't handle weird stuff.
+
+    // Working from the bottom, handle the first conditional branch.
+    if (Cond.empty()) {
+      FBB = TBB;
+      TBB = I->getOperand(0).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(BranchCode));
+      continue;
+    }
+
+    // Handle subsequent conditional branches. Only handle the case where all
+    // conditional branches branch to the same destination.
+    assert(Cond.size() == 1);
+    assert(TBB);
+
+    // Only handle the case where all conditional branches branch to
+    // the same destination.
+    if (TBB != I->getOperand(0).getMBB())
+      return true;
+
+    MSP430CC::CondCodes OldBranchCode = (MSP430CC::CondCodes)Cond[0].getImm();
+    // If the conditions are the same, we can leave them alone.
+    if (OldBranchCode == BranchCode)
+      continue;
+
+    return true;
+  }
+
+  return false;
+}
+
+unsigned
+MSP430InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                              MachineBasicBlock *FBB,
+                            const SmallVectorImpl<MachineOperand> &Cond) const {
+  // FIXME this should probably have a DebugLoc operand
+  DebugLoc dl = DebugLoc::getUnknownLoc();
+
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 1 || Cond.size() == 0) &&
+         "MSP430 branch conditions have one component!");
+
+  if (Cond.empty()) {
+    // Unconditional branch?
+    assert(!FBB && "Unconditional branch with multiple successors!");
+    BuildMI(&MBB, dl, get(MSP430::JMP)).addMBB(TBB);
+    return 1;
+  }
+
+  // Conditional branch.
+  unsigned Count = 0;
+  BuildMI(&MBB, dl, get(MSP430::JCC)).addMBB(TBB).addImm(Cond[0].getImm());
+  ++Count;
+
+  if (FBB) {
+    // Two-way Conditional branch. Insert the second branch.
+    BuildMI(&MBB, dl, get(MSP430::JMP)).addMBB(FBB);
+    ++Count;
+  }
+  return Count;
+}
+
+/// GetInstSize - Return the number of bytes of code the specified
+/// instruction may be.  This returns the maximum number of bytes.
+///
+unsigned MSP430InstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
+  const TargetInstrDesc &Desc = MI->getDesc();
+
+  switch (Desc.TSFlags & MSP430II::SizeMask) {
+  default:
+    switch (Desc.getOpcode()) {
+    default:
+      assert(0 && "Unknown instruction size!");
+    case TargetOpcode::DBG_LABEL:
+    case TargetOpcode::EH_LABEL:
+    case TargetOpcode::IMPLICIT_DEF:
+    case TargetOpcode::KILL:
+      return 0;
+    case TargetOpcode::INLINEASM: {
+      const MachineFunction *MF = MI->getParent()->getParent();
+      const TargetInstrInfo &TII = *MF->getTarget().getInstrInfo();
+      return TII.getInlineAsmLength(MI->getOperand(0).getSymbolName(),
+                                    *MF->getTarget().getMCAsmInfo());
+    }
+    }
+  case MSP430II::SizeSpecial:
+    switch (MI->getOpcode()) {
+    default:
+      assert(0 && "Unknown instruction size!");
+    case MSP430::SAR8r1c:
+    case MSP430::SAR16r1c:
+      return 4;
+    }
+  case MSP430II::Size2Bytes:
+    return 2;
+  case MSP430II::Size4Bytes:
+    return 4;
+  case MSP430II::Size6Bytes:
+    return 6;
+  }
+
+  return 6;
+}

diff --git a/lib/Target/MSP430/MSP430InstrInfo.h b/lib/Target/MSP430/MSP430InstrInfo.h
new file mode 100644
index 0000000..6ef4b0a
--- /dev/null
+++ b/lib/Target/MSP430/MSP430InstrInfo.h

@@ -0,0 +1,97 @@
+//===- MSP430InstrInfo.h - MSP430 Instruction Information -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MSP430 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_MSP430INSTRINFO_H
+#define LLVM_TARGET_MSP430INSTRINFO_H
+
+#include "llvm/Target/TargetInstrInfo.h"
+#include "MSP430RegisterInfo.h"
+
+namespace llvm {
+
+class MSP430TargetMachine;
+
+/// MSP430II - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace MSP430II {
+  enum {
+    SizeShift   = 2,
+    SizeMask    = 7 << SizeShift,
+
+    SizeUnknown = 0 << SizeShift,
+    SizeSpecial = 1 << SizeShift,
+    Size2Bytes  = 2 << SizeShift,
+    Size4Bytes  = 3 << SizeShift,
+    Size6Bytes  = 4 << SizeShift
+  };
+}
+
+class MSP430InstrInfo : public TargetInstrInfoImpl {
+  const MSP430RegisterInfo RI;
+  MSP430TargetMachine &TM;
+public:
+  explicit MSP430InstrInfo(MSP430TargetMachine &TM);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  virtual const TargetRegisterInfo &getRegisterInfo() const { return RI; }
+
+  bool copyRegToReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                    unsigned DestReg, unsigned SrcReg,
+                    const TargetRegisterClass *DestRC,
+                    const TargetRegisterClass *SrcRC) const;
+
+  bool isMoveInstr(const MachineInstr& MI,
+                   unsigned &SrcReg, unsigned &DstReg,
+                   unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
+
+  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   unsigned SrcReg, bool isKill,
+                                   int FrameIndex,
+                                   const TargetRegisterClass *RC) const;
+  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MI,
+                                    unsigned DestReg, int FrameIdx,
+                                    const TargetRegisterClass *RC) const;
+
+  virtual bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI) const;
+  virtual bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI) const;
+
+  unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
+
+  // Branch folding goodness
+  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+  bool isUnpredicatedTerminator(const MachineInstr *MI) const;
+  bool AnalyzeBranch(MachineBasicBlock &MBB,
+                     MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify) const;
+
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB,
+                        const SmallVectorImpl<MachineOperand> &Cond) const;
+
+};
+
+}
+
+#endif

diff --git a/lib/Target/MSP430/MSP430InstrInfo.td b/lib/Target/MSP430/MSP430InstrInfo.td
new file mode 100644
index 0000000..bb06f7b
--- /dev/null
+++ b/lib/Target/MSP430/MSP430InstrInfo.td

@@ -0,0 +1,1195 @@
+//===- MSP430InstrInfo.td - MSP430 Instruction defs -----------*- tblgen-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the MSP430 instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+include "MSP430InstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Type Constraints.
+//===----------------------------------------------------------------------===//
+class SDTCisI8<int OpNum> : SDTCisVT<OpNum, i8>;
+class SDTCisI16<int OpNum> : SDTCisVT<OpNum, i16>;
+
+//===----------------------------------------------------------------------===//
+// Type Profiles.
+//===----------------------------------------------------------------------===//
+def SDT_MSP430Call         : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
+def SDT_MSP430CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i16>]>;
+def SDT_MSP430CallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
+def SDT_MSP430Wrapper      : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+def SDT_MSP430Cmp          : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
+def SDT_MSP430BrCC         : SDTypeProfile<0, 2, [SDTCisVT<0, OtherVT>,
+                                                  SDTCisVT<1, i8>]>;
+def SDT_MSP430SelectCC     : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, 
+                                                  SDTCisVT<3, i8>]>;
+def SDT_MSP430Shift        : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisI8<2>]>;
+
+//===----------------------------------------------------------------------===//
+// MSP430 Specific Node Definitions.
+//===----------------------------------------------------------------------===//
+def MSP430retflag  : SDNode<"MSP430ISD::RET_FLAG", SDTNone,
+                       [SDNPHasChain, SDNPOptInFlag]>;
+def MSP430retiflag : SDNode<"MSP430ISD::RETI_FLAG", SDTNone,
+                       [SDNPHasChain, SDNPOptInFlag]>;
+
+def MSP430rra     : SDNode<"MSP430ISD::RRA", SDTIntUnaryOp, []>;
+def MSP430rla     : SDNode<"MSP430ISD::RLA", SDTIntUnaryOp, []>;
+def MSP430rrc     : SDNode<"MSP430ISD::RRC", SDTIntUnaryOp, []>;
+
+def MSP430call    : SDNode<"MSP430ISD::CALL", SDT_MSP430Call,
+                     [SDNPHasChain, SDNPOutFlag, SDNPOptInFlag]>;
+def MSP430callseq_start :
+                 SDNode<"ISD::CALLSEQ_START", SDT_MSP430CallSeqStart,
+                        [SDNPHasChain, SDNPOutFlag]>;
+def MSP430callseq_end :
+                 SDNode<"ISD::CALLSEQ_END",   SDT_MSP430CallSeqEnd,
+                        [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+def MSP430Wrapper : SDNode<"MSP430ISD::Wrapper", SDT_MSP430Wrapper>;
+def MSP430cmp     : SDNode<"MSP430ISD::CMP", SDT_MSP430Cmp, [SDNPOutFlag]>;
+def MSP430brcc    : SDNode<"MSP430ISD::BR_CC", SDT_MSP430BrCC, [SDNPHasChain, SDNPInFlag]>;
+def MSP430selectcc: SDNode<"MSP430ISD::SELECT_CC", SDT_MSP430SelectCC, [SDNPInFlag]>;
+def MSP430shl     : SDNode<"MSP430ISD::SHL", SDT_MSP430Shift, []>;
+def MSP430sra     : SDNode<"MSP430ISD::SRA", SDT_MSP430Shift, []>;
+def MSP430srl     : SDNode<"MSP430ISD::SRL", SDT_MSP430Shift, []>;
+
+//===----------------------------------------------------------------------===//
+// MSP430 Operand Definitions.
+//===----------------------------------------------------------------------===//
+
+// Address operands
+def memsrc : Operand<i16> {
+  let PrintMethod = "printSrcMemOperand";
+  let MIOperandInfo = (ops GR16, i16imm);
+}
+
+def memdst : Operand<i16> {
+  let PrintMethod = "printSrcMemOperand";
+  let MIOperandInfo = (ops GR16, i16imm);
+}
+
+// Branch targets have OtherVT type.
+def brtarget : Operand<OtherVT> {
+  let PrintMethod = "printPCRelImmOperand";
+}
+
+// Operand for printing out a condition code.
+def cc : Operand<i8> {
+  let PrintMethod = "printCCOperand";
+}
+
+//===----------------------------------------------------------------------===//
+// MSP430 Complex Pattern Definitions.
+//===----------------------------------------------------------------------===//
+
+def addr : ComplexPattern<iPTR, 2, "SelectAddr", [], []>;
+
+//===----------------------------------------------------------------------===//
+// Pattern Fragments
+def zextloadi16i8 : PatFrag<(ops node:$ptr), (i16 (zextloadi8 node:$ptr))>;
+def  extloadi16i8 : PatFrag<(ops node:$ptr), (i16 ( extloadi8 node:$ptr))>;
+def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{
+  return N->hasOneUse();
+}]>;
+//===----------------------------------------------------------------------===//
+// Instruction list..
+
+// ADJCALLSTACKDOWN/UP implicitly use/def SP because they may be expanded into
+// a stack adjustment and the codegen must know that they may modify the stack
+// pointer before prolog-epilog rewriting occurs.
+// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
+// sub / add which can clobber SRW.
+let Defs = [SPW, SRW], Uses = [SPW] in {
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i16imm:$amt),
+                              "#ADJCALLSTACKDOWN",
+                              [(MSP430callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP   : Pseudo<(outs), (ins i16imm:$amt1, i16imm:$amt2),
+                              "#ADJCALLSTACKUP",
+                              [(MSP430callseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+let usesCustomInserter = 1 in {
+  def Select8  : Pseudo<(outs GR8:$dst), (ins GR8:$src1, GR8:$src2, i8imm:$cc),
+                        "# Select8 PSEUDO",
+                        [(set GR8:$dst,
+                          (MSP430selectcc GR8:$src1, GR8:$src2, imm:$cc))]>;
+  def Select16 : Pseudo<(outs GR16:$dst), (ins GR16:$src1, GR16:$src2, i8imm:$cc),
+                        "# Select16 PSEUDO",
+                        [(set GR16:$dst,
+                          (MSP430selectcc GR16:$src1, GR16:$src2, imm:$cc))]>;
+  let Defs = [SRW] in {
+  def Shl8     : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$cnt),
+                        "# Shl8 PSEUDO",
+                        [(set GR8:$dst, (MSP430shl GR8:$src, GR8:$cnt))]>;
+  def Shl16    : Pseudo<(outs GR16:$dst), (ins GR16:$src, GR8:$cnt),
+                        "# Shl16 PSEUDO",
+                        [(set GR16:$dst, (MSP430shl GR16:$src, GR8:$cnt))]>;
+  def Sra8     : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$cnt),
+                        "# Sra8 PSEUDO",
+                        [(set GR8:$dst, (MSP430sra GR8:$src, GR8:$cnt))]>;
+  def Sra16    : Pseudo<(outs GR16:$dst), (ins GR16:$src, GR8:$cnt),
+                        "# Sra16 PSEUDO",
+                        [(set GR16:$dst, (MSP430sra GR16:$src, GR8:$cnt))]>;
+  def Srl8     : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$cnt),
+                        "# Srl8 PSEUDO",
+                        [(set GR8:$dst, (MSP430srl GR8:$src, GR8:$cnt))]>;
+  def Srl16    : Pseudo<(outs GR16:$dst), (ins GR16:$src, GR8:$cnt),
+                        "# Srl16 PSEUDO",
+                        [(set GR16:$dst, (MSP430srl GR16:$src, GR8:$cnt))]>;
+
+  }
+}
+
+let neverHasSideEffects = 1 in
+def NOP : Pseudo<(outs), (ins), "nop", []>;
+
+//===----------------------------------------------------------------------===//
+//  Control Flow Instructions...
+//
+
+// FIXME: Provide proper encoding!
+let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
+  def RET  : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
+                     (outs), (ins), "ret",  [(MSP430retflag)]>;
+  def RETI : II16r<0x0, (outs), (ins), "reti", [(MSP430retiflag)]>;
+}
+
+let isBranch = 1, isTerminator = 1 in {
+
+// FIXME: expand opcode & cond field for branches!
+
+// Direct branch
+let isBarrier = 1 in {
+  // Short branch
+  def JMP : CJForm<0, 0,
+                   (outs), (ins brtarget:$dst),
+                   "jmp\t$dst",
+                   [(br bb:$dst)]>;
+  // Long branch
+  def B   : I16ri<0,
+                  (outs), (ins brtarget:$dst),
+                  "br\t$dst",
+                  []>;
+}
+
+// Conditional branches
+let Uses = [SRW] in
+  def JCC : CJForm<0, 0,
+                   (outs), (ins brtarget:$dst, cc:$cc),
+                   "j$cc\t$dst",
+                   [(MSP430brcc bb:$dst, imm:$cc)]>;
+} // isBranch, isTerminator
+
+//===----------------------------------------------------------------------===//
+//  Call Instructions...
+//
+let isCall = 1 in
+  // All calls clobber the non-callee saved registers. SPW is marked as
+  // a use to prevent stack-pointer assignments that appear immediately
+  // before calls from potentially appearing dead. Uses for argument
+  // registers are added manually.
+  let Defs = [R12W, R13W, R14W, R15W, SRW],
+      Uses = [SPW] in {
+    def CALLi     : II16i<0x0,
+                          (outs), (ins i16imm:$dst, variable_ops),
+                          "call\t$dst", [(MSP430call imm:$dst)]>;
+    def CALLr     : II16r<0x0,
+                          (outs), (ins GR16:$dst, variable_ops),
+                          "call\t$dst", [(MSP430call GR16:$dst)]>;
+    def CALLm     : II16m<0x0,
+                          (outs), (ins memsrc:$dst, variable_ops),
+                          "call\t${dst:mem}", [(MSP430call (load addr:$dst))]>;
+  }
+
+
+//===----------------------------------------------------------------------===//
+//  Miscellaneous Instructions...
+//
+let Defs = [SPW], Uses = [SPW], neverHasSideEffects=1 in {
+let mayLoad = 1 in
+def POP16r   : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
+                       (outs GR16:$reg), (ins), "pop.w\t$reg", []>;
+
+let mayStore = 1 in
+def PUSH16r  : II16r<0x0,
+                     (outs), (ins GR16:$reg), "push.w\t$reg",[]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Move Instructions
+
+// FIXME: Provide proper encoding!
+let neverHasSideEffects = 1 in {
+def MOV8rr  : I8rr<0x0,
+                   (outs GR8:$dst), (ins GR8:$src),
+                   "mov.b\t{$src, $dst}",
+                   []>;
+def MOV16rr : I16rr<0x0,
+                    (outs GR16:$dst), (ins GR16:$src),
+                    "mov.w\t{$src, $dst}",
+                    []>;
+}
+
+// FIXME: Provide proper encoding!
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+def MOV8ri  : I8ri<0x0,
+                   (outs GR8:$dst), (ins i8imm:$src),
+                   "mov.b\t{$src, $dst}",
+                   [(set GR8:$dst, imm:$src)]>;
+def MOV16ri : I16ri<0x0,
+                    (outs GR16:$dst), (ins i16imm:$src),
+                    "mov.w\t{$src, $dst}",
+                    [(set GR16:$dst, imm:$src)]>;
+}
+
+let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in {
+def MOV8rm  : I8rm<0x0,
+                   (outs GR8:$dst), (ins memsrc:$src),
+                   "mov.b\t{$src, $dst}",
+                   [(set GR8:$dst, (load addr:$src))]>;
+def MOV16rm : I16rm<0x0,
+                    (outs GR16:$dst), (ins memsrc:$src),
+                    "mov.w\t{$src, $dst}",
+                    [(set GR16:$dst, (load addr:$src))]>;
+}
+
+def MOVZX16rr8 : I8rr<0x0,
+                      (outs GR16:$dst), (ins GR8:$src),
+                      "mov.b\t{$src, $dst}",
+                      [(set GR16:$dst, (zext GR8:$src))]>;
+def MOVZX16rm8 : I8rm<0x0,
+                      (outs GR16:$dst), (ins memsrc:$src),
+                      "mov.b\t{$src, $dst}",
+                      [(set GR16:$dst, (zextloadi16i8 addr:$src))]>;
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1, Constraints = "$base = $base_wb" in {
+def MOV8rm_POST  : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
+                         (outs GR8:$dst, GR16:$base_wb), (ins GR16:$base),
+                         "mov.b\t{@$base+, $dst}", []>;
+def MOV16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
+                           (outs GR16:$dst, GR16:$base_wb), (ins GR16:$base),
+                           "mov.w\t{@$base+, $dst}", []>;
+}
+
+// Any instruction that defines a 8-bit result leaves the high half of the
+// register. Truncate can be lowered to EXTRACT_SUBREG, and CopyFromReg may
+// be copying from a truncate, but any other 8-bit operation will zero-extend
+// up to 16 bits.
+def def8 : PatLeaf<(i8 GR8:$src), [{
+  return N->getOpcode() != ISD::TRUNCATE &&
+         N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
+         N->getOpcode() != ISD::CopyFromReg;
+}]>;
+
+// In the case of a 8-bit def that is known to implicitly zero-extend,
+// we can use a SUBREG_TO_REG.
+def : Pat<(i16 (zext def8:$src)),
+          (SUBREG_TO_REG (i16 0), GR8:$src, subreg_8bit)>;
+
+def MOV8mi  : I8mi<0x0,
+                   (outs), (ins memdst:$dst, i8imm:$src),
+                   "mov.b\t{$src, $dst}",
+                   [(store (i8 imm:$src), addr:$dst)]>;
+def MOV16mi : I16mi<0x0,
+                    (outs), (ins memdst:$dst, i16imm:$src),
+                    "mov.w\t{$src, $dst}",
+                    [(store (i16 imm:$src), addr:$dst)]>;
+
+def MOV8mr  : I8mr<0x0,
+                   (outs), (ins memdst:$dst, GR8:$src),
+                   "mov.b\t{$src, $dst}",
+                   [(store GR8:$src, addr:$dst)]>;
+def MOV16mr : I16mr<0x0,
+                    (outs), (ins memdst:$dst, GR16:$src),
+                    "mov.w\t{$src, $dst}",
+                    [(store GR16:$src, addr:$dst)]>;
+
+def MOV8mm  : I8mm<0x0,
+                   (outs), (ins memdst:$dst, memsrc:$src),
+                   "mov.b\t{$src, $dst}",
+                   [(store (i8 (load addr:$src)), addr:$dst)]>;
+def MOV16mm : I16mm<0x0,
+                    (outs), (ins memdst:$dst, memsrc:$src),
+                    "mov.w\t{$src, $dst}",
+                    [(store (i16 (load addr:$src)), addr:$dst)]>;
+
+//===----------------------------------------------------------------------===//
+// Arithmetic Instructions
+
+let isTwoAddress = 1 in {
+
+let Defs = [SRW] in {
+
+let isCommutable = 1 in { // X = ADD Y, Z  == X = ADD Z, Y
+
+def ADD8rr  : I8rr<0x0,
+                   (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+                   "add.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (add GR8:$src1, GR8:$src2)),
+                    (implicit SRW)]>;
+def ADD16rr : I16rr<0x0,
+                    (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                    "add.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (add GR16:$src1, GR16:$src2)),
+                     (implicit SRW)]>;
+}
+
+def ADD8rm  : I8rm<0x0,
+                   (outs GR8:$dst), (ins GR8:$src1, memsrc:$src2),
+                   "add.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (add GR8:$src1, (load addr:$src2))),
+                    (implicit SRW)]>;
+def ADD16rm : I16rm<0x0,
+                    (outs GR16:$dst), (ins GR16:$src1, memsrc:$src2),
+                    "add.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (add GR16:$src1, (load addr:$src2))),
+                     (implicit SRW)]>;
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1, 
+Constraints = "$base = $base_wb, $src1 = $dst" in {
+def ADD8rm_POST : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
+                         (outs GR8:$dst, GR16:$base_wb),
+                         (ins GR8:$src1, GR16:$base),
+                         "add.b\t{@$base+, $dst}", []>;
+def ADD16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
+                           (outs GR16:$dst, GR16:$base_wb),
+                           (ins GR16:$src1, GR16:$base),
+                          "add.w\t{@$base+, $dst}", []>;
+}
+
+
+def ADD8ri  : I8ri<0x0,
+                   (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+                   "add.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (add GR8:$src1, imm:$src2)),
+                    (implicit SRW)]>;
+def ADD16ri : I16ri<0x0,
+                    (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+                    "add.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (add GR16:$src1, imm:$src2)),
+                     (implicit SRW)]>;
+
+let isTwoAddress = 0 in {
+def ADD8mr  : I8mr<0x0,
+                   (outs), (ins memdst:$dst, GR8:$src),
+                   "add.b\t{$src, $dst}",
+                   [(store (add (load addr:$dst), GR8:$src), addr:$dst),
+                    (implicit SRW)]>;
+def ADD16mr : I16mr<0x0,
+                    (outs), (ins memdst:$dst, GR16:$src),
+                    "add.w\t{$src, $dst}",
+                    [(store (add (load addr:$dst), GR16:$src), addr:$dst),
+                     (implicit SRW)]>;
+
+def ADD8mi  : I8mi<0x0,
+                   (outs), (ins memdst:$dst, i8imm:$src),
+                   "add.b\t{$src, $dst}",
+                   [(store (add (load addr:$dst), (i8 imm:$src)), addr:$dst),
+                    (implicit SRW)]>;
+def ADD16mi : I16mi<0x0,
+                    (outs), (ins memdst:$dst, i16imm:$src),
+                    "add.w\t{$src, $dst}",
+                    [(store (add (load addr:$dst), (i16 imm:$src)), addr:$dst),
+                     (implicit SRW)]>;
+
+def ADD8mm  : I8mm<0x0,
+                   (outs), (ins memdst:$dst, memsrc:$src),
+                   "add.b\t{$src, $dst}",
+                   [(store (add (load addr:$dst), 
+                                (i8 (load addr:$src))), addr:$dst),
+                    (implicit SRW)]>;
+def ADD16mm : I16mm<0x0,
+                    (outs), (ins memdst:$dst, memsrc:$src),
+                    "add.w\t{$src, $dst}",
+                    [(store (add (load addr:$dst), 
+                                  (i16 (load addr:$src))), addr:$dst),
+                     (implicit SRW)]>;
+}
+
+let Uses = [SRW] in {
+
+let isCommutable = 1 in { // X = ADDC Y, Z  == X = ADDC Z, Y
+def ADC8rr  : I8rr<0x0,
+                   (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+                   "addc.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (adde GR8:$src1, GR8:$src2)),
+                    (implicit SRW)]>;
+def ADC16rr : I16rr<0x0,
+                    (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                    "addc.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (adde GR16:$src1, GR16:$src2)),
+                     (implicit SRW)]>;
+} // isCommutable
+
+def ADC8ri  : I8ri<0x0,
+                   (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+                   "addc.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (adde GR8:$src1, imm:$src2)),
+                    (implicit SRW)]>;
+def ADC16ri : I16ri<0x0,
+                    (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+                    "addc.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (adde GR16:$src1, imm:$src2)),
+                     (implicit SRW)]>;
+
+def ADC8rm  : I8rm<0x0,
+                   (outs GR8:$dst), (ins GR8:$src1, memsrc:$src2),
+                   "addc.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (adde GR8:$src1, (load addr:$src2))),
+                    (implicit SRW)]>;
+def ADC16rm : I16rm<0x0,
+                    (outs GR16:$dst), (ins GR16:$src1, memsrc:$src2),
+                    "addc.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (adde GR16:$src1, (load addr:$src2))),
+                     (implicit SRW)]>;
+
+let isTwoAddress = 0 in {
+def ADC8mr  : I8mr<0x0,
+                   (outs), (ins memdst:$dst, GR8:$src),
+                   "addc.b\t{$src, $dst}",
+                   [(store (adde (load addr:$dst), GR8:$src), addr:$dst),
+                    (implicit SRW)]>;
+def ADC16mr : I16mr<0x0,
+                    (outs), (ins memdst:$dst, GR16:$src),
+                    "addc.w\t{$src, $dst}",
+                    [(store (adde (load addr:$dst), GR16:$src), addr:$dst),
+                     (implicit SRW)]>;
+
+def ADC8mi  : I8mi<0x0,
+                   (outs), (ins memdst:$dst, i8imm:$src),
+                   "addc.b\t{$src, $dst}",
+                   [(store (adde (load addr:$dst), (i8 imm:$src)), addr:$dst),
+                    (implicit SRW)]>;
+def ADC16mi : I16mi<0x0,
+                    (outs), (ins memdst:$dst, i16imm:$src),
+                    "addc.w\t{$src, $dst}",
+                    [(store (adde (load addr:$dst), (i16 imm:$src)), addr:$dst),
+                     (implicit SRW)]>;
+
+def ADC8mm  : I8mm<0x0,
+                   (outs), (ins memdst:$dst, memsrc:$src),
+                   "addc.b\t{$src, $dst}",
+                   [(store (adde (load addr:$dst), 
+                                 (i8 (load addr:$src))), addr:$dst),
+                    (implicit SRW)]>;
+def ADC16mm : I8mm<0x0,
+                   (outs), (ins memdst:$dst, memsrc:$src),
+                   "addc.w\t{$src, $dst}",
+                   [(store (adde (load addr:$dst), 
+                                 (i16 (load addr:$src))), addr:$dst),
+                    (implicit SRW)]>;
+}
+
+} // Uses = [SRW]
+
+let isCommutable = 1 in { // X = AND Y, Z  == X = AND Z, Y
+def AND8rr  : I8rr<0x0,
+                   (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+                   "and.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (and GR8:$src1, GR8:$src2)),
+                    (implicit SRW)]>;
+def AND16rr : I16rr<0x0,
+                    (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                    "and.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (and GR16:$src1, GR16:$src2)),
+                     (implicit SRW)]>;
+}
+
+def AND8ri  : I8ri<0x0,
+                   (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+                   "and.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (and GR8:$src1, imm:$src2)),
+                    (implicit SRW)]>;
+def AND16ri : I16ri<0x0,
+                    (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+                    "and.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (and GR16:$src1, imm:$src2)),
+                     (implicit SRW)]>;
+
+def AND8rm  : I8rm<0x0,
+                   (outs GR8:$dst), (ins GR8:$src1, memsrc:$src2),
+                   "and.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (and GR8:$src1, (load addr:$src2))),
+                    (implicit SRW)]>;
+def AND16rm : I16rm<0x0,
+                    (outs GR16:$dst), (ins GR16:$src1, memsrc:$src2),
+                    "and.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (and GR16:$src1, (load addr:$src2))),
+                     (implicit SRW)]>;
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1, 
+Constraints = "$base = $base_wb, $src1 = $dst" in {
+def AND8rm_POST : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
+                         (outs GR8:$dst, GR16:$base_wb),
+                         (ins GR8:$src1, GR16:$base),
+                         "and.b\t{@$base+, $dst}", []>;
+def AND16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
+                           (outs GR16:$dst, GR16:$base_wb),
+                           (ins GR16:$src1, GR16:$base),
+                           "and.w\t{@$base+, $dst}", []>;
+}
+
+let isTwoAddress = 0 in {
+def AND8mr  : I8mr<0x0,
+                   (outs), (ins memdst:$dst, GR8:$src),
+                   "and.b\t{$src, $dst}",
+                   [(store (and (load addr:$dst), GR8:$src), addr:$dst),
+                    (implicit SRW)]>;
+def AND16mr : I16mr<0x0,
+                    (outs), (ins memdst:$dst, GR16:$src),
+                    "and.w\t{$src, $dst}",
+                    [(store (and (load addr:$dst), GR16:$src), addr:$dst),
+                     (implicit SRW)]>;
+
+def AND8mi  : I8mi<0x0,
+                   (outs), (ins memdst:$dst, i8imm:$src),
+                   "and.b\t{$src, $dst}",
+                   [(store (and (load addr:$dst), (i8 imm:$src)), addr:$dst),
+                    (implicit SRW)]>;
+def AND16mi : I16mi<0x0,
+                    (outs), (ins memdst:$dst, i16imm:$src),
+                    "and.w\t{$src, $dst}",
+                    [(store (and (load addr:$dst), (i16 imm:$src)), addr:$dst),
+                     (implicit SRW)]>;
+
+def AND8mm  : I8mm<0x0,
+                   (outs), (ins memdst:$dst, memsrc:$src),
+                   "and.b\t{$src, $dst}",
+                   [(store (and (load addr:$dst), 
+                                (i8 (load addr:$src))), addr:$dst),
+                    (implicit SRW)]>;
+def AND16mm : I16mm<0x0,
+                    (outs), (ins memdst:$dst, memsrc:$src),
+                    "and.w\t{$src, $dst}",
+                    [(store (and (load addr:$dst), 
+                                 (i16 (load addr:$src))), addr:$dst),
+                     (implicit SRW)]>;
+}
+
+let isCommutable = 1 in { // X = OR Y, Z  == X = OR Z, Y
+def OR8rr  : I8rr<0x0,
+                  (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+                  "bis.b\t{$src2, $dst}",
+                  [(set GR8:$dst, (or GR8:$src1, GR8:$src2))]>;
+def OR16rr : I16rr<0x0,
+                   (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                   "bis.w\t{$src2, $dst}",
+                   [(set GR16:$dst, (or GR16:$src1, GR16:$src2))]>;
+}
+
+def OR8ri  : I8ri<0x0,
+                  (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+                  "bis.b\t{$src2, $dst}",
+                  [(set GR8:$dst, (or GR8:$src1, imm:$src2))]>;
+def OR16ri : I16ri<0x0,
+                   (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+                   "bis.w\t{$src2, $dst}",
+                   [(set GR16:$dst, (or GR16:$src1, imm:$src2))]>;
+
+def OR8rm  : I8rm<0x0,
+                  (outs GR8:$dst), (ins GR8:$src1, memsrc:$src2),
+                  "bis.b\t{$src2, $dst}",
+                  [(set GR8:$dst, (or GR8:$src1, (load addr:$src2)))]>;
+def OR16rm : I16rm<0x0,
+                   (outs GR16:$dst), (ins GR16:$src1, memsrc:$src2),
+                   "bis.w\t{$src2, $dst}",
+                   [(set GR16:$dst, (or GR16:$src1, (load addr:$src2)))]>;
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1, 
+Constraints = "$base = $base_wb, $src1 = $dst" in {
+def OR8rm_POST : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
+                        (outs GR8:$dst, GR16:$base_wb),
+                        (ins GR8:$src1, GR16:$base),
+                        "bis.b\t{@$base+, $dst}", []>;
+def OR16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
+                          (outs GR16:$dst, GR16:$base_wb),
+                          (ins GR16:$src1, GR16:$base),
+                          "bis.w\t{@$base+, $dst}", []>;
+}
+
+let isTwoAddress = 0 in {
+def OR8mr  : I8mr<0x0,
+                  (outs), (ins memdst:$dst, GR8:$src),
+                  "bis.b\t{$src, $dst}",
+                  [(store (or (load addr:$dst), GR8:$src), addr:$dst)]>;
+def OR16mr : I16mr<0x0,
+                   (outs), (ins memdst:$dst, GR16:$src),
+                   "bis.w\t{$src, $dst}",
+                   [(store (or (load addr:$dst), GR16:$src), addr:$dst)]>;
+
+def OR8mi  : I8mi<0x0, 
+                  (outs), (ins memdst:$dst, i8imm:$src),
+                  "bis.b\t{$src, $dst}",
+                  [(store (or (load addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def OR16mi : I16mi<0x0,
+                   (outs), (ins memdst:$dst, i16imm:$src),
+                   "bis.w\t{$src, $dst}",
+                   [(store (or (load addr:$dst), (i16 imm:$src)), addr:$dst)]>;
+
+def OR8mm  : I8mm<0x0,
+                  (outs), (ins memdst:$dst, memsrc:$src),
+                  "bis.b\t{$src, $dst}",
+                  [(store (or (i8 (load addr:$dst)),
+                              (i8 (load addr:$src))), addr:$dst)]>;
+def OR16mm : I16mm<0x0,
+                   (outs), (ins memdst:$dst, memsrc:$src),
+                   "bis.w\t{$src, $dst}",
+                   [(store (or (i16 (load addr:$dst)),
+                               (i16 (load addr:$src))), addr:$dst)]>;
+}
+
+// bic does not modify condition codes
+def BIC8rr :  I8rr<0x0,
+                   (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+                   "bic.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (and GR8:$src1, (not GR8:$src2)))]>;
+def BIC16rr : I16rr<0x0,
+                    (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                    "bic.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (and GR16:$src1, (not GR16:$src2)))]>;
+
+def BIC8rm :  I8rm<0x0,
+                   (outs GR8:$dst), (ins GR8:$src1, memsrc:$src2),
+                   "bic.b\t{$src2, $dst}",
+                    [(set GR8:$dst, (and GR8:$src1, (not (i8 (load addr:$src2)))))]>;
+def BIC16rm : I16rm<0x0,
+                    (outs GR16:$dst), (ins GR16:$src1, memsrc:$src2),
+                    "bic.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (and GR16:$src1, (not (i16 (load addr:$src2)))))]>;
+
+let isTwoAddress = 0 in {
+def BIC8mr :  I8mr<0x0,
+                   (outs), (ins memdst:$dst, GR8:$src),
+                   "bic.b\t{$src, $dst}",
+                   [(store (and (load addr:$dst), (not GR8:$src)), addr:$dst)]>;
+def BIC16mr : I16mr<0x0,
+                    (outs), (ins memdst:$dst, GR16:$src),
+                    "bic.w\t{$src, $dst}",
+                    [(store (and (load addr:$dst), (not GR16:$src)), addr:$dst)]>;
+
+def BIC8mm :  I8mm<0x0,
+                   (outs), (ins memdst:$dst, memsrc:$src),
+                   "bic.b\t{$src, $dst}",
+                   [(store (and (load addr:$dst),
+                                (not (i8 (load addr:$src)))), addr:$dst)]>;
+def BIC16mm : I16mm<0x0,
+                    (outs), (ins memdst:$dst, memsrc:$src),
+                    "bic.w\t{$src, $dst}",
+                    [(store (and (load addr:$dst),
+                                 (not (i16 (load addr:$src)))), addr:$dst)]>;
+}
+
+let isCommutable = 1 in { // X = XOR Y, Z  == X = XOR Z, Y
+def XOR8rr  : I8rr<0x0,
+                   (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+                   "xor.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (xor GR8:$src1, GR8:$src2)),
+                    (implicit SRW)]>;
+def XOR16rr : I16rr<0x0,
+                    (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                    "xor.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (xor GR16:$src1, GR16:$src2)),
+                     (implicit SRW)]>;
+}
+
+def XOR8ri  : I8ri<0x0,
+                   (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+                   "xor.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (xor GR8:$src1, imm:$src2)),
+                    (implicit SRW)]>;
+def XOR16ri : I16ri<0x0,
+                    (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+                    "xor.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (xor GR16:$src1, imm:$src2)),
+                     (implicit SRW)]>;
+
+def XOR8rm  : I8rm<0x0,
+                   (outs GR8:$dst), (ins GR8:$src1, memsrc:$src2),
+                   "xor.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (xor GR8:$src1, (load addr:$src2))),
+                    (implicit SRW)]>;
+def XOR16rm : I16rm<0x0,
+                    (outs GR16:$dst), (ins GR16:$src1, memsrc:$src2),
+                    "xor.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (xor GR16:$src1, (load addr:$src2))),
+                     (implicit SRW)]>;
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1, 
+Constraints = "$base = $base_wb, $src1 = $dst" in {
+def XOR8rm_POST : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
+                         (outs GR8:$dst, GR16:$base_wb),
+                         (ins GR8:$src1, GR16:$base),
+                         "xor.b\t{@$base+, $dst}", []>;
+def XOR16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
+                           (outs GR16:$dst, GR16:$base_wb),
+                           (ins GR16:$src1, GR16:$base),
+                           "xor.w\t{@$base+, $dst}", []>;
+}
+
+let isTwoAddress = 0 in {
+def XOR8mr  : I8mr<0x0,
+                   (outs), (ins memdst:$dst, GR8:$src),
+                   "xor.b\t{$src, $dst}",
+                   [(store (xor (load addr:$dst), GR8:$src), addr:$dst),
+                    (implicit SRW)]>;
+def XOR16mr : I16mr<0x0,
+                    (outs), (ins memdst:$dst, GR16:$src),
+                    "xor.w\t{$src, $dst}",
+                    [(store (xor (load addr:$dst), GR16:$src), addr:$dst),
+                     (implicit SRW)]>;
+
+def XOR8mi  : I8mi<0x0,
+                   (outs), (ins memdst:$dst, i8imm:$src),
+                   "xor.b\t{$src, $dst}",
+                   [(store (xor (load addr:$dst), (i8 imm:$src)), addr:$dst),
+                    (implicit SRW)]>;
+def XOR16mi : I16mi<0x0,
+                    (outs), (ins memdst:$dst, i16imm:$src),
+                    "xor.w\t{$src, $dst}",
+                    [(store (xor (load addr:$dst), (i16 imm:$src)), addr:$dst),
+                     (implicit SRW)]>;
+
+def XOR8mm  : I8mm<0x0,
+                   (outs), (ins memdst:$dst, memsrc:$src),
+                   "xor.b\t{$src, $dst}",
+                   [(store (xor (load addr:$dst), (i8 (load addr:$src))), addr:$dst),
+                    (implicit SRW)]>;
+def XOR16mm : I16mm<0x0,
+                    (outs), (ins memdst:$dst, memsrc:$src),
+                    "xor.w\t{$src, $dst}",
+                    [(store (xor (load addr:$dst), (i16 (load addr:$src))), addr:$dst),
+                     (implicit SRW)]>;
+}
+
+
+def SUB8rr  : I8rr<0x0,
+                   (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+                   "sub.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (sub GR8:$src1, GR8:$src2)),
+                    (implicit SRW)]>;
+def SUB16rr : I16rr<0x0,
+                    (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                    "sub.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (sub GR16:$src1, GR16:$src2)),
+                     (implicit SRW)]>;
+
+def SUB8ri  : I8ri<0x0,
+                   (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+                   "sub.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (sub GR8:$src1, imm:$src2)),
+                    (implicit SRW)]>;
+def SUB16ri : I16ri<0x0,
+                    (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+                    "sub.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (sub GR16:$src1, imm:$src2)),
+                     (implicit SRW)]>;
+
+def SUB8rm  : I8rm<0x0,
+                   (outs GR8:$dst), (ins GR8:$src1, memsrc:$src2),
+                   "sub.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (sub GR8:$src1, (load addr:$src2))),
+                    (implicit SRW)]>;
+def SUB16rm : I16rm<0x0,
+                    (outs GR16:$dst), (ins GR16:$src1, memsrc:$src2),
+                    "sub.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (sub GR16:$src1, (load addr:$src2))),
+                     (implicit SRW)]>;
+
+let mayLoad = 1, hasExtraDefRegAllocReq = 1, 
+Constraints = "$base = $base_wb, $src1 = $dst" in {
+def SUB8rm_POST : IForm8<0x0, DstReg, SrcPostInc, Size2Bytes,
+                         (outs GR8:$dst, GR16:$base_wb),
+                         (ins GR8:$src1, GR16:$base),
+                         "sub.b\t{@$base+, $dst}", []>;
+def SUB16rm_POST : IForm16<0x0, DstReg, SrcPostInc, Size2Bytes,
+                          (outs GR16:$dst, GR16:$base_wb),
+                          (ins GR16:$src1, GR16:$base),
+                          "sub.w\t{@$base+, $dst}", []>;
+}
+
+let isTwoAddress = 0 in {
+def SUB8mr  : I8mr<0x0,
+                   (outs), (ins memdst:$dst, GR8:$src),
+                   "sub.b\t{$src, $dst}",
+                   [(store (sub (load addr:$dst), GR8:$src), addr:$dst),
+                    (implicit SRW)]>;
+def SUB16mr : I16mr<0x0,
+                    (outs), (ins memdst:$dst, GR16:$src),
+                    "sub.w\t{$src, $dst}",
+                    [(store (sub (load addr:$dst), GR16:$src), addr:$dst),
+                     (implicit SRW)]>;
+
+def SUB8mi  : I8mi<0x0,
+                   (outs), (ins memdst:$dst, i8imm:$src),
+                   "sub.b\t{$src, $dst}",
+                   [(store (sub (load addr:$dst), (i8 imm:$src)), addr:$dst),
+                    (implicit SRW)]>;
+def SUB16mi : I16mi<0x0,
+                    (outs), (ins memdst:$dst, i16imm:$src),
+                    "sub.w\t{$src, $dst}",
+                    [(store (sub (load addr:$dst), (i16 imm:$src)), addr:$dst),
+                     (implicit SRW)]>;
+
+def SUB8mm  : I8mm<0x0,
+                   (outs), (ins memdst:$dst, memsrc:$src),
+                   "sub.b\t{$src, $dst}",
+                   [(store (sub (load addr:$dst), 
+                                (i8 (load addr:$src))), addr:$dst),
+                    (implicit SRW)]>;
+def SUB16mm : I16mm<0x0,
+                    (outs), (ins memdst:$dst, memsrc:$src),
+                    "sub.w\t{$src, $dst}",
+                    [(store (sub (load addr:$dst), 
+                                 (i16 (load addr:$src))), addr:$dst),
+                     (implicit SRW)]>;
+}
+
+let Uses = [SRW] in {
+def SBC8rr  : I8rr<0x0,
+                   (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+                   "subc.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (sube GR8:$src1, GR8:$src2)),
+                    (implicit SRW)]>;
+def SBC16rr : I16rr<0x0,
+                    (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                    "subc.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (sube GR16:$src1, GR16:$src2)),
+                     (implicit SRW)]>;
+
+def SBC8ri  : I8ri<0x0,
+                   (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+                   "subc.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (sube GR8:$src1, imm:$src2)),
+                    (implicit SRW)]>;
+def SBC16ri : I16ri<0x0,
+                    (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+                    "subc.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (sube GR16:$src1, imm:$src2)),
+                     (implicit SRW)]>;
+
+def SBC8rm  : I8rm<0x0,
+                   (outs GR8:$dst), (ins GR8:$src1, memsrc:$src2),
+                   "subc.b\t{$src2, $dst}",
+                   [(set GR8:$dst, (sube GR8:$src1, (load addr:$src2))),
+                    (implicit SRW)]>;
+def SBC16rm : I16rm<0x0,
+                    (outs GR16:$dst), (ins GR16:$src1, memsrc:$src2),
+                    "subc.w\t{$src2, $dst}",
+                    [(set GR16:$dst, (sube GR16:$src1, (load addr:$src2))),
+                     (implicit SRW)]>;
+
+let isTwoAddress = 0 in {
+def SBC8mr  : I8mr<0x0,
+                   (outs), (ins memdst:$dst, GR8:$src),
+                   "subc.b\t{$src, $dst}",
+                  [(store (sube (load addr:$dst), GR8:$src), addr:$dst),
+                   (implicit SRW)]>;
+def SBC16mr : I16mr<0x0,
+                    (outs), (ins memdst:$dst, GR16:$src),
+                    "subc.w\t{$src, $dst}",
+                    [(store (sube (load addr:$dst), GR16:$src), addr:$dst),
+                     (implicit SRW)]>;
+
+def SBC8mi  : I8mi<0x0,
+                   (outs), (ins memdst:$dst, i8imm:$src),
+                   "subc.b\t{$src, $dst}",
+                   [(store (sube (load addr:$dst), (i8 imm:$src)), addr:$dst),
+                    (implicit SRW)]>;
+def SBC16mi : I16mi<0x0,
+                    (outs), (ins memdst:$dst, i16imm:$src),
+                    "subc.w\t{$src, $dst}",
+                    [(store (sube (load addr:$dst), (i16 imm:$src)), addr:$dst),
+                     (implicit SRW)]>;
+
+def SBC8mm  : I8mm<0x0,
+                   (outs), (ins memdst:$dst, memsrc:$src),
+                   "subc.b\t{$src, $dst}",
+                   [(store (sube (load addr:$dst),
+                                 (i8 (load addr:$src))), addr:$dst),
+                    (implicit SRW)]>;
+def SBC16mm : I16mm<0x0,
+                    (outs), (ins memdst:$dst, memsrc:$src),
+                    "subc.w\t{$src, $dst}",
+                    [(store (sube (load addr:$dst),
+                            (i16 (load addr:$src))), addr:$dst),
+                     (implicit SRW)]>;
+}
+
+} // Uses = [SRW]
+
+// FIXME: memory variant!
+def SAR8r1  : II8r<0x0,
+                   (outs GR8:$dst), (ins GR8:$src),
+                   "rra.b\t$dst",
+                   [(set GR8:$dst, (MSP430rra GR8:$src)),
+                    (implicit SRW)]>;
+def SAR16r1 : II16r<0x0,
+                    (outs GR16:$dst), (ins GR16:$src),
+                    "rra.w\t$dst",
+                    [(set GR16:$dst, (MSP430rra GR16:$src)),
+                     (implicit SRW)]>;
+
+def SHL8r1  : I8rr<0x0,
+                   (outs GR8:$dst), (ins GR8:$src),
+                   "rla.b\t$dst",
+                   [(set GR8:$dst, (MSP430rla GR8:$src)),
+                    (implicit SRW)]>;
+def SHL16r1 : I16rr<0x0,
+                    (outs GR16:$dst), (ins GR16:$src),
+                    "rla.w\t$dst",
+                    [(set GR16:$dst, (MSP430rla GR16:$src)),
+                     (implicit SRW)]>;
+
+def SAR8r1c  : Pseudo<(outs GR8:$dst), (ins GR8:$src),
+                      "clrc\n\t"
+                      "rrc.b\t$dst",
+                      [(set GR8:$dst, (MSP430rrc GR8:$src)),
+                       (implicit SRW)]>;
+def SAR16r1c : Pseudo<(outs GR16:$dst), (ins GR16:$src),
+                      "clrc\n\t"
+                      "rrc.w\t$dst",
+                      [(set GR16:$dst, (MSP430rrc GR16:$src)),
+                       (implicit SRW)]>;
+
+// FIXME: Memory sext's ?
+def SEXT16r : II16r<0x0,
+                    (outs GR16:$dst), (ins GR16:$src),
+                    "sxt\t$dst",
+                    [(set GR16:$dst, (sext_inreg GR16:$src, i8)),
+                     (implicit SRW)]>;
+
+} // Defs = [SRW]
+
+def ZEXT16r : I8rr<0x0,
+                   (outs GR16:$dst), (ins GR16:$src),
+                   "mov.b\t{$src, $dst}",
+                   [(set GR16:$dst, (zext (trunc GR16:$src)))]>;
+
+// FIXME: Memory bitswaps?
+def SWPB16r : II16r<0x0,
+                    (outs GR16:$dst), (ins GR16:$src),
+                    "swpb\t$dst",
+                    [(set GR16:$dst, (bswap GR16:$src))]>;
+
+} // isTwoAddress = 1
+
+// Integer comparisons
+let Defs = [SRW] in {
+def CMP8rr  : I8rr<0x0,
+                   (outs), (ins GR8:$src1, GR8:$src2),
+                   "cmp.b\t{$src2, $src1}",
+                   [(MSP430cmp GR8:$src1, GR8:$src2), (implicit SRW)]>;
+def CMP16rr : I16rr<0x0,
+                    (outs), (ins GR16:$src1, GR16:$src2),
+                    "cmp.w\t{$src2, $src1}",
+                    [(MSP430cmp GR16:$src1, GR16:$src2), (implicit SRW)]>;
+
+def CMP8ri  : I8ri<0x0,
+                   (outs), (ins GR8:$src1, i8imm:$src2),
+                   "cmp.b\t{$src2, $src1}",
+                   [(MSP430cmp GR8:$src1, imm:$src2), (implicit SRW)]>;
+def CMP16ri : I16ri<0x0,
+                    (outs), (ins GR16:$src1, i16imm:$src2),
+                    "cmp.w\t{$src2, $src1}",
+                    [(MSP430cmp GR16:$src1, imm:$src2), (implicit SRW)]>;
+
+def CMP8mi  : I8mi<0x0,
+                   (outs), (ins memsrc:$src1, i8imm:$src2),
+                   "cmp.b\t{$src2, $src1}",
+                   [(MSP430cmp (load addr:$src1),
+                               (i8 imm:$src2)), (implicit SRW)]>;
+def CMP16mi : I16mi<0x0,
+                    (outs), (ins memsrc:$src1, i16imm:$src2),
+                    "cmp.w\t{$src2, $src1}",
+                     [(MSP430cmp (load addr:$src1),
+                                 (i16 imm:$src2)), (implicit SRW)]>;
+
+def CMP8rm  : I8rm<0x0,
+                   (outs), (ins GR8:$src1, memsrc:$src2),
+                   "cmp.b\t{$src2, $src1}",
+                   [(MSP430cmp GR8:$src1, (load addr:$src2)), 
+                    (implicit SRW)]>;
+def CMP16rm : I16rm<0x0,
+                    (outs), (ins GR16:$src1, memsrc:$src2),
+                    "cmp.w\t{$src2, $src1}",
+                    [(MSP430cmp GR16:$src1, (load addr:$src2)),
+                     (implicit SRW)]>;
+
+def CMP8mr  : I8mr<0x0,
+                   (outs), (ins memsrc:$src1, GR8:$src2),
+                   "cmp.b\t{$src2, $src1}",
+                   [(MSP430cmp (load addr:$src1), GR8:$src2),
+                    (implicit SRW)]>;
+def CMP16mr : I16mr<0x0,
+                    (outs), (ins memsrc:$src1, GR16:$src2),
+                    "cmp.w\t{$src2, $src1}",
+                    [(MSP430cmp (load addr:$src1), GR16:$src2), 
+                     (implicit SRW)]>;
+
+
+// BIT TESTS, just sets condition codes
+// Note that the C condition is set differently than when using CMP.
+let isCommutable = 1 in {
+def BIT8rr  : I8rr<0x0,
+                   (outs), (ins GR8:$src1, GR8:$src2),
+                   "bit.b\t{$src2, $src1}",
+                   [(MSP430cmp (and_su GR8:$src1, GR8:$src2), 0),
+                    (implicit SRW)]>;
+def BIT16rr : I16rr<0x0,
+                    (outs), (ins GR16:$src1, GR16:$src2),
+                    "bit.w\t{$src2, $src1}",
+                    [(MSP430cmp (and_su GR16:$src1, GR16:$src2), 0),
+                     (implicit SRW)]>;
+}
+def BIT8ri  : I8ri<0x0,
+                   (outs), (ins GR8:$src1, i8imm:$src2),
+                   "bit.b\t{$src2, $src1}",
+                   [(MSP430cmp (and_su GR8:$src1, imm:$src2), 0),
+                    (implicit SRW)]>;
+def BIT16ri : I16ri<0x0,
+                    (outs), (ins GR16:$src1, i16imm:$src2),
+                    "bit.w\t{$src2, $src1}",
+                    [(MSP430cmp (and_su GR16:$src1, imm:$src2), 0),
+                     (implicit SRW)]>;
+
+def BIT8rm  : I8rm<0x0,
+                   (outs), (ins GR8:$src1, memdst:$src2),
+                   "bit.b\t{$src2, $src1}",
+                   [(MSP430cmp (and_su GR8:$src1,  (load addr:$src2)), 0),
+                    (implicit SRW)]>;
+def BIT16rm : I16rm<0x0,
+                    (outs), (ins GR16:$src1, memdst:$src2),
+                    "bit.w\t{$src2, $src1}",
+                    [(MSP430cmp (and_su GR16:$src1,  (load addr:$src2)), 0),
+                     (implicit SRW)]>;
+
+def BIT8mr  : I8mr<0x0,
+                  (outs), (ins memsrc:$src1, GR8:$src2),
+                  "bit.b\t{$src2, $src1}",
+                  [(MSP430cmp (and_su (load addr:$src1), GR8:$src2), 0),
+                   (implicit SRW)]>;
+def BIT16mr : I16mr<0x0,
+                    (outs), (ins memsrc:$src1, GR16:$src2),
+                    "bit.w\t{$src2, $src1}",
+                    [(MSP430cmp (and_su (load addr:$src1), GR16:$src2), 0),
+                     (implicit SRW)]>;
+
+def BIT8mi  : I8mi<0x0,
+                   (outs), (ins memsrc:$src1, i8imm:$src2),
+                   "bit.b\t{$src2, $src1}",
+                   [(MSP430cmp (and_su (load addr:$src1), (i8 imm:$src2)), 0),
+                    (implicit SRW)]>;
+def BIT16mi : I16mi<0x0,
+                    (outs), (ins memsrc:$src1, i16imm:$src2),
+                    "bit.w\t{$src2, $src1}",
+                    [(MSP430cmp (and_su (load addr:$src1), (i16 imm:$src2)), 0),
+                     (implicit SRW)]>;
+
+def BIT8mm  : I8mm<0x0,
+                   (outs), (ins memsrc:$src1, memsrc:$src2),
+                   "bit.b\t{$src2, $src1}",
+                   [(MSP430cmp (and_su (i8 (load addr:$src1)),
+                                       (load addr:$src2)),
+                                 0),
+                      (implicit SRW)]>;
+def BIT16mm : I16mm<0x0,
+                    (outs), (ins memsrc:$src1, memsrc:$src2),
+                    "bit.w\t{$src2, $src1}",
+                    [(MSP430cmp (and_su (i16 (load addr:$src1)),
+                                        (load addr:$src2)),
+                                 0),
+                     (implicit SRW)]>;
+} // Defs = [SRW]
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+
+// extload
+def : Pat<(extloadi16i8 addr:$src), (MOVZX16rm8 addr:$src)>;
+
+// anyext
+def : Pat<(i16 (anyext GR8:$src)),
+          (SUBREG_TO_REG (i16 0), GR8:$src, subreg_8bit)>;
+
+// truncs
+def : Pat<(i8 (trunc GR16:$src)),
+          (EXTRACT_SUBREG GR16:$src, subreg_8bit)>;
+
+// GlobalAddress, ExternalSymbol
+def : Pat<(i16 (MSP430Wrapper tglobaladdr:$dst)), (MOV16ri tglobaladdr:$dst)>;
+def : Pat<(i16 (MSP430Wrapper texternalsym:$dst)), (MOV16ri texternalsym:$dst)>;
+
+def : Pat<(add GR16:$src1, (MSP430Wrapper tglobaladdr :$src2)),
+          (ADD16ri GR16:$src1, tglobaladdr:$src2)>;
+def : Pat<(add GR16:$src1, (MSP430Wrapper texternalsym:$src2)),
+          (ADD16ri GR16:$src1, texternalsym:$src2)>;
+
+def : Pat<(store (i16 (MSP430Wrapper tglobaladdr:$src)), addr:$dst),
+          (MOV16mi addr:$dst, tglobaladdr:$src)>;
+def : Pat<(store (i16 (MSP430Wrapper texternalsym:$src)), addr:$dst),
+          (MOV16mi addr:$dst, texternalsym:$src)>;
+
+// calls
+def : Pat<(MSP430call (i16 tglobaladdr:$dst)),
+          (CALLi tglobaladdr:$dst)>;
+def : Pat<(MSP430call (i16 texternalsym:$dst)),
+          (CALLi texternalsym:$dst)>;
+
+// add and sub always produce carry
+def : Pat<(addc GR16:$src1, GR16:$src2),
+          (ADD16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(addc GR16:$src1, (load addr:$src2)),
+          (ADD16rm GR16:$src1, addr:$src2)>;
+def : Pat<(addc GR16:$src1, imm:$src2),
+          (ADD16ri GR16:$src1, imm:$src2)>;
+def : Pat<(store (addc (load addr:$dst), GR16:$src), addr:$dst),
+          (ADD16mr addr:$dst, GR16:$src)>;
+def : Pat<(store (addc (load addr:$dst), (i16 (load addr:$src))), addr:$dst),
+          (ADD16mm addr:$dst, addr:$src)>;
+
+def : Pat<(addc GR8:$src1, GR8:$src2),
+          (ADD8rr GR8:$src1, GR8:$src2)>;
+def : Pat<(addc GR8:$src1, (load addr:$src2)),
+          (ADD8rm GR8:$src1, addr:$src2)>;
+def : Pat<(addc GR8:$src1, imm:$src2),
+          (ADD8ri GR8:$src1, imm:$src2)>;
+def : Pat<(store (addc (load addr:$dst), GR8:$src), addr:$dst),
+          (ADD8mr addr:$dst, GR8:$src)>;
+def : Pat<(store (addc (load addr:$dst), (i8 (load addr:$src))), addr:$dst),
+          (ADD8mm addr:$dst, addr:$src)>;
+
+def : Pat<(subc GR16:$src1, GR16:$src2),
+          (SUB16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(subc GR16:$src1, (load addr:$src2)),
+          (SUB16rm GR16:$src1, addr:$src2)>;
+def : Pat<(subc GR16:$src1, imm:$src2),
+          (SUB16ri GR16:$src1, imm:$src2)>;
+def : Pat<(store (subc (load addr:$dst), GR16:$src), addr:$dst),
+          (SUB16mr addr:$dst, GR16:$src)>;
+def : Pat<(store (subc (load addr:$dst), (i16 (load addr:$src))), addr:$dst),
+          (SUB16mm addr:$dst, addr:$src)>;
+
+def : Pat<(subc GR8:$src1, GR8:$src2),
+          (SUB8rr GR8:$src1, GR8:$src2)>;
+def : Pat<(subc GR8:$src1, (load addr:$src2)),
+          (SUB8rm GR8:$src1, addr:$src2)>;
+def : Pat<(subc GR8:$src1, imm:$src2),
+          (SUB8ri GR8:$src1, imm:$src2)>;
+def : Pat<(store (subc (load addr:$dst), GR8:$src), addr:$dst),
+          (SUB8mr addr:$dst, GR8:$src)>;
+def : Pat<(store (subc (load addr:$dst), (i8 (load addr:$src))), addr:$dst),
+          (SUB8mm addr:$dst, addr:$src)>;
+
+// peephole patterns
+def : Pat<(and GR16:$src, 255), (ZEXT16r GR16:$src)>;
+def : Pat<(MSP430cmp (trunc (and_su GR16:$src1, GR16:$src2)), 0),
+          (BIT8rr (EXTRACT_SUBREG GR16:$src1, subreg_8bit),
+                  (EXTRACT_SUBREG GR16:$src2, subreg_8bit))>;

diff --git a/lib/Target/MSP430/MSP430MCAsmInfo.cpp b/lib/Target/MSP430/MSP430MCAsmInfo.cpp
new file mode 100644
index 0000000..cfb499d
--- /dev/null
+++ b/lib/Target/MSP430/MSP430MCAsmInfo.cpp

@@ -0,0 +1,26 @@
+//===-- MSP430MCAsmInfo.cpp - MSP430 asm properties -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the MSP430MCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430MCAsmInfo.h"
+using namespace llvm;
+
+MSP430MCAsmInfo::MSP430MCAsmInfo(const Target &T, const StringRef &TT) {
+  PrivateGlobalPrefix = ".L";
+  WeakRefDirective ="\t.weak\t";
+  PCSymbol=".";
+  CommentString = ";";
+
+  AlignmentIsInBytes = false;
+  AllowNameToStartWithDigit = true;
+  UsesELFSectionDirectiveForBSS = true;
+}

diff --git a/lib/Target/MSP430/MSP430MCAsmInfo.h b/lib/Target/MSP430/MSP430MCAsmInfo.h
new file mode 100644
index 0000000..8318029
--- /dev/null
+++ b/lib/Target/MSP430/MSP430MCAsmInfo.h

@@ -0,0 +1,28 @@
+//=====-- MSP430MCAsmInfo.h - MSP430 asm properties -----------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the MSP430MCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MSP430TARGETASMINFO_H
+#define MSP430TARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+  class Target;
+  class StringRef;
+  struct MSP430MCAsmInfo : public MCAsmInfo {
+    explicit MSP430MCAsmInfo(const Target &T, const StringRef &TT);
+  };
+
+} // namespace llvm
+
+#endif

diff --git a/lib/Target/MSP430/MSP430MachineFunctionInfo.h b/lib/Target/MSP430/MSP430MachineFunctionInfo.h
new file mode 100644
index 0000000..383fd2e
--- /dev/null
+++ b/lib/Target/MSP430/MSP430MachineFunctionInfo.h

@@ -0,0 +1,46 @@
+//===- MSP430MachineFuctionInfo.h - MSP430 machine function info -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares MSP430-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MSP430MACHINEFUNCTIONINFO_H
+#define MSP430MACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+/// MSP430MachineFunctionInfo - This class is derived from MachineFunction and
+/// contains private MSP430 target-specific information for each MachineFunction.
+class MSP430MachineFunctionInfo : public MachineFunctionInfo {
+  /// CalleeSavedFrameSize - Size of the callee-saved register portion of the
+  /// stack frame in bytes.
+  unsigned CalleeSavedFrameSize;
+
+  /// ReturnAddrIndex - FrameIndex for return slot.
+  int ReturnAddrIndex;
+
+public:
+  MSP430MachineFunctionInfo() : CalleeSavedFrameSize(0) {}
+
+  explicit MSP430MachineFunctionInfo(MachineFunction &MF)
+    : CalleeSavedFrameSize(0), ReturnAddrIndex(0) {}
+
+  unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
+  void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }
+
+  int getRAIndex() const { return ReturnAddrIndex; }
+  void setRAIndex(int Index) { ReturnAddrIndex = Index; }
+};
+
+} // End llvm namespace
+
+#endif

diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp
new file mode 100644
index 0000000..566d902
--- /dev/null
+++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp

@@ -0,0 +1,426 @@
+//===- MSP430RegisterInfo.cpp - MSP430 Register Information ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MSP430 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "msp430-reg-info"
+
+#include "MSP430.h"
+#include "MSP430MachineFunctionInfo.h"
+#include "MSP430RegisterInfo.h"
+#include "MSP430TargetMachine.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+// FIXME: Provide proper call frame setup / destroy opcodes.
+MSP430RegisterInfo::MSP430RegisterInfo(MSP430TargetMachine &tm,
+                                       const TargetInstrInfo &tii)
+  : MSP430GenRegisterInfo(MSP430::ADJCALLSTACKDOWN, MSP430::ADJCALLSTACKUP),
+    TM(tm), TII(tii) {
+  StackAlign = TM.getFrameInfo()->getStackAlignment();
+}
+
+const unsigned*
+MSP430RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  const Function* F = MF->getFunction();
+  static const unsigned CalleeSavedRegs[] = {
+    MSP430::FPW, MSP430::R5W, MSP430::R6W, MSP430::R7W,
+    MSP430::R8W, MSP430::R9W, MSP430::R10W, MSP430::R11W,
+    0
+  };
+  static const unsigned CalleeSavedRegsFP[] = {
+    MSP430::R5W, MSP430::R6W, MSP430::R7W,
+    MSP430::R8W, MSP430::R9W, MSP430::R10W, MSP430::R11W,
+    0
+  };
+  static const unsigned CalleeSavedRegsIntr[] = {
+    MSP430::FPW,  MSP430::R5W,  MSP430::R6W,  MSP430::R7W,
+    MSP430::R8W,  MSP430::R9W,  MSP430::R10W, MSP430::R11W,
+    MSP430::R12W, MSP430::R13W, MSP430::R14W, MSP430::R15W,
+    0
+  };
+  static const unsigned CalleeSavedRegsIntrFP[] = {
+    MSP430::R5W,  MSP430::R6W,  MSP430::R7W,
+    MSP430::R8W,  MSP430::R9W,  MSP430::R10W, MSP430::R11W,
+    MSP430::R12W, MSP430::R13W, MSP430::R14W, MSP430::R15W,
+    0
+  };
+
+  if (hasFP(*MF))
+    return (F->getCallingConv() == CallingConv::MSP430_INTR ?
+            CalleeSavedRegsIntrFP : CalleeSavedRegsFP);
+  else
+    return (F->getCallingConv() == CallingConv::MSP430_INTR ?
+            CalleeSavedRegsIntr : CalleeSavedRegs);
+
+}
+
+const TargetRegisterClass *const *
+MSP430RegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
+  const Function* F = MF->getFunction();
+  static const TargetRegisterClass * const CalleeSavedRegClasses[] = {
+    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
+    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
+    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
+    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
+    0
+  };
+  static const TargetRegisterClass * const CalleeSavedRegClassesFP[] = {
+    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
+    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
+    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
+    &MSP430::GR16RegClass, 0
+  };
+  static const TargetRegisterClass * const CalleeSavedRegClassesIntr[] = {
+    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
+    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
+    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
+    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
+    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
+    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
+    0
+  };
+  static const TargetRegisterClass * const CalleeSavedRegClassesIntrFP[] = {
+    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
+    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
+    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
+    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
+    &MSP430::GR16RegClass, &MSP430::GR16RegClass,
+    &MSP430::GR16RegClass, 0
+  };
+
+  if (hasFP(*MF))
+    return (F->getCallingConv() == CallingConv::MSP430_INTR ?
+            CalleeSavedRegClassesIntrFP : CalleeSavedRegClassesFP);
+  else
+    return (F->getCallingConv() == CallingConv::MSP430_INTR ?
+            CalleeSavedRegClassesIntr : CalleeSavedRegClasses);
+}
+
+BitVector MSP430RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+
+  // Mark 4 special registers as reserved.
+  Reserved.set(MSP430::PCW);
+  Reserved.set(MSP430::SPW);
+  Reserved.set(MSP430::SRW);
+  Reserved.set(MSP430::CGW);
+
+  // Mark frame pointer as reserved if needed.
+  if (hasFP(MF))
+    Reserved.set(MSP430::FPW);
+
+  return Reserved;
+}
+
+const TargetRegisterClass *
+MSP430RegisterInfo::getPointerRegClass(unsigned Kind) const {
+  return &MSP430::GR16RegClass;
+}
+
+
+bool MSP430RegisterInfo::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  return (NoFramePointerElim ||
+          MF.getFrameInfo()->hasVarSizedObjects() ||
+          MFI->isFrameAddressTaken());
+}
+
+bool MSP430RegisterInfo::hasReservedCallFrame(MachineFunction &MF) const {
+  return !MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+void MSP430RegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  if (!hasReservedCallFrame(MF)) {
+    // If the stack pointer can be changed after prologue, turn the
+    // adjcallstackup instruction into a 'sub SPW, <amt>' and the
+    // adjcallstackdown instruction into 'add SPW, <amt>'
+    // TODO: consider using push / pop instead of sub + store / add
+    MachineInstr *Old = I;
+    uint64_t Amount = Old->getOperand(0).getImm();
+    if (Amount != 0) {
+      // We need to keep the stack aligned properly.  To do this, we round the
+      // amount of space needed for the outgoing arguments up to the next
+      // alignment boundary.
+      Amount = (Amount+StackAlign-1)/StackAlign*StackAlign;
+
+      MachineInstr *New = 0;
+      if (Old->getOpcode() == getCallFrameSetupOpcode()) {
+        New = BuildMI(MF, Old->getDebugLoc(),
+                      TII.get(MSP430::SUB16ri), MSP430::SPW)
+          .addReg(MSP430::SPW).addImm(Amount);
+      } else {
+        assert(Old->getOpcode() == getCallFrameDestroyOpcode());
+        // factor out the amount the callee already popped.
+        uint64_t CalleeAmt = Old->getOperand(1).getImm();
+        Amount -= CalleeAmt;
+        if (Amount)
+          New = BuildMI(MF, Old->getDebugLoc(),
+                        TII.get(MSP430::ADD16ri), MSP430::SPW)
+            .addReg(MSP430::SPW).addImm(Amount);
+      }
+
+      if (New) {
+        // The SRW implicit def is dead.
+        New->getOperand(3).setIsDead();
+
+        // Replace the pseudo instruction with a new instruction...
+        MBB.insert(I, New);
+      }
+    }
+  } else if (I->getOpcode() == getCallFrameDestroyOpcode()) {
+    // If we are performing frame pointer elimination and if the callee pops
+    // something off the stack pointer, add it back.
+    if (uint64_t CalleeAmt = I->getOperand(1).getImm()) {
+      MachineInstr *Old = I;
+      MachineInstr *New =
+        BuildMI(MF, Old->getDebugLoc(), TII.get(MSP430::SUB16ri),
+                MSP430::SPW).addReg(MSP430::SPW).addImm(CalleeAmt);
+      // The SRW implicit def is dead.
+      New->getOperand(3).setIsDead();
+
+      MBB.insert(I, New);
+    }
+  }
+
+  MBB.erase(I);
+}
+
+unsigned
+MSP430RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                        int SPAdj, int *Value,
+                                        RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+
+  unsigned i = 0;
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  DebugLoc dl = MI.getDebugLoc();
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+
+  int FrameIndex = MI.getOperand(i).getIndex();
+
+  unsigned BasePtr = (hasFP(MF) ? MSP430::FPW : MSP430::SPW);
+  int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex);
+
+  // Skip the saved PC
+  Offset += 2;
+
+  if (!hasFP(MF))
+    Offset += MF.getFrameInfo()->getStackSize();
+  else
+    Offset += 2; // Skip the saved FPW
+
+  // Fold imm into offset
+  Offset += MI.getOperand(i+1).getImm();
+
+  if (MI.getOpcode() == MSP430::ADD16ri) {
+    // This is actually "load effective address" of the stack slot
+    // instruction. We have only two-address instructions, thus we need to
+    // expand it into mov + add
+
+    MI.setDesc(TII.get(MSP430::MOV16rr));
+    MI.getOperand(i).ChangeToRegister(BasePtr, false);
+
+    if (Offset == 0)
+      return 0;
+
+    // We need to materialize the offset via add instruction.
+    unsigned DstReg = MI.getOperand(0).getReg();
+    if (Offset < 0)
+      BuildMI(MBB, llvm::next(II), dl, TII.get(MSP430::SUB16ri), DstReg)
+        .addReg(DstReg).addImm(-Offset);
+    else
+      BuildMI(MBB, llvm::next(II), dl, TII.get(MSP430::ADD16ri), DstReg)
+        .addReg(DstReg).addImm(Offset);
+
+    return 0;
+  }
+
+  MI.getOperand(i).ChangeToRegister(BasePtr, false);
+  MI.getOperand(i+1).ChangeToImmediate(Offset);
+  return 0;
+}
+
+void
+MSP430RegisterInfo::processFunctionBeforeFrameFinalized(MachineFunction &MF)
+                                                                         const {
+  // Create a frame entry for the FPW register that must be saved.
+  if (hasFP(MF)) {
+    int ATTRIBUTE_UNUSED FrameIdx =
+      MF.getFrameInfo()->CreateFixedObject(2, -4, true, false);
+    assert(FrameIdx == MF.getFrameInfo()->getObjectIndexBegin() &&
+           "Slot for FPW register must be last in order to be found!");
+  }
+}
+
+
+void MSP430RegisterInfo::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();   // Prolog goes in entry BB
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MSP430MachineFunctionInfo *MSP430FI = MF.getInfo<MSP430MachineFunctionInfo>();
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  DebugLoc DL = (MBBI != MBB.end() ? MBBI->getDebugLoc() :
+                 DebugLoc::getUnknownLoc());
+
+  // Get the number of bytes to allocate from the FrameInfo.
+  uint64_t StackSize = MFI->getStackSize();
+
+  uint64_t NumBytes = 0;
+  if (hasFP(MF)) {
+    // Calculate required stack adjustment
+    uint64_t FrameSize = StackSize - 2;
+    NumBytes = FrameSize - MSP430FI->getCalleeSavedFrameSize();
+
+    // Get the offset of the stack slot for the EBP register... which is
+    // guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
+    // Update the frame offset adjustment.
+    MFI->setOffsetAdjustment(-NumBytes);
+
+    // Save FPW into the appropriate stack slot...
+    BuildMI(MBB, MBBI, DL, TII.get(MSP430::PUSH16r))
+      .addReg(MSP430::FPW, RegState::Kill);
+
+    // Update FPW with the new base value...
+    BuildMI(MBB, MBBI, DL, TII.get(MSP430::MOV16rr), MSP430::FPW)
+      .addReg(MSP430::SPW);
+
+    // Mark the FramePtr as live-in in every block except the entry.
+    for (MachineFunction::iterator I = llvm::next(MF.begin()), E = MF.end();
+         I != E; ++I)
+      I->addLiveIn(MSP430::FPW);
+
+  } else
+    NumBytes = StackSize - MSP430FI->getCalleeSavedFrameSize();
+
+  // Skip the callee-saved push instructions.
+  while (MBBI != MBB.end() && (MBBI->getOpcode() == MSP430::PUSH16r))
+    ++MBBI;
+
+  if (MBBI != MBB.end())
+    DL = MBBI->getDebugLoc();
+
+  if (NumBytes) { // adjust stack pointer: SPW -= numbytes
+    // If there is an SUB16ri of SPW immediately before this instruction, merge
+    // the two.
+    //NumBytes -= mergeSPUpdates(MBB, MBBI, true);
+    // If there is an ADD16ri or SUB16ri of SPW immediately after this
+    // instruction, merge the two instructions.
+    // mergeSPUpdatesDown(MBB, MBBI, &NumBytes);
+
+    if (NumBytes) {
+      MachineInstr *MI =
+        BuildMI(MBB, MBBI, DL, TII.get(MSP430::SUB16ri), MSP430::SPW)
+        .addReg(MSP430::SPW).addImm(NumBytes);
+      // The SRW implicit def is dead.
+      MI->getOperand(3).setIsDead();
+    }
+  }
+}
+
+void MSP430RegisterInfo::emitEpilogue(MachineFunction &MF,
+                                      MachineBasicBlock &MBB) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  MSP430MachineFunctionInfo *MSP430FI = MF.getInfo<MSP430MachineFunctionInfo>();
+  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  unsigned RetOpcode = MBBI->getOpcode();
+  DebugLoc DL = MBBI->getDebugLoc();
+
+  switch (RetOpcode) {
+  case MSP430::RET:
+  case MSP430::RETI: break;  // These are ok
+  default:
+    llvm_unreachable("Can only insert epilog into returning blocks");
+  }
+
+  // Get the number of bytes to allocate from the FrameInfo
+  uint64_t StackSize = MFI->getStackSize();
+  unsigned CSSize = MSP430FI->getCalleeSavedFrameSize();
+  uint64_t NumBytes = 0;
+
+  if (hasFP(MF)) {
+    // Calculate required stack adjustment
+    uint64_t FrameSize = StackSize - 2;
+    NumBytes = FrameSize - CSSize;
+
+    // pop FPW.
+    BuildMI(MBB, MBBI, DL, TII.get(MSP430::POP16r), MSP430::FPW);
+  } else
+    NumBytes = StackSize - CSSize;
+
+  // Skip the callee-saved pop instructions.
+  while (MBBI != MBB.begin()) {
+    MachineBasicBlock::iterator PI = prior(MBBI);
+    unsigned Opc = PI->getOpcode();
+    if (Opc != MSP430::POP16r && !PI->getDesc().isTerminator())
+      break;
+    --MBBI;
+  }
+
+  DL = MBBI->getDebugLoc();
+
+  // If there is an ADD16ri or SUB16ri of SPW immediately before this
+  // instruction, merge the two instructions.
+  //if (NumBytes || MFI->hasVarSizedObjects())
+  //  mergeSPUpdatesUp(MBB, MBBI, StackPtr, &NumBytes);
+
+  if (MFI->hasVarSizedObjects()) {
+    BuildMI(MBB, MBBI, DL,
+            TII.get(MSP430::MOV16rr), MSP430::SPW).addReg(MSP430::FPW);
+    if (CSSize) {
+      MachineInstr *MI =
+        BuildMI(MBB, MBBI, DL,
+                TII.get(MSP430::SUB16ri), MSP430::SPW)
+        .addReg(MSP430::SPW).addImm(CSSize);
+      // The SRW implicit def is dead.
+      MI->getOperand(3).setIsDead();
+    }
+  } else {
+    // adjust stack pointer back: SPW += numbytes
+    if (NumBytes) {
+      MachineInstr *MI =
+        BuildMI(MBB, MBBI, DL, TII.get(MSP430::ADD16ri), MSP430::SPW)
+        .addReg(MSP430::SPW).addImm(NumBytes);
+      // The SRW implicit def is dead.
+      MI->getOperand(3).setIsDead();
+    }
+  }
+}
+
+unsigned MSP430RegisterInfo::getRARegister() const {
+  return MSP430::PCW;
+}
+
+unsigned MSP430RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  return hasFP(MF) ? MSP430::FPW : MSP430::SPW;
+}
+
+int MSP430RegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
+  llvm_unreachable("Not implemented yet!");
+  return 0;
+}
+
+#include "MSP430GenRegisterInfo.inc"

diff --git a/lib/Target/MSP430/MSP430RegisterInfo.h b/lib/Target/MSP430/MSP430RegisterInfo.h
new file mode 100644
index 0000000..aa08787
--- /dev/null
+++ b/lib/Target/MSP430/MSP430RegisterInfo.h

@@ -0,0 +1,71 @@
+//===- MSP430RegisterInfo.h - MSP430 Register Information Impl --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MSP430 implementation of the MRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_MSP430REGISTERINFO_H
+#define LLVM_TARGET_MSP430REGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "MSP430GenRegisterInfo.h.inc"
+
+namespace llvm {
+
+class TargetInstrInfo;
+class MSP430TargetMachine;
+
+struct MSP430RegisterInfo : public MSP430GenRegisterInfo {
+private:
+  MSP430TargetMachine &TM;
+  const TargetInstrInfo &TII;
+
+  /// StackAlign - Default stack alignment.
+  ///
+  unsigned StackAlign;
+public:
+  MSP430RegisterInfo(MSP430TargetMachine &tm, const TargetInstrInfo &tii);
+
+  /// Code Generation virtual methods...
+  const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+
+  const TargetRegisterClass* const*
+    getCalleeSavedRegClasses(const MachineFunction *MF = 0) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+  const TargetRegisterClass* getPointerRegClass(unsigned Kind = 0) const;
+
+  bool hasFP(const MachineFunction &MF) const;
+  bool hasReservedCallFrame(MachineFunction &MF) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+  unsigned eliminateFrameIndex(MachineBasicBlock::iterator II,
+                               int SPAdj, int *Value = NULL,
+                               RegScavenger *RS = NULL) const;
+
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
+
+  // Debug information queries.
+  unsigned getRARegister() const;
+  unsigned getFrameRegister(const MachineFunction &MF) const;
+
+  //! Get DWARF debugging register number
+  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TARGET_MSP430REGISTERINFO_H

diff --git a/lib/Target/MSP430/MSP430RegisterInfo.td b/lib/Target/MSP430/MSP430RegisterInfo.td
new file mode 100644
index 0000000..4078626
--- /dev/null
+++ b/lib/Target/MSP430/MSP430RegisterInfo.td

@@ -0,0 +1,122 @@
+//===- MSP430RegisterInfo.td - MSP430 Register defs ----------*- tblgen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the MSP430 register file
+//===----------------------------------------------------------------------===//
+
+class MSP430Reg<bits<4> num, string n> : Register<n> {
+  field bits<4> Num = num;
+  let Namespace = "MSP430";
+}
+
+class MSP430RegWithSubregs<bits<4> num, string n, list<Register> subregs> 
+  : RegisterWithSubRegs<n, subregs> {
+  field bits<4> Num = num;
+  let Namespace = "MSP430";
+}
+
+//===----------------------------------------------------------------------===//
+//  Registers
+//===----------------------------------------------------------------------===//
+
+def PCB  : MSP430Reg<0,  "r0">;
+def SPB  : MSP430Reg<1,  "r1">;
+def SRB  : MSP430Reg<2,  "r2">;
+def CGB  : MSP430Reg<3,  "r3">;
+def FPB  : MSP430Reg<4,  "r4">;
+def R5B  : MSP430Reg<5,  "r5">;
+def R6B  : MSP430Reg<6,  "r6">;
+def R7B  : MSP430Reg<7,  "r7">;
+def R8B  : MSP430Reg<8,  "r8">;
+def R9B  : MSP430Reg<9,  "r9">;
+def R10B : MSP430Reg<10, "r10">;
+def R11B : MSP430Reg<11, "r11">;
+def R12B : MSP430Reg<12, "r12">;
+def R13B : MSP430Reg<13, "r13">;
+def R14B : MSP430Reg<14, "r14">;
+def R15B : MSP430Reg<15, "r15">;
+
+def PCW  : MSP430RegWithSubregs<0,  "r0",  [PCB]>;
+def SPW  : MSP430RegWithSubregs<1,  "r1",  [SPB]>;
+def SRW  : MSP430RegWithSubregs<2,  "r2",  [SRB]>;
+def CGW  : MSP430RegWithSubregs<3,  "r3",  [CGB]>;
+def FPW  : MSP430RegWithSubregs<4,  "r4",  [FPB]>;
+def R5W  : MSP430RegWithSubregs<5,  "r5",  [R5B]>;
+def R6W  : MSP430RegWithSubregs<6,  "r6",  [R6B]>;
+def R7W  : MSP430RegWithSubregs<7,  "r7",  [R7B]>;
+def R8W  : MSP430RegWithSubregs<8,  "r8",  [R8B]>;
+def R9W  : MSP430RegWithSubregs<9,  "r9",  [R9B]>;
+def R10W : MSP430RegWithSubregs<10, "r10", [R10B]>;
+def R11W : MSP430RegWithSubregs<11, "r11", [R11B]>;
+def R12W : MSP430RegWithSubregs<12, "r12", [R12B]>;
+def R13W : MSP430RegWithSubregs<13, "r13", [R13B]>;
+def R14W : MSP430RegWithSubregs<14, "r14", [R14B]>;
+def R15W : MSP430RegWithSubregs<15, "r15", [R15B]>;
+
+def : SubRegSet<1, [PCW, SPW, SRW, CGW, FPW,
+                    R5W, R6W, R7W, R8W, R9W, R10W, R11W, R12W, R13W, R14W, R15W],
+                   [PCB, SPB, SRB, CGB, FPB,
+                    R5B, R6B, R7B, R8B, R9B, R10B, R11B, R12B, R13B, R14B, R15B]>;
+
+def subreg_8bit : PatLeaf<(i32 1)>;
+
+def GR8 : RegisterClass<"MSP430", [i8], 8,
+   // Volatile registers
+  [R12B, R13B, R14B, R15B, R11B, R10B, R9B, R8B, R7B, R6B, R5B,
+   // Frame pointer, sometimes allocable
+   FPB,
+   // Volatile, but not allocable
+   PCB, SPB, SRB, CGB]>
+{
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    GR8Class::iterator
+    GR8Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      // Depending on whether the function uses frame pointer or not, last 5 or 4
+      // registers on the list above are reserved
+      if (RI->hasFP(MF))
+        return end()-5;
+      else
+        return end()-4;
+    }
+  }];
+}
+
+def GR16 : RegisterClass<"MSP430", [i16], 16,
+   // Volatile registers
+  [R12W, R13W, R14W, R15W, R11W, R10W, R9W, R8W, R7W, R6W, R5W,
+   // Frame pointer, sometimes allocable
+   FPW,
+   // Volatile, but not allocable
+   PCW, SPW, SRW, CGW]>
+{
+  let SubRegClassList = [GR8];
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    GR16Class::iterator
+    GR16Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      // Depending on whether the function uses frame pointer or not, last 5 or 4
+      // registers on the list above are reserved
+      if (RI->hasFP(MF))
+        return end()-5;
+      else
+        return end()-4;
+    }
+  }];
+}
+

diff --git a/lib/Target/MSP430/MSP430Subtarget.cpp b/lib/Target/MSP430/MSP430Subtarget.cpp
new file mode 100644
index 0000000..1346cb9
--- /dev/null
+++ b/lib/Target/MSP430/MSP430Subtarget.cpp

@@ -0,0 +1,25 @@
+//===- MSP430Subtarget.cpp - MSP430 Subtarget Information ---------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MSP430 specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430Subtarget.h"
+#include "MSP430.h"
+#include "MSP430GenSubtarget.inc"
+
+using namespace llvm;
+
+MSP430Subtarget::MSP430Subtarget(const std::string &TT, const std::string &FS) {
+  std::string CPU = "generic";
+
+  // Parse features string.
+  ParseSubtargetFeatures(FS, CPU);
+}

diff --git a/lib/Target/MSP430/MSP430Subtarget.h b/lib/Target/MSP430/MSP430Subtarget.h
new file mode 100644
index 0000000..1070544
--- /dev/null
+++ b/lib/Target/MSP430/MSP430Subtarget.h

@@ -0,0 +1,38 @@
+//====-- MSP430Subtarget.h - Define Subtarget for the MSP430 ---*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the MSP430 specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_MSP430_SUBTARGET_H
+#define LLVM_TARGET_MSP430_SUBTARGET_H
+
+#include "llvm/Target/TargetSubtarget.h"
+
+#include <string>
+
+namespace llvm {
+
+class MSP430Subtarget : public TargetSubtarget {
+  bool ExtendedInsts;
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified triple.
+  ///
+  MSP430Subtarget(const std::string &TT, const std::string &FS);
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  std::string ParseSubtargetFeatures(const std::string &FS,
+                                     const std::string &CPU);
+};
+} // End llvm namespace
+
+#endif  // LLVM_TARGET_MSP430_SUBTARGET_H

diff --git a/lib/Target/MSP430/MSP430TargetMachine.cpp b/lib/Target/MSP430/MSP430TargetMachine.cpp
new file mode 100644
index 0000000..a0dbac2
--- /dev/null
+++ b/lib/Target/MSP430/MSP430TargetMachine.cpp

@@ -0,0 +1,52 @@
+//===-- MSP430TargetMachine.cpp - Define TargetMachine for MSP430 ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Top-level implementation for the MSP430 target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430.h"
+#include "MSP430MCAsmInfo.h"
+#include "MSP430TargetMachine.h"
+#include "llvm/PassManager.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+extern "C" void LLVMInitializeMSP430Target() {
+  // Register the target.
+  RegisterTargetMachine<MSP430TargetMachine> X(TheMSP430Target);
+  RegisterAsmInfo<MSP430MCAsmInfo> Z(TheMSP430Target);
+}
+
+MSP430TargetMachine::MSP430TargetMachine(const Target &T,
+                                         const std::string &TT,
+                                         const std::string &FS) :
+  LLVMTargetMachine(T, TT),
+  Subtarget(TT, FS),
+  // FIXME: Check TargetData string.
+  DataLayout("e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16"),
+  InstrInfo(*this), TLInfo(*this),
+  FrameInfo(TargetFrameInfo::StackGrowsDown, 2, -2) { }
+
+
+bool MSP430TargetMachine::addInstSelector(PassManagerBase &PM,
+                                          CodeGenOpt::Level OptLevel) {
+  // Install an instruction selector.
+  PM.add(createMSP430ISelDag(*this, OptLevel));
+  return false;
+}
+
+bool MSP430TargetMachine::addPreEmitPass(PassManagerBase &PM,
+                                         CodeGenOpt::Level OptLevel) {
+  // Must run branch selection immediately preceding the asm printer.
+  PM.add(createMSP430BranchSelectionPass());
+  return false;
+}

diff --git a/lib/Target/MSP430/MSP430TargetMachine.h b/lib/Target/MSP430/MSP430TargetMachine.h
new file mode 100644
index 0000000..d93ac5c
--- /dev/null
+++ b/lib/Target/MSP430/MSP430TargetMachine.h

@@ -0,0 +1,63 @@
+//==-- MSP430TargetMachine.h - Define TargetMachine for MSP430 ---*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the MSP430 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef LLVM_TARGET_MSP430_TARGETMACHINE_H
+#define LLVM_TARGET_MSP430_TARGETMACHINE_H
+
+#include "MSP430InstrInfo.h"
+#include "MSP430ISelLowering.h"
+#include "MSP430RegisterInfo.h"
+#include "MSP430Subtarget.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+/// MSP430TargetMachine
+///
+class MSP430TargetMachine : public LLVMTargetMachine {
+  MSP430Subtarget        Subtarget;
+  const TargetData       DataLayout;       // Calculates type size & alignment
+  MSP430InstrInfo        InstrInfo;
+  MSP430TargetLowering   TLInfo;
+
+  // MSP430 does not have any call stack frame, therefore not having
+  // any MSP430 specific FrameInfo class.
+  TargetFrameInfo       FrameInfo;
+
+public:
+  MSP430TargetMachine(const Target &T, const std::string &TT,
+                      const std::string &FS);
+
+  virtual const TargetFrameInfo *getFrameInfo() const { return &FrameInfo; }
+  virtual const MSP430InstrInfo *getInstrInfo() const  { return &InstrInfo; }
+  virtual const TargetData *getTargetData() const     { return &DataLayout;}
+  virtual const MSP430Subtarget *getSubtargetImpl() const { return &Subtarget; }
+
+  virtual const TargetRegisterInfo *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+
+  virtual MSP430TargetLowering *getTargetLowering() const {
+    return const_cast<MSP430TargetLowering*>(&TLInfo);
+  }
+
+  virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+  virtual bool addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+}; // MSP430TargetMachine.
+
+} // end namespace llvm
+
+#endif // LLVM_TARGET_MSP430_TARGETMACHINE_H

diff --git a/lib/Target/MSP430/Makefile b/lib/Target/MSP430/Makefile
new file mode 100644
index 0000000..b1f33d6
--- /dev/null
+++ b/lib/Target/MSP430/Makefile

@@ -0,0 +1,24 @@
+##===- lib/Target/MSP430/Makefile --------------------------*- Makefile -*-===##
+# 
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source 
+# License. See LICENSE.TXT for details.
+# 
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMMSP430CodeGen
+TARGET = MSP430
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = MSP430GenRegisterInfo.h.inc MSP430GenRegisterNames.inc \
+		MSP430GenRegisterInfo.inc MSP430GenInstrNames.inc \
+		MSP430GenInstrInfo.inc MSP430GenAsmWriter.inc \
+		MSP430GenDAGISel.inc MSP430GenCallingConv.inc \
+		MSP430GenSubtarget.inc
+
+DIRS = AsmPrinter TargetInfo
+
+include $(LEVEL)/Makefile.common
+

diff --git a/lib/Target/MSP430/README.txt b/lib/Target/MSP430/README.txt
new file mode 100644
index 0000000..5b9634b
--- /dev/null
+++ b/lib/Target/MSP430/README.txt

@@ -0,0 +1,40 @@
+//===---------------------------------------------------------------------===//
+// MSP430 backend.
+//===---------------------------------------------------------------------===//
+
+DISCLAIMER: Thid backend should be considered as highly experimental. I never
+seen nor worked with this MCU, all information was gathered from datasheet
+only. The original intention of making this backend was to write documentation
+of form "How to write backend for dummies" :) Thes notes hopefully will be
+available pretty soon.
+
+Some things are incomplete / not implemented yet (this list surely is not
+complete as well):
+
+1. Verify, how stuff is handling implicit zext with 8 bit operands (this might
+be modelled currently in improper way - should we need to mark the superreg as
+def for every 8 bit instruction?).
+
+2. Libcalls: multiplication, division, remainder. Note, that calling convention
+for libcalls is incomptible with calling convention of libcalls of msp430-gcc
+(these cannot be used though due to license restriction).
+
+3. Implement multiplication / division by constant (dag combiner hook?).
+
+4. Implement non-constant shifts.
+
+5. Implement varargs stuff.
+
+6. Verify and fix (if needed) how's stuff playing with i32 / i64.
+
+7. Implement floating point stuff (softfp?)
+
+8. Implement instruction encoding for (possible) direct code emission in the
+future.
+
+9. Since almost all instructions set flags - implement brcond / select in better
+way (currently they emit explicit comparison).
+
+10. Handle imm in comparisons in better way (see comment in MSP430InstrInfo.td)
+
+11. Implement hooks for better memory op folding, etc.

diff --git a/lib/Target/MSP430/TargetInfo/CMakeLists.txt b/lib/Target/MSP430/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000..1d408d0
--- /dev/null
+++ b/lib/Target/MSP430/TargetInfo/CMakeLists.txt

@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMMSP430Info
+  MSP430TargetInfo.cpp
+  )
+
+add_dependencies(LLVMMSP430Info MSP430Table_gen)

diff --git a/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp b/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp
new file mode 100644
index 0000000..f9ca5c4
--- /dev/null
+++ b/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp

@@ -0,0 +1,20 @@
+//===-- MSP430TargetInfo.cpp - MSP430 Target Implementation ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MSP430.h"
+#include "llvm/Module.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheMSP430Target;
+
+extern "C" void LLVMInitializeMSP430TargetInfo() { 
+  RegisterTarget<Triple::msp430> 
+    X(TheMSP430Target, "msp430", "MSP430 [experimental]");
+}

diff --git a/lib/Target/MSP430/TargetInfo/Makefile b/lib/Target/MSP430/TargetInfo/Makefile
new file mode 100644
index 0000000..abb08f2
--- /dev/null
+++ b/lib/Target/MSP430/TargetInfo/Makefile

@@ -0,0 +1,15 @@
+##===- lib/Target/MSP430/TargetInfo/Makefile ---------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMMSP430Info
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common

diff --git a/lib/Target/Makefile b/lib/Target/Makefile
new file mode 100644
index 0000000..50a360f
--- /dev/null
+++ b/lib/Target/Makefile

@@ -0,0 +1,20 @@
+#===- lib/Target/Makefile ----------------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../..
+LIBRARYNAME = LLVMTarget
+BUILD_ARCHIVE = 1
+
+# We include this early so we can access the value of TARGETS_TO_BUILD as the
+# value for PARALLEL_DIRS which must be set before Makefile.rules is included
+include $(LEVEL)/Makefile.config
+
+PARALLEL_DIRS := $(TARGETS_TO_BUILD)
+
+include $(LLVM_SRC_ROOT)/Makefile.rules

diff --git a/lib/Target/Mangler.cpp b/lib/Target/Mangler.cpp
new file mode 100644
index 0000000..ef6defc
--- /dev/null
+++ b/lib/Target/Mangler.cpp

@@ -0,0 +1,179 @@
+//===-- Mangler.cpp - Self-contained c/asm llvm name mangler --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Unified name mangler for assembly backends.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/Mangler.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Twine.h"
+using namespace llvm;
+
+static bool isAcceptableChar(char C) {
+  if ((C < 'a' || C > 'z') &&
+      (C < 'A' || C > 'Z') &&
+      (C < '0' || C > '9') &&
+      C != '_' && C != '$' && C != '.' && C != '@')
+    return false;
+  return true;
+}
+
+static char HexDigit(int V) {
+  return V < 10 ? V+'0' : V+'A'-10;
+}
+
+static void MangleLetter(SmallVectorImpl<char> &OutName, unsigned char C) {
+  OutName.push_back('_');
+  OutName.push_back(HexDigit(C >> 4));
+  OutName.push_back(HexDigit(C & 15));
+  OutName.push_back('_');
+}
+
+/// NameNeedsEscaping - Return true if the identifier \arg Str needs quotes
+/// for this assembler.
+static bool NameNeedsEscaping(StringRef Str, const MCAsmInfo &MAI) {
+  assert(!Str.empty() && "Cannot create an empty MCSymbol");
+  
+  // If the first character is a number and the target does not allow this, we
+  // need quotes.
+  if (!MAI.doesAllowNameToStartWithDigit() && Str[0] >= '0' && Str[0] <= '9')
+    return true;
+  
+  // If any of the characters in the string is an unacceptable character, force
+  // quotes.
+  for (unsigned i = 0, e = Str.size(); i != e; ++i)
+    if (!isAcceptableChar(Str[i]))
+      return true;
+  return false;
+}
+
+/// appendMangledName - Add the specified string in mangled form if it uses
+/// any unusual characters.
+static void appendMangledName(SmallVectorImpl<char> &OutName, StringRef Str,
+                              const MCAsmInfo *MAI) {
+  // The first character is not allowed to be a number unless the target
+  // explicitly allows it.
+  if ((MAI == 0 || !MAI->doesAllowNameToStartWithDigit()) &&
+      Str[0] >= '0' && Str[0] <= '9') {
+    MangleLetter(OutName, Str[0]);
+    Str = Str.substr(1);
+  }
+  
+  for (unsigned i = 0, e = Str.size(); i != e; ++i) {
+    if (!isAcceptableChar(Str[i]))
+      MangleLetter(OutName, Str[i]);
+    else
+      OutName.push_back(Str[i]);
+  }
+}
+
+
+/// appendMangledQuotedName - On systems that support quoted symbols, we still
+/// have to escape some (obscure) characters like " and \n which would break the
+/// assembler's lexing.
+static void appendMangledQuotedName(SmallVectorImpl<char> &OutName,
+                                   StringRef Str) {
+  for (unsigned i = 0, e = Str.size(); i != e; ++i) {
+    if (Str[i] == '"' || Str[i] == '\n')
+      MangleLetter(OutName, Str[i]);
+    else
+      OutName.push_back(Str[i]);
+  }
+}
+
+
+/// getNameWithPrefix - Fill OutName with the name of the appropriate prefix
+/// and the specified name as the global variable name.  GVName must not be
+/// empty.
+void Mangler::getNameWithPrefix(SmallVectorImpl<char> &OutName,
+                                const Twine &GVName, ManglerPrefixTy PrefixTy) {
+  SmallString<256> TmpData;
+  StringRef Name = GVName.toStringRef(TmpData);
+  assert(!Name.empty() && "getNameWithPrefix requires non-empty name");
+  
+  // If the global name is not led with \1, add the appropriate prefixes.
+  if (Name[0] == '\1') {
+    Name = Name.substr(1);
+  } else {
+    if (PrefixTy == Mangler::Private) {
+      const char *Prefix = MAI.getPrivateGlobalPrefix();
+      OutName.append(Prefix, Prefix+strlen(Prefix));
+    } else if (PrefixTy == Mangler::LinkerPrivate) {
+      const char *Prefix = MAI.getLinkerPrivateGlobalPrefix();
+      OutName.append(Prefix, Prefix+strlen(Prefix));
+    }
+
+    const char *Prefix = MAI.getGlobalPrefix();
+    if (Prefix[0] == 0)
+      ; // Common noop, no prefix.
+    else if (Prefix[1] == 0)
+      OutName.push_back(Prefix[0]);  // Common, one character prefix.
+    else
+      OutName.append(Prefix, Prefix+strlen(Prefix)); // Arbitrary length prefix.
+  }
+  
+  // If this is a simple string that doesn't need escaping, just append it.
+  if (!NameNeedsEscaping(Name, MAI) ||
+      // If quotes are supported, they can be used unless the string contains
+      // a quote or newline.
+      (MAI.doesAllowQuotesInName() &&
+       Name.find_first_of("\n\"") == StringRef::npos)) {
+    OutName.append(Name.begin(), Name.end());
+    return;
+  }
+  
+  // On systems that do not allow quoted names, we need to mangle most
+  // strange characters.
+  if (!MAI.doesAllowQuotesInName())
+    return appendMangledName(OutName, Name, &MAI);
+  
+  // Okay, the system allows quoted strings.  We can quote most anything, the
+  // only characters that need escaping are " and \n.
+  assert(Name.find_first_of("\n\"") != StringRef::npos);
+  return appendMangledQuotedName(OutName, Name);
+}
+
+
+/// getNameWithPrefix - Fill OutName with the name of the appropriate prefix
+/// and the specified global variable's name.  If the global variable doesn't
+/// have a name, this fills in a unique name for the global.
+void Mangler::getNameWithPrefix(SmallVectorImpl<char> &OutName,
+                                const GlobalValue *GV,
+                                bool isImplicitlyPrivate) {
+  ManglerPrefixTy PrefixTy = Mangler::Default;
+  if (GV->hasPrivateLinkage() || isImplicitlyPrivate)
+    PrefixTy = Mangler::Private;
+  else if (GV->hasLinkerPrivateLinkage())
+    PrefixTy = Mangler::LinkerPrivate;
+  
+  // If this global has a name, handle it simply.
+  if (GV->hasName())
+    return getNameWithPrefix(OutName, GV->getName(), PrefixTy);
+  
+  // Get the ID for the global, assigning a new one if we haven't got one
+  // already.
+  unsigned &ID = AnonGlobalIDs[GV];
+  if (ID == 0) ID = NextAnonGlobalID++;
+  
+  // Must mangle the global into a unique ID.
+  getNameWithPrefix(OutName, "__unnamed_" + Twine(ID), PrefixTy);
+}
+
+/// getNameWithPrefix - Fill OutName with the name of the appropriate prefix
+/// and the specified global variable's name.  If the global variable doesn't
+/// have a name, this fills in a unique name for the global.
+std::string Mangler::getNameWithPrefix(const GlobalValue *GV,
+                                       bool isImplicitlyPrivate) {
+  SmallString<64> Buf;
+  getNameWithPrefix(Buf, GV, isImplicitlyPrivate);
+  return std::string(Buf.begin(), Buf.end());
+}

diff --git a/lib/Target/Mips/AsmPrinter/CMakeLists.txt b/lib/Target/Mips/AsmPrinter/CMakeLists.txt
new file mode 100644
index 0000000..56c68a6
--- /dev/null
+++ b/lib/Target/Mips/AsmPrinter/CMakeLists.txt

@@ -0,0 +1,9 @@
+include_directories(

+  ${CMAKE_CURRENT_BINARY_DIR}/..

+  ${CMAKE_CURRENT_SOURCE_DIR}/..

+  )

+

+add_llvm_library(LLVMMipsAsmPrinter

+  MipsAsmPrinter.cpp
+  )

+add_dependencies(LLVMMipsAsmPrinter MipsCodeGenTable_gen)
\ No newline at end of file

diff --git a/lib/Target/Mips/AsmPrinter/Makefile b/lib/Target/Mips/AsmPrinter/Makefile
new file mode 100644
index 0000000..a2fecf4
--- /dev/null
+++ b/lib/Target/Mips/AsmPrinter/Makefile

@@ -0,0 +1,17 @@
+##===- lib/Target/Mips/AsmPrinter/Makefile -----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMMipsAsmPrinter
+
+# Hack: we need to include 'main' Mips target directory to grab
+# private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common

diff --git a/lib/Target/Mips/AsmPrinter/MipsAsmPrinter.cpp b/lib/Target/Mips/AsmPrinter/MipsAsmPrinter.cpp
new file mode 100644
index 0000000..b8641c3
--- /dev/null
+++ b/lib/Target/Mips/AsmPrinter/MipsAsmPrinter.cpp

@@ -0,0 +1,363 @@
+//===-- MipsAsmPrinter.cpp - Mips LLVM assembly writer --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to GAS-format MIPS assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mips-asm-printer"
+
+#include "Mips.h"
+#include "MipsSubtarget.h"
+#include "MipsInstrInfo.h"
+#include "MipsTargetMachine.h"
+#include "MipsMachineFunction.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLoweringObjectFile.h" 
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/MathExtras.h"
+#include <cctype>
+using namespace llvm;
+
+namespace {
+  class MipsAsmPrinter : public AsmPrinter {
+    const MipsSubtarget *Subtarget;
+  public:
+    explicit MipsAsmPrinter(formatted_raw_ostream &O, TargetMachine &TM, 
+                            MCContext &Ctx, MCStreamer &Streamer,
+                            const MCAsmInfo *T)
+      : AsmPrinter(O, TM, Ctx, Streamer, T) {
+      Subtarget = &TM.getSubtarget<MipsSubtarget>();
+    }
+
+    virtual const char *getPassName() const {
+      return "Mips Assembly Printer";
+    }
+
+    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, 
+                         unsigned AsmVariant, const char *ExtraCode);
+    void printOperand(const MachineInstr *MI, int opNum);
+    void printUnsignedImm(const MachineInstr *MI, int opNum);
+    void printMemOperand(const MachineInstr *MI, int opNum, 
+                         const char *Modifier = 0);
+    void printFCCOperand(const MachineInstr *MI, int opNum, 
+                         const char *Modifier = 0);
+    void printSavedRegsBitmask();
+    void printHex32(unsigned int Value);
+
+    const char *emitCurrentABIString();
+    void emitFrameDirective();
+
+    void printInstruction(const MachineInstr *MI);  // autogenerated.
+    void EmitInstruction(const MachineInstr *MI) {
+      printInstruction(MI);
+      OutStreamer.AddBlankLine();
+    }
+    virtual void EmitFunctionBodyStart();
+    virtual void EmitFunctionBodyEnd();
+    static const char *getRegisterName(unsigned RegNo);
+
+    virtual void EmitFunctionEntryLabel();
+    void EmitStartOfAsmFile(Module &M);
+  };
+} // end of anonymous namespace
+
+#include "MipsGenAsmWriter.inc"
+
+//===----------------------------------------------------------------------===//
+//
+//  Mips Asm Directives
+//
+//  -- Frame directive "frame Stackpointer, Stacksize, RARegister"
+//  Describe the stack frame.
+//
+//  -- Mask directives "(f)mask  bitmask, offset" 
+//  Tells the assembler which registers are saved and where.
+//  bitmask - contain a little endian bitset indicating which registers are 
+//            saved on function prologue (e.g. with a 0x80000000 mask, the 
+//            assembler knows the register 31 (RA) is saved at prologue.
+//  offset  - the position before stack pointer subtraction indicating where 
+//            the first saved register on prologue is located. (e.g. with a
+//
+//  Consider the following function prologue:
+//
+//    .frame  $fp,48,$ra
+//    .mask   0xc0000000,-8
+//       addiu $sp, $sp, -48
+//       sw $ra, 40($sp)
+//       sw $fp, 36($sp)
+//
+//    With a 0xc0000000 mask, the assembler knows the register 31 (RA) and 
+//    30 (FP) are saved at prologue. As the save order on prologue is from 
+//    left to right, RA is saved first. A -8 offset means that after the 
+//    stack pointer subtration, the first register in the mask (RA) will be
+//    saved at address 48-8=40.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Mask directives
+//===----------------------------------------------------------------------===//
+
+// Create a bitmask with all callee saved registers for CPU or Floating Point 
+// registers. For CPU registers consider RA, GP and FP for saving if necessary.
+void MipsAsmPrinter::printSavedRegsBitmask() {
+  const TargetRegisterInfo &RI = *TM.getRegisterInfo();
+  const MipsFunctionInfo *MipsFI = MF->getInfo<MipsFunctionInfo>();
+             
+  // CPU and FPU Saved Registers Bitmasks
+  unsigned int CPUBitmask = 0;
+  unsigned int FPUBitmask = 0;
+
+  // Set the CPU and FPU Bitmasks
+  const MachineFrameInfo *MFI = MF->getFrameInfo();
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned RegNum = MipsRegisterInfo::getRegisterNumbering(CSI[i].getReg());
+    if (CSI[i].getRegClass() == Mips::CPURegsRegisterClass)
+      CPUBitmask |= (1 << RegNum);
+    else
+      FPUBitmask |= (1 << RegNum);
+  }
+
+  // Return Address and Frame registers must also be set in CPUBitmask.
+  if (RI.hasFP(*MF)) 
+    CPUBitmask |= (1 << MipsRegisterInfo::
+                getRegisterNumbering(RI.getFrameRegister(*MF)));
+  
+  if (MFI->hasCalls()) 
+    CPUBitmask |= (1 << MipsRegisterInfo::
+                getRegisterNumbering(RI.getRARegister()));
+
+  // Print CPUBitmask
+  O << "\t.mask \t"; printHex32(CPUBitmask); O << ','
+    << MipsFI->getCPUTopSavedRegOff() << '\n';
+
+  // Print FPUBitmask
+  O << "\t.fmask\t"; printHex32(FPUBitmask); O << ","
+    << MipsFI->getFPUTopSavedRegOff() << '\n';
+}
+
+// Print a 32 bit hex number with all numbers.
+void MipsAsmPrinter::
+printHex32(unsigned int Value) 
+{
+  O << "0x";
+  for (int i = 7; i >= 0; i--) 
+    O << utohexstr( (Value & (0xF << (i*4))) >> (i*4) );
+}
+
+//===----------------------------------------------------------------------===//
+// Frame and Set directives
+//===----------------------------------------------------------------------===//
+
+/// Frame Directive
+void MipsAsmPrinter::emitFrameDirective() {
+  const TargetRegisterInfo &RI = *TM.getRegisterInfo();
+
+  unsigned stackReg  = RI.getFrameRegister(*MF);
+  unsigned returnReg = RI.getRARegister();
+  unsigned stackSize = MF->getFrameInfo()->getStackSize();
+
+
+  O << "\t.frame\t" << '$' << LowercaseString(getRegisterName(stackReg))
+                    << ',' << stackSize << ','
+                    << '$' << LowercaseString(getRegisterName(returnReg))
+                    << '\n';
+}
+
+/// Emit Set directives.
+const char *MipsAsmPrinter::emitCurrentABIString() {  
+  switch(Subtarget->getTargetABI()) {
+    case MipsSubtarget::O32:  return "abi32";  
+    case MipsSubtarget::O64:  return "abiO64";
+    case MipsSubtarget::N32:  return "abiN32";
+    case MipsSubtarget::N64:  return "abi64";
+    case MipsSubtarget::EABI: return "eabi32"; // TODO: handle eabi64
+    default: break;
+  }
+
+  llvm_unreachable("Unknown Mips ABI");
+  return NULL;
+}  
+
+void MipsAsmPrinter::EmitFunctionEntryLabel() {
+  O << "\t.ent\t" << *CurrentFnSym << '\n';
+  OutStreamer.EmitLabel(CurrentFnSym);
+}
+
+/// EmitFunctionBodyStart - Targets can override this to emit stuff before
+/// the first basic block in the function.
+void MipsAsmPrinter::EmitFunctionBodyStart() {
+  emitFrameDirective();
+  printSavedRegsBitmask();
+}
+
+/// EmitFunctionBodyEnd - Targets can override this to emit stuff after
+/// the last basic block in the function.
+void MipsAsmPrinter::EmitFunctionBodyEnd() {
+  // There are instruction for this macros, but they must
+  // always be at the function end, and we can't emit and
+  // break with BB logic. 
+  O << "\t.set\tmacro\n"; 
+  O << "\t.set\treorder\n"; 
+  
+  O << "\t.end\t" << *CurrentFnSym << '\n';
+}
+
+
+// Print out an operand for an inline asm expression.
+bool MipsAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo, 
+                                     unsigned AsmVariant,const char *ExtraCode){
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) 
+    return true; // Unknown modifier.
+
+  printOperand(MI, OpNo);
+  return false;
+}
+
+void MipsAsmPrinter::printOperand(const MachineInstr *MI, int opNum) {
+  const MachineOperand &MO = MI->getOperand(opNum);
+  bool closeP = false;
+
+  if (MO.getTargetFlags())
+    closeP = true;
+
+  switch(MO.getTargetFlags()) {
+  case MipsII::MO_GPREL:    O << "%gp_rel("; break;
+  case MipsII::MO_GOT_CALL: O << "%call16("; break;
+  case MipsII::MO_GOT:
+    if (MI->getOpcode() == Mips::LW)
+      O << "%got(";
+    else
+      O << "%lo(";
+    break;
+  case MipsII::MO_ABS_HILO:
+    if (MI->getOpcode() == Mips::LUi)
+      O << "%hi(";
+    else
+      O << "%lo(";     
+    break;
+  }
+
+  switch (MO.getType()) {
+    case MachineOperand::MO_Register:
+      O << '$' << LowercaseString(getRegisterName(MO.getReg()));
+      break;
+
+    case MachineOperand::MO_Immediate:
+      O << (short int)MO.getImm();
+      break;
+
+    case MachineOperand::MO_MachineBasicBlock:
+      O << *MO.getMBB()->getSymbol(OutContext);
+      return;
+
+    case MachineOperand::MO_GlobalAddress:
+      O << *GetGlobalValueSymbol(MO.getGlobal());
+      break;
+
+    case MachineOperand::MO_ExternalSymbol:
+      O << *GetExternalSymbolSymbol(MO.getSymbolName());
+      break;
+
+    case MachineOperand::MO_JumpTableIndex:
+      O << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+        << '_' << MO.getIndex();
+      break;
+
+    case MachineOperand::MO_ConstantPoolIndex:
+      O << MAI->getPrivateGlobalPrefix() << "CPI"
+        << getFunctionNumber() << "_" << MO.getIndex();
+      if (MO.getOffset())
+        O << "+" << MO.getOffset();
+      break;
+  
+    default:
+      llvm_unreachable("<unknown operand type>");
+  }
+
+  if (closeP) O << ")";
+}
+
+void MipsAsmPrinter::printUnsignedImm(const MachineInstr *MI, int opNum) {
+  const MachineOperand &MO = MI->getOperand(opNum);
+  if (MO.getType() == MachineOperand::MO_Immediate)
+    O << (unsigned short int)MO.getImm();
+  else 
+    printOperand(MI, opNum);
+}
+
+void MipsAsmPrinter::
+printMemOperand(const MachineInstr *MI, int opNum, const char *Modifier) {
+  // when using stack locations for not load/store instructions
+  // print the same way as all normal 3 operand instructions.
+  if (Modifier && !strcmp(Modifier, "stackloc")) {
+    printOperand(MI, opNum+1);
+    O << ", ";
+    printOperand(MI, opNum);
+    return;
+  }
+
+  // Load/Store memory operands -- imm($reg) 
+  // If PIC target the target is loaded as the 
+  // pattern lw $25,%call16($28)
+  printOperand(MI, opNum);
+  O << "(";
+  printOperand(MI, opNum+1);
+  O << ")";
+}
+
+void MipsAsmPrinter::
+printFCCOperand(const MachineInstr *MI, int opNum, const char *Modifier) {
+  const MachineOperand& MO = MI->getOperand(opNum);
+  O << Mips::MipsFCCToString((Mips::CondCode)MO.getImm()); 
+}
+
+void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) {
+  // FIXME: Use SwitchSection.
+  
+  // Tell the assembler which ABI we are using
+  O << "\t.section .mdebug." << emitCurrentABIString() << '\n';
+
+  // TODO: handle O64 ABI
+  if (Subtarget->isABI_EABI())
+    O << "\t.section .gcc_compiled_long" << 
+      (Subtarget->isGP32bit() ? "32" : "64") << '\n';
+
+  // return to previous section
+  O << "\t.previous" << '\n'; 
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeMipsAsmPrinter() { 
+  RegisterAsmPrinter<MipsAsmPrinter> X(TheMipsTarget);
+  RegisterAsmPrinter<MipsAsmPrinter> Y(TheMipselTarget);
+}

diff --git a/lib/Target/Mips/CMakeLists.txt b/lib/Target/Mips/CMakeLists.txt
new file mode 100644
index 0000000..0e3bf5a
--- /dev/null
+++ b/lib/Target/Mips/CMakeLists.txt

@@ -0,0 +1,25 @@
+set(LLVM_TARGET_DEFINITIONS Mips.td)
+
+tablegen(MipsGenRegisterInfo.h.inc -gen-register-desc-header)
+tablegen(MipsGenRegisterNames.inc -gen-register-enums)
+tablegen(MipsGenRegisterInfo.inc -gen-register-desc)
+tablegen(MipsGenInstrNames.inc -gen-instr-enums)
+tablegen(MipsGenInstrInfo.inc -gen-instr-desc)
+tablegen(MipsGenAsmWriter.inc -gen-asm-writer)
+tablegen(MipsGenDAGISel.inc -gen-dag-isel)
+tablegen(MipsGenCallingConv.inc -gen-callingconv)
+tablegen(MipsGenSubtarget.inc -gen-subtarget)
+
+add_llvm_target(MipsCodeGen
+  MipsDelaySlotFiller.cpp
+  MipsInstrInfo.cpp
+  MipsISelDAGToDAG.cpp
+  MipsISelLowering.cpp
+  MipsMCAsmInfo.cpp
+  MipsRegisterInfo.cpp
+  MipsSubtarget.cpp
+  MipsTargetMachine.cpp
+  MipsTargetObjectFile.cpp
+  )
+
+target_link_libraries (LLVMMipsCodeGen LLVMSelectionDAG)

diff --git a/lib/Target/Mips/Makefile b/lib/Target/Mips/Makefile
new file mode 100644
index 0000000..2ed8d77
--- /dev/null
+++ b/lib/Target/Mips/Makefile

@@ -0,0 +1,24 @@
+##===- lib/Target/Mips/Makefile ----------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMMipsCodeGen
+TARGET = Mips
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = MipsGenRegisterInfo.h.inc MipsGenRegisterNames.inc \
+                MipsGenRegisterInfo.inc MipsGenInstrNames.inc \
+                MipsGenInstrInfo.inc MipsGenAsmWriter.inc \
+                MipsGenDAGISel.inc MipsGenCallingConv.inc \
+                MipsGenSubtarget.inc
+
+DIRS = AsmPrinter TargetInfo
+
+include $(LEVEL)/Makefile.common
+

diff --git a/lib/Target/Mips/Mips.h b/lib/Target/Mips/Mips.h
new file mode 100644
index 0000000..a9ab050
--- /dev/null
+++ b/lib/Target/Mips/Mips.h

@@ -0,0 +1,41 @@
+//===-- Mips.h - Top-level interface for Mips representation ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in 
+// the LLVM Mips back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_MIPS_H
+#define TARGET_MIPS_H
+
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+  class MipsTargetMachine;
+  class FunctionPass;
+  class MachineCodeEmitter;
+  class formatted_raw_ostream;
+
+  FunctionPass *createMipsISelDag(MipsTargetMachine &TM);
+  FunctionPass *createMipsDelaySlotFillerPass(MipsTargetMachine &TM);
+
+  extern Target TheMipsTarget;
+  extern Target TheMipselTarget;
+
+} // end namespace llvm;
+
+// Defines symbolic names for Mips registers.  This defines a mapping from
+// register name to register number.
+#include "MipsGenRegisterNames.inc"
+
+// Defines symbolic names for the Mips instructions.
+#include "MipsGenInstrNames.inc"
+
+#endif

diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td
new file mode 100644
index 0000000..79a78d8
--- /dev/null
+++ b/lib/Target/Mips/Mips.td

@@ -0,0 +1,88 @@
+//===- Mips.td - Describe the Mips Target Machine ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This is the top level entry point for the Mips target.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// Register File, Calling Conv, Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "MipsRegisterInfo.td"
+include "MipsSchedule.td"
+include "MipsInstrInfo.td"
+include "MipsCallingConv.td"
+
+def MipsInstrInfo : InstrInfo {
+  let TSFlagsFields = [];
+  let TSFlagsShifts = [];
+}
+
+//===----------------------------------------------------------------------===//
+// Mips Subtarget features                                                    //
+//===----------------------------------------------------------------------===//
+
+def FeatureGP64Bit     : SubtargetFeature<"gp64", "IsGP64bit", "true",
+                                "General Purpose Registers are 64-bit wide.">;
+def FeatureFP64Bit     : SubtargetFeature<"fp64", "IsFP64bit", "true",
+                                "Support 64-bit FP registers.">;
+def FeatureSingleFloat : SubtargetFeature<"single-float", "IsSingleFloat",
+                                "true", "Only supports single precision float">;
+def FeatureMips1       : SubtargetFeature<"mips1", "MipsArchVersion", "Mips1",
+                                "Mips1 ISA Support">;
+def FeatureMips2       : SubtargetFeature<"mips2", "MipsArchVersion", "Mips2",
+                                "Mips2 ISA Support">;
+def FeatureO32         : SubtargetFeature<"o32", "MipsABI", "O32",
+                                "Enable o32 ABI">;
+def FeatureEABI        : SubtargetFeature<"eabi", "MipsABI", "EABI",
+                                "Enable eabi ABI">;
+def FeatureVFPU        : SubtargetFeature<"vfpu", "HasVFPU", 
+                                "true", "Enable vector FPU instructions.">;
+def FeatureSEInReg     : SubtargetFeature<"seinreg", "HasSEInReg", "true", 
+                                "Enable 'signext in register' instructions.">;
+def FeatureCondMov     : SubtargetFeature<"condmov", "HasCondMov", "true", 
+                                "Enable 'conditional move' instructions.">;
+def FeatureMulDivAdd   : SubtargetFeature<"muldivadd", "HasMulDivAdd", "true",
+                                "Enable 'multiply add/sub' instructions.">;
+def FeatureMinMax      : SubtargetFeature<"minmax", "HasMinMax", "true",
+                                "Enable 'min/max' instructions.">;
+def FeatureSwap        : SubtargetFeature<"swap", "HasSwap", "true",
+                                "Enable 'byte/half swap' instructions.">;
+def FeatureBitCount    : SubtargetFeature<"bitcount", "HasBitCount", "true",
+                                "Enable 'count leading bits' instructions.">;
+
+//===----------------------------------------------------------------------===//
+// Mips processors supported.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, MipsGenericItineraries, Features>;
+
+def : Proc<"mips1", [FeatureMips1]>;
+def : Proc<"r2000", [FeatureMips1]>;
+def : Proc<"r3000", [FeatureMips1]>;
+
+def : Proc<"mips2", [FeatureMips2]>;
+def : Proc<"r6000", [FeatureMips2]>;
+
+// Allegrex is a 32bit subset of r4000, both for interger and fp registers, 
+// but much more similar to Mips2 than Mips3. It also contains some of 
+// Mips32/Mips32r2 instructions and a custom vector fpu processor. 
+def : Proc<"allegrex", [FeatureMips2, FeatureSingleFloat, FeatureEABI, 
+      FeatureVFPU, FeatureSEInReg, FeatureCondMov, FeatureMulDivAdd,
+      FeatureMinMax, FeatureSwap, FeatureBitCount]>;
+
+def Mips : Target {
+  let InstructionSet = MipsInstrInfo;
+}

diff --git a/lib/Target/Mips/MipsCallingConv.td b/lib/Target/Mips/MipsCallingConv.td
new file mode 100644
index 0000000..c2bfb8f
--- /dev/null
+++ b/lib/Target/Mips/MipsCallingConv.td

@@ -0,0 +1,86 @@
+//===- MipsCallingConv.td - Calling Conventions for Mips --------*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+// This describes the calling conventions for Mips architecture.
+//===----------------------------------------------------------------------===//
+
+/// CCIfSubtarget - Match if the current subtarget has a feature F.
+class CCIfSubtarget<string F, CCAction A>: 
+  CCIf<!strconcat("State.getTarget().getSubtarget<MipsSubtarget>().", F), A>;
+
+//===----------------------------------------------------------------------===//
+// Mips O32 Calling Convention
+//===----------------------------------------------------------------------===//
+
+// Only the return rules are defined here for O32. The rules for argument 
+// passing are defined in MipsISelLowering.cpp.
+def RetCC_MipsO32 : CallingConv<[
+  // i32 are returned in registers V0, V1
+  CCIfType<[i32], CCAssignToReg<[V0, V1]>>,
+
+  // f32 are returned in registers F0, F2
+  CCIfType<[f32], CCAssignToReg<[F0, F2]>>,
+
+  // f64 are returned in register D0, D1
+  CCIfType<[f64], CCIfSubtarget<"isNotSingleFloat()", CCAssignToReg<[D0, D1]>>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Mips EABI Calling Convention
+//===----------------------------------------------------------------------===//
+
+def CC_MipsEABI : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // Integer arguments are passed in integer registers.
+  CCIfType<[i32], CCAssignToReg<[A0, A1, A2, A3, T0, T1, T2, T3]>>,
+
+  // Single fp arguments are passed in pairs within 32-bit mode 
+  CCIfType<[f32], CCIfSubtarget<"isSingleFloat()", 
+                  CCAssignToReg<[F12, F13, F14, F15, F16, F17, F18, F19]>>>,
+
+  CCIfType<[f32], CCIfSubtarget<"isNotSingleFloat()", 
+                  CCAssignToReg<[F12, F14, F16, F18]>>>,
+
+  // The first 4 doubl fp arguments are passed in single fp registers.
+  CCIfType<[f64], CCIfSubtarget<"isNotSingleFloat()", 
+                  CCAssignToReg<[D6, D7, D8, D9]>>>,
+
+  // Integer values get stored in stack slots that are 4 bytes in
+  // size and 4-byte aligned.
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+
+  // Integer values get stored in stack slots that are 8 bytes in
+  // size and 8-byte aligned.
+  CCIfType<[f64], CCIfSubtarget<"isNotSingleFloat()", CCAssignToStack<8, 8>>>
+]>;
+
+def RetCC_MipsEABI : CallingConv<[
+  // i32 are returned in registers V0, V1
+  CCIfType<[i32], CCAssignToReg<[V0, V1]>>,
+
+  // f32 are returned in registers F0, F1
+  CCIfType<[f32], CCAssignToReg<[F0, F1]>>,
+
+  // f64 are returned in register D0
+  CCIfType<[f64], CCIfSubtarget<"isNotSingleFloat()", CCAssignToReg<[D0]>>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// Mips Calling Convention Dispatch
+//===----------------------------------------------------------------------===//
+
+def CC_Mips : CallingConv<[
+  CCIfSubtarget<"isABI_EABI()", CCDelegateTo<CC_MipsEABI>>
+]>;
+
+def RetCC_Mips : CallingConv<[
+  CCIfSubtarget<"isABI_EABI()", CCDelegateTo<RetCC_MipsEABI>>,
+  CCDelegateTo<RetCC_MipsO32>
+]>;

diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp
new file mode 100644
index 0000000..a2b615d
--- /dev/null
+++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp

@@ -0,0 +1,77 @@
+//===-- DelaySlotFiller.cpp - Mips delay slot filler ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Simple pass to fills delay slots with NOPs.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "delay-slot-filler"
+
+#include "Mips.h"
+#include "MipsTargetMachine.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+
+using namespace llvm;
+
+STATISTIC(FilledSlots, "Number of delay slots filled");
+
+namespace {
+  struct Filler : public MachineFunctionPass {
+
+    TargetMachine &TM;
+    const TargetInstrInfo *TII;
+
+    static char ID;
+    Filler(TargetMachine &tm) 
+      : MachineFunctionPass(&ID), TM(tm), TII(tm.getInstrInfo()) { }
+
+    virtual const char *getPassName() const {
+      return "Mips Delay Slot Filler";
+    }
+
+    bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
+    bool runOnMachineFunction(MachineFunction &F) {
+      bool Changed = false;
+      for (MachineFunction::iterator FI = F.begin(), FE = F.end();
+           FI != FE; ++FI)
+        Changed |= runOnMachineBasicBlock(*FI);
+      return Changed;
+    }
+
+  };
+  char Filler::ID = 0;
+} // end of anonymous namespace
+
+/// runOnMachineBasicBlock - Fill in delay slots for the given basic block.
+/// Currently, we fill delay slots with NOPs. We assume there is only one
+/// delay slot per delayed instruction.
+bool Filler::
+runOnMachineBasicBlock(MachineBasicBlock &MBB) 
+{
+  bool Changed = false;
+  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I)
+    if (I->getDesc().hasDelaySlot()) {
+      MachineBasicBlock::iterator J = I;
+      ++J;
+      BuildMI(MBB, J, I->getDebugLoc(), TII->get(Mips::NOP));
+      ++FilledSlots;
+      Changed = true;
+    }
+  return Changed;
+}
+
+/// createMipsDelaySlotFillerPass - Returns a pass that fills in delay
+/// slots in Mips MachineFunctions
+FunctionPass *llvm::createMipsDelaySlotFillerPass(MipsTargetMachine &tm) {
+  return new Filler(tm);
+}
+

diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp
new file mode 100644
index 0000000..f1d4a67
--- /dev/null
+++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp

@@ -0,0 +1,565 @@
+//===-- MipsISelDAGToDAG.cpp - A dag to dag inst selector for Mips --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the MIPS target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mips-isel"
+#include "Mips.h"
+#include "MipsISelLowering.h"
+#include "MipsMachineFunction.h"
+#include "MipsRegisterInfo.h"
+#include "MipsSubtarget.h"
+#include "MipsTargetMachine.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Type.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MipsDAGToDAGISel - MIPS specific code to select MIPS machine
+// instructions for SelectionDAG operations.
+//===----------------------------------------------------------------------===//
+namespace {
+
+class MipsDAGToDAGISel : public SelectionDAGISel {
+
+  /// TM - Keep a reference to MipsTargetMachine.
+  MipsTargetMachine &TM;
+
+  /// Subtarget - Keep a pointer to the MipsSubtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const MipsSubtarget &Subtarget;
+ 
+public:
+  explicit MipsDAGToDAGISel(MipsTargetMachine &tm) :
+  SelectionDAGISel(tm),
+  TM(tm), Subtarget(tm.getSubtarget<MipsSubtarget>()) {}
+  
+  virtual void InstructionSelect();
+
+  // Pass Name
+  virtual const char *getPassName() const {
+    return "MIPS DAG->DAG Pattern Instruction Selection";
+  } 
+  
+
+private:  
+  // Include the pieces autogenerated from the target description.
+  #include "MipsGenDAGISel.inc"
+
+  /// getTargetMachine - Return a reference to the TargetMachine, casted
+  /// to the target-specific type.
+  const MipsTargetMachine &getTargetMachine() {
+    return static_cast<const MipsTargetMachine &>(TM);
+  }
+
+  /// getInstrInfo - Return a reference to the TargetInstrInfo, casted
+  /// to the target-specific type.
+  const MipsInstrInfo *getInstrInfo() {
+    return getTargetMachine().getInstrInfo();
+  }
+
+  SDNode *getGlobalBaseReg();
+  SDNode *Select(SDNode *N);
+
+  // Complex Pattern.
+  bool SelectAddr(SDNode *Op, SDValue N, 
+                  SDValue &Base, SDValue &Offset);
+
+  SDNode *SelectLoadFp64(SDNode *N);
+  SDNode *SelectStoreFp64(SDNode *N);
+
+  // getI32Imm - Return a target constant with the specified
+  // value, of type i32.
+  inline SDValue getI32Imm(unsigned Imm) {
+    return CurDAG->getTargetConstant(Imm, MVT::i32);
+  }
+
+
+  #ifndef NDEBUG
+  unsigned Indent;
+  #endif
+};
+
+}
+
+/// InstructionSelect - This callback is invoked by
+/// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+void MipsDAGToDAGISel::InstructionSelect() {
+  // Codegen the basic block.
+  DEBUG(errs() << "===== Instruction selection begins:\n");
+  DEBUG(Indent = 0);
+
+  // Select target instructions for the DAG.
+  SelectRoot(*CurDAG);
+
+  DEBUG(errs() << "===== Instruction selection ends:\n");
+
+  CurDAG->RemoveDeadNodes();
+}
+
+/// getGlobalBaseReg - Output the instructions required to put the
+/// GOT address into a register.
+SDNode *MipsDAGToDAGISel::getGlobalBaseReg() {
+  unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
+  return CurDAG->getRegister(GlobalBaseReg, TLI.getPointerTy()).getNode();
+}
+
+/// ComplexPattern used on MipsInstrInfo
+/// Used on Mips Load/Store instructions
+bool MipsDAGToDAGISel::
+SelectAddr(SDNode *Op, SDValue Addr, SDValue &Offset, SDValue &Base)
+{
+  // if Address is FI, get the TargetFrameIndex.
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base   = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+    
+  // on PIC code Load GA
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    if ((Addr.getOpcode() == ISD::TargetGlobalAddress) || 
+        (Addr.getOpcode() == ISD::TargetConstantPool) || 
+        (Addr.getOpcode() == ISD::TargetJumpTable)){
+      Base   = CurDAG->getRegister(Mips::GP, MVT::i32);
+      Offset = Addr;
+      return true;
+    }
+  } else {
+    if ((Addr.getOpcode() == ISD::TargetExternalSymbol ||
+        Addr.getOpcode() == ISD::TargetGlobalAddress))
+      return false;
+  }    
+  
+  // Operand is a result from an ADD.
+  if (Addr.getOpcode() == ISD::ADD) {
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
+      if (Predicate_immSExt16(CN)) {
+
+        // If the first operand is a FI, get the TargetFI Node
+        if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>
+                                    (Addr.getOperand(0))) {
+          Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+        } else {
+          Base = Addr.getOperand(0);
+        }
+
+        Offset = CurDAG->getTargetConstant(CN->getZExtValue(), MVT::i32);
+        return true;
+      }
+    }
+
+    // When loading from constant pools, load the lower address part in
+    // the instruction itself. Example, instead of:
+    //  lui $2, %hi($CPI1_0)
+    //  addiu $2, $2, %lo($CPI1_0)
+    //  lwc1 $f0, 0($2)
+    // Generate:
+    //  lui $2, %hi($CPI1_0)
+    //  lwc1 $f0, %lo($CPI1_0)($2)
+    if ((Addr.getOperand(0).getOpcode() == MipsISD::Hi || 
+         Addr.getOperand(0).getOpcode() == ISD::LOAD) &&
+        Addr.getOperand(1).getOpcode() == MipsISD::Lo) {
+      SDValue LoVal = Addr.getOperand(1); 
+      if (dyn_cast<ConstantPoolSDNode>(LoVal.getOperand(0))) {
+        Base = Addr.getOperand(0);
+        Offset = LoVal.getOperand(0);
+        return true;
+      }
+    }
+  }
+
+  Base   = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return true;
+}
+
+SDNode *MipsDAGToDAGISel::SelectLoadFp64(SDNode *N) {
+  MVT::SimpleValueType NVT = 
+    N->getValueType(0).getSimpleVT().SimpleTy;
+
+  if (!Subtarget.isMips1() || NVT != MVT::f64)
+    return NULL;
+
+  if (!Predicate_unindexedload(N) ||
+      !Predicate_load(N))
+    return NULL;
+
+  SDValue Chain = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue Offset0, Offset1, Base;
+
+  if (!SelectAddr(N, N1, Offset0, Base) ||
+      N1.getValueType() != MVT::i32)
+    return NULL;
+
+  MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+  MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+  DebugLoc dl = N->getDebugLoc();
+
+  // The second load should start after for 4 bytes. 
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Offset0))
+    Offset1 = CurDAG->getTargetConstant(C->getSExtValue()+4, MVT::i32);
+  else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Offset0))
+    Offset1 = CurDAG->getTargetConstantPool(CP->getConstVal(), 
+                                            MVT::i32, 
+                                            CP->getAlignment(), 
+                                            CP->getOffset()+4, 
+                                            CP->getTargetFlags());
+  else
+    return NULL;
+
+  // Choose the offsets depending on the endianess
+  if (TM.getTargetData()->isBigEndian())
+    std::swap(Offset0, Offset1);
+
+  // Instead of:
+  //    ldc $f0, X($3)
+  // Generate:
+  //    lwc $f0, X($3)
+  //    lwc $f1, X+4($3)
+  SDNode *LD0 = CurDAG->getMachineNode(Mips::LWC1, dl, MVT::f32, 
+                                    MVT::Other, Offset0, Base, Chain);
+  SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
+                                                 dl, NVT), 0);
+  SDValue I0 = CurDAG->getTargetInsertSubreg(Mips::SUBREG_FPEVEN, dl, 
+                            MVT::f64, Undef, SDValue(LD0, 0));
+
+  SDNode *LD1 = CurDAG->getMachineNode(Mips::LWC1, dl, MVT::f32,
+                          MVT::Other, Offset1, Base, SDValue(LD0, 1));
+  SDValue I1 = CurDAG->getTargetInsertSubreg(Mips::SUBREG_FPODD, dl, 
+                            MVT::f64, I0, SDValue(LD1, 0));
+
+  ReplaceUses(SDValue(N, 0), I1);
+  ReplaceUses(SDValue(N, 1), Chain);
+  cast<MachineSDNode>(LD0)->setMemRefs(MemRefs0, MemRefs0 + 1);
+  cast<MachineSDNode>(LD1)->setMemRefs(MemRefs0, MemRefs0 + 1);
+  return I1.getNode();
+}
+
+SDNode *MipsDAGToDAGISel::SelectStoreFp64(SDNode *N) {
+
+  if (!Subtarget.isMips1() || 
+      N->getOperand(1).getValueType() != MVT::f64)
+    return NULL;
+
+  SDValue Chain = N->getOperand(0);
+
+  if (!Predicate_unindexedstore(N) ||
+      !Predicate_store(N))
+    return NULL;
+
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  SDValue Offset0, Offset1, Base;
+
+  if (!SelectAddr(N, N2, Offset0, Base) ||
+      N1.getValueType() != MVT::f64 ||
+      N2.getValueType() != MVT::i32)
+    return NULL;
+
+  MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
+  MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
+  DebugLoc dl = N->getDebugLoc();
+
+  // Get the even and odd part from the f64 register
+  SDValue FPOdd = CurDAG->getTargetExtractSubreg(Mips::SUBREG_FPODD, 
+                                                 dl, MVT::f32, N1);
+  SDValue FPEven = CurDAG->getTargetExtractSubreg(Mips::SUBREG_FPEVEN,
+                                                 dl, MVT::f32, N1);
+
+  // The second store should start after for 4 bytes. 
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Offset0))
+    Offset1 = CurDAG->getTargetConstant(C->getSExtValue()+4, MVT::i32);
+  else
+    return NULL;
+
+  // Choose the offsets depending on the endianess
+  if (TM.getTargetData()->isBigEndian())
+    std::swap(Offset0, Offset1);
+
+  // Instead of:
+  //    sdc $f0, X($3)
+  // Generate:
+  //    swc $f0, X($3)
+  //    swc $f1, X+4($3)
+  SDValue Ops0[] = { FPEven, Offset0, Base, Chain };
+  Chain = SDValue(CurDAG->getMachineNode(Mips::SWC1, dl,
+                                       MVT::Other, Ops0, 4), 0);
+  cast<MachineSDNode>(Chain.getNode())->setMemRefs(MemRefs0, MemRefs0 + 1);
+
+  SDValue Ops1[] = { FPOdd, Offset1, Base, Chain };
+  Chain = SDValue(CurDAG->getMachineNode(Mips::SWC1, dl,
+                                       MVT::Other, Ops1, 4), 0);
+  cast<MachineSDNode>(Chain.getNode())->setMemRefs(MemRefs0, MemRefs0 + 1);
+
+  ReplaceUses(SDValue(N, 0), Chain);
+  return Chain.getNode();
+}
+
+/// Select instructions not customized! Used for
+/// expanded, promoted and normal instructions
+SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
+  unsigned Opcode = Node->getOpcode();
+  DebugLoc dl = Node->getDebugLoc();
+
+  // Dump information about the Node being selected
+  DEBUG(errs().indent(Indent) << "Selecting: ";
+        Node->dump(CurDAG);
+        errs() << "\n");
+  DEBUG(Indent += 2);
+
+  // If we have a custom node, we already have selected!
+  if (Node->isMachineOpcode()) {
+    DEBUG(errs().indent(Indent-2) << "== ";
+          Node->dump(CurDAG);
+          errs() << "\n");
+    DEBUG(Indent -= 2);
+    return NULL;
+  }
+
+  ///
+  // Instruction Selection not handled by the auto-generated 
+  // tablegen selection should be handled here.
+  /// 
+  switch(Opcode) {
+
+    default: break;
+
+    case ISD::SUBE: 
+    case ISD::ADDE: {
+      SDValue InFlag = Node->getOperand(2), CmpLHS;
+      unsigned Opc = InFlag.getOpcode(); Opc=Opc;
+      assert(((Opc == ISD::ADDC || Opc == ISD::ADDE) || 
+              (Opc == ISD::SUBC || Opc == ISD::SUBE)) &&  
+             "(ADD|SUB)E flag operand must come from (ADD|SUB)C/E insn");
+
+      unsigned MOp;
+      if (Opcode == ISD::ADDE) {
+        CmpLHS = InFlag.getValue(0);
+        MOp = Mips::ADDu;
+      } else { 
+        CmpLHS = InFlag.getOperand(0);
+        MOp = Mips::SUBu;
+      }
+
+      SDValue Ops[] = { CmpLHS, InFlag.getOperand(1) };
+
+      SDValue LHS = Node->getOperand(0);
+      SDValue RHS = Node->getOperand(1);
+
+      EVT VT = LHS.getValueType();
+      SDNode *Carry = CurDAG->getMachineNode(Mips::SLTu, dl, VT, Ops, 2);
+      SDNode *AddCarry = CurDAG->getMachineNode(Mips::ADDu, dl, VT, 
+                                                SDValue(Carry,0), RHS);
+
+      return CurDAG->SelectNodeTo(Node, MOp, VT, MVT::Flag,
+                                  LHS, SDValue(AddCarry,0));
+    }
+
+    /// Mul/Div with two results
+    case ISD::SDIVREM:
+    case ISD::UDIVREM:
+    case ISD::SMUL_LOHI:
+    case ISD::UMUL_LOHI: {
+      SDValue Op1 = Node->getOperand(0);
+      SDValue Op2 = Node->getOperand(1);
+
+      unsigned Op;
+      if (Opcode == ISD::UMUL_LOHI || Opcode == ISD::SMUL_LOHI)
+        Op = (Opcode == ISD::UMUL_LOHI ? Mips::MULTu : Mips::MULT);
+      else
+        Op = (Opcode == ISD::UDIVREM ? Mips::DIVu : Mips::DIV);
+
+      SDNode *MulDiv = CurDAG->getMachineNode(Op, dl, MVT::Flag, Op1, Op2);
+
+      SDValue InFlag = SDValue(MulDiv, 0);
+      SDNode *Lo = CurDAG->getMachineNode(Mips::MFLO, dl, MVT::i32, 
+                                          MVT::Flag, InFlag);
+      InFlag = SDValue(Lo,1);
+      SDNode *Hi = CurDAG->getMachineNode(Mips::MFHI, dl, MVT::i32, InFlag);
+
+      if (!SDValue(Node, 0).use_empty()) 
+        ReplaceUses(SDValue(Node, 0), SDValue(Lo,0));
+
+      if (!SDValue(Node, 1).use_empty()) 
+        ReplaceUses(SDValue(Node, 1), SDValue(Hi,0));
+
+      return NULL;
+    }
+
+    /// Special Muls
+    case ISD::MUL: 
+    case ISD::MULHS:
+    case ISD::MULHU: {
+      SDValue MulOp1 = Node->getOperand(0);
+      SDValue MulOp2 = Node->getOperand(1);
+
+      unsigned MulOp  = (Opcode == ISD::MULHU ? Mips::MULTu : Mips::MULT);
+      SDNode *MulNode = CurDAG->getMachineNode(MulOp, dl, 
+                                               MVT::Flag, MulOp1, MulOp2);
+
+      SDValue InFlag = SDValue(MulNode, 0);
+
+      if (Opcode == ISD::MUL)
+        return CurDAG->getMachineNode(Mips::MFLO, dl, MVT::i32, InFlag);
+      else
+        return CurDAG->getMachineNode(Mips::MFHI, dl, MVT::i32, InFlag);
+    }
+
+    /// Div/Rem operations
+    case ISD::SREM:
+    case ISD::UREM:
+    case ISD::SDIV: 
+    case ISD::UDIV: {
+      SDValue Op1 = Node->getOperand(0);
+      SDValue Op2 = Node->getOperand(1);
+
+      unsigned Op, MOp;
+      if (Opcode == ISD::SDIV || Opcode == ISD::UDIV) {
+        Op  = (Opcode == ISD::SDIV ? Mips::DIV : Mips::DIVu);
+        MOp = Mips::MFLO;
+      } else {
+        Op  = (Opcode == ISD::SREM ? Mips::DIV : Mips::DIVu);
+        MOp = Mips::MFHI;
+      }
+      SDNode *Node = CurDAG->getMachineNode(Op, dl, MVT::Flag, Op1, Op2);
+
+      SDValue InFlag = SDValue(Node, 0);
+      return CurDAG->getMachineNode(MOp, dl, MVT::i32, InFlag);
+    }
+
+    // Get target GOT address.
+    case ISD::GLOBAL_OFFSET_TABLE:
+      return getGlobalBaseReg();
+
+    case ISD::ConstantFP: {
+      ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(Node);
+      if (Node->getValueType(0) == MVT::f64 && CN->isExactlyValue(+0.0)) { 
+        SDValue Zero = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, 
+                                        Mips::ZERO, MVT::i32);
+        SDValue Undef = SDValue(
+          CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, dl, MVT::f64), 0);
+        SDNode *MTC = CurDAG->getMachineNode(Mips::MTC1, dl, MVT::f32, Zero);
+        SDValue I0 = CurDAG->getTargetInsertSubreg(Mips::SUBREG_FPEVEN, dl, 
+                            MVT::f64, Undef, SDValue(MTC, 0));
+        SDValue I1 = CurDAG->getTargetInsertSubreg(Mips::SUBREG_FPODD, dl, 
+                            MVT::f64, I0, SDValue(MTC, 0));
+        ReplaceUses(SDValue(Node, 0), I1);
+        return I1.getNode();
+      }
+      break;
+    }
+
+    case ISD::LOAD:
+      if (SDNode *ResNode = SelectLoadFp64(Node))
+        return ResNode;
+      // Other cases are autogenerated.
+      break;
+
+    case ISD::STORE:
+      if (SDNode *ResNode = SelectStoreFp64(Node))
+        return ResNode;
+      // Other cases are autogenerated.
+      break;
+
+    /// Handle direct and indirect calls when using PIC. On PIC, when 
+    /// GOT is smaller than about 64k (small code) the GA target is 
+    /// loaded with only one instruction. Otherwise GA's target must 
+    /// be loaded with 3 instructions. 
+    case MipsISD::JmpLink: {
+      if (TM.getRelocationModel() == Reloc::PIC_) {
+        unsigned LastOpNum = Node->getNumOperands()-1;
+
+        SDValue Chain  = Node->getOperand(0);
+        SDValue Callee = Node->getOperand(1);
+        SDValue InFlag;
+
+        // Skip the incomming flag if present
+        if (Node->getOperand(LastOpNum).getValueType() == MVT::Flag)
+          LastOpNum--;
+
+        if ( (isa<GlobalAddressSDNode>(Callee)) ||
+             (isa<ExternalSymbolSDNode>(Callee)) )
+        {
+          /// Direct call for global addresses and external symbols
+          SDValue GPReg = CurDAG->getRegister(Mips::GP, MVT::i32);
+
+          // Use load to get GOT target
+          SDValue Ops[] = { Callee, GPReg, Chain };
+          SDValue Load = SDValue(CurDAG->getMachineNode(Mips::LW, dl, MVT::i32, 
+                                     MVT::Other, Ops, 3), 0);
+          Chain = Load.getValue(1);
+
+          // Call target must be on T9
+          Chain = CurDAG->getCopyToReg(Chain, dl, Mips::T9, Load, InFlag);
+        } else 
+          /// Indirect call
+          Chain = CurDAG->getCopyToReg(Chain, dl, Mips::T9, Callee, InFlag);
+
+        // Map the JmpLink operands to JALR
+        SDVTList NodeTys = CurDAG->getVTList(MVT::Other, MVT::Flag);
+        SmallVector<SDValue, 8> Ops;
+        Ops.push_back(CurDAG->getRegister(Mips::T9, MVT::i32));
+
+        for (unsigned i = 2, e = LastOpNum+1; i != e; ++i)
+          Ops.push_back(Node->getOperand(i));
+        Ops.push_back(Chain);
+        Ops.push_back(Chain.getValue(1));
+
+        // Emit Jump and Link Register
+        SDNode *ResNode = CurDAG->getMachineNode(Mips::JALR, dl, NodeTys, 
+                                  &Ops[0], Ops.size());
+
+        // Replace Chain and InFlag
+        ReplaceUses(SDValue(Node, 0), SDValue(ResNode, 0));
+        ReplaceUses(SDValue(Node, 1), SDValue(ResNode, 1));
+        return ResNode;
+      } 
+    }
+  }
+
+  // Select the default instruction
+  SDNode *ResNode = SelectCode(Node);
+
+  DEBUG(errs().indent(Indent-2) << "=> ");
+  if (ResNode == NULL || ResNode == Node)
+    DEBUG(Node->dump(CurDAG));
+  else
+    DEBUG(ResNode->dump(CurDAG));
+  DEBUG(errs() << "\n");
+  DEBUG(Indent -= 2);
+
+  return ResNode;
+}
+
+/// createMipsISelDag - This pass converts a legalized DAG into a 
+/// MIPS-specific DAG, ready for instruction scheduling.
+FunctionPass *llvm::createMipsISelDag(MipsTargetMachine &TM) {
+  return new MipsDAGToDAGISel(TM);
+}

diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
new file mode 100644
index 0000000..d94944f
--- /dev/null
+++ b/lib/Target/Mips/MipsISelLowering.cpp

@@ -0,0 +1,1330 @@
+//===-- MipsISelLowering.cpp - Mips DAG Lowering Implementation -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that Mips uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mips-lower"
+#include "MipsISelLowering.h"
+#include "MipsMachineFunction.h"
+#include "MipsTargetMachine.h"
+#include "MipsTargetObjectFile.h"
+#include "MipsSubtarget.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/CallingConv.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+using namespace llvm;
+
+const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+    case MipsISD::JmpLink    : return "MipsISD::JmpLink";
+    case MipsISD::Hi         : return "MipsISD::Hi";
+    case MipsISD::Lo         : return "MipsISD::Lo";
+    case MipsISD::GPRel      : return "MipsISD::GPRel";
+    case MipsISD::Ret        : return "MipsISD::Ret";
+    case MipsISD::CMov       : return "MipsISD::CMov";
+    case MipsISD::SelectCC   : return "MipsISD::SelectCC";
+    case MipsISD::FPSelectCC : return "MipsISD::FPSelectCC";
+    case MipsISD::FPBrcond   : return "MipsISD::FPBrcond";
+    case MipsISD::FPCmp      : return "MipsISD::FPCmp";
+    case MipsISD::FPRound    : return "MipsISD::FPRound";
+    default                  : return NULL;
+  }
+}
+
+MipsTargetLowering::
+MipsTargetLowering(MipsTargetMachine &TM)
+  : TargetLowering(TM, new MipsTargetObjectFile()) {
+  Subtarget = &TM.getSubtarget<MipsSubtarget>();
+
+  // Mips does not have i1 type, so use i32 for
+  // setcc operations results (slt, sgt, ...). 
+  setBooleanContents(ZeroOrOneBooleanContent);
+
+  // Set up the register classes
+  addRegisterClass(MVT::i32, Mips::CPURegsRegisterClass);
+  addRegisterClass(MVT::f32, Mips::FGR32RegisterClass);
+
+  // When dealing with single precision only, use libcalls
+  if (!Subtarget->isSingleFloat())
+    if (!Subtarget->isFP64bit())
+      addRegisterClass(MVT::f64, Mips::AFGR64RegisterClass);
+
+  // Load extented operations for i1 types must be promoted 
+  setLoadExtAction(ISD::EXTLOAD,  MVT::i1,  Promote);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1,  Promote);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1,  Promote);
+
+  // MIPS doesn't have extending float->double load/store
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+  // Used by legalize types to correctly generate the setcc result. 
+  // Without this, every float setcc comes with a AND/OR with the result, 
+  // we don't want this, since the fpcmp result goes to a flag register, 
+  // which is used implicitly by brcond and select operations.
+  AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
+
+  // Mips Custom Operations
+  setOperationAction(ISD::GlobalAddress,      MVT::i32,   Custom);
+  setOperationAction(ISD::GlobalTLSAddress,   MVT::i32,   Custom);
+  setOperationAction(ISD::JumpTable,          MVT::i32,   Custom);
+  setOperationAction(ISD::ConstantPool,       MVT::i32,   Custom);
+  setOperationAction(ISD::SELECT,             MVT::f32,   Custom);
+  setOperationAction(ISD::SELECT,             MVT::f64,   Custom);
+  setOperationAction(ISD::SELECT,             MVT::i32,   Custom);
+  setOperationAction(ISD::SETCC,              MVT::f32,   Custom);
+  setOperationAction(ISD::SETCC,              MVT::f64,   Custom);
+  setOperationAction(ISD::BRCOND,             MVT::Other, Custom);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32,   Custom);
+  setOperationAction(ISD::FP_TO_SINT,         MVT::i32,   Custom);
+  setOperationAction(ISD::VASTART,            MVT::Other, Custom);
+
+
+  // We custom lower AND/OR to handle the case where the DAG contain 'ands/ors' 
+  // with operands comming from setcc fp comparions. This is necessary since 
+  // the result from these setcc are in a flag registers (FCR31).
+  setOperationAction(ISD::AND,              MVT::i32,   Custom);
+  setOperationAction(ISD::OR,               MVT::i32,   Custom);
+
+  // Operations not directly supported by Mips.
+  setOperationAction(ISD::BR_JT,             MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC,             MVT::Other, Expand);
+  setOperationAction(ISD::SELECT_CC,         MVT::Other, Expand);
+  setOperationAction(ISD::UINT_TO_FP,        MVT::i32,   Expand);
+  setOperationAction(ISD::FP_TO_UINT,        MVT::i32,   Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1,    Expand);
+  setOperationAction(ISD::CTPOP,             MVT::i32,   Expand);
+  setOperationAction(ISD::CTTZ,              MVT::i32,   Expand);
+  setOperationAction(ISD::ROTL,              MVT::i32,   Expand);
+  setOperationAction(ISD::ROTR,              MVT::i32,   Expand);
+  setOperationAction(ISD::SHL_PARTS,         MVT::i32,   Expand);
+  setOperationAction(ISD::SRA_PARTS,         MVT::i32,   Expand);
+  setOperationAction(ISD::SRL_PARTS,         MVT::i32,   Expand);
+  setOperationAction(ISD::FCOPYSIGN,         MVT::f32,   Expand);
+  setOperationAction(ISD::FCOPYSIGN,         MVT::f64,   Expand);
+  setOperationAction(ISD::FSIN,              MVT::f32,   Expand);
+  setOperationAction(ISD::FCOS,              MVT::f32,   Expand);
+  setOperationAction(ISD::FPOWI,             MVT::f32,   Expand);
+  setOperationAction(ISD::FPOW,              MVT::f32,   Expand);
+  setOperationAction(ISD::FLOG,              MVT::f32,   Expand);
+  setOperationAction(ISD::FLOG2,             MVT::f32,   Expand);
+  setOperationAction(ISD::FLOG10,            MVT::f32,   Expand);
+  setOperationAction(ISD::FEXP,              MVT::f32,   Expand);
+
+  setOperationAction(ISD::EH_LABEL,          MVT::Other, Expand);
+
+  // Use the default for now
+  setOperationAction(ISD::STACKSAVE,         MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE,      MVT::Other, Expand);
+  setOperationAction(ISD::MEMBARRIER,        MVT::Other, Expand);
+
+  if (Subtarget->isSingleFloat())
+    setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
+
+  if (!Subtarget->hasSEInReg()) {
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8,  Expand);
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
+  }
+
+  if (!Subtarget->hasBitCount())
+    setOperationAction(ISD::CTLZ, MVT::i32, Expand);
+
+  if (!Subtarget->hasSwap())
+    setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+
+  setStackPointerRegisterToSaveRestore(Mips::SP);
+  computeRegisterProperties();
+}
+
+MVT::SimpleValueType MipsTargetLowering::getSetCCResultType(EVT VT) const {
+  return MVT::i32;
+}
+
+/// getFunctionAlignment - Return the Log2 alignment of this function.
+unsigned MipsTargetLowering::getFunctionAlignment(const Function *) const {
+  return 2;
+}
+
+SDValue MipsTargetLowering::
+LowerOperation(SDValue Op, SelectionDAG &DAG) 
+{
+  switch (Op.getOpcode()) 
+  {
+    case ISD::AND:                return LowerANDOR(Op, DAG);
+    case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
+    case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
+    case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
+    case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
+    case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
+    case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
+    case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
+    case ISD::OR:                 return LowerANDOR(Op, DAG);
+    case ISD::SELECT:             return LowerSELECT(Op, DAG);
+    case ISD::SETCC:              return LowerSETCC(Op, DAG);
+    case ISD::VASTART:            return LowerVASTART(Op, DAG);
+  }
+  return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+//  Lower helper functions
+//===----------------------------------------------------------------------===//
+
+// AddLiveIn - This helper function adds the specified physical register to the
+// MachineFunction as a live in value.  It also creates a corresponding
+// virtual register for it.
+static unsigned
+AddLiveIn(MachineFunction &MF, unsigned PReg, TargetRegisterClass *RC) 
+{
+  assert(RC->contains(PReg) && "Not the correct regclass!");
+  unsigned VReg = MF.getRegInfo().createVirtualRegister(RC);
+  MF.getRegInfo().addLiveIn(PReg, VReg);
+  return VReg;
+}
+
+// Get fp branch code (not opcode) from condition code.
+static Mips::FPBranchCode GetFPBranchCodeFromCond(Mips::CondCode CC) {
+  if (CC >= Mips::FCOND_F && CC <= Mips::FCOND_NGT)
+    return Mips::BRANCH_T;
+
+  if (CC >= Mips::FCOND_T && CC <= Mips::FCOND_GT)
+    return Mips::BRANCH_F;
+
+  return Mips::BRANCH_INVALID;
+}
+  
+static unsigned FPBranchCodeToOpc(Mips::FPBranchCode BC) {
+  switch(BC) {
+    default:
+      llvm_unreachable("Unknown branch code");
+    case Mips::BRANCH_T  : return Mips::BC1T;
+    case Mips::BRANCH_F  : return Mips::BC1F;
+    case Mips::BRANCH_TL : return Mips::BC1TL;
+    case Mips::BRANCH_FL : return Mips::BC1FL;
+  }
+}
+
+static Mips::CondCode FPCondCCodeToFCC(ISD::CondCode CC) {
+  switch (CC) {
+  default: llvm_unreachable("Unknown fp condition code!");
+  case ISD::SETEQ:  
+  case ISD::SETOEQ: return Mips::FCOND_EQ;
+  case ISD::SETUNE: return Mips::FCOND_OGL;
+  case ISD::SETLT:  
+  case ISD::SETOLT: return Mips::FCOND_OLT;
+  case ISD::SETGT:  
+  case ISD::SETOGT: return Mips::FCOND_OGT;
+  case ISD::SETLE:  
+  case ISD::SETOLE: return Mips::FCOND_OLE; 
+  case ISD::SETGE:
+  case ISD::SETOGE: return Mips::FCOND_OGE;
+  case ISD::SETULT: return Mips::FCOND_ULT;
+  case ISD::SETULE: return Mips::FCOND_ULE; 
+  case ISD::SETUGT: return Mips::FCOND_UGT;
+  case ISD::SETUGE: return Mips::FCOND_UGE;
+  case ISD::SETUO:  return Mips::FCOND_UN; 
+  case ISD::SETO:   return Mips::FCOND_OR;
+  case ISD::SETNE:  
+  case ISD::SETONE: return Mips::FCOND_NEQ;
+  case ISD::SETUEQ: return Mips::FCOND_UEQ;
+  }
+}
+
+MachineBasicBlock *
+MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                MachineBasicBlock *BB,
+                   DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  bool isFPCmp = false;
+  DebugLoc dl = MI->getDebugLoc();
+
+  switch (MI->getOpcode()) {
+  default: assert(false && "Unexpected instr type to insert");
+  case Mips::Select_FCC:
+  case Mips::Select_FCC_S32:
+  case Mips::Select_FCC_D32:
+    isFPCmp = true; // FALL THROUGH
+  case Mips::Select_CC:
+  case Mips::Select_CC_S32:
+  case Mips::Select_CC_D32: {
+    // To "insert" a SELECT_CC instruction, we actually have to insert the
+    // diamond control-flow pattern.  The incoming instruction knows the
+    // destination vreg to set, the condition code register to branch on, the
+    // true/false values to select between, and a branch opcode to use.
+    const BasicBlock *LLVM_BB = BB->getBasicBlock();
+    MachineFunction::iterator It = BB;
+    ++It;
+
+    //  thisMBB:
+    //  ...
+    //   TrueVal = ...
+    //   setcc r1, r2, r3
+    //   bNE   r1, r0, copy1MBB
+    //   fallthrough --> copy0MBB
+    MachineBasicBlock *thisMBB  = BB;
+    MachineFunction *F = BB->getParent();
+    MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *sinkMBB  = F->CreateMachineBasicBlock(LLVM_BB);
+
+    // Emit the right instruction according to the type of the operands compared
+    if (isFPCmp) {
+      // Find the condiction code present in the setcc operation.
+      Mips::CondCode CC = (Mips::CondCode)MI->getOperand(4).getImm();
+      // Get the branch opcode from the branch code.
+      unsigned Opc = FPBranchCodeToOpc(GetFPBranchCodeFromCond(CC));
+      BuildMI(BB, dl, TII->get(Opc)).addMBB(sinkMBB);
+    } else
+      BuildMI(BB, dl, TII->get(Mips::BNE)).addReg(MI->getOperand(1).getReg())
+        .addReg(Mips::ZERO).addMBB(sinkMBB);
+
+    F->insert(It, copy0MBB);
+    F->insert(It, sinkMBB);
+    // Update machine-CFG edges by first adding all successors of the current
+    // block to the new block which will contain the Phi node for the select.
+    // Also inform sdisel of the edge changes.
+    for(MachineBasicBlock::succ_iterator i = BB->succ_begin(),
+          e = BB->succ_end(); i != e; ++i) {
+      EM->insert(std::make_pair(*i, sinkMBB));
+      sinkMBB->addSuccessor(*i);
+    }
+    // Next, remove all successors of the current block, and add the true
+    // and fallthrough blocks as its successors.
+    while(!BB->succ_empty())
+      BB->removeSuccessor(BB->succ_begin());
+    BB->addSuccessor(copy0MBB);
+    BB->addSuccessor(sinkMBB);
+
+    //  copy0MBB:
+    //   %FalseValue = ...
+    //   # fallthrough to sinkMBB
+    BB = copy0MBB;
+
+    // Update machine-CFG edges
+    BB->addSuccessor(sinkMBB);
+
+    //  sinkMBB:
+    //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+    //  ...
+    BB = sinkMBB;
+    BuildMI(BB, dl, TII->get(Mips::PHI), MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB)
+      .addReg(MI->getOperand(3).getReg()).addMBB(thisMBB);
+
+    F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+    return BB;
+  }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//  Misc Lower Operation implementation
+//===----------------------------------------------------------------------===//
+
+SDValue MipsTargetLowering::
+LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG)
+{
+  if (!Subtarget->isMips1())
+    return Op;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  unsigned CCReg = AddLiveIn(MF, Mips::FCR31, Mips::CCRRegisterClass);
+
+  SDValue Chain = DAG.getEntryNode();
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue Src = Op.getOperand(0);
+
+  // Set the condition register
+  SDValue CondReg = DAG.getCopyFromReg(Chain, dl, CCReg, MVT::i32);
+  CondReg = DAG.getCopyToReg(Chain, dl, Mips::AT, CondReg);
+  CondReg = DAG.getCopyFromReg(CondReg, dl, Mips::AT, MVT::i32);
+
+  SDValue Cst = DAG.getConstant(3, MVT::i32);
+  SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i32, CondReg, Cst);
+  Cst = DAG.getConstant(2, MVT::i32);
+  SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i32, Or, Cst);
+
+  SDValue InFlag(0, 0);
+  CondReg = DAG.getCopyToReg(Chain, dl, Mips::FCR31, Xor, InFlag);
+
+  // Emit the round instruction and bit convert to integer
+  SDValue Trunc = DAG.getNode(MipsISD::FPRound, dl, MVT::f32,
+                              Src, CondReg.getValue(1));
+  SDValue BitCvt = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, Trunc);
+  return BitCvt;
+}
+
+SDValue MipsTargetLowering::
+LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG)
+{
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size = Op.getOperand(1);
+  DebugLoc dl = Op.getDebugLoc();
+
+  // Get a reference from Mips stack pointer
+  SDValue StackPointer = DAG.getCopyFromReg(Chain, dl, Mips::SP, MVT::i32);
+
+  // Subtract the dynamic size from the actual stack size to
+  // obtain the new stack size.
+  SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, StackPointer, Size);
+
+  // The Sub result contains the new stack start address, so it 
+  // must be placed in the stack pointer register.
+  Chain = DAG.getCopyToReg(StackPointer.getValue(1), dl, Mips::SP, Sub);
+  
+  // This node always has two return values: a new stack pointer 
+  // value and a chain
+  SDValue Ops[2] = { Sub, Chain };
+  return DAG.getMergeValues(Ops, 2, dl);
+}
+
+SDValue MipsTargetLowering::
+LowerANDOR(SDValue Op, SelectionDAG &DAG)
+{
+  SDValue LHS   = Op.getOperand(0);
+  SDValue RHS   = Op.getOperand(1);
+  DebugLoc dl   = Op.getDebugLoc();
+
+  if (LHS.getOpcode() != MipsISD::FPCmp || RHS.getOpcode() != MipsISD::FPCmp)
+    return Op;
+
+  SDValue True  = DAG.getConstant(1, MVT::i32);
+  SDValue False = DAG.getConstant(0, MVT::i32);
+
+  SDValue LSEL = DAG.getNode(MipsISD::FPSelectCC, dl, True.getValueType(), 
+                             LHS, True, False, LHS.getOperand(2));
+  SDValue RSEL = DAG.getNode(MipsISD::FPSelectCC, dl, True.getValueType(), 
+                             RHS, True, False, RHS.getOperand(2));
+
+  return DAG.getNode(Op.getOpcode(), dl, MVT::i32, LSEL, RSEL);
+}
+
+SDValue MipsTargetLowering::
+LowerBRCOND(SDValue Op, SelectionDAG &DAG)
+{
+  // The first operand is the chain, the second is the condition, the third is 
+  // the block to branch to if the condition is true.
+  SDValue Chain = Op.getOperand(0);
+  SDValue Dest = Op.getOperand(2);
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (Op.getOperand(1).getOpcode() != MipsISD::FPCmp)
+    return Op;
+  
+  SDValue CondRes = Op.getOperand(1);
+  SDValue CCNode  = CondRes.getOperand(2);
+  Mips::CondCode CC =
+    (Mips::CondCode)cast<ConstantSDNode>(CCNode)->getZExtValue();
+  SDValue BrCode = DAG.getConstant(GetFPBranchCodeFromCond(CC), MVT::i32); 
+
+  return DAG.getNode(MipsISD::FPBrcond, dl, Op.getValueType(), Chain, BrCode, 
+             Dest, CondRes);
+}
+
+SDValue MipsTargetLowering::
+LowerSETCC(SDValue Op, SelectionDAG &DAG)
+{
+  // The operands to this are the left and right operands to compare (ops #0, 
+  // and #1) and the condition code to compare them with (op #2) as a 
+  // CondCodeSDNode.
+  SDValue LHS = Op.getOperand(0); 
+  SDValue RHS = Op.getOperand(1);
+  DebugLoc dl = Op.getDebugLoc();
+
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  
+  return DAG.getNode(MipsISD::FPCmp, dl, Op.getValueType(), LHS, RHS, 
+                 DAG.getConstant(FPCondCCodeToFCC(CC), MVT::i32));
+}
+
+SDValue MipsTargetLowering::
+LowerSELECT(SDValue Op, SelectionDAG &DAG) 
+{
+  SDValue Cond  = Op.getOperand(0); 
+  SDValue True  = Op.getOperand(1);
+  SDValue False = Op.getOperand(2);
+  DebugLoc dl = Op.getDebugLoc();
+
+  // if the incomming condition comes from a integer compare, the select 
+  // operation must be SelectCC or a conditional move if the subtarget 
+  // supports it.
+  if (Cond.getOpcode() != MipsISD::FPCmp) {
+    if (Subtarget->hasCondMov() && !True.getValueType().isFloatingPoint())
+      return Op;
+    return DAG.getNode(MipsISD::SelectCC, dl, True.getValueType(), 
+                       Cond, True, False);
+  }
+
+  // if the incomming condition comes from fpcmp, the select
+  // operation must use FPSelectCC.
+  SDValue CCNode = Cond.getOperand(2);
+  return DAG.getNode(MipsISD::FPSelectCC, dl, True.getValueType(), 
+                     Cond, True, False, CCNode);
+}
+
+SDValue MipsTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) {
+  // FIXME there isn't actually debug info here
+  DebugLoc dl = Op.getDebugLoc();
+  GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+
+  if (getTargetMachine().getRelocationModel() != Reloc::PIC_) {
+    SDVTList VTs = DAG.getVTList(MVT::i32);
+    
+    MipsTargetObjectFile &TLOF = (MipsTargetObjectFile&)getObjFileLowering();
+    
+    // %gp_rel relocation
+    if (TLOF.IsGlobalInSmallSection(GV, getTargetMachine())) { 
+      SDValue GA = DAG.getTargetGlobalAddress(GV, MVT::i32, 0, 
+                                              MipsII::MO_GPREL);
+      SDValue GPRelNode = DAG.getNode(MipsISD::GPRel, dl, VTs, &GA, 1);
+      SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(MVT::i32);
+      return DAG.getNode(ISD::ADD, dl, MVT::i32, GOT, GPRelNode); 
+    }
+    // %hi/%lo relocation
+    SDValue GA = DAG.getTargetGlobalAddress(GV, MVT::i32, 0,
+                                            MipsII::MO_ABS_HILO);
+    SDValue HiPart = DAG.getNode(MipsISD::Hi, dl, VTs, &GA, 1);
+    SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, GA);
+    return DAG.getNode(ISD::ADD, dl, MVT::i32, HiPart, Lo);
+
+  } else {
+    SDValue GA = DAG.getTargetGlobalAddress(GV, MVT::i32, 0,
+                                            MipsII::MO_GOT);
+    SDValue ResNode = DAG.getLoad(MVT::i32, dl, 
+                                  DAG.getEntryNode(), GA, NULL, 0);
+    // On functions and global targets not internal linked only
+    // a load from got/GP is necessary for PIC to work.
+    if (!GV->hasLocalLinkage() || isa<Function>(GV))
+      return ResNode;
+    SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, GA);
+    return DAG.getNode(ISD::ADD, dl, MVT::i32, ResNode, Lo);
+  }
+
+  llvm_unreachable("Dont know how to handle GlobalAddress");
+  return SDValue(0,0);
+}
+
+SDValue MipsTargetLowering::
+LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG)
+{
+  llvm_unreachable("TLS not implemented for MIPS.");
+  return SDValue(); // Not reached
+}
+
+SDValue MipsTargetLowering::
+LowerJumpTable(SDValue Op, SelectionDAG &DAG) 
+{
+  SDValue ResNode;
+  SDValue HiPart; 
+  // FIXME there isn't actually debug info here
+  DebugLoc dl = Op.getDebugLoc();
+  bool IsPIC = getTargetMachine().getRelocationModel() == Reloc::PIC_;
+  unsigned char OpFlag = IsPIC ? MipsII::MO_GOT : MipsII::MO_ABS_HILO;
+
+  EVT PtrVT = Op.getValueType();
+  JumpTableSDNode *JT  = cast<JumpTableSDNode>(Op);
+
+  SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
+
+  if (IsPIC) {
+    SDValue Ops[] = { JTI };
+    HiPart = DAG.getNode(MipsISD::Hi, dl, DAG.getVTList(MVT::i32), Ops, 1);
+  } else // Emit Load from Global Pointer
+    HiPart = DAG.getLoad(MVT::i32, dl, DAG.getEntryNode(), JTI, NULL, 0);
+
+  SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, JTI);
+  ResNode = DAG.getNode(ISD::ADD, dl, MVT::i32, HiPart, Lo);
+
+  return ResNode;
+}
+
+SDValue MipsTargetLowering::
+LowerConstantPool(SDValue Op, SelectionDAG &DAG) 
+{
+  SDValue ResNode;
+  ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op);
+  Constant *C = N->getConstVal();
+  // FIXME there isn't actually debug info here
+  DebugLoc dl = Op.getDebugLoc();
+
+  // gp_rel relocation
+  // FIXME: we should reference the constant pool using small data sections, 
+  // but the asm printer currently doens't support this feature without
+  // hacking it. This feature should come soon so we can uncomment the 
+  // stuff below.
+  //if (IsInSmallSection(C->getType())) {
+  //  SDValue GPRelNode = DAG.getNode(MipsISD::GPRel, MVT::i32, CP);
+  //  SDValue GOT = DAG.getGLOBAL_OFFSET_TABLE(MVT::i32);
+  //  ResNode = DAG.getNode(ISD::ADD, MVT::i32, GOT, GPRelNode); 
+
+  if (getTargetMachine().getRelocationModel() != Reloc::PIC_) {
+    SDValue CP = DAG.getTargetConstantPool(C, MVT::i32, N->getAlignment(), 
+                                      N->getOffset(), MipsII::MO_ABS_HILO);
+    SDValue HiPart = DAG.getNode(MipsISD::Hi, dl, MVT::i32, CP);
+    SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, CP);
+    ResNode = DAG.getNode(ISD::ADD, dl, MVT::i32, HiPart, Lo);
+  } else {
+    SDValue CP = DAG.getTargetConstantPool(C, MVT::i32, N->getAlignment(), 
+                                      N->getOffset(), MipsII::MO_GOT);
+    SDValue Load = DAG.getLoad(MVT::i32, dl, DAG.getEntryNode(), 
+                                 CP, NULL, 0);
+    SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, CP);
+    ResNode = DAG.getNode(ISD::ADD, dl, MVT::i32, Load, Lo);
+  }
+
+  return ResNode;
+}
+
+SDValue MipsTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue FI = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy());
+
+  // vastart just stores the address of the VarArgsFrameIndex slot into the
+  // memory location argument.
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), dl, FI, Op.getOperand(1), SV, 0);
+}
+
+//===----------------------------------------------------------------------===//
+//                      Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "MipsGenCallingConv.inc"
+
+//===----------------------------------------------------------------------===//
+// TODO: Implement a generic logic using tblgen that can support this. 
+// Mips O32 ABI rules:
+// ---
+// i32 - Passed in A0, A1, A2, A3 and stack
+// f32 - Only passed in f32 registers if no int reg has been used yet to hold 
+//       an argument. Otherwise, passed in A1, A2, A3 and stack.
+// f64 - Only passed in two aliased f32 registers if no int reg has been used 
+//       yet to hold an argument. Otherwise, use A2, A3 and stack. If A1 is 
+//       not used, it must be shadowed. If only A3 is avaiable, shadow it and
+//       go to stack.
+//===----------------------------------------------------------------------===//
+
+static bool CC_MipsO32(unsigned ValNo, EVT ValVT,
+                       EVT LocVT, CCValAssign::LocInfo LocInfo,
+                       ISD::ArgFlagsTy ArgFlags, CCState &State) {
+
+  static const unsigned IntRegsSize=4, FloatRegsSize=2; 
+
+  static const unsigned IntRegs[] = {
+      Mips::A0, Mips::A1, Mips::A2, Mips::A3
+  };
+  static const unsigned F32Regs[] = {
+      Mips::F12, Mips::F14
+  };
+  static const unsigned F64Regs[] = {
+      Mips::D6, Mips::D7
+  };
+
+  unsigned Reg=0;
+  unsigned UnallocIntReg = State.getFirstUnallocated(IntRegs, IntRegsSize);
+  bool IntRegUsed = (IntRegs[UnallocIntReg] != (unsigned (Mips::A0)));
+
+  // Promote i8 and i16
+  if (LocVT == MVT::i8 || LocVT == MVT::i16) {
+    LocVT = MVT::i32;
+    if (ArgFlags.isSExt())
+      LocInfo = CCValAssign::SExt;
+    else if (ArgFlags.isZExt())
+      LocInfo = CCValAssign::ZExt;
+    else
+      LocInfo = CCValAssign::AExt;
+  }
+
+  if (ValVT == MVT::i32 || (ValVT == MVT::f32 && IntRegUsed)) {
+    Reg = State.AllocateReg(IntRegs, IntRegsSize);
+    IntRegUsed = true;
+    LocVT = MVT::i32;
+  }
+
+  if (ValVT.isFloatingPoint() && !IntRegUsed) {
+    if (ValVT == MVT::f32)
+      Reg = State.AllocateReg(F32Regs, FloatRegsSize);
+    else
+      Reg = State.AllocateReg(F64Regs, FloatRegsSize);
+  }
+
+  if (ValVT == MVT::f64 && IntRegUsed) {
+    if (UnallocIntReg != IntRegsSize) {
+      // If we hit register A3 as the first not allocated, we must
+      // mark it as allocated (shadow) and use the stack instead.
+      if (IntRegs[UnallocIntReg] != (unsigned (Mips::A3)))
+        Reg = Mips::A2;
+      for (;UnallocIntReg < IntRegsSize; ++UnallocIntReg)
+        State.AllocateReg(UnallocIntReg);
+    } 
+    LocVT = MVT::i32;
+  }
+
+  if (!Reg) {
+    unsigned SizeInBytes = ValVT.getSizeInBits() >> 3;
+    unsigned Offset = State.AllocateStack(SizeInBytes, SizeInBytes);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+  } else
+    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+
+  return false; // CC must always match
+}
+
+static bool CC_MipsO32_VarArgs(unsigned ValNo, EVT ValVT,
+                       EVT LocVT, CCValAssign::LocInfo LocInfo,
+                       ISD::ArgFlagsTy ArgFlags, CCState &State) {
+
+  static const unsigned IntRegsSize=4;
+
+  static const unsigned IntRegs[] = {
+      Mips::A0, Mips::A1, Mips::A2, Mips::A3
+  };
+
+  // Promote i8 and i16
+  if (LocVT == MVT::i8 || LocVT == MVT::i16) {
+    LocVT = MVT::i32;
+    if (ArgFlags.isSExt())
+      LocInfo = CCValAssign::SExt;
+    else if (ArgFlags.isZExt())
+      LocInfo = CCValAssign::ZExt;
+    else
+      LocInfo = CCValAssign::AExt;
+  }
+
+  if (ValVT == MVT::i32 || ValVT == MVT::f32) {
+    if (unsigned Reg = State.AllocateReg(IntRegs, IntRegsSize)) {
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, MVT::i32, LocInfo));
+      return false;
+    }
+    unsigned Off = State.AllocateStack(4, 4);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Off, LocVT, LocInfo));
+    return false;
+  }
+
+  unsigned UnallocIntReg = State.getFirstUnallocated(IntRegs, IntRegsSize);
+  if (ValVT == MVT::f64) {
+    if (IntRegs[UnallocIntReg] == (unsigned (Mips::A1))) {
+      // A1 can't be used anymore, because 64 bit arguments
+      // must be aligned when copied back to the caller stack
+      State.AllocateReg(IntRegs, IntRegsSize);
+      UnallocIntReg++;
+    }
+
+    if (IntRegs[UnallocIntReg] == (unsigned (Mips::A0)) ||
+        IntRegs[UnallocIntReg] == (unsigned (Mips::A2))) {
+      unsigned Reg = State.AllocateReg(IntRegs, IntRegsSize);
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, MVT::i32, LocInfo));
+      // Shadow the next register so it can be used 
+      // later to get the other 32bit part.
+      State.AllocateReg(IntRegs, IntRegsSize);
+      return false;
+    }
+
+    // Register is shadowed to preserve alignment, and the
+    // argument goes to a stack location.
+    if (UnallocIntReg != IntRegsSize)
+      State.AllocateReg(IntRegs, IntRegsSize);
+
+    unsigned Off = State.AllocateStack(8, 8);
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Off, LocVT, LocInfo));
+    return false;
+  }
+
+  return true; // CC didn't match
+}
+
+//===----------------------------------------------------------------------===//
+//                  Call Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+/// LowerCall - functions arguments are copied from virtual regs to
+/// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
+/// TODO: isTailCall.
+SDValue
+MipsTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
+                              CallingConv::ID CallConv, bool isVarArg,
+                              bool &isTailCall,
+                              const SmallVectorImpl<ISD::OutputArg> &Outs,
+                              const SmallVectorImpl<ISD::InputArg> &Ins,
+                              DebugLoc dl, SelectionDAG &DAG,
+                              SmallVectorImpl<SDValue> &InVals) {
+  // MIPs target does not yet support tail call optimization.
+  isTailCall = false;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  bool IsPIC = getTargetMachine().getRelocationModel() == Reloc::PIC_;
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs,
+                 *DAG.getContext());
+
+  // To meet O32 ABI, Mips must always allocate 16 bytes on
+  // the stack (even if less than 4 are used as arguments)
+  if (Subtarget->isABI_O32()) {
+    int VTsize = EVT(MVT::i32).getSizeInBits()/8;
+    MFI->CreateFixedObject(VTsize, (VTsize*3), true, false);
+    CCInfo.AnalyzeCallOperands(Outs, 
+                     isVarArg ? CC_MipsO32_VarArgs : CC_MipsO32);
+  } else
+    CCInfo.AnalyzeCallOperands(Outs, CC_Mips);
+  
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+
+  // With EABI is it possible to have 16 args on registers.
+  SmallVector<std::pair<unsigned, SDValue>, 16> RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+
+  // First/LastArgStackLoc contains the first/last 
+  // "at stack" argument location.
+  int LastArgStackLoc = 0;
+  unsigned FirstStackArgLoc = (Subtarget->isABI_EABI() ? 0 : 16);
+
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    SDValue Arg = Outs[i].Val;
+    CCValAssign &VA = ArgLocs[i];
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: 
+      if (Subtarget->isABI_O32() && VA.isRegLoc()) {
+        if (VA.getValVT() == MVT::f32 && VA.getLocVT() == MVT::i32)
+          Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, Arg);
+        if (VA.getValVT() == MVT::f64 && VA.getLocVT() == MVT::i32) {
+          Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg);
+          SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Arg,
+                                   DAG.getConstant(0, getPointerTy()));
+          SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Arg,
+                                   DAG.getConstant(1, getPointerTy()));
+          RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
+          RegsToPass.push_back(std::make_pair(VA.getLocReg()+1, Hi));
+          continue;
+        }  
+      }
+      break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    }
+    
+    // Arguments that can be passed on register must be kept at 
+    // RegsToPass vector
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+      continue;
+    }
+    
+    // Register can't get to this point...
+    assert(VA.isMemLoc());
+    
+    // Create the frame index object for this incoming parameter
+    // This guarantees that when allocating Local Area the firsts
+    // 16 bytes which are alwayes reserved won't be overwritten
+    // if O32 ABI is used. For EABI the first address is zero.
+    LastArgStackLoc = (FirstStackArgLoc + VA.getLocMemOffset());
+    int FI = MFI->CreateFixedObject(VA.getValVT().getSizeInBits()/8,
+                                    LastArgStackLoc, true, false);
+
+    SDValue PtrOff = DAG.getFrameIndex(FI,getPointerTy());
+
+    // emit ISD::STORE whichs stores the 
+    // parameter value to a stack Location
+    MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff, NULL, 0));
+  }
+
+  // Transform all store nodes into one single node because all store
+  // nodes are independent of each other.
+  if (!MemOpChains.empty())     
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Build a sequence of copy-to-reg nodes chained together with token 
+  // chain and flag operands which copy the outgoing args into registers.
+  // The InFlag in necessary since all emited instructions must be
+  // stuck together.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
+  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol 
+  // node so that legalize doesn't hack it. 
+  unsigned char OpFlag = IsPIC ? MipsII::MO_GOT_CALL : MipsII::MO_NO_FLAG;
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) 
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), 
+                                getPointerTy(), 0, OpFlag);
+  else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee))
+    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), 
+                                getPointerTy(), OpFlag);
+
+  // MipsJmpLink = #chain, #target_address, #opt_in_flags...
+  //             = Chain, Callee, Reg#1, Reg#2, ...  
+  //
+  // Returns a chain & a flag for retval copy to use.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are 
+  // known live into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  Chain  = DAG.getNode(MipsISD::JmpLink, dl, NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  // Create a stack location to hold GP when PIC is used. This stack 
+  // location is used on function prologue to save GP and also after all 
+  // emited CALL's to restore GP. 
+  if (IsPIC) {
+      // Function can have an arbitrary number of calls, so 
+      // hold the LastArgStackLoc with the biggest offset.
+      int FI;
+      MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+      if (LastArgStackLoc >= MipsFI->getGPStackOffset()) {
+        LastArgStackLoc = (!LastArgStackLoc) ? (16) : (LastArgStackLoc+4);
+        // Create the frame index only once. SPOffset here can be anything 
+        // (this will be fixed on processFunctionBeforeFrameFinalized)
+        if (MipsFI->getGPStackOffset() == -1) {
+          FI = MFI->CreateFixedObject(4, 0, true, false);
+          MipsFI->setGPFI(FI);
+        }
+        MipsFI->setGPStackOffset(LastArgStackLoc);
+      }
+
+      // Reload GP value.
+      FI = MipsFI->getGPFI();
+      SDValue FIN = DAG.getFrameIndex(FI,getPointerTy());
+      SDValue GPLoad = DAG.getLoad(MVT::i32, dl, Chain, FIN, NULL, 0);
+      Chain = GPLoad.getValue(1);
+      Chain = DAG.getCopyToReg(Chain, dl, DAG.getRegister(Mips::GP, MVT::i32), 
+                               GPLoad, SDValue(0,0));
+      InFlag = Chain.getValue(1);
+  }      
+
+  // Create the CALLSEQ_END node.
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                             DAG.getIntPtrConstant(0, true), InFlag);
+  InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
+                         Ins, dl, DAG, InVals);
+}
+
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+SDValue
+MipsTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
+                                    CallingConv::ID CallConv, bool isVarArg,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    DebugLoc dl, SelectionDAG &DAG,
+                                    SmallVectorImpl<SDValue> &InVals) {
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 RVLocs, *DAG.getContext());
+
+  CCInfo.AnalyzeCallResult(Ins, RetCC_Mips);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    Chain = DAG.getCopyFromReg(Chain, dl, RVLocs[i].getLocReg(),
+                               RVLocs[i].getValVT(), InFlag).getValue(1);
+    InFlag = Chain.getValue(2);
+    InVals.push_back(Chain.getValue(0));
+  }
+
+  return Chain;
+}
+
+//===----------------------------------------------------------------------===//
+//             Formal Arguments Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+/// LowerFormalArguments - transform physical registers into virtual registers 
+/// and generate load operations for arguments places on the stack.
+SDValue
+MipsTargetLowering::LowerFormalArguments(SDValue Chain,
+                                        CallingConv::ID CallConv, bool isVarArg,
+                                        const SmallVectorImpl<ISD::InputArg>
+                                        &Ins,
+                                        DebugLoc dl, SelectionDAG &DAG,
+                                        SmallVectorImpl<SDValue> &InVals) {
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+
+  unsigned StackReg = MF.getTarget().getRegisterInfo()->getFrameRegister(MF);
+  VarArgsFrameIndex = 0;
+
+  // Used with vargs to acumulate store chains.
+  std::vector<SDValue> OutChains;
+
+  // Keep track of the last register used for arguments
+  unsigned ArgRegEnd = 0;
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 ArgLocs, *DAG.getContext());
+
+  if (Subtarget->isABI_O32())
+    CCInfo.AnalyzeFormalArguments(Ins, 
+                        isVarArg ? CC_MipsO32_VarArgs : CC_MipsO32);
+  else
+    CCInfo.AnalyzeFormalArguments(Ins, CC_Mips);
+
+  SDValue StackPtr;
+
+  unsigned FirstStackArgLoc = (Subtarget->isABI_EABI() ? 0 : 16);
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+
+    // Arguments stored on registers
+    if (VA.isRegLoc()) {
+      EVT RegVT = VA.getLocVT();
+      ArgRegEnd = VA.getLocReg();
+      TargetRegisterClass *RC = 0;
+
+      if (RegVT == MVT::i32)
+        RC = Mips::CPURegsRegisterClass; 
+      else if (RegVT == MVT::f32) 
+        RC = Mips::FGR32RegisterClass;
+      else if (RegVT == MVT::f64) {
+        if (!Subtarget->isSingleFloat()) 
+          RC = Mips::AFGR64RegisterClass;
+      } else  
+        llvm_unreachable("RegVT not supported by FormalArguments Lowering");
+
+      // Transform the arguments stored on 
+      // physical registers into virtual ones
+      unsigned Reg = AddLiveIn(DAG.getMachineFunction(), ArgRegEnd, RC);
+      SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
+      
+      // If this is an 8 or 16-bit value, it has been passed promoted 
+      // to 32 bits.  Insert an assert[sz]ext to capture this, then 
+      // truncate to the right size.
+      if (VA.getLocInfo() != CCValAssign::Full) {
+        unsigned Opcode = 0;
+        if (VA.getLocInfo() == CCValAssign::SExt)
+          Opcode = ISD::AssertSext;
+        else if (VA.getLocInfo() == CCValAssign::ZExt)
+          Opcode = ISD::AssertZext;
+        if (Opcode)
+          ArgValue = DAG.getNode(Opcode, dl, RegVT, ArgValue, 
+                                 DAG.getValueType(VA.getValVT()));
+        ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+      }
+
+      // Handle O32 ABI cases: i32->f32 and (i32,i32)->f64 
+      if (Subtarget->isABI_O32()) {
+        if (RegVT == MVT::i32 && VA.getValVT() == MVT::f32) 
+          ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, ArgValue);
+        if (RegVT == MVT::i32 && VA.getValVT() == MVT::f64) {
+          unsigned Reg2 = AddLiveIn(DAG.getMachineFunction(), 
+                                    VA.getLocReg()+1, RC);
+          SDValue ArgValue2 = DAG.getCopyFromReg(Chain, dl, Reg2, RegVT);
+          SDValue Hi = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, ArgValue);
+          SDValue Lo = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, ArgValue2);
+          ArgValue = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::f64, Lo, Hi);
+        }
+      }
+
+      InVals.push_back(ArgValue);
+    } else { // VA.isRegLoc()
+
+      // sanity check
+      assert(VA.isMemLoc());
+
+      // The last argument is not a register anymore
+      ArgRegEnd = 0;
+      
+      // The stack pointer offset is relative to the caller stack frame. 
+      // Since the real stack size is unknown here, a negative SPOffset 
+      // is used so there's a way to adjust these offsets when the stack
+      // size get known (on EliminateFrameIndex). A dummy SPOffset is 
+      // used instead of a direct negative address (which is recorded to
+      // be used on emitPrologue) to avoid mis-calc of the first stack 
+      // offset on PEI::calculateFrameObjectOffsets.
+      // Arguments are always 32-bit.
+      unsigned ArgSize = VA.getLocVT().getSizeInBits()/8;
+      int FI = MFI->CreateFixedObject(ArgSize, 0, true, false);
+      MipsFI->recordLoadArgsFI(FI, -(ArgSize+
+        (FirstStackArgLoc + VA.getLocMemOffset())));
+
+      // Create load nodes to retrieve arguments from the stack
+      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+      InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, NULL, 0));
+    }
+  }
+
+  // The mips ABIs for returning structs by value requires that we copy
+  // the sret argument into $v0 for the return. Save the argument into
+  // a virtual register so that we can access it from the return points.
+  if (DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
+    unsigned Reg = MipsFI->getSRetReturnReg();
+    if (!Reg) {
+      Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i32));
+      MipsFI->setSRetReturnReg(Reg);
+    }
+    SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
+  }
+
+  // To meet ABI, when VARARGS are passed on registers, the registers
+  // must have their values written to the caller stack frame. If the last
+  // argument was placed in the stack, there's no need to save any register. 
+  if ((isVarArg) && (Subtarget->isABI_O32() && ArgRegEnd)) {
+    if (StackPtr.getNode() == 0)
+      StackPtr = DAG.getRegister(StackReg, getPointerTy());
+  
+    // The last register argument that must be saved is Mips::A3
+    TargetRegisterClass *RC = Mips::CPURegsRegisterClass;
+    unsigned StackLoc = ArgLocs.size()-1;
+
+    for (++ArgRegEnd; ArgRegEnd <= Mips::A3; ++ArgRegEnd, ++StackLoc) {
+      unsigned Reg = AddLiveIn(DAG.getMachineFunction(), ArgRegEnd, RC);
+      SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, MVT::i32);
+
+      int FI = MFI->CreateFixedObject(4, 0, true, false);
+      MipsFI->recordStoreVarArgsFI(FI, -(4+(StackLoc*4)));
+      SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy());
+      OutChains.push_back(DAG.getStore(Chain, dl, ArgValue, PtrOff, NULL, 0));
+
+      // Record the frame index of the first variable argument
+      // which is a value necessary to VASTART.
+      if (!VarArgsFrameIndex)
+        VarArgsFrameIndex = FI;
+    }
+  }
+
+  // All stores are grouped in one node to allow the matching between 
+  // the size of Ins and InVals. This only happens when on varg functions
+  if (!OutChains.empty()) {
+    OutChains.push_back(Chain);
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        &OutChains[0], OutChains.size());
+  }
+
+  return Chain;
+}
+
+//===----------------------------------------------------------------------===//
+//               Return Value Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+SDValue
+MipsTargetLowering::LowerReturn(SDValue Chain,
+                                CallingConv::ID CallConv, bool isVarArg,
+                                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                DebugLoc dl, SelectionDAG &DAG) {
+
+  // CCValAssign - represent the assignment of
+  // the return value to a location
+  SmallVector<CCValAssign, 16> RVLocs;
+
+  // CCState - Info about the registers and stack slot.
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 RVLocs, *DAG.getContext());
+
+  // Analize return values.
+  CCInfo.AnalyzeReturn(Outs, RetCC_Mips);
+
+  // If this is the first return lowered for this function, add 
+  // the regs to the liveout set for the function.
+  if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
+    for (unsigned i = 0; i != RVLocs.size(); ++i)
+      if (RVLocs[i].isRegLoc())
+        DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
+  }
+
+  SDValue Flag;
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 
+                             Outs[i].Val, Flag);
+
+    // guarantee that all emitted copies are
+    // stuck together, avoiding something bad
+    Flag = Chain.getValue(1);
+  }
+
+  // The mips ABIs for returning structs by value requires that we copy
+  // the sret argument into $v0 for the return. We saved the argument into
+  // a virtual register in the entry block, so now we copy the value out
+  // and into $v0.
+  if (DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
+    MachineFunction &MF      = DAG.getMachineFunction();
+    MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+    unsigned Reg = MipsFI->getSRetReturnReg();
+
+    if (!Reg) 
+      llvm_unreachable("sret virtual register not created in the entry block");
+    SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
+
+    Chain = DAG.getCopyToReg(Chain, dl, Mips::V0, Val, Flag);
+    Flag = Chain.getValue(1);
+  }
+
+  // Return on Mips is always a "jr $ra"
+  if (Flag.getNode())
+    return DAG.getNode(MipsISD::Ret, dl, MVT::Other, 
+                       Chain, DAG.getRegister(Mips::RA, MVT::i32), Flag);
+  else // Return Void
+    return DAG.getNode(MipsISD::Ret, dl, MVT::Other, 
+                       Chain, DAG.getRegister(Mips::RA, MVT::i32));
+}
+
+//===----------------------------------------------------------------------===//
+//                           Mips Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+MipsTargetLowering::ConstraintType MipsTargetLowering::
+getConstraintType(const std::string &Constraint) const 
+{
+  // Mips specific constrainy 
+  // GCC config/mips/constraints.md
+  //
+  // 'd' : An address register. Equivalent to r 
+  //       unless generating MIPS16 code. 
+  // 'y' : Equivalent to r; retained for 
+  //       backwards compatibility. 
+  // 'f' : Floating Point registers.      
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+      default : break;
+      case 'd':     
+      case 'y': 
+      case 'f':
+        return C_RegisterClass;
+        break;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+/// getRegClassForInlineAsmConstraint - Given a constraint letter (e.g. "r"),
+/// return a list of registers that can be used to satisfy the constraint.
+/// This should only be used for C_RegisterClass constraints.
+std::pair<unsigned, const TargetRegisterClass*> MipsTargetLowering::
+getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const
+{
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'r':
+      return std::make_pair(0U, Mips::CPURegsRegisterClass);
+    case 'f':
+      if (VT == MVT::f32)
+        return std::make_pair(0U, Mips::FGR32RegisterClass);
+      if (VT == MVT::f64)    
+        if ((!Subtarget->isSingleFloat()) && (!Subtarget->isFP64bit()))
+          return std::make_pair(0U, Mips::AFGR64RegisterClass);
+    }
+  }
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
+
+/// Given a register class constraint, like 'r', if this corresponds directly
+/// to an LLVM register class, return a register of 0 and the register class
+/// pointer.
+std::vector<unsigned> MipsTargetLowering::
+getRegClassForInlineAsmConstraint(const std::string &Constraint,
+                                  EVT VT) const
+{
+  if (Constraint.size() != 1)
+    return std::vector<unsigned>();
+
+  switch (Constraint[0]) {         
+    default : break;
+    case 'r':
+    // GCC Mips Constraint Letters
+    case 'd':     
+    case 'y': 
+      return make_vector<unsigned>(Mips::T0, Mips::T1, Mips::T2, Mips::T3, 
+             Mips::T4, Mips::T5, Mips::T6, Mips::T7, Mips::S0, Mips::S1, 
+             Mips::S2, Mips::S3, Mips::S4, Mips::S5, Mips::S6, Mips::S7, 
+             Mips::T8, 0);
+
+    case 'f':
+      if (VT == MVT::f32) {
+        if (Subtarget->isSingleFloat())
+          return make_vector<unsigned>(Mips::F2, Mips::F3, Mips::F4, Mips::F5,
+                 Mips::F6, Mips::F7, Mips::F8, Mips::F9, Mips::F10, Mips::F11,
+                 Mips::F20, Mips::F21, Mips::F22, Mips::F23, Mips::F24,
+                 Mips::F25, Mips::F26, Mips::F27, Mips::F28, Mips::F29,
+                 Mips::F30, Mips::F31, 0);
+        else
+          return make_vector<unsigned>(Mips::F2, Mips::F4, Mips::F6, Mips::F8, 
+                 Mips::F10, Mips::F20, Mips::F22, Mips::F24, Mips::F26, 
+                 Mips::F28, Mips::F30, 0);
+      }
+
+      if (VT == MVT::f64)    
+        if ((!Subtarget->isSingleFloat()) && (!Subtarget->isFP64bit()))
+          return make_vector<unsigned>(Mips::D1, Mips::D2, Mips::D3, Mips::D4, 
+                 Mips::D5, Mips::D10, Mips::D11, Mips::D12, Mips::D13, 
+                 Mips::D14, Mips::D15, 0);
+  }
+  return std::vector<unsigned>();
+}
+
+bool
+MipsTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  // The Mips target isn't yet aware of offsets.
+  return false;
+}
+
+bool MipsTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+  if (VT != MVT::f32 && VT != MVT::f64)
+    return false;
+  return Imm.isZero();
+}

diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
new file mode 100644
index 0000000..7256617
--- /dev/null
+++ b/lib/Target/Mips/MipsISelLowering.h

@@ -0,0 +1,160 @@
+//===-- MipsISelLowering.h - Mips DAG Lowering Interface --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that Mips uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MipsISELLOWERING_H
+#define MipsISELLOWERING_H
+
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+#include "Mips.h"
+#include "MipsSubtarget.h"
+
+namespace llvm {
+  namespace MipsISD {
+    enum NodeType {
+      // Start the numbering from where ISD NodeType finishes.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      // Jump and link (call)
+      JmpLink,
+
+      // Get the Higher 16 bits from a 32-bit immediate
+      // No relation with Mips Hi register
+      Hi, 
+
+      // Get the Lower 16 bits from a 32-bit immediate
+      // No relation with Mips Lo register
+      Lo, 
+
+      // Handle gp_rel (small data/bss sections) relocation.
+      GPRel,
+
+      // Conditional Move
+      CMov,
+
+      // Select CC Pseudo Instruction
+      SelectCC,
+
+      // Floating Point Select CC Pseudo Instruction
+      FPSelectCC,
+
+      // Floating Point Branch Conditional
+      FPBrcond,
+
+      // Floating Point Compare
+      FPCmp,
+
+      // Floating Point Rounding
+      FPRound,
+
+      // Return 
+      Ret
+    };
+  }
+
+  //===--------------------------------------------------------------------===//
+  // TargetLowering Implementation
+  //===--------------------------------------------------------------------===//
+  
+  class MipsTargetLowering : public TargetLowering  {
+    int VarArgsFrameIndex;            // FrameIndex for start of varargs area.
+
+  public:
+
+    explicit MipsTargetLowering(MipsTargetMachine &TM);
+
+    /// LowerOperation - Provide custom lowering hooks for some operations.
+    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG);
+
+    /// getTargetNodeName - This method returns the name of a target specific 
+    //  DAG node.
+    virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+    /// getSetCCResultType - get the ISD::SETCC result ValueType
+    MVT::SimpleValueType getSetCCResultType(EVT VT) const;
+
+    /// getFunctionAlignment - Return the Log2 alignment of this function.
+    virtual unsigned getFunctionAlignment(const Function *F) const;
+  private:
+    // Subtarget Info
+    const MipsSubtarget *Subtarget;
+
+
+    // Lower Operand helpers
+    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                            CallingConv::ID CallConv, bool isVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            DebugLoc dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals);
+
+    // Lower Operand specifics
+    SDValue LowerANDOR(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG);
+
+    virtual SDValue
+      LowerFormalArguments(SDValue Chain,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerCall(SDValue Chain, SDValue Callee,
+                CallingConv::ID CallConv, bool isVarArg,
+                bool &isTailCall,
+                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<ISD::InputArg> &Ins,
+                DebugLoc dl, SelectionDAG &DAG,
+                SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerReturn(SDValue Chain,
+                  CallingConv::ID CallConv, bool isVarArg,
+                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  DebugLoc dl, SelectionDAG &DAG);
+
+    virtual MachineBasicBlock *EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                         MachineBasicBlock *MBB,
+                    DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const;
+
+    // Inline asm support
+    ConstraintType getConstraintType(const std::string &Constraint) const;
+
+    std::pair<unsigned, const TargetRegisterClass*> 
+              getRegForInlineAsmConstraint(const std::string &Constraint,
+              EVT VT) const;
+
+    std::vector<unsigned>
+    getRegClassForInlineAsmConstraint(const std::string &Constraint,
+              EVT VT) const;
+
+    virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
+
+    /// isFPImmLegal - Returns true if the target can instruction select the
+    /// specified FP immediate natively. If false, the legalizer will
+    /// materialize the FP immediate as a load from a constant pool.
+    virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
+  };
+}
+
+#endif // MipsISELLOWERING_H

diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td
new file mode 100644
index 0000000..fa4518d
--- /dev/null
+++ b/lib/Target/Mips/MipsInstrFPU.td

@@ -0,0 +1,314 @@
+//===- MipsInstrFPU.td - Mips FPU Instruction Information -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Floating Point Instructions
+// ------------------------
+// * 64bit fp:
+//    - 32 64-bit registers (default mode)
+//    - 16 even 32-bit registers (32-bit compatible mode) for
+//      single and double access.
+// * 32bit fp:
+//    - 16 even 32-bit registers - single and double (aliased)
+//    - 32 32-bit registers (within single-only mode)
+//===----------------------------------------------------------------------===//
+
+// Floating Point Compare and Branch
+def SDT_MipsFPBrcond : SDTypeProfile<0, 3, [SDTCisSameAs<0, 2>, SDTCisInt<0>,
+                                     SDTCisVT<1, OtherVT>]>;
+def SDT_MipsFPCmp : SDTypeProfile<0, 3, [SDTCisSameAs<0, 1>, SDTCisFP<0>, 
+                                  SDTCisInt<2>]>;
+def SDT_MipsFPSelectCC : SDTypeProfile<1, 4, [SDTCisInt<1>, SDTCisInt<4>,
+                                  SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>]>;
+
+def MipsFPRound : SDNode<"MipsISD::FPRound", SDTFPRoundOp, [SDNPOptInFlag]>;
+def MipsFPBrcond : SDNode<"MipsISD::FPBrcond", SDT_MipsFPBrcond, 
+                          [SDNPHasChain]>; 
+def MipsFPCmp : SDNode<"MipsISD::FPCmp", SDT_MipsFPCmp>;
+def MipsFPSelectCC : SDNode<"MipsISD::FPSelectCC", SDT_MipsFPSelectCC>;
+
+// Operand for printing out a condition code.
+let PrintMethod = "printFCCOperand" in
+  def condcode : Operand<i32>;
+
+//===----------------------------------------------------------------------===//
+// Feature predicates.
+//===----------------------------------------------------------------------===//
+
+def In32BitMode      : Predicate<"!Subtarget.isFP64bit()">;
+def IsSingleFloat    : Predicate<"Subtarget.isSingleFloat()">;
+def IsNotSingleFloat : Predicate<"!Subtarget.isSingleFloat()">;
+def IsNotMipsI       : Predicate<"!Subtarget.isMips1()">;
+
+//===----------------------------------------------------------------------===//
+// Instruction Class Templates
+//
+// A set of multiclasses is used to address the register usage. 
+//
+// S32 - single precision in 16 32bit even fp registers
+//       single precision in 32 32bit fp registers in SingleOnly mode
+// S64 - single precision in 32 64bit fp registers (In64BitMode)
+// D32 - double precision in 16 32bit even fp registers
+// D64 - double precision in 32 64bit fp registers (In64BitMode)
+//
+// Only S32 and D32 are supported right now.
+//===----------------------------------------------------------------------===//
+
+multiclass FFR1_1<bits<6> funct, string asmstr> 
+{
+  def _S32 : FFR<0x11, funct, 0x0, (outs FGR32:$fd), (ins FGR32:$fs),
+      !strconcat(asmstr, ".s $fd, $fs"), []>;
+
+  def _D32  : FFR<0x11, funct, 0x1, (outs FGR32:$fd), (ins AFGR64:$fs),
+      !strconcat(asmstr, ".d $fd, $fs"), []>, Requires<[In32BitMode]>;
+}
+
+multiclass FFR1_2<bits<6> funct, string asmstr, SDNode FOp> 
+{
+  def _S32 : FFR<0x11, funct, 0x0, (outs FGR32:$fd), (ins FGR32:$fs),
+                 !strconcat(asmstr, ".s $fd, $fs"), 
+                 [(set FGR32:$fd, (FOp FGR32:$fs))]>;
+
+  def _D32  : FFR<0x11, funct, 0x1, (outs AFGR64:$fd), (ins AFGR64:$fs),
+                 !strconcat(asmstr, ".d $fd, $fs"), 
+                 [(set AFGR64:$fd, (FOp AFGR64:$fs))]>, Requires<[In32BitMode]>;
+}
+
+class FFR1_3<bits<6> funct, bits<5> fmt, RegisterClass RcSrc, 
+              RegisterClass RcDst, string asmstr>: 
+  FFR<0x11, funct, fmt, (outs RcSrc:$fd), (ins RcDst:$fs), 
+      !strconcat(asmstr, " $fd, $fs"), []>; 
+
+
+multiclass FFR1_4<bits<6> funct, string asmstr, SDNode FOp> {
+  def _S32 : FFR<0x11, funct, 0x0, (outs FGR32:$fd), 
+                 (ins FGR32:$fs, FGR32:$ft), 
+                 !strconcat(asmstr, ".s $fd, $fs, $ft"),
+                 [(set FGR32:$fd, (FOp FGR32:$fs, FGR32:$ft))]>;
+
+  def _D32 : FFR<0x11, funct, 0x1, (outs AFGR64:$fd), 
+                 (ins AFGR64:$fs, AFGR64:$ft), 
+                 !strconcat(asmstr, ".d $fd, $fs, $ft"),
+                 [(set AFGR64:$fd, (FOp AFGR64:$fs, AFGR64:$ft))]>,
+                 Requires<[In32BitMode]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Floating Point Instructions
+//===----------------------------------------------------------------------===//
+
+let ft = 0 in {
+  defm FLOOR_W : FFR1_1<0b001111, "floor.w">;
+  defm CEIL_W  : FFR1_1<0b001110, "ceil.w">;
+  defm ROUND_W : FFR1_1<0b001100, "round.w">;
+  defm TRUNC_W : FFR1_1<0b001101, "trunc.w">;
+  defm CVTW    : FFR1_1<0b100100, "cvt.w">;
+
+  defm FABS    : FFR1_2<0b000101, "abs",  fabs>; 
+  defm FNEG    : FFR1_2<0b000111, "neg",  fneg>; 
+  defm FSQRT   : FFR1_2<0b000100, "sqrt", fsqrt>;
+
+  /// Convert to Single Precison
+  def CVTS_W32 : FFR1_3<0b100000, 0x2, FGR32,  FGR32,  "cvt.s.w">;
+
+  let Predicates = [IsNotSingleFloat] in {
+    /// Ceil to long signed integer
+    def CEIL_LS   : FFR1_3<0b001010, 0x0, FGR32, FGR32, "ceil.l">;
+    def CEIL_LD   : FFR1_3<0b001010, 0x1, AFGR64, AFGR64, "ceil.l">;
+
+    /// Round to long signed integer
+    def ROUND_LS  : FFR1_3<0b001000, 0x0, FGR32, FGR32, "round.l">;
+    def ROUND_LD  : FFR1_3<0b001000, 0x1, AFGR64, AFGR64, "round.l">;
+
+    /// Floor to long signed integer
+    def FLOOR_LS  : FFR1_3<0b001011, 0x0, FGR32, FGR32, "floor.l">;
+    def FLOOR_LD  : FFR1_3<0b001011, 0x1, AFGR64, AFGR64, "floor.l">;
+
+    /// Trunc to long signed integer
+    def TRUNC_LS  : FFR1_3<0b001001, 0x0, FGR32, FGR32, "trunc.l">;
+    def TRUNC_LD  : FFR1_3<0b001001, 0x1, AFGR64, AFGR64, "trunc.l">;
+
+    /// Convert to long signed integer
+    def CVTL_S    : FFR1_3<0b100101, 0x0, FGR32, FGR32, "cvt.l">; 
+    def CVTL_D    : FFR1_3<0b100101, 0x1, AFGR64, AFGR64, "cvt.l">; 
+
+    /// Convert to Double Precison 
+    def CVTD_S32 : FFR1_3<0b100001, 0x0, AFGR64, FGR32, "cvt.d.s">; 
+    def CVTD_W32 : FFR1_3<0b100001, 0x2, AFGR64, FGR32, "cvt.d.w">; 
+    def CVTD_L32 : FFR1_3<0b100001, 0x3, AFGR64, AFGR64, "cvt.d.l">; 
+                   
+    /// Convert to Single Precison
+    def CVTS_D32 : FFR1_3<0b100000, 0x1, FGR32, AFGR64, "cvt.s.d">;
+    def CVTS_L32 : FFR1_3<0b100000, 0x3, FGR32, AFGR64, "cvt.s.l">; 
+  }
+}
+
+// The odd-numbered registers are only referenced when doing loads,
+// stores, and moves between floating-point and integer registers.
+// When defining instructions, we reference all 32-bit registers, 
+// regardless of register aliasing.
+let fd = 0 in {
+  /// Move Control Registers From/To CPU Registers
+  def CFC1  : FFR<0x11, 0x0, 0x2, (outs CPURegs:$rt), (ins CCR:$fs),
+                  "cfc1 $rt, $fs", []>;
+
+  def CTC1  : FFR<0x11, 0x0, 0x6, (outs CCR:$rt), (ins CPURegs:$fs),
+                  "ctc1 $fs, $rt", []>;
+                  
+  def MFC1  : FFR<0x11, 0x00, 0x00, (outs CPURegs:$rt), (ins FGR32:$fs),
+                  "mfc1 $rt, $fs", []>;
+
+  def MTC1  : FFR<0x11, 0x00, 0x04, (outs FGR32:$fs), (ins CPURegs:$rt),
+                  "mtc1 $rt, $fs", []>;
+}
+
+def FMOV_S32 : FFR<0x11, 0b000110, 0x0, (outs FGR32:$fd), (ins FGR32:$fs),
+                   "mov.s $fd, $fs", []>;
+def FMOV_D32 : FFR<0x11, 0b000110, 0x1, (outs AFGR64:$fd), (ins AFGR64:$fs),
+                   "mov.d $fd, $fs", []>;
+
+/// Floating Point Memory Instructions
+let Predicates = [IsNotSingleFloat, IsNotMipsI] in {
+  def LDC1 : FFI<0b110101, (outs AFGR64:$ft), (ins mem:$addr), 
+                 "ldc1 $ft, $addr", [(set AFGR64:$ft, (load addr:$addr))]>;
+
+  def SDC1 : FFI<0b111101, (outs), (ins AFGR64:$ft, mem:$addr), 
+                 "sdc1 $ft, $addr", [(store AFGR64:$ft, addr:$addr)]>;
+}
+
+// LWC1 and SWC1 can always be emited with odd registers.
+def LWC1  : FFI<0b110001, (outs FGR32:$ft), (ins mem:$addr), "lwc1 $ft, $addr",
+               [(set FGR32:$ft, (load addr:$addr))]>; 
+def SWC1  : FFI<0b111001, (outs), (ins FGR32:$ft, mem:$addr), "swc1 $ft, $addr",
+               [(store FGR32:$ft, addr:$addr)]>; 
+
+/// Floating-point Aritmetic
+defm FADD : FFR1_4<0x10, "add", fadd>;
+defm FDIV : FFR1_4<0x03, "div", fdiv>;
+defm FMUL : FFR1_4<0x02, "mul", fmul>;
+defm FSUB : FFR1_4<0x01, "sub", fsub>;
+
+//===----------------------------------------------------------------------===//
+// Floating Point Branch Codes
+//===----------------------------------------------------------------------===//
+// Mips branch codes. These correspond to condcode in MipsInstrInfo.h. 
+// They must be kept in synch.
+def MIPS_BRANCH_F  : PatLeaf<(i32 0)>;
+def MIPS_BRANCH_T  : PatLeaf<(i32 1)>;
+def MIPS_BRANCH_FL : PatLeaf<(i32 2)>;
+def MIPS_BRANCH_TL : PatLeaf<(i32 3)>;
+
+/// Floating Point Branch of False/True (Likely)
+let isBranch=1, isTerminator=1, hasDelaySlot=1, base=0x8, Uses=[FCR31] in {
+  class FBRANCH<PatLeaf op, string asmstr> : FFI<0x11, (outs), 
+        (ins brtarget:$dst), !strconcat(asmstr, " $dst"),
+        [(MipsFPBrcond op, bb:$dst, FCR31)]>;
+}
+def BC1F  : FBRANCH<MIPS_BRANCH_F,  "bc1f">;
+def BC1T  : FBRANCH<MIPS_BRANCH_T,  "bc1t">;
+def BC1FL : FBRANCH<MIPS_BRANCH_FL, "bc1fl">;
+def BC1TL : FBRANCH<MIPS_BRANCH_TL, "bc1tl">;
+
+//===----------------------------------------------------------------------===//
+// Floating Point Flag Conditions
+//===----------------------------------------------------------------------===//
+// Mips condition codes. They must correspond to condcode in MipsInstrInfo.h. 
+// They must be kept in synch.
+def MIPS_FCOND_F    : PatLeaf<(i32 0)>;
+def MIPS_FCOND_UN   : PatLeaf<(i32 1)>;
+def MIPS_FCOND_EQ   : PatLeaf<(i32 2)>;
+def MIPS_FCOND_UEQ  : PatLeaf<(i32 3)>;
+def MIPS_FCOND_OLT  : PatLeaf<(i32 4)>;
+def MIPS_FCOND_ULT  : PatLeaf<(i32 5)>;
+def MIPS_FCOND_OLE  : PatLeaf<(i32 6)>;
+def MIPS_FCOND_ULE  : PatLeaf<(i32 7)>;
+def MIPS_FCOND_SF   : PatLeaf<(i32 8)>;
+def MIPS_FCOND_NGLE : PatLeaf<(i32 9)>;
+def MIPS_FCOND_SEQ  : PatLeaf<(i32 10)>;
+def MIPS_FCOND_NGL  : PatLeaf<(i32 11)>;
+def MIPS_FCOND_LT   : PatLeaf<(i32 12)>;
+def MIPS_FCOND_NGE  : PatLeaf<(i32 13)>;
+def MIPS_FCOND_LE   : PatLeaf<(i32 14)>;
+def MIPS_FCOND_NGT  : PatLeaf<(i32 15)>;
+
+/// Floating Point Compare
+let hasDelaySlot = 1, Defs=[FCR31] in {
+  def FCMP_S32 : FCC<0x0, (outs), (ins FGR32:$fs, FGR32:$ft, condcode:$cc),
+      "c.$cc.s $fs, $ft", [(MipsFPCmp FGR32:$fs, FGR32:$ft, imm:$cc), 
+      (implicit FCR31)]>;
+  
+  def FCMP_D32 : FCC<0x1, (outs), (ins AFGR64:$fs, AFGR64:$ft, condcode:$cc),
+      "c.$cc.d $fs, $ft", [(MipsFPCmp AFGR64:$fs, AFGR64:$ft, imm:$cc),
+      (implicit FCR31)]>, Requires<[In32BitMode]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Floating Point Pseudo-Instructions
+//===----------------------------------------------------------------------===//
+
+// For some explanation, see Select_CC at MipsInstrInfo.td. We also embedd a 
+// condiciton code to enable easy handling by the Custom Inserter.
+let usesCustomInserter = 1, Uses=[FCR31] in {
+  class PseudoFPSelCC<RegisterClass RC, string asmstr> : 
+    MipsPseudo<(outs RC:$dst), 
+               (ins CPURegs:$CmpRes, RC:$T, RC:$F, condcode:$cc), asmstr, 
+               [(set RC:$dst, (MipsFPSelectCC CPURegs:$CmpRes, RC:$T, RC:$F,
+                 imm:$cc))]>;
+}
+
+// The values to be selected are fp but the condition test is with integers.
+def Select_CC_S32 : PseudoSelCC<FGR32, "# MipsSelect_CC_S32_f32">;
+def Select_CC_D32 : PseudoSelCC<AFGR64, "# MipsSelect_CC_D32_f32">,
+                    Requires<[In32BitMode]>;
+
+// The values to be selected are int but the condition test is done with fp.
+def Select_FCC     : PseudoFPSelCC<CPURegs, "# MipsSelect_FCC">;
+
+// The values to be selected and the condition test is done with fp.
+def Select_FCC_S32 : PseudoFPSelCC<FGR32, "# MipsSelect_FCC_S32_f32">;
+def Select_FCC_D32 : PseudoFPSelCC<AFGR64, "# MipsSelect_FCC_D32_f32">, 
+                     Requires<[In32BitMode]>;
+
+def MOVCCRToCCR : MipsPseudo<(outs CCR:$dst), (ins CCR:$src), 
+                             "# MOVCCRToCCR", []>; 
+
+//===----------------------------------------------------------------------===//
+// Floating Point Patterns
+//===----------------------------------------------------------------------===//
+def fpimm0 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(+0.0);
+}]>;
+
+def fpimm0neg : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(-0.0);
+}]>;
+
+def : Pat<(f32 fpimm0), (MTC1 ZERO)>;
+def : Pat<(f32 fpimm0neg), (FNEG_S32 (MTC1 ZERO))>;
+
+def : Pat<(f32 (sint_to_fp CPURegs:$src)), (CVTS_W32 (MTC1 CPURegs:$src))>;
+def : Pat<(f64 (sint_to_fp CPURegs:$src)), (CVTD_W32 (MTC1 CPURegs:$src))>;
+
+def : Pat<(i32 (fp_to_sint FGR32:$src)), (MFC1 (TRUNC_W_S32 FGR32:$src))>;
+
+def : Pat<(i32 (bitconvert FGR32:$src)),  (MFC1 FGR32:$src)>;
+def : Pat<(f32 (bitconvert CPURegs:$src)), (MTC1 CPURegs:$src)>;
+
+let Predicates = [In32BitMode] in { 
+  def : Pat<(f32 (fround AFGR64:$src)), (CVTS_D32 AFGR64:$src)>;
+  def : Pat<(f64 (fextend FGR32:$src)), (CVTD_S32 FGR32:$src)>;
+}
+
+// MipsFPRound is only emitted for MipsI targets.
+def : Pat<(f32 (MipsFPRound AFGR64:$src)), (CVTW_D32 AFGR64:$src)>;
+

diff --git a/lib/Target/Mips/MipsInstrFormats.td b/lib/Target/Mips/MipsInstrFormats.td
new file mode 100644
index 0000000..0853272
--- /dev/null
+++ b/lib/Target/Mips/MipsInstrFormats.td

@@ -0,0 +1,182 @@
+//===- MipsRegisterInfo.td - Mips Register defs -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Describe MIPS instructions format
+//
+//  CPU INSTRUCTION FORMATS
+//
+//  opcode  - operation code.
+//  rs      - src reg.
+//  rt      - dst reg (on a 2 regs instr) or src reg (on a 3 reg instr).
+//  rd      - dst reg, only used on 3 regs instr.
+//  shamt   - only used on shift instructions, contains the shift amount.
+//  funct   - combined with opcode field give us an operation code.
+//
+//===----------------------------------------------------------------------===//
+
+// Generic Mips Format
+class MipsInst<dag outs, dag ins, string asmstr, list<dag> pattern, 
+               InstrItinClass itin>: Instruction 
+{
+  field bits<32> Inst;
+
+  let Namespace = "Mips";
+
+  bits<6> opcode;
+
+  // Top 5 bits are the 'opcode' field
+  let Inst{31-26} = opcode;   
+  
+  dag OutOperandList = outs;
+  dag InOperandList  = ins;
+
+  let AsmString   = asmstr;
+  let Pattern     = pattern;
+  let Itinerary   = itin;
+}
+
+// Mips Pseudo Instructions Format
+class MipsPseudo<dag outs, dag ins, string asmstr, list<dag> pattern>:
+      MipsInst<outs, ins, asmstr, pattern, IIPseudo>;
+
+//===----------------------------------------------------------------------===//
+// Format R instruction class in Mips : <|opcode|rs|rt|rd|shamt|funct|>
+//===----------------------------------------------------------------------===//
+
+class FR<bits<6> op, bits<6> _funct, dag outs, dag ins, string asmstr,
+         list<dag> pattern, InstrItinClass itin>:
+      MipsInst<outs, ins, asmstr, pattern, itin> 
+{
+  bits<5>  rd;
+  bits<5>  rs;
+  bits<5>  rt;
+  bits<5>  shamt;
+  bits<6>  funct;
+
+  let opcode = op;
+  let funct  = _funct;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt; 
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = shamt;
+  let Inst{5-0}   = funct;
+}
+
+//===----------------------------------------------------------------------===//
+// Format I instruction class in Mips : <|opcode|rs|rt|immediate|>
+//===----------------------------------------------------------------------===//
+
+class FI<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern,
+         InstrItinClass itin>: MipsInst<outs, ins, asmstr, pattern, itin> 
+{
+  bits<5>  rt;
+  bits<5>  rs;
+  bits<16> imm16;
+
+  let opcode = op;
+
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt; 
+  let Inst{15-0}  = imm16;
+}
+
+//===----------------------------------------------------------------------===//
+// Format J instruction class in Mips : <|opcode|address|>
+//===----------------------------------------------------------------------===//
+
+class FJ<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern,
+         InstrItinClass itin>: MipsInst<outs, ins, asmstr, pattern, itin> 
+{
+  bits<26> addr;
+
+  let opcode = op;
+  
+  let Inst{25-0} = addr;
+}
+
+//===----------------------------------------------------------------------===//
+//
+//  FLOATING POINT INSTRUCTION FORMATS
+//
+//  opcode  - operation code.
+//  fs      - src reg.
+//  ft      - dst reg (on a 2 regs instr) or src reg (on a 3 reg instr).
+//  fd      - dst reg, only used on 3 regs instr.
+//  fmt     - double or single precision.
+//  funct   - combined with opcode field give us an operation code.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Format FR instruction class in Mips : <|opcode|fmt|ft|fs|fd|funct|>
+//===----------------------------------------------------------------------===//
+
+class FFR<bits<6> op, bits<6> _funct, bits<5> _fmt, dag outs, dag ins, 
+          string asmstr, list<dag> pattern> : 
+          MipsInst<outs, ins, asmstr, pattern, NoItinerary> 
+{
+  bits<5>  fd;
+  bits<5>  fs;
+  bits<5>  ft;
+  bits<5>  fmt;
+  bits<6>  funct;
+
+  let opcode = op;
+  let funct  = _funct;
+  let fmt    = _fmt;
+
+  let Inst{25-21} = fmt;
+  let Inst{20-16} = ft; 
+  let Inst{15-11} = fs;
+  let Inst{10-6}  = fd;
+  let Inst{5-0}   = funct;
+}
+
+//===----------------------------------------------------------------------===//
+// Format FI instruction class in Mips : <|opcode|base|ft|immediate|>
+//===----------------------------------------------------------------------===//
+
+class FFI<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern>: 
+          MipsInst<outs, ins, asmstr, pattern, NoItinerary> 
+{
+  bits<5>  ft;
+  bits<5>  base;
+  bits<16> imm16;
+
+  let opcode = op;
+
+  let Inst{25-21} = base;
+  let Inst{20-16} = ft; 
+  let Inst{15-0}  = imm16;
+}
+
+//===----------------------------------------------------------------------===//
+// Compare instruction class in Mips : <|010001|fmt|ft|fs|0000011|condcode|>
+//===----------------------------------------------------------------------===//
+
+class FCC<bits<5> _fmt, dag outs, dag ins, string asmstr, list<dag> pattern> : 
+          MipsInst<outs, ins, asmstr, pattern, NoItinerary> 
+{
+  bits<5>  fs;
+  bits<5>  ft;
+  bits<4>  cc;
+  bits<5>  fmt;
+
+  let opcode = 0x11;
+  let fmt    = _fmt;
+
+  let Inst{25-21} = fmt;
+  let Inst{20-16} = ft; 
+  let Inst{15-11} = fs;
+  let Inst{10-6}  = 0;
+  let Inst{5-4}   = 0b11;
+  let Inst{3-0}   = cc;
+}

diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp
new file mode 100644
index 0000000..1a9bffc
--- /dev/null
+++ b/lib/Target/Mips/MipsInstrInfo.cpp

@@ -0,0 +1,621 @@
+//===- MipsInstrInfo.cpp - Mips Instruction Information ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsInstrInfo.h"
+#include "MipsTargetMachine.h"
+#include "MipsMachineFunction.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "MipsGenInstrInfo.inc"
+
+using namespace llvm;
+
+MipsInstrInfo::MipsInstrInfo(MipsTargetMachine &tm)
+  : TargetInstrInfoImpl(MipsInsts, array_lengthof(MipsInsts)),
+    TM(tm), RI(*TM.getSubtargetImpl(), *this) {}
+
+static bool isZeroImm(const MachineOperand &op) {
+  return op.isImm() && op.getImm() == 0;
+}
+
+/// Return true if the instruction is a register to register move and
+/// leave the source and dest operands in the passed parameters.
+bool MipsInstrInfo::
+isMoveInstr(const MachineInstr &MI, unsigned &SrcReg, unsigned &DstReg,
+            unsigned &SrcSubIdx, unsigned &DstSubIdx) const 
+{
+  SrcSubIdx = DstSubIdx = 0; // No sub-registers.
+
+  // addu $dst, $src, $zero || addu $dst, $zero, $src
+  // or   $dst, $src, $zero || or   $dst, $zero, $src
+  if ((MI.getOpcode() == Mips::ADDu) || (MI.getOpcode() == Mips::OR)) {
+    if (MI.getOperand(1).getReg() == Mips::ZERO) {
+      DstReg = MI.getOperand(0).getReg();
+      SrcReg = MI.getOperand(2).getReg();
+      return true;
+    } else if (MI.getOperand(2).getReg() == Mips::ZERO) {
+      DstReg = MI.getOperand(0).getReg();
+      SrcReg = MI.getOperand(1).getReg();
+      return true;
+    }
+  }
+
+  // mov $fpDst, $fpSrc
+  // mfc $gpDst, $fpSrc
+  // mtc $fpDst, $gpSrc
+  if (MI.getOpcode() == Mips::FMOV_S32 || 
+      MI.getOpcode() == Mips::FMOV_D32 || 
+      MI.getOpcode() == Mips::MFC1 || 
+      MI.getOpcode() == Mips::MTC1 ||
+      MI.getOpcode() == Mips::MOVCCRToCCR) {
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+    return true;
+  }
+
+  // addiu $dst, $src, 0
+  if (MI.getOpcode() == Mips::ADDiu) {
+    if ((MI.getOperand(1).isReg()) && (isZeroImm(MI.getOperand(2)))) {
+      DstReg = MI.getOperand(0).getReg();
+      SrcReg = MI.getOperand(1).getReg();
+      return true;
+    }
+  }
+
+  return false;
+}
+
+/// isLoadFromStackSlot - If the specified machine instruction is a direct
+/// load from a stack slot, return the virtual or physical register number of
+/// the destination along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than loading from the stack slot.
+unsigned MipsInstrInfo::
+isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const 
+{
+  if ((MI->getOpcode() == Mips::LW) || (MI->getOpcode() == Mips::LWC1) ||
+      (MI->getOpcode() == Mips::LDC1)) {
+    if ((MI->getOperand(2).isFI()) && // is a stack slot
+        (MI->getOperand(1).isImm()) &&  // the imm is zero
+        (isZeroImm(MI->getOperand(1)))) {
+      FrameIndex = MI->getOperand(2).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+  }
+
+  return 0;
+}
+
+/// isStoreToStackSlot - If the specified machine instruction is a direct
+/// store to a stack slot, return the virtual or physical register number of
+/// the source reg along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than storing to the stack slot.
+unsigned MipsInstrInfo::
+isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const 
+{
+  if ((MI->getOpcode() == Mips::SW) || (MI->getOpcode() == Mips::SWC1) ||
+      (MI->getOpcode() == Mips::SDC1)) {
+    if ((MI->getOperand(2).isFI()) && // is a stack slot
+        (MI->getOperand(1).isImm()) &&  // the imm is zero
+        (isZeroImm(MI->getOperand(1)))) {
+      FrameIndex = MI->getOperand(2).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+  }
+  return 0;
+}
+
+/// insertNoop - If data hazard condition is found insert the target nop
+/// instruction.
+void MipsInstrInfo::
+insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const 
+{
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+  BuildMI(MBB, MI, DL, get(Mips::NOP));
+}
+
+bool MipsInstrInfo::
+copyRegToReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+             unsigned DestReg, unsigned SrcReg,
+             const TargetRegisterClass *DestRC,
+             const TargetRegisterClass *SrcRC) const {
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  if (DestRC != SrcRC) {
+
+    // Copy to/from FCR31 condition register
+    if ((DestRC == Mips::CPURegsRegisterClass) && 
+        (SrcRC == Mips::CCRRegisterClass))
+      BuildMI(MBB, I, DL, get(Mips::CFC1), DestReg).addReg(SrcReg);
+    else if ((DestRC == Mips::CCRRegisterClass) && 
+        (SrcRC == Mips::CPURegsRegisterClass))
+      BuildMI(MBB, I, DL, get(Mips::CTC1), DestReg).addReg(SrcReg);
+
+    // Moves between coprocessors and cpu
+    else if ((DestRC == Mips::CPURegsRegisterClass) && 
+        (SrcRC == Mips::FGR32RegisterClass))
+      BuildMI(MBB, I, DL, get(Mips::MFC1), DestReg).addReg(SrcReg);
+    else if ((DestRC == Mips::FGR32RegisterClass) &&
+             (SrcRC == Mips::CPURegsRegisterClass))
+      BuildMI(MBB, I, DL, get(Mips::MTC1), DestReg).addReg(SrcReg);
+
+    // Move from/to Hi/Lo registers
+    else if ((DestRC == Mips::HILORegisterClass) &&
+             (SrcRC == Mips::CPURegsRegisterClass)) {
+      unsigned Opc = (DestReg == Mips::HI) ? Mips::MTHI : Mips::MTLO;
+      BuildMI(MBB, I, DL, get(Opc), DestReg);
+    } else if ((SrcRC == Mips::HILORegisterClass) &&
+               (DestRC == Mips::CPURegsRegisterClass)) {
+      unsigned Opc = (SrcReg == Mips::HI) ? Mips::MFHI : Mips::MFLO;
+      BuildMI(MBB, I, DL, get(Opc), DestReg);
+    } else 
+      // Can't copy this register
+      return false; 
+
+    return true;
+  }
+
+  if (DestRC == Mips::CPURegsRegisterClass)
+    BuildMI(MBB, I, DL, get(Mips::ADDu), DestReg).addReg(Mips::ZERO)
+      .addReg(SrcReg);
+  else if (DestRC == Mips::FGR32RegisterClass) 
+    BuildMI(MBB, I, DL, get(Mips::FMOV_S32), DestReg).addReg(SrcReg);
+  else if (DestRC == Mips::AFGR64RegisterClass)
+    BuildMI(MBB, I, DL, get(Mips::FMOV_D32), DestReg).addReg(SrcReg);
+  else if (DestRC == Mips::CCRRegisterClass)
+    BuildMI(MBB, I, DL, get(Mips::MOVCCRToCCR), DestReg).addReg(SrcReg);
+  else
+    // Can't copy this register
+    return false;
+  
+  return true;
+}
+
+void MipsInstrInfo::
+storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                    unsigned SrcReg, bool isKill, int FI, 
+                    const TargetRegisterClass *RC) const {
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  if (RC == Mips::CPURegsRegisterClass) 
+    BuildMI(MBB, I, DL, get(Mips::SW)).addReg(SrcReg, getKillRegState(isKill))
+          .addImm(0).addFrameIndex(FI);
+  else if (RC == Mips::FGR32RegisterClass)
+    BuildMI(MBB, I, DL, get(Mips::SWC1)).addReg(SrcReg, getKillRegState(isKill))
+          .addImm(0).addFrameIndex(FI);
+  else if (RC == Mips::AFGR64RegisterClass) {
+    if (!TM.getSubtarget<MipsSubtarget>().isMips1()) {
+      BuildMI(MBB, I, DL, get(Mips::SDC1))
+        .addReg(SrcReg, getKillRegState(isKill))
+        .addImm(0).addFrameIndex(FI);
+    } else {
+      const TargetRegisterInfo *TRI = 
+        MBB.getParent()->getTarget().getRegisterInfo();
+      const unsigned *SubSet = TRI->getSubRegisters(SrcReg);
+      BuildMI(MBB, I, DL, get(Mips::SWC1))
+        .addReg(SubSet[0], getKillRegState(isKill))
+        .addImm(0).addFrameIndex(FI);
+      BuildMI(MBB, I, DL, get(Mips::SWC1))
+        .addReg(SubSet[1], getKillRegState(isKill))
+        .addImm(4).addFrameIndex(FI);
+    }
+  } else
+    llvm_unreachable("Register class not handled!");
+}
+
+void MipsInstrInfo::
+loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                     unsigned DestReg, int FI,
+                     const TargetRegisterClass *RC) const 
+{
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  if (RC == Mips::CPURegsRegisterClass) 
+    BuildMI(MBB, I, DL, get(Mips::LW), DestReg).addImm(0).addFrameIndex(FI);
+  else if (RC == Mips::FGR32RegisterClass)
+    BuildMI(MBB, I, DL, get(Mips::LWC1), DestReg).addImm(0).addFrameIndex(FI);
+  else if (RC == Mips::AFGR64RegisterClass) {
+    if (!TM.getSubtarget<MipsSubtarget>().isMips1()) {
+      BuildMI(MBB, I, DL, get(Mips::LDC1), DestReg).addImm(0).addFrameIndex(FI);
+    } else {
+      const TargetRegisterInfo *TRI = 
+        MBB.getParent()->getTarget().getRegisterInfo();
+      const unsigned *SubSet = TRI->getSubRegisters(DestReg);
+      BuildMI(MBB, I, DL, get(Mips::LWC1), SubSet[0])
+        .addImm(0).addFrameIndex(FI);
+      BuildMI(MBB, I, DL, get(Mips::LWC1), SubSet[1])
+        .addImm(4).addFrameIndex(FI);
+    }
+  } else
+    llvm_unreachable("Register class not handled!");
+}
+
+MachineInstr *MipsInstrInfo::
+foldMemoryOperandImpl(MachineFunction &MF,
+                      MachineInstr* MI,
+                      const SmallVectorImpl<unsigned> &Ops, int FI) const 
+{
+  if (Ops.size() != 1) return NULL;
+
+  MachineInstr *NewMI = NULL;
+
+  switch (MI->getOpcode()) {
+  case Mips::ADDu:
+    if ((MI->getOperand(0).isReg()) &&
+        (MI->getOperand(1).isReg()) &&
+        (MI->getOperand(1).getReg() == Mips::ZERO) &&
+        (MI->getOperand(2).isReg())) {
+      if (Ops[0] == 0) {    // COPY -> STORE
+        unsigned SrcReg = MI->getOperand(2).getReg();
+        bool isKill = MI->getOperand(2).isKill();
+        bool isUndef = MI->getOperand(2).isUndef();
+        NewMI = BuildMI(MF, MI->getDebugLoc(), get(Mips::SW))
+          .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef))
+          .addImm(0).addFrameIndex(FI);
+      } else {              // COPY -> LOAD
+        unsigned DstReg = MI->getOperand(0).getReg();
+        bool isDead = MI->getOperand(0).isDead();
+        bool isUndef = MI->getOperand(0).isUndef();
+        NewMI = BuildMI(MF, MI->getDebugLoc(), get(Mips::LW))
+          .addReg(DstReg, RegState::Define | getDeadRegState(isDead) |
+                  getUndefRegState(isUndef))
+          .addImm(0).addFrameIndex(FI);
+      }
+    }
+    break;
+  case Mips::FMOV_S32:
+  case Mips::FMOV_D32:
+    if ((MI->getOperand(0).isReg()) &&
+        (MI->getOperand(1).isReg())) {
+      const TargetRegisterClass 
+        *RC = RI.getRegClass(MI->getOperand(0).getReg());
+      unsigned StoreOpc, LoadOpc;
+      bool IsMips1 = TM.getSubtarget<MipsSubtarget>().isMips1();
+
+      if (RC == Mips::FGR32RegisterClass) {
+        LoadOpc = Mips::LWC1; StoreOpc = Mips::SWC1;
+      } else {
+        assert(RC == Mips::AFGR64RegisterClass);
+        // Mips1 doesn't have ldc/sdc instructions.
+        if (IsMips1) break;
+        LoadOpc = Mips::LDC1; StoreOpc = Mips::SDC1;
+      }
+
+      if (Ops[0] == 0) {    // COPY -> STORE
+        unsigned SrcReg = MI->getOperand(1).getReg();
+        bool isKill = MI->getOperand(1).isKill();
+        bool isUndef = MI->getOperand(2).isUndef();
+        NewMI = BuildMI(MF, MI->getDebugLoc(), get(StoreOpc))
+          .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef))
+          .addImm(0).addFrameIndex(FI) ;
+      } else {              // COPY -> LOAD
+        unsigned DstReg = MI->getOperand(0).getReg();
+        bool isDead = MI->getOperand(0).isDead();
+        bool isUndef = MI->getOperand(0).isUndef();
+        NewMI = BuildMI(MF, MI->getDebugLoc(), get(LoadOpc))
+          .addReg(DstReg, RegState::Define | getDeadRegState(isDead) |
+                  getUndefRegState(isUndef))
+          .addImm(0).addFrameIndex(FI);
+      }
+    }
+    break;
+  }
+
+  return NewMI;
+}
+
+//===----------------------------------------------------------------------===//
+// Branch Analysis
+//===----------------------------------------------------------------------===//
+
+/// GetCondFromBranchOpc - Return the Mips CC that matches 
+/// the correspondent Branch instruction opcode.
+static Mips::CondCode GetCondFromBranchOpc(unsigned BrOpc) 
+{
+  switch (BrOpc) {
+  default: return Mips::COND_INVALID;
+  case Mips::BEQ  : return Mips::COND_E;
+  case Mips::BNE  : return Mips::COND_NE;
+  case Mips::BGTZ : return Mips::COND_GZ;
+  case Mips::BGEZ : return Mips::COND_GEZ;
+  case Mips::BLTZ : return Mips::COND_LZ;
+  case Mips::BLEZ : return Mips::COND_LEZ;
+
+  // We dont do fp branch analysis yet!  
+  case Mips::BC1T : 
+  case Mips::BC1F : return Mips::COND_INVALID;
+  }
+}
+
+/// GetCondBranchFromCond - Return the Branch instruction
+/// opcode that matches the cc.
+unsigned Mips::GetCondBranchFromCond(Mips::CondCode CC) 
+{
+  switch (CC) {
+  default: llvm_unreachable("Illegal condition code!");
+  case Mips::COND_E   : return Mips::BEQ;
+  case Mips::COND_NE  : return Mips::BNE;
+  case Mips::COND_GZ  : return Mips::BGTZ;
+  case Mips::COND_GEZ : return Mips::BGEZ;
+  case Mips::COND_LZ  : return Mips::BLTZ;
+  case Mips::COND_LEZ : return Mips::BLEZ;
+
+  case Mips::FCOND_F:
+  case Mips::FCOND_UN:
+  case Mips::FCOND_EQ:
+  case Mips::FCOND_UEQ:
+  case Mips::FCOND_OLT:
+  case Mips::FCOND_ULT:
+  case Mips::FCOND_OLE:
+  case Mips::FCOND_ULE:
+  case Mips::FCOND_SF:
+  case Mips::FCOND_NGLE:
+  case Mips::FCOND_SEQ:
+  case Mips::FCOND_NGL:
+  case Mips::FCOND_LT:
+  case Mips::FCOND_NGE:
+  case Mips::FCOND_LE:
+  case Mips::FCOND_NGT: return Mips::BC1T;
+
+  case Mips::FCOND_T:
+  case Mips::FCOND_OR:
+  case Mips::FCOND_NEQ:
+  case Mips::FCOND_OGL:
+  case Mips::FCOND_UGE:
+  case Mips::FCOND_OGE:
+  case Mips::FCOND_UGT:
+  case Mips::FCOND_OGT:
+  case Mips::FCOND_ST:
+  case Mips::FCOND_GLE:
+  case Mips::FCOND_SNE:
+  case Mips::FCOND_GL:
+  case Mips::FCOND_NLT:
+  case Mips::FCOND_GE:
+  case Mips::FCOND_NLE:
+  case Mips::FCOND_GT: return Mips::BC1F;
+  }
+}
+
+/// GetOppositeBranchCondition - Return the inverse of the specified 
+/// condition, e.g. turning COND_E to COND_NE.
+Mips::CondCode Mips::GetOppositeBranchCondition(Mips::CondCode CC) 
+{
+  switch (CC) {
+  default: llvm_unreachable("Illegal condition code!");
+  case Mips::COND_E   : return Mips::COND_NE;
+  case Mips::COND_NE  : return Mips::COND_E;
+  case Mips::COND_GZ  : return Mips::COND_LEZ;
+  case Mips::COND_GEZ : return Mips::COND_LZ;
+  case Mips::COND_LZ  : return Mips::COND_GEZ;
+  case Mips::COND_LEZ : return Mips::COND_GZ;
+  case Mips::FCOND_F  : return Mips::FCOND_T;
+  case Mips::FCOND_UN : return Mips::FCOND_OR;
+  case Mips::FCOND_EQ : return Mips::FCOND_NEQ;
+  case Mips::FCOND_UEQ: return Mips::FCOND_OGL;
+  case Mips::FCOND_OLT: return Mips::FCOND_UGE;
+  case Mips::FCOND_ULT: return Mips::FCOND_OGE;
+  case Mips::FCOND_OLE: return Mips::FCOND_UGT;
+  case Mips::FCOND_ULE: return Mips::FCOND_OGT;
+  case Mips::FCOND_SF:  return Mips::FCOND_ST;
+  case Mips::FCOND_NGLE:return Mips::FCOND_GLE;
+  case Mips::FCOND_SEQ: return Mips::FCOND_SNE;
+  case Mips::FCOND_NGL: return Mips::FCOND_GL;
+  case Mips::FCOND_LT:  return Mips::FCOND_NLT;
+  case Mips::FCOND_NGE: return Mips::FCOND_GE;
+  case Mips::FCOND_LE:  return Mips::FCOND_NLE;
+  case Mips::FCOND_NGT: return Mips::FCOND_GT;
+  }
+}
+
+bool MipsInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, 
+                                  MachineBasicBlock *&TBB,
+                                  MachineBasicBlock *&FBB,
+                                  SmallVectorImpl<MachineOperand> &Cond,
+                                  bool AllowModify) const 
+{
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
+    return false;
+  
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+  
+  // If there is only one terminator instruction, process it.
+  unsigned LastOpc = LastInst->getOpcode();
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+    if (!LastInst->getDesc().isBranch())
+      return true;
+
+    // Unconditional branch
+    if (LastOpc == Mips::J) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    }
+
+    Mips::CondCode BranchCode = GetCondFromBranchOpc(LastInst->getOpcode());
+    if (BranchCode == Mips::COND_INVALID)
+      return true;  // Can't handle indirect branch.
+
+    // Conditional branch
+    // Block ends with fall-through condbranch.
+    if (LastOpc != Mips::COND_INVALID) {
+      int LastNumOp = LastInst->getNumOperands();
+
+      TBB = LastInst->getOperand(LastNumOp-1).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(BranchCode));
+
+      for (int i=0; i<LastNumOp-1; i++) {
+        Cond.push_back(LastInst->getOperand(i));
+      }
+
+      return false;
+    }
+  }
+  
+  // Get the instruction before it if it is a terminator.
+  MachineInstr *SecondLastInst = I;
+  
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I))
+    return true;
+
+  // If the block ends with Mips::J and a Mips::BNE/Mips::BEQ, handle it.
+  unsigned SecondLastOpc    = SecondLastInst->getOpcode();
+  Mips::CondCode BranchCode = GetCondFromBranchOpc(SecondLastOpc);
+
+  if (BranchCode != Mips::COND_INVALID && LastOpc == Mips::J) {
+    int SecondNumOp = SecondLastInst->getNumOperands();
+
+    TBB = SecondLastInst->getOperand(SecondNumOp-1).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(BranchCode));
+
+    for (int i=0; i<SecondNumOp-1; i++) {
+      Cond.push_back(SecondLastInst->getOperand(i));
+    }
+
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+  
+  // If the block ends with two unconditional branches, handle it. The last 
+  // one is not executed, so remove it.
+  if ((SecondLastOpc == Mips::J) && (LastOpc == Mips::J)) {
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return false;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+unsigned MipsInstrInfo::
+InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, 
+             MachineBasicBlock *FBB,
+             const SmallVectorImpl<MachineOperand> &Cond) const {
+  // FIXME this should probably have a DebugLoc argument
+  DebugLoc dl = DebugLoc::getUnknownLoc();
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 3 || Cond.size() == 2 || Cond.size() == 0) &&
+         "Mips branch conditions can have two|three components!");
+
+  if (FBB == 0) { // One way branch.
+    if (Cond.empty()) {
+      // Unconditional branch?
+      BuildMI(&MBB, dl, get(Mips::J)).addMBB(TBB);
+    } else {
+      // Conditional branch.
+      unsigned Opc = GetCondBranchFromCond((Mips::CondCode)Cond[0].getImm());
+      const TargetInstrDesc &TID = get(Opc);
+
+      if (TID.getNumOperands() == 3)
+        BuildMI(&MBB, dl, TID).addReg(Cond[1].getReg())
+                          .addReg(Cond[2].getReg())
+                          .addMBB(TBB);
+      else
+        BuildMI(&MBB, dl, TID).addReg(Cond[1].getReg())
+                          .addMBB(TBB);
+
+    }                             
+    return 1;
+  }
+  
+  // Two-way Conditional branch.
+  unsigned Opc = GetCondBranchFromCond((Mips::CondCode)Cond[0].getImm());
+  const TargetInstrDesc &TID = get(Opc);
+
+  if (TID.getNumOperands() == 3)
+    BuildMI(&MBB, dl, TID).addReg(Cond[1].getReg()).addReg(Cond[2].getReg())
+                      .addMBB(TBB);
+  else
+    BuildMI(&MBB, dl, TID).addReg(Cond[1].getReg()).addMBB(TBB);
+
+  BuildMI(&MBB, dl, get(Mips::J)).addMBB(FBB);
+  return 2;
+}
+
+unsigned MipsInstrInfo::
+RemoveBranch(MachineBasicBlock &MBB) const 
+{
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin()) return 0;
+  --I;
+  if (I->getOpcode() != Mips::J && 
+      GetCondFromBranchOpc(I->getOpcode()) == Mips::COND_INVALID)
+    return 0;
+  
+  // Remove the branch.
+  I->eraseFromParent();
+  
+  I = MBB.end();
+  
+  if (I == MBB.begin()) return 1;
+  --I;
+  if (GetCondFromBranchOpc(I->getOpcode()) == Mips::COND_INVALID)
+    return 1;
+  
+  // Remove the branch.
+  I->eraseFromParent();
+  return 2;
+}
+
+/// ReverseBranchCondition - Return the inverse opcode of the 
+/// specified Branch instruction.
+bool MipsInstrInfo::
+ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const 
+{
+  assert( (Cond.size() == 3 || Cond.size() == 2) && 
+          "Invalid Mips branch condition!");
+  Cond[0].setImm(GetOppositeBranchCondition((Mips::CondCode)Cond[0].getImm()));
+  return false;
+}
+
+/// getGlobalBaseReg - Return a virtual register initialized with the
+/// the global base register value. Output instructions required to
+/// initialize the register in the function entry block, if necessary.
+///
+unsigned MipsInstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
+  MipsFunctionInfo *MipsFI = MF->getInfo<MipsFunctionInfo>();
+  unsigned GlobalBaseReg = MipsFI->getGlobalBaseReg();
+  if (GlobalBaseReg != 0)
+    return GlobalBaseReg;
+
+  // Insert the set of GlobalBaseReg into the first MBB of the function
+  MachineBasicBlock &FirstMBB = MF->front();
+  MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  const TargetInstrInfo *TII = MF->getTarget().getInstrInfo();
+
+  GlobalBaseReg = RegInfo.createVirtualRegister(Mips::CPURegsRegisterClass);
+  bool Ok = TII->copyRegToReg(FirstMBB, MBBI, GlobalBaseReg, Mips::GP,
+                              Mips::CPURegsRegisterClass,
+                              Mips::CPURegsRegisterClass);
+  assert(Ok && "Couldn't assign to global base register!");
+  Ok = Ok; // Silence warning when assertions are turned off.
+  RegInfo.addLiveIn(Mips::GP);
+
+  MipsFI->setGlobalBaseReg(GlobalBaseReg);
+  return GlobalBaseReg;
+}

diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h
new file mode 100644
index 0000000..ab8dc59
--- /dev/null
+++ b/lib/Target/Mips/MipsInstrInfo.h

@@ -0,0 +1,251 @@
+//===- MipsInstrInfo.h - Mips Instruction Information -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSINSTRUCTIONINFO_H
+#define MIPSINSTRUCTIONINFO_H
+
+#include "Mips.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "MipsRegisterInfo.h"
+
+namespace llvm {
+
+namespace Mips {
+
+  // Mips Branch Codes
+  enum FPBranchCode {
+    BRANCH_F,
+    BRANCH_T,
+    BRANCH_FL,
+    BRANCH_TL,
+    BRANCH_INVALID
+  };
+
+  // Mips Condition Codes
+  enum CondCode {
+    // To be used with float branch True
+    FCOND_F,
+    FCOND_UN,
+    FCOND_EQ,
+    FCOND_UEQ,
+    FCOND_OLT,
+    FCOND_ULT,
+    FCOND_OLE,
+    FCOND_ULE,
+    FCOND_SF,
+    FCOND_NGLE,
+    FCOND_SEQ,
+    FCOND_NGL,
+    FCOND_LT,
+    FCOND_NGE,
+    FCOND_LE,
+    FCOND_NGT,
+
+    // To be used with float branch False
+    // This conditions have the same mnemonic as the
+    // above ones, but are used with a branch False;
+    FCOND_T,
+    FCOND_OR,
+    FCOND_NEQ,
+    FCOND_OGL,
+    FCOND_UGE,
+    FCOND_OGE,
+    FCOND_UGT,
+    FCOND_OGT,
+    FCOND_ST,
+    FCOND_GLE,
+    FCOND_SNE,
+    FCOND_GL,
+    FCOND_NLT,
+    FCOND_GE,
+    FCOND_NLE,
+    FCOND_GT,
+
+    // Only integer conditions
+    COND_E,
+    COND_GZ,
+    COND_GEZ,
+    COND_LZ,
+    COND_LEZ,
+    COND_NE,
+    COND_INVALID
+  };
+  
+  // Turn condition code into conditional branch opcode.
+  unsigned GetCondBranchFromCond(CondCode CC);
+
+  /// GetOppositeBranchCondition - Return the inverse of the specified cond,
+  /// e.g. turning COND_E to COND_NE.
+  CondCode GetOppositeBranchCondition(Mips::CondCode CC);
+
+  /// MipsCCToString - Map each FP condition code to its string
+  inline static const char *MipsFCCToString(Mips::CondCode CC) 
+  {
+    switch (CC) {
+      default: llvm_unreachable("Unknown condition code");
+      case FCOND_F:
+      case FCOND_T:   return "f";
+      case FCOND_UN:
+      case FCOND_OR:  return "un";
+      case FCOND_EQ: 
+      case FCOND_NEQ: return "eq";
+      case FCOND_UEQ:
+      case FCOND_OGL: return "ueq";
+      case FCOND_OLT:
+      case FCOND_UGE: return "olt";
+      case FCOND_ULT:
+      case FCOND_OGE: return "ult";
+      case FCOND_OLE:
+      case FCOND_UGT: return "ole";
+      case FCOND_ULE:
+      case FCOND_OGT: return "ule";
+      case FCOND_SF:
+      case FCOND_ST:  return "sf";
+      case FCOND_NGLE:
+      case FCOND_GLE: return "ngle";
+      case FCOND_SEQ:
+      case FCOND_SNE: return "seq";
+      case FCOND_NGL:
+      case FCOND_GL:  return "ngl";
+      case FCOND_LT:
+      case FCOND_NLT: return "lt";
+      case FCOND_NGE:
+      case FCOND_GE:  return "ge";
+      case FCOND_LE:
+      case FCOND_NLE: return "nle";
+      case FCOND_NGT:
+      case FCOND_GT:  return "gt";
+    }
+  }
+}
+
+/// MipsII - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace MipsII {
+  /// Target Operand Flag enum.
+  enum TOF {
+    //===------------------------------------------------------------------===//
+    // Mips Specific MachineOperand flags.
+ 
+    MO_NO_FLAG,
+
+    /// MO_GOT - Represents the offset into the global offset table at which
+    /// the address the relocation entry symbol resides during execution.
+    MO_GOT,
+
+    /// MO_GOT_CALL - Represents the offset into the global offset table at 
+    /// which the address of a call site relocation entry symbol resides 
+    /// during execution. This is different from the above since this flag
+    /// can only be present in call instructions.
+    MO_GOT_CALL,
+
+    /// MO_GPREL - Represents the offset from the current gp value to be used 
+    /// for the relocatable object file being produced.
+    MO_GPREL,
+
+    /// MO_ABS_HILO - Represents the hi or low part of an absolute symbol
+    /// address. 
+    MO_ABS_HILO
+
+  };
+}
+
+class MipsInstrInfo : public TargetInstrInfoImpl {
+  MipsTargetMachine &TM;
+  const MipsRegisterInfo RI;
+public:
+  explicit MipsInstrInfo(MipsTargetMachine &TM);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  virtual const MipsRegisterInfo &getRegisterInfo() const { return RI; }
+
+  /// Return true if the instruction is a register to register move and return
+  /// the source and dest operands and their sub-register indices by reference.
+  virtual bool isMoveInstr(const MachineInstr &MI,
+                           unsigned &SrcReg, unsigned &DstReg,
+                           unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
+  
+  /// isLoadFromStackSlot - If the specified machine instruction is a direct
+  /// load from a stack slot, return the virtual or physical register number of
+  /// the destination along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than loading from the stack slot.
+  virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                                       int &FrameIndex) const;
+  
+  /// isStoreToStackSlot - If the specified machine instruction is a direct
+  /// store to a stack slot, return the virtual or physical register number of
+  /// the source reg along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than storing to the stack slot.
+  virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
+                                      int &FrameIndex) const;
+ 
+  /// Branch Analysis
+  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                             MachineBasicBlock *&FBB,
+                             SmallVectorImpl<MachineOperand> &Cond,
+                             bool AllowModify) const;
+  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                                MachineBasicBlock *FBB,
+                            const SmallVectorImpl<MachineOperand> &Cond) const;
+  virtual bool copyRegToReg(MachineBasicBlock &MBB, 
+                            MachineBasicBlock::iterator I,
+                            unsigned DestReg, unsigned SrcReg,
+                            const TargetRegisterClass *DestRC,
+                            const TargetRegisterClass *SrcRC) const;
+  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   unsigned SrcReg, bool isKill, int FrameIndex,
+                                   const TargetRegisterClass *RC) const;
+
+  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MBBI,
+                                    unsigned DestReg, int FrameIndex,
+                                    const TargetRegisterClass *RC) const;
+
+  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                              MachineInstr* MI,
+                                           const SmallVectorImpl<unsigned> &Ops,
+                                              int FrameIndex) const;
+
+  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                              MachineInstr* MI,
+                                           const SmallVectorImpl<unsigned> &Ops,
+                                              MachineInstr* LoadMI) const {
+    return 0;
+  }
+  
+  virtual
+  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+
+  /// Insert nop instruction when hazard condition is found
+  virtual void insertNoop(MachineBasicBlock &MBB, 
+                          MachineBasicBlock::iterator MI) const;
+
+  /// getGlobalBaseReg - Return a virtual register initialized with the
+  /// the global base register value. Output instructions required to
+  /// initialize the register in the function entry block, if necessary.
+  ///
+  unsigned getGlobalBaseReg(MachineFunction *MF) const;
+};
+
+}
+
+#endif

diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
new file mode 100644
index 0000000..e67bcbf
--- /dev/null
+++ b/lib/Target/Mips/MipsInstrInfo.td

@@ -0,0 +1,707 @@
+//===- MipsInstrInfo.td - Mips Register defs --------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+
+include "MipsInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Mips profiles and nodes
+//===----------------------------------------------------------------------===//
+
+def SDT_MipsRet          : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+def SDT_MipsJmpLink      : SDTypeProfile<0, 1, [SDTCisVT<0, iPTR>]>;
+def SDT_MipsSelectCC     : SDTypeProfile<1, 3, [SDTCisSameAs<0, 2>, 
+                                         SDTCisSameAs<2, 3>, SDTCisInt<1>]>;
+def SDT_MipsCMov         : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>, 
+                                         SDTCisSameAs<1, 2>, SDTCisSameAs<3, 4>,
+                                         SDTCisInt<4>]>;
+def SDT_MipsCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_MipsCallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>;
+
+// Call
+def MipsJmpLink : SDNode<"MipsISD::JmpLink",SDT_MipsJmpLink, 
+                         [SDNPHasChain, SDNPOutFlag, SDNPOptInFlag]>;
+
+// Hi and Lo nodes are used to handle global addresses. Used on 
+// MipsISelLowering to lower stuff like GlobalAddress, ExternalSymbol 
+// static model. (nothing to do with Mips Registers Hi and Lo)
+def MipsHi    : SDNode<"MipsISD::Hi", SDTIntUnaryOp>;
+def MipsLo    : SDNode<"MipsISD::Lo", SDTIntUnaryOp>;
+def MipsGPRel : SDNode<"MipsISD::GPRel", SDTIntUnaryOp>;
+
+// Return
+def MipsRet : SDNode<"MipsISD::Ret", SDT_MipsRet, [SDNPHasChain, 
+                     SDNPOptInFlag]>;
+
+// These are target-independent nodes, but have target-specific formats.
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_MipsCallSeqStart,
+                           [SDNPHasChain, SDNPOutFlag]>;
+def callseq_end   : SDNode<"ISD::CALLSEQ_END", SDT_MipsCallSeqEnd,
+                           [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+
+// Select Condition Code
+def MipsSelectCC  : SDNode<"MipsISD::SelectCC", SDT_MipsSelectCC>;
+
+// Conditional Move
+def MipsCMov      : SDNode<"MipsISD::CMov", SDT_MipsCMov>;
+
+//===----------------------------------------------------------------------===//
+// Mips Instruction Predicate Definitions.
+//===----------------------------------------------------------------------===//
+def HasSEInReg  : Predicate<"Subtarget.hasSEInReg()">;
+def HasBitCount : Predicate<"Subtarget.hasBitCount()">;
+def HasSwap     : Predicate<"Subtarget.hasSwap()">;
+def HasCondMov  : Predicate<"Subtarget.hasCondMov()">;
+
+//===----------------------------------------------------------------------===//
+// Mips Operand, Complex Patterns and Transformations Definitions.
+//===----------------------------------------------------------------------===//
+
+// Instruction operand types
+def brtarget    : Operand<OtherVT>;
+def calltarget  : Operand<i32>;
+def simm16      : Operand<i32>;
+def shamt       : Operand<i32>;
+
+// Unsigned Operand
+def uimm16      : Operand<i32> {
+  let PrintMethod = "printUnsignedImm";
+}
+
+// Address operand
+def mem : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops simm16, CPURegs);
+}
+
+// Transformation Function - get the lower 16 bits.
+def LO16 : SDNodeXForm<imm, [{
+  return getI32Imm((unsigned)N->getZExtValue() & 0xFFFF);
+}]>;
+
+// Transformation Function - get the higher 16 bits.
+def HI16 : SDNodeXForm<imm, [{
+  return getI32Imm((unsigned)N->getZExtValue() >> 16);
+}]>;
+
+// Node immediate fits as 16-bit sign extended on target immediate.
+// e.g. addi, andi
+def immSExt16  : PatLeaf<(imm), [{
+  if (N->getValueType(0) == MVT::i32)
+    return (int32_t)N->getZExtValue() == (short)N->getZExtValue();
+  else
+    return (int64_t)N->getZExtValue() == (short)N->getZExtValue();
+}]>;
+
+// Node immediate fits as 16-bit zero extended on target immediate.
+// The LO16 param means that only the lower 16 bits of the node
+// immediate are caught.
+// e.g. addiu, sltiu
+def immZExt16  : PatLeaf<(imm), [{
+  if (N->getValueType(0) == MVT::i32)
+    return (uint32_t)N->getZExtValue() == (unsigned short)N->getZExtValue();
+  else
+    return (uint64_t)N->getZExtValue() == (unsigned short)N->getZExtValue();
+}], LO16>;
+
+// shamt field must fit in 5 bits.
+def immZExt5 : PatLeaf<(imm), [{
+  return N->getZExtValue() == ((N->getZExtValue()) & 0x1f) ;
+}]>;
+
+// Mips Address Mode! SDNode frameindex could possibily be a match
+// since load and store instructions from stack used it.
+def addr : ComplexPattern<i32, 2, "SelectAddr", [frameindex], []>;
+
+//===----------------------------------------------------------------------===//
+// Instructions specific format
+//===----------------------------------------------------------------------===//
+
+// Arithmetic 3 register operands
+let isCommutable = 1 in
+class ArithR<bits<6> op, bits<6> func, string instr_asm, SDNode OpNode,
+             InstrItinClass itin>:
+  FR< op,
+      func,
+      (outs CPURegs:$dst),
+      (ins CPURegs:$b, CPURegs:$c),
+      !strconcat(instr_asm, "\t$dst, $b, $c"),
+      [(set CPURegs:$dst, (OpNode CPURegs:$b, CPURegs:$c))], itin>;
+
+let isCommutable = 1 in
+class ArithOverflowR<bits<6> op, bits<6> func, string instr_asm>:
+  FR< op,
+      func,
+      (outs CPURegs:$dst),
+      (ins CPURegs:$b, CPURegs:$c),
+      !strconcat(instr_asm, "\t$dst, $b, $c"),
+      [], IIAlu>;
+
+// Arithmetic 2 register operands
+class ArithI<bits<6> op, string instr_asm, SDNode OpNode,
+             Operand Od, PatLeaf imm_type> :
+  FI< op,
+      (outs CPURegs:$dst),
+      (ins CPURegs:$b, Od:$c),
+      !strconcat(instr_asm, "\t$dst, $b, $c"),
+      [(set CPURegs:$dst, (OpNode CPURegs:$b, imm_type:$c))], IIAlu>;
+
+class ArithOverflowI<bits<6> op, string instr_asm, SDNode OpNode,
+             Operand Od, PatLeaf imm_type> :
+  FI< op,
+      (outs CPURegs:$dst),
+      (ins CPURegs:$b, Od:$c),
+      !strconcat(instr_asm, "\t$dst, $b, $c"),
+      [], IIAlu>;
+
+// Arithmetic Multiply ADD/SUB
+let rd=0 in
+class MArithR<bits<6> func, string instr_asm> :
+  FR< 0x1c,
+      func,
+      (outs CPURegs:$rs),
+      (ins CPURegs:$rt),
+      !strconcat(instr_asm, "\t$rs, $rt"),
+      [], IIImul>;
+
+//  Logical
+class LogicR<bits<6> func, string instr_asm, SDNode OpNode>:
+  FR< 0x00,
+      func,
+      (outs CPURegs:$dst),
+      (ins CPURegs:$b, CPURegs:$c),
+      !strconcat(instr_asm, "\t$dst, $b, $c"),
+      [(set CPURegs:$dst, (OpNode CPURegs:$b, CPURegs:$c))], IIAlu>;
+
+class LogicI<bits<6> op, string instr_asm, SDNode OpNode>:
+  FI< op,
+      (outs CPURegs:$dst),
+      (ins CPURegs:$b, uimm16:$c),
+      !strconcat(instr_asm, "\t$dst, $b, $c"),
+      [(set CPURegs:$dst, (OpNode CPURegs:$b, immZExt16:$c))], IIAlu>;
+
+class LogicNOR<bits<6> op, bits<6> func, string instr_asm>:
+  FR< op,
+      func,
+      (outs CPURegs:$dst),
+      (ins CPURegs:$b, CPURegs:$c),
+      !strconcat(instr_asm, "\t$dst, $b, $c"),
+      [(set CPURegs:$dst, (not (or CPURegs:$b, CPURegs:$c)))], IIAlu>;
+
+// Shifts
+let rt = 0 in
+class LogicR_shift_imm<bits<6> func, string instr_asm, SDNode OpNode>:
+  FR< 0x00,
+      func,
+      (outs CPURegs:$dst),
+      (ins CPURegs:$b, shamt:$c),
+      !strconcat(instr_asm, "\t$dst, $b, $c"),
+      [(set CPURegs:$dst, (OpNode CPURegs:$b, immZExt5:$c))], IIAlu>;
+
+class LogicR_shift_reg<bits<6> func, string instr_asm, SDNode OpNode>:
+  FR< 0x00,
+      func,
+      (outs CPURegs:$dst),
+      (ins CPURegs:$b, CPURegs:$c),
+      !strconcat(instr_asm, "\t$dst, $b, $c"),
+      [(set CPURegs:$dst, (OpNode CPURegs:$b, CPURegs:$c))], IIAlu>;
+
+// Load Upper Imediate
+class LoadUpper<bits<6> op, string instr_asm>:
+  FI< op,
+      (outs CPURegs:$dst),
+      (ins uimm16:$imm),
+      !strconcat(instr_asm, "\t$dst, $imm"),
+      [], IIAlu>;
+
+// Memory Load/Store
+let canFoldAsLoad = 1, hasDelaySlot = 1 in
+class LoadM<bits<6> op, string instr_asm, PatFrag OpNode>:
+  FI< op,
+      (outs CPURegs:$dst),
+      (ins mem:$addr),
+      !strconcat(instr_asm, "\t$dst, $addr"),
+      [(set CPURegs:$dst, (OpNode addr:$addr))], IILoad>;
+
+class StoreM<bits<6> op, string instr_asm, PatFrag OpNode>:
+  FI< op,
+      (outs),
+      (ins CPURegs:$dst, mem:$addr),
+      !strconcat(instr_asm, "\t$dst, $addr"),
+      [(OpNode CPURegs:$dst, addr:$addr)], IIStore>;
+
+// Conditional Branch
+let isBranch = 1, isTerminator=1, hasDelaySlot = 1 in {
+class CBranch<bits<6> op, string instr_asm, PatFrag cond_op>:
+  FI< op,
+      (outs),
+      (ins CPURegs:$a, CPURegs:$b, brtarget:$offset),
+      !strconcat(instr_asm, "\t$a, $b, $offset"),
+      [(brcond (cond_op CPURegs:$a, CPURegs:$b), bb:$offset)],
+      IIBranch>;
+
+
+class CBranchZero<bits<6> op, string instr_asm, PatFrag cond_op>:
+  FI< op,
+      (outs),
+      (ins CPURegs:$src, brtarget:$offset),
+      !strconcat(instr_asm, "\t$src, $offset"),
+      [(brcond (cond_op CPURegs:$src, 0), bb:$offset)],
+      IIBranch>;
+}
+
+// SetCC
+class SetCC_R<bits<6> op, bits<6> func, string instr_asm,
+      PatFrag cond_op>:
+  FR< op,
+      func,
+      (outs CPURegs:$dst),
+      (ins CPURegs:$b, CPURegs:$c),
+      !strconcat(instr_asm, "\t$dst, $b, $c"),
+      [(set CPURegs:$dst, (cond_op CPURegs:$b, CPURegs:$c))],
+      IIAlu>;
+
+class SetCC_I<bits<6> op, string instr_asm, PatFrag cond_op,
+      Operand Od, PatLeaf imm_type>:
+  FI< op,
+      (outs CPURegs:$dst),
+      (ins CPURegs:$b, Od:$c),
+      !strconcat(instr_asm, "\t$dst, $b, $c"),
+      [(set CPURegs:$dst, (cond_op CPURegs:$b, imm_type:$c))],
+      IIAlu>;
+
+// Unconditional branch
+let isBranch=1, isTerminator=1, isBarrier=1, hasDelaySlot = 1 in
+class JumpFJ<bits<6> op, string instr_asm>:
+  FJ< op,
+      (outs),
+      (ins brtarget:$target),
+      !strconcat(instr_asm, "\t$target"),
+      [(br bb:$target)], IIBranch>;
+
+let isBranch=1, isTerminator=1, isBarrier=1, rd=0, hasDelaySlot = 1 in
+class JumpFR<bits<6> op, bits<6> func, string instr_asm>:
+  FR< op,
+      func,
+      (outs),
+      (ins CPURegs:$target),
+      !strconcat(instr_asm, "\t$target"),
+      [(brind CPURegs:$target)], IIBranch>;
+
+// Jump and Link (Call)
+let isCall=1, hasDelaySlot=1,
+  // All calls clobber the non-callee saved registers...
+  Defs = [AT, V0, V1, A0, A1, A2, A3, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, 
+          K0, K1, F0, F1, F2, F3, F4, F5, F6, F7, F8, F9, F10, F11, F12, F13,
+          F14, F15, F16, F17, F18, F19], Uses = [GP] in {
+  class JumpLink<bits<6> op, string instr_asm>:
+    FJ< op,
+        (outs),
+        (ins calltarget:$target, variable_ops),
+        !strconcat(instr_asm, "\t$target"),
+        [(MipsJmpLink imm:$target)], IIBranch>;
+
+  let rd=31 in
+  class JumpLinkReg<bits<6> op, bits<6> func, string instr_asm>:
+    FR< op,
+        func,
+        (outs),
+        (ins CPURegs:$rs, variable_ops),
+        !strconcat(instr_asm, "\t$rs"),
+        [(MipsJmpLink CPURegs:$rs)], IIBranch>;
+
+  class BranchLink<string instr_asm>:
+    FI< 0x1,
+        (outs),
+        (ins CPURegs:$rs, brtarget:$target, variable_ops),
+        !strconcat(instr_asm, "\t$rs, $target"),
+        [], IIBranch>;
+}
+
+// Mul, Div
+class MulDiv<bits<6> func, string instr_asm, InstrItinClass itin>:
+  FR< 0x00,
+      func,
+      (outs),
+      (ins CPURegs:$a, CPURegs:$b),
+      !strconcat(instr_asm, "\t$a, $b"),
+      [], itin>;
+
+// Move from Hi/Lo
+class MoveFromLOHI<bits<6> func, string instr_asm>:
+  FR< 0x00,
+      func,
+      (outs CPURegs:$dst),
+      (ins),
+      !strconcat(instr_asm, "\t$dst"),
+      [], IIHiLo>;
+
+class MoveToLOHI<bits<6> func, string instr_asm>:
+  FR< 0x00,
+      func,
+      (outs),
+      (ins CPURegs:$src),
+      !strconcat(instr_asm, "\t$src"),
+      [], IIHiLo>;
+
+class EffectiveAddress<string instr_asm> :
+  FI<0x09,
+     (outs CPURegs:$dst),
+     (ins mem:$addr),
+     instr_asm,
+     [(set CPURegs:$dst, addr:$addr)], IIAlu>;
+
+// Count Leading Ones/Zeros in Word
+class CountLeading<bits<6> func, string instr_asm, SDNode CountOp>:
+  FR< 0x1c, func, (outs CPURegs:$dst), (ins CPURegs:$src),
+      !strconcat(instr_asm, "\t$dst, $src"), 
+      [(set CPURegs:$dst, (CountOp CPURegs:$src))], IIAlu>;
+
+// Sign Extend in Register.
+class SignExtInReg<bits<6> func, string instr_asm, ValueType vt>:
+  FR< 0x3f, func, (outs CPURegs:$dst), (ins CPURegs:$src),
+      !strconcat(instr_asm, "\t$dst, $src"),
+      [(set CPURegs:$dst, (sext_inreg CPURegs:$src, vt))], NoItinerary>;
+
+// Byte Swap
+class ByteSwap<bits<6> func, string instr_asm>:
+  FR< 0x1f, func, (outs CPURegs:$dst), (ins CPURegs:$src),
+      !strconcat(instr_asm, "\t$dst, $src"),
+      [(set CPURegs:$dst, (bswap CPURegs:$src))], NoItinerary>;
+
+// Conditional Move
+class CondMov<bits<6> func, string instr_asm, PatLeaf MovCode>:
+  FR< 0x00, func, (outs CPURegs:$dst), (ins CPURegs:$F, CPURegs:$T, 
+      CPURegs:$cond), !strconcat(instr_asm, "\t$dst, $T, $cond"), 
+      [(set CPURegs:$dst, (MipsCMov CPURegs:$F, CPURegs:$T, 
+                           CPURegs:$cond, MovCode))], NoItinerary>;
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions
+//===----------------------------------------------------------------------===//
+
+// As stack alignment is always done with addiu, we need a 16-bit immediate
+let Defs = [SP], Uses = [SP] in {
+def ADJCALLSTACKDOWN : MipsPseudo<(outs), (ins uimm16:$amt),
+                                  "!ADJCALLSTACKDOWN $amt",
+                                  [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP   : MipsPseudo<(outs), (ins uimm16:$amt1, uimm16:$amt2),
+                                  "!ADJCALLSTACKUP $amt1",
+                                  [(callseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+// Some assembly macros need to avoid pseudoinstructions and assembler
+// automatic reodering, we should reorder ourselves.
+def MACRO     : MipsPseudo<(outs), (ins), ".set\tmacro",     []>;
+def REORDER   : MipsPseudo<(outs), (ins), ".set\treorder",   []>;
+def NOMACRO   : MipsPseudo<(outs), (ins), ".set\tnomacro",   []>;
+def NOREORDER : MipsPseudo<(outs), (ins), ".set\tnoreorder", []>;
+
+// When handling PIC code the assembler needs .cpload and .cprestore
+// directives. If the real instructions corresponding these directives
+// are used, we have the same behavior, but get also a bunch of warnings
+// from the assembler.
+def CPLOAD : MipsPseudo<(outs), (ins CPURegs:$picreg), ".cpload\t$picreg", []>;
+def CPRESTORE : MipsPseudo<(outs), (ins uimm16:$loc), ".cprestore\t$loc\n", []>;
+
+// The supported Mips ISAs dont have any instruction close to the SELECT_CC 
+// operation. The solution is to create a Mips pseudo SELECT_CC instruction
+// (MipsSelectCC), use LowerSELECT_CC to generate this instruction and finally 
+// replace it for real supported nodes into EmitInstrWithCustomInserter
+let usesCustomInserter = 1 in {
+  class PseudoSelCC<RegisterClass RC, string asmstr>: 
+    MipsPseudo<(outs RC:$dst), (ins CPURegs:$CmpRes, RC:$T, RC:$F), asmstr, 
+    [(set RC:$dst, (MipsSelectCC CPURegs:$CmpRes, RC:$T, RC:$F))]>;
+}
+
+def Select_CC : PseudoSelCC<CPURegs, "# MipsSelect_CC_i32">;
+
+//===----------------------------------------------------------------------===//
+// Instruction definition
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MipsI Instructions
+//===----------------------------------------------------------------------===//
+
+/// Arithmetic Instructions (ALU Immediate)
+def ADDiu   : ArithI<0x09, "addiu", add, simm16, immSExt16>;
+def ADDi    : ArithOverflowI<0x08, "addi",  add, simm16, immSExt16>;
+def SLTi    : SetCC_I<0x0a, "slti", setlt, simm16, immSExt16>;
+def SLTiu   : SetCC_I<0x0b, "sltiu", setult, simm16, immSExt16>;
+def ANDi    : LogicI<0x0c, "andi", and>;
+def ORi     : LogicI<0x0d, "ori",  or>;
+def XORi    : LogicI<0x0e, "xori",  xor>;
+def LUi     : LoadUpper<0x0f, "lui">;
+
+/// Arithmetic Instructions (3-Operand, R-Type)
+def ADDu    : ArithR<0x00, 0x21, "addu", add, IIAlu>;
+def SUBu    : ArithR<0x00, 0x23, "subu", sub, IIAlu>;
+def ADD     : ArithOverflowR<0x00, 0x20, "add">;
+def SUB     : ArithOverflowR<0x00, 0x22, "sub">;
+def SLT     : SetCC_R<0x00, 0x2a, "slt", setlt>;
+def SLTu    : SetCC_R<0x00, 0x2b, "sltu", setult>;
+def AND     : LogicR<0x24, "and", and>;
+def OR      : LogicR<0x25, "or",  or>;
+def XOR     : LogicR<0x26, "xor", xor>;
+def NOR     : LogicNOR<0x00, 0x27, "nor">;
+
+/// Shift Instructions
+def SLL     : LogicR_shift_imm<0x00, "sll", shl>;
+def SRL     : LogicR_shift_imm<0x02, "srl", srl>;
+def SRA     : LogicR_shift_imm<0x03, "sra", sra>;
+def SLLV    : LogicR_shift_reg<0x04, "sllv", shl>;
+def SRLV    : LogicR_shift_reg<0x06, "srlv", srl>;
+def SRAV    : LogicR_shift_reg<0x07, "srav", sra>;
+
+/// Load and Store Instructions
+def LB      : LoadM<0x20, "lb",  sextloadi8>;
+def LBu     : LoadM<0x24, "lbu", zextloadi8>;
+def LH      : LoadM<0x21, "lh",  sextloadi16>;
+def LHu     : LoadM<0x25, "lhu", zextloadi16>;
+def LW      : LoadM<0x23, "lw",  load>;
+def SB      : StoreM<0x28, "sb", truncstorei8>;
+def SH      : StoreM<0x29, "sh", truncstorei16>;
+def SW      : StoreM<0x2b, "sw", store>;
+
+/// Jump and Branch Instructions
+def J       : JumpFJ<0x02, "j">;
+def JR      : JumpFR<0x00, 0x08, "jr">;
+def JAL     : JumpLink<0x03, "jal">;
+def JALR    : JumpLinkReg<0x00, 0x09, "jalr">;
+def BEQ     : CBranch<0x04, "beq", seteq>;
+def BNE     : CBranch<0x05, "bne", setne>;
+
+let rt=1 in
+  def BGEZ  : CBranchZero<0x01, "bgez", setge>;
+
+let rt=0 in {
+  def BGTZ  : CBranchZero<0x07, "bgtz", setgt>;
+  def BLEZ  : CBranchZero<0x07, "blez", setle>;
+  def BLTZ  : CBranchZero<0x01, "bltz", setlt>;
+}
+
+def BGEZAL  : BranchLink<"bgezal">;
+def BLTZAL  : BranchLink<"bltzal">;
+
+let isReturn=1, isTerminator=1, hasDelaySlot=1,
+    isBarrier=1, hasCtrlDep=1, rs=0, rt=0, shamt=0 in
+  def RET : FR <0x00, 0x02, (outs), (ins CPURegs:$target),
+                "jr\t$target", [(MipsRet CPURegs:$target)], IIBranch>;
+
+/// Multiply and Divide Instructions. 
+let Defs = [HI, LO] in {
+  def MULT    : MulDiv<0x18, "mult", IIImul>;
+  def MULTu   : MulDiv<0x19, "multu", IIImul>;
+  def DIV     : MulDiv<0x1a, "div", IIIdiv>;
+  def DIVu    : MulDiv<0x1b, "divu", IIIdiv>;
+}
+
+let Defs = [HI] in
+  def MTHI  : MoveToLOHI<0x11, "mthi">;
+let Defs = [LO] in
+  def MTLO  : MoveToLOHI<0x13, "mtlo">;
+
+let Uses = [HI] in
+  def MFHI  : MoveFromLOHI<0x10, "mfhi">;
+let Uses = [LO] in
+  def MFLO  : MoveFromLOHI<0x12, "mflo">;
+
+/// Sign Ext In Register Instructions.
+let Predicates = [HasSEInReg] in {
+  let shamt = 0x10, rs = 0 in
+    def SEB : SignExtInReg<0x21, "seb", i8>;
+
+  let shamt = 0x18, rs = 0 in
+    def SEH : SignExtInReg<0x20, "seh", i16>;
+}
+
+/// Count Leading
+let Predicates = [HasBitCount] in {
+  let rt = 0 in
+    def CLZ : CountLeading<0b010110, "clz", ctlz>;
+}
+
+/// Byte Swap
+let Predicates = [HasSwap] in {
+  let shamt = 0x3, rs = 0 in
+    def WSBW : ByteSwap<0x20, "wsbw">;
+}
+
+/// Conditional Move
+def MIPS_CMOV_ZERO  : PatLeaf<(i32 0)>;
+def MIPS_CMOV_NZERO : PatLeaf<(i32 1)>;
+
+let Predicates = [HasCondMov], isTwoAddress = 1 in {
+  def MOVN : CondMov<0x0a, "movn", MIPS_CMOV_NZERO>;
+  def MOVZ : CondMov<0x0b, "movz", MIPS_CMOV_ZERO>;
+}
+
+/// No operation
+let addr=0 in
+  def NOP   : FJ<0, (outs), (ins), "nop", [], IIAlu>;
+
+// FrameIndexes are legalized when they are operands from load/store
+// instructions. The same not happens for stack address copies, so an
+// add op with mem ComplexPattern is used and the stack address copy
+// can be matched. It's similar to Sparc LEA_ADDRi
+def LEA_ADDiu : EffectiveAddress<"addiu\t$dst, ${addr:stackloc}">;
+
+// MADD*/MSUB* are not part of MipsI either.
+//def MADD    : MArithR<0x00, "madd">;
+//def MADDU   : MArithR<0x01, "maddu">;
+//def MSUB    : MArithR<0x04, "msub">;
+//def MSUBU   : MArithR<0x05, "msubu">;
+
+// MUL is a assembly macro in the current used ISAs. In recent ISA's
+// it is a real instruction.
+//def MUL   : ArithR<0x1c, 0x02, "mul", mul, IIImul>;
+
+//===----------------------------------------------------------------------===//
+//  Arbitrary patterns that map to one or more instructions
+//===----------------------------------------------------------------------===//
+
+// Small immediates
+def : Pat<(i32 immSExt16:$in),
+          (ADDiu ZERO, imm:$in)>;
+def : Pat<(i32 immZExt16:$in),
+          (ORi ZERO, imm:$in)>;
+
+// Arbitrary immediates
+def : Pat<(i32 imm:$imm),
+          (ORi (LUi (HI16 imm:$imm)), (LO16 imm:$imm))>;
+
+// Carry patterns
+def : Pat<(subc CPURegs:$lhs, CPURegs:$rhs),
+          (SUBu CPURegs:$lhs, CPURegs:$rhs)>;
+def : Pat<(addc CPURegs:$lhs, CPURegs:$rhs),
+          (ADDu CPURegs:$lhs, CPURegs:$rhs)>;
+def : Pat<(addc  CPURegs:$src, imm:$imm),
+          (ADDiu CPURegs:$src, imm:$imm)>;
+
+// Call
+def : Pat<(MipsJmpLink (i32 tglobaladdr:$dst)),
+          (JAL tglobaladdr:$dst)>;
+def : Pat<(MipsJmpLink (i32 texternalsym:$dst)),
+          (JAL texternalsym:$dst)>;
+def : Pat<(MipsJmpLink CPURegs:$dst),
+          (JALR CPURegs:$dst)>;
+
+// hi/lo relocs
+def : Pat<(MipsHi tglobaladdr:$in), (LUi tglobaladdr:$in)>;
+def : Pat<(add CPURegs:$hi, (MipsLo tglobaladdr:$lo)),
+          (ADDiu CPURegs:$hi, tglobaladdr:$lo)>;
+
+def : Pat<(MipsHi tjumptable:$in), (LUi tjumptable:$in)>;
+def : Pat<(add CPURegs:$hi, (MipsLo tjumptable:$lo)),
+          (ADDiu CPURegs:$hi, tjumptable:$lo)>;
+
+def : Pat<(MipsHi tconstpool:$in), (LUi tconstpool:$in)>;
+def : Pat<(add CPURegs:$hi, (MipsLo tconstpool:$lo)),
+          (ADDiu CPURegs:$hi, tconstpool:$lo)>;
+
+// gp_rel relocs
+def : Pat<(add CPURegs:$gp, (MipsGPRel tglobaladdr:$in)), 
+          (ADDiu CPURegs:$gp, tglobaladdr:$in)>;
+def : Pat<(add CPURegs:$gp, (MipsGPRel tconstpool:$in)), 
+          (ADDiu CPURegs:$gp, tconstpool:$in)>;
+
+// Mips does not have "not", so we expand our way
+def : Pat<(not CPURegs:$in),
+          (NOR CPURegs:$in, ZERO)>;
+
+// extended load and stores
+def : Pat<(extloadi1  addr:$src), (LBu addr:$src)>;
+def : Pat<(extloadi8  addr:$src), (LBu addr:$src)>;
+def : Pat<(extloadi16 addr:$src), (LHu addr:$src)>;
+
+// peepholes
+def : Pat<(store (i32 0), addr:$dst), (SW ZERO, addr:$dst)>;
+
+// brcond patterns
+def : Pat<(brcond (setne CPURegs:$lhs, 0), bb:$dst),
+          (BNE CPURegs:$lhs, ZERO, bb:$dst)>;
+def : Pat<(brcond (seteq CPURegs:$lhs, 0), bb:$dst),
+          (BEQ CPURegs:$lhs, ZERO, bb:$dst)>;
+
+def : Pat<(brcond (setge CPURegs:$lhs, CPURegs:$rhs), bb:$dst),
+          (BEQ (SLT CPURegs:$lhs, CPURegs:$rhs), ZERO, bb:$dst)>;
+def : Pat<(brcond (setuge CPURegs:$lhs, CPURegs:$rhs), bb:$dst),
+          (BEQ (SLTu CPURegs:$lhs, CPURegs:$rhs), ZERO, bb:$dst)>;
+def : Pat<(brcond (setge CPURegs:$lhs, immSExt16:$rhs), bb:$dst),
+          (BEQ (SLTi CPURegs:$lhs, immSExt16:$rhs), ZERO, bb:$dst)>;
+def : Pat<(brcond (setuge CPURegs:$lhs, immSExt16:$rhs), bb:$dst),
+          (BEQ (SLTiu CPURegs:$lhs, immSExt16:$rhs), ZERO, bb:$dst)>;
+
+def : Pat<(brcond (setle CPURegs:$lhs, CPURegs:$rhs), bb:$dst),
+          (BEQ (SLT CPURegs:$rhs, CPURegs:$lhs), ZERO, bb:$dst)>;
+def : Pat<(brcond (setule CPURegs:$lhs, CPURegs:$rhs), bb:$dst),
+          (BEQ (SLTu CPURegs:$rhs, CPURegs:$lhs), ZERO, bb:$dst)>;
+
+def : Pat<(brcond CPURegs:$cond, bb:$dst),
+          (BNE CPURegs:$cond, ZERO, bb:$dst)>;
+
+// select patterns
+def : Pat<(select (setge CPURegs:$lhs, CPURegs:$rhs), CPURegs:$T, CPURegs:$F),
+          (MOVZ CPURegs:$F, CPURegs:$T, (SLT CPURegs:$lhs, CPURegs:$rhs))>;
+def : Pat<(select (setuge CPURegs:$lhs, CPURegs:$rhs), CPURegs:$T, CPURegs:$F),
+          (MOVZ CPURegs:$F, CPURegs:$T, (SLTu CPURegs:$lhs, CPURegs:$rhs))>;
+def : Pat<(select (setge CPURegs:$lhs, immSExt16:$rhs), CPURegs:$T, CPURegs:$F),
+          (MOVZ CPURegs:$F, CPURegs:$T, (SLTi CPURegs:$lhs, immSExt16:$rhs))>;
+def : Pat<(select (setuge CPURegs:$lh, immSExt16:$rh), CPURegs:$T, CPURegs:$F),
+          (MOVZ CPURegs:$F, CPURegs:$T, (SLTiu CPURegs:$lh, immSExt16:$rh))>;
+
+def : Pat<(select (setle CPURegs:$lhs, CPURegs:$rhs), CPURegs:$T, CPURegs:$F),
+          (MOVZ CPURegs:$F, CPURegs:$T, (SLT CPURegs:$rhs, CPURegs:$lhs))>;
+def : Pat<(select (setule CPURegs:$lhs, CPURegs:$rhs), CPURegs:$T, CPURegs:$F),
+          (MOVZ CPURegs:$F, CPURegs:$T, (SLTu CPURegs:$rhs, CPURegs:$lhs))>;
+
+def : Pat<(select (seteq CPURegs:$lhs, CPURegs:$rhs), CPURegs:$T, CPURegs:$F),
+          (MOVZ CPURegs:$F, CPURegs:$T, (XOR CPURegs:$lhs, CPURegs:$rhs))>;
+def : Pat<(select (setne CPURegs:$lhs, CPURegs:$rhs), CPURegs:$T, CPURegs:$F),
+          (MOVN CPURegs:$F, CPURegs:$T, (XOR CPURegs:$lhs, CPURegs:$rhs))>;
+
+def : Pat<(select CPURegs:$cond, CPURegs:$T, CPURegs:$F), 
+          (MOVN CPURegs:$F, CPURegs:$T, CPURegs:$cond)>;
+
+// setcc patterns
+def : Pat<(seteq CPURegs:$lhs, CPURegs:$rhs),
+          (SLTu (XOR CPURegs:$lhs, CPURegs:$rhs), 1)>;
+def : Pat<(setne CPURegs:$lhs, CPURegs:$rhs),
+          (SLTu ZERO, (XOR CPURegs:$lhs, CPURegs:$rhs))>;
+
+def : Pat<(setle CPURegs:$lhs, CPURegs:$rhs),
+          (XORi (SLT CPURegs:$rhs, CPURegs:$lhs), 1)>;
+def : Pat<(setule CPURegs:$lhs, CPURegs:$rhs),
+          (XORi (SLTu CPURegs:$rhs, CPURegs:$lhs), 1)>;
+
+def : Pat<(setgt CPURegs:$lhs, CPURegs:$rhs),
+          (SLT CPURegs:$rhs, CPURegs:$lhs)>;
+def : Pat<(setugt CPURegs:$lhs, CPURegs:$rhs),
+          (SLTu CPURegs:$rhs, CPURegs:$lhs)>;
+
+def : Pat<(setge CPURegs:$lhs, CPURegs:$rhs),
+          (XORi (SLT CPURegs:$lhs, CPURegs:$rhs), 1)>;
+def : Pat<(setuge CPURegs:$lhs, CPURegs:$rhs),
+          (XORi (SLTu CPURegs:$lhs, CPURegs:$rhs), 1)>;
+
+def : Pat<(setge CPURegs:$lhs, immSExt16:$rhs),
+          (XORi (SLTi CPURegs:$lhs, immSExt16:$rhs), 1)>;
+def : Pat<(setuge CPURegs:$lhs, immSExt16:$rhs),
+          (XORi (SLTiu CPURegs:$lhs, immSExt16:$rhs), 1)>;
+
+//===----------------------------------------------------------------------===//
+// Floating Point Support
+//===----------------------------------------------------------------------===//
+
+include "MipsInstrFPU.td"
+

diff --git a/lib/Target/Mips/MipsMCAsmInfo.cpp b/lib/Target/Mips/MipsMCAsmInfo.cpp
new file mode 100644
index 0000000..89e3e11
--- /dev/null
+++ b/lib/Target/Mips/MipsMCAsmInfo.cpp

@@ -0,0 +1,27 @@
+//===-- MipsMCAsmInfo.cpp - Mips asm properties ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the MipsMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsMCAsmInfo.h"
+using namespace llvm;
+
+MipsMCAsmInfo::MipsMCAsmInfo(const Target &T, const StringRef &TT) {
+  AlignmentIsInBytes          = false;
+  Data16bitsDirective         = "\t.half\t";
+  Data32bitsDirective         = "\t.word\t";
+  Data64bitsDirective         = 0;
+  PrivateGlobalPrefix         = "$";
+  CommentString               = "#";
+  ZeroDirective               = "\t.space\t";
+  GPRel32Directive            = "\t.gpword\t";
+  HasSetDirective             = false;
+}

diff --git a/lib/Target/Mips/MipsMCAsmInfo.h b/lib/Target/Mips/MipsMCAsmInfo.h
new file mode 100644
index 0000000..33a4b5e
--- /dev/null
+++ b/lib/Target/Mips/MipsMCAsmInfo.h

@@ -0,0 +1,30 @@
+//=====-- MipsMCAsmInfo.h - Mips asm properties ---------------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the MipsMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSTARGETASMINFO_H
+#define MIPSTARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+  class Target;
+  class StringRef;
+  
+  class MipsMCAsmInfo : public MCAsmInfo {
+  public:
+    explicit MipsMCAsmInfo(const Target &T, const StringRef &TT);
+  };
+
+} // namespace llvm
+
+#endif

diff --git a/lib/Target/Mips/MipsMachineFunction.h b/lib/Target/Mips/MipsMachineFunction.h
new file mode 100644
index 0000000..a300f49
--- /dev/null
+++ b/lib/Target/Mips/MipsMachineFunction.h

@@ -0,0 +1,140 @@
+//===-- MipsMachineFunctionInfo.h - Private data used for Mips ----*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Mips specific subclass of MachineFunctionInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPS_MACHINE_FUNCTION_INFO_H
+#define MIPS_MACHINE_FUNCTION_INFO_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/VectorExtras.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+
+namespace llvm {
+
+/// MipsFunctionInfo - This class is derived from MachineFunction private
+/// Mips target-specific information for each MachineFunction.
+class MipsFunctionInfo : public MachineFunctionInfo {
+
+private:
+  /// Holds for each function where on the stack the Frame Pointer must be 
+  /// saved. This is used on Prologue and Epilogue to emit FP save/restore
+  int FPStackOffset;
+
+  /// Holds for each function where on the stack the Return Address must be 
+  /// saved. This is used on Prologue and Epilogue to emit RA save/restore
+  int RAStackOffset;
+
+  /// At each function entry, two special bitmask directives must be emitted
+  /// to help debugging, for CPU and FPU callee saved registers. Both need
+  /// the negative offset from the final stack size and its higher registers
+  /// location on the stack.
+  int CPUTopSavedRegOff;
+  int FPUTopSavedRegOff;
+
+  /// MipsFIHolder - Holds a FrameIndex and it's Stack Pointer Offset
+  struct MipsFIHolder {
+
+    int FI;
+    int SPOffset;
+
+    MipsFIHolder(int FrameIndex, int StackPointerOffset)
+      : FI(FrameIndex), SPOffset(StackPointerOffset) {}
+  };
+
+  /// When PIC is used the GP must be saved on the stack on the function 
+  /// prologue and must be reloaded from this stack location after every 
+  /// call. A reference to its stack location and frame index must be kept 
+  /// to be used on emitPrologue and processFunctionBeforeFrameFinalized.
+  MipsFIHolder GPHolder;
+
+  /// On LowerFormalArguments the stack size is unknown, so the Stack
+  /// Pointer Offset calculation of "not in register arguments" must be 
+  /// postponed to emitPrologue. 
+  SmallVector<MipsFIHolder, 16> FnLoadArgs;
+  bool HasLoadArgs;
+
+  // When VarArgs, we must write registers back to caller stack, preserving 
+  // on register arguments. Since the stack size is unknown on 
+  // LowerFormalArguments, the Stack Pointer Offset calculation must be
+  // postponed to emitPrologue. 
+  SmallVector<MipsFIHolder, 4> FnStoreVarArgs;
+  bool HasStoreVarArgs;
+
+  /// SRetReturnReg - Some subtargets require that sret lowering includes
+  /// returning the value of the returned struct in a register. This field
+  /// holds the virtual register into which the sret argument is passed.
+  unsigned SRetReturnReg;
+
+  /// GlobalBaseReg - keeps track of the virtual register initialized for
+  /// use as the global base register. This is used for PIC in some PIC
+  /// relocation models.
+  unsigned GlobalBaseReg;
+
+public:
+  MipsFunctionInfo(MachineFunction& MF) 
+  : FPStackOffset(0), RAStackOffset(0), CPUTopSavedRegOff(0), 
+    FPUTopSavedRegOff(0), GPHolder(-1,-1), HasLoadArgs(false), 
+    HasStoreVarArgs(false), SRetReturnReg(0), GlobalBaseReg(0)
+  {}
+
+  int getFPStackOffset() const { return FPStackOffset; }
+  void setFPStackOffset(int Off) { FPStackOffset = Off; }
+
+  int getRAStackOffset() const { return RAStackOffset; }
+  void setRAStackOffset(int Off) { RAStackOffset = Off; }
+
+  int getCPUTopSavedRegOff() const { return CPUTopSavedRegOff; }
+  void setCPUTopSavedRegOff(int Off) { CPUTopSavedRegOff = Off; }
+
+  int getFPUTopSavedRegOff() const { return FPUTopSavedRegOff; }
+  void setFPUTopSavedRegOff(int Off) { FPUTopSavedRegOff = Off; }
+
+  int getGPStackOffset() const { return GPHolder.SPOffset; }
+  int getGPFI() const { return GPHolder.FI; }
+  void setGPStackOffset(int Off) { GPHolder.SPOffset = Off; }
+  void setGPFI(int FI) { GPHolder.FI = FI; }
+  bool needGPSaveRestore() const { return GPHolder.SPOffset != -1; }
+
+  bool hasLoadArgs() const { return HasLoadArgs; }
+  bool hasStoreVarArgs() const { return HasStoreVarArgs; } 
+
+  void recordLoadArgsFI(int FI, int SPOffset) {
+    if (!HasLoadArgs) HasLoadArgs=true;
+    FnLoadArgs.push_back(MipsFIHolder(FI, SPOffset));
+  }
+  void recordStoreVarArgsFI(int FI, int SPOffset) {
+    if (!HasStoreVarArgs) HasStoreVarArgs=true;
+    FnStoreVarArgs.push_back(MipsFIHolder(FI, SPOffset));
+  }
+
+  void adjustLoadArgsFI(MachineFrameInfo *MFI) const {
+    if (!hasLoadArgs()) return;
+    for (unsigned i = 0, e = FnLoadArgs.size(); i != e; ++i) 
+      MFI->setObjectOffset( FnLoadArgs[i].FI, FnLoadArgs[i].SPOffset );
+  }
+  void adjustStoreVarArgsFI(MachineFrameInfo *MFI) const {
+    if (!hasStoreVarArgs()) return; 
+    for (unsigned i = 0, e = FnStoreVarArgs.size(); i != e; ++i) 
+      MFI->setObjectOffset( FnStoreVarArgs[i].FI, FnStoreVarArgs[i].SPOffset );
+  }
+
+  unsigned getSRetReturnReg() const { return SRetReturnReg; }
+  void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+
+  unsigned getGlobalBaseReg() const { return GlobalBaseReg; }
+  void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; }
+};
+
+} // end of namespace llvm
+
+#endif // MIPS_MACHINE_FUNCTION_INFO_H

diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp
new file mode 100644
index 0000000..f923bed
--- /dev/null
+++ b/lib/Target/Mips/MipsRegisterInfo.cpp

@@ -0,0 +1,532 @@
+//===- MipsRegisterInfo.cpp - MIPS Register Information -== -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the MIPS implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mips-reg-info"
+
+#include "Mips.h"
+#include "MipsSubtarget.h"
+#include "MipsRegisterInfo.h"
+#include "MipsMachineFunction.h"
+#include "llvm/Constants.h"
+#include "llvm/Type.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+
+using namespace llvm;
+
+MipsRegisterInfo::MipsRegisterInfo(const MipsSubtarget &ST, 
+                                   const TargetInstrInfo &tii)
+  : MipsGenRegisterInfo(Mips::ADJCALLSTACKDOWN, Mips::ADJCALLSTACKUP),
+    Subtarget(ST), TII(tii) {}
+
+/// getRegisterNumbering - Given the enum value for some register, e.g.
+/// Mips::RA, return the number that it corresponds to (e.g. 31).
+unsigned MipsRegisterInfo::
+getRegisterNumbering(unsigned RegEnum) 
+{
+  switch (RegEnum) {
+    case Mips::ZERO : case Mips::F0 : case Mips::D0 : return 0;
+    case Mips::AT   : case Mips::F1 : return 1;
+    case Mips::V0   : case Mips::F2 : case Mips::D1 : return 2;
+    case Mips::V1   : case Mips::F3 : return 3;
+    case Mips::A0   : case Mips::F4 : case Mips::D2 : return 4;
+    case Mips::A1   : case Mips::F5 : return 5;
+    case Mips::A2   : case Mips::F6 : case Mips::D3 : return 6;
+    case Mips::A3   : case Mips::F7 : return 7;
+    case Mips::T0   : case Mips::F8 : case Mips::D4 : return 8;
+    case Mips::T1   : case Mips::F9 : return 9;
+    case Mips::T2   : case Mips::F10: case Mips::D5: return 10;
+    case Mips::T3   : case Mips::F11: return 11;
+    case Mips::T4   : case Mips::F12: case Mips::D6: return 12;
+    case Mips::T5   : case Mips::F13: return 13;
+    case Mips::T6   : case Mips::F14: case Mips::D7: return 14;
+    case Mips::T7   : case Mips::F15: return 15;
+    case Mips::T8   : case Mips::F16: case Mips::D8: return 16;
+    case Mips::T9   : case Mips::F17: return 17;
+    case Mips::S0   : case Mips::F18: case Mips::D9: return 18;
+    case Mips::S1   : case Mips::F19: return 19;
+    case Mips::S2   : case Mips::F20: case Mips::D10: return 20;
+    case Mips::S3   : case Mips::F21: return 21;
+    case Mips::S4   : case Mips::F22: case Mips::D11: return 22;
+    case Mips::S5   : case Mips::F23: return 23;
+    case Mips::S6   : case Mips::F24: case Mips::D12: return 24;
+    case Mips::S7   : case Mips::F25: return 25;
+    case Mips::K0   : case Mips::F26: case Mips::D13: return 26;
+    case Mips::K1   : case Mips::F27: return 27;
+    case Mips::GP   : case Mips::F28: case Mips::D14: return 28;
+    case Mips::SP   : case Mips::F29: return 29;
+    case Mips::FP   : case Mips::F30: case Mips::D15: return 30;
+    case Mips::RA   : case Mips::F31: return 31;
+    default: llvm_unreachable("Unknown register number!");
+  }    
+  return 0; // Not reached
+}
+
+unsigned MipsRegisterInfo::getPICCallReg() { return Mips::T9; }
+
+//===----------------------------------------------------------------------===//
+// Callee Saved Registers methods 
+//===----------------------------------------------------------------------===//
+
+/// Mips Callee Saved Registers
+const unsigned* MipsRegisterInfo::
+getCalleeSavedRegs(const MachineFunction *MF) const 
+{
+  // Mips callee-save register range is $16-$23, $f20-$f30
+  static const unsigned SingleFloatOnlyCalleeSavedRegs[] = {
+    Mips::S0, Mips::S1, Mips::S2, Mips::S3, 
+    Mips::S4, Mips::S5, Mips::S6, Mips::S7,
+    Mips::F20, Mips::F21, Mips::F22, Mips::F23, Mips::F24, Mips::F25, 
+    Mips::F26, Mips::F27, Mips::F28, Mips::F29, Mips::F30, 0
+  };
+
+  static const unsigned BitMode32CalleeSavedRegs[] = {
+    Mips::S0, Mips::S1, Mips::S2, Mips::S3, 
+    Mips::S4, Mips::S5, Mips::S6, Mips::S7,
+    Mips::F20, Mips::F22, Mips::F24, Mips::F26, Mips::F28, Mips::F30, 0
+  };
+
+  if (Subtarget.isSingleFloat())
+    return SingleFloatOnlyCalleeSavedRegs;
+  else
+    return BitMode32CalleeSavedRegs;
+}
+
+/// Mips Callee Saved Register Classes
+const TargetRegisterClass* const* 
+MipsRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const 
+{
+  static const TargetRegisterClass * const SingleFloatOnlyCalleeSavedRC[] = {
+    &Mips::CPURegsRegClass, &Mips::CPURegsRegClass, &Mips::CPURegsRegClass, 
+    &Mips::CPURegsRegClass, &Mips::CPURegsRegClass, &Mips::CPURegsRegClass,
+    &Mips::CPURegsRegClass, &Mips::CPURegsRegClass,
+    &Mips::FGR32RegClass, &Mips::FGR32RegClass, &Mips::FGR32RegClass, 
+    &Mips::FGR32RegClass, &Mips::FGR32RegClass, &Mips::FGR32RegClass, 
+    &Mips::FGR32RegClass, &Mips::FGR32RegClass, &Mips::FGR32RegClass, 
+    &Mips::FGR32RegClass, &Mips::FGR32RegClass, 0
+  };
+
+  static const TargetRegisterClass * const BitMode32CalleeSavedRC[] = {
+    &Mips::CPURegsRegClass, &Mips::CPURegsRegClass, &Mips::CPURegsRegClass, 
+    &Mips::CPURegsRegClass, &Mips::CPURegsRegClass, &Mips::CPURegsRegClass,
+    &Mips::CPURegsRegClass, &Mips::CPURegsRegClass,
+    &Mips::FGR32RegClass, &Mips::FGR32RegClass, &Mips::FGR32RegClass, 
+    &Mips::FGR32RegClass, &Mips::FGR32RegClass, &Mips::FGR32RegClass, 0
+  };
+
+  if (Subtarget.isSingleFloat())
+    return SingleFloatOnlyCalleeSavedRC;
+  else
+    return BitMode32CalleeSavedRC;
+}
+
+BitVector MipsRegisterInfo::
+getReservedRegs(const MachineFunction &MF) const
+{
+  BitVector Reserved(getNumRegs());
+  Reserved.set(Mips::ZERO);
+  Reserved.set(Mips::AT);
+  Reserved.set(Mips::K0);
+  Reserved.set(Mips::K1);
+  Reserved.set(Mips::GP);
+  Reserved.set(Mips::SP);
+  Reserved.set(Mips::FP);
+  Reserved.set(Mips::RA);
+
+  // SRV4 requires that odd register can't be used.
+  if (!Subtarget.isSingleFloat())
+    for (unsigned FReg=(Mips::F0)+1; FReg < Mips::F30; FReg+=2)
+      Reserved.set(FReg);
+  
+  return Reserved;
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Stack Frame Processing methods
+// +----------------------------+
+//
+// The stack is allocated decrementing the stack pointer on
+// the first instruction of a function prologue. Once decremented,
+// all stack references are done thought a positive offset
+// from the stack/frame pointer, so the stack is considering
+// to grow up! Otherwise terrible hacks would have to be made
+// to get this stack ABI compliant :)
+//
+//  The stack frame required by the ABI (after call):
+//  Offset
+//
+//  0                 ----------
+//  4                 Args to pass 
+//  .                 saved $GP  (used in PIC)
+//  .                 Alloca allocations
+//  .                 Local Area
+//  .                 CPU "Callee Saved" Registers
+//  .                 saved FP
+//  .                 saved RA
+//  .                 FPU "Callee Saved" Registers
+//  StackSize         -----------
+//
+// Offset - offset from sp after stack allocation on function prologue
+//
+// The sp is the stack pointer subtracted/added from the stack size
+// at the Prologue/Epilogue
+//
+// References to the previous stack (to obtain arguments) are done
+// with offsets that exceeds the stack size: (stacksize+(4*(num_arg-1))
+//
+// Examples:
+// - reference to the actual stack frame
+//   for any local area var there is smt like : FI >= 0, StackOffset: 4
+//     sw REGX, 4(SP)
+//
+// - reference to previous stack frame
+//   suppose there's a load to the 5th arguments : FI < 0, StackOffset: 16.
+//   The emitted instruction will be something like:
+//     lw REGX, 16+StackSize(SP)
+//
+// Since the total stack size is unknown on LowerFormalArguments, all
+// stack references (ObjectOffset) created to reference the function 
+// arguments, are negative numbers. This way, on eliminateFrameIndex it's
+// possible to detect those references and the offsets are adjusted to
+// their real location.
+//
+//===----------------------------------------------------------------------===//
+
+void MipsRegisterInfo::adjustMipsStackFrame(MachineFunction &MF) const
+{
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  unsigned StackAlign = MF.getTarget().getFrameInfo()->getStackAlignment();
+  unsigned RegSize = Subtarget.isGP32bit() ? 4 : 8;
+  bool HasGP = MipsFI->needGPSaveRestore();
+
+  // Min and Max CSI FrameIndex.
+  int MinCSFI = -1, MaxCSFI = -1; 
+
+  // See the description at MipsMachineFunction.h
+  int TopCPUSavedRegOff = -1, TopFPUSavedRegOff = -1;
+
+  // Replace the dummy '0' SPOffset by the negative offsets, as explained on 
+  // LowerFormalArguments. Leaving '0' for while is necessary to avoid
+  // the approach done by calculateFrameObjectOffsets to the stack frame.
+  MipsFI->adjustLoadArgsFI(MFI);
+  MipsFI->adjustStoreVarArgsFI(MFI); 
+
+  // It happens that the default stack frame allocation order does not directly 
+  // map to the convention used for mips. So we must fix it. We move the callee 
+  // save register slots after the local variables area, as described in the
+  // stack frame above.
+  unsigned CalleeSavedAreaSize = 0;
+  if (!CSI.empty()) {
+    MinCSFI = CSI[0].getFrameIdx();
+    MaxCSFI = CSI[CSI.size()-1].getFrameIdx();
+  }
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i)
+    CalleeSavedAreaSize += MFI->getObjectAlignment(CSI[i].getFrameIdx());
+
+  unsigned StackOffset = HasGP ? (MipsFI->getGPStackOffset()+RegSize)
+                : (Subtarget.isABI_O32() ? 16 : 0);
+
+  // Adjust local variables. They should come on the stack right
+  // after the arguments.
+  int LastOffsetFI = -1;
+  for (int i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
+    if (i >= MinCSFI && i <= MaxCSFI)
+      continue;
+    if (MFI->isDeadObjectIndex(i))
+      continue;
+    unsigned Offset = 
+      StackOffset + MFI->getObjectOffset(i) - CalleeSavedAreaSize;
+    if (LastOffsetFI == -1)
+      LastOffsetFI = i;
+    if (Offset > MFI->getObjectOffset(LastOffsetFI))
+      LastOffsetFI = i;
+    MFI->setObjectOffset(i, Offset);
+  }
+
+  // Adjust CPU Callee Saved Registers Area. Registers RA and FP must
+  // be saved in this CPU Area. This whole area must be aligned to the 
+  // default Stack Alignment requirements.
+  if (LastOffsetFI >= 0)
+    StackOffset = MFI->getObjectOffset(LastOffsetFI)+ 
+                  MFI->getObjectSize(LastOffsetFI);
+  StackOffset = ((StackOffset+StackAlign-1)/StackAlign*StackAlign);
+
+  for (unsigned i = 0, e = CSI.size(); i != e ; ++i) {
+    if (CSI[i].getRegClass() != Mips::CPURegsRegisterClass)
+      break;
+    MFI->setObjectOffset(CSI[i].getFrameIdx(), StackOffset);
+    TopCPUSavedRegOff = StackOffset;
+    StackOffset += MFI->getObjectAlignment(CSI[i].getFrameIdx());
+  }
+
+  // Stack locations for FP and RA. If only one of them is used, 
+  // the space must be allocated for both, otherwise no space at all.
+  if (hasFP(MF) || MFI->hasCalls()) {
+    // FP stack location
+    MFI->setObjectOffset(MFI->CreateStackObject(RegSize, RegSize, true), 
+                         StackOffset);
+    MipsFI->setFPStackOffset(StackOffset);
+    TopCPUSavedRegOff = StackOffset;
+    StackOffset += RegSize;
+
+    // SP stack location
+    MFI->setObjectOffset(MFI->CreateStackObject(RegSize, RegSize, true),
+                         StackOffset);
+    MipsFI->setRAStackOffset(StackOffset);
+    StackOffset += RegSize;
+
+    if (MFI->hasCalls())
+      TopCPUSavedRegOff += RegSize;
+  }
+
+  StackOffset = ((StackOffset+StackAlign-1)/StackAlign*StackAlign);
+  
+  // Adjust FPU Callee Saved Registers Area. This Area must be 
+  // aligned to the default Stack Alignment requirements.
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    if (CSI[i].getRegClass() == Mips::CPURegsRegisterClass)
+      continue;
+    MFI->setObjectOffset(CSI[i].getFrameIdx(), StackOffset);
+    TopFPUSavedRegOff = StackOffset;
+    StackOffset += MFI->getObjectAlignment(CSI[i].getFrameIdx());
+  }
+  StackOffset = ((StackOffset+StackAlign-1)/StackAlign*StackAlign);
+
+  // Update frame info
+  MFI->setStackSize(StackOffset);
+
+  // Recalculate the final tops offset. The final values must be '0'
+  // if there isn't a callee saved register for CPU or FPU, otherwise
+  // a negative offset is needed.
+  if (TopCPUSavedRegOff >= 0)
+    MipsFI->setCPUTopSavedRegOff(TopCPUSavedRegOff-StackOffset);
+
+  if (TopFPUSavedRegOff >= 0)
+    MipsFI->setFPUTopSavedRegOff(TopFPUSavedRegOff-StackOffset);
+}
+
+// hasFP - Return true if the specified function should have a dedicated frame
+// pointer register.  This is true if the function has variable sized allocas or
+// if frame pointer elimination is disabled.
+bool MipsRegisterInfo::
+hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  return NoFramePointerElim || MFI->hasVarSizedObjects();
+}
+
+// This function eliminate ADJCALLSTACKDOWN, 
+// ADJCALLSTACKUP pseudo instructions
+void MipsRegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  // Simply discard ADJCALLSTACKDOWN, ADJCALLSTACKUP instructions.
+  MBB.erase(I);
+}
+
+// FrameIndex represent objects inside a abstract stack.
+// We must replace FrameIndex with an stack/frame pointer
+// direct reference.
+unsigned MipsRegisterInfo::
+eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
+                    int *Value, RegScavenger *RS) const
+{
+  MachineInstr &MI = *II;
+  MachineFunction &MF = *MI.getParent()->getParent();
+
+  unsigned i = 0;
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && 
+           "Instr doesn't have FrameIndex operand!");
+  }
+
+  DEBUG(errs() << "\nFunction : " << MF.getFunction()->getName() << "\n";
+        errs() << "<--------->\n" << MI);
+
+  int FrameIndex = MI.getOperand(i).getIndex();
+  int stackSize  = MF.getFrameInfo()->getStackSize();
+  int spOffset   = MF.getFrameInfo()->getObjectOffset(FrameIndex);
+
+  DEBUG(errs() << "FrameIndex : " << FrameIndex << "\n"
+               << "spOffset   : " << spOffset << "\n"
+               << "stackSize  : " << stackSize << "\n");
+
+  // as explained on LowerFormalArguments, detect negative offsets
+  // and adjust SPOffsets considering the final stack size.
+  int Offset = ((spOffset < 0) ? (stackSize + (-(spOffset+4))) : (spOffset));
+  Offset    += MI.getOperand(i-1).getImm();
+
+  DEBUG(errs() << "Offset     : " << Offset << "\n" << "<--------->\n");
+
+  MI.getOperand(i-1).ChangeToImmediate(Offset);
+  MI.getOperand(i).ChangeToRegister(getFrameRegister(MF), false);
+  return 0;
+}
+
+void MipsRegisterInfo::
+emitPrologue(MachineFunction &MF) const 
+{
+  MachineBasicBlock &MBB   = MF.front();
+  MachineFrameInfo *MFI    = MF.getFrameInfo();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  DebugLoc dl = (MBBI != MBB.end() ?
+                 MBBI->getDebugLoc() : DebugLoc::getUnknownLoc());
+  bool isPIC = (MF.getTarget().getRelocationModel() == Reloc::PIC_);
+
+  // Get the right frame order for Mips.
+  adjustMipsStackFrame(MF);
+
+  // Get the number of bytes to allocate from the FrameInfo.
+  unsigned StackSize = MFI->getStackSize();
+
+  // No need to allocate space on the stack.
+  if (StackSize == 0 && !MFI->hasCalls()) return;
+
+  int FPOffset = MipsFI->getFPStackOffset();
+  int RAOffset = MipsFI->getRAStackOffset();
+
+  BuildMI(MBB, MBBI, dl, TII.get(Mips::NOREORDER));
+  
+  // TODO: check need from GP here.
+  if (isPIC && Subtarget.isABI_O32()) 
+    BuildMI(MBB, MBBI, dl, TII.get(Mips::CPLOAD)).addReg(getPICCallReg());
+  BuildMI(MBB, MBBI, dl, TII.get(Mips::NOMACRO));
+
+  // Adjust stack : addi sp, sp, (-imm)
+  BuildMI(MBB, MBBI, dl, TII.get(Mips::ADDiu), Mips::SP)
+      .addReg(Mips::SP).addImm(-StackSize);
+
+  // Save the return address only if the function isnt a leaf one.
+  // sw  $ra, stack_loc($sp)
+  if (MFI->hasCalls()) { 
+    BuildMI(MBB, MBBI, dl, TII.get(Mips::SW))
+        .addReg(Mips::RA).addImm(RAOffset).addReg(Mips::SP);
+  }
+
+  // if framepointer enabled, save it and set it
+  // to point to the stack pointer
+  if (hasFP(MF)) {
+    // sw  $fp,stack_loc($sp)
+    BuildMI(MBB, MBBI, dl, TII.get(Mips::SW))
+      .addReg(Mips::FP).addImm(FPOffset).addReg(Mips::SP);
+
+    // move $fp, $sp
+    BuildMI(MBB, MBBI, dl, TII.get(Mips::ADDu), Mips::FP)
+      .addReg(Mips::SP).addReg(Mips::ZERO);
+  }
+
+  // Restore GP from the saved stack location
+  if (MipsFI->needGPSaveRestore())
+    BuildMI(MBB, MBBI, dl, TII.get(Mips::CPRESTORE))
+      .addImm(MipsFI->getGPStackOffset());
+}
+
+void MipsRegisterInfo::
+emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const 
+{
+  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  MachineFrameInfo *MFI            = MF.getFrameInfo();
+  MipsFunctionInfo *MipsFI         = MF.getInfo<MipsFunctionInfo>();
+  DebugLoc dl = MBBI->getDebugLoc();
+
+  // Get the number of bytes from FrameInfo
+  int NumBytes = (int) MFI->getStackSize();
+
+  // Get the FI's where RA and FP are saved.
+  int FPOffset = MipsFI->getFPStackOffset();
+  int RAOffset = MipsFI->getRAStackOffset();
+
+  // if framepointer enabled, restore it and restore the
+  // stack pointer
+  if (hasFP(MF)) {
+    // move $sp, $fp
+    BuildMI(MBB, MBBI, dl, TII.get(Mips::ADDu), Mips::SP)
+      .addReg(Mips::FP).addReg(Mips::ZERO);
+
+    // lw  $fp,stack_loc($sp)
+    BuildMI(MBB, MBBI, dl, TII.get(Mips::LW), Mips::FP)
+      .addImm(FPOffset).addReg(Mips::SP);
+  }
+
+  // Restore the return address only if the function isnt a leaf one.
+  // lw  $ra, stack_loc($sp)
+  if (MFI->hasCalls()) { 
+    BuildMI(MBB, MBBI, dl, TII.get(Mips::LW), Mips::RA)
+      .addImm(RAOffset).addReg(Mips::SP);
+  }
+
+  // adjust stack  : insert addi sp, sp, (imm)
+  if (NumBytes) {
+    BuildMI(MBB, MBBI, dl, TII.get(Mips::ADDiu), Mips::SP)
+      .addReg(Mips::SP).addImm(NumBytes);
+  }
+}
+
+
+void MipsRegisterInfo::
+processFunctionBeforeFrameFinalized(MachineFunction &MF) const {
+  // Set the stack offset where GP must be saved/loaded from.
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
+  if (MipsFI->needGPSaveRestore())
+    MFI->setObjectOffset(MipsFI->getGPFI(), MipsFI->getGPStackOffset());
+}
+
+unsigned MipsRegisterInfo::
+getRARegister() const {
+  return Mips::RA;
+}
+
+unsigned MipsRegisterInfo::
+getFrameRegister(const MachineFunction &MF) const {
+  return hasFP(MF) ? Mips::FP : Mips::SP;
+}
+
+unsigned MipsRegisterInfo::
+getEHExceptionRegister() const {
+  llvm_unreachable("What is the exception register");
+  return 0;
+}
+
+unsigned MipsRegisterInfo::
+getEHHandlerRegister() const {
+  llvm_unreachable("What is the exception handler register");
+  return 0;
+}
+
+int MipsRegisterInfo::
+getDwarfRegNum(unsigned RegNum, bool isEH) const {
+  llvm_unreachable("What is the dwarf register number");
+  return -1;
+}
+
+#include "MipsGenRegisterInfo.inc"
+

diff --git a/lib/Target/Mips/MipsRegisterInfo.h b/lib/Target/Mips/MipsRegisterInfo.h
new file mode 100644
index 0000000..6a3ec00
--- /dev/null
+++ b/lib/Target/Mips/MipsRegisterInfo.h

@@ -0,0 +1,88 @@
+//===- MipsRegisterInfo.h - Mips Register Information Impl ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Mips implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSREGISTERINFO_H
+#define MIPSREGISTERINFO_H
+
+#include "Mips.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "MipsGenRegisterInfo.h.inc"
+
+namespace llvm {
+class MipsSubtarget;
+class TargetInstrInfo;
+class Type;
+
+namespace Mips {
+  /// SubregIndex - The index of various sized subregister classes. Note that 
+  /// these indices must be kept in sync with the class indices in the 
+  /// MipsRegisterInfo.td file.
+  enum SubregIndex {
+    SUBREG_FPEVEN = 1, SUBREG_FPODD = 2
+  };
+}
+
+struct MipsRegisterInfo : public MipsGenRegisterInfo {
+  const MipsSubtarget &Subtarget;
+  const TargetInstrInfo &TII;
+  
+  MipsRegisterInfo(const MipsSubtarget &Subtarget, const TargetInstrInfo &tii);
+
+  /// getRegisterNumbering - Given the enum value for some register, e.g.
+  /// Mips::RA, return the number that it corresponds to (e.g. 31).
+  static unsigned getRegisterNumbering(unsigned RegEnum);
+
+  /// Get PIC indirect call register
+  static unsigned getPICCallReg();
+
+  /// Adjust the Mips stack frame.
+  void adjustMipsStackFrame(MachineFunction &MF) const;
+
+  /// Code Generation virtual methods...
+  const unsigned *getCalleeSavedRegs(const MachineFunction* MF = 0) const;
+
+  const TargetRegisterClass* const*
+  getCalleeSavedRegClasses(const MachineFunction* MF = 0) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+
+  bool hasFP(const MachineFunction &MF) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+  /// Stack Frame Processing Methods
+  unsigned eliminateFrameIndex(MachineBasicBlock::iterator II,
+                               int SPAdj, int *Value = NULL,
+                               RegScavenger *RS = NULL) const;
+
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
+
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+  
+  /// Debug information queries.
+  unsigned getRARegister() const;
+  unsigned getFrameRegister(const MachineFunction &MF) const;
+
+  /// Exception handling queries.
+  unsigned getEHExceptionRegister() const;
+  unsigned getEHHandlerRegister() const;
+
+  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+};
+
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td
new file mode 100644
index 0000000..00e7723
--- /dev/null
+++ b/lib/Target/Mips/MipsRegisterInfo.td

@@ -0,0 +1,276 @@
+//===- MipsRegisterInfo.td - Mips Register defs -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the MIPS register file
+//===----------------------------------------------------------------------===//
+
+// We have banks of 32 registers each.
+class MipsReg<string n> : Register<n> {
+  field bits<5> Num;
+  let Namespace = "Mips";
+}
+
+class MipsRegWithSubRegs<string n, list<Register> subregs> 
+  : RegisterWithSubRegs<n, subregs> {
+  field bits<5> Num;
+  let Namespace = "Mips";
+}
+
+// Mips CPU Registers
+class MipsGPRReg<bits<5> num, string n> : MipsReg<n> {
+  let Num = num;
+}
+
+// Mips 32-bit FPU Registers
+class FPR<bits<5> num, string n> : MipsReg<n> {
+  let Num = num;
+}
+
+// Mips 64-bit (aliased) FPU Registers
+class AFPR<bits<5> num, string n, list<Register> subregs> 
+  : MipsRegWithSubRegs<n, subregs> {
+  let Num = num;
+}
+
+//===----------------------------------------------------------------------===//
+//  Registers
+//===----------------------------------------------------------------------===//
+
+let Namespace = "Mips" in {
+
+  // General Purpose Registers
+  def ZERO : MipsGPRReg< 0, "ZERO">, DwarfRegNum<[0]>;
+  def AT   : MipsGPRReg< 1, "AT">,   DwarfRegNum<[1]>;
+  def V0   : MipsGPRReg< 2, "2">,    DwarfRegNum<[2]>;
+  def V1   : MipsGPRReg< 3, "3">,    DwarfRegNum<[3]>;
+  def A0   : MipsGPRReg< 4, "4">,    DwarfRegNum<[5]>;
+  def A1   : MipsGPRReg< 5, "5">,    DwarfRegNum<[5]>;
+  def A2   : MipsGPRReg< 6, "6">,    DwarfRegNum<[6]>;
+  def A3   : MipsGPRReg< 7, "7">,    DwarfRegNum<[7]>;
+  def T0   : MipsGPRReg< 8, "8">,    DwarfRegNum<[8]>;
+  def T1   : MipsGPRReg< 9, "9">,    DwarfRegNum<[9]>;
+  def T2   : MipsGPRReg< 10, "10">,  DwarfRegNum<[10]>;
+  def T3   : MipsGPRReg< 11, "11">,  DwarfRegNum<[11]>;
+  def T4   : MipsGPRReg< 12, "12">,  DwarfRegNum<[12]>;
+  def T5   : MipsGPRReg< 13, "13">,  DwarfRegNum<[13]>;
+  def T6   : MipsGPRReg< 14, "14">,  DwarfRegNum<[14]>;
+  def T7   : MipsGPRReg< 15, "15">,  DwarfRegNum<[15]>;
+  def S0   : MipsGPRReg< 16, "16">,  DwarfRegNum<[16]>;
+  def S1   : MipsGPRReg< 17, "17">,  DwarfRegNum<[17]>;
+  def S2   : MipsGPRReg< 18, "18">,  DwarfRegNum<[18]>;
+  def S3   : MipsGPRReg< 19, "19">,  DwarfRegNum<[19]>;
+  def S4   : MipsGPRReg< 20, "20">,  DwarfRegNum<[20]>;
+  def S5   : MipsGPRReg< 21, "21">,  DwarfRegNum<[21]>;
+  def S6   : MipsGPRReg< 22, "22">,  DwarfRegNum<[22]>;
+  def S7   : MipsGPRReg< 23, "23">,  DwarfRegNum<[23]>;
+  def T8   : MipsGPRReg< 24, "24">,  DwarfRegNum<[24]>;
+  def T9   : MipsGPRReg< 25, "25">,  DwarfRegNum<[25]>;
+  def K0   : MipsGPRReg< 26, "26">,  DwarfRegNum<[26]>;
+  def K1   : MipsGPRReg< 27, "27">,  DwarfRegNum<[27]>;
+  def GP   : MipsGPRReg< 28, "GP">,  DwarfRegNum<[28]>;
+  def SP   : MipsGPRReg< 29, "SP">,  DwarfRegNum<[29]>;
+  def FP   : MipsGPRReg< 30, "FP">,  DwarfRegNum<[30]>;
+  def RA   : MipsGPRReg< 31, "RA">,  DwarfRegNum<[31]>;
+  
+  /// Mips Single point precision FPU Registers
+  def F0  : FPR< 0,  "F0">, DwarfRegNum<[32]>;
+  def F1  : FPR< 1,  "F1">, DwarfRegNum<[33]>;
+  def F2  : FPR< 2,  "F2">, DwarfRegNum<[34]>;
+  def F3  : FPR< 3,  "F3">, DwarfRegNum<[35]>;
+  def F4  : FPR< 4,  "F4">, DwarfRegNum<[36]>;
+  def F5  : FPR< 5,  "F5">, DwarfRegNum<[37]>;
+  def F6  : FPR< 6,  "F6">, DwarfRegNum<[38]>;
+  def F7  : FPR< 7,  "F7">, DwarfRegNum<[39]>;
+  def F8  : FPR< 8,  "F8">, DwarfRegNum<[40]>;
+  def F9  : FPR< 9,  "F9">, DwarfRegNum<[41]>;
+  def F10 : FPR<10, "F10">, DwarfRegNum<[42]>;
+  def F11 : FPR<11, "F11">, DwarfRegNum<[43]>;
+  def F12 : FPR<12, "F12">, DwarfRegNum<[44]>;
+  def F13 : FPR<13, "F13">, DwarfRegNum<[45]>;
+  def F14 : FPR<14, "F14">, DwarfRegNum<[46]>;
+  def F15 : FPR<15, "F15">, DwarfRegNum<[47]>;
+  def F16 : FPR<16, "F16">, DwarfRegNum<[48]>;
+  def F17 : FPR<17, "F17">, DwarfRegNum<[49]>;
+  def F18 : FPR<18, "F18">, DwarfRegNum<[50]>;
+  def F19 : FPR<19, "F19">, DwarfRegNum<[51]>;
+  def F20 : FPR<20, "F20">, DwarfRegNum<[52]>;
+  def F21 : FPR<21, "F21">, DwarfRegNum<[53]>;
+  def F22 : FPR<22, "F22">, DwarfRegNum<[54]>;
+  def F23 : FPR<23, "F23">, DwarfRegNum<[55]>;
+  def F24 : FPR<24, "F24">, DwarfRegNum<[56]>;
+  def F25 : FPR<25, "F25">, DwarfRegNum<[57]>;
+  def F26 : FPR<26, "F26">, DwarfRegNum<[58]>;
+  def F27 : FPR<27, "F27">, DwarfRegNum<[59]>;
+  def F28 : FPR<28, "F28">, DwarfRegNum<[60]>;
+  def F29 : FPR<29, "F29">, DwarfRegNum<[61]>;
+  def F30 : FPR<30, "F30">, DwarfRegNum<[62]>;
+  def F31 : FPR<31, "F31">, DwarfRegNum<[63]>;
+  
+  /// Mips Double point precision FPU Registers (aliased
+  /// with the single precision to hold 64 bit values)
+  def D0  : AFPR< 0,  "F0", [F0,   F1]>, DwarfRegNum<[32]>;
+  def D1  : AFPR< 2,  "F2", [F2,   F3]>, DwarfRegNum<[34]>;
+  def D2  : AFPR< 4,  "F4", [F4,   F5]>, DwarfRegNum<[36]>;
+  def D3  : AFPR< 6,  "F6", [F6,   F7]>, DwarfRegNum<[38]>;
+  def D4  : AFPR< 8,  "F8", [F8,   F9]>, DwarfRegNum<[40]>;
+  def D5  : AFPR<10, "F10", [F10, F11]>, DwarfRegNum<[42]>;
+  def D6  : AFPR<12, "F12", [F12, F13]>, DwarfRegNum<[44]>;
+  def D7  : AFPR<14, "F14", [F14, F15]>, DwarfRegNum<[46]>;
+  def D8  : AFPR<16, "F16", [F16, F17]>, DwarfRegNum<[48]>;
+  def D9  : AFPR<18, "F18", [F18, F19]>, DwarfRegNum<[50]>;
+  def D10 : AFPR<20, "F20", [F20, F21]>, DwarfRegNum<[52]>;
+  def D11 : AFPR<22, "F22", [F22, F23]>, DwarfRegNum<[54]>;
+  def D12 : AFPR<24, "F24", [F24, F25]>, DwarfRegNum<[56]>;
+  def D13 : AFPR<26, "F26", [F26, F27]>, DwarfRegNum<[58]>;
+  def D14 : AFPR<28, "F28", [F28, F29]>, DwarfRegNum<[60]>;
+  def D15 : AFPR<30, "F30", [F30, F31]>, DwarfRegNum<[62]>;
+
+  // Hi/Lo registers
+  def HI  : Register<"hi">, DwarfRegNum<[64]>;
+  def LO  : Register<"lo">, DwarfRegNum<[65]>;
+
+  // Status flags register
+  def FCR31 : Register<"31">;
+}
+
+//===----------------------------------------------------------------------===//
+// Subregister Set Definitions
+//===----------------------------------------------------------------------===//
+
+def mips_subreg_fpeven : PatLeaf<(i32 1)>;
+def mips_subreg_fpodd  : PatLeaf<(i32 2)>;
+
+def : SubRegSet<1, [D0, D1, D2, D3, D4, D5, D6, D7, 
+                    D8, D9, D10, D11, D12, D13, D14, D15],
+                   [F0, F2, F4, F6, F8, F10, F12, F14,
+                    F16, F18, F20, F22, F24, F26, F28, F30]>;
+
+def : SubRegSet<2, [D0, D1, D2, D3, D4, D5, D6, D7, 
+                    D8, D9, D10, D11, D12, D13, D14, D15],
+                   [F1, F3, F5, F7, F9, F11, F13, F15,
+                    F17, F19, F21, F23, F25, F27, F29, F31]>;
+
+//===----------------------------------------------------------------------===//
+// Register Classes
+//===----------------------------------------------------------------------===//
+
+def CPURegs : RegisterClass<"Mips", [i32], 32, 
+  // Return Values and Arguments
+  [V0, V1, A0, A1, A2, A3,
+  // Not preserved across procedure calls
+  T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, 
+  // Callee save
+  S0, S1, S2, S3, S4, S5, S6, S7,
+  // Reserved
+  ZERO, AT, K0, K1, GP, SP, FP, RA]>
+{
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    CPURegsClass::iterator
+    CPURegsClass::allocation_order_end(const MachineFunction &MF) const {
+      // The last 8 registers on the list above are reserved
+      return end()-8;
+    }
+  }];
+}
+
+// 64bit fp:
+// * FGR64  - 32 64-bit registers
+// * AFGR64 - 16 32-bit even registers (32-bit FP Mode) 
+//
+// 32bit fp:
+// * FGR32 - 16 32-bit even registers
+// * FGR32 - 32 32-bit registers (single float only mode)
+def FGR32 : RegisterClass<"Mips", [f32], 32, 
+  // Return Values and Arguments
+  [F0, F1, F2, F3, F12, F13, F14, F15,
+  // Not preserved across procedure calls
+  F4, F5, F6, F7, F8, F9, F10, F11, F16, F17, F18, F19, 
+  // Callee save
+  F20, F21, F22, F23, F24, F25, F26, F27, F28, F29, F30,
+  // Reserved
+  F31]>
+{
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+
+    static const unsigned MIPS_FGR32[] = {
+      Mips::F0,  Mips::F1,  Mips::F2,  Mips::F3,  Mips::F12,  Mips::F13, 
+      Mips::F14, Mips::F15, Mips::F4,  Mips::F5,  Mips::F6,   Mips::F7, 
+      Mips::F8,  Mips::F9,  Mips::F10, Mips::F11, Mips::F16,  Mips::F17, 
+      Mips::F18, Mips::F19, Mips::F20, Mips::F21, Mips::F22,  Mips::F23, 
+      Mips::F24, Mips::F25, Mips::F26, Mips::F27, Mips::F28,  Mips::F29, 
+      Mips::F30
+    };
+
+    static const unsigned MIPS_SVR4_FGR32[] = {
+      Mips::F0,  Mips::F2,  Mips::F12, Mips::F14, Mips::F4, 
+      Mips::F6,  Mips::F8,  Mips::F10, Mips::F16, Mips::F18, 
+      Mips::F20, Mips::F22, Mips::F24, Mips::F26, Mips::F28, Mips::F30,
+    };
+
+    FGR32Class::iterator
+    FGR32Class::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const MipsSubtarget &Subtarget = TM.getSubtarget<MipsSubtarget>();
+
+      if (Subtarget.isSingleFloat())
+        return MIPS_FGR32;
+      else
+        return MIPS_SVR4_FGR32; 
+    }
+
+    FGR32Class::iterator
+    FGR32Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const MipsSubtarget &Subtarget = TM.getSubtarget<MipsSubtarget>();
+
+      if (Subtarget.isSingleFloat())
+        return MIPS_FGR32 + (sizeof(MIPS_FGR32) / sizeof(unsigned));
+      else
+        return MIPS_SVR4_FGR32 + (sizeof(MIPS_SVR4_FGR32) / sizeof(unsigned));
+    }
+  }];
+}
+
+def AFGR64 : RegisterClass<"Mips", [f64], 64, 
+  // Return Values and Arguments
+  [D0, D1, D6, D7,
+  // Not preserved across procedure calls
+  D2, D3, D4, D5, D8, D9, 
+  // Callee save
+  D10, D11, D12, D13, D14,
+  // Reserved
+  D15]>
+{
+  let SubRegClassList = [FGR32, FGR32];
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    AFGR64Class::iterator
+    AFGR64Class::allocation_order_end(const MachineFunction &MF) const {
+      // The last register on the list above is reserved
+      return end()-1;
+    }
+  }];
+}
+
+// Condition Register for floating point operations
+def CCR  : RegisterClass<"Mips", [i32], 32, [FCR31]>;
+
+// Hi/Lo Registers
+def HILO : RegisterClass<"Mips", [i32], 32, [HI, LO]>;
+

diff --git a/lib/Target/Mips/MipsSchedule.td b/lib/Target/Mips/MipsSchedule.td
new file mode 100644
index 0000000..0c3ca57
--- /dev/null
+++ b/lib/Target/Mips/MipsSchedule.td

@@ -0,0 +1,63 @@
+//===- MipsSchedule.td - Mips Scheduling Definitions ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Functional units across Mips chips sets. Based on GCC/Mips backend files.
+//===----------------------------------------------------------------------===//
+def ALU     : FuncUnit;
+def IMULDIV : FuncUnit;
+
+//===----------------------------------------------------------------------===//
+// Instruction Itinerary classes used for Mips 
+//===----------------------------------------------------------------------===//
+def IIAlu              : InstrItinClass;
+def IILoad             : InstrItinClass;
+def IIStore            : InstrItinClass;
+def IIXfer             : InstrItinClass;
+def IIBranch           : InstrItinClass;
+def IIHiLo             : InstrItinClass;
+def IIImul             : InstrItinClass;
+def IIIdiv             : InstrItinClass;
+def IIFcvt             : InstrItinClass;
+def IIFmove            : InstrItinClass;
+def IIFcmp             : InstrItinClass;
+def IIFadd             : InstrItinClass;
+def IIFmulSingle       : InstrItinClass;
+def IIFmulDouble       : InstrItinClass;
+def IIFdivSingle       : InstrItinClass;
+def IIFdivDouble       : InstrItinClass;
+def IIFsqrtSingle      : InstrItinClass;
+def IIFsqrtDouble      : InstrItinClass;
+def IIFrecipFsqrtStep  : InstrItinClass;
+def IIPseudo           : InstrItinClass;
+
+//===----------------------------------------------------------------------===//
+// Mips Generic instruction itineraries.
+//===----------------------------------------------------------------------===//
+def MipsGenericItineraries : ProcessorItineraries<[
+  InstrItinData<IIAlu              , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<IILoad             , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<IIStore            , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<IIXfer             , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<IIBranch           , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<IIHiLo             , [InstrStage<1,  [IMULDIV]>]>,
+  InstrItinData<IIImul             , [InstrStage<17, [IMULDIV]>]>,
+  InstrItinData<IIIdiv             , [InstrStage<38, [IMULDIV]>]>,
+  InstrItinData<IIFcvt             , [InstrStage<1,  [ALU]>]>,
+  InstrItinData<IIFmove            , [InstrStage<2,  [ALU]>]>,
+  InstrItinData<IIFcmp             , [InstrStage<3,  [ALU]>]>,
+  InstrItinData<IIFadd             , [InstrStage<4,  [ALU]>]>,
+  InstrItinData<IIFmulSingle       , [InstrStage<7,  [ALU]>]>,
+  InstrItinData<IIFmulDouble       , [InstrStage<8,  [ALU]>]>,
+  InstrItinData<IIFdivSingle       , [InstrStage<23, [ALU]>]>,
+  InstrItinData<IIFdivDouble       , [InstrStage<36, [ALU]>]>,
+  InstrItinData<IIFsqrtSingle      , [InstrStage<54, [ALU]>]>,
+  InstrItinData<IIFsqrtDouble      , [InstrStage<12, [ALU]>]>,
+  InstrItinData<IIFrecipFsqrtStep  , [InstrStage<5,  [ALU]>]>
+]>;

diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp
new file mode 100644
index 0000000..db114da
--- /dev/null
+++ b/lib/Target/Mips/MipsSubtarget.cpp

@@ -0,0 +1,51 @@
+//===- MipsSubtarget.cpp - Mips Subtarget Information -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Mips specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsSubtarget.h"
+#include "Mips.h"
+#include "MipsGenSubtarget.inc"
+using namespace llvm;
+
+MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &FS,
+                             bool little) : 
+  MipsArchVersion(Mips1), MipsABI(O32), IsLittle(little), IsSingleFloat(false),
+  IsFP64bit(false), IsGP64bit(false), HasVFPU(false), IsLinux(true),
+  HasSEInReg(false), HasCondMov(false), HasMulDivAdd(false), HasMinMax(false),
+  HasSwap(false), HasBitCount(false)
+{
+  std::string CPU = "mips1";
+  MipsArchVersion = Mips1;
+
+  // Parse features string.
+  ParseSubtargetFeatures(FS, CPU);
+
+  // Is the target system Linux ?
+  if (TT.find("linux") == std::string::npos)
+    IsLinux = false;
+
+  // When only the target triple is specified and is 
+  // a allegrex target, set the features. We also match
+  // big and little endian allegrex cores (dont really
+  // know if a big one exists)
+  if (TT.find("mipsallegrex") != std::string::npos ||
+      TT.find("psp") != std::string::npos) {
+    MipsABI = EABI;
+    IsSingleFloat = true;
+    MipsArchVersion = Mips2;
+    HasVFPU = true; // Enables Allegrex Vector FPU (not supported yet)
+    HasSEInReg = true;
+    HasBitCount = true;
+    HasSwap = true;
+    HasCondMov = true;
+  }
+}

diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
new file mode 100644
index 0000000..2d5fd22
--- /dev/null
+++ b/lib/Target/Mips/MipsSubtarget.h

@@ -0,0 +1,123 @@
+//=====-- MipsSubtarget.h - Define Subtarget for the Mips -----*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Mips specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSSUBTARGET_H
+#define MIPSSUBTARGET_H
+
+#include "llvm/Target/TargetSubtarget.h"
+#include "llvm/Target/TargetMachine.h"
+
+#include <string>
+
+namespace llvm {
+
+class MipsSubtarget : public TargetSubtarget {
+
+public:
+  enum MipsABIEnum {
+    O32, O64, N32, N64, EABI
+  }; 
+
+protected:
+
+  enum MipsArchEnum {
+    Mips1, Mips2, Mips3, Mips4, Mips32, Mips32r2, Mips64, Mips64r2
+  };
+
+  // Mips architecture version 
+  MipsArchEnum MipsArchVersion;
+
+  // Mips supported ABIs 
+  MipsABIEnum MipsABI;
+
+  // IsLittle - The target is Little Endian
+  bool IsLittle;
+
+  // IsSingleFloat - The target only supports single precision float
+  // point operations. This enable the target to use all 32 32-bit
+  // floating point registers instead of only using even ones.
+  bool IsSingleFloat;
+
+  // IsFP64bit - The target processor has 64-bit floating point registers.
+  bool IsFP64bit;
+
+  // IsFP64bit - General-purpose registers are 64 bits wide
+  bool IsGP64bit;
+
+  // HasVFPU - Processor has a vector floating point unit.
+  bool HasVFPU;
+
+  // isLinux - Target system is Linux. Is false we consider ELFOS for now.
+  bool IsLinux;
+
+  /// Features related to the presence of specific instructions.
+  
+  // HasSEInReg - SEB and SEH (signext in register) instructions.
+  bool HasSEInReg;
+
+  // HasCondMov - Conditional mov (MOVZ, MOVN) instructions.
+  bool HasCondMov;
+
+  // HasMulDivAdd - Multiply add and sub (MADD, MADDu, MSUB, MSUBu) 
+  // instructions.
+  bool HasMulDivAdd;
+
+  // HasMinMax - MIN and MAX instructions.
+  bool HasMinMax;
+
+  // HasSwap - Byte and half swap instructions.
+  bool HasSwap;
+
+  // HasBitCount - Count leading '1' and '0' bits.
+  bool HasBitCount;
+
+  InstrItineraryData InstrItins;
+
+public:
+
+  /// Only O32 and EABI supported right now.
+  bool isABI_EABI() const { return MipsABI == EABI; }
+  bool isABI_O32() const { return MipsABI == O32; }
+  unsigned getTargetABI() const { return MipsABI; }
+
+  /// This constructor initializes the data members to match that
+  /// of the specified triple.
+  MipsSubtarget(const std::string &TT, const std::string &FS, bool little);
+  
+  /// ParseSubtargetFeatures - Parses features string setting specified 
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  std::string ParseSubtargetFeatures(const std::string &FS,
+                                     const std::string &CPU);
+
+  bool isMips1() const { return MipsArchVersion == Mips1; }
+
+  bool isLittle() const { return IsLittle; }
+  bool isFP64bit() const { return IsFP64bit; }
+  bool isGP64bit() const { return IsGP64bit; }
+  bool isGP32bit() const { return !IsGP64bit; }
+  bool isSingleFloat() const { return IsSingleFloat; }
+  bool isNotSingleFloat() const { return !IsSingleFloat; }
+  bool hasVFPU() const { return HasVFPU; }
+  bool isLinux() const { return IsLinux; }
+
+  /// Features related to the presence of specific instructions.
+  bool hasSEInReg()   const { return HasSEInReg; }
+  bool hasCondMov()   const { return HasCondMov; }
+  bool hasMulDivAdd() const { return HasMulDivAdd; }
+  bool hasMinMax()    const { return HasMinMax; }
+  bool hasSwap()      const { return HasSwap; }
+  bool hasBitCount()  const { return HasBitCount; }
+};
+} // End llvm namespace
+
+#endif

diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
new file mode 100644
index 0000000..4724ff7
--- /dev/null
+++ b/lib/Target/Mips/MipsTargetMachine.cpp

@@ -0,0 +1,77 @@
+//===-- MipsTargetMachine.cpp - Define TargetMachine for Mips -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements the info about Mips target spec.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips.h"
+#include "MipsMCAsmInfo.h"
+#include "MipsTargetMachine.h"
+#include "llvm/PassManager.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+extern "C" void LLVMInitializeMipsTarget() {
+  // Register the target.
+  RegisterTargetMachine<MipsTargetMachine> X(TheMipsTarget);
+  RegisterTargetMachine<MipselTargetMachine> Y(TheMipselTarget);
+  RegisterAsmInfo<MipsMCAsmInfo> A(TheMipsTarget);
+  RegisterAsmInfo<MipsMCAsmInfo> B(TheMipselTarget);
+}
+
+// DataLayout --> Big-endian, 32-bit pointer/ABI/alignment
+// The stack is always 8 byte aligned
+// On function prologue, the stack is created by decrementing
+// its pointer. Once decremented, all references are done with positive
+// offset from the stack/frame pointer, using StackGrowsUp enables 
+// an easier handling.
+// Using CodeModel::Large enables different CALL behavior.
+MipsTargetMachine::
+MipsTargetMachine(const Target &T, const std::string &TT, const std::string &FS,
+                  bool isLittle=false):
+  LLVMTargetMachine(T, TT),
+  Subtarget(TT, FS, isLittle), 
+  DataLayout(isLittle ? std::string("e-p:32:32:32-i8:8:32-i16:16:32-n32") :
+                        std::string("E-p:32:32:32-i8:8:32-i16:16:32-n32")), 
+  InstrInfo(*this), 
+  FrameInfo(TargetFrameInfo::StackGrowsUp, 8, 0),
+  TLInfo(*this) {
+  // Abicall enables PIC by default
+  if (getRelocationModel() == Reloc::Default) {
+    if (Subtarget.isABI_O32())
+      setRelocationModel(Reloc::PIC_);
+    else
+      setRelocationModel(Reloc::Static);
+  }
+}
+
+MipselTargetMachine::
+MipselTargetMachine(const Target &T, const std::string &TT,
+                    const std::string &FS) :
+  MipsTargetMachine(T, TT, FS, true) {}
+
+// Install an instruction selector pass using 
+// the ISelDag to gen Mips code.
+bool MipsTargetMachine::
+addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel) 
+{
+  PM.add(createMipsISelDag(*this));
+  return false;
+}
+
+// Implemented by targets that want to run passes immediately before 
+// machine code is emitted. return true if -print-machineinstrs should 
+// print out the code after the passes.
+bool MipsTargetMachine::
+addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel) 
+{
+  PM.add(createMipsDelaySlotFillerPass(*this));
+  return true;
+}

diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h
new file mode 100644
index 0000000..c3428be
--- /dev/null
+++ b/lib/Target/Mips/MipsTargetMachine.h

@@ -0,0 +1,71 @@
+//===-- MipsTargetMachine.h - Define TargetMachine for Mips -00--*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Mips specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSTARGETMACHINE_H
+#define MIPSTARGETMACHINE_H
+
+#include "MipsSubtarget.h"
+#include "MipsInstrInfo.h"
+#include "MipsISelLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameInfo.h"
+
+namespace llvm {
+  class formatted_raw_ostream;
+  
+  class MipsTargetMachine : public LLVMTargetMachine {
+    MipsSubtarget       Subtarget;
+    const TargetData    DataLayout; // Calculates type size & alignment
+    MipsInstrInfo       InstrInfo;
+    TargetFrameInfo     FrameInfo;
+    MipsTargetLowering  TLInfo;
+  public:
+    MipsTargetMachine(const Target &T, const std::string &TT,
+                      const std::string &FS, bool isLittle);
+    
+    virtual const MipsInstrInfo   *getInstrInfo()     const 
+    { return &InstrInfo; }
+    virtual const TargetFrameInfo *getFrameInfo()     const 
+    { return &FrameInfo; }
+    virtual const MipsSubtarget   *getSubtargetImpl() const 
+    { return &Subtarget; }
+    virtual const TargetData      *getTargetData()    const 
+    { return &DataLayout;}
+
+    virtual const MipsRegisterInfo *getRegisterInfo()  const {
+      return &InstrInfo.getRegisterInfo();
+    }
+
+    virtual MipsTargetLowering   *getTargetLowering() const { 
+      return const_cast<MipsTargetLowering*>(&TLInfo); 
+    }
+
+    // Pass Pipeline Configuration
+    virtual bool addInstSelector(PassManagerBase &PM,
+                                 CodeGenOpt::Level OptLevel);
+    virtual bool addPreEmitPass(PassManagerBase &PM,
+                                CodeGenOpt::Level OptLevel);
+  };
+
+/// MipselTargetMachine - Mipsel target machine.
+///
+class MipselTargetMachine : public MipsTargetMachine {
+public:
+  MipselTargetMachine(const Target &T, const std::string &TT,
+                      const std::string &FS);
+};
+
+} // End llvm namespace
+
+#endif

diff --git a/lib/Target/Mips/MipsTargetObjectFile.cpp b/lib/Target/Mips/MipsTargetObjectFile.cpp
new file mode 100644
index 0000000..0fb423d
--- /dev/null
+++ b/lib/Target/Mips/MipsTargetObjectFile.cpp

@@ -0,0 +1,100 @@
+//===-- MipsTargetObjectFile.cpp - Mips object files ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MipsTargetObjectFile.h"
+#include "MipsSubtarget.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+static cl::opt<unsigned>
+SSThreshold("mips-ssection-threshold", cl::Hidden,
+            cl::desc("Small data and bss section threshold size (default=8)"),
+            cl::init(8));
+
+void MipsTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM){
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+ 
+  SmallDataSection =
+    getELFSection(".sdata", MCSectionELF::SHT_PROGBITS,
+                  MCSectionELF::SHF_WRITE | MCSectionELF::SHF_ALLOC,
+                  SectionKind::getDataRel());
+  
+  SmallBSSSection =
+    getELFSection(".sbss", MCSectionELF::SHT_NOBITS,
+                  MCSectionELF::SHF_WRITE | MCSectionELF::SHF_ALLOC,
+                  SectionKind::getBSS());
+  
+}
+
+// A address must be loaded from a small section if its size is less than the 
+// small section size threshold. Data in this section must be addressed using 
+// gp_rel operator.
+static bool IsInSmallSection(uint64_t Size) {
+  return Size > 0 && Size <= SSThreshold;
+}
+
+bool MipsTargetObjectFile::IsGlobalInSmallSection(const GlobalValue *GV,
+                                                const TargetMachine &TM) const {
+  if (GV->isDeclaration() || GV->hasAvailableExternallyLinkage())
+    return false;
+  
+  return IsGlobalInSmallSection(GV, TM, getKindForGlobal(GV, TM));
+}
+
+/// IsGlobalInSmallSection - Return true if this global address should be
+/// placed into small data/bss section.
+bool MipsTargetObjectFile::
+IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM,
+                       SectionKind Kind) const {
+
+  // Only use small section for non linux targets.
+  const MipsSubtarget &Subtarget = TM.getSubtarget<MipsSubtarget>();
+  if (Subtarget.isLinux())
+    return false;
+
+  // Only global variables, not functions.
+  const GlobalVariable *GVA = dyn_cast<GlobalVariable>(GV);
+  if (!GVA)
+    return false;
+  
+  // We can only do this for datarel or BSS objects for now.
+  if (!Kind.isBSS() && !Kind.isDataRel())
+    return false;
+  
+  // If this is a internal constant string, there is a special
+  // section for it, but not in small data/bss.
+  if (Kind.isMergeable1ByteCString())
+    return false;
+
+  const Type *Ty = GV->getType()->getElementType();
+  return IsInSmallSection(TM.getTargetData()->getTypeAllocSize(Ty));
+}
+
+
+
+const MCSection *MipsTargetObjectFile::
+SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
+                       Mangler *Mang, const TargetMachine &TM) const {
+  // TODO: Could also support "weak" symbols as well with ".gnu.linkonce.s.*"
+  // sections?
+  
+  // Handle Small Section classification here.
+  if (Kind.isBSS() && IsGlobalInSmallSection(GV, TM, Kind))
+    return SmallBSSSection;
+  if (Kind.isDataNoRel() && IsGlobalInSmallSection(GV, TM, Kind))
+    return SmallDataSection;
+  
+  // Otherwise, we work the same as ELF.
+  return TargetLoweringObjectFileELF::SelectSectionForGlobal(GV, Kind, Mang,TM);
+}

diff --git a/lib/Target/Mips/MipsTargetObjectFile.h b/lib/Target/Mips/MipsTargetObjectFile.h
new file mode 100644
index 0000000..32e0436
--- /dev/null
+++ b/lib/Target/Mips/MipsTargetObjectFile.h

@@ -0,0 +1,41 @@
+//===-- llvm/Target/MipsTargetObjectFile.h - Mips Object Info ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_MIPS_TARGETOBJECTFILE_H
+#define LLVM_TARGET_MIPS_TARGETOBJECTFILE_H
+
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+namespace llvm {
+
+  class MipsTargetObjectFile : public TargetLoweringObjectFileELF {
+    const MCSection *SmallDataSection;
+    const MCSection *SmallBSSSection;
+  public:
+    
+    void Initialize(MCContext &Ctx, const TargetMachine &TM);
+
+    
+    /// IsGlobalInSmallSection - Return true if this global address should be
+    /// placed into small data/bss section.
+    bool IsGlobalInSmallSection(const GlobalValue *GV,
+                                const TargetMachine &TM, SectionKind Kind)const;
+    bool IsGlobalInSmallSection(const GlobalValue *GV,
+                                const TargetMachine &TM) const;  
+    
+    const MCSection *SelectSectionForGlobal(const GlobalValue *GV,
+                                            SectionKind Kind,
+                                            Mangler *Mang,
+                                            const TargetMachine &TM) const;
+      
+    // TODO: Classify globals as mips wishes.
+  };
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/Mips/TargetInfo/CMakeLists.txt b/lib/Target/Mips/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000..6e5d56b
--- /dev/null
+++ b/lib/Target/Mips/TargetInfo/CMakeLists.txt

@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMMipsInfo
+  MipsTargetInfo.cpp
+  )
+
+add_dependencies(LLVMMipsInfo MipsCodeGenTable_gen)

diff --git a/lib/Target/Mips/TargetInfo/Makefile b/lib/Target/Mips/TargetInfo/Makefile
new file mode 100644
index 0000000..32f4e16
--- /dev/null
+++ b/lib/Target/Mips/TargetInfo/Makefile

@@ -0,0 +1,15 @@
+##===- lib/Target/Mips/TargetInfo/Makefile -----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMMipsInfo
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common

diff --git a/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp b/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp
new file mode 100644
index 0000000..cc3d61e
--- /dev/null
+++ b/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp

@@ -0,0 +1,21 @@
+//===-- MipsTargetInfo.cpp - Mips Target Implementation -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Mips.h"
+#include "llvm/Module.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheMipsTarget, llvm::TheMipselTarget;
+
+extern "C" void LLVMInitializeMipsTargetInfo() { 
+  RegisterTarget<Triple::mips> X(TheMipsTarget, "mips", "Mips");
+
+  RegisterTarget<Triple::mipsel> Y(TheMipselTarget, "mipsel", "Mipsel");
+}

diff --git a/lib/Target/PIC16/AsmPrinter/CMakeLists.txt b/lib/Target/PIC16/AsmPrinter/CMakeLists.txt
new file mode 100644
index 0000000..2e1b809
--- /dev/null
+++ b/lib/Target/PIC16/AsmPrinter/CMakeLists.txt

@@ -0,0 +1,9 @@
+include_directories(

+  ${CMAKE_CURRENT_BINARY_DIR}/..

+  ${CMAKE_CURRENT_SOURCE_DIR}/..

+  )

+

+add_llvm_library(LLVMPIC16AsmPrinter

+  PIC16AsmPrinter.cpp
+  )

+add_dependencies(LLVMPIC16AsmPrinter PIC16CodeGenTable_gen)

diff --git a/lib/Target/PIC16/AsmPrinter/Makefile b/lib/Target/PIC16/AsmPrinter/Makefile
new file mode 100644
index 0000000..f4db57e
--- /dev/null
+++ b/lib/Target/PIC16/AsmPrinter/Makefile

@@ -0,0 +1,15 @@
+##===- lib/Target/PIC16/AsmPrinter/Makefile ----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMPIC16AsmPrinter
+
+# Hack: we need to include 'main' pic16 target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common

diff --git a/lib/Target/PIC16/AsmPrinter/PIC16AsmPrinter.cpp b/lib/Target/PIC16/AsmPrinter/PIC16AsmPrinter.cpp
new file mode 100644
index 0000000..72f7c16
--- /dev/null
+++ b/lib/Target/PIC16/AsmPrinter/PIC16AsmPrinter.cpp

@@ -0,0 +1,507 @@
+//===-- PIC16AsmPrinter.cpp - PIC16 LLVM assembly writer ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to PIC16 assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PIC16ABINames.h"
+#include "PIC16AsmPrinter.h"
+#include "PIC16Section.h"
+#include "PIC16MCAsmInfo.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Module.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include <cstring>
+using namespace llvm;
+
+#include "PIC16GenAsmWriter.inc"
+
+PIC16AsmPrinter::PIC16AsmPrinter(formatted_raw_ostream &O, TargetMachine &TM,
+                                 MCContext &Ctx, MCStreamer &Streamer,
+                                 const MCAsmInfo *T)
+: AsmPrinter(O, TM, Ctx, Streamer, T), DbgInfo(O, T) {
+  PTLI = static_cast<PIC16TargetLowering*>(TM.getTargetLowering());
+  PMAI = static_cast<const PIC16MCAsmInfo*>(T);
+  PTOF = (PIC16TargetObjectFile *)&PTLI->getObjFileLowering();
+}
+
+void PIC16AsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  printInstruction(MI);
+  OutStreamer.AddBlankLine();
+}
+
+static int getFunctionColor(const Function *F) {
+  if (F->hasSection()) {
+    std::string Sectn = F->getSection();
+    std::string StrToFind = "Overlay=";
+    std::string::size_type Pos = Sectn.find(StrToFind);
+
+    // Retreive the color number if the key is found.
+    if (Pos != std::string::npos) {
+      Pos += StrToFind.length();
+      std::string Color = "";
+      char c = Sectn.at(Pos);
+      // A Color can only consist of digits.
+      while (c >= '0' && c<= '9') {
+        Color.append(1,c);
+        Pos++;
+        if (Pos >= Sectn.length())
+          break;
+        c = Sectn.at(Pos);
+      }
+      return atoi(Color.c_str());
+    }
+  }
+
+  // Color was not set for function, so return -1.
+  return -1;
+}
+
+// Color the Auto section of the given function. 
+void PIC16AsmPrinter::ColorAutoSection(const Function *F) {
+  std::string SectionName = PAN::getAutosSectionName(CurrentFnSym->getName());
+  PIC16Section* Section = PTOF->findPIC16Section(SectionName);
+  if (Section != NULL) {
+    int Color = getFunctionColor(F);
+    if (Color >= 0)
+      Section->setColor(Color);
+  }
+}
+
+
+/// runOnMachineFunction - This emits the frame section, autos section and 
+/// assembly for each instruction. Also takes care of function begin debug
+/// directive and file begin debug directive (if required) for the function.
+///
+bool PIC16AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  // This calls the base class function required to be called at beginning
+  // of runOnMachineFunction.
+  SetupMachineFunction(MF);
+
+  // Put the color information from function to its auto section.
+  const Function *F = MF.getFunction();
+  ColorAutoSection(F);
+
+  // Emit the function frame (args and temps).
+  EmitFunctionFrame(MF);
+
+  DbgInfo.BeginFunction(MF);
+
+  // Now emit the instructions of function in its code section.
+  const MCSection *fCodeSection 
+    = getObjFileLowering().SectionForCode(CurrentFnSym->getName());
+
+  // Start the Code Section.
+  O <<  "\n";
+  OutStreamer.SwitchSection(fCodeSection);
+
+  // Emit the frame address of the function at the beginning of code.
+  O << "\tretlw  low(" << PAN::getFrameLabel(CurrentFnSym->getName()) << ")\n";
+  O << "\tretlw  high(" << PAN::getFrameLabel(CurrentFnSym->getName()) << ")\n";
+
+  // Emit function start label.
+  O << *CurrentFnSym << ":\n";
+
+  DebugLoc CurDL;
+  O << "\n"; 
+  // Print out code for the function.
+  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+
+    // Print a label for the basic block.
+    if (I != MF.begin()) {
+      EmitBasicBlockStart(I);
+    }
+    
+    // Print a basic block.
+    for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
+         II != E; ++II) {
+
+      // Emit the line directive if source line changed.
+      const DebugLoc DL = II->getDebugLoc();
+      if (!DL.isUnknown() && DL != CurDL) {
+        DbgInfo.ChangeDebugLoc(MF, DL);
+        CurDL = DL;
+      }
+        
+      // Print the assembly for the instruction.
+      EmitInstruction(II);
+    }
+  }
+  
+  // Emit function end debug directives.
+  DbgInfo.EndFunction(MF);
+
+  return false;  // we didn't modify anything.
+}
+
+
+// printOperand - print operand of insn.
+void PIC16AsmPrinter::printOperand(const MachineInstr *MI, int opNum) {
+  const MachineOperand &MO = MI->getOperand(opNum);
+
+  switch (MO.getType()) {
+    case MachineOperand::MO_Register:
+      {
+        // For indirect load/store insns, the fsr name is printed as INDF.
+        std::string RegName = getRegisterName(MO.getReg());
+        if ((MI->getOpcode() == PIC16::load_indirect) ||
+            (MI->getOpcode() == PIC16::store_indirect))
+        {
+          RegName.replace (0, 3, "INDF");
+        }
+        O << RegName;
+      }
+      return;
+
+    case MachineOperand::MO_Immediate:
+      O << (int)MO.getImm();
+      return;
+
+    case MachineOperand::MO_GlobalAddress: {
+      MCSymbol *Sym = GetGlobalValueSymbol(MO.getGlobal());
+      // FIXME: currently we do not have a memcpy def coming in the module
+      // by any chance, as we do not link in those as .bc lib. So these calls
+      // are always external and it is safe to emit an extern.
+      if (PAN::isMemIntrinsic(Sym->getName()))
+        LibcallDecls.push_back(createESName(Sym->getName()));
+
+      O << *Sym;
+      break;
+    }
+    case MachineOperand::MO_ExternalSymbol: {
+       const char *Sname = MO.getSymbolName();
+
+      // If its a libcall name, record it to decls section.
+      if (PAN::getSymbolTag(Sname) == PAN::LIBCALL)
+        LibcallDecls.push_back(Sname);
+
+      // Record a call to intrinsic to print the extern declaration for it.
+      std::string Sym = Sname;  
+      if (PAN::isMemIntrinsic(Sym)) {
+        Sym = PAN::addPrefix(Sym);
+        LibcallDecls.push_back(createESName(Sym));
+      }
+
+      O << Sym;
+      break;
+    }
+    case MachineOperand::MO_MachineBasicBlock:
+      O << *MO.getMBB()->getSymbol(OutContext);
+      return;
+
+    default:
+      llvm_unreachable(" Operand type not supported.");
+  }
+}
+
+/// printCCOperand - Print the cond code operand.
+///
+void PIC16AsmPrinter::printCCOperand(const MachineInstr *MI, int opNum) {
+  int CC = (int)MI->getOperand(opNum).getImm();
+  O << PIC16CondCodeToString((PIC16CC::CondCodes)CC);
+}
+
+// This function is used to sort the decls list.
+// should return true if s1 should come before s2.
+static bool is_before(const char *s1, const char *s2) {
+  return strcmp(s1, s2) <= 0;
+}
+
+// This is used by list::unique below. 
+// unique will filter out duplicates if it knows them.
+static bool is_duplicate(const char *s1, const char *s2) {
+  return !strcmp(s1, s2);
+}
+
+/// printLibcallDecls - print the extern declarations for compiler 
+/// intrinsics.
+///
+void PIC16AsmPrinter::printLibcallDecls() {
+  // If no libcalls used, return.
+  if (LibcallDecls.empty()) return;
+
+  O << MAI->getCommentString() << "External decls for libcalls - BEGIN." <<"\n";
+  // Remove duplicate entries.
+  LibcallDecls.sort(is_before);
+  LibcallDecls.unique(is_duplicate);
+
+  for (std::list<const char*>::const_iterator I = LibcallDecls.begin(); 
+       I != LibcallDecls.end(); I++) {
+    O << MAI->getExternDirective() << *I << "\n";
+    O << MAI->getExternDirective() << PAN::getArgsLabel(*I) << "\n";
+    O << MAI->getExternDirective() << PAN::getRetvalLabel(*I) << "\n";
+  }
+  O << MAI->getCommentString() << "External decls for libcalls - END." <<"\n";
+}
+
+/// doInitialization - Perform Module level initializations here.
+/// One task that we do here is to sectionize all global variables.
+/// The MemSelOptimizer pass depends on the sectionizing.
+///
+bool PIC16AsmPrinter::doInitialization(Module &M) {
+  bool Result = AsmPrinter::doInitialization(M);
+
+  // Every asmbly contains these std headers. 
+  O << "\n#include p16f1xxx.inc";
+  O << "\n#include stdmacros.inc";
+
+  // Set the section names for all globals.
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I) {
+
+    // Record External Var Decls.
+    if (I->isDeclaration()) {
+      ExternalVarDecls.push_back(I);
+      continue;
+    }
+
+    // Record Exteranl Var Defs.
+    if (I->hasExternalLinkage() || I->hasCommonLinkage()) {
+      ExternalVarDefs.push_back(I);
+    }
+
+    // Sectionify actual data.
+    if (!I->hasAvailableExternallyLinkage()) {
+      const MCSection *S = getObjFileLowering().SectionForGlobal(I, Mang, TM);
+      
+      I->setSection(((const PIC16Section *)S)->getName());
+    }
+  }
+
+  DbgInfo.BeginModule(M);
+  EmitFunctionDecls(M);
+  EmitUndefinedVars(M);
+  EmitDefinedVars(M);
+  EmitIData(M);
+  EmitUData(M);
+  EmitRomData(M);
+  EmitSharedUdata(M);
+  EmitUserSections(M);
+  return Result;
+}
+
+/// Emit extern decls for functions imported from other modules, and emit
+/// global declarations for function defined in this module and which are
+/// available to other modules.
+///
+void PIC16AsmPrinter::EmitFunctionDecls(Module &M) {
+ // Emit declarations for external functions.
+  O <<"\n"<<MAI->getCommentString() << "Function Declarations - BEGIN." <<"\n";
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; I++) {
+    if (I->isIntrinsic() || I->getName() == "@abort")
+      continue;
+    
+    if (!I->isDeclaration() && !I->hasExternalLinkage())
+      continue;
+
+    MCSymbol *Sym = GetGlobalValueSymbol(I);
+    
+    // Do not emit memcpy, memset, and memmove here.
+    // Calls to these routines can be generated in two ways,
+    // 1. User calling the standard lib function
+    // 2. Codegen generating these calls for llvm intrinsics.
+    // In the first case a prototype is alread availale, while in
+    // second case the call is via and externalsym and the prototype is missing.
+    // So declarations for these are currently always getting printing by
+    // tracking both kind of references in printInstrunction.
+    if (I->isDeclaration() && PAN::isMemIntrinsic(Sym->getName())) continue;
+
+    const char *directive = I->isDeclaration() ? MAI->getExternDirective() :
+                                                 MAI->getGlobalDirective();
+      
+    O << directive << Sym->getName() << "\n";
+    O << directive << PAN::getRetvalLabel(Sym->getName()) << "\n";
+    O << directive << PAN::getArgsLabel(Sym->getName()) << "\n";
+  }
+
+  O << MAI->getCommentString() << "Function Declarations - END." <<"\n";
+}
+
+// Emit variables imported from other Modules.
+void PIC16AsmPrinter::EmitUndefinedVars(Module &M) {
+  std::vector<const GlobalVariable*> Items = ExternalVarDecls;
+  if (!Items.size()) return;
+
+  O << "\n" << MAI->getCommentString() << "Imported Variables - BEGIN" << "\n";
+  for (unsigned j = 0; j < Items.size(); j++)
+    O << MAI->getExternDirective() << *GetGlobalValueSymbol(Items[j]) << "\n";
+  O << MAI->getCommentString() << "Imported Variables - END" << "\n";
+}
+
+// Emit variables defined in this module and are available to other modules.
+void PIC16AsmPrinter::EmitDefinedVars(Module &M) {
+  std::vector<const GlobalVariable*> Items = ExternalVarDefs;
+  if (!Items.size()) return;
+
+  O << "\n" << MAI->getCommentString() << "Exported Variables - BEGIN" << "\n";
+  for (unsigned j = 0; j < Items.size(); j++)
+    O << MAI->getGlobalDirective() << *GetGlobalValueSymbol(Items[j]) << "\n";
+  O <<  MAI->getCommentString() << "Exported Variables - END" << "\n";
+}
+
+// Emit initialized data placed in ROM.
+void PIC16AsmPrinter::EmitRomData(Module &M) {
+  EmitSingleSection(PTOF->ROMDATASection());
+}
+
+// Emit Shared section udata.
+void PIC16AsmPrinter::EmitSharedUdata(Module &M) {
+  EmitSingleSection(PTOF->SHAREDUDATASection());
+}
+
+bool PIC16AsmPrinter::doFinalization(Module &M) {
+  EmitAllAutos(M);
+  printLibcallDecls();
+  DbgInfo.EndModule(M);
+  O << "\n\t" << "END\n";
+  return AsmPrinter::doFinalization(M);
+}
+
+void PIC16AsmPrinter::EmitFunctionFrame(MachineFunction &MF) {
+  const Function *F = MF.getFunction();
+  const TargetData *TD = TM.getTargetData();
+  // Emit the data section name.
+  O << "\n"; 
+  
+  PIC16Section *fPDataSection =
+    const_cast<PIC16Section *>(getObjFileLowering().
+                                SectionForFrame(CurrentFnSym->getName()));
+ 
+  fPDataSection->setColor(getFunctionColor(F)); 
+  OutStreamer.SwitchSection(fPDataSection);
+  
+  // Emit function frame label
+  O << PAN::getFrameLabel(CurrentFnSym->getName()) << ":\n";
+
+  const Type *RetType = F->getReturnType();
+  unsigned RetSize = 0; 
+  if (RetType->getTypeID() != Type::VoidTyID) 
+    RetSize = TD->getTypeAllocSize(RetType);
+  
+  //Emit function return value space
+  // FIXME: Do not emit RetvalLable when retsize is zero. To do this
+  // we will need to avoid printing a global directive for Retval label
+  // in emitExternandGloblas.
+  if(RetSize > 0)
+     O << PAN::getRetvalLabel(CurrentFnSym->getName())
+       << " RES " << RetSize << "\n";
+  else
+     O << PAN::getRetvalLabel(CurrentFnSym->getName()) << ": \n";
+   
+  // Emit variable to hold the space for function arguments 
+  unsigned ArgSize = 0;
+  for (Function::const_arg_iterator argi = F->arg_begin(),
+           arge = F->arg_end(); argi != arge ; ++argi) {
+    const Type *Ty = argi->getType();
+    ArgSize += TD->getTypeAllocSize(Ty);
+   }
+
+  O << PAN::getArgsLabel(CurrentFnSym->getName()) << " RES " << ArgSize << "\n";
+
+  // Emit temporary space
+  int TempSize = PTLI->GetTmpSize();
+  if (TempSize > 0)
+    O << PAN::getTempdataLabel(CurrentFnSym->getName()) << " RES  "
+      << TempSize << '\n';
+}
+
+
+void PIC16AsmPrinter::EmitInitializedDataSection(const PIC16Section *S) {
+  /// Emit Section header.
+  OutStreamer.SwitchSection(S);
+
+    std::vector<const GlobalVariable*> Items = S->Items;
+    for (unsigned j = 0; j < Items.size(); j++) {
+      Constant *C = Items[j]->getInitializer();
+      int AddrSpace = Items[j]->getType()->getAddressSpace();
+      O << *GetGlobalValueSymbol(Items[j]);
+      EmitGlobalConstant(C, AddrSpace);
+   }
+}
+
+// Print all IDATA sections.
+void PIC16AsmPrinter::EmitIData(Module &M) {
+  EmitSectionList (M, PTOF->IDATASections());
+}
+
+void PIC16AsmPrinter::
+EmitUninitializedDataSection(const PIC16Section *S) {
+    const TargetData *TD = TM.getTargetData();
+    OutStreamer.SwitchSection(S);
+    std::vector<const GlobalVariable*> Items = S->Items;
+    for (unsigned j = 0; j < Items.size(); j++) {
+      Constant *C = Items[j]->getInitializer();
+      const Type *Ty = C->getType();
+      unsigned Size = TD->getTypeAllocSize(Ty);
+      O << *GetGlobalValueSymbol(Items[j]) << " RES " << Size << "\n";
+    }
+}
+
+// Print all UDATA sections.
+void PIC16AsmPrinter::EmitUData(Module &M) {
+  EmitSectionList (M, PTOF->UDATASections());
+}
+
+// Print all USER sections.
+void PIC16AsmPrinter::EmitUserSections(Module &M) {
+  EmitSectionList (M, PTOF->USERSections());
+}
+
+// Print all AUTO sections.
+void PIC16AsmPrinter::EmitAllAutos(Module &M) {
+  EmitSectionList (M, PTOF->AUTOSections());
+}
+
+extern "C" void LLVMInitializePIC16AsmPrinter() { 
+  RegisterAsmPrinter<PIC16AsmPrinter> X(ThePIC16Target);
+}
+
+// Emit one data section using correct section emitter based on section type.
+void PIC16AsmPrinter::EmitSingleSection(const PIC16Section *S) {
+  if (S == NULL) return;
+
+  switch (S->getType()) {
+    default: llvm_unreachable ("unknow user section type");
+    case UDATA:
+    case UDATA_SHR:
+    case UDATA_OVR:
+      EmitUninitializedDataSection(S);
+      break;
+    case IDATA:
+    case ROMDATA:
+      EmitInitializedDataSection(S);
+      break;
+  }
+}
+
+// Emit a list of sections.
+void PIC16AsmPrinter::
+EmitSectionList(Module &M, const std::vector<PIC16Section *> &SList) {
+  for (unsigned i = 0; i < SList.size(); i++) {
+    // Exclude llvm specific metadata sections.
+    if (SList[i]->getName().find("llvm.") != std::string::npos)
+      continue;
+    O << "\n";
+    EmitSingleSection(SList[i]);
+  }
+}
+

diff --git a/lib/Target/PIC16/AsmPrinter/PIC16AsmPrinter.h b/lib/Target/PIC16/AsmPrinter/PIC16AsmPrinter.h
new file mode 100644
index 0000000..77b6e63
--- /dev/null
+++ b/lib/Target/PIC16/AsmPrinter/PIC16AsmPrinter.h

@@ -0,0 +1,90 @@
+//===-- PIC16AsmPrinter.h - PIC16 LLVM assembly writer ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to PIC16 assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PIC16ASMPRINTER_H
+#define PIC16ASMPRINTER_H
+
+#include "PIC16.h"
+#include "PIC16TargetMachine.h"
+#include "PIC16DebugInfo.h"
+#include "PIC16MCAsmInfo.h"
+#include "PIC16TargetObjectFile.h"
+#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetMachine.h"
+#include <list>
+#include <string>
+
+namespace llvm {
+  class VISIBILITY_HIDDEN PIC16AsmPrinter : public AsmPrinter {
+  public:
+    explicit PIC16AsmPrinter(formatted_raw_ostream &O, TargetMachine &TM,
+                             MCContext &Ctx, MCStreamer &Streamer,
+                             const MCAsmInfo *T);
+  private:
+    virtual const char *getPassName() const {
+      return "PIC16 Assembly Printer";
+    }
+    
+    PIC16TargetObjectFile &getObjFileLowering() const {
+      return (PIC16TargetObjectFile &)AsmPrinter::getObjFileLowering();
+    }
+
+    bool runOnMachineFunction(MachineFunction &F);
+    void printOperand(const MachineInstr *MI, int opNum);
+    void printCCOperand(const MachineInstr *MI, int opNum);
+    void printInstruction(const MachineInstr *MI); // definition autogenerated.
+    static const char *getRegisterName(unsigned RegNo);
+
+    void EmitInstruction(const MachineInstr *MI);
+    void EmitFunctionDecls (Module &M);
+    void EmitUndefinedVars (Module &M);
+    void EmitDefinedVars (Module &M);
+    void EmitIData (Module &M);
+    void EmitUData (Module &M);
+    void EmitAllAutos (Module &M);
+    void EmitRomData (Module &M);
+    void EmitSharedUdata(Module &M);
+    void EmitUserSections (Module &M);
+    void EmitFunctionFrame(MachineFunction &MF);
+    void printLibcallDecls();
+    void EmitUninitializedDataSection(const PIC16Section *S);
+    void EmitInitializedDataSection(const PIC16Section *S);
+    void EmitSingleSection(const PIC16Section *S);
+    void EmitSectionList(Module &M, 
+                         const std::vector< PIC16Section *> &SList);
+    void ColorAutoSection(const Function *F);
+  protected:
+    bool doInitialization(Module &M);
+    bool doFinalization(Module &M);
+
+    /// EmitGlobalVariable - Emit the specified global variable and its
+    /// initializer to the output stream.
+    virtual void EmitGlobalVariable(const GlobalVariable *GV) {
+      // PIC16 doesn't use normal hooks for this.
+    }
+    
+  private:
+    PIC16TargetObjectFile *PTOF;
+    PIC16TargetLowering *PTLI;
+    PIC16DbgInfo DbgInfo;
+    const PIC16MCAsmInfo *PMAI;
+    std::list<const char *> LibcallDecls; // List of extern decls.
+    std::vector<const GlobalVariable *> ExternalVarDecls;
+    std::vector<const GlobalVariable *> ExternalVarDefs;
+  };
+} // end of namespace
+
+#endif

diff --git a/lib/Target/PIC16/CMakeLists.txt b/lib/Target/PIC16/CMakeLists.txt
new file mode 100644
index 0000000..208b067
--- /dev/null
+++ b/lib/Target/PIC16/CMakeLists.txt

@@ -0,0 +1,25 @@
+set(LLVM_TARGET_DEFINITIONS PIC16.td)
+
+tablegen(PIC16GenRegisterInfo.h.inc -gen-register-desc-header)
+tablegen(PIC16GenRegisterNames.inc -gen-register-enums)
+tablegen(PIC16GenRegisterInfo.inc -gen-register-desc)
+tablegen(PIC16GenInstrNames.inc -gen-instr-enums)
+tablegen(PIC16GenInstrInfo.inc -gen-instr-desc)
+tablegen(PIC16GenAsmWriter.inc -gen-asm-writer)
+tablegen(PIC16GenDAGISel.inc -gen-dag-isel)
+tablegen(PIC16GenCallingConv.inc -gen-callingconv)
+tablegen(PIC16GenSubtarget.inc -gen-subtarget)
+
+add_llvm_target(PIC16
+  PIC16DebugInfo.cpp
+  PIC16InstrInfo.cpp
+  PIC16ISelDAGToDAG.cpp
+  PIC16ISelLowering.cpp
+  PIC16MemSelOpt.cpp
+  PIC16MCAsmInfo.cpp
+  PIC16RegisterInfo.cpp
+  PIC16Section.cpp
+  PIC16Subtarget.cpp
+  PIC16TargetMachine.cpp
+  PIC16TargetObjectFile.cpp
+  )

diff --git a/lib/Target/PIC16/Makefile b/lib/Target/PIC16/Makefile
new file mode 100644
index 0000000..9e784d1
--- /dev/null
+++ b/lib/Target/PIC16/Makefile

@@ -0,0 +1,24 @@
+##===- lib/Target/PIC16/Makefile ---------------------------*- Makefile -*-===##
+# 
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source 
+# License. See LICENSE.TXT for details.
+# 
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMPIC16CodeGen
+TARGET = PIC16
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = PIC16GenRegisterInfo.h.inc PIC16GenRegisterNames.inc \
+		PIC16GenRegisterInfo.inc PIC16GenInstrNames.inc \
+		PIC16GenInstrInfo.inc PIC16GenAsmWriter.inc \
+		PIC16GenDAGISel.inc PIC16GenCallingConv.inc \
+		PIC16GenSubtarget.inc
+
+DIRS = AsmPrinter TargetInfo PIC16Passes
+
+include $(LEVEL)/Makefile.common
+

diff --git a/lib/Target/PIC16/PIC16.h b/lib/Target/PIC16/PIC16.h
new file mode 100644
index 0000000..8d067de
--- /dev/null
+++ b/lib/Target/PIC16/PIC16.h

@@ -0,0 +1,119 @@
+//===-- PIC16.h - Top-level interface for PIC16 representation --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in 
+// the LLVM PIC16 back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_PIC16_H
+#define LLVM_TARGET_PIC16_H
+
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cassert>
+#include <sstream>
+#include <cstring>
+#include <string>
+
+namespace llvm {
+  class PIC16TargetMachine;
+  class FunctionPass;
+  class MachineCodeEmitter;
+  class formatted_raw_ostream;
+
+namespace PIC16CC {
+  enum CondCodes {
+    EQ,
+    NE,
+    LT,
+    LE,
+    GT,
+    GE,
+    ULT,
+    UGT,
+    ULE,
+    UGE
+  };
+}
+
+  enum PIC16SectionType {
+      CODE,
+      UDATA,
+      IDATA,
+      ROMDATA,
+      UDATA_OVR,
+      UDATA_SHR
+    };
+
+
+  // External symbol names require memory to live till the program end.
+  // So we have to allocate it and keep.
+  // FIXME: Don't leak the allocated strings.
+  inline static const char *createESName (const std::string &name) {
+    char *tmpName = new char[name.size() + 1];
+    memcpy(tmpName, name.c_str(), name.size() + 1);
+    return tmpName;
+  }
+
+
+
+  inline static const char *PIC16CondCodeToString(PIC16CC::CondCodes CC) {
+    switch (CC) {
+    default: llvm_unreachable("Unknown condition code");
+    case PIC16CC::NE:  return "ne";
+    case PIC16CC::EQ:   return "eq";
+    case PIC16CC::LT:   return "lt";
+    case PIC16CC::ULT:   return "lt";
+    case PIC16CC::LE:  return "le";
+    case PIC16CC::ULE:  return "le";
+    case PIC16CC::GT:  return "gt";
+    case PIC16CC::UGT:  return "gt";
+    case PIC16CC::GE:   return "ge";
+    case PIC16CC::UGE:   return "ge";
+    }
+  }
+
+  inline static bool isSignedComparison(PIC16CC::CondCodes CC) {
+    switch (CC) {
+    default: llvm_unreachable("Unknown condition code");
+    case PIC16CC::NE:  
+    case PIC16CC::EQ: 
+    case PIC16CC::LT:
+    case PIC16CC::LE:
+    case PIC16CC::GE:
+    case PIC16CC::GT:
+      return true;
+    case PIC16CC::ULT:
+    case PIC16CC::UGT:
+    case PIC16CC::ULE:
+    case PIC16CC::UGE:
+      return false;   // condition codes for unsigned comparison. 
+    }
+  }
+
+
+
+  FunctionPass *createPIC16ISelDag(PIC16TargetMachine &TM);
+  // Banksel optimizer pass.
+  FunctionPass *createPIC16MemSelOptimizerPass();
+
+  extern Target ThePIC16Target;
+  extern Target TheCooperTarget;
+  
+} // end namespace llvm;
+
+// Defines symbolic names for PIC16 registers.  This defines a mapping from
+// register name to register number.
+#include "PIC16GenRegisterNames.inc"
+
+// Defines symbolic names for the PIC16 instructions.
+#include "PIC16GenInstrNames.inc"
+
+#endif

diff --git a/lib/Target/PIC16/PIC16.td b/lib/Target/PIC16/PIC16.td
new file mode 100644
index 0000000..b2b9b1c
--- /dev/null
+++ b/lib/Target/PIC16/PIC16.td

@@ -0,0 +1,40 @@
+//===- PIC16.td - Describe the PIC16 Target Machine -----------*- tblgen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This is the top level entry point for the PIC16 target.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+include "PIC16RegisterInfo.td"
+include "PIC16InstrInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Subtarget Features. 
+//===----------------------------------------------------------------------===//
+def FeatureCooper : SubtargetFeature<"cooper", "IsCooper", "true",
+                                     "PIC16 Cooper ISA Support">;
+
+//===----------------------------------------------------------------------===//
+// PIC16 supported processors.
+//===----------------------------------------------------------------------===//
+
+def : Processor<"generic", NoItineraries, []>;
+def : Processor<"cooper", NoItineraries, [FeatureCooper]>;
+
+
+def PIC16InstrInfo : InstrInfo {} 
+
+def PIC16 : Target {
+  let InstructionSet = PIC16InstrInfo;
+}
+

diff --git a/lib/Target/PIC16/PIC16ABINames.h b/lib/Target/PIC16/PIC16ABINames.h
new file mode 100644
index 0000000..e18ddf1
--- /dev/null
+++ b/lib/Target/PIC16/PIC16ABINames.h

@@ -0,0 +1,331 @@
+//===-- PIC16ABINames.h - PIC16 Naming conventios for ABI----- --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the functions to manage ABI Naming conventions for PIC16. 
+// 
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_PIC16ABINAMES_H
+#define LLVM_TARGET_PIC16ABINAMES_H
+
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cassert>
+#include <sstream>
+#include <cstring>
+#include <string>
+
+namespace llvm {
+  class PIC16TargetMachine;
+  class FunctionPass;
+  class MachineCodeEmitter;
+  class formatted_raw_ostream;
+
+  // A Central class to manage all ABI naming conventions.
+  // PAN - [P]ic16 [A]BI [N]ames
+  class PAN {
+    public:
+    // Map the name of the symbol to its section name.
+    // Current ABI:
+    // -----------------------------------------------------
+    // ALL Names are prefixed with the symobl '@'.
+    // ------------------------------------------------------
+    // Global variables do not have any '.' in their names.
+    // These are maily function names and global variable names.
+    // Example - @foo,  @i
+    // Static local variables - @<func>.<var>
+    // -------------------------------------------------------
+    // Functions and auto variables.
+    // Names are mangled as <prefix><funcname>.<tag>.<varname>
+    // Where <prefix> is '@' and <tag> is any one of
+    // the following
+    // .auto. - an automatic var of a function.
+    // .temp. - temproray data of a function.
+    // .ret.  - return value label for a function.
+    // .frame. - Frame label for a function where retval, args
+    //           and temps are stored.
+    // .args. - Label used to pass arguments to a direct call.
+    // Example - Function name:   @foo
+    //           Its frame:       @foo.frame.
+    //           Its retval:      @foo.ret.
+    //           Its local vars:  @foo.auto.a
+    //           Its temp data:   @foo.temp.
+    //           Its arg passing: @foo.args.
+    //----------------------------------------------
+    // Libcall - compiler generated libcall names must start with .lib.
+    //           This id will be used to emit extern decls for libcalls.
+    // Example - libcall name:   @.lib.sra.i8
+    //           To pass args:   @.lib.sra.i8.args.
+    //           To return val:  @.lib.sra.i8.ret.
+    //----------------------------------------------
+    // SECTION Names
+    // uninitialized globals - @udata.<num>.#
+    // initialized globals - @idata.<num>.#
+    // Program memory data - @romdata.#
+    // Variables with user defined section name - <user_defined_section>
+    // Variables with user defined address - @<var>.user_section.<address>.#
+    // Function frame - @<func>.frame_section.
+    // Function autos - @<func>.autos_section.
+    // Overlay sections - @<color>.##
+    // Declarations - Enclosed in comments. No section for them.
+    //----------------------------------------------------------
+    
+    // Tags used to mangle different names. 
+    enum TAGS {
+      PREFIX_SYMBOL,
+      GLOBAL,
+      STATIC_LOCAL,
+      AUTOS_LABEL,
+      FRAME_LABEL,
+      RET_LABEL,
+      ARGS_LABEL,
+      TEMPS_LABEL,
+      
+      LIBCALL,
+      
+      FRAME_SECTION,
+      AUTOS_SECTION,
+      CODE_SECTION,
+      USER_SECTION
+    };
+
+    // Textual names of the tags.
+    inline static const char *getTagName(TAGS tag) {
+      switch (tag) {
+      default: return "";
+      case PREFIX_SYMBOL:    return "@";
+      case AUTOS_LABEL:       return ".auto.";
+      case FRAME_LABEL:       return ".frame.";
+      case TEMPS_LABEL:       return ".temp.";
+      case ARGS_LABEL:       return ".args.";
+      case RET_LABEL:       return ".ret.";
+      case LIBCALL:       return ".lib.";
+      case FRAME_SECTION:       return ".frame_section.";
+      case AUTOS_SECTION:       return ".autos_section.";
+      case CODE_SECTION:       return ".code_section.";
+      case USER_SECTION:       return ".user_section.";
+      }
+    }
+
+    // Get tag type for the Symbol.
+    inline static TAGS getSymbolTag(const std::string &Sym) {
+      if (Sym.find(getTagName(TEMPS_LABEL)) != std::string::npos)
+        return TEMPS_LABEL;
+
+      if (Sym.find(getTagName(FRAME_LABEL)) != std::string::npos)
+        return FRAME_LABEL;
+
+      if (Sym.find(getTagName(RET_LABEL)) != std::string::npos)
+        return RET_LABEL;
+
+      if (Sym.find(getTagName(ARGS_LABEL)) != std::string::npos)
+        return ARGS_LABEL;
+
+      if (Sym.find(getTagName(AUTOS_LABEL)) != std::string::npos)
+        return AUTOS_LABEL;
+
+      if (Sym.find(getTagName(LIBCALL)) != std::string::npos)
+        return LIBCALL;
+
+      // It does not have any Tag. So its a true global or static local.
+      if (Sym.find(".") == std::string::npos) 
+        return GLOBAL;
+      
+      // If a . is there, then it may be static local.
+      // We should mangle these as well in clang.
+      if (Sym.find(".") != std::string::npos) 
+        return STATIC_LOCAL;
+ 
+      assert (0 && "Could not determine Symbol's tag");
+      return PREFIX_SYMBOL; // Silence warning when assertions are turned off.
+    }
+
+    // addPrefix - add prefix symbol to a name if there isn't one already.
+    inline static std::string addPrefix (const std::string &Name) {
+      std::string prefix = getTagName (PREFIX_SYMBOL);
+
+      // If this name already has a prefix, nothing to do.
+      if (Name.compare(0, prefix.size(), prefix) == 0)
+        return Name;
+
+      return prefix + Name;
+    }
+
+    // Get mangled func name from a mangled sym name.
+    // In all cases func name is the first component before a '.'.
+    static inline std::string getFuncNameForSym(const std::string &Sym1) {
+      assert (getSymbolTag(Sym1) != GLOBAL && "not belongs to a function");
+
+      std::string Sym = addPrefix(Sym1);
+
+      // Position of the . after func name. That's where func name ends.
+      size_t func_name_end = Sym.find ('.');
+
+      return Sym.substr (0, func_name_end);
+    }
+
+    // Get Frame start label for a func.
+    static std::string getFrameLabel(const std::string &Func) {
+      std::string Func1 = addPrefix(Func);
+      std::string tag = getTagName(FRAME_LABEL);
+      return Func1 + tag;
+    }
+
+    static std::string getRetvalLabel(const std::string &Func) {
+      std::string Func1 = addPrefix(Func);
+      std::string tag = getTagName(RET_LABEL);
+      return Func1 + tag;
+    }
+
+    static std::string getArgsLabel(const std::string &Func) {
+      std::string Func1 = addPrefix(Func);
+      std::string tag = getTagName(ARGS_LABEL);
+      return Func1 + tag;
+    }
+
+    static std::string getTempdataLabel(const std::string &Func) {
+      std::string Func1 = addPrefix(Func);
+      std::string tag = getTagName(TEMPS_LABEL);
+      return Func1 + tag;
+    }
+
+    static std::string getFrameSectionName(const std::string &Func) {
+      std::string Func1 = addPrefix(Func);
+      std::string tag = getTagName(FRAME_SECTION);
+      return Func1 + tag + "#";
+    }
+
+    static std::string getAutosSectionName(const std::string &Func) {
+      std::string Func1 = addPrefix(Func);
+      std::string tag = getTagName(AUTOS_SECTION);
+      return Func1 + tag + "#";
+    }
+
+    static std::string getCodeSectionName(const std::string &Func) {
+      std::string Func1 = addPrefix(Func);
+      std::string tag = getTagName(CODE_SECTION);
+      return Func1 + tag + "#";
+    }
+
+    static std::string getUserSectionName(const std::string &Name) {
+      std::string sname = addPrefix(Name);;
+      std::string tag = getTagName(USER_SECTION);
+      return sname + tag + "#";
+    }
+
+    // udata, romdata and idata section names are generated by a given number.
+    // @udata.<num>.# 
+    static std::string getUdataSectionName(unsigned num, 
+                                           std::string prefix = "") {
+       std::ostringstream o;
+       o << getTagName(PREFIX_SYMBOL) << prefix << "udata." << num 
+         << ".#"; 
+       return o.str(); 
+    }
+
+    static std::string getRomdataSectionName() {
+      return "romdata.#";
+    }
+
+    static std::string getSharedUDataSectionName() {
+       std::ostringstream o;
+       o << getTagName(PREFIX_SYMBOL)  << "udata_shr" << ".#";
+       return o.str();
+    }
+
+    static std::string getRomdataSectionName(unsigned num,
+                                             std::string prefix = "") {
+       std::ostringstream o;
+       o << getTagName(PREFIX_SYMBOL) << prefix << "romdata." << num 
+         << ".#";
+       return o.str();
+    }
+
+    static std::string getIdataSectionName(unsigned num,
+                                           std::string prefix = "") {
+       std::ostringstream o;
+       o << getTagName(PREFIX_SYMBOL) << prefix << "idata." << num 
+         << ".#"; 
+       return o.str(); 
+    }
+
+    inline static bool isLocalName (const std::string &Name) {
+      if (getSymbolTag(Name) == AUTOS_LABEL)
+        return true;
+
+      return false;
+    }
+
+    inline static bool isMemIntrinsic (const std::string &Name) {
+      if (Name.compare("@memcpy") == 0 || Name.compare("@memset") == 0 ||
+          Name.compare("@memmove") == 0) {
+        return true;
+      }
+      
+      return false;
+    }
+
+    inline static bool isLocalToFunc (std::string &Func, std::string &Var) {
+      if (! isLocalName(Var)) return false;
+
+      std::string Func1 = addPrefix(Func);
+      // Extract func name of the varilable.
+      const std::string &fname = getFuncNameForSym(Var);
+
+      if (fname.compare(Func1) == 0)
+        return true;
+
+      return false;
+    }
+
+
+    // Get the section for the given external symbol names.
+    // This tries to find the type (Tag) of the symbol from its mangled name
+    // and return appropriate section name for it.
+    static inline std::string getSectionNameForSym(const std::string &Sym1) {
+      std::string Sym = addPrefix(Sym1);
+
+      std::string SectionName;
+ 
+      std::string Fname = getFuncNameForSym (Sym);
+      TAGS id = getSymbolTag (Sym);
+
+      switch (id) {
+        default : assert (0 && "Could not determine external symbol type");
+        case FRAME_LABEL:
+        case RET_LABEL:
+        case TEMPS_LABEL:
+        case ARGS_LABEL:  {
+          return getFrameSectionName(Fname);
+        }
+        case AUTOS_LABEL: {
+          return getAutosSectionName(Fname);
+        }
+      }
+    }
+
+    /// Return Overlay Name for the section.
+    /// The ABI Convention is: @<Color>.##.<section_tag>
+    /// The section_tag is retrieved from the SectName parameter and
+    /// and Color is passed in parameter.
+    static inline std::string  getOverlayName(std::string SectName, int Color) {
+      // FIXME: Only autos_section and frame_section are colored.
+      // So check and assert if the passed SectName does not have AUTOS_SECTION
+      // or FRAME_SECTION tag in it.
+      std::ostringstream o;
+      o << getTagName(PREFIX_SYMBOL) << Color << ".##" 
+        << SectName.substr(SectName.find("."));
+
+      return o.str();
+    } 
+  }; // class PAN.
+} // end namespace llvm;
+
+#endif

diff --git a/lib/Target/PIC16/PIC16DebugInfo.cpp b/lib/Target/PIC16/PIC16DebugInfo.cpp
new file mode 100644
index 0000000..c517b1b
--- /dev/null
+++ b/lib/Target/PIC16/PIC16DebugInfo.cpp

@@ -0,0 +1,489 @@
+
+//===-- PIC16DebugInfo.cpp - Implementation for PIC16 Debug Information ======//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the helper functions for representing debug information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PIC16.h"
+#include "PIC16ABINames.h"
+#include "PIC16DebugInfo.h" 
+#include "llvm/GlobalVariable.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/DebugLoc.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/ADT/SmallString.h"
+
+using namespace llvm;
+
+/// PopulateDebugInfo - Populate the TypeNo, Aux[] and TagName from Ty.
+///
+void PIC16DbgInfo::PopulateDebugInfo (DIType Ty, unsigned short &TypeNo,
+                                      bool &HasAux, int Aux[], 
+                                      std::string &TagName) {
+  if (Ty.isBasicType())
+    PopulateBasicTypeInfo (Ty, TypeNo);
+  else if (Ty.isCompositeType())
+    PopulateCompositeTypeInfo (Ty, TypeNo, HasAux, Aux, TagName);
+  else if (Ty.isDerivedType())
+    PopulateDerivedTypeInfo (Ty, TypeNo, HasAux, Aux, TagName);
+  else {
+    TypeNo = PIC16Dbg::T_NULL;
+    HasAux = false;
+  }
+  return;
+}
+
+/// PopulateBasicTypeInfo- Populate TypeNo for basic type from Ty.
+///
+void PIC16DbgInfo::PopulateBasicTypeInfo (DIType Ty, unsigned short &TypeNo) {
+  std::string Name = Ty.getName();
+  unsigned short BaseTy = GetTypeDebugNumber(Name);
+  TypeNo = TypeNo << PIC16Dbg::S_BASIC;
+  TypeNo = TypeNo | (0xffff & BaseTy);
+}
+
+/// PopulateDerivedTypeInfo - Populate TypeNo, Aux[], TagName for derived type 
+/// from Ty. Derived types are mostly pointers.
+///
+void PIC16DbgInfo::PopulateDerivedTypeInfo (DIType Ty, unsigned short &TypeNo,
+                                            bool &HasAux, int Aux[],
+                                            std::string &TagName) {
+
+  switch(Ty.getTag())
+  {
+    case dwarf::DW_TAG_pointer_type:
+      TypeNo = TypeNo << PIC16Dbg::S_DERIVED;
+      TypeNo = TypeNo | PIC16Dbg::DT_PTR;
+      break;
+    default:
+      TypeNo = TypeNo << PIC16Dbg::S_DERIVED;
+  }
+  
+  // We also need to encode the information about the base type of
+  // pointer in TypeNo.
+  DIType BaseType = DIDerivedType(Ty.getNode()).getTypeDerivedFrom();
+  PopulateDebugInfo(BaseType, TypeNo, HasAux, Aux, TagName);
+}
+
+/// PopulateArrayTypeInfo - Populate TypeNo, Aux[] for array from Ty.
+void PIC16DbgInfo::PopulateArrayTypeInfo (DIType Ty, unsigned short &TypeNo,
+                                          bool &HasAux, int Aux[],
+                                          std::string &TagName) {
+
+  DICompositeType CTy = DICompositeType(Ty.getNode());
+  DIArray Elements = CTy.getTypeArray();
+  unsigned short size = 1;
+  unsigned short Dimension[4]={0,0,0,0};
+  for (unsigned i = 0, N = Elements.getNumElements(); i < N; ++i) {
+    DIDescriptor Element = Elements.getElement(i);
+    if (Element.getTag() == dwarf::DW_TAG_subrange_type) {
+      TypeNo = TypeNo << PIC16Dbg::S_DERIVED;
+      TypeNo = TypeNo | PIC16Dbg::DT_ARY;
+      DISubrange SubRange = DISubrange(Element.getNode());
+      Dimension[i] = SubRange.getHi() - SubRange.getLo() + 1;
+      // Each dimension is represented by 2 bytes starting at byte 9.
+      Aux[8+i*2+0] = Dimension[i];
+      Aux[8+i*2+1] = Dimension[i] >> 8;
+      size = size * Dimension[i];
+    }
+  }
+  HasAux = true;
+  // In auxillary entry for array, 7th and 8th byte represent array size.
+  Aux[6] = size & 0xff;
+  Aux[7] = size >> 8;
+  DIType BaseType = CTy.getTypeDerivedFrom();
+  PopulateDebugInfo(BaseType, TypeNo, HasAux, Aux, TagName);
+}
+
+/// PopulateStructOrUnionTypeInfo - Populate TypeNo, Aux[] , TagName for 
+/// structure or union.
+///
+void PIC16DbgInfo::PopulateStructOrUnionTypeInfo (DIType Ty, 
+                                                  unsigned short &TypeNo,
+                                                  bool &HasAux, int Aux[],
+                                                  std::string &TagName) {
+  DICompositeType CTy = DICompositeType(Ty.getNode());
+  TypeNo = TypeNo << PIC16Dbg::S_BASIC;
+  if (Ty.getTag() == dwarf::DW_TAG_structure_type)
+    TypeNo = TypeNo | PIC16Dbg::T_STRUCT;
+  else
+    TypeNo = TypeNo | PIC16Dbg::T_UNION;
+  TagName = CTy.getName();
+  // UniqueSuffix is .number where number is obtained from
+  // llvm.dbg.composite<number>.
+  // FIXME: This will break when composite type is not represented by
+  // llvm.dbg.composite* global variable. Since we need to revisit 
+  // PIC16DebugInfo implementation anyways after the MDNodes based 
+  // framework is done, let us continue with the way it is.
+  std::string UniqueSuffix = "." + Ty.getNode()->getNameStr().substr(18);
+  TagName += UniqueSuffix;
+  unsigned short size = CTy.getSizeInBits()/8;
+  // 7th and 8th byte represent size.
+  HasAux = true;
+  Aux[6] = size & 0xff;
+  Aux[7] = size >> 8;
+}
+
+/// PopulateEnumTypeInfo - Populate TypeNo for enum from Ty.
+void PIC16DbgInfo::PopulateEnumTypeInfo (DIType Ty, unsigned short &TypeNo) {
+  TypeNo = TypeNo << PIC16Dbg::S_BASIC;
+  TypeNo = TypeNo | PIC16Dbg::T_ENUM;
+}
+
+/// PopulateCompositeTypeInfo - Populate TypeNo, Aux[] and TagName for 
+/// composite types from Ty.
+///
+void PIC16DbgInfo::PopulateCompositeTypeInfo (DIType Ty, unsigned short &TypeNo,
+                                              bool &HasAux, int Aux[],
+                                              std::string &TagName) {
+  switch (Ty.getTag()) {
+    case dwarf::DW_TAG_array_type: {
+      PopulateArrayTypeInfo (Ty, TypeNo, HasAux, Aux, TagName);
+      break;
+    }
+    case dwarf:: DW_TAG_union_type:
+    case dwarf::DW_TAG_structure_type: {
+      PopulateStructOrUnionTypeInfo (Ty, TypeNo, HasAux, Aux, TagName);
+      break;
+    }
+    case dwarf::DW_TAG_enumeration_type: {
+      PopulateEnumTypeInfo (Ty, TypeNo);
+      break;
+    }
+    default:
+      TypeNo = TypeNo << PIC16Dbg::S_DERIVED;
+  }
+}
+
+/// GetTypeDebugNumber - Get debug type number for given type.
+///
+unsigned PIC16DbgInfo::GetTypeDebugNumber(std::string &type)  {
+  if (type == "char")
+    return PIC16Dbg::T_CHAR;
+  else if (type == "short")
+    return PIC16Dbg::T_SHORT;
+  else if (type == "int")
+    return PIC16Dbg::T_INT;
+  else if (type == "long")
+    return PIC16Dbg::T_LONG;
+  else if (type == "unsigned char")
+    return PIC16Dbg::T_UCHAR;
+  else if (type == "unsigned short")
+    return PIC16Dbg::T_USHORT;
+  else if (type == "unsigned int")
+    return PIC16Dbg::T_UINT;
+  else if (type == "unsigned long")
+    return PIC16Dbg::T_ULONG;
+  else
+    return 0;
+}
+ 
+/// GetStorageClass - Get storage class for give debug variable.
+///
+short PIC16DbgInfo::getStorageClass(DIGlobalVariable DIGV) {
+  short ClassNo;
+  if (PAN::isLocalName(DIGV.getName())) {
+    // Generating C_AUTO here fails due to error in linker. Change it once
+    // linker is fixed.
+    ClassNo = PIC16Dbg::C_STAT;
+  }
+  else if (DIGV.isLocalToUnit())
+    ClassNo = PIC16Dbg::C_STAT;
+  else
+    ClassNo = PIC16Dbg::C_EXT;
+  return ClassNo;
+}
+
+/// BeginModule - Emit necessary debug info to start a Module and do other
+/// required initializations.
+void PIC16DbgInfo::BeginModule(Module &M) {
+  // Emit file directive for module.
+  DebugInfoFinder DbgFinder;
+  DbgFinder.processModule(M);
+  if (DbgFinder.compile_unit_count() != 0) {
+    // FIXME : What if more then one CUs are present in a module ?
+    MDNode *CU = *DbgFinder.compile_unit_begin();
+    EmitDebugDirectives = true;
+    SwitchToCU(CU);
+  }
+  // Emit debug info for decls of composite types.
+  EmitCompositeTypeDecls(M);
+}
+
+/// Helper to find first valid debug loc for a function.
+///
+static const DebugLoc GetDebugLocForFunction(const MachineFunction &MF) {
+  DebugLoc DL;
+  for (MachineFunction::const_iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+    for (MachineBasicBlock::const_iterator II = I->begin(), E = I->end();
+         II != E; ++II) {
+      DL = II->getDebugLoc();
+      if (!DL.isUnknown())
+        return DL;
+    }
+  }
+  return DL;
+}
+
+/// BeginFunction - Emit necessary debug info to start a function.
+///
+void PIC16DbgInfo::BeginFunction(const MachineFunction &MF) {
+  if (! EmitDebugDirectives) return;
+  
+  // Retreive the first valid debug Loc and process it.
+  const DebugLoc &DL = GetDebugLocForFunction(MF);
+  // Emit debug info only if valid debug info is available.
+  if (!DL.isUnknown()) {
+    ChangeDebugLoc(MF, DL, true);
+    EmitFunctBeginDI(MF.getFunction());
+  } 
+  // Set current line to 0 so that.line directive is genearted after .bf.
+  CurLine = 0;
+}
+
+/// ChangeDebugLoc - Take necessary steps when DebugLoc changes.
+/// CurFile and CurLine may change as a result of this.
+///
+void PIC16DbgInfo::ChangeDebugLoc(const MachineFunction &MF,  
+                                  const DebugLoc &DL, bool IsInBeginFunction) {
+  if (! EmitDebugDirectives) return;
+  assert (! DL.isUnknown()  && "can't change to invalid debug loc");
+
+  DILocation Loc = MF.getDILocation(DL);
+  MDNode *CU = Loc.getScope().getNode();
+  unsigned line = Loc.getLineNumber();
+
+  SwitchToCU(CU);
+  SwitchToLine(line, IsInBeginFunction);
+}
+
+/// SwitchToLine - Emit line directive for a new line.
+///
+void PIC16DbgInfo::SwitchToLine(unsigned Line, bool IsInBeginFunction) {
+  if (CurLine == Line) return;
+  if (!IsInBeginFunction)  O << "\n\t.line " << Line << "\n";
+  CurLine = Line;
+}
+
+/// EndFunction - Emit .ef for end of function.
+///
+void PIC16DbgInfo::EndFunction(const MachineFunction &MF) {
+  if (! EmitDebugDirectives) return;
+  const DebugLoc &DL = GetDebugLocForFunction(MF);
+  // Emit debug info only if valid debug info is available.
+  if (!DL.isUnknown())
+    EmitFunctEndDI(MF.getFunction(), CurLine);
+}
+
+/// EndModule - Emit .eof for end of module.
+///
+void PIC16DbgInfo::EndModule(Module &M) {
+  if (! EmitDebugDirectives) return;
+  EmitVarDebugInfo(M);
+  if (CurFile != "") O << "\n\t.eof";
+}
+ 
+/// EmitCompositeTypeElements - Emit debug information for members of a 
+/// composite type.
+/// 
+void PIC16DbgInfo::EmitCompositeTypeElements (DICompositeType CTy,
+                                              std::string SuffixNo) {
+  unsigned long Value = 0;
+  DIArray Elements = CTy.getTypeArray();
+  for (unsigned i = 0, N = Elements.getNumElements(); i < N; i++) {
+    DIDescriptor Element = Elements.getElement(i);
+    unsigned short TypeNo = 0;
+    bool HasAux = false;
+    int ElementAux[PIC16Dbg::AuxSize] = { 0 };
+    std::string TagName = "";
+    DIDerivedType DITy(Element.getNode());
+    unsigned short ElementSize = DITy.getSizeInBits()/8;
+    // Get mangleddd name for this structure/union  element.
+    std::string MangMemName = DITy.getName().str() + SuffixNo;
+    PopulateDebugInfo(DITy, TypeNo, HasAux, ElementAux, TagName);
+    short Class = 0;
+    if( CTy.getTag() == dwarf::DW_TAG_union_type)
+      Class = PIC16Dbg::C_MOU;
+    else if  (CTy.getTag() == dwarf::DW_TAG_structure_type)
+      Class = PIC16Dbg::C_MOS;
+    EmitSymbol(MangMemName.c_str(), Class, TypeNo, Value);
+    if (CTy.getTag() == dwarf::DW_TAG_structure_type)
+      Value += ElementSize;
+    if (HasAux)
+      EmitAuxEntry(MangMemName.c_str(), ElementAux, PIC16Dbg::AuxSize, TagName);
+  }
+}
+
+/// EmitCompositeTypeDecls - Emit composite type declarations like structure 
+/// and union declarations.
+///
+void PIC16DbgInfo::EmitCompositeTypeDecls(Module &M) {
+  DebugInfoFinder DbgFinder;
+  DbgFinder.processModule(M);
+  for (DebugInfoFinder::iterator I = DbgFinder.type_begin(),
+         E = DbgFinder.type_end(); I != E; ++I) {
+    DICompositeType CTy(*I);
+    if (CTy.isNull())
+      continue;
+    if (CTy.getTag() == dwarf::DW_TAG_union_type ||
+        CTy.getTag() == dwarf::DW_TAG_structure_type ) {
+      // Get the number after llvm.dbg.composite and make UniqueSuffix from 
+      // it.
+      std::string DIVar = CTy.getNode()->getNameStr();
+      std::string UniqueSuffix = "." + DIVar.substr(18);
+      std::string MangledCTyName = CTy.getName().str() + UniqueSuffix;
+      unsigned short size = CTy.getSizeInBits()/8;
+      int Aux[PIC16Dbg::AuxSize] = {0};
+      // 7th and 8th byte represent size of structure/union.
+      Aux[6] = size & 0xff;
+      Aux[7] = size >> 8;
+      // Emit .def for structure/union tag.
+      if( CTy.getTag() == dwarf::DW_TAG_union_type)
+        EmitSymbol(MangledCTyName.c_str(), PIC16Dbg::C_UNTAG);
+      else if  (CTy.getTag() == dwarf::DW_TAG_structure_type) 
+        EmitSymbol(MangledCTyName.c_str(), PIC16Dbg::C_STRTAG);
+      
+      // Emit auxiliary debug information for structure/union tag. 
+      EmitAuxEntry(MangledCTyName.c_str(), Aux, PIC16Dbg::AuxSize);
+      
+      // Emit members.
+      EmitCompositeTypeElements (CTy, UniqueSuffix);
+      
+      // Emit mangled Symbol for end of structure/union.
+      std::string EOSSymbol = ".eos" + UniqueSuffix;
+      EmitSymbol(EOSSymbol.c_str(), PIC16Dbg::C_EOS);
+      EmitAuxEntry(EOSSymbol.c_str(), Aux, PIC16Dbg::AuxSize, 
+                   MangledCTyName.c_str());
+    }
+  }
+}
+
+
+/// EmitFunctBeginDI - Emit .bf for function.
+///
+void PIC16DbgInfo::EmitFunctBeginDI(const Function *F) {
+  std::string FunctName = F->getName();
+  if (EmitDebugDirectives) {
+    std::string FunctBeginSym = ".bf." + FunctName;
+    std::string BlockBeginSym = ".bb." + FunctName;
+
+    int BFAux[PIC16Dbg::AuxSize] = {0};
+    BFAux[4] = CurLine;
+    BFAux[5] = CurLine >> 8;
+
+    // Emit debug directives for beginning of function.
+    EmitSymbol(FunctBeginSym, PIC16Dbg::C_FCN);
+    EmitAuxEntry(FunctBeginSym, BFAux, PIC16Dbg::AuxSize);
+
+    EmitSymbol(BlockBeginSym, PIC16Dbg::C_BLOCK);
+    EmitAuxEntry(BlockBeginSym, BFAux, PIC16Dbg::AuxSize);
+  }
+}
+
+/// EmitFunctEndDI - Emit .ef for function end.
+///
+void PIC16DbgInfo::EmitFunctEndDI(const Function *F, unsigned Line) {
+  std::string FunctName = F->getName();
+  if (EmitDebugDirectives) {
+    std::string FunctEndSym = ".ef." + FunctName;
+    std::string BlockEndSym = ".eb." + FunctName;
+
+    // Emit debug directives for end of function.
+    EmitSymbol(BlockEndSym, PIC16Dbg::C_BLOCK);
+    int EFAux[PIC16Dbg::AuxSize] = {0};
+    // 5th and 6th byte stand for line number.
+    EFAux[4] = CurLine;
+    EFAux[5] = CurLine >> 8;
+    EmitAuxEntry(BlockEndSym, EFAux, PIC16Dbg::AuxSize);
+    EmitSymbol(FunctEndSym, PIC16Dbg::C_FCN);
+    EmitAuxEntry(FunctEndSym, EFAux, PIC16Dbg::AuxSize);
+  }
+}
+
+/// EmitAuxEntry - Emit Auxiliary debug information.
+///
+void PIC16DbgInfo::EmitAuxEntry(const std::string VarName, int Aux[], int Num,
+                                std::string TagName) {
+  O << "\n\t.dim " << VarName << ", 1" ;
+  // TagName is emitted in case of structure/union objects.
+  if (TagName != "")
+    O << ", " << TagName;
+  for (int i = 0; i<Num; i++)
+    O << "," << Aux[i];
+}
+
+/// EmitSymbol - Emit .def for a symbol. Value is offset for the member.
+///
+void PIC16DbgInfo::EmitSymbol(std::string Name, short Class, unsigned short
+                              Type, unsigned long Value) {
+  O << "\n\t" << ".def "<< Name << ", type = " << Type << ", class = " 
+    << Class;
+  if (Value > 0)
+    O  << ", value = " << Value;
+}
+
+/// EmitVarDebugInfo - Emit debug information for all variables.
+///
+void PIC16DbgInfo::EmitVarDebugInfo(Module &M) {
+  DebugInfoFinder DbgFinder;
+  DbgFinder.processModule(M);
+  
+  for (DebugInfoFinder::iterator I = DbgFinder.global_variable_begin(),
+         E = DbgFinder.global_variable_end(); I != E; ++I) {
+    DIGlobalVariable DIGV(*I);
+    DIType Ty = DIGV.getType();
+    unsigned short TypeNo = 0;
+    bool HasAux = false;
+    int Aux[PIC16Dbg::AuxSize] = { 0 };
+    std::string TagName = "";
+    std::string VarName = DIGV.getName();
+    VarName = MAI->getGlobalPrefix() + VarName;
+    PopulateDebugInfo(Ty, TypeNo, HasAux, Aux, TagName);
+    // Emit debug info only if type information is availaible.
+    if (TypeNo != PIC16Dbg::T_NULL) {
+      O << "\n\t.type " << VarName << ", " << TypeNo;
+      short ClassNo = getStorageClass(DIGV);
+      O << "\n\t.class " << VarName << ", " << ClassNo;
+      if (HasAux) 
+        EmitAuxEntry(VarName, Aux, PIC16Dbg::AuxSize, TagName);
+    }
+  }
+  O << "\n";
+}
+
+/// SwitchToCU - Switch to a new compilation unit.
+///
+void PIC16DbgInfo::SwitchToCU(MDNode *CU) {
+  // Get the file path from CU.
+  DICompileUnit cu(CU);
+  std::string DirName = cu.getDirectory();
+  std::string FileName = cu.getFilename();
+  std::string FilePath = DirName + "/" + FileName;
+
+  // Nothing to do if source file is still same.
+  if ( FilePath == CurFile ) return;
+
+  // Else, close the current one and start a new.
+  if (CurFile != "") O << "\n\t.eof";
+  O << "\n\t.file\t\"" << FilePath << "\"\n" ;
+  CurFile = FilePath;
+  CurLine = 0;
+}
+
+/// EmitEOF - Emit .eof for end of file.
+///
+void PIC16DbgInfo::EmitEOF() {
+  if (CurFile != "")
+    O << "\n\t.EOF";
+}
+

diff --git a/lib/Target/PIC16/PIC16DebugInfo.h b/lib/Target/PIC16/PIC16DebugInfo.h
new file mode 100644
index 0000000..54e27c7
--- /dev/null
+++ b/lib/Target/PIC16/PIC16DebugInfo.h

@@ -0,0 +1,157 @@
+//===-- PIC16DebugInfo.h - Interfaces for PIC16 Debug Information ============//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the helper functions for representing debug information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PIC16DBG_H
+#define PIC16DBG_H
+
+#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/Module.h"
+
+namespace llvm {
+  class MachineFunction;
+  class DebugLoc;
+  namespace PIC16Dbg {
+    enum VarType {
+      T_NULL,
+      T_VOID,
+      T_CHAR,
+      T_SHORT,
+      T_INT,
+      T_LONG,
+      T_FLOAT,
+      T_DOUBLE,
+      T_STRUCT,
+      T_UNION,
+      T_ENUM,
+      T_MOE,
+      T_UCHAR,
+      T_USHORT,
+      T_UINT,
+      T_ULONG
+    };
+    enum DerivedType {
+      DT_NONE,
+      DT_PTR,
+      DT_FCN,
+      DT_ARY
+    };
+    enum TypeSize {
+      S_BASIC = 5,
+      S_DERIVED = 3
+    };
+    enum DbgClass {
+      C_NULL,
+      C_AUTO,
+      C_EXT,
+      C_STAT,
+      C_REG,
+      C_EXTDEF,
+      C_LABEL,
+      C_ULABEL,
+      C_MOS,
+      C_ARG,
+      C_STRTAG,
+      C_MOU,
+      C_UNTAG,
+      C_TPDEF,
+      C_USTATIC,
+      C_ENTAG,
+      C_MOE,
+      C_REGPARM,
+      C_FIELD,
+      C_AUTOARG,
+      C_LASTENT,
+      C_BLOCK = 100,
+      C_FCN,
+      C_EOS,
+      C_FILE,
+      C_LINE,
+      C_ALIAS,
+      C_HIDDEN,
+      C_EOF,
+      C_LIST,
+      C_SECTION,
+      C_EFCN = 255
+    };
+    enum SymbolSize {
+      AuxSize =20
+    };
+  }
+
+  class formatted_raw_ostream;
+
+  class PIC16DbgInfo {
+    formatted_raw_ostream &O;
+    const MCAsmInfo *MAI;
+    std::string CurFile;
+    unsigned CurLine;
+
+    // EmitDebugDirectives is set if debug information is available. Default
+    // value for it is false.
+    bool EmitDebugDirectives;
+
+  public:
+    PIC16DbgInfo(formatted_raw_ostream &o, const MCAsmInfo *T)
+      : O(o), MAI(T) {
+      CurFile = "";
+      CurLine = 0;
+      EmitDebugDirectives = false; 
+    }
+
+    void BeginModule (Module &M);
+    void BeginFunction (const MachineFunction &MF);
+    void ChangeDebugLoc (const MachineFunction &MF, const DebugLoc &DL,
+                         bool IsInBeginFunction = false);
+    void EndFunction (const MachineFunction &MF);
+    void EndModule (Module &M);
+
+
+    private:
+    void SwitchToCU (MDNode *CU);
+    void SwitchToLine (unsigned Line, bool IsInBeginFunction = false);
+
+    void PopulateDebugInfo (DIType Ty, unsigned short &TypeNo, bool &HasAux,
+                           int Aux[], std::string &TypeName);
+    void PopulateBasicTypeInfo (DIType Ty, unsigned short &TypeNo);
+    void PopulateDerivedTypeInfo (DIType Ty, unsigned short &TypeNo, 
+                                  bool &HasAux, int Aux[],
+                                  std::string &TypeName);
+
+    void PopulateCompositeTypeInfo (DIType Ty, unsigned short &TypeNo,
+                                    bool &HasAux, int Aux[],
+                                    std::string &TypeName);
+    void PopulateArrayTypeInfo (DIType Ty, unsigned short &TypeNo,
+                                bool &HasAux, int Aux[],
+                                std::string &TypeName);
+
+    void PopulateStructOrUnionTypeInfo (DIType Ty, unsigned short &TypeNo,
+                                        bool &HasAux, int Aux[],
+                                        std::string &TypeName);
+    void PopulateEnumTypeInfo (DIType Ty, unsigned short &TypeNo);
+
+    unsigned GetTypeDebugNumber(std::string &Type);
+    short getStorageClass(DIGlobalVariable DIGV);
+    void EmitFunctBeginDI(const Function *F);
+    void EmitCompositeTypeDecls(Module &M);
+    void EmitCompositeTypeElements (DICompositeType CTy, std::string Suffix);
+    void EmitFunctEndDI(const Function *F, unsigned Line);
+    void EmitAuxEntry(const std::string VarName, int Aux[], 
+                      int num = PIC16Dbg::AuxSize, std::string TagName = "");
+    inline void EmitSymbol(std::string Name, short Class, 
+                           unsigned short Type = PIC16Dbg::T_NULL, 
+                           unsigned long Value = 0);
+    void EmitVarDebugInfo(Module &M);
+    void EmitEOF();
+  };
+} // end namespace llvm;
+#endif

diff --git a/lib/Target/PIC16/PIC16ISelDAGToDAG.cpp b/lib/Target/PIC16/PIC16ISelDAGToDAG.cpp
new file mode 100644
index 0000000..82197ae
--- /dev/null
+++ b/lib/Target/PIC16/PIC16ISelDAGToDAG.cpp

@@ -0,0 +1,60 @@
+//===-- PIC16ISelDAGToDAG.cpp - A dag to dag inst selector for PIC16 ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the PIC16 target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "pic16-isel"
+
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "PIC16ISelDAGToDAG.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+/// createPIC16ISelDag - This pass converts a legalized DAG into a
+/// PIC16-specific DAG, ready for instruction scheduling.
+FunctionPass *llvm::createPIC16ISelDag(PIC16TargetMachine &TM) {
+  return new PIC16DAGToDAGISel(TM);
+}
+
+
+/// InstructionSelect - This callback is invoked by
+/// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+void PIC16DAGToDAGISel::InstructionSelect() {
+  SelectRoot(*CurDAG);
+  CurDAG->RemoveDeadNodes();
+}
+
+/// Select - Select instructions not customized! Used for
+/// expanded, promoted and normal instructions.
+SDNode* PIC16DAGToDAGISel::Select(SDNode *N) {
+
+  // Select the default instruction.
+  SDNode *ResNode = SelectCode(N);
+
+  return ResNode;
+}
+
+
+// SelectDirectAddr - Match a direct address for DAG. 
+// A direct address could be a globaladdress or externalsymbol.
+bool PIC16DAGToDAGISel::SelectDirectAddr(SDNode *Op, SDValue N, 
+                                      SDValue &Address) {
+  // Return true if TGA or ES.
+  if (N.getOpcode() == ISD::TargetGlobalAddress
+      || N.getOpcode() == ISD::TargetExternalSymbol) {
+    Address = N;
+    return true;
+  }
+
+  return false;
+}

diff --git a/lib/Target/PIC16/PIC16ISelDAGToDAG.h b/lib/Target/PIC16/PIC16ISelDAGToDAG.h
new file mode 100644
index 0000000..813a540
--- /dev/null
+++ b/lib/Target/PIC16/PIC16ISelDAGToDAG.h

@@ -0,0 +1,63 @@
+//===-- PIC16ISelDAGToDAG.cpp - A dag to dag inst selector for PIC16 ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the PIC16 target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "pic16-isel"
+
+#include "PIC16.h"
+#include "PIC16ISelLowering.h"
+#include "PIC16RegisterInfo.h"
+#include "PIC16TargetMachine.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Intrinsics.h"
+using namespace llvm;
+
+namespace {
+
+class VISIBILITY_HIDDEN PIC16DAGToDAGISel : public SelectionDAGISel {
+
+  /// TM - Keep a reference to PIC16TargetMachine.
+  PIC16TargetMachine &TM;
+
+  /// PIC16Lowering - This object fully describes how to lower LLVM code to an
+  /// PIC16-specific SelectionDAG.
+  PIC16TargetLowering &PIC16Lowering;
+
+public:
+  explicit PIC16DAGToDAGISel(PIC16TargetMachine &tm) : 
+        SelectionDAGISel(tm),
+        TM(tm), PIC16Lowering(*TM.getTargetLowering()) { 
+    // Keep PIC16 specific DAGISel to use during the lowering
+    PIC16Lowering.ISel = this;
+  }
+  
+  // Pass Name
+  virtual const char *getPassName() const {
+    return "PIC16 DAG->DAG Pattern Instruction Selection";
+  } 
+
+  virtual void InstructionSelect();
+  
+private:
+  // Include the pieces autogenerated from the target description.
+#include "PIC16GenDAGISel.inc"
+
+  SDNode *Select(SDNode *N);
+
+  // Match direct address complex pattern.
+  bool SelectDirectAddr(SDNode *Op, SDValue N, SDValue &Address);
+
+};
+
+}
+

diff --git a/lib/Target/PIC16/PIC16ISelLowering.cpp b/lib/Target/PIC16/PIC16ISelLowering.cpp
new file mode 100644
index 0000000..7754a4f
--- /dev/null
+++ b/lib/Target/PIC16/PIC16ISelLowering.cpp

@@ -0,0 +1,1950 @@
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that PIC16 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "pic16-lower"
+#include "PIC16ABINames.h"
+#include "PIC16ISelLowering.h"
+#include "PIC16TargetObjectFile.h"
+#include "PIC16TargetMachine.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Function.h"
+#include "llvm/CallingConv.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+
+
+using namespace llvm;
+
+static const char *getIntrinsicName(unsigned opcode) {
+  std::string Basename;
+  switch(opcode) {
+  default: llvm_unreachable("do not know intrinsic name");
+  // Arithmetic Right shift for integer types.
+  case PIC16ISD::SRA_I8: Basename = "sra.i8"; break;
+  case RTLIB::SRA_I16: Basename = "sra.i16"; break;
+  case RTLIB::SRA_I32: Basename = "sra.i32"; break;
+
+  // Left shift for integer types.
+  case PIC16ISD::SLL_I8: Basename = "sll.i8"; break;
+  case RTLIB::SHL_I16: Basename = "sll.i16"; break;
+  case RTLIB::SHL_I32: Basename = "sll.i32"; break;
+
+  // Logical Right Shift for integer types.
+  case PIC16ISD::SRL_I8: Basename = "srl.i8"; break;
+  case RTLIB::SRL_I16: Basename = "srl.i16"; break;
+  case RTLIB::SRL_I32: Basename = "srl.i32"; break;
+
+  // Multiply for integer types.
+  case PIC16ISD::MUL_I8: Basename = "mul.i8"; break;
+  case RTLIB::MUL_I16: Basename = "mul.i16"; break;
+  case RTLIB::MUL_I32: Basename = "mul.i32"; break;
+
+  // Signed division for integers.
+  case RTLIB::SDIV_I16: Basename = "sdiv.i16"; break;
+  case RTLIB::SDIV_I32: Basename = "sdiv.i32"; break;
+
+  // Unsigned division for integers.
+  case RTLIB::UDIV_I16: Basename = "udiv.i16"; break;
+  case RTLIB::UDIV_I32: Basename = "udiv.i32"; break;
+
+  // Signed Modulas for integers.
+  case RTLIB::SREM_I16: Basename = "srem.i16"; break;
+  case RTLIB::SREM_I32: Basename = "srem.i32"; break;
+
+  // Unsigned Modulas for integers.
+  case RTLIB::UREM_I16: Basename = "urem.i16"; break;
+  case RTLIB::UREM_I32: Basename = "urem.i32"; break;
+
+  //////////////////////
+  // LIBCALLS FOR FLOATS
+  //////////////////////
+
+  // Float to signed integrals
+  case RTLIB::FPTOSINT_F32_I8: Basename = "f32_to_si32"; break;
+  case RTLIB::FPTOSINT_F32_I16: Basename = "f32_to_si32"; break;
+  case RTLIB::FPTOSINT_F32_I32: Basename = "f32_to_si32"; break;
+
+  // Signed integrals to float. char and int are first sign extended to i32 
+  // before being converted to float, so an I8_F32 or I16_F32 isn't required.
+  case RTLIB::SINTTOFP_I32_F32: Basename = "si32_to_f32"; break;
+
+  // Float to Unsigned conversions.
+  // Signed conversion can be used for unsigned conversion as well.
+  // In signed and unsigned versions only the interpretation of the 
+  // MSB is different. Bit representation remains the same. 
+  case RTLIB::FPTOUINT_F32_I8: Basename = "f32_to_si32"; break;
+  case RTLIB::FPTOUINT_F32_I16: Basename = "f32_to_si32"; break;
+  case RTLIB::FPTOUINT_F32_I32: Basename = "f32_to_si32"; break;
+
+  // Unsigned to Float conversions. char and int are first zero extended 
+  // before being converted to float.
+  case RTLIB::UINTTOFP_I32_F32: Basename = "ui32_to_f32"; break;
+               
+  // Floating point add, sub, mul, div.
+  case RTLIB::ADD_F32: Basename = "add.f32"; break;
+  case RTLIB::SUB_F32: Basename = "sub.f32"; break;
+  case RTLIB::MUL_F32: Basename = "mul.f32"; break;
+  case RTLIB::DIV_F32: Basename = "div.f32"; break;
+
+  // Floating point comparison
+  case RTLIB::O_F32: Basename = "unordered.f32"; break;
+  case RTLIB::UO_F32: Basename = "unordered.f32"; break;
+  case RTLIB::OLE_F32: Basename = "le.f32"; break;
+  case RTLIB::OGE_F32: Basename = "ge.f32"; break;
+  case RTLIB::OLT_F32: Basename = "lt.f32"; break;
+  case RTLIB::OGT_F32: Basename = "gt.f32"; break;
+  case RTLIB::OEQ_F32: Basename = "eq.f32"; break;
+  case RTLIB::UNE_F32: Basename = "neq.f32"; break;
+  }
+  
+  std::string prefix = PAN::getTagName(PAN::PREFIX_SYMBOL);
+  std::string tagname = PAN::getTagName(PAN::LIBCALL);
+  std::string Fullname = prefix + tagname + Basename; 
+
+  // The name has to live through program life.
+  return createESName(Fullname);
+}
+
+// getStdLibCallName - Get the name for the standard library function.
+static const char *getStdLibCallName(unsigned opcode) {
+  std::string BaseName;
+  switch(opcode) {
+    case RTLIB::COS_F32: BaseName = "cos";
+      break;
+    case RTLIB::SIN_F32: BaseName = "sin";
+      break;
+    case RTLIB::MEMCPY: BaseName = "memcpy";
+      break;
+    case RTLIB::MEMSET: BaseName = "memset";
+      break;
+    case RTLIB::MEMMOVE: BaseName = "memmove";
+      break;
+    default: llvm_unreachable("do not know std lib call name");
+  }
+  std::string prefix = PAN::getTagName(PAN::PREFIX_SYMBOL);
+  std::string LibCallName = prefix + BaseName;
+
+  // The name has to live through program life.
+  return createESName(LibCallName);
+}
+
+// PIC16TargetLowering Constructor.
+PIC16TargetLowering::PIC16TargetLowering(PIC16TargetMachine &TM)
+  : TargetLowering(TM, new PIC16TargetObjectFile()), TmpSize(0) {
+ 
+  Subtarget = &TM.getSubtarget<PIC16Subtarget>();
+
+  addRegisterClass(MVT::i8, PIC16::GPRRegisterClass);
+
+  setShiftAmountType(MVT::i8);
+  
+  // Std lib call names
+  setLibcallName(RTLIB::COS_F32, getStdLibCallName(RTLIB::COS_F32));
+  setLibcallName(RTLIB::SIN_F32, getStdLibCallName(RTLIB::SIN_F32));
+  setLibcallName(RTLIB::MEMCPY, getStdLibCallName(RTLIB::MEMCPY));
+  setLibcallName(RTLIB::MEMSET, getStdLibCallName(RTLIB::MEMSET));
+  setLibcallName(RTLIB::MEMMOVE, getStdLibCallName(RTLIB::MEMMOVE));
+
+  // SRA library call names
+  setPIC16LibcallName(PIC16ISD::SRA_I8, getIntrinsicName(PIC16ISD::SRA_I8));
+  setLibcallName(RTLIB::SRA_I16, getIntrinsicName(RTLIB::SRA_I16));
+  setLibcallName(RTLIB::SRA_I32, getIntrinsicName(RTLIB::SRA_I32));
+
+  // SHL library call names
+  setPIC16LibcallName(PIC16ISD::SLL_I8, getIntrinsicName(PIC16ISD::SLL_I8));
+  setLibcallName(RTLIB::SHL_I16, getIntrinsicName(RTLIB::SHL_I16));
+  setLibcallName(RTLIB::SHL_I32, getIntrinsicName(RTLIB::SHL_I32));
+
+  // SRL library call names
+  setPIC16LibcallName(PIC16ISD::SRL_I8, getIntrinsicName(PIC16ISD::SRL_I8));
+  setLibcallName(RTLIB::SRL_I16, getIntrinsicName(RTLIB::SRL_I16));
+  setLibcallName(RTLIB::SRL_I32, getIntrinsicName(RTLIB::SRL_I32));
+
+  // MUL Library call names
+  setPIC16LibcallName(PIC16ISD::MUL_I8, getIntrinsicName(PIC16ISD::MUL_I8));
+  setLibcallName(RTLIB::MUL_I16, getIntrinsicName(RTLIB::MUL_I16));
+  setLibcallName(RTLIB::MUL_I32, getIntrinsicName(RTLIB::MUL_I32));
+
+  // Signed division lib call names
+  setLibcallName(RTLIB::SDIV_I16, getIntrinsicName(RTLIB::SDIV_I16));
+  setLibcallName(RTLIB::SDIV_I32, getIntrinsicName(RTLIB::SDIV_I32));
+
+  // Unsigned division lib call names
+  setLibcallName(RTLIB::UDIV_I16, getIntrinsicName(RTLIB::UDIV_I16));
+  setLibcallName(RTLIB::UDIV_I32, getIntrinsicName(RTLIB::UDIV_I32));
+
+  // Signed remainder lib call names
+  setLibcallName(RTLIB::SREM_I16, getIntrinsicName(RTLIB::SREM_I16));
+  setLibcallName(RTLIB::SREM_I32, getIntrinsicName(RTLIB::SREM_I32));
+
+  // Unsigned remainder lib call names
+  setLibcallName(RTLIB::UREM_I16, getIntrinsicName(RTLIB::UREM_I16));
+  setLibcallName(RTLIB::UREM_I32, getIntrinsicName(RTLIB::UREM_I32));
+ 
+  // Floating point to signed int conversions.
+  setLibcallName(RTLIB::FPTOSINT_F32_I8, 
+                 getIntrinsicName(RTLIB::FPTOSINT_F32_I8));
+  setLibcallName(RTLIB::FPTOSINT_F32_I16, 
+                 getIntrinsicName(RTLIB::FPTOSINT_F32_I16));
+  setLibcallName(RTLIB::FPTOSINT_F32_I32, 
+                 getIntrinsicName(RTLIB::FPTOSINT_F32_I32));
+
+  // Signed int to floats.
+  setLibcallName(RTLIB::SINTTOFP_I32_F32, 
+                 getIntrinsicName(RTLIB::SINTTOFP_I32_F32));
+
+  // Floating points to unsigned ints.
+  setLibcallName(RTLIB::FPTOUINT_F32_I8, 
+                 getIntrinsicName(RTLIB::FPTOUINT_F32_I8));
+  setLibcallName(RTLIB::FPTOUINT_F32_I16, 
+                 getIntrinsicName(RTLIB::FPTOUINT_F32_I16));
+  setLibcallName(RTLIB::FPTOUINT_F32_I32, 
+                 getIntrinsicName(RTLIB::FPTOUINT_F32_I32));
+
+  // Unsigned int to floats.
+  setLibcallName(RTLIB::UINTTOFP_I32_F32, 
+                 getIntrinsicName(RTLIB::UINTTOFP_I32_F32));
+
+  // Floating point add, sub, mul ,div.
+  setLibcallName(RTLIB::ADD_F32, getIntrinsicName(RTLIB::ADD_F32));
+  setLibcallName(RTLIB::SUB_F32, getIntrinsicName(RTLIB::SUB_F32));
+  setLibcallName(RTLIB::MUL_F32, getIntrinsicName(RTLIB::MUL_F32));
+  setLibcallName(RTLIB::DIV_F32, getIntrinsicName(RTLIB::DIV_F32));
+
+  // Floationg point comparison
+  setLibcallName(RTLIB::O_F32, getIntrinsicName(RTLIB::O_F32));
+  setLibcallName(RTLIB::UO_F32, getIntrinsicName(RTLIB::UO_F32));
+  setLibcallName(RTLIB::OLE_F32, getIntrinsicName(RTLIB::OLE_F32));
+  setLibcallName(RTLIB::OGE_F32, getIntrinsicName(RTLIB::OGE_F32));
+  setLibcallName(RTLIB::OLT_F32, getIntrinsicName(RTLIB::OLT_F32));
+  setLibcallName(RTLIB::OGT_F32, getIntrinsicName(RTLIB::OGT_F32));
+  setLibcallName(RTLIB::OEQ_F32, getIntrinsicName(RTLIB::OEQ_F32));
+  setLibcallName(RTLIB::UNE_F32, getIntrinsicName(RTLIB::UNE_F32));
+
+  // Return value comparisons of floating point calls. 
+  setCmpLibcallCC(RTLIB::OEQ_F32, ISD::SETNE);
+  setCmpLibcallCC(RTLIB::UNE_F32, ISD::SETNE);
+  setCmpLibcallCC(RTLIB::OLT_F32, ISD::SETNE);
+  setCmpLibcallCC(RTLIB::OLE_F32, ISD::SETNE);
+  setCmpLibcallCC(RTLIB::OGE_F32, ISD::SETNE);
+  setCmpLibcallCC(RTLIB::OGT_F32, ISD::SETNE);
+  setCmpLibcallCC(RTLIB::UO_F32, ISD::SETNE);
+  setCmpLibcallCC(RTLIB::O_F32, ISD::SETEQ);
+
+  setOperationAction(ISD::GlobalAddress, MVT::i16, Custom);
+  setOperationAction(ISD::ExternalSymbol, MVT::i16, Custom);
+
+  setOperationAction(ISD::LOAD,   MVT::i8,  Legal);
+  setOperationAction(ISD::LOAD,   MVT::i16, Custom);
+  setOperationAction(ISD::LOAD,   MVT::i32, Custom);
+
+  setOperationAction(ISD::STORE,  MVT::i8,  Legal);
+  setOperationAction(ISD::STORE,  MVT::i16, Custom);
+  setOperationAction(ISD::STORE,  MVT::i32, Custom);
+  setOperationAction(ISD::STORE,  MVT::i64, Custom);
+
+  setOperationAction(ISD::ADDE,    MVT::i8,  Custom);
+  setOperationAction(ISD::ADDC,    MVT::i8,  Custom);
+  setOperationAction(ISD::SUBE,    MVT::i8,  Custom);
+  setOperationAction(ISD::SUBC,    MVT::i8,  Custom);
+  setOperationAction(ISD::SUB,    MVT::i8,  Custom);
+  setOperationAction(ISD::ADD,    MVT::i8,  Custom);
+  setOperationAction(ISD::ADD,    MVT::i16, Custom);
+
+  setOperationAction(ISD::OR,     MVT::i8,  Custom);
+  setOperationAction(ISD::AND,    MVT::i8,  Custom);
+  setOperationAction(ISD::XOR,    MVT::i8,  Custom);
+
+  setOperationAction(ISD::FrameIndex, MVT::i16, Custom);
+
+  setOperationAction(ISD::MUL,    MVT::i8,  Custom);
+
+  setOperationAction(ISD::SMUL_LOHI,    MVT::i8,  Expand);
+  setOperationAction(ISD::UMUL_LOHI,    MVT::i8,  Expand);
+  setOperationAction(ISD::MULHU,        MVT::i8, Expand);
+  setOperationAction(ISD::MULHS,        MVT::i8, Expand);
+
+  setOperationAction(ISD::SRA,    MVT::i8,  Custom);
+  setOperationAction(ISD::SHL,    MVT::i8,  Custom);
+  setOperationAction(ISD::SRL,    MVT::i8,  Custom);
+
+  setOperationAction(ISD::ROTL,    MVT::i8,  Expand);
+  setOperationAction(ISD::ROTR,    MVT::i8,  Expand);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+  // PIC16 does not support shift parts
+  setOperationAction(ISD::SRA_PARTS,    MVT::i8, Expand);
+  setOperationAction(ISD::SHL_PARTS,    MVT::i8, Expand);
+  setOperationAction(ISD::SRL_PARTS,    MVT::i8, Expand);
+
+
+  // PIC16 does not have a SETCC, expand it to SELECT_CC.
+  setOperationAction(ISD::SETCC,  MVT::i8, Expand);
+  setOperationAction(ISD::SELECT,  MVT::i8, Expand);
+  setOperationAction(ISD::BRCOND, MVT::Other, Expand);
+  setOperationAction(ISD::BRIND, MVT::Other, Expand);
+
+  setOperationAction(ISD::SELECT_CC,  MVT::i8, Custom);
+  setOperationAction(ISD::BR_CC,  MVT::i8, Custom);
+
+  //setOperationAction(ISD::TRUNCATE, MVT::i16, Custom);
+  setTruncStoreAction(MVT::i16,   MVT::i8,  Custom);
+
+  // Now deduce the information based on the above mentioned 
+  // actions
+  computeRegisterProperties();
+}
+
+// getOutFlag - Extract the flag result if the Op has it.
+static SDValue getOutFlag(SDValue &Op) {
+  // Flag is the last value of the node.
+  SDValue Flag = Op.getValue(Op.getNode()->getNumValues() - 1);
+
+  assert (Flag.getValueType() == MVT::Flag 
+          && "Node does not have an out Flag");
+
+  return Flag;
+}
+// Get the TmpOffset for FrameIndex
+unsigned PIC16TargetLowering::GetTmpOffsetForFI(unsigned FI, unsigned size) {
+  std::map<unsigned, unsigned>::iterator 
+            MapIt = FiTmpOffsetMap.find(FI);
+  if (MapIt != FiTmpOffsetMap.end())
+      return MapIt->second;
+
+  // This FI (FrameIndex) is not yet mapped, so map it
+  FiTmpOffsetMap[FI] = TmpSize; 
+  TmpSize += size;
+  return FiTmpOffsetMap[FI];
+}
+
+// To extract chain value from the SDValue Nodes
+// This function will help to maintain the chain extracting
+// code at one place. In case of any change in future it will
+// help maintain the code.
+static SDValue getChain(SDValue &Op) { 
+  SDValue Chain = Op.getValue(Op.getNode()->getNumValues() - 1);
+
+  // If the last value returned in Flag then the chain is
+  // second last value returned.
+  if (Chain.getValueType() == MVT::Flag)
+    Chain = Op.getValue(Op.getNode()->getNumValues() - 2);
+  
+  // All nodes may not produce a chain. Therefore following assert
+  // verifies that the node is returning a chain only.
+  assert (Chain.getValueType() == MVT::Other 
+          && "Node does not have a chain");
+
+  return Chain;
+}
+
+/// PopulateResults - Helper function to LowerOperation.
+/// If a node wants to return multiple results after lowering,
+/// it stuffs them into an array of SDValue called Results.
+
+static void PopulateResults(SDValue N, SmallVectorImpl<SDValue>&Results) {
+  if (N.getOpcode() == ISD::MERGE_VALUES) {
+    int NumResults = N.getNumOperands();
+    for( int i = 0; i < NumResults; i++)
+      Results.push_back(N.getOperand(i));
+  }
+  else
+    Results.push_back(N);
+}
+
+MVT::SimpleValueType
+PIC16TargetLowering::getSetCCResultType(EVT ValType) const {
+  return MVT::i8;
+}
+
+MVT::SimpleValueType
+PIC16TargetLowering::getCmpLibcallReturnType() const {
+  return MVT::i8; 
+}
+
+/// The type legalizer framework of generating legalizer can generate libcalls
+/// only when the operand/result types are illegal.
+/// PIC16 needs to generate libcalls even for the legal types (i8) for some ops.
+/// For example an arithmetic right shift. These functions are used to lower
+/// such operations that generate libcall for legal types.
+
+void 
+PIC16TargetLowering::setPIC16LibcallName(PIC16ISD::PIC16Libcall Call,
+                                         const char *Name) {
+  PIC16LibcallNames[Call] = Name; 
+}
+
+const char *
+PIC16TargetLowering::getPIC16LibcallName(PIC16ISD::PIC16Libcall Call) {
+  return PIC16LibcallNames[Call];
+}
+
+SDValue
+PIC16TargetLowering::MakePIC16Libcall(PIC16ISD::PIC16Libcall Call,
+                                      EVT RetVT, const SDValue *Ops,
+                                      unsigned NumOps, bool isSigned,
+                                      SelectionDAG &DAG, DebugLoc dl) {
+
+  TargetLowering::ArgListTy Args;
+  Args.reserve(NumOps);
+
+  TargetLowering::ArgListEntry Entry;
+  for (unsigned i = 0; i != NumOps; ++i) {
+    Entry.Node = Ops[i];
+    Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
+    Entry.isSExt = isSigned;
+    Entry.isZExt = !isSigned;
+    Args.push_back(Entry);
+  }
+
+  SDValue Callee = DAG.getExternalSymbol(getPIC16LibcallName(Call), MVT::i16);
+
+   const Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
+   std::pair<SDValue,SDValue> CallInfo = 
+     LowerCallTo(DAG.getEntryNode(), RetTy, isSigned, !isSigned, false,
+                 false, 0, CallingConv::C, false,
+                 /*isReturnValueUsed=*/true,
+                 Callee, Args, DAG, dl,
+                 DAG.GetOrdering(DAG.getEntryNode().getNode()));
+
+  return CallInfo.first;
+}
+
+const char *PIC16TargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default:                         return NULL;
+  case PIC16ISD::Lo:               return "PIC16ISD::Lo";
+  case PIC16ISD::Hi:               return "PIC16ISD::Hi";
+  case PIC16ISD::MTLO:             return "PIC16ISD::MTLO";
+  case PIC16ISD::MTHI:             return "PIC16ISD::MTHI";
+  case PIC16ISD::MTPCLATH:         return "PIC16ISD::MTPCLATH";
+  case PIC16ISD::PIC16Connect:     return "PIC16ISD::PIC16Connect";
+  case PIC16ISD::Banksel:          return "PIC16ISD::Banksel";
+  case PIC16ISD::PIC16Load:        return "PIC16ISD::PIC16Load";
+  case PIC16ISD::PIC16LdArg:       return "PIC16ISD::PIC16LdArg";
+  case PIC16ISD::PIC16LdWF:        return "PIC16ISD::PIC16LdWF";
+  case PIC16ISD::PIC16Store:       return "PIC16ISD::PIC16Store";
+  case PIC16ISD::PIC16StWF:        return "PIC16ISD::PIC16StWF";
+  case PIC16ISD::BCF:              return "PIC16ISD::BCF";
+  case PIC16ISD::LSLF:             return "PIC16ISD::LSLF";
+  case PIC16ISD::LRLF:             return "PIC16ISD::LRLF";
+  case PIC16ISD::RLF:              return "PIC16ISD::RLF";
+  case PIC16ISD::RRF:              return "PIC16ISD::RRF";
+  case PIC16ISD::CALL:             return "PIC16ISD::CALL";
+  case PIC16ISD::CALLW:            return "PIC16ISD::CALLW";
+  case PIC16ISD::SUBCC:            return "PIC16ISD::SUBCC";
+  case PIC16ISD::SELECT_ICC:       return "PIC16ISD::SELECT_ICC";
+  case PIC16ISD::BRCOND:           return "PIC16ISD::BRCOND";
+  case PIC16ISD::RET:              return "PIC16ISD::RET";
+  case PIC16ISD::Dummy:            return "PIC16ISD::Dummy";
+  }
+}
+
+void PIC16TargetLowering::ReplaceNodeResults(SDNode *N,
+                                             SmallVectorImpl<SDValue>&Results,
+                                             SelectionDAG &DAG) {
+
+  switch (N->getOpcode()) {
+    case ISD::GlobalAddress:
+      Results.push_back(ExpandGlobalAddress(N, DAG));
+      return;
+    case ISD::ExternalSymbol:
+      Results.push_back(ExpandExternalSymbol(N, DAG));
+      return;
+    case ISD::STORE:
+      Results.push_back(ExpandStore(N, DAG));
+      return;
+    case ISD::LOAD:
+      PopulateResults(ExpandLoad(N, DAG), Results);
+      return;
+    case ISD::ADD:
+      // Results.push_back(ExpandAdd(N, DAG));
+      return;
+    case ISD::FrameIndex:
+      Results.push_back(ExpandFrameIndex(N, DAG));
+      return;
+    default:
+      assert (0 && "not implemented");
+      return;
+  }
+}
+
+SDValue PIC16TargetLowering::ExpandFrameIndex(SDNode *N, SelectionDAG &DAG) {
+
+  // Currently handling FrameIndex of size MVT::i16 only
+  // One example of this scenario is when return value is written on
+  // FrameIndex#0
+
+  if (N->getValueType(0) != MVT::i16)
+    return SDValue();
+
+  // Expand the FrameIndex into ExternalSymbol and a Constant node
+  // The constant will represent the frame index number
+  // Get the current function frame
+  MachineFunction &MF = DAG.getMachineFunction();
+  const Function *Func = MF.getFunction();
+  const std::string Name = Func->getName();
+  
+  FrameIndexSDNode *FR = dyn_cast<FrameIndexSDNode>(SDValue(N,0));
+  // FIXME there isn't really debug info here
+  DebugLoc dl = FR->getDebugLoc();
+
+  // Expand FrameIndex like GlobalAddress and ExternalSymbol
+  // Also use Offset field for lo and hi parts. The default 
+  // offset is zero.
+
+  SDValue ES;
+  int FrameOffset;
+  SDValue FI = SDValue(N,0);
+  LegalizeFrameIndex(FI, DAG, ES, FrameOffset);
+  SDValue Offset = DAG.getConstant(FrameOffset, MVT::i8);
+  SDValue Lo = DAG.getNode(PIC16ISD::Lo, dl, MVT::i8, ES, Offset);
+  SDValue Hi = DAG.getNode(PIC16ISD::Hi, dl, MVT::i8, ES, Offset);
+  return DAG.getNode(ISD::BUILD_PAIR, dl, N->getValueType(0), Lo, Hi);
+}
+
+
+SDValue PIC16TargetLowering::ExpandStore(SDNode *N, SelectionDAG &DAG) { 
+  StoreSDNode *St = cast<StoreSDNode>(N);
+  SDValue Chain = St->getChain();
+  SDValue Src = St->getValue();
+  SDValue Ptr = St->getBasePtr();
+  EVT ValueType = Src.getValueType();
+  unsigned StoreOffset = 0;
+  DebugLoc dl = N->getDebugLoc();
+
+  SDValue PtrLo, PtrHi;
+  LegalizeAddress(Ptr, DAG, PtrLo, PtrHi, StoreOffset, dl);
+ 
+  if (ValueType == MVT::i8) {
+    return DAG.getNode (PIC16ISD::PIC16Store, dl, MVT::Other, Chain, Src,
+                        PtrLo, PtrHi, 
+                        DAG.getConstant (0 + StoreOffset, MVT::i8));
+  }
+  else if (ValueType == MVT::i16) {
+    // Get the Lo and Hi parts from MERGE_VALUE or BUILD_PAIR.
+    SDValue SrcLo, SrcHi;
+    GetExpandedParts(Src, DAG, SrcLo, SrcHi);
+    SDValue ChainLo = Chain, ChainHi = Chain;
+    // FIXME: This makes unsafe assumptions. The Chain may be a TokenFactor
+    // created for an unrelated purpose, in which case it may not have
+    // exactly two operands. Also, even if it does have two operands, they
+    // may not be the low and high parts of an aligned load that was split.
+    if (Chain.getOpcode() == ISD::TokenFactor) {
+      ChainLo = Chain.getOperand(0);
+      ChainHi = Chain.getOperand(1);
+    }
+    SDValue Store1 = DAG.getNode(PIC16ISD::PIC16Store, dl, MVT::Other,
+                                 ChainLo,
+                                 SrcLo, PtrLo, PtrHi,
+                                 DAG.getConstant (0 + StoreOffset, MVT::i8));
+
+    SDValue Store2 = DAG.getNode(PIC16ISD::PIC16Store, dl, MVT::Other, ChainHi, 
+                                 SrcHi, PtrLo, PtrHi,
+                                 DAG.getConstant (1 + StoreOffset, MVT::i8));
+
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, getChain(Store1),
+                       getChain(Store2));
+  }
+  else if (ValueType == MVT::i32) {
+    // Get the Lo and Hi parts from MERGE_VALUE or BUILD_PAIR.
+    SDValue SrcLo, SrcHi;
+    GetExpandedParts(Src, DAG, SrcLo, SrcHi);
+
+    // Get the expanded parts of each of SrcLo and SrcHi.
+    SDValue SrcLo1, SrcLo2, SrcHi1, SrcHi2;
+    GetExpandedParts(SrcLo, DAG, SrcLo1, SrcLo2);
+    GetExpandedParts(SrcHi, DAG, SrcHi1, SrcHi2);
+
+    SDValue ChainLo = Chain, ChainHi = Chain;
+    // FIXME: This makes unsafe assumptions; see the FIXME above.
+    if (Chain.getOpcode() == ISD::TokenFactor) {  
+      ChainLo = Chain.getOperand(0);
+      ChainHi = Chain.getOperand(1);
+    }
+    SDValue ChainLo1 = ChainLo, ChainLo2 = ChainLo, ChainHi1 = ChainHi,
+            ChainHi2 = ChainHi;
+    // FIXME: This makes unsafe assumptions; see the FIXME above.
+    if (ChainLo.getOpcode() == ISD::TokenFactor) {
+      ChainLo1 = ChainLo.getOperand(0);
+      ChainLo2 = ChainLo.getOperand(1);
+    }
+    // FIXME: This makes unsafe assumptions; see the FIXME above.
+    if (ChainHi.getOpcode() == ISD::TokenFactor) {
+      ChainHi1 = ChainHi.getOperand(0);
+      ChainHi2 = ChainHi.getOperand(1);
+    }
+    SDValue Store1 = DAG.getNode(PIC16ISD::PIC16Store, dl, MVT::Other,
+                                 ChainLo1,
+                                 SrcLo1, PtrLo, PtrHi,
+                                 DAG.getConstant (0 + StoreOffset, MVT::i8));
+
+    SDValue Store2 = DAG.getNode(PIC16ISD::PIC16Store, dl, MVT::Other, ChainLo2,
+                                 SrcLo2, PtrLo, PtrHi,
+                                 DAG.getConstant (1 + StoreOffset, MVT::i8));
+
+    SDValue Store3 = DAG.getNode(PIC16ISD::PIC16Store, dl, MVT::Other, ChainHi1,
+                                 SrcHi1, PtrLo, PtrHi,
+                                 DAG.getConstant (2 + StoreOffset, MVT::i8));
+
+    SDValue Store4 = DAG.getNode(PIC16ISD::PIC16Store, dl, MVT::Other, ChainHi2,
+                                 SrcHi2, PtrLo, PtrHi,
+                                 DAG.getConstant (3 + StoreOffset, MVT::i8));
+
+    SDValue RetLo =  DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 
+                                 getChain(Store1), getChain(Store2));
+    SDValue RetHi =  DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 
+                                 getChain(Store3), getChain(Store4));
+    return  DAG.getNode(ISD::TokenFactor, dl, MVT::Other, RetLo, RetHi);
+
+  } else if (ValueType == MVT::i64) {
+    SDValue SrcLo, SrcHi;
+    GetExpandedParts(Src, DAG, SrcLo, SrcHi);
+    SDValue ChainLo = Chain, ChainHi = Chain;
+    // FIXME: This makes unsafe assumptions; see the FIXME above.
+    if (Chain.getOpcode() == ISD::TokenFactor) {
+      ChainLo = Chain.getOperand(0);
+      ChainHi = Chain.getOperand(1);
+    }
+    SDValue Store1 = DAG.getStore(ChainLo, dl, SrcLo, Ptr, NULL,
+                                  0 + StoreOffset);
+
+    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                      DAG.getConstant(4, Ptr.getValueType()));
+    SDValue Store2 = DAG.getStore(ChainHi, dl, SrcHi, Ptr, NULL,
+                                  1 + StoreOffset);
+
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1,
+                       Store2);
+  } else {
+    assert (0 && "value type not supported");
+    return SDValue();
+  }
+}
+
+SDValue PIC16TargetLowering::ExpandExternalSymbol(SDNode *N, SelectionDAG &DAG)
+{
+  ExternalSymbolSDNode *ES = dyn_cast<ExternalSymbolSDNode>(SDValue(N, 0));
+  // FIXME there isn't really debug info here
+  DebugLoc dl = ES->getDebugLoc();
+
+  SDValue TES = DAG.getTargetExternalSymbol(ES->getSymbol(), MVT::i8);
+  SDValue Offset = DAG.getConstant(0, MVT::i8);
+  SDValue Lo = DAG.getNode(PIC16ISD::Lo, dl, MVT::i8, TES, Offset);
+  SDValue Hi = DAG.getNode(PIC16ISD::Hi, dl, MVT::i8, TES, Offset);
+
+  return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i16, Lo, Hi);
+}
+
+// ExpandGlobalAddress - 
+SDValue PIC16TargetLowering::ExpandGlobalAddress(SDNode *N, SelectionDAG &DAG) {
+  GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(SDValue(N, 0));
+  // FIXME there isn't really debug info here
+  DebugLoc dl = G->getDebugLoc();
+  
+  SDValue TGA = DAG.getTargetGlobalAddress(G->getGlobal(), MVT::i8,
+                                           G->getOffset());
+
+  SDValue Offset = DAG.getConstant(0, MVT::i8);
+  SDValue Lo = DAG.getNode(PIC16ISD::Lo, dl, MVT::i8, TGA, Offset);
+  SDValue Hi = DAG.getNode(PIC16ISD::Hi, dl, MVT::i8, TGA, Offset);
+
+  return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i16, Lo, Hi);
+}
+
+bool PIC16TargetLowering::isDirectAddress(const SDValue &Op) {
+  assert (Op.getNode() != NULL && "Can't operate on NULL SDNode!!");
+
+  if (Op.getOpcode() == ISD::BUILD_PAIR) {
+   if (Op.getOperand(0).getOpcode() == PIC16ISD::Lo) 
+     return true;
+  }
+  return false;
+}
+
+// Return true if DirectAddress is in ROM_SPACE
+bool PIC16TargetLowering::isRomAddress(const SDValue &Op) {
+
+  // RomAddress is a GlobalAddress in ROM_SPACE_
+  // If the Op is not a GlobalAddress return NULL without checking
+  // anything further.
+  if (!isDirectAddress(Op))
+    return false; 
+
+  // Its a GlobalAddress.
+  // It is BUILD_PAIR((PIC16Lo TGA), (PIC16Hi TGA)) and Op is BUILD_PAIR
+  SDValue TGA = Op.getOperand(0).getOperand(0);
+  GlobalAddressSDNode *GSDN = dyn_cast<GlobalAddressSDNode>(TGA);
+
+  if (GSDN->getAddressSpace() == PIC16ISD::ROM_SPACE)
+    return true;
+
+  // Any other address space return it false
+  return false;
+}
+
+
+// GetExpandedParts - This function is on the similiar lines as
+// the GetExpandedInteger in type legalizer is. This returns expanded
+// parts of Op in Lo and Hi. 
+
+void PIC16TargetLowering::GetExpandedParts(SDValue Op, SelectionDAG &DAG,
+                                           SDValue &Lo, SDValue &Hi) {  
+  SDNode *N = Op.getNode();
+  DebugLoc dl = N->getDebugLoc();
+  EVT NewVT = getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+
+  // Extract the lo component.
+  Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, NewVT, Op,
+                   DAG.getConstant(0, MVT::i8));
+
+  // extract the hi component
+  Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, NewVT, Op,
+                   DAG.getConstant(1, MVT::i8));
+}
+
+// Legalize FrameIndex into ExternalSymbol and offset.
+void 
+PIC16TargetLowering::LegalizeFrameIndex(SDValue Op, SelectionDAG &DAG,
+                                        SDValue &ES, int &Offset) {
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  const Function *Func = MF.getFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const std::string Name = Func->getName();
+
+  FrameIndexSDNode *FR = dyn_cast<FrameIndexSDNode>(Op);
+
+  // FrameIndices are not stack offsets. But they represent the request
+  // for space on stack. That space requested may be more than one byte. 
+  // Therefore, to calculate the stack offset that a FrameIndex aligns
+  // with, we need to traverse all the FrameIndices available earlier in 
+  // the list and add their requested size.
+  unsigned FIndex = FR->getIndex();
+  const char *tmpName;
+  if (FIndex < ReservedFrameCount) {
+    tmpName = createESName(PAN::getFrameLabel(Name));
+    ES = DAG.getTargetExternalSymbol(tmpName, MVT::i8);
+    Offset = 0;
+    for (unsigned i=0; i<FIndex ; ++i) {
+      Offset += MFI->getObjectSize(i);
+    }
+  } else {
+   // FrameIndex has been made for some temporary storage 
+    tmpName = createESName(PAN::getTempdataLabel(Name));
+    ES = DAG.getTargetExternalSymbol(tmpName, MVT::i8);
+    Offset = GetTmpOffsetForFI(FIndex, MFI->getObjectSize(FIndex));
+  }
+
+  return;
+}
+
+// This function legalizes the PIC16 Addresses. If the Pointer is  
+//  -- Direct address variable residing 
+//     --> then a Banksel for that variable will be created.
+//  -- Rom variable            
+//     --> then it will be treated as an indirect address.
+//  -- Indirect address 
+//     --> then the address will be loaded into FSR
+//  -- ADD with constant operand
+//     --> then constant operand of ADD will be returned as Offset
+//         and non-constant operand of ADD will be treated as pointer.
+// Returns the high and lo part of the address, and the offset(in case of ADD).
+
+void PIC16TargetLowering::LegalizeAddress(SDValue Ptr, SelectionDAG &DAG, 
+                                          SDValue &Lo, SDValue &Hi,
+                                          unsigned &Offset, DebugLoc dl) {
+
+  // Offset, by default, should be 0
+  Offset = 0;
+
+  // If the pointer is ADD with constant,
+  // return the constant value as the offset  
+  if (Ptr.getOpcode() == ISD::ADD) {
+    SDValue OperLeft = Ptr.getOperand(0);
+    SDValue OperRight = Ptr.getOperand(1);
+    if ((OperLeft.getOpcode() == ISD::Constant) &&
+        (dyn_cast<ConstantSDNode>(OperLeft)->getZExtValue() < 32 )) {
+      Offset = dyn_cast<ConstantSDNode>(OperLeft)->getZExtValue();
+      Ptr = OperRight;
+    } else if ((OperRight.getOpcode() == ISD::Constant)  &&
+               (dyn_cast<ConstantSDNode>(OperRight)->getZExtValue() < 32 )){
+      Offset = dyn_cast<ConstantSDNode>(OperRight)->getZExtValue();
+      Ptr = OperLeft;
+    }
+  }
+
+  // If the pointer is Type i8 and an external symbol
+  // then treat it as direct address.
+  // One example for such case is storing and loading
+  // from function frame during a call
+  if (Ptr.getValueType() == MVT::i8) {
+    switch (Ptr.getOpcode()) {
+    case ISD::TargetExternalSymbol:
+      Lo = Ptr;
+      Hi = DAG.getConstant(1, MVT::i8);
+      return;
+    }
+  }
+
+  // Expansion of FrameIndex has Lo/Hi parts
+  if (isDirectAddress(Ptr)) { 
+      SDValue TFI = Ptr.getOperand(0).getOperand(0); 
+      int FrameOffset;
+      if (TFI.getOpcode() == ISD::TargetFrameIndex) {
+        LegalizeFrameIndex(TFI, DAG, Lo, FrameOffset);
+        Hi = DAG.getConstant(1, MVT::i8);
+        Offset += FrameOffset; 
+        return;
+      } else if (TFI.getOpcode() == ISD::TargetExternalSymbol) {
+        // FrameIndex has already been expanded.
+        // Now just make use of its expansion
+        Lo = TFI;
+        Hi = DAG.getConstant(1, MVT::i8);
+        SDValue FOffset = Ptr.getOperand(0).getOperand(1);
+        assert (FOffset.getOpcode() == ISD::Constant && 
+                          "Invalid operand of PIC16ISD::Lo");
+        Offset += dyn_cast<ConstantSDNode>(FOffset)->getZExtValue();
+        return;
+      }
+  }
+
+  if (isDirectAddress(Ptr) && !isRomAddress(Ptr)) {
+    // Direct addressing case for RAM variables. The Hi part is constant
+    // and the Lo part is the TGA itself.
+    Lo = Ptr.getOperand(0).getOperand(0);
+
+    // For direct addresses Hi is a constant. Value 1 for the constant
+    // signifies that banksel needs to generated for it. Value 0 for
+    // the constant signifies that banksel does not need to be generated 
+    // for it. Mark it as 1 now and optimize later. 
+    Hi = DAG.getConstant(1, MVT::i8);
+    return; 
+  }
+
+  // Indirect addresses. Get the hi and lo parts of ptr. 
+  GetExpandedParts(Ptr, DAG, Lo, Hi);
+
+  // Put the hi and lo parts into FSR.
+  Lo = DAG.getNode(PIC16ISD::MTLO, dl, MVT::i8, Lo);
+  Hi = DAG.getNode(PIC16ISD::MTHI, dl, MVT::i8, Hi);
+
+  return;
+}
+
+SDValue PIC16TargetLowering::ExpandLoad(SDNode *N, SelectionDAG &DAG) {
+  LoadSDNode *LD = dyn_cast<LoadSDNode>(SDValue(N, 0));
+  SDValue Chain = LD->getChain();
+  SDValue Ptr = LD->getBasePtr();
+  DebugLoc dl = LD->getDebugLoc();
+
+  SDValue Load, Offset;
+  SDVTList Tys; 
+  EVT VT, NewVT;
+  SDValue PtrLo, PtrHi;
+  unsigned LoadOffset;
+
+  // Legalize direct/indirect addresses. This will give the lo and hi parts
+  // of the address and the offset.
+  LegalizeAddress(Ptr, DAG, PtrLo, PtrHi, LoadOffset, dl);
+
+  // Load from the pointer (direct address or FSR) 
+  VT = N->getValueType(0);
+  unsigned NumLoads = VT.getSizeInBits() / 8; 
+  std::vector<SDValue> PICLoads;
+  unsigned iter;
+  EVT MemVT = LD->getMemoryVT();
+  if(ISD::isNON_EXTLoad(N)) {
+    for (iter=0; iter<NumLoads ; ++iter) {
+      // Add the pointer offset if any
+      Offset = DAG.getConstant(iter + LoadOffset, MVT::i8);
+      Tys = DAG.getVTList(MVT::i8, MVT::Other); 
+      Load = DAG.getNode(PIC16ISD::PIC16Load, dl, Tys, Chain, PtrLo, PtrHi,
+                         Offset); 
+      PICLoads.push_back(Load);
+    }
+  } else {
+    // If it is extended load then use PIC16Load for Memory Bytes
+    // and for all extended bytes perform action based on type of
+    // extention - i.e. SignExtendedLoad or ZeroExtendedLoad
+
+    
+    // For extended loads this is the memory value type
+    // i.e. without any extension
+    EVT MemVT = LD->getMemoryVT();
+    unsigned MemBytes = MemVT.getSizeInBits() / 8;
+    // if MVT::i1 is extended to MVT::i8 then MemBytes will be zero
+    // So set it to one
+    if (MemBytes == 0) MemBytes = 1;
+    
+    unsigned ExtdBytes = VT.getSizeInBits() / 8;
+    Offset = DAG.getConstant(LoadOffset, MVT::i8);
+
+    Tys = DAG.getVTList(MVT::i8, MVT::Other); 
+    // For MemBytes generate PIC16Load with proper offset
+    for (iter=0; iter < MemBytes; ++iter) {
+      // Add the pointer offset if any
+      Offset = DAG.getConstant(iter + LoadOffset, MVT::i8);
+      Load = DAG.getNode(PIC16ISD::PIC16Load, dl, Tys, Chain, PtrLo, PtrHi,
+                         Offset); 
+      PICLoads.push_back(Load);
+    }
+
+    // For SignExtendedLoad
+    if (ISD::isSEXTLoad(N)) {
+      // For all ExtdBytes use the Right Shifted(Arithmetic) Value of the 
+      // highest MemByte
+      SDValue SRA = DAG.getNode(ISD::SRA, dl, MVT::i8, Load, 
+                                DAG.getConstant(7, MVT::i8));
+      for (iter=MemBytes; iter<ExtdBytes; ++iter) { 
+        PICLoads.push_back(SRA);
+      }
+    } else if (ISD::isZEXTLoad(N) || ISD::isEXTLoad(N)) {
+    //} else if (ISD::isZEXTLoad(N)) {
+      // ZeroExtendedLoad -- For all ExtdBytes use constant 0
+      SDValue ConstZero = DAG.getConstant(0, MVT::i8);
+      for (iter=MemBytes; iter<ExtdBytes; ++iter) { 
+        PICLoads.push_back(ConstZero);
+      }
+    }
+  }
+  SDValue BP;
+
+  if (VT == MVT::i8) {
+    // Operand of Load is illegal -- Load itself is legal
+    return PICLoads[0];
+  }
+  else if (VT == MVT::i16) {
+    BP = DAG.getNode(ISD::BUILD_PAIR, dl, VT, PICLoads[0], PICLoads[1]);
+    if ((MemVT == MVT::i8) || (MemVT == MVT::i1))
+      Chain = getChain(PICLoads[0]);
+    else
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 
+                          getChain(PICLoads[0]), getChain(PICLoads[1]));
+  } else if (VT == MVT::i32) {
+    SDValue BPs[2];
+    BPs[0] = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i16, 
+                         PICLoads[0], PICLoads[1]);
+    BPs[1] = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i16,
+                         PICLoads[2], PICLoads[3]);
+    BP = DAG.getNode(ISD::BUILD_PAIR, dl, VT, BPs[0], BPs[1]);
+    if ((MemVT == MVT::i8) || (MemVT == MVT::i1))
+      Chain = getChain(PICLoads[0]);
+    else if (MemVT == MVT::i16)
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 
+                          getChain(PICLoads[0]), getChain(PICLoads[1]));
+    else {
+      SDValue Chains[2];
+      Chains[0] = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                              getChain(PICLoads[0]), getChain(PICLoads[1]));
+      Chains[1] = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                              getChain(PICLoads[2]), getChain(PICLoads[3]));
+      Chain =  DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                           Chains[0], Chains[1]);
+    }
+  }
+  Tys = DAG.getVTList(VT, MVT::Other); 
+  return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, BP, Chain);
+}
+
+SDValue PIC16TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) {
+  // We should have handled larger operands in type legalizer itself.
+  assert (Op.getValueType() == MVT::i8 && "illegal shift to lower");
+ 
+  SDNode *N = Op.getNode();
+  SDValue Value = N->getOperand(0);
+  SDValue Amt = N->getOperand(1);
+  PIC16ISD::PIC16Libcall CallCode;
+  switch (N->getOpcode()) {
+  case ISD::SRA:
+    CallCode = PIC16ISD::SRA_I8;
+    break;
+  case ISD::SHL:
+    CallCode = PIC16ISD::SLL_I8;
+    break;
+  case ISD::SRL:
+    CallCode = PIC16ISD::SRL_I8;
+    break;
+  default:
+    assert ( 0 && "This shift is not implemented yet.");
+    return SDValue();
+  }
+  SmallVector<SDValue, 2> Ops(2);
+  Ops[0] = Value;
+  Ops[1] = Amt;
+  SDValue Call = MakePIC16Libcall(CallCode, N->getValueType(0), &Ops[0], 2, 
+                                  true, DAG, N->getDebugLoc());
+  return Call;
+}
+
+SDValue PIC16TargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) {
+  // We should have handled larger operands in type legalizer itself.
+  assert (Op.getValueType() == MVT::i8 && "illegal multiply to lower");
+
+  SDNode *N = Op.getNode();
+  SmallVector<SDValue, 2> Ops(2);
+  Ops[0] = N->getOperand(0);
+  Ops[1] = N->getOperand(1);
+  SDValue Call = MakePIC16Libcall(PIC16ISD::MUL_I8, N->getValueType(0), 
+                                  &Ops[0], 2, true, DAG, N->getDebugLoc());
+  return Call;
+}
+
+void
+PIC16TargetLowering::LowerOperationWrapper(SDNode *N,
+                                           SmallVectorImpl<SDValue>&Results,
+                                           SelectionDAG &DAG) {
+  SDValue Op = SDValue(N, 0);
+  SDValue Res;
+  unsigned i;
+  switch (Op.getOpcode()) {
+    case ISD::LOAD:
+      Res = ExpandLoad(Op.getNode(), DAG); break;
+    default: {
+      // All other operations are handled in LowerOperation.
+      Res = LowerOperation(Op, DAG);
+      if (Res.getNode())
+        Results.push_back(Res);
+        
+      return; 
+    }
+  }
+
+  N = Res.getNode();
+  unsigned NumValues = N->getNumValues(); 
+  for (i = 0; i < NumValues ; i++) {
+    Results.push_back(SDValue(N, i)); 
+  }
+}
+
+SDValue PIC16TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
+  switch (Op.getOpcode()) {
+    case ISD::ADD:
+    case ISD::ADDC:
+    case ISD::ADDE:
+      return LowerADD(Op, DAG);
+    case ISD::SUB:
+    case ISD::SUBC:
+    case ISD::SUBE:
+      return LowerSUB(Op, DAG);
+    case ISD::LOAD:
+      return ExpandLoad(Op.getNode(), DAG);
+    case ISD::STORE:
+      return ExpandStore(Op.getNode(), DAG);
+    case ISD::MUL:
+      return LowerMUL(Op, DAG);
+    case ISD::SHL:
+    case ISD::SRA:
+    case ISD::SRL:
+      return LowerShift(Op, DAG);
+    case ISD::OR:
+    case ISD::AND:
+    case ISD::XOR:
+      return LowerBinOp(Op, DAG);
+    case ISD::BR_CC:
+      return LowerBR_CC(Op, DAG);
+    case ISD::SELECT_CC:
+      return LowerSELECT_CC(Op, DAG);
+  }
+  return SDValue();
+}
+
+SDValue PIC16TargetLowering::ConvertToMemOperand(SDValue Op,
+                                                 SelectionDAG &DAG,
+                                                 DebugLoc dl) {
+  assert (Op.getValueType() == MVT::i8 
+          && "illegal value type to store on stack.");
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  const Function *Func = MF.getFunction();
+  const std::string FuncName = Func->getName();
+
+
+  // Put the value on stack.
+  // Get a stack slot index and convert to es.
+  int FI = MF.getFrameInfo()->CreateStackObject(1, 1, false);
+  const char *tmpName = createESName(PAN::getTempdataLabel(FuncName));
+  SDValue ES = DAG.getTargetExternalSymbol(tmpName, MVT::i8);
+
+  // Store the value to ES.
+  SDValue Store = DAG.getNode (PIC16ISD::PIC16Store, dl, MVT::Other,
+                               DAG.getEntryNode(),
+                               Op, ES, 
+                               DAG.getConstant (1, MVT::i8), // Banksel.
+                               DAG.getConstant (GetTmpOffsetForFI(FI, 1), 
+                                                MVT::i8));
+
+  // Load the value from ES.
+  SDVTList Tys = DAG.getVTList(MVT::i8, MVT::Other);
+  SDValue Load = DAG.getNode(PIC16ISD::PIC16Load, dl, Tys, Store,
+                             ES, DAG.getConstant (1, MVT::i8),
+                             DAG.getConstant (GetTmpOffsetForFI(FI, 1), 
+                             MVT::i8));
+    
+  return Load.getValue(0);
+}
+
+SDValue PIC16TargetLowering::
+LowerIndirectCallArguments(SDValue Chain, SDValue InFlag,
+                           SDValue DataAddr_Lo, SDValue DataAddr_Hi,
+                           const SmallVectorImpl<ISD::OutputArg> &Outs,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG) {
+  unsigned NumOps = Outs.size();
+
+  // If call has no arguments then do nothing and return.
+  if (NumOps == 0)
+    return Chain;
+
+  std::vector<SDValue> Ops;
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SDValue Arg, StoreRet;
+
+  // For PIC16 ABI the arguments come after the return value. 
+  unsigned RetVals = Ins.size();
+  for (unsigned i = 0, ArgOffset = RetVals; i < NumOps; i++) {
+    // Get the arguments
+    Arg = Outs[i].Val;
+    
+    Ops.clear();
+    Ops.push_back(Chain);
+    Ops.push_back(Arg);
+    Ops.push_back(DataAddr_Lo);
+    Ops.push_back(DataAddr_Hi);
+    Ops.push_back(DAG.getConstant(ArgOffset, MVT::i8));
+    Ops.push_back(InFlag);
+
+    StoreRet = DAG.getNode (PIC16ISD::PIC16StWF, dl, Tys, &Ops[0], Ops.size());
+
+    Chain = getChain(StoreRet);
+    InFlag = getOutFlag(StoreRet);
+    ArgOffset++;
+  }
+  return Chain;
+}
+
+SDValue PIC16TargetLowering::
+LowerDirectCallArguments(SDValue ArgLabel, SDValue Chain, SDValue InFlag,
+                         const SmallVectorImpl<ISD::OutputArg> &Outs,
+                         DebugLoc dl, SelectionDAG &DAG) {
+  unsigned NumOps = Outs.size();
+  std::string Name;
+  SDValue Arg, StoreAt;
+  EVT ArgVT;
+  unsigned Size=0;
+
+  // If call has no arguments then do nothing and return.
+  if (NumOps == 0)
+    return Chain; 
+
+  // FIXME: This portion of code currently assumes only
+  // primitive types being passed as arguments.
+
+  // Legalize the address before use
+  SDValue PtrLo, PtrHi;
+  unsigned AddressOffset;
+  int StoreOffset = 0;
+  LegalizeAddress(ArgLabel, DAG, PtrLo, PtrHi, AddressOffset, dl);
+  SDValue StoreRet;
+
+  std::vector<SDValue> Ops;
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+  for (unsigned i=0, Offset = 0; i<NumOps; i++) {
+    // Get the argument
+    Arg = Outs[i].Val;
+    StoreOffset = (Offset + AddressOffset);
+   
+    // Store the argument on frame
+
+    Ops.clear();
+    Ops.push_back(Chain);
+    Ops.push_back(Arg);
+    Ops.push_back(PtrLo);
+    Ops.push_back(PtrHi);
+    Ops.push_back(DAG.getConstant(StoreOffset, MVT::i8));
+    Ops.push_back(InFlag);
+
+    StoreRet = DAG.getNode (PIC16ISD::PIC16StWF, dl, Tys, &Ops[0], Ops.size());
+
+    Chain = getChain(StoreRet);
+    InFlag = getOutFlag(StoreRet);
+
+    // Update the frame offset to be used for next argument
+    ArgVT = Arg.getValueType();
+    Size = ArgVT.getSizeInBits();
+    Size = Size/8;    // Calculate size in bytes
+    Offset += Size;   // Increase the frame offset
+  }
+  return Chain;
+}
+
+SDValue PIC16TargetLowering::
+LowerIndirectCallReturn(SDValue Chain, SDValue InFlag,
+                        SDValue DataAddr_Lo, SDValue DataAddr_Hi,
+                        const SmallVectorImpl<ISD::InputArg> &Ins,
+                        DebugLoc dl, SelectionDAG &DAG,
+                        SmallVectorImpl<SDValue> &InVals) {
+  unsigned RetVals = Ins.size();
+
+  // If call does not have anything to return
+  // then do nothing and go back.
+  if (RetVals == 0)
+    return Chain;
+
+  // Call has something to return
+  SDValue LoadRet;
+
+  SDVTList Tys = DAG.getVTList(MVT::i8, MVT::Other, MVT::Flag);
+  for(unsigned i=0;i<RetVals;i++) {
+    LoadRet = DAG.getNode(PIC16ISD::PIC16LdWF, dl, Tys, Chain, DataAddr_Lo,
+                          DataAddr_Hi, DAG.getConstant(i, MVT::i8),
+                          InFlag);
+    InFlag = getOutFlag(LoadRet);
+    Chain = getChain(LoadRet);
+    InVals.push_back(LoadRet);
+  }
+  return Chain;
+}
+
+SDValue PIC16TargetLowering::
+LowerDirectCallReturn(SDValue RetLabel, SDValue Chain, SDValue InFlag,
+                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                      DebugLoc dl, SelectionDAG &DAG,
+                      SmallVectorImpl<SDValue> &InVals) {
+
+  // Currently handling primitive types only. They will come in
+  // i8 parts
+  unsigned RetVals = Ins.size();
+
+  // Return immediately if the return type is void
+  if (RetVals == 0)
+    return Chain;
+
+  // Call has something to return
+  
+  // Legalize the address before use
+  SDValue LdLo, LdHi;
+  unsigned LdOffset;
+  LegalizeAddress(RetLabel, DAG, LdLo, LdHi, LdOffset, dl);
+
+  SDVTList Tys = DAG.getVTList(MVT::i8, MVT::Other, MVT::Flag);
+  SDValue LoadRet;
+ 
+  for(unsigned i=0, Offset=0;i<RetVals;i++) {
+
+    LoadRet = DAG.getNode(PIC16ISD::PIC16LdWF, dl, Tys, Chain, LdLo, LdHi,
+                          DAG.getConstant(LdOffset + Offset, MVT::i8),
+                          InFlag);
+
+    InFlag = getOutFlag(LoadRet);
+
+    Chain = getChain(LoadRet);
+    Offset++;
+    InVals.push_back(LoadRet);
+  }
+
+  return Chain;
+}
+
+SDValue
+PIC16TargetLowering::LowerReturn(SDValue Chain,
+                                 CallingConv::ID CallConv, bool isVarArg,
+                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                 DebugLoc dl, SelectionDAG &DAG) {
+
+  // Number of values to return 
+  unsigned NumRet = Outs.size();
+
+  // Function returns value always on stack with the offset starting
+  // from 0 
+  MachineFunction &MF = DAG.getMachineFunction();
+  const Function *F = MF.getFunction();
+  std::string FuncName = F->getName();
+
+  const char *tmpName = createESName(PAN::getFrameLabel(FuncName));
+  SDValue ES = DAG.getTargetExternalSymbol(tmpName, MVT::i8);
+  SDValue BS = DAG.getConstant(1, MVT::i8);
+  SDValue RetVal;
+  for(unsigned i=0;i<NumRet; ++i) {
+    RetVal = Outs[i].Val;
+    Chain =  DAG.getNode (PIC16ISD::PIC16Store, dl, MVT::Other, Chain, RetVal,
+                        ES, BS,
+                        DAG.getConstant (i, MVT::i8));
+      
+  }
+  return DAG.getNode(PIC16ISD::RET, dl, MVT::Other, Chain);
+}
+
+void PIC16TargetLowering::
+GetDataAddress(DebugLoc dl, SDValue Callee, SDValue &Chain, 
+               SDValue &DataAddr_Lo, SDValue &DataAddr_Hi,
+               SelectionDAG &DAG) {
+   assert (Callee.getOpcode() == PIC16ISD::PIC16Connect
+           && "Don't know what to do of such callee!!");
+   SDValue ZeroOperand = DAG.getConstant(0, MVT::i8);
+   SDValue SeqStart  = DAG.getCALLSEQ_START(Chain, ZeroOperand);
+   Chain = getChain(SeqStart);
+   SDValue OperFlag = getOutFlag(SeqStart); // To manage the data dependency
+
+   // Get the Lo and Hi part of code address
+   SDValue Lo = Callee.getOperand(0);
+   SDValue Hi = Callee.getOperand(1);
+
+   SDValue Data_Lo, Data_Hi;
+   SDVTList Tys = DAG.getVTList(MVT::i8, MVT::Other, MVT::Flag);
+   // Subtract 2 from Address to get the Lower part of DataAddress.
+   SDVTList VTList = DAG.getVTList(MVT::i8, MVT::Flag);
+   Data_Lo = DAG.getNode(ISD::SUBC, dl, VTList, Lo, 
+                         DAG.getConstant(2, MVT::i8));
+   SDValue Ops[3] = { Hi, DAG.getConstant(0, MVT::i8), Data_Lo.getValue(1)};
+   Data_Hi = DAG.getNode(ISD::SUBE, dl, VTList, Ops, 3);
+   SDValue PCLATH = DAG.getNode(PIC16ISD::MTPCLATH, dl, MVT::i8, Data_Hi);
+   Callee = DAG.getNode(PIC16ISD::PIC16Connect, dl, MVT::i8, Data_Lo, PCLATH);
+   SDValue Call = DAG.getNode(PIC16ISD::CALLW, dl, Tys, Chain, Callee,
+                              OperFlag);
+   Chain = getChain(Call);
+   OperFlag = getOutFlag(Call);
+   SDValue SeqEnd = DAG.getCALLSEQ_END(Chain, ZeroOperand, ZeroOperand,
+                                       OperFlag);
+   Chain = getChain(SeqEnd);
+   OperFlag = getOutFlag(SeqEnd);
+
+   // Low part of Data Address 
+   DataAddr_Lo = DAG.getNode(PIC16ISD::MTLO, dl, MVT::i8, Call, OperFlag);
+
+   // Make the second call.
+   SeqStart  = DAG.getCALLSEQ_START(Chain, ZeroOperand);
+   Chain = getChain(SeqStart);
+   OperFlag = getOutFlag(SeqStart); // To manage the data dependency
+
+   // Subtract 1 from Address to get high part of data address.
+   Data_Lo = DAG.getNode(ISD::SUBC, dl, VTList, Lo, 
+                         DAG.getConstant(1, MVT::i8));
+   SDValue HiOps[3] = { Hi, DAG.getConstant(0, MVT::i8), Data_Lo.getValue(1)};
+   Data_Hi = DAG.getNode(ISD::SUBE, dl, VTList, HiOps, 3);
+   PCLATH = DAG.getNode(PIC16ISD::MTPCLATH, dl, MVT::i8, Data_Hi);
+
+   // Use new Lo to make another CALLW
+   Callee = DAG.getNode(PIC16ISD::PIC16Connect, dl, MVT::i8, Data_Lo, PCLATH);
+   Call = DAG.getNode(PIC16ISD::CALLW, dl, Tys, Chain, Callee, OperFlag);
+   Chain = getChain(Call);
+   OperFlag = getOutFlag(Call);
+   SeqEnd = DAG.getCALLSEQ_END(Chain, ZeroOperand, ZeroOperand,
+                                        OperFlag);
+   Chain = getChain(SeqEnd);
+   OperFlag = getOutFlag(SeqEnd);
+   // Hi part of Data Address
+   DataAddr_Hi = DAG.getNode(PIC16ISD::MTHI, dl, MVT::i8, Call, OperFlag);
+}
+
+SDValue
+PIC16TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
+                               CallingConv::ID CallConv, bool isVarArg,
+                               bool &isTailCall,
+                               const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               DebugLoc dl, SelectionDAG &DAG,
+                               SmallVectorImpl<SDValue> &InVals) {
+    // PIC16 target does not yet support tail call optimization.
+    isTailCall = false;
+
+    assert(Callee.getValueType() == MVT::i16 &&
+           "Don't know how to legalize this call node!!!");
+
+    // The flag to track if this is a direct or indirect call.
+    bool IsDirectCall = true;    
+    unsigned RetVals = Ins.size();
+    unsigned NumArgs = Outs.size();
+
+    SDValue DataAddr_Lo, DataAddr_Hi; 
+    if (!isa<GlobalAddressSDNode>(Callee) &&
+        !isa<ExternalSymbolSDNode>(Callee)) {
+       IsDirectCall = false;    // This is indirect call
+
+       // If this is an indirect call then to pass the arguments
+       // and read the return value back, we need the data address
+       // of the function being called.
+       // To get the data address two more calls need to be made.
+
+       // Come here for indirect calls
+       SDValue Lo, Hi;
+       // Indirect addresses. Get the hi and lo parts of ptr.
+       GetExpandedParts(Callee, DAG, Lo, Hi);
+       // Connect Lo and Hi parts of the callee with the PIC16Connect
+       Callee = DAG.getNode(PIC16ISD::PIC16Connect, dl, MVT::i8, Lo, Hi);
+
+       // Read DataAddress only if we have to pass arguments or 
+       // read return value. 
+       if ((RetVals > 0) || (NumArgs > 0)) 
+         GetDataAddress(dl, Callee, Chain, DataAddr_Lo, DataAddr_Hi, DAG);
+    }
+
+    SDValue ZeroOperand = DAG.getConstant(0, MVT::i8);
+
+    // Start the call sequence.
+    // Carring the Constant 0 along the CALLSEQSTART
+    // because there is nothing else to carry.
+    SDValue SeqStart  = DAG.getCALLSEQ_START(Chain, ZeroOperand);
+    Chain = getChain(SeqStart);
+    SDValue OperFlag = getOutFlag(SeqStart); // To manage the data dependency
+    std::string Name;
+
+    // For any direct call - callee will be GlobalAddressNode or
+    // ExternalSymbol
+    SDValue ArgLabel, RetLabel;
+    if (IsDirectCall) { 
+       // Considering the GlobalAddressNode case here.
+       if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+          GlobalValue *GV = G->getGlobal();
+          Callee = DAG.getTargetGlobalAddress(GV, MVT::i8);
+          Name = G->getGlobal()->getName();
+       } else {// Considering the ExternalSymbol case here
+          ExternalSymbolSDNode *ES = dyn_cast<ExternalSymbolSDNode>(Callee);
+          Callee = DAG.getTargetExternalSymbol(ES->getSymbol(), MVT::i8); 
+          Name = ES->getSymbol();
+       }
+
+       // Label for argument passing
+       const char *argFrame = createESName(PAN::getArgsLabel(Name));
+       ArgLabel = DAG.getTargetExternalSymbol(argFrame, MVT::i8);
+
+       // Label for reading return value
+       const char *retName = createESName(PAN::getRetvalLabel(Name));
+       RetLabel = DAG.getTargetExternalSymbol(retName, MVT::i8);
+    } else {
+       // if indirect call
+       SDValue CodeAddr_Lo = Callee.getOperand(0);
+       SDValue CodeAddr_Hi = Callee.getOperand(1);
+
+       /*CodeAddr_Lo = DAG.getNode(ISD::ADD, dl, MVT::i8, CodeAddr_Lo,
+                                 DAG.getConstant(2, MVT::i8));*/
+
+       // move Hi part in PCLATH
+       CodeAddr_Hi = DAG.getNode(PIC16ISD::MTPCLATH, dl, MVT::i8, CodeAddr_Hi);
+       Callee = DAG.getNode(PIC16ISD::PIC16Connect, dl, MVT::i8, CodeAddr_Lo,
+                            CodeAddr_Hi);
+    } 
+
+    // Pass the argument to function before making the call.
+    SDValue CallArgs;
+    if (IsDirectCall) {
+      CallArgs = LowerDirectCallArguments(ArgLabel, Chain, OperFlag,
+                                          Outs, dl, DAG);
+      Chain = getChain(CallArgs);
+      OperFlag = getOutFlag(CallArgs);
+    } else {
+      CallArgs = LowerIndirectCallArguments(Chain, OperFlag, DataAddr_Lo,
+                                            DataAddr_Hi, Outs, Ins, dl, DAG);
+      Chain = getChain(CallArgs);
+      OperFlag = getOutFlag(CallArgs);
+    }
+
+    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+    SDValue PICCall = DAG.getNode(PIC16ISD::CALL, dl, Tys, Chain, Callee,
+                                  OperFlag);
+    Chain = getChain(PICCall);
+    OperFlag = getOutFlag(PICCall);
+
+
+    // Carrying the Constant 0 along the CALLSEQSTART
+    // because there is nothing else to carry.
+    SDValue SeqEnd = DAG.getCALLSEQ_END(Chain, ZeroOperand, ZeroOperand,
+                                        OperFlag);
+    Chain = getChain(SeqEnd);
+    OperFlag = getOutFlag(SeqEnd);
+
+    // Lower the return value reading after the call.
+    if (IsDirectCall)
+      return LowerDirectCallReturn(RetLabel, Chain, OperFlag,
+                                   Ins, dl, DAG, InVals);
+    else
+      return LowerIndirectCallReturn(Chain, OperFlag, DataAddr_Lo,
+                                     DataAddr_Hi, Ins, dl, DAG, InVals);
+}
+
+bool PIC16TargetLowering::isDirectLoad(const SDValue Op) {
+  if (Op.getOpcode() == PIC16ISD::PIC16Load)
+    if (Op.getOperand(1).getOpcode() == ISD::TargetGlobalAddress
+     || Op.getOperand(1).getOpcode() == ISD::TargetExternalSymbol)
+      return true;
+  return false;
+}
+
+// NeedToConvertToMemOp - Returns true if one of the operands of the
+// operation 'Op' needs to be put into memory. Also returns the
+// operand no. of the operand to be converted in 'MemOp'. Remember, PIC16 has 
+// no instruction that can operation on two registers. Most insns take
+// one register and one memory operand (addwf) / Constant (addlw).
+bool PIC16TargetLowering::NeedToConvertToMemOp(SDValue Op, unsigned &MemOp, 
+                      SelectionDAG &DAG) {
+  // If one of the operand is a constant, return false.
+  if (Op.getOperand(0).getOpcode() == ISD::Constant ||
+      Op.getOperand(1).getOpcode() == ISD::Constant)
+    return false;    
+
+  // Return false if one of the operands is already a direct
+  // load and that operand has only one use.
+  if (isDirectLoad(Op.getOperand(0))) {
+    if (Op.getOperand(0).hasOneUse()) {  
+      // Legal and profitable folding check uses the NodeId of DAG nodes.
+      // This NodeId is assigned by topological order. Therefore first 
+      // assign topological order then perform legal and profitable check.
+      // Note:- Though this ordering is done before begining with legalization,
+      // newly added node during legalization process have NodeId=-1 (NewNode)
+      // therefore before performing any check proper ordering of the node is
+      // required.
+      DAG.AssignTopologicalOrder();
+
+      // Direct load operands are folded in binary operations. But before folding
+      // verify if this folding is legal. Fold only if it is legal otherwise
+      // convert this direct load to a separate memory operation.
+      if(ISel->IsLegalAndProfitableToFold(Op.getOperand(0).getNode(), 
+                                         Op.getNode(), Op.getNode()))
+        return false;
+      else 
+        MemOp = 0;
+    }
+  }
+
+  // For operations that are non-cummutative there is no need to check 
+  // for right operand because folding right operand may result in 
+  // incorrect operation. 
+  if (! SelectionDAG::isCommutativeBinOp(Op.getOpcode()))
+    return true;
+
+  if (isDirectLoad(Op.getOperand(1))) {
+    if (Op.getOperand(1).hasOneUse())
+      return false;
+    else 
+      MemOp = 1; 
+  }
+  return true;
+}  
+
+// LowerBinOp - Lower a commutative binary operation that does not
+// affect status flag carry.
+SDValue PIC16TargetLowering::LowerBinOp(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+
+  // We should have handled larger operands in type legalizer itself.
+  assert (Op.getValueType() == MVT::i8 && "illegal Op to lower");
+
+  unsigned MemOp = 1;
+  if (NeedToConvertToMemOp(Op, MemOp, DAG)) {
+    // Put one value on stack.
+    SDValue NewVal = ConvertToMemOperand (Op.getOperand(MemOp), DAG, dl);
+
+    return DAG.getNode(Op.getOpcode(), dl, MVT::i8, Op.getOperand(MemOp ^ 1),
+    NewVal);
+  }
+  else {
+    return Op;
+  }
+}
+
+// LowerADD - Lower all types of ADD operations including the ones
+// that affects carry.
+SDValue PIC16TargetLowering::LowerADD(SDValue Op, SelectionDAG &DAG) {
+  // We should have handled larger operands in type legalizer itself.
+  assert (Op.getValueType() == MVT::i8 && "illegal add to lower");
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned MemOp = 1;
+  if (NeedToConvertToMemOp(Op, MemOp, DAG)) {
+    // Put one value on stack.
+    SDValue NewVal = ConvertToMemOperand (Op.getOperand(MemOp), DAG, dl);
+    
+    // ADDC and ADDE produce two results.
+    SDVTList Tys = DAG.getVTList(MVT::i8, MVT::Flag);
+
+    // ADDE has three operands, the last one is the carry bit.
+    if (Op.getOpcode() == ISD::ADDE)
+      return DAG.getNode(Op.getOpcode(), dl, Tys, Op.getOperand(MemOp ^ 1),
+                         NewVal, Op.getOperand(2));
+    // ADDC has two operands.
+    else if (Op.getOpcode() == ISD::ADDC)
+      return DAG.getNode(Op.getOpcode(), dl, Tys, Op.getOperand(MemOp ^ 1),
+                         NewVal);
+    // ADD it is. It produces only one result.
+    else
+      return DAG.getNode(Op.getOpcode(), dl, MVT::i8, Op.getOperand(MemOp ^ 1),
+                         NewVal);
+  }
+  else
+    return Op;
+}
+
+SDValue PIC16TargetLowering::LowerSUB(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  // We should have handled larger operands in type legalizer itself.
+  assert (Op.getValueType() == MVT::i8 && "illegal sub to lower");
+  unsigned MemOp = 1;
+  SDVTList Tys = DAG.getVTList(MVT::i8, MVT::Flag);
+
+  // Since we don't have an instruction for X - c , 
+  // we can change it to X + (-c)
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+  if (C && (Op.getOpcode() == ISD::SUB))
+    {
+      return DAG.getNode(ISD::ADD, 
+                         dl, MVT::i8, Op.getOperand(0), 
+                         DAG.getConstant(0-(C->getZExtValue()), MVT::i8));
+    }
+
+  if (NeedToConvertToMemOp(Op, MemOp, DAG) ||
+      (isDirectLoad(Op.getOperand(1)) && 
+       (!isDirectLoad(Op.getOperand(0))) &&
+       (Op.getOperand(0).getOpcode() != ISD::Constant)))
+    {
+      // Put first operand on stack.
+      SDValue NewVal = ConvertToMemOperand (Op.getOperand(0), DAG, dl);
+      
+      switch (Op.getOpcode()) {
+      default:
+        assert (0 && "Opcode unknown."); 
+      case ISD::SUBE:
+        return DAG.getNode(Op.getOpcode(), 
+                           dl, Tys, NewVal, Op.getOperand(1),
+                           Op.getOperand(2));
+        break;
+      case ISD::SUBC:
+        return DAG.getNode(Op.getOpcode(), 
+                           dl, Tys, NewVal, Op.getOperand(1));
+        break;
+      case ISD::SUB:
+        return DAG.getNode(Op.getOpcode(), 
+                           dl, MVT::i8, NewVal, Op.getOperand(1));
+        break;
+      }
+    }
+  else 
+    return Op;
+}
+
+void PIC16TargetLowering::InitReservedFrameCount(const Function *F) {
+  unsigned NumArgs = F->arg_size();
+
+  bool isVoidFunc = (F->getReturnType()->getTypeID() == Type::VoidTyID);
+
+  if (isVoidFunc)
+    ReservedFrameCount = NumArgs;
+  else
+    ReservedFrameCount = NumArgs + 1;
+}
+
+// LowerFormalArguments - Argument values are loaded from the
+// <fname>.args + offset. All arguments are already broken to leaglized
+// types, so the offset just runs from 0 to NumArgVals - 1.
+
+SDValue
+PIC16TargetLowering::LowerFormalArguments(SDValue Chain,
+                                          CallingConv::ID CallConv,
+                                          bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                          DebugLoc dl,
+                                          SelectionDAG &DAG,
+                                          SmallVectorImpl<SDValue> &InVals) {
+  unsigned NumArgVals = Ins.size();
+
+  // Get the callee's name to create the <fname>.args label to pass args.
+  MachineFunction &MF = DAG.getMachineFunction();
+  const Function *F = MF.getFunction();
+  std::string FuncName = F->getName();
+
+  // Reset the map of FI and TmpOffset 
+  ResetTmpOffsetMap();
+  // Initialize the ReserveFrameCount
+  InitReservedFrameCount(F);
+
+  // Create the <fname>.args external symbol.
+  const char *tmpName = createESName(PAN::getArgsLabel(FuncName));
+  SDValue ES = DAG.getTargetExternalSymbol(tmpName, MVT::i8);
+
+  // Load arg values from the label + offset.
+  SDVTList VTs  = DAG.getVTList (MVT::i8, MVT::Other);
+  SDValue BS = DAG.getConstant(1, MVT::i8);
+  for (unsigned i = 0; i < NumArgVals ; ++i) {
+    SDValue Offset = DAG.getConstant(i, MVT::i8);
+    SDValue PICLoad = DAG.getNode(PIC16ISD::PIC16LdArg, dl, VTs, Chain, ES, BS,
+                                  Offset);
+    Chain = getChain(PICLoad);
+    InVals.push_back(PICLoad);
+  }
+
+  return Chain;
+}
+
+// Perform DAGCombine of PIC16Load.
+// FIXME - Need a more elaborate comment here.
+SDValue PIC16TargetLowering::
+PerformPIC16LoadCombine(SDNode *N, DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue Chain = N->getOperand(0); 
+  if (N->hasNUsesOfValue(0, 0)) {
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N,1), Chain);
+  }
+  return SDValue();
+}
+
+// For all the functions with arguments some STORE nodes are generated 
+// that store the argument on the frameindex. However in PIC16 the arguments
+// are passed on stack only. Therefore these STORE nodes are redundant. 
+// To remove these STORE nodes will be removed in PerformStoreCombine 
+//
+// Currently this function is doint nothing and will be updated for removing
+// unwanted store operations
+SDValue PIC16TargetLowering::
+PerformStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const {
+  return SDValue(N, 0);
+  /*
+  // Storing an undef value is of no use, so remove it
+  if (isStoringUndef(N, Chain, DAG)) {
+    return Chain; // remove the store and return the chain
+  }
+  //else everything is ok.
+  return SDValue(N, 0);
+  */
+}
+
+SDValue PIC16TargetLowering::PerformDAGCombine(SDNode *N, 
+                                               DAGCombinerInfo &DCI) const {
+  switch (N->getOpcode()) {
+  case ISD::STORE:   
+   return PerformStoreCombine(N, DCI); 
+  case PIC16ISD::PIC16Load:   
+    return PerformPIC16LoadCombine(N, DCI);
+  }
+  return SDValue();
+}
+
+static PIC16CC::CondCodes IntCCToPIC16CC(ISD::CondCode CC) {
+  switch (CC) {
+  default: llvm_unreachable("Unknown condition code!");
+  case ISD::SETNE:  return PIC16CC::NE;
+  case ISD::SETEQ:  return PIC16CC::EQ;
+  case ISD::SETGT:  return PIC16CC::GT;
+  case ISD::SETGE:  return PIC16CC::GE;
+  case ISD::SETLT:  return PIC16CC::LT;
+  case ISD::SETLE:  return PIC16CC::LE;
+  case ISD::SETULT: return PIC16CC::ULT;
+  case ISD::SETULE: return PIC16CC::ULE;
+  case ISD::SETUGE: return PIC16CC::UGE;
+  case ISD::SETUGT: return PIC16CC::UGT;
+  }
+}
+
+// Look at LHS/RHS/CC and see if they are a lowered setcc instruction.  If so
+// set LHS/RHS and SPCC to the LHS/RHS of the setcc and SPCC to the condition.
+static void LookThroughSetCC(SDValue &LHS, SDValue &RHS,
+                             ISD::CondCode CC, unsigned &SPCC) {
+  if (isa<ConstantSDNode>(RHS) &&
+      cast<ConstantSDNode>(RHS)->getZExtValue() == 0 &&
+      CC == ISD::SETNE &&
+      (LHS.getOpcode() == PIC16ISD::SELECT_ICC &&
+        LHS.getOperand(3).getOpcode() == PIC16ISD::SUBCC) &&
+      isa<ConstantSDNode>(LHS.getOperand(0)) &&
+      isa<ConstantSDNode>(LHS.getOperand(1)) &&
+      cast<ConstantSDNode>(LHS.getOperand(0))->getZExtValue() == 1 &&
+      cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() == 0) {
+    SDValue CMPCC = LHS.getOperand(3);
+    SPCC = cast<ConstantSDNode>(LHS.getOperand(2))->getZExtValue();
+    LHS = CMPCC.getOperand(0);
+    RHS = CMPCC.getOperand(1);
+  }
+}
+
+// Returns appropriate CMP insn and corresponding condition code in PIC16CC
+SDValue PIC16TargetLowering::getPIC16Cmp(SDValue LHS, SDValue RHS, 
+                                         unsigned CC, SDValue &PIC16CC, 
+                                         SelectionDAG &DAG, DebugLoc dl) {
+  PIC16CC::CondCodes CondCode = (PIC16CC::CondCodes) CC;
+
+  // PIC16 sub is literal - W. So Swap the operands and condition if needed.
+  // i.e. a < 12 can be rewritten as 12 > a.
+  if (RHS.getOpcode() == ISD::Constant) {
+
+    SDValue Tmp = LHS;
+    LHS = RHS;
+    RHS = Tmp;
+
+    switch (CondCode) {
+    default: break;
+    case PIC16CC::LT:
+      CondCode = PIC16CC::GT; 
+      break;
+    case PIC16CC::GT:
+      CondCode = PIC16CC::LT; 
+      break;
+    case PIC16CC::ULT:
+      CondCode = PIC16CC::UGT; 
+      break;
+    case PIC16CC::UGT:
+      CondCode = PIC16CC::ULT; 
+      break;
+    case PIC16CC::GE:
+      CondCode = PIC16CC::LE; 
+      break;
+    case PIC16CC::LE:
+      CondCode = PIC16CC::GE;
+      break;
+    case PIC16CC::ULE:
+      CondCode = PIC16CC::UGE;
+      break;
+    case PIC16CC::UGE:
+      CondCode = PIC16CC::ULE;
+      break;
+    }
+  }
+
+  PIC16CC = DAG.getConstant(CondCode, MVT::i8);
+
+  // These are signed comparisons. 
+  SDValue Mask = DAG.getConstant(128, MVT::i8);
+  if (isSignedComparison(CondCode)) {
+    LHS = DAG.getNode (ISD::XOR, dl, MVT::i8, LHS, Mask);
+    RHS = DAG.getNode (ISD::XOR, dl, MVT::i8, RHS, Mask); 
+  }
+
+  SDVTList VTs = DAG.getVTList (MVT::i8, MVT::Flag);
+  // We can use a subtract operation to set the condition codes. But
+  // we need to put one operand in memory if required.
+  // Nothing to do if the first operand is already a valid type (direct load 
+  // for subwf and literal for sublw) and it is used by this operation only. 
+  if ((LHS.getOpcode() == ISD::Constant || isDirectLoad(LHS)) 
+      && LHS.hasOneUse())
+    return DAG.getNode(PIC16ISD::SUBCC, dl, VTs, LHS, RHS);
+
+  // else convert the first operand to mem.
+  LHS = ConvertToMemOperand (LHS, DAG, dl);
+  return DAG.getNode(PIC16ISD::SUBCC, dl, VTs, LHS, RHS);
+}
+
+
+SDValue PIC16TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) {
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+  SDValue TrueVal = Op.getOperand(2);
+  SDValue FalseVal = Op.getOperand(3);
+  unsigned ORIGCC = ~0;
+  DebugLoc dl = Op.getDebugLoc();
+
+  // If this is a select_cc of a "setcc", and if the setcc got lowered into
+  // an CMP[IF]CC/SELECT_[IF]CC pair, find the original compared values.
+  // i.e.
+  // A setcc: lhs, rhs, cc is expanded by llvm to 
+  // select_cc: result of setcc, 0, 1, 0, setne
+  // We can think of it as:
+  // select_cc: lhs, rhs, 1, 0, cc
+  LookThroughSetCC(LHS, RHS, CC, ORIGCC);
+  if (ORIGCC == ~0U) ORIGCC = IntCCToPIC16CC (CC);
+
+  SDValue PIC16CC;
+  SDValue Cmp = getPIC16Cmp(LHS, RHS, ORIGCC, PIC16CC, DAG, dl);
+
+  return DAG.getNode (PIC16ISD::SELECT_ICC, dl, TrueVal.getValueType(), TrueVal,
+                      FalseVal, PIC16CC, Cmp.getValue(1)); 
+}
+
+MachineBasicBlock *
+PIC16TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                 MachineBasicBlock *BB,
+                   DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const {
+  const TargetInstrInfo &TII = *getTargetMachine().getInstrInfo();
+  unsigned CC = (PIC16CC::CondCodes)MI->getOperand(3).getImm();
+  DebugLoc dl = MI->getDebugLoc();
+
+  // To "insert" a SELECT_CC instruction, we actually have to insert the diamond
+  // control-flow pattern.  The incoming instruction knows the destination vreg
+  // to set, the condition code register to branch on, the true/false values to
+  // select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   [f]bCC copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  BuildMI(BB, dl, TII.get(PIC16::pic16brcond)).addMBB(sinkMBB).addImm(CC);
+  F->insert(It, copy0MBB);
+  F->insert(It, sinkMBB);
+
+  // Update machine-CFG edges by first adding all successors of the current
+  // block to the new block which will contain the Phi node for the select.
+  // Also inform sdisel of the edge changes.
+  for (MachineBasicBlock::succ_iterator I = BB->succ_begin(), 
+         E = BB->succ_end(); I != E; ++I) {
+    EM->insert(std::make_pair(*I, sinkMBB));
+    sinkMBB->addSuccessor(*I);
+  }
+  // Next, remove all successors of the current block, and add the true
+  // and fallthrough blocks as its successors.
+  while (!BB->succ_empty())
+    BB->removeSuccessor(BB->succ_begin());
+  // Next, add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(sinkMBB);
+
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to sinkMBB
+  BB = copy0MBB;
+
+  // Update machine-CFG edges
+  BB->addSuccessor(sinkMBB);
+
+  //  sinkMBB:
+  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+  //  ...
+  BB = sinkMBB;
+  BuildMI(BB, dl, TII.get(PIC16::PHI), MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB)
+    .addReg(MI->getOperand(1).getReg()).addMBB(thisMBB);
+
+  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+  return BB;
+}
+
+
+SDValue PIC16TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) {
+  SDValue Chain = Op.getOperand(0);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+  SDValue LHS = Op.getOperand(2);   // LHS of the condition.
+  SDValue RHS = Op.getOperand(3);   // RHS of the condition.
+  SDValue Dest = Op.getOperand(4);  // BB to jump to
+  unsigned ORIGCC = ~0;
+  DebugLoc dl = Op.getDebugLoc();
+
+  // If this is a br_cc of a "setcc", and if the setcc got lowered into
+  // an CMP[IF]CC/SELECT_[IF]CC pair, find the original compared values.
+  LookThroughSetCC(LHS, RHS, CC, ORIGCC);
+  if (ORIGCC == ~0U) ORIGCC = IntCCToPIC16CC (CC);
+
+  // Get the Compare insn and condition code.
+  SDValue PIC16CC;
+  SDValue Cmp = getPIC16Cmp(LHS, RHS, ORIGCC, PIC16CC, DAG, dl);
+
+  return DAG.getNode(PIC16ISD::BRCOND, dl, MVT::Other, Chain, Dest, PIC16CC, 
+                     Cmp.getValue(1));
+}
+

diff --git a/lib/Target/PIC16/PIC16ISelLowering.h b/lib/Target/PIC16/PIC16ISelLowering.h
new file mode 100644
index 0000000..de14520
--- /dev/null
+++ b/lib/Target/PIC16/PIC16ISelLowering.h

@@ -0,0 +1,264 @@
+//===-- PIC16ISelLowering.h - PIC16 DAG Lowering Interface ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that PIC16 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PIC16ISELLOWERING_H
+#define PIC16ISELLOWERING_H
+
+#include "PIC16.h"
+#include "PIC16Subtarget.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Target/TargetLowering.h"
+#include <map>
+
+namespace llvm {
+  namespace PIC16ISD {
+    enum NodeType {
+      // Start the numbering from where ISD NodeType finishes.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      Lo,            // Low 8-bits of GlobalAddress.
+      Hi,            // High 8-bits of GlobalAddress.
+      PIC16Load,
+      PIC16LdArg,   // This is replica of PIC16Load but used to load function 
+                    // arguments and is being used for facilitating for some 
+                    // store removal optimizations. 
+
+      PIC16LdWF,
+      PIC16Store,
+      PIC16StWF,
+      Banksel,
+      MTLO,          // Move to low part of FSR
+      MTHI,          // Move to high part of FSR
+      MTPCLATH,      // Move to PCLATCH
+      PIC16Connect,  // General connector for PIC16 nodes
+      BCF,
+      LSLF,          // PIC16 Logical shift left
+      LRLF,          // PIC16 Logical shift right
+      RLF,           // Rotate left through carry
+      RRF,           // Rotate right through carry
+      CALL,          // PIC16 Call instruction 
+      CALLW,         // PIC16 CALLW instruction 
+      SUBCC,         // Compare for equality or inequality.
+      SELECT_ICC,    // Psuedo to be caught in schedular and expanded to brcond.
+      BRCOND,        // Conditional branch.
+      RET,           // Return.
+      Dummy
+    };
+
+    // Keep track of different address spaces. 
+    enum AddressSpace {
+      RAM_SPACE = 0,   // RAM address space
+      ROM_SPACE = 1    // ROM address space number is 1
+    };
+    enum PIC16Libcall {
+      MUL_I8 = RTLIB::UNKNOWN_LIBCALL + 1,
+      SRA_I8,
+      SLL_I8,
+      SRL_I8,
+      PIC16UnknownCall
+    };
+  }
+
+
+  //===--------------------------------------------------------------------===//
+  // TargetLowering Implementation
+  //===--------------------------------------------------------------------===//
+  class PIC16TargetLowering : public TargetLowering {
+  public:
+    explicit PIC16TargetLowering(PIC16TargetMachine &TM);
+
+    /// getTargetNodeName - This method returns the name of a target specific
+    /// DAG node.
+    virtual const char *getTargetNodeName(unsigned Opcode) const;
+    /// getSetCCResultType - Return the ISD::SETCC ValueType
+    virtual MVT::SimpleValueType getSetCCResultType(EVT ValType) const;
+    virtual MVT::SimpleValueType getCmpLibcallReturnType() const;
+    SDValue LowerShift(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerMUL(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerADD(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerSUB(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerBinOp(SDValue Op, SelectionDAG &DAG);
+    // Call returns
+    SDValue 
+    LowerDirectCallReturn(SDValue RetLabel, SDValue Chain, SDValue InFlag,
+                          const SmallVectorImpl<ISD::InputArg> &Ins,
+                          DebugLoc dl, SelectionDAG &DAG,
+                          SmallVectorImpl<SDValue> &InVals);
+    SDValue 
+    LowerIndirectCallReturn(SDValue Chain, SDValue InFlag,
+                             SDValue DataAddr_Lo, SDValue DataAddr_Hi,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            DebugLoc dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals);
+
+    // Call arguments
+    SDValue 
+    LowerDirectCallArguments(SDValue ArgLabel, SDValue Chain, SDValue InFlag,
+                             const SmallVectorImpl<ISD::OutputArg> &Outs,
+                             DebugLoc dl, SelectionDAG &DAG);
+
+    SDValue 
+    LowerIndirectCallArguments(SDValue Chain, SDValue InFlag,
+                               SDValue DataAddr_Lo, SDValue DataAddr_Hi, 
+                               const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               DebugLoc dl, SelectionDAG &DAG);
+
+    SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG);
+    SDValue getPIC16Cmp(SDValue LHS, SDValue RHS, unsigned OrigCC, SDValue &CC,
+                        SelectionDAG &DAG, DebugLoc dl);
+    virtual MachineBasicBlock *EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                         MachineBasicBlock *MBB,
+                    DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const;
+
+
+    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG);
+    virtual void ReplaceNodeResults(SDNode *N,
+                                    SmallVectorImpl<SDValue> &Results,
+                                    SelectionDAG &DAG);
+    virtual void LowerOperationWrapper(SDNode *N,
+                                       SmallVectorImpl<SDValue> &Results,
+                                       SelectionDAG &DAG);
+
+    virtual SDValue
+    LowerFormalArguments(SDValue Chain,
+                         CallingConv::ID CallConv,
+                         bool isVarArg,
+                         const SmallVectorImpl<ISD::InputArg> &Ins,
+                         DebugLoc dl, SelectionDAG &DAG,
+                         SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerCall(SDValue Chain, SDValue Callee,
+                CallingConv::ID CallConv, bool isVarArg, bool &isTailCall,
+                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<ISD::InputArg> &Ins,
+                DebugLoc dl, SelectionDAG &DAG,
+                SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerReturn(SDValue Chain,
+                  CallingConv::ID CallConv, bool isVarArg,
+                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  DebugLoc dl, SelectionDAG &DAG);
+
+    SDValue ExpandStore(SDNode *N, SelectionDAG &DAG);
+    SDValue ExpandLoad(SDNode *N, SelectionDAG &DAG);
+    SDValue ExpandGlobalAddress(SDNode *N, SelectionDAG &DAG);
+    SDValue ExpandExternalSymbol(SDNode *N, SelectionDAG &DAG);
+    SDValue ExpandFrameIndex(SDNode *N, SelectionDAG &DAG);
+
+    SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const; 
+    SDValue PerformPIC16LoadCombine(SDNode *N, DAGCombinerInfo &DCI) const; 
+    SDValue PerformStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const; 
+
+    // This function returns the Tmp Offset for FrameIndex. If any TmpOffset 
+    // already exists for the FI then it returns the same else it creates the 
+    // new offset and returns.
+    unsigned GetTmpOffsetForFI(unsigned FI, unsigned slot_size); 
+    void ResetTmpOffsetMap() { FiTmpOffsetMap.clear(); SetTmpSize(0); }
+    void InitReservedFrameCount(const Function *F); 
+
+    // Return the size of Tmp variable 
+    unsigned GetTmpSize() { return TmpSize; }
+    void SetTmpSize(unsigned Size) { TmpSize = Size; }
+
+    /// getFunctionAlignment - Return the Log2 alignment of this function.
+    virtual unsigned getFunctionAlignment(const Function *) const {
+      // FIXME: The function never seems to be aligned.
+      return 1;
+    }
+  private:
+    // If the Node is a BUILD_PAIR representing a direct Address,
+    // then this function will return true.
+    bool isDirectAddress(const SDValue &Op);
+
+    // If the Node is a DirectAddress in ROM_SPACE then this 
+    // function will return true
+    bool isRomAddress(const SDValue &Op);
+
+    // Extract the Lo and Hi component of Op. 
+    void GetExpandedParts(SDValue Op, SelectionDAG &DAG, SDValue &Lo, 
+                          SDValue &Hi); 
+
+
+    // Load pointer can be a direct or indirect address. In PIC16 direct
+    // addresses need Banksel and Indirect addresses need to be loaded to
+    // FSR first. Handle address specific cases here.
+    void LegalizeAddress(SDValue Ptr, SelectionDAG &DAG, SDValue &Chain, 
+                         SDValue &NewPtr, unsigned &Offset, DebugLoc dl);
+
+    // FrameIndex should be broken down into ExternalSymbol and FrameOffset. 
+    void LegalizeFrameIndex(SDValue Op, SelectionDAG &DAG, SDValue &ES, 
+                            int &Offset);
+
+    // For indirect calls data address of the callee frame need to be
+    // extracted. This function fills the arguments DataAddr_Lo and 
+    // DataAddr_Hi with the address of the callee frame.
+    void GetDataAddress(DebugLoc dl, SDValue Callee, SDValue &Chain,
+                        SDValue &DataAddr_Lo, SDValue &DataAddr_Hi,
+                        SelectionDAG &DAG); 
+
+    // We can not have both operands of a binary operation in W.
+    // This function is used to put one operand on stack and generate a load.
+    SDValue ConvertToMemOperand(SDValue Op, SelectionDAG &DAG, DebugLoc dl); 
+
+    // This function checks if we need to put an operand of an operation on
+    // stack and generate a load or not.
+    // DAG parameter is required to access DAG information during
+    // analysis.
+    bool NeedToConvertToMemOp(SDValue Op, unsigned &MemOp, SelectionDAG &DAG); 
+
+    /// Subtarget - Keep a pointer to the PIC16Subtarget around so that we can
+    /// make the right decision when generating code for different targets.
+    const PIC16Subtarget *Subtarget;
+
+
+    // Extending the LIB Call framework of LLVM
+    // to hold the names of PIC16Libcalls.
+    const char *PIC16LibcallNames[PIC16ISD::PIC16UnknownCall]; 
+
+    // To set and retrieve the lib call names.
+    void setPIC16LibcallName(PIC16ISD::PIC16Libcall Call, const char *Name);
+    const char *getPIC16LibcallName(PIC16ISD::PIC16Libcall Call);
+
+    // Make PIC16 Libcall.
+    SDValue MakePIC16Libcall(PIC16ISD::PIC16Libcall Call, EVT RetVT, 
+                             const SDValue *Ops, unsigned NumOps, bool isSigned,
+                             SelectionDAG &DAG, DebugLoc dl);
+
+    // Check if operation has a direct load operand.
+    inline bool isDirectLoad(const SDValue Op);
+
+  public:
+    // Keep a pointer to SelectionDAGISel to access its public 
+    // interface (It is required during legalization)
+    SelectionDAGISel   *ISel;
+
+  private:
+    // The frameindexes generated for spill/reload are stack based.
+    // This maps maintain zero based indexes for these FIs.
+    std::map<unsigned, unsigned> FiTmpOffsetMap;
+    unsigned TmpSize;
+
+    // These are the frames for return value and argument passing 
+    // These FrameIndices will be expanded to foo.frame external symbol
+    // and all others will be expanded to foo.tmp external symbol.
+    unsigned ReservedFrameCount; 
+  };
+} // namespace llvm
+
+#endif // PIC16ISELLOWERING_H

diff --git a/lib/Target/PIC16/PIC16InstrFormats.td b/lib/Target/PIC16/PIC16InstrFormats.td
new file mode 100644
index 0000000..e213ea8
--- /dev/null
+++ b/lib/Target/PIC16/PIC16InstrFormats.td

@@ -0,0 +1,117 @@
+//===- PIC16InstrFormats.td - PIC16 Instruction Formats-------*- tblgen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Describe PIC16 instructions format
+//
+//  All the possible PIC16 fields are:
+//
+//  opcode  - operation code.
+//  f       - 7-bit register file address.
+//  d       - 1-bit direction specifier
+//  k       - 8/11 bit literals
+//  b       - 3 bits bit num specifier
+//
+//===----------------------------------------------------------------------===//
+
+// Generic PIC16 Format
+// PIC16 Instructions are 14-bit wide.
+
+// FIXME: Add Cooper Specific Formats if any.
+
+class PIC16Inst<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : Instruction {
+  field bits<14> Inst;
+
+  let Namespace = "PIC16";
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  let AsmString = asmstr;
+  let Pattern = pattern;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Byte Oriented instruction class in PIC16 : <|opcode|d|f|>
+// opcode = 6 bits.
+// d = direction = 1 bit.
+// f = file register address = 7 bits.
+//===----------------------------------------------------------------------===//
+
+class ByteFormat<bits<6> opcode, dag outs, dag ins, string asmstr,
+                 list<dag> pattern>
+  :PIC16Inst<outs, ins, asmstr, pattern> {
+  bits<1>  d;
+  bits<7>  f;
+
+  let Inst{13-8} = opcode;
+
+  let Inst{7} = d;
+  let Inst{6-0} = f; 
+}
+
+//===----------------------------------------------------------------------===//
+// Bit Oriented instruction class in PIC16 : <|opcode|b|f|>
+// opcode = 4 bits.
+// b = bit specifier = 3 bits.
+// f = file register address = 7 bits.
+//===----------------------------------------------------------------------===//
+
+class BitFormat<bits<4> opcode, dag outs, dag ins, string asmstr, 
+                list<dag> pattern>
+  : PIC16Inst<outs, ins, asmstr, pattern> {
+  bits<3>  b;
+  bits<7>  f;
+
+  let Inst{13-10} = opcode;
+
+  let Inst{9-7} = b;
+  let Inst{6-0} = f; 
+}
+
+//===----------------------------------------------------------------------===//
+// Literal Format instruction class in PIC16 : <|opcode|k|>
+// opcode = 6 bits
+// k = literal = 8 bits
+//===----------------------------------------------------------------------===//
+
+class LiteralFormat<bits<6> opcode, dag outs, dag ins, string asmstr, 
+                    list<dag> pattern>
+  : PIC16Inst<outs, ins, asmstr, pattern> {
+  bits<8> k;
+  
+  let Inst{13-8} = opcode;
+
+  let Inst{7-0} = k; 
+}
+
+//===----------------------------------------------------------------------===//
+// Control Format instruction class in PIC16 : <|opcode|k|>
+// opcode = 3 bits.
+// k = jump address = 11 bits.
+//===----------------------------------------------------------------------===//
+
+class ControlFormat<bits<3> opcode, dag outs, dag ins, string asmstr, 
+                    list<dag> pattern>
+  : PIC16Inst<outs, ins, asmstr, pattern> {
+  bits<11> k;
+
+  let Inst{13-11} = opcode;
+
+  let Inst{10-0} = k; 
+}
+
+//===----------------------------------------------------------------------===//
+// Pseudo instruction class in PIC16
+//===----------------------------------------------------------------------===//
+
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+  : PIC16Inst<outs, ins, asmstr, pattern> {
+   let Inst{13-6} = 0;
+}

diff --git a/lib/Target/PIC16/PIC16InstrInfo.cpp b/lib/Target/PIC16/PIC16InstrInfo.cpp
new file mode 100644
index 0000000..2fb405e
--- /dev/null
+++ b/lib/Target/PIC16/PIC16InstrInfo.cpp

@@ -0,0 +1,238 @@
+//===- PIC16InstrInfo.cpp - PIC16 Instruction Information -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PIC16 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PIC16.h"
+#include "PIC16ABINames.h"
+#include "PIC16InstrInfo.h"
+#include "PIC16TargetMachine.h"
+#include "PIC16GenInstrInfo.inc"
+#include "llvm/Function.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cstdio>
+
+
+using namespace llvm;
+
+// FIXME: Add the subtarget support on this constructor.
+PIC16InstrInfo::PIC16InstrInfo(PIC16TargetMachine &tm)
+  : TargetInstrInfoImpl(PIC16Insts, array_lengthof(PIC16Insts)),
+    TM(tm), 
+    RegInfo(*this, *TM.getSubtargetImpl()) {}
+
+
+/// isStoreToStackSlot - If the specified machine instruction is a direct
+/// store to a stack slot, return the virtual or physical register number of
+/// the source reg along with the FrameIndex of the loaded stack slot.  
+/// If not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than storing to the stack slot.
+unsigned PIC16InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                            int &FrameIndex) const {
+  if (MI->getOpcode() == PIC16::movwf 
+      && MI->getOperand(0).isReg()
+      && MI->getOperand(1).isSymbol()) {
+    FrameIndex = MI->getOperand(1).getIndex();
+    return MI->getOperand(0).getReg();
+  }
+  return 0;
+}
+
+/// isLoadFromStackSlot - If the specified machine instruction is a direct
+/// load from a stack slot, return the virtual or physical register number of
+/// the dest reg along with the FrameIndex of the stack slot.  
+/// If not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than storing to the stack slot.
+unsigned PIC16InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                            int &FrameIndex) const {
+  if (MI->getOpcode() == PIC16::movf 
+      && MI->getOperand(0).isReg()
+      && MI->getOperand(1).isSymbol()) {
+    FrameIndex = MI->getOperand(1).getIndex();
+    return MI->getOperand(0).getReg();
+  }
+  return 0;
+}
+
+
+void PIC16InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, 
+                                         MachineBasicBlock::iterator I,
+                                         unsigned SrcReg, bool isKill, int FI,
+                                         const TargetRegisterClass *RC) const {
+  PIC16TargetLowering *PTLI = TM.getTargetLowering();
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  const Function *Func = MBB.getParent()->getFunction();
+  const std::string FuncName = Func->getName();
+
+  const char *tmpName = createESName(PAN::getTempdataLabel(FuncName));
+
+  // On the order of operands here: think "movwf SrcReg, tmp_slot, offset".
+  if (RC == PIC16::GPRRegisterClass) {
+    //MachineFunction &MF = *MBB.getParent();
+    //MachineRegisterInfo &RI = MF.getRegInfo();
+    BuildMI(MBB, I, DL, get(PIC16::movwf))
+      .addReg(SrcReg, getKillRegState(isKill))
+      .addImm(PTLI->GetTmpOffsetForFI(FI, 1))
+      .addExternalSymbol(tmpName)
+      .addImm(1); // Emit banksel for it.
+  }
+  else if (RC == PIC16::FSR16RegisterClass) {
+    // This is a 16-bit register and the frameindex given by llvm is of
+    // size two here. Break this index N into two zero based indexes and 
+    // put one into the map. The second one is always obtained by adding 1
+    // to the first zero based index. In fact it is going to use 3 slots
+    // as saving FSRs corrupts W also and hence we need to save/restore W also.
+
+    unsigned opcode = (SrcReg == PIC16::FSR0) ? PIC16::save_fsr0 
+                                                 : PIC16::save_fsr1;
+    BuildMI(MBB, I, DL, get(opcode))
+      .addReg(SrcReg, getKillRegState(isKill))
+      .addImm(PTLI->GetTmpOffsetForFI(FI, 3))
+      .addExternalSymbol(tmpName)
+      .addImm(1); // Emit banksel for it.
+  }
+  else
+    llvm_unreachable("Can't store this register to stack slot");
+}
+
+void PIC16InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, 
+                                          MachineBasicBlock::iterator I,
+                                          unsigned DestReg, int FI,
+                                          const TargetRegisterClass *RC) const {
+  PIC16TargetLowering *PTLI = TM.getTargetLowering();
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  const Function *Func = MBB.getParent()->getFunction();
+  const std::string FuncName = Func->getName();
+
+  const char *tmpName = createESName(PAN::getTempdataLabel(FuncName));
+
+  // On the order of operands here: think "movf FrameIndex, W".
+  if (RC == PIC16::GPRRegisterClass) {
+    //MachineFunction &MF = *MBB.getParent();
+    //MachineRegisterInfo &RI = MF.getRegInfo();
+    BuildMI(MBB, I, DL, get(PIC16::movf), DestReg)
+      .addImm(PTLI->GetTmpOffsetForFI(FI, 1))
+      .addExternalSymbol(tmpName)
+      .addImm(1); // Emit banksel for it.
+  }
+  else if (RC == PIC16::FSR16RegisterClass) {
+    // This is a 16-bit register and the frameindex given by llvm is of
+    // size two here. Break this index N into two zero based indexes and 
+    // put one into the map. The second one is always obtained by adding 1
+    // to the first zero based index. In fact it is going to use 3 slots
+    // as saving FSRs corrupts W also and hence we need to save/restore W also.
+
+    unsigned opcode = (DestReg == PIC16::FSR0) ? PIC16::restore_fsr0 
+                                                 : PIC16::restore_fsr1;
+    BuildMI(MBB, I, DL, get(opcode), DestReg)
+      .addImm(PTLI->GetTmpOffsetForFI(FI, 3))
+      .addExternalSymbol(tmpName)
+      .addImm(1); // Emit banksel for it.
+  }
+  else
+    llvm_unreachable("Can't load this register from stack slot");
+}
+
+bool PIC16InstrInfo::copyRegToReg (MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator I,
+                                   unsigned DestReg, unsigned SrcReg,
+                                   const TargetRegisterClass *DestRC,
+                                   const TargetRegisterClass *SrcRC) const {
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  if (DestRC == PIC16::FSR16RegisterClass) {
+    BuildMI(MBB, I, DL, get(PIC16::copy_fsr), DestReg).addReg(SrcReg);
+    return true;
+  }
+
+  if (DestRC == PIC16::GPRRegisterClass) {
+    BuildMI(MBB, I, DL, get(PIC16::copy_w), DestReg).addReg(SrcReg);
+    return true;
+  }
+
+  // Not yet supported.
+  return false;
+}
+
+bool PIC16InstrInfo::isMoveInstr(const MachineInstr &MI,
+                                 unsigned &SrcReg, unsigned &DestReg,
+                                 unsigned &SrcSubIdx, unsigned &DstSubIdx) const {
+  SrcSubIdx = DstSubIdx = 0; // No sub-registers.
+
+  if (MI.getOpcode() == PIC16::copy_fsr
+      || MI.getOpcode() == PIC16::copy_w) {
+    DestReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+    return true;
+  }
+
+  return false;
+}
+
+/// InsertBranch - Insert a branch into the end of the specified
+/// MachineBasicBlock.  This operands to this method are the same as those
+/// returned by AnalyzeBranch.  This is invoked in cases where AnalyzeBranch
+/// returns success and when an unconditional branch (TBB is non-null, FBB is
+/// null, Cond is empty) needs to be inserted. It returns the number of
+/// instructions inserted.
+unsigned PIC16InstrInfo::
+InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, 
+             MachineBasicBlock *FBB,
+             const SmallVectorImpl<MachineOperand> &Cond) const {
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+
+  if (FBB == 0) { // One way branch.
+    if (Cond.empty()) {
+      // Unconditional branch?
+      DebugLoc dl = DebugLoc::getUnknownLoc();
+      BuildMI(&MBB, dl, get(PIC16::br_uncond)).addMBB(TBB);
+    }
+    return 1;
+  }
+
+  // FIXME: If the there are some conditions specified then conditional branch
+  // should be generated.   
+  // For the time being no instruction is being generated therefore
+  // returning NULL.
+  return 0;
+}
+
+bool PIC16InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+                                   MachineBasicBlock *&TBB,
+                                   MachineBasicBlock *&FBB,
+                                   SmallVectorImpl<MachineOperand> &Cond,
+                                   bool AllowModify) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin())
+    return true;
+
+  // Get the terminator instruction.
+  --I;
+  // Handle unconditional branches. If the unconditional branch's target is
+  // successor basic block then remove the unconditional branch. 
+  if (I->getOpcode() == PIC16::br_uncond  && AllowModify) {
+    if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
+      TBB = 0;
+      I->eraseFromParent();
+    }
+  }
+  return true;
+}

diff --git a/lib/Target/PIC16/PIC16InstrInfo.h b/lib/Target/PIC16/PIC16InstrInfo.h
new file mode 100644
index 0000000..56f51f0
--- /dev/null
+++ b/lib/Target/PIC16/PIC16InstrInfo.h

@@ -0,0 +1,78 @@
+//===- PIC16InstrInfo.h - PIC16 Instruction Information----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the niversity of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PIC16 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PIC16INSTRUCTIONINFO_H
+#define PIC16INSTRUCTIONINFO_H
+
+#include "PIC16.h"
+#include "PIC16RegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+namespace llvm {
+
+
+class PIC16InstrInfo : public TargetInstrInfoImpl 
+{
+  PIC16TargetMachine &TM;
+  const PIC16RegisterInfo RegInfo;
+public:
+  explicit PIC16InstrInfo(PIC16TargetMachine &TM);
+
+  virtual const PIC16RegisterInfo &getRegisterInfo() const { return RegInfo; }
+
+  /// isLoadFromStackSlot - If the specified machine instruction is a direct
+  /// load from a stack slot, return the virtual or physical register number of
+  /// the destination along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than loading from the stack slot.
+  virtual unsigned isLoadFromStackSlot(const MachineInstr *MI, 
+                                       int &FrameIndex) const;
+                                                                               
+  /// isStoreToStackSlot - If the specified machine instruction is a direct
+  /// store to a stack slot, return the virtual or physical register number of
+  /// the source reg along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than storing to the stack slot.
+  virtual unsigned isStoreToStackSlot(const MachineInstr *MI, 
+                                      int &FrameIndex) const;
+
+  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   unsigned SrcReg, bool isKill, int FrameIndex,
+                                   const TargetRegisterClass *RC) const;
+                                                                               
+  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MBBI,
+                                    unsigned DestReg, int FrameIndex,
+                                    const TargetRegisterClass *RC) const;
+  virtual bool copyRegToReg(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            unsigned DestReg, unsigned SrcReg,
+                            const TargetRegisterClass *DestRC,
+                            const TargetRegisterClass *SrcRC) const;
+  virtual bool isMoveInstr(const MachineInstr &MI,
+                           unsigned &SrcReg, unsigned &DstReg,
+                           unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
+
+  virtual 
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB,
+                        const SmallVectorImpl<MachineOperand> &Cond) const; 
+  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                             MachineBasicBlock *&FBB,
+                             SmallVectorImpl<MachineOperand> &Cond,
+                             bool AllowModify) const;
+  };
+} // namespace llvm
+
+#endif

diff --git a/lib/Target/PIC16/PIC16InstrInfo.td b/lib/Target/PIC16/PIC16InstrInfo.td
new file mode 100644
index 0000000..24df251
--- /dev/null
+++ b/lib/Target/PIC16/PIC16InstrInfo.td

@@ -0,0 +1,540 @@
+//===- PIC16InstrInfo.td - PIC16 Instruction defs -------------*- tblgen-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the PIC16 instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// PIC16 Specific Type Constraints.
+//===----------------------------------------------------------------------===//
+class SDTCisI8<int OpNum> : SDTCisVT<OpNum, i8>;
+class SDTCisI16<int OpNum> : SDTCisVT<OpNum, i16>;
+
+//===----------------------------------------------------------------------===//
+// PIC16 Specific Type Profiles.
+//===----------------------------------------------------------------------===//
+
+// Generic type profiles for i8/i16 unary/binary operations.
+// Taking one i8 or i16 and producing void.
+def SDTI8VoidOp : SDTypeProfile<0, 1, [SDTCisI8<0>]>;
+def SDTI16VoidOp : SDTypeProfile<0, 1, [SDTCisI16<0>]>;
+
+// Taking one value and producing an output of same type.
+def SDTI8UnaryOp : SDTypeProfile<1, 1, [SDTCisI8<0>, SDTCisI8<1>]>;
+def SDTI16UnaryOp : SDTypeProfile<1, 1, [SDTCisI16<0>, SDTCisI16<1>]>;
+
+// Taking two values and producing an output of same type.
+def SDTI8BinOp : SDTypeProfile<1, 2, [SDTCisI8<0>, SDTCisI8<1>, SDTCisI8<2>]>;
+def SDTI16BinOp : SDTypeProfile<1, 2, [SDTCisI16<0>, SDTCisI16<1>, 
+                                       SDTCisI16<2>]>;
+
+// Node specific type profiles.
+def SDT_PIC16Load : SDTypeProfile<1, 3, [SDTCisI8<0>, SDTCisI8<1>, 
+                                          SDTCisI8<2>, SDTCisI8<3>]>;
+
+def SDT_PIC16Store : SDTypeProfile<0, 4, [SDTCisI8<0>, SDTCisI8<1>, 
+                                          SDTCisI8<2>, SDTCisI8<3>]>;
+
+def SDT_PIC16Connect : SDTypeProfile<1, 2, [SDTCisI8<0>, SDTCisI8<1>,
+                                            SDTCisI8<2>]>;
+
+// PIC16ISD::CALL type prorile
+def SDT_PIC16call : SDTypeProfile<0, -1, [SDTCisInt<0>]>;
+def SDT_PIC16callw : SDTypeProfile<1, -1, [SDTCisInt<0>]>;
+
+// PIC16ISD::BRCOND
+def SDT_PIC16Brcond: SDTypeProfile<0, 2, 
+                                   [SDTCisVT<0, OtherVT>, SDTCisI8<1>]>;
+
+// PIC16ISD::BRCOND
+def SDT_PIC16Selecticc: SDTypeProfile<1, 3, 
+                                   [SDTCisI8<0>, SDTCisI8<1>, SDTCisI8<2>,
+                                    SDTCisI8<3>]>;
+
+//===----------------------------------------------------------------------===//
+// PIC16 addressing modes matching via DAG.
+//===----------------------------------------------------------------------===//
+def diraddr : ComplexPattern<i8, 1, "SelectDirectAddr", [], []>;
+
+//===----------------------------------------------------------------------===//
+// PIC16 Specific Node Definitions.
+//===----------------------------------------------------------------------===//
+def PIC16callseq_start : SDNode<"ISD::CALLSEQ_START", SDTI8VoidOp,
+                                [SDNPHasChain, SDNPOutFlag]>;
+def PIC16callseq_end   : SDNode<"ISD::CALLSEQ_END", SDTI8VoidOp, 
+                                [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+
+// Low 8-bits of GlobalAddress.
+def PIC16Lo : SDNode<"PIC16ISD::Lo", SDTI8BinOp>;  
+
+// High 8-bits of GlobalAddress.
+def PIC16Hi : SDNode<"PIC16ISD::Hi", SDTI8BinOp>;
+
+// The MTHI and MTLO nodes are used only to match them in the incoming 
+// DAG for replacement by corresponding set_fsrhi, set_fsrlo insntructions.
+// These nodes are not used for defining any instructions.
+def MTLO     : SDNode<"PIC16ISD::MTLO", SDTI8UnaryOp>;
+def MTHI     : SDNode<"PIC16ISD::MTHI", SDTI8UnaryOp>;
+def MTPCLATH : SDNode<"PIC16ISD::MTPCLATH", SDTI8UnaryOp>;
+
+// Node to generate Bank Select for a GlobalAddress.
+def Banksel : SDNode<"PIC16ISD::Banksel", SDTI8UnaryOp>;
+
+// Node to match a direct store operation.
+def PIC16Store : SDNode<"PIC16ISD::PIC16Store", SDT_PIC16Store, [SDNPHasChain]>;
+def PIC16StWF : SDNode<"PIC16ISD::PIC16StWF", SDT_PIC16Store, 
+                       [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>;
+
+// Node to match a direct load operation.
+def PIC16Load  : SDNode<"PIC16ISD::PIC16Load", SDT_PIC16Load, [SDNPHasChain]>;
+def PIC16LdArg  : SDNode<"PIC16ISD::PIC16LdArg", SDT_PIC16Load, [SDNPHasChain]>;
+def PIC16LdWF  : SDNode<"PIC16ISD::PIC16LdWF", SDT_PIC16Load, 
+                       [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>;
+def PIC16Connect: SDNode<"PIC16ISD::PIC16Connect", SDT_PIC16Connect, []>;
+
+// Node to match PIC16 call
+def PIC16call : SDNode<"PIC16ISD::CALL", SDT_PIC16call,
+                              [SDNPHasChain , SDNPOptInFlag, SDNPOutFlag]>;
+def PIC16callw : SDNode<"PIC16ISD::CALLW", SDT_PIC16callw,
+                              [SDNPHasChain , SDNPOptInFlag, SDNPOutFlag]>;
+
+// Node to match a comparison instruction.
+def PIC16Subcc : SDNode<"PIC16ISD::SUBCC", SDTI8BinOp, [SDNPOutFlag]>;
+
+// Node to match a conditional branch.
+def PIC16Brcond : SDNode<"PIC16ISD::BRCOND", SDT_PIC16Brcond, 
+                         [SDNPHasChain, SDNPInFlag]>;
+
+def PIC16Selecticc : SDNode<"PIC16ISD::SELECT_ICC", SDT_PIC16Selecticc, 
+                         [SDNPInFlag]>;
+
+def PIC16ret       : SDNode<"PIC16ISD::RET", SDTNone, [SDNPHasChain]>;
+
+//===----------------------------------------------------------------------===//
+// PIC16 Operand Definitions.
+//===----------------------------------------------------------------------===//
+def i8mem : Operand<i8>;
+def brtarget: Operand<OtherVT>;
+
+// Operand for printing out a condition code.
+let PrintMethod = "printCCOperand" in
+  def CCOp : Operand<i8>;
+
+include "PIC16InstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// PIC16 Common Classes.
+//===----------------------------------------------------------------------===//
+
+// W = W Op F : Load the value from F and do Op to W.
+let isTwoAddress = 1, mayLoad = 1 in
+class BinOpFW<bits<6> OpCode, string OpcStr, SDNode OpNode>:
+  ByteFormat<OpCode, (outs GPR:$dst),
+             (ins GPR:$src, i8imm:$offset, i8mem:$ptrlo, i8imm:$ptrhi),
+              !strconcat(OpcStr, " $ptrlo + $offset, W"),
+             [(set GPR:$dst, (OpNode GPR:$src, (PIC16Load diraddr:$ptrlo,
+                                             (i8 imm:$ptrhi),
+                                             (i8 imm:$offset))))]>;
+
+// F = F Op W : Load the value from F, do op with W and store in F.
+// This insn class is not marked as TwoAddress because the reg is
+// being used as a source operand only. (Remember a TwoAddress insn
+// needs a copyRegToReg.)
+let mayStore = 1 in
+class BinOpWF<bits<6> OpCode, string OpcStr, SDNode OpNode>:
+  ByteFormat<OpCode, (outs),
+             (ins GPR:$src, i8imm:$offset, i8mem:$ptrlo, i8imm:$ptrhi),
+              !strconcat(OpcStr, " $ptrlo + $offset, F"),
+             [(PIC16Store (OpNode GPR:$src, (PIC16Load diraddr:$ptrlo,
+                                             (i8 imm:$ptrhi),
+                                             (i8 imm:$offset))),
+                                             diraddr:$ptrlo,
+                                             (i8 imm:$ptrhi), (i8 imm:$offset)
+                                             )]>;
+
+// W = W Op L : Do Op of L with W and place result in W.
+let isTwoAddress = 1 in
+class BinOpWL<bits<6> opcode, string OpcStr, SDNode OpNode> :
+  LiteralFormat<opcode, (outs GPR:$dst),
+                (ins GPR:$src, i8imm:$literal),
+                !strconcat(OpcStr, " $literal"),
+                [(set GPR:$dst, (OpNode GPR:$src, (i8 imm:$literal)))]>;
+
+//===----------------------------------------------------------------------===//
+// PIC16 Instructions.
+//===----------------------------------------------------------------------===//
+
+// Pseudo-instructions.
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i8imm:$amt),
+                       "!ADJCALLSTACKDOWN $amt",
+                       [(PIC16callseq_start imm:$amt)]>;
+
+def ADJCALLSTACKUP : Pseudo<(outs), (ins i8imm:$amt),
+                       "!ADJCALLSTACKUP $amt", 
+                       [(PIC16callseq_end imm:$amt)]>;
+
+//-----------------------------------
+// Vaious movlw insn patterns.
+//-----------------------------------
+let isReMaterializable = 1 in {
+// Move 8-bit literal to W.
+def movlw : BitFormat<12, (outs GPR:$dst), (ins i8imm:$src),
+                      "movlw $src",
+                      [(set GPR:$dst, (i8 imm:$src))]>;
+
+// Move a Lo(TGA) to W.
+def movlw_lo_1 : BitFormat<12, (outs GPR:$dst), (ins i8imm:$src, i8imm:$src2),
+                      "movlw LOW(${src} + ${src2})",
+                      [(set GPR:$dst, (PIC16Lo tglobaladdr:$src, imm:$src2 ))]>;
+
+// Move a Lo(TES) to W.
+def movlw_lo_2 : BitFormat<12, (outs GPR:$dst), (ins i8imm:$src, i8imm:$src2),
+                      "movlw LOW(${src} + ${src2})",
+                      [(set GPR:$dst, (PIC16Lo texternalsym:$src, imm:$src2 ))]>;
+
+// Move a Hi(TGA) to W.
+def movlw_hi_1 : BitFormat<12, (outs GPR:$dst), (ins i8imm:$src, i8imm:$src2),
+                      "movlw HIGH(${src} + ${src2})",
+                      [(set GPR:$dst, (PIC16Hi tglobaladdr:$src, imm:$src2))]>;
+
+// Move a Hi(TES) to W.
+def movlw_hi_2 : BitFormat<12, (outs GPR:$dst), (ins i8imm:$src, i8imm:$src2),
+                      "movlw HIGH(${src} + ${src2})",
+                      [(set GPR:$dst, (PIC16Hi texternalsym:$src, imm:$src2))]>;
+}
+
+//-------------------
+// FSR setting insns. 
+//-------------------
+// These insns are matched via a DAG replacement pattern.
+def set_fsrlo:
+  ByteFormat<0, (outs FSR16:$fsr), 
+             (ins GPR:$val),
+             "movwf ${fsr}L",
+             []>;
+
+let isTwoAddress = 1 in
+def set_fsrhi:
+  ByteFormat<0, (outs FSR16:$dst), 
+             (ins FSR16:$src, GPR:$val),
+             "movwf ${dst}H",
+             []>;
+
+def set_pclath:
+  ByteFormat<0, (outs PCLATHR:$dst), 
+             (ins GPR:$val),
+             "movwf ${dst}",
+             [(set PCLATHR:$dst , (MTPCLATH GPR:$val))]>;
+
+//----------------------------
+// copyRegToReg 
+// copyRegToReg insns. These are dummy. They should always be deleted
+// by the optimizer and never be present in the final generated code.
+// if they are, then we have to write correct macros for these insns.
+//----------------------------
+def copy_fsr:
+  Pseudo<(outs FSR16:$dst), (ins FSR16:$src), "copy_fsr $dst, $src", []>;
+
+def copy_w:
+  Pseudo<(outs GPR:$dst), (ins GPR:$src), "copy_w $dst, $src", []>;
+
+class SAVE_FSR<string OpcStr>:
+  Pseudo<(outs), 
+         (ins FSR16:$src, i8imm:$offset, i8mem:$ptrlo, i8imm:$ptrhi), 
+         !strconcat(OpcStr, " $ptrlo, $offset"),
+         []>; 
+ 
+def save_fsr0: SAVE_FSR<"save_fsr0">;
+def save_fsr1: SAVE_FSR<"save_fsr1">;
+
+class RESTORE_FSR<string OpcStr>:
+  Pseudo<(outs FSR16:$dst), 
+         (ins i8imm:$offset, i8mem:$ptrlo, i8imm:$ptrhi), 
+         !strconcat(OpcStr, " $ptrlo, $offset"),
+         []>; 
+
+def restore_fsr0: RESTORE_FSR<"restore_fsr0">;
+def restore_fsr1: RESTORE_FSR<"restore_fsr1">;
+
+//--------------------------
+// Store to memory
+//-------------------------
+
+// Direct store.
+// Input operands are: val = W, ptrlo = GA, offset = offset, ptrhi = banksel.
+let mayStore = 1 in
+class MOVWF_INSN<bits<6> OpCode, SDNode OpNodeDest, SDNode Op>:
+  ByteFormat<0, (outs), 
+             (ins GPR:$val, i8imm:$offset, i8mem:$ptrlo, i8imm:$ptrhi),
+             "movwf ${ptrlo} + ${offset}",
+             [(Op GPR:$val, OpNodeDest:$ptrlo, (i8 imm:$ptrhi), 
+               (i8 imm:$offset))]>;
+
+// Store W to a Global Address.
+def movwf : MOVWF_INSN<0, tglobaladdr, PIC16Store>;
+
+// Store W to an External Symobol.
+def movwf_1 : MOVWF_INSN<0, texternalsym, PIC16Store>;
+
+// Store with InFlag and OutFlag
+// This is same as movwf_1 but has a flag. A flag is required to 
+// order the stores while passing the params to function.
+def movwf_2 : MOVWF_INSN<0, texternalsym, PIC16StWF>;
+
+// Indirect store. Matched via a DAG replacement pattern.
+def store_indirect : 
+  ByteFormat<0, (outs), 
+             (ins GPR:$val, FSR16:$fsr, i8imm:$offset),
+             "movwi $offset[$fsr]",
+             []>;
+
+//----------------------------
+// Load from memory
+//----------------------------
+// Direct load.
+// Input Operands are: ptrlo = GA, offset = offset, ptrhi = banksel.
+// Output: dst = W
+let Defs = [STATUS], mayLoad = 1 in
+class MOVF_INSN<bits<6> OpCode, SDNode OpNodeSrc, SDNode Op>:
+  ByteFormat<0, (outs GPR:$dst), 
+             (ins i8imm:$offset, i8mem:$ptrlo, i8imm:$ptrhi),
+             "movf ${ptrlo} + ${offset}, W",
+             [(set GPR:$dst, 
+               (Op OpNodeSrc:$ptrlo, (i8 imm:$ptrhi),
+               (i8 imm:$offset)))]>;
+
+// Load from a GA.
+def movf : MOVF_INSN<0, tglobaladdr, PIC16Load>;
+
+// Load from an ES.
+def movf_1 : MOVF_INSN<0, texternalsym, PIC16Load>;
+def movf_1_1 : MOVF_INSN<0, texternalsym, PIC16LdArg>;
+
+// Load with InFlag and OutFlag
+// This is same as movf_1 but has a flag. A flag is required to 
+// order the loads while copying the return value of a function.
+def movf_2 : MOVF_INSN<0, texternalsym, PIC16LdWF>;
+
+// Indirect load. Matched via a DAG replacement pattern.
+def load_indirect : 
+  ByteFormat<0, (outs GPR:$dst), 
+             (ins FSR16:$fsr, i8imm:$offset),
+             "moviw $offset[$fsr]",
+             []>;
+
+//-------------------------
+// Bitwise operations patterns
+//--------------------------
+// W = W op [F]
+let Defs = [STATUS] in {
+def OrFW :  BinOpFW<0, "iorwf", or>;
+def XOrFW : BinOpFW<0, "xorwf", xor>;
+def AndFW : BinOpFW<0, "andwf", and>;
+
+// F = W op [F]
+def OrWF :  BinOpWF<0, "iorwf", or>;
+def XOrWF : BinOpWF<0, "xorwf", xor>;
+def AndWF : BinOpWF<0, "andwf", and>;
+
+//-------------------------
+// Various add/sub patterns.
+//-------------------------
+
+// W = W + [F]
+def addfw_1: BinOpFW<0, "addwf", add>;
+def addfw_2: BinOpFW<0, "addwf", addc>;
+
+let Uses = [STATUS] in
+def addfwc: BinOpFW<0, "addwfc", adde>;  // With Carry.
+
+// F = W + [F]
+def addwf_1: BinOpWF<0, "addwf", add>;
+def addwf_2: BinOpWF<0, "addwf", addc>;
+let Uses = [STATUS] in
+def addwfc: BinOpWF<0, "addwfc", adde>;  // With Carry.
+}
+
+// W -= [F] ; load from F and sub the value from W.
+let isTwoAddress = 1, mayLoad = 1 in
+class SUBFW<bits<6> OpCode, string OpcStr, SDNode OpNode>:
+  ByteFormat<OpCode, (outs GPR:$dst),
+             (ins GPR:$src, i8imm:$offset, i8mem:$ptrlo, i8imm:$ptrhi),
+              !strconcat(OpcStr, " $ptrlo + $offset, W"),
+             [(set GPR:$dst, (OpNode (PIC16Load diraddr:$ptrlo,
+                                      (i8 imm:$ptrhi), (i8 imm:$offset)),
+                                      GPR:$src))]>;
+let Defs = [STATUS] in {
+def subfw_1: SUBFW<0, "subwf", sub>;
+def subfw_2: SUBFW<0, "subwf", subc>;
+
+let Uses = [STATUS] in
+def subfwb: SUBFW<0, "subwfb", sube>;  // With Borrow.
+
+}
+let Defs = [STATUS], isTerminator = 1 in
+def subfw_cc: SUBFW<0, "subwf", PIC16Subcc>;
+
+// [F] -= W ; 
+let mayStore = 1 in
+class SUBWF<bits<6> OpCode, string OpcStr, SDNode OpNode>:
+  ByteFormat<OpCode, (outs),
+             (ins GPR:$src, i8imm:$offset, i8mem:$ptrlo, i8imm:$ptrhi),
+              !strconcat(OpcStr, " $ptrlo + $offset"),
+             [(PIC16Store (OpNode (PIC16Load diraddr:$ptrlo,
+                                      (i8 imm:$ptrhi), (i8 imm:$offset)),
+                                      GPR:$src), diraddr:$ptrlo,
+                                      (i8 imm:$ptrhi), (i8 imm:$offset))]>;
+
+let Defs = [STATUS] in {
+def subwf_1: SUBWF<0, "subwf", sub>;
+def subwf_2: SUBWF<0, "subwf", subc>;
+
+let Uses = [STATUS] in
+  def subwfb: SUBWF<0, "subwfb", sube>;  // With Borrow.
+
+def subwf_cc: SUBWF<0, "subwf", PIC16Subcc>;
+}
+
+// addlw 
+let Defs = [STATUS] in {
+def addlw_1 : BinOpWL<0, "addlw", add>;
+def addlw_2 : BinOpWL<0, "addlw", addc>;
+
+let Uses = [STATUS] in
+def addlwc : BinOpWL<0, "addlwc", adde>; // With Carry. (Assembler macro).
+
+// bitwise operations involving a literal and w.
+def andlw : BinOpWL<0, "andlw", and>;
+def xorlw : BinOpWL<0, "xorlw", xor>;
+def orlw  : BinOpWL<0, "iorlw", or>;
+}
+
+// sublw 
+// W = C - W ; sub W from literal. (Without borrow).
+let isTwoAddress = 1 in
+class SUBLW<bits<6> opcode, string OpcStr, SDNode OpNode> :
+  LiteralFormat<opcode, (outs GPR:$dst),
+                (ins GPR:$src, i8imm:$literal),
+                !strconcat(OpcStr, " $literal"),
+                [(set GPR:$dst, (OpNode (i8 imm:$literal), GPR:$src))]>;
+// subwl 
+// W = W - C ; sub literal from W  (Without borrow).
+let isTwoAddress = 1 in
+class SUBWL<bits<6> opcode, string OpcStr, SDNode OpNode> :
+  LiteralFormat<opcode, (outs GPR:$dst),
+                (ins GPR:$src, i8imm:$literal),
+                !strconcat(OpcStr, " $literal"),
+                [(set GPR:$dst, (OpNode GPR:$src, (i8 imm:$literal)))]>;
+
+let Defs = [STATUS] in {
+def sublw_1 : SUBLW<0, "sublw", sub>;
+def sublw_2 : SUBLW<0, "sublw", subc>;
+def sublw_3 : SUBLW<0, "sublwb", sube>; // With borrow (Assembler macro).
+
+def sublw_4 : SUBWL<0, "subwl", sub>;   // Assembler macro replace with addlw
+def sublw_5 : SUBWL<0, "subwl", subc>;  // Assembler macro replace with addlw
+def sublw_6 : SUBWL<0, "subwlb", sube>; // With borrow (Assembler macro).
+}
+let Defs = [STATUS], isTerminator = 1 in 
+def sublw_cc : SUBLW<0, "sublw", PIC16Subcc>;
+
+// Call instruction.
+let isCall = 1,
+    Defs = [W, FSR0, FSR1] in {
+    def CALL: LiteralFormat<0x1, (outs), (ins i8imm:$func),
+            //"call ${func} + 2",
+            "call ${func}",
+            [(PIC16call diraddr:$func)]>;
+}
+
+let isCall = 1,
+    Defs = [W, FSR0, FSR1] in {
+    def CALL_1: LiteralFormat<0x1, (outs), (ins GPR:$func, PCLATHR:$pc),
+            "callw",
+            [(PIC16call (PIC16Connect GPR:$func, PCLATHR:$pc))]>;
+}
+
+let isCall = 1,
+    Defs = [FSR0, FSR1] in {
+    def CALLW: LiteralFormat<0x1, (outs GPR:$dest), 
+                                  (ins GPR:$func, PCLATHR:$pc),
+            "callw",
+            [(set GPR:$dest, (PIC16callw (PIC16Connect GPR:$func, PCLATHR:$pc)))]>;
+}
+
+let Uses = [STATUS], isBranch = 1, isTerminator = 1, hasDelaySlot = 0 in
+def pic16brcond: ControlFormat<0x0, (outs), (ins brtarget:$dst, CCOp:$cc),
+                          "b$cc $dst",
+                          [(PIC16Brcond bb:$dst, imm:$cc)]>;
+
+// Unconditional branch.
+let isBranch = 1, isTerminator = 1, hasDelaySlot = 0 in
+def br_uncond: ControlFormat<0x0, (outs), (ins brtarget:$dst),
+                          "goto $dst",
+                          [(br bb:$dst)]>;
+
+// SELECT_CC_* - Used to implement the SELECT_CC DAG operation.  Expanded after
+// instruction selection into a branch sequence.
+let usesCustomInserter = 1 in {   // Expanded after instruction selection.
+  def SELECT_CC_Int_ICC
+   : Pseudo<(outs GPR:$dst), (ins GPR:$T, GPR:$F, i8imm:$Cond),
+            "; SELECT_CC_Int_ICC PSEUDO!",
+            [(set GPR:$dst, (PIC16Selecticc GPR:$T, GPR:$F,
+                                             imm:$Cond))]>;
+}
+
+
+// Banksel.
+def banksel : 
+  Pseudo<(outs),
+         (ins i8mem:$ptr),
+         "banksel $ptr",
+         []>;
+
+def pagesel : 
+  Pseudo<(outs),
+         (ins i8mem:$ptr),
+         "movlp $ptr",
+         []>;
+
+
+// Return insn.
+let isTerminator = 1, isBarrier = 1, isReturn = 1 in
+def Return : 
+  ControlFormat<0, (outs), (ins), "return", [(PIC16ret)]>;
+
+//===----------------------------------------------------------------------===//
+// PIC16 Replacment Patterns.
+//===----------------------------------------------------------------------===//
+
+// Identify an indirect store and select insns for it.
+def : Pat<(PIC16Store GPR:$val, (MTLO GPR:$loaddr), (MTHI GPR:$hiaddr), 
+           imm:$offset),
+          (store_indirect GPR:$val, 
+           (set_fsrhi (set_fsrlo GPR:$loaddr), GPR:$hiaddr),
+           imm:$offset)>;
+
+def : Pat<(PIC16StWF GPR:$val, (MTLO GPR:$loaddr), (MTHI GPR:$hiaddr), 
+           imm:$offset),
+          (store_indirect GPR:$val, 
+           (set_fsrhi (set_fsrlo GPR:$loaddr), GPR:$hiaddr),
+           imm:$offset)>;
+
+// Identify an indirect load and select insns for it.
+def : Pat<(PIC16Load (MTLO GPR:$loaddr), (MTHI GPR:$hiaddr), 
+           imm:$offset),
+          (load_indirect  (set_fsrhi (set_fsrlo GPR:$loaddr), GPR:$hiaddr),
+           imm:$offset)>;
+
+def : Pat<(PIC16LdWF (MTLO GPR:$loaddr), (MTHI GPR:$hiaddr), 
+           imm:$offset),
+          (load_indirect  (set_fsrhi (set_fsrlo GPR:$loaddr), GPR:$hiaddr),
+           imm:$offset)>;
+

diff --git a/lib/Target/PIC16/PIC16MCAsmInfo.cpp b/lib/Target/PIC16/PIC16MCAsmInfo.cpp
new file mode 100644
index 0000000..b080542
--- /dev/null
+++ b/lib/Target/PIC16/PIC16MCAsmInfo.cpp

@@ -0,0 +1,59 @@
+//===-- PIC16MCAsmInfo.cpp - PIC16 asm properties -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the PIC16MCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PIC16MCAsmInfo.h"
+
+// FIXME: Layering violation to get enums and static function, should be moved
+// to separate headers.
+#include "PIC16.h"
+#include "PIC16ABINames.h"
+#include "PIC16ISelLowering.h"
+using namespace llvm;
+
+PIC16MCAsmInfo::PIC16MCAsmInfo(const Target &T, const StringRef &TT) {
+  CommentString = ";";
+  GlobalPrefix = PAN::getTagName(PAN::PREFIX_SYMBOL);
+  GlobalDirective = "\tglobal\t";
+  ExternDirective = "\textern\t";
+
+  Data8bitsDirective = " db ";
+  Data16bitsDirective = " dw ";
+  Data32bitsDirective = " dl ";
+  Data64bitsDirective = NULL;
+  ZeroDirective = NULL;
+  AsciiDirective = " dt ";
+  AscizDirective = NULL;
+    
+  RomData8bitsDirective = " dw ";
+  RomData16bitsDirective = " rom_di ";
+  RomData32bitsDirective = " rom_dl ";
+  HasSetDirective = false;  
+    
+  // Set it to false because we weed to generate c file name and not bc file
+  // name.
+  HasSingleParameterDotFile = false;
+}
+
+const char *PIC16MCAsmInfo::getDataASDirective(unsigned Size,
+                                               unsigned AS) const {
+  if (AS != PIC16ISD::ROM_SPACE)
+    return 0;
+  
+  switch (Size) {
+  case  8: return RomData8bitsDirective;
+  case 16: return RomData16bitsDirective;
+  case 32: return RomData32bitsDirective;
+  default: return NULL;
+  }
+}
+

diff --git a/lib/Target/PIC16/PIC16MCAsmInfo.h b/lib/Target/PIC16/PIC16MCAsmInfo.h
new file mode 100644
index 0000000..e84db85
--- /dev/null
+++ b/lib/Target/PIC16/PIC16MCAsmInfo.h

@@ -0,0 +1,35 @@
+//=====-- PIC16MCAsmInfo.h - PIC16 asm properties -------------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the PIC16MCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PIC16TARGETASMINFO_H
+#define PIC16TARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+  class Target;
+  class StringRef;
+
+  class PIC16MCAsmInfo : public MCAsmInfo {
+    const char *RomData8bitsDirective;
+    const char *RomData16bitsDirective;
+    const char *RomData32bitsDirective;
+  public:    
+    PIC16MCAsmInfo(const Target &T, const StringRef &TT);
+    
+    virtual const char *getDataASDirective(unsigned size, unsigned AS) const;
+  };
+
+} // namespace llvm
+
+#endif

diff --git a/lib/Target/PIC16/PIC16MemSelOpt.cpp b/lib/Target/PIC16/PIC16MemSelOpt.cpp
new file mode 100644
index 0000000..cc71b04
--- /dev/null
+++ b/lib/Target/PIC16/PIC16MemSelOpt.cpp

@@ -0,0 +1,175 @@
+//===-- PIC16MemSelOpt.cpp - PIC16 banksel optimizer  --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which optimizes the emitting of banksel 
+// instructions before accessing data memory. This currently works within
+// a basic block only and keep tracks of the last accessed memory bank.
+// If memory access continues to be in the same bank it just makes banksel
+// immediate, which is a part of the insn accessing the data memory, from 1
+// to zero. The asm printer emits a banksel only if that immediate is 1. 
+//
+// FIXME: this is not implemented yet.  The banksel pass only works on local
+// basic blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "pic16-codegen"
+#include "PIC16.h"
+#include "PIC16ABINames.h"
+#include "PIC16InstrInfo.h"
+#include "PIC16MCAsmInfo.h"
+#include "PIC16TargetMachine.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/DerivedTypes.h"
+
+using namespace llvm;
+
+namespace {
+  struct MemSelOpt : public MachineFunctionPass {
+    static char ID;
+    MemSelOpt() : MachineFunctionPass(&ID) {}
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addPreservedID(MachineLoopInfoID);
+      AU.addPreservedID(MachineDominatorsID);
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual const char *getPassName() const { 
+      return "PIC16 Memsel Optimizer"; 
+    }
+
+   bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB);
+   bool processInstruction(MachineInstr *MI);
+
+  private:
+    const TargetInstrInfo *TII; // Machine instruction info.
+    MachineBasicBlock *MBB;     // Current basic block
+    std::string CurBank;
+
+  };
+  char MemSelOpt::ID = 0;
+}
+
+FunctionPass *llvm::createPIC16MemSelOptimizerPass() { 
+  return new MemSelOpt(); 
+}
+
+
+/// runOnMachineFunction - Loop over all of the basic blocks, transforming FP
+/// register references into FP stack references.
+///
+bool MemSelOpt::runOnMachineFunction(MachineFunction &MF) {
+  TII = MF.getTarget().getInstrInfo();
+  bool Changed = false;
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+    Changed |= processBasicBlock(MF, *I);
+  }
+
+  return Changed;
+}
+
+/// processBasicBlock - Loop over all of the instructions in the basic block,
+/// transforming FP instructions into their stack form.
+///
+bool MemSelOpt::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
+  bool Changed = false;
+  MBB = &BB;
+
+  // Let us assume that when entering a basic block now bank is selected.
+  // Ideally we should look at the predecessors for this information.
+  CurBank=""; 
+
+  for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
+    Changed |= processInstruction(I);
+  }
+  return Changed;
+}
+
+bool MemSelOpt::processInstruction(MachineInstr *MI) {
+  bool Changed = false;
+
+  unsigned NumOperands = MI->getNumOperands();
+  if (NumOperands == 0) return false;
+
+
+  // If this insn is not going to access any memory, return.
+  const TargetInstrDesc &TID = TII->get(MI->getOpcode());
+  if (!(TID.isBranch() || TID.isCall() || TID.mayLoad() || TID.mayStore()))
+    return false;
+
+  // Scan for the memory address operand.
+  // FIXME: Should we use standard interfaces like memoperands_iterator,
+  // hasMemOperand() etc ?
+  int MemOpPos = -1;
+  for (unsigned i = 0; i < NumOperands; i++) {
+    MachineOperand Op = MI->getOperand(i);
+    if (Op.getType() ==  MachineOperand::MO_GlobalAddress ||
+        Op.getType() ==  MachineOperand::MO_ExternalSymbol || 
+        Op.getType() ==  MachineOperand::MO_MachineBasicBlock) {
+      // We found one mem operand. Next one may be BS.
+      MemOpPos = i;
+      break;
+    }
+  }
+
+  // If we did not find an insn accessing memory. Continue.
+  if (MemOpPos == -1) return Changed;
+ 
+  // Get the MemOp.
+  MachineOperand &Op = MI->getOperand(MemOpPos);
+
+  // If this is a pagesel material, handle it first.
+  if (MI->getOpcode() == PIC16::CALL ||
+      MI->getOpcode() == PIC16::br_uncond) {
+    DebugLoc dl = MI->getDebugLoc();
+    BuildMI(*MBB, MI, dl, TII->get(PIC16::pagesel)).
+      addOperand(Op);
+    return true;
+  }
+
+  // Get the section name(NewBank) for MemOp.
+  // This assumes that the section names for globals are already set by
+  // AsmPrinter->doInitialization.
+  std::string NewBank = CurBank;
+  if (Op.getType() ==  MachineOperand::MO_GlobalAddress &&
+      Op.getGlobal()->getType()->getAddressSpace() == PIC16ISD::RAM_SPACE) {
+    NewBank = Op.getGlobal()->getSection();
+  } else if (Op.getType() ==  MachineOperand::MO_ExternalSymbol) {
+    // External Symbol is generated for temp data and arguments. They are
+    // in fpdata.<functionname>.# section.
+    std::string Sym = Op.getSymbolName();
+    NewBank = PAN::getSectionNameForSym(Sym);
+  }
+
+  // If the section is shared section, do not emit banksel.
+  if (NewBank == PAN::getSharedUDataSectionName())
+    return Changed;
+
+  // If the previous and new section names are same, we don't need to
+  // emit banksel. 
+  if (NewBank.compare(CurBank) != 0 ) {
+    DebugLoc dl = MI->getDebugLoc();
+    BuildMI(*MBB, MI, dl, TII->get(PIC16::banksel)).
+      addOperand(Op);
+    Changed = true;
+    CurBank = NewBank;
+  }
+
+  return Changed;
+}
+

diff --git a/lib/Target/PIC16/PIC16Passes/Makefile b/lib/Target/PIC16/PIC16Passes/Makefile
new file mode 100644
index 0000000..9684b8d
--- /dev/null
+++ b/lib/Target/PIC16/PIC16Passes/Makefile

@@ -0,0 +1,15 @@
+##===- lib/Target/PIC16/PIC16Passes/Makefile -----------*- Makefile -*-===##
+# 
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source 
+# License. See LICENSE.TXT for details.
+# 
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+TARGET = PIC16
+LIBRARYNAME = LLVMpic16passes
+BUILD_ARCHIVE = 1
+
+include $(LEVEL)/Makefile.common
+

diff --git a/lib/Target/PIC16/PIC16Passes/PIC16Overlay.cpp b/lib/Target/PIC16/PIC16Passes/PIC16Overlay.cpp
new file mode 100644
index 0000000..197c398
--- /dev/null
+++ b/lib/Target/PIC16/PIC16Passes/PIC16Overlay.cpp

@@ -0,0 +1,181 @@
+//===-- PIC16Overlay.cpp - Implementation for PIC16 Frame Overlay===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PIC16 Frame Overlay implementation.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Pass.h"
+#include "llvm/Module.h"
+#include "llvm/Instructions.h"
+#include "llvm/Value.h"
+#include "PIC16Overlay.h"
+#include "llvm/Function.h"
+#include <cstdlib>
+#include <sstream>
+using namespace llvm;
+
+namespace llvm {
+  char PIC16FrameOverlay::ID = 0;
+  ModulePass *createPIC16OverlayPass() { return new PIC16FrameOverlay(); }
+}
+
+void PIC16FrameOverlay::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<CallGraph>();
+}
+
+void PIC16FrameOverlay::DFSTraverse(CallGraphNode *CGN, unsigned Depth) {
+  // Do not set any color for external calling node.
+  if (Depth != 0 && CGN->getFunction()) {
+    unsigned Color = getColor(CGN->getFunction());
+
+    // Handle indirectly called functions
+    if (Color >= PIC16Overlay::StartIndirectCallColor || 
+        Depth >= PIC16Overlay::StartIndirectCallColor) {
+      // All functions called from an indirectly called function are given
+      // an unique color.
+      if (Color < PIC16Overlay::StartIndirectCallColor &&
+          Depth >= PIC16Overlay::StartIndirectCallColor)
+        setColor(CGN->getFunction(), Depth);
+
+      for (unsigned int i = 0; i < CGN->size(); i++)
+        DFSTraverse((*CGN)[i], ++IndirectCallColor);
+      return;
+    }
+    // Just return if the node already has a color greater than the current 
+    // depth. A node must be colored with the maximum depth that it has.
+    if (Color >= Depth)
+      return;
+    
+    Depth = ModifyDepthForInterrupt(CGN, Depth);  
+    setColor(CGN->getFunction(), Depth);
+  }
+  
+  // Color all children of this node with color depth+1.
+  for (unsigned int i = 0; i < CGN->size(); i++)
+    DFSTraverse((*CGN)[i], Depth+1);
+}
+
+unsigned PIC16FrameOverlay::ModifyDepthForInterrupt(CallGraphNode *CGN,
+                                                    unsigned Depth) {
+  Function *Fn = CGN->getFunction();
+
+  // Return original Depth if function or section for function do not exist.
+  if (!Fn || !Fn->hasSection())
+    return Depth;
+
+  // Return original Depth if this function is not marked as interrupt.
+  if (Fn->getSection().find("interrupt") == string::npos)
+    return Depth;
+
+  Depth = Depth + InterruptDepth;
+  return Depth;
+}
+
+void PIC16FrameOverlay::setColor(Function *Fn, unsigned Color) {
+  std::string Section = "";
+  if (Fn->hasSection())
+    Section = Fn->getSection();
+
+  size_t Pos = Section.find(OverlayStr);
+
+  // Convert Color to string.
+  std::stringstream ss;
+  ss << Color;
+  std::string ColorString = ss.str();
+
+  // If color is already set then reset it with the new value. Else append 
+  // the Color string to section.
+  if (Pos != std::string::npos) {
+    Pos += OverlayStr.length();
+    char c = Section.at(Pos);
+    unsigned OldColorLength = 0;  
+    while (c >= '0' && c<= '9') {
+      OldColorLength++;    
+      if (Pos < Section.length() - 1)
+        Pos++;
+      else
+        break;
+      c = Section.at(Pos);
+    }
+    // Replace old color with new one.
+    Section.replace(Pos-OldColorLength +1, OldColorLength, ColorString); 
+  }
+  else {
+    // Append Color information to section string.
+    if (Fn->hasSection())
+      Section.append(" ");
+    Section.append(OverlayStr + ColorString);
+  }
+  Fn->setSection(Section);
+}
+
+unsigned PIC16FrameOverlay::getColor(Function *Fn) {
+  int Color = 0;
+  if (!Fn->hasSection())
+    return 0;
+
+  std::string Section = Fn->getSection();
+  size_t Pos = Section.find(OverlayStr);
+  
+  // Return 0 if Color is not set.
+  if (Pos == std::string::npos)
+    return 0;
+
+  // Set Pos to after "Overlay=".
+  Pos += OverlayStr.length();
+  char c = Section.at(Pos);
+  std::string ColorString = "";
+
+  // Find the string representing Color. A Color can only consist of digits.
+  while (c >= '0' && c<= '9') { 
+    ColorString.append(1,c);
+    if (Pos < Section.length() - 1)
+      Pos++;
+    else
+      break;
+    c = Section.at(Pos);
+  }
+  Color = atoi(ColorString.c_str());
+  
+  return Color;    
+}
+
+bool PIC16FrameOverlay::runOnModule(Module &M) {
+  CallGraph &CG = getAnalysis<CallGraph>();
+  CallGraphNode *ECN = CG.getExternalCallingNode();
+
+  MarkIndirectlyCalledFunctions(M); 
+  // Since External Calling Node is the base function, do a depth first 
+  // traversal of CallGraph with ECN as root. Each node with be marked with 
+  // a color that is max(color(callers)) + 1.
+  if(ECN) {
+    DFSTraverse(ECN, 0);
+  }
+  return false;
+}
+
+void PIC16FrameOverlay::MarkIndirectlyCalledFunctions(Module &M) {
+  // If the use of a function is not a call instruction then this
+  // function might be called indirectly. In that case give it
+  // an unique color.
+  for (Module::iterator MI = M.begin(), E = M.end(); MI != E; ++MI) {
+    for (Value::use_iterator I = MI->use_begin(), E = MI->use_end(); I != E;
+         ++I) {
+      if ((!isa<CallInst>(I) && !isa<InvokeInst>(I))
+          || !CallSite(cast<Instruction>(I)).isCallee(I)) {
+        setColor(MI, ++IndirectCallColor);
+        break;
+      }
+    }
+  }
+}

diff --git a/lib/Target/PIC16/PIC16Passes/PIC16Overlay.h b/lib/Target/PIC16/PIC16Passes/PIC16Overlay.h
new file mode 100644
index 0000000..d70c4e7
--- /dev/null
+++ b/lib/Target/PIC16/PIC16Passes/PIC16Overlay.h

@@ -0,0 +1,55 @@
+//===-- PIC16FrameOverlay.h - Interface for PIC16 Frame Overlay -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PIC16 Overlay infrastructure.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PIC16FRAMEOVERLAY_H
+#define PIC16FRAMEOVERLAY_H
+ 
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Pass.h"
+#include "llvm/CallGraphSCCPass.h"
+
+using std::string;
+using namespace llvm;
+
+namespace  llvm {
+  namespace PIC16Overlay {
+    enum OverlayConsts {
+      StartInterruptColor = 200,
+      StartIndirectCallColor = 300
+    }; 
+  }
+  class PIC16FrameOverlay : public ModulePass {
+    std::string OverlayStr;
+    unsigned InterruptDepth;
+    unsigned IndirectCallColor;
+  public:
+    static char ID; // Class identification 
+    PIC16FrameOverlay() : ModulePass(&ID) {
+      OverlayStr = "Overlay=";
+      InterruptDepth = PIC16Overlay::StartInterruptColor;
+      IndirectCallColor = PIC16Overlay::StartIndirectCallColor;
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const; 
+    virtual bool runOnModule(Module &M);
+
+  private: 
+    unsigned getColor(Function *Fn);
+    void setColor(Function *Fn, unsigned Color);
+    unsigned ModifyDepthForInterrupt(CallGraphNode *CGN, unsigned Depth);
+    void MarkIndirectlyCalledFunctions(Module &M);
+    void DFSTraverse(CallGraphNode *CGN, unsigned Depth);
+  };
+}  // End of  namespace
+
+#endif

diff --git a/lib/Target/PIC16/PIC16RegisterInfo.cpp b/lib/Target/PIC16/PIC16RegisterInfo.cpp
new file mode 100644
index 0000000..8ba9a1d
--- /dev/null
+++ b/lib/Target/PIC16/PIC16RegisterInfo.cpp

@@ -0,0 +1,94 @@
+//===- PIC16RegisterInfo.cpp - PIC16 Register Information -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PIC16 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "pic16-reg-info"
+
+#include "PIC16.h"
+#include "PIC16RegisterInfo.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+PIC16RegisterInfo::PIC16RegisterInfo(const TargetInstrInfo &tii,
+                                     const PIC16Subtarget &st)
+  : PIC16GenRegisterInfo(PIC16::ADJCALLSTACKDOWN, PIC16::ADJCALLSTACKUP),
+    TII(tii),
+    ST(st) {}
+
+#include "PIC16GenRegisterInfo.inc"
+
+/// PIC16 Callee Saved Registers
+const unsigned* PIC16RegisterInfo::
+getCalleeSavedRegs(const MachineFunction *MF) const {
+  static const unsigned CalleeSavedRegs[] = { 0 };
+  return CalleeSavedRegs;
+}
+
+// PIC16 Callee Saved Reg Classes
+const TargetRegisterClass* const*
+PIC16RegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
+  static const TargetRegisterClass * const CalleeSavedRegClasses[] = { 0 };
+  return CalleeSavedRegClasses;
+}
+
+BitVector PIC16RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  return Reserved;
+}
+
+bool PIC16RegisterInfo::hasFP(const MachineFunction &MF) const {
+  return false;
+}
+
+unsigned PIC16RegisterInfo::
+eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
+                    int *Value, RegScavenger *RS) const
+{
+  /* NOT YET IMPLEMENTED */
+  return 0;
+}
+
+void PIC16RegisterInfo::emitPrologue(MachineFunction &MF) const
+{    /* NOT YET IMPLEMENTED */  }
+
+void PIC16RegisterInfo::
+emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const
+{    /* NOT YET IMPLEMENTED */  }
+
+int PIC16RegisterInfo::
+getDwarfRegNum(unsigned RegNum, bool isEH) const {
+  llvm_unreachable("Not keeping track of debug information yet!!");
+  return -1;
+}
+
+unsigned PIC16RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  llvm_unreachable("PIC16 Does not have any frame register");
+  return 0;
+}
+
+unsigned PIC16RegisterInfo::getRARegister() const {
+  llvm_unreachable("PIC16 Does not have any return address register");
+  return 0;
+}
+
+// This function eliminates ADJCALLSTACKDOWN,
+// ADJCALLSTACKUP pseudo instructions
+void PIC16RegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  // Simply discard ADJCALLSTACKDOWN,
+  // ADJCALLSTACKUP instructions.
+  MBB.erase(I);
+}
+

diff --git a/lib/Target/PIC16/PIC16RegisterInfo.h b/lib/Target/PIC16/PIC16RegisterInfo.h
new file mode 100644
index 0000000..1d5dbbf
--- /dev/null
+++ b/lib/Target/PIC16/PIC16RegisterInfo.h

@@ -0,0 +1,69 @@
+//===- PIC16RegisterInfo.h - PIC16 Register Information Impl ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PIC16 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PIC16REGISTERINFO_H
+#define PIC16REGISTERINFO_H
+
+#include "PIC16GenRegisterInfo.h.inc"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+namespace llvm {
+
+// Forward Declarations.
+  class PIC16Subtarget;
+  class TargetInstrInfo;
+
+class PIC16RegisterInfo : public PIC16GenRegisterInfo {
+  private:
+    const TargetInstrInfo &TII;
+    const PIC16Subtarget &ST;
+  
+  public:
+    PIC16RegisterInfo(const TargetInstrInfo &tii, 
+                      const PIC16Subtarget &st);
+
+
+  //------------------------------------------------------
+  // Pure virtual functions from TargetRegisterInfo
+  //------------------------------------------------------
+
+  // PIC16 callee saved registers
+  virtual const unsigned* 
+  getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+
+  // PIC16 callee saved register classes
+  virtual const TargetRegisterClass* const *
+  getCalleeSavedRegClasses(const MachineFunction *MF) const;
+
+  virtual BitVector getReservedRegs(const MachineFunction &MF) const;
+  virtual bool hasFP(const MachineFunction &MF) const;
+
+  virtual unsigned eliminateFrameIndex(MachineBasicBlock::iterator MI,
+                                       int SPAdj, int *Value = NULL,
+                                       RegScavenger *RS=NULL) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+  virtual void emitPrologue(MachineFunction &MF) const;
+  virtual void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+  virtual int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+  virtual unsigned getFrameRegister(const MachineFunction &MF) const;
+  virtual unsigned getRARegister() const;
+
+};
+
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/PIC16/PIC16RegisterInfo.td b/lib/Target/PIC16/PIC16RegisterInfo.td
new file mode 100644
index 0000000..2959d91
--- /dev/null
+++ b/lib/Target/PIC16/PIC16RegisterInfo.td

@@ -0,0 +1,33 @@
+//===- PIC16RegisterInfo.td - PIC16 Register defs ------------*- tblgen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the PIC16 register file
+//===----------------------------------------------------------------------===//
+
+class PIC16Reg<string n> : Register<n> {
+  let Namespace = "PIC16";
+}
+
+// PIC16 Registers.
+def W   : PIC16Reg<"W">;
+def FSR0   : PIC16Reg<"FSR0">;
+def FSR1   : PIC16Reg<"FSR1">;
+def BS     : PIC16Reg<"BS">;
+def PCLATH : PIC16Reg<"PCLATH">;
+
+def STATUS : PIC16Reg<"STATUS">;
+
+// PIC16 Register classes.
+def GPR     : RegisterClass<"PIC16", [i8],  8, [W]>;
+def FSR16   : RegisterClass<"PIC16", [i16], 8, [FSR0, FSR1]>;
+def BSR     : RegisterClass<"PIC16", [i8],  8, [BS]>;
+def PCLATHR : RegisterClass<"PIC16", [i8],  8, [PCLATH]>;
+def STATUSR : RegisterClass<"PIC16", [i8],  8, [STATUS]>;
+

diff --git a/lib/Target/PIC16/PIC16Section.cpp b/lib/Target/PIC16/PIC16Section.cpp
new file mode 100644
index 0000000..a96ebb8
--- /dev/null
+++ b/lib/Target/PIC16/PIC16Section.cpp

@@ -0,0 +1,96 @@
+//===-- PIC16Section.cpp - PIC16 Section ----------- --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PIC16.h"
+#include "PIC16ABINames.h"
+#include "PIC16Section.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+
+// This is the only way to create a PIC16Section. Sections created here
+// do not need to be explicitly deleted as they are managed by auto_ptrs.
+PIC16Section *PIC16Section::Create(const StringRef &Name,
+                                   PIC16SectionType Ty,
+                                   const std::string &Address, 
+                                   int Color, MCContext &Ctx) {
+
+  /// Determine the internal SectionKind info.
+  /// Users of PIC16Section class should not need to know the internal
+  /// SectionKind. They should work only with PIC16SectionType.
+  ///
+  /// PIC16 Terminology for section kinds is as below.
+  /// UDATA - BSS
+  /// IDATA - initialized data (equiv to Metadata) 
+  /// ROMDATA - ReadOnly.
+  /// UDATA_OVR - Sections that can be overlaid. Section of such type is
+  ///             used to contain function autos an frame. We can think of
+  ///             it as equiv to llvm ThreadBSS)
+  /// UDATA_SHR - Shared RAM. Memory area that is mapped to all banks.
+
+  SectionKind K;
+  switch (Ty) {
+    default: llvm_unreachable ("can not create unknown section type");
+    case UDATA_OVR: {
+      K = SectionKind::getThreadBSS();
+      break;
+    }
+    case UDATA_SHR:
+    case UDATA: {
+      K = SectionKind::getBSS();
+      break;
+    }
+    case ROMDATA:
+    case IDATA: {
+      K = SectionKind::getMetadata();
+      break;
+    }
+    case CODE: {
+      K = SectionKind::getText();
+      break;
+    }
+      
+  }
+
+  // Create the Section.
+  PIC16Section *S = new (Ctx) PIC16Section(Name, K, Address, Color);
+  S->T = Ty;
+  return S;
+}
+
+// A generic way to print all types of sections.
+void PIC16Section::PrintSwitchToSection(const MCAsmInfo &MAI,
+                                          raw_ostream &OS) const {
+ 
+  // If the section is overlaid(i.e. it has a color), print overlay name for 
+  // it. Otherwise print its normal name.
+  if (Color != -1)
+    OS << PAN::getOverlayName(getName(), Color) << '\t';
+  else
+    OS << getName() << '\t';
+
+  // Print type.
+  switch (getType()) {
+  default : llvm_unreachable ("unknown section type"); 
+  case UDATA: OS << "UDATA"; break;
+  case IDATA: OS << "IDATA"; break;
+  case ROMDATA: OS << "ROMDATA"; break;
+  case UDATA_SHR: OS << "UDATA_SHR"; break;
+  case UDATA_OVR: OS << "UDATA_OVR"; break;
+  case CODE: OS << "CODE"; break;
+  }
+
+  OS << '\t';
+
+  // Print Address.
+  OS << Address;
+
+  OS << '\n';
+}

diff --git a/lib/Target/PIC16/PIC16Section.h b/lib/Target/PIC16/PIC16Section.h
new file mode 100644
index 0000000..3a8bbfb
--- /dev/null
+++ b/lib/Target/PIC16/PIC16Section.h

@@ -0,0 +1,92 @@
+//===- PIC16Section.h - PIC16-specific section representation -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the PIC16Section class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_PIC16SECTION_H
+#define LLVM_PIC16SECTION_H
+
+#include "llvm/MC/MCSection.h"
+#include "llvm/GlobalVariable.h"
+#include <vector>
+
+namespace llvm {
+  /// PIC16Section - Represents a physical section in PIC16 COFF.
+  /// Contains data objects.
+  ///
+  class PIC16Section : public MCSection {
+    /// PIC16 Sections does not really use the SectionKind class to
+    /// to distinguish between various types of sections. PIC16 maintain
+    /// its own Section Type info. See the PIC16SectionType enum in PIC16.h 
+    /// for various section types.
+    PIC16SectionType T;
+
+    /// Name of the section to uniquely identify it.
+    std::string Name;
+
+    /// User can specify an address at which a section should be placed. 
+    /// Negative value here means user hasn't specified any. 
+    std::string Address; 
+
+    /// Overlay information - Sections with same color can be overlaid on
+    /// one another.
+    int Color; 
+
+    /// Total size of all data objects contained here.
+    unsigned Size;
+    
+    PIC16Section(const StringRef &name, SectionKind K, const std::string &addr, 
+                 int color)
+      : MCSection(K), Name(name), Address(addr), Color(color) {
+    }
+    
+  public:
+    /// Return the name of the section.
+    const std::string &getName() const { return Name; }
+
+    /// Return the Address of the section.
+    const std::string &getAddress() const { return Address; }
+
+    /// Return the Color of the section.
+    int getColor() const { return Color; }
+    void setColor(int color) { Color = color; }
+
+    /// Return the size of the section.
+    unsigned getSize() const { return Size; }
+    void setSize(unsigned size) { Size = size; }
+
+    /// Conatined data objects.
+    std::vector<const GlobalVariable *>Items;
+
+    /// Check section type. 
+    bool isUDATA_Type() const { return T == UDATA; }
+    bool isIDATA_Type() const { return T == IDATA; }
+    bool isROMDATA_Type() const { return T == ROMDATA; }
+    bool isUDATA_OVR_Type() const { return T == UDATA_OVR; }
+    bool isUDATA_SHR_Type() const { return T == UDATA_SHR; }
+    bool isCODE_Type() const { return T == CODE; }
+
+    PIC16SectionType getType() const { return T; }
+
+    /// This would be the only way to create a section. 
+    static PIC16Section *Create(const StringRef &Name, PIC16SectionType Ty, 
+                                const std::string &Address, int Color, 
+                                MCContext &Ctx);
+    
+    /// Override this as PIC16 has its own way of printing switching
+    /// to a section.
+    virtual void PrintSwitchToSection(const MCAsmInfo &MAI,
+                                      raw_ostream &OS) const;
+  };
+
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/PIC16/PIC16Subtarget.cpp b/lib/Target/PIC16/PIC16Subtarget.cpp
new file mode 100644
index 0000000..33fc3fb
--- /dev/null
+++ b/lib/Target/PIC16/PIC16Subtarget.cpp

@@ -0,0 +1,27 @@
+//===- PIC16Subtarget.cpp - PIC16 Subtarget Information -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PIC16 specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PIC16Subtarget.h"
+#include "PIC16GenSubtarget.inc"
+
+using namespace llvm;
+
+PIC16Subtarget::PIC16Subtarget(const std::string &TT, const std::string &FS, 
+                               bool Cooper)
+  :IsCooper(Cooper)
+{
+  std::string CPU = "generic";
+
+  // Parse features string.
+  ParseSubtargetFeatures(FS, CPU);
+}

diff --git a/lib/Target/PIC16/PIC16Subtarget.h b/lib/Target/PIC16/PIC16Subtarget.h
new file mode 100644
index 0000000..81e3783
--- /dev/null
+++ b/lib/Target/PIC16/PIC16Subtarget.h

@@ -0,0 +1,44 @@
+//=====-- PIC16Subtarget.h - Define Subtarget for the PIC16 ---*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the PIC16 specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PIC16SUBTARGET_H
+#define PIC16SUBTARGET_H
+
+#include "llvm/Target/TargetSubtarget.h"
+
+#include <string>
+
+namespace llvm {
+
+class PIC16Subtarget : public TargetSubtarget {
+
+  // IsCooper - Target ISA is Cooper.
+  bool IsCooper;
+
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified triple.
+  ///
+  PIC16Subtarget(const std::string &TT, const std::string &FS, bool Cooper);
+  
+  /// isCooper - Returns true if the target ISA is Cooper.
+  bool isCooper() const { return IsCooper; }
+
+  /// ParseSubtargetFeatures - Parses features string setting specified 
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  std::string ParseSubtargetFeatures(const std::string &FS,
+                                     const std::string &CPU);
+};
+} // End llvm namespace
+
+#endif  // PIC16SUBTARGET_H

diff --git a/lib/Target/PIC16/PIC16TargetMachine.cpp b/lib/Target/PIC16/PIC16TargetMachine.cpp
new file mode 100644
index 0000000..e2acb85
--- /dev/null
+++ b/lib/Target/PIC16/PIC16TargetMachine.cpp

@@ -0,0 +1,55 @@
+//===-- PIC16TargetMachine.cpp - Define TargetMachine for PIC16 -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Top-level implementation for the PIC16 target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PIC16.h"
+#include "PIC16MCAsmInfo.h"
+#include "PIC16TargetMachine.h"
+#include "llvm/PassManager.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Target/TargetRegistry.h"
+
+using namespace llvm;
+
+extern "C" void LLVMInitializePIC16Target() {
+  // Register the target. Curretnly the codegen works for
+  // enhanced pic16 mid-range.
+  RegisterTargetMachine<PIC16TargetMachine> X(ThePIC16Target);
+  RegisterAsmInfo<PIC16MCAsmInfo> A(ThePIC16Target);
+}
+
+
+// PIC16TargetMachine - Enhanced PIC16 mid-range Machine. May also represent
+// a Traditional Machine if 'Trad' is true.
+PIC16TargetMachine::PIC16TargetMachine(const Target &T, const std::string &TT,
+                                       const std::string &FS, bool Trad)
+: LLVMTargetMachine(T, TT),
+  Subtarget(TT, FS, Trad),
+  DataLayout("e-p:16:8:8-i8:8:8-i16:8:8-i32:8:8-n8"), 
+  InstrInfo(*this), TLInfo(*this),
+  FrameInfo(TargetFrameInfo::StackGrowsUp, 8, 0) { }
+
+
+bool PIC16TargetMachine::addInstSelector(PassManagerBase &PM,
+                                         CodeGenOpt::Level OptLevel) {
+  // Install an instruction selector.
+  PM.add(createPIC16ISelDag(*this));
+  return false;
+}
+
+bool PIC16TargetMachine::addPreEmitPass(PassManagerBase &PM, 
+                                         CodeGenOpt::Level OptLevel) {
+  PM.add(createPIC16MemSelOptimizerPass());
+  return true;  // -print-machineinstr should print after this.
+}
+
+

diff --git a/lib/Target/PIC16/PIC16TargetMachine.h b/lib/Target/PIC16/PIC16TargetMachine.h
new file mode 100644
index 0000000..b11fdd5
--- /dev/null
+++ b/lib/Target/PIC16/PIC16TargetMachine.h

@@ -0,0 +1,64 @@
+//===-- PIC16TargetMachine.h - Define TargetMachine for PIC16 ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the PIC16 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef PIC16_TARGETMACHINE_H
+#define PIC16_TARGETMACHINE_H
+
+#include "PIC16InstrInfo.h"
+#include "PIC16ISelLowering.h"
+#include "PIC16RegisterInfo.h"
+#include "PIC16Subtarget.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+/// PIC16TargetMachine
+///
+class PIC16TargetMachine : public LLVMTargetMachine {
+  PIC16Subtarget        Subtarget;
+  const TargetData      DataLayout;       // Calculates type size & alignment
+  PIC16InstrInfo        InstrInfo;
+  PIC16TargetLowering   TLInfo;
+
+  // PIC16 does not have any call stack frame, therefore not having 
+  // any PIC16 specific FrameInfo class.
+  TargetFrameInfo       FrameInfo;
+
+public:
+  PIC16TargetMachine(const Target &T, const std::string &TT,
+                     const std::string &FS, bool Cooper = false);
+
+  virtual const TargetFrameInfo *getFrameInfo() const { return &FrameInfo; }
+  virtual const PIC16InstrInfo *getInstrInfo() const  { return &InstrInfo; }
+  virtual const TargetData *getTargetData() const     { return &DataLayout;}
+  virtual const PIC16Subtarget *getSubtargetImpl() const { return &Subtarget; }
+ 
+  virtual const PIC16RegisterInfo *getRegisterInfo() const { 
+    return &(InstrInfo.getRegisterInfo()); 
+  }
+
+  virtual PIC16TargetLowering *getTargetLowering() const { 
+    return const_cast<PIC16TargetLowering*>(&TLInfo); 
+  }
+
+  virtual bool addInstSelector(PassManagerBase &PM,
+                               CodeGenOpt::Level OptLevel);
+  virtual bool addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+}; // PIC16TargetMachine.
+
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/PIC16/PIC16TargetObjectFile.cpp b/lib/Target/PIC16/PIC16TargetObjectFile.cpp
new file mode 100644
index 0000000..d7cfe02
--- /dev/null
+++ b/lib/Target/PIC16/PIC16TargetObjectFile.cpp

@@ -0,0 +1,381 @@
+//===-- PIC16TargetObjectFile.cpp - PIC16 object files --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PIC16TargetObjectFile.h"
+#include "PIC16ISelLowering.h"
+#include "PIC16TargetMachine.h"
+#include "PIC16Section.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+
+PIC16TargetObjectFile::PIC16TargetObjectFile() {
+}
+
+PIC16TargetObjectFile::~PIC16TargetObjectFile() {
+}
+
+/// Find a pic16 section. Return null if not found. Do not create one.
+PIC16Section *PIC16TargetObjectFile::
+findPIC16Section(const std::string &Name) {
+  /// Return if we have an already existing one.
+  PIC16Section *Entry = SectionsByName[Name];
+  if (Entry)
+    return Entry;
+
+  return NULL;
+}
+
+
+/// Find a pic16 section. If not found, create one.
+PIC16Section *PIC16TargetObjectFile::
+getPIC16Section(const std::string &Name, PIC16SectionType Ty, 
+                const std::string &Address, int Color) const {
+
+  /// Return if we have an already existing one.
+  PIC16Section *&Entry = SectionsByName[Name];
+  if (Entry)
+    return Entry;
+
+
+  Entry = PIC16Section::Create(Name, Ty, Address, Color, getContext());
+  return Entry;
+}
+
+/// Find a standard pic16 data section. If not found, create one and keep
+/// track of it by adding it to appropriate std section list.
+PIC16Section *PIC16TargetObjectFile::
+getPIC16DataSection(const std::string &Name, PIC16SectionType Ty, 
+                    const std::string &Address, int Color) const {
+
+  /// Return if we have an already existing one.
+  PIC16Section *&Entry = SectionsByName[Name];
+  if (Entry)
+    return Entry;
+
+
+  /// Else create a new one and add it to appropriate section list.
+  Entry = PIC16Section::Create(Name, Ty, Address, Color, getContext());
+
+  switch (Ty) {
+  default: llvm_unreachable ("unknow standard section type.");
+  case UDATA: UDATASections_.push_back(Entry); break;
+  case IDATA: IDATASections_.push_back(Entry); break;
+  case ROMDATA: ROMDATASection_ = Entry; break;
+  case UDATA_SHR: SHAREDUDATASection_ = Entry; break;
+  }
+
+  return Entry;
+}
+    
+
+/// Find a standard pic16 autos section. If not found, create one and keep
+/// track of it by adding it to appropriate std section list.
+PIC16Section *PIC16TargetObjectFile::
+getPIC16AutoSection(const std::string &Name, PIC16SectionType Ty, 
+                    const std::string &Address, int Color) const {
+
+  /// Return if we have an already existing one.
+  PIC16Section *&Entry = SectionsByName[Name];
+  if (Entry)
+    return Entry;
+
+
+  /// Else create a new one and add it to appropriate section list.
+  Entry = PIC16Section::Create(Name, Ty, Address, Color, getContext());
+
+  assert (Ty == UDATA_OVR && "incorrect section type for autos");
+  AUTOSections_.push_back(Entry);
+
+  return Entry;
+}
+    
+/// Find a pic16 user section. If not found, create one and keep
+/// track of it by adding it to appropriate std section list.
+PIC16Section *PIC16TargetObjectFile::
+getPIC16UserSection(const std::string &Name, PIC16SectionType Ty, 
+                    const std::string &Address, int Color) const {
+
+  /// Return if we have an already existing one.
+  PIC16Section *&Entry = SectionsByName[Name];
+  if (Entry)
+    return Entry;
+
+
+  /// Else create a new one and add it to appropriate section list.
+  Entry = PIC16Section::Create(Name, Ty, Address, Color, getContext());
+
+  USERSections_.push_back(Entry);
+
+  return Entry;
+}
+
+/// Do some standard initialization.
+void PIC16TargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &tm){
+  TargetLoweringObjectFile::Initialize(Ctx, tm);
+  TM = &tm;
+  
+  ROMDATASection_ = NULL;
+  SHAREDUDATASection_ = NULL;
+}
+
+/// allocateUDATA - Allocate a un-initialized global to an existing or new UDATA
+/// section and return that section.
+const MCSection *
+PIC16TargetObjectFile::allocateUDATA(const GlobalVariable *GV) const {
+  assert(GV->hasInitializer() && "This global doesn't need space");
+  Constant *C = GV->getInitializer();
+  assert(C->isNullValue() && "Unitialized globals has non-zero initializer");
+
+  // Find how much space this global needs.
+  const TargetData *TD = TM->getTargetData();
+  const Type *Ty = C->getType(); 
+  unsigned ValSize = TD->getTypeAllocSize(Ty);
+ 
+  // Go through all UDATA Sections and assign this variable
+  // to the first available section having enough space.
+  PIC16Section *Found = NULL;
+  for (unsigned i = 0; i < UDATASections_.size(); i++) {
+    if (DataBankSize - UDATASections_[i]->getSize() >= ValSize) {
+      Found = UDATASections_[i];
+      break;
+    }
+  }
+
+  // No UDATA section spacious enough was found. Crate a new one.
+  if (!Found) {
+    std::string name = PAN::getUdataSectionName(UDATASections_.size());
+    Found = getPIC16DataSection(name.c_str(), UDATA);
+  }
+  
+  // Insert the GV into this UDATA section.
+  Found->Items.push_back(GV);
+  Found->setSize(Found->getSize() + ValSize);
+  return Found;
+} 
+
+/// allocateIDATA - allocate an initialized global into an existing
+/// or new section and return that section.
+const MCSection *
+PIC16TargetObjectFile::allocateIDATA(const GlobalVariable *GV) const{
+  assert(GV->hasInitializer() && "This global doesn't need space");
+  Constant *C = GV->getInitializer();
+  assert(!C->isNullValue() && "initialized globals has zero initializer");
+  assert(GV->getType()->getAddressSpace() == PIC16ISD::RAM_SPACE &&
+         "can allocate initialized RAM data only");
+
+  // Find how much space this global needs.
+  const TargetData *TD = TM->getTargetData();
+  const Type *Ty = C->getType(); 
+  unsigned ValSize = TD->getTypeAllocSize(Ty);
+ 
+  // Go through all IDATA Sections and assign this variable
+  // to the first available section having enough space.
+  PIC16Section *Found = NULL;
+  for (unsigned i = 0; i < IDATASections_.size(); i++) {
+    if (DataBankSize - IDATASections_[i]->getSize() >= ValSize) {
+      Found = IDATASections_[i]; 
+      break;
+    }
+  }
+
+  // No IDATA section spacious enough was found. Crate a new one.
+  if (!Found) {
+    std::string name = PAN::getIdataSectionName(IDATASections_.size());
+    Found = getPIC16DataSection(name.c_str(), IDATA);
+  }
+  
+  // Insert the GV into this IDATA.
+  Found->Items.push_back(GV);
+  Found->setSize(Found->getSize() + ValSize);
+  return Found;
+} 
+
+// Allocate a program memory variable into ROMDATA section.
+const MCSection *
+PIC16TargetObjectFile::allocateROMDATA(const GlobalVariable *GV) const {
+
+  std::string name = PAN::getRomdataSectionName();
+  PIC16Section *S = getPIC16DataSection(name.c_str(), ROMDATA);
+
+  S->Items.push_back(GV);
+  return S;
+}
+
+// Get the section for an automatic variable of a function.
+// For PIC16 they are globals only with mangled names.
+const MCSection *
+PIC16TargetObjectFile::allocateAUTO(const GlobalVariable *GV) const {
+
+  const std::string name = PAN::getSectionNameForSym(GV->getName());
+  PIC16Section *S = getPIC16AutoSection(name.c_str());
+
+  S->Items.push_back(GV);
+  return S;
+}
+
+
+// Override default implementation to put the true globals into
+// multiple data sections if required.
+const MCSection *
+PIC16TargetObjectFile::SelectSectionForGlobal(const GlobalValue *GV1,
+                                              SectionKind Kind,
+                                              Mangler *Mang,
+                                              const TargetMachine &TM) const {
+  // We select the section based on the initializer here, so it really
+  // has to be a GlobalVariable.
+  const GlobalVariable *GV = dyn_cast<GlobalVariable>(GV1); 
+  if (!GV)
+    return TargetLoweringObjectFile::SelectSectionForGlobal(GV1, Kind, Mang,TM);
+
+  assert(GV->hasInitializer() && "A def without initializer?");
+
+  // First, if this is an automatic variable for a function, get the section
+  // name for it and return.
+  std::string name = GV->getName();
+  if (PAN::isLocalName(name))
+    return allocateAUTO(GV);
+
+  // See if this is an uninitialized global.
+  const Constant *C = GV->getInitializer();
+  if (C->isNullValue()) 
+    return allocateUDATA(GV);
+
+  // If this is initialized data in RAM. Put it in the correct IDATA section.
+  if (GV->getType()->getAddressSpace() == PIC16ISD::RAM_SPACE) 
+    return allocateIDATA(GV);
+
+  // This is initialized data in rom, put it in the readonly section.
+  if (GV->getType()->getAddressSpace() == PIC16ISD::ROM_SPACE) 
+    return allocateROMDATA(GV);
+
+  // Else let the default implementation take care of it.
+  return TargetLoweringObjectFile::SelectSectionForGlobal(GV, Kind, Mang,TM);
+}
+
+
+
+
+/// getExplicitSectionGlobal - Allow the target to completely override
+/// section assignment of a global.
+const MCSection *PIC16TargetObjectFile::
+getExplicitSectionGlobal(const GlobalValue *GV, SectionKind Kind, 
+                         Mangler *Mang, const TargetMachine &TM) const {
+  assert(GV->hasSection());
+  
+  if (const GlobalVariable *GVar = cast<GlobalVariable>(GV)) {
+    std::string SectName = GVar->getSection();
+    // If address for a variable is specified, get the address and create
+    // section.
+    // FIXME: move this attribute checking in PAN.
+    std::string AddrStr = "Address=";
+    if (SectName.compare(0, AddrStr.length(), AddrStr) == 0) {
+      std::string SectAddr = SectName.substr(AddrStr.length());
+      if (SectAddr.compare("NEAR") == 0)
+        return allocateSHARED(GVar, Mang);
+      else
+        return allocateAtGivenAddress(GVar, SectAddr);
+    }
+     
+    // Create the section specified with section attribute. 
+    return allocateInGivenSection(GVar);
+  }
+
+  return getPIC16DataSection(GV->getSection().c_str(), UDATA);
+}
+
+const MCSection *
+PIC16TargetObjectFile::allocateSHARED(const GlobalVariable *GV,
+                                      Mangler *Mang) const {
+  // Make sure that this is an uninitialized global.
+  assert(GV->hasInitializer() && "This global doesn't need space");
+  if (!GV->getInitializer()->isNullValue()) {
+    // FIXME: Generate a warning in this case that near qualifier will be 
+    // ignored.
+    return SelectSectionForGlobal(GV, SectionKind::getDataRel(), Mang, *TM); 
+  } 
+  std::string Name = PAN::getSharedUDataSectionName(); 
+
+  PIC16Section *SharedUDataSect = getPIC16DataSection(Name.c_str(), UDATA_SHR); 
+  // Insert the GV into shared section.
+  SharedUDataSect->Items.push_back(GV);
+  return SharedUDataSect;
+}
+
+
+// Interface used by AsmPrinter to get a code section for a function.
+const PIC16Section *
+PIC16TargetObjectFile::SectionForCode(const std::string &FnName) const {
+  const std::string &sec_name = PAN::getCodeSectionName(FnName);
+  return getPIC16Section(sec_name, CODE);
+}
+
+// Interface used by AsmPrinter to get a frame section for a function.
+const PIC16Section *
+PIC16TargetObjectFile::SectionForFrame(const std::string &FnName) const {
+  const std::string &sec_name = PAN::getFrameSectionName(FnName);
+  return getPIC16Section(sec_name, UDATA_OVR);
+}
+
+// Allocate a global var in existing or new section of given name.
+const MCSection *
+PIC16TargetObjectFile::allocateInGivenSection(const GlobalVariable *GV) const {
+  // Determine the type of section that we need to create.
+  PIC16SectionType SecTy;
+
+  // See if this is an uninitialized global.
+  const Constant *C = GV->getInitializer();
+  if (C->isNullValue())
+    SecTy = UDATA;
+  // If this is initialized data in RAM. Put it in the correct IDATA section.
+  else if (GV->getType()->getAddressSpace() == PIC16ISD::RAM_SPACE)
+    SecTy = IDATA;
+  // This is initialized data in rom, put it in the readonly section.
+  else if (GV->getType()->getAddressSpace() == PIC16ISD::ROM_SPACE) 
+    SecTy = ROMDATA;
+  else
+    llvm_unreachable ("Could not determine section type for global");
+
+  PIC16Section *S = getPIC16UserSection(GV->getSection().c_str(), SecTy);
+  S->Items.push_back(GV);
+  return S;
+}
+
+// Allocate a global var in a new absolute sections at given address.
+const MCSection *
+PIC16TargetObjectFile::allocateAtGivenAddress(const GlobalVariable *GV,
+                                               const std::string &Addr) const {
+  // Determine the type of section that we need to create.
+  PIC16SectionType SecTy;
+
+  // See if this is an uninitialized global.
+  const Constant *C = GV->getInitializer();
+  if (C->isNullValue())
+    SecTy = UDATA;
+  // If this is initialized data in RAM. Put it in the correct IDATA section.
+  else if (GV->getType()->getAddressSpace() == PIC16ISD::RAM_SPACE)
+    SecTy = IDATA;
+  // This is initialized data in rom, put it in the readonly section.
+  else if (GV->getType()->getAddressSpace() == PIC16ISD::ROM_SPACE) 
+    SecTy = ROMDATA;
+  else
+    llvm_unreachable ("Could not determine section type for global");
+
+  std::string Prefix = GV->getNameStr() + "." + Addr + ".";
+  std::string SName = PAN::getUserSectionName(Prefix);
+  PIC16Section *S = getPIC16UserSection(SName.c_str(), SecTy, Addr.c_str());
+  S->Items.push_back(GV);
+  return S;
+}
+
+

diff --git a/lib/Target/PIC16/PIC16TargetObjectFile.h b/lib/Target/PIC16/PIC16TargetObjectFile.h
new file mode 100644
index 0000000..0b0ad43
--- /dev/null
+++ b/lib/Target/PIC16/PIC16TargetObjectFile.h

@@ -0,0 +1,167 @@
+//===-- PIC16TargetObjectFile.h - PIC16 Object Info -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_PIC16_TARGETOBJECTFILE_H
+#define LLVM_TARGET_PIC16_TARGETOBJECTFILE_H
+
+#include "PIC16.h"
+#include "PIC16ABINames.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/ADT/StringMap.h"
+#include <vector>
+#include <string>
+
+namespace llvm {
+  class GlobalVariable;
+  class Module;
+  class PIC16TargetMachine;
+  class PIC16Section;
+  
+  enum { DataBankSize = 80 };
+
+  /// PIC16 Splits the global data into mulitple udata and idata sections.
+  /// Each udata and idata section needs to contain a list of globals that
+  /// they contain, in order to avoid scanning over all the global values 
+  /// again and printing only those that match the current section. 
+  /// Keeping values inside the sections make printing a section much easier.
+  ///
+  /// FIXME: MOVE ALL THIS STUFF TO PIC16Section.
+  ///
+
+  /// PIC16TargetObjectFile - PIC16 Object file. Contains data and code
+  /// sections. 
+  // PIC16 Object File has two types of sections.
+  // 1. Standard Sections
+  //    1.1 un-initialized global data 
+  //    1.2 initialized global data
+  //    1.3 program memory data
+  //    1.4 local variables of functions.
+  // 2. User defined sections
+  //    2.1 Objects placed in a specific section. (By _Section() macro)
+  //    2.2 Objects placed at a specific address. (By _Address() macro)
+  class PIC16TargetObjectFile : public TargetLoweringObjectFile {
+    /// SectionsByName - Bindings of names to allocated sections.
+    mutable StringMap<PIC16Section*> SectionsByName;
+
+    const TargetMachine *TM;
+    
+    /// Lists of sections.
+    /// Standard Data Sections.
+    mutable std::vector<PIC16Section *> UDATASections_;
+    mutable std::vector<PIC16Section *> IDATASections_;
+    mutable PIC16Section * ROMDATASection_;
+    mutable PIC16Section * SHAREDUDATASection_;
+
+    /// Standard Auto Sections.
+    mutable std::vector<PIC16Section *> AUTOSections_;
+ 
+    /// User specified sections.
+    mutable std::vector<PIC16Section *> USERSections_;
+
+    
+    /// Find or Create a PIC16 Section, without adding it to any
+    /// section list.
+    PIC16Section *getPIC16Section(const std::string &Name,
+                                   PIC16SectionType Ty, 
+                                   const std::string &Address = "", 
+                                   int Color = -1) const;
+
+    /// Convenience functions. These wrappers also take care of adding 
+    /// the newly created section to the appropriate sections list.
+
+    /// Find or Create PIC16 Standard Data Section.
+    PIC16Section *getPIC16DataSection(const std::string &Name,
+                                       PIC16SectionType Ty, 
+                                       const std::string &Address = "", 
+                                       int Color = -1) const;
+
+    /// Find or Create PIC16 Standard Auto Section.
+    PIC16Section *getPIC16AutoSection(const std::string &Name,
+                                       PIC16SectionType Ty = UDATA_OVR,
+                                       const std::string &Address = "", 
+                                       int Color = -1) const;
+
+    /// Find or Create PIC16 Standard Auto Section.
+    PIC16Section *getPIC16UserSection(const std::string &Name,
+                                       PIC16SectionType Ty, 
+                                       const std::string &Address = "", 
+                                       int Color = -1) const;
+
+    /// Allocate Un-initialized data to a standard UDATA section. 
+    const MCSection *allocateUDATA(const GlobalVariable *GV) const;
+
+    /// Allocate Initialized data to a standard IDATA section. 
+    const MCSection *allocateIDATA(const GlobalVariable *GV) const;
+
+    /// Allocate ROM data to the standard ROMDATA section. 
+    const MCSection *allocateROMDATA(const GlobalVariable *GV) const;
+
+    /// Allocate an AUTO variable to an AUTO section.
+    const MCSection *allocateAUTO(const GlobalVariable *GV) const;
+    
+    /// Allocate DATA in user specified section.
+    const MCSection *allocateInGivenSection(const GlobalVariable *GV) const;
+
+    /// Allocate DATA at user specified address.
+    const MCSection *allocateAtGivenAddress(const GlobalVariable *GV,
+                                            const std::string &Addr) const;
+
+    /// Allocate a shared variable to SHARED section.
+    const MCSection *allocateSHARED(const GlobalVariable *GV,
+                                    Mangler *Mang) const;
+   
+    public:
+    PIC16TargetObjectFile();
+    ~PIC16TargetObjectFile();
+    void Initialize(MCContext &Ctx, const TargetMachine &TM);
+
+    /// Return the section with the given Name. Null if not found.
+    PIC16Section *findPIC16Section(const std::string &Name);
+
+    /// Override section allocations for user specified sections.
+    virtual const MCSection *
+    getExplicitSectionGlobal(const GlobalValue *GV, SectionKind Kind, 
+                             Mangler *Mang, const TargetMachine &TM) const;
+    
+    /// Select sections for Data and Auto variables(globals).
+    virtual const MCSection *SelectSectionForGlobal(const GlobalValue *GV,
+                                                    SectionKind Kind,
+                                                    Mangler *Mang,
+                                                    const TargetMachine&) const;
+
+
+    /// Return a code section for a function.
+    const PIC16Section *SectionForCode (const std::string &FnName) const;
+
+    /// Return a frame section for a function.
+    const PIC16Section *SectionForFrame (const std::string &FnName) const;
+
+    /// Accessors for various section lists.
+    const std::vector<PIC16Section *> &UDATASections() const {
+      return UDATASections_;
+    }
+    const std::vector<PIC16Section *> &IDATASections() const {
+      return IDATASections_;
+    }
+    const PIC16Section *ROMDATASection() const {
+      return ROMDATASection_;
+    }
+    const PIC16Section *SHAREDUDATASection() const {
+      return SHAREDUDATASection_;
+    }
+    const std::vector<PIC16Section *> &AUTOSections() const {
+      return AUTOSections_;
+    }
+    const std::vector<PIC16Section *> &USERSections() const {
+      return USERSections_;
+    }
+  };
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/PIC16/TargetInfo/CMakeLists.txt b/lib/Target/PIC16/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000..bfc6ff4
--- /dev/null
+++ b/lib/Target/PIC16/TargetInfo/CMakeLists.txt

@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMPIC16Info
+  PIC16TargetInfo.cpp
+  )
+
+add_dependencies(LLVMPIC16Info PIC16Table_gen)

diff --git a/lib/Target/PIC16/TargetInfo/Makefile b/lib/Target/PIC16/TargetInfo/Makefile
new file mode 100644
index 0000000..76609f6
--- /dev/null
+++ b/lib/Target/PIC16/TargetInfo/Makefile

@@ -0,0 +1,15 @@
+##===- lib/Target/PIC16/TargetInfo/Makefile ----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMPIC16Info
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common

diff --git a/lib/Target/PIC16/TargetInfo/PIC16TargetInfo.cpp b/lib/Target/PIC16/TargetInfo/PIC16TargetInfo.cpp
new file mode 100644
index 0000000..46cc819
--- /dev/null
+++ b/lib/Target/PIC16/TargetInfo/PIC16TargetInfo.cpp

@@ -0,0 +1,21 @@
+//===-- PIC16TargetInfo.cpp - PIC16 Target Implementation -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PIC16.h"
+#include "llvm/Module.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::ThePIC16Target, llvm::TheCooperTarget;
+
+extern "C" void LLVMInitializePIC16TargetInfo() { 
+  RegisterTarget<> X(ThePIC16Target, "pic16", "PIC16 14-bit [experimental]");
+
+  RegisterTarget<> Y(TheCooperTarget, "cooper", "PIC16 Cooper [experimental]");
+}

diff --git a/lib/Target/PowerPC/AsmPrinter/CMakeLists.txt b/lib/Target/PowerPC/AsmPrinter/CMakeLists.txt
new file mode 100644
index 0000000..236b264
--- /dev/null
+++ b/lib/Target/PowerPC/AsmPrinter/CMakeLists.txt

@@ -0,0 +1,6 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMPowerPCAsmPrinter
+  PPCAsmPrinter.cpp
+  )
+add_dependencies(LLVMPowerPCAsmPrinter PowerPCCodeGenTable_gen)
\ No newline at end of file

diff --git a/lib/Target/PowerPC/AsmPrinter/Makefile b/lib/Target/PowerPC/AsmPrinter/Makefile
new file mode 100644
index 0000000..269ef92
--- /dev/null
+++ b/lib/Target/PowerPC/AsmPrinter/Makefile

@@ -0,0 +1,15 @@
+##===- lib/Target/PowerPC/AsmPrinter/Makefile --------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMPowerPCAsmPrinter
+
+# Hack: we need to include 'main' PowerPC target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common

diff --git a/lib/Target/PowerPC/AsmPrinter/PPCAsmPrinter.cpp b/lib/Target/PowerPC/AsmPrinter/PPCAsmPrinter.cpp
new file mode 100644
index 0000000..afc90b1
--- /dev/null
+++ b/lib/Target/PowerPC/AsmPrinter/PPCAsmPrinter.cpp

@@ -0,0 +1,848 @@
+//===-- PPCAsmPrinter.cpp - Print machine instrs to PowerPC assembly --------=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to PowerPC assembly language. This printer is
+// the output mechanism used by `llc'.
+//
+// Documentation at http://developer.apple.com/documentation/DeveloperTools/
+// Reference/Assembler/ASMIntroduction/chapter_1_section_1.html
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asmprinter"
+#include "PPC.h"
+#include "PPCPredicates.h"
+#include "PPCTargetMachine.h"
+#include "PPCSubtarget.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/ADT/SmallString.h"
+using namespace llvm;
+
+namespace {
+  class PPCAsmPrinter : public AsmPrinter {
+  protected:
+    DenseMap<const MCSymbol*, const MCSymbol*> TOC;
+    const PPCSubtarget &Subtarget;
+    uint64_t LabelID;
+  public:
+    explicit PPCAsmPrinter(formatted_raw_ostream &O, TargetMachine &TM,
+                           MCContext &Ctx, MCStreamer &Streamer,
+                           const MCAsmInfo *T)
+      : AsmPrinter(O, TM, Ctx, Streamer, T),
+        Subtarget(TM.getSubtarget<PPCSubtarget>()), LabelID(0) {}
+
+    virtual const char *getPassName() const {
+      return "PowerPC Assembly Printer";
+    }
+
+    PPCTargetMachine &getTM() {
+      return static_cast<PPCTargetMachine&>(TM);
+    }
+
+    unsigned enumRegToMachineReg(unsigned enumReg) {
+      switch (enumReg) {
+      default: llvm_unreachable("Unhandled register!");
+      case PPC::CR0:  return  0;
+      case PPC::CR1:  return  1;
+      case PPC::CR2:  return  2;
+      case PPC::CR3:  return  3;
+      case PPC::CR4:  return  4;
+      case PPC::CR5:  return  5;
+      case PPC::CR6:  return  6;
+      case PPC::CR7:  return  7;
+      }
+      llvm_unreachable(0);
+    }
+
+    /// printInstruction - This method is automatically generated by tablegen
+    /// from the instruction set description.  This method returns true if the
+    /// machine instruction was sufficiently described to print it, otherwise it
+    /// returns false.
+    void printInstruction(const MachineInstr *MI);
+    static const char *getRegisterName(unsigned RegNo);
+
+
+    virtual void EmitInstruction(const MachineInstr *MI);
+    void printOp(const MachineOperand &MO);
+
+    /// stripRegisterPrefix - This method strips the character prefix from a
+    /// register name so that only the number is left.  Used by for linux asm.
+    const char *stripRegisterPrefix(const char *RegName) {
+      switch (RegName[0]) {
+      case 'r':
+      case 'f':
+      case 'v': return RegName + 1;
+      case 'c': if (RegName[1] == 'r') return RegName + 2;
+      }
+
+      return RegName;
+    }
+
+    /// printRegister - Print register according to target requirements.
+    ///
+    void printRegister(const MachineOperand &MO, bool R0AsZero) {
+      unsigned RegNo = MO.getReg();
+      assert(TargetRegisterInfo::isPhysicalRegister(RegNo) && "Not physreg??");
+
+      // If we should use 0 for R0.
+      if (R0AsZero && RegNo == PPC::R0) {
+        O << "0";
+        return;
+      }
+
+      const char *RegName = getRegisterName(RegNo);
+      // Linux assembler (Others?) does not take register mnemonics.
+      // FIXME - What about special registers used in mfspr/mtspr?
+      if (!Subtarget.isDarwin()) RegName = stripRegisterPrefix(RegName);
+      O << RegName;
+    }
+
+    void printOperand(const MachineInstr *MI, unsigned OpNo) {
+      const MachineOperand &MO = MI->getOperand(OpNo);
+      if (MO.isReg()) {
+        printRegister(MO, false);
+      } else if (MO.isImm()) {
+        O << MO.getImm();
+      } else {
+        printOp(MO);
+      }
+    }
+
+    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                         unsigned AsmVariant, const char *ExtraCode);
+    bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                               unsigned AsmVariant, const char *ExtraCode);
+
+
+    void printS5ImmOperand(const MachineInstr *MI, unsigned OpNo) {
+      char value = MI->getOperand(OpNo).getImm();
+      value = (value << (32-5)) >> (32-5);
+      O << (int)value;
+    }
+    void printU5ImmOperand(const MachineInstr *MI, unsigned OpNo) {
+      unsigned char value = MI->getOperand(OpNo).getImm();
+      assert(value <= 31 && "Invalid u5imm argument!");
+      O << (unsigned int)value;
+    }
+    void printU6ImmOperand(const MachineInstr *MI, unsigned OpNo) {
+      unsigned char value = MI->getOperand(OpNo).getImm();
+      assert(value <= 63 && "Invalid u6imm argument!");
+      O << (unsigned int)value;
+    }
+    void printS16ImmOperand(const MachineInstr *MI, unsigned OpNo) {
+      O << (short)MI->getOperand(OpNo).getImm();
+    }
+    void printU16ImmOperand(const MachineInstr *MI, unsigned OpNo) {
+      O << (unsigned short)MI->getOperand(OpNo).getImm();
+    }
+    void printS16X4ImmOperand(const MachineInstr *MI, unsigned OpNo) {
+      if (MI->getOperand(OpNo).isImm()) {
+        O << (short)(MI->getOperand(OpNo).getImm()*4);
+      } else {
+        O << "lo16(";
+        printOp(MI->getOperand(OpNo));
+        if (TM.getRelocationModel() == Reloc::PIC_)
+          O << "-\"L" << getFunctionNumber() << "$pb\")";
+        else
+          O << ')';
+      }
+    }
+    void printBranchOperand(const MachineInstr *MI, unsigned OpNo) {
+      // Branches can take an immediate operand.  This is used by the branch
+      // selection pass to print $+8, an eight byte displacement from the PC.
+      if (MI->getOperand(OpNo).isImm()) {
+        O << "$+" << MI->getOperand(OpNo).getImm()*4;
+      } else {
+        printOp(MI->getOperand(OpNo));
+      }
+    }
+    void printCallOperand(const MachineInstr *MI, unsigned OpNo) {
+      const MachineOperand &MO = MI->getOperand(OpNo);
+      if (TM.getRelocationModel() != Reloc::Static) {
+        if (MO.getType() == MachineOperand::MO_GlobalAddress) {
+          GlobalValue *GV = MO.getGlobal();
+          if (GV->isDeclaration() || GV->isWeakForLinker()) {
+            // Dynamically-resolved functions need a stub for the function.
+            MCSymbol *Sym = GetSymbolWithGlobalValueBase(GV, "$stub");
+            MCSymbol *&StubSym =
+              MMI->getObjFileInfo<MachineModuleInfoMachO>().getFnStubEntry(Sym);
+            if (StubSym == 0)
+              StubSym = GetGlobalValueSymbol(GV);
+            O << *Sym;
+            return;
+          }
+        }
+        if (MO.getType() == MachineOperand::MO_ExternalSymbol) {
+          SmallString<128> TempNameStr;
+          TempNameStr += StringRef(MO.getSymbolName());
+          TempNameStr += StringRef("$stub");
+          
+          MCSymbol *Sym = GetExternalSymbolSymbol(TempNameStr.str());
+          MCSymbol *&StubSym =
+            MMI->getObjFileInfo<MachineModuleInfoMachO>().getFnStubEntry(Sym);
+          if (StubSym == 0)
+            StubSym = GetExternalSymbolSymbol(MO.getSymbolName());
+          O << *Sym;
+          return;
+        }
+      }
+
+      printOp(MI->getOperand(OpNo));
+    }
+    void printAbsAddrOperand(const MachineInstr *MI, unsigned OpNo) {
+     O << (int)MI->getOperand(OpNo).getImm()*4;
+    }
+    void printPICLabel(const MachineInstr *MI, unsigned OpNo) {
+      O << "\"L" << getFunctionNumber() << "$pb\"\n";
+      O << "\"L" << getFunctionNumber() << "$pb\":";
+    }
+    void printSymbolHi(const MachineInstr *MI, unsigned OpNo) {
+      if (MI->getOperand(OpNo).isImm()) {
+        printS16ImmOperand(MI, OpNo);
+      } else {
+        if (Subtarget.isDarwin()) O << "ha16(";
+        printOp(MI->getOperand(OpNo));
+        if (TM.getRelocationModel() == Reloc::PIC_)
+          O << "-\"L" << getFunctionNumber() << "$pb\"";
+        if (Subtarget.isDarwin())
+          O << ')';
+        else
+          O << "@ha";
+      }
+    }
+    void printSymbolLo(const MachineInstr *MI, unsigned OpNo) {
+      if (MI->getOperand(OpNo).isImm()) {
+        printS16ImmOperand(MI, OpNo);
+      } else {
+        if (Subtarget.isDarwin()) O << "lo16(";
+        printOp(MI->getOperand(OpNo));
+        if (TM.getRelocationModel() == Reloc::PIC_)
+          O << "-\"L" << getFunctionNumber() << "$pb\"";
+        if (Subtarget.isDarwin())
+          O << ')';
+        else
+          O << "@l";
+      }
+    }
+    void printcrbitm(const MachineInstr *MI, unsigned OpNo) {
+      unsigned CCReg = MI->getOperand(OpNo).getReg();
+      unsigned RegNo = enumRegToMachineReg(CCReg);
+      O << (0x80 >> RegNo);
+    }
+    // The new addressing mode printers.
+    void printMemRegImm(const MachineInstr *MI, unsigned OpNo) {
+      printSymbolLo(MI, OpNo);
+      O << '(';
+      if (MI->getOperand(OpNo+1).isReg() &&
+          MI->getOperand(OpNo+1).getReg() == PPC::R0)
+        O << "0";
+      else
+        printOperand(MI, OpNo+1);
+      O << ')';
+    }
+    void printMemRegImmShifted(const MachineInstr *MI, unsigned OpNo) {
+      if (MI->getOperand(OpNo).isImm())
+        printS16X4ImmOperand(MI, OpNo);
+      else
+        printSymbolLo(MI, OpNo);
+      O << '(';
+      if (MI->getOperand(OpNo+1).isReg() &&
+          MI->getOperand(OpNo+1).getReg() == PPC::R0)
+        O << "0";
+      else
+        printOperand(MI, OpNo+1);
+      O << ')';
+    }
+
+    void printMemRegReg(const MachineInstr *MI, unsigned OpNo) {
+      // When used as the base register, r0 reads constant zero rather than
+      // the value contained in the register.  For this reason, the darwin
+      // assembler requires that we print r0 as 0 (no r) when used as the base.
+      const MachineOperand &MO = MI->getOperand(OpNo);
+      printRegister(MO, true);
+      O << ", ";
+      printOperand(MI, OpNo+1);
+    }
+
+    void printTOCEntryLabel(const MachineInstr *MI, unsigned OpNo) {
+      const MachineOperand &MO = MI->getOperand(OpNo);
+
+      assert(MO.getType() == MachineOperand::MO_GlobalAddress);
+
+      const MCSymbol *Sym = GetGlobalValueSymbol(MO.getGlobal());
+
+      // Map symbol -> label of TOC entry.
+      const MCSymbol *&TOCEntry = TOC[Sym];
+      if (TOCEntry == 0)
+        TOCEntry = OutContext.
+          GetOrCreateSymbol(StringRef(MAI->getPrivateGlobalPrefix()) + "C" +
+                            Twine(LabelID++));
+
+      O << *TOCEntry << "@toc";
+    }
+
+    void printPredicateOperand(const MachineInstr *MI, unsigned OpNo,
+                               const char *Modifier);
+  };
+
+  /// PPCLinuxAsmPrinter - PowerPC assembly printer, customized for Linux
+  class PPCLinuxAsmPrinter : public PPCAsmPrinter {
+  public:
+    explicit PPCLinuxAsmPrinter(formatted_raw_ostream &O, TargetMachine &TM,
+                                MCContext &Ctx, MCStreamer &Streamer,
+                                const MCAsmInfo *T)
+      : PPCAsmPrinter(O, TM, Ctx, Streamer, T) {}
+
+    virtual const char *getPassName() const {
+      return "Linux PPC Assembly Printer";
+    }
+
+    bool doFinalization(Module &M);
+
+    virtual void EmitFunctionEntryLabel();
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+      AU.addRequired<MachineModuleInfo>();
+      AU.addRequired<DwarfWriter>();
+      PPCAsmPrinter::getAnalysisUsage(AU);
+    }
+  };
+
+  /// PPCDarwinAsmPrinter - PowerPC assembly printer, customized for Darwin/Mac
+  /// OS X
+  class PPCDarwinAsmPrinter : public PPCAsmPrinter {
+    formatted_raw_ostream &OS;
+  public:
+    explicit PPCDarwinAsmPrinter(formatted_raw_ostream &O, TargetMachine &TM,
+                                 MCContext &Ctx, MCStreamer &Streamer,
+                                 const MCAsmInfo *T)
+      : PPCAsmPrinter(O, TM, Ctx, Streamer, T), OS(O) {}
+
+    virtual const char *getPassName() const {
+      return "Darwin PPC Assembly Printer";
+    }
+
+    bool doFinalization(Module &M);
+    void EmitStartOfAsmFile(Module &M);
+
+    void EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs);
+    
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+      AU.addRequired<MachineModuleInfo>();
+      AU.addRequired<DwarfWriter>();
+      PPCAsmPrinter::getAnalysisUsage(AU);
+    }
+  };
+} // end of anonymous namespace
+
+// Include the auto-generated portion of the assembly writer
+#include "PPCGenAsmWriter.inc"
+
+void PPCAsmPrinter::printOp(const MachineOperand &MO) {
+  switch (MO.getType()) {
+  case MachineOperand::MO_Immediate:
+    llvm_unreachable("printOp() does not handle immediate values");
+
+  case MachineOperand::MO_MachineBasicBlock:
+    O << *MO.getMBB()->getSymbol(OutContext);
+    return;
+  case MachineOperand::MO_JumpTableIndex:
+    O << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+      << '_' << MO.getIndex();
+    // FIXME: PIC relocation model
+    return;
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << MAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber()
+      << '_' << MO.getIndex();
+    return;
+  case MachineOperand::MO_BlockAddress:
+    O << *GetBlockAddressSymbol(MO.getBlockAddress());
+    return;
+  case MachineOperand::MO_ExternalSymbol: {
+    // Computing the address of an external symbol, not calling it.
+    if (TM.getRelocationModel() == Reloc::Static) {
+      O << *GetExternalSymbolSymbol(MO.getSymbolName());
+      return;
+    }
+
+    MCSymbol *NLPSym = 
+      OutContext.GetOrCreateSymbol(StringRef(MAI->getGlobalPrefix())+
+                                   MO.getSymbolName()+"$non_lazy_ptr");
+    MCSymbol *&StubSym = 
+      MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(NLPSym);
+    if (StubSym == 0)
+      StubSym = GetExternalSymbolSymbol(MO.getSymbolName());
+    
+    O << *NLPSym;
+    return;
+  }
+  case MachineOperand::MO_GlobalAddress: {
+    // Computing the address of a global symbol, not calling it.
+    GlobalValue *GV = MO.getGlobal();
+    MCSymbol *SymToPrint;
+
+    // External or weakly linked global variables need non-lazily-resolved stubs
+    if (TM.getRelocationModel() != Reloc::Static &&
+        (GV->isDeclaration() || GV->isWeakForLinker())) {
+      if (!GV->hasHiddenVisibility()) {
+        SymToPrint = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+        MCSymbol *&StubSym = 
+       MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(SymToPrint);
+        if (StubSym == 0)
+          StubSym = GetGlobalValueSymbol(GV);
+      } else if (GV->isDeclaration() || GV->hasCommonLinkage() ||
+                 GV->hasAvailableExternallyLinkage()) {
+        SymToPrint = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+        
+        MCSymbol *&StubSym = 
+          MMI->getObjFileInfo<MachineModuleInfoMachO>().
+                    getHiddenGVStubEntry(SymToPrint);
+        if (StubSym == 0)
+          StubSym = GetGlobalValueSymbol(GV);
+      } else {
+        SymToPrint = GetGlobalValueSymbol(GV);
+      }
+    } else {
+      SymToPrint = GetGlobalValueSymbol(GV);
+    }
+    
+    O << *SymToPrint;
+
+    printOffset(MO.getOffset());
+    return;
+  }
+
+  default:
+    O << "<unknown operand type: " << MO.getType() << ">";
+    return;
+  }
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool PPCAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                    unsigned AsmVariant,
+                                    const char *ExtraCode) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default: return true;  // Unknown modifier.
+    case 'c': // Don't print "$" before a global var name or constant.
+      // PPC never has a prefix.
+      printOperand(MI, OpNo);
+      return false;
+    case 'L': // Write second word of DImode reference.
+      // Verify that this operand has two consecutive registers.
+      if (!MI->getOperand(OpNo).isReg() ||
+          OpNo+1 == MI->getNumOperands() ||
+          !MI->getOperand(OpNo+1).isReg())
+        return true;
+      ++OpNo;   // Return the high-part.
+      break;
+    case 'I':
+      // Write 'i' if an integer constant, otherwise nothing.  Used to print
+      // addi vs add, etc.
+      if (MI->getOperand(OpNo).isImm())
+        O << "i";
+      return false;
+    }
+  }
+
+  printOperand(MI, OpNo);
+  return false;
+}
+
+// At the moment, all inline asm memory operands are a single register.
+// In any case, the output of this routine should always be just one
+// assembler operand.
+
+bool PPCAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                                          unsigned AsmVariant,
+                                          const char *ExtraCode) {
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier.
+  assert (MI->getOperand(OpNo).isReg());
+  O << "0(";
+  printOperand(MI, OpNo);
+  O << ")";
+  return false;
+}
+
+void PPCAsmPrinter::printPredicateOperand(const MachineInstr *MI, unsigned OpNo,
+                                          const char *Modifier) {
+  assert(Modifier && "Must specify 'cc' or 'reg' as predicate op modifier!");
+  unsigned Code = MI->getOperand(OpNo).getImm();
+  if (!strcmp(Modifier, "cc")) {
+    switch ((PPC::Predicate)Code) {
+    case PPC::PRED_ALWAYS: return; // Don't print anything for always.
+    case PPC::PRED_LT: O << "lt"; return;
+    case PPC::PRED_LE: O << "le"; return;
+    case PPC::PRED_EQ: O << "eq"; return;
+    case PPC::PRED_GE: O << "ge"; return;
+    case PPC::PRED_GT: O << "gt"; return;
+    case PPC::PRED_NE: O << "ne"; return;
+    case PPC::PRED_UN: O << "un"; return;
+    case PPC::PRED_NU: O << "nu"; return;
+    }
+
+  } else {
+    assert(!strcmp(Modifier, "reg") &&
+           "Need to specify 'cc' or 'reg' as predicate op modifier!");
+    // Don't print the register for 'always'.
+    if (Code == PPC::PRED_ALWAYS) return;
+    printOperand(MI, OpNo+1);
+  }
+}
+
+
+/// EmitInstruction -- Print out a single PowerPC MI in Darwin syntax to
+/// the current output stream.
+///
+void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  // Check for slwi/srwi mnemonics.
+  if (MI->getOpcode() == PPC::RLWINM) {
+    unsigned char SH = MI->getOperand(2).getImm();
+    unsigned char MB = MI->getOperand(3).getImm();
+    unsigned char ME = MI->getOperand(4).getImm();
+    bool useSubstituteMnemonic = false;
+    if (SH <= 31 && MB == 0 && ME == (31-SH)) {
+      O << "\tslwi "; useSubstituteMnemonic = true;
+    }
+    if (SH <= 31 && MB == (32-SH) && ME == 31) {
+      O << "\tsrwi "; useSubstituteMnemonic = true;
+      SH = 32-SH;
+    }
+    if (useSubstituteMnemonic) {
+      printOperand(MI, 0);
+      O << ", ";
+      printOperand(MI, 1);
+      O << ", " << (unsigned int)SH;
+      OutStreamer.AddBlankLine();
+      return;
+    }
+  }
+  
+  if ((MI->getOpcode() == PPC::OR || MI->getOpcode() == PPC::OR8) &&
+      MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) {
+    O << "\tmr ";
+    printOperand(MI, 0);
+    O << ", ";
+    printOperand(MI, 1);
+    OutStreamer.AddBlankLine();
+    return;
+  }
+  
+  if (MI->getOpcode() == PPC::RLDICR) {
+    unsigned char SH = MI->getOperand(2).getImm();
+    unsigned char ME = MI->getOperand(3).getImm();
+    // rldicr RA, RS, SH, 63-SH == sldi RA, RS, SH
+    if (63-SH == ME) {
+      O << "\tsldi ";
+      printOperand(MI, 0);
+      O << ", ";
+      printOperand(MI, 1);
+      O << ", " << (unsigned int)SH;
+      OutStreamer.AddBlankLine();
+      return;
+    }
+  }
+
+  printInstruction(MI);
+  OutStreamer.AddBlankLine();
+}
+
+void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() {
+  if (!Subtarget.isPPC64())  // linux/ppc32 - Normal entry label.
+    return AsmPrinter::EmitFunctionEntryLabel();
+    
+  // Emit an official procedure descriptor.
+  // FIXME 64-bit SVR4: Use MCSection here!
+  O << "\t.section\t\".opd\",\"aw\"\n";
+  O << "\t.align 3\n";
+  OutStreamer.EmitLabel(CurrentFnSym);
+  O << "\t.quad .L." << *CurrentFnSym << ",.TOC.@tocbase\n";
+  O << "\t.previous\n";
+  O << ".L." << *CurrentFnSym << ":\n";
+}
+
+
+bool PPCLinuxAsmPrinter::doFinalization(Module &M) {
+  const TargetData *TD = TM.getTargetData();
+
+  bool isPPC64 = TD->getPointerSizeInBits() == 64;
+
+  if (isPPC64 && !TOC.empty()) {
+    // FIXME 64-bit SVR4: Use MCSection here?
+    O << "\t.section\t\".toc\",\"aw\"\n";
+
+    // FIXME: This is nondeterminstic!
+    for (DenseMap<const MCSymbol*, const MCSymbol*>::iterator I = TOC.begin(),
+         E = TOC.end(); I != E; ++I) {
+      O << *I->second << ":\n";
+      O << "\t.tc " << *I->first << "[TC]," << *I->first << '\n';
+    }
+  }
+
+  return AsmPrinter::doFinalization(M);
+}
+
+void PPCDarwinAsmPrinter::EmitStartOfAsmFile(Module &M) {
+  static const char *const CPUDirectives[] = {
+    "",
+    "ppc",
+    "ppc601",
+    "ppc602",
+    "ppc603",
+    "ppc7400",
+    "ppc750",
+    "ppc970",
+    "ppc64"
+  };
+
+  unsigned Directive = Subtarget.getDarwinDirective();
+  if (Subtarget.isGigaProcessor() && Directive < PPC::DIR_970)
+    Directive = PPC::DIR_970;
+  if (Subtarget.hasAltivec() && Directive < PPC::DIR_7400)
+    Directive = PPC::DIR_7400;
+  if (Subtarget.isPPC64() && Directive < PPC::DIR_970)
+    Directive = PPC::DIR_64;
+  assert(Directive <= PPC::DIR_64 && "Directive out of range.");
+  O << "\t.machine " << CPUDirectives[Directive] << '\n';
+
+  // Prime text sections so they are adjacent.  This reduces the likelihood a
+  // large data or debug section causes a branch to exceed 16M limit.
+  TargetLoweringObjectFileMachO &TLOFMacho = 
+    static_cast<TargetLoweringObjectFileMachO &>(getObjFileLowering());
+  OutStreamer.SwitchSection(TLOFMacho.getTextCoalSection());
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    OutStreamer.SwitchSection(
+            TLOFMacho.getMachOSection("__TEXT", "__picsymbolstub1",
+                                      MCSectionMachO::S_SYMBOL_STUBS |
+                                      MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
+                                      32, SectionKind::getText()));
+  } else if (TM.getRelocationModel() == Reloc::DynamicNoPIC) {
+    OutStreamer.SwitchSection(
+            TLOFMacho.getMachOSection("__TEXT","__symbol_stub1",
+                                      MCSectionMachO::S_SYMBOL_STUBS |
+                                      MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
+                                      16, SectionKind::getText()));
+  }
+  OutStreamer.SwitchSection(getObjFileLowering().getTextSection());
+}
+
+static const MCSymbol *GetLazyPtr(const MCSymbol *Sym, MCContext &Ctx) {
+  // Remove $stub suffix, add $lazy_ptr.
+  SmallString<128> TmpStr(Sym->getName().begin(), Sym->getName().end()-5);
+  TmpStr += "$lazy_ptr";
+  return Ctx.GetOrCreateSymbol(TmpStr.str());
+}
+
+static const MCSymbol *GetAnonSym(const MCSymbol *Sym, MCContext &Ctx) {
+  // Add $tmp suffix to $stub, yielding $stub$tmp.
+  SmallString<128> TmpStr(Sym->getName().begin(), Sym->getName().end());
+  TmpStr += "$tmp";
+  return Ctx.GetOrCreateSymbol(TmpStr.str());
+}
+
+void PPCDarwinAsmPrinter::
+EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs) {
+  bool isPPC64 = TM.getTargetData()->getPointerSizeInBits() == 64;
+  
+  TargetLoweringObjectFileMachO &TLOFMacho = 
+    static_cast<TargetLoweringObjectFileMachO &>(getObjFileLowering());
+
+  // .lazy_symbol_pointer
+  const MCSection *LSPSection = TLOFMacho.getLazySymbolPointerSection();
+  
+  // Output stubs for dynamically-linked functions
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    const MCSection *StubSection = 
+    TLOFMacho.getMachOSection("__TEXT", "__picsymbolstub1",
+                              MCSectionMachO::S_SYMBOL_STUBS |
+                              MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
+                              32, SectionKind::getText());
+    for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+      OutStreamer.SwitchSection(StubSection);
+      EmitAlignment(4);
+      
+      const MCSymbol *Stub = Stubs[i].first;
+      const MCSymbol *RawSym = Stubs[i].second;
+      const MCSymbol *LazyPtr = GetLazyPtr(Stub, OutContext);
+      const MCSymbol *AnonSymbol = GetAnonSym(Stub, OutContext);
+                                           
+      O << *Stub << ":\n";
+      O << "\t.indirect_symbol " << *RawSym << '\n';
+      O << "\tmflr r0\n";
+      O << "\tbcl 20,31," << *AnonSymbol << '\n';
+      O << *AnonSymbol << ":\n";
+      O << "\tmflr r11\n";
+      O << "\taddis r11,r11,ha16(" << *LazyPtr << '-' << *AnonSymbol
+      << ")\n";
+      O << "\tmtlr r0\n";
+      O << (isPPC64 ? "\tldu" : "\tlwzu") << " r12,lo16(" << *LazyPtr
+      << '-' << *AnonSymbol << ")(r11)\n";
+      O << "\tmtctr r12\n";
+      O << "\tbctr\n";
+      
+      OutStreamer.SwitchSection(LSPSection);
+      O << *LazyPtr << ":\n";
+      O << "\t.indirect_symbol " << *RawSym << '\n';
+      O << (isPPC64 ? "\t.quad" : "\t.long") << " dyld_stub_binding_helper\n";
+    }
+    O << '\n';
+    return;
+  }
+  
+  const MCSection *StubSection =
+    TLOFMacho.getMachOSection("__TEXT","__symbol_stub1",
+                              MCSectionMachO::S_SYMBOL_STUBS |
+                              MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
+                              16, SectionKind::getText());
+  for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+    const MCSymbol *Stub = Stubs[i].first;
+    const MCSymbol *RawSym = Stubs[i].second;
+    const MCSymbol *LazyPtr = GetLazyPtr(Stub, OutContext);
+
+    OutStreamer.SwitchSection(StubSection);
+    EmitAlignment(4);
+    O << *Stub << ":\n";
+    O << "\t.indirect_symbol " << *RawSym << '\n';
+    O << "\tlis r11,ha16(" << *LazyPtr << ")\n";
+    O << (isPPC64 ? "\tldu" :  "\tlwzu") << " r12,lo16(" << *LazyPtr
+    << ")(r11)\n";
+    O << "\tmtctr r12\n";
+    O << "\tbctr\n";
+    OutStreamer.SwitchSection(LSPSection);
+    O << *LazyPtr << ":\n";
+    O << "\t.indirect_symbol " << *RawSym << '\n';
+    O << (isPPC64 ? "\t.quad" : "\t.long") << " dyld_stub_binding_helper\n";
+  }
+  
+  O << '\n';
+}
+
+
+bool PPCDarwinAsmPrinter::doFinalization(Module &M) {
+  bool isPPC64 = TM.getTargetData()->getPointerSizeInBits() == 64;
+
+  // Darwin/PPC always uses mach-o.
+  TargetLoweringObjectFileMachO &TLOFMacho = 
+    static_cast<TargetLoweringObjectFileMachO &>(getObjFileLowering());
+  MachineModuleInfoMachO &MMIMacho =
+    MMI->getObjFileInfo<MachineModuleInfoMachO>();
+  
+  MachineModuleInfoMachO::SymbolListTy Stubs = MMIMacho.GetFnStubList();
+  if (!Stubs.empty())
+    EmitFunctionStubs(Stubs);
+
+  if (MAI->doesSupportExceptionHandling() && MMI) {
+    // Add the (possibly multiple) personalities to the set of global values.
+    // Only referenced functions get into the Personalities list.
+    const std::vector<Function *> &Personalities = MMI->getPersonalities();
+    for (std::vector<Function *>::const_iterator I = Personalities.begin(),
+         E = Personalities.end(); I != E; ++I) {
+      if (*I) {
+        MCSymbol *NLPSym = GetSymbolWithGlobalValueBase(*I, "$non_lazy_ptr");
+        MCSymbol *&StubSym = MMIMacho.getGVStubEntry(NLPSym);
+        StubSym = GetGlobalValueSymbol(*I);
+      }
+    }
+  }
+
+  // Output stubs for dynamically-linked functions.
+  Stubs = MMIMacho.GetGVStubList();
+  
+  // Output macho stubs for external and common global variables.
+  if (!Stubs.empty()) {
+    // Switch with ".non_lazy_symbol_pointer" directive.
+    OutStreamer.SwitchSection(TLOFMacho.getNonLazySymbolPointerSection());
+    EmitAlignment(isPPC64 ? 3 : 2);
+    
+    for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+      O << *Stubs[i].first << ":\n";
+      O << "\t.indirect_symbol " << *Stubs[i].second << '\n';
+      O << (isPPC64 ? "\t.quad\t0\n" : "\t.long\t0\n");
+    }
+  }
+
+  Stubs = MMIMacho.GetHiddenGVStubList();
+  if (!Stubs.empty()) {
+    OutStreamer.SwitchSection(getObjFileLowering().getDataSection());
+    EmitAlignment(isPPC64 ? 3 : 2);
+    
+    for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+      O << *Stubs[i].first << ":\n";
+      O << (isPPC64 ? "\t.quad\t" : "\t.long\t") << *Stubs[i].second << '\n';
+    }
+  }
+
+  // Funny Darwin hack: This flag tells the linker that no global symbols
+  // contain code that falls through to other global symbols (e.g. the obvious
+  // implementation of multiple entry points).  If this doesn't occur, the
+  // linker can safely perform dead code stripping.  Since LLVM never generates
+  // code that does this, it is always safe to set.
+  OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
+
+  return AsmPrinter::doFinalization(M);
+}
+
+
+
+/// createPPCAsmPrinterPass - Returns a pass that prints the PPC assembly code
+/// for a MachineFunction to the given output stream, in a format that the
+/// Darwin assembler can deal with.
+///
+static AsmPrinter *createPPCAsmPrinterPass(formatted_raw_ostream &o,
+                                           TargetMachine &tm,
+                                           MCContext &Ctx, MCStreamer &Streamer,
+                                           const MCAsmInfo *tai) {
+  const PPCSubtarget *Subtarget = &tm.getSubtarget<PPCSubtarget>();
+
+  if (Subtarget->isDarwin())
+    return new PPCDarwinAsmPrinter(o, tm, Ctx, Streamer, tai);
+  return new PPCLinuxAsmPrinter(o, tm, Ctx, Streamer, tai);
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializePowerPCAsmPrinter() { 
+  TargetRegistry::RegisterAsmPrinter(ThePPC32Target, createPPCAsmPrinterPass);
+  TargetRegistry::RegisterAsmPrinter(ThePPC64Target, createPPCAsmPrinterPass);
+}

diff --git a/lib/Target/PowerPC/CMakeLists.txt b/lib/Target/PowerPC/CMakeLists.txt
new file mode 100644
index 0000000..c997c5c
--- /dev/null
+++ b/lib/Target/PowerPC/CMakeLists.txt

@@ -0,0 +1,29 @@
+set(LLVM_TARGET_DEFINITIONS PPC.td)
+
+tablegen(PPCGenInstrNames.inc -gen-instr-enums)
+tablegen(PPCGenRegisterNames.inc -gen-register-enums)
+tablegen(PPCGenAsmWriter.inc -gen-asm-writer)
+tablegen(PPCGenCodeEmitter.inc -gen-emitter)
+tablegen(PPCGenRegisterInfo.h.inc -gen-register-desc-header)
+tablegen(PPCGenRegisterInfo.inc -gen-register-desc)
+tablegen(PPCGenInstrInfo.inc -gen-instr-desc)
+tablegen(PPCGenDAGISel.inc -gen-dag-isel)
+tablegen(PPCGenCallingConv.inc -gen-callingconv)
+tablegen(PPCGenSubtarget.inc -gen-subtarget)
+
+add_llvm_target(PowerPCCodeGen
+  PPCBranchSelector.cpp
+  PPCCodeEmitter.cpp
+  PPCHazardRecognizers.cpp
+  PPCInstrInfo.cpp
+  PPCISelDAGToDAG.cpp
+  PPCISelLowering.cpp
+  PPCJITInfo.cpp
+  PPCMCAsmInfo.cpp
+  PPCPredicates.cpp
+  PPCRegisterInfo.cpp
+  PPCSubtarget.cpp
+  PPCTargetMachine.cpp
+  )
+
+target_link_libraries (LLVMPowerPCCodeGen LLVMSelectionDAG)

diff --git a/lib/Target/PowerPC/Makefile b/lib/Target/PowerPC/Makefile
new file mode 100644
index 0000000..1265f1d
--- /dev/null
+++ b/lib/Target/PowerPC/Makefile

@@ -0,0 +1,23 @@
+##===- lib/Target/PowerPC/Makefile -------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMPowerPCCodeGen
+TARGET = PPC
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = PPCGenInstrNames.inc PPCGenRegisterNames.inc \
+                PPCGenAsmWriter.inc  PPCGenCodeEmitter.inc \
+                PPCGenRegisterInfo.h.inc PPCGenRegisterInfo.inc \
+                PPCGenInstrInfo.inc PPCGenDAGISel.inc \
+                PPCGenSubtarget.inc PPCGenCallingConv.inc
+
+DIRS = AsmPrinter TargetInfo
+
+include $(LEVEL)/Makefile.common

diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h
new file mode 100644
index 0000000..67e3a4a
--- /dev/null
+++ b/lib/Target/PowerPC/PPC.h

@@ -0,0 +1,47 @@
+//===-- PPC.h - Top-level interface for PowerPC Target ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// PowerPC back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_POWERPC_H
+#define LLVM_TARGET_POWERPC_H
+
+// GCC #defines PPC on Linux but we use it as our namespace name
+#undef PPC
+
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+  class PPCTargetMachine;
+  class FunctionPass;
+  class formatted_raw_ostream;
+  
+FunctionPass *createPPCBranchSelectionPass();
+FunctionPass *createPPCISelDag(PPCTargetMachine &TM);
+FunctionPass *createPPCJITCodeEmitterPass(PPCTargetMachine &TM,
+                                          JITCodeEmitter &MCE);
+
+extern Target ThePPC32Target;
+extern Target ThePPC64Target;
+
+} // end namespace llvm;
+
+// Defines symbolic names for PowerPC registers.  This defines a mapping from
+// register name to register number.
+//
+#include "PPCGenRegisterNames.inc"
+
+// Defines symbolic names for the PowerPC instructions.
+//
+#include "PPCGenInstrNames.inc"
+
+#endif

diff --git a/lib/Target/PowerPC/PPC.td b/lib/Target/PowerPC/PPC.td
new file mode 100644
index 0000000..08f5bb4
--- /dev/null
+++ b/lib/Target/PowerPC/PPC.td

@@ -0,0 +1,114 @@
+//===- PPC.td - Describe the PowerPC Target Machine --------*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This is the top level entry point for the PowerPC target.
+//
+//===----------------------------------------------------------------------===//
+
+// Get the target-independent interfaces which we are implementing.
+//
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// PowerPC Subtarget features.
+//
+ 
+//===----------------------------------------------------------------------===//
+// CPU Directives                                                             //
+//===----------------------------------------------------------------------===//
+
+def Directive601 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_601", "">;
+def Directive602 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_602", "">;
+def Directive603 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_603", "">;
+def Directive604 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_603", "">;
+def Directive620 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_603", "">;
+def Directive7400: SubtargetFeature<"", "DarwinDirective", "PPC::DIR_7400", "">;
+def Directive750 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_750", "">;
+def Directive970 : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_970", "">;
+def Directive32  : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_32", "">;
+def Directive64  : SubtargetFeature<"", "DarwinDirective", "PPC::DIR_64", "">;
+
+def Feature64Bit     : SubtargetFeature<"64bit","Has64BitSupport", "true",
+                                        "Enable 64-bit instructions">;
+def Feature64BitRegs : SubtargetFeature<"64bitregs","Use64BitRegs", "true",
+                              "Enable 64-bit registers usage for ppc32 [beta]">;
+def FeatureAltivec   : SubtargetFeature<"altivec","HasAltivec", "true",
+                                        "Enable Altivec instructions">;
+def FeatureGPUL      : SubtargetFeature<"gpul","IsGigaProcessor", "true",
+                                        "Enable GPUL instructions">;
+def FeatureFSqrt     : SubtargetFeature<"fsqrt","HasFSQRT", "true",
+                                        "Enable the fsqrt instruction">; 
+def FeatureSTFIWX    : SubtargetFeature<"stfiwx","HasSTFIWX", "true",
+                                        "Enable the stfiwx instruction">; 
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "PPCRegisterInfo.td"
+include "PPCSchedule.td"
+include "PPCInstrInfo.td"
+
+//===----------------------------------------------------------------------===//
+// PowerPC processors supported.
+//
+
+def : Processor<"generic", G3Itineraries, [Directive32]>;
+def : Processor<"601", G3Itineraries, [Directive601]>;
+def : Processor<"602", G3Itineraries, [Directive602]>;
+def : Processor<"603", G3Itineraries, [Directive603]>;
+def : Processor<"603e", G3Itineraries, [Directive603]>;
+def : Processor<"603ev", G3Itineraries, [Directive603]>;
+def : Processor<"604", G3Itineraries, [Directive604]>;
+def : Processor<"604e", G3Itineraries, [Directive604]>;
+def : Processor<"620", G3Itineraries, [Directive620]>;
+def : Processor<"g3", G3Itineraries, [Directive7400]>;
+def : Processor<"7400", G4Itineraries, [Directive7400, FeatureAltivec]>;
+def : Processor<"g4", G4Itineraries, [Directive7400, FeatureAltivec]>;
+def : Processor<"7450", G4PlusItineraries, [Directive7400, FeatureAltivec]>;
+def : Processor<"g4+", G4PlusItineraries, [Directive750, FeatureAltivec]>;
+def : Processor<"750", G4Itineraries, [Directive750, FeatureAltivec]>;
+def : Processor<"970", G5Itineraries,
+                  [Directive970, FeatureAltivec,
+                   FeatureGPUL, FeatureFSqrt, FeatureSTFIWX,
+                   Feature64Bit /*, Feature64BitRegs */]>;
+def : Processor<"g5", G5Itineraries,
+                  [Directive970, FeatureAltivec,
+                   FeatureGPUL, FeatureFSqrt, FeatureSTFIWX,
+                   Feature64Bit /*, Feature64BitRegs */]>;
+def : Processor<"ppc", G3Itineraries, [Directive32]>;
+def : Processor<"ppc64", G5Itineraries,
+                  [Directive64, FeatureAltivec,
+                   FeatureGPUL, FeatureFSqrt, FeatureSTFIWX,
+                   Feature64Bit /*, Feature64BitRegs */]>;
+
+
+//===----------------------------------------------------------------------===//
+// Calling Conventions
+//===----------------------------------------------------------------------===//
+
+include "PPCCallingConv.td"
+
+def PPCInstrInfo : InstrInfo {
+  // Define how we want to layout our TargetSpecific information field... This
+  // should be kept up-to-date with the fields in the PPCInstrInfo.h file.
+  let TSFlagsFields = ["PPC970_First",
+                       "PPC970_Single",
+                       "PPC970_Cracked",
+                       "PPC970_Unit"];
+  let TSFlagsShifts = [0, 1, 2, 3];
+
+  let isLittleEndianEncoding = 1;
+}
+
+
+def PPC : Target {
+  // Information about the instructions.
+  let InstructionSet = PPCInstrInfo;
+}

diff --git a/lib/Target/PowerPC/PPCBranchSelector.cpp b/lib/Target/PowerPC/PPCBranchSelector.cpp
new file mode 100644
index 0000000..a752421
--- /dev/null
+++ b/lib/Target/PowerPC/PPCBranchSelector.cpp

@@ -0,0 +1,173 @@
+//===-- PPCBranchSelector.cpp - Emit long conditional branches-----*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that scans a machine function to determine which
+// conditional branches need more than 16 bits of displacement to reach their
+// target basic block.  It does this in two passes; a calculation of basic block
+// positions pass, and a branch psuedo op to machine branch opcode pass.  This
+// pass should be run last, just before the assembly printer.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ppc-branch-select"
+#include "PPC.h"
+#include "PPCInstrBuilder.h"
+#include "PPCInstrInfo.h"
+#include "PPCPredicates.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/MathExtras.h"
+using namespace llvm;
+
+STATISTIC(NumExpanded, "Number of branches expanded to long format");
+
+namespace {
+  struct PPCBSel : public MachineFunctionPass {
+    static char ID;
+    PPCBSel() : MachineFunctionPass(&ID) {}
+
+    /// BlockSizes - The sizes of the basic blocks in the function.
+    std::vector<unsigned> BlockSizes;
+
+    virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+    virtual const char *getPassName() const {
+      return "PowerPC Branch Selector";
+    }
+  };
+  char PPCBSel::ID = 0;
+}
+
+/// createPPCBranchSelectionPass - returns an instance of the Branch Selection
+/// Pass
+///
+FunctionPass *llvm::createPPCBranchSelectionPass() {
+  return new PPCBSel();
+}
+
+bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) {
+  const TargetInstrInfo *TII = Fn.getTarget().getInstrInfo();
+  // Give the blocks of the function a dense, in-order, numbering.
+  Fn.RenumberBlocks();
+  BlockSizes.resize(Fn.getNumBlockIDs());
+
+  // Measure each MBB and compute a size for the entire function.
+  unsigned FuncSize = 0;
+  for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+       ++MFI) {
+    MachineBasicBlock *MBB = MFI;
+
+    unsigned BlockSize = 0;
+    for (MachineBasicBlock::iterator MBBI = MBB->begin(), EE = MBB->end();
+         MBBI != EE; ++MBBI)
+      BlockSize += TII->GetInstSizeInBytes(MBBI);
+    
+    BlockSizes[MBB->getNumber()] = BlockSize;
+    FuncSize += BlockSize;
+  }
+  
+  // If the entire function is smaller than the displacement of a branch field,
+  // we know we don't need to shrink any branches in this function.  This is a
+  // common case.
+  if (FuncSize < (1 << 15)) {
+    BlockSizes.clear();
+    return false;
+  }
+  
+  // For each conditional branch, if the offset to its destination is larger
+  // than the offset field allows, transform it into a long branch sequence
+  // like this:
+  //   short branch:
+  //     bCC MBB
+  //   long branch:
+  //     b!CC $PC+8
+  //     b MBB
+  //
+  bool MadeChange = true;
+  bool EverMadeChange = false;
+  while (MadeChange) {
+    // Iteratively expand branches until we reach a fixed point.
+    MadeChange = false;
+  
+    for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
+         ++MFI) {
+      MachineBasicBlock &MBB = *MFI;
+      unsigned MBBStartOffset = 0;
+      for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+           I != E; ++I) {
+        if (I->getOpcode() != PPC::BCC || I->getOperand(2).isImm()) {
+          MBBStartOffset += TII->GetInstSizeInBytes(I);
+          continue;
+        }
+        
+        // Determine the offset from the current branch to the destination
+        // block.
+        MachineBasicBlock *Dest = I->getOperand(2).getMBB();
+        
+        int BranchSize;
+        if (Dest->getNumber() <= MBB.getNumber()) {
+          // If this is a backwards branch, the delta is the offset from the
+          // start of this block to this branch, plus the sizes of all blocks
+          // from this block to the dest.
+          BranchSize = MBBStartOffset;
+          
+          for (unsigned i = Dest->getNumber(), e = MBB.getNumber(); i != e; ++i)
+            BranchSize += BlockSizes[i];
+        } else {
+          // Otherwise, add the size of the blocks between this block and the
+          // dest to the number of bytes left in this block.
+          BranchSize = -MBBStartOffset;
+
+          for (unsigned i = MBB.getNumber(), e = Dest->getNumber(); i != e; ++i)
+            BranchSize += BlockSizes[i];
+        }
+
+        // If this branch is in range, ignore it.
+        if (isInt16(BranchSize)) {
+          MBBStartOffset += 4;
+          continue;
+        }
+        
+        // Otherwise, we have to expand it to a long branch.
+        // The BCC operands are:
+        // 0. PPC branch predicate
+        // 1. CR register
+        // 2. Target MBB
+        PPC::Predicate Pred = (PPC::Predicate)I->getOperand(0).getImm();
+        unsigned CRReg = I->getOperand(1).getReg();
+        
+        MachineInstr *OldBranch = I;
+        DebugLoc dl = OldBranch->getDebugLoc();
+        
+        // Jump over the uncond branch inst (i.e. $PC+8) on opposite condition.
+        BuildMI(MBB, I, dl, TII->get(PPC::BCC))
+          .addImm(PPC::InvertPredicate(Pred)).addReg(CRReg).addImm(2);
+        
+        // Uncond branch to the real destination.
+        I = BuildMI(MBB, I, dl, TII->get(PPC::B)).addMBB(Dest);
+
+        // Remove the old branch from the function.
+        OldBranch->eraseFromParent();
+        
+        // Remember that this instruction is 8-bytes, increase the size of the
+        // block by 4, remember to iterate.
+        BlockSizes[MBB.getNumber()] += 4;
+        MBBStartOffset += 8;
+        ++NumExpanded;
+        MadeChange = true;
+      }
+    }
+    EverMadeChange |= MadeChange;
+  }
+  
+  BlockSizes.clear();
+  return true;
+}
+

diff --git a/lib/Target/PowerPC/PPCCallingConv.td b/lib/Target/PowerPC/PPCCallingConv.td
new file mode 100644
index 0000000..c7ce171
--- /dev/null
+++ b/lib/Target/PowerPC/PPCCallingConv.td

@@ -0,0 +1,147 @@
+//===- PPCCallingConv.td - Calling Conventions for PowerPC ------*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the PowerPC 32- and 64-bit
+// architectures.
+//
+//===----------------------------------------------------------------------===//
+
+/// CCIfSubtarget - Match if the current subtarget has a feature F.
+class CCIfSubtarget<string F, CCAction A>
+ : CCIf<!strconcat("State.getTarget().getSubtarget<PPCSubtarget>().", F), A>;
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Convention
+//===----------------------------------------------------------------------===//
+
+// Return-value convention for PowerPC
+def RetCC_PPC : CallingConv<[
+  CCIfType<[i32], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>,
+  CCIfType<[i64], CCAssignToReg<[X3, X4, X5, X6]>>,
+  
+  CCIfType<[f32], CCAssignToReg<[F1]>>,
+  CCIfType<[f64], CCAssignToReg<[F1, F2]>>,
+  
+  // Vector types are always returned in V2.
+  CCIfType<[v16i8, v8i16, v4i32, v4f32], CCAssignToReg<[V2]>>
+]>;
+
+
+//===----------------------------------------------------------------------===//
+// PowerPC Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+/*
+def CC_PPC : CallingConv<[
+  // The first 8 integer arguments are passed in integer registers.
+  CCIfType<[i32], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>,
+  CCIfType<[i64], CCAssignToReg<[X3, X4, X5, X6, X7, X8, X9, X10]>>,
+  
+  // Common sub-targets passes FP values in F1 - F13
+  CCIfType<[f32, f64], 
+           CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8,F9,F10,F11,F12,F13]>>,
+           
+  // The first 12 Vector arguments are passed in altivec registers.
+  CCIfType<[v16i8, v8i16, v4i32, v4f32],
+              CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9, V10,V11,V12,V13]>>
+
+/*
+  // Integer/FP values get stored in stack slots that are 8 bytes in size and
+  // 8-byte aligned if there are no more registers to hold them.
+  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
+  
+  // Vectors get 16-byte stack slots that are 16-byte aligned.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+              CCAssignToStack<16, 16>>*/
+]>;
+
+*/
+
+//===----------------------------------------------------------------------===//
+// PowerPC System V Release 4 ABI
+//===----------------------------------------------------------------------===//
+
+// _Complex arguments are never split, thus their two scalars are either
+// passed both in argument registers or both on the stack. Also _Complex
+// arguments are always passed in general purpose registers, never in
+// Floating-point registers or vector registers. Arguments which should go
+// on the stack are marked with the inreg parameter attribute.
+// Giving inreg this target-dependent (and counter-intuitive) meaning
+// simplifies things, because functions calls are not always coming from the
+// frontend but are also created implicitly e.g. for libcalls. If inreg would
+// actually mean that the argument is passed in a register, then all places
+// which create function calls/function definitions implicitly would need to
+// be aware of this fact and would need to mark arguments accordingly. With
+// inreg meaning that the argument is passed on the stack, this is not an
+// issue, except for calls which involve _Complex types.
+
+def CC_PPC_SVR4_Common : CallingConv<[
+  // The ABI requires i64 to be passed in two adjacent registers with the first
+  // register having an odd register number.
+  CCIfType<[i32], CCIfSplit<CCCustom<"CC_PPC_SVR4_Custom_AlignArgRegs">>>,
+
+  // The first 8 integer arguments are passed in integer registers.
+  CCIfType<[i32], CCIf<"!ArgFlags.isInReg()",
+    CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>>,
+
+  // Make sure the i64 words from a long double are either both passed in
+  // registers or both passed on the stack.
+  CCIfType<[f64], CCIfSplit<CCCustom<"CC_PPC_SVR4_Custom_AlignFPArgRegs">>>,
+  
+  // FP values are passed in F1 - F8.
+  CCIfType<[f32, f64], CCAssignToReg<[F1, F2, F3, F4, F5, F6, F7, F8]>>,
+
+  // Split arguments have an alignment of 8 bytes on the stack.
+  CCIfType<[i32], CCIfSplit<CCAssignToStack<4, 8>>>,
+  
+  CCIfType<[i32], CCAssignToStack<4, 4>>,
+  
+  // Floats are stored in double precision format, thus they have the same
+  // alignment and size as doubles.
+  CCIfType<[f32,f64], CCAssignToStack<8, 8>>,  
+
+  // Vectors get 16-byte stack slots that are 16-byte aligned.
+  CCIfType<[v16i8, v8i16, v4i32, v4f32], CCAssignToStack<16, 16>>
+]>;
+
+// This calling convention puts vector arguments always on the stack. It is used
+// to assign vector arguments which belong to the variable portion of the
+// parameter list of a variable argument function.
+def CC_PPC_SVR4_VarArg : CallingConv<[
+  CCDelegateTo<CC_PPC_SVR4_Common>
+]>;
+
+// In contrast to CC_PPC_SVR4_VarArg, this calling convention first tries to put
+// vector arguments in vector registers before putting them on the stack.
+def CC_PPC_SVR4 : CallingConv<[
+  // The first 12 Vector arguments are passed in AltiVec registers.
+  CCIfType<[v16i8, v8i16, v4i32, v4f32],
+           CCAssignToReg<[V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13]>>,
+           
+  CCDelegateTo<CC_PPC_SVR4_Common>
+]>;  
+
+// Helper "calling convention" to handle aggregate by value arguments.
+// Aggregate by value arguments are always placed in the local variable space
+// of the caller. This calling convention is only used to assign those stack
+// offsets in the callers stack frame.
+//
+// Still, the address of the aggregate copy in the callers stack frame is passed
+// in a GPR (or in the parameter list area if all GPRs are allocated) from the
+// caller to the callee. The location for the address argument is assigned by
+// the CC_PPC_SVR4 calling convention.
+//
+// The only purpose of CC_PPC_SVR4_Custom_Dummy is to skip arguments which are
+// not passed by value.
+ 
+def CC_PPC_SVR4_ByVal : CallingConv<[
+  CCIfByVal<CCPassByVal<4, 4>>,
+  
+  CCCustom<"CC_PPC_SVR4_Custom_Dummy">
+]>;
+

diff --git a/lib/Target/PowerPC/PPCCodeEmitter.cpp b/lib/Target/PowerPC/PPCCodeEmitter.cpp
new file mode 100644
index 0000000..327470d
--- /dev/null
+++ b/lib/Target/PowerPC/PPCCodeEmitter.cpp

@@ -0,0 +1,250 @@
+//===-- PPCCodeEmitter.cpp - JIT Code Emitter for PowerPC32 -------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the PowerPC 32-bit CodeEmitter and associated machinery to
+// JIT-compile bitcode to native PowerPC.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCTargetMachine.h"
+#include "PPCRelocations.h"
+#include "PPC.h"
+#include "llvm/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+namespace {
+  class PPCCodeEmitter : public MachineFunctionPass {
+    TargetMachine &TM;
+    JITCodeEmitter &MCE;
+    
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<MachineModuleInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+    
+    static char ID;
+    
+    /// MovePCtoLROffset - When/if we see a MovePCtoLR instruction, we record
+    /// its address in the function into this pointer.
+    void *MovePCtoLROffset;
+  public:
+    
+    PPCCodeEmitter(TargetMachine &tm, JITCodeEmitter &mce)
+      : MachineFunctionPass(&ID), TM(tm), MCE(mce) {}
+
+    /// getBinaryCodeForInstr - This function, generated by the
+    /// CodeEmitterGenerator using TableGen, produces the binary encoding for
+    /// machine instructions.
+
+    unsigned getBinaryCodeForInstr(const MachineInstr &MI);
+
+    /// getMachineOpValue - evaluates the MachineOperand of a given MachineInstr
+
+    unsigned getMachineOpValue(const MachineInstr &MI,
+                               const MachineOperand &MO);
+
+    const char *getPassName() const { return "PowerPC Machine Code Emitter"; }
+
+    /// runOnMachineFunction - emits the given MachineFunction to memory
+    ///
+    bool runOnMachineFunction(MachineFunction &MF);
+
+    /// emitBasicBlock - emits the given MachineBasicBlock to memory
+    ///
+    void emitBasicBlock(MachineBasicBlock &MBB);
+
+    /// getValueBit - return the particular bit of Val
+    ///
+    unsigned getValueBit(int64_t Val, unsigned bit) { return (Val >> bit) & 1; }
+  };
+}
+
+char PPCCodeEmitter::ID = 0;
+
+/// createPPCCodeEmitterPass - Return a pass that emits the collected PPC code
+/// to the specified MCE object.
+FunctionPass *llvm::createPPCJITCodeEmitterPass(PPCTargetMachine &TM,
+                                                JITCodeEmitter &JCE) {
+  return new PPCCodeEmitter(TM, JCE);
+}
+
+bool PPCCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
+  assert((MF.getTarget().getRelocationModel() != Reloc::Default ||
+          MF.getTarget().getRelocationModel() != Reloc::Static) &&
+         "JIT relocation model must be set to static or default!");
+
+  MCE.setModuleInfo(&getAnalysis<MachineModuleInfo>());
+  do {
+    MovePCtoLROffset = 0;
+    MCE.startFunction(MF);
+    for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB)
+      emitBasicBlock(*BB);
+  } while (MCE.finishFunction(MF));
+
+  return false;
+}
+
+void PPCCodeEmitter::emitBasicBlock(MachineBasicBlock &MBB) {
+  MCE.StartMachineBasicBlock(&MBB);
+
+  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ++I){
+    const MachineInstr &MI = *I;
+    MCE.processDebugLoc(MI.getDebugLoc(), true);
+    switch (MI.getOpcode()) {
+    default:
+      MCE.emitWordBE(getBinaryCodeForInstr(MI));
+      break;
+    case TargetOpcode::DBG_LABEL:
+    case TargetOpcode::EH_LABEL:
+      MCE.emitLabel(MI.getOperand(0).getImm());
+      break;
+    case TargetOpcode::IMPLICIT_DEF:
+    case TargetOpcode::KILL:
+      break; // pseudo opcode, no side effects
+    case PPC::MovePCtoLR:
+    case PPC::MovePCtoLR8:
+      assert(TM.getRelocationModel() == Reloc::PIC_);
+      MovePCtoLROffset = (void*)MCE.getCurrentPCValue();
+      MCE.emitWordBE(0x48000005);   // bl 1
+      break;
+    }
+    MCE.processDebugLoc(MI.getDebugLoc(), false);
+  }
+}
+
+unsigned PPCCodeEmitter::getMachineOpValue(const MachineInstr &MI,
+                                           const MachineOperand &MO) {
+
+  unsigned rv = 0; // Return value; defaults to 0 for unhandled cases
+                   // or things that get fixed up later by the JIT.
+  if (MO.isReg()) {
+    rv = PPCRegisterInfo::getRegisterNumbering(MO.getReg());
+
+    // Special encoding for MTCRF and MFOCRF, which uses a bit mask for the
+    // register, not the register number directly.
+    if ((MI.getOpcode() == PPC::MTCRF || MI.getOpcode() == PPC::MFOCRF) &&
+        (MO.getReg() >= PPC::CR0 && MO.getReg() <= PPC::CR7)) {
+      rv = 0x80 >> rv;
+    }
+  } else if (MO.isImm()) {
+    rv = MO.getImm();
+  } else if (MO.isGlobal() || MO.isSymbol() ||
+             MO.isCPI() || MO.isJTI()) {
+    unsigned Reloc = 0;
+    if (MI.getOpcode() == PPC::BL_Darwin || MI.getOpcode() == PPC::BL8_Darwin ||
+        MI.getOpcode() == PPC::BL_SVR4 || MI.getOpcode() == PPC::BL8_ELF ||
+        MI.getOpcode() == PPC::TAILB || MI.getOpcode() == PPC::TAILB8)
+      Reloc = PPC::reloc_pcrel_bx;
+    else {
+      if (TM.getRelocationModel() == Reloc::PIC_) {
+        assert(MovePCtoLROffset && "MovePCtoLR not seen yet?");
+      }
+      switch (MI.getOpcode()) {
+      default: MI.dump(); llvm_unreachable("Unknown instruction for relocation!");
+      case PPC::LIS:
+      case PPC::LIS8:
+      case PPC::ADDIS:
+      case PPC::ADDIS8:
+        Reloc = PPC::reloc_absolute_high;       // Pointer to symbol
+        break;
+      case PPC::LI:
+      case PPC::LI8:
+      case PPC::LA:
+      // Loads.
+      case PPC::LBZ:
+      case PPC::LBZ8:
+      case PPC::LHA:
+      case PPC::LHA8:
+      case PPC::LHZ:
+      case PPC::LHZ8:
+      case PPC::LWZ:
+      case PPC::LWZ8:
+      case PPC::LFS:
+      case PPC::LFD:
+
+      // Stores.
+      case PPC::STB:
+      case PPC::STB8:
+      case PPC::STH:
+      case PPC::STH8:
+      case PPC::STW:
+      case PPC::STW8:
+      case PPC::STFS:
+      case PPC::STFD:
+        Reloc = PPC::reloc_absolute_low;
+        break;
+
+      case PPC::LWA:
+      case PPC::LD:
+      case PPC::STD:
+      case PPC::STD_32:
+        Reloc = PPC::reloc_absolute_low_ix;
+        break;
+      }
+    }
+
+    MachineRelocation R;
+    if (MO.isGlobal()) {
+      R = MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc,
+                                   MO.getGlobal(), 0,
+                                   isa<Function>(MO.getGlobal()));
+    } else if (MO.isSymbol()) {
+      R = MachineRelocation::getExtSym(MCE.getCurrentPCOffset(),
+                                       Reloc, MO.getSymbolName(), 0);
+    } else if (MO.isCPI()) {
+      R = MachineRelocation::getConstPool(MCE.getCurrentPCOffset(),
+                                          Reloc, MO.getIndex(), 0);
+    } else {
+      assert(MO.isJTI());
+      R = MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(),
+                                          Reloc, MO.getIndex(), 0);
+    }
+
+    // If in PIC mode, we need to encode the negated address of the
+    // 'movepctolr' into the unrelocated field.  After relocation, we'll have
+    // &gv-&movepctolr-4 in the imm field.  Once &movepctolr is added to the imm
+    // field, we get &gv.  This doesn't happen for branch relocations, which are
+    // always implicitly pc relative.
+    if (TM.getRelocationModel() == Reloc::PIC_ && Reloc != PPC::reloc_pcrel_bx){
+      assert(MovePCtoLROffset && "MovePCtoLR not seen yet?");
+      R.setConstantVal(-(intptr_t)MovePCtoLROffset - 4);
+    }
+    MCE.addRelocation(R);
+
+  } else if (MO.isMBB()) {
+    unsigned Reloc = 0;
+    unsigned Opcode = MI.getOpcode();
+    if (Opcode == PPC::B || Opcode == PPC::BL_Darwin ||
+        Opcode == PPC::BLA_Darwin|| Opcode == PPC::BL_SVR4 ||
+        Opcode == PPC::BLA_SVR4)
+      Reloc = PPC::reloc_pcrel_bx;
+    else // BCC instruction
+      Reloc = PPC::reloc_pcrel_bcx;
+
+    MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(),
+                                               Reloc, MO.getMBB()));
+  } else {
+#ifndef NDEBUG
+    errs() << "ERROR: Unknown type of MachineOperand: " << MO << "\n";
+#endif
+    llvm_unreachable(0);
+  }
+
+  return rv;
+}
+
+#include "PPCGenCodeEmitter.inc"

diff --git a/lib/Target/PowerPC/PPCFrameInfo.h b/lib/Target/PowerPC/PPCFrameInfo.h
new file mode 100644
index 0000000..7587b03
--- /dev/null
+++ b/lib/Target/PowerPC/PPCFrameInfo.h

@@ -0,0 +1,300 @@
+//===-- PPCFrameInfo.h - Define TargetFrameInfo for PowerPC -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POWERPC_FRAMEINFO_H
+#define POWERPC_FRAMEINFO_H
+
+#include "PPC.h"
+#include "PPCSubtarget.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+
+namespace llvm {
+
+class PPCFrameInfo: public TargetFrameInfo {
+  const TargetMachine &TM;
+
+public:
+  PPCFrameInfo(const TargetMachine &tm, bool LP64)
+    : TargetFrameInfo(TargetFrameInfo::StackGrowsDown, 16, 0), TM(tm) {
+  }
+
+  /// getReturnSaveOffset - Return the previous frame offset to save the
+  /// return address.
+  static unsigned getReturnSaveOffset(bool isPPC64, bool isDarwinABI) {
+    if (isDarwinABI)
+      return isPPC64 ? 16 : 8;
+    // SVR4 ABI:
+    return isPPC64 ? 16 : 4;
+  }
+
+  /// getFramePointerSaveOffset - Return the previous frame offset to save the
+  /// frame pointer.
+  static unsigned getFramePointerSaveOffset(bool isPPC64, bool isDarwinABI) {
+    // For the Darwin ABI:
+    // We cannot use the TOC save slot (offset +20) in the PowerPC linkage area
+    // for saving the frame pointer (if needed.)  While the published ABI has
+    // not used this slot since at least MacOSX 10.2, there is older code
+    // around that does use it, and that needs to continue to work.
+    if (isDarwinABI)
+      return isPPC64 ? -8U : -4U;
+    
+    // SVR4 ABI: First slot in the general register save area.
+    return isPPC64 ? -8U : -4U;
+  }
+  
+  /// getLinkageSize - Return the size of the PowerPC ABI linkage area.
+  ///
+  static unsigned getLinkageSize(bool isPPC64, bool isDarwinABI) {
+    if (isDarwinABI || isPPC64)
+      return 6 * (isPPC64 ? 8 : 4);
+    
+    // SVR4 ABI:
+    return 8;
+  }
+
+  /// getMinCallArgumentsSize - Return the size of the minium PowerPC ABI
+  /// argument area.
+  static unsigned getMinCallArgumentsSize(bool isPPC64, bool isDarwinABI) {
+    // For the Darwin ABI / 64-bit SVR4 ABI:
+    // The prolog code of the callee may store up to 8 GPR argument registers to
+    // the stack, allowing va_start to index over them in memory if its varargs.
+    // Because we cannot tell if this is needed on the caller side, we have to
+    // conservatively assume that it is needed.  As such, make sure we have at
+    // least enough stack space for the caller to store the 8 GPRs.
+    if (isDarwinABI || isPPC64)
+      return 8 * (isPPC64 ? 8 : 4);
+    
+    // 32-bit SVR4 ABI:
+    // There is no default stack allocated for the 8 first GPR arguments.
+    return 0;
+  }
+
+  /// getMinCallFrameSize - Return the minimum size a call frame can be using
+  /// the PowerPC ABI.
+  static unsigned getMinCallFrameSize(bool isPPC64, bool isDarwinABI) {
+    // The call frame needs to be at least big enough for linkage and 8 args.
+    return getLinkageSize(isPPC64, isDarwinABI) +
+           getMinCallArgumentsSize(isPPC64, isDarwinABI);
+  }
+
+  // With the SVR4 ABI, callee-saved registers have fixed offsets on the stack.
+  const SpillSlot *
+  getCalleeSavedSpillSlots(unsigned &NumEntries) const {
+    if (TM.getSubtarget<PPCSubtarget>().isDarwinABI()) {
+      NumEntries = 1;
+      if (TM.getSubtarget<PPCSubtarget>().isPPC64()) {
+        static const SpillSlot darwin64Offsets = {PPC::X31, -8};
+        return &darwin64Offsets;
+      } else {
+        static const SpillSlot darwinOffsets = {PPC::R31, -4};
+        return &darwinOffsets;
+      }
+    }
+
+    // Early exit if not using the SVR4 ABI.
+    if (!TM.getSubtarget<PPCSubtarget>().isSVR4ABI()) {
+      NumEntries = 0;
+      return 0;
+    }
+
+    static const SpillSlot Offsets[] = {
+      // Floating-point register save area offsets.
+      {PPC::F31, -8},
+      {PPC::F30, -16},
+      {PPC::F29, -24},
+      {PPC::F28, -32},
+      {PPC::F27, -40},
+      {PPC::F26, -48},
+      {PPC::F25, -56},
+      {PPC::F24, -64},
+      {PPC::F23, -72},
+      {PPC::F22, -80},
+      {PPC::F21, -88},
+      {PPC::F20, -96},
+      {PPC::F19, -104},
+      {PPC::F18, -112},
+      {PPC::F17, -120},
+      {PPC::F16, -128},
+      {PPC::F15, -136},
+      {PPC::F14, -144},
+
+      // General register save area offsets.
+      {PPC::R31, -4},
+      {PPC::R30, -8},
+      {PPC::R29, -12},
+      {PPC::R28, -16},
+      {PPC::R27, -20},
+      {PPC::R26, -24},
+      {PPC::R25, -28},
+      {PPC::R24, -32},
+      {PPC::R23, -36},
+      {PPC::R22, -40},
+      {PPC::R21, -44},
+      {PPC::R20, -48},
+      {PPC::R19, -52},
+      {PPC::R18, -56},
+      {PPC::R17, -60},
+      {PPC::R16, -64},
+      {PPC::R15, -68},
+      {PPC::R14, -72},
+
+      // CR save area offset.
+      // FIXME SVR4: Disable CR save area for now.
+//      {PPC::CR2, -4},
+//      {PPC::CR3, -4},
+//      {PPC::CR4, -4},
+//      {PPC::CR2LT, -4},
+//      {PPC::CR2GT, -4},
+//      {PPC::CR2EQ, -4},
+//      {PPC::CR2UN, -4},
+//      {PPC::CR3LT, -4},
+//      {PPC::CR3GT, -4},
+//      {PPC::CR3EQ, -4},
+//      {PPC::CR3UN, -4},
+//      {PPC::CR4LT, -4},
+//      {PPC::CR4GT, -4},
+//      {PPC::CR4EQ, -4},
+//      {PPC::CR4UN, -4},
+
+      // VRSAVE save area offset.
+      {PPC::VRSAVE, -4},
+
+      // Vector register save area
+      {PPC::V31, -16},
+      {PPC::V30, -32},
+      {PPC::V29, -48},
+      {PPC::V28, -64},
+      {PPC::V27, -80},
+      {PPC::V26, -96},
+      {PPC::V25, -112},
+      {PPC::V24, -128},
+      {PPC::V23, -144},
+      {PPC::V22, -160},
+      {PPC::V21, -176},
+      {PPC::V20, -192}
+    };
+
+    static const SpillSlot Offsets64[] = {
+      // Floating-point register save area offsets.
+      {PPC::F31, -8},
+      {PPC::F30, -16},
+      {PPC::F29, -24},
+      {PPC::F28, -32},
+      {PPC::F27, -40},
+      {PPC::F26, -48},
+      {PPC::F25, -56},
+      {PPC::F24, -64},
+      {PPC::F23, -72},
+      {PPC::F22, -80},
+      {PPC::F21, -88},
+      {PPC::F20, -96},
+      {PPC::F19, -104},
+      {PPC::F18, -112},
+      {PPC::F17, -120},
+      {PPC::F16, -128},
+      {PPC::F15, -136},
+      {PPC::F14, -144},
+
+      // General register save area offsets.
+      // FIXME 64-bit SVR4: Are 32-bit registers actually allocated in 64-bit
+      //                    mode?
+      {PPC::R31, -4},
+      {PPC::R30, -12},
+      {PPC::R29, -20},
+      {PPC::R28, -28},
+      {PPC::R27, -36},
+      {PPC::R26, -44},
+      {PPC::R25, -52},
+      {PPC::R24, -60},
+      {PPC::R23, -68},
+      {PPC::R22, -76},
+      {PPC::R21, -84},
+      {PPC::R20, -92},
+      {PPC::R19, -100},
+      {PPC::R18, -108},
+      {PPC::R17, -116},
+      {PPC::R16, -124},
+      {PPC::R15, -132},
+      {PPC::R14, -140},
+
+      {PPC::X31, -8},
+      {PPC::X30, -16},
+      {PPC::X29, -24},
+      {PPC::X28, -32},
+      {PPC::X27, -40},
+      {PPC::X26, -48},
+      {PPC::X25, -56},
+      {PPC::X24, -64},
+      {PPC::X23, -72},
+      {PPC::X22, -80},
+      {PPC::X21, -88},
+      {PPC::X20, -96},
+      {PPC::X19, -104},
+      {PPC::X18, -112},
+      {PPC::X17, -120},
+      {PPC::X16, -128},
+      {PPC::X15, -136},
+      {PPC::X14, -144},
+
+      // CR save area offset.
+      // FIXME SVR4: Disable CR save area for now.
+//      {PPC::CR2, -4},
+//      {PPC::CR3, -4},
+//      {PPC::CR4, -4},
+//      {PPC::CR2LT, -4},
+//      {PPC::CR2GT, -4},
+//      {PPC::CR2EQ, -4},
+//      {PPC::CR2UN, -4},
+//      {PPC::CR3LT, -4},
+//      {PPC::CR3GT, -4},
+//      {PPC::CR3EQ, -4},
+//      {PPC::CR3UN, -4},
+//      {PPC::CR4LT, -4},
+//      {PPC::CR4GT, -4},
+//      {PPC::CR4EQ, -4},
+//      {PPC::CR4UN, -4},
+
+      // VRSAVE save area offset.
+      {PPC::VRSAVE, -4},
+
+      // Vector register save area
+      {PPC::V31, -16},
+      {PPC::V30, -32},
+      {PPC::V29, -48},
+      {PPC::V28, -64},
+      {PPC::V27, -80},
+      {PPC::V26, -96},
+      {PPC::V25, -112},
+      {PPC::V24, -128},
+      {PPC::V23, -144},
+      {PPC::V22, -160},
+      {PPC::V21, -176},
+      {PPC::V20, -192}
+    };
+
+    if (TM.getSubtarget<PPCSubtarget>().isPPC64()) {
+      NumEntries = array_lengthof(Offsets64);
+
+      return Offsets64;
+    } else {
+      NumEntries = array_lengthof(Offsets);
+
+      return Offsets;
+    }
+  }
+};
+
+} // End llvm namespace
+
+#endif

diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
new file mode 100644
index 0000000..3a15f7e
--- /dev/null
+++ b/lib/Target/PowerPC/PPCHazardRecognizers.cpp

@@ -0,0 +1,306 @@
+//===-- PPCHazardRecognizers.cpp - PowerPC Hazard Recognizer Impls --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements hazard recognizers for scheduling on PowerPC processors.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "pre-RA-sched"
+#include "PPCHazardRecognizers.h"
+#include "PPC.h"
+#include "PPCInstrInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// PowerPC 970 Hazard Recognizer
+//
+// This models the dispatch group formation of the PPC970 processor.  Dispatch
+// groups are bundles of up to five instructions that can contain various mixes
+// of instructions.  The PPC970 can dispatch a peak of 4 non-branch and one 
+// branch instruction per-cycle.
+//
+// There are a number of restrictions to dispatch group formation: some
+// instructions can only be issued in the first slot of a dispatch group, & some
+// instructions fill an entire dispatch group.  Additionally, only branches can
+// issue in the 5th (last) slot.
+//
+// Finally, there are a number of "structural" hazards on the PPC970.  These
+// conditions cause large performance penalties due to misprediction, recovery,
+// and replay logic that has to happen.  These cases include setting a CTR and
+// branching through it in the same dispatch group, and storing to an address,
+// then loading from the same address within a dispatch group.  To avoid these
+// conditions, we insert no-op instructions when appropriate.
+//
+// FIXME: This is missing some significant cases:
+//   1. Modeling of microcoded instructions.
+//   2. Handling of serialized operations.
+//   3. Handling of the esoteric cases in "Resource-based Instruction Grouping".
+//
+
+PPCHazardRecognizer970::PPCHazardRecognizer970(const TargetInstrInfo &tii)
+  : TII(tii) {
+  EndDispatchGroup();
+}
+
+void PPCHazardRecognizer970::EndDispatchGroup() {
+  DEBUG(errs() << "=== Start of dispatch group\n");
+  NumIssued = 0;
+  
+  // Structural hazard info.
+  HasCTRSet = false;
+  NumStores = 0;
+}
+
+
+PPCII::PPC970_Unit 
+PPCHazardRecognizer970::GetInstrType(unsigned Opcode,
+                                     bool &isFirst, bool &isSingle,
+                                     bool &isCracked,
+                                     bool &isLoad, bool &isStore) {
+  if ((int)Opcode >= 0) {
+    isFirst = isSingle = isCracked = isLoad = isStore = false;
+    return PPCII::PPC970_Pseudo;
+  }
+  Opcode = ~Opcode;
+  
+  const TargetInstrDesc &TID = TII.get(Opcode);
+  
+  isLoad  = TID.mayLoad();
+  isStore = TID.mayStore();
+  
+  unsigned TSFlags = TID.TSFlags;
+  
+  isFirst   = TSFlags & PPCII::PPC970_First;
+  isSingle  = TSFlags & PPCII::PPC970_Single;
+  isCracked = TSFlags & PPCII::PPC970_Cracked;
+  return (PPCII::PPC970_Unit)(TSFlags & PPCII::PPC970_Mask);
+}
+
+/// isLoadOfStoredAddress - If we have a load from the previously stored pointer
+/// as indicated by StorePtr1/StorePtr2/StoreSize, return true.
+bool PPCHazardRecognizer970::
+isLoadOfStoredAddress(unsigned LoadSize, SDValue Ptr1, SDValue Ptr2) const {
+  for (unsigned i = 0, e = NumStores; i != e; ++i) {
+    // Handle exact and commuted addresses.
+    if (Ptr1 == StorePtr1[i] && Ptr2 == StorePtr2[i])
+      return true;
+    if (Ptr2 == StorePtr1[i] && Ptr1 == StorePtr2[i])
+      return true;
+    
+    // Okay, we don't have an exact match, if this is an indexed offset, see if
+    // we have overlap (which happens during fp->int conversion for example).
+    if (StorePtr2[i] == Ptr2) {
+      if (ConstantSDNode *StoreOffset = dyn_cast<ConstantSDNode>(StorePtr1[i]))
+        if (ConstantSDNode *LoadOffset = dyn_cast<ConstantSDNode>(Ptr1)) {
+          // Okay the base pointers match, so we have [c1+r] vs [c2+r].  Check
+          // to see if the load and store actually overlap.
+          int StoreOffs = StoreOffset->getZExtValue();
+          int LoadOffs  = LoadOffset->getZExtValue();
+          if (StoreOffs < LoadOffs) {
+            if (int(StoreOffs+StoreSize[i]) > LoadOffs) return true;
+          } else {
+            if (int(LoadOffs+LoadSize) > StoreOffs) return true;
+          }
+        }
+    }
+  }
+  return false;
+}
+
+/// getHazardType - We return hazard for any non-branch instruction that would
+/// terminate the dispatch group.  We turn NoopHazard for any
+/// instructions that wouldn't terminate the dispatch group that would cause a
+/// pipeline flush.
+ScheduleHazardRecognizer::HazardType PPCHazardRecognizer970::
+getHazardType(SUnit *SU) {
+  const SDNode *Node = SU->getNode()->getFlaggedMachineNode();
+  bool isFirst, isSingle, isCracked, isLoad, isStore;
+  PPCII::PPC970_Unit InstrType = 
+    GetInstrType(Node->getOpcode(), isFirst, isSingle, isCracked,
+                 isLoad, isStore);
+  if (InstrType == PPCII::PPC970_Pseudo) return NoHazard;  
+  unsigned Opcode = Node->getMachineOpcode();
+
+  // We can only issue a PPC970_First/PPC970_Single instruction (such as
+  // crand/mtspr/etc) if this is the first cycle of the dispatch group.
+  if (NumIssued != 0 && (isFirst || isSingle))
+    return Hazard;
+  
+  // If this instruction is cracked into two ops by the decoder, we know that
+  // it is not a branch and that it cannot issue if 3 other instructions are
+  // already in the dispatch group.
+  if (isCracked && NumIssued > 2)
+    return Hazard;
+      
+  switch (InstrType) {
+  default: llvm_unreachable("Unknown instruction type!");
+  case PPCII::PPC970_FXU:
+  case PPCII::PPC970_LSU:
+  case PPCII::PPC970_FPU:
+  case PPCII::PPC970_VALU:
+  case PPCII::PPC970_VPERM:
+    // We can only issue a branch as the last instruction in a group.
+    if (NumIssued == 4) return Hazard;
+    break;
+  case PPCII::PPC970_CRU:
+    // We can only issue a CR instruction in the first two slots.
+    if (NumIssued >= 2) return Hazard;
+    break;
+  case PPCII::PPC970_BRU:
+    break;
+  }
+  
+  // Do not allow MTCTR and BCTRL to be in the same dispatch group.
+  if (HasCTRSet && (Opcode == PPC::BCTRL_Darwin || Opcode == PPC::BCTRL_SVR4))
+    return NoopHazard;
+  
+  // If this is a load following a store, make sure it's not to the same or
+  // overlapping address.
+  if (isLoad && NumStores) {
+    unsigned LoadSize;
+    switch (Opcode) {
+    default: llvm_unreachable("Unknown load!");
+    case PPC::LBZ:   case PPC::LBZU:
+    case PPC::LBZX:
+    case PPC::LBZ8:  case PPC::LBZU8:
+    case PPC::LBZX8:
+    case PPC::LVEBX:
+      LoadSize = 1;
+      break;
+    case PPC::LHA:   case PPC::LHAU:
+    case PPC::LHAX:
+    case PPC::LHZ:   case PPC::LHZU:
+    case PPC::LHZX:
+    case PPC::LVEHX:
+    case PPC::LHBRX:
+    case PPC::LHA8:   case PPC::LHAU8:
+    case PPC::LHAX8:
+    case PPC::LHZ8:   case PPC::LHZU8:
+    case PPC::LHZX8:
+      LoadSize = 2;
+      break;
+    case PPC::LFS:    case PPC::LFSU:
+    case PPC::LFSX:
+    case PPC::LWZ:    case PPC::LWZU:
+    case PPC::LWZX:
+    case PPC::LWA:
+    case PPC::LWAX:
+    case PPC::LVEWX:
+    case PPC::LWBRX:
+    case PPC::LWZ8:
+    case PPC::LWZX8:
+      LoadSize = 4;
+      break;
+    case PPC::LFD:    case PPC::LFDU:
+    case PPC::LFDX:
+    case PPC::LD:     case PPC::LDU:
+    case PPC::LDX:
+      LoadSize = 8;
+      break;
+    case PPC::LVX:
+    case PPC::LVXL:
+      LoadSize = 16;
+      break;
+    }
+    
+    if (isLoadOfStoredAddress(LoadSize, 
+                              Node->getOperand(0), Node->getOperand(1)))
+      return NoopHazard;
+  }
+  
+  return NoHazard;
+}
+
+void PPCHazardRecognizer970::EmitInstruction(SUnit *SU) {
+  const SDNode *Node = SU->getNode()->getFlaggedMachineNode();
+  bool isFirst, isSingle, isCracked, isLoad, isStore;
+  PPCII::PPC970_Unit InstrType = 
+    GetInstrType(Node->getOpcode(), isFirst, isSingle, isCracked,
+                 isLoad, isStore);
+  if (InstrType == PPCII::PPC970_Pseudo) return;  
+  unsigned Opcode = Node->getMachineOpcode();
+
+  // Update structural hazard information.
+  if (Opcode == PPC::MTCTR) HasCTRSet = true;
+  
+  // Track the address stored to.
+  if (isStore) {
+    unsigned ThisStoreSize;
+    switch (Opcode) {
+    default: llvm_unreachable("Unknown store instruction!");
+    case PPC::STB:    case PPC::STB8:
+    case PPC::STBU:   case PPC::STBU8:
+    case PPC::STBX:   case PPC::STBX8:
+    case PPC::STVEBX:
+      ThisStoreSize = 1;
+      break;
+    case PPC::STH:    case PPC::STH8:
+    case PPC::STHU:   case PPC::STHU8:
+    case PPC::STHX:   case PPC::STHX8:
+    case PPC::STVEHX:
+    case PPC::STHBRX:
+      ThisStoreSize = 2;
+      break;
+    case PPC::STFS:
+    case PPC::STFSU:
+    case PPC::STFSX:
+    case PPC::STWX:   case PPC::STWX8:
+    case PPC::STWUX:
+    case PPC::STW:    case PPC::STW8:
+    case PPC::STWU:   case PPC::STWU8:
+    case PPC::STVEWX:
+    case PPC::STFIWX:
+    case PPC::STWBRX:
+      ThisStoreSize = 4;
+      break;
+    case PPC::STD_32:
+    case PPC::STDX_32:
+    case PPC::STD:
+    case PPC::STDU:
+    case PPC::STFD:
+    case PPC::STFDX:
+    case PPC::STDX:
+    case PPC::STDUX:
+      ThisStoreSize = 8;
+      break;
+    case PPC::STVX:
+    case PPC::STVXL:
+      ThisStoreSize = 16;
+      break;
+    }
+    
+    StoreSize[NumStores] = ThisStoreSize;
+    StorePtr1[NumStores] = Node->getOperand(1);
+    StorePtr2[NumStores] = Node->getOperand(2);
+    ++NumStores;
+  }
+  
+  if (InstrType == PPCII::PPC970_BRU || isSingle)
+    NumIssued = 4;  // Terminate a d-group.
+  ++NumIssued;
+  
+  // If this instruction is cracked into two ops by the decoder, remember that
+  // we issued two pieces.
+  if (isCracked)
+    ++NumIssued;
+  
+  if (NumIssued == 5)
+    EndDispatchGroup();
+}
+
+void PPCHazardRecognizer970::AdvanceCycle() {
+  assert(NumIssued < 5 && "Illegal dispatch group!");
+  ++NumIssued;
+  if (NumIssued == 5)
+    EndDispatchGroup();
+}

diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.h b/lib/Target/PowerPC/PPCHazardRecognizers.h
new file mode 100644
index 0000000..74bf8e5
--- /dev/null
+++ b/lib/Target/PowerPC/PPCHazardRecognizers.h

@@ -0,0 +1,73 @@
+//===-- PPCHazardRecognizers.h - PowerPC Hazard Recognizers -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines hazard recognizers for scheduling on PowerPC processors.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPCHAZRECS_H
+#define PPCHAZRECS_H
+
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "PPCInstrInfo.h"
+
+namespace llvm {
+  
+/// PPCHazardRecognizer970 - This class defines a finite state automata that
+/// models the dispatch logic on the PowerPC 970 (aka G5) processor.  This
+/// promotes good dispatch group formation and implements noop insertion to
+/// avoid structural hazards that cause significant performance penalties (e.g.
+/// setting the CTR register then branching through it within a dispatch group),
+/// or storing then loading from the same address within a dispatch group.
+class PPCHazardRecognizer970 : public ScheduleHazardRecognizer {
+  const TargetInstrInfo &TII;
+  
+  unsigned NumIssued;  // Number of insts issued, including advanced cycles.
+  
+  // Various things that can cause a structural hazard.
+  
+  // HasCTRSet - If the CTR register is set in this group, disallow BCTRL.
+  bool HasCTRSet;
+  
+  // StoredPtr - Keep track of the address of any store.  If we see a load from
+  // the same address (or one that aliases it), disallow the store.  We can have
+  // up to four stores in one dispatch group, hence we track up to 4.
+  //
+  // This is null if we haven't seen a store yet.  We keep track of both
+  // operands of the store here, since we support [r+r] and [r+i] addressing.
+  SDValue StorePtr1[4], StorePtr2[4];
+  unsigned  StoreSize[4];
+  unsigned NumStores;
+  
+public:
+  PPCHazardRecognizer970(const TargetInstrInfo &TII);
+  virtual HazardType getHazardType(SUnit *SU);
+  virtual void EmitInstruction(SUnit *SU);
+  virtual void AdvanceCycle();
+  
+private:
+  /// EndDispatchGroup - Called when we are finishing a new dispatch group.
+  ///
+  void EndDispatchGroup();
+  
+  /// GetInstrType - Classify the specified powerpc opcode according to its
+  /// pipeline.
+  PPCII::PPC970_Unit GetInstrType(unsigned Opcode,
+                                  bool &isFirst, bool &isSingle,bool &isCracked,
+                                  bool &isLoad, bool &isStore);
+  
+  bool isLoadOfStoredAddress(unsigned LoadSize,
+                             SDValue Ptr1, SDValue Ptr2) const;
+};
+
+} // end namespace llvm
+
+#endif
+

diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
new file mode 100644
index 0000000..004997f
--- /dev/null
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp

@@ -0,0 +1,1104 @@
+//===-- PPCISelDAGToDAG.cpp - PPC --pattern matching inst selector --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pattern matching instruction selector for PowerPC,
+// converting from a legalized dag to a PPC dag.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "ppc-codegen"
+#include "PPC.h"
+#include "PPCPredicates.h"
+#include "PPCTargetMachine.h"
+#include "PPCISelLowering.h"
+#include "PPCHazardRecognizers.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+namespace {
+  //===--------------------------------------------------------------------===//
+  /// PPCDAGToDAGISel - PPC specific code to select PPC machine
+  /// instructions for SelectionDAG operations.
+  ///
+  class PPCDAGToDAGISel : public SelectionDAGISel {
+    PPCTargetMachine &TM;
+    PPCTargetLowering &PPCLowering;
+    const PPCSubtarget &PPCSubTarget;
+    unsigned GlobalBaseReg;
+  public:
+    explicit PPCDAGToDAGISel(PPCTargetMachine &tm)
+      : SelectionDAGISel(tm), TM(tm),
+        PPCLowering(*TM.getTargetLowering()),
+        PPCSubTarget(*TM.getSubtargetImpl()) {}
+    
+    virtual bool runOnMachineFunction(MachineFunction &MF) {
+      // Make sure we re-emit a set of the global base reg if necessary
+      GlobalBaseReg = 0;
+      SelectionDAGISel::runOnMachineFunction(MF);
+      
+      InsertVRSaveCode(MF);
+      return true;
+    }
+   
+    /// getI32Imm - Return a target constant with the specified value, of type
+    /// i32.
+    inline SDValue getI32Imm(unsigned Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i32);
+    }
+
+    /// getI64Imm - Return a target constant with the specified value, of type
+    /// i64.
+    inline SDValue getI64Imm(uint64_t Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i64);
+    }
+    
+    /// getSmallIPtrImm - Return a target constant of pointer type.
+    inline SDValue getSmallIPtrImm(unsigned Imm) {
+      return CurDAG->getTargetConstant(Imm, PPCLowering.getPointerTy());
+    }
+    
+    /// isRunOfOnes - Returns true iff Val consists of one contiguous run of 1s 
+    /// with any number of 0s on either side.  The 1s are allowed to wrap from
+    /// LSB to MSB, so 0x000FFF0, 0x0000FFFF, and 0xFF0000FF are all runs.
+    /// 0x0F0F0000 is not, since all 1s are not contiguous.
+    static bool isRunOfOnes(unsigned Val, unsigned &MB, unsigned &ME);
+
+
+    /// isRotateAndMask - Returns true if Mask and Shift can be folded into a
+    /// rotate and mask opcode and mask operation.
+    static bool isRotateAndMask(SDNode *N, unsigned Mask, bool isShiftMask,
+                                unsigned &SH, unsigned &MB, unsigned &ME);
+    
+    /// getGlobalBaseReg - insert code into the entry mbb to materialize the PIC
+    /// base register.  Return the virtual register that holds this value.
+    SDNode *getGlobalBaseReg();
+    
+    // Select - Convert the specified operand from a target-independent to a
+    // target-specific node if it hasn't already been changed.
+    SDNode *Select(SDNode *N);
+    
+    SDNode *SelectBitfieldInsert(SDNode *N);
+
+    /// SelectCC - Select a comparison of the specified values with the
+    /// specified condition code, returning the CR# of the expression.
+    SDValue SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC, DebugLoc dl);
+
+    /// SelectAddrImm - Returns true if the address N can be represented by
+    /// a base register plus a signed 16-bit displacement [r+imm].
+    bool SelectAddrImm(SDNode *Op, SDValue N, SDValue &Disp,
+                       SDValue &Base) {
+      return PPCLowering.SelectAddressRegImm(N, Disp, Base, *CurDAG);
+    }
+    
+    /// SelectAddrImmOffs - Return true if the operand is valid for a preinc
+    /// immediate field.  Because preinc imms have already been validated, just
+    /// accept it.
+    bool SelectAddrImmOffs(SDNode *Op, SDValue N, SDValue &Out) const {
+      Out = N;
+      return true;
+    }
+      
+    /// SelectAddrIdx - Given the specified addressed, check to see if it can be
+    /// represented as an indexed [r+r] operation.  Returns false if it can
+    /// be represented by [r+imm], which are preferred.
+    bool SelectAddrIdx(SDNode *Op, SDValue N, SDValue &Base,
+                       SDValue &Index) {
+      return PPCLowering.SelectAddressRegReg(N, Base, Index, *CurDAG);
+    }
+    
+    /// SelectAddrIdxOnly - Given the specified addressed, force it to be
+    /// represented as an indexed [r+r] operation.
+    bool SelectAddrIdxOnly(SDNode *Op, SDValue N, SDValue &Base,
+                           SDValue &Index) {
+      return PPCLowering.SelectAddressRegRegOnly(N, Base, Index, *CurDAG);
+    }
+
+    /// SelectAddrImmShift - Returns true if the address N can be represented by
+    /// a base register plus a signed 14-bit displacement [r+imm*4].  Suitable
+    /// for use by STD and friends.
+    bool SelectAddrImmShift(SDNode *Op, SDValue N, SDValue &Disp,
+                            SDValue &Base) {
+      return PPCLowering.SelectAddressRegImmShift(N, Disp, Base, *CurDAG);
+    }
+      
+    /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+    /// inline asm expressions.  It is always correct to compute the value into
+    /// a register.  The case of adding a (possibly relocatable) constant to a
+    /// register can be improved, but it is wrong to substitute Reg+Reg for
+    /// Reg in an asm, because the load or store opcode would have to change.
+   virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                              char ConstraintCode,
+                                              std::vector<SDValue> &OutOps) {
+      OutOps.push_back(Op);
+      return false;
+    }
+    
+    SDValue BuildSDIVSequence(SDNode *N);
+    SDValue BuildUDIVSequence(SDNode *N);
+    
+    /// InstructionSelect - This callback is invoked by
+    /// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+    virtual void InstructionSelect();
+    
+    void InsertVRSaveCode(MachineFunction &MF);
+
+    virtual const char *getPassName() const {
+      return "PowerPC DAG->DAG Pattern Instruction Selection";
+    } 
+    
+    /// CreateTargetHazardRecognizer - Return the hazard recognizer to use for
+    /// this target when scheduling the DAG.
+    virtual ScheduleHazardRecognizer *CreateTargetHazardRecognizer() {
+      // Should use subtarget info to pick the right hazard recognizer.  For
+      // now, always return a PPC970 recognizer.
+      const TargetInstrInfo *II = TM.getInstrInfo();
+      assert(II && "No InstrInfo?");
+      return new PPCHazardRecognizer970(*II); 
+    }
+
+// Include the pieces autogenerated from the target description.
+#include "PPCGenDAGISel.inc"
+    
+private:
+    SDNode *SelectSETCC(SDNode *N);
+  };
+}
+
+/// InstructionSelect - This callback is invoked by
+/// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+void PPCDAGToDAGISel::InstructionSelect() {
+  // Select target instructions for the DAG.
+  SelectRoot(*CurDAG);
+  CurDAG->RemoveDeadNodes();
+}
+
+/// InsertVRSaveCode - Once the entire function has been instruction selected,
+/// all virtual registers are created and all machine instructions are built,
+/// check to see if we need to save/restore VRSAVE.  If so, do it.
+void PPCDAGToDAGISel::InsertVRSaveCode(MachineFunction &Fn) {
+  // Check to see if this function uses vector registers, which means we have to
+  // save and restore the VRSAVE register and update it with the regs we use.  
+  //
+  // In this case, there will be virtual registers of vector type created
+  // by the scheduler.  Detect them now.
+  bool HasVectorVReg = false;
+  for (unsigned i = TargetRegisterInfo::FirstVirtualRegister, 
+       e = RegInfo->getLastVirtReg()+1; i != e; ++i)
+    if (RegInfo->getRegClass(i) == &PPC::VRRCRegClass) {
+      HasVectorVReg = true;
+      break;
+    }
+  if (!HasVectorVReg) return;  // nothing to do.
+      
+  // If we have a vector register, we want to emit code into the entry and exit
+  // blocks to save and restore the VRSAVE register.  We do this here (instead
+  // of marking all vector instructions as clobbering VRSAVE) for two reasons:
+  //
+  // 1. This (trivially) reduces the load on the register allocator, by not
+  //    having to represent the live range of the VRSAVE register.
+  // 2. This (more significantly) allows us to create a temporary virtual
+  //    register to hold the saved VRSAVE value, allowing this temporary to be
+  //    register allocated, instead of forcing it to be spilled to the stack.
+
+  // Create two vregs - one to hold the VRSAVE register that is live-in to the
+  // function and one for the value after having bits or'd into it.
+  unsigned InVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
+  unsigned UpdatedVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
+  
+  const TargetInstrInfo &TII = *TM.getInstrInfo();
+  MachineBasicBlock &EntryBB = *Fn.begin();
+  DebugLoc dl = DebugLoc::getUnknownLoc();
+  // Emit the following code into the entry block:
+  // InVRSAVE = MFVRSAVE
+  // UpdatedVRSAVE = UPDATE_VRSAVE InVRSAVE
+  // MTVRSAVE UpdatedVRSAVE
+  MachineBasicBlock::iterator IP = EntryBB.begin();  // Insert Point
+  BuildMI(EntryBB, IP, dl, TII.get(PPC::MFVRSAVE), InVRSAVE);
+  BuildMI(EntryBB, IP, dl, TII.get(PPC::UPDATE_VRSAVE),
+          UpdatedVRSAVE).addReg(InVRSAVE);
+  BuildMI(EntryBB, IP, dl, TII.get(PPC::MTVRSAVE)).addReg(UpdatedVRSAVE);
+  
+  // Find all return blocks, outputting a restore in each epilog.
+  for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) {
+    if (!BB->empty() && BB->back().getDesc().isReturn()) {
+      IP = BB->end(); --IP;
+      
+      // Skip over all terminator instructions, which are part of the return
+      // sequence.
+      MachineBasicBlock::iterator I2 = IP;
+      while (I2 != BB->begin() && (--I2)->getDesc().isTerminator())
+        IP = I2;
+      
+      // Emit: MTVRSAVE InVRSave
+      BuildMI(*BB, IP, dl, TII.get(PPC::MTVRSAVE)).addReg(InVRSAVE);
+    }        
+  }
+}
+
+
+/// getGlobalBaseReg - Output the instructions required to put the
+/// base address to use for accessing globals into a register.
+///
+SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
+  if (!GlobalBaseReg) {
+    const TargetInstrInfo &TII = *TM.getInstrInfo();
+    // Insert the set of GlobalBaseReg into the first MBB of the function
+    MachineBasicBlock &FirstMBB = MF->front();
+    MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+    DebugLoc dl = DebugLoc::getUnknownLoc();
+
+    if (PPCLowering.getPointerTy() == MVT::i32) {
+      GlobalBaseReg = RegInfo->createVirtualRegister(PPC::GPRCRegisterClass);
+      BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR), PPC::LR);
+      BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg);
+    } else {
+      GlobalBaseReg = RegInfo->createVirtualRegister(PPC::G8RCRegisterClass);
+      BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR8), PPC::LR8);
+      BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR8), GlobalBaseReg);
+    }
+  }
+  return CurDAG->getRegister(GlobalBaseReg,
+                             PPCLowering.getPointerTy()).getNode();
+}
+
+/// isIntS16Immediate - This method tests to see if the node is either a 32-bit
+/// or 64-bit immediate, and if the value can be accurately represented as a
+/// sign extension from a 16-bit value.  If so, this returns true and the
+/// immediate.
+static bool isIntS16Immediate(SDNode *N, short &Imm) {
+  if (N->getOpcode() != ISD::Constant)
+    return false;
+
+  Imm = (short)cast<ConstantSDNode>(N)->getZExtValue();
+  if (N->getValueType(0) == MVT::i32)
+    return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue();
+  else
+    return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue();
+}
+
+static bool isIntS16Immediate(SDValue Op, short &Imm) {
+  return isIntS16Immediate(Op.getNode(), Imm);
+}
+
+
+/// isInt32Immediate - This method tests to see if the node is a 32-bit constant
+/// operand. If so Imm will receive the 32-bit value.
+static bool isInt32Immediate(SDNode *N, unsigned &Imm) {
+  if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i32) {
+    Imm = cast<ConstantSDNode>(N)->getZExtValue();
+    return true;
+  }
+  return false;
+}
+
+/// isInt64Immediate - This method tests to see if the node is a 64-bit constant
+/// operand.  If so Imm will receive the 64-bit value.
+static bool isInt64Immediate(SDNode *N, uint64_t &Imm) {
+  if (N->getOpcode() == ISD::Constant && N->getValueType(0) == MVT::i64) {
+    Imm = cast<ConstantSDNode>(N)->getZExtValue();
+    return true;
+  }
+  return false;
+}
+
+// isInt32Immediate - This method tests to see if a constant operand.
+// If so Imm will receive the 32 bit value.
+static bool isInt32Immediate(SDValue N, unsigned &Imm) {
+  return isInt32Immediate(N.getNode(), Imm);
+}
+
+
+// isOpcWithIntImmediate - This method tests to see if the node is a specific
+// opcode and that it has a immediate integer right operand.
+// If so Imm will receive the 32 bit value.
+static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) {
+  return N->getOpcode() == Opc
+         && isInt32Immediate(N->getOperand(1).getNode(), Imm);
+}
+
+bool PPCDAGToDAGISel::isRunOfOnes(unsigned Val, unsigned &MB, unsigned &ME) {
+  if (isShiftedMask_32(Val)) {
+    // look for the first non-zero bit
+    MB = CountLeadingZeros_32(Val);
+    // look for the first zero bit after the run of ones
+    ME = CountLeadingZeros_32((Val - 1) ^ Val);
+    return true;
+  } else {
+    Val = ~Val; // invert mask
+    if (isShiftedMask_32(Val)) {
+      // effectively look for the first zero bit
+      ME = CountLeadingZeros_32(Val) - 1;
+      // effectively look for the first one bit after the run of zeros
+      MB = CountLeadingZeros_32((Val - 1) ^ Val) + 1;
+      return true;
+    }
+  }
+  // no run present
+  return false;
+}
+
+bool PPCDAGToDAGISel::isRotateAndMask(SDNode *N, unsigned Mask, 
+                                      bool isShiftMask, unsigned &SH, 
+                                      unsigned &MB, unsigned &ME) {
+  // Don't even go down this path for i64, since different logic will be
+  // necessary for rldicl/rldicr/rldimi.
+  if (N->getValueType(0) != MVT::i32)
+    return false;
+
+  unsigned Shift  = 32;
+  unsigned Indeterminant = ~0;  // bit mask marking indeterminant results
+  unsigned Opcode = N->getOpcode();
+  if (N->getNumOperands() != 2 ||
+      !isInt32Immediate(N->getOperand(1).getNode(), Shift) || (Shift > 31))
+    return false;
+  
+  if (Opcode == ISD::SHL) {
+    // apply shift left to mask if it comes first
+    if (isShiftMask) Mask = Mask << Shift;
+    // determine which bits are made indeterminant by shift
+    Indeterminant = ~(0xFFFFFFFFu << Shift);
+  } else if (Opcode == ISD::SRL) { 
+    // apply shift right to mask if it comes first
+    if (isShiftMask) Mask = Mask >> Shift;
+    // determine which bits are made indeterminant by shift
+    Indeterminant = ~(0xFFFFFFFFu >> Shift);
+    // adjust for the left rotate
+    Shift = 32 - Shift;
+  } else if (Opcode == ISD::ROTL) {
+    Indeterminant = 0;
+  } else {
+    return false;
+  }
+  
+  // if the mask doesn't intersect any Indeterminant bits
+  if (Mask && !(Mask & Indeterminant)) {
+    SH = Shift & 31;
+    // make sure the mask is still a mask (wrap arounds may not be)
+    return isRunOfOnes(Mask, MB, ME);
+  }
+  return false;
+}
+
+/// SelectBitfieldInsert - turn an or of two masked values into
+/// the rotate left word immediate then mask insert (rlwimi) instruction.
+SDNode *PPCDAGToDAGISel::SelectBitfieldInsert(SDNode *N) {
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  DebugLoc dl = N->getDebugLoc();
+  
+  APInt LKZ, LKO, RKZ, RKO;
+  CurDAG->ComputeMaskedBits(Op0, APInt::getAllOnesValue(32), LKZ, LKO);
+  CurDAG->ComputeMaskedBits(Op1, APInt::getAllOnesValue(32), RKZ, RKO);
+  
+  unsigned TargetMask = LKZ.getZExtValue();
+  unsigned InsertMask = RKZ.getZExtValue();
+  
+  if ((TargetMask | InsertMask) == 0xFFFFFFFF) {
+    unsigned Op0Opc = Op0.getOpcode();
+    unsigned Op1Opc = Op1.getOpcode();
+    unsigned Value, SH = 0;
+    TargetMask = ~TargetMask;
+    InsertMask = ~InsertMask;
+
+    // If the LHS has a foldable shift and the RHS does not, then swap it to the
+    // RHS so that we can fold the shift into the insert.
+    if (Op0Opc == ISD::AND && Op1Opc == ISD::AND) {
+      if (Op0.getOperand(0).getOpcode() == ISD::SHL ||
+          Op0.getOperand(0).getOpcode() == ISD::SRL) {
+        if (Op1.getOperand(0).getOpcode() != ISD::SHL &&
+            Op1.getOperand(0).getOpcode() != ISD::SRL) {
+          std::swap(Op0, Op1);
+          std::swap(Op0Opc, Op1Opc);
+          std::swap(TargetMask, InsertMask);
+        }
+      }
+    } else if (Op0Opc == ISD::SHL || Op0Opc == ISD::SRL) {
+      if (Op1Opc == ISD::AND && Op1.getOperand(0).getOpcode() != ISD::SHL &&
+          Op1.getOperand(0).getOpcode() != ISD::SRL) {
+        std::swap(Op0, Op1);
+        std::swap(Op0Opc, Op1Opc);
+        std::swap(TargetMask, InsertMask);
+      }
+    }
+    
+    unsigned MB, ME;
+    if (InsertMask && isRunOfOnes(InsertMask, MB, ME)) {
+      SDValue Tmp1, Tmp2;
+
+      if ((Op1Opc == ISD::SHL || Op1Opc == ISD::SRL) &&
+          isInt32Immediate(Op1.getOperand(1), Value)) {
+        Op1 = Op1.getOperand(0);
+        SH  = (Op1Opc == ISD::SHL) ? Value : 32 - Value;
+      }
+      if (Op1Opc == ISD::AND) {
+        unsigned SHOpc = Op1.getOperand(0).getOpcode();
+        if ((SHOpc == ISD::SHL || SHOpc == ISD::SRL) &&
+            isInt32Immediate(Op1.getOperand(0).getOperand(1), Value)) {
+          Op1 = Op1.getOperand(0).getOperand(0);
+          SH  = (SHOpc == ISD::SHL) ? Value : 32 - Value;
+        } else {
+          Op1 = Op1.getOperand(0);
+        }
+      }
+
+      SH &= 31;
+      SDValue Ops[] = { Op0, Op1, getI32Imm(SH), getI32Imm(MB),
+                          getI32Imm(ME) };
+      return CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops, 5);
+    }
+  }
+  return 0;
+}
+
+/// SelectCC - Select a comparison of the specified values with the specified
+/// condition code, returning the CR# of the expression.
+SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS,
+                                    ISD::CondCode CC, DebugLoc dl) {
+  // Always select the LHS.
+  unsigned Opc;
+  
+  if (LHS.getValueType() == MVT::i32) {
+    unsigned Imm;
+    if (CC == ISD::SETEQ || CC == ISD::SETNE) {
+      if (isInt32Immediate(RHS, Imm)) {
+        // SETEQ/SETNE comparison with 16-bit immediate, fold it.
+        if (isUInt16(Imm))
+          return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, LHS,
+                                                getI32Imm(Imm & 0xFFFF)), 0);
+        // If this is a 16-bit signed immediate, fold it.
+        if (isInt16((int)Imm))
+          return SDValue(CurDAG->getMachineNode(PPC::CMPWI, dl, MVT::i32, LHS,
+                                                getI32Imm(Imm & 0xFFFF)), 0);
+        
+        // For non-equality comparisons, the default code would materialize the
+        // constant, then compare against it, like this:
+        //   lis r2, 4660
+        //   ori r2, r2, 22136 
+        //   cmpw cr0, r3, r2
+        // Since we are just comparing for equality, we can emit this instead:
+        //   xoris r0,r3,0x1234
+        //   cmplwi cr0,r0,0x5678
+        //   beq cr0,L6
+        SDValue Xor(CurDAG->getMachineNode(PPC::XORIS, dl, MVT::i32, LHS,
+                                           getI32Imm(Imm >> 16)), 0);
+        return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, Xor,
+                                              getI32Imm(Imm & 0xFFFF)), 0);
+      }
+      Opc = PPC::CMPLW;
+    } else if (ISD::isUnsignedIntSetCC(CC)) {
+      if (isInt32Immediate(RHS, Imm) && isUInt16(Imm))
+        return SDValue(CurDAG->getMachineNode(PPC::CMPLWI, dl, MVT::i32, LHS,
+                                              getI32Imm(Imm & 0xFFFF)), 0);
+      Opc = PPC::CMPLW;
+    } else {
+      short SImm;
+      if (isIntS16Immediate(RHS, SImm))
+        return SDValue(CurDAG->getMachineNode(PPC::CMPWI, dl, MVT::i32, LHS,
+                                              getI32Imm((int)SImm & 0xFFFF)),
+                         0);
+      Opc = PPC::CMPW;
+    }
+  } else if (LHS.getValueType() == MVT::i64) {
+    uint64_t Imm;
+    if (CC == ISD::SETEQ || CC == ISD::SETNE) {
+      if (isInt64Immediate(RHS.getNode(), Imm)) {
+        // SETEQ/SETNE comparison with 16-bit immediate, fold it.
+        if (isUInt16(Imm))
+          return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, LHS,
+                                                getI32Imm(Imm & 0xFFFF)), 0);
+        // If this is a 16-bit signed immediate, fold it.
+        if (isInt16(Imm))
+          return SDValue(CurDAG->getMachineNode(PPC::CMPDI, dl, MVT::i64, LHS,
+                                                getI32Imm(Imm & 0xFFFF)), 0);
+        
+        // For non-equality comparisons, the default code would materialize the
+        // constant, then compare against it, like this:
+        //   lis r2, 4660
+        //   ori r2, r2, 22136 
+        //   cmpd cr0, r3, r2
+        // Since we are just comparing for equality, we can emit this instead:
+        //   xoris r0,r3,0x1234
+        //   cmpldi cr0,r0,0x5678
+        //   beq cr0,L6
+        if (isUInt32(Imm)) {
+          SDValue Xor(CurDAG->getMachineNode(PPC::XORIS8, dl, MVT::i64, LHS,
+                                             getI64Imm(Imm >> 16)), 0);
+          return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, Xor,
+                                                getI64Imm(Imm & 0xFFFF)), 0);
+        }
+      }
+      Opc = PPC::CMPLD;
+    } else if (ISD::isUnsignedIntSetCC(CC)) {
+      if (isInt64Immediate(RHS.getNode(), Imm) && isUInt16(Imm))
+        return SDValue(CurDAG->getMachineNode(PPC::CMPLDI, dl, MVT::i64, LHS,
+                                              getI64Imm(Imm & 0xFFFF)), 0);
+      Opc = PPC::CMPLD;
+    } else {
+      short SImm;
+      if (isIntS16Immediate(RHS, SImm))
+        return SDValue(CurDAG->getMachineNode(PPC::CMPDI, dl, MVT::i64, LHS,
+                                              getI64Imm(SImm & 0xFFFF)),
+                         0);
+      Opc = PPC::CMPD;
+    }
+  } else if (LHS.getValueType() == MVT::f32) {
+    Opc = PPC::FCMPUS;
+  } else {
+    assert(LHS.getValueType() == MVT::f64 && "Unknown vt!");
+    Opc = PPC::FCMPUD;
+  }
+  return SDValue(CurDAG->getMachineNode(Opc, dl, MVT::i32, LHS, RHS), 0);
+}
+
+static PPC::Predicate getPredicateForSetCC(ISD::CondCode CC) {
+  switch (CC) {
+  case ISD::SETUEQ:
+  case ISD::SETONE:
+  case ISD::SETOLE:
+  case ISD::SETOGE:
+    llvm_unreachable("Should be lowered by legalize!");
+  default: llvm_unreachable("Unknown condition!");
+  case ISD::SETOEQ:
+  case ISD::SETEQ:  return PPC::PRED_EQ;
+  case ISD::SETUNE:
+  case ISD::SETNE:  return PPC::PRED_NE;
+  case ISD::SETOLT:
+  case ISD::SETLT:  return PPC::PRED_LT;
+  case ISD::SETULE:
+  case ISD::SETLE:  return PPC::PRED_LE;
+  case ISD::SETOGT:
+  case ISD::SETGT:  return PPC::PRED_GT;
+  case ISD::SETUGE:
+  case ISD::SETGE:  return PPC::PRED_GE;
+  case ISD::SETO:   return PPC::PRED_NU;
+  case ISD::SETUO:  return PPC::PRED_UN;
+    // These two are invalid for floating point.  Assume we have int.
+  case ISD::SETULT: return PPC::PRED_LT;
+  case ISD::SETUGT: return PPC::PRED_GT;
+  }
+}
+
+/// getCRIdxForSetCC - Return the index of the condition register field
+/// associated with the SetCC condition, and whether or not the field is
+/// treated as inverted.  That is, lt = 0; ge = 0 inverted.
+///
+/// If this returns with Other != -1, then the returned comparison is an or of
+/// two simpler comparisons.  In this case, Invert is guaranteed to be false.
+static unsigned getCRIdxForSetCC(ISD::CondCode CC, bool &Invert, int &Other) {
+  Invert = false;
+  Other = -1;
+  switch (CC) {
+  default: llvm_unreachable("Unknown condition!");
+  case ISD::SETOLT:
+  case ISD::SETLT:  return 0;                  // Bit #0 = SETOLT
+  case ISD::SETOGT:
+  case ISD::SETGT:  return 1;                  // Bit #1 = SETOGT
+  case ISD::SETOEQ:
+  case ISD::SETEQ:  return 2;                  // Bit #2 = SETOEQ
+  case ISD::SETUO:  return 3;                  // Bit #3 = SETUO
+  case ISD::SETUGE:
+  case ISD::SETGE:  Invert = true; return 0;   // !Bit #0 = SETUGE
+  case ISD::SETULE:
+  case ISD::SETLE:  Invert = true; return 1;   // !Bit #1 = SETULE
+  case ISD::SETUNE:
+  case ISD::SETNE:  Invert = true; return 2;   // !Bit #2 = SETUNE
+  case ISD::SETO:   Invert = true; return 3;   // !Bit #3 = SETO
+  case ISD::SETUEQ: 
+  case ISD::SETOGE: 
+  case ISD::SETOLE: 
+  case ISD::SETONE:
+    llvm_unreachable("Invalid branch code: should be expanded by legalize");
+  // These are invalid for floating point.  Assume integer.
+  case ISD::SETULT: return 0;
+  case ISD::SETUGT: return 1;
+  }
+  return 0;
+}
+
+SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  unsigned Imm;
+  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  if (isInt32Immediate(N->getOperand(1), Imm)) {
+    // We can codegen setcc op, imm very efficiently compared to a brcond.
+    // Check for those cases here.
+    // setcc op, 0
+    if (Imm == 0) {
+      SDValue Op = N->getOperand(0);
+      switch (CC) {
+      default: break;
+      case ISD::SETEQ: {
+        Op = SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Op), 0);
+        SDValue Ops[] = { Op, getI32Imm(27), getI32Imm(5), getI32Imm(31) };
+        return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+      }
+      case ISD::SETNE: {
+        SDValue AD =
+          SDValue(CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Flag,
+                                         Op, getI32Imm(~0U)), 0);
+        return CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, AD, Op, 
+                                    AD.getValue(1));
+      }
+      case ISD::SETLT: {
+        SDValue Ops[] = { Op, getI32Imm(1), getI32Imm(31), getI32Imm(31) };
+        return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+      }
+      case ISD::SETGT: {
+        SDValue T =
+          SDValue(CurDAG->getMachineNode(PPC::NEG, dl, MVT::i32, Op), 0);
+        T = SDValue(CurDAG->getMachineNode(PPC::ANDC, dl, MVT::i32, T, Op), 0);
+        SDValue Ops[] = { T, getI32Imm(1), getI32Imm(31), getI32Imm(31) };
+        return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+      }
+      }
+    } else if (Imm == ~0U) {        // setcc op, -1
+      SDValue Op = N->getOperand(0);
+      switch (CC) {
+      default: break;
+      case ISD::SETEQ:
+        Op = SDValue(CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Flag,
+                                            Op, getI32Imm(1)), 0);
+        return CurDAG->SelectNodeTo(N, PPC::ADDZE, MVT::i32, 
+                              SDValue(CurDAG->getMachineNode(PPC::LI, dl, 
+                                                             MVT::i32,
+                                                             getI32Imm(0)), 0),
+                                      Op.getValue(1));
+      case ISD::SETNE: {
+        Op = SDValue(CurDAG->getMachineNode(PPC::NOR, dl, MVT::i32, Op, Op), 0);
+        SDNode *AD = CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Flag,
+                                            Op, getI32Imm(~0U));
+        return CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32, SDValue(AD, 0),
+                                    Op, SDValue(AD, 1));
+      }
+      case ISD::SETLT: {
+        SDValue AD = SDValue(CurDAG->getMachineNode(PPC::ADDI, dl, MVT::i32, Op,
+                                                    getI32Imm(1)), 0);
+        SDValue AN = SDValue(CurDAG->getMachineNode(PPC::AND, dl, MVT::i32, AD,
+                                                    Op), 0);
+        SDValue Ops[] = { AN, getI32Imm(1), getI32Imm(31), getI32Imm(31) };
+        return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+      }
+      case ISD::SETGT: {
+        SDValue Ops[] = { Op, getI32Imm(1), getI32Imm(31), getI32Imm(31) };
+        Op = SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops, 4), 
+                     0);
+        return CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Op, 
+                                    getI32Imm(1));
+      }
+      }
+    }
+  }
+  
+  bool Inv;
+  int OtherCondIdx;
+  unsigned Idx = getCRIdxForSetCC(CC, Inv, OtherCondIdx);
+  SDValue CCReg = SelectCC(N->getOperand(0), N->getOperand(1), CC, dl);
+  SDValue IntCR;
+  
+  // Force the ccreg into CR7.
+  SDValue CR7Reg = CurDAG->getRegister(PPC::CR7, MVT::i32);
+  
+  SDValue InFlag(0, 0);  // Null incoming flag value.
+  CCReg = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, CR7Reg, CCReg, 
+                               InFlag).getValue(1);
+  
+  if (PPCSubTarget.isGigaProcessor() && OtherCondIdx == -1)
+    IntCR = SDValue(CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32, CR7Reg,
+                                           CCReg), 0);
+  else
+    IntCR = SDValue(CurDAG->getMachineNode(PPC::MFCR, dl, MVT::i32, CCReg), 0);
+  
+  SDValue Ops[] = { IntCR, getI32Imm((32-(3-Idx)) & 31),
+                      getI32Imm(31), getI32Imm(31) };
+  if (OtherCondIdx == -1 && !Inv)
+    return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+
+  // Get the specified bit.
+  SDValue Tmp =
+    SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops, 4), 0);
+  if (Inv) {
+    assert(OtherCondIdx == -1 && "Can't have split plus negation");
+    return CurDAG->SelectNodeTo(N, PPC::XORI, MVT::i32, Tmp, getI32Imm(1));
+  }
+
+  // Otherwise, we have to turn an operation like SETONE -> SETOLT | SETOGT.
+  // We already got the bit for the first part of the comparison (e.g. SETULE).
+
+  // Get the other bit of the comparison.
+  Ops[1] = getI32Imm((32-(3-OtherCondIdx)) & 31);
+  SDValue OtherCond = 
+    SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, Ops, 4), 0);
+
+  return CurDAG->SelectNodeTo(N, PPC::OR, MVT::i32, Tmp, OtherCond);
+}
+
+
+// Select - Convert the specified operand from a target-independent to a
+// target-specific node if it hasn't already been changed.
+SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  if (N->isMachineOpcode())
+    return NULL;   // Already selected.
+
+  switch (N->getOpcode()) {
+  default: break;
+  
+  case ISD::Constant: {
+    if (N->getValueType(0) == MVT::i64) {
+      // Get 64 bit value.
+      int64_t Imm = cast<ConstantSDNode>(N)->getZExtValue();
+      // Assume no remaining bits.
+      unsigned Remainder = 0;
+      // Assume no shift required.
+      unsigned Shift = 0;
+      
+      // If it can't be represented as a 32 bit value.
+      if (!isInt32(Imm)) {
+        Shift = CountTrailingZeros_64(Imm);
+        int64_t ImmSh = static_cast<uint64_t>(Imm) >> Shift;
+        
+        // If the shifted value fits 32 bits.
+        if (isInt32(ImmSh)) {
+          // Go with the shifted value.
+          Imm = ImmSh;
+        } else {
+          // Still stuck with a 64 bit value.
+          Remainder = Imm;
+          Shift = 32;
+          Imm >>= 32;
+        }
+      }
+      
+      // Intermediate operand.
+      SDNode *Result;
+
+      // Handle first 32 bits.
+      unsigned Lo = Imm & 0xFFFF;
+      unsigned Hi = (Imm >> 16) & 0xFFFF;
+      
+      // Simple value.
+      if (isInt16(Imm)) {
+       // Just the Lo bits.
+        Result = CurDAG->getMachineNode(PPC::LI8, dl, MVT::i64, getI32Imm(Lo));
+      } else if (Lo) {
+        // Handle the Hi bits.
+        unsigned OpC = Hi ? PPC::LIS8 : PPC::LI8;
+        Result = CurDAG->getMachineNode(OpC, dl, MVT::i64, getI32Imm(Hi));
+        // And Lo bits.
+        Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64,
+                                        SDValue(Result, 0), getI32Imm(Lo));
+      } else {
+       // Just the Hi bits.
+        Result = CurDAG->getMachineNode(PPC::LIS8, dl, MVT::i64, getI32Imm(Hi));
+      }
+      
+      // If no shift, we're done.
+      if (!Shift) return Result;
+
+      // Shift for next step if the upper 32-bits were not zero.
+      if (Imm) {
+        Result = CurDAG->getMachineNode(PPC::RLDICR, dl, MVT::i64,
+                                        SDValue(Result, 0),
+                                        getI32Imm(Shift),
+                                        getI32Imm(63 - Shift));
+      }
+
+      // Add in the last bits as required.
+      if ((Hi = (Remainder >> 16) & 0xFFFF)) {
+        Result = CurDAG->getMachineNode(PPC::ORIS8, dl, MVT::i64,
+                                        SDValue(Result, 0), getI32Imm(Hi));
+      } 
+      if ((Lo = Remainder & 0xFFFF)) {
+        Result = CurDAG->getMachineNode(PPC::ORI8, dl, MVT::i64,
+                                        SDValue(Result, 0), getI32Imm(Lo));
+      }
+      
+      return Result;
+    }
+    break;
+  }
+  
+  case ISD::SETCC:
+    return SelectSETCC(N);
+  case PPCISD::GlobalBaseReg:
+    return getGlobalBaseReg();
+    
+  case ISD::FrameIndex: {
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI, N->getValueType(0));
+    unsigned Opc = N->getValueType(0) == MVT::i32 ? PPC::ADDI : PPC::ADDI8;
+    if (N->hasOneUse())
+      return CurDAG->SelectNodeTo(N, Opc, N->getValueType(0), TFI,
+                                  getSmallIPtrImm(0));
+    return CurDAG->getMachineNode(Opc, dl, N->getValueType(0), TFI,
+                                  getSmallIPtrImm(0));
+  }
+
+  case PPCISD::MFCR: {
+    SDValue InFlag = N->getOperand(1);
+    // Use MFOCRF if supported.
+    if (PPCSubTarget.isGigaProcessor())
+      return CurDAG->getMachineNode(PPC::MFOCRF, dl, MVT::i32,
+                                    N->getOperand(0), InFlag);
+    else
+      return CurDAG->getMachineNode(PPC::MFCR, dl, MVT::i32, InFlag);
+  }
+    
+  case ISD::SDIV: {
+    // FIXME: since this depends on the setting of the carry flag from the srawi
+    //        we should really be making notes about that for the scheduler.
+    // FIXME: It sure would be nice if we could cheaply recognize the 
+    //        srl/add/sra pattern the dag combiner will generate for this as
+    //        sra/addze rather than having to handle sdiv ourselves.  oh well.
+    unsigned Imm;
+    if (isInt32Immediate(N->getOperand(1), Imm)) {
+      SDValue N0 = N->getOperand(0);
+      if ((signed)Imm > 0 && isPowerOf2_32(Imm)) {
+        SDNode *Op =
+          CurDAG->getMachineNode(PPC::SRAWI, dl, MVT::i32, MVT::Flag,
+                                 N0, getI32Imm(Log2_32(Imm)));
+        return CurDAG->SelectNodeTo(N, PPC::ADDZE, MVT::i32, 
+                                    SDValue(Op, 0), SDValue(Op, 1));
+      } else if ((signed)Imm < 0 && isPowerOf2_32(-Imm)) {
+        SDNode *Op =
+          CurDAG->getMachineNode(PPC::SRAWI, dl, MVT::i32, MVT::Flag,
+                                 N0, getI32Imm(Log2_32(-Imm)));
+        SDValue PT =
+          SDValue(CurDAG->getMachineNode(PPC::ADDZE, dl, MVT::i32,
+                                         SDValue(Op, 0), SDValue(Op, 1)),
+                    0);
+        return CurDAG->SelectNodeTo(N, PPC::NEG, MVT::i32, PT);
+      }
+    }
+    
+    // Other cases are autogenerated.
+    break;
+  }
+    
+  case ISD::LOAD: {
+    // Handle preincrement loads.
+    LoadSDNode *LD = cast<LoadSDNode>(N);
+    EVT LoadedVT = LD->getMemoryVT();
+    
+    // Normal loads are handled by code generated from the .td file.
+    if (LD->getAddressingMode() != ISD::PRE_INC)
+      break;
+    
+    SDValue Offset = LD->getOffset();
+    if (isa<ConstantSDNode>(Offset) ||
+        Offset.getOpcode() == ISD::TargetGlobalAddress) {
+      
+      unsigned Opcode;
+      bool isSExt = LD->getExtensionType() == ISD::SEXTLOAD;
+      if (LD->getValueType(0) != MVT::i64) {
+        // Handle PPC32 integer and normal FP loads.
+        assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load");
+        switch (LoadedVT.getSimpleVT().SimpleTy) {
+          default: llvm_unreachable("Invalid PPC load type!");
+          case MVT::f64: Opcode = PPC::LFDU; break;
+          case MVT::f32: Opcode = PPC::LFSU; break;
+          case MVT::i32: Opcode = PPC::LWZU; break;
+          case MVT::i16: Opcode = isSExt ? PPC::LHAU : PPC::LHZU; break;
+          case MVT::i1:
+          case MVT::i8:  Opcode = PPC::LBZU; break;
+        }
+      } else {
+        assert(LD->getValueType(0) == MVT::i64 && "Unknown load result type!");
+        assert((!isSExt || LoadedVT == MVT::i16) && "Invalid sext update load");
+        switch (LoadedVT.getSimpleVT().SimpleTy) {
+          default: llvm_unreachable("Invalid PPC load type!");
+          case MVT::i64: Opcode = PPC::LDU; break;
+          case MVT::i32: Opcode = PPC::LWZU8; break;
+          case MVT::i16: Opcode = isSExt ? PPC::LHAU8 : PPC::LHZU8; break;
+          case MVT::i1:
+          case MVT::i8:  Opcode = PPC::LBZU8; break;
+        }
+      }
+      
+      SDValue Chain = LD->getChain();
+      SDValue Base = LD->getBasePtr();
+      SDValue Ops[] = { Offset, Base, Chain };
+      // FIXME: PPC64
+      return CurDAG->getMachineNode(Opcode, dl, LD->getValueType(0),
+                                    PPCLowering.getPointerTy(),
+                                    MVT::Other, Ops, 3);
+    } else {
+      llvm_unreachable("R+R preindex loads not supported yet!");
+    }
+  }
+    
+  case ISD::AND: {
+    unsigned Imm, Imm2, SH, MB, ME;
+
+    // If this is an and of a value rotated between 0 and 31 bits and then and'd
+    // with a mask, emit rlwinm
+    if (isInt32Immediate(N->getOperand(1), Imm) &&
+        isRotateAndMask(N->getOperand(0).getNode(), Imm, false, SH, MB, ME)) {
+      SDValue Val = N->getOperand(0).getOperand(0);
+      SDValue Ops[] = { Val, getI32Imm(SH), getI32Imm(MB), getI32Imm(ME) };
+      return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+    }
+    // If this is just a masked value where the input is not handled above, and
+    // is not a rotate-left (handled by a pattern in the .td file), emit rlwinm
+    if (isInt32Immediate(N->getOperand(1), Imm) &&
+        isRunOfOnes(Imm, MB, ME) && 
+        N->getOperand(0).getOpcode() != ISD::ROTL) {
+      SDValue Val = N->getOperand(0);
+      SDValue Ops[] = { Val, getI32Imm(0), getI32Imm(MB), getI32Imm(ME) };
+      return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+    }
+    // AND X, 0 -> 0, not "rlwinm 32".
+    if (isInt32Immediate(N->getOperand(1), Imm) && (Imm == 0)) {
+      ReplaceUses(SDValue(N, 0), N->getOperand(1));
+      return NULL;
+    }
+    // ISD::OR doesn't get all the bitfield insertion fun.
+    // (and (or x, c1), c2) where isRunOfOnes(~(c1^c2)) is a bitfield insert
+    if (isInt32Immediate(N->getOperand(1), Imm) && 
+        N->getOperand(0).getOpcode() == ISD::OR &&
+        isInt32Immediate(N->getOperand(0).getOperand(1), Imm2)) {
+      unsigned MB, ME;
+      Imm = ~(Imm^Imm2);
+      if (isRunOfOnes(Imm, MB, ME)) {
+        SDValue Ops[] = { N->getOperand(0).getOperand(0),
+                            N->getOperand(0).getOperand(1),
+                            getI32Imm(0), getI32Imm(MB),getI32Imm(ME) };
+        return CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops, 5);
+      }
+    }
+    
+    // Other cases are autogenerated.
+    break;
+  }
+  case ISD::OR:
+    if (N->getValueType(0) == MVT::i32)
+      if (SDNode *I = SelectBitfieldInsert(N))
+        return I;
+      
+    // Other cases are autogenerated.
+    break;
+  case ISD::SHL: {
+    unsigned Imm, SH, MB, ME;
+    if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, Imm) &&
+        isRotateAndMask(N, Imm, true, SH, MB, ME)) {
+      SDValue Ops[] = { N->getOperand(0).getOperand(0),
+                          getI32Imm(SH), getI32Imm(MB), getI32Imm(ME) };
+      return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+    }
+    
+    // Other cases are autogenerated.
+    break;
+  }
+  case ISD::SRL: {
+    unsigned Imm, SH, MB, ME;
+    if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, Imm) &&
+        isRotateAndMask(N, Imm, true, SH, MB, ME)) { 
+      SDValue Ops[] = { N->getOperand(0).getOperand(0),
+                          getI32Imm(SH), getI32Imm(MB), getI32Imm(ME) };
+      return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+    }
+    
+    // Other cases are autogenerated.
+    break;
+  }
+  case ISD::SELECT_CC: {
+    ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
+    
+    // Handle the setcc cases here.  select_cc lhs, 0, 1, 0, cc
+    if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1)))
+      if (ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N->getOperand(2)))
+        if (ConstantSDNode *N3C = dyn_cast<ConstantSDNode>(N->getOperand(3)))
+          if (N1C->isNullValue() && N3C->isNullValue() &&
+              N2C->getZExtValue() == 1ULL && CC == ISD::SETNE &&
+              // FIXME: Implement this optzn for PPC64.
+              N->getValueType(0) == MVT::i32) {
+            SDNode *Tmp =
+              CurDAG->getMachineNode(PPC::ADDIC, dl, MVT::i32, MVT::Flag,
+                                     N->getOperand(0), getI32Imm(~0U));
+            return CurDAG->SelectNodeTo(N, PPC::SUBFE, MVT::i32,
+                                        SDValue(Tmp, 0), N->getOperand(0),
+                                        SDValue(Tmp, 1));
+          }
+
+    SDValue CCReg = SelectCC(N->getOperand(0), N->getOperand(1), CC, dl);
+    unsigned BROpc = getPredicateForSetCC(CC);
+
+    unsigned SelectCCOp;
+    if (N->getValueType(0) == MVT::i32)
+      SelectCCOp = PPC::SELECT_CC_I4;
+    else if (N->getValueType(0) == MVT::i64)
+      SelectCCOp = PPC::SELECT_CC_I8;
+    else if (N->getValueType(0) == MVT::f32)
+      SelectCCOp = PPC::SELECT_CC_F4;
+    else if (N->getValueType(0) == MVT::f64)
+      SelectCCOp = PPC::SELECT_CC_F8;
+    else
+      SelectCCOp = PPC::SELECT_CC_VRRC;
+
+    SDValue Ops[] = { CCReg, N->getOperand(2), N->getOperand(3),
+                        getI32Imm(BROpc) };
+    return CurDAG->SelectNodeTo(N, SelectCCOp, N->getValueType(0), Ops, 4);
+  }
+  case PPCISD::COND_BRANCH: {
+    // Op #0 is the Chain.
+    // Op #1 is the PPC::PRED_* number.
+    // Op #2 is the CR#
+    // Op #3 is the Dest MBB
+    // Op #4 is the Flag.
+    // Prevent PPC::PRED_* from being selected into LI.
+    SDValue Pred =
+      getI32Imm(cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
+    SDValue Ops[] = { Pred, N->getOperand(2), N->getOperand(3),
+      N->getOperand(0), N->getOperand(4) };
+    return CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops, 5);
+  }
+  case ISD::BR_CC: {
+    ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
+    SDValue CondCode = SelectCC(N->getOperand(2), N->getOperand(3), CC, dl);
+    SDValue Ops[] = { getI32Imm(getPredicateForSetCC(CC)), CondCode, 
+                        N->getOperand(4), N->getOperand(0) };
+    return CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops, 4);
+  }
+  case ISD::BRIND: {
+    // FIXME: Should custom lower this.
+    SDValue Chain = N->getOperand(0);
+    SDValue Target = N->getOperand(1);
+    unsigned Opc = Target.getValueType() == MVT::i32 ? PPC::MTCTR : PPC::MTCTR8;
+    Chain = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Target,
+                                           Chain), 0);
+    return CurDAG->SelectNodeTo(N, PPC::BCTR, MVT::Other, Chain);
+  }
+  }
+  
+  return SelectCode(N);
+}
+
+
+
+/// createPPCISelDag - This pass converts a legalized DAG into a 
+/// PowerPC-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createPPCISelDag(PPCTargetMachine &TM) {
+  return new PPCDAGToDAGISel(TM);
+}
+

diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
new file mode 100644
index 0000000..a11d624
--- /dev/null
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp

@@ -0,0 +1,5502 @@
+//===-- PPCISelLowering.cpp - PPC DAG Lowering Implementation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PPCISelLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCISelLowering.h"
+#include "PPCMachineFunctionInfo.h"
+#include "PPCPredicates.h"
+#include "PPCTargetMachine.h"
+#include "PPCPerfectShuffle.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/VectorExtras.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/DerivedTypes.h"
+using namespace llvm;
+
+static bool CC_PPC_SVR4_Custom_Dummy(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
+                                     CCValAssign::LocInfo &LocInfo,
+                                     ISD::ArgFlagsTy &ArgFlags,
+                                     CCState &State);
+static bool CC_PPC_SVR4_Custom_AlignArgRegs(unsigned &ValNo, EVT &ValVT,
+                                            EVT &LocVT,
+                                            CCValAssign::LocInfo &LocInfo,
+                                            ISD::ArgFlagsTy &ArgFlags,
+                                            CCState &State);
+static bool CC_PPC_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, EVT &ValVT,
+                                              EVT &LocVT,
+                                              CCValAssign::LocInfo &LocInfo,
+                                              ISD::ArgFlagsTy &ArgFlags,
+                                              CCState &State);
+
+static cl::opt<bool> EnablePPCPreinc("enable-ppc-preinc",
+cl::desc("enable preincrement load/store generation on PPC (experimental)"),
+                                     cl::Hidden);
+
+static TargetLoweringObjectFile *CreateTLOF(const PPCTargetMachine &TM) {
+  if (TM.getSubtargetImpl()->isDarwin())
+    return new TargetLoweringObjectFileMachO();
+  return new TargetLoweringObjectFileELF();
+}
+
+
+PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
+  : TargetLowering(TM, CreateTLOF(TM)), PPCSubTarget(*TM.getSubtargetImpl()) {
+
+  setPow2DivIsCheap();
+
+  // Use _setjmp/_longjmp instead of setjmp/longjmp.
+  setUseUnderscoreSetJmp(true);
+  setUseUnderscoreLongJmp(true);
+
+  // Set up the register classes.
+  addRegisterClass(MVT::i32, PPC::GPRCRegisterClass);
+  addRegisterClass(MVT::f32, PPC::F4RCRegisterClass);
+  addRegisterClass(MVT::f64, PPC::F8RCRegisterClass);
+
+  // PowerPC has an i16 but no i8 (or i1) SEXTLOAD
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Expand);
+
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+  // PowerPC has pre-inc load and store's.
+  setIndexedLoadAction(ISD::PRE_INC, MVT::i1, Legal);
+  setIndexedLoadAction(ISD::PRE_INC, MVT::i8, Legal);
+  setIndexedLoadAction(ISD::PRE_INC, MVT::i16, Legal);
+  setIndexedLoadAction(ISD::PRE_INC, MVT::i32, Legal);
+  setIndexedLoadAction(ISD::PRE_INC, MVT::i64, Legal);
+  setIndexedStoreAction(ISD::PRE_INC, MVT::i1, Legal);
+  setIndexedStoreAction(ISD::PRE_INC, MVT::i8, Legal);
+  setIndexedStoreAction(ISD::PRE_INC, MVT::i16, Legal);
+  setIndexedStoreAction(ISD::PRE_INC, MVT::i32, Legal);
+  setIndexedStoreAction(ISD::PRE_INC, MVT::i64, Legal);
+
+  // This is used in the ppcf128->int sequence.  Note it has different semantics
+  // from FP_ROUND:  that rounds to nearest, this rounds to zero.
+  setOperationAction(ISD::FP_ROUND_INREG, MVT::ppcf128, Custom);
+
+  // PowerPC has no SREM/UREM instructions
+  setOperationAction(ISD::SREM, MVT::i32, Expand);
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
+  setOperationAction(ISD::SREM, MVT::i64, Expand);
+  setOperationAction(ISD::UREM, MVT::i64, Expand);
+
+  // Don't use SMUL_LOHI/UMUL_LOHI or SDIVREM/UDIVREM to lower SREM/UREM.
+  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+
+  // We don't support sin/cos/sqrt/fmod/pow
+  setOperationAction(ISD::FSIN , MVT::f64, Expand);
+  setOperationAction(ISD::FCOS , MVT::f64, Expand);
+  setOperationAction(ISD::FREM , MVT::f64, Expand);
+  setOperationAction(ISD::FPOW , MVT::f64, Expand);
+  setOperationAction(ISD::FSIN , MVT::f32, Expand);
+  setOperationAction(ISD::FCOS , MVT::f32, Expand);
+  setOperationAction(ISD::FREM , MVT::f32, Expand);
+  setOperationAction(ISD::FPOW , MVT::f32, Expand);
+
+  setOperationAction(ISD::FLT_ROUNDS_, MVT::i32, Custom);
+
+  // If we're enabling GP optimizations, use hardware square root
+  if (!TM.getSubtarget<PPCSubtarget>().hasFSQRT()) {
+    setOperationAction(ISD::FSQRT, MVT::f64, Expand);
+    setOperationAction(ISD::FSQRT, MVT::f32, Expand);
+  }
+
+  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+
+  // PowerPC does not have BSWAP, CTPOP or CTTZ
+  setOperationAction(ISD::BSWAP, MVT::i32  , Expand);
+  setOperationAction(ISD::CTPOP, MVT::i32  , Expand);
+  setOperationAction(ISD::CTTZ , MVT::i32  , Expand);
+  setOperationAction(ISD::BSWAP, MVT::i64  , Expand);
+  setOperationAction(ISD::CTPOP, MVT::i64  , Expand);
+  setOperationAction(ISD::CTTZ , MVT::i64  , Expand);
+
+  // PowerPC does not have ROTR
+  setOperationAction(ISD::ROTR, MVT::i32   , Expand);
+  setOperationAction(ISD::ROTR, MVT::i64   , Expand);
+
+  // PowerPC does not have Select
+  setOperationAction(ISD::SELECT, MVT::i32, Expand);
+  setOperationAction(ISD::SELECT, MVT::i64, Expand);
+  setOperationAction(ISD::SELECT, MVT::f32, Expand);
+  setOperationAction(ISD::SELECT, MVT::f64, Expand);
+
+  // PowerPC wants to turn select_cc of FP into fsel when possible.
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
+
+  // PowerPC wants to optimize integer setcc a bit
+  setOperationAction(ISD::SETCC, MVT::i32, Custom);
+
+  // PowerPC does not have BRCOND which requires SetCC
+  setOperationAction(ISD::BRCOND, MVT::Other, Expand);
+
+  setOperationAction(ISD::BR_JT,  MVT::Other, Expand);
+
+  // PowerPC turns FP_TO_SINT into FCTIWZ and some load/stores.
+  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+
+  // PowerPC does not have [U|S]INT_TO_FP
+  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Expand);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
+
+  setOperationAction(ISD::BIT_CONVERT, MVT::f32, Expand);
+  setOperationAction(ISD::BIT_CONVERT, MVT::i32, Expand);
+  setOperationAction(ISD::BIT_CONVERT, MVT::i64, Expand);
+  setOperationAction(ISD::BIT_CONVERT, MVT::f64, Expand);
+
+  // We cannot sextinreg(i1).  Expand to shifts.
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+  setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
+  setOperationAction(ISD::EHSELECTION,   MVT::i64, Expand);
+  setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
+  setOperationAction(ISD::EHSELECTION,   MVT::i32, Expand);
+
+
+  // We want to legalize GlobalAddress and ConstantPool nodes into the
+  // appropriate instructions to materialize the address.
+  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
+  setOperationAction(ISD::BlockAddress,  MVT::i32, Custom);
+  setOperationAction(ISD::ConstantPool,  MVT::i32, Custom);
+  setOperationAction(ISD::JumpTable,     MVT::i32, Custom);
+  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
+  setOperationAction(ISD::BlockAddress,  MVT::i64, Custom);
+  setOperationAction(ISD::ConstantPool,  MVT::i64, Custom);
+  setOperationAction(ISD::JumpTable,     MVT::i64, Custom);
+
+  // TRAP is legal.
+  setOperationAction(ISD::TRAP, MVT::Other, Legal);
+
+  // TRAMPOLINE is custom lowered.
+  setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom);
+
+  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
+  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
+
+  // VAARG is custom lowered with the 32-bit SVR4 ABI.
+  if (    TM.getSubtarget<PPCSubtarget>().isSVR4ABI()
+      && !TM.getSubtarget<PPCSubtarget>().isPPC64())
+    setOperationAction(ISD::VAARG, MVT::Other, Custom);
+  else
+    setOperationAction(ISD::VAARG, MVT::Other, Expand);
+
+  // Use the default implementation.
+  setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
+  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
+  setOperationAction(ISD::STACKSAVE         , MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE      , MVT::Other, Custom);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Custom);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64  , Custom);
+
+  // We want to custom lower some of our intrinsics.
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
+  // Comparisons that require checking two conditions.
+  setCondCodeAction(ISD::SETULT, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETULT, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETUGT, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUGT, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETUEQ, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUEQ, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETOGE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETOGE, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETOLE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETOLE, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETONE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETONE, MVT::f64, Expand);
+
+  if (TM.getSubtarget<PPCSubtarget>().has64BitSupport()) {
+    // They also have instructions for converting between i64 and fp.
+    setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+    setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
+    setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
+    // This is just the low 32 bits of a (signed) fp->i64 conversion.
+    // We cannot do this with Promote because i64 is not a legal type.
+    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+
+    // FIXME: disable this lowered code.  This generates 64-bit register values,
+    // and we don't model the fact that the top part is clobbered by calls.  We
+    // need to flag these together so that the value isn't live across a call.
+    //setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+  } else {
+    // PowerPC does not have FP_TO_UINT on 32-bit implementations.
+    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
+  }
+
+  if (TM.getSubtarget<PPCSubtarget>().use64BitRegs()) {
+    // 64-bit PowerPC implementations can support i64 types directly
+    addRegisterClass(MVT::i64, PPC::G8RCRegisterClass);
+    // BUILD_PAIR can't be handled natively, and should be expanded to shl/or
+    setOperationAction(ISD::BUILD_PAIR, MVT::i64, Expand);
+    // 64-bit PowerPC wants to expand i128 shifts itself.
+    setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
+    setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
+    setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
+  } else {
+    // 32-bit PowerPC wants to expand i64 shifts itself.
+    setOperationAction(ISD::SHL_PARTS, MVT::i32, Custom);
+    setOperationAction(ISD::SRA_PARTS, MVT::i32, Custom);
+    setOperationAction(ISD::SRL_PARTS, MVT::i32, Custom);
+  }
+
+  if (TM.getSubtarget<PPCSubtarget>().hasAltivec()) {
+    // First set operation action for all vector types to expand. Then we
+    // will selectively turn on ones that can be effectively codegen'd.
+    for (unsigned i = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+         i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
+      MVT::SimpleValueType VT = (MVT::SimpleValueType)i;
+
+      // add/sub are legal for all supported vector VT's.
+      setOperationAction(ISD::ADD , VT, Legal);
+      setOperationAction(ISD::SUB , VT, Legal);
+
+      // We promote all shuffles to v16i8.
+      setOperationAction(ISD::VECTOR_SHUFFLE, VT, Promote);
+      AddPromotedToType (ISD::VECTOR_SHUFFLE, VT, MVT::v16i8);
+
+      // We promote all non-typed operations to v4i32.
+      setOperationAction(ISD::AND   , VT, Promote);
+      AddPromotedToType (ISD::AND   , VT, MVT::v4i32);
+      setOperationAction(ISD::OR    , VT, Promote);
+      AddPromotedToType (ISD::OR    , VT, MVT::v4i32);
+      setOperationAction(ISD::XOR   , VT, Promote);
+      AddPromotedToType (ISD::XOR   , VT, MVT::v4i32);
+      setOperationAction(ISD::LOAD  , VT, Promote);
+      AddPromotedToType (ISD::LOAD  , VT, MVT::v4i32);
+      setOperationAction(ISD::SELECT, VT, Promote);
+      AddPromotedToType (ISD::SELECT, VT, MVT::v4i32);
+      setOperationAction(ISD::STORE, VT, Promote);
+      AddPromotedToType (ISD::STORE, VT, MVT::v4i32);
+
+      // No other operations are legal.
+      setOperationAction(ISD::MUL , VT, Expand);
+      setOperationAction(ISD::SDIV, VT, Expand);
+      setOperationAction(ISD::SREM, VT, Expand);
+      setOperationAction(ISD::UDIV, VT, Expand);
+      setOperationAction(ISD::UREM, VT, Expand);
+      setOperationAction(ISD::FDIV, VT, Expand);
+      setOperationAction(ISD::FNEG, VT, Expand);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Expand);
+      setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
+      setOperationAction(ISD::BUILD_VECTOR, VT, Expand);
+      setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+      setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+      setOperationAction(ISD::UDIVREM, VT, Expand);
+      setOperationAction(ISD::SDIVREM, VT, Expand);
+      setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand);
+      setOperationAction(ISD::FPOW, VT, Expand);
+      setOperationAction(ISD::CTPOP, VT, Expand);
+      setOperationAction(ISD::CTLZ, VT, Expand);
+      setOperationAction(ISD::CTTZ, VT, Expand);
+    }
+
+    // We can custom expand all VECTOR_SHUFFLEs to VPERM, others we can handle
+    // with merges, splats, etc.
+    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom);
+
+    setOperationAction(ISD::AND   , MVT::v4i32, Legal);
+    setOperationAction(ISD::OR    , MVT::v4i32, Legal);
+    setOperationAction(ISD::XOR   , MVT::v4i32, Legal);
+    setOperationAction(ISD::LOAD  , MVT::v4i32, Legal);
+    setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
+    setOperationAction(ISD::STORE , MVT::v4i32, Legal);
+
+    addRegisterClass(MVT::v4f32, PPC::VRRCRegisterClass);
+    addRegisterClass(MVT::v4i32, PPC::VRRCRegisterClass);
+    addRegisterClass(MVT::v8i16, PPC::VRRCRegisterClass);
+    addRegisterClass(MVT::v16i8, PPC::VRRCRegisterClass);
+
+    setOperationAction(ISD::MUL, MVT::v4f32, Legal);
+    setOperationAction(ISD::MUL, MVT::v4i32, Custom);
+    setOperationAction(ISD::MUL, MVT::v8i16, Custom);
+    setOperationAction(ISD::MUL, MVT::v16i8, Custom);
+
+    setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i32, Custom);
+
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom);
+    setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
+  }
+
+  setShiftAmountType(MVT::i32);
+  setBooleanContents(ZeroOrOneBooleanContent);
+
+  if (TM.getSubtarget<PPCSubtarget>().isPPC64()) {
+    setStackPointerRegisterToSaveRestore(PPC::X1);
+    setExceptionPointerRegister(PPC::X3);
+    setExceptionSelectorRegister(PPC::X4);
+  } else {
+    setStackPointerRegisterToSaveRestore(PPC::R1);
+    setExceptionPointerRegister(PPC::R3);
+    setExceptionSelectorRegister(PPC::R4);
+  }
+
+  // We have target-specific dag combine patterns for the following nodes:
+  setTargetDAGCombine(ISD::SINT_TO_FP);
+  setTargetDAGCombine(ISD::STORE);
+  setTargetDAGCombine(ISD::BR_CC);
+  setTargetDAGCombine(ISD::BSWAP);
+
+  // Darwin long double math library functions have $LDBL128 appended.
+  if (TM.getSubtarget<PPCSubtarget>().isDarwin()) {
+    setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128");
+    setLibcallName(RTLIB::POW_PPCF128, "powl$LDBL128");
+    setLibcallName(RTLIB::REM_PPCF128, "fmodl$LDBL128");
+    setLibcallName(RTLIB::SIN_PPCF128, "sinl$LDBL128");
+    setLibcallName(RTLIB::SQRT_PPCF128, "sqrtl$LDBL128");
+    setLibcallName(RTLIB::LOG_PPCF128, "logl$LDBL128");
+    setLibcallName(RTLIB::LOG2_PPCF128, "log2l$LDBL128");
+    setLibcallName(RTLIB::LOG10_PPCF128, "log10l$LDBL128");
+    setLibcallName(RTLIB::EXP_PPCF128, "expl$LDBL128");
+    setLibcallName(RTLIB::EXP2_PPCF128, "exp2l$LDBL128");
+  }
+
+  computeRegisterProperties();
+}
+
+/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
+/// function arguments in the caller parameter area.
+unsigned PPCTargetLowering::getByValTypeAlignment(const Type *Ty) const {
+  TargetMachine &TM = getTargetMachine();
+  // Darwin passes everything on 4 byte boundary.
+  if (TM.getSubtarget<PPCSubtarget>().isDarwin())
+    return 4;
+  // FIXME SVR4 TBD
+  return 4;
+}
+
+const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return 0;
+  case PPCISD::FSEL:            return "PPCISD::FSEL";
+  case PPCISD::FCFID:           return "PPCISD::FCFID";
+  case PPCISD::FCTIDZ:          return "PPCISD::FCTIDZ";
+  case PPCISD::FCTIWZ:          return "PPCISD::FCTIWZ";
+  case PPCISD::STFIWX:          return "PPCISD::STFIWX";
+  case PPCISD::VMADDFP:         return "PPCISD::VMADDFP";
+  case PPCISD::VNMSUBFP:        return "PPCISD::VNMSUBFP";
+  case PPCISD::VPERM:           return "PPCISD::VPERM";
+  case PPCISD::Hi:              return "PPCISD::Hi";
+  case PPCISD::Lo:              return "PPCISD::Lo";
+  case PPCISD::TOC_ENTRY:       return "PPCISD::TOC_ENTRY";
+  case PPCISD::TOC_RESTORE:     return "PPCISD::TOC_RESTORE";
+  case PPCISD::LOAD:            return "PPCISD::LOAD";
+  case PPCISD::LOAD_TOC:        return "PPCISD::LOAD_TOC";
+  case PPCISD::DYNALLOC:        return "PPCISD::DYNALLOC";
+  case PPCISD::GlobalBaseReg:   return "PPCISD::GlobalBaseReg";
+  case PPCISD::SRL:             return "PPCISD::SRL";
+  case PPCISD::SRA:             return "PPCISD::SRA";
+  case PPCISD::SHL:             return "PPCISD::SHL";
+  case PPCISD::EXTSW_32:        return "PPCISD::EXTSW_32";
+  case PPCISD::STD_32:          return "PPCISD::STD_32";
+  case PPCISD::CALL_SVR4:       return "PPCISD::CALL_SVR4";
+  case PPCISD::CALL_Darwin:     return "PPCISD::CALL_Darwin";
+  case PPCISD::NOP:             return "PPCISD::NOP";
+  case PPCISD::MTCTR:           return "PPCISD::MTCTR";
+  case PPCISD::BCTRL_Darwin:    return "PPCISD::BCTRL_Darwin";
+  case PPCISD::BCTRL_SVR4:      return "PPCISD::BCTRL_SVR4";
+  case PPCISD::RET_FLAG:        return "PPCISD::RET_FLAG";
+  case PPCISD::MFCR:            return "PPCISD::MFCR";
+  case PPCISD::VCMP:            return "PPCISD::VCMP";
+  case PPCISD::VCMPo:           return "PPCISD::VCMPo";
+  case PPCISD::LBRX:            return "PPCISD::LBRX";
+  case PPCISD::STBRX:           return "PPCISD::STBRX";
+  case PPCISD::LARX:            return "PPCISD::LARX";
+  case PPCISD::STCX:            return "PPCISD::STCX";
+  case PPCISD::COND_BRANCH:     return "PPCISD::COND_BRANCH";
+  case PPCISD::MFFS:            return "PPCISD::MFFS";
+  case PPCISD::MTFSB0:          return "PPCISD::MTFSB0";
+  case PPCISD::MTFSB1:          return "PPCISD::MTFSB1";
+  case PPCISD::FADDRTZ:         return "PPCISD::FADDRTZ";
+  case PPCISD::MTFSF:           return "PPCISD::MTFSF";
+  case PPCISD::TC_RETURN:       return "PPCISD::TC_RETURN";
+  }
+}
+
+MVT::SimpleValueType PPCTargetLowering::getSetCCResultType(EVT VT) const {
+  return MVT::i32;
+}
+
+/// getFunctionAlignment - Return the Log2 alignment of this function.
+unsigned PPCTargetLowering::getFunctionAlignment(const Function *F) const {
+  if (getTargetMachine().getSubtarget<PPCSubtarget>().isDarwin())
+    return F->hasFnAttr(Attribute::OptimizeForSize) ? 2 : 4;
+  else
+    return 2;
+}
+
+//===----------------------------------------------------------------------===//
+// Node matching predicates, for use by the tblgen matching code.
+//===----------------------------------------------------------------------===//
+
+/// isFloatingPointZero - Return true if this is 0.0 or -0.0.
+static bool isFloatingPointZero(SDValue Op) {
+  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op))
+    return CFP->getValueAPF().isZero();
+  else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
+    // Maybe this has already been legalized into the constant pool?
+    if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op.getOperand(1)))
+      if (ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
+        return CFP->getValueAPF().isZero();
+  }
+  return false;
+}
+
+/// isConstantOrUndef - Op is either an undef node or a ConstantSDNode.  Return
+/// true if Op is undef or if it matches the specified value.
+static bool isConstantOrUndef(int Op, int Val) {
+  return Op < 0 || Op == Val;
+}
+
+/// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a
+/// VPKUHUM instruction.
+bool PPC::isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary) {
+  if (!isUnary) {
+    for (unsigned i = 0; i != 16; ++i)
+      if (!isConstantOrUndef(N->getMaskElt(i),  i*2+1))
+        return false;
+  } else {
+    for (unsigned i = 0; i != 8; ++i)
+      if (!isConstantOrUndef(N->getMaskElt(i),    i*2+1) ||
+          !isConstantOrUndef(N->getMaskElt(i+8),  i*2+1))
+        return false;
+  }
+  return true;
+}
+
+/// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a
+/// VPKUWUM instruction.
+bool PPC::isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary) {
+  if (!isUnary) {
+    for (unsigned i = 0; i != 16; i += 2)
+      if (!isConstantOrUndef(N->getMaskElt(i  ),  i*2+2) ||
+          !isConstantOrUndef(N->getMaskElt(i+1),  i*2+3))
+        return false;
+  } else {
+    for (unsigned i = 0; i != 8; i += 2)
+      if (!isConstantOrUndef(N->getMaskElt(i  ),  i*2+2) ||
+          !isConstantOrUndef(N->getMaskElt(i+1),  i*2+3) ||
+          !isConstantOrUndef(N->getMaskElt(i+8),  i*2+2) ||
+          !isConstantOrUndef(N->getMaskElt(i+9),  i*2+3))
+        return false;
+  }
+  return true;
+}
+
+/// isVMerge - Common function, used to match vmrg* shuffles.
+///
+static bool isVMerge(ShuffleVectorSDNode *N, unsigned UnitSize,
+                     unsigned LHSStart, unsigned RHSStart) {
+  assert(N->getValueType(0) == MVT::v16i8 &&
+         "PPC only supports shuffles by bytes!");
+  assert((UnitSize == 1 || UnitSize == 2 || UnitSize == 4) &&
+         "Unsupported merge size!");
+
+  for (unsigned i = 0; i != 8/UnitSize; ++i)     // Step over units
+    for (unsigned j = 0; j != UnitSize; ++j) {   // Step over bytes within unit
+      if (!isConstantOrUndef(N->getMaskElt(i*UnitSize*2+j),
+                             LHSStart+j+i*UnitSize) ||
+          !isConstantOrUndef(N->getMaskElt(i*UnitSize*2+UnitSize+j),
+                             RHSStart+j+i*UnitSize))
+        return false;
+    }
+  return true;
+}
+
+/// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for
+/// a VRGL* instruction with the specified unit size (1,2 or 4 bytes).
+bool PPC::isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, 
+                             bool isUnary) {
+  if (!isUnary)
+    return isVMerge(N, UnitSize, 8, 24);
+  return isVMerge(N, UnitSize, 8, 8);
+}
+
+/// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for
+/// a VRGH* instruction with the specified unit size (1,2 or 4 bytes).
+bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize, 
+                             bool isUnary) {
+  if (!isUnary)
+    return isVMerge(N, UnitSize, 0, 16);
+  return isVMerge(N, UnitSize, 0, 0);
+}
+
+
+/// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift
+/// amount, otherwise return -1.
+int PPC::isVSLDOIShuffleMask(SDNode *N, bool isUnary) {
+  assert(N->getValueType(0) == MVT::v16i8 &&
+         "PPC only supports shuffles by bytes!");
+
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  
+  // Find the first non-undef value in the shuffle mask.
+  unsigned i;
+  for (i = 0; i != 16 && SVOp->getMaskElt(i) < 0; ++i)
+    /*search*/;
+
+  if (i == 16) return -1;  // all undef.
+
+  // Otherwise, check to see if the rest of the elements are consecutively
+  // numbered from this value.
+  unsigned ShiftAmt = SVOp->getMaskElt(i);
+  if (ShiftAmt < i) return -1;
+  ShiftAmt -= i;
+
+  if (!isUnary) {
+    // Check the rest of the elements to see if they are consecutive.
+    for (++i; i != 16; ++i)
+      if (!isConstantOrUndef(SVOp->getMaskElt(i), ShiftAmt+i))
+        return -1;
+  } else {
+    // Check the rest of the elements to see if they are consecutive.
+    for (++i; i != 16; ++i)
+      if (!isConstantOrUndef(SVOp->getMaskElt(i), (ShiftAmt+i) & 15))
+        return -1;
+  }
+  return ShiftAmt;
+}
+
+/// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a splat of a single element that is suitable for input to
+/// VSPLTB/VSPLTH/VSPLTW.
+bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) {
+  assert(N->getValueType(0) == MVT::v16i8 &&
+         (EltSize == 1 || EltSize == 2 || EltSize == 4));
+
+  // This is a splat operation if each element of the permute is the same, and
+  // if the value doesn't reference the second vector.
+  unsigned ElementBase = N->getMaskElt(0);
+  
+  // FIXME: Handle UNDEF elements too!
+  if (ElementBase >= 16)
+    return false;
+
+  // Check that the indices are consecutive, in the case of a multi-byte element
+  // splatted with a v16i8 mask.
+  for (unsigned i = 1; i != EltSize; ++i)
+    if (N->getMaskElt(i) < 0 || N->getMaskElt(i) != (int)(i+ElementBase))
+      return false;
+
+  for (unsigned i = EltSize, e = 16; i != e; i += EltSize) {
+    if (N->getMaskElt(i) < 0) continue;
+    for (unsigned j = 0; j != EltSize; ++j)
+      if (N->getMaskElt(i+j) != N->getMaskElt(j))
+        return false;
+  }
+  return true;
+}
+
+/// isAllNegativeZeroVector - Returns true if all elements of build_vector
+/// are -0.0.
+bool PPC::isAllNegativeZeroVector(SDNode *N) {
+  BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N);
+
+  APInt APVal, APUndef;
+  unsigned BitSize;
+  bool HasAnyUndefs;
+  
+  if (BV->isConstantSplat(APVal, APUndef, BitSize, HasAnyUndefs, 32, true))
+    if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
+      return CFP->getValueAPF().isNegZero();
+
+  return false;
+}
+
+/// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the
+/// specified isSplatShuffleMask VECTOR_SHUFFLE mask.
+unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  assert(isSplatShuffleMask(SVOp, EltSize));
+  return SVOp->getMaskElt(0) / EltSize;
+}
+
+/// get_VSPLTI_elt - If this is a build_vector of constants which can be formed
+/// by using a vspltis[bhw] instruction of the specified element size, return
+/// the constant being splatted.  The ByteSize field indicates the number of
+/// bytes of each element [124] -> [bhw].
+SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
+  SDValue OpVal(0, 0);
+
+  // If ByteSize of the splat is bigger than the element size of the
+  // build_vector, then we have a case where we are checking for a splat where
+  // multiple elements of the buildvector are folded together into a single
+  // logical element of the splat (e.g. "vsplish 1" to splat {0,1}*8).
+  unsigned EltSize = 16/N->getNumOperands();
+  if (EltSize < ByteSize) {
+    unsigned Multiple = ByteSize/EltSize;   // Number of BV entries per spltval.
+    SDValue UniquedVals[4];
+    assert(Multiple > 1 && Multiple <= 4 && "How can this happen?");
+
+    // See if all of the elements in the buildvector agree across.
+    for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+      if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
+      // If the element isn't a constant, bail fully out.
+      if (!isa<ConstantSDNode>(N->getOperand(i))) return SDValue();
+
+
+      if (UniquedVals[i&(Multiple-1)].getNode() == 0)
+        UniquedVals[i&(Multiple-1)] = N->getOperand(i);
+      else if (UniquedVals[i&(Multiple-1)] != N->getOperand(i))
+        return SDValue();  // no match.
+    }
+
+    // Okay, if we reached this point, UniquedVals[0..Multiple-1] contains
+    // either constant or undef values that are identical for each chunk.  See
+    // if these chunks can form into a larger vspltis*.
+
+    // Check to see if all of the leading entries are either 0 or -1.  If
+    // neither, then this won't fit into the immediate field.
+    bool LeadingZero = true;
+    bool LeadingOnes = true;
+    for (unsigned i = 0; i != Multiple-1; ++i) {
+      if (UniquedVals[i].getNode() == 0) continue;  // Must have been undefs.
+
+      LeadingZero &= cast<ConstantSDNode>(UniquedVals[i])->isNullValue();
+      LeadingOnes &= cast<ConstantSDNode>(UniquedVals[i])->isAllOnesValue();
+    }
+    // Finally, check the least significant entry.
+    if (LeadingZero) {
+      if (UniquedVals[Multiple-1].getNode() == 0)
+        return DAG.getTargetConstant(0, MVT::i32);  // 0,0,0,undef
+      int Val = cast<ConstantSDNode>(UniquedVals[Multiple-1])->getZExtValue();
+      if (Val < 16)
+        return DAG.getTargetConstant(Val, MVT::i32);  // 0,0,0,4 -> vspltisw(4)
+    }
+    if (LeadingOnes) {
+      if (UniquedVals[Multiple-1].getNode() == 0)
+        return DAG.getTargetConstant(~0U, MVT::i32);  // -1,-1,-1,undef
+      int Val =cast<ConstantSDNode>(UniquedVals[Multiple-1])->getSExtValue();
+      if (Val >= -16)                            // -1,-1,-1,-2 -> vspltisw(-2)
+        return DAG.getTargetConstant(Val, MVT::i32);
+    }
+
+    return SDValue();
+  }
+
+  // Check to see if this buildvec has a single non-undef value in its elements.
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
+    if (OpVal.getNode() == 0)
+      OpVal = N->getOperand(i);
+    else if (OpVal != N->getOperand(i))
+      return SDValue();
+  }
+
+  if (OpVal.getNode() == 0) return SDValue();  // All UNDEF: use implicit def.
+
+  unsigned ValSizeInBytes = EltSize;
+  uint64_t Value = 0;
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal)) {
+    Value = CN->getZExtValue();
+  } else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal)) {
+    assert(CN->getValueType(0) == MVT::f32 && "Only one legal FP vector type!");
+    Value = FloatToBits(CN->getValueAPF().convertToFloat());
+  }
+
+  // If the splat value is larger than the element value, then we can never do
+  // this splat.  The only case that we could fit the replicated bits into our
+  // immediate field for would be zero, and we prefer to use vxor for it.
+  if (ValSizeInBytes < ByteSize) return SDValue();
+
+  // If the element value is larger than the splat value, cut it in half and
+  // check to see if the two halves are equal.  Continue doing this until we
+  // get to ByteSize.  This allows us to handle 0x01010101 as 0x01.
+  while (ValSizeInBytes > ByteSize) {
+    ValSizeInBytes >>= 1;
+
+    // If the top half equals the bottom half, we're still ok.
+    if (((Value >> (ValSizeInBytes*8)) & ((1 << (8*ValSizeInBytes))-1)) !=
+         (Value                        & ((1 << (8*ValSizeInBytes))-1)))
+      return SDValue();
+  }
+
+  // Properly sign extend the value.
+  int ShAmt = (4-ByteSize)*8;
+  int MaskVal = ((int)Value << ShAmt) >> ShAmt;
+
+  // If this is zero, don't match, zero matches ISD::isBuildVectorAllZeros.
+  if (MaskVal == 0) return SDValue();
+
+  // Finally, if this value fits in a 5 bit sext field, return it
+  if (((MaskVal << (32-5)) >> (32-5)) == MaskVal)
+    return DAG.getTargetConstant(MaskVal, MVT::i32);
+  return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+//  Addressing Mode Selection
+//===----------------------------------------------------------------------===//
+
+/// isIntS16Immediate - This method tests to see if the node is either a 32-bit
+/// or 64-bit immediate, and if the value can be accurately represented as a
+/// sign extension from a 16-bit value.  If so, this returns true and the
+/// immediate.
+static bool isIntS16Immediate(SDNode *N, short &Imm) {
+  if (N->getOpcode() != ISD::Constant)
+    return false;
+
+  Imm = (short)cast<ConstantSDNode>(N)->getZExtValue();
+  if (N->getValueType(0) == MVT::i32)
+    return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue();
+  else
+    return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue();
+}
+static bool isIntS16Immediate(SDValue Op, short &Imm) {
+  return isIntS16Immediate(Op.getNode(), Imm);
+}
+
+
+/// SelectAddressRegReg - Given the specified addressed, check to see if it
+/// can be represented as an indexed [r+r] operation.  Returns false if it
+/// can be more efficiently represented with [r+imm].
+bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base,
+                                            SDValue &Index,
+                                            SelectionDAG &DAG) const {
+  short imm = 0;
+  if (N.getOpcode() == ISD::ADD) {
+    if (isIntS16Immediate(N.getOperand(1), imm))
+      return false;    // r+i
+    if (N.getOperand(1).getOpcode() == PPCISD::Lo)
+      return false;    // r+i
+
+    Base = N.getOperand(0);
+    Index = N.getOperand(1);
+    return true;
+  } else if (N.getOpcode() == ISD::OR) {
+    if (isIntS16Immediate(N.getOperand(1), imm))
+      return false;    // r+i can fold it if we can.
+
+    // If this is an or of disjoint bitfields, we can codegen this as an add
+    // (for better address arithmetic) if the LHS and RHS of the OR are provably
+    // disjoint.
+    APInt LHSKnownZero, LHSKnownOne;
+    APInt RHSKnownZero, RHSKnownOne;
+    DAG.ComputeMaskedBits(N.getOperand(0),
+                          APInt::getAllOnesValue(N.getOperand(0)
+                            .getValueSizeInBits()),
+                          LHSKnownZero, LHSKnownOne);
+
+    if (LHSKnownZero.getBoolValue()) {
+      DAG.ComputeMaskedBits(N.getOperand(1),
+                            APInt::getAllOnesValue(N.getOperand(1)
+                              .getValueSizeInBits()),
+                            RHSKnownZero, RHSKnownOne);
+      // If all of the bits are known zero on the LHS or RHS, the add won't
+      // carry.
+      if (~(LHSKnownZero | RHSKnownZero) == 0) {
+        Base = N.getOperand(0);
+        Index = N.getOperand(1);
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+/// Returns true if the address N can be represented by a base register plus
+/// a signed 16-bit displacement [r+imm], and if it is not better
+/// represented as reg+reg.
+bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
+                                            SDValue &Base,
+                                            SelectionDAG &DAG) const {
+  // FIXME dl should come from parent load or store, not from address
+  DebugLoc dl = N.getDebugLoc();
+  // If this can be more profitably realized as r+r, fail.
+  if (SelectAddressRegReg(N, Disp, Base, DAG))
+    return false;
+
+  if (N.getOpcode() == ISD::ADD) {
+    short imm = 0;
+    if (isIntS16Immediate(N.getOperand(1), imm)) {
+      Disp = DAG.getTargetConstant((int)imm & 0xFFFF, MVT::i32);
+      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
+        Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+      } else {
+        Base = N.getOperand(0);
+      }
+      return true; // [r+i]
+    } else if (N.getOperand(1).getOpcode() == PPCISD::Lo) {
+      // Match LOAD (ADD (X, Lo(G))).
+     assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue()
+             && "Cannot handle constant offsets yet!");
+      Disp = N.getOperand(1).getOperand(0);  // The global address.
+      assert(Disp.getOpcode() == ISD::TargetGlobalAddress ||
+             Disp.getOpcode() == ISD::TargetConstantPool ||
+             Disp.getOpcode() == ISD::TargetJumpTable);
+      Base = N.getOperand(0);
+      return true;  // [&g+r]
+    }
+  } else if (N.getOpcode() == ISD::OR) {
+    short imm = 0;
+    if (isIntS16Immediate(N.getOperand(1), imm)) {
+      // If this is an or of disjoint bitfields, we can codegen this as an add
+      // (for better address arithmetic) if the LHS and RHS of the OR are
+      // provably disjoint.
+      APInt LHSKnownZero, LHSKnownOne;
+      DAG.ComputeMaskedBits(N.getOperand(0),
+                            APInt::getAllOnesValue(N.getOperand(0)
+                                                   .getValueSizeInBits()),
+                            LHSKnownZero, LHSKnownOne);
+
+      if ((LHSKnownZero.getZExtValue()|~(uint64_t)imm) == ~0ULL) {
+        // If all of the bits are known zero on the LHS or RHS, the add won't
+        // carry.
+        Base = N.getOperand(0);
+        Disp = DAG.getTargetConstant((int)imm & 0xFFFF, MVT::i32);
+        return true;
+      }
+    }
+  } else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
+    // Loading from a constant address.
+
+    // If this address fits entirely in a 16-bit sext immediate field, codegen
+    // this as "d, 0"
+    short Imm;
+    if (isIntS16Immediate(CN, Imm)) {
+      Disp = DAG.getTargetConstant(Imm, CN->getValueType(0));
+      Base = DAG.getRegister(PPC::R0, CN->getValueType(0));
+      return true;
+    }
+
+    // Handle 32-bit sext immediates with LIS + addr mode.
+    if (CN->getValueType(0) == MVT::i32 ||
+        (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) {
+      int Addr = (int)CN->getZExtValue();
+
+      // Otherwise, break this down into an LIS + disp.
+      Disp = DAG.getTargetConstant((short)Addr, MVT::i32);
+
+      Base = DAG.getTargetConstant((Addr - (signed short)Addr) >> 16, MVT::i32);
+      unsigned Opc = CN->getValueType(0) == MVT::i32 ? PPC::LIS : PPC::LIS8;
+      Base = SDValue(DAG.getMachineNode(Opc, dl, CN->getValueType(0), Base), 0);
+      return true;
+    }
+  }
+
+  Disp = DAG.getTargetConstant(0, getPointerTy());
+  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N))
+    Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+  else
+    Base = N;
+  return true;      // [r+0]
+}
+
+/// SelectAddressRegRegOnly - Given the specified addressed, force it to be
+/// represented as an indexed [r+r] operation.
+bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base,
+                                                SDValue &Index,
+                                                SelectionDAG &DAG) const {
+  // Check to see if we can easily represent this as an [r+r] address.  This
+  // will fail if it thinks that the address is more profitably represented as
+  // reg+imm, e.g. where imm = 0.
+  if (SelectAddressRegReg(N, Base, Index, DAG))
+    return true;
+
+  // If the operand is an addition, always emit this as [r+r], since this is
+  // better (for code size, and execution, as the memop does the add for free)
+  // than emitting an explicit add.
+  if (N.getOpcode() == ISD::ADD) {
+    Base = N.getOperand(0);
+    Index = N.getOperand(1);
+    return true;
+  }
+
+  // Otherwise, do it the hard way, using R0 as the base register.
+  Base = DAG.getRegister(PPC::R0, N.getValueType());
+  Index = N;
+  return true;
+}
+
+/// SelectAddressRegImmShift - Returns true if the address N can be
+/// represented by a base register plus a signed 14-bit displacement
+/// [r+imm*4].  Suitable for use by STD and friends.
+bool PPCTargetLowering::SelectAddressRegImmShift(SDValue N, SDValue &Disp,
+                                                 SDValue &Base,
+                                                 SelectionDAG &DAG) const {
+  // FIXME dl should come from the parent load or store, not the address
+  DebugLoc dl = N.getDebugLoc();
+  // If this can be more profitably realized as r+r, fail.
+  if (SelectAddressRegReg(N, Disp, Base, DAG))
+    return false;
+
+  if (N.getOpcode() == ISD::ADD) {
+    short imm = 0;
+    if (isIntS16Immediate(N.getOperand(1), imm) && (imm & 3) == 0) {
+      Disp =  DAG.getTargetConstant(((int)imm & 0xFFFF) >> 2, MVT::i32);
+      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
+        Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+      } else {
+        Base = N.getOperand(0);
+      }
+      return true; // [r+i]
+    } else if (N.getOperand(1).getOpcode() == PPCISD::Lo) {
+      // Match LOAD (ADD (X, Lo(G))).
+     assert(!cast<ConstantSDNode>(N.getOperand(1).getOperand(1))->getZExtValue()
+             && "Cannot handle constant offsets yet!");
+      Disp = N.getOperand(1).getOperand(0);  // The global address.
+      assert(Disp.getOpcode() == ISD::TargetGlobalAddress ||
+             Disp.getOpcode() == ISD::TargetConstantPool ||
+             Disp.getOpcode() == ISD::TargetJumpTable);
+      Base = N.getOperand(0);
+      return true;  // [&g+r]
+    }
+  } else if (N.getOpcode() == ISD::OR) {
+    short imm = 0;
+    if (isIntS16Immediate(N.getOperand(1), imm) && (imm & 3) == 0) {
+      // If this is an or of disjoint bitfields, we can codegen this as an add
+      // (for better address arithmetic) if the LHS and RHS of the OR are
+      // provably disjoint.
+      APInt LHSKnownZero, LHSKnownOne;
+      DAG.ComputeMaskedBits(N.getOperand(0),
+                            APInt::getAllOnesValue(N.getOperand(0)
+                                                   .getValueSizeInBits()),
+                            LHSKnownZero, LHSKnownOne);
+      if ((LHSKnownZero.getZExtValue()|~(uint64_t)imm) == ~0ULL) {
+        // If all of the bits are known zero on the LHS or RHS, the add won't
+        // carry.
+        Base = N.getOperand(0);
+        Disp = DAG.getTargetConstant(((int)imm & 0xFFFF) >> 2, MVT::i32);
+        return true;
+      }
+    }
+  } else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
+    // Loading from a constant address.  Verify low two bits are clear.
+    if ((CN->getZExtValue() & 3) == 0) {
+      // If this address fits entirely in a 14-bit sext immediate field, codegen
+      // this as "d, 0"
+      short Imm;
+      if (isIntS16Immediate(CN, Imm)) {
+        Disp = DAG.getTargetConstant((unsigned short)Imm >> 2, getPointerTy());
+        Base = DAG.getRegister(PPC::R0, CN->getValueType(0));
+        return true;
+      }
+
+      // Fold the low-part of 32-bit absolute addresses into addr mode.
+      if (CN->getValueType(0) == MVT::i32 ||
+          (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) {
+        int Addr = (int)CN->getZExtValue();
+
+        // Otherwise, break this down into an LIS + disp.
+        Disp = DAG.getTargetConstant((short)Addr >> 2, MVT::i32);
+        Base = DAG.getTargetConstant((Addr-(signed short)Addr) >> 16, MVT::i32);
+        unsigned Opc = CN->getValueType(0) == MVT::i32 ? PPC::LIS : PPC::LIS8;
+        Base = SDValue(DAG.getMachineNode(Opc, dl, CN->getValueType(0), Base),0);
+        return true;
+      }
+    }
+  }
+
+  Disp = DAG.getTargetConstant(0, getPointerTy());
+  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N))
+    Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+  else
+    Base = N;
+  return true;      // [r+0]
+}
+
+
+/// getPreIndexedAddressParts - returns true by value, base pointer and
+/// offset pointer and addressing mode by reference if the node's address
+/// can be legally represented as pre-indexed load / store address.
+bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
+                                                  SDValue &Offset,
+                                                  ISD::MemIndexedMode &AM,
+                                                  SelectionDAG &DAG) const {
+  // Disabled by default for now.
+  if (!EnablePPCPreinc) return false;
+
+  SDValue Ptr;
+  EVT VT;
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    Ptr = LD->getBasePtr();
+    VT = LD->getMemoryVT();
+
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    ST = ST;
+    Ptr = ST->getBasePtr();
+    VT  = ST->getMemoryVT();
+  } else
+    return false;
+
+  // PowerPC doesn't have preinc load/store instructions for vectors.
+  if (VT.isVector())
+    return false;
+
+  // TODO: Check reg+reg first.
+
+  // LDU/STU use reg+imm*4, others use reg+imm.
+  if (VT != MVT::i64) {
+    // reg + imm
+    if (!SelectAddressRegImm(Ptr, Offset, Base, DAG))
+      return false;
+  } else {
+    // reg + imm * 4.
+    if (!SelectAddressRegImmShift(Ptr, Offset, Base, DAG))
+      return false;
+  }
+
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    // PPC64 doesn't have lwau, but it does have lwaux.  Reject preinc load of
+    // sext i32 to i64 when addr mode is r+i.
+    if (LD->getValueType(0) == MVT::i64 && LD->getMemoryVT() == MVT::i32 &&
+        LD->getExtensionType() == ISD::SEXTLOAD &&
+        isa<ConstantSDNode>(Offset))
+      return false;
+  }
+
+  AM = ISD::PRE_INC;
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+//  LowerOperation implementation
+//===----------------------------------------------------------------------===//
+
+SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
+                                             SelectionDAG &DAG) {
+  EVT PtrVT = Op.getValueType();
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+  Constant *C = CP->getConstVal();
+  SDValue CPI = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment());
+  SDValue Zero = DAG.getConstant(0, PtrVT);
+  // FIXME there isn't really any debug info here
+  DebugLoc dl = Op.getDebugLoc();
+
+  const TargetMachine &TM = DAG.getTarget();
+
+  SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, CPI, Zero);
+  SDValue Lo = DAG.getNode(PPCISD::Lo, dl, PtrVT, CPI, Zero);
+
+  // If this is a non-darwin platform, we don't support non-static relo models
+  // yet.
+  if (TM.getRelocationModel() == Reloc::Static ||
+      !TM.getSubtarget<PPCSubtarget>().isDarwin()) {
+    // Generate non-pic code that has direct accesses to the constant pool.
+    // The address of the global is just (hi(&g)+lo(&g)).
+    return DAG.getNode(ISD::ADD, dl, PtrVT, Hi, Lo);
+  }
+
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    // With PIC, the first instruction is actually "GR+hi(&G)".
+    Hi = DAG.getNode(ISD::ADD, dl, PtrVT,
+                     DAG.getNode(PPCISD::GlobalBaseReg,
+                                 DebugLoc::getUnknownLoc(), PtrVT), Hi);
+  }
+
+  Lo = DAG.getNode(ISD::ADD, dl, PtrVT, Hi, Lo);
+  return Lo;
+}
+
+SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) {
+  EVT PtrVT = Op.getValueType();
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+  SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
+  SDValue Zero = DAG.getConstant(0, PtrVT);
+  // FIXME there isn't really any debug loc here
+  DebugLoc dl = Op.getDebugLoc();
+
+  const TargetMachine &TM = DAG.getTarget();
+
+  SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, JTI, Zero);
+  SDValue Lo = DAG.getNode(PPCISD::Lo, dl, PtrVT, JTI, Zero);
+
+  // If this is a non-darwin platform, we don't support non-static relo models
+  // yet.
+  if (TM.getRelocationModel() == Reloc::Static ||
+      !TM.getSubtarget<PPCSubtarget>().isDarwin()) {
+    // Generate non-pic code that has direct accesses to the constant pool.
+    // The address of the global is just (hi(&g)+lo(&g)).
+    return DAG.getNode(ISD::ADD, dl, PtrVT, Hi, Lo);
+  }
+
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    // With PIC, the first instruction is actually "GR+hi(&G)".
+    Hi = DAG.getNode(ISD::ADD, dl, PtrVT,
+                     DAG.getNode(PPCISD::GlobalBaseReg,
+                                 DebugLoc::getUnknownLoc(), PtrVT), Hi);
+  }
+
+  Lo = DAG.getNode(ISD::ADD, dl, PtrVT, Hi, Lo);
+  return Lo;
+}
+
+SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
+                                                   SelectionDAG &DAG) {
+  llvm_unreachable("TLS not implemented for PPC.");
+  return SDValue(); // Not reached
+}
+
+SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) {
+  EVT PtrVT = Op.getValueType();
+  DebugLoc DL = Op.getDebugLoc();
+
+  BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+  SDValue TgtBA = DAG.getBlockAddress(BA, PtrVT, /*isTarget=*/true);
+  SDValue Zero = DAG.getConstant(0, PtrVT);
+  SDValue Hi = DAG.getNode(PPCISD::Hi, DL, PtrVT, TgtBA, Zero);
+  SDValue Lo = DAG.getNode(PPCISD::Lo, DL, PtrVT, TgtBA, Zero);
+
+  // If this is a non-darwin platform, we don't support non-static relo models
+  // yet.
+  const TargetMachine &TM = DAG.getTarget();
+  if (TM.getRelocationModel() == Reloc::Static ||
+      !TM.getSubtarget<PPCSubtarget>().isDarwin()) {
+    // Generate non-pic code that has direct accesses to globals.
+    // The address of the global is just (hi(&g)+lo(&g)).
+    return DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Lo);
+  }
+
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    // With PIC, the first instruction is actually "GR+hi(&G)".
+    Hi = DAG.getNode(ISD::ADD, DL, PtrVT,
+                     DAG.getNode(PPCISD::GlobalBaseReg,
+                                 DebugLoc::getUnknownLoc(), PtrVT), Hi);
+  }
+
+  return DAG.getNode(ISD::ADD, DL, PtrVT, Hi, Lo);
+}
+
+SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
+                                              SelectionDAG &DAG) {
+  EVT PtrVT = Op.getValueType();
+  GlobalAddressSDNode *GSDN = cast<GlobalAddressSDNode>(Op);
+  GlobalValue *GV = GSDN->getGlobal();
+  SDValue GA = DAG.getTargetGlobalAddress(GV, PtrVT, GSDN->getOffset());
+  SDValue Zero = DAG.getConstant(0, PtrVT);
+  // FIXME there isn't really any debug info here
+  DebugLoc dl = GSDN->getDebugLoc();
+
+  const TargetMachine &TM = DAG.getTarget();
+
+  // 64-bit SVR4 ABI code is always position-independent.
+  // The actual address of the GlobalValue is stored in the TOC.
+  if (PPCSubTarget.isSVR4ABI() && PPCSubTarget.isPPC64()) {
+    return DAG.getNode(PPCISD::TOC_ENTRY, dl, MVT::i64, GA,
+                       DAG.getRegister(PPC::X2, MVT::i64));
+  }
+
+  SDValue Hi = DAG.getNode(PPCISD::Hi, dl, PtrVT, GA, Zero);
+  SDValue Lo = DAG.getNode(PPCISD::Lo, dl, PtrVT, GA, Zero);
+
+  // If this is a non-darwin platform, we don't support non-static relo models
+  // yet.
+  if (TM.getRelocationModel() == Reloc::Static ||
+      !TM.getSubtarget<PPCSubtarget>().isDarwin()) {
+    // Generate non-pic code that has direct accesses to globals.
+    // The address of the global is just (hi(&g)+lo(&g)).
+    return DAG.getNode(ISD::ADD, dl, PtrVT, Hi, Lo);
+  }
+
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    // With PIC, the first instruction is actually "GR+hi(&G)".
+    Hi = DAG.getNode(ISD::ADD, dl, PtrVT,
+                     DAG.getNode(PPCISD::GlobalBaseReg,
+                                 DebugLoc::getUnknownLoc(), PtrVT), Hi);
+  }
+
+  Lo = DAG.getNode(ISD::ADD, dl, PtrVT, Hi, Lo);
+
+  if (!TM.getSubtarget<PPCSubtarget>().hasLazyResolverStub(GV, TM))
+    return Lo;
+
+  // If the global is weak or external, we have to go through the lazy
+  // resolution stub.
+  return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Lo, NULL, 0);
+}
+
+SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) {
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  DebugLoc dl = Op.getDebugLoc();
+
+  // If we're comparing for equality to zero, expose the fact that this is
+  // implented as a ctlz/srl pair on ppc, so that the dag combiner can
+  // fold the new nodes.
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+    if (C->isNullValue() && CC == ISD::SETEQ) {
+      EVT VT = Op.getOperand(0).getValueType();
+      SDValue Zext = Op.getOperand(0);
+      if (VT.bitsLT(MVT::i32)) {
+        VT = MVT::i32;
+        Zext = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Op.getOperand(0));
+      }
+      unsigned Log2b = Log2_32(VT.getSizeInBits());
+      SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Zext);
+      SDValue Scc = DAG.getNode(ISD::SRL, dl, VT, Clz,
+                                DAG.getConstant(Log2b, MVT::i32));
+      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Scc);
+    }
+    // Leave comparisons against 0 and -1 alone for now, since they're usually
+    // optimized.  FIXME: revisit this when we can custom lower all setcc
+    // optimizations.
+    if (C->isAllOnesValue() || C->isNullValue())
+      return SDValue();
+  }
+
+  // If we have an integer seteq/setne, turn it into a compare against zero
+  // by xor'ing the rhs with the lhs, which is faster than setting a
+  // condition register, reading it back out, and masking the correct bit.  The
+  // normal approach here uses sub to do this instead of xor.  Using xor exposes
+  // the result to other bit-twiddling opportunities.
+  EVT LHSVT = Op.getOperand(0).getValueType();
+  if (LHSVT.isInteger() && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+    EVT VT = Op.getValueType();
+    SDValue Sub = DAG.getNode(ISD::XOR, dl, LHSVT, Op.getOperand(0),
+                                Op.getOperand(1));
+    return DAG.getSetCC(dl, VT, Sub, DAG.getConstant(0, LHSVT), CC);
+  }
+  return SDValue();
+}
+
+SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG,
+                              int VarArgsFrameIndex,
+                              int VarArgsStackOffset,
+                              unsigned VarArgsNumGPR,
+                              unsigned VarArgsNumFPR,
+                              const PPCSubtarget &Subtarget) {
+
+  llvm_unreachable("VAARG not yet implemented for the SVR4 ABI!");
+  return SDValue(); // Not reached
+}
+
+SDValue PPCTargetLowering::LowerTRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
+  SDValue Chain = Op.getOperand(0);
+  SDValue Trmp = Op.getOperand(1); // trampoline
+  SDValue FPtr = Op.getOperand(2); // nested function
+  SDValue Nest = Op.getOperand(3); // 'nest' parameter value
+  DebugLoc dl = Op.getDebugLoc();
+
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  bool isPPC64 = (PtrVT == MVT::i64);
+  const Type *IntPtrTy =
+    DAG.getTargetLoweringInfo().getTargetData()->getIntPtrType(
+                                                             *DAG.getContext());
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+
+  Entry.Ty = IntPtrTy;
+  Entry.Node = Trmp; Args.push_back(Entry);
+
+  // TrampSize == (isPPC64 ? 48 : 40);
+  Entry.Node = DAG.getConstant(isPPC64 ? 48 : 40,
+                               isPPC64 ? MVT::i64 : MVT::i32);
+  Args.push_back(Entry);
+
+  Entry.Node = FPtr; Args.push_back(Entry);
+  Entry.Node = Nest; Args.push_back(Entry);
+
+  // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
+  std::pair<SDValue, SDValue> CallResult =
+    LowerCallTo(Chain, Op.getValueType().getTypeForEVT(*DAG.getContext()),
+                false, false, false, false, 0, CallingConv::C, false,
+                /*isReturnValueUsed=*/true,
+                DAG.getExternalSymbol("__trampoline_setup", PtrVT),
+                Args, DAG, dl, DAG.GetOrdering(Chain.getNode()));
+
+  SDValue Ops[] =
+    { CallResult.first, CallResult.second };
+
+  return DAG.getMergeValues(Ops, 2, dl);
+}
+
+SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG,
+                                        int VarArgsFrameIndex,
+                                        int VarArgsStackOffset,
+                                        unsigned VarArgsNumGPR,
+                                        unsigned VarArgsNumFPR,
+                                        const PPCSubtarget &Subtarget) {
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (Subtarget.isDarwinABI() || Subtarget.isPPC64()) {
+    // vastart just stores the address of the VarArgsFrameIndex slot into the
+    // memory location argument.
+    EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+    SDValue FR = DAG.getFrameIndex(VarArgsFrameIndex, PtrVT);
+    const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+    return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0);
+  }
+
+  // For the 32-bit SVR4 ABI we follow the layout of the va_list struct.
+  // We suppose the given va_list is already allocated.
+  //
+  // typedef struct {
+  //  char gpr;     /* index into the array of 8 GPRs
+  //                 * stored in the register save area
+  //                 * gpr=0 corresponds to r3,
+  //                 * gpr=1 to r4, etc.
+  //                 */
+  //  char fpr;     /* index into the array of 8 FPRs
+  //                 * stored in the register save area
+  //                 * fpr=0 corresponds to f1,
+  //                 * fpr=1 to f2, etc.
+  //                 */
+  //  char *overflow_arg_area;
+  //                /* location on stack that holds
+  //                 * the next overflow argument
+  //                 */
+  //  char *reg_save_area;
+  //               /* where r3:r10 and f1:f8 (if saved)
+  //                * are stored
+  //                */
+  // } va_list[1];
+
+
+  SDValue ArgGPR = DAG.getConstant(VarArgsNumGPR, MVT::i32);
+  SDValue ArgFPR = DAG.getConstant(VarArgsNumFPR, MVT::i32);
+
+
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+
+  SDValue StackOffsetFI = DAG.getFrameIndex(VarArgsStackOffset, PtrVT);
+  SDValue FR = DAG.getFrameIndex(VarArgsFrameIndex, PtrVT);
+
+  uint64_t FrameOffset = PtrVT.getSizeInBits()/8;
+  SDValue ConstFrameOffset = DAG.getConstant(FrameOffset, PtrVT);
+
+  uint64_t StackOffset = PtrVT.getSizeInBits()/8 - 1;
+  SDValue ConstStackOffset = DAG.getConstant(StackOffset, PtrVT);
+
+  uint64_t FPROffset = 1;
+  SDValue ConstFPROffset = DAG.getConstant(FPROffset, PtrVT);
+
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+
+  // Store first byte : number of int regs
+  SDValue firstStore = DAG.getTruncStore(Op.getOperand(0), dl, ArgGPR,
+                                         Op.getOperand(1), SV, 0, MVT::i8);
+  uint64_t nextOffset = FPROffset;
+  SDValue nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, Op.getOperand(1),
+                                  ConstFPROffset);
+
+  // Store second byte : number of float regs
+  SDValue secondStore =
+    DAG.getTruncStore(firstStore, dl, ArgFPR, nextPtr, SV, nextOffset, MVT::i8);
+  nextOffset += StackOffset;
+  nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstStackOffset);
+
+  // Store second word : arguments given on stack
+  SDValue thirdStore =
+    DAG.getStore(secondStore, dl, StackOffsetFI, nextPtr, SV, nextOffset);
+  nextOffset += FrameOffset;
+  nextPtr = DAG.getNode(ISD::ADD, dl, PtrVT, nextPtr, ConstFrameOffset);
+
+  // Store third word : arguments given in registers
+  return DAG.getStore(thirdStore, dl, FR, nextPtr, SV, nextOffset);
+
+}
+
+#include "PPCGenCallingConv.inc"
+
+static bool CC_PPC_SVR4_Custom_Dummy(unsigned &ValNo, EVT &ValVT, EVT &LocVT,
+                                     CCValAssign::LocInfo &LocInfo,
+                                     ISD::ArgFlagsTy &ArgFlags,
+                                     CCState &State) {
+  return true;
+}
+
+static bool CC_PPC_SVR4_Custom_AlignArgRegs(unsigned &ValNo, EVT &ValVT,
+                                            EVT &LocVT,
+                                            CCValAssign::LocInfo &LocInfo,
+                                            ISD::ArgFlagsTy &ArgFlags,
+                                            CCState &State) {
+  static const unsigned ArgRegs[] = {
+    PPC::R3, PPC::R4, PPC::R5, PPC::R6,
+    PPC::R7, PPC::R8, PPC::R9, PPC::R10,
+  };
+  const unsigned NumArgRegs = array_lengthof(ArgRegs);
+  
+  unsigned RegNum = State.getFirstUnallocated(ArgRegs, NumArgRegs);
+
+  // Skip one register if the first unallocated register has an even register
+  // number and there are still argument registers available which have not been
+  // allocated yet. RegNum is actually an index into ArgRegs, which means we
+  // need to skip a register if RegNum is odd.
+  if (RegNum != NumArgRegs && RegNum % 2 == 1) {
+    State.AllocateReg(ArgRegs[RegNum]);
+  }
+  
+  // Always return false here, as this function only makes sure that the first
+  // unallocated register has an odd register number and does not actually
+  // allocate a register for the current argument.
+  return false;
+}
+
+static bool CC_PPC_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, EVT &ValVT,
+                                              EVT &LocVT,
+                                              CCValAssign::LocInfo &LocInfo,
+                                              ISD::ArgFlagsTy &ArgFlags,
+                                              CCState &State) {
+  static const unsigned ArgRegs[] = {
+    PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
+    PPC::F8
+  };
+
+  const unsigned NumArgRegs = array_lengthof(ArgRegs);
+  
+  unsigned RegNum = State.getFirstUnallocated(ArgRegs, NumArgRegs);
+
+  // If there is only one Floating-point register left we need to put both f64
+  // values of a split ppc_fp128 value on the stack.
+  if (RegNum != NumArgRegs && ArgRegs[RegNum] == PPC::F8) {
+    State.AllocateReg(ArgRegs[RegNum]);
+  }
+  
+  // Always return false here, as this function only makes sure that the two f64
+  // values a ppc_fp128 value is split into are both passed in registers or both
+  // passed on the stack and does not actually allocate a register for the
+  // current argument.
+  return false;
+}
+
+/// GetFPR - Get the set of FP registers that should be allocated for arguments,
+/// on Darwin.
+static const unsigned *GetFPR() {
+  static const unsigned FPR[] = {
+    PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
+    PPC::F8, PPC::F9, PPC::F10, PPC::F11, PPC::F12, PPC::F13
+  };
+
+  return FPR;
+}
+
+/// CalculateStackSlotSize - Calculates the size reserved for this argument on
+/// the stack.
+static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags,
+                                       unsigned PtrByteSize) {
+  unsigned ArgSize = ArgVT.getSizeInBits()/8;
+  if (Flags.isByVal())
+    ArgSize = Flags.getByValSize();
+  ArgSize = ((ArgSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+
+  return ArgSize;
+}
+
+SDValue
+PPCTargetLowering::LowerFormalArguments(SDValue Chain,
+                                        CallingConv::ID CallConv, bool isVarArg,
+                                        const SmallVectorImpl<ISD::InputArg>
+                                          &Ins,
+                                        DebugLoc dl, SelectionDAG &DAG,
+                                        SmallVectorImpl<SDValue> &InVals) {
+  if (PPCSubTarget.isSVR4ABI() && !PPCSubTarget.isPPC64()) {
+    return LowerFormalArguments_SVR4(Chain, CallConv, isVarArg, Ins,
+                                     dl, DAG, InVals);
+  } else {
+    return LowerFormalArguments_Darwin(Chain, CallConv, isVarArg, Ins,
+                                       dl, DAG, InVals);
+  }
+}
+
+SDValue
+PPCTargetLowering::LowerFormalArguments_SVR4(
+                                      SDValue Chain,
+                                      CallingConv::ID CallConv, bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg>
+                                        &Ins,
+                                      DebugLoc dl, SelectionDAG &DAG,
+                                      SmallVectorImpl<SDValue> &InVals) {
+
+  // 32-bit SVR4 ABI Stack Frame Layout:
+  //              +-----------------------------------+
+  //        +-->  |            Back chain             |
+  //        |     +-----------------------------------+
+  //        |     | Floating-point register save area |
+  //        |     +-----------------------------------+
+  //        |     |    General register save area     |
+  //        |     +-----------------------------------+
+  //        |     |          CR save word             |
+  //        |     +-----------------------------------+
+  //        |     |         VRSAVE save word          |
+  //        |     +-----------------------------------+
+  //        |     |         Alignment padding         |
+  //        |     +-----------------------------------+
+  //        |     |     Vector register save area     |
+  //        |     +-----------------------------------+
+  //        |     |       Local variable space        |
+  //        |     +-----------------------------------+
+  //        |     |        Parameter list area        |
+  //        |     +-----------------------------------+
+  //        |     |           LR save word            |
+  //        |     +-----------------------------------+
+  // SP-->  +---  |            Back chain             |
+  //              +-----------------------------------+
+  //
+  // Specifications:
+  //   System V Application Binary Interface PowerPC Processor Supplement
+  //   AltiVec Technology Programming Interface Manual
+  
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  // Potential tail calls could cause overwriting of argument stack slots.
+  bool isImmutable = !(GuaranteedTailCallOpt && (CallConv==CallingConv::Fast));
+  unsigned PtrByteSize = 4;
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), ArgLocs,
+                 *DAG.getContext());
+
+  // Reserve space for the linkage area on the stack.
+  CCInfo.AllocateStack(PPCFrameInfo::getLinkageSize(false, false), PtrByteSize);
+
+  CCInfo.AnalyzeFormalArguments(Ins, CC_PPC_SVR4);
+  
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    
+    // Arguments stored in registers.
+    if (VA.isRegLoc()) {
+      TargetRegisterClass *RC;
+      EVT ValVT = VA.getValVT();
+      
+      switch (ValVT.getSimpleVT().SimpleTy) {
+        default:
+          llvm_unreachable("ValVT not supported by formal arguments Lowering");
+        case MVT::i32:
+          RC = PPC::GPRCRegisterClass;
+          break;
+        case MVT::f32:
+          RC = PPC::F4RCRegisterClass;
+          break;
+        case MVT::f64:
+          RC = PPC::F8RCRegisterClass;
+          break;
+        case MVT::v16i8:
+        case MVT::v8i16:
+        case MVT::v4i32:
+        case MVT::v4f32:
+          RC = PPC::VRRCRegisterClass;
+          break;
+      }
+      
+      // Transform the arguments stored in physical registers into virtual ones.
+      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+      SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, ValVT);
+
+      InVals.push_back(ArgValue);
+    } else {
+      // Argument stored in memory.
+      assert(VA.isMemLoc());
+
+      unsigned ArgSize = VA.getLocVT().getSizeInBits() / 8;
+      int FI = MFI->CreateFixedObject(ArgSize, VA.getLocMemOffset(),
+                                      isImmutable, false);
+
+      // Create load nodes to retrieve arguments from the stack.
+      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+      InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN, NULL, 0));
+    }
+  }
+
+  // Assign locations to all of the incoming aggregate by value arguments.
+  // Aggregates passed by value are stored in the local variable space of the
+  // caller's stack frame, right above the parameter list area.
+  SmallVector<CCValAssign, 16> ByValArgLocs;
+  CCState CCByValInfo(CallConv, isVarArg, getTargetMachine(),
+                      ByValArgLocs, *DAG.getContext());
+
+  // Reserve stack space for the allocations in CCInfo.
+  CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
+
+  CCByValInfo.AnalyzeFormalArguments(Ins, CC_PPC_SVR4_ByVal);
+
+  // Area that is at least reserved in the caller of this function.
+  unsigned MinReservedArea = CCByValInfo.getNextStackOffset();
+  
+  // Set the size that is at least reserved in caller of this function.  Tail
+  // call optimized function's reserved stack space needs to be aligned so that
+  // taking the difference between two stack areas will result in an aligned
+  // stack.
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+
+  MinReservedArea =
+    std::max(MinReservedArea,
+             PPCFrameInfo::getMinCallFrameSize(false, false));
+  
+  unsigned TargetAlign = DAG.getMachineFunction().getTarget().getFrameInfo()->
+    getStackAlignment();
+  unsigned AlignMask = TargetAlign-1;
+  MinReservedArea = (MinReservedArea + AlignMask) & ~AlignMask;
+  
+  FI->setMinReservedArea(MinReservedArea);
+
+  SmallVector<SDValue, 8> MemOps;
+  
+  // If the function takes variable number of arguments, make a frame index for
+  // the start of the first vararg value... for expansion of llvm.va_start.
+  if (isVarArg) {
+    static const unsigned GPArgRegs[] = {
+      PPC::R3, PPC::R4, PPC::R5, PPC::R6,
+      PPC::R7, PPC::R8, PPC::R9, PPC::R10,
+    };
+    const unsigned NumGPArgRegs = array_lengthof(GPArgRegs);
+
+    static const unsigned FPArgRegs[] = {
+      PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
+      PPC::F8
+    };
+    const unsigned NumFPArgRegs = array_lengthof(FPArgRegs);
+
+    VarArgsNumGPR = CCInfo.getFirstUnallocated(GPArgRegs, NumGPArgRegs);
+    VarArgsNumFPR = CCInfo.getFirstUnallocated(FPArgRegs, NumFPArgRegs);
+
+    // Make room for NumGPArgRegs and NumFPArgRegs.
+    int Depth = NumGPArgRegs * PtrVT.getSizeInBits()/8 +
+                NumFPArgRegs * EVT(MVT::f64).getSizeInBits()/8;
+
+    VarArgsStackOffset = MFI->CreateFixedObject(PtrVT.getSizeInBits()/8,
+                                                CCInfo.getNextStackOffset(),
+                                                true, false);
+
+    VarArgsFrameIndex = MFI->CreateStackObject(Depth, 8, false);
+    SDValue FIN = DAG.getFrameIndex(VarArgsFrameIndex, PtrVT);
+
+    // The fixed integer arguments of a variadic function are
+    // stored to the VarArgsFrameIndex on the stack.
+    unsigned GPRIndex = 0;
+    for (; GPRIndex != VarArgsNumGPR; ++GPRIndex) {
+      SDValue Val = DAG.getRegister(GPArgRegs[GPRIndex], PtrVT);
+      SDValue Store = DAG.getStore(Chain, dl, Val, FIN, NULL, 0);
+      MemOps.push_back(Store);
+      // Increment the address by four for the next argument to store
+      SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, PtrVT);
+      FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
+    }
+
+    // If this function is vararg, store any remaining integer argument regs
+    // to their spots on the stack so that they may be loaded by deferencing the
+    // result of va_next.
+    for (; GPRIndex != NumGPArgRegs; ++GPRIndex) {
+      unsigned VReg = MF.addLiveIn(GPArgRegs[GPRIndex], &PPC::GPRCRegClass);
+
+      SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+      SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, NULL, 0);
+      MemOps.push_back(Store);
+      // Increment the address by four for the next argument to store
+      SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, PtrVT);
+      FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
+    }
+
+    // FIXME 32-bit SVR4: We only need to save FP argument registers if CR bit 6
+    // is set.
+    
+    // The double arguments are stored to the VarArgsFrameIndex
+    // on the stack.
+    unsigned FPRIndex = 0;
+    for (FPRIndex = 0; FPRIndex != VarArgsNumFPR; ++FPRIndex) {
+      SDValue Val = DAG.getRegister(FPArgRegs[FPRIndex], MVT::f64);
+      SDValue Store = DAG.getStore(Chain, dl, Val, FIN, NULL, 0);
+      MemOps.push_back(Store);
+      // Increment the address by eight for the next argument to store
+      SDValue PtrOff = DAG.getConstant(EVT(MVT::f64).getSizeInBits()/8,
+                                         PtrVT);
+      FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
+    }
+
+    for (; FPRIndex != NumFPArgRegs; ++FPRIndex) {
+      unsigned VReg = MF.addLiveIn(FPArgRegs[FPRIndex], &PPC::F8RCRegClass);
+
+      SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::f64);
+      SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, NULL, 0);
+      MemOps.push_back(Store);
+      // Increment the address by eight for the next argument to store
+      SDValue PtrOff = DAG.getConstant(EVT(MVT::f64).getSizeInBits()/8,
+                                         PtrVT);
+      FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
+    }
+  }
+
+  if (!MemOps.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl,
+                        MVT::Other, &MemOps[0], MemOps.size());
+
+  return Chain;
+}
+
+SDValue
+PPCTargetLowering::LowerFormalArguments_Darwin(
+                                      SDValue Chain,
+                                      CallingConv::ID CallConv, bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg>
+                                        &Ins,
+                                      DebugLoc dl, SelectionDAG &DAG,
+                                      SmallVectorImpl<SDValue> &InVals) {
+  // TODO: add description of PPC stack frame format, or at least some docs.
+  //
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  bool isPPC64 = PtrVT == MVT::i64;
+  // Potential tail calls could cause overwriting of argument stack slots.
+  bool isImmutable = !(GuaranteedTailCallOpt && (CallConv==CallingConv::Fast));
+  unsigned PtrByteSize = isPPC64 ? 8 : 4;
+
+  unsigned ArgOffset = PPCFrameInfo::getLinkageSize(isPPC64, true);
+  // Area that is at least reserved in caller of this function.
+  unsigned MinReservedArea = ArgOffset;
+
+  static const unsigned GPR_32[] = {           // 32-bit registers.
+    PPC::R3, PPC::R4, PPC::R5, PPC::R6,
+    PPC::R7, PPC::R8, PPC::R9, PPC::R10,
+  };
+  static const unsigned GPR_64[] = {           // 64-bit registers.
+    PPC::X3, PPC::X4, PPC::X5, PPC::X6,
+    PPC::X7, PPC::X8, PPC::X9, PPC::X10,
+  };
+
+  static const unsigned *FPR = GetFPR();
+
+  static const unsigned VR[] = {
+    PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
+    PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
+  };
+
+  const unsigned Num_GPR_Regs = array_lengthof(GPR_32);
+  const unsigned Num_FPR_Regs = 13;
+  const unsigned Num_VR_Regs  = array_lengthof( VR);
+
+  unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
+
+  const unsigned *GPR = isPPC64 ? GPR_64 : GPR_32;
+
+  // In 32-bit non-varargs functions, the stack space for vectors is after the
+  // stack space for non-vectors.  We do not use this space unless we have
+  // too many vectors to fit in registers, something that only occurs in
+  // constructed examples:), but we have to walk the arglist to figure
+  // that out...for the pathological case, compute VecArgOffset as the
+  // start of the vector parameter area.  Computing VecArgOffset is the
+  // entire point of the following loop.
+  unsigned VecArgOffset = ArgOffset;
+  if (!isVarArg && !isPPC64) {
+    for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e;
+         ++ArgNo) {
+      EVT ObjectVT = Ins[ArgNo].VT;
+      unsigned ObjSize = ObjectVT.getSizeInBits()/8;
+      ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
+
+      if (Flags.isByVal()) {
+        // ObjSize is the true size, ArgSize rounded up to multiple of regs.
+        ObjSize = Flags.getByValSize();
+        unsigned ArgSize =
+                ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+        VecArgOffset += ArgSize;
+        continue;
+      }
+
+      switch(ObjectVT.getSimpleVT().SimpleTy) {
+      default: llvm_unreachable("Unhandled argument type!");
+      case MVT::i32:
+      case MVT::f32:
+        VecArgOffset += isPPC64 ? 8 : 4;
+        break;
+      case MVT::i64:  // PPC64
+      case MVT::f64:
+        VecArgOffset += 8;
+        break;
+      case MVT::v4f32:
+      case MVT::v4i32:
+      case MVT::v8i16:
+      case MVT::v16i8:
+        // Nothing to do, we're only looking at Nonvector args here.
+        break;
+      }
+    }
+  }
+  // We've found where the vector parameter area in memory is.  Skip the
+  // first 12 parameters; these don't use that memory.
+  VecArgOffset = ((VecArgOffset+15)/16)*16;
+  VecArgOffset += 12*16;
+
+  // Add DAG nodes to load the arguments or copy them out of registers.  On
+  // entry to a function on PPC, the arguments start after the linkage area,
+  // although the first ones are often in registers.
+
+  SmallVector<SDValue, 8> MemOps;
+  unsigned nAltivecParamsAtEnd = 0;
+  for (unsigned ArgNo = 0, e = Ins.size(); ArgNo != e; ++ArgNo) {
+    SDValue ArgVal;
+    bool needsLoad = false;
+    EVT ObjectVT = Ins[ArgNo].VT;
+    unsigned ObjSize = ObjectVT.getSizeInBits()/8;
+    unsigned ArgSize = ObjSize;
+    ISD::ArgFlagsTy Flags = Ins[ArgNo].Flags;
+
+    unsigned CurArgOffset = ArgOffset;
+
+    // Varargs or 64 bit Altivec parameters are padded to a 16 byte boundary.
+    if (ObjectVT==MVT::v4f32 || ObjectVT==MVT::v4i32 ||
+        ObjectVT==MVT::v8i16 || ObjectVT==MVT::v16i8) {
+      if (isVarArg || isPPC64) {
+        MinReservedArea = ((MinReservedArea+15)/16)*16;
+        MinReservedArea += CalculateStackSlotSize(ObjectVT,
+                                                  Flags,
+                                                  PtrByteSize);
+      } else  nAltivecParamsAtEnd++;
+    } else
+      // Calculate min reserved area.
+      MinReservedArea += CalculateStackSlotSize(Ins[ArgNo].VT,
+                                                Flags,
+                                                PtrByteSize);
+
+    // FIXME the codegen can be much improved in some cases.
+    // We do not have to keep everything in memory.
+    if (Flags.isByVal()) {
+      // ObjSize is the true size, ArgSize rounded up to multiple of registers.
+      ObjSize = Flags.getByValSize();
+      ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
+      // Objects of size 1 and 2 are right justified, everything else is
+      // left justified.  This means the memory address is adjusted forwards.
+      if (ObjSize==1 || ObjSize==2) {
+        CurArgOffset = CurArgOffset + (4 - ObjSize);
+      }
+      // The value of the object is its address.
+      int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, true, false);
+      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+      InVals.push_back(FIN);
+      if (ObjSize==1 || ObjSize==2) {
+        if (GPR_idx != Num_GPR_Regs) {
+          unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
+          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+          SDValue Store = DAG.getTruncStore(Val.getValue(1), dl, Val, FIN,
+                               NULL, 0, ObjSize==1 ? MVT::i8 : MVT::i16 );
+          MemOps.push_back(Store);
+          ++GPR_idx;
+        }
+        
+        ArgOffset += PtrByteSize;
+        
+        continue;
+      }
+      for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {
+        // Store whatever pieces of the object are in registers
+        // to memory.  ArgVal will be address of the beginning of
+        // the object.
+        if (GPR_idx != Num_GPR_Regs) {
+          unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
+          int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true, false);
+          SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+          SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, NULL, 0);
+          MemOps.push_back(Store);
+          ++GPR_idx;
+          ArgOffset += PtrByteSize;
+        } else {
+          ArgOffset += ArgSize - (ArgOffset-CurArgOffset);
+          break;
+        }
+      }
+      continue;
+    }
+
+    switch (ObjectVT.getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unhandled argument type!");
+    case MVT::i32:
+      if (!isPPC64) {
+        if (GPR_idx != Num_GPR_Regs) {
+          unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
+          ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
+          ++GPR_idx;
+        } else {
+          needsLoad = true;
+          ArgSize = PtrByteSize;
+        }
+        // All int arguments reserve stack space in the Darwin ABI.
+        ArgOffset += PtrByteSize;
+        break;
+      }
+      // FALLTHROUGH
+    case MVT::i64:  // PPC64
+      if (GPR_idx != Num_GPR_Regs) {
+        unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
+
+        if (ObjectVT == MVT::i32) {
+          // PPC64 passes i8, i16, and i32 values in i64 registers. Promote
+          // value to MVT::i64 and then truncate to the correct register size.
+          if (Flags.isSExt())
+            ArgVal = DAG.getNode(ISD::AssertSext, dl, MVT::i64, ArgVal,
+                                 DAG.getValueType(ObjectVT));
+          else if (Flags.isZExt())
+            ArgVal = DAG.getNode(ISD::AssertZext, dl, MVT::i64, ArgVal,
+                                 DAG.getValueType(ObjectVT));
+
+          ArgVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, ArgVal);
+        }
+
+        ++GPR_idx;
+      } else {
+        needsLoad = true;
+        ArgSize = PtrByteSize;
+      }
+      // All int arguments reserve stack space in the Darwin ABI.
+      ArgOffset += 8;
+      break;
+
+    case MVT::f32:
+    case MVT::f64:
+      // Every 4 bytes of argument space consumes one of the GPRs available for
+      // argument passing.
+      if (GPR_idx != Num_GPR_Regs) {
+        ++GPR_idx;
+        if (ObjSize == 8 && GPR_idx != Num_GPR_Regs && !isPPC64)
+          ++GPR_idx;
+      }
+      if (FPR_idx != Num_FPR_Regs) {
+        unsigned VReg;
+
+        if (ObjectVT == MVT::f32)
+          VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F4RCRegClass);
+        else
+          VReg = MF.addLiveIn(FPR[FPR_idx], &PPC::F8RCRegClass);
+
+        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
+        ++FPR_idx;
+      } else {
+        needsLoad = true;
+      }
+
+      // All FP arguments reserve stack space in the Darwin ABI.
+      ArgOffset += isPPC64 ? 8 : ObjSize;
+      break;
+    case MVT::v4f32:
+    case MVT::v4i32:
+    case MVT::v8i16:
+    case MVT::v16i8:
+      // Note that vector arguments in registers don't reserve stack space,
+      // except in varargs functions.
+      if (VR_idx != Num_VR_Regs) {
+        unsigned VReg = MF.addLiveIn(VR[VR_idx], &PPC::VRRCRegClass);
+        ArgVal = DAG.getCopyFromReg(Chain, dl, VReg, ObjectVT);
+        if (isVarArg) {
+          while ((ArgOffset % 16) != 0) {
+            ArgOffset += PtrByteSize;
+            if (GPR_idx != Num_GPR_Regs)
+              GPR_idx++;
+          }
+          ArgOffset += 16;
+          GPR_idx = std::min(GPR_idx+4, Num_GPR_Regs); // FIXME correct for ppc64?
+        }
+        ++VR_idx;
+      } else {
+        if (!isVarArg && !isPPC64) {
+          // Vectors go after all the nonvectors.
+          CurArgOffset = VecArgOffset;
+          VecArgOffset += 16;
+        } else {
+          // Vectors are aligned.
+          ArgOffset = ((ArgOffset+15)/16)*16;
+          CurArgOffset = ArgOffset;
+          ArgOffset += 16;
+        }
+        needsLoad = true;
+      }
+      break;
+    }
+
+    // We need to load the argument to a virtual register if we determined above
+    // that we ran out of physical registers of the appropriate type.
+    if (needsLoad) {
+      int FI = MFI->CreateFixedObject(ObjSize,
+                                      CurArgOffset + (ArgSize - ObjSize),
+                                      isImmutable, false);
+      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+      ArgVal = DAG.getLoad(ObjectVT, dl, Chain, FIN, NULL, 0);
+    }
+
+    InVals.push_back(ArgVal);
+  }
+
+  // Set the size that is at least reserved in caller of this function.  Tail
+  // call optimized function's reserved stack space needs to be aligned so that
+  // taking the difference between two stack areas will result in an aligned
+  // stack.
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+  // Add the Altivec parameters at the end, if needed.
+  if (nAltivecParamsAtEnd) {
+    MinReservedArea = ((MinReservedArea+15)/16)*16;
+    MinReservedArea += 16*nAltivecParamsAtEnd;
+  }
+  MinReservedArea =
+    std::max(MinReservedArea,
+             PPCFrameInfo::getMinCallFrameSize(isPPC64, true));
+  unsigned TargetAlign = DAG.getMachineFunction().getTarget().getFrameInfo()->
+    getStackAlignment();
+  unsigned AlignMask = TargetAlign-1;
+  MinReservedArea = (MinReservedArea + AlignMask) & ~AlignMask;
+  FI->setMinReservedArea(MinReservedArea);
+
+  // If the function takes variable number of arguments, make a frame index for
+  // the start of the first vararg value... for expansion of llvm.va_start.
+  if (isVarArg) {
+    int Depth = ArgOffset;
+
+    VarArgsFrameIndex = MFI->CreateFixedObject(PtrVT.getSizeInBits()/8,
+                                               Depth, true, false);
+    SDValue FIN = DAG.getFrameIndex(VarArgsFrameIndex, PtrVT);
+
+    // If this function is vararg, store any remaining integer argument regs
+    // to their spots on the stack so that they may be loaded by deferencing the
+    // result of va_next.
+    for (; GPR_idx != Num_GPR_Regs; ++GPR_idx) {
+      unsigned VReg;
+      
+      if (isPPC64)
+        VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+      else
+        VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
+
+      SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+      SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, NULL, 0);
+      MemOps.push_back(Store);
+      // Increment the address by four for the next argument to store
+      SDValue PtrOff = DAG.getConstant(PtrVT.getSizeInBits()/8, PtrVT);
+      FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
+    }
+  }
+
+  if (!MemOps.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl,
+                        MVT::Other, &MemOps[0], MemOps.size());
+
+  return Chain;
+}
+
+/// CalculateParameterAndLinkageAreaSize - Get the size of the paramter plus
+/// linkage area for the Darwin ABI.
+static unsigned
+CalculateParameterAndLinkageAreaSize(SelectionDAG &DAG,
+                                     bool isPPC64,
+                                     bool isVarArg,
+                                     unsigned CC,
+                                     const SmallVectorImpl<ISD::OutputArg>
+                                       &Outs,
+                                     unsigned &nAltivecParamsAtEnd) {
+  // Count how many bytes are to be pushed on the stack, including the linkage
+  // area, and parameter passing area.  We start with 24/48 bytes, which is
+  // prereserved space for [SP][CR][LR][3 x unused].
+  unsigned NumBytes = PPCFrameInfo::getLinkageSize(isPPC64, true);
+  unsigned NumOps = Outs.size();
+  unsigned PtrByteSize = isPPC64 ? 8 : 4;
+
+  // Add up all the space actually used.
+  // In 32-bit non-varargs calls, Altivec parameters all go at the end; usually
+  // they all go in registers, but we must reserve stack space for them for
+  // possible use by the caller.  In varargs or 64-bit calls, parameters are
+  // assigned stack space in order, with padding so Altivec parameters are
+  // 16-byte aligned.
+  nAltivecParamsAtEnd = 0;
+  for (unsigned i = 0; i != NumOps; ++i) {
+    SDValue Arg = Outs[i].Val;
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+    EVT ArgVT = Arg.getValueType();
+    // Varargs Altivec parameters are padded to a 16 byte boundary.
+    if (ArgVT==MVT::v4f32 || ArgVT==MVT::v4i32 ||
+        ArgVT==MVT::v8i16 || ArgVT==MVT::v16i8) {
+      if (!isVarArg && !isPPC64) {
+        // Non-varargs Altivec parameters go after all the non-Altivec
+        // parameters; handle those later so we know how much padding we need.
+        nAltivecParamsAtEnd++;
+        continue;
+      }
+      // Varargs and 64-bit Altivec parameters are padded to 16 byte boundary.
+      NumBytes = ((NumBytes+15)/16)*16;
+    }
+    NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
+  }
+
+   // Allow for Altivec parameters at the end, if needed.
+  if (nAltivecParamsAtEnd) {
+    NumBytes = ((NumBytes+15)/16)*16;
+    NumBytes += 16*nAltivecParamsAtEnd;
+  }
+
+  // The prolog code of the callee may store up to 8 GPR argument registers to
+  // the stack, allowing va_start to index over them in memory if its varargs.
+  // Because we cannot tell if this is needed on the caller side, we have to
+  // conservatively assume that it is needed.  As such, make sure we have at
+  // least enough stack space for the caller to store the 8 GPRs.
+  NumBytes = std::max(NumBytes,
+                      PPCFrameInfo::getMinCallFrameSize(isPPC64, true));
+
+  // Tail call needs the stack to be aligned.
+  if (CC==CallingConv::Fast && GuaranteedTailCallOpt) {
+    unsigned TargetAlign = DAG.getMachineFunction().getTarget().getFrameInfo()->
+      getStackAlignment();
+    unsigned AlignMask = TargetAlign-1;
+    NumBytes = (NumBytes + AlignMask) & ~AlignMask;
+  }
+
+  return NumBytes;
+}
+
+/// CalculateTailCallSPDiff - Get the amount the stack pointer has to be
+/// adjusted to accomodate the arguments for the tailcall.
+static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall,
+                                   unsigned ParamSize) {
+
+  if (!isTailCall) return 0;
+
+  PPCFunctionInfo *FI = DAG.getMachineFunction().getInfo<PPCFunctionInfo>();
+  unsigned CallerMinReservedArea = FI->getMinReservedArea();
+  int SPDiff = (int)CallerMinReservedArea - (int)ParamSize;
+  // Remember only if the new adjustement is bigger.
+  if (SPDiff < FI->getTailCallSPDelta())
+    FI->setTailCallSPDelta(SPDiff);
+
+  return SPDiff;
+}
+
+/// IsEligibleForTailCallOptimization - Check whether the call is eligible
+/// for tail call optimization. Targets which want to do tail call
+/// optimization should implement this function.
+bool
+PPCTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
+                                                     CallingConv::ID CalleeCC,
+                                                     bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                                     SelectionDAG& DAG) const {
+  if (!GuaranteedTailCallOpt)
+    return false;
+
+  // Variable argument functions are not supported.
+  if (isVarArg)
+    return false;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  CallingConv::ID CallerCC = MF.getFunction()->getCallingConv();
+  if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) {
+    // Functions containing by val parameters are not supported.
+    for (unsigned i = 0; i != Ins.size(); i++) {
+       ISD::ArgFlagsTy Flags = Ins[i].Flags;
+       if (Flags.isByVal()) return false;
+    }
+
+    // Non PIC/GOT  tail calls are supported.
+    if (getTargetMachine().getRelocationModel() != Reloc::PIC_)
+      return true;
+
+    // At the moment we can only do local tail calls (in same module, hidden
+    // or protected) if we are generating PIC.
+    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+      return G->getGlobal()->hasHiddenVisibility()
+          || G->getGlobal()->hasProtectedVisibility();
+  }
+
+  return false;
+}
+
+/// isCallCompatibleAddress - Return the immediate to use if the specified
+/// 32-bit value is representable in the immediate field of a BxA instruction.
+static SDNode *isBLACompatibleAddress(SDValue Op, SelectionDAG &DAG) {
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
+  if (!C) return 0;
+
+  int Addr = C->getZExtValue();
+  if ((Addr & 3) != 0 ||  // Low 2 bits are implicitly zero.
+      (Addr << 6 >> 6) != Addr)
+    return 0;  // Top 6 bits have to be sext of immediate.
+
+  return DAG.getConstant((int)C->getZExtValue() >> 2,
+                         DAG.getTargetLoweringInfo().getPointerTy()).getNode();
+}
+
+namespace {
+
+struct TailCallArgumentInfo {
+  SDValue Arg;
+  SDValue FrameIdxOp;
+  int       FrameIdx;
+
+  TailCallArgumentInfo() : FrameIdx(0) {}
+};
+
+}
+
+/// StoreTailCallArgumentsToStackSlot - Stores arguments to their stack slot.
+static void
+StoreTailCallArgumentsToStackSlot(SelectionDAG &DAG,
+                                           SDValue Chain,
+                   const SmallVector<TailCallArgumentInfo, 8> &TailCallArgs,
+                   SmallVector<SDValue, 8> &MemOpChains,
+                   DebugLoc dl) {
+  for (unsigned i = 0, e = TailCallArgs.size(); i != e; ++i) {
+    SDValue Arg = TailCallArgs[i].Arg;
+    SDValue FIN = TailCallArgs[i].FrameIdxOp;
+    int FI = TailCallArgs[i].FrameIdx;
+    // Store relative to framepointer.
+    MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, FIN,
+                                       PseudoSourceValue::getFixedStack(FI),
+                                       0));
+  }
+}
+
+/// EmitTailCallStoreFPAndRetAddr - Move the frame pointer and return address to
+/// the appropriate stack slot for the tail call optimized function call.
+static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG,
+                                               MachineFunction &MF,
+                                               SDValue Chain,
+                                               SDValue OldRetAddr,
+                                               SDValue OldFP,
+                                               int SPDiff,
+                                               bool isPPC64,
+                                               bool isDarwinABI,
+                                               DebugLoc dl) {
+  if (SPDiff) {
+    // Calculate the new stack slot for the return address.
+    int SlotSize = isPPC64 ? 8 : 4;
+    int NewRetAddrLoc = SPDiff + PPCFrameInfo::getReturnSaveOffset(isPPC64,
+                                                                   isDarwinABI);
+    int NewRetAddr = MF.getFrameInfo()->CreateFixedObject(SlotSize,
+                                                          NewRetAddrLoc,
+                                                          true, false);
+    EVT VT = isPPC64 ? MVT::i64 : MVT::i32;
+    SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewRetAddr, VT);
+    Chain = DAG.getStore(Chain, dl, OldRetAddr, NewRetAddrFrIdx,
+                         PseudoSourceValue::getFixedStack(NewRetAddr), 0);
+
+    // When using the 32/64-bit SVR4 ABI there is no need to move the FP stack
+    // slot as the FP is never overwritten.
+    if (isDarwinABI) {
+      int NewFPLoc =
+        SPDiff + PPCFrameInfo::getFramePointerSaveOffset(isPPC64, isDarwinABI);
+      int NewFPIdx = MF.getFrameInfo()->CreateFixedObject(SlotSize, NewFPLoc,
+                                                          true, false);
+      SDValue NewFramePtrIdx = DAG.getFrameIndex(NewFPIdx, VT);
+      Chain = DAG.getStore(Chain, dl, OldFP, NewFramePtrIdx,
+                           PseudoSourceValue::getFixedStack(NewFPIdx), 0);
+    }
+  }
+  return Chain;
+}
+
+/// CalculateTailCallArgDest - Remember Argument for later processing. Calculate
+/// the position of the argument.
+static void
+CalculateTailCallArgDest(SelectionDAG &DAG, MachineFunction &MF, bool isPPC64,
+                         SDValue Arg, int SPDiff, unsigned ArgOffset,
+                      SmallVector<TailCallArgumentInfo, 8>& TailCallArguments) {
+  int Offset = ArgOffset + SPDiff;
+  uint32_t OpSize = (Arg.getValueType().getSizeInBits()+7)/8;
+  int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true,false);
+  EVT VT = isPPC64 ? MVT::i64 : MVT::i32;
+  SDValue FIN = DAG.getFrameIndex(FI, VT);
+  TailCallArgumentInfo Info;
+  Info.Arg = Arg;
+  Info.FrameIdxOp = FIN;
+  Info.FrameIdx = FI;
+  TailCallArguments.push_back(Info);
+}
+
+/// EmitTCFPAndRetAddrLoad - Emit load from frame pointer and return address
+/// stack slot. Returns the chain as result and the loaded frame pointers in
+/// LROpOut/FPOpout. Used when tail calling.
+SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr(SelectionDAG & DAG,
+                                                        int SPDiff,
+                                                        SDValue Chain,
+                                                        SDValue &LROpOut,
+                                                        SDValue &FPOpOut,
+                                                        bool isDarwinABI,
+                                                        DebugLoc dl) {
+  if (SPDiff) {
+    // Load the LR and FP stack slot for later adjusting.
+    EVT VT = PPCSubTarget.isPPC64() ? MVT::i64 : MVT::i32;
+    LROpOut = getReturnAddrFrameIndex(DAG);
+    LROpOut = DAG.getLoad(VT, dl, Chain, LROpOut, NULL, 0);
+    Chain = SDValue(LROpOut.getNode(), 1);
+    
+    // When using the 32/64-bit SVR4 ABI there is no need to load the FP stack
+    // slot as the FP is never overwritten.
+    if (isDarwinABI) {
+      FPOpOut = getFramePointerFrameIndex(DAG);
+      FPOpOut = DAG.getLoad(VT, dl, Chain, FPOpOut, NULL, 0);
+      Chain = SDValue(FPOpOut.getNode(), 1);
+    }
+  }
+  return Chain;
+}
+
+/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
+/// by "Src" to address "Dst" of size "Size".  Alignment information is
+/// specified by the specific parameter attribute. The copy will be passed as
+/// a byval function parameter.
+/// Sometimes what we are copying is the end of a larger object, the part that
+/// does not fit in registers.
+static SDValue
+CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
+                          ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
+                          DebugLoc dl) {
+  SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
+  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
+                       false, NULL, 0, NULL, 0);
+}
+
+/// LowerMemOpCallTo - Store the argument to the stack or remember it in case of
+/// tail calls.
+static void
+LowerMemOpCallTo(SelectionDAG &DAG, MachineFunction &MF, SDValue Chain,
+                 SDValue Arg, SDValue PtrOff, int SPDiff,
+                 unsigned ArgOffset, bool isPPC64, bool isTailCall,
+                 bool isVector, SmallVector<SDValue, 8> &MemOpChains,
+                 SmallVector<TailCallArgumentInfo, 8>& TailCallArguments,
+                 DebugLoc dl) {
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  if (!isTailCall) {
+    if (isVector) {
+      SDValue StackPtr;
+      if (isPPC64)
+        StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
+      else
+        StackPtr = DAG.getRegister(PPC::R1, MVT::i32);
+      PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
+                           DAG.getConstant(ArgOffset, PtrVT));
+    }
+    MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff, NULL, 0));
+  // Calculate and remember argument location.
+  } else CalculateTailCallArgDest(DAG, MF, isPPC64, Arg, SPDiff, ArgOffset,
+                                  TailCallArguments);
+}
+
+static
+void PrepareTailCall(SelectionDAG &DAG, SDValue &InFlag, SDValue &Chain,
+                     DebugLoc dl, bool isPPC64, int SPDiff, unsigned NumBytes,
+                     SDValue LROp, SDValue FPOp, bool isDarwinABI,
+                     SmallVector<TailCallArgumentInfo, 8> &TailCallArguments) {
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // Emit a sequence of copyto/copyfrom virtual registers for arguments that
+  // might overwrite each other in case of tail call optimization.
+  SmallVector<SDValue, 8> MemOpChains2;
+  // Do not flag preceeding copytoreg stuff together with the following stuff.
+  InFlag = SDValue();
+  StoreTailCallArgumentsToStackSlot(DAG, Chain, TailCallArguments,
+                                    MemOpChains2, dl);
+  if (!MemOpChains2.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        &MemOpChains2[0], MemOpChains2.size());
+
+  // Store the return address to the appropriate stack slot.
+  Chain = EmitTailCallStoreFPAndRetAddr(DAG, MF, Chain, LROp, FPOp, SPDiff,
+                                        isPPC64, isDarwinABI, dl);
+
+  // Emit callseq_end just before tailcall node.
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                             DAG.getIntPtrConstant(0, true), InFlag);
+  InFlag = Chain.getValue(1);
+}
+
+static
+unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
+                     SDValue &Chain, DebugLoc dl, int SPDiff, bool isTailCall,
+                     SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass,
+                     SmallVector<SDValue, 8> &Ops, std::vector<EVT> &NodeTys,
+                     bool isPPC64, bool isSVR4ABI) {
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  NodeTys.push_back(MVT::Other);   // Returns a chain
+  NodeTys.push_back(MVT::Flag);    // Returns a flag for retval copy to use.
+
+  unsigned CallOpc = isSVR4ABI ? PPCISD::CALL_SVR4 : PPCISD::CALL_Darwin;
+
+  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
+  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
+  // node so that legalize doesn't hack it.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), Callee.getValueType());
+  else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee))
+    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), Callee.getValueType());
+  else if (SDNode *Dest = isBLACompatibleAddress(Callee, DAG))
+    // If this is an absolute destination address, use the munged value.
+    Callee = SDValue(Dest, 0);
+  else {
+    // Otherwise, this is an indirect call.  We have to use a MTCTR/BCTRL pair
+    // to do the call, we can't use PPCISD::CALL.
+    SDValue MTCTROps[] = {Chain, Callee, InFlag};
+
+    if (isSVR4ABI && isPPC64) {
+      // Function pointers in the 64-bit SVR4 ABI do not point to the function
+      // entry point, but to the function descriptor (the function entry point
+      // address is part of the function descriptor though).
+      // The function descriptor is a three doubleword structure with the
+      // following fields: function entry point, TOC base address and
+      // environment pointer.
+      // Thus for a call through a function pointer, the following actions need
+      // to be performed:
+      //   1. Save the TOC of the caller in the TOC save area of its stack
+      //      frame (this is done in LowerCall_Darwin()).
+      //   2. Load the address of the function entry point from the function
+      //      descriptor.
+      //   3. Load the TOC of the callee from the function descriptor into r2.
+      //   4. Load the environment pointer from the function descriptor into
+      //      r11.
+      //   5. Branch to the function entry point address.
+      //   6. On return of the callee, the TOC of the caller needs to be
+      //      restored (this is done in FinishCall()).
+      //
+      // All those operations are flagged together to ensure that no other
+      // operations can be scheduled in between. E.g. without flagging the
+      // operations together, a TOC access in the caller could be scheduled
+      // between the load of the callee TOC and the branch to the callee, which
+      // results in the TOC access going through the TOC of the callee instead
+      // of going through the TOC of the caller, which leads to incorrect code.
+
+      // Load the address of the function entry point from the function
+      // descriptor.
+      SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other, MVT::Flag);
+      SDValue LoadFuncPtr = DAG.getNode(PPCISD::LOAD, dl, VTs, MTCTROps,
+                                        InFlag.getNode() ? 3 : 2);
+      Chain = LoadFuncPtr.getValue(1);
+      InFlag = LoadFuncPtr.getValue(2);
+
+      // Load environment pointer into r11.
+      // Offset of the environment pointer within the function descriptor.
+      SDValue PtrOff = DAG.getIntPtrConstant(16);
+
+      SDValue AddPtr = DAG.getNode(ISD::ADD, dl, MVT::i64, Callee, PtrOff);
+      SDValue LoadEnvPtr = DAG.getNode(PPCISD::LOAD, dl, VTs, Chain, AddPtr,
+                                       InFlag);
+      Chain = LoadEnvPtr.getValue(1);
+      InFlag = LoadEnvPtr.getValue(2);
+
+      SDValue EnvVal = DAG.getCopyToReg(Chain, dl, PPC::X11, LoadEnvPtr,
+                                        InFlag);
+      Chain = EnvVal.getValue(0);
+      InFlag = EnvVal.getValue(1);
+
+      // Load TOC of the callee into r2. We are using a target-specific load
+      // with r2 hard coded, because the result of a target-independent load
+      // would never go directly into r2, since r2 is a reserved register (which
+      // prevents the register allocator from allocating it), resulting in an
+      // additional register being allocated and an unnecessary move instruction
+      // being generated.
+      VTs = DAG.getVTList(MVT::Other, MVT::Flag);
+      SDValue LoadTOCPtr = DAG.getNode(PPCISD::LOAD_TOC, dl, VTs, Chain,
+                                       Callee, InFlag);
+      Chain = LoadTOCPtr.getValue(0);
+      InFlag = LoadTOCPtr.getValue(1);
+
+      MTCTROps[0] = Chain;
+      MTCTROps[1] = LoadFuncPtr;
+      MTCTROps[2] = InFlag;
+    }
+
+    Chain = DAG.getNode(PPCISD::MTCTR, dl, NodeTys, MTCTROps,
+                        2 + (InFlag.getNode() != 0));
+    InFlag = Chain.getValue(1);
+
+    NodeTys.clear();
+    NodeTys.push_back(MVT::Other);
+    NodeTys.push_back(MVT::Flag);
+    Ops.push_back(Chain);
+    CallOpc = isSVR4ABI ? PPCISD::BCTRL_SVR4 : PPCISD::BCTRL_Darwin;
+    Callee.setNode(0);
+    // Add CTR register as callee so a bctr can be emitted later.
+    if (isTailCall)
+      Ops.push_back(DAG.getRegister(PPC::CTR, PtrVT));
+  }
+
+  // If this is a direct call, pass the chain and the callee.
+  if (Callee.getNode()) {
+    Ops.push_back(Chain);
+    Ops.push_back(Callee);
+  }
+  // If this is a tail call add stack pointer delta.
+  if (isTailCall)
+    Ops.push_back(DAG.getConstant(SPDiff, MVT::i32));
+
+  // Add argument registers to the end of the list so that they are known live
+  // into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  return CallOpc;
+}
+
+SDValue
+PPCTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
+                                   CallingConv::ID CallConv, bool isVarArg,
+                                   const SmallVectorImpl<ISD::InputArg> &Ins,
+                                   DebugLoc dl, SelectionDAG &DAG,
+                                   SmallVectorImpl<SDValue> &InVals) {
+
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCRetInfo(CallConv, isVarArg, getTargetMachine(),
+                    RVLocs, *DAG.getContext());
+  CCRetInfo.AnalyzeCallResult(Ins, RetCC_PPC);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
+    CCValAssign &VA = RVLocs[i];
+    EVT VT = VA.getValVT();
+    assert(VA.isRegLoc() && "Can only return in registers!");
+    Chain = DAG.getCopyFromReg(Chain, dl,
+                               VA.getLocReg(), VT, InFlag).getValue(1);
+    InVals.push_back(Chain.getValue(0));
+    InFlag = Chain.getValue(2);
+  }
+
+  return Chain;
+}
+
+SDValue
+PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl,
+                              bool isTailCall, bool isVarArg,
+                              SelectionDAG &DAG,
+                              SmallVector<std::pair<unsigned, SDValue>, 8>
+                                &RegsToPass,
+                              SDValue InFlag, SDValue Chain,
+                              SDValue &Callee,
+                              int SPDiff, unsigned NumBytes,
+                              const SmallVectorImpl<ISD::InputArg> &Ins,
+                              SmallVectorImpl<SDValue> &InVals) {
+  std::vector<EVT> NodeTys;
+  SmallVector<SDValue, 8> Ops;
+  unsigned CallOpc = PrepareCall(DAG, Callee, InFlag, Chain, dl, SPDiff,
+                                 isTailCall, RegsToPass, Ops, NodeTys,
+                                 PPCSubTarget.isPPC64(),
+                                 PPCSubTarget.isSVR4ABI());
+
+  // When performing tail call optimization the callee pops its arguments off
+  // the stack. Account for this here so these bytes can be pushed back on in
+  // PPCRegisterInfo::eliminateCallFramePseudoInstr.
+  int BytesCalleePops =
+    (CallConv==CallingConv::Fast && GuaranteedTailCallOpt) ? NumBytes : 0;
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  // Emit tail call.
+  if (isTailCall) {
+    // If this is the first return lowered for this function, add the regs
+    // to the liveout set for the function.
+    if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
+      SmallVector<CCValAssign, 16> RVLocs;
+      CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs,
+                     *DAG.getContext());
+      CCInfo.AnalyzeCallResult(Ins, RetCC_PPC);
+      for (unsigned i = 0; i != RVLocs.size(); ++i)
+        DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
+    }
+
+    assert(((Callee.getOpcode() == ISD::Register &&
+             cast<RegisterSDNode>(Callee)->getReg() == PPC::CTR) ||
+            Callee.getOpcode() == ISD::TargetExternalSymbol ||
+            Callee.getOpcode() == ISD::TargetGlobalAddress ||
+            isa<ConstantSDNode>(Callee)) &&
+    "Expecting an global address, external symbol, absolute value or register");
+
+    return DAG.getNode(PPCISD::TC_RETURN, dl, MVT::Other, &Ops[0], Ops.size());
+  }
+
+  Chain = DAG.getNode(CallOpc, dl, NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  // Add a NOP immediately after the branch instruction when using the 64-bit
+  // SVR4 ABI. At link time, if caller and callee are in a different module and
+  // thus have a different TOC, the call will be replaced with a call to a stub
+  // function which saves the current TOC, loads the TOC of the callee and
+  // branches to the callee. The NOP will be replaced with a load instruction
+  // which restores the TOC of the caller from the TOC save slot of the current
+  // stack frame. If caller and callee belong to the same module (and have the
+  // same TOC), the NOP will remain unchanged.
+  if (!isTailCall && PPCSubTarget.isSVR4ABI()&& PPCSubTarget.isPPC64()) {
+    SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Flag);
+    if (CallOpc == PPCISD::BCTRL_SVR4) {
+      // This is a call through a function pointer.
+      // Restore the caller TOC from the save area into R2.
+      // See PrepareCall() for more information about calls through function
+      // pointers in the 64-bit SVR4 ABI.
+      // We are using a target-specific load with r2 hard coded, because the
+      // result of a target-independent load would never go directly into r2,
+      // since r2 is a reserved register (which prevents the register allocator
+      // from allocating it), resulting in an additional register being
+      // allocated and an unnecessary move instruction being generated.
+      Chain = DAG.getNode(PPCISD::TOC_RESTORE, dl, VTs, Chain, InFlag);
+      InFlag = Chain.getValue(1);
+    } else {
+      // Otherwise insert NOP.
+      InFlag = DAG.getNode(PPCISD::NOP, dl, MVT::Flag, InFlag);
+    }
+  }
+
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                             DAG.getIntPtrConstant(BytesCalleePops, true),
+                             InFlag);
+  if (!Ins.empty())
+    InFlag = Chain.getValue(1);
+
+  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
+                         Ins, dl, DAG, InVals);
+}
+
+SDValue
+PPCTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
+                             CallingConv::ID CallConv, bool isVarArg,
+                             bool &isTailCall,
+                             const SmallVectorImpl<ISD::OutputArg> &Outs,
+                             const SmallVectorImpl<ISD::InputArg> &Ins,
+                             DebugLoc dl, SelectionDAG &DAG,
+                             SmallVectorImpl<SDValue> &InVals) {
+  if (isTailCall)
+    isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg,
+                                                   Ins, DAG);
+
+  if (PPCSubTarget.isSVR4ABI() && !PPCSubTarget.isPPC64()) {
+    return LowerCall_SVR4(Chain, Callee, CallConv, isVarArg,
+                          isTailCall, Outs, Ins,
+                          dl, DAG, InVals);
+  } else {
+    return LowerCall_Darwin(Chain, Callee, CallConv, isVarArg,
+                            isTailCall, Outs, Ins,
+                            dl, DAG, InVals);
+  }
+}
+
+SDValue
+PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee,
+                                  CallingConv::ID CallConv, bool isVarArg,
+                                  bool isTailCall,
+                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                  const SmallVectorImpl<ISD::InputArg> &Ins,
+                                  DebugLoc dl, SelectionDAG &DAG,
+                                  SmallVectorImpl<SDValue> &InVals) {
+  // See PPCTargetLowering::LowerFormalArguments_SVR4() for a description
+  // of the 32-bit SVR4 ABI stack frame layout.
+
+  assert((CallConv == CallingConv::C ||
+          CallConv == CallingConv::Fast) && "Unknown calling convention!");
+
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  unsigned PtrByteSize = 4;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // Mark this function as potentially containing a function that contains a
+  // tail call. As a consequence the frame pointer will be used for dynamicalloc
+  // and restoring the callers stack pointer in this functions epilog. This is
+  // done because by tail calling the called function might overwrite the value
+  // in this function's (MF) stack pointer stack slot 0(SP).
+  if (GuaranteedTailCallOpt && CallConv==CallingConv::Fast)
+    MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
+  
+  // Count how many bytes are to be pushed on the stack, including the linkage
+  // area, parameter list area and the part of the local variable space which
+  // contains copies of aggregates which are passed by value.
+
+  // Assign locations to all of the outgoing arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 ArgLocs, *DAG.getContext());
+
+  // Reserve space for the linkage area on the stack.
+  CCInfo.AllocateStack(PPCFrameInfo::getLinkageSize(false, false), PtrByteSize);
+
+  if (isVarArg) {
+    // Handle fixed and variable vector arguments differently.
+    // Fixed vector arguments go into registers as long as registers are
+    // available. Variable vector arguments always go into memory.
+    unsigned NumArgs = Outs.size();
+    
+    for (unsigned i = 0; i != NumArgs; ++i) {
+      EVT ArgVT = Outs[i].Val.getValueType();
+      ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+      bool Result;
+      
+      if (Outs[i].IsFixed) {
+        Result = CC_PPC_SVR4(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags,
+                             CCInfo);
+      } else {
+        Result = CC_PPC_SVR4_VarArg(i, ArgVT, ArgVT, CCValAssign::Full,
+                                    ArgFlags, CCInfo);
+      }
+      
+      if (Result) {
+#ifndef NDEBUG
+        errs() << "Call operand #" << i << " has unhandled type "
+             << ArgVT.getEVTString() << "\n";
+#endif
+        llvm_unreachable(0);
+      }
+    }
+  } else {
+    // All arguments are treated the same.
+    CCInfo.AnalyzeCallOperands(Outs, CC_PPC_SVR4);
+  }
+  
+  // Assign locations to all of the outgoing aggregate by value arguments.
+  SmallVector<CCValAssign, 16> ByValArgLocs;
+  CCState CCByValInfo(CallConv, isVarArg, getTargetMachine(), ByValArgLocs,
+                      *DAG.getContext());
+
+  // Reserve stack space for the allocations in CCInfo.
+  CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
+
+  CCByValInfo.AnalyzeCallOperands(Outs, CC_PPC_SVR4_ByVal);
+
+  // Size of the linkage area, parameter list area and the part of the local
+  // space variable where copies of aggregates which are passed by value are
+  // stored.
+  unsigned NumBytes = CCByValInfo.getNextStackOffset();
+  
+  // Calculate by how many bytes the stack has to be adjusted in case of tail
+  // call optimization.
+  int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);
+
+  // Adjust the stack pointer for the new arguments...
+  // These operations are automatically eliminated by the prolog/epilog pass
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+  SDValue CallSeqStart = Chain;
+
+  // Load the return address and frame pointer so it can be moved somewhere else
+  // later.
+  SDValue LROp, FPOp;
+  Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, false,
+                                       dl);
+
+  // Set up a copy of the stack pointer for use loading and storing any
+  // arguments that may not fit in the registers available for argument
+  // passing.
+  SDValue StackPtr = DAG.getRegister(PPC::R1, MVT::i32);
+  
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
+  SmallVector<SDValue, 8> MemOpChains;
+
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, j = 0, e = ArgLocs.size();
+       i != e;
+       ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    SDValue Arg = Outs[i].Val;
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+    
+    if (Flags.isByVal()) {
+      // Argument is an aggregate which is passed by value, thus we need to
+      // create a copy of it in the local variable space of the current stack
+      // frame (which is the stack frame of the caller) and pass the address of
+      // this copy to the callee.
+      assert((j < ByValArgLocs.size()) && "Index out of bounds!");
+      CCValAssign &ByValVA = ByValArgLocs[j++];
+      assert((VA.getValNo() == ByValVA.getValNo()) && "ValNo mismatch!");
+      
+      // Memory reserved in the local variable space of the callers stack frame.
+      unsigned LocMemOffset = ByValVA.getLocMemOffset();
+      
+      SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
+      PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
+      
+      // Create a copy of the argument in the local area of the current
+      // stack frame.
+      SDValue MemcpyCall =
+        CreateCopyOfByValArgument(Arg, PtrOff,
+                                  CallSeqStart.getNode()->getOperand(0),
+                                  Flags, DAG, dl);
+      
+      // This must go outside the CALLSEQ_START..END.
+      SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
+                           CallSeqStart.getNode()->getOperand(1));
+      DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
+                             NewCallSeqStart.getNode());
+      Chain = CallSeqStart = NewCallSeqStart;
+      
+      // Pass the address of the aggregate copy on the stack either in a
+      // physical register or in the parameter list area of the current stack
+      // frame to the callee.
+      Arg = PtrOff;
+    }
+    
+    if (VA.isRegLoc()) {
+      // Put argument in a physical register.
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      // Put argument in the parameter list area of the current stack frame.
+      assert(VA.isMemLoc());
+      unsigned LocMemOffset = VA.getLocMemOffset();
+
+      if (!isTailCall) {
+        SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
+        PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
+
+        MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
+                              PseudoSourceValue::getStack(), LocMemOffset));
+      } else {
+        // Calculate and remember argument location.
+        CalculateTailCallArgDest(DAG, MF, false, Arg, SPDiff, LocMemOffset,
+                                 TailCallArguments);
+      }
+    }
+  }
+  
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+  
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into the appropriate regs.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+  
+  // Set CR6 to true if this is a vararg call.
+  if (isVarArg) {
+    SDValue SetCR(DAG.getMachineNode(PPC::CRSET, dl, MVT::i32), 0);
+    Chain = DAG.getCopyToReg(Chain, dl, PPC::CR1EQ, SetCR, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  if (isTailCall) {
+    PrepareTailCall(DAG, InFlag, Chain, dl, false, SPDiff, NumBytes, LROp, FPOp,
+                    false, TailCallArguments);
+  }
+
+  return FinishCall(CallConv, dl, isTailCall, isVarArg, DAG,
+                    RegsToPass, InFlag, Chain, Callee, SPDiff, NumBytes,
+                    Ins, InVals);
+}
+
+SDValue
+PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
+                                    CallingConv::ID CallConv, bool isVarArg,
+                                    bool isTailCall,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    DebugLoc dl, SelectionDAG &DAG,
+                                    SmallVectorImpl<SDValue> &InVals) {
+
+  unsigned NumOps  = Outs.size();
+
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  bool isPPC64 = PtrVT == MVT::i64;
+  unsigned PtrByteSize = isPPC64 ? 8 : 4;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // Mark this function as potentially containing a function that contains a
+  // tail call. As a consequence the frame pointer will be used for dynamicalloc
+  // and restoring the callers stack pointer in this functions epilog. This is
+  // done because by tail calling the called function might overwrite the value
+  // in this function's (MF) stack pointer stack slot 0(SP).
+  if (GuaranteedTailCallOpt && CallConv==CallingConv::Fast)
+    MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
+
+  unsigned nAltivecParamsAtEnd = 0;
+
+  // Count how many bytes are to be pushed on the stack, including the linkage
+  // area, and parameter passing area.  We start with 24/48 bytes, which is
+  // prereserved space for [SP][CR][LR][3 x unused].
+  unsigned NumBytes =
+    CalculateParameterAndLinkageAreaSize(DAG, isPPC64, isVarArg, CallConv,
+                                         Outs,
+                                         nAltivecParamsAtEnd);
+
+  // Calculate by how many bytes the stack has to be adjusted in case of tail
+  // call optimization.
+  int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);
+
+  // To protect arguments on the stack from being clobbered in a tail call,
+  // force all the loads to happen before doing any other lowering.
+  if (isTailCall)
+    Chain = DAG.getStackArgumentTokenFactor(Chain);
+
+  // Adjust the stack pointer for the new arguments...
+  // These operations are automatically eliminated by the prolog/epilog pass
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+  SDValue CallSeqStart = Chain;
+
+  // Load the return address and frame pointer so it can be move somewhere else
+  // later.
+  SDValue LROp, FPOp;
+  Chain = EmitTailCallLoadFPAndRetAddr(DAG, SPDiff, Chain, LROp, FPOp, true,
+                                       dl);
+
+  // Set up a copy of the stack pointer for use loading and storing any
+  // arguments that may not fit in the registers available for argument
+  // passing.
+  SDValue StackPtr;
+  if (isPPC64)
+    StackPtr = DAG.getRegister(PPC::X1, MVT::i64);
+  else
+    StackPtr = DAG.getRegister(PPC::R1, MVT::i32);
+
+  // Figure out which arguments are going to go in registers, and which in
+  // memory.  Also, if this is a vararg function, floating point operations
+  // must be stored to our stack, and loaded into integer regs as well, if
+  // any integer regs are available for argument passing.
+  unsigned ArgOffset = PPCFrameInfo::getLinkageSize(isPPC64, true);
+  unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
+
+  static const unsigned GPR_32[] = {           // 32-bit registers.
+    PPC::R3, PPC::R4, PPC::R5, PPC::R6,
+    PPC::R7, PPC::R8, PPC::R9, PPC::R10,
+  };
+  static const unsigned GPR_64[] = {           // 64-bit registers.
+    PPC::X3, PPC::X4, PPC::X5, PPC::X6,
+    PPC::X7, PPC::X8, PPC::X9, PPC::X10,
+  };
+  static const unsigned *FPR = GetFPR();
+
+  static const unsigned VR[] = {
+    PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
+    PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
+  };
+  const unsigned NumGPRs = array_lengthof(GPR_32);
+  const unsigned NumFPRs = 13;
+  const unsigned NumVRs  = array_lengthof(VR);
+
+  const unsigned *GPR = isPPC64 ? GPR_64 : GPR_32;
+
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
+
+  SmallVector<SDValue, 8> MemOpChains;
+  for (unsigned i = 0; i != NumOps; ++i) {
+    SDValue Arg = Outs[i].Val;
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+
+    // PtrOff will be used to store the current argument to the stack if a
+    // register cannot be found for it.
+    SDValue PtrOff;
+
+    PtrOff = DAG.getConstant(ArgOffset, StackPtr.getValueType());
+
+    PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
+
+    // On PPC64, promote integers to 64-bit values.
+    if (isPPC64 && Arg.getValueType() == MVT::i32) {
+      // FIXME: Should this use ANY_EXTEND if neither sext nor zext?
+      unsigned ExtOp = Flags.isSExt() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+      Arg = DAG.getNode(ExtOp, dl, MVT::i64, Arg);
+    }
+
+    // FIXME memcpy is used way more than necessary.  Correctness first.
+    if (Flags.isByVal()) {
+      unsigned Size = Flags.getByValSize();
+      if (Size==1 || Size==2) {
+        // Very small objects are passed right-justified.
+        // Everything else is passed left-justified.
+        EVT VT = (Size==1) ? MVT::i8 : MVT::i16;
+        if (GPR_idx != NumGPRs) {
+          SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
+                                          NULL, 0, VT);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+
+          ArgOffset += PtrByteSize;
+        } else {
+          SDValue Const = DAG.getConstant(4 - Size, PtrOff.getValueType());
+          SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
+          SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, AddPtr,
+                                CallSeqStart.getNode()->getOperand(0),
+                                Flags, DAG, dl);
+          // This must go outside the CALLSEQ_START..END.
+          SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
+                               CallSeqStart.getNode()->getOperand(1));
+          DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
+                                 NewCallSeqStart.getNode());
+          Chain = CallSeqStart = NewCallSeqStart;
+          ArgOffset += PtrByteSize;
+        }
+        continue;
+      }
+      // Copy entire object into memory.  There are cases where gcc-generated
+      // code assumes it is there, even if it could be put entirely into
+      // registers.  (This is not what the doc says.)
+      SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, PtrOff,
+                            CallSeqStart.getNode()->getOperand(0),
+                            Flags, DAG, dl);
+      // This must go outside the CALLSEQ_START..END.
+      SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
+                           CallSeqStart.getNode()->getOperand(1));
+      DAG.ReplaceAllUsesWith(CallSeqStart.getNode(), NewCallSeqStart.getNode());
+      Chain = CallSeqStart = NewCallSeqStart;
+      // And copy the pieces of it that fit into registers.
+      for (unsigned j=0; j<Size; j+=PtrByteSize) {
+        SDValue Const = DAG.getConstant(j, PtrOff.getValueType());
+        SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
+        if (GPR_idx != NumGPRs) {
+          SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg, NULL, 0);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+          ArgOffset += PtrByteSize;
+        } else {
+          ArgOffset += ((Size - j + PtrByteSize-1)/PtrByteSize)*PtrByteSize;
+          break;
+        }
+      }
+      continue;
+    }
+
+    switch (Arg.getValueType().getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unexpected ValueType for argument!");
+    case MVT::i32:
+    case MVT::i64:
+      if (GPR_idx != NumGPRs) {
+        RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));
+      } else {
+        LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+                         isPPC64, isTailCall, false, MemOpChains,
+                         TailCallArguments, dl);
+      }
+      ArgOffset += PtrByteSize;
+      break;
+    case MVT::f32:
+    case MVT::f64:
+      if (FPR_idx != NumFPRs) {
+        RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));
+
+        if (isVarArg) {
+          SDValue Store = DAG.getStore(Chain, dl, Arg, PtrOff, NULL, 0);
+          MemOpChains.push_back(Store);
+
+          // Float varargs are always shadowed in available integer registers
+          if (GPR_idx != NumGPRs) {
+            SDValue Load = DAG.getLoad(PtrVT, dl, Store, PtrOff, NULL, 0);
+            MemOpChains.push_back(Load.getValue(1));
+            RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+          }
+          if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 && !isPPC64){
+            SDValue ConstFour = DAG.getConstant(4, PtrOff.getValueType());
+            PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
+            SDValue Load = DAG.getLoad(PtrVT, dl, Store, PtrOff, NULL, 0);
+            MemOpChains.push_back(Load.getValue(1));
+            RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+          }
+        } else {
+          // If we have any FPRs remaining, we may also have GPRs remaining.
+          // Args passed in FPRs consume either 1 (f32) or 2 (f64) available
+          // GPRs.
+          if (GPR_idx != NumGPRs)
+            ++GPR_idx;
+          if (GPR_idx != NumGPRs && Arg.getValueType() == MVT::f64 &&
+              !isPPC64)  // PPC64 has 64-bit GPR's obviously :)
+            ++GPR_idx;
+        }
+      } else {
+        LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+                         isPPC64, isTailCall, false, MemOpChains,
+                         TailCallArguments, dl);
+      }
+      if (isPPC64)
+        ArgOffset += 8;
+      else
+        ArgOffset += Arg.getValueType() == MVT::f32 ? 4 : 8;
+      break;
+    case MVT::v4f32:
+    case MVT::v4i32:
+    case MVT::v8i16:
+    case MVT::v16i8:
+      if (isVarArg) {
+        // These go aligned on the stack, or in the corresponding R registers
+        // when within range.  The Darwin PPC ABI doc claims they also go in
+        // V registers; in fact gcc does this only for arguments that are
+        // prototyped, not for those that match the ...  We do it for all
+        // arguments, seems to work.
+        while (ArgOffset % 16 !=0) {
+          ArgOffset += PtrByteSize;
+          if (GPR_idx != NumGPRs)
+            GPR_idx++;
+        }
+        // We could elide this store in the case where the object fits
+        // entirely in R registers.  Maybe later.
+        PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr,
+                            DAG.getConstant(ArgOffset, PtrVT));
+        SDValue Store = DAG.getStore(Chain, dl, Arg, PtrOff, NULL, 0);
+        MemOpChains.push_back(Store);
+        if (VR_idx != NumVRs) {
+          SDValue Load = DAG.getLoad(MVT::v4f32, dl, Store, PtrOff, NULL, 0);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(VR[VR_idx++], Load));
+        }
+        ArgOffset += 16;
+        for (unsigned i=0; i<16; i+=PtrByteSize) {
+          if (GPR_idx == NumGPRs)
+            break;
+          SDValue Ix = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff,
+                                  DAG.getConstant(i, PtrVT));
+          SDValue Load = DAG.getLoad(PtrVT, dl, Store, Ix, NULL, 0);
+          MemOpChains.push_back(Load.getValue(1));
+          RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+        }
+        break;
+      }
+
+      // Non-varargs Altivec params generally go in registers, but have
+      // stack space allocated at the end.
+      if (VR_idx != NumVRs) {
+        // Doesn't have GPR space allocated.
+        RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg));
+      } else if (nAltivecParamsAtEnd==0) {
+        // We are emitting Altivec params in order.
+        LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+                         isPPC64, isTailCall, true, MemOpChains,
+                         TailCallArguments, dl);
+        ArgOffset += 16;
+      }
+      break;
+    }
+  }
+  // If all Altivec parameters fit in registers, as they usually do,
+  // they get stack space following the non-Altivec parameters.  We
+  // don't track this here because nobody below needs it.
+  // If there are more Altivec parameters than fit in registers emit
+  // the stores here.
+  if (!isVarArg && nAltivecParamsAtEnd > NumVRs) {
+    unsigned j = 0;
+    // Offset is aligned; skip 1st 12 params which go in V registers.
+    ArgOffset = ((ArgOffset+15)/16)*16;
+    ArgOffset += 12*16;
+    for (unsigned i = 0; i != NumOps; ++i) {
+      SDValue Arg = Outs[i].Val;
+      EVT ArgType = Arg.getValueType();
+      if (ArgType==MVT::v4f32 || ArgType==MVT::v4i32 ||
+          ArgType==MVT::v8i16 || ArgType==MVT::v16i8) {
+        if (++j > NumVRs) {
+          SDValue PtrOff;
+          // We are emitting Altivec params in order.
+          LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
+                           isPPC64, isTailCall, true, MemOpChains,
+                           TailCallArguments, dl);
+          ArgOffset += 16;
+        }
+      }
+    }
+  }
+
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Check if this is an indirect call (MTCTR/BCTRL).
+  // See PrepareCall() for more information about calls through function
+  // pointers in the 64-bit SVR4 ABI.
+  if (!isTailCall && isPPC64 && PPCSubTarget.isSVR4ABI() &&
+      !dyn_cast<GlobalAddressSDNode>(Callee) &&
+      !dyn_cast<ExternalSymbolSDNode>(Callee) &&
+      !isBLACompatibleAddress(Callee, DAG)) {
+    // Load r2 into a virtual register and store it to the TOC save area.
+    SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64);
+    // TOC save area offset.
+    SDValue PtrOff = DAG.getIntPtrConstant(40);
+    SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
+    Chain = DAG.getStore(Val.getValue(1), dl, Val, AddPtr, NULL, 0);
+  }
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into the appropriate regs.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  if (isTailCall) {
+    PrepareTailCall(DAG, InFlag, Chain, dl, isPPC64, SPDiff, NumBytes, LROp,
+                    FPOp, true, TailCallArguments);
+  }
+
+  return FinishCall(CallConv, dl, isTailCall, isVarArg, DAG,
+                    RegsToPass, InFlag, Chain, Callee, SPDiff, NumBytes,
+                    Ins, InVals);
+}
+
+SDValue
+PPCTargetLowering::LowerReturn(SDValue Chain,
+                               CallingConv::ID CallConv, bool isVarArg,
+                               const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               DebugLoc dl, SelectionDAG &DAG) {
+
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 RVLocs, *DAG.getContext());
+  CCInfo.AnalyzeReturn(Outs, RetCC_PPC);
+
+  // If this is the first return lowered for this function, add the regs to the
+  // liveout set for the function.
+  if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
+    for (unsigned i = 0; i != RVLocs.size(); ++i)
+      DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
+  }
+
+  SDValue Flag;
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+                             Outs[i].Val, Flag);
+    Flag = Chain.getValue(1);
+  }
+
+  if (Flag.getNode())
+    return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other, Chain, Flag);
+  else
+    return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other, Chain);
+}
+
+SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG,
+                                   const PPCSubtarget &Subtarget) {
+  // When we pop the dynamic allocation we need to restore the SP link.
+  DebugLoc dl = Op.getDebugLoc();
+
+  // Get the corect type for pointers.
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+
+  // Construct the stack pointer operand.
+  bool isPPC64 = Subtarget.isPPC64();
+  unsigned SP = isPPC64 ? PPC::X1 : PPC::R1;
+  SDValue StackPtr = DAG.getRegister(SP, PtrVT);
+
+  // Get the operands for the STACKRESTORE.
+  SDValue Chain = Op.getOperand(0);
+  SDValue SaveSP = Op.getOperand(1);
+
+  // Load the old link SP.
+  SDValue LoadLinkSP = DAG.getLoad(PtrVT, dl, Chain, StackPtr, NULL, 0);
+
+  // Restore the stack pointer.
+  Chain = DAG.getCopyToReg(LoadLinkSP.getValue(1), dl, SP, SaveSP);
+
+  // Store the old link SP.
+  return DAG.getStore(Chain, dl, LoadLinkSP, StackPtr, NULL, 0);
+}
+
+
+
+SDValue
+PPCTargetLowering::getReturnAddrFrameIndex(SelectionDAG & DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool isPPC64 = PPCSubTarget.isPPC64();
+  bool isDarwinABI = PPCSubTarget.isDarwinABI();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+
+  // Get current frame pointer save index.  The users of this index will be
+  // primarily DYNALLOC instructions.
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+  int RASI = FI->getReturnAddrSaveIndex();
+
+  // If the frame pointer save index hasn't been defined yet.
+  if (!RASI) {
+    // Find out what the fix offset of the frame pointer save area.
+    int LROffset = PPCFrameInfo::getReturnSaveOffset(isPPC64, isDarwinABI);
+    // Allocate the frame index for frame pointer save area.
+    RASI = MF.getFrameInfo()->CreateFixedObject(isPPC64? 8 : 4, LROffset,
+                                                true, false);
+    // Save the result.
+    FI->setReturnAddrSaveIndex(RASI);
+  }
+  return DAG.getFrameIndex(RASI, PtrVT);
+}
+
+SDValue
+PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool isPPC64 = PPCSubTarget.isPPC64();
+  bool isDarwinABI = PPCSubTarget.isDarwinABI();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+
+  // Get current frame pointer save index.  The users of this index will be
+  // primarily DYNALLOC instructions.
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+  int FPSI = FI->getFramePointerSaveIndex();
+
+  // If the frame pointer save index hasn't been defined yet.
+  if (!FPSI) {
+    // Find out what the fix offset of the frame pointer save area.
+    int FPOffset = PPCFrameInfo::getFramePointerSaveOffset(isPPC64,
+                                                           isDarwinABI);
+
+    // Allocate the frame index for frame pointer save area.
+    FPSI = MF.getFrameInfo()->CreateFixedObject(isPPC64? 8 : 4, FPOffset,
+                                                true, false);
+    // Save the result.
+    FI->setFramePointerSaveIndex(FPSI);
+  }
+  return DAG.getFrameIndex(FPSI, PtrVT);
+}
+
+SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+                                         SelectionDAG &DAG,
+                                         const PPCSubtarget &Subtarget) {
+  // Get the inputs.
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size  = Op.getOperand(1);
+  DebugLoc dl = Op.getDebugLoc();
+
+  // Get the corect type for pointers.
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  // Negate the size.
+  SDValue NegSize = DAG.getNode(ISD::SUB, dl, PtrVT,
+                                  DAG.getConstant(0, PtrVT), Size);
+  // Construct a node for the frame pointer save index.
+  SDValue FPSIdx = getFramePointerFrameIndex(DAG);
+  // Build a DYNALLOC node.
+  SDValue Ops[3] = { Chain, NegSize, FPSIdx };
+  SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other);
+  return DAG.getNode(PPCISD::DYNALLOC, dl, VTs, Ops, 3);
+}
+
+/// LowerSELECT_CC - Lower floating point select_cc's into fsel instruction when
+/// possible.
+SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) {
+  // Not FP? Not a fsel.
+  if (!Op.getOperand(0).getValueType().isFloatingPoint() ||
+      !Op.getOperand(2).getValueType().isFloatingPoint())
+    return Op;
+
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+
+  // Cannot handle SETEQ/SETNE.
+  if (CC == ISD::SETEQ || CC == ISD::SETNE) return Op;
+
+  EVT ResVT = Op.getValueType();
+  EVT CmpVT = Op.getOperand(0).getValueType();
+  SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
+  SDValue TV  = Op.getOperand(2), FV  = Op.getOperand(3);
+  DebugLoc dl = Op.getDebugLoc();
+
+  // If the RHS of the comparison is a 0.0, we don't need to do the
+  // subtraction at all.
+  if (isFloatingPointZero(RHS))
+    switch (CC) {
+    default: break;       // SETUO etc aren't handled by fsel.
+    case ISD::SETULT:
+    case ISD::SETLT:
+      std::swap(TV, FV);  // fsel is natively setge, swap operands for setlt
+    case ISD::SETOGE:
+    case ISD::SETGE:
+      if (LHS.getValueType() == MVT::f32)   // Comparison is always 64-bits
+        LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
+      return DAG.getNode(PPCISD::FSEL, dl, ResVT, LHS, TV, FV);
+    case ISD::SETUGT:
+    case ISD::SETGT:
+      std::swap(TV, FV);  // fsel is natively setge, swap operands for setlt
+    case ISD::SETOLE:
+    case ISD::SETLE:
+      if (LHS.getValueType() == MVT::f32)   // Comparison is always 64-bits
+        LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS);
+      return DAG.getNode(PPCISD::FSEL, dl, ResVT,
+                         DAG.getNode(ISD::FNEG, dl, MVT::f64, LHS), TV, FV);
+    }
+
+  SDValue Cmp;
+  switch (CC) {
+  default: break;       // SETUO etc aren't handled by fsel.
+  case ISD::SETULT:
+  case ISD::SETLT:
+    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS);
+    if (Cmp.getValueType() == MVT::f32)   // Comparison is always 64-bits
+      Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
+      return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV);
+  case ISD::SETOGE:
+  case ISD::SETGE:
+    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS);
+    if (Cmp.getValueType() == MVT::f32)   // Comparison is always 64-bits
+      Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
+      return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
+  case ISD::SETUGT:
+  case ISD::SETGT:
+    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS);
+    if (Cmp.getValueType() == MVT::f32)   // Comparison is always 64-bits
+      Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
+      return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, FV, TV);
+  case ISD::SETOLE:
+  case ISD::SETLE:
+    Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, RHS, LHS);
+    if (Cmp.getValueType() == MVT::f32)   // Comparison is always 64-bits
+      Cmp = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Cmp);
+      return DAG.getNode(PPCISD::FSEL, dl, ResVT, Cmp, TV, FV);
+  }
+  return Op;
+}
+
+// FIXME: Split this code up when LegalizeDAGTypes lands.
+SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
+                                           DebugLoc dl) {
+  assert(Op.getOperand(0).getValueType().isFloatingPoint());
+  SDValue Src = Op.getOperand(0);
+  if (Src.getValueType() == MVT::f32)
+    Src = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Src);
+
+  SDValue Tmp;
+  switch (Op.getValueType().getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("Unhandled FP_TO_INT type in custom expander!");
+  case MVT::i32:
+    Tmp = DAG.getNode(Op.getOpcode()==ISD::FP_TO_SINT ? PPCISD::FCTIWZ :
+                                                         PPCISD::FCTIDZ, 
+                      dl, MVT::f64, Src);
+    break;
+  case MVT::i64:
+    Tmp = DAG.getNode(PPCISD::FCTIDZ, dl, MVT::f64, Src);
+    break;
+  }
+
+  // Convert the FP value to an int value through memory.
+  SDValue FIPtr = DAG.CreateStackTemporary(MVT::f64);
+
+  // Emit a store to the stack slot.
+  SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Tmp, FIPtr, NULL, 0);
+
+  // Result is a load from the stack slot.  If loading 4 bytes, make sure to
+  // add in a bias.
+  if (Op.getValueType() == MVT::i32)
+    FIPtr = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr,
+                        DAG.getConstant(4, FIPtr.getValueType()));
+  return DAG.getLoad(Op.getValueType(), dl, Chain, FIPtr, NULL, 0);
+}
+
+SDValue PPCTargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  // Don't handle ppc_fp128 here; let it be lowered to a libcall.
+  if (Op.getValueType() != MVT::f32 && Op.getValueType() != MVT::f64)
+    return SDValue();
+
+  if (Op.getOperand(0).getValueType() == MVT::i64) {
+    SDValue Bits = DAG.getNode(ISD::BIT_CONVERT, dl,
+                               MVT::f64, Op.getOperand(0));
+    SDValue FP = DAG.getNode(PPCISD::FCFID, dl, MVT::f64, Bits);
+    if (Op.getValueType() == MVT::f32)
+      FP = DAG.getNode(ISD::FP_ROUND, dl,
+                       MVT::f32, FP, DAG.getIntPtrConstant(0));
+    return FP;
+  }
+
+  assert(Op.getOperand(0).getValueType() == MVT::i32 &&
+         "Unhandled SINT_TO_FP type in custom expander!");
+  // Since we only generate this in 64-bit mode, we can take advantage of
+  // 64-bit registers.  In particular, sign extend the input value into the
+  // 64-bit register with extsw, store the WHOLE 64-bit value into the stack
+  // then lfd it and fcfid it.
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *FrameInfo = MF.getFrameInfo();
+  int FrameIdx = FrameInfo->CreateStackObject(8, 8, false);
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
+
+  SDValue Ext64 = DAG.getNode(PPCISD::EXTSW_32, dl, MVT::i32,
+                                Op.getOperand(0));
+
+  // STD the extended value into the stack slot.
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FrameIdx),
+                            MachineMemOperand::MOStore, 0, 8, 8);
+  SDValue Ops[] = { DAG.getEntryNode(), Ext64, FIdx };
+  SDValue Store =
+    DAG.getMemIntrinsicNode(PPCISD::STD_32, dl, DAG.getVTList(MVT::Other),
+                            Ops, 4, MVT::i64, MMO);
+  // Load the value as a double.
+  SDValue Ld = DAG.getLoad(MVT::f64, dl, Store, FIdx, NULL, 0);
+
+  // FCFID it and return it.
+  SDValue FP = DAG.getNode(PPCISD::FCFID, dl, MVT::f64, Ld);
+  if (Op.getValueType() == MVT::f32)
+    FP = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, FP, DAG.getIntPtrConstant(0));
+  return FP;
+}
+
+SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  /*
+   The rounding mode is in bits 30:31 of FPSR, and has the following
+   settings:
+     00 Round to nearest
+     01 Round to 0
+     10 Round to +inf
+     11 Round to -inf
+
+  FLT_ROUNDS, on the other hand, expects the following:
+    -1 Undefined
+     0 Round to 0
+     1 Round to nearest
+     2 Round to +inf
+     3 Round to -inf
+
+  To perform the conversion, we do:
+    ((FPSCR & 0x3) ^ ((~FPSCR & 0x3) >> 1))
+  */
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  EVT VT = Op.getValueType();
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  std::vector<EVT> NodeTys;
+  SDValue MFFSreg, InFlag;
+
+  // Save FP Control Word to register
+  NodeTys.push_back(MVT::f64);    // return register
+  NodeTys.push_back(MVT::Flag);   // unused in this context
+  SDValue Chain = DAG.getNode(PPCISD::MFFS, dl, NodeTys, &InFlag, 0);
+
+  // Save FP register to stack slot
+  int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8, false);
+  SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Chain,
+                                 StackSlot, NULL, 0);
+
+  // Load FP Control Word from low 32 bits of stack slot.
+  SDValue Four = DAG.getConstant(4, PtrVT);
+  SDValue Addr = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, Four);
+  SDValue CWD = DAG.getLoad(MVT::i32, dl, Store, Addr, NULL, 0);
+
+  // Transform as necessary
+  SDValue CWD1 =
+    DAG.getNode(ISD::AND, dl, MVT::i32,
+                CWD, DAG.getConstant(3, MVT::i32));
+  SDValue CWD2 =
+    DAG.getNode(ISD::SRL, dl, MVT::i32,
+                DAG.getNode(ISD::AND, dl, MVT::i32,
+                            DAG.getNode(ISD::XOR, dl, MVT::i32,
+                                        CWD, DAG.getConstant(3, MVT::i32)),
+                            DAG.getConstant(3, MVT::i32)),
+                DAG.getConstant(1, MVT::i32));
+
+  SDValue RetVal =
+    DAG.getNode(ISD::XOR, dl, MVT::i32, CWD1, CWD2);
+
+  return DAG.getNode((VT.getSizeInBits() < 16 ?
+                      ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal);
+}
+
+SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+  unsigned BitWidth = VT.getSizeInBits();
+  DebugLoc dl = Op.getDebugLoc();
+  assert(Op.getNumOperands() == 3 &&
+         VT == Op.getOperand(1).getValueType() &&
+         "Unexpected SHL!");
+
+  // Expand into a bunch of logical ops.  Note that these ops
+  // depend on the PPC behavior for oversized shift amounts.
+  SDValue Lo = Op.getOperand(0);
+  SDValue Hi = Op.getOperand(1);
+  SDValue Amt = Op.getOperand(2);
+  EVT AmtVT = Amt.getValueType();
+
+  SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
+                             DAG.getConstant(BitWidth, AmtVT), Amt);
+  SDValue Tmp2 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Amt);
+  SDValue Tmp3 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Tmp1);
+  SDValue Tmp4 = DAG.getNode(ISD::OR , dl, VT, Tmp2, Tmp3);
+  SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
+                             DAG.getConstant(-BitWidth, AmtVT));
+  SDValue Tmp6 = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Tmp5);
+  SDValue OutHi = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6);
+  SDValue OutLo = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Amt);
+  SDValue OutOps[] = { OutLo, OutHi };
+  return DAG.getMergeValues(OutOps, 2, dl);
+}
+
+SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned BitWidth = VT.getSizeInBits();
+  assert(Op.getNumOperands() == 3 &&
+         VT == Op.getOperand(1).getValueType() &&
+         "Unexpected SRL!");
+
+  // Expand into a bunch of logical ops.  Note that these ops
+  // depend on the PPC behavior for oversized shift amounts.
+  SDValue Lo = Op.getOperand(0);
+  SDValue Hi = Op.getOperand(1);
+  SDValue Amt = Op.getOperand(2);
+  EVT AmtVT = Amt.getValueType();
+
+  SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
+                             DAG.getConstant(BitWidth, AmtVT), Amt);
+  SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt);
+  SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1);
+  SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
+  SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
+                             DAG.getConstant(-BitWidth, AmtVT));
+  SDValue Tmp6 = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Tmp5);
+  SDValue OutLo = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6);
+  SDValue OutHi = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Amt);
+  SDValue OutOps[] = { OutLo, OutHi };
+  return DAG.getMergeValues(OutOps, 2, dl);
+}
+
+SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+  unsigned BitWidth = VT.getSizeInBits();
+  assert(Op.getNumOperands() == 3 &&
+         VT == Op.getOperand(1).getValueType() &&
+         "Unexpected SRA!");
+
+  // Expand into a bunch of logical ops, followed by a select_cc.
+  SDValue Lo = Op.getOperand(0);
+  SDValue Hi = Op.getOperand(1);
+  SDValue Amt = Op.getOperand(2);
+  EVT AmtVT = Amt.getValueType();
+
+  SDValue Tmp1 = DAG.getNode(ISD::SUB, dl, AmtVT,
+                             DAG.getConstant(BitWidth, AmtVT), Amt);
+  SDValue Tmp2 = DAG.getNode(PPCISD::SRL, dl, VT, Lo, Amt);
+  SDValue Tmp3 = DAG.getNode(PPCISD::SHL, dl, VT, Hi, Tmp1);
+  SDValue Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp3);
+  SDValue Tmp5 = DAG.getNode(ISD::ADD, dl, AmtVT, Amt,
+                             DAG.getConstant(-BitWidth, AmtVT));
+  SDValue Tmp6 = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Tmp5);
+  SDValue OutHi = DAG.getNode(PPCISD::SRA, dl, VT, Hi, Amt);
+  SDValue OutLo = DAG.getSelectCC(dl, Tmp5, DAG.getConstant(0, AmtVT),
+                                  Tmp4, Tmp6, ISD::SETLE);
+  SDValue OutOps[] = { OutLo, OutHi };
+  return DAG.getMergeValues(OutOps, 2, dl);
+}
+
+//===----------------------------------------------------------------------===//
+// Vector related lowering.
+//
+
+/// BuildSplatI - Build a canonical splati of Val with an element size of
+/// SplatSize.  Cast the result to VT.
+static SDValue BuildSplatI(int Val, unsigned SplatSize, EVT VT,
+                             SelectionDAG &DAG, DebugLoc dl) {
+  assert(Val >= -16 && Val <= 15 && "vsplti is out of range!");
+
+  static const EVT VTys[] = { // canonical VT to use for each size.
+    MVT::v16i8, MVT::v8i16, MVT::Other, MVT::v4i32
+  };
+
+  EVT ReqVT = VT != MVT::Other ? VT : VTys[SplatSize-1];
+
+  // Force vspltis[hw] -1 to vspltisb -1 to canonicalize.
+  if (Val == -1)
+    SplatSize = 1;
+
+  EVT CanonicalVT = VTys[SplatSize-1];
+
+  // Build a canonical splat for this value.
+  SDValue Elt = DAG.getConstant(Val, MVT::i32);
+  SmallVector<SDValue, 8> Ops;
+  Ops.assign(CanonicalVT.getVectorNumElements(), Elt);
+  SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, dl, CanonicalVT,
+                              &Ops[0], Ops.size());
+  return DAG.getNode(ISD::BIT_CONVERT, dl, ReqVT, Res);
+}
+
+/// BuildIntrinsicOp - Return a binary operator intrinsic node with the
+/// specified intrinsic ID.
+static SDValue BuildIntrinsicOp(unsigned IID, SDValue LHS, SDValue RHS,
+                                SelectionDAG &DAG, DebugLoc dl,
+                                EVT DestVT = MVT::Other) {
+  if (DestVT == MVT::Other) DestVT = LHS.getValueType();
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
+                     DAG.getConstant(IID, MVT::i32), LHS, RHS);
+}
+
+/// BuildIntrinsicOp - Return a ternary operator intrinsic node with the
+/// specified intrinsic ID.
+static SDValue BuildIntrinsicOp(unsigned IID, SDValue Op0, SDValue Op1,
+                                SDValue Op2, SelectionDAG &DAG,
+                                DebugLoc dl, EVT DestVT = MVT::Other) {
+  if (DestVT == MVT::Other) DestVT = Op0.getValueType();
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, DestVT,
+                     DAG.getConstant(IID, MVT::i32), Op0, Op1, Op2);
+}
+
+
+/// BuildVSLDOI - Return a VECTOR_SHUFFLE that is a vsldoi of the specified
+/// amount.  The result has the specified value type.
+static SDValue BuildVSLDOI(SDValue LHS, SDValue RHS, unsigned Amt,
+                             EVT VT, SelectionDAG &DAG, DebugLoc dl) {
+  // Force LHS/RHS to be the right type.
+  LHS = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, LHS);
+  RHS = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, RHS);
+
+  int Ops[16];
+  for (unsigned i = 0; i != 16; ++i)
+    Ops[i] = i + Amt;
+  SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, LHS, RHS, Ops);
+  return DAG.getNode(ISD::BIT_CONVERT, dl, VT, T);
+}
+
+// If this is a case we can't handle, return null and let the default
+// expansion code take care of it.  If we CAN select this case, and if it
+// selects to a single instruction, return Op.  Otherwise, if we can codegen
+// this case more efficiently than a constant pool load, lower it to the
+// sequence of ops that should be used.
+SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
+  assert(BVN != 0 && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR");
+
+  // Check if this is a splat of a constant value.
+  APInt APSplatBits, APSplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (! BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
+                             HasAnyUndefs, 0, true) || SplatBitSize > 32)
+    return SDValue();
+
+  unsigned SplatBits = APSplatBits.getZExtValue();
+  unsigned SplatUndef = APSplatUndef.getZExtValue();
+  unsigned SplatSize = SplatBitSize / 8;
+
+  // First, handle single instruction cases.
+
+  // All zeros?
+  if (SplatBits == 0) {
+    // Canonicalize all zero vectors to be v4i32.
+    if (Op.getValueType() != MVT::v4i32 || HasAnyUndefs) {
+      SDValue Z = DAG.getConstant(0, MVT::i32);
+      Z = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Z, Z, Z, Z);
+      Op = DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Z);
+    }
+    return Op;
+  }
+
+  // If the sign extended value is in the range [-16,15], use VSPLTI[bhw].
+  int32_t SextVal= (int32_t(SplatBits << (32-SplatBitSize)) >>
+                    (32-SplatBitSize));
+  if (SextVal >= -16 && SextVal <= 15)
+    return BuildSplatI(SextVal, SplatSize, Op.getValueType(), DAG, dl);
+
+
+  // Two instruction sequences.
+
+  // If this value is in the range [-32,30] and is even, use:
+  //    tmp = VSPLTI[bhw], result = add tmp, tmp
+  if (SextVal >= -32 && SextVal <= 30 && (SextVal & 1) == 0) {
+    SDValue Res = BuildSplatI(SextVal >> 1, SplatSize, MVT::Other, DAG, dl);
+    Res = DAG.getNode(ISD::ADD, dl, Res.getValueType(), Res, Res);
+    return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Res);
+  }
+
+  // If this is 0x8000_0000 x 4, turn into vspltisw + vslw.  If it is
+  // 0x7FFF_FFFF x 4, turn it into not(0x8000_0000).  This is important
+  // for fneg/fabs.
+  if (SplatSize == 4 && SplatBits == (0x7FFFFFFF&~SplatUndef)) {
+    // Make -1 and vspltisw -1:
+    SDValue OnesV = BuildSplatI(-1, 4, MVT::v4i32, DAG, dl);
+
+    // Make the VSLW intrinsic, computing 0x8000_0000.
+    SDValue Res = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, OnesV,
+                                   OnesV, DAG, dl);
+
+    // xor by OnesV to invert it.
+    Res = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Res, OnesV);
+    return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Res);
+  }
+
+  // Check to see if this is a wide variety of vsplti*, binop self cases.
+  static const signed char SplatCsts[] = {
+    -1, 1, -2, 2, -3, 3, -4, 4, -5, 5, -6, 6, -7, 7,
+    -8, 8, -9, 9, -10, 10, -11, 11, -12, 12, -13, 13, 14, -14, 15, -15, -16
+  };
+
+  for (unsigned idx = 0; idx < array_lengthof(SplatCsts); ++idx) {
+    // Indirect through the SplatCsts array so that we favor 'vsplti -1' for
+    // cases which are ambiguous (e.g. formation of 0x8000_0000).  'vsplti -1'
+    int i = SplatCsts[idx];
+
+    // Figure out what shift amount will be used by altivec if shifted by i in
+    // this splat size.
+    unsigned TypeShiftAmt = i & (SplatBitSize-1);
+
+    // vsplti + shl self.
+    if (SextVal == (i << (int)TypeShiftAmt)) {
+      SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
+      static const unsigned IIDs[] = { // Intrinsic to use for each size.
+        Intrinsic::ppc_altivec_vslb, Intrinsic::ppc_altivec_vslh, 0,
+        Intrinsic::ppc_altivec_vslw
+      };
+      Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
+      return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Res);
+    }
+
+    // vsplti + srl self.
+    if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
+      SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
+      static const unsigned IIDs[] = { // Intrinsic to use for each size.
+        Intrinsic::ppc_altivec_vsrb, Intrinsic::ppc_altivec_vsrh, 0,
+        Intrinsic::ppc_altivec_vsrw
+      };
+      Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
+      return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Res);
+    }
+
+    // vsplti + sra self.
+    if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
+      SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
+      static const unsigned IIDs[] = { // Intrinsic to use for each size.
+        Intrinsic::ppc_altivec_vsrab, Intrinsic::ppc_altivec_vsrah, 0,
+        Intrinsic::ppc_altivec_vsraw
+      };
+      Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
+      return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Res);
+    }
+
+    // vsplti + rol self.
+    if (SextVal == (int)(((unsigned)i << TypeShiftAmt) |
+                         ((unsigned)i >> (SplatBitSize-TypeShiftAmt)))) {
+      SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
+      static const unsigned IIDs[] = { // Intrinsic to use for each size.
+        Intrinsic::ppc_altivec_vrlb, Intrinsic::ppc_altivec_vrlh, 0,
+        Intrinsic::ppc_altivec_vrlw
+      };
+      Res = BuildIntrinsicOp(IIDs[SplatSize-1], Res, Res, DAG, dl);
+      return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Res);
+    }
+
+    // t = vsplti c, result = vsldoi t, t, 1
+    if (SextVal == ((i << 8) | (i >> (TypeShiftAmt-8)))) {
+      SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
+      return BuildVSLDOI(T, T, 1, Op.getValueType(), DAG, dl);
+    }
+    // t = vsplti c, result = vsldoi t, t, 2
+    if (SextVal == ((i << 16) | (i >> (TypeShiftAmt-16)))) {
+      SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
+      return BuildVSLDOI(T, T, 2, Op.getValueType(), DAG, dl);
+    }
+    // t = vsplti c, result = vsldoi t, t, 3
+    if (SextVal == ((i << 24) | (i >> (TypeShiftAmt-24)))) {
+      SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
+      return BuildVSLDOI(T, T, 3, Op.getValueType(), DAG, dl);
+    }
+  }
+
+  // Three instruction sequences.
+
+  // Odd, in range [17,31]:  (vsplti C)-(vsplti -16).
+  if (SextVal >= 0 && SextVal <= 31) {
+    SDValue LHS = BuildSplatI(SextVal-16, SplatSize, MVT::Other, DAG, dl);
+    SDValue RHS = BuildSplatI(-16, SplatSize, MVT::Other, DAG, dl);
+    LHS = DAG.getNode(ISD::SUB, dl, LHS.getValueType(), LHS, RHS);
+    return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), LHS);
+  }
+  // Odd, in range [-31,-17]:  (vsplti C)+(vsplti -16).
+  if (SextVal >= -31 && SextVal <= 0) {
+    SDValue LHS = BuildSplatI(SextVal+16, SplatSize, MVT::Other, DAG, dl);
+    SDValue RHS = BuildSplatI(-16, SplatSize, MVT::Other, DAG, dl);
+    LHS = DAG.getNode(ISD::ADD, dl, LHS.getValueType(), LHS, RHS);
+    return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), LHS);
+  }
+
+  return SDValue();
+}
+
+/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
+/// the specified operations to build the shuffle.
+static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
+                                      SDValue RHS, SelectionDAG &DAG,
+                                      DebugLoc dl) {
+  unsigned OpNum = (PFEntry >> 26) & 0x0F;
+  unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
+  unsigned RHSID = (PFEntry >>  0) & ((1 << 13)-1);
+
+  enum {
+    OP_COPY = 0,  // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
+    OP_VMRGHW,
+    OP_VMRGLW,
+    OP_VSPLTISW0,
+    OP_VSPLTISW1,
+    OP_VSPLTISW2,
+    OP_VSPLTISW3,
+    OP_VSLDOI4,
+    OP_VSLDOI8,
+    OP_VSLDOI12
+  };
+
+  if (OpNum == OP_COPY) {
+    if (LHSID == (1*9+2)*9+3) return LHS;
+    assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
+    return RHS;
+  }
+
+  SDValue OpLHS, OpRHS;
+  OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
+  OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
+
+  int ShufIdxs[16];
+  switch (OpNum) {
+  default: llvm_unreachable("Unknown i32 permute!");
+  case OP_VMRGHW:
+    ShufIdxs[ 0] =  0; ShufIdxs[ 1] =  1; ShufIdxs[ 2] =  2; ShufIdxs[ 3] =  3;
+    ShufIdxs[ 4] = 16; ShufIdxs[ 5] = 17; ShufIdxs[ 6] = 18; ShufIdxs[ 7] = 19;
+    ShufIdxs[ 8] =  4; ShufIdxs[ 9] =  5; ShufIdxs[10] =  6; ShufIdxs[11] =  7;
+    ShufIdxs[12] = 20; ShufIdxs[13] = 21; ShufIdxs[14] = 22; ShufIdxs[15] = 23;
+    break;
+  case OP_VMRGLW:
+    ShufIdxs[ 0] =  8; ShufIdxs[ 1] =  9; ShufIdxs[ 2] = 10; ShufIdxs[ 3] = 11;
+    ShufIdxs[ 4] = 24; ShufIdxs[ 5] = 25; ShufIdxs[ 6] = 26; ShufIdxs[ 7] = 27;
+    ShufIdxs[ 8] = 12; ShufIdxs[ 9] = 13; ShufIdxs[10] = 14; ShufIdxs[11] = 15;
+    ShufIdxs[12] = 28; ShufIdxs[13] = 29; ShufIdxs[14] = 30; ShufIdxs[15] = 31;
+    break;
+  case OP_VSPLTISW0:
+    for (unsigned i = 0; i != 16; ++i)
+      ShufIdxs[i] = (i&3)+0;
+    break;
+  case OP_VSPLTISW1:
+    for (unsigned i = 0; i != 16; ++i)
+      ShufIdxs[i] = (i&3)+4;
+    break;
+  case OP_VSPLTISW2:
+    for (unsigned i = 0; i != 16; ++i)
+      ShufIdxs[i] = (i&3)+8;
+    break;
+  case OP_VSPLTISW3:
+    for (unsigned i = 0; i != 16; ++i)
+      ShufIdxs[i] = (i&3)+12;
+    break;
+  case OP_VSLDOI4:
+    return BuildVSLDOI(OpLHS, OpRHS, 4, OpLHS.getValueType(), DAG, dl);
+  case OP_VSLDOI8:
+    return BuildVSLDOI(OpLHS, OpRHS, 8, OpLHS.getValueType(), DAG, dl);
+  case OP_VSLDOI12:
+    return BuildVSLDOI(OpLHS, OpRHS, 12, OpLHS.getValueType(), DAG, dl);
+  }
+  EVT VT = OpLHS.getValueType();
+  OpLHS = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, OpLHS);
+  OpRHS = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, OpRHS);
+  SDValue T = DAG.getVectorShuffle(MVT::v16i8, dl, OpLHS, OpRHS, ShufIdxs);
+  return DAG.getNode(ISD::BIT_CONVERT, dl, VT, T);
+}
+
+/// LowerVECTOR_SHUFFLE - Return the code we lower for VECTOR_SHUFFLE.  If this
+/// is a shuffle we can handle in a single instruction, return it.  Otherwise,
+/// return the code it can be lowered into.  Worst case, it can always be
+/// lowered into a vperm.
+SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
+                                               SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  EVT VT = Op.getValueType();
+
+  // Cases that are handled by instructions that take permute immediates
+  // (such as vsplt*) should be left as VECTOR_SHUFFLE nodes so they can be
+  // selected by the instruction selector.
+  if (V2.getOpcode() == ISD::UNDEF) {
+    if (PPC::isSplatShuffleMask(SVOp, 1) ||
+        PPC::isSplatShuffleMask(SVOp, 2) ||
+        PPC::isSplatShuffleMask(SVOp, 4) ||
+        PPC::isVPKUWUMShuffleMask(SVOp, true) ||
+        PPC::isVPKUHUMShuffleMask(SVOp, true) ||
+        PPC::isVSLDOIShuffleMask(SVOp, true) != -1 ||
+        PPC::isVMRGLShuffleMask(SVOp, 1, true) ||
+        PPC::isVMRGLShuffleMask(SVOp, 2, true) ||
+        PPC::isVMRGLShuffleMask(SVOp, 4, true) ||
+        PPC::isVMRGHShuffleMask(SVOp, 1, true) ||
+        PPC::isVMRGHShuffleMask(SVOp, 2, true) ||
+        PPC::isVMRGHShuffleMask(SVOp, 4, true)) {
+      return Op;
+    }
+  }
+
+  // Altivec has a variety of "shuffle immediates" that take two vector inputs
+  // and produce a fixed permutation.  If any of these match, do not lower to
+  // VPERM.
+  if (PPC::isVPKUWUMShuffleMask(SVOp, false) ||
+      PPC::isVPKUHUMShuffleMask(SVOp, false) ||
+      PPC::isVSLDOIShuffleMask(SVOp, false) != -1 ||
+      PPC::isVMRGLShuffleMask(SVOp, 1, false) ||
+      PPC::isVMRGLShuffleMask(SVOp, 2, false) ||
+      PPC::isVMRGLShuffleMask(SVOp, 4, false) ||
+      PPC::isVMRGHShuffleMask(SVOp, 1, false) ||
+      PPC::isVMRGHShuffleMask(SVOp, 2, false) ||
+      PPC::isVMRGHShuffleMask(SVOp, 4, false))
+    return Op;
+
+  // Check to see if this is a shuffle of 4-byte values.  If so, we can use our
+  // perfect shuffle table to emit an optimal matching sequence.
+  SmallVector<int, 16> PermMask;
+  SVOp->getMask(PermMask);
+  
+  unsigned PFIndexes[4];
+  bool isFourElementShuffle = true;
+  for (unsigned i = 0; i != 4 && isFourElementShuffle; ++i) { // Element number
+    unsigned EltNo = 8;   // Start out undef.
+    for (unsigned j = 0; j != 4; ++j) {  // Intra-element byte.
+      if (PermMask[i*4+j] < 0)
+        continue;   // Undef, ignore it.
+
+      unsigned ByteSource = PermMask[i*4+j];
+      if ((ByteSource & 3) != j) {
+        isFourElementShuffle = false;
+        break;
+      }
+
+      if (EltNo == 8) {
+        EltNo = ByteSource/4;
+      } else if (EltNo != ByteSource/4) {
+        isFourElementShuffle = false;
+        break;
+      }
+    }
+    PFIndexes[i] = EltNo;
+  }
+
+  // If this shuffle can be expressed as a shuffle of 4-byte elements, use the
+  // perfect shuffle vector to determine if it is cost effective to do this as
+  // discrete instructions, or whether we should use a vperm.
+  if (isFourElementShuffle) {
+    // Compute the index in the perfect shuffle table.
+    unsigned PFTableIndex =
+      PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
+
+    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+    unsigned Cost  = (PFEntry >> 30);
+
+    // Determining when to avoid vperm is tricky.  Many things affect the cost
+    // of vperm, particularly how many times the perm mask needs to be computed.
+    // For example, if the perm mask can be hoisted out of a loop or is already
+    // used (perhaps because there are multiple permutes with the same shuffle
+    // mask?) the vperm has a cost of 1.  OTOH, hoisting the permute mask out of
+    // the loop requires an extra register.
+    //
+    // As a compromise, we only emit discrete instructions if the shuffle can be
+    // generated in 3 or fewer operations.  When we have loop information
+    // available, if this block is within a loop, we should avoid using vperm
+    // for 3-operation perms and use a constant pool load instead.
+    if (Cost < 3)
+      return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
+  }
+
+  // Lower this to a VPERM(V1, V2, V3) expression, where V3 is a constant
+  // vector that will get spilled to the constant pool.
+  if (V2.getOpcode() == ISD::UNDEF) V2 = V1;
+
+  // The SHUFFLE_VECTOR mask is almost exactly what we want for vperm, except
+  // that it is in input element units, not in bytes.  Convert now.
+  EVT EltVT = V1.getValueType().getVectorElementType();
+  unsigned BytesPerElement = EltVT.getSizeInBits()/8;
+
+  SmallVector<SDValue, 16> ResultMask;
+  for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
+    unsigned SrcElt = PermMask[i] < 0 ? 0 : PermMask[i];
+
+    for (unsigned j = 0; j != BytesPerElement; ++j)
+      ResultMask.push_back(DAG.getConstant(SrcElt*BytesPerElement+j,
+                                           MVT::i32));
+  }
+
+  SDValue VPermMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8,
+                                    &ResultMask[0], ResultMask.size());
+  return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(), V1, V2, VPermMask);
+}
+
+/// getAltivecCompareInfo - Given an intrinsic, return false if it is not an
+/// altivec comparison.  If it is, return true and fill in Opc/isDot with
+/// information about the intrinsic.
+static bool getAltivecCompareInfo(SDValue Intrin, int &CompareOpc,
+                                  bool &isDot) {
+  unsigned IntrinsicID =
+    cast<ConstantSDNode>(Intrin.getOperand(0))->getZExtValue();
+  CompareOpc = -1;
+  isDot = false;
+  switch (IntrinsicID) {
+  default: return false;
+    // Comparison predicates.
+  case Intrinsic::ppc_altivec_vcmpbfp_p:  CompareOpc = 966; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpeqfp_p: CompareOpc = 198; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpequb_p: CompareOpc =   6; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpequh_p: CompareOpc =  70; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpequw_p: CompareOpc = 134; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgefp_p: CompareOpc = 454; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtfp_p: CompareOpc = 710; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtsb_p: CompareOpc = 774; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtsh_p: CompareOpc = 838; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtsw_p: CompareOpc = 902; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtub_p: CompareOpc = 518; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtuh_p: CompareOpc = 582; isDot = 1; break;
+  case Intrinsic::ppc_altivec_vcmpgtuw_p: CompareOpc = 646; isDot = 1; break;
+
+    // Normal Comparisons.
+  case Intrinsic::ppc_altivec_vcmpbfp:    CompareOpc = 966; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpeqfp:   CompareOpc = 198; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpequb:   CompareOpc =   6; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpequh:   CompareOpc =  70; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpequw:   CompareOpc = 134; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgefp:   CompareOpc = 454; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtfp:   CompareOpc = 710; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtsb:   CompareOpc = 774; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtsh:   CompareOpc = 838; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtsw:   CompareOpc = 902; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtub:   CompareOpc = 518; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtuh:   CompareOpc = 582; isDot = 0; break;
+  case Intrinsic::ppc_altivec_vcmpgtuw:   CompareOpc = 646; isDot = 0; break;
+  }
+  return true;
+}
+
+/// LowerINTRINSIC_WO_CHAIN - If this is an intrinsic that we want to custom
+/// lower, do it, otherwise return null.
+SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
+                                                     SelectionDAG &DAG) {
+  // If this is a lowered altivec predicate compare, CompareOpc is set to the
+  // opcode number of the comparison.
+  DebugLoc dl = Op.getDebugLoc();
+  int CompareOpc;
+  bool isDot;
+  if (!getAltivecCompareInfo(Op, CompareOpc, isDot))
+    return SDValue();    // Don't custom lower most intrinsics.
+
+  // If this is a non-dot comparison, make the VCMP node and we are done.
+  if (!isDot) {
+    SDValue Tmp = DAG.getNode(PPCISD::VCMP, dl, Op.getOperand(2).getValueType(),
+                                Op.getOperand(1), Op.getOperand(2),
+                                DAG.getConstant(CompareOpc, MVT::i32));
+    return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Tmp);
+  }
+
+  // Create the PPCISD altivec 'dot' comparison node.
+  SDValue Ops[] = {
+    Op.getOperand(2),  // LHS
+    Op.getOperand(3),  // RHS
+    DAG.getConstant(CompareOpc, MVT::i32)
+  };
+  std::vector<EVT> VTs;
+  VTs.push_back(Op.getOperand(2).getValueType());
+  VTs.push_back(MVT::Flag);
+  SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops, 3);
+
+  // Now that we have the comparison, emit a copy from the CR to a GPR.
+  // This is flagged to the above dot comparison.
+  SDValue Flags = DAG.getNode(PPCISD::MFCR, dl, MVT::i32,
+                                DAG.getRegister(PPC::CR6, MVT::i32),
+                                CompNode.getValue(1));
+
+  // Unpack the result based on how the target uses it.
+  unsigned BitNo;   // Bit # of CR6.
+  bool InvertBit;   // Invert result?
+  switch (cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()) {
+  default:  // Can't happen, don't crash on invalid number though.
+  case 0:   // Return the value of the EQ bit of CR6.
+    BitNo = 0; InvertBit = false;
+    break;
+  case 1:   // Return the inverted value of the EQ bit of CR6.
+    BitNo = 0; InvertBit = true;
+    break;
+  case 2:   // Return the value of the LT bit of CR6.
+    BitNo = 2; InvertBit = false;
+    break;
+  case 3:   // Return the inverted value of the LT bit of CR6.
+    BitNo = 2; InvertBit = true;
+    break;
+  }
+
+  // Shift the bit into the low position.
+  Flags = DAG.getNode(ISD::SRL, dl, MVT::i32, Flags,
+                      DAG.getConstant(8-(3-BitNo), MVT::i32));
+  // Isolate the bit.
+  Flags = DAG.getNode(ISD::AND, dl, MVT::i32, Flags,
+                      DAG.getConstant(1, MVT::i32));
+
+  // If we are supposed to, toggle the bit.
+  if (InvertBit)
+    Flags = DAG.getNode(ISD::XOR, dl, MVT::i32, Flags,
+                        DAG.getConstant(1, MVT::i32));
+  return Flags;
+}
+
+SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
+                                                   SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  // Create a stack slot that is 16-byte aligned.
+  MachineFrameInfo *FrameInfo = DAG.getMachineFunction().getFrameInfo();
+  int FrameIdx = FrameInfo->CreateStackObject(16, 16, false);
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
+
+  // Store the input value into Value#0 of the stack slot.
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl,
+                                 Op.getOperand(0), FIdx, NULL, 0);
+  // Load it out.
+  return DAG.getLoad(Op.getValueType(), dl, Store, FIdx, NULL, 0);
+}
+
+SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  if (Op.getValueType() == MVT::v4i32) {
+    SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
+
+    SDValue Zero  = BuildSplatI(  0, 1, MVT::v4i32, DAG, dl);
+    SDValue Neg16 = BuildSplatI(-16, 4, MVT::v4i32, DAG, dl);//+16 as shift amt.
+
+    SDValue RHSSwap =   // = vrlw RHS, 16
+      BuildIntrinsicOp(Intrinsic::ppc_altivec_vrlw, RHS, Neg16, DAG, dl);
+
+    // Shrinkify inputs to v8i16.
+    LHS = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, LHS);
+    RHS = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, RHS);
+    RHSSwap = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, RHSSwap);
+
+    // Low parts multiplied together, generating 32-bit results (we ignore the
+    // top parts).
+    SDValue LoProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmulouh,
+                                        LHS, RHS, DAG, dl, MVT::v4i32);
+
+    SDValue HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmsumuhm,
+                                      LHS, RHSSwap, Zero, DAG, dl, MVT::v4i32);
+    // Shift the high parts up 16 bits.
+    HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, HiProd,
+                              Neg16, DAG, dl);
+    return DAG.getNode(ISD::ADD, dl, MVT::v4i32, LoProd, HiProd);
+  } else if (Op.getValueType() == MVT::v8i16) {
+    SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
+
+    SDValue Zero = BuildSplatI(0, 1, MVT::v8i16, DAG, dl);
+
+    return BuildIntrinsicOp(Intrinsic::ppc_altivec_vmladduhm,
+                            LHS, RHS, Zero, DAG, dl);
+  } else if (Op.getValueType() == MVT::v16i8) {
+    SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
+
+    // Multiply the even 8-bit parts, producing 16-bit sums.
+    SDValue EvenParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuleub,
+                                           LHS, RHS, DAG, dl, MVT::v8i16);
+    EvenParts = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, EvenParts);
+
+    // Multiply the odd 8-bit parts, producing 16-bit sums.
+    SDValue OddParts = BuildIntrinsicOp(Intrinsic::ppc_altivec_vmuloub,
+                                          LHS, RHS, DAG, dl, MVT::v8i16);
+    OddParts = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, OddParts);
+
+    // Merge the results together.
+    int Ops[16];
+    for (unsigned i = 0; i != 8; ++i) {
+      Ops[i*2  ] = 2*i+1;
+      Ops[i*2+1] = 2*i+1+16;
+    }
+    return DAG.getVectorShuffle(MVT::v16i8, dl, EvenParts, OddParts, Ops);
+  } else {
+    llvm_unreachable("Unknown mul to lower!");
+  }
+}
+
+/// LowerOperation - Provide custom lowering hooks for some operations.
+///
+SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Wasn't expecting to be able to lower this!");
+  case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
+  case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
+  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
+  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
+  case ISD::SETCC:              return LowerSETCC(Op, DAG);
+  case ISD::TRAMPOLINE:         return LowerTRAMPOLINE(Op, DAG);
+  case ISD::VASTART:
+    return LowerVASTART(Op, DAG, VarArgsFrameIndex, VarArgsStackOffset,
+                        VarArgsNumGPR, VarArgsNumFPR, PPCSubTarget);
+
+  case ISD::VAARG:
+    return LowerVAARG(Op, DAG, VarArgsFrameIndex, VarArgsStackOffset,
+                      VarArgsNumGPR, VarArgsNumFPR, PPCSubTarget);
+
+  case ISD::STACKRESTORE:       return LowerSTACKRESTORE(Op, DAG, PPCSubTarget);
+  case ISD::DYNAMIC_STACKALLOC:
+    return LowerDYNAMIC_STACKALLOC(Op, DAG, PPCSubTarget);
+
+  case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG);
+  case ISD::FP_TO_UINT:
+  case ISD::FP_TO_SINT:         return LowerFP_TO_INT(Op, DAG,
+                                                       Op.getDebugLoc());
+  case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
+  case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
+
+  // Lower 64-bit shifts.
+  case ISD::SHL_PARTS:          return LowerSHL_PARTS(Op, DAG);
+  case ISD::SRL_PARTS:          return LowerSRL_PARTS(Op, DAG);
+  case ISD::SRA_PARTS:          return LowerSRA_PARTS(Op, DAG);
+
+  // Vector-related lowering.
+  case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
+  case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
+  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
+  case ISD::MUL:                return LowerMUL(Op, DAG);
+
+  // Frame & Return address.
+  case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
+  case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
+  }
+  return SDValue();
+}
+
+void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
+                                           SmallVectorImpl<SDValue>&Results,
+                                           SelectionDAG &DAG) {
+  DebugLoc dl = N->getDebugLoc();
+  switch (N->getOpcode()) {
+  default:
+    assert(false && "Do not know how to custom type legalize this operation!");
+    return;
+  case ISD::FP_ROUND_INREG: {
+    assert(N->getValueType(0) == MVT::ppcf128);
+    assert(N->getOperand(0).getValueType() == MVT::ppcf128);
+    SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
+                             MVT::f64, N->getOperand(0),
+                             DAG.getIntPtrConstant(0));
+    SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl,
+                             MVT::f64, N->getOperand(0),
+                             DAG.getIntPtrConstant(1));
+
+    // This sequence changes FPSCR to do round-to-zero, adds the two halves
+    // of the long double, and puts FPSCR back the way it was.  We do not
+    // actually model FPSCR.
+    std::vector<EVT> NodeTys;
+    SDValue Ops[4], Result, MFFSreg, InFlag, FPreg;
+
+    NodeTys.push_back(MVT::f64);   // Return register
+    NodeTys.push_back(MVT::Flag);    // Returns a flag for later insns
+    Result = DAG.getNode(PPCISD::MFFS, dl, NodeTys, &InFlag, 0);
+    MFFSreg = Result.getValue(0);
+    InFlag = Result.getValue(1);
+
+    NodeTys.clear();
+    NodeTys.push_back(MVT::Flag);   // Returns a flag
+    Ops[0] = DAG.getConstant(31, MVT::i32);
+    Ops[1] = InFlag;
+    Result = DAG.getNode(PPCISD::MTFSB1, dl, NodeTys, Ops, 2);
+    InFlag = Result.getValue(0);
+
+    NodeTys.clear();
+    NodeTys.push_back(MVT::Flag);   // Returns a flag
+    Ops[0] = DAG.getConstant(30, MVT::i32);
+    Ops[1] = InFlag;
+    Result = DAG.getNode(PPCISD::MTFSB0, dl, NodeTys, Ops, 2);
+    InFlag = Result.getValue(0);
+
+    NodeTys.clear();
+    NodeTys.push_back(MVT::f64);    // result of add
+    NodeTys.push_back(MVT::Flag);   // Returns a flag
+    Ops[0] = Lo;
+    Ops[1] = Hi;
+    Ops[2] = InFlag;
+    Result = DAG.getNode(PPCISD::FADDRTZ, dl, NodeTys, Ops, 3);
+    FPreg = Result.getValue(0);
+    InFlag = Result.getValue(1);
+
+    NodeTys.clear();
+    NodeTys.push_back(MVT::f64);
+    Ops[0] = DAG.getConstant(1, MVT::i32);
+    Ops[1] = MFFSreg;
+    Ops[2] = FPreg;
+    Ops[3] = InFlag;
+    Result = DAG.getNode(PPCISD::MTFSF, dl, NodeTys, Ops, 4);
+    FPreg = Result.getValue(0);
+
+    // We know the low half is about to be thrown away, so just use something
+    // convenient.
+    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::ppcf128,
+                                FPreg, FPreg));
+    return;
+  }
+  case ISD::FP_TO_SINT:
+    Results.push_back(LowerFP_TO_INT(SDValue(N, 0), DAG, dl));
+    return;
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Other Lowering Code
+//===----------------------------------------------------------------------===//
+
+MachineBasicBlock *
+PPCTargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
+                                    bool is64bit, unsigned BinOpcode) const {
+  // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction *F = BB->getParent();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  unsigned dest = MI->getOperand(0).getReg();
+  unsigned ptrA = MI->getOperand(1).getReg();
+  unsigned ptrB = MI->getOperand(2).getReg();
+  unsigned incr = MI->getOperand(3).getReg();
+  DebugLoc dl = MI->getDebugLoc();
+
+  MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, loopMBB);
+  F->insert(It, exitMBB);
+  exitMBB->transferSuccessors(BB);
+
+  MachineRegisterInfo &RegInfo = F->getRegInfo();
+  unsigned TmpReg = (!BinOpcode) ? incr :
+    RegInfo.createVirtualRegister(
+       is64bit ? (const TargetRegisterClass *) &PPC::G8RCRegClass :
+                 (const TargetRegisterClass *) &PPC::GPRCRegClass);
+
+  //  thisMBB:
+  //   ...
+  //   fallthrough --> loopMBB
+  BB->addSuccessor(loopMBB);
+
+  //  loopMBB:
+  //   l[wd]arx dest, ptr
+  //   add r0, dest, incr
+  //   st[wd]cx. r0, ptr
+  //   bne- loopMBB
+  //   fallthrough --> exitMBB
+  BB = loopMBB;
+  BuildMI(BB, dl, TII->get(is64bit ? PPC::LDARX : PPC::LWARX), dest)
+    .addReg(ptrA).addReg(ptrB);
+  if (BinOpcode)
+    BuildMI(BB, dl, TII->get(BinOpcode), TmpReg).addReg(incr).addReg(dest);
+  BuildMI(BB, dl, TII->get(is64bit ? PPC::STDCX : PPC::STWCX))
+    .addReg(TmpReg).addReg(ptrA).addReg(ptrB);
+  BuildMI(BB, dl, TII->get(PPC::BCC))
+    .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB);
+  BB->addSuccessor(loopMBB);
+  BB->addSuccessor(exitMBB);
+
+  //  exitMBB:
+  //   ...
+  BB = exitMBB;
+  return BB;
+}
+
+MachineBasicBlock *
+PPCTargetLowering::EmitPartwordAtomicBinary(MachineInstr *MI,
+                                            MachineBasicBlock *BB,
+                                            bool is8bit,    // operation
+                                            unsigned BinOpcode) const {
+  // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  // In 64 bit mode we have to use 64 bits for addresses, even though the
+  // lwarx/stwcx are 32 bits.  With the 32-bit atomics we can use address
+  // registers without caring whether they're 32 or 64, but here we're
+  // doing actual arithmetic on the addresses.
+  bool is64bit = PPCSubTarget.isPPC64();
+
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction *F = BB->getParent();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  unsigned dest = MI->getOperand(0).getReg();
+  unsigned ptrA = MI->getOperand(1).getReg();
+  unsigned ptrB = MI->getOperand(2).getReg();
+  unsigned incr = MI->getOperand(3).getReg();
+  DebugLoc dl = MI->getDebugLoc();
+
+  MachineBasicBlock *loopMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(It, loopMBB);
+  F->insert(It, exitMBB);
+  exitMBB->transferSuccessors(BB);
+
+  MachineRegisterInfo &RegInfo = F->getRegInfo();
+  const TargetRegisterClass *RC =
+    is64bit ? (const TargetRegisterClass *) &PPC::G8RCRegClass :
+              (const TargetRegisterClass *) &PPC::GPRCRegClass;
+  unsigned PtrReg = RegInfo.createVirtualRegister(RC);
+  unsigned Shift1Reg = RegInfo.createVirtualRegister(RC);
+  unsigned ShiftReg = RegInfo.createVirtualRegister(RC);
+  unsigned Incr2Reg = RegInfo.createVirtualRegister(RC);
+  unsigned MaskReg = RegInfo.createVirtualRegister(RC);
+  unsigned Mask2Reg = RegInfo.createVirtualRegister(RC);
+  unsigned Mask3Reg = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp2Reg = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp3Reg = RegInfo.createVirtualRegister(RC);
+  unsigned Tmp4Reg = RegInfo.createVirtualRegister(RC);
+  unsigned TmpDestReg = RegInfo.createVirtualRegister(RC);
+  unsigned Ptr1Reg;
+  unsigned TmpReg = (!BinOpcode) ? Incr2Reg : RegInfo.createVirtualRegister(RC);
+
+  //  thisMBB:
+  //   ...
+  //   fallthrough --> loopMBB
+  BB->addSuccessor(loopMBB);
+
+  // The 4-byte load must be aligned, while a char or short may be
+  // anywhere in the word.  Hence all this nasty bookkeeping code.
+  //   add ptr1, ptrA, ptrB [copy if ptrA==0]
+  //   rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27]
+  //   xori shift, shift1, 24 [16]
+  //   rlwinm ptr, ptr1, 0, 0, 29
+  //   slw incr2, incr, shift
+  //   li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535]
+  //   slw mask, mask2, shift
+  //  loopMBB:
+  //   lwarx tmpDest, ptr
+  //   add tmp, tmpDest, incr2
+  //   andc tmp2, tmpDest, mask
+  //   and tmp3, tmp, mask
+  //   or tmp4, tmp3, tmp2
+  //   stwcx. tmp4, ptr
+  //   bne- loopMBB
+  //   fallthrough --> exitMBB
+  //   srw dest, tmpDest, shift
+
+  if (ptrA!=PPC::R0) {
+    Ptr1Reg = RegInfo.createVirtualRegister(RC);
+    BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg)
+      .addReg(ptrA).addReg(ptrB);
+  } else {
+    Ptr1Reg = ptrB;
+  }
+  BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg).addReg(Ptr1Reg)
+      .addImm(3).addImm(27).addImm(is8bit ? 28 : 27);
+  BuildMI(BB, dl, TII->get(is64bit ? PPC::XORI8 : PPC::XORI), ShiftReg)
+      .addReg(Shift1Reg).addImm(is8bit ? 24 : 16);
+  if (is64bit)
+    BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg)
+      .addReg(Ptr1Reg).addImm(0).addImm(61);
+  else
+    BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg)
+      .addReg(Ptr1Reg).addImm(0).addImm(0).addImm(29);
+  BuildMI(BB, dl, TII->get(PPC::SLW), Incr2Reg)
+      .addReg(incr).addReg(ShiftReg);
+  if (is8bit)
+    BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255);
+  else {
+    BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0);
+    BuildMI(BB, dl, TII->get(PPC::ORI),Mask2Reg).addReg(Mask3Reg).addImm(65535);
+  }
+  BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg)
+      .addReg(Mask2Reg).addReg(ShiftReg);
+
+  BB = loopMBB;
+  BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg)
+    .addReg(PPC::R0).addReg(PtrReg);
+  if (BinOpcode)
+    BuildMI(BB, dl, TII->get(BinOpcode), TmpReg)
+      .addReg(Incr2Reg).addReg(TmpDestReg);
+  BuildMI(BB, dl, TII->get(is64bit ? PPC::ANDC8 : PPC::ANDC), Tmp2Reg)
+    .addReg(TmpDestReg).addReg(MaskReg);
+  BuildMI(BB, dl, TII->get(is64bit ? PPC::AND8 : PPC::AND), Tmp3Reg)
+    .addReg(TmpReg).addReg(MaskReg);
+  BuildMI(BB, dl, TII->get(is64bit ? PPC::OR8 : PPC::OR), Tmp4Reg)
+    .addReg(Tmp3Reg).addReg(Tmp2Reg);
+  BuildMI(BB, dl, TII->get(PPC::STWCX))
+    .addReg(Tmp4Reg).addReg(PPC::R0).addReg(PtrReg);
+  BuildMI(BB, dl, TII->get(PPC::BCC))
+    .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loopMBB);
+  BB->addSuccessor(loopMBB);
+  BB->addSuccessor(exitMBB);
+
+  //  exitMBB:
+  //   ...
+  BB = exitMBB;
+  BuildMI(BB, dl, TII->get(PPC::SRW), dest).addReg(TmpDestReg).addReg(ShiftReg);
+  return BB;
+}
+
+MachineBasicBlock *
+PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                               MachineBasicBlock *BB,
+                   DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+  // To "insert" these instructions we actually have to insert their
+  // control-flow patterns.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  MachineFunction *F = BB->getParent();
+
+  if (MI->getOpcode() == PPC::SELECT_CC_I4 ||
+      MI->getOpcode() == PPC::SELECT_CC_I8 ||
+      MI->getOpcode() == PPC::SELECT_CC_F4 ||
+      MI->getOpcode() == PPC::SELECT_CC_F8 ||
+      MI->getOpcode() == PPC::SELECT_CC_VRRC) {
+
+    // The incoming instruction knows the destination vreg to set, the
+    // condition code register to branch on, the true/false values to
+    // select between, and a branch opcode to use.
+
+    //  thisMBB:
+    //  ...
+    //   TrueVal = ...
+    //   cmpTY ccX, r1, r2
+    //   bCC copy1MBB
+    //   fallthrough --> copy0MBB
+    MachineBasicBlock *thisMBB = BB;
+    MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+    unsigned SelectPred = MI->getOperand(4).getImm();
+    DebugLoc dl = MI->getDebugLoc();
+    BuildMI(BB, dl, TII->get(PPC::BCC))
+      .addImm(SelectPred).addReg(MI->getOperand(1).getReg()).addMBB(sinkMBB);
+    F->insert(It, copy0MBB);
+    F->insert(It, sinkMBB);
+    // Update machine-CFG edges by first adding all successors of the current
+    // block to the new block which will contain the Phi node for the select.
+    // Also inform sdisel of the edge changes.
+    for (MachineBasicBlock::succ_iterator I = BB->succ_begin(), 
+           E = BB->succ_end(); I != E; ++I) {
+      EM->insert(std::make_pair(*I, sinkMBB));
+      sinkMBB->addSuccessor(*I);
+    }
+    // Next, remove all successors of the current block, and add the true
+    // and fallthrough blocks as its successors.
+    while (!BB->succ_empty())
+      BB->removeSuccessor(BB->succ_begin());
+    // Next, add the true and fallthrough blocks as its successors.
+    BB->addSuccessor(copy0MBB);
+    BB->addSuccessor(sinkMBB);
+
+    //  copy0MBB:
+    //   %FalseValue = ...
+    //   # fallthrough to sinkMBB
+    BB = copy0MBB;
+
+    // Update machine-CFG edges
+    BB->addSuccessor(sinkMBB);
+
+    //  sinkMBB:
+    //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+    //  ...
+    BB = sinkMBB;
+    BuildMI(BB, dl, TII->get(PPC::PHI), MI->getOperand(0).getReg())
+      .addReg(MI->getOperand(3).getReg()).addMBB(copy0MBB)
+      .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
+  }
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_ADD_I8)
+    BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::ADD4);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_ADD_I16)
+    BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::ADD4);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_ADD_I32)
+    BB = EmitAtomicBinary(MI, BB, false, PPC::ADD4);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_ADD_I64)
+    BB = EmitAtomicBinary(MI, BB, true, PPC::ADD8);
+
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_AND_I8)
+    BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::AND);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_AND_I16)
+    BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::AND);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_AND_I32)
+    BB = EmitAtomicBinary(MI, BB, false, PPC::AND);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_AND_I64)
+    BB = EmitAtomicBinary(MI, BB, true, PPC::AND8);
+
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_OR_I8)
+    BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::OR);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_OR_I16)
+    BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::OR);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_OR_I32)
+    BB = EmitAtomicBinary(MI, BB, false, PPC::OR);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_OR_I64)
+    BB = EmitAtomicBinary(MI, BB, true, PPC::OR8);
+
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_XOR_I8)
+    BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::XOR);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_XOR_I16)
+    BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::XOR);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_XOR_I32)
+    BB = EmitAtomicBinary(MI, BB, false, PPC::XOR);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_XOR_I64)
+    BB = EmitAtomicBinary(MI, BB, true, PPC::XOR8);
+
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I8)
+    BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::ANDC);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I16)
+    BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::ANDC);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I32)
+    BB = EmitAtomicBinary(MI, BB, false, PPC::ANDC);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_NAND_I64)
+    BB = EmitAtomicBinary(MI, BB, true, PPC::ANDC8);
+
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I8)
+    BB = EmitPartwordAtomicBinary(MI, BB, true, PPC::SUBF);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I16)
+    BB = EmitPartwordAtomicBinary(MI, BB, false, PPC::SUBF);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I32)
+    BB = EmitAtomicBinary(MI, BB, false, PPC::SUBF);
+  else if (MI->getOpcode() == PPC::ATOMIC_LOAD_SUB_I64)
+    BB = EmitAtomicBinary(MI, BB, true, PPC::SUBF8);
+
+  else if (MI->getOpcode() == PPC::ATOMIC_SWAP_I8)
+    BB = EmitPartwordAtomicBinary(MI, BB, true, 0);
+  else if (MI->getOpcode() == PPC::ATOMIC_SWAP_I16)
+    BB = EmitPartwordAtomicBinary(MI, BB, false, 0);
+  else if (MI->getOpcode() == PPC::ATOMIC_SWAP_I32)
+    BB = EmitAtomicBinary(MI, BB, false, 0);
+  else if (MI->getOpcode() == PPC::ATOMIC_SWAP_I64)
+    BB = EmitAtomicBinary(MI, BB, true, 0);
+
+  else if (MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I32 ||
+           MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I64) {
+    bool is64bit = MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I64;
+
+    unsigned dest   = MI->getOperand(0).getReg();
+    unsigned ptrA   = MI->getOperand(1).getReg();
+    unsigned ptrB   = MI->getOperand(2).getReg();
+    unsigned oldval = MI->getOperand(3).getReg();
+    unsigned newval = MI->getOperand(4).getReg();
+    DebugLoc dl     = MI->getDebugLoc();
+
+    MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
+    F->insert(It, loop1MBB);
+    F->insert(It, loop2MBB);
+    F->insert(It, midMBB);
+    F->insert(It, exitMBB);
+    exitMBB->transferSuccessors(BB);
+
+    //  thisMBB:
+    //   ...
+    //   fallthrough --> loopMBB
+    BB->addSuccessor(loop1MBB);
+
+    // loop1MBB:
+    //   l[wd]arx dest, ptr
+    //   cmp[wd] dest, oldval
+    //   bne- midMBB
+    // loop2MBB:
+    //   st[wd]cx. newval, ptr
+    //   bne- loopMBB
+    //   b exitBB
+    // midMBB:
+    //   st[wd]cx. dest, ptr
+    // exitBB:
+    BB = loop1MBB;
+    BuildMI(BB, dl, TII->get(is64bit ? PPC::LDARX : PPC::LWARX), dest)
+      .addReg(ptrA).addReg(ptrB);
+    BuildMI(BB, dl, TII->get(is64bit ? PPC::CMPD : PPC::CMPW), PPC::CR0)
+      .addReg(oldval).addReg(dest);
+    BuildMI(BB, dl, TII->get(PPC::BCC))
+      .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(midMBB);
+    BB->addSuccessor(loop2MBB);
+    BB->addSuccessor(midMBB);
+
+    BB = loop2MBB;
+    BuildMI(BB, dl, TII->get(is64bit ? PPC::STDCX : PPC::STWCX))
+      .addReg(newval).addReg(ptrA).addReg(ptrB);
+    BuildMI(BB, dl, TII->get(PPC::BCC))
+      .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loop1MBB);
+    BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB);
+    BB->addSuccessor(loop1MBB);
+    BB->addSuccessor(exitMBB);
+
+    BB = midMBB;
+    BuildMI(BB, dl, TII->get(is64bit ? PPC::STDCX : PPC::STWCX))
+      .addReg(dest).addReg(ptrA).addReg(ptrB);
+    BB->addSuccessor(exitMBB);
+
+    //  exitMBB:
+    //   ...
+    BB = exitMBB;
+  } else if (MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I8 ||
+             MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I16) {
+    // We must use 64-bit registers for addresses when targeting 64-bit,
+    // since we're actually doing arithmetic on them.  Other registers
+    // can be 32-bit.
+    bool is64bit = PPCSubTarget.isPPC64();
+    bool is8bit = MI->getOpcode() == PPC::ATOMIC_CMP_SWAP_I8;
+
+    unsigned dest   = MI->getOperand(0).getReg();
+    unsigned ptrA   = MI->getOperand(1).getReg();
+    unsigned ptrB   = MI->getOperand(2).getReg();
+    unsigned oldval = MI->getOperand(3).getReg();
+    unsigned newval = MI->getOperand(4).getReg();
+    DebugLoc dl     = MI->getDebugLoc();
+
+    MachineBasicBlock *loop1MBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *loop2MBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *midMBB = F->CreateMachineBasicBlock(LLVM_BB);
+    MachineBasicBlock *exitMBB = F->CreateMachineBasicBlock(LLVM_BB);
+    F->insert(It, loop1MBB);
+    F->insert(It, loop2MBB);
+    F->insert(It, midMBB);
+    F->insert(It, exitMBB);
+    exitMBB->transferSuccessors(BB);
+
+    MachineRegisterInfo &RegInfo = F->getRegInfo();
+    const TargetRegisterClass *RC =
+      is64bit ? (const TargetRegisterClass *) &PPC::G8RCRegClass :
+                (const TargetRegisterClass *) &PPC::GPRCRegClass;
+    unsigned PtrReg = RegInfo.createVirtualRegister(RC);
+    unsigned Shift1Reg = RegInfo.createVirtualRegister(RC);
+    unsigned ShiftReg = RegInfo.createVirtualRegister(RC);
+    unsigned NewVal2Reg = RegInfo.createVirtualRegister(RC);
+    unsigned NewVal3Reg = RegInfo.createVirtualRegister(RC);
+    unsigned OldVal2Reg = RegInfo.createVirtualRegister(RC);
+    unsigned OldVal3Reg = RegInfo.createVirtualRegister(RC);
+    unsigned MaskReg = RegInfo.createVirtualRegister(RC);
+    unsigned Mask2Reg = RegInfo.createVirtualRegister(RC);
+    unsigned Mask3Reg = RegInfo.createVirtualRegister(RC);
+    unsigned Tmp2Reg = RegInfo.createVirtualRegister(RC);
+    unsigned Tmp4Reg = RegInfo.createVirtualRegister(RC);
+    unsigned TmpDestReg = RegInfo.createVirtualRegister(RC);
+    unsigned Ptr1Reg;
+    unsigned TmpReg = RegInfo.createVirtualRegister(RC);
+    //  thisMBB:
+    //   ...
+    //   fallthrough --> loopMBB
+    BB->addSuccessor(loop1MBB);
+
+    // The 4-byte load must be aligned, while a char or short may be
+    // anywhere in the word.  Hence all this nasty bookkeeping code.
+    //   add ptr1, ptrA, ptrB [copy if ptrA==0]
+    //   rlwinm shift1, ptr1, 3, 27, 28 [3, 27, 27]
+    //   xori shift, shift1, 24 [16]
+    //   rlwinm ptr, ptr1, 0, 0, 29
+    //   slw newval2, newval, shift
+    //   slw oldval2, oldval,shift
+    //   li mask2, 255 [li mask3, 0; ori mask2, mask3, 65535]
+    //   slw mask, mask2, shift
+    //   and newval3, newval2, mask
+    //   and oldval3, oldval2, mask
+    // loop1MBB:
+    //   lwarx tmpDest, ptr
+    //   and tmp, tmpDest, mask
+    //   cmpw tmp, oldval3
+    //   bne- midMBB
+    // loop2MBB:
+    //   andc tmp2, tmpDest, mask
+    //   or tmp4, tmp2, newval3
+    //   stwcx. tmp4, ptr
+    //   bne- loop1MBB
+    //   b exitBB
+    // midMBB:
+    //   stwcx. tmpDest, ptr
+    // exitBB:
+    //   srw dest, tmpDest, shift
+    if (ptrA!=PPC::R0) {
+      Ptr1Reg = RegInfo.createVirtualRegister(RC);
+      BuildMI(BB, dl, TII->get(is64bit ? PPC::ADD8 : PPC::ADD4), Ptr1Reg)
+        .addReg(ptrA).addReg(ptrB);
+    } else {
+      Ptr1Reg = ptrB;
+    }
+    BuildMI(BB, dl, TII->get(PPC::RLWINM), Shift1Reg).addReg(Ptr1Reg)
+        .addImm(3).addImm(27).addImm(is8bit ? 28 : 27);
+    BuildMI(BB, dl, TII->get(is64bit ? PPC::XORI8 : PPC::XORI), ShiftReg)
+        .addReg(Shift1Reg).addImm(is8bit ? 24 : 16);
+    if (is64bit)
+      BuildMI(BB, dl, TII->get(PPC::RLDICR), PtrReg)
+        .addReg(Ptr1Reg).addImm(0).addImm(61);
+    else
+      BuildMI(BB, dl, TII->get(PPC::RLWINM), PtrReg)
+        .addReg(Ptr1Reg).addImm(0).addImm(0).addImm(29);
+    BuildMI(BB, dl, TII->get(PPC::SLW), NewVal2Reg)
+        .addReg(newval).addReg(ShiftReg);
+    BuildMI(BB, dl, TII->get(PPC::SLW), OldVal2Reg)
+        .addReg(oldval).addReg(ShiftReg);
+    if (is8bit)
+      BuildMI(BB, dl, TII->get(PPC::LI), Mask2Reg).addImm(255);
+    else {
+      BuildMI(BB, dl, TII->get(PPC::LI), Mask3Reg).addImm(0);
+      BuildMI(BB, dl, TII->get(PPC::ORI), Mask2Reg)
+        .addReg(Mask3Reg).addImm(65535);
+    }
+    BuildMI(BB, dl, TII->get(PPC::SLW), MaskReg)
+        .addReg(Mask2Reg).addReg(ShiftReg);
+    BuildMI(BB, dl, TII->get(PPC::AND), NewVal3Reg)
+        .addReg(NewVal2Reg).addReg(MaskReg);
+    BuildMI(BB, dl, TII->get(PPC::AND), OldVal3Reg)
+        .addReg(OldVal2Reg).addReg(MaskReg);
+
+    BB = loop1MBB;
+    BuildMI(BB, dl, TII->get(PPC::LWARX), TmpDestReg)
+        .addReg(PPC::R0).addReg(PtrReg);
+    BuildMI(BB, dl, TII->get(PPC::AND),TmpReg)
+        .addReg(TmpDestReg).addReg(MaskReg);
+    BuildMI(BB, dl, TII->get(PPC::CMPW), PPC::CR0)
+        .addReg(TmpReg).addReg(OldVal3Reg);
+    BuildMI(BB, dl, TII->get(PPC::BCC))
+        .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(midMBB);
+    BB->addSuccessor(loop2MBB);
+    BB->addSuccessor(midMBB);
+
+    BB = loop2MBB;
+    BuildMI(BB, dl, TII->get(PPC::ANDC),Tmp2Reg)
+        .addReg(TmpDestReg).addReg(MaskReg);
+    BuildMI(BB, dl, TII->get(PPC::OR),Tmp4Reg)
+        .addReg(Tmp2Reg).addReg(NewVal3Reg);
+    BuildMI(BB, dl, TII->get(PPC::STWCX)).addReg(Tmp4Reg)
+        .addReg(PPC::R0).addReg(PtrReg);
+    BuildMI(BB, dl, TII->get(PPC::BCC))
+      .addImm(PPC::PRED_NE).addReg(PPC::CR0).addMBB(loop1MBB);
+    BuildMI(BB, dl, TII->get(PPC::B)).addMBB(exitMBB);
+    BB->addSuccessor(loop1MBB);
+    BB->addSuccessor(exitMBB);
+
+    BB = midMBB;
+    BuildMI(BB, dl, TII->get(PPC::STWCX)).addReg(TmpDestReg)
+      .addReg(PPC::R0).addReg(PtrReg);
+    BB->addSuccessor(exitMBB);
+
+    //  exitMBB:
+    //   ...
+    BB = exitMBB;
+    BuildMI(BB, dl, TII->get(PPC::SRW),dest).addReg(TmpReg).addReg(ShiftReg);
+  } else {
+    llvm_unreachable("Unexpected instr type to insert");
+  }
+
+  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+  return BB;
+}
+
+//===----------------------------------------------------------------------===//
+// Target Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
+                                             DAGCombinerInfo &DCI) const {
+  TargetMachine &TM = getTargetMachine();
+  SelectionDAG &DAG = DCI.DAG;
+  DebugLoc dl = N->getDebugLoc();
+  switch (N->getOpcode()) {
+  default: break;
+  case PPCISD::SHL:
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
+      if (C->getZExtValue() == 0)   // 0 << V -> 0.
+        return N->getOperand(0);
+    }
+    break;
+  case PPCISD::SRL:
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
+      if (C->getZExtValue() == 0)   // 0 >>u V -> 0.
+        return N->getOperand(0);
+    }
+    break;
+  case PPCISD::SRA:
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
+      if (C->getZExtValue() == 0 ||   //  0 >>s V -> 0.
+          C->isAllOnesValue())    // -1 >>s V -> -1.
+        return N->getOperand(0);
+    }
+    break;
+
+  case ISD::SINT_TO_FP:
+    if (TM.getSubtarget<PPCSubtarget>().has64BitSupport()) {
+      if (N->getOperand(0).getOpcode() == ISD::FP_TO_SINT) {
+        // Turn (sint_to_fp (fp_to_sint X)) -> fctidz/fcfid without load/stores.
+        // We allow the src/dst to be either f32/f64, but the intermediate
+        // type must be i64.
+        if (N->getOperand(0).getValueType() == MVT::i64 &&
+            N->getOperand(0).getOperand(0).getValueType() != MVT::ppcf128) {
+          SDValue Val = N->getOperand(0).getOperand(0);
+          if (Val.getValueType() == MVT::f32) {
+            Val = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Val);
+            DCI.AddToWorklist(Val.getNode());
+          }
+
+          Val = DAG.getNode(PPCISD::FCTIDZ, dl, MVT::f64, Val);
+          DCI.AddToWorklist(Val.getNode());
+          Val = DAG.getNode(PPCISD::FCFID, dl, MVT::f64, Val);
+          DCI.AddToWorklist(Val.getNode());
+          if (N->getValueType(0) == MVT::f32) {
+            Val = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, Val,
+                              DAG.getIntPtrConstant(0));
+            DCI.AddToWorklist(Val.getNode());
+          }
+          return Val;
+        } else if (N->getOperand(0).getValueType() == MVT::i32) {
+          // If the intermediate type is i32, we can avoid the load/store here
+          // too.
+        }
+      }
+    }
+    break;
+  case ISD::STORE:
+    // Turn STORE (FP_TO_SINT F) -> STFIWX(FCTIWZ(F)).
+    if (TM.getSubtarget<PPCSubtarget>().hasSTFIWX() &&
+        !cast<StoreSDNode>(N)->isTruncatingStore() &&
+        N->getOperand(1).getOpcode() == ISD::FP_TO_SINT &&
+        N->getOperand(1).getValueType() == MVT::i32 &&
+        N->getOperand(1).getOperand(0).getValueType() != MVT::ppcf128) {
+      SDValue Val = N->getOperand(1).getOperand(0);
+      if (Val.getValueType() == MVT::f32) {
+        Val = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, Val);
+        DCI.AddToWorklist(Val.getNode());
+      }
+      Val = DAG.getNode(PPCISD::FCTIWZ, dl, MVT::f64, Val);
+      DCI.AddToWorklist(Val.getNode());
+
+      Val = DAG.getNode(PPCISD::STFIWX, dl, MVT::Other, N->getOperand(0), Val,
+                        N->getOperand(2), N->getOperand(3));
+      DCI.AddToWorklist(Val.getNode());
+      return Val;
+    }
+
+    // Turn STORE (BSWAP) -> sthbrx/stwbrx.
+    if (cast<StoreSDNode>(N)->isUnindexed() &&
+        N->getOperand(1).getOpcode() == ISD::BSWAP &&
+        N->getOperand(1).getNode()->hasOneUse() &&
+        (N->getOperand(1).getValueType() == MVT::i32 ||
+         N->getOperand(1).getValueType() == MVT::i16)) {
+      SDValue BSwapOp = N->getOperand(1).getOperand(0);
+      // Do an any-extend to 32-bits if this is a half-word input.
+      if (BSwapOp.getValueType() == MVT::i16)
+        BSwapOp = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, BSwapOp);
+
+      SDValue Ops[] = {
+        N->getOperand(0), BSwapOp, N->getOperand(2),
+        DAG.getValueType(N->getOperand(1).getValueType())
+      };
+      return
+        DAG.getMemIntrinsicNode(PPCISD::STBRX, dl, DAG.getVTList(MVT::Other),
+                                Ops, array_lengthof(Ops),
+                                cast<StoreSDNode>(N)->getMemoryVT(),
+                                cast<StoreSDNode>(N)->getMemOperand());
+    }
+    break;
+  case ISD::BSWAP:
+    // Turn BSWAP (LOAD) -> lhbrx/lwbrx.
+    if (ISD::isNON_EXTLoad(N->getOperand(0).getNode()) &&
+        N->getOperand(0).hasOneUse() &&
+        (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i16)) {
+      SDValue Load = N->getOperand(0);
+      LoadSDNode *LD = cast<LoadSDNode>(Load);
+      // Create the byte-swapping load.
+      SDValue Ops[] = {
+        LD->getChain(),    // Chain
+        LD->getBasePtr(),  // Ptr
+        DAG.getValueType(N->getValueType(0)) // VT
+      };
+      SDValue BSLoad =
+        DAG.getMemIntrinsicNode(PPCISD::LBRX, dl,
+                                DAG.getVTList(MVT::i32, MVT::Other), Ops, 3,
+                                LD->getMemoryVT(), LD->getMemOperand());
+
+      // If this is an i16 load, insert the truncate.
+      SDValue ResVal = BSLoad;
+      if (N->getValueType(0) == MVT::i16)
+        ResVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, BSLoad);
+
+      // First, combine the bswap away.  This makes the value produced by the
+      // load dead.
+      DCI.CombineTo(N, ResVal);
+
+      // Next, combine the load away, we give it a bogus result value but a real
+      // chain result.  The result value is dead because the bswap is dead.
+      DCI.CombineTo(Load.getNode(), ResVal, BSLoad.getValue(1));
+
+      // Return N so it doesn't get rechecked!
+      return SDValue(N, 0);
+    }
+
+    break;
+  case PPCISD::VCMP: {
+    // If a VCMPo node already exists with exactly the same operands as this
+    // node, use its result instead of this node (VCMPo computes both a CR6 and
+    // a normal output).
+    //
+    if (!N->getOperand(0).hasOneUse() &&
+        !N->getOperand(1).hasOneUse() &&
+        !N->getOperand(2).hasOneUse()) {
+
+      // Scan all of the users of the LHS, looking for VCMPo's that match.
+      SDNode *VCMPoNode = 0;
+
+      SDNode *LHSN = N->getOperand(0).getNode();
+      for (SDNode::use_iterator UI = LHSN->use_begin(), E = LHSN->use_end();
+           UI != E; ++UI)
+        if (UI->getOpcode() == PPCISD::VCMPo &&
+            UI->getOperand(1) == N->getOperand(1) &&
+            UI->getOperand(2) == N->getOperand(2) &&
+            UI->getOperand(0) == N->getOperand(0)) {
+          VCMPoNode = *UI;
+          break;
+        }
+
+      // If there is no VCMPo node, or if the flag value has a single use, don't
+      // transform this.
+      if (!VCMPoNode || VCMPoNode->hasNUsesOfValue(0, 1))
+        break;
+
+      // Look at the (necessarily single) use of the flag value.  If it has a
+      // chain, this transformation is more complex.  Note that multiple things
+      // could use the value result, which we should ignore.
+      SDNode *FlagUser = 0;
+      for (SDNode::use_iterator UI = VCMPoNode->use_begin();
+           FlagUser == 0; ++UI) {
+        assert(UI != VCMPoNode->use_end() && "Didn't find user!");
+        SDNode *User = *UI;
+        for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
+          if (User->getOperand(i) == SDValue(VCMPoNode, 1)) {
+            FlagUser = User;
+            break;
+          }
+        }
+      }
+
+      // If the user is a MFCR instruction, we know this is safe.  Otherwise we
+      // give up for right now.
+      if (FlagUser->getOpcode() == PPCISD::MFCR)
+        return SDValue(VCMPoNode, 0);
+    }
+    break;
+  }
+  case ISD::BR_CC: {
+    // If this is a branch on an altivec predicate comparison, lower this so
+    // that we don't have to do a MFCR: instead, branch directly on CR6.  This
+    // lowering is done pre-legalize, because the legalizer lowers the predicate
+    // compare down to code that is difficult to reassemble.
+    ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
+    SDValue LHS = N->getOperand(2), RHS = N->getOperand(3);
+    int CompareOpc;
+    bool isDot;
+
+    if (LHS.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
+        isa<ConstantSDNode>(RHS) && (CC == ISD::SETEQ || CC == ISD::SETNE) &&
+        getAltivecCompareInfo(LHS, CompareOpc, isDot)) {
+      assert(isDot && "Can't compare against a vector result!");
+
+      // If this is a comparison against something other than 0/1, then we know
+      // that the condition is never/always true.
+      unsigned Val = cast<ConstantSDNode>(RHS)->getZExtValue();
+      if (Val != 0 && Val != 1) {
+        if (CC == ISD::SETEQ)      // Cond never true, remove branch.
+          return N->getOperand(0);
+        // Always !=, turn it into an unconditional branch.
+        return DAG.getNode(ISD::BR, dl, MVT::Other,
+                           N->getOperand(0), N->getOperand(4));
+      }
+
+      bool BranchOnWhenPredTrue = (CC == ISD::SETEQ) ^ (Val == 0);
+
+      // Create the PPCISD altivec 'dot' comparison node.
+      std::vector<EVT> VTs;
+      SDValue Ops[] = {
+        LHS.getOperand(2),  // LHS of compare
+        LHS.getOperand(3),  // RHS of compare
+        DAG.getConstant(CompareOpc, MVT::i32)
+      };
+      VTs.push_back(LHS.getOperand(2).getValueType());
+      VTs.push_back(MVT::Flag);
+      SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops, 3);
+
+      // Unpack the result based on how the target uses it.
+      PPC::Predicate CompOpc;
+      switch (cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue()) {
+      default:  // Can't happen, don't crash on invalid number though.
+      case 0:   // Branch on the value of the EQ bit of CR6.
+        CompOpc = BranchOnWhenPredTrue ? PPC::PRED_EQ : PPC::PRED_NE;
+        break;
+      case 1:   // Branch on the inverted value of the EQ bit of CR6.
+        CompOpc = BranchOnWhenPredTrue ? PPC::PRED_NE : PPC::PRED_EQ;
+        break;
+      case 2:   // Branch on the value of the LT bit of CR6.
+        CompOpc = BranchOnWhenPredTrue ? PPC::PRED_LT : PPC::PRED_GE;
+        break;
+      case 3:   // Branch on the inverted value of the LT bit of CR6.
+        CompOpc = BranchOnWhenPredTrue ? PPC::PRED_GE : PPC::PRED_LT;
+        break;
+      }
+
+      return DAG.getNode(PPCISD::COND_BRANCH, dl, MVT::Other, N->getOperand(0),
+                         DAG.getConstant(CompOpc, MVT::i32),
+                         DAG.getRegister(PPC::CR6, MVT::i32),
+                         N->getOperand(4), CompNode.getValue(1));
+    }
+    break;
+  }
+  }
+
+  return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+// Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+void PPCTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
+                                                       const APInt &Mask,
+                                                       APInt &KnownZero,
+                                                       APInt &KnownOne,
+                                                       const SelectionDAG &DAG,
+                                                       unsigned Depth) const {
+  KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);
+  switch (Op.getOpcode()) {
+  default: break;
+  case PPCISD::LBRX: {
+    // lhbrx is known to have the top bits cleared out.
+    if (cast<VTSDNode>(Op.getOperand(2))->getVT() == MVT::i16)
+      KnownZero = 0xFFFF0000;
+    break;
+  }
+  case ISD::INTRINSIC_WO_CHAIN: {
+    switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) {
+    default: break;
+    case Intrinsic::ppc_altivec_vcmpbfp_p:
+    case Intrinsic::ppc_altivec_vcmpeqfp_p:
+    case Intrinsic::ppc_altivec_vcmpequb_p:
+    case Intrinsic::ppc_altivec_vcmpequh_p:
+    case Intrinsic::ppc_altivec_vcmpequw_p:
+    case Intrinsic::ppc_altivec_vcmpgefp_p:
+    case Intrinsic::ppc_altivec_vcmpgtfp_p:
+    case Intrinsic::ppc_altivec_vcmpgtsb_p:
+    case Intrinsic::ppc_altivec_vcmpgtsh_p:
+    case Intrinsic::ppc_altivec_vcmpgtsw_p:
+    case Intrinsic::ppc_altivec_vcmpgtub_p:
+    case Intrinsic::ppc_altivec_vcmpgtuh_p:
+    case Intrinsic::ppc_altivec_vcmpgtuw_p:
+      KnownZero = ~1U;  // All bits but the low one are known to be zero.
+      break;
+    }
+  }
+  }
+}
+
+
+/// getConstraintType - Given a constraint, return the type of
+/// constraint it is for this target.
+PPCTargetLowering::ConstraintType
+PPCTargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default: break;
+    case 'b':
+    case 'r':
+    case 'f':
+    case 'v':
+    case 'y':
+      return C_RegisterClass;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+std::pair<unsigned, const TargetRegisterClass*>
+PPCTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+                                                EVT VT) const {
+  if (Constraint.size() == 1) {
+    // GCC RS6000 Constraint Letters
+    switch (Constraint[0]) {
+    case 'b':   // R1-R31
+    case 'r':   // R0-R31
+      if (VT == MVT::i64 && PPCSubTarget.isPPC64())
+        return std::make_pair(0U, PPC::G8RCRegisterClass);
+      return std::make_pair(0U, PPC::GPRCRegisterClass);
+    case 'f':
+      if (VT == MVT::f32)
+        return std::make_pair(0U, PPC::F4RCRegisterClass);
+      else if (VT == MVT::f64)
+        return std::make_pair(0U, PPC::F8RCRegisterClass);
+      break;
+    case 'v':
+      return std::make_pair(0U, PPC::VRRCRegisterClass);
+    case 'y':   // crrc
+      return std::make_pair(0U, PPC::CRRCRegisterClass);
+    }
+  }
+
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
+
+
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector.  If it is invalid, don't add anything to Ops. If hasMemory is true
+/// it means one of the asm constraint of the inline asm instruction being
+/// processed is 'm'.
+void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op, char Letter,
+                                                     bool hasMemory,
+                                                     std::vector<SDValue>&Ops,
+                                                     SelectionDAG &DAG) const {
+  SDValue Result(0,0);
+  switch (Letter) {
+  default: break;
+  case 'I':
+  case 'J':
+  case 'K':
+  case 'L':
+  case 'M':
+  case 'N':
+  case 'O':
+  case 'P': {
+    ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op);
+    if (!CST) return; // Must be an immediate to match.
+    unsigned Value = CST->getZExtValue();
+    switch (Letter) {
+    default: llvm_unreachable("Unknown constraint letter!");
+    case 'I':  // "I" is a signed 16-bit constant.
+      if ((short)Value == (int)Value)
+        Result = DAG.getTargetConstant(Value, Op.getValueType());
+      break;
+    case 'J':  // "J" is a constant with only the high-order 16 bits nonzero.
+    case 'L':  // "L" is a signed 16-bit constant shifted left 16 bits.
+      if ((short)Value == 0)
+        Result = DAG.getTargetConstant(Value, Op.getValueType());
+      break;
+    case 'K':  // "K" is a constant with only the low-order 16 bits nonzero.
+      if ((Value >> 16) == 0)
+        Result = DAG.getTargetConstant(Value, Op.getValueType());
+      break;
+    case 'M':  // "M" is a constant that is greater than 31.
+      if (Value > 31)
+        Result = DAG.getTargetConstant(Value, Op.getValueType());
+      break;
+    case 'N':  // "N" is a positive constant that is an exact power of two.
+      if ((int)Value > 0 && isPowerOf2_32(Value))
+        Result = DAG.getTargetConstant(Value, Op.getValueType());
+      break;
+    case 'O':  // "O" is the constant zero.
+      if (Value == 0)
+        Result = DAG.getTargetConstant(Value, Op.getValueType());
+      break;
+    case 'P':  // "P" is a constant whose negation is a signed 16-bit constant.
+      if ((short)-Value == (int)-Value)
+        Result = DAG.getTargetConstant(Value, Op.getValueType());
+      break;
+    }
+    break;
+  }
+  }
+
+  if (Result.getNode()) {
+    Ops.push_back(Result);
+    return;
+  }
+
+  // Handle standard constraint letters.
+  TargetLowering::LowerAsmOperandForConstraint(Op, Letter, hasMemory, Ops, DAG);
+}
+
+// isLegalAddressingMode - Return true if the addressing mode represented
+// by AM is legal for this target, for a load/store of the specified type.
+bool PPCTargetLowering::isLegalAddressingMode(const AddrMode &AM,
+                                              const Type *Ty) const {
+  // FIXME: PPC does not allow r+i addressing modes for vectors!
+
+  // PPC allows a sign-extended 16-bit immediate field.
+  if (AM.BaseOffs <= -(1LL << 16) || AM.BaseOffs >= (1LL << 16)-1)
+    return false;
+
+  // No global is ever allowed as a base.
+  if (AM.BaseGV)
+    return false;
+
+  // PPC only support r+r,
+  switch (AM.Scale) {
+  case 0:  // "r+i" or just "i", depending on HasBaseReg.
+    break;
+  case 1:
+    if (AM.HasBaseReg && AM.BaseOffs)  // "r+r+i" is not allowed.
+      return false;
+    // Otherwise we have r+r or r+i.
+    break;
+  case 2:
+    if (AM.HasBaseReg || AM.BaseOffs)  // 2*r+r  or  2*r+i is not allowed.
+      return false;
+    // Allow 2*r as r+r.
+    break;
+  default:
+    // No other scales are supported.
+    return false;
+  }
+
+  return true;
+}
+
+/// isLegalAddressImmediate - Return true if the integer value can be used
+/// as the offset of the target addressing mode for load / store of the
+/// given type.
+bool PPCTargetLowering::isLegalAddressImmediate(int64_t V,const Type *Ty) const{
+  // PPC allows a sign-extended 16-bit immediate field.
+  return (V > -(1 << 16) && V < (1 << 16)-1);
+}
+
+bool PPCTargetLowering::isLegalAddressImmediate(llvm::GlobalValue* GV) const {
+  return false;
+}
+
+SDValue PPCTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  // Depths > 0 not supported yet!
+  if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() > 0)
+    return SDValue();
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+
+  // Just load the return address off the stack.
+  SDValue RetAddrFI = getReturnAddrFrameIndex(DAG);
+
+  // Make sure the function really does not optimize away the store of the RA
+  // to the stack.
+  FuncInfo->setLRStoreRequired();
+  return DAG.getLoad(getPointerTy(), dl,
+                     DAG.getEntryNode(), RetAddrFI, NULL, 0);
+}
+
+SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  // Depths > 0 not supported yet!
+  if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() > 0)
+    return SDValue();
+
+  EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+  bool isPPC64 = PtrVT == MVT::i64;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  bool is31 = (NoFramePointerElim || MFI->hasVarSizedObjects())
+                  && MFI->getStackSize();
+
+  if (isPPC64)
+    return DAG.getCopyFromReg(DAG.getEntryNode(), dl, is31 ? PPC::X31 : PPC::X1,
+      MVT::i64);
+  else
+    return DAG.getCopyFromReg(DAG.getEntryNode(), dl, is31 ? PPC::R31 : PPC::R1,
+      MVT::i32);
+}
+
+bool
+PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  // The PowerPC target isn't yet aware of offsets.
+  return false;
+}
+
+EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align,
+                                           bool isSrcConst, bool isSrcStr,
+                                           SelectionDAG &DAG) const {
+  if (this->PPCSubTarget.isPPC64()) {
+    return MVT::i64;
+  } else {
+    return MVT::i32;
+  }
+}

diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
new file mode 100644
index 0000000..9c390ac
--- /dev/null
+++ b/lib/Target/PowerPC/PPCISelLowering.h

@@ -0,0 +1,476 @@
+//===-- PPCISelLowering.h - PPC32 DAG Lowering Interface --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that PPC uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H
+#define LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H
+
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "PPC.h"
+#include "PPCSubtarget.h"
+
+namespace llvm {
+  namespace PPCISD {
+    enum NodeType {
+      // Start the numbering where the builtin ops and target ops leave off.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      /// FSEL - Traditional three-operand fsel node.
+      ///
+      FSEL,
+      
+      /// FCFID - The FCFID instruction, taking an f64 operand and producing
+      /// and f64 value containing the FP representation of the integer that
+      /// was temporarily in the f64 operand.
+      FCFID,
+      
+      /// FCTI[D,W]Z - The FCTIDZ and FCTIWZ instructions, taking an f32 or f64 
+      /// operand, producing an f64 value containing the integer representation
+      /// of that FP value.
+      FCTIDZ, FCTIWZ,
+      
+      /// STFIWX - The STFIWX instruction.  The first operand is an input token
+      /// chain, then an f64 value to store, then an address to store it to.
+      STFIWX,
+      
+      // VMADDFP, VNMSUBFP - The VMADDFP and VNMSUBFP instructions, taking
+      // three v4f32 operands and producing a v4f32 result.
+      VMADDFP, VNMSUBFP,
+      
+      /// VPERM - The PPC VPERM Instruction.
+      ///
+      VPERM,
+      
+      /// Hi/Lo - These represent the high and low 16-bit parts of a global
+      /// address respectively.  These nodes have two operands, the first of
+      /// which must be a TargetGlobalAddress, and the second of which must be a
+      /// Constant.  Selected naively, these turn into 'lis G+C' and 'li G+C',
+      /// though these are usually folded into other nodes.
+      Hi, Lo,
+      
+      TOC_ENTRY,
+
+      /// The following three target-specific nodes are used for calls through
+      /// function pointers in the 64-bit SVR4 ABI.
+
+      /// Restore the TOC from the TOC save area of the current stack frame.
+      /// This is basically a hard coded load instruction which additionally
+      /// takes/produces a flag.
+      TOC_RESTORE,
+
+      /// Like a regular LOAD but additionally taking/producing a flag.
+      LOAD,
+
+      /// LOAD into r2 (also taking/producing a flag). Like TOC_RESTORE, this is
+      /// a hard coded load instruction.
+      LOAD_TOC,
+
+      /// OPRC, CHAIN = DYNALLOC(CHAIN, NEGSIZE, FRAME_INDEX)
+      /// This instruction is lowered in PPCRegisterInfo::eliminateFrameIndex to
+      /// compute an allocation on the stack.
+      DYNALLOC,
+      
+      /// GlobalBaseReg - On Darwin, this node represents the result of the mflr
+      /// at function entry, used for PIC code.
+      GlobalBaseReg,
+      
+      /// These nodes represent the 32-bit PPC shifts that operate on 6-bit
+      /// shift amounts.  These nodes are generated by the multi-precision shift
+      /// code.
+      SRL, SRA, SHL,
+      
+      /// EXTSW_32 - This is the EXTSW instruction for use with "32-bit"
+      /// registers.
+      EXTSW_32,
+
+      /// CALL - A direct function call.
+      CALL_Darwin, CALL_SVR4,
+      
+      /// NOP - Special NOP which follows 64-bit SVR4 calls.
+      NOP,
+
+      /// CHAIN,FLAG = MTCTR(VAL, CHAIN[, INFLAG]) - Directly corresponds to a
+      /// MTCTR instruction.
+      MTCTR,
+      
+      /// CHAIN,FLAG = BCTRL(CHAIN, INFLAG) - Directly corresponds to a
+      /// BCTRL instruction.
+      BCTRL_Darwin, BCTRL_SVR4,
+      
+      /// Return with a flag operand, matched by 'blr'
+      RET_FLAG,
+      
+      /// R32 = MFCR(CRREG, INFLAG) - Represents the MFCR/MFOCRF instructions.
+      /// This copies the bits corresponding to the specified CRREG into the
+      /// resultant GPR.  Bits corresponding to other CR regs are undefined.
+      MFCR,
+
+      /// RESVEC = VCMP(LHS, RHS, OPC) - Represents one of the altivec VCMP*
+      /// instructions.  For lack of better number, we use the opcode number
+      /// encoding for the OPC field to identify the compare.  For example, 838
+      /// is VCMPGTSH.
+      VCMP,
+      
+      /// RESVEC, OUTFLAG = VCMPo(LHS, RHS, OPC) - Represents one of the
+      /// altivec VCMP*o instructions.  For lack of better number, we use the 
+      /// opcode number encoding for the OPC field to identify the compare.  For
+      /// example, 838 is VCMPGTSH.
+      VCMPo,
+      
+      /// CHAIN = COND_BRANCH CHAIN, CRRC, OPC, DESTBB [, INFLAG] - This
+      /// corresponds to the COND_BRANCH pseudo instruction.  CRRC is the
+      /// condition register to branch on, OPC is the branch opcode to use (e.g.
+      /// PPC::BLE), DESTBB is the destination block to branch to, and INFLAG is
+      /// an optional input flag argument.
+      COND_BRANCH,
+      
+      // The following 5 instructions are used only as part of the
+      // long double-to-int conversion sequence.
+
+      /// OUTFLAG = MFFS F8RC - This moves the FPSCR (not modelled) into the
+      /// register.
+      MFFS,
+
+      /// OUTFLAG = MTFSB0 INFLAG - This clears a bit in the FPSCR.
+      MTFSB0,
+
+      /// OUTFLAG = MTFSB1 INFLAG - This sets a bit in the FPSCR.
+      MTFSB1,
+
+      /// F8RC, OUTFLAG = FADDRTZ F8RC, F8RC, INFLAG - This is an FADD done with
+      /// rounding towards zero.  It has flags added so it won't move past the 
+      /// FPSCR-setting instructions.
+      FADDRTZ,
+
+      /// MTFSF = F8RC, INFLAG - This moves the register into the FPSCR.
+      MTFSF,
+
+      /// LARX = This corresponds to PPC l{w|d}arx instrcution: load and
+      /// reserve indexed. This is used to implement atomic operations.
+      LARX,
+
+      /// STCX = This corresponds to PPC stcx. instrcution: store conditional
+      /// indexed. This is used to implement atomic operations.
+      STCX,
+
+      /// TC_RETURN - A tail call return.
+      ///   operand #0 chain
+      ///   operand #1 callee (register or absolute)
+      ///   operand #2 stack adjustment
+      ///   operand #3 optional in flag
+      TC_RETURN,
+
+      /// STD_32 - This is the STD instruction for use with "32-bit" registers.
+      STD_32 = ISD::FIRST_TARGET_MEMORY_OPCODE,
+      
+      /// CHAIN = STBRX CHAIN, GPRC, Ptr, Type - This is a 
+      /// byte-swapping store instruction.  It byte-swaps the low "Type" bits of
+      /// the GPRC input, then stores it through Ptr.  Type can be either i16 or
+      /// i32.
+      STBRX, 
+      
+      /// GPRC, CHAIN = LBRX CHAIN, Ptr, Type - This is a 
+      /// byte-swapping load instruction.  It loads "Type" bits, byte swaps it,
+      /// then puts it in the bottom bits of the GPRC.  TYPE can be either i16
+      /// or i32.
+      LBRX
+    };
+  }
+
+  /// Define some predicates that are used for node matching.
+  namespace PPC {
+    /// isVPKUHUMShuffleMask - Return true if this is the shuffle mask for a
+    /// VPKUHUM instruction.
+    bool isVPKUHUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary);
+    
+    /// isVPKUWUMShuffleMask - Return true if this is the shuffle mask for a
+    /// VPKUWUM instruction.
+    bool isVPKUWUMShuffleMask(ShuffleVectorSDNode *N, bool isUnary);
+
+    /// isVMRGLShuffleMask - Return true if this is a shuffle mask suitable for
+    /// a VRGL* instruction with the specified unit size (1,2 or 4 bytes).
+    bool isVMRGLShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
+                            bool isUnary);
+
+    /// isVMRGHShuffleMask - Return true if this is a shuffle mask suitable for
+    /// a VRGH* instruction with the specified unit size (1,2 or 4 bytes).
+    bool isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
+                            bool isUnary);
+    
+    /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the shift
+    /// amount, otherwise return -1.
+    int isVSLDOIShuffleMask(SDNode *N, bool isUnary);
+    
+    /// isSplatShuffleMask - Return true if the specified VECTOR_SHUFFLE operand
+    /// specifies a splat of a single element that is suitable for input to
+    /// VSPLTB/VSPLTH/VSPLTW.
+    bool isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize);
+    
+    /// isAllNegativeZeroVector - Returns true if all elements of build_vector
+    /// are -0.0.
+    bool isAllNegativeZeroVector(SDNode *N);
+
+    /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the
+    /// specified isSplatShuffleMask VECTOR_SHUFFLE mask.
+    unsigned getVSPLTImmediate(SDNode *N, unsigned EltSize);
+    
+    /// get_VSPLTI_elt - If this is a build_vector of constants which can be
+    /// formed by using a vspltis[bhw] instruction of the specified element
+    /// size, return the constant being splatted.  The ByteSize field indicates
+    /// the number of bytes of each element [124] -> [bhw].
+    SDValue get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG);
+  }
+  
+  class PPCTargetLowering : public TargetLowering {
+    int VarArgsFrameIndex;            // FrameIndex for start of varargs area.
+    int VarArgsStackOffset;           // StackOffset for start of stack
+                                      // arguments.
+    unsigned VarArgsNumGPR;           // Index of the first unused integer
+                                      // register for parameter passing.
+    unsigned VarArgsNumFPR;           // Index of the first unused double
+                                      // register for parameter passing.
+    const PPCSubtarget &PPCSubTarget;
+  public:
+    explicit PPCTargetLowering(PPCTargetMachine &TM);
+    
+    /// getTargetNodeName() - This method returns the name of a target specific
+    /// DAG node.
+    virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+    /// getSetCCResultType - Return the ISD::SETCC ValueType
+    virtual MVT::SimpleValueType getSetCCResultType(EVT VT) const;
+
+    /// getPreIndexedAddressParts - returns true by value, base pointer and
+    /// offset pointer and addressing mode by reference if the node's address
+    /// can be legally represented as pre-indexed load / store address.
+    virtual bool getPreIndexedAddressParts(SDNode *N, SDValue &Base,
+                                           SDValue &Offset,
+                                           ISD::MemIndexedMode &AM,
+                                           SelectionDAG &DAG) const;
+    
+    /// SelectAddressRegReg - Given the specified addressed, check to see if it
+    /// can be represented as an indexed [r+r] operation.  Returns false if it
+    /// can be more efficiently represented with [r+imm].
+    bool SelectAddressRegReg(SDValue N, SDValue &Base, SDValue &Index,
+                             SelectionDAG &DAG) const;
+    
+    /// SelectAddressRegImm - Returns true if the address N can be represented
+    /// by a base register plus a signed 16-bit displacement [r+imm], and if it
+    /// is not better represented as reg+reg.
+    bool SelectAddressRegImm(SDValue N, SDValue &Disp, SDValue &Base,
+                             SelectionDAG &DAG) const;
+    
+    /// SelectAddressRegRegOnly - Given the specified addressed, force it to be
+    /// represented as an indexed [r+r] operation.
+    bool SelectAddressRegRegOnly(SDValue N, SDValue &Base, SDValue &Index,
+                                 SelectionDAG &DAG) const;
+
+    /// SelectAddressRegImmShift - Returns true if the address N can be
+    /// represented by a base register plus a signed 14-bit displacement
+    /// [r+imm*4].  Suitable for use by STD and friends.
+    bool SelectAddressRegImmShift(SDValue N, SDValue &Disp, SDValue &Base,
+                                  SelectionDAG &DAG) const;
+
+    
+    /// LowerOperation - Provide custom lowering hooks for some operations.
+    ///
+    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG);
+
+    /// ReplaceNodeResults - Replace the results of node with an illegal result
+    /// type with new values built out of custom code.
+    ///
+    virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+                                    SelectionDAG &DAG);
+
+    virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+    
+    virtual void computeMaskedBitsForTargetNode(const SDValue Op,
+                                                const APInt &Mask,
+                                                APInt &KnownZero, 
+                                                APInt &KnownOne,
+                                                const SelectionDAG &DAG,
+                                                unsigned Depth = 0) const;
+
+    virtual MachineBasicBlock *EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                         MachineBasicBlock *MBB,
+                    DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const;
+    MachineBasicBlock *EmitAtomicBinary(MachineInstr *MI, 
+                                        MachineBasicBlock *MBB, bool is64Bit,
+                                        unsigned BinOpcode) const;
+    MachineBasicBlock *EmitPartwordAtomicBinary(MachineInstr *MI, 
+                                                MachineBasicBlock *MBB, 
+                                            bool is8bit, unsigned Opcode) const;
+    
+    ConstraintType getConstraintType(const std::string &Constraint) const;
+    std::pair<unsigned, const TargetRegisterClass*> 
+      getRegForInlineAsmConstraint(const std::string &Constraint,
+                                   EVT VT) const;
+
+    /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
+    /// function arguments in the caller parameter area.  This is the actual
+    /// alignment, not its logarithm.
+    unsigned getByValTypeAlignment(const Type *Ty) const;
+
+    /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+    /// vector.  If it is invalid, don't add anything to Ops. If hasMemory is
+    /// true it means one of the asm constraint of the inline asm instruction
+    /// being processed is 'm'.
+    virtual void LowerAsmOperandForConstraint(SDValue Op,
+                                              char ConstraintLetter,
+                                              bool hasMemory,
+                                              std::vector<SDValue> &Ops,
+                                              SelectionDAG &DAG) const;
+    
+    /// isLegalAddressingMode - Return true if the addressing mode represented
+    /// by AM is legal for this target, for a load/store of the specified type.
+    virtual bool isLegalAddressingMode(const AddrMode &AM, const Type *Ty)const;
+    
+    /// isLegalAddressImmediate - Return true if the integer value can be used
+    /// as the offset of the target addressing mode for load / store of the
+    /// given type.
+    virtual bool isLegalAddressImmediate(int64_t V, const Type *Ty) const;
+
+    /// isLegalAddressImmediate - Return true if the GlobalValue can be used as
+    /// the offset of the target addressing mode.
+    virtual bool isLegalAddressImmediate(GlobalValue *GV) const;
+
+    virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
+    
+    virtual EVT getOptimalMemOpType(uint64_t Size, unsigned Align,
+                                    bool isSrcConst, bool isSrcStr,
+                                    SelectionDAG &DAG) const;
+
+    /// getFunctionAlignment - Return the Log2 alignment of this function.
+    virtual unsigned getFunctionAlignment(const Function *F) const;
+
+  private:
+    SDValue getFramePointerFrameIndex(SelectionDAG & DAG) const;
+    SDValue getReturnAddrFrameIndex(SelectionDAG & DAG) const;
+
+    bool
+    IsEligibleForTailCallOptimization(SDValue Callee,
+                                      CallingConv::ID CalleeCC,
+                                      bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                      SelectionDAG& DAG) const;
+
+    SDValue EmitTailCallLoadFPAndRetAddr(SelectionDAG & DAG,
+                                         int SPDiff,
+                                         SDValue Chain,
+                                         SDValue &LROpOut,
+                                         SDValue &FPOpOut,
+                                         bool isDarwinABI,
+                                         DebugLoc dl);
+
+    SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerTRAMPOLINE(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG,
+                           int VarArgsFrameIndex, int VarArgsStackOffset,
+                           unsigned VarArgsNumGPR, unsigned VarArgsNumFPR,
+                           const PPCSubtarget &Subtarget);
+    SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG, int VarArgsFrameIndex,
+                         int VarArgsStackOffset, unsigned VarArgsNumGPR,
+                         unsigned VarArgsNumFPR, const PPCSubtarget &Subtarget);
+    SDValue LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG,
+                                const PPCSubtarget &Subtarget);
+    SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG,
+                                      const PPCSubtarget &Subtarget);
+    SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, DebugLoc dl);
+    SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerMUL(SDValue Op, SelectionDAG &DAG);
+
+    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                            CallingConv::ID CallConv, bool isVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            DebugLoc dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals);
+    SDValue FinishCall(CallingConv::ID CallConv, DebugLoc dl, bool isTailCall,
+                       bool isVarArg,
+                       SelectionDAG &DAG,
+                       SmallVector<std::pair<unsigned, SDValue>, 8>
+                         &RegsToPass,
+                       SDValue InFlag, SDValue Chain,
+                       SDValue &Callee,
+                       int SPDiff, unsigned NumBytes,
+                       const SmallVectorImpl<ISD::InputArg> &Ins,
+                       SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerFormalArguments(SDValue Chain,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerCall(SDValue Chain, SDValue Callee,
+                CallingConv::ID CallConv, bool isVarArg, bool &isTailCall,
+                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<ISD::InputArg> &Ins,
+                DebugLoc dl, SelectionDAG &DAG,
+                SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerReturn(SDValue Chain,
+                  CallingConv::ID CallConv, bool isVarArg,
+                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  DebugLoc dl, SelectionDAG &DAG);
+
+    SDValue
+      LowerFormalArguments_Darwin(SDValue Chain,
+                                  CallingConv::ID CallConv, bool isVarArg,
+                                  const SmallVectorImpl<ISD::InputArg> &Ins,
+                                  DebugLoc dl, SelectionDAG &DAG,
+                                  SmallVectorImpl<SDValue> &InVals);
+    SDValue
+      LowerFormalArguments_SVR4(SDValue Chain,
+                                CallingConv::ID CallConv, bool isVarArg,
+                                const SmallVectorImpl<ISD::InputArg> &Ins,
+                                DebugLoc dl, SelectionDAG &DAG,
+                                SmallVectorImpl<SDValue> &InVals);
+
+    SDValue
+      LowerCall_Darwin(SDValue Chain, SDValue Callee,
+                       CallingConv::ID CallConv, bool isVarArg, bool isTailCall,
+                       const SmallVectorImpl<ISD::OutputArg> &Outs,
+                       const SmallVectorImpl<ISD::InputArg> &Ins,
+                       DebugLoc dl, SelectionDAG &DAG,
+                       SmallVectorImpl<SDValue> &InVals);
+    SDValue
+      LowerCall_SVR4(SDValue Chain, SDValue Callee,
+                     CallingConv::ID CallConv, bool isVarArg, bool isTailCall,
+                     const SmallVectorImpl<ISD::OutputArg> &Outs,
+                     const SmallVectorImpl<ISD::InputArg> &Ins,
+                     DebugLoc dl, SelectionDAG &DAG,
+                     SmallVectorImpl<SDValue> &InVals);
+  };
+}
+
+#endif   // LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H

diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
new file mode 100644
index 0000000..219efb9
--- /dev/null
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td

@@ -0,0 +1,758 @@
+//===- PPCInstr64Bit.td - The PowerPC 64-bit Support -------*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the PowerPC 64-bit instructions.  These patterns are used
+// both when in ppc64 mode and when in "use 64-bit extensions in 32-bit" mode.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// 64-bit operands.
+//
+def s16imm64 : Operand<i64> {
+  let PrintMethod = "printS16ImmOperand";
+}
+def u16imm64 : Operand<i64> {
+  let PrintMethod = "printU16ImmOperand";
+}
+def symbolHi64 : Operand<i64> {
+  let PrintMethod = "printSymbolHi";
+}
+def symbolLo64 : Operand<i64> {
+  let PrintMethod = "printSymbolLo";
+}
+
+//===----------------------------------------------------------------------===//
+// 64-bit transformation functions.
+//
+
+def SHL64 : SDNodeXForm<imm, [{
+  // Transformation function: 63 - imm
+  return getI32Imm(63 - N->getZExtValue());
+}]>;
+
+def SRL64 : SDNodeXForm<imm, [{
+  // Transformation function: 64 - imm
+  return N->getZExtValue() ? getI32Imm(64 - N->getZExtValue()) : getI32Imm(0);
+}]>;
+
+def HI32_48 : SDNodeXForm<imm, [{
+  // Transformation function: shift the immediate value down into the low bits.
+  return getI32Imm((unsigned short)(N->getZExtValue() >> 32));
+}]>;
+
+def HI48_64 : SDNodeXForm<imm, [{
+  // Transformation function: shift the immediate value down into the low bits.
+  return getI32Imm((unsigned short)(N->getZExtValue() >> 48));
+}]>;
+
+
+//===----------------------------------------------------------------------===//
+// Calls.
+//
+
+let Defs = [LR8] in
+  def MovePCtoLR8 : Pseudo<(outs), (ins piclabel:$label), "bl $label", []>,
+                    PPC970_Unit_BRU;
+
+// Darwin ABI Calls.
+let isCall = 1, PPC970_Unit = 7, 
+  // All calls clobber the PPC64 non-callee saved registers.
+  Defs = [X0,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,
+          F0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,
+          V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,
+          LR8,CTR8,
+          CR0,CR1,CR5,CR6,CR7,CARRY] in {
+  // Convenient aliases for call instructions
+  let Uses = [RM] in {
+    def BL8_Darwin  : IForm<18, 0, 1,
+                            (outs), (ins calltarget:$func, variable_ops), 
+                            "bl $func", BrB, []>;  // See Pat patterns below.
+    def BLA8_Darwin : IForm<18, 1, 1,
+                          (outs), (ins aaddr:$func, variable_ops),
+                          "bla $func", BrB, [(PPCcall_Darwin (i64 imm:$func))]>;
+  }
+  let Uses = [CTR8, RM] in {
+    def BCTRL8_Darwin : XLForm_2_ext<19, 528, 20, 0, 1, 
+                                  (outs), (ins variable_ops),
+                                  "bctrl", BrB,
+                                  [(PPCbctrl_Darwin)]>, Requires<[In64BitMode]>;
+  }
+}
+
+// ELF 64 ABI Calls = Darwin ABI Calls
+// Used to define BL8_ELF and BLA8_ELF
+let isCall = 1, PPC970_Unit = 7, 
+  // All calls clobber the PPC64 non-callee saved registers.
+  Defs = [X0,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,
+          F0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,
+          V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,
+          LR8,CTR8,
+          CR0,CR1,CR5,CR6,CR7,CARRY] in {
+  // Convenient aliases for call instructions
+  let Uses = [RM] in {
+    def BL8_ELF  : IForm<18, 0, 1,
+                         (outs), (ins calltarget:$func, variable_ops), 
+                         "bl $func", BrB, []>;  // See Pat patterns below.                            
+    def BLA8_ELF : IForm<18, 1, 1,
+                         (outs), (ins aaddr:$func, variable_ops),
+                         "bla $func", BrB, [(PPCcall_SVR4 (i64 imm:$func))]>;
+  }
+  let Uses = [CTR8, RM] in {
+    def BCTRL8_ELF : XLForm_2_ext<19, 528, 20, 0, 1,
+                               (outs), (ins variable_ops),
+                               "bctrl", BrB,
+                               [(PPCbctrl_SVR4)]>, Requires<[In64BitMode]>;
+  }
+}
+
+
+// Calls
+def : Pat<(PPCcall_Darwin (i64 tglobaladdr:$dst)),
+          (BL8_Darwin tglobaladdr:$dst)>;
+def : Pat<(PPCcall_Darwin (i64 texternalsym:$dst)),
+          (BL8_Darwin texternalsym:$dst)>;
+
+def : Pat<(PPCcall_SVR4 (i64 tglobaladdr:$dst)),
+          (BL8_ELF tglobaladdr:$dst)>;
+def : Pat<(PPCcall_SVR4 (i64 texternalsym:$dst)),
+          (BL8_ELF texternalsym:$dst)>;
+def : Pat<(PPCnop),
+          (NOP)>;
+
+// Atomic operations
+let usesCustomInserter = 1 in {
+  let Uses = [CR0] in {
+    def ATOMIC_LOAD_ADD_I64 : Pseudo<
+      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr),
+      "${:comment} ATOMIC_LOAD_ADD_I64 PSEUDO!",
+      [(set G8RC:$dst, (atomic_load_add_64 xoaddr:$ptr, G8RC:$incr))]>;
+    def ATOMIC_LOAD_SUB_I64 : Pseudo<
+      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr),
+      "${:comment} ATOMIC_LOAD_SUB_I64 PSEUDO!",
+      [(set G8RC:$dst, (atomic_load_sub_64 xoaddr:$ptr, G8RC:$incr))]>;
+    def ATOMIC_LOAD_OR_I64 : Pseudo<
+      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr),
+      "${:comment} ATOMIC_LOAD_OR_I64 PSEUDO!",
+      [(set G8RC:$dst, (atomic_load_or_64 xoaddr:$ptr, G8RC:$incr))]>;
+    def ATOMIC_LOAD_XOR_I64 : Pseudo<
+      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr),
+      "${:comment} ATOMIC_LOAD_XOR_I64 PSEUDO!",
+      [(set G8RC:$dst, (atomic_load_xor_64 xoaddr:$ptr, G8RC:$incr))]>;
+    def ATOMIC_LOAD_AND_I64 : Pseudo<
+      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr),
+      "${:comment} ATOMIC_LOAD_AND_I64 PSEUDO!",
+      [(set G8RC:$dst, (atomic_load_and_64 xoaddr:$ptr, G8RC:$incr))]>;
+    def ATOMIC_LOAD_NAND_I64 : Pseudo<
+      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$incr),
+      "${:comment} ATOMIC_LOAD_NAND_I64 PSEUDO!",
+      [(set G8RC:$dst, (atomic_load_nand_64 xoaddr:$ptr, G8RC:$incr))]>;
+
+    def ATOMIC_CMP_SWAP_I64 : Pseudo<
+      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$old, G8RC:$new),
+      "${:comment} ATOMIC_CMP_SWAP_I64 PSEUDO!",
+      [(set G8RC:$dst, 
+                    (atomic_cmp_swap_64 xoaddr:$ptr, G8RC:$old, G8RC:$new))]>;
+
+    def ATOMIC_SWAP_I64 : Pseudo<
+      (outs G8RC:$dst), (ins memrr:$ptr, G8RC:$new),
+      "${:comment} ATOMIC_SWAP_I64 PSEUDO!",
+      [(set G8RC:$dst, (atomic_swap_64 xoaddr:$ptr, G8RC:$new))]>;
+  }
+}
+
+// Instructions to support atomic operations
+def LDARX : XForm_1<31,  84, (outs G8RC:$rD), (ins memrr:$ptr),
+                   "ldarx $rD, $ptr", LdStLDARX,
+                   [(set G8RC:$rD, (PPClarx xoaddr:$ptr))]>;
+
+let Defs = [CR0] in
+def STDCX : XForm_1<31, 214, (outs), (ins G8RC:$rS, memrr:$dst),
+                   "stdcx. $rS, $dst", LdStSTDCX,
+                   [(PPCstcx G8RC:$rS, xoaddr:$dst)]>,
+                   isDOT;
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
+def TCRETURNdi8 :Pseudo< (outs),
+                        (ins calltarget:$dst, i32imm:$offset, variable_ops),
+                 "#TC_RETURNd8 $dst $offset",
+                 []>;
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
+def TCRETURNai8 :Pseudo<(outs), (ins aaddr:$func, i32imm:$offset, variable_ops),
+                 "#TC_RETURNa8 $func $offset",
+                 [(PPCtc_return (i64 imm:$func), imm:$offset)]>;
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
+def TCRETURNri8 : Pseudo<(outs), (ins CTRRC8:$dst, i32imm:$offset, variable_ops),
+                 "#TC_RETURNr8 $dst $offset",
+                 []>;
+
+
+let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7, isBranch = 1,
+    isIndirectBranch = 1, isCall = 1, isReturn = 1, Uses = [CTR, RM] in
+def TAILBCTR8 : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", BrB, []>,
+    Requires<[In64BitMode]>;
+
+
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7,
+    isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in
+def TAILB8   : IForm<18, 0, 0, (outs), (ins calltarget:$dst),
+                  "b $dst", BrB,
+                  []>;
+
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7,
+    isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in
+def TAILBA8   : IForm<18, 0, 0, (outs), (ins aaddr:$dst),
+                  "ba $dst", BrB,
+                  []>;
+
+def : Pat<(PPCtc_return (i64 tglobaladdr:$dst),  imm:$imm),
+          (TCRETURNdi8 tglobaladdr:$dst, imm:$imm)>;
+
+def : Pat<(PPCtc_return (i64 texternalsym:$dst), imm:$imm),
+          (TCRETURNdi8 texternalsym:$dst, imm:$imm)>;
+
+def : Pat<(PPCtc_return CTRRC8:$dst, imm:$imm),
+          (TCRETURNri8 CTRRC8:$dst, imm:$imm)>;
+
+
+//===----------------------------------------------------------------------===//
+// 64-bit SPR manipulation instrs.
+
+let Uses = [CTR8] in {
+def MFCTR8 : XFXForm_1_ext<31, 339, 9, (outs G8RC:$rT), (ins),
+                           "mfctr $rT", SprMFSPR>,
+             PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+let Pattern = [(PPCmtctr G8RC:$rS)], Defs = [CTR8] in {
+def MTCTR8 : XFXForm_7_ext<31, 467, 9, (outs), (ins G8RC:$rS),
+                           "mtctr $rS", SprMTSPR>,
+             PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+
+let Defs = [X1], Uses = [X1] in
+def DYNALLOC8 : Pseudo<(outs G8RC:$result), (ins G8RC:$negsize, memri:$fpsi),
+                       "${:comment} DYNALLOC8 $result, $negsize, $fpsi",
+                       [(set G8RC:$result,
+                             (PPCdynalloc G8RC:$negsize, iaddr:$fpsi))]>;
+
+let Defs = [LR8] in {
+def MTLR8  : XFXForm_7_ext<31, 467, 8, (outs), (ins G8RC:$rS),
+                           "mtlr $rS", SprMTSPR>,
+             PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+let Uses = [LR8] in {
+def MFLR8  : XFXForm_1_ext<31, 339, 8, (outs G8RC:$rT), (ins),
+                           "mflr $rT", SprMFSPR>,
+             PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+
+//===----------------------------------------------------------------------===//
+// Fixed point instructions.
+//
+
+let PPC970_Unit = 1 in {  // FXU Operations.
+
+// Copies, extends, truncates.
+def OR4To8  : XForm_6<31, 444, (outs G8RC:$rA), (ins GPRC:$rS, GPRC:$rB),
+                   "or $rA, $rS, $rB", IntGeneral,
+                   []>;
+def OR8To4  : XForm_6<31, 444, (outs GPRC:$rA), (ins G8RC:$rS, G8RC:$rB),
+                   "or $rA, $rS, $rB", IntGeneral,
+                   []>;
+
+def LI8  : DForm_2_r0<14, (outs G8RC:$rD), (ins symbolLo64:$imm),
+                      "li $rD, $imm", IntGeneral,
+                      [(set G8RC:$rD, immSExt16:$imm)]>;
+def LIS8 : DForm_2_r0<15, (outs G8RC:$rD), (ins symbolHi64:$imm),
+                      "lis $rD, $imm", IntGeneral,
+                      [(set G8RC:$rD, imm16ShiftedSExt:$imm)]>;
+
+// Logical ops.
+def NAND8: XForm_6<31, 476, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
+                   "nand $rA, $rS, $rB", IntGeneral,
+                   [(set G8RC:$rA, (not (and G8RC:$rS, G8RC:$rB)))]>;
+def AND8 : XForm_6<31,  28, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
+                   "and $rA, $rS, $rB", IntGeneral,
+                   [(set G8RC:$rA, (and G8RC:$rS, G8RC:$rB))]>;
+def ANDC8: XForm_6<31,  60, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
+                   "andc $rA, $rS, $rB", IntGeneral,
+                   [(set G8RC:$rA, (and G8RC:$rS, (not G8RC:$rB)))]>;
+def OR8  : XForm_6<31, 444, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
+                   "or $rA, $rS, $rB", IntGeneral,
+                   [(set G8RC:$rA, (or G8RC:$rS, G8RC:$rB))]>;
+def NOR8 : XForm_6<31, 124, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
+                   "nor $rA, $rS, $rB", IntGeneral,
+                   [(set G8RC:$rA, (not (or G8RC:$rS, G8RC:$rB)))]>;
+def ORC8 : XForm_6<31, 412, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
+                   "orc $rA, $rS, $rB", IntGeneral,
+                   [(set G8RC:$rA, (or G8RC:$rS, (not G8RC:$rB)))]>;
+def EQV8 : XForm_6<31, 284, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
+                   "eqv $rA, $rS, $rB", IntGeneral,
+                   [(set G8RC:$rA, (not (xor G8RC:$rS, G8RC:$rB)))]>;
+def XOR8 : XForm_6<31, 316, (outs G8RC:$rA), (ins G8RC:$rS, G8RC:$rB),
+                   "xor $rA, $rS, $rB", IntGeneral,
+                   [(set G8RC:$rA, (xor G8RC:$rS, G8RC:$rB))]>;
+
+// Logical ops with immediate.
+def ANDIo8  : DForm_4<28, (outs G8RC:$dst), (ins G8RC:$src1, u16imm:$src2),
+                      "andi. $dst, $src1, $src2", IntGeneral,
+                      [(set G8RC:$dst, (and G8RC:$src1, immZExt16:$src2))]>,
+                      isDOT;
+def ANDISo8 : DForm_4<29, (outs G8RC:$dst), (ins G8RC:$src1, u16imm:$src2),
+                     "andis. $dst, $src1, $src2", IntGeneral,
+                    [(set G8RC:$dst, (and G8RC:$src1,imm16ShiftedZExt:$src2))]>,
+                     isDOT;
+def ORI8    : DForm_4<24, (outs G8RC:$dst), (ins G8RC:$src1, u16imm:$src2),
+                      "ori $dst, $src1, $src2", IntGeneral,
+                      [(set G8RC:$dst, (or G8RC:$src1, immZExt16:$src2))]>;
+def ORIS8   : DForm_4<25, (outs G8RC:$dst), (ins G8RC:$src1, u16imm:$src2),
+                      "oris $dst, $src1, $src2", IntGeneral,
+                    [(set G8RC:$dst, (or G8RC:$src1, imm16ShiftedZExt:$src2))]>;
+def XORI8   : DForm_4<26, (outs G8RC:$dst), (ins G8RC:$src1, u16imm:$src2),
+                      "xori $dst, $src1, $src2", IntGeneral,
+                      [(set G8RC:$dst, (xor G8RC:$src1, immZExt16:$src2))]>;
+def XORIS8  : DForm_4<27, (outs G8RC:$dst), (ins G8RC:$src1, u16imm:$src2),
+                      "xoris $dst, $src1, $src2", IntGeneral,
+                   [(set G8RC:$dst, (xor G8RC:$src1, imm16ShiftedZExt:$src2))]>;
+
+def ADD8  : XOForm_1<31, 266, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
+                     "add $rT, $rA, $rB", IntGeneral,
+                     [(set G8RC:$rT, (add G8RC:$rA, G8RC:$rB))]>;
+                     
+let Defs = [CARRY] in {
+def ADDC8 : XOForm_1<31, 10, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
+                     "addc $rT, $rA, $rB", IntGeneral,
+                     [(set G8RC:$rT, (addc G8RC:$rA, G8RC:$rB))]>,
+                     PPC970_DGroup_Cracked;
+def ADDIC8 : DForm_2<12, (outs G8RC:$rD), (ins G8RC:$rA, s16imm64:$imm),
+                     "addic $rD, $rA, $imm", IntGeneral,
+                     [(set G8RC:$rD, (addc G8RC:$rA, immSExt16:$imm))]>;
+}
+def ADDI8  : DForm_2<14, (outs G8RC:$rD), (ins G8RC:$rA, s16imm64:$imm),
+                     "addi $rD, $rA, $imm", IntGeneral,
+                     [(set G8RC:$rD, (add G8RC:$rA, immSExt16:$imm))]>;
+def ADDIS8 : DForm_2<15, (outs G8RC:$rD), (ins G8RC:$rA, symbolHi64:$imm),
+                     "addis $rD, $rA, $imm", IntGeneral,
+                     [(set G8RC:$rD, (add G8RC:$rA, imm16ShiftedSExt:$imm))]>;
+
+let Defs = [CARRY] in {
+def SUBFIC8: DForm_2< 8, (outs G8RC:$rD), (ins G8RC:$rA, s16imm64:$imm),
+                     "subfic $rD, $rA, $imm", IntGeneral,
+                     [(set G8RC:$rD, (subc immSExt16:$imm, G8RC:$rA))]>;
+def SUBFC8 : XOForm_1<31, 8, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
+                      "subfc $rT, $rA, $rB", IntGeneral,
+                      [(set G8RC:$rT, (subc G8RC:$rB, G8RC:$rA))]>,
+                      PPC970_DGroup_Cracked;
+}
+def SUBF8 : XOForm_1<31, 40, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
+                     "subf $rT, $rA, $rB", IntGeneral,
+                     [(set G8RC:$rT, (sub G8RC:$rB, G8RC:$rA))]>;
+def NEG8    : XOForm_3<31, 104, 0, (outs G8RC:$rT), (ins G8RC:$rA),
+                       "neg $rT, $rA", IntGeneral,
+                       [(set G8RC:$rT, (ineg G8RC:$rA))]>;
+let Uses = [CARRY], Defs = [CARRY] in {
+def ADDE8   : XOForm_1<31, 138, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
+                       "adde $rT, $rA, $rB", IntGeneral,
+                       [(set G8RC:$rT, (adde G8RC:$rA, G8RC:$rB))]>;
+def ADDME8  : XOForm_3<31, 234, 0, (outs G8RC:$rT), (ins G8RC:$rA),
+                       "addme $rT, $rA", IntGeneral,
+                       [(set G8RC:$rT, (adde G8RC:$rA, immAllOnes))]>;
+def ADDZE8  : XOForm_3<31, 202, 0, (outs G8RC:$rT), (ins G8RC:$rA),
+                       "addze $rT, $rA", IntGeneral,
+                       [(set G8RC:$rT, (adde G8RC:$rA, 0))]>;
+def SUBFE8  : XOForm_1<31, 136, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
+                       "subfe $rT, $rA, $rB", IntGeneral,
+                       [(set G8RC:$rT, (sube G8RC:$rB, G8RC:$rA))]>;
+def SUBFME8 : XOForm_3<31, 232, 0, (outs G8RC:$rT), (ins G8RC:$rA),
+                       "subfme $rT, $rA", IntGeneral,
+                       [(set G8RC:$rT, (sube immAllOnes, G8RC:$rA))]>;
+def SUBFZE8 : XOForm_3<31, 200, 0, (outs G8RC:$rT), (ins G8RC:$rA),
+                       "subfze $rT, $rA", IntGeneral,
+                       [(set G8RC:$rT, (sube 0, G8RC:$rA))]>;
+}
+
+
+def MULHD : XOForm_1<31, 73, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
+                     "mulhd $rT, $rA, $rB", IntMulHW,
+                     [(set G8RC:$rT, (mulhs G8RC:$rA, G8RC:$rB))]>;
+def MULHDU : XOForm_1<31, 9, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
+                     "mulhdu $rT, $rA, $rB", IntMulHWU,
+                     [(set G8RC:$rT, (mulhu G8RC:$rA, G8RC:$rB))]>;
+
+def CMPD   : XForm_16_ext<31, 0, (outs CRRC:$crD), (ins G8RC:$rA, G8RC:$rB),
+                          "cmpd $crD, $rA, $rB", IntCompare>, isPPC64;
+def CMPLD  : XForm_16_ext<31, 32, (outs CRRC:$crD), (ins G8RC:$rA, G8RC:$rB),
+                          "cmpld $crD, $rA, $rB", IntCompare>, isPPC64;
+def CMPDI  : DForm_5_ext<11, (outs CRRC:$crD), (ins G8RC:$rA, s16imm:$imm),
+                         "cmpdi $crD, $rA, $imm", IntCompare>, isPPC64;
+def CMPLDI : DForm_6_ext<10, (outs CRRC:$dst), (ins G8RC:$src1, u16imm:$src2),
+                         "cmpldi $dst, $src1, $src2", IntCompare>, isPPC64;
+
+def SLD  : XForm_6<31,  27, (outs G8RC:$rA), (ins G8RC:$rS, GPRC:$rB),
+                   "sld $rA, $rS, $rB", IntRotateD,
+                   [(set G8RC:$rA, (PPCshl G8RC:$rS, GPRC:$rB))]>, isPPC64;
+def SRD  : XForm_6<31, 539, (outs G8RC:$rA), (ins G8RC:$rS, GPRC:$rB),
+                   "srd $rA, $rS, $rB", IntRotateD,
+                   [(set G8RC:$rA, (PPCsrl G8RC:$rS, GPRC:$rB))]>, isPPC64;
+let Defs = [CARRY] in {
+def SRAD : XForm_6<31, 794, (outs G8RC:$rA), (ins G8RC:$rS, GPRC:$rB),
+                   "srad $rA, $rS, $rB", IntRotateD,
+                   [(set G8RC:$rA, (PPCsra G8RC:$rS, GPRC:$rB))]>, isPPC64;
+}
+                   
+def EXTSB8 : XForm_11<31, 954, (outs G8RC:$rA), (ins G8RC:$rS),
+                      "extsb $rA, $rS", IntGeneral,
+                      [(set G8RC:$rA, (sext_inreg G8RC:$rS, i8))]>;
+def EXTSH8 : XForm_11<31, 922, (outs G8RC:$rA), (ins G8RC:$rS),
+                      "extsh $rA, $rS", IntGeneral,
+                      [(set G8RC:$rA, (sext_inreg G8RC:$rS, i16))]>;
+
+def EXTSW  : XForm_11<31, 986, (outs G8RC:$rA), (ins G8RC:$rS),
+                      "extsw $rA, $rS", IntGeneral,
+                      [(set G8RC:$rA, (sext_inreg G8RC:$rS, i32))]>, isPPC64;
+/// EXTSW_32 - Just like EXTSW, but works on '32-bit' registers.
+def EXTSW_32 : XForm_11<31, 986, (outs GPRC:$rA), (ins GPRC:$rS),
+                      "extsw $rA, $rS", IntGeneral,
+                      [(set GPRC:$rA, (PPCextsw_32 GPRC:$rS))]>, isPPC64;
+def EXTSW_32_64 : XForm_11<31, 986, (outs G8RC:$rA), (ins GPRC:$rS),
+                      "extsw $rA, $rS", IntGeneral,
+                      [(set G8RC:$rA, (sext GPRC:$rS))]>, isPPC64;
+
+let Defs = [CARRY] in {
+def SRADI  : XSForm_1<31, 413, (outs G8RC:$rA), (ins G8RC:$rS, u6imm:$SH),
+                      "sradi $rA, $rS, $SH", IntRotateD,
+                      [(set G8RC:$rA, (sra G8RC:$rS, (i32 imm:$SH)))]>, isPPC64;
+}
+def CNTLZD : XForm_11<31, 58, (outs G8RC:$rA), (ins G8RC:$rS),
+                      "cntlzd $rA, $rS", IntGeneral,
+                      [(set G8RC:$rA, (ctlz G8RC:$rS))]>;
+
+def DIVD  : XOForm_1<31, 489, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
+                     "divd $rT, $rA, $rB", IntDivD,
+                     [(set G8RC:$rT, (sdiv G8RC:$rA, G8RC:$rB))]>, isPPC64,
+                     PPC970_DGroup_First, PPC970_DGroup_Cracked;
+def DIVDU : XOForm_1<31, 457, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
+                     "divdu $rT, $rA, $rB", IntDivD,
+                     [(set G8RC:$rT, (udiv G8RC:$rA, G8RC:$rB))]>, isPPC64,
+                     PPC970_DGroup_First, PPC970_DGroup_Cracked;
+def MULLD : XOForm_1<31, 233, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
+                     "mulld $rT, $rA, $rB", IntMulHD,
+                     [(set G8RC:$rT, (mul G8RC:$rA, G8RC:$rB))]>, isPPC64;
+
+
+let isCommutable = 1 in {
+def RLDIMI : MDForm_1<30, 3,
+                      (outs G8RC:$rA), (ins G8RC:$rSi, G8RC:$rS, u6imm:$SH, u6imm:$MB),
+                      "rldimi $rA, $rS, $SH, $MB", IntRotateD,
+                      []>, isPPC64, RegConstraint<"$rSi = $rA">,
+                      NoEncode<"$rSi">;
+}
+
+// Rotate instructions.
+def RLDCL  : MDForm_1<30, 0,
+                      (outs G8RC:$rA), (ins G8RC:$rS, GPRC:$rB, u6imm:$MB),
+                      "rldcl $rA, $rS, $rB, $MB", IntRotateD,
+                      []>, isPPC64;
+def RLDICL : MDForm_1<30, 0,
+                      (outs G8RC:$rA), (ins G8RC:$rS, u6imm:$SH, u6imm:$MB),
+                      "rldicl $rA, $rS, $SH, $MB", IntRotateD,
+                      []>, isPPC64;
+def RLDICR : MDForm_1<30, 1,
+                      (outs G8RC:$rA), (ins G8RC:$rS, u6imm:$SH, u6imm:$ME),
+                      "rldicr $rA, $rS, $SH, $ME", IntRotateD,
+                      []>, isPPC64;
+}  // End FXU Operations.
+
+
+//===----------------------------------------------------------------------===//
+// Load/Store instructions.
+//
+
+
+// Sign extending loads.
+let canFoldAsLoad = 1, PPC970_Unit = 2 in {
+def LHA8: DForm_1<42, (outs G8RC:$rD), (ins memri:$src),
+                  "lha $rD, $src", LdStLHA,
+                  [(set G8RC:$rD, (sextloadi16 iaddr:$src))]>,
+                  PPC970_DGroup_Cracked;
+def LWA  : DSForm_1<58, 2, (outs G8RC:$rD), (ins memrix:$src),
+                    "lwa $rD, $src", LdStLWA,
+                    [(set G8RC:$rD, (sextloadi32 ixaddr:$src))]>, isPPC64,
+                    PPC970_DGroup_Cracked;
+def LHAX8: XForm_1<31, 343, (outs G8RC:$rD), (ins memrr:$src),
+                   "lhax $rD, $src", LdStLHA,
+                   [(set G8RC:$rD, (sextloadi16 xaddr:$src))]>,
+                   PPC970_DGroup_Cracked;
+def LWAX : XForm_1<31, 341, (outs G8RC:$rD), (ins memrr:$src),
+                   "lwax $rD, $src", LdStLHA,
+                   [(set G8RC:$rD, (sextloadi32 xaddr:$src))]>, isPPC64,
+                   PPC970_DGroup_Cracked;
+
+// Update forms.
+let mayLoad = 1 in
+def LHAU8 : DForm_1<43, (outs G8RC:$rD, ptr_rc:$ea_result), (ins symbolLo:$disp,
+                            ptr_rc:$rA),
+                    "lhau $rD, $disp($rA)", LdStGeneral,
+                    []>, RegConstraint<"$rA = $ea_result">,
+                    NoEncode<"$ea_result">;
+// NO LWAU!
+
+}
+
+// Zero extending loads.
+let canFoldAsLoad = 1, PPC970_Unit = 2 in {
+def LBZ8 : DForm_1<34, (outs G8RC:$rD), (ins memri:$src),
+                  "lbz $rD, $src", LdStGeneral,
+                  [(set G8RC:$rD, (zextloadi8 iaddr:$src))]>;
+def LHZ8 : DForm_1<40, (outs G8RC:$rD), (ins memri:$src),
+                  "lhz $rD, $src", LdStGeneral,
+                  [(set G8RC:$rD, (zextloadi16 iaddr:$src))]>;
+def LWZ8 : DForm_1<32, (outs G8RC:$rD), (ins memri:$src),
+                  "lwz $rD, $src", LdStGeneral,
+                  [(set G8RC:$rD, (zextloadi32 iaddr:$src))]>, isPPC64;
+
+def LBZX8 : XForm_1<31,  87, (outs G8RC:$rD), (ins memrr:$src),
+                   "lbzx $rD, $src", LdStGeneral,
+                   [(set G8RC:$rD, (zextloadi8 xaddr:$src))]>;
+def LHZX8 : XForm_1<31, 279, (outs G8RC:$rD), (ins memrr:$src),
+                   "lhzx $rD, $src", LdStGeneral,
+                   [(set G8RC:$rD, (zextloadi16 xaddr:$src))]>;
+def LWZX8 : XForm_1<31,  23, (outs G8RC:$rD), (ins memrr:$src),
+                   "lwzx $rD, $src", LdStGeneral,
+                   [(set G8RC:$rD, (zextloadi32 xaddr:$src))]>;
+                   
+                   
+// Update forms.
+let mayLoad = 1 in {
+def LBZU8 : DForm_1<35, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
+                    "lbzu $rD, $addr", LdStGeneral,
+                    []>, RegConstraint<"$addr.reg = $ea_result">,
+                    NoEncode<"$ea_result">;
+def LHZU8 : DForm_1<41, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
+                    "lhzu $rD, $addr", LdStGeneral,
+                    []>, RegConstraint<"$addr.reg = $ea_result">,
+                    NoEncode<"$ea_result">;
+def LWZU8 : DForm_1<33, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
+                    "lwzu $rD, $addr", LdStGeneral,
+                    []>, RegConstraint<"$addr.reg = $ea_result">,
+                    NoEncode<"$ea_result">;
+}
+}
+
+
+// Full 8-byte loads.
+let canFoldAsLoad = 1, PPC970_Unit = 2 in {
+def LD   : DSForm_1<58, 0, (outs G8RC:$rD), (ins memrix:$src),
+                    "ld $rD, $src", LdStLD,
+                    [(set G8RC:$rD, (load ixaddr:$src))]>, isPPC64;
+def LDtoc: DSForm_1<58, 0, (outs G8RC:$rD), (ins tocentry:$disp, G8RC:$reg),
+                    "ld $rD, $disp($reg)", LdStLD,
+                    [(set G8RC:$rD,
+                     (PPCtoc_entry tglobaladdr:$disp, G8RC:$reg))]>, isPPC64;
+let RST = 2, DS = 8 in
+def LDinto_toc: DSForm_1<58, 0, (outs), (ins G8RC:$reg),
+                    "ld 2, 8($reg)", LdStLD,
+                    [(PPCload_toc G8RC:$reg)]>, isPPC64;
+let RST = 2, DS = 40, RA = 1 in
+def LDtoc_restore : DSForm_1<58, 0, (outs), (ins),
+                    "ld 2, 40(1)", LdStLD,
+                    []>, isPPC64;
+def LDX  : XForm_1<31,  21, (outs G8RC:$rD), (ins memrr:$src),
+                   "ldx $rD, $src", LdStLD,
+                   [(set G8RC:$rD, (load xaddr:$src))]>, isPPC64;
+                   
+let mayLoad = 1 in
+def LDU  : DSForm_1<58, 1, (outs G8RC:$rD, ptr_rc:$ea_result), (ins memrix:$addr),
+                    "ldu $rD, $addr", LdStLD,
+                    []>, RegConstraint<"$addr.reg = $ea_result">, isPPC64,
+                    NoEncode<"$ea_result">;
+
+}
+
+def : Pat<(PPCtoc_restore),
+          (LDtoc_restore)>;
+def : Pat<(PPCload ixaddr:$src),
+          (LD ixaddr:$src)>;
+def : Pat<(PPCload xaddr:$src),
+          (LDX xaddr:$src)>;
+
+let PPC970_Unit = 2 in {
+// Truncating stores.                       
+def STB8 : DForm_1<38, (outs), (ins G8RC:$rS, memri:$src),
+                   "stb $rS, $src", LdStGeneral,
+                   [(truncstorei8 G8RC:$rS, iaddr:$src)]>;
+def STH8 : DForm_1<44, (outs), (ins G8RC:$rS, memri:$src),
+                   "sth $rS, $src", LdStGeneral,
+                   [(truncstorei16 G8RC:$rS, iaddr:$src)]>;
+def STW8 : DForm_1<36, (outs), (ins G8RC:$rS, memri:$src),
+                   "stw $rS, $src", LdStGeneral,
+                   [(truncstorei32 G8RC:$rS, iaddr:$src)]>;
+def STBX8 : XForm_8<31, 215, (outs), (ins G8RC:$rS, memrr:$dst),
+                   "stbx $rS, $dst", LdStGeneral,
+                   [(truncstorei8 G8RC:$rS, xaddr:$dst)]>, 
+                   PPC970_DGroup_Cracked;
+def STHX8 : XForm_8<31, 407, (outs), (ins G8RC:$rS, memrr:$dst),
+                   "sthx $rS, $dst", LdStGeneral,
+                   [(truncstorei16 G8RC:$rS, xaddr:$dst)]>, 
+                   PPC970_DGroup_Cracked;
+def STWX8 : XForm_8<31, 151, (outs), (ins G8RC:$rS, memrr:$dst),
+                   "stwx $rS, $dst", LdStGeneral,
+                   [(truncstorei32 G8RC:$rS, xaddr:$dst)]>,
+                   PPC970_DGroup_Cracked;
+// Normal 8-byte stores.
+def STD  : DSForm_1<62, 0, (outs), (ins G8RC:$rS, memrix:$dst),
+                    "std $rS, $dst", LdStSTD,
+                    [(store G8RC:$rS, ixaddr:$dst)]>, isPPC64;
+def STDX  : XForm_8<31, 149, (outs), (ins G8RC:$rS, memrr:$dst),
+                   "stdx $rS, $dst", LdStSTD,
+                   [(store G8RC:$rS, xaddr:$dst)]>, isPPC64,
+                   PPC970_DGroup_Cracked;
+}
+
+let PPC970_Unit = 2 in {
+
+def STBU8 : DForm_1<38, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
+                             symbolLo:$ptroff, ptr_rc:$ptrreg),
+                    "stbu $rS, $ptroff($ptrreg)", LdStGeneral,
+                    [(set ptr_rc:$ea_res,
+                          (pre_truncsti8 G8RC:$rS, ptr_rc:$ptrreg, 
+                                         iaddroff:$ptroff))]>,
+                    RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
+def STHU8 : DForm_1<45, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
+                             symbolLo:$ptroff, ptr_rc:$ptrreg),
+                    "sthu $rS, $ptroff($ptrreg)", LdStGeneral,
+                    [(set ptr_rc:$ea_res,
+                        (pre_truncsti16 G8RC:$rS, ptr_rc:$ptrreg, 
+                                        iaddroff:$ptroff))]>,
+                    RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
+def STWU8 : DForm_1<37, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
+                             symbolLo:$ptroff, ptr_rc:$ptrreg),
+                    "stwu $rS, $ptroff($ptrreg)", LdStGeneral,
+                    [(set ptr_rc:$ea_res, (pre_store G8RC:$rS, ptr_rc:$ptrreg, 
+                                                     iaddroff:$ptroff))]>,
+                    RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
+
+
+def STDU : DSForm_1<62, 1, (outs ptr_rc:$ea_res), (ins G8RC:$rS,
+                                s16immX4:$ptroff, ptr_rc:$ptrreg),
+                    "stdu $rS, $ptroff($ptrreg)", LdStSTD,
+                    [(set ptr_rc:$ea_res, (pre_store G8RC:$rS, ptr_rc:$ptrreg, 
+                                                     iaddroff:$ptroff))]>,
+                    RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">,
+                    isPPC64;
+
+let mayStore = 1 in
+def STDUX : XForm_8<31, 181, (outs), (ins G8RC:$rS, memrr:$dst),
+                   "stdux $rS, $dst", LdStSTD,
+                   []>, isPPC64;
+
+// STD_32/STDX_32 - Just like STD/STDX, but uses a '32-bit' input register.
+def STD_32  : DSForm_1<62, 0, (outs), (ins GPRC:$rT, memrix:$dst),
+                       "std $rT, $dst", LdStSTD,
+                       [(PPCstd_32  GPRC:$rT, ixaddr:$dst)]>, isPPC64;
+def STDX_32  : XForm_8<31, 149, (outs), (ins GPRC:$rT, memrr:$dst),
+                       "stdx $rT, $dst", LdStSTD,
+                       [(PPCstd_32  GPRC:$rT, xaddr:$dst)]>, isPPC64,
+                       PPC970_DGroup_Cracked;
+}
+
+
+
+//===----------------------------------------------------------------------===//
+// Floating point instructions.
+//
+
+
+let PPC970_Unit = 3, Uses = [RM] in {  // FPU Operations.
+def FCFID  : XForm_26<63, 846, (outs F8RC:$frD), (ins F8RC:$frB),
+                      "fcfid $frD, $frB", FPGeneral,
+                      [(set F8RC:$frD, (PPCfcfid F8RC:$frB))]>, isPPC64;
+def FCTIDZ : XForm_26<63, 815, (outs F8RC:$frD), (ins F8RC:$frB),
+                      "fctidz $frD, $frB", FPGeneral,
+                      [(set F8RC:$frD, (PPCfctidz F8RC:$frB))]>, isPPC64;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Instruction Patterns
+//
+
+// Extensions and truncates to/from 32-bit regs.
+def : Pat<(i64 (zext GPRC:$in)),
+          (RLDICL (OR4To8 GPRC:$in, GPRC:$in), 0, 32)>;
+def : Pat<(i64 (anyext GPRC:$in)),
+          (OR4To8 GPRC:$in, GPRC:$in)>;
+def : Pat<(i32 (trunc G8RC:$in)),
+          (OR8To4 G8RC:$in, G8RC:$in)>;
+
+// Extending loads with i64 targets.
+def : Pat<(zextloadi1 iaddr:$src),
+          (LBZ8 iaddr:$src)>;
+def : Pat<(zextloadi1 xaddr:$src),
+          (LBZX8 xaddr:$src)>;
+def : Pat<(extloadi1 iaddr:$src),
+          (LBZ8 iaddr:$src)>;
+def : Pat<(extloadi1 xaddr:$src),
+          (LBZX8 xaddr:$src)>;
+def : Pat<(extloadi8 iaddr:$src),
+          (LBZ8 iaddr:$src)>;
+def : Pat<(extloadi8 xaddr:$src),
+          (LBZX8 xaddr:$src)>;
+def : Pat<(extloadi16 iaddr:$src),
+          (LHZ8 iaddr:$src)>;
+def : Pat<(extloadi16 xaddr:$src),
+          (LHZX8 xaddr:$src)>;
+def : Pat<(extloadi32 iaddr:$src),
+          (LWZ8 iaddr:$src)>;
+def : Pat<(extloadi32 xaddr:$src),
+          (LWZX8 xaddr:$src)>;
+
+// Standard shifts.  These are represented separately from the real shifts above
+// so that we can distinguish between shifts that allow 6-bit and 7-bit shift
+// amounts.
+def : Pat<(sra G8RC:$rS, GPRC:$rB),
+          (SRAD G8RC:$rS, GPRC:$rB)>;
+def : Pat<(srl G8RC:$rS, GPRC:$rB),
+          (SRD G8RC:$rS, GPRC:$rB)>;
+def : Pat<(shl G8RC:$rS, GPRC:$rB),
+          (SLD G8RC:$rS, GPRC:$rB)>;
+
+// SHL/SRL
+def : Pat<(shl G8RC:$in, (i32 imm:$imm)),
+          (RLDICR G8RC:$in, imm:$imm, (SHL64 imm:$imm))>;
+def : Pat<(srl G8RC:$in, (i32 imm:$imm)),
+          (RLDICL G8RC:$in, (SRL64 imm:$imm), imm:$imm)>;
+
+// ROTL
+def : Pat<(rotl G8RC:$in, GPRC:$sh),
+          (RLDCL G8RC:$in, GPRC:$sh, 0)>;
+def : Pat<(rotl G8RC:$in, (i32 imm:$imm)),
+          (RLDICL G8RC:$in, imm:$imm, 0)>;
+
+// Hi and Lo for Darwin Global Addresses.
+def : Pat<(PPChi tglobaladdr:$in, 0), (LIS8 tglobaladdr:$in)>;
+def : Pat<(PPClo tglobaladdr:$in, 0), (LI8  tglobaladdr:$in)>;
+def : Pat<(PPChi tconstpool:$in , 0), (LIS8 tconstpool:$in)>;
+def : Pat<(PPClo tconstpool:$in , 0), (LI8  tconstpool:$in)>;
+def : Pat<(PPChi tjumptable:$in , 0), (LIS8 tjumptable:$in)>;
+def : Pat<(PPClo tjumptable:$in , 0), (LI8  tjumptable:$in)>;
+def : Pat<(PPChi tblockaddress:$in, 0), (LIS8 tblockaddress:$in)>;
+def : Pat<(PPClo tblockaddress:$in, 0), (LI8  tblockaddress:$in)>;
+def : Pat<(add G8RC:$in, (PPChi tglobaladdr:$g, 0)),
+          (ADDIS8 G8RC:$in, tglobaladdr:$g)>;
+def : Pat<(add G8RC:$in, (PPChi tconstpool:$g, 0)),
+          (ADDIS8 G8RC:$in, tconstpool:$g)>;
+def : Pat<(add G8RC:$in, (PPChi tjumptable:$g, 0)),
+          (ADDIS8 G8RC:$in, tjumptable:$g)>;
+def : Pat<(add G8RC:$in, (PPChi tblockaddress:$g, 0)),
+          (ADDIS8 G8RC:$in, tblockaddress:$g)>;

diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td
new file mode 100644
index 0000000..3f4d329
--- /dev/null
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td

@@ -0,0 +1,690 @@
+//===- PPCInstrAltivec.td - The PowerPC Altivec Extension --*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Altivec extension to the PowerPC instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Altivec transformation functions and pattern fragments.
+//
+
+
+def vpkuhum_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                              (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), false);
+}]>;
+def vpkuwum_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                              (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVPKUWUMShuffleMask(cast<ShuffleVectorSDNode>(N), false);
+}]>;
+def vpkuhum_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                    (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVPKUHUMShuffleMask(cast<ShuffleVectorSDNode>(N), true);
+}]>;
+def vpkuwum_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                    (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVPKUWUMShuffleMask(cast<ShuffleVectorSDNode>(N), true);
+}]>;
+
+
+def vmrglb_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 1, false);
+}]>;
+def vmrglh_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 2, false);
+}]>;
+def vmrglw_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 4, false);
+}]>;
+def vmrghb_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 1, false);
+}]>;
+def vmrghh_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 2, false);
+}]>;
+def vmrghw_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 4, false);
+}]>;
+
+
+def vmrglb_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 1, true);
+}]>;
+def vmrglh_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 2, true);
+}]>;
+def vmrglw_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGLShuffleMask(cast<ShuffleVectorSDNode>(N), 4, true);
+}]>;
+def vmrghb_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 1, true);
+}]>;
+def vmrghh_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 2, true);
+}]>;
+def vmrghw_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVMRGHShuffleMask(cast<ShuffleVectorSDNode>(N), 4, true);
+}]>;
+
+
+def VSLDOI_get_imm : SDNodeXForm<vector_shuffle, [{
+  return getI32Imm(PPC::isVSLDOIShuffleMask(N, false));
+}]>;
+def vsldoi_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVSLDOIShuffleMask(N, false) != -1;
+}], VSLDOI_get_imm>;
+
+
+/// VSLDOI_unary* - These are used to match vsldoi(X,X), which is turned into
+/// vector_shuffle(X,undef,mask) by the dag combiner.
+def VSLDOI_unary_get_imm : SDNodeXForm<vector_shuffle, [{
+  return getI32Imm(PPC::isVSLDOIShuffleMask(N, true));
+}]>;
+def vsldoi_unary_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isVSLDOIShuffleMask(N, true) != -1;
+}], VSLDOI_unary_get_imm>;
+
+
+// VSPLT*_get_imm xform function: convert vector_shuffle mask to VSPLT* imm.
+def VSPLTB_get_imm : SDNodeXForm<vector_shuffle, [{
+  return getI32Imm(PPC::getVSPLTImmediate(N, 1));
+}]>;
+def vspltb_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 1);
+}], VSPLTB_get_imm>;
+def VSPLTH_get_imm : SDNodeXForm<vector_shuffle, [{
+  return getI32Imm(PPC::getVSPLTImmediate(N, 2));
+}]>;
+def vsplth_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 2);
+}], VSPLTH_get_imm>;
+def VSPLTW_get_imm : SDNodeXForm<vector_shuffle, [{
+  return getI32Imm(PPC::getVSPLTImmediate(N, 4));
+}]>;
+def vspltw_shuffle : PatFrag<(ops node:$lhs, node:$rhs),
+                             (vector_shuffle node:$lhs, node:$rhs), [{
+  return PPC::isSplatShuffleMask(cast<ShuffleVectorSDNode>(N), 4);
+}], VSPLTW_get_imm>;
+
+
+// VSPLTISB_get_imm xform function: convert build_vector to VSPLTISB imm.
+def VSPLTISB_get_imm : SDNodeXForm<build_vector, [{
+  return PPC::get_VSPLTI_elt(N, 1, *CurDAG);
+}]>;
+def vecspltisb : PatLeaf<(build_vector), [{
+  return PPC::get_VSPLTI_elt(N, 1, *CurDAG).getNode() != 0;
+}], VSPLTISB_get_imm>;
+
+// VSPLTISH_get_imm xform function: convert build_vector to VSPLTISH imm.
+def VSPLTISH_get_imm : SDNodeXForm<build_vector, [{
+  return PPC::get_VSPLTI_elt(N, 2, *CurDAG);
+}]>;
+def vecspltish : PatLeaf<(build_vector), [{
+  return PPC::get_VSPLTI_elt(N, 2, *CurDAG).getNode() != 0;
+}], VSPLTISH_get_imm>;
+
+// VSPLTISW_get_imm xform function: convert build_vector to VSPLTISW imm.
+def VSPLTISW_get_imm : SDNodeXForm<build_vector, [{
+  return PPC::get_VSPLTI_elt(N, 4, *CurDAG);
+}]>;
+def vecspltisw : PatLeaf<(build_vector), [{
+  return PPC::get_VSPLTI_elt(N, 4, *CurDAG).getNode() != 0;
+}], VSPLTISW_get_imm>;
+
+def V_immneg0 : PatLeaf<(build_vector), [{
+  return PPC::isAllNegativeZeroVector(N);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Helpers for defining instructions that directly correspond to intrinsics.
+
+// VA1a_Int - A VAForm_1a intrinsic definition.
+class VA1a_Int<bits<6> xo, string opc, Intrinsic IntID>
+  : VAForm_1a<xo, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB, VRRC:$vC),
+              !strconcat(opc, " $vD, $vA, $vB, $vC"), VecFP,
+                       [(set VRRC:$vD, (IntID VRRC:$vA, VRRC:$vB, VRRC:$vC))]>;
+
+// VX1_Int - A VXForm_1 intrinsic definition.
+class VX1_Int<bits<11> xo, string opc, Intrinsic IntID>
+  : VXForm_1<xo, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+             !strconcat(opc, " $vD, $vA, $vB"), VecFP,
+             [(set VRRC:$vD, (IntID VRRC:$vA, VRRC:$vB))]>;
+
+// VX2_Int - A VXForm_2 intrinsic definition.
+class VX2_Int<bits<11> xo, string opc, Intrinsic IntID>
+  : VXForm_2<xo, (outs VRRC:$vD), (ins VRRC:$vB),
+             !strconcat(opc, " $vD, $vB"), VecFP,
+             [(set VRRC:$vD, (IntID VRRC:$vB))]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction Definitions.
+
+def DSS      : DSS_Form<822, (outs),
+                        (ins u5imm:$ZERO0, u5imm:$STRM,u5imm:$ZERO1,u5imm:$ZERO2),
+                        "dss $STRM", LdStGeneral /*FIXME*/, []>;
+def DSSALL   : DSS_Form<822, (outs),
+                        (ins u5imm:$ONE, u5imm:$ZERO0,u5imm:$ZERO1,u5imm:$ZERO2),
+                        "dssall", LdStGeneral /*FIXME*/, []>;
+def DST      : DSS_Form<342, (outs),
+                        (ins u5imm:$ZERO, u5imm:$STRM, GPRC:$rA, GPRC:$rB),
+                        "dst $rA, $rB, $STRM", LdStGeneral /*FIXME*/, []>;
+def DSTT     : DSS_Form<342, (outs),
+                        (ins u5imm:$ONE, u5imm:$STRM, GPRC:$rA, GPRC:$rB),
+                        "dstt $rA, $rB, $STRM", LdStGeneral /*FIXME*/, []>;
+def DSTST    : DSS_Form<374, (outs),
+                        (ins u5imm:$ZERO, u5imm:$STRM, GPRC:$rA, GPRC:$rB),
+                        "dstst $rA, $rB, $STRM", LdStGeneral /*FIXME*/, []>;
+def DSTSTT   : DSS_Form<374, (outs),
+                        (ins u5imm:$ONE, u5imm:$STRM, GPRC:$rA, GPRC:$rB),
+                        "dststt $rA, $rB, $STRM", LdStGeneral /*FIXME*/, []>;
+
+def DST64    : DSS_Form<342, (outs),
+                        (ins u5imm:$ZERO, u5imm:$STRM, G8RC:$rA, GPRC:$rB),
+                        "dst $rA, $rB, $STRM", LdStGeneral /*FIXME*/, []>;
+def DSTT64   : DSS_Form<342, (outs),
+                        (ins u5imm:$ONE, u5imm:$STRM, G8RC:$rA, GPRC:$rB),
+                        "dstt $rA, $rB, $STRM", LdStGeneral /*FIXME*/, []>;
+def DSTST64  : DSS_Form<374, (outs),
+                        (ins u5imm:$ZERO, u5imm:$STRM, G8RC:$rA, GPRC:$rB),
+                        "dstst $rA, $rB, $STRM", LdStGeneral /*FIXME*/, []>;
+def DSTSTT64 : DSS_Form<374, (outs),
+                        (ins u5imm:$ONE, u5imm:$STRM, G8RC:$rA, GPRC:$rB),
+                        "dststt $rA, $rB, $STRM", LdStGeneral /*FIXME*/, []>;
+
+def MFVSCR : VXForm_4<1540, (outs VRRC:$vD), (ins),
+                      "mfvscr $vD", LdStGeneral,
+                      [(set VRRC:$vD, (int_ppc_altivec_mfvscr))]>; 
+def MTVSCR : VXForm_5<1604, (outs), (ins VRRC:$vB),
+                      "mtvscr $vB", LdStGeneral,
+                      [(int_ppc_altivec_mtvscr VRRC:$vB)]>; 
+
+let canFoldAsLoad = 1, PPC970_Unit = 2 in {  // Loads.
+def LVEBX: XForm_1<31,   7, (outs VRRC:$vD), (ins memrr:$src),
+                   "lvebx $vD, $src", LdStGeneral,
+                   [(set VRRC:$vD, (int_ppc_altivec_lvebx xoaddr:$src))]>;
+def LVEHX: XForm_1<31,  39, (outs VRRC:$vD), (ins memrr:$src),
+                   "lvehx $vD, $src", LdStGeneral,
+                   [(set VRRC:$vD, (int_ppc_altivec_lvehx xoaddr:$src))]>;
+def LVEWX: XForm_1<31,  71, (outs VRRC:$vD), (ins memrr:$src),
+                   "lvewx $vD, $src", LdStGeneral,
+                   [(set VRRC:$vD, (int_ppc_altivec_lvewx xoaddr:$src))]>;
+def LVX  : XForm_1<31, 103, (outs VRRC:$vD), (ins memrr:$src),
+                   "lvx $vD, $src", LdStGeneral,
+                   [(set VRRC:$vD, (int_ppc_altivec_lvx xoaddr:$src))]>;
+def LVXL : XForm_1<31, 359, (outs VRRC:$vD), (ins memrr:$src),
+                   "lvxl $vD, $src", LdStGeneral,
+                   [(set VRRC:$vD, (int_ppc_altivec_lvxl xoaddr:$src))]>;
+}
+
+def LVSL : XForm_1<31,   6, (outs VRRC:$vD), (ins memrr:$src),
+                   "lvsl $vD, $src", LdStGeneral,
+                   [(set VRRC:$vD, (int_ppc_altivec_lvsl xoaddr:$src))]>,
+                   PPC970_Unit_LSU;
+def LVSR : XForm_1<31,  38, (outs VRRC:$vD), (ins memrr:$src),
+                   "lvsr $vD, $src", LdStGeneral,
+                   [(set VRRC:$vD, (int_ppc_altivec_lvsr xoaddr:$src))]>,
+                   PPC970_Unit_LSU;
+
+let PPC970_Unit = 2 in {   // Stores.
+def STVEBX: XForm_8<31, 135, (outs), (ins VRRC:$rS, memrr:$dst),
+                   "stvebx $rS, $dst", LdStGeneral,
+                   [(int_ppc_altivec_stvebx VRRC:$rS, xoaddr:$dst)]>;
+def STVEHX: XForm_8<31, 167, (outs), (ins VRRC:$rS, memrr:$dst),
+                   "stvehx $rS, $dst", LdStGeneral,
+                   [(int_ppc_altivec_stvehx VRRC:$rS, xoaddr:$dst)]>;
+def STVEWX: XForm_8<31, 199, (outs), (ins VRRC:$rS, memrr:$dst),
+                   "stvewx $rS, $dst", LdStGeneral,
+                   [(int_ppc_altivec_stvewx VRRC:$rS, xoaddr:$dst)]>;
+def STVX  : XForm_8<31, 231, (outs), (ins VRRC:$rS, memrr:$dst),
+                   "stvx $rS, $dst", LdStGeneral,
+                   [(int_ppc_altivec_stvx VRRC:$rS, xoaddr:$dst)]>;
+def STVXL : XForm_8<31, 487, (outs), (ins VRRC:$rS, memrr:$dst),
+                   "stvxl $rS, $dst", LdStGeneral,
+                   [(int_ppc_altivec_stvxl VRRC:$rS, xoaddr:$dst)]>;
+}
+
+let PPC970_Unit = 5 in {  // VALU Operations.
+// VA-Form instructions.  3-input AltiVec ops.
+def VMADDFP : VAForm_1<46, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vC, VRRC:$vB),
+                       "vmaddfp $vD, $vA, $vC, $vB", VecFP,
+                       [(set VRRC:$vD, (fadd (fmul VRRC:$vA, VRRC:$vC),
+                                             VRRC:$vB))]>,
+                       Requires<[FPContractions]>;
+def VNMSUBFP: VAForm_1<47, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vC, VRRC:$vB),
+                       "vnmsubfp $vD, $vA, $vC, $vB", VecFP,
+                       [(set VRRC:$vD, (fsub V_immneg0,
+                                             (fsub (fmul VRRC:$vA, VRRC:$vC),
+                                                   VRRC:$vB)))]>,
+                       Requires<[FPContractions]>;
+
+def VMHADDSHS  : VA1a_Int<32, "vmhaddshs",  int_ppc_altivec_vmhaddshs>;
+def VMHRADDSHS : VA1a_Int<33, "vmhraddshs", int_ppc_altivec_vmhraddshs>;
+def VMLADDUHM  : VA1a_Int<34, "vmladduhm",  int_ppc_altivec_vmladduhm>;
+def VPERM      : VA1a_Int<43, "vperm",      int_ppc_altivec_vperm>;
+def VSEL       : VA1a_Int<42, "vsel",       int_ppc_altivec_vsel>;
+
+// Shuffles.
+def VSLDOI  : VAForm_2<44, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB, u5imm:$SH),
+                       "vsldoi $vD, $vA, $vB, $SH", VecFP,
+                       [(set VRRC:$vD, 
+                         (vsldoi_shuffle:$SH (v16i8 VRRC:$vA), VRRC:$vB))]>;
+
+// VX-Form instructions.  AltiVec arithmetic ops.
+def VADDFP : VXForm_1<10, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vaddfp $vD, $vA, $vB", VecFP,
+                      [(set VRRC:$vD, (fadd VRRC:$vA, VRRC:$vB))]>;
+                      
+def VADDUBM : VXForm_1<0, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vaddubm $vD, $vA, $vB", VecGeneral,
+                      [(set VRRC:$vD, (add (v16i8 VRRC:$vA), VRRC:$vB))]>;
+def VADDUHM : VXForm_1<64, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vadduhm $vD, $vA, $vB", VecGeneral,
+                      [(set VRRC:$vD, (add (v8i16 VRRC:$vA), VRRC:$vB))]>;
+def VADDUWM : VXForm_1<128, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vadduwm $vD, $vA, $vB", VecGeneral,
+                      [(set VRRC:$vD, (add (v4i32 VRRC:$vA), VRRC:$vB))]>;
+                      
+def VADDCUW : VX1_Int<384, "vaddcuw", int_ppc_altivec_vaddcuw>;
+def VADDSBS : VX1_Int<768, "vaddsbs", int_ppc_altivec_vaddsbs>;
+def VADDSHS : VX1_Int<832, "vaddshs", int_ppc_altivec_vaddshs>;
+def VADDSWS : VX1_Int<896, "vaddsws", int_ppc_altivec_vaddsws>;
+def VADDUBS : VX1_Int<512, "vaddubs", int_ppc_altivec_vaddubs>;
+def VADDUHS : VX1_Int<576, "vadduhs", int_ppc_altivec_vadduhs>;
+def VADDUWS : VX1_Int<640, "vadduws", int_ppc_altivec_vadduws>;
+                             
+                             
+def VAND : VXForm_1<1028, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                    "vand $vD, $vA, $vB", VecFP,
+                    [(set VRRC:$vD, (and (v4i32 VRRC:$vA), VRRC:$vB))]>;
+def VANDC : VXForm_1<1092, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                     "vandc $vD, $vA, $vB", VecFP,
+                     [(set VRRC:$vD, (and (v4i32 VRRC:$vA), (vnot VRRC:$vB)))]>;
+
+def VCFSX  : VXForm_1<842, (outs VRRC:$vD), (ins u5imm:$UIMM, VRRC:$vB),
+                      "vcfsx $vD, $vB, $UIMM", VecFP,
+                      [(set VRRC:$vD,
+                             (int_ppc_altivec_vcfsx VRRC:$vB, imm:$UIMM))]>;
+def VCFUX  : VXForm_1<778, (outs VRRC:$vD), (ins u5imm:$UIMM, VRRC:$vB),
+                      "vcfux $vD, $vB, $UIMM", VecFP,
+                      [(set VRRC:$vD,
+                             (int_ppc_altivec_vcfux VRRC:$vB, imm:$UIMM))]>;
+def VCTSXS : VXForm_1<970, (outs VRRC:$vD), (ins u5imm:$UIMM, VRRC:$vB),
+                      "vctsxs $vD, $vB, $UIMM", VecFP,
+                      [(set VRRC:$vD,
+                             (int_ppc_altivec_vctsxs VRRC:$vB, imm:$UIMM))]>;
+def VCTUXS : VXForm_1<906, (outs VRRC:$vD), (ins u5imm:$UIMM, VRRC:$vB),
+                      "vctuxs $vD, $vB, $UIMM", VecFP,
+                      [(set VRRC:$vD,
+                             (int_ppc_altivec_vctuxs VRRC:$vB, imm:$UIMM))]>;
+def VEXPTEFP : VX2_Int<394, "vexptefp", int_ppc_altivec_vexptefp>;
+def VLOGEFP  : VX2_Int<458, "vlogefp",  int_ppc_altivec_vlogefp>;
+
+def VAVGSB : VX1_Int<1282, "vavgsb", int_ppc_altivec_vavgsb>;
+def VAVGSH : VX1_Int<1346, "vavgsh", int_ppc_altivec_vavgsh>;
+def VAVGSW : VX1_Int<1410, "vavgsw", int_ppc_altivec_vavgsw>;
+def VAVGUB : VX1_Int<1026, "vavgub", int_ppc_altivec_vavgub>;
+def VAVGUH : VX1_Int<1090, "vavguh", int_ppc_altivec_vavguh>;
+def VAVGUW : VX1_Int<1154, "vavguw", int_ppc_altivec_vavguw>;
+
+def VMAXFP : VX1_Int<1034, "vmaxfp", int_ppc_altivec_vmaxfp>;
+def VMAXSB : VX1_Int< 258, "vmaxsb", int_ppc_altivec_vmaxsb>;
+def VMAXSH : VX1_Int< 322, "vmaxsh", int_ppc_altivec_vmaxsh>;
+def VMAXSW : VX1_Int< 386, "vmaxsw", int_ppc_altivec_vmaxsw>;
+def VMAXUB : VX1_Int<   2, "vmaxub", int_ppc_altivec_vmaxub>;
+def VMAXUH : VX1_Int<  66, "vmaxuh", int_ppc_altivec_vmaxuh>;
+def VMAXUW : VX1_Int< 130, "vmaxuw", int_ppc_altivec_vmaxuw>;
+def VMINFP : VX1_Int<1098, "vminfp", int_ppc_altivec_vminfp>;
+def VMINSB : VX1_Int< 770, "vminsb", int_ppc_altivec_vminsb>;
+def VMINSH : VX1_Int< 834, "vminsh", int_ppc_altivec_vminsh>;
+def VMINSW : VX1_Int< 898, "vminsw", int_ppc_altivec_vminsw>;
+def VMINUB : VX1_Int< 514, "vminub", int_ppc_altivec_vminub>;
+def VMINUH : VX1_Int< 578, "vminuh", int_ppc_altivec_vminuh>;
+def VMINUW : VX1_Int< 642, "vminuw", int_ppc_altivec_vminuw>;
+
+def VMRGHB : VXForm_1< 12, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vmrghb $vD, $vA, $vB", VecFP,
+                      [(set VRRC:$vD, (vmrghb_shuffle VRRC:$vA, VRRC:$vB))]>;
+def VMRGHH : VXForm_1< 76, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vmrghh $vD, $vA, $vB", VecFP,
+                      [(set VRRC:$vD, (vmrghh_shuffle VRRC:$vA, VRRC:$vB))]>;
+def VMRGHW : VXForm_1<140, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vmrghw $vD, $vA, $vB", VecFP,
+                      [(set VRRC:$vD, (vmrghw_shuffle VRRC:$vA, VRRC:$vB))]>;
+def VMRGLB : VXForm_1<268, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vmrglb $vD, $vA, $vB", VecFP,
+                      [(set VRRC:$vD, (vmrglb_shuffle VRRC:$vA, VRRC:$vB))]>;
+def VMRGLH : VXForm_1<332, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vmrglh $vD, $vA, $vB", VecFP,
+                      [(set VRRC:$vD, (vmrglh_shuffle VRRC:$vA, VRRC:$vB))]>;
+def VMRGLW : VXForm_1<396, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vmrglw $vD, $vA, $vB", VecFP,
+                      [(set VRRC:$vD, (vmrglw_shuffle VRRC:$vA, VRRC:$vB))]>;
+
+def VMSUMMBM : VA1a_Int<37, "vmsummbm", int_ppc_altivec_vmsummbm>;
+def VMSUMSHM : VA1a_Int<40, "vmsumshm", int_ppc_altivec_vmsumshm>;
+def VMSUMSHS : VA1a_Int<41, "vmsumshs", int_ppc_altivec_vmsumshs>;
+def VMSUMUBM : VA1a_Int<36, "vmsumubm", int_ppc_altivec_vmsumubm>;
+def VMSUMUHM : VA1a_Int<38, "vmsumuhm", int_ppc_altivec_vmsumuhm>;
+def VMSUMUHS : VA1a_Int<39, "vmsumuhs", int_ppc_altivec_vmsumuhs>;
+
+def VMULESB : VX1_Int<776, "vmulesb", int_ppc_altivec_vmulesb>;
+def VMULESH : VX1_Int<840, "vmulesh", int_ppc_altivec_vmulesh>;
+def VMULEUB : VX1_Int<520, "vmuleub", int_ppc_altivec_vmuleub>;
+def VMULEUH : VX1_Int<584, "vmuleuh", int_ppc_altivec_vmuleuh>;
+def VMULOSB : VX1_Int<264, "vmulosb", int_ppc_altivec_vmulosb>;
+def VMULOSH : VX1_Int<328, "vmulosh", int_ppc_altivec_vmulosh>;
+def VMULOUB : VX1_Int<  8, "vmuloub", int_ppc_altivec_vmuloub>;
+def VMULOUH : VX1_Int< 72, "vmulouh", int_ppc_altivec_vmulouh>;
+                       
+def VREFP     : VX2_Int<266, "vrefp",     int_ppc_altivec_vrefp>;
+def VRFIM     : VX2_Int<714, "vrfim",     int_ppc_altivec_vrfim>;
+def VRFIN     : VX2_Int<522, "vrfin",     int_ppc_altivec_vrfin>;
+def VRFIP     : VX2_Int<650, "vrfip",     int_ppc_altivec_vrfip>;
+def VRFIZ     : VX2_Int<586, "vrfiz",     int_ppc_altivec_vrfiz>;
+def VRSQRTEFP : VX2_Int<330, "vrsqrtefp", int_ppc_altivec_vrsqrtefp>;
+
+def VSUBCUW : VX1_Int<74, "vsubcuw", int_ppc_altivec_vsubcuw>;
+
+def VSUBFP  : VXForm_1<74, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vsubfp $vD, $vA, $vB", VecGeneral,
+                      [(set VRRC:$vD, (fsub VRRC:$vA, VRRC:$vB))]>;
+def VSUBUBM : VXForm_1<1024, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vsububm $vD, $vA, $vB", VecGeneral,
+                      [(set VRRC:$vD, (sub (v16i8 VRRC:$vA), VRRC:$vB))]>;
+def VSUBUHM : VXForm_1<1088, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vsubuhm $vD, $vA, $vB", VecGeneral,
+                      [(set VRRC:$vD, (sub (v8i16 VRRC:$vA), VRRC:$vB))]>;
+def VSUBUWM : VXForm_1<1152, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vsubuwm $vD, $vA, $vB", VecGeneral,
+                      [(set VRRC:$vD, (sub (v4i32 VRRC:$vA), VRRC:$vB))]>;
+                      
+def VSUBSBS : VX1_Int<1792, "vsubsbs" , int_ppc_altivec_vsubsbs>;
+def VSUBSHS : VX1_Int<1856, "vsubshs" , int_ppc_altivec_vsubshs>;
+def VSUBSWS : VX1_Int<1920, "vsubsws" , int_ppc_altivec_vsubsws>;
+def VSUBUBS : VX1_Int<1536, "vsububs" , int_ppc_altivec_vsububs>;
+def VSUBUHS : VX1_Int<1600, "vsubuhs" , int_ppc_altivec_vsubuhs>;
+def VSUBUWS : VX1_Int<1664, "vsubuws" , int_ppc_altivec_vsubuws>;
+def VSUMSWS : VX1_Int<1928, "vsumsws" , int_ppc_altivec_vsumsws>;
+def VSUM2SWS: VX1_Int<1672, "vsum2sws", int_ppc_altivec_vsum2sws>;
+def VSUM4SBS: VX1_Int<1672, "vsum4sbs", int_ppc_altivec_vsum4sbs>;
+def VSUM4SHS: VX1_Int<1608, "vsum4shs", int_ppc_altivec_vsum4shs>;
+def VSUM4UBS: VX1_Int<1544, "vsum4ubs", int_ppc_altivec_vsum4ubs>;
+
+def VNOR : VXForm_1<1284, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                    "vnor $vD, $vA, $vB", VecFP,
+                    [(set VRRC:$vD, (vnot (or (v4i32 VRRC:$vA), VRRC:$vB)))]>;
+def VOR : VXForm_1<1156, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vor $vD, $vA, $vB", VecFP,
+                      [(set VRRC:$vD, (or (v4i32 VRRC:$vA), VRRC:$vB))]>;
+def VXOR : VXForm_1<1220, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                      "vxor $vD, $vA, $vB", VecFP,
+                      [(set VRRC:$vD, (xor (v4i32 VRRC:$vA), VRRC:$vB))]>;
+
+def VRLB   : VX1_Int<   4, "vrlb", int_ppc_altivec_vrlb>;
+def VRLH   : VX1_Int<  68, "vrlh", int_ppc_altivec_vrlh>;
+def VRLW   : VX1_Int< 132, "vrlw", int_ppc_altivec_vrlw>;
+
+def VSL    : VX1_Int< 452, "vsl" , int_ppc_altivec_vsl >;
+def VSLO   : VX1_Int<1036, "vslo", int_ppc_altivec_vslo>;
+def VSLB   : VX1_Int< 260, "vslb", int_ppc_altivec_vslb>;
+def VSLH   : VX1_Int< 324, "vslh", int_ppc_altivec_vslh>;
+def VSLW   : VX1_Int< 388, "vslw", int_ppc_altivec_vslw>;
+
+def VSPLTB : VXForm_1<524, (outs VRRC:$vD), (ins u5imm:$UIMM, VRRC:$vB),
+                      "vspltb $vD, $vB, $UIMM", VecPerm,
+                      [(set VRRC:$vD,
+                        (vspltb_shuffle:$UIMM (v16i8 VRRC:$vB), (undef)))]>;
+def VSPLTH : VXForm_1<588, (outs VRRC:$vD), (ins u5imm:$UIMM, VRRC:$vB),
+                      "vsplth $vD, $vB, $UIMM", VecPerm,
+                      [(set VRRC:$vD,
+                        (vsplth_shuffle:$UIMM (v16i8 VRRC:$vB), (undef)))]>;
+def VSPLTW : VXForm_1<652, (outs VRRC:$vD), (ins u5imm:$UIMM, VRRC:$vB),
+                      "vspltw $vD, $vB, $UIMM", VecPerm,
+                      [(set VRRC:$vD, 
+                        (vspltw_shuffle:$UIMM (v16i8 VRRC:$vB), (undef)))]>;
+
+def VSR    : VX1_Int< 708, "vsr"  , int_ppc_altivec_vsr>;
+def VSRO   : VX1_Int<1100, "vsro" , int_ppc_altivec_vsro>;
+def VSRAB  : VX1_Int< 772, "vsrab", int_ppc_altivec_vsrab>;
+def VSRAH  : VX1_Int< 836, "vsrah", int_ppc_altivec_vsrah>;
+def VSRAW  : VX1_Int< 900, "vsraw", int_ppc_altivec_vsraw>;
+def VSRB   : VX1_Int< 516, "vsrb" , int_ppc_altivec_vsrb>;
+def VSRH   : VX1_Int< 580, "vsrh" , int_ppc_altivec_vsrh>;
+def VSRW   : VX1_Int< 644, "vsrw" , int_ppc_altivec_vsrw>;
+
+
+def VSPLTISB : VXForm_3<780, (outs VRRC:$vD), (ins s5imm:$SIMM),
+                       "vspltisb $vD, $SIMM", VecPerm,
+                       [(set VRRC:$vD, (v16i8 vecspltisb:$SIMM))]>;
+def VSPLTISH : VXForm_3<844, (outs VRRC:$vD), (ins s5imm:$SIMM),
+                       "vspltish $vD, $SIMM", VecPerm,
+                       [(set VRRC:$vD, (v8i16 vecspltish:$SIMM))]>;
+def VSPLTISW : VXForm_3<908, (outs VRRC:$vD), (ins s5imm:$SIMM),
+                       "vspltisw $vD, $SIMM", VecPerm,
+                       [(set VRRC:$vD, (v4i32 vecspltisw:$SIMM))]>;
+
+// Vector Pack.
+def VPKPX   : VX1_Int<782, "vpkpx", int_ppc_altivec_vpkpx>;
+def VPKSHSS : VX1_Int<398, "vpkshss", int_ppc_altivec_vpkshss>;
+def VPKSHUS : VX1_Int<270, "vpkshus", int_ppc_altivec_vpkshus>;
+def VPKSWSS : VX1_Int<462, "vpkswss", int_ppc_altivec_vpkswss>;
+def VPKSWUS : VX1_Int<334, "vpkswus", int_ppc_altivec_vpkswus>;
+def VPKUHUM : VXForm_1<14, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                       "vpkuhum $vD, $vA, $vB", VecFP,
+                       [(set VRRC:$vD,
+                         (vpkuhum_shuffle (v16i8 VRRC:$vA), VRRC:$vB))]>;
+def VPKUHUS : VX1_Int<142, "vpkuhus", int_ppc_altivec_vpkuhus>;
+def VPKUWUM : VXForm_1<78, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),
+                       "vpkuwum $vD, $vA, $vB", VecFP,
+                       [(set VRRC:$vD,
+                         (vpkuwum_shuffle (v16i8 VRRC:$vA), VRRC:$vB))]>;
+def VPKUWUS : VX1_Int<206, "vpkuwus", int_ppc_altivec_vpkuwus>;
+
+// Vector Unpack.
+def VUPKHPX : VX2_Int<846, "vupkhpx", int_ppc_altivec_vupkhpx>;
+def VUPKHSB : VX2_Int<526, "vupkhsb", int_ppc_altivec_vupkhsb>;
+def VUPKHSH : VX2_Int<590, "vupkhsh", int_ppc_altivec_vupkhsh>;
+def VUPKLPX : VX2_Int<974, "vupklpx", int_ppc_altivec_vupklpx>;
+def VUPKLSB : VX2_Int<654, "vupklsb", int_ppc_altivec_vupklsb>;
+def VUPKLSH : VX2_Int<718, "vupklsh", int_ppc_altivec_vupklsh>;
+
+
+// Altivec Comparisons.
+
+class VCMP<bits<10> xo, string asmstr, ValueType Ty>
+  : VXRForm_1<xo, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),asmstr,VecFPCompare,
+              [(set VRRC:$vD, (Ty (PPCvcmp VRRC:$vA, VRRC:$vB, xo)))]>;
+class VCMPo<bits<10> xo, string asmstr, ValueType Ty>
+  : VXRForm_1<xo, (outs VRRC:$vD), (ins VRRC:$vA, VRRC:$vB),asmstr,VecFPCompare,
+              [(set VRRC:$vD, (Ty (PPCvcmp_o VRRC:$vA, VRRC:$vB, xo)))]> {
+  let Defs = [CR6];
+  let RC = 1;
+}
+
+// f32 element comparisons.0
+def VCMPBFP   : VCMP <966, "vcmpbfp $vD, $vA, $vB"  , v4f32>;
+def VCMPBFPo  : VCMPo<966, "vcmpbfp. $vD, $vA, $vB" , v4f32>;
+def VCMPEQFP  : VCMP <198, "vcmpeqfp $vD, $vA, $vB" , v4f32>;
+def VCMPEQFPo : VCMPo<198, "vcmpeqfp. $vD, $vA, $vB", v4f32>;
+def VCMPGEFP  : VCMP <454, "vcmpgefp $vD, $vA, $vB" , v4f32>;
+def VCMPGEFPo : VCMPo<454, "vcmpgefp. $vD, $vA, $vB", v4f32>;
+def VCMPGTFP  : VCMP <710, "vcmpgtfp $vD, $vA, $vB" , v4f32>;
+def VCMPGTFPo : VCMPo<710, "vcmpgtfp. $vD, $vA, $vB", v4f32>;
+
+// i8 element comparisons.
+def VCMPEQUB  : VCMP <  6, "vcmpequb $vD, $vA, $vB" , v16i8>;
+def VCMPEQUBo : VCMPo<  6, "vcmpequb. $vD, $vA, $vB", v16i8>;
+def VCMPGTSB  : VCMP <774, "vcmpgtsb $vD, $vA, $vB" , v16i8>;
+def VCMPGTSBo : VCMPo<774, "vcmpgtsb. $vD, $vA, $vB", v16i8>;
+def VCMPGTUB  : VCMP <518, "vcmpgtub $vD, $vA, $vB" , v16i8>;
+def VCMPGTUBo : VCMPo<518, "vcmpgtub. $vD, $vA, $vB", v16i8>;
+
+// i16 element comparisons.
+def VCMPEQUH  : VCMP < 70, "vcmpequh $vD, $vA, $vB" , v8i16>;
+def VCMPEQUHo : VCMPo< 70, "vcmpequh. $vD, $vA, $vB", v8i16>;
+def VCMPGTSH  : VCMP <838, "vcmpgtsh $vD, $vA, $vB" , v8i16>;
+def VCMPGTSHo : VCMPo<838, "vcmpgtsh. $vD, $vA, $vB", v8i16>;
+def VCMPGTUH  : VCMP <582, "vcmpgtuh $vD, $vA, $vB" , v8i16>;
+def VCMPGTUHo : VCMPo<582, "vcmpgtuh. $vD, $vA, $vB", v8i16>;
+
+// i32 element comparisons.
+def VCMPEQUW  : VCMP <134, "vcmpequw $vD, $vA, $vB" , v4i32>;
+def VCMPEQUWo : VCMPo<134, "vcmpequw. $vD, $vA, $vB", v4i32>;
+def VCMPGTSW  : VCMP <902, "vcmpgtsw $vD, $vA, $vB" , v4i32>;
+def VCMPGTSWo : VCMPo<902, "vcmpgtsw. $vD, $vA, $vB", v4i32>;
+def VCMPGTUW  : VCMP <646, "vcmpgtuw $vD, $vA, $vB" , v4i32>;
+def VCMPGTUWo : VCMPo<646, "vcmpgtuw. $vD, $vA, $vB", v4i32>;
+                      
+def V_SET0 : VXForm_setzero<1220, (outs VRRC:$vD), (ins),
+                      "vxor $vD, $vD, $vD", VecFP,
+                      [(set VRRC:$vD, (v4i32 immAllZerosV))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Additional Altivec Patterns
+//
+
+// DS* intrinsics
+def : Pat<(int_ppc_altivec_dssall), (DSSALL 1, 0, 0, 0)>;
+def : Pat<(int_ppc_altivec_dss imm:$STRM), (DSS 0, imm:$STRM, 0, 0)>;
+
+//  * 32-bit
+def : Pat<(int_ppc_altivec_dst GPRC:$rA, GPRC:$rB, imm:$STRM),
+          (DST 0, imm:$STRM, GPRC:$rA, GPRC:$rB)>;
+def : Pat<(int_ppc_altivec_dstt GPRC:$rA, GPRC:$rB, imm:$STRM),
+          (DSTT 1, imm:$STRM, GPRC:$rA, GPRC:$rB)>;
+def : Pat<(int_ppc_altivec_dstst GPRC:$rA, GPRC:$rB, imm:$STRM),
+          (DSTST 0, imm:$STRM, GPRC:$rA, GPRC:$rB)>;
+def : Pat<(int_ppc_altivec_dststt GPRC:$rA, GPRC:$rB, imm:$STRM),
+          (DSTSTT 1, imm:$STRM, GPRC:$rA, GPRC:$rB)>;
+
+//  * 64-bit
+def : Pat<(int_ppc_altivec_dst G8RC:$rA, GPRC:$rB, imm:$STRM),
+          (DST64 0, imm:$STRM, (i64 G8RC:$rA), GPRC:$rB)>;
+def : Pat<(int_ppc_altivec_dstt G8RC:$rA, GPRC:$rB, imm:$STRM),
+          (DSTT64 1, imm:$STRM, (i64 G8RC:$rA), GPRC:$rB)>;
+def : Pat<(int_ppc_altivec_dstst G8RC:$rA, GPRC:$rB, imm:$STRM),
+          (DSTST64 0, imm:$STRM, (i64 G8RC:$rA), GPRC:$rB)>;
+def : Pat<(int_ppc_altivec_dststt G8RC:$rA, GPRC:$rB, imm:$STRM),
+          (DSTSTT64 1, imm:$STRM, (i64 G8RC:$rA), GPRC:$rB)>;
+
+// Loads.
+def : Pat<(v4i32 (load xoaddr:$src)), (LVX xoaddr:$src)>;
+
+// Stores.
+def : Pat<(store (v4i32 VRRC:$rS), xoaddr:$dst),
+          (STVX (v4i32 VRRC:$rS), xoaddr:$dst)>;
+
+// Bit conversions.
+def : Pat<(v16i8 (bitconvert (v8i16 VRRC:$src))), (v16i8 VRRC:$src)>;
+def : Pat<(v16i8 (bitconvert (v4i32 VRRC:$src))), (v16i8 VRRC:$src)>;
+def : Pat<(v16i8 (bitconvert (v4f32 VRRC:$src))), (v16i8 VRRC:$src)>;
+
+def : Pat<(v8i16 (bitconvert (v16i8 VRRC:$src))), (v8i16 VRRC:$src)>;
+def : Pat<(v8i16 (bitconvert (v4i32 VRRC:$src))), (v8i16 VRRC:$src)>;
+def : Pat<(v8i16 (bitconvert (v4f32 VRRC:$src))), (v8i16 VRRC:$src)>;
+
+def : Pat<(v4i32 (bitconvert (v16i8 VRRC:$src))), (v4i32 VRRC:$src)>;
+def : Pat<(v4i32 (bitconvert (v8i16 VRRC:$src))), (v4i32 VRRC:$src)>;
+def : Pat<(v4i32 (bitconvert (v4f32 VRRC:$src))), (v4i32 VRRC:$src)>;
+
+def : Pat<(v4f32 (bitconvert (v16i8 VRRC:$src))), (v4f32 VRRC:$src)>;
+def : Pat<(v4f32 (bitconvert (v8i16 VRRC:$src))), (v4f32 VRRC:$src)>;
+def : Pat<(v4f32 (bitconvert (v4i32 VRRC:$src))), (v4f32 VRRC:$src)>;
+
+// Shuffles.
+
+// Match vsldoi(x,x), vpkuwum(x,x), vpkuhum(x,x)
+def:Pat<(vsldoi_unary_shuffle:$in (v16i8 VRRC:$vA), undef),
+        (VSLDOI VRRC:$vA, VRRC:$vA, (VSLDOI_unary_get_imm VRRC:$in))>;
+def:Pat<(vpkuwum_unary_shuffle (v16i8 VRRC:$vA), undef),
+        (VPKUWUM VRRC:$vA, VRRC:$vA)>;
+def:Pat<(vpkuhum_unary_shuffle (v16i8 VRRC:$vA), undef),
+        (VPKUHUM VRRC:$vA, VRRC:$vA)>;
+
+// Match vmrg*(x,x)
+def:Pat<(vmrglb_unary_shuffle (v16i8 VRRC:$vA), undef),
+        (VMRGLB VRRC:$vA, VRRC:$vA)>;
+def:Pat<(vmrglh_unary_shuffle (v16i8 VRRC:$vA), undef),
+        (VMRGLH VRRC:$vA, VRRC:$vA)>;
+def:Pat<(vmrglw_unary_shuffle (v16i8 VRRC:$vA), undef),
+        (VMRGLW VRRC:$vA, VRRC:$vA)>;
+def:Pat<(vmrghb_unary_shuffle (v16i8 VRRC:$vA), undef),
+        (VMRGHB VRRC:$vA, VRRC:$vA)>;
+def:Pat<(vmrghh_unary_shuffle (v16i8 VRRC:$vA), undef),
+        (VMRGHH VRRC:$vA, VRRC:$vA)>;
+def:Pat<(vmrghw_unary_shuffle (v16i8 VRRC:$vA), undef),
+        (VMRGHW VRRC:$vA, VRRC:$vA)>;
+
+// Logical Operations
+def : Pat<(v4i32 (vnot VRRC:$vA)),      (VNOR VRRC:$vA, VRRC:$vA)>;
+def : Pat<(v4i32 (vnot_conv VRRC:$vA)), (VNOR VRRC:$vA, VRRC:$vA)>;
+
+def : Pat<(v4i32 (vnot_conv (or VRRC:$A, VRRC:$B))),
+          (VNOR VRRC:$A, VRRC:$B)>;
+def : Pat<(v4i32 (and VRRC:$A, (vnot_conv VRRC:$B))),
+          (VANDC VRRC:$A, VRRC:$B)>;
+
+def : Pat<(fmul VRRC:$vA, VRRC:$vB),
+          (VMADDFP VRRC:$vA, VRRC:$vB, (v4i32 (V_SET0)))>; 
+
+// Fused multiply add and multiply sub for packed float.  These are represented
+// separately from the real instructions above, for operations that must have
+// the additional precision, such as Newton-Rhapson (used by divide, sqrt)
+def : Pat<(PPCvmaddfp VRRC:$A, VRRC:$B, VRRC:$C),
+          (VMADDFP VRRC:$A, VRRC:$B, VRRC:$C)>;
+def : Pat<(PPCvnmsubfp VRRC:$A, VRRC:$B, VRRC:$C),
+          (VNMSUBFP VRRC:$A, VRRC:$B, VRRC:$C)>;
+
+def : Pat<(int_ppc_altivec_vmaddfp VRRC:$A, VRRC:$B, VRRC:$C),
+          (VMADDFP VRRC:$A, VRRC:$B, VRRC:$C)>;
+def : Pat<(int_ppc_altivec_vnmsubfp VRRC:$A, VRRC:$B, VRRC:$C),
+          (VNMSUBFP VRRC:$A, VRRC:$B, VRRC:$C)>;
+
+def : Pat<(PPCvperm (v16i8 VRRC:$vA), VRRC:$vB, VRRC:$vC),
+          (VPERM VRRC:$vA, VRRC:$vB, VRRC:$vC)>;
+
+// Vector shifts
+def : Pat<(v16i8 (shl (v16i8 VRRC:$vA), (v16i8 VRRC:$vB))),
+          (v16i8 (VSLB VRRC:$vA, VRRC:$vB))>;
+def : Pat<(v8i16 (shl (v8i16 VRRC:$vA), (v8i16 VRRC:$vB))),
+          (v8i16 (VSLH VRRC:$vA, VRRC:$vB))>;
+def : Pat<(v4i32 (shl (v4i32 VRRC:$vA), (v4i32 VRRC:$vB))),
+          (v4i32 (VSLW VRRC:$vA, VRRC:$vB))>;
+
+def : Pat<(v16i8 (srl (v16i8 VRRC:$vA), (v16i8 VRRC:$vB))),
+          (v16i8 (VSRB VRRC:$vA, VRRC:$vB))>;
+def : Pat<(v8i16 (srl (v8i16 VRRC:$vA), (v8i16 VRRC:$vB))),
+          (v8i16 (VSRH VRRC:$vA, VRRC:$vB))>;
+def : Pat<(v4i32 (srl (v4i32 VRRC:$vA), (v4i32 VRRC:$vB))),
+          (v4i32 (VSRW VRRC:$vA, VRRC:$vB))>;
+
+def : Pat<(v16i8 (sra (v16i8 VRRC:$vA), (v16i8 VRRC:$vB))),
+          (v16i8 (VSRAB VRRC:$vA, VRRC:$vB))>;
+def : Pat<(v8i16 (sra (v8i16 VRRC:$vA), (v8i16 VRRC:$vB))),
+          (v8i16 (VSRAH VRRC:$vA, VRRC:$vB))>;
+def : Pat<(v4i32 (sra (v4i32 VRRC:$vA), (v4i32 VRRC:$vB))),
+          (v4i32 (VSRAW VRRC:$vA, VRRC:$vB))>;

diff --git a/lib/Target/PowerPC/PPCInstrBuilder.h b/lib/Target/PowerPC/PPCInstrBuilder.h
new file mode 100644
index 0000000..b424d11
--- /dev/null
+++ b/lib/Target/PowerPC/PPCInstrBuilder.h

@@ -0,0 +1,43 @@
+//===-- PPCInstrBuilder.h - Aides for building PPC insts --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes functions that may be used with BuildMI from the
+// MachineInstrBuilder.h file to simplify generating frame and constant pool
+// references.
+//
+// For reference, the order of operands for memory references is:
+// (Operand), Dest Reg, Base Reg, and either Reg Index or Immediate
+// Displacement.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POWERPC_INSTRBUILDER_H
+#define POWERPC_INSTRBUILDER_H
+
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+namespace llvm {
+
+/// addFrameReference - This function is used to add a reference to the base of
+/// an abstract object on the stack frame of the current function.  This
+/// reference has base register as the FrameIndex offset until it is resolved.
+/// This allows a constant offset to be specified as well...
+///
+static inline const MachineInstrBuilder&
+addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0,
+                  bool mem = true) {
+  if (mem)
+    return MIB.addImm(Offset).addFrameIndex(FI);
+  else
+    return MIB.addFrameIndex(FI).addImm(Offset);
+}
+
+} // End llvm namespace
+
+#endif

diff --git a/lib/Target/PowerPC/PPCInstrFormats.td b/lib/Target/PowerPC/PPCInstrFormats.td
new file mode 100644
index 0000000..54cebcd
--- /dev/null
+++ b/lib/Target/PowerPC/PPCInstrFormats.td

@@ -0,0 +1,875 @@
+//===- PowerPCInstrFormats.td - PowerPC Instruction Formats --*- tablegen -*-=//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//
+// PowerPC instruction formats
+
+class I<bits<6> opcode, dag OOL, dag IOL, string asmstr, InstrItinClass itin>
+        : Instruction {
+  field bits<32> Inst;
+
+  bit PPC64 = 0;  // Default value, override with isPPC64
+
+  let Namespace = "PPC";
+  let Inst{0-5} = opcode;
+  let OutOperandList = OOL;
+  let InOperandList = IOL;
+  let AsmString = asmstr;
+  let Itinerary = itin;
+  
+  /// These fields correspond to the fields in PPCInstrInfo.h.  Any changes to
+  /// these must be reflected there!  See comments there for what these are.
+  bits<1> PPC970_First = 0;
+  bits<1> PPC970_Single = 0;
+  bits<1> PPC970_Cracked = 0;
+  bits<3> PPC970_Unit = 0;
+}
+
+class PPC970_DGroup_First   { bits<1> PPC970_First = 1;  }
+class PPC970_DGroup_Single  { bits<1> PPC970_Single = 1; }
+class PPC970_DGroup_Cracked { bits<1> PPC970_Cracked = 1; }
+class PPC970_MicroCode;
+
+class PPC970_Unit_Pseudo   { bits<3> PPC970_Unit = 0;   }
+class PPC970_Unit_FXU      { bits<3> PPC970_Unit = 1;   }
+class PPC970_Unit_LSU      { bits<3> PPC970_Unit = 2;   }
+class PPC970_Unit_FPU      { bits<3> PPC970_Unit = 3;   }
+class PPC970_Unit_CRU      { bits<3> PPC970_Unit = 4;   }
+class PPC970_Unit_VALU     { bits<3> PPC970_Unit = 5;   }
+class PPC970_Unit_VPERM    { bits<3> PPC970_Unit = 6;   }
+class PPC970_Unit_BRU      { bits<3> PPC970_Unit = 7;   }
+
+
+// 1.7.1 I-Form
+class IForm<bits<6> opcode, bit aa, bit lk, dag OOL, dag IOL, string asmstr,
+            InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  let Pattern = pattern;
+  bits<24> LI;
+
+  let Inst{6-29}  = LI;
+  let Inst{30}    = aa;
+  let Inst{31}    = lk;
+}
+
+// 1.7.2 B-Form
+class BForm<bits<6> opcode, bit aa, bit lk, dag OOL, dag IOL, string asmstr>
+  : I<opcode, OOL, IOL, asmstr, BrB> {
+  bits<7> BIBO;  // 2 bits of BI and 5 bits of BO.
+  bits<3>  CR;
+  bits<14> BD;
+
+  bits<5> BI;
+  let BI{0-1} = BIBO{5-6};
+  let BI{2-4} = CR{0-2};
+
+  let Inst{6-10}  = BIBO{4-0};
+  let Inst{11-15} = BI;
+  let Inst{16-29} = BD;
+  let Inst{30}    = aa;
+  let Inst{31}    = lk;
+}
+
+
+// 1.7.4 D-Form
+class DForm_base<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+                 InstrItinClass itin, list<dag> pattern> 
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  A;
+  bits<5>  B;
+  bits<16> C;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = A;
+  let Inst{11-15} = B;
+  let Inst{16-31} = C;
+}
+
+class DForm_1<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  A;
+  bits<16> C;
+  bits<5>  B;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = A;
+  let Inst{11-15} = B;
+  let Inst{16-31} = C;
+}
+
+class DForm_2<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : DForm_base<opcode, OOL, IOL, asmstr, itin, pattern>;
+
+class DForm_2_r0<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+                 InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  A;
+  bits<16> B;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = A;
+  let Inst{11-15} = 0;
+  let Inst{16-31} = B;
+}
+
+class DForm_4<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  B;
+  bits<5>  A;
+  bits<16> C;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = A;
+  let Inst{11-15} = B;
+  let Inst{16-31} = C;
+}
+              
+class DForm_4_zero<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+                   InstrItinClass itin, list<dag> pattern>
+  : DForm_1<opcode, OOL, IOL, asmstr, itin, pattern> {
+  let A = 0;
+  let B = 0;
+  let C = 0;
+}
+
+class DForm_5<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3>  BF;
+  bits<1>  L;
+  bits<5>  RA;
+  bits<16> I;
+
+  let Inst{6-8}   = BF;
+  let Inst{9}     = 0;
+  let Inst{10}    = L;
+  let Inst{11-15} = RA;
+  let Inst{16-31} = I;
+}
+
+class DForm_5_ext<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+                  InstrItinClass itin>
+  : DForm_5<opcode, OOL, IOL, asmstr, itin> {
+  let L = PPC64;
+}
+
+class DForm_6<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin> 
+  : DForm_5<opcode, OOL, IOL, asmstr, itin>;
+
+class DForm_6_ext<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+                  InstrItinClass itin>
+  : DForm_6<opcode, OOL, IOL, asmstr, itin> {
+  let L = PPC64;
+}
+
+
+// 1.7.5 DS-Form
+class DSForm_1<bits<6> opcode, bits<2> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  RST;
+  bits<14> DS;
+  bits<5>  RA;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = RST;
+  let Inst{11-15} = RA;
+  let Inst{16-29} = DS;
+  let Inst{30-31} = xo;
+}
+
+// 1.7.6 X-Form
+class XForm_base_r3xo<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, 
+                      InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RST;
+  bits<5> A;
+  bits<5> B;
+
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = RST;
+  let Inst{11-15} = A;
+  let Inst{16-20} = B;
+  let Inst{21-30} = xo;
+  let Inst{31}    = RC;
+}
+
+// This is the same as XForm_base_r3xo, but the first two operands are swapped
+// when code is emitted.
+class XForm_base_r3xo_swapped
+        <bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+        InstrItinClass itin> 
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> A;
+  bits<5> RST;
+  bits<5> B;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = RST;
+  let Inst{11-15} = A;
+  let Inst{16-20} = B;
+  let Inst{21-30} = xo;
+  let Inst{31}    = RC;
+}
+
+
+class XForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern>;
+
+class XForm_6<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo_swapped<opcode, xo, OOL, IOL, asmstr, itin> {
+  let Pattern = pattern;
+}
+
+class XForm_8<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern>;
+
+class XForm_10<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo_swapped<opcode, xo, OOL, IOL, asmstr, itin> {
+    let Pattern = pattern;
+}
+
+class XForm_11<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo_swapped<opcode, xo, OOL, IOL, asmstr, itin> {
+  let B = 0;
+  let Pattern = pattern;
+}
+
+class XForm_16<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3> BF;
+  bits<1> L; 
+  bits<5> RA;
+  bits<5> RB;
+  
+  let Inst{6-8}   = BF;
+  let Inst{9}     = 0;
+  let Inst{10}    = L;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = RB;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XForm_16_ext<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                   InstrItinClass itin>
+  : XForm_16<opcode, xo, OOL, IOL, asmstr, itin> {
+  let L = PPC64;
+}
+
+class XForm_17<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3> BF;
+  bits<5> FRA;
+  bits<5> FRB;
+  
+  let Inst{6-8}   = BF;
+  let Inst{9-10}  = 0;
+  let Inst{11-15} = FRA;
+  let Inst{16-20} = FRB;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XForm_24<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern> 
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  let Pattern = pattern;
+  let Inst{6-10}  = 31;
+  let Inst{11-15} = 0;
+  let Inst{16-20} = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XForm_24_sync<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+               string asmstr, InstrItinClass itin, list<dag> pattern> 
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  let Pattern = pattern;
+  let Inst{6-10}  = 0;
+  let Inst{11-15} = 0;
+  let Inst{16-20} = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XForm_25<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+}
+
+class XForm_26<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let A = 0;
+}
+
+class XForm_28<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern> 
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+}
+
+// This is used for MFFS, MTFSB0, MTFSB1.  42 is arbitrary; this series of
+// numbers presumably relates to some document, but I haven't found it.
+class XForm_42<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = RST;
+  let Inst{11-20} = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = RC;
+}
+class XForm_43<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : XForm_base_r3xo<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let Pattern = pattern;
+  bits<5> FM;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = FM;
+  let Inst{11-20} = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = RC;
+}
+
+// DCB_Form - Form X instruction, used for dcb* instructions.
+class DCB_Form<bits<10> xo, bits<5> immfield, dag OOL, dag IOL, string asmstr, 
+                      InstrItinClass itin, list<dag> pattern>
+  : I<31, OOL, IOL, asmstr, itin> {
+  bits<5> A;
+  bits<5> B;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = immfield;
+  let Inst{11-15} = A;
+  let Inst{16-20} = B;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+
+// DSS_Form - Form X instruction, used for altivec dss* instructions.
+class DSS_Form<bits<10> xo, dag OOL, dag IOL, string asmstr, 
+                      InstrItinClass itin, list<dag> pattern>
+  : I<31, OOL, IOL, asmstr, itin> {
+  bits<1> T;
+  bits<2> STRM;
+  bits<5> A;
+  bits<5> B;
+
+  let Pattern = pattern;
+
+  let Inst{6}     = T;
+  let Inst{7-8}   = 0;
+  let Inst{9-10}  = STRM;
+  let Inst{11-15} = A;
+  let Inst{16-20} = B;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+// 1.7.7 XL-Form
+class XLForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> CRD;
+  bits<5> CRA;
+  bits<5> CRB;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = CRD;
+  let Inst{11-15} = CRA;
+  let Inst{16-20} = CRB;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XLForm_1_ext<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> CRD;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = CRD;
+  let Inst{11-15} = CRD;
+  let Inst{16-20} = CRD;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XLForm_2<bits<6> opcode, bits<10> xo, bit lk, dag OOL, dag IOL, string asmstr, 
+               InstrItinClass itin, list<dag> pattern>
+    : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> BO;
+  bits<5> BI;
+  bits<2> BH;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = BO;
+  let Inst{11-15} = BI;
+  let Inst{16-18} = 0;
+  let Inst{19-20} = BH;
+  let Inst{21-30} = xo;
+  let Inst{31}    = lk;
+}
+
+class XLForm_2_br<bits<6> opcode, bits<10> xo, bit lk,
+                  dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern>
+  : XLForm_2<opcode, xo, lk, OOL, IOL, asmstr, itin, pattern> {
+  bits<7> BIBO;  // 2 bits of BI and 5 bits of BO.
+  bits<3>  CR;
+  
+  let BO = BIBO{2-6};
+  let BI{0-1} = BIBO{0-1};
+  let BI{2-4} = CR;
+  let BH = 0;
+}
+
+
+class XLForm_2_ext<bits<6> opcode, bits<10> xo, bits<5> bo,  bits<5> bi, bit lk,
+                  dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern>
+  : XLForm_2<opcode, xo, lk, OOL, IOL, asmstr, itin, pattern> {
+  let BO = bo;
+  let BI = bi;
+  let BH = 0;
+}
+
+class XLForm_3<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3> BF;
+  bits<3> BFA;
+  
+  let Inst{6-8}   = BF;
+  let Inst{9-10}  = 0;
+  let Inst{11-13} = BFA;
+  let Inst{14-15} = 0;
+  let Inst{16-20} = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+// 1.7.8 XFX-Form
+class XFXForm_1<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  RT;
+  bits<10> SPR;
+
+  let Inst{6-10}  = RT;
+  let Inst{11}    = SPR{4};
+  let Inst{12}    = SPR{3};
+  let Inst{13}    = SPR{2};
+  let Inst{14}    = SPR{1};
+  let Inst{15}    = SPR{0};
+  let Inst{16}    = SPR{9};
+  let Inst{17}    = SPR{8};
+  let Inst{18}    = SPR{7};
+  let Inst{19}    = SPR{6};
+  let Inst{20}    = SPR{5};
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XFXForm_1_ext<bits<6> opcode, bits<10> xo, bits<10> spr, 
+                   dag OOL, dag IOL, string asmstr, InstrItinClass itin> 
+  : XFXForm_1<opcode, xo, OOL, IOL, asmstr, itin> {
+  let SPR = spr;
+}
+
+class XFXForm_3<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  RT;
+   
+  let Inst{6-10}  = RT;
+  let Inst{11-20} = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XFXForm_5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin> 
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<8>  FXM;
+  bits<5>  ST;
+   
+  let Inst{6-10}  = ST;
+  let Inst{11}    = 0;
+  let Inst{12-19} = FXM;
+  let Inst{20}    = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XFXForm_5a<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                 InstrItinClass itin> 
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5>  ST;
+  bits<8>  FXM;
+   
+  let Inst{6-10}  = ST;
+  let Inst{11}    = 1;
+  let Inst{12-19} = FXM;
+  let Inst{20}    = 0;
+  let Inst{21-30} = xo;
+  let Inst{31}    = 0;
+}
+
+class XFXForm_7<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin>
+  : XFXForm_1<opcode, xo, OOL, IOL, asmstr, itin>;
+
+class XFXForm_7_ext<bits<6> opcode, bits<10> xo, bits<10> spr, 
+                    dag OOL, dag IOL, string asmstr, InstrItinClass itin> 
+  : XFXForm_7<opcode, xo, OOL, IOL, asmstr, itin> {
+  let SPR = spr;
+}
+
+// XFL-Form - MTFSF
+// This is probably 1.7.9, but I don't have the reference that uses this
+// numbering scheme...
+class XFLForm<bits<6> opcode, bits<10> xo, dag OOL, dag IOL, string asmstr, 
+                      string cstr, InstrItinClass itin, list<dag>pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<8> FM;
+  bits<5> RT;
+
+  bit RC = 0;    // set by isDOT
+  let Pattern = pattern;
+  let Constraints = cstr;
+
+  let Inst{6} = 0;
+  let Inst{7-14}  = FM;
+  let Inst{15} = 0;
+  let Inst{16-20} = RT;
+  let Inst{21-30} = xo;
+  let Inst{31}    = RC;
+}
+
+// 1.7.10 XS-Form - SRADI.
+class XSForm_1<bits<6> opcode, bits<9> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> A;
+  bits<5> RS;
+  bits<6> SH;
+
+  bit RC = 0;    // set by isDOT
+  let Pattern = pattern;
+
+  let Inst{6-10}  = RS;
+  let Inst{11-15} = A;
+  let Inst{16-20} = SH{4,3,2,1,0};
+  let Inst{21-29} = xo;
+  let Inst{30}    = SH{5};
+  let Inst{31}    = RC;
+}
+
+// 1.7.11 XO-Form
+class XOForm_1<bits<6> opcode, bits<9> xo, bit oe, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RT;
+  bits<5> RA;
+  bits<5> RB;
+
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = RT;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = RB;
+  let Inst{21}    = oe;
+  let Inst{22-30} = xo;
+  let Inst{31}    = RC;  
+}
+
+class XOForm_3<bits<6> opcode, bits<9> xo, bit oe, 
+               dag OOL, dag IOL, string asmstr, InstrItinClass itin, list<dag> pattern>
+  : XOForm_1<opcode, xo, oe, OOL, IOL, asmstr, itin, pattern> {
+  let RB = 0;
+}
+
+// 1.7.12 A-Form
+class AForm_1<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr, 
+              InstrItinClass itin, list<dag> pattern>
+         : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> FRT;
+  bits<5> FRA;
+  bits<5> FRC;
+  bits<5> FRB;
+
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = FRT;
+  let Inst{11-15} = FRA;
+  let Inst{16-20} = FRB;
+  let Inst{21-25} = FRC;
+  let Inst{26-30} = xo;
+  let Inst{31}    = RC;
+}
+
+class AForm_2<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : AForm_1<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let FRC = 0;
+}
+
+class AForm_3<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern> 
+  : AForm_1<opcode, xo, OOL, IOL, asmstr, itin, pattern> {
+  let FRB = 0;
+}
+
+// 1.7.13 M-Form
+class MForm_1<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+    : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RA;
+  bits<5> RS;
+  bits<5> RB;
+  bits<5> MB;
+  bits<5> ME;
+
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = RS;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = RB;
+  let Inst{21-25} = MB;
+  let Inst{26-30} = ME;
+  let Inst{31}    = RC;
+}
+
+class MForm_2<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+              InstrItinClass itin, list<dag> pattern>
+  : MForm_1<opcode, OOL, IOL, asmstr, itin, pattern> {
+}
+
+// 1.7.14 MD-Form
+class MDForm_1<bits<6> opcode, bits<3> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RA;
+  bits<5> RS;
+  bits<6> SH;
+  bits<6> MBE;
+
+  let Pattern = pattern;
+
+  bit RC = 0;    // set by isDOT
+
+  let Inst{6-10}  = RS;
+  let Inst{11-15} = RA;
+  let Inst{16-20} = SH{4,3,2,1,0};
+  let Inst{21-26} = MBE{4,3,2,1,0,5};
+  let Inst{27-29} = xo;
+  let Inst{30}    = SH{5};
+  let Inst{31}    = RC;
+}
+
+
+
+// E-1 VA-Form
+
+// VAForm_1 - DACB ordering.
+class VAForm_1<bits<6> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VA;
+  bits<5> VC;
+  bits<5> VB;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = VA;
+  let Inst{16-20} = VB;
+  let Inst{21-25} = VC;
+  let Inst{26-31} = xo;
+}
+
+// VAForm_1a - DABC ordering.
+class VAForm_1a<bits<6> xo, dag OOL, dag IOL, string asmstr,
+                InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VA;
+  bits<5> VB;
+  bits<5> VC;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = VA;
+  let Inst{16-20} = VB;
+  let Inst{21-25} = VC;
+  let Inst{26-31} = xo;
+}
+
+class VAForm_2<bits<6> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VA;
+  bits<5> VB;
+  bits<4> SH;
+
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = VA;
+  let Inst{16-20} = VB;
+  let Inst{21}    = 0;
+  let Inst{22-25} = SH;
+  let Inst{26-31} = xo;
+}
+
+// E-2 VX-Form
+class VXForm_1<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VA;
+  bits<5> VB;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = VA;
+  let Inst{16-20} = VB;
+  let Inst{21-31} = xo;
+}
+
+class VXForm_setzero<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : VXForm_1<xo, OOL, IOL, asmstr, itin, pattern> {
+  let VA = VD;
+  let VB = VD;
+}
+
+
+class VXForm_2<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VB;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = 0;
+  let Inst{16-20} = VB;
+  let Inst{21-31} = xo;
+}
+
+class VXForm_3<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> IMM;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = IMM;
+  let Inst{16-20} = 0;
+  let Inst{21-31} = xo;
+}
+
+/// VXForm_4 - VX instructions with "VD,0,0" register fields, like mfvscr.
+class VXForm_4<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = 0;
+  let Inst{16-20} = 0;
+  let Inst{21-31} = xo;
+}
+
+/// VXForm_5 - VX instructions with "0,0,VB" register fields, like mtvscr.
+class VXForm_5<bits<11> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VB;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = 0;
+  let Inst{11-15} = 0;
+  let Inst{16-20} = VB;
+  let Inst{21-31} = xo;
+}
+
+// E-4 VXR-Form
+class VXRForm_1<bits<10> xo, dag OOL, dag IOL, string asmstr,
+               InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VD;
+  bits<5> VA;
+  bits<5> VB;
+  bit RC = 0;
+  
+  let Pattern = pattern;
+  
+  let Inst{6-10}  = VD;
+  let Inst{11-15} = VA;
+  let Inst{16-20} = VB;
+  let Inst{21}    = RC;
+  let Inst{22-31} = xo;
+}
+
+//===----------------------------------------------------------------------===//
+class Pseudo<dag OOL, dag IOL, string asmstr, list<dag> pattern>
+    : I<0, OOL, IOL, asmstr, NoItinerary> {
+  let PPC64 = 0;
+  let Pattern = pattern;
+  let Inst{31-0} = 0;
+}

diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
new file mode 100644
index 0000000..af7d812
--- /dev/null
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp

@@ -0,0 +1,768 @@
+//===- PPCInstrInfo.cpp - PowerPC32 Instruction Information -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PowerPC implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCInstrInfo.h"
+#include "PPCInstrBuilder.h"
+#include "PPCMachineFunctionInfo.h"
+#include "PPCPredicates.h"
+#include "PPCGenInstrInfo.inc"
+#include "PPCTargetMachine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/MC/MCAsmInfo.h"
+using namespace llvm;
+
+extern cl::opt<bool> EnablePPC32RS;  // FIXME (64-bit): See PPCRegisterInfo.cpp.
+extern cl::opt<bool> EnablePPC64RS;  // FIXME (64-bit): See PPCRegisterInfo.cpp.
+
+PPCInstrInfo::PPCInstrInfo(PPCTargetMachine &tm)
+  : TargetInstrInfoImpl(PPCInsts, array_lengthof(PPCInsts)), TM(tm),
+    RI(*TM.getSubtargetImpl(), *this) {}
+
+bool PPCInstrInfo::isMoveInstr(const MachineInstr& MI,
+                               unsigned& sourceReg,
+                               unsigned& destReg,
+                               unsigned& sourceSubIdx,
+                               unsigned& destSubIdx) const {
+  sourceSubIdx = destSubIdx = 0; // No sub-registers.
+
+  unsigned oc = MI.getOpcode();
+  if (oc == PPC::OR || oc == PPC::OR8 || oc == PPC::VOR ||
+      oc == PPC::OR4To8 || oc == PPC::OR8To4) {                // or r1, r2, r2
+    assert(MI.getNumOperands() >= 3 &&
+           MI.getOperand(0).isReg() &&
+           MI.getOperand(1).isReg() &&
+           MI.getOperand(2).isReg() &&
+           "invalid PPC OR instruction!");
+    if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
+      sourceReg = MI.getOperand(1).getReg();
+      destReg = MI.getOperand(0).getReg();
+      return true;
+    }
+  } else if (oc == PPC::ADDI) {             // addi r1, r2, 0
+    assert(MI.getNumOperands() >= 3 &&
+           MI.getOperand(0).isReg() &&
+           MI.getOperand(2).isImm() &&
+           "invalid PPC ADDI instruction!");
+    if (MI.getOperand(1).isReg() && MI.getOperand(2).getImm() == 0) {
+      sourceReg = MI.getOperand(1).getReg();
+      destReg = MI.getOperand(0).getReg();
+      return true;
+    }
+  } else if (oc == PPC::ORI) {             // ori r1, r2, 0
+    assert(MI.getNumOperands() >= 3 &&
+           MI.getOperand(0).isReg() &&
+           MI.getOperand(1).isReg() &&
+           MI.getOperand(2).isImm() &&
+           "invalid PPC ORI instruction!");
+    if (MI.getOperand(2).getImm() == 0) {
+      sourceReg = MI.getOperand(1).getReg();
+      destReg = MI.getOperand(0).getReg();
+      return true;
+    }
+  } else if (oc == PPC::FMRS || oc == PPC::FMRD ||
+             oc == PPC::FMRSD) {      // fmr r1, r2
+    assert(MI.getNumOperands() >= 2 &&
+           MI.getOperand(0).isReg() &&
+           MI.getOperand(1).isReg() &&
+           "invalid PPC FMR instruction");
+    sourceReg = MI.getOperand(1).getReg();
+    destReg = MI.getOperand(0).getReg();
+    return true;
+  } else if (oc == PPC::MCRF) {             // mcrf cr1, cr2
+    assert(MI.getNumOperands() >= 2 &&
+           MI.getOperand(0).isReg() &&
+           MI.getOperand(1).isReg() &&
+           "invalid PPC MCRF instruction");
+    sourceReg = MI.getOperand(1).getReg();
+    destReg = MI.getOperand(0).getReg();
+    return true;
+  }
+  return false;
+}
+
+unsigned PPCInstrInfo::isLoadFromStackSlot(const MachineInstr *MI, 
+                                           int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case PPC::LD:
+  case PPC::LWZ:
+  case PPC::LFS:
+  case PPC::LFD:
+    if (MI->getOperand(1).isImm() && !MI->getOperand(1).getImm() &&
+        MI->getOperand(2).isFI()) {
+      FrameIndex = MI->getOperand(2).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+unsigned PPCInstrInfo::isStoreToStackSlot(const MachineInstr *MI, 
+                                          int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case PPC::STD:
+  case PPC::STW:
+  case PPC::STFS:
+  case PPC::STFD:
+    if (MI->getOperand(1).isImm() && !MI->getOperand(1).getImm() &&
+        MI->getOperand(2).isFI()) {
+      FrameIndex = MI->getOperand(2).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+// commuteInstruction - We can commute rlwimi instructions, but only if the
+// rotate amt is zero.  We also have to munge the immediates a bit.
+MachineInstr *
+PPCInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
+  MachineFunction &MF = *MI->getParent()->getParent();
+
+  // Normal instructions can be commuted the obvious way.
+  if (MI->getOpcode() != PPC::RLWIMI)
+    return TargetInstrInfoImpl::commuteInstruction(MI, NewMI);
+  
+  // Cannot commute if it has a non-zero rotate count.
+  if (MI->getOperand(3).getImm() != 0)
+    return 0;
+  
+  // If we have a zero rotate count, we have:
+  //   M = mask(MB,ME)
+  //   Op0 = (Op1 & ~M) | (Op2 & M)
+  // Change this to:
+  //   M = mask((ME+1)&31, (MB-1)&31)
+  //   Op0 = (Op2 & ~M) | (Op1 & M)
+
+  // Swap op1/op2
+  unsigned Reg0 = MI->getOperand(0).getReg();
+  unsigned Reg1 = MI->getOperand(1).getReg();
+  unsigned Reg2 = MI->getOperand(2).getReg();
+  bool Reg1IsKill = MI->getOperand(1).isKill();
+  bool Reg2IsKill = MI->getOperand(2).isKill();
+  bool ChangeReg0 = false;
+  // If machine instrs are no longer in two-address forms, update
+  // destination register as well.
+  if (Reg0 == Reg1) {
+    // Must be two address instruction!
+    assert(MI->getDesc().getOperandConstraint(0, TOI::TIED_TO) &&
+           "Expecting a two-address instruction!");
+    Reg2IsKill = false;
+    ChangeReg0 = true;
+  }
+
+  // Masks.
+  unsigned MB = MI->getOperand(4).getImm();
+  unsigned ME = MI->getOperand(5).getImm();
+
+  if (NewMI) {
+    // Create a new instruction.
+    unsigned Reg0 = ChangeReg0 ? Reg2 : MI->getOperand(0).getReg();
+    bool Reg0IsDead = MI->getOperand(0).isDead();
+    return BuildMI(MF, MI->getDebugLoc(), MI->getDesc())
+      .addReg(Reg0, RegState::Define | getDeadRegState(Reg0IsDead))
+      .addReg(Reg2, getKillRegState(Reg2IsKill))
+      .addReg(Reg1, getKillRegState(Reg1IsKill))
+      .addImm((ME+1) & 31)
+      .addImm((MB-1) & 31);
+  }
+
+  if (ChangeReg0)
+    MI->getOperand(0).setReg(Reg2);
+  MI->getOperand(2).setReg(Reg1);
+  MI->getOperand(1).setReg(Reg2);
+  MI->getOperand(2).setIsKill(Reg1IsKill);
+  MI->getOperand(1).setIsKill(Reg2IsKill);
+  
+  // Swap the mask around.
+  MI->getOperand(4).setImm((ME+1) & 31);
+  MI->getOperand(5).setImm((MB-1) & 31);
+  return MI;
+}
+
+void PPCInstrInfo::insertNoop(MachineBasicBlock &MBB, 
+                              MachineBasicBlock::iterator MI) const {
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  BuildMI(MBB, MI, DL, get(PPC::NOP));
+}
+
+
+// Branch analysis.
+bool PPCInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
+                                 MachineBasicBlock *&FBB,
+                                 SmallVectorImpl<MachineOperand> &Cond,
+                                 bool AllowModify) const {
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+  
+  // If there is only one terminator instruction, process it.
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+    if (LastInst->getOpcode() == PPC::B) {
+      if (!LastInst->getOperand(0).isMBB())
+        return true;
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    } else if (LastInst->getOpcode() == PPC::BCC) {
+      if (!LastInst->getOperand(2).isMBB())
+        return true;
+      // Block ends with fall-through condbranch.
+      TBB = LastInst->getOperand(2).getMBB();
+      Cond.push_back(LastInst->getOperand(0));
+      Cond.push_back(LastInst->getOperand(1));
+      return false;
+    }
+    // Otherwise, don't know what this is.
+    return true;
+  }
+  
+  // Get the instruction before it if it's a terminator.
+  MachineInstr *SecondLastInst = I;
+
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() &&
+      isUnpredicatedTerminator(--I))
+    return true;
+  
+  // If the block ends with PPC::B and PPC:BCC, handle it.
+  if (SecondLastInst->getOpcode() == PPC::BCC && 
+      LastInst->getOpcode() == PPC::B) {
+    if (!SecondLastInst->getOperand(2).isMBB() ||
+        !LastInst->getOperand(0).isMBB())
+      return true;
+    TBB =  SecondLastInst->getOperand(2).getMBB();
+    Cond.push_back(SecondLastInst->getOperand(0));
+    Cond.push_back(SecondLastInst->getOperand(1));
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+  
+  // If the block ends with two PPC:Bs, handle it.  The second one is not
+  // executed, so remove it.
+  if (SecondLastInst->getOpcode() == PPC::B && 
+      LastInst->getOpcode() == PPC::B) {
+    if (!SecondLastInst->getOperand(0).isMBB())
+      return true;
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return false;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+unsigned PPCInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin()) return 0;
+  --I;
+  if (I->getOpcode() != PPC::B && I->getOpcode() != PPC::BCC)
+    return 0;
+  
+  // Remove the branch.
+  I->eraseFromParent();
+  
+  I = MBB.end();
+
+  if (I == MBB.begin()) return 1;
+  --I;
+  if (I->getOpcode() != PPC::BCC)
+    return 1;
+  
+  // Remove the branch.
+  I->eraseFromParent();
+  return 2;
+}
+
+unsigned
+PPCInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                           MachineBasicBlock *FBB,
+                           const SmallVectorImpl<MachineOperand> &Cond) const {
+  // FIXME this should probably have a DebugLoc argument
+  DebugLoc dl = DebugLoc::getUnknownLoc();
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 2 || Cond.size() == 0) && 
+         "PPC branch conditions have two components!");
+  
+  // One-way branch.
+  if (FBB == 0) {
+    if (Cond.empty())   // Unconditional branch
+      BuildMI(&MBB, dl, get(PPC::B)).addMBB(TBB);
+    else                // Conditional branch
+      BuildMI(&MBB, dl, get(PPC::BCC))
+        .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB);
+    return 1;
+  }
+  
+  // Two-way Conditional Branch.
+  BuildMI(&MBB, dl, get(PPC::BCC))
+    .addImm(Cond[0].getImm()).addReg(Cond[1].getReg()).addMBB(TBB);
+  BuildMI(&MBB, dl, get(PPC::B)).addMBB(FBB);
+  return 2;
+}
+
+bool PPCInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   unsigned DestReg, unsigned SrcReg,
+                                   const TargetRegisterClass *DestRC,
+                                   const TargetRegisterClass *SrcRC) const {
+  if (DestRC != SrcRC) {
+    // Not yet supported!
+    return false;
+  }
+
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  if (DestRC == PPC::GPRCRegisterClass) {
+    BuildMI(MBB, MI, DL, get(PPC::OR), DestReg).addReg(SrcReg).addReg(SrcReg);
+  } else if (DestRC == PPC::G8RCRegisterClass) {
+    BuildMI(MBB, MI, DL, get(PPC::OR8), DestReg).addReg(SrcReg).addReg(SrcReg);
+  } else if (DestRC == PPC::F4RCRegisterClass) {
+    BuildMI(MBB, MI, DL, get(PPC::FMRS), DestReg).addReg(SrcReg);
+  } else if (DestRC == PPC::F8RCRegisterClass) {
+    BuildMI(MBB, MI, DL, get(PPC::FMRD), DestReg).addReg(SrcReg);
+  } else if (DestRC == PPC::CRRCRegisterClass) {
+    BuildMI(MBB, MI, DL, get(PPC::MCRF), DestReg).addReg(SrcReg);
+  } else if (DestRC == PPC::VRRCRegisterClass) {
+    BuildMI(MBB, MI, DL, get(PPC::VOR), DestReg).addReg(SrcReg).addReg(SrcReg);
+  } else if (DestRC == PPC::CRBITRCRegisterClass) {
+    BuildMI(MBB, MI, DL, get(PPC::CROR), DestReg).addReg(SrcReg).addReg(SrcReg);
+  } else {
+    // Attempt to copy register that is not GPR or FPR
+    return false;
+  }
+  
+  return true;
+}
+
+bool
+PPCInstrInfo::StoreRegToStackSlot(MachineFunction &MF,
+                                  unsigned SrcReg, bool isKill,
+                                  int FrameIdx,
+                                  const TargetRegisterClass *RC,
+                                  SmallVectorImpl<MachineInstr*> &NewMIs) const{
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (RC == PPC::GPRCRegisterClass) {
+    if (SrcReg != PPC::LR) {
+      NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STW))
+                                         .addReg(SrcReg,
+                                                 getKillRegState(isKill)),
+                                         FrameIdx));
+    } else {
+      // FIXME: this spills LR immediately to memory in one step.  To do this,
+      // we use R11, which we know cannot be used in the prolog/epilog.  This is
+      // a hack.
+      NewMIs.push_back(BuildMI(MF, DL, get(PPC::MFLR), PPC::R11));
+      NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STW))
+                                         .addReg(PPC::R11,
+                                                 getKillRegState(isKill)),
+                                         FrameIdx));
+    }
+  } else if (RC == PPC::G8RCRegisterClass) {
+    if (SrcReg != PPC::LR8) {
+      NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STD))
+                                         .addReg(SrcReg,
+                                                 getKillRegState(isKill)),
+                                         FrameIdx));
+    } else {
+      // FIXME: this spills LR immediately to memory in one step.  To do this,
+      // we use R11, which we know cannot be used in the prolog/epilog.  This is
+      // a hack.
+      NewMIs.push_back(BuildMI(MF, DL, get(PPC::MFLR8), PPC::X11));
+      NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STD))
+                                         .addReg(PPC::X11,
+                                                 getKillRegState(isKill)),
+                                         FrameIdx));
+    }
+  } else if (RC == PPC::F8RCRegisterClass) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STFD))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+  } else if (RC == PPC::F4RCRegisterClass) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STFS))
+                                       .addReg(SrcReg,
+                                               getKillRegState(isKill)),
+                                       FrameIdx));
+  } else if (RC == PPC::CRRCRegisterClass) {
+    if ((EnablePPC32RS && !TM.getSubtargetImpl()->isPPC64()) ||
+        (EnablePPC64RS && TM.getSubtargetImpl()->isPPC64())) {
+      // FIXME (64-bit): Enable
+      NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::SPILL_CR))
+                                         .addReg(SrcReg,
+                                                 getKillRegState(isKill)),
+                                         FrameIdx));
+      return true;
+    } else {
+      // FIXME: We use R0 here, because it isn't available for RA.  We need to
+      // store the CR in the low 4-bits of the saved value.  First, issue a MFCR
+      // to save all of the CRBits.
+      NewMIs.push_back(BuildMI(MF, DL, get(PPC::MFCR), PPC::R0));
+    
+      // If the saved register wasn't CR0, shift the bits left so that they are
+      // in CR0's slot.
+      if (SrcReg != PPC::CR0) {
+        unsigned ShiftBits = PPCRegisterInfo::getRegisterNumbering(SrcReg)*4;
+        // rlwinm r0, r0, ShiftBits, 0, 31.
+        NewMIs.push_back(BuildMI(MF, DL, get(PPC::RLWINM), PPC::R0)
+                       .addReg(PPC::R0).addImm(ShiftBits).addImm(0).addImm(31));
+      }
+    
+      NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::STW))
+                                         .addReg(PPC::R0,
+                                                 getKillRegState(isKill)),
+                                         FrameIdx));
+    }
+  } else if (RC == PPC::CRBITRCRegisterClass) {
+    // FIXME: We use CRi here because there is no mtcrf on a bit. Since the
+    // backend currently only uses CR1EQ as an individual bit, this should
+    // not cause any bug. If we need other uses of CR bits, the following
+    // code may be invalid.
+    unsigned Reg = 0;
+    if (SrcReg == PPC::CR0LT || SrcReg == PPC::CR0GT ||
+        SrcReg == PPC::CR0EQ || SrcReg == PPC::CR0UN)
+      Reg = PPC::CR0;
+    else if (SrcReg == PPC::CR1LT || SrcReg == PPC::CR1GT ||
+             SrcReg == PPC::CR1EQ || SrcReg == PPC::CR1UN)
+      Reg = PPC::CR1;
+    else if (SrcReg == PPC::CR2LT || SrcReg == PPC::CR2GT ||
+             SrcReg == PPC::CR2EQ || SrcReg == PPC::CR2UN)
+      Reg = PPC::CR2;
+    else if (SrcReg == PPC::CR3LT || SrcReg == PPC::CR3GT ||
+             SrcReg == PPC::CR3EQ || SrcReg == PPC::CR3UN)
+      Reg = PPC::CR3;
+    else if (SrcReg == PPC::CR4LT || SrcReg == PPC::CR4GT ||
+             SrcReg == PPC::CR4EQ || SrcReg == PPC::CR4UN)
+      Reg = PPC::CR4;
+    else if (SrcReg == PPC::CR5LT || SrcReg == PPC::CR5GT ||
+             SrcReg == PPC::CR5EQ || SrcReg == PPC::CR5UN)
+      Reg = PPC::CR5;
+    else if (SrcReg == PPC::CR6LT || SrcReg == PPC::CR6GT ||
+             SrcReg == PPC::CR6EQ || SrcReg == PPC::CR6UN)
+      Reg = PPC::CR6;
+    else if (SrcReg == PPC::CR7LT || SrcReg == PPC::CR7GT ||
+             SrcReg == PPC::CR7EQ || SrcReg == PPC::CR7UN)
+      Reg = PPC::CR7;
+
+    return StoreRegToStackSlot(MF, Reg, isKill, FrameIdx, 
+                               PPC::CRRCRegisterClass, NewMIs);
+
+  } else if (RC == PPC::VRRCRegisterClass) {
+    // We don't have indexed addressing for vector loads.  Emit:
+    // R0 = ADDI FI#
+    // STVX VAL, 0, R0
+    // 
+    // FIXME: We use R0 here, because it isn't available for RA.
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::ADDI), PPC::R0),
+                                       FrameIdx, 0, 0));
+    NewMIs.push_back(BuildMI(MF, DL, get(PPC::STVX))
+                     .addReg(SrcReg, getKillRegState(isKill))
+                     .addReg(PPC::R0)
+                     .addReg(PPC::R0));
+  } else {
+    llvm_unreachable("Unknown regclass!");
+  }
+
+  return false;
+}
+
+void
+PPCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MI,
+                                  unsigned SrcReg, bool isKill, int FrameIdx,
+                                  const TargetRegisterClass *RC) const {
+  MachineFunction &MF = *MBB.getParent();
+  SmallVector<MachineInstr*, 4> NewMIs;
+
+  if (StoreRegToStackSlot(MF, SrcReg, isKill, FrameIdx, RC, NewMIs)) {
+    PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+    FuncInfo->setSpillsCR();
+  }
+
+  for (unsigned i = 0, e = NewMIs.size(); i != e; ++i)
+    MBB.insert(MI, NewMIs[i]);
+}
+
+void
+PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL,
+                                   unsigned DestReg, int FrameIdx,
+                                   const TargetRegisterClass *RC,
+                                   SmallVectorImpl<MachineInstr*> &NewMIs)const{
+  if (RC == PPC::GPRCRegisterClass) {
+    if (DestReg != PPC::LR) {
+      NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LWZ),
+                                                 DestReg), FrameIdx));
+    } else {
+      NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LWZ),
+                                                 PPC::R11), FrameIdx));
+      NewMIs.push_back(BuildMI(MF, DL, get(PPC::MTLR)).addReg(PPC::R11));
+    }
+  } else if (RC == PPC::G8RCRegisterClass) {
+    if (DestReg != PPC::LR8) {
+      NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LD), DestReg),
+                                         FrameIdx));
+    } else {
+      NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LD),
+                                                 PPC::R11), FrameIdx));
+      NewMIs.push_back(BuildMI(MF, DL, get(PPC::MTLR8)).addReg(PPC::R11));
+    }
+  } else if (RC == PPC::F8RCRegisterClass) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LFD), DestReg),
+                                       FrameIdx));
+  } else if (RC == PPC::F4RCRegisterClass) {
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LFS), DestReg),
+                                       FrameIdx));
+  } else if (RC == PPC::CRRCRegisterClass) {
+    // FIXME: We use R0 here, because it isn't available for RA.
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::LWZ), PPC::R0),
+                                       FrameIdx));
+    
+    // If the reloaded register isn't CR0, shift the bits right so that they are
+    // in the right CR's slot.
+    if (DestReg != PPC::CR0) {
+      unsigned ShiftBits = PPCRegisterInfo::getRegisterNumbering(DestReg)*4;
+      // rlwinm r11, r11, 32-ShiftBits, 0, 31.
+      NewMIs.push_back(BuildMI(MF, DL, get(PPC::RLWINM), PPC::R0)
+                    .addReg(PPC::R0).addImm(32-ShiftBits).addImm(0).addImm(31));
+    }
+    
+    NewMIs.push_back(BuildMI(MF, DL, get(PPC::MTCRF), DestReg).addReg(PPC::R0));
+  } else if (RC == PPC::CRBITRCRegisterClass) {
+   
+    unsigned Reg = 0;
+    if (DestReg == PPC::CR0LT || DestReg == PPC::CR0GT ||
+        DestReg == PPC::CR0EQ || DestReg == PPC::CR0UN)
+      Reg = PPC::CR0;
+    else if (DestReg == PPC::CR1LT || DestReg == PPC::CR1GT ||
+             DestReg == PPC::CR1EQ || DestReg == PPC::CR1UN)
+      Reg = PPC::CR1;
+    else if (DestReg == PPC::CR2LT || DestReg == PPC::CR2GT ||
+             DestReg == PPC::CR2EQ || DestReg == PPC::CR2UN)
+      Reg = PPC::CR2;
+    else if (DestReg == PPC::CR3LT || DestReg == PPC::CR3GT ||
+             DestReg == PPC::CR3EQ || DestReg == PPC::CR3UN)
+      Reg = PPC::CR3;
+    else if (DestReg == PPC::CR4LT || DestReg == PPC::CR4GT ||
+             DestReg == PPC::CR4EQ || DestReg == PPC::CR4UN)
+      Reg = PPC::CR4;
+    else if (DestReg == PPC::CR5LT || DestReg == PPC::CR5GT ||
+             DestReg == PPC::CR5EQ || DestReg == PPC::CR5UN)
+      Reg = PPC::CR5;
+    else if (DestReg == PPC::CR6LT || DestReg == PPC::CR6GT ||
+             DestReg == PPC::CR6EQ || DestReg == PPC::CR6UN)
+      Reg = PPC::CR6;
+    else if (DestReg == PPC::CR7LT || DestReg == PPC::CR7GT ||
+             DestReg == PPC::CR7EQ || DestReg == PPC::CR7UN)
+      Reg = PPC::CR7;
+
+    return LoadRegFromStackSlot(MF, DL, Reg, FrameIdx, 
+                                PPC::CRRCRegisterClass, NewMIs);
+
+  } else if (RC == PPC::VRRCRegisterClass) {
+    // We don't have indexed addressing for vector loads.  Emit:
+    // R0 = ADDI FI#
+    // Dest = LVX 0, R0
+    // 
+    // FIXME: We use R0 here, because it isn't available for RA.
+    NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(PPC::ADDI), PPC::R0),
+                                       FrameIdx, 0, 0));
+    NewMIs.push_back(BuildMI(MF, DL, get(PPC::LVX),DestReg).addReg(PPC::R0)
+                     .addReg(PPC::R0));
+  } else {
+    llvm_unreachable("Unknown regclass!");
+  }
+}
+
+void
+PPCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   unsigned DestReg, int FrameIdx,
+                                   const TargetRegisterClass *RC) const {
+  MachineFunction &MF = *MBB.getParent();
+  SmallVector<MachineInstr*, 4> NewMIs;
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+  LoadRegFromStackSlot(MF, DL, DestReg, FrameIdx, RC, NewMIs);
+  for (unsigned i = 0, e = NewMIs.size(); i != e; ++i)
+    MBB.insert(MI, NewMIs[i]);
+}
+
+/// foldMemoryOperand - PowerPC (like most RISC's) can only fold spills into
+/// copy instructions, turning them into load/store instructions.
+MachineInstr *PPCInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+                                                  MachineInstr *MI,
+                                           const SmallVectorImpl<unsigned> &Ops,
+                                                  int FrameIndex) const {
+  if (Ops.size() != 1) return NULL;
+
+  // Make sure this is a reg-reg copy.  Note that we can't handle MCRF, because
+  // it takes more than one instruction to store it.
+  unsigned Opc = MI->getOpcode();
+  unsigned OpNum = Ops[0];
+
+  MachineInstr *NewMI = NULL;
+  if ((Opc == PPC::OR &&
+       MI->getOperand(1).getReg() == MI->getOperand(2).getReg())) {
+    if (OpNum == 0) {  // move -> store
+      unsigned InReg = MI->getOperand(1).getReg();
+      bool isKill = MI->getOperand(1).isKill();
+      bool isUndef = MI->getOperand(1).isUndef();
+      NewMI = addFrameReference(BuildMI(MF, MI->getDebugLoc(), get(PPC::STW))
+                                .addReg(InReg,
+                                        getKillRegState(isKill) |
+                                        getUndefRegState(isUndef)),
+                                FrameIndex);
+    } else {           // move -> load
+      unsigned OutReg = MI->getOperand(0).getReg();
+      bool isDead = MI->getOperand(0).isDead();
+      bool isUndef = MI->getOperand(0).isUndef();
+      NewMI = addFrameReference(BuildMI(MF, MI->getDebugLoc(), get(PPC::LWZ))
+                                .addReg(OutReg,
+                                        RegState::Define |
+                                        getDeadRegState(isDead) |
+                                        getUndefRegState(isUndef)),
+                                FrameIndex);
+    }
+  } else if ((Opc == PPC::OR8 &&
+              MI->getOperand(1).getReg() == MI->getOperand(2).getReg())) {
+    if (OpNum == 0) {  // move -> store
+      unsigned InReg = MI->getOperand(1).getReg();
+      bool isKill = MI->getOperand(1).isKill();
+      bool isUndef = MI->getOperand(1).isUndef();
+      NewMI = addFrameReference(BuildMI(MF, MI->getDebugLoc(), get(PPC::STD))
+                                .addReg(InReg,
+                                        getKillRegState(isKill) |
+                                        getUndefRegState(isUndef)),
+                                FrameIndex);
+    } else {           // move -> load
+      unsigned OutReg = MI->getOperand(0).getReg();
+      bool isDead = MI->getOperand(0).isDead();
+      bool isUndef = MI->getOperand(0).isUndef();
+      NewMI = addFrameReference(BuildMI(MF, MI->getDebugLoc(), get(PPC::LD))
+                                .addReg(OutReg,
+                                        RegState::Define |
+                                        getDeadRegState(isDead) |
+                                        getUndefRegState(isUndef)),
+                                FrameIndex);
+    }
+  } else if (Opc == PPC::FMRD) {
+    if (OpNum == 0) {  // move -> store
+      unsigned InReg = MI->getOperand(1).getReg();
+      bool isKill = MI->getOperand(1).isKill();
+      bool isUndef = MI->getOperand(1).isUndef();
+      NewMI = addFrameReference(BuildMI(MF, MI->getDebugLoc(), get(PPC::STFD))
+                                .addReg(InReg,
+                                        getKillRegState(isKill) |
+                                        getUndefRegState(isUndef)),
+                                FrameIndex);
+    } else {           // move -> load
+      unsigned OutReg = MI->getOperand(0).getReg();
+      bool isDead = MI->getOperand(0).isDead();
+      bool isUndef = MI->getOperand(0).isUndef();
+      NewMI = addFrameReference(BuildMI(MF, MI->getDebugLoc(), get(PPC::LFD))
+                                .addReg(OutReg,
+                                        RegState::Define |
+                                        getDeadRegState(isDead) |
+                                        getUndefRegState(isUndef)),
+                                FrameIndex);
+    }
+  } else if (Opc == PPC::FMRS) {
+    if (OpNum == 0) {  // move -> store
+      unsigned InReg = MI->getOperand(1).getReg();
+      bool isKill = MI->getOperand(1).isKill();
+      bool isUndef = MI->getOperand(1).isUndef();
+      NewMI = addFrameReference(BuildMI(MF, MI->getDebugLoc(), get(PPC::STFS))
+                                .addReg(InReg,
+                                        getKillRegState(isKill) |
+                                        getUndefRegState(isUndef)),
+                                FrameIndex);
+    } else {           // move -> load
+      unsigned OutReg = MI->getOperand(0).getReg();
+      bool isDead = MI->getOperand(0).isDead();
+      bool isUndef = MI->getOperand(0).isUndef();
+      NewMI = addFrameReference(BuildMI(MF, MI->getDebugLoc(), get(PPC::LFS))
+                                .addReg(OutReg,
+                                        RegState::Define |
+                                        getDeadRegState(isDead) |
+                                        getUndefRegState(isUndef)),
+                                FrameIndex);
+    }
+  }
+
+  return NewMI;
+}
+
+bool PPCInstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
+                                  const SmallVectorImpl<unsigned> &Ops) const {
+  if (Ops.size() != 1) return false;
+
+  // Make sure this is a reg-reg copy.  Note that we can't handle MCRF, because
+  // it takes more than one instruction to store it.
+  unsigned Opc = MI->getOpcode();
+
+  if ((Opc == PPC::OR &&
+       MI->getOperand(1).getReg() == MI->getOperand(2).getReg()))
+    return true;
+  else if ((Opc == PPC::OR8 &&
+              MI->getOperand(1).getReg() == MI->getOperand(2).getReg()))
+    return true;
+  else if (Opc == PPC::FMRD || Opc == PPC::FMRS)
+    return true;
+
+  return false;
+}
+
+
+bool PPCInstrInfo::
+ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+  assert(Cond.size() == 2 && "Invalid PPC branch opcode!");
+  // Leave the CR# the same, but invert the condition.
+  Cond[0].setImm(PPC::InvertPredicate((PPC::Predicate)Cond[0].getImm()));
+  return false;
+}
+
+/// GetInstSize - Return the number of bytes of code the specified
+/// instruction may be.  This returns the maximum number of bytes.
+///
+unsigned PPCInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  case PPC::INLINEASM: {       // Inline Asm: Variable size.
+    const MachineFunction *MF = MI->getParent()->getParent();
+    const char *AsmStr = MI->getOperand(0).getSymbolName();
+    return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo());
+  }
+  case PPC::DBG_LABEL:
+  case PPC::EH_LABEL:
+  case PPC::GC_LABEL:
+    return 0;
+  default:
+    return 4; // PowerPC instructions are all 4 bytes
+  }
+}

diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
new file mode 100644
index 0000000..57facac
--- /dev/null
+++ b/lib/Target/PowerPC/PPCInstrInfo.h

@@ -0,0 +1,157 @@
+//===- PPCInstrInfo.h - PowerPC Instruction Information ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PowerPC implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POWERPC32_INSTRUCTIONINFO_H
+#define POWERPC32_INSTRUCTIONINFO_H
+
+#include "PPC.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "PPCRegisterInfo.h"
+
+namespace llvm {
+
+/// PPCII - This namespace holds all of the PowerPC target-specific
+/// per-instruction flags.  These must match the corresponding definitions in
+/// PPC.td and PPCInstrFormats.td.
+namespace PPCII {
+enum {
+  // PPC970 Instruction Flags.  These flags describe the characteristics of the
+  // PowerPC 970 (aka G5) dispatch groups and how they are formed out of
+  // raw machine instructions.
+
+  /// PPC970_First - This instruction starts a new dispatch group, so it will
+  /// always be the first one in the group.
+  PPC970_First = 0x1,
+  
+  /// PPC970_Single - This instruction starts a new dispatch group and
+  /// terminates it, so it will be the sole instruction in the group.
+  PPC970_Single = 0x2,
+
+  /// PPC970_Cracked - This instruction is cracked into two pieces, requiring
+  /// two dispatch pipes to be available to issue.
+  PPC970_Cracked = 0x4,
+  
+  /// PPC970_Mask/Shift - This is a bitmask that selects the pipeline type that
+  /// an instruction is issued to.
+  PPC970_Shift = 3,
+  PPC970_Mask = 0x07 << PPC970_Shift
+};
+enum PPC970_Unit {
+  /// These are the various PPC970 execution unit pipelines.  Each instruction
+  /// is one of these.
+  PPC970_Pseudo = 0 << PPC970_Shift,   // Pseudo instruction
+  PPC970_FXU    = 1 << PPC970_Shift,   // Fixed Point (aka Integer/ALU) Unit
+  PPC970_LSU    = 2 << PPC970_Shift,   // Load Store Unit
+  PPC970_FPU    = 3 << PPC970_Shift,   // Floating Point Unit
+  PPC970_CRU    = 4 << PPC970_Shift,   // Control Register Unit
+  PPC970_VALU   = 5 << PPC970_Shift,   // Vector ALU
+  PPC970_VPERM  = 6 << PPC970_Shift,   // Vector Permute Unit
+  PPC970_BRU    = 7 << PPC970_Shift    // Branch Unit
+};
+}
+  
+  
+class PPCInstrInfo : public TargetInstrInfoImpl {
+  PPCTargetMachine &TM;
+  const PPCRegisterInfo RI;
+
+  bool StoreRegToStackSlot(MachineFunction &MF,
+                           unsigned SrcReg, bool isKill, int FrameIdx,
+                           const TargetRegisterClass *RC,
+                           SmallVectorImpl<MachineInstr*> &NewMIs) const;
+  void LoadRegFromStackSlot(MachineFunction &MF, DebugLoc DL, 
+                            unsigned DestReg, int FrameIdx,
+                            const TargetRegisterClass *RC,
+                            SmallVectorImpl<MachineInstr*> &NewMIs) const;
+public:
+  explicit PPCInstrInfo(PPCTargetMachine &TM);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  virtual const PPCRegisterInfo &getRegisterInfo() const { return RI; }
+
+  /// Return true if the instruction is a register to register move and return
+  /// the source and dest operands and their sub-register indices by reference.
+  virtual bool isMoveInstr(const MachineInstr &MI,
+                           unsigned &SrcReg, unsigned &DstReg,
+                           unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
+
+  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                               int &FrameIndex) const;
+  unsigned isStoreToStackSlot(const MachineInstr *MI,
+                              int &FrameIndex) const;
+
+  // commuteInstruction - We can commute rlwimi instructions, but only if the
+  // rotate amt is zero.  We also have to munge the immediates a bit.
+  virtual MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI) const;
+  
+  virtual void insertNoop(MachineBasicBlock &MBB, 
+                          MachineBasicBlock::iterator MI) const;
+
+
+  // Branch analysis.
+  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                             MachineBasicBlock *&FBB,
+                             SmallVectorImpl<MachineOperand> &Cond,
+                             bool AllowModify) const;
+  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                                MachineBasicBlock *FBB,
+                            const SmallVectorImpl<MachineOperand> &Cond) const;
+  virtual bool copyRegToReg(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            unsigned DestReg, unsigned SrcReg,
+                            const TargetRegisterClass *DestRC,
+                            const TargetRegisterClass *SrcRC) const;
+  
+  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   unsigned SrcReg, bool isKill, int FrameIndex,
+                                   const TargetRegisterClass *RC) const;
+
+  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MBBI,
+                                    unsigned DestReg, int FrameIndex,
+                                    const TargetRegisterClass *RC) const;
+  
+  /// foldMemoryOperand - PowerPC (like most RISC's) can only fold spills into
+  /// copy instructions, turning them into load/store instructions.
+  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                              MachineInstr* MI,
+                                              const SmallVectorImpl<unsigned> &Ops,
+                                              int FrameIndex) const;
+
+  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                              MachineInstr* MI,
+                                              const SmallVectorImpl<unsigned> &Ops,
+                                              MachineInstr* LoadMI) const {
+    return 0;
+  }
+
+  virtual bool canFoldMemoryOperand(const MachineInstr *MI,
+                                    const SmallVectorImpl<unsigned> &Ops) const;
+  
+  virtual
+  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+  
+  /// GetInstSize - Return the number of bytes of code the specified
+  /// instruction may be.  This returns the maximum number of bytes.
+  ///
+  virtual unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
+};
+
+}
+
+#endif

diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
new file mode 100644
index 0000000..842f8ee
--- /dev/null
+++ b/lib/Target/PowerPC/PPCInstrInfo.td

@@ -0,0 +1,1491 @@
+//===- PPCInstrInfo.td - The PowerPC Instruction Set -------*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the subset of the 32-bit PowerPC instruction set, as used
+// by the PowerPC instruction selector.
+//
+//===----------------------------------------------------------------------===//
+
+include "PPCInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// PowerPC specific type constraints.
+//
+def SDT_PPCstfiwx : SDTypeProfile<0, 2, [ // stfiwx
+  SDTCisVT<0, f64>, SDTCisPtrTy<1>
+]>;
+def SDT_PPCCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_PPCCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
+                                         SDTCisVT<1, i32> ]>;
+def SDT_PPCvperm   : SDTypeProfile<1, 3, [
+  SDTCisVT<3, v16i8>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>
+]>;
+
+def SDT_PPCvcmp : SDTypeProfile<1, 3, [
+  SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i32>
+]>;
+
+def SDT_PPCcondbr : SDTypeProfile<0, 3, [
+  SDTCisVT<0, i32>, SDTCisVT<2, OtherVT>
+]>;
+
+def SDT_PPClbrx : SDTypeProfile<1, 2, [
+  SDTCisVT<0, i32>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT>
+]>;
+def SDT_PPCstbrx : SDTypeProfile<0, 3, [
+  SDTCisVT<0, i32>, SDTCisPtrTy<1>, SDTCisVT<2, OtherVT>
+]>;
+
+def SDT_PPClarx : SDTypeProfile<1, 1, [
+  SDTCisInt<0>, SDTCisPtrTy<1>
+]>;
+def SDT_PPCstcx : SDTypeProfile<0, 2, [
+  SDTCisInt<0>, SDTCisPtrTy<1>
+]>;
+
+def SDT_PPCTC_ret : SDTypeProfile<0, 2, [
+  SDTCisPtrTy<0>, SDTCisVT<1, i32>
+]>;
+
+def SDT_PPCnop : SDTypeProfile<0, 0, []>;
+
+//===----------------------------------------------------------------------===//
+// PowerPC specific DAG Nodes.
+//
+
+def PPCfcfid  : SDNode<"PPCISD::FCFID" , SDTFPUnaryOp, []>;
+def PPCfctidz : SDNode<"PPCISD::FCTIDZ", SDTFPUnaryOp, []>;
+def PPCfctiwz : SDNode<"PPCISD::FCTIWZ", SDTFPUnaryOp, []>;
+def PPCstfiwx : SDNode<"PPCISD::STFIWX", SDT_PPCstfiwx,
+                       [SDNPHasChain, SDNPMayStore]>;
+
+// This sequence is used for long double->int conversions.  It changes the
+// bits in the FPSCR which is not modelled.  
+def PPCmffs   : SDNode<"PPCISD::MFFS", SDTypeProfile<1, 0, [SDTCisVT<0, f64>]>,
+                        [SDNPOutFlag]>;
+def PPCmtfsb0 : SDNode<"PPCISD::MTFSB0", SDTypeProfile<0, 1, [SDTCisInt<0>]>,
+                       [SDNPInFlag, SDNPOutFlag]>;
+def PPCmtfsb1 : SDNode<"PPCISD::MTFSB1", SDTypeProfile<0, 1, [SDTCisInt<0>]>,
+                       [SDNPInFlag, SDNPOutFlag]>;
+def PPCfaddrtz: SDNode<"PPCISD::FADDRTZ", SDTFPBinOp,
+                       [SDNPInFlag, SDNPOutFlag]>;
+def PPCmtfsf  : SDNode<"PPCISD::MTFSF", SDTypeProfile<1, 3, 
+                       [SDTCisVT<0, f64>, SDTCisInt<1>, SDTCisVT<2, f64>,
+                        SDTCisVT<3, f64>]>,
+                       [SDNPInFlag]>;
+
+def PPCfsel   : SDNode<"PPCISD::FSEL",  
+   // Type constraint for fsel.
+   SDTypeProfile<1, 3, [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, 
+                        SDTCisFP<0>, SDTCisVT<1, f64>]>, []>;
+
+def PPChi       : SDNode<"PPCISD::Hi", SDTIntBinOp, []>;
+def PPClo       : SDNode<"PPCISD::Lo", SDTIntBinOp, []>;
+def PPCtoc_entry: SDNode<"PPCISD::TOC_ENTRY", SDTIntBinOp, [SDNPMayLoad]>;
+def PPCvmaddfp  : SDNode<"PPCISD::VMADDFP", SDTFPTernaryOp, []>;
+def PPCvnmsubfp : SDNode<"PPCISD::VNMSUBFP", SDTFPTernaryOp, []>;
+
+def PPCvperm    : SDNode<"PPCISD::VPERM", SDT_PPCvperm, []>;
+
+// These nodes represent the 32-bit PPC shifts that operate on 6-bit shift
+// amounts.  These nodes are generated by the multi-precision shift code.
+def PPCsrl        : SDNode<"PPCISD::SRL"       , SDTIntShiftOp>;
+def PPCsra        : SDNode<"PPCISD::SRA"       , SDTIntShiftOp>;
+def PPCshl        : SDNode<"PPCISD::SHL"       , SDTIntShiftOp>;
+
+def PPCextsw_32   : SDNode<"PPCISD::EXTSW_32"  , SDTIntUnaryOp>;
+def PPCstd_32     : SDNode<"PPCISD::STD_32"    , SDTStore,
+                           [SDNPHasChain, SDNPMayStore]>;
+
+// These are target-independent nodes, but have target-specific formats.
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_PPCCallSeqStart,
+                           [SDNPHasChain, SDNPOutFlag]>;
+def callseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_PPCCallSeqEnd,
+                           [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+
+def SDT_PPCCall   : SDTypeProfile<0, -1, [SDTCisInt<0>]>;
+def PPCcall_Darwin : SDNode<"PPCISD::CALL_Darwin", SDT_PPCCall,
+                            [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+def PPCcall_SVR4  : SDNode<"PPCISD::CALL_SVR4", SDT_PPCCall,
+                           [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+def PPCnop : SDNode<"PPCISD::NOP", SDT_PPCnop, [SDNPInFlag, SDNPOutFlag]>;
+def PPCload   : SDNode<"PPCISD::LOAD", SDTypeProfile<1, 1, []>,
+                       [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+def PPCload_toc : SDNode<"PPCISD::LOAD_TOC", SDTypeProfile<0, 1, []>,
+                          [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>;
+def PPCtoc_restore : SDNode<"PPCISD::TOC_RESTORE", SDTypeProfile<0, 0, []>,
+                            [SDNPHasChain, SDNPInFlag, SDNPOutFlag]>;
+def PPCmtctr      : SDNode<"PPCISD::MTCTR", SDT_PPCCall,
+                           [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+def PPCbctrl_Darwin  : SDNode<"PPCISD::BCTRL_Darwin", SDTNone,
+                              [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+
+def PPCbctrl_SVR4  : SDNode<"PPCISD::BCTRL_SVR4", SDTNone,
+                            [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+
+def retflag       : SDNode<"PPCISD::RET_FLAG", SDTNone,
+                           [SDNPHasChain, SDNPOptInFlag]>;
+
+def PPCtc_return : SDNode<"PPCISD::TC_RETURN", SDT_PPCTC_ret,
+                        [SDNPHasChain,  SDNPOptInFlag]>;
+
+def PPCvcmp       : SDNode<"PPCISD::VCMP" , SDT_PPCvcmp, []>;
+def PPCvcmp_o     : SDNode<"PPCISD::VCMPo", SDT_PPCvcmp, [SDNPOutFlag]>;
+
+def PPCcondbranch : SDNode<"PPCISD::COND_BRANCH", SDT_PPCcondbr,
+                           [SDNPHasChain, SDNPOptInFlag]>;
+
+def PPClbrx       : SDNode<"PPCISD::LBRX", SDT_PPClbrx,
+                           [SDNPHasChain, SDNPMayLoad]>;
+def PPCstbrx      : SDNode<"PPCISD::STBRX", SDT_PPCstbrx,
+                           [SDNPHasChain, SDNPMayStore]>;
+
+// Instructions to support atomic operations
+def PPClarx      : SDNode<"PPCISD::LARX", SDT_PPClarx,
+                          [SDNPHasChain, SDNPMayLoad]>;
+def PPCstcx      : SDNode<"PPCISD::STCX", SDT_PPCstcx,
+                          [SDNPHasChain, SDNPMayStore]>;
+
+// Instructions to support dynamic alloca.
+def SDTDynOp  : SDTypeProfile<1, 2, []>;
+def PPCdynalloc   : SDNode<"PPCISD::DYNALLOC", SDTDynOp, [SDNPHasChain]>;
+
+//===----------------------------------------------------------------------===//
+// PowerPC specific transformation functions and pattern fragments.
+//
+
+def SHL32 : SDNodeXForm<imm, [{
+  // Transformation function: 31 - imm
+  return getI32Imm(31 - N->getZExtValue());
+}]>;
+
+def SRL32 : SDNodeXForm<imm, [{
+  // Transformation function: 32 - imm
+  return N->getZExtValue() ? getI32Imm(32 - N->getZExtValue()) : getI32Imm(0);
+}]>;
+
+def LO16 : SDNodeXForm<imm, [{
+  // Transformation function: get the low 16 bits.
+  return getI32Imm((unsigned short)N->getZExtValue());
+}]>;
+
+def HI16 : SDNodeXForm<imm, [{
+  // Transformation function: shift the immediate value down into the low bits.
+  return getI32Imm((unsigned)N->getZExtValue() >> 16);
+}]>;
+
+def HA16 : SDNodeXForm<imm, [{
+  // Transformation function: shift the immediate value down into the low bits.
+  signed int Val = N->getZExtValue();
+  return getI32Imm((Val - (signed short)Val) >> 16);
+}]>;
+def MB : SDNodeXForm<imm, [{
+  // Transformation function: get the start bit of a mask
+  unsigned mb = 0, me;
+  (void)isRunOfOnes((unsigned)N->getZExtValue(), mb, me);
+  return getI32Imm(mb);
+}]>;
+
+def ME : SDNodeXForm<imm, [{
+  // Transformation function: get the end bit of a mask
+  unsigned mb, me = 0;
+  (void)isRunOfOnes((unsigned)N->getZExtValue(), mb, me);
+  return getI32Imm(me);
+}]>;
+def maskimm32 : PatLeaf<(imm), [{
+  // maskImm predicate - True if immediate is a run of ones.
+  unsigned mb, me;
+  if (N->getValueType(0) == MVT::i32)
+    return isRunOfOnes((unsigned)N->getZExtValue(), mb, me);
+  else
+    return false;
+}]>;
+
+def immSExt16  : PatLeaf<(imm), [{
+  // immSExt16 predicate - True if the immediate fits in a 16-bit sign extended
+  // field.  Used by instructions like 'addi'.
+  if (N->getValueType(0) == MVT::i32)
+    return (int32_t)N->getZExtValue() == (short)N->getZExtValue();
+  else
+    return (int64_t)N->getZExtValue() == (short)N->getZExtValue();
+}]>;
+def immZExt16  : PatLeaf<(imm), [{
+  // immZExt16 predicate - True if the immediate fits in a 16-bit zero extended
+  // field.  Used by instructions like 'ori'.
+  return (uint64_t)N->getZExtValue() == (unsigned short)N->getZExtValue();
+}], LO16>;
+
+// imm16Shifted* - These match immediates where the low 16-bits are zero.  There
+// are two forms: imm16ShiftedSExt and imm16ShiftedZExt.  These two forms are
+// identical in 32-bit mode, but in 64-bit mode, they return true if the
+// immediate fits into a sign/zero extended 32-bit immediate (with the low bits
+// clear).
+def imm16ShiftedZExt : PatLeaf<(imm), [{
+  // imm16ShiftedZExt predicate - True if only bits in the top 16-bits of the
+  // immediate are set.  Used by instructions like 'xoris'.
+  return (N->getZExtValue() & ~uint64_t(0xFFFF0000)) == 0;
+}], HI16>;
+
+def imm16ShiftedSExt : PatLeaf<(imm), [{
+  // imm16ShiftedSExt predicate - True if only bits in the top 16-bits of the
+  // immediate are set.  Used by instructions like 'addis'.  Identical to 
+  // imm16ShiftedZExt in 32-bit mode.
+  if (N->getZExtValue() & 0xFFFF) return false;
+  if (N->getValueType(0) == MVT::i32)
+    return true;
+  // For 64-bit, make sure it is sext right.
+  return N->getZExtValue() == (uint64_t)(int)N->getZExtValue();
+}], HI16>;
+
+
+//===----------------------------------------------------------------------===//
+// PowerPC Flag Definitions.
+
+class isPPC64 { bit PPC64 = 1; }
+class isDOT   {
+  list<Register> Defs = [CR0];
+  bit RC  = 1;
+}
+
+class RegConstraint<string C> {
+  string Constraints = C;
+}
+class NoEncode<string E> {
+  string DisableEncoding = E;
+}
+
+
+//===----------------------------------------------------------------------===//
+// PowerPC Operand Definitions.
+
+def s5imm   : Operand<i32> {
+  let PrintMethod = "printS5ImmOperand";
+}
+def u5imm   : Operand<i32> {
+  let PrintMethod = "printU5ImmOperand";
+}
+def u6imm   : Operand<i32> {
+  let PrintMethod = "printU6ImmOperand";
+}
+def s16imm  : Operand<i32> {
+  let PrintMethod = "printS16ImmOperand";
+}
+def u16imm  : Operand<i32> {
+  let PrintMethod = "printU16ImmOperand";
+}
+def s16immX4  : Operand<i32> {   // Multiply imm by 4 before printing.
+  let PrintMethod = "printS16X4ImmOperand";
+}
+def target : Operand<OtherVT> {
+  let PrintMethod = "printBranchOperand";
+}
+def calltarget : Operand<iPTR> {
+  let PrintMethod = "printCallOperand";
+}
+def aaddr : Operand<iPTR> {
+  let PrintMethod = "printAbsAddrOperand";
+}
+def piclabel: Operand<iPTR> {
+  let PrintMethod = "printPICLabel";
+}
+def symbolHi: Operand<i32> {
+  let PrintMethod = "printSymbolHi";
+}
+def symbolLo: Operand<i32> {
+  let PrintMethod = "printSymbolLo";
+}
+def crbitm: Operand<i8> {
+  let PrintMethod = "printcrbitm";
+}
+// Address operands
+def memri : Operand<iPTR> {
+  let PrintMethod = "printMemRegImm";
+  let MIOperandInfo = (ops i32imm:$imm, ptr_rc:$reg);
+}
+def memrr : Operand<iPTR> {
+  let PrintMethod = "printMemRegReg";
+  let MIOperandInfo = (ops ptr_rc, ptr_rc);
+}
+def memrix : Operand<iPTR> {   // memri where the imm is shifted 2 bits.
+  let PrintMethod = "printMemRegImmShifted";
+  let MIOperandInfo = (ops i32imm:$imm, ptr_rc:$reg);
+}
+def tocentry : Operand<iPTR> {
+  let PrintMethod = "printTOCEntryLabel";
+  let MIOperandInfo = (ops i32imm:$imm);
+}
+
+// PowerPC Predicate operand.  20 = (0<<5)|20 = always, CR0 is a dummy reg
+// that doesn't matter.
+def pred : PredicateOperand<OtherVT, (ops imm, CRRC),
+                                     (ops (i32 20), (i32 zero_reg))> {
+  let PrintMethod = "printPredicateOperand";
+}
+
+// Define PowerPC specific addressing mode.
+def iaddr  : ComplexPattern<iPTR, 2, "SelectAddrImm",    [], []>;
+def xaddr  : ComplexPattern<iPTR, 2, "SelectAddrIdx",    [], []>;
+def xoaddr : ComplexPattern<iPTR, 2, "SelectAddrIdxOnly",[], []>;
+def ixaddr : ComplexPattern<iPTR, 2, "SelectAddrImmShift", [], []>; // "std"
+
+/// This is just the offset part of iaddr, used for preinc.
+def iaddroff : ComplexPattern<iPTR, 1, "SelectAddrImmOffs", [], []>;
+
+//===----------------------------------------------------------------------===//
+// PowerPC Instruction Predicate Definitions.
+def FPContractions : Predicate<"!NoExcessFPPrecision">;
+def In32BitMode  : Predicate<"!PPCSubTarget.isPPC64()">;
+def In64BitMode  : Predicate<"PPCSubTarget.isPPC64()">;
+
+
+//===----------------------------------------------------------------------===//
+// PowerPC Instruction Definitions.
+
+// Pseudo-instructions:
+
+let hasCtrlDep = 1 in {
+let Defs = [R1], Uses = [R1] in {
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins u16imm:$amt),
+                              "${:comment} ADJCALLSTACKDOWN",
+                              [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP   : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2),
+                              "${:comment} ADJCALLSTACKUP",
+                              [(callseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+def UPDATE_VRSAVE    : Pseudo<(outs GPRC:$rD), (ins GPRC:$rS),
+                              "UPDATE_VRSAVE $rD, $rS", []>;
+}
+
+let Defs = [R1], Uses = [R1] in
+def DYNALLOC : Pseudo<(outs GPRC:$result), (ins GPRC:$negsize, memri:$fpsi),
+                       "${:comment} DYNALLOC $result, $negsize, $fpsi",
+                       [(set GPRC:$result,
+                             (PPCdynalloc GPRC:$negsize, iaddr:$fpsi))]>;
+                         
+// SELECT_CC_* - Used to implement the SELECT_CC DAG operation.  Expanded after
+// instruction selection into a branch sequence.
+let usesCustomInserter = 1,    // Expanded after instruction selection.
+    PPC970_Single = 1 in {
+  def SELECT_CC_I4 : Pseudo<(outs GPRC:$dst), (ins CRRC:$cond, GPRC:$T, GPRC:$F,
+                              i32imm:$BROPC), "${:comment} SELECT_CC PSEUDO!",
+                              []>;
+  def SELECT_CC_I8 : Pseudo<(outs G8RC:$dst), (ins CRRC:$cond, G8RC:$T, G8RC:$F,
+                              i32imm:$BROPC), "${:comment} SELECT_CC PSEUDO!",
+                              []>;
+  def SELECT_CC_F4  : Pseudo<(outs F4RC:$dst), (ins CRRC:$cond, F4RC:$T, F4RC:$F,
+                              i32imm:$BROPC), "${:comment} SELECT_CC PSEUDO!",
+                              []>;
+  def SELECT_CC_F8  : Pseudo<(outs F8RC:$dst), (ins CRRC:$cond, F8RC:$T, F8RC:$F,
+                              i32imm:$BROPC), "${:comment} SELECT_CC PSEUDO!",
+                              []>;
+  def SELECT_CC_VRRC: Pseudo<(outs VRRC:$dst), (ins CRRC:$cond, VRRC:$T, VRRC:$F,
+                              i32imm:$BROPC), "${:comment} SELECT_CC PSEUDO!",
+                              []>;
+}
+
+// SPILL_CR - Indicate that we're dumping the CR register, so we'll need to
+// scavenge a register for it.
+def SPILL_CR : Pseudo<(outs), (ins GPRC:$cond, memri:$F),
+                     "${:comment} SPILL_CR $cond $F", []>;
+
+let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7 in {
+  let isReturn = 1, Uses = [LR, RM] in
+    def BLR : XLForm_2_br<19, 16, 0, (outs), (ins pred:$p),
+                          "b${p:cc}lr ${p:reg}", BrB, 
+                          [(retflag)]>;
+  let isBranch = 1, isIndirectBranch = 1, Uses = [CTR] in
+    def BCTR : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", BrB, []>;
+}
+
+let Defs = [LR] in
+  def MovePCtoLR : Pseudo<(outs), (ins piclabel:$label), "bl $label", []>,
+                   PPC970_Unit_BRU;
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7 in {
+  let isBarrier = 1 in {
+  def B   : IForm<18, 0, 0, (outs), (ins target:$dst),
+                  "b $dst", BrB,
+                  [(br bb:$dst)]>;
+  }
+
+  // BCC represents an arbitrary conditional branch on a predicate.
+  // FIXME: should be able to write a pattern for PPCcondbranch, but can't use
+  // a two-value operand where a dag node expects two operands. :( 
+  def BCC : BForm<16, 0, 0, (outs), (ins pred:$cond, target:$dst),
+                  "b${cond:cc} ${cond:reg}, $dst"
+                  /*[(PPCcondbranch CRRC:$crS, imm:$opc, bb:$dst)]*/>;
+}
+
+// Darwin ABI Calls.
+let isCall = 1, PPC970_Unit = 7, 
+  // All calls clobber the non-callee saved registers...
+  Defs = [R0,R2,R3,R4,R5,R6,R7,R8,R9,R10,R11,R12,
+          F0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,
+          V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,
+          LR,CTR,
+          CR0,CR1,CR5,CR6,CR7,CARRY] in {
+  // Convenient aliases for call instructions
+  let Uses = [RM] in {
+    def BL_Darwin  : IForm<18, 0, 1,
+                           (outs), (ins calltarget:$func, variable_ops), 
+                           "bl $func", BrB, []>;  // See Pat patterns below.
+    def BLA_Darwin : IForm<18, 1, 1, 
+                          (outs), (ins aaddr:$func, variable_ops),
+                          "bla $func", BrB, [(PPCcall_Darwin (i32 imm:$func))]>;
+  }
+  let Uses = [CTR, RM] in {
+    def BCTRL_Darwin : XLForm_2_ext<19, 528, 20, 0, 1, 
+                                  (outs), (ins variable_ops),
+                                  "bctrl", BrB,
+                                  [(PPCbctrl_Darwin)]>, Requires<[In32BitMode]>;
+  }
+}
+
+// SVR4 ABI Calls.
+let isCall = 1, PPC970_Unit = 7, 
+  // All calls clobber the non-callee saved registers...
+  Defs = [R0,R3,R4,R5,R6,R7,R8,R9,R10,R11,R12,
+          F0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,
+          V0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,
+          LR,CTR,
+          CR0,CR1,CR5,CR6,CR7,CARRY] in {
+  // Convenient aliases for call instructions
+  let Uses = [RM] in {
+    def BL_SVR4  : IForm<18, 0, 1,
+                        (outs), (ins calltarget:$func, variable_ops), 
+                        "bl $func", BrB, []>;  // See Pat patterns below.
+    def BLA_SVR4 : IForm<18, 1, 1,
+                        (outs), (ins aaddr:$func, variable_ops),
+                        "bla $func", BrB,
+                        [(PPCcall_SVR4 (i32 imm:$func))]>;
+  }
+  let Uses = [CTR, RM] in {
+    def BCTRL_SVR4 : XLForm_2_ext<19, 528, 20, 0, 1,
+                                (outs), (ins variable_ops),
+                                "bctrl", BrB,
+                                [(PPCbctrl_SVR4)]>, Requires<[In32BitMode]>;
+  }
+}
+
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
+def TCRETURNdi :Pseudo< (outs),
+                        (ins calltarget:$dst, i32imm:$offset, variable_ops),
+                 "#TC_RETURNd $dst $offset",
+                 []>;
+
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
+def TCRETURNai :Pseudo<(outs), (ins aaddr:$func, i32imm:$offset, variable_ops),
+                 "#TC_RETURNa $func $offset",
+                 [(PPCtc_return (i32 imm:$func), imm:$offset)]>;
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [RM] in
+def TCRETURNri : Pseudo<(outs), (ins CTRRC:$dst, i32imm:$offset, variable_ops),
+                 "#TC_RETURNr $dst $offset",
+                 []>;
+
+
+let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7, isBranch = 1,
+    isIndirectBranch = 1, isCall = 1, isReturn = 1, Uses = [CTR, RM]  in
+def TAILBCTR : XLForm_2_ext<19, 528, 20, 0, 0, (outs), (ins), "bctr", BrB, []>,
+     Requires<[In32BitMode]>;
+
+
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7,
+    isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in
+def TAILB   : IForm<18, 0, 0, (outs), (ins calltarget:$dst),
+                  "b $dst", BrB,
+                  []>;
+
+
+let isBranch = 1, isTerminator = 1, hasCtrlDep = 1, PPC970_Unit = 7,
+    isBarrier = 1, isCall = 1, isReturn = 1, Uses = [RM] in
+def TAILBA   : IForm<18, 0, 0, (outs), (ins aaddr:$dst),
+                  "ba $dst", BrB,
+                  []>;
+
+
+// DCB* instructions.
+def DCBA   : DCB_Form<758, 0, (outs), (ins memrr:$dst),
+                      "dcba $dst", LdStDCBF, [(int_ppc_dcba xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+def DCBF   : DCB_Form<86, 0, (outs), (ins memrr:$dst),
+                      "dcbf $dst", LdStDCBF, [(int_ppc_dcbf xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+def DCBI   : DCB_Form<470, 0, (outs), (ins memrr:$dst),
+                      "dcbi $dst", LdStDCBF, [(int_ppc_dcbi xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+def DCBST  : DCB_Form<54, 0, (outs), (ins memrr:$dst),
+                      "dcbst $dst", LdStDCBF, [(int_ppc_dcbst xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+def DCBT   : DCB_Form<278, 0, (outs), (ins memrr:$dst),
+                      "dcbt $dst", LdStDCBF, [(int_ppc_dcbt xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+def DCBTST : DCB_Form<246, 0, (outs), (ins memrr:$dst),
+                      "dcbtst $dst", LdStDCBF, [(int_ppc_dcbtst xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+def DCBZ   : DCB_Form<1014, 0, (outs), (ins memrr:$dst),
+                      "dcbz $dst", LdStDCBF, [(int_ppc_dcbz xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+def DCBZL  : DCB_Form<1014, 1, (outs), (ins memrr:$dst),
+                      "dcbzl $dst", LdStDCBF, [(int_ppc_dcbzl xoaddr:$dst)]>,
+                      PPC970_DGroup_Single;
+
+// Atomic operations
+let usesCustomInserter = 1 in {
+  let Uses = [CR0] in {
+    def ATOMIC_LOAD_ADD_I8 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr),
+      "${:comment} ATOMIC_LOAD_ADD_I8 PSEUDO!",
+      [(set GPRC:$dst, (atomic_load_add_8 xoaddr:$ptr, GPRC:$incr))]>;
+    def ATOMIC_LOAD_SUB_I8 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr),
+      "${:comment} ATOMIC_LOAD_SUB_I8 PSEUDO!",
+      [(set GPRC:$dst, (atomic_load_sub_8 xoaddr:$ptr, GPRC:$incr))]>;
+    def ATOMIC_LOAD_AND_I8 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr),
+      "${:comment} ATOMIC_LOAD_AND_I8 PSEUDO!",
+      [(set GPRC:$dst, (atomic_load_and_8 xoaddr:$ptr, GPRC:$incr))]>;
+    def ATOMIC_LOAD_OR_I8 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr),
+      "${:comment} ATOMIC_LOAD_OR_I8 PSEUDO!",
+      [(set GPRC:$dst, (atomic_load_or_8 xoaddr:$ptr, GPRC:$incr))]>;
+    def ATOMIC_LOAD_XOR_I8 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr),
+      "${:comment} ATOMIC_LOAD_XOR_I8 PSEUDO!",
+      [(set GPRC:$dst, (atomic_load_xor_8 xoaddr:$ptr, GPRC:$incr))]>;
+    def ATOMIC_LOAD_NAND_I8 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr),
+      "${:comment} ATOMIC_LOAD_NAND_I8 PSEUDO!",
+      [(set GPRC:$dst, (atomic_load_nand_8 xoaddr:$ptr, GPRC:$incr))]>;
+    def ATOMIC_LOAD_ADD_I16 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr),
+      "${:comment} ATOMIC_LOAD_ADD_I16 PSEUDO!",
+      [(set GPRC:$dst, (atomic_load_add_16 xoaddr:$ptr, GPRC:$incr))]>;
+    def ATOMIC_LOAD_SUB_I16 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr),
+      "${:comment} ATOMIC_LOAD_SUB_I16 PSEUDO!",
+      [(set GPRC:$dst, (atomic_load_sub_16 xoaddr:$ptr, GPRC:$incr))]>;
+    def ATOMIC_LOAD_AND_I16 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr),
+      "${:comment} ATOMIC_LOAD_AND_I16 PSEUDO!",
+      [(set GPRC:$dst, (atomic_load_and_16 xoaddr:$ptr, GPRC:$incr))]>;
+    def ATOMIC_LOAD_OR_I16 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr),
+      "${:comment} ATOMIC_LOAD_OR_I16 PSEUDO!",
+      [(set GPRC:$dst, (atomic_load_or_16 xoaddr:$ptr, GPRC:$incr))]>;
+    def ATOMIC_LOAD_XOR_I16 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr),
+      "${:comment} ATOMIC_LOAD_XOR_I16 PSEUDO!",
+      [(set GPRC:$dst, (atomic_load_xor_16 xoaddr:$ptr, GPRC:$incr))]>;
+    def ATOMIC_LOAD_NAND_I16 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr),
+      "${:comment} ATOMIC_LOAD_NAND_I16 PSEUDO!",
+      [(set GPRC:$dst, (atomic_load_nand_16 xoaddr:$ptr, GPRC:$incr))]>;
+    def ATOMIC_LOAD_ADD_I32 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr),
+      "${:comment} ATOMIC_LOAD_ADD_I32 PSEUDO!",
+      [(set GPRC:$dst, (atomic_load_add_32 xoaddr:$ptr, GPRC:$incr))]>;
+    def ATOMIC_LOAD_SUB_I32 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr),
+      "${:comment} ATOMIC_LOAD_SUB_I32 PSEUDO!",
+      [(set GPRC:$dst, (atomic_load_sub_32 xoaddr:$ptr, GPRC:$incr))]>;
+    def ATOMIC_LOAD_AND_I32 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr),
+      "${:comment} ATOMIC_LOAD_AND_I32 PSEUDO!",
+      [(set GPRC:$dst, (atomic_load_and_32 xoaddr:$ptr, GPRC:$incr))]>;
+    def ATOMIC_LOAD_OR_I32 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr),
+      "${:comment} ATOMIC_LOAD_OR_I32 PSEUDO!",
+      [(set GPRC:$dst, (atomic_load_or_32 xoaddr:$ptr, GPRC:$incr))]>;
+    def ATOMIC_LOAD_XOR_I32 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr),
+      "${:comment} ATOMIC_LOAD_XOR_I32 PSEUDO!",
+      [(set GPRC:$dst, (atomic_load_xor_32 xoaddr:$ptr, GPRC:$incr))]>;
+    def ATOMIC_LOAD_NAND_I32 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$incr),
+      "${:comment} ATOMIC_LOAD_NAND_I32 PSEUDO!",
+      [(set GPRC:$dst, (atomic_load_nand_32 xoaddr:$ptr, GPRC:$incr))]>;
+
+    def ATOMIC_CMP_SWAP_I8 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$old, GPRC:$new),
+      "${:comment} ATOMIC_CMP_SWAP_I8 PSEUDO!",
+      [(set GPRC:$dst, 
+                    (atomic_cmp_swap_8 xoaddr:$ptr, GPRC:$old, GPRC:$new))]>;
+    def ATOMIC_CMP_SWAP_I16 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$old, GPRC:$new),
+      "${:comment} ATOMIC_CMP_SWAP_I16 PSEUDO!",
+      [(set GPRC:$dst, 
+                    (atomic_cmp_swap_16 xoaddr:$ptr, GPRC:$old, GPRC:$new))]>;
+    def ATOMIC_CMP_SWAP_I32 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$old, GPRC:$new),
+      "${:comment} ATOMIC_CMP_SWAP_I32 PSEUDO!",
+      [(set GPRC:$dst, 
+                    (atomic_cmp_swap_32 xoaddr:$ptr, GPRC:$old, GPRC:$new))]>;
+
+    def ATOMIC_SWAP_I8 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$new),
+      "${:comment} ATOMIC_SWAP_I8 PSEUDO!",
+      [(set GPRC:$dst, (atomic_swap_8 xoaddr:$ptr, GPRC:$new))]>;
+    def ATOMIC_SWAP_I16 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$new),
+      "${:comment} ATOMIC_SWAP_I16 PSEUDO!",
+      [(set GPRC:$dst, (atomic_swap_16 xoaddr:$ptr, GPRC:$new))]>;
+    def ATOMIC_SWAP_I32 : Pseudo<
+      (outs GPRC:$dst), (ins memrr:$ptr, GPRC:$new),
+      "${:comment} ATOMIC_SWAP_I32 PSEUDO!",
+      [(set GPRC:$dst, (atomic_swap_32 xoaddr:$ptr, GPRC:$new))]>;
+  }
+}
+
+// Instructions to support atomic operations
+def LWARX : XForm_1<31,  20, (outs GPRC:$rD), (ins memrr:$src),
+                   "lwarx $rD, $src", LdStLWARX,
+                   [(set GPRC:$rD, (PPClarx xoaddr:$src))]>;
+
+let Defs = [CR0] in
+def STWCX : XForm_1<31, 150, (outs), (ins GPRC:$rS, memrr:$dst),
+                   "stwcx. $rS, $dst", LdStSTWCX,
+                   [(PPCstcx GPRC:$rS, xoaddr:$dst)]>,
+                   isDOT;
+
+let isBarrier = 1, hasCtrlDep = 1 in
+def TRAP  : XForm_24<31, 4, (outs), (ins), "trap", LdStGeneral, [(trap)]>;
+
+//===----------------------------------------------------------------------===//
+// PPC32 Load Instructions.
+//
+
+// Unindexed (r+i) Loads. 
+let canFoldAsLoad = 1, PPC970_Unit = 2 in {
+def LBZ : DForm_1<34, (outs GPRC:$rD), (ins memri:$src),
+                  "lbz $rD, $src", LdStGeneral,
+                  [(set GPRC:$rD, (zextloadi8 iaddr:$src))]>;
+def LHA : DForm_1<42, (outs GPRC:$rD), (ins memri:$src),
+                  "lha $rD, $src", LdStLHA,
+                  [(set GPRC:$rD, (sextloadi16 iaddr:$src))]>,
+                  PPC970_DGroup_Cracked;
+def LHZ : DForm_1<40, (outs GPRC:$rD), (ins memri:$src),
+                  "lhz $rD, $src", LdStGeneral,
+                  [(set GPRC:$rD, (zextloadi16 iaddr:$src))]>;
+def LWZ : DForm_1<32, (outs GPRC:$rD), (ins memri:$src),
+                  "lwz $rD, $src", LdStGeneral,
+                  [(set GPRC:$rD, (load iaddr:$src))]>;
+
+def LFS : DForm_1<48, (outs F4RC:$rD), (ins memri:$src),
+                  "lfs $rD, $src", LdStLFDU,
+                  [(set F4RC:$rD, (load iaddr:$src))]>;
+def LFD : DForm_1<50, (outs F8RC:$rD), (ins memri:$src),
+                  "lfd $rD, $src", LdStLFD,
+                  [(set F8RC:$rD, (load iaddr:$src))]>;
+
+
+// Unindexed (r+i) Loads with Update (preinc).
+let mayLoad = 1 in {
+def LBZU : DForm_1<35, (outs GPRC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
+                   "lbzu $rD, $addr", LdStGeneral,
+                   []>, RegConstraint<"$addr.reg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LHAU : DForm_1<43, (outs GPRC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
+                   "lhau $rD, $addr", LdStGeneral,
+                   []>, RegConstraint<"$addr.reg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LHZU : DForm_1<41, (outs GPRC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
+                   "lhzu $rD, $addr", LdStGeneral,
+                   []>, RegConstraint<"$addr.reg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LWZU : DForm_1<33, (outs GPRC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
+                   "lwzu $rD, $addr", LdStGeneral,
+                   []>, RegConstraint<"$addr.reg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LFSU : DForm_1<49, (outs F4RC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
+                  "lfs $rD, $addr", LdStLFDU,
+                  []>, RegConstraint<"$addr.reg = $ea_result">,
+                   NoEncode<"$ea_result">;
+
+def LFDU : DForm_1<51, (outs F8RC:$rD, ptr_rc:$ea_result), (ins memri:$addr),
+                  "lfd $rD, $addr", LdStLFD,
+                  []>, RegConstraint<"$addr.reg = $ea_result">,
+                   NoEncode<"$ea_result">;
+}
+}
+
+// Indexed (r+r) Loads.
+//
+let canFoldAsLoad = 1, PPC970_Unit = 2 in {
+def LBZX : XForm_1<31,  87, (outs GPRC:$rD), (ins memrr:$src),
+                   "lbzx $rD, $src", LdStGeneral,
+                   [(set GPRC:$rD, (zextloadi8 xaddr:$src))]>;
+def LHAX : XForm_1<31, 343, (outs GPRC:$rD), (ins memrr:$src),
+                   "lhax $rD, $src", LdStLHA,
+                   [(set GPRC:$rD, (sextloadi16 xaddr:$src))]>,
+                   PPC970_DGroup_Cracked;
+def LHZX : XForm_1<31, 279, (outs GPRC:$rD), (ins memrr:$src),
+                   "lhzx $rD, $src", LdStGeneral,
+                   [(set GPRC:$rD, (zextloadi16 xaddr:$src))]>;
+def LWZX : XForm_1<31,  23, (outs GPRC:$rD), (ins memrr:$src),
+                   "lwzx $rD, $src", LdStGeneral,
+                   [(set GPRC:$rD, (load xaddr:$src))]>;
+                   
+                   
+def LHBRX : XForm_1<31, 790, (outs GPRC:$rD), (ins memrr:$src),
+                   "lhbrx $rD, $src", LdStGeneral,
+                   [(set GPRC:$rD, (PPClbrx xoaddr:$src, i16))]>;
+def LWBRX : XForm_1<31,  534, (outs GPRC:$rD), (ins memrr:$src),
+                   "lwbrx $rD, $src", LdStGeneral,
+                   [(set GPRC:$rD, (PPClbrx xoaddr:$src, i32))]>;
+
+def LFSX   : XForm_25<31, 535, (outs F4RC:$frD), (ins memrr:$src),
+                      "lfsx $frD, $src", LdStLFDU,
+                      [(set F4RC:$frD, (load xaddr:$src))]>;
+def LFDX   : XForm_25<31, 599, (outs F8RC:$frD), (ins memrr:$src),
+                      "lfdx $frD, $src", LdStLFDU,
+                      [(set F8RC:$frD, (load xaddr:$src))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// PPC32 Store Instructions.
+//
+
+// Unindexed (r+i) Stores.
+let PPC970_Unit = 2 in {
+def STB  : DForm_1<38, (outs), (ins GPRC:$rS, memri:$src),
+                   "stb $rS, $src", LdStGeneral,
+                   [(truncstorei8 GPRC:$rS, iaddr:$src)]>;
+def STH  : DForm_1<44, (outs), (ins GPRC:$rS, memri:$src),
+                   "sth $rS, $src", LdStGeneral,
+                   [(truncstorei16 GPRC:$rS, iaddr:$src)]>;
+def STW  : DForm_1<36, (outs), (ins GPRC:$rS, memri:$src),
+                   "stw $rS, $src", LdStGeneral,
+                   [(store GPRC:$rS, iaddr:$src)]>;
+def STFS : DForm_1<52, (outs), (ins F4RC:$rS, memri:$dst),
+                   "stfs $rS, $dst", LdStUX,
+                   [(store F4RC:$rS, iaddr:$dst)]>;
+def STFD : DForm_1<54, (outs), (ins F8RC:$rS, memri:$dst),
+                   "stfd $rS, $dst", LdStUX,
+                   [(store F8RC:$rS, iaddr:$dst)]>;
+}
+
+// Unindexed (r+i) Stores with Update (preinc).
+let PPC970_Unit = 2 in {
+def STBU  : DForm_1<39, (outs ptr_rc:$ea_res), (ins GPRC:$rS,
+                             symbolLo:$ptroff, ptr_rc:$ptrreg),
+                    "stbu $rS, $ptroff($ptrreg)", LdStGeneral,
+                    [(set ptr_rc:$ea_res,
+                          (pre_truncsti8 GPRC:$rS, ptr_rc:$ptrreg, 
+                                         iaddroff:$ptroff))]>,
+                    RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
+def STHU  : DForm_1<45, (outs ptr_rc:$ea_res), (ins GPRC:$rS,
+                             symbolLo:$ptroff, ptr_rc:$ptrreg),
+                    "sthu $rS, $ptroff($ptrreg)", LdStGeneral,
+                    [(set ptr_rc:$ea_res,
+                        (pre_truncsti16 GPRC:$rS, ptr_rc:$ptrreg, 
+                                        iaddroff:$ptroff))]>,
+                    RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
+def STWU  : DForm_1<37, (outs ptr_rc:$ea_res), (ins GPRC:$rS,
+                             symbolLo:$ptroff, ptr_rc:$ptrreg),
+                    "stwu $rS, $ptroff($ptrreg)", LdStGeneral,
+                    [(set ptr_rc:$ea_res, (pre_store GPRC:$rS, ptr_rc:$ptrreg, 
+                                                     iaddroff:$ptroff))]>,
+                    RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
+def STFSU : DForm_1<37, (outs ptr_rc:$ea_res), (ins F4RC:$rS,
+                             symbolLo:$ptroff, ptr_rc:$ptrreg),
+                    "stfsu $rS, $ptroff($ptrreg)", LdStGeneral,
+                    [(set ptr_rc:$ea_res, (pre_store F4RC:$rS,  ptr_rc:$ptrreg, 
+                                          iaddroff:$ptroff))]>,
+                    RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
+def STFDU : DForm_1<37, (outs ptr_rc:$ea_res), (ins F8RC:$rS,
+                             symbolLo:$ptroff, ptr_rc:$ptrreg),
+                    "stfdu $rS, $ptroff($ptrreg)", LdStGeneral,
+                    [(set ptr_rc:$ea_res, (pre_store F8RC:$rS, ptr_rc:$ptrreg, 
+                                          iaddroff:$ptroff))]>,
+                    RegConstraint<"$ptrreg = $ea_res">, NoEncode<"$ea_res">;
+}
+
+
+// Indexed (r+r) Stores.
+//
+let PPC970_Unit = 2 in {
+def STBX  : XForm_8<31, 215, (outs), (ins GPRC:$rS, memrr:$dst),
+                   "stbx $rS, $dst", LdStGeneral,
+                   [(truncstorei8 GPRC:$rS, xaddr:$dst)]>, 
+                   PPC970_DGroup_Cracked;
+def STHX  : XForm_8<31, 407, (outs), (ins GPRC:$rS, memrr:$dst),
+                   "sthx $rS, $dst", LdStGeneral,
+                   [(truncstorei16 GPRC:$rS, xaddr:$dst)]>, 
+                   PPC970_DGroup_Cracked;
+def STWX  : XForm_8<31, 151, (outs), (ins GPRC:$rS, memrr:$dst),
+                   "stwx $rS, $dst", LdStGeneral,
+                   [(store GPRC:$rS, xaddr:$dst)]>,
+                   PPC970_DGroup_Cracked;
+                   
+let mayStore = 1 in {
+def STWUX : XForm_8<31, 183, (outs), (ins GPRC:$rS, GPRC:$rA, GPRC:$rB),
+                   "stwux $rS, $rA, $rB", LdStGeneral,
+                   []>;
+}
+def STHBRX: XForm_8<31, 918, (outs), (ins GPRC:$rS, memrr:$dst),
+                   "sthbrx $rS, $dst", LdStGeneral,
+                   [(PPCstbrx GPRC:$rS, xoaddr:$dst, i16)]>, 
+                   PPC970_DGroup_Cracked;
+def STWBRX: XForm_8<31, 662, (outs), (ins GPRC:$rS, memrr:$dst),
+                   "stwbrx $rS, $dst", LdStGeneral,
+                   [(PPCstbrx GPRC:$rS, xoaddr:$dst, i32)]>,
+                   PPC970_DGroup_Cracked;
+
+def STFIWX: XForm_28<31, 983, (outs), (ins F8RC:$frS, memrr:$dst),
+                     "stfiwx $frS, $dst", LdStUX,
+                     [(PPCstfiwx F8RC:$frS, xoaddr:$dst)]>;
+                     
+def STFSX : XForm_28<31, 663, (outs), (ins F4RC:$frS, memrr:$dst),
+                     "stfsx $frS, $dst", LdStUX,
+                     [(store F4RC:$frS, xaddr:$dst)]>;
+def STFDX : XForm_28<31, 727, (outs), (ins F8RC:$frS, memrr:$dst),
+                     "stfdx $frS, $dst", LdStUX,
+                     [(store F8RC:$frS, xaddr:$dst)]>;
+}
+
+let isBarrier = 1 in
+def SYNC : XForm_24_sync<31, 598, (outs), (ins),
+                        "sync", LdStSync,
+                        [(int_ppc_sync)]>;
+
+//===----------------------------------------------------------------------===//
+// PPC32 Arithmetic Instructions.
+//
+
+let PPC970_Unit = 1 in {  // FXU Operations.
+def ADDI   : DForm_2<14, (outs GPRC:$rD), (ins GPRC:$rA, s16imm:$imm),
+                     "addi $rD, $rA, $imm", IntGeneral,
+                     [(set GPRC:$rD, (add GPRC:$rA, immSExt16:$imm))]>;
+let Defs = [CARRY] in {
+def ADDIC  : DForm_2<12, (outs GPRC:$rD), (ins GPRC:$rA, s16imm:$imm),
+                     "addic $rD, $rA, $imm", IntGeneral,
+                     [(set GPRC:$rD, (addc GPRC:$rA, immSExt16:$imm))]>,
+                     PPC970_DGroup_Cracked;
+def ADDICo : DForm_2<13, (outs GPRC:$rD), (ins GPRC:$rA, s16imm:$imm),
+                     "addic. $rD, $rA, $imm", IntGeneral,
+                     []>;
+}
+def ADDIS  : DForm_2<15, (outs GPRC:$rD), (ins GPRC:$rA, symbolHi:$imm),
+                     "addis $rD, $rA, $imm", IntGeneral,
+                     [(set GPRC:$rD, (add GPRC:$rA, imm16ShiftedSExt:$imm))]>;
+def LA     : DForm_2<14, (outs GPRC:$rD), (ins GPRC:$rA, symbolLo:$sym),
+                     "la $rD, $sym($rA)", IntGeneral,
+                     [(set GPRC:$rD, (add GPRC:$rA,
+                                          (PPClo tglobaladdr:$sym, 0)))]>;
+def MULLI  : DForm_2< 7, (outs GPRC:$rD), (ins GPRC:$rA, s16imm:$imm),
+                     "mulli $rD, $rA, $imm", IntMulLI,
+                     [(set GPRC:$rD, (mul GPRC:$rA, immSExt16:$imm))]>;
+let Defs = [CARRY] in {
+def SUBFIC : DForm_2< 8, (outs GPRC:$rD), (ins GPRC:$rA, s16imm:$imm),
+                     "subfic $rD, $rA, $imm", IntGeneral,
+                     [(set GPRC:$rD, (subc immSExt16:$imm, GPRC:$rA))]>;
+}
+
+let isReMaterializable = 1 in {
+  def LI  : DForm_2_r0<14, (outs GPRC:$rD), (ins symbolLo:$imm),
+                       "li $rD, $imm", IntGeneral,
+                       [(set GPRC:$rD, immSExt16:$imm)]>;
+  def LIS : DForm_2_r0<15, (outs GPRC:$rD), (ins symbolHi:$imm),
+                       "lis $rD, $imm", IntGeneral,
+                       [(set GPRC:$rD, imm16ShiftedSExt:$imm)]>;
+}
+}
+
+let PPC970_Unit = 1 in {  // FXU Operations.
+def ANDIo : DForm_4<28, (outs GPRC:$dst), (ins GPRC:$src1, u16imm:$src2),
+                    "andi. $dst, $src1, $src2", IntGeneral,
+                    [(set GPRC:$dst, (and GPRC:$src1, immZExt16:$src2))]>,
+                    isDOT;
+def ANDISo : DForm_4<29, (outs GPRC:$dst), (ins GPRC:$src1, u16imm:$src2),
+                    "andis. $dst, $src1, $src2", IntGeneral,
+                    [(set GPRC:$dst, (and GPRC:$src1,imm16ShiftedZExt:$src2))]>,
+                    isDOT;
+def ORI   : DForm_4<24, (outs GPRC:$dst), (ins GPRC:$src1, u16imm:$src2),
+                    "ori $dst, $src1, $src2", IntGeneral,
+                    [(set GPRC:$dst, (or GPRC:$src1, immZExt16:$src2))]>;
+def ORIS  : DForm_4<25, (outs GPRC:$dst), (ins GPRC:$src1, u16imm:$src2),
+                    "oris $dst, $src1, $src2", IntGeneral,
+                    [(set GPRC:$dst, (or GPRC:$src1, imm16ShiftedZExt:$src2))]>;
+def XORI  : DForm_4<26, (outs GPRC:$dst), (ins GPRC:$src1, u16imm:$src2),
+                    "xori $dst, $src1, $src2", IntGeneral,
+                    [(set GPRC:$dst, (xor GPRC:$src1, immZExt16:$src2))]>;
+def XORIS : DForm_4<27, (outs GPRC:$dst), (ins GPRC:$src1, u16imm:$src2),
+                    "xoris $dst, $src1, $src2", IntGeneral,
+                    [(set GPRC:$dst, (xor GPRC:$src1,imm16ShiftedZExt:$src2))]>;
+def NOP   : DForm_4_zero<24, (outs), (ins), "nop", IntGeneral,
+                         []>;
+def CMPWI : DForm_5_ext<11, (outs CRRC:$crD), (ins GPRC:$rA, s16imm:$imm),
+                        "cmpwi $crD, $rA, $imm", IntCompare>;
+def CMPLWI : DForm_6_ext<10, (outs CRRC:$dst), (ins GPRC:$src1, u16imm:$src2),
+                         "cmplwi $dst, $src1, $src2", IntCompare>;
+}
+
+
+let PPC970_Unit = 1 in {  // FXU Operations.
+def NAND : XForm_6<31, 476, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
+                   "nand $rA, $rS, $rB", IntGeneral,
+                   [(set GPRC:$rA, (not (and GPRC:$rS, GPRC:$rB)))]>;
+def AND  : XForm_6<31,  28, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
+                   "and $rA, $rS, $rB", IntGeneral,
+                   [(set GPRC:$rA, (and GPRC:$rS, GPRC:$rB))]>;
+def ANDC : XForm_6<31,  60, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
+                   "andc $rA, $rS, $rB", IntGeneral,
+                   [(set GPRC:$rA, (and GPRC:$rS, (not GPRC:$rB)))]>;
+def OR   : XForm_6<31, 444, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
+                   "or $rA, $rS, $rB", IntGeneral,
+                   [(set GPRC:$rA, (or GPRC:$rS, GPRC:$rB))]>;
+def NOR  : XForm_6<31, 124, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
+                   "nor $rA, $rS, $rB", IntGeneral,
+                   [(set GPRC:$rA, (not (or GPRC:$rS, GPRC:$rB)))]>;
+def ORC  : XForm_6<31, 412, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
+                   "orc $rA, $rS, $rB", IntGeneral,
+                   [(set GPRC:$rA, (or GPRC:$rS, (not GPRC:$rB)))]>;
+def EQV  : XForm_6<31, 284, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
+                   "eqv $rA, $rS, $rB", IntGeneral,
+                   [(set GPRC:$rA, (not (xor GPRC:$rS, GPRC:$rB)))]>;
+def XOR  : XForm_6<31, 316, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
+                   "xor $rA, $rS, $rB", IntGeneral,
+                   [(set GPRC:$rA, (xor GPRC:$rS, GPRC:$rB))]>;
+def SLW  : XForm_6<31,  24, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
+                   "slw $rA, $rS, $rB", IntGeneral,
+                   [(set GPRC:$rA, (PPCshl GPRC:$rS, GPRC:$rB))]>;
+def SRW  : XForm_6<31, 536, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
+                   "srw $rA, $rS, $rB", IntGeneral,
+                   [(set GPRC:$rA, (PPCsrl GPRC:$rS, GPRC:$rB))]>;
+let Defs = [CARRY] in {
+def SRAW : XForm_6<31, 792, (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB),
+                   "sraw $rA, $rS, $rB", IntShift,
+                   [(set GPRC:$rA, (PPCsra GPRC:$rS, GPRC:$rB))]>;
+}
+}
+
+let PPC970_Unit = 1 in {  // FXU Operations.
+let Defs = [CARRY] in {
+def SRAWI : XForm_10<31, 824, (outs GPRC:$rA), (ins GPRC:$rS, u5imm:$SH), 
+                     "srawi $rA, $rS, $SH", IntShift,
+                     [(set GPRC:$rA, (sra GPRC:$rS, (i32 imm:$SH)))]>;
+}
+def CNTLZW : XForm_11<31,  26, (outs GPRC:$rA), (ins GPRC:$rS),
+                      "cntlzw $rA, $rS", IntGeneral,
+                      [(set GPRC:$rA, (ctlz GPRC:$rS))]>;
+def EXTSB  : XForm_11<31, 954, (outs GPRC:$rA), (ins GPRC:$rS),
+                      "extsb $rA, $rS", IntGeneral,
+                      [(set GPRC:$rA, (sext_inreg GPRC:$rS, i8))]>;
+def EXTSH  : XForm_11<31, 922, (outs GPRC:$rA), (ins GPRC:$rS),
+                      "extsh $rA, $rS", IntGeneral,
+                      [(set GPRC:$rA, (sext_inreg GPRC:$rS, i16))]>;
+
+def CMPW   : XForm_16_ext<31, 0, (outs CRRC:$crD), (ins GPRC:$rA, GPRC:$rB),
+                          "cmpw $crD, $rA, $rB", IntCompare>;
+def CMPLW  : XForm_16_ext<31, 32, (outs CRRC:$crD), (ins GPRC:$rA, GPRC:$rB),
+                          "cmplw $crD, $rA, $rB", IntCompare>;
+}
+let PPC970_Unit = 3 in {  // FPU Operations.
+//def FCMPO  : XForm_17<63, 32, (outs CRRC:$crD), (ins FPRC:$fA, FPRC:$fB),
+//                      "fcmpo $crD, $fA, $fB", FPCompare>;
+def FCMPUS : XForm_17<63, 0, (outs CRRC:$crD), (ins F4RC:$fA, F4RC:$fB),
+                      "fcmpu $crD, $fA, $fB", FPCompare>;
+def FCMPUD : XForm_17<63, 0, (outs CRRC:$crD), (ins F8RC:$fA, F8RC:$fB),
+                      "fcmpu $crD, $fA, $fB", FPCompare>;
+
+let Uses = [RM] in {
+  def FCTIWZ : XForm_26<63, 15, (outs F8RC:$frD), (ins F8RC:$frB),
+                        "fctiwz $frD, $frB", FPGeneral,
+                        [(set F8RC:$frD, (PPCfctiwz F8RC:$frB))]>;
+  def FRSP   : XForm_26<63, 12, (outs F4RC:$frD), (ins F8RC:$frB),
+                        "frsp $frD, $frB", FPGeneral,
+                        [(set F4RC:$frD, (fround F8RC:$frB))]>;
+  def FSQRT  : XForm_26<63, 22, (outs F8RC:$frD), (ins F8RC:$frB),
+                        "fsqrt $frD, $frB", FPSqrt,
+                        [(set F8RC:$frD, (fsqrt F8RC:$frB))]>;
+  def FSQRTS : XForm_26<59, 22, (outs F4RC:$frD), (ins F4RC:$frB),
+                        "fsqrts $frD, $frB", FPSqrt,
+                        [(set F4RC:$frD, (fsqrt F4RC:$frB))]>;
+  }
+}
+
+/// FMR is split into 3 versions, one for 4/8 byte FP, and one for extending.
+///
+/// Note that these are defined as pseudo-ops on the PPC970 because they are
+/// often coalesced away and we don't want the dispatch group builder to think
+/// that they will fill slots (which could cause the load of a LSU reject to
+/// sneak into a d-group with a store).
+def FMRS   : XForm_26<63, 72, (outs F4RC:$frD), (ins F4RC:$frB),
+                      "fmr $frD, $frB", FPGeneral,
+                      []>,  // (set F4RC:$frD, F4RC:$frB)
+                      PPC970_Unit_Pseudo;
+def FMRD   : XForm_26<63, 72, (outs F8RC:$frD), (ins F8RC:$frB),
+                      "fmr $frD, $frB", FPGeneral,
+                      []>,  // (set F8RC:$frD, F8RC:$frB)
+                      PPC970_Unit_Pseudo;
+def FMRSD  : XForm_26<63, 72, (outs F8RC:$frD), (ins F4RC:$frB),
+                      "fmr $frD, $frB", FPGeneral,
+                      [(set F8RC:$frD, (fextend F4RC:$frB))]>,
+                      PPC970_Unit_Pseudo;
+
+let PPC970_Unit = 3 in {  // FPU Operations.
+// These are artificially split into two different forms, for 4/8 byte FP.
+def FABSS  : XForm_26<63, 264, (outs F4RC:$frD), (ins F4RC:$frB),
+                      "fabs $frD, $frB", FPGeneral,
+                      [(set F4RC:$frD, (fabs F4RC:$frB))]>;
+def FABSD  : XForm_26<63, 264, (outs F8RC:$frD), (ins F8RC:$frB),
+                      "fabs $frD, $frB", FPGeneral,
+                      [(set F8RC:$frD, (fabs F8RC:$frB))]>;
+def FNABSS : XForm_26<63, 136, (outs F4RC:$frD), (ins F4RC:$frB),
+                      "fnabs $frD, $frB", FPGeneral,
+                      [(set F4RC:$frD, (fneg (fabs F4RC:$frB)))]>;
+def FNABSD : XForm_26<63, 136, (outs F8RC:$frD), (ins F8RC:$frB),
+                      "fnabs $frD, $frB", FPGeneral,
+                      [(set F8RC:$frD, (fneg (fabs F8RC:$frB)))]>;
+def FNEGS  : XForm_26<63, 40, (outs F4RC:$frD), (ins F4RC:$frB),
+                      "fneg $frD, $frB", FPGeneral,
+                      [(set F4RC:$frD, (fneg F4RC:$frB))]>;
+def FNEGD  : XForm_26<63, 40, (outs F8RC:$frD), (ins F8RC:$frB),
+                      "fneg $frD, $frB", FPGeneral,
+                      [(set F8RC:$frD, (fneg F8RC:$frB))]>;
+}
+                      
+
+// XL-Form instructions.  condition register logical ops.
+//
+def MCRF   : XLForm_3<19, 0, (outs CRRC:$BF), (ins CRRC:$BFA),
+                      "mcrf $BF, $BFA", BrMCR>,
+             PPC970_DGroup_First, PPC970_Unit_CRU;
+
+def CREQV  : XLForm_1<19, 289, (outs CRBITRC:$CRD),
+                               (ins CRBITRC:$CRA, CRBITRC:$CRB),
+                      "creqv $CRD, $CRA, $CRB", BrCR,
+                      []>;
+
+def CROR  : XLForm_1<19, 449, (outs CRBITRC:$CRD),
+                               (ins CRBITRC:$CRA, CRBITRC:$CRB),
+                      "cror $CRD, $CRA, $CRB", BrCR,
+                      []>;
+
+def CRSET  : XLForm_1_ext<19, 289, (outs CRBITRC:$dst), (ins),
+              "creqv $dst, $dst, $dst", BrCR,
+              []>;
+
+// XFX-Form instructions.  Instructions that deal with SPRs.
+//
+let Uses = [CTR] in {
+def MFCTR : XFXForm_1_ext<31, 339, 9, (outs GPRC:$rT), (ins),
+                          "mfctr $rT", SprMFSPR>,
+            PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+let Defs = [CTR], Pattern = [(PPCmtctr GPRC:$rS)] in {
+def MTCTR : XFXForm_7_ext<31, 467, 9, (outs), (ins GPRC:$rS),
+                          "mtctr $rS", SprMTSPR>,
+            PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+
+let Defs = [LR] in {
+def MTLR  : XFXForm_7_ext<31, 467, 8, (outs), (ins GPRC:$rS),
+                          "mtlr $rS", SprMTSPR>,
+            PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+let Uses = [LR] in {
+def MFLR  : XFXForm_1_ext<31, 339, 8, (outs GPRC:$rT), (ins),
+                          "mflr $rT", SprMFSPR>,
+            PPC970_DGroup_First, PPC970_Unit_FXU;
+}
+
+// Move to/from VRSAVE: despite being a SPR, the VRSAVE register is renamed like
+// a GPR on the PPC970.  As such, copies in and out have the same performance
+// characteristics as an OR instruction.
+def MTVRSAVE : XFXForm_7_ext<31, 467, 256, (outs), (ins GPRC:$rS),
+                             "mtspr 256, $rS", IntGeneral>,
+               PPC970_DGroup_Single, PPC970_Unit_FXU;
+def MFVRSAVE : XFXForm_1_ext<31, 339, 256, (outs GPRC:$rT), (ins),
+                             "mfspr $rT, 256", IntGeneral>,
+               PPC970_DGroup_First, PPC970_Unit_FXU;
+
+def MTCRF : XFXForm_5<31, 144, (outs), (ins crbitm:$FXM, GPRC:$rS),
+                      "mtcrf $FXM, $rS", BrMCRX>,
+            PPC970_MicroCode, PPC970_Unit_CRU;
+// FIXME:  this Uses all the CR registers.  Marking it as such is 
+// necessary for DeadMachineInstructionElim to do the right thing.
+// However, marking it also exposes PR 2964, and causes crashes in
+// the Local RA because it doesn't like this sequence:
+//  vreg = MCRF  CR0
+//  MFCR  <kill of whatever preg got assigned to vreg>
+// For now DeadMachineInstructionElim is turned off, so don't do the marking.
+def MFCR  : XFXForm_3<31, 19, (outs GPRC:$rT), (ins), "mfcr $rT", SprMFCR>,
+            PPC970_MicroCode, PPC970_Unit_CRU;
+def MFOCRF: XFXForm_5a<31, 19, (outs GPRC:$rT), (ins crbitm:$FXM),
+                       "mfcr $rT, $FXM", SprMFCR>,
+            PPC970_DGroup_First, PPC970_Unit_CRU;
+
+// Instructions to manipulate FPSCR.  Only long double handling uses these.
+// FPSCR is not modelled; we use the SDNode Flag to keep things in order.
+
+let Uses = [RM], Defs = [RM] in { 
+  def MTFSB0 : XForm_43<63, 70, (outs), (ins u5imm:$FM),
+                         "mtfsb0 $FM", IntMTFSB0,
+                        [(PPCmtfsb0 (i32 imm:$FM))]>,
+               PPC970_DGroup_Single, PPC970_Unit_FPU;
+  def MTFSB1 : XForm_43<63, 38, (outs), (ins u5imm:$FM),
+                         "mtfsb1 $FM", IntMTFSB0,
+                        [(PPCmtfsb1 (i32 imm:$FM))]>,
+               PPC970_DGroup_Single, PPC970_Unit_FPU;
+  // MTFSF does not actually produce an FP result.  We pretend it copies
+  // input reg B to the output.  If we didn't do this it would look like the
+  // instruction had no outputs (because we aren't modelling the FPSCR) and
+  // it would be deleted.
+  def MTFSF  : XFLForm<63, 711, (outs F8RC:$FRA),
+                                (ins i32imm:$FM, F8RC:$rT, F8RC:$FRB),
+                         "mtfsf $FM, $rT", "$FRB = $FRA", IntMTFSB0,
+                         [(set F8RC:$FRA, (PPCmtfsf (i32 imm:$FM), 
+                                                     F8RC:$rT, F8RC:$FRB))]>,
+               PPC970_DGroup_Single, PPC970_Unit_FPU;
+}
+let Uses = [RM] in {
+  def MFFS   : XForm_42<63, 583, (outs F8RC:$rT), (ins), 
+                         "mffs $rT", IntMFFS,
+                         [(set F8RC:$rT, (PPCmffs))]>,
+               PPC970_DGroup_Single, PPC970_Unit_FPU;
+  def FADDrtz: AForm_2<63, 21,
+                      (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRB),
+                      "fadd $FRT, $FRA, $FRB", FPGeneral,
+                      [(set F8RC:$FRT, (PPCfaddrtz F8RC:$FRA, F8RC:$FRB))]>,
+               PPC970_DGroup_Single, PPC970_Unit_FPU;
+}
+
+
+let PPC970_Unit = 1 in {  // FXU Operations.
+
+// XO-Form instructions.  Arithmetic instructions that can set overflow bit
+//
+def ADD4  : XOForm_1<31, 266, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
+                     "add $rT, $rA, $rB", IntGeneral,
+                     [(set GPRC:$rT, (add GPRC:$rA, GPRC:$rB))]>;
+let Defs = [CARRY] in {
+def ADDC  : XOForm_1<31, 10, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
+                     "addc $rT, $rA, $rB", IntGeneral,
+                     [(set GPRC:$rT, (addc GPRC:$rA, GPRC:$rB))]>,
+                     PPC970_DGroup_Cracked;
+}
+def DIVW  : XOForm_1<31, 491, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
+                     "divw $rT, $rA, $rB", IntDivW,
+                     [(set GPRC:$rT, (sdiv GPRC:$rA, GPRC:$rB))]>,
+                     PPC970_DGroup_First, PPC970_DGroup_Cracked;
+def DIVWU : XOForm_1<31, 459, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
+                     "divwu $rT, $rA, $rB", IntDivW,
+                     [(set GPRC:$rT, (udiv GPRC:$rA, GPRC:$rB))]>,
+                     PPC970_DGroup_First, PPC970_DGroup_Cracked;
+def MULHW : XOForm_1<31, 75, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
+                     "mulhw $rT, $rA, $rB", IntMulHW,
+                     [(set GPRC:$rT, (mulhs GPRC:$rA, GPRC:$rB))]>;
+def MULHWU : XOForm_1<31, 11, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
+                     "mulhwu $rT, $rA, $rB", IntMulHWU,
+                     [(set GPRC:$rT, (mulhu GPRC:$rA, GPRC:$rB))]>;
+def MULLW : XOForm_1<31, 235, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
+                     "mullw $rT, $rA, $rB", IntMulHW,
+                     [(set GPRC:$rT, (mul GPRC:$rA, GPRC:$rB))]>;
+def SUBF  : XOForm_1<31, 40, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
+                     "subf $rT, $rA, $rB", IntGeneral,
+                     [(set GPRC:$rT, (sub GPRC:$rB, GPRC:$rA))]>;
+let Defs = [CARRY] in {
+def SUBFC : XOForm_1<31, 8, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
+                     "subfc $rT, $rA, $rB", IntGeneral,
+                     [(set GPRC:$rT, (subc GPRC:$rB, GPRC:$rA))]>,
+                     PPC970_DGroup_Cracked;
+}
+def NEG    : XOForm_3<31, 104, 0, (outs GPRC:$rT), (ins GPRC:$rA),
+                      "neg $rT, $rA", IntGeneral,
+                      [(set GPRC:$rT, (ineg GPRC:$rA))]>;
+let Uses = [CARRY], Defs = [CARRY] in {
+def ADDE  : XOForm_1<31, 138, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
+                      "adde $rT, $rA, $rB", IntGeneral,
+                      [(set GPRC:$rT, (adde GPRC:$rA, GPRC:$rB))]>;
+def ADDME  : XOForm_3<31, 234, 0, (outs GPRC:$rT), (ins GPRC:$rA),
+                      "addme $rT, $rA", IntGeneral,
+                      [(set GPRC:$rT, (adde GPRC:$rA, immAllOnes))]>;
+def ADDZE  : XOForm_3<31, 202, 0, (outs GPRC:$rT), (ins GPRC:$rA),
+                      "addze $rT, $rA", IntGeneral,
+                      [(set GPRC:$rT, (adde GPRC:$rA, 0))]>;
+def SUBFE : XOForm_1<31, 136, 0, (outs GPRC:$rT), (ins GPRC:$rA, GPRC:$rB),
+                      "subfe $rT, $rA, $rB", IntGeneral,
+                      [(set GPRC:$rT, (sube GPRC:$rB, GPRC:$rA))]>;
+def SUBFME : XOForm_3<31, 232, 0, (outs GPRC:$rT), (ins GPRC:$rA),
+                      "subfme $rT, $rA", IntGeneral,
+                      [(set GPRC:$rT, (sube immAllOnes, GPRC:$rA))]>;
+def SUBFZE : XOForm_3<31, 200, 0, (outs GPRC:$rT), (ins GPRC:$rA),
+                      "subfze $rT, $rA", IntGeneral,
+                      [(set GPRC:$rT, (sube 0, GPRC:$rA))]>;
+}
+}
+
+// A-Form instructions.  Most of the instructions executed in the FPU are of
+// this type.
+//
+let PPC970_Unit = 3 in {  // FPU Operations.
+let Uses = [RM] in {
+  def FMADD : AForm_1<63, 29, 
+                      (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRC, F8RC:$FRB),
+                      "fmadd $FRT, $FRA, $FRC, $FRB", FPFused,
+                      [(set F8RC:$FRT, (fadd (fmul F8RC:$FRA, F8RC:$FRC),
+                                             F8RC:$FRB))]>,
+                      Requires<[FPContractions]>;
+  def FMADDS : AForm_1<59, 29,
+                      (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRC, F4RC:$FRB),
+                      "fmadds $FRT, $FRA, $FRC, $FRB", FPGeneral,
+                      [(set F4RC:$FRT, (fadd (fmul F4RC:$FRA, F4RC:$FRC),
+                                             F4RC:$FRB))]>,
+                      Requires<[FPContractions]>;
+  def FMSUB : AForm_1<63, 28,
+                      (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRC, F8RC:$FRB),
+                      "fmsub $FRT, $FRA, $FRC, $FRB", FPFused,
+                      [(set F8RC:$FRT, (fsub (fmul F8RC:$FRA, F8RC:$FRC),
+                                             F8RC:$FRB))]>,
+                      Requires<[FPContractions]>;
+  def FMSUBS : AForm_1<59, 28,
+                      (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRC, F4RC:$FRB),
+                      "fmsubs $FRT, $FRA, $FRC, $FRB", FPGeneral,
+                      [(set F4RC:$FRT, (fsub (fmul F4RC:$FRA, F4RC:$FRC),
+                                             F4RC:$FRB))]>,
+                      Requires<[FPContractions]>;
+  def FNMADD : AForm_1<63, 31,
+                      (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRC, F8RC:$FRB),
+                      "fnmadd $FRT, $FRA, $FRC, $FRB", FPFused,
+                      [(set F8RC:$FRT, (fneg (fadd (fmul F8RC:$FRA, F8RC:$FRC),
+                                                   F8RC:$FRB)))]>,
+                      Requires<[FPContractions]>;
+  def FNMADDS : AForm_1<59, 31,
+                      (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRC, F4RC:$FRB),
+                      "fnmadds $FRT, $FRA, $FRC, $FRB", FPGeneral,
+                      [(set F4RC:$FRT, (fneg (fadd (fmul F4RC:$FRA, F4RC:$FRC),
+                                                   F4RC:$FRB)))]>,
+                      Requires<[FPContractions]>;
+  def FNMSUB : AForm_1<63, 30,
+                      (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRC, F8RC:$FRB),
+                      "fnmsub $FRT, $FRA, $FRC, $FRB", FPFused,
+                      [(set F8RC:$FRT, (fneg (fsub (fmul F8RC:$FRA, F8RC:$FRC),
+                                                   F8RC:$FRB)))]>,
+                      Requires<[FPContractions]>;
+  def FNMSUBS : AForm_1<59, 30,
+                      (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRC, F4RC:$FRB),
+                      "fnmsubs $FRT, $FRA, $FRC, $FRB", FPGeneral,
+                      [(set F4RC:$FRT, (fneg (fsub (fmul F4RC:$FRA, F4RC:$FRC),
+                                                   F4RC:$FRB)))]>,
+                      Requires<[FPContractions]>;
+}
+// FSEL is artificially split into 4 and 8-byte forms for the result.  To avoid
+// having 4 of these, force the comparison to always be an 8-byte double (code
+// should use an FMRSD if the input comparison value really wants to be a float)
+// and 4/8 byte forms for the result and operand type..
+def FSELD : AForm_1<63, 23,
+                    (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRC, F8RC:$FRB),
+                    "fsel $FRT, $FRA, $FRC, $FRB", FPGeneral,
+                    [(set F8RC:$FRT, (PPCfsel F8RC:$FRA,F8RC:$FRC,F8RC:$FRB))]>;
+def FSELS : AForm_1<63, 23,
+                     (outs F4RC:$FRT), (ins F8RC:$FRA, F4RC:$FRC, F4RC:$FRB),
+                     "fsel $FRT, $FRA, $FRC, $FRB", FPGeneral,
+                    [(set F4RC:$FRT, (PPCfsel F8RC:$FRA,F4RC:$FRC,F4RC:$FRB))]>;
+let Uses = [RM] in {
+  def FADD  : AForm_2<63, 21,
+                      (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRB),
+                      "fadd $FRT, $FRA, $FRB", FPGeneral,
+                      [(set F8RC:$FRT, (fadd F8RC:$FRA, F8RC:$FRB))]>;
+  def FADDS : AForm_2<59, 21,
+                      (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRB),
+                      "fadds $FRT, $FRA, $FRB", FPGeneral,
+                      [(set F4RC:$FRT, (fadd F4RC:$FRA, F4RC:$FRB))]>;
+  def FDIV  : AForm_2<63, 18,
+                      (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRB),
+                      "fdiv $FRT, $FRA, $FRB", FPDivD,
+                      [(set F8RC:$FRT, (fdiv F8RC:$FRA, F8RC:$FRB))]>;
+  def FDIVS : AForm_2<59, 18,
+                      (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRB),
+                      "fdivs $FRT, $FRA, $FRB", FPDivS,
+                      [(set F4RC:$FRT, (fdiv F4RC:$FRA, F4RC:$FRB))]>;
+  def FMUL  : AForm_3<63, 25,
+                      (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRB),
+                      "fmul $FRT, $FRA, $FRB", FPFused,
+                      [(set F8RC:$FRT, (fmul F8RC:$FRA, F8RC:$FRB))]>;
+  def FMULS : AForm_3<59, 25,
+                      (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRB),
+                      "fmuls $FRT, $FRA, $FRB", FPGeneral,
+                      [(set F4RC:$FRT, (fmul F4RC:$FRA, F4RC:$FRB))]>;
+  def FSUB  : AForm_2<63, 20,
+                      (outs F8RC:$FRT), (ins F8RC:$FRA, F8RC:$FRB),
+                      "fsub $FRT, $FRA, $FRB", FPGeneral,
+                      [(set F8RC:$FRT, (fsub F8RC:$FRA, F8RC:$FRB))]>;
+  def FSUBS : AForm_2<59, 20,
+                      (outs F4RC:$FRT), (ins F4RC:$FRA, F4RC:$FRB),
+                      "fsubs $FRT, $FRA, $FRB", FPGeneral,
+                      [(set F4RC:$FRT, (fsub F4RC:$FRA, F4RC:$FRB))]>;
+  }
+}
+
+let PPC970_Unit = 1 in {  // FXU Operations.
+// M-Form instructions.  rotate and mask instructions.
+//
+let isCommutable = 1 in {
+// RLWIMI can be commuted if the rotate amount is zero.
+def RLWIMI : MForm_2<20,
+                     (outs GPRC:$rA), (ins GPRC:$rSi, GPRC:$rS, u5imm:$SH, u5imm:$MB, 
+                      u5imm:$ME), "rlwimi $rA, $rS, $SH, $MB, $ME", IntRotate,
+                      []>, PPC970_DGroup_Cracked, RegConstraint<"$rSi = $rA">,
+                      NoEncode<"$rSi">;
+}
+def RLWINM : MForm_2<21,
+                     (outs GPRC:$rA), (ins GPRC:$rS, u5imm:$SH, u5imm:$MB, u5imm:$ME),
+                     "rlwinm $rA, $rS, $SH, $MB, $ME", IntGeneral,
+                     []>;
+def RLWINMo : MForm_2<21,
+                     (outs GPRC:$rA), (ins GPRC:$rS, u5imm:$SH, u5imm:$MB, u5imm:$ME),
+                     "rlwinm. $rA, $rS, $SH, $MB, $ME", IntGeneral,
+                     []>, isDOT, PPC970_DGroup_Cracked;
+def RLWNM  : MForm_2<23,
+                     (outs GPRC:$rA), (ins GPRC:$rS, GPRC:$rB, u5imm:$MB, u5imm:$ME),
+                     "rlwnm $rA, $rS, $rB, $MB, $ME", IntGeneral,
+                     []>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// PowerPC Instruction Patterns
+//
+
+// Arbitrary immediate support.  Implement in terms of LIS/ORI.
+def : Pat<(i32 imm:$imm),
+          (ORI (LIS (HI16 imm:$imm)), (LO16 imm:$imm))>;
+
+// Implement the 'not' operation with the NOR instruction.
+def NOT : Pat<(not GPRC:$in),
+              (NOR GPRC:$in, GPRC:$in)>;
+
+// ADD an arbitrary immediate.
+def : Pat<(add GPRC:$in, imm:$imm),
+          (ADDIS (ADDI GPRC:$in, (LO16 imm:$imm)), (HA16 imm:$imm))>;
+// OR an arbitrary immediate.
+def : Pat<(or GPRC:$in, imm:$imm),
+          (ORIS (ORI GPRC:$in, (LO16 imm:$imm)), (HI16 imm:$imm))>;
+// XOR an arbitrary immediate.
+def : Pat<(xor GPRC:$in, imm:$imm),
+          (XORIS (XORI GPRC:$in, (LO16 imm:$imm)), (HI16 imm:$imm))>;
+// SUBFIC
+def : Pat<(sub  immSExt16:$imm, GPRC:$in),
+          (SUBFIC GPRC:$in, imm:$imm)>;
+
+// SHL/SRL
+def : Pat<(shl GPRC:$in, (i32 imm:$imm)),
+          (RLWINM GPRC:$in, imm:$imm, 0, (SHL32 imm:$imm))>;
+def : Pat<(srl GPRC:$in, (i32 imm:$imm)),
+          (RLWINM GPRC:$in, (SRL32 imm:$imm), imm:$imm, 31)>;
+
+// ROTL
+def : Pat<(rotl GPRC:$in, GPRC:$sh),
+          (RLWNM GPRC:$in, GPRC:$sh, 0, 31)>;
+def : Pat<(rotl GPRC:$in, (i32 imm:$imm)),
+          (RLWINM GPRC:$in, imm:$imm, 0, 31)>;
+
+// RLWNM
+def : Pat<(and (rotl GPRC:$in, GPRC:$sh), maskimm32:$imm),
+          (RLWNM GPRC:$in, GPRC:$sh, (MB maskimm32:$imm), (ME maskimm32:$imm))>;
+
+// Calls
+def : Pat<(PPCcall_Darwin (i32 tglobaladdr:$dst)),
+          (BL_Darwin tglobaladdr:$dst)>;
+def : Pat<(PPCcall_Darwin (i32 texternalsym:$dst)),
+          (BL_Darwin texternalsym:$dst)>;
+def : Pat<(PPCcall_SVR4 (i32 tglobaladdr:$dst)),
+          (BL_SVR4 tglobaladdr:$dst)>;
+def : Pat<(PPCcall_SVR4 (i32 texternalsym:$dst)),
+          (BL_SVR4 texternalsym:$dst)>;
+
+
+def : Pat<(PPCtc_return (i32 tglobaladdr:$dst),  imm:$imm),
+          (TCRETURNdi tglobaladdr:$dst, imm:$imm)>;
+
+def : Pat<(PPCtc_return (i32 texternalsym:$dst), imm:$imm),
+          (TCRETURNdi texternalsym:$dst, imm:$imm)>;
+
+def : Pat<(PPCtc_return CTRRC:$dst, imm:$imm),
+          (TCRETURNri CTRRC:$dst, imm:$imm)>;
+
+
+
+// Hi and Lo for Darwin Global Addresses.
+def : Pat<(PPChi tglobaladdr:$in, 0), (LIS tglobaladdr:$in)>;
+def : Pat<(PPClo tglobaladdr:$in, 0), (LI tglobaladdr:$in)>;
+def : Pat<(PPChi tconstpool:$in, 0), (LIS tconstpool:$in)>;
+def : Pat<(PPClo tconstpool:$in, 0), (LI tconstpool:$in)>;
+def : Pat<(PPChi tjumptable:$in, 0), (LIS tjumptable:$in)>;
+def : Pat<(PPClo tjumptable:$in, 0), (LI tjumptable:$in)>;
+def : Pat<(PPChi tblockaddress:$in, 0), (LIS tblockaddress:$in)>;
+def : Pat<(PPClo tblockaddress:$in, 0), (LI tblockaddress:$in)>;
+def : Pat<(add GPRC:$in, (PPChi tglobaladdr:$g, 0)),
+          (ADDIS GPRC:$in, tglobaladdr:$g)>;
+def : Pat<(add GPRC:$in, (PPChi tconstpool:$g, 0)),
+          (ADDIS GPRC:$in, tconstpool:$g)>;
+def : Pat<(add GPRC:$in, (PPChi tjumptable:$g, 0)),
+          (ADDIS GPRC:$in, tjumptable:$g)>;
+def : Pat<(add GPRC:$in, (PPChi tblockaddress:$g, 0)),
+          (ADDIS GPRC:$in, tblockaddress:$g)>;
+
+// Fused negative multiply subtract, alternate pattern
+def : Pat<(fsub F8RC:$B, (fmul F8RC:$A, F8RC:$C)),
+          (FNMSUB F8RC:$A, F8RC:$C, F8RC:$B)>,
+          Requires<[FPContractions]>;
+def : Pat<(fsub F4RC:$B, (fmul F4RC:$A, F4RC:$C)),
+          (FNMSUBS F4RC:$A, F4RC:$C, F4RC:$B)>,
+          Requires<[FPContractions]>;
+
+// Standard shifts.  These are represented separately from the real shifts above
+// so that we can distinguish between shifts that allow 5-bit and 6-bit shift
+// amounts.
+def : Pat<(sra GPRC:$rS, GPRC:$rB),
+          (SRAW GPRC:$rS, GPRC:$rB)>;
+def : Pat<(srl GPRC:$rS, GPRC:$rB),
+          (SRW GPRC:$rS, GPRC:$rB)>;
+def : Pat<(shl GPRC:$rS, GPRC:$rB),
+          (SLW GPRC:$rS, GPRC:$rB)>;
+
+def : Pat<(zextloadi1 iaddr:$src),
+          (LBZ iaddr:$src)>;
+def : Pat<(zextloadi1 xaddr:$src),
+          (LBZX xaddr:$src)>;
+def : Pat<(extloadi1 iaddr:$src),
+          (LBZ iaddr:$src)>;
+def : Pat<(extloadi1 xaddr:$src),
+          (LBZX xaddr:$src)>;
+def : Pat<(extloadi8 iaddr:$src),
+          (LBZ iaddr:$src)>;
+def : Pat<(extloadi8 xaddr:$src),
+          (LBZX xaddr:$src)>;
+def : Pat<(extloadi16 iaddr:$src),
+          (LHZ iaddr:$src)>;
+def : Pat<(extloadi16 xaddr:$src),
+          (LHZX xaddr:$src)>;
+def : Pat<(extloadf32 iaddr:$src),
+          (FMRSD (LFS iaddr:$src))>;
+def : Pat<(extloadf32 xaddr:$src),
+          (FMRSD (LFSX xaddr:$src))>;
+
+// Memory barriers
+def : Pat<(membarrier (i32 imm:$ll),
+                      (i32 imm:$ls),
+                      (i32 imm:$sl),
+                      (i32 imm:$ss),
+                      (i32 imm:$device)),
+           (SYNC)>;
+
+include "PPCInstrAltivec.td"
+include "PPCInstr64Bit.td"

diff --git a/lib/Target/PowerPC/PPCJITInfo.cpp b/lib/Target/PowerPC/PPCJITInfo.cpp
new file mode 100644
index 0000000..daf4ec6
--- /dev/null
+++ b/lib/Target/PowerPC/PPCJITInfo.cpp

@@ -0,0 +1,446 @@
+//===-- PPCJITInfo.cpp - Implement the JIT interfaces for the PowerPC -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the JIT interfaces for the 32-bit PowerPC target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jit"
+#include "PPCJITInfo.h"
+#include "PPCRelocations.h"
+#include "PPCTargetMachine.h"
+#include "llvm/Function.h"
+#include "llvm/System/Memory.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+static TargetJITInfo::JITCompilerFn JITCompilerFunction;
+
+#define BUILD_ADDIS(RD,RS,IMM16) \
+  ((15 << 26) | ((RD) << 21) | ((RS) << 16) | ((IMM16) & 65535))
+#define BUILD_ORI(RD,RS,UIMM16) \
+  ((24 << 26) | ((RS) << 21) | ((RD) << 16) | ((UIMM16) & 65535))
+#define BUILD_ORIS(RD,RS,UIMM16) \
+  ((25 << 26) | ((RS) << 21) | ((RD) << 16) | ((UIMM16) & 65535))
+#define BUILD_RLDICR(RD,RS,SH,ME) \
+  ((30 << 26) | ((RS) << 21) | ((RD) << 16) | (((SH) & 31) << 11) | \
+   (((ME) & 63) << 6) | (1 << 2) | ((((SH) >> 5) & 1) << 1))
+#define BUILD_MTSPR(RS,SPR)      \
+  ((31 << 26) | ((RS) << 21) | ((SPR) << 16) | (467 << 1))
+#define BUILD_BCCTRx(BO,BI,LINK) \
+  ((19 << 26) | ((BO) << 21) | ((BI) << 16) | (528 << 1) | ((LINK) & 1))
+#define BUILD_B(TARGET, LINK) \
+  ((18 << 26) | (((TARGET) & 0x00FFFFFF) << 2) | ((LINK) & 1))
+
+// Pseudo-ops
+#define BUILD_LIS(RD,IMM16)    BUILD_ADDIS(RD,0,IMM16)
+#define BUILD_SLDI(RD,RS,IMM6) BUILD_RLDICR(RD,RS,IMM6,63-IMM6)
+#define BUILD_MTCTR(RS)        BUILD_MTSPR(RS,9)
+#define BUILD_BCTR(LINK)       BUILD_BCCTRx(20,0,LINK)
+
+static void EmitBranchToAt(uint64_t At, uint64_t To, bool isCall, bool is64Bit){
+  intptr_t Offset = ((intptr_t)To - (intptr_t)At) >> 2;
+  unsigned *AtI = (unsigned*)(intptr_t)At;
+
+  if (Offset >= -(1 << 23) && Offset < (1 << 23)) {   // In range?
+    AtI[0] = BUILD_B(Offset, isCall);     // b/bl target
+  } else if (!is64Bit) {
+    AtI[0] = BUILD_LIS(12, To >> 16);     // lis r12, hi16(address)
+    AtI[1] = BUILD_ORI(12, 12, To);       // ori r12, r12, lo16(address)
+    AtI[2] = BUILD_MTCTR(12);             // mtctr r12
+    AtI[3] = BUILD_BCTR(isCall);          // bctr/bctrl
+  } else {
+    AtI[0] = BUILD_LIS(12, To >> 48);      // lis r12, hi16(address)
+    AtI[1] = BUILD_ORI(12, 12, To >> 32);  // ori r12, r12, lo16(address)
+    AtI[2] = BUILD_SLDI(12, 12, 32);       // sldi r12, r12, 32
+    AtI[3] = BUILD_ORIS(12, 12, To >> 16); // oris r12, r12, hi16(address)
+    AtI[4] = BUILD_ORI(12, 12, To);        // ori r12, r12, lo16(address)
+    AtI[5] = BUILD_MTCTR(12);              // mtctr r12
+    AtI[6] = BUILD_BCTR(isCall);           // bctr/bctrl
+  }
+}
+
+extern "C" void PPC32CompilationCallback();
+extern "C" void PPC64CompilationCallback();
+
+#if (defined(__POWERPC__) || defined (__ppc__) || defined(_POWER)) && \
+    !(defined(__ppc64__) || defined(__FreeBSD__))
+// CompilationCallback stub - We can't use a C function with inline assembly in
+// it, because we the prolog/epilog inserted by GCC won't work for us.  Instead,
+// write our own wrapper, which does things our way, so we have complete control
+// over register saving and restoring.
+asm(
+    ".text\n"
+    ".align 2\n"
+    ".globl _PPC32CompilationCallback\n"
+"_PPC32CompilationCallback:\n"
+    // Make space for 8 ints r[3-10] and 13 doubles f[1-13] and the 
+    // FIXME: need to save v[0-19] for altivec?
+    // FIXME: could shrink frame
+    // Set up a proper stack frame
+    // FIXME Layout
+    //   PowerPC64 ABI linkage    -  24 bytes
+    //                 parameters -  32 bytes
+    //   13 double registers      - 104 bytes
+    //   8 int registers          -  32 bytes
+    "mflr r0\n"
+    "stw r0,  8(r1)\n"
+    "stwu r1, -208(r1)\n"
+    // Save all int arg registers
+    "stw r10, 204(r1)\n"    "stw r9,  200(r1)\n"
+    "stw r8,  196(r1)\n"    "stw r7,  192(r1)\n"
+    "stw r6,  188(r1)\n"    "stw r5,  184(r1)\n"
+    "stw r4,  180(r1)\n"    "stw r3,  176(r1)\n"
+    // Save all call-clobbered FP regs.
+    "stfd f13, 168(r1)\n"   "stfd f12, 160(r1)\n"
+    "stfd f11, 152(r1)\n"   "stfd f10, 144(r1)\n"
+    "stfd f9,  136(r1)\n"   "stfd f8,  128(r1)\n"
+    "stfd f7,  120(r1)\n"   "stfd f6,  112(r1)\n"
+    "stfd f5,  104(r1)\n"   "stfd f4,   96(r1)\n"
+    "stfd f3,   88(r1)\n"   "stfd f2,   80(r1)\n"
+    "stfd f1,   72(r1)\n"
+    // Arguments to Compilation Callback:
+    // r3 - our lr (address of the call instruction in stub plus 4)
+    // r4 - stub's lr (address of instruction that called the stub plus 4)
+    // r5 - is64Bit - always 0.
+    "mr   r3, r0\n"
+    "lwz  r2, 208(r1)\n" // stub's frame
+    "lwz  r4, 8(r2)\n" // stub's lr
+    "li   r5, 0\n"       // 0 == 32 bit
+    "bl _PPCCompilationCallbackC\n"
+    "mtctr r3\n"
+    // Restore all int arg registers
+    "lwz r10, 204(r1)\n"    "lwz r9,  200(r1)\n"
+    "lwz r8,  196(r1)\n"    "lwz r7,  192(r1)\n"
+    "lwz r6,  188(r1)\n"    "lwz r5,  184(r1)\n"
+    "lwz r4,  180(r1)\n"    "lwz r3,  176(r1)\n"
+    // Restore all FP arg registers
+    "lfd f13, 168(r1)\n"    "lfd f12, 160(r1)\n"
+    "lfd f11, 152(r1)\n"    "lfd f10, 144(r1)\n"
+    "lfd f9,  136(r1)\n"    "lfd f8,  128(r1)\n"
+    "lfd f7,  120(r1)\n"    "lfd f6,  112(r1)\n"
+    "lfd f5,  104(r1)\n"    "lfd f4,   96(r1)\n"
+    "lfd f3,   88(r1)\n"    "lfd f2,   80(r1)\n"
+    "lfd f1,   72(r1)\n"
+    // Pop 3 frames off the stack and branch to target
+    "lwz  r1, 208(r1)\n"
+    "lwz  r2, 8(r1)\n"
+    "mtlr r2\n"
+    "bctr\n"
+    );
+
+#elif defined(__PPC__) && !defined(__ppc64__)
+// Linux & FreeBSD / PPC 32 support
+
+// CompilationCallback stub - We can't use a C function with inline assembly in
+// it, because we the prolog/epilog inserted by GCC won't work for us.  Instead,
+// write our own wrapper, which does things our way, so we have complete control
+// over register saving and restoring.
+asm(
+    ".text\n"
+    ".align 2\n"
+    ".globl PPC32CompilationCallback\n"
+"PPC32CompilationCallback:\n"
+    // Make space for 8 ints r[3-10] and 8 doubles f[1-8] and the 
+    // FIXME: need to save v[0-19] for altivec?
+    // FIXME: could shrink frame
+    // Set up a proper stack frame
+    // FIXME Layout
+    //   8 double registers       -  64 bytes
+    //   8 int registers          -  32 bytes
+    "mflr 0\n"
+    "stw 0,  4(1)\n"
+    "stwu 1, -104(1)\n"
+    // Save all int arg registers
+    "stw 10, 100(1)\n"   "stw 9,  96(1)\n"
+    "stw 8,  92(1)\n"    "stw 7,  88(1)\n"
+    "stw 6,  84(1)\n"    "stw 5,  80(1)\n"
+    "stw 4,  76(1)\n"    "stw 3,  72(1)\n"
+    // Save all call-clobbered FP regs.
+    "stfd 8,  64(1)\n"
+    "stfd 7,  56(1)\n"   "stfd 6,  48(1)\n"
+    "stfd 5,  40(1)\n"   "stfd 4,  32(1)\n"
+    "stfd 3,  24(1)\n"   "stfd 2,  16(1)\n"
+    "stfd 1,  8(1)\n"
+    // Arguments to Compilation Callback:
+    // r3 - our lr (address of the call instruction in stub plus 4)
+    // r4 - stub's lr (address of instruction that called the stub plus 4)
+    // r5 - is64Bit - always 0.
+    "mr   3, 0\n"
+    "lwz  5, 104(1)\n" // stub's frame
+    "lwz  4, 4(5)\n" // stub's lr
+    "li   5, 0\n"       // 0 == 32 bit
+    "bl PPCCompilationCallbackC\n"
+    "mtctr 3\n"
+    // Restore all int arg registers
+    "lwz 10, 100(1)\n"   "lwz 9,  96(1)\n"
+    "lwz 8,  92(1)\n"    "lwz 7,  88(1)\n"
+    "lwz 6,  84(1)\n"    "lwz 5,  80(1)\n"
+    "lwz 4,  76(1)\n"    "lwz 3,  72(1)\n"
+    // Restore all FP arg registers
+    "lfd 8,  64(1)\n"
+    "lfd 7,  56(1)\n"    "lfd 6,  48(1)\n"
+    "lfd 5,  40(1)\n"    "lfd 4,  32(1)\n"
+    "lfd 3,  24(1)\n"    "lfd 2,  16(1)\n"
+    "lfd 1,  8(1)\n"
+    // Pop 3 frames off the stack and branch to target
+    "lwz  1, 104(1)\n"
+    "lwz  0, 4(1)\n"
+    "mtlr 0\n"
+    "bctr\n"
+    );
+#else
+void PPC32CompilationCallback() {
+  llvm_unreachable("This is not a power pc, you can't execute this!");
+}
+#endif
+
+#if (defined(__POWERPC__) || defined (__ppc__) || defined(_POWER)) && \
+    defined(__ppc64__)
+asm(
+    ".text\n"
+    ".align 2\n"
+    ".globl _PPC64CompilationCallback\n"
+"_PPC64CompilationCallback:\n"
+    // Make space for 8 ints r[3-10] and 13 doubles f[1-13] and the 
+    // FIXME: need to save v[0-19] for altivec?
+    // Set up a proper stack frame
+    // Layout
+    //   PowerPC64 ABI linkage    -  48 bytes
+    //                 parameters -  64 bytes
+    //   13 double registers      - 104 bytes
+    //   8 int registers          -  64 bytes
+    "mflr r0\n"
+    "std r0,  16(r1)\n"
+    "stdu r1, -280(r1)\n"
+    // Save all int arg registers
+    "std r10, 272(r1)\n"    "std r9,  264(r1)\n"
+    "std r8,  256(r1)\n"    "std r7,  248(r1)\n"
+    "std r6,  240(r1)\n"    "std r5,  232(r1)\n"
+    "std r4,  224(r1)\n"    "std r3,  216(r1)\n"
+    // Save all call-clobbered FP regs.
+    "stfd f13, 208(r1)\n"    "stfd f12, 200(r1)\n"
+    "stfd f11, 192(r1)\n"    "stfd f10, 184(r1)\n"
+    "stfd f9,  176(r1)\n"    "stfd f8,  168(r1)\n"
+    "stfd f7,  160(r1)\n"    "stfd f6,  152(r1)\n"
+    "stfd f5,  144(r1)\n"    "stfd f4,  136(r1)\n"
+    "stfd f3,  128(r1)\n"    "stfd f2,  120(r1)\n"
+    "stfd f1,  112(r1)\n"
+    // Arguments to Compilation Callback:
+    // r3 - our lr (address of the call instruction in stub plus 4)
+    // r4 - stub's lr (address of instruction that called the stub plus 4)
+    // r5 - is64Bit - always 1.
+    "mr   r3, r0\n"
+    "ld   r2, 280(r1)\n" // stub's frame
+    "ld   r4, 16(r2)\n"  // stub's lr
+    "li   r5, 1\n"       // 1 == 64 bit
+    "bl _PPCCompilationCallbackC\n"
+    "mtctr r3\n"
+    // Restore all int arg registers
+    "ld r10, 272(r1)\n"    "ld r9,  264(r1)\n"
+    "ld r8,  256(r1)\n"    "ld r7,  248(r1)\n"
+    "ld r6,  240(r1)\n"    "ld r5,  232(r1)\n"
+    "ld r4,  224(r1)\n"    "ld r3,  216(r1)\n"
+    // Restore all FP arg registers
+    "lfd f13, 208(r1)\n"    "lfd f12, 200(r1)\n"
+    "lfd f11, 192(r1)\n"    "lfd f10, 184(r1)\n"
+    "lfd f9,  176(r1)\n"    "lfd f8,  168(r1)\n"
+    "lfd f7,  160(r1)\n"    "lfd f6,  152(r1)\n"
+    "lfd f5,  144(r1)\n"    "lfd f4,  136(r1)\n"
+    "lfd f3,  128(r1)\n"    "lfd f2,  120(r1)\n"
+    "lfd f1,  112(r1)\n"
+    // Pop 3 frames off the stack and branch to target
+    "ld  r1, 280(r1)\n"
+    "ld  r2, 16(r1)\n"
+    "mtlr r2\n"
+    "bctr\n"
+    );
+#else
+void PPC64CompilationCallback() {
+  llvm_unreachable("This is not a power pc, you can't execute this!");
+}
+#endif
+
+extern "C" void *PPCCompilationCallbackC(unsigned *StubCallAddrPlus4,
+                                         unsigned *OrigCallAddrPlus4,
+                                         bool is64Bit) {
+  // Adjust the pointer to the address of the call instruction in the stub
+  // emitted by emitFunctionStub, rather than the instruction after it.
+  unsigned *StubCallAddr = StubCallAddrPlus4 - 1;
+  unsigned *OrigCallAddr = OrigCallAddrPlus4 - 1;
+
+  void *Target = JITCompilerFunction(StubCallAddr);
+
+  // Check to see if *OrigCallAddr is a 'bl' instruction, and if we can rewrite
+  // it to branch directly to the destination.  If so, rewrite it so it does not
+  // need to go through the stub anymore.
+  unsigned OrigCallInst = *OrigCallAddr;
+  if ((OrigCallInst >> 26) == 18) {     // Direct call.
+    intptr_t Offset = ((intptr_t)Target - (intptr_t)OrigCallAddr) >> 2;
+    
+    if (Offset >= -(1 << 23) && Offset < (1 << 23)) {   // In range?
+      // Clear the original target out.
+      OrigCallInst &= (63 << 26) | 3;
+      // Fill in the new target.
+      OrigCallInst |= (Offset & ((1 << 24)-1)) << 2;
+      // Replace the call.
+      *OrigCallAddr = OrigCallInst;
+    }
+  }
+
+  // Assert that we are coming from a stub that was created with our
+  // emitFunctionStub.
+  if ((*StubCallAddr >> 26) == 18)
+    StubCallAddr -= 3;
+  else {
+  assert((*StubCallAddr >> 26) == 19 && "Call in stub is not indirect!");
+    StubCallAddr -= is64Bit ? 9 : 6;
+  }
+
+  // Rewrite the stub with an unconditional branch to the target, for any users
+  // who took the address of the stub.
+  EmitBranchToAt((intptr_t)StubCallAddr, (intptr_t)Target, false, is64Bit);
+  sys::Memory::InvalidateInstructionCache(StubCallAddr, 7*4);
+
+  // Put the address of the target function to call and the address to return to
+  // after calling the target function in a place that is easy to get on the
+  // stack after we restore all regs.
+  return Target;
+}
+
+
+
+TargetJITInfo::LazyResolverFn
+PPCJITInfo::getLazyResolverFunction(JITCompilerFn Fn) {
+  JITCompilerFunction = Fn;
+  return is64Bit ? PPC64CompilationCallback : PPC32CompilationCallback;
+}
+
+TargetJITInfo::StubLayout PPCJITInfo::getStubLayout() {
+  // The stub contains up to 10 4-byte instructions, aligned at 4 bytes: 3
+  // instructions to save the caller's address if this is a lazy-compilation
+  // stub, plus a 1-, 4-, or 7-instruction sequence to load an arbitrary address
+  // into a register and jump through it.
+  StubLayout Result = {10*4, 4};
+  return Result;
+}
+
+#if (defined(__POWERPC__) || defined (__ppc__) || defined(_POWER)) && \
+defined(__APPLE__)
+extern "C" void sys_icache_invalidate(const void *Addr, size_t len);
+#endif
+
+void *PPCJITInfo::emitFunctionStub(const Function* F, void *Fn,
+                                   JITCodeEmitter &JCE) {
+  // If this is just a call to an external function, emit a branch instead of a
+  // call.  The code is the same except for one bit of the last instruction.
+  if (Fn != (void*)(intptr_t)PPC32CompilationCallback && 
+      Fn != (void*)(intptr_t)PPC64CompilationCallback) {
+    void *Addr = (void*)JCE.getCurrentPCValue();
+    JCE.emitWordBE(0);
+    JCE.emitWordBE(0);
+    JCE.emitWordBE(0);
+    JCE.emitWordBE(0);
+    JCE.emitWordBE(0);
+    JCE.emitWordBE(0);
+    JCE.emitWordBE(0);
+    EmitBranchToAt((intptr_t)Addr, (intptr_t)Fn, false, is64Bit);
+    sys::Memory::InvalidateInstructionCache(Addr, 7*4);
+    return Addr;
+  }
+
+  void *Addr = (void*)JCE.getCurrentPCValue();
+  if (is64Bit) {
+    JCE.emitWordBE(0xf821ffb1);     // stdu r1,-80(r1)
+    JCE.emitWordBE(0x7d6802a6);     // mflr r11
+    JCE.emitWordBE(0xf9610060);     // std r11, 96(r1)
+  } else if (TM.getSubtargetImpl()->isDarwinABI()){
+    JCE.emitWordBE(0x9421ffe0);     // stwu r1,-32(r1)
+    JCE.emitWordBE(0x7d6802a6);     // mflr r11
+    JCE.emitWordBE(0x91610028);     // stw r11, 40(r1)
+  } else {
+    JCE.emitWordBE(0x9421ffe0);     // stwu r1,-32(r1)
+    JCE.emitWordBE(0x7d6802a6);     // mflr r11
+    JCE.emitWordBE(0x91610024);     // stw r11, 36(r1)
+  }
+  intptr_t BranchAddr = (intptr_t)JCE.getCurrentPCValue();
+  JCE.emitWordBE(0);
+  JCE.emitWordBE(0);
+  JCE.emitWordBE(0);
+  JCE.emitWordBE(0);
+  JCE.emitWordBE(0);
+  JCE.emitWordBE(0);
+  JCE.emitWordBE(0);
+  EmitBranchToAt(BranchAddr, (intptr_t)Fn, true, is64Bit);
+  sys::Memory::InvalidateInstructionCache(Addr, 10*4);
+  return Addr;
+}
+
+
+void PPCJITInfo::relocate(void *Function, MachineRelocation *MR,
+                          unsigned NumRelocs, unsigned char* GOTBase) {
+  for (unsigned i = 0; i != NumRelocs; ++i, ++MR) {
+    unsigned *RelocPos = (unsigned*)Function + MR->getMachineCodeOffset()/4;
+    intptr_t ResultPtr = (intptr_t)MR->getResultPointer();
+    switch ((PPC::RelocationType)MR->getRelocationType()) {
+    default: llvm_unreachable("Unknown relocation type!");
+    case PPC::reloc_pcrel_bx:
+      // PC-relative relocation for b and bl instructions.
+      ResultPtr = (ResultPtr-(intptr_t)RelocPos) >> 2;
+      assert(ResultPtr >= -(1 << 23) && ResultPtr < (1 << 23) &&
+             "Relocation out of range!");
+      *RelocPos |= (ResultPtr & ((1 << 24)-1))  << 2;
+      break;
+    case PPC::reloc_pcrel_bcx:
+      // PC-relative relocation for BLT,BLE,BEQ,BGE,BGT,BNE, or other
+      // bcx instructions.
+      ResultPtr = (ResultPtr-(intptr_t)RelocPos) >> 2;
+      assert(ResultPtr >= -(1 << 13) && ResultPtr < (1 << 13) &&
+             "Relocation out of range!");
+      *RelocPos |= (ResultPtr & ((1 << 14)-1))  << 2;
+      break;
+    case PPC::reloc_absolute_high:     // high bits of ref -> low 16 of instr
+    case PPC::reloc_absolute_low: {    // low bits of ref  -> low 16 of instr
+      ResultPtr += MR->getConstantVal();
+
+      // If this is a high-part access, get the high-part.
+      if (MR->getRelocationType() == PPC::reloc_absolute_high) {
+        // If the low part will have a carry (really a borrow) from the low
+        // 16-bits into the high 16, add a bit to borrow from.
+        if (((int)ResultPtr << 16) < 0)
+          ResultPtr += 1 << 16;
+        ResultPtr >>= 16;
+      }
+
+      // Do the addition then mask, so the addition does not overflow the 16-bit
+      // immediate section of the instruction.
+      unsigned LowBits  = (*RelocPos + ResultPtr) & 65535;
+      unsigned HighBits = *RelocPos & ~65535;
+      *RelocPos = LowBits | HighBits;  // Slam into low 16-bits
+      break;
+    }
+    case PPC::reloc_absolute_low_ix: {  // low bits of ref  -> low 14 of instr
+      ResultPtr += MR->getConstantVal();
+      // Do the addition then mask, so the addition does not overflow the 16-bit
+      // immediate section of the instruction.
+      unsigned LowBits  = (*RelocPos + ResultPtr) & 0xFFFC;
+      unsigned HighBits = *RelocPos & 0xFFFF0003;
+      *RelocPos = LowBits | HighBits;  // Slam into low 14-bits.
+      break;
+    }
+    }
+  }
+}
+
+void PPCJITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
+  EmitBranchToAt((intptr_t)Old, (intptr_t)New, false, is64Bit);
+  sys::Memory::InvalidateInstructionCache(Old, 7*4);
+}

diff --git a/lib/Target/PowerPC/PPCJITInfo.h b/lib/Target/PowerPC/PPCJITInfo.h
new file mode 100644
index 0000000..47ead59
--- /dev/null
+++ b/lib/Target/PowerPC/PPCJITInfo.h

@@ -0,0 +1,49 @@
+//===- PPCJITInfo.h - PowerPC impl. of the JIT interface --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PowerPC implementation of the TargetJITInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POWERPC_JITINFO_H
+#define POWERPC_JITINFO_H
+
+#include "llvm/Target/TargetJITInfo.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+
+namespace llvm {
+  class PPCTargetMachine;
+
+  class PPCJITInfo : public TargetJITInfo {
+  protected:
+    PPCTargetMachine &TM;
+    bool is64Bit;
+  public:
+    PPCJITInfo(PPCTargetMachine &tm, bool tmIs64Bit) : TM(tm) {
+      useGOT = 0;
+      is64Bit = tmIs64Bit;
+    }
+
+    virtual StubLayout getStubLayout();
+    virtual void *emitFunctionStub(const Function* F, void *Fn,
+                                   JITCodeEmitter &JCE);
+    virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn);
+    virtual void relocate(void *Function, MachineRelocation *MR,
+                          unsigned NumRelocs, unsigned char* GOTBase);
+    
+    /// replaceMachineCodeForFunction - Make it so that calling the function
+    /// whose machine code is at OLD turns into a call to NEW, perhaps by
+    /// overwriting OLD with a branch to NEW.  This is used for self-modifying
+    /// code.
+    ///
+    virtual void replaceMachineCodeForFunction(void *Old, void *New);
+  };
+}
+
+#endif

diff --git a/lib/Target/PowerPC/PPCMCAsmInfo.cpp b/lib/Target/PowerPC/PPCMCAsmInfo.cpp
new file mode 100644
index 0000000..b37aee8
--- /dev/null
+++ b/lib/Target/PowerPC/PPCMCAsmInfo.cpp

@@ -0,0 +1,59 @@
+//===-- PPCMCAsmInfo.cpp - PPC asm properties -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the MCAsmInfoDarwin properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCMCAsmInfo.h"
+using namespace llvm;
+
+PPCMCAsmInfoDarwin::PPCMCAsmInfoDarwin(bool is64Bit) {
+  PCSymbol = ".";
+  CommentString = ";";
+  ExceptionsType = ExceptionHandling::Dwarf;
+
+  if (!is64Bit)
+    Data64bitsDirective = 0;      // We can't emit a 64-bit unit in PPC32 mode.
+  AssemblerDialect = 1;           // New-Style mnemonics.
+  SupportsDebugInformation= true; // Debug information.
+}
+
+PPCLinuxMCAsmInfo::PPCLinuxMCAsmInfo(bool is64Bit) {
+  // ".comm align is in bytes but .align is pow-2."
+  AlignmentIsInBytes = false;
+
+  CommentString = "#";
+  GlobalPrefix = "";
+  PrivateGlobalPrefix = ".L";
+  WeakRefDirective = "\t.weak\t";
+  
+  // Uses '.section' before '.bss' directive
+  UsesELFSectionDirectiveForBSS = true;  
+
+  // Debug Information
+  AbsoluteDebugSectionOffsets = true;
+  SupportsDebugInformation = true;
+
+  PCSymbol = ".";
+
+  // Set up DWARF directives
+  HasLEB128 = true;  // Target asm supports leb128 directives (little-endian)
+
+  // Exceptions handling
+  if (!is64Bit)
+    ExceptionsType = ExceptionHandling::Dwarf;
+  AbsoluteEHSectionOffsets = false;
+    
+  ZeroDirective = "\t.space\t";
+  Data64bitsDirective = is64Bit ? "\t.quad\t" : 0;
+  HasLCOMMDirective = true;
+  AssemblerDialect = 0;           // Old-Style mnemonics.
+}
+

diff --git a/lib/Target/PowerPC/PPCMCAsmInfo.h b/lib/Target/PowerPC/PPCMCAsmInfo.h
new file mode 100644
index 0000000..96ae6fb
--- /dev/null
+++ b/lib/Target/PowerPC/PPCMCAsmInfo.h

@@ -0,0 +1,31 @@
+//=====-- PPCMCAsmInfo.h - PPC asm properties -----------------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the MCAsmInfoDarwin class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPCTARGETASMINFO_H
+#define PPCTARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfoDarwin.h"
+
+namespace llvm {
+
+  struct PPCMCAsmInfoDarwin : public MCAsmInfoDarwin {
+    explicit PPCMCAsmInfoDarwin(bool is64Bit);
+  };
+
+  struct PPCLinuxMCAsmInfo : public MCAsmInfo {
+    explicit PPCLinuxMCAsmInfo(bool is64Bit);
+  };
+
+} // namespace llvm
+
+#endif

diff --git a/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/lib/Target/PowerPC/PPCMachineFunctionInfo.h
new file mode 100644
index 0000000..b359dd3
--- /dev/null
+++ b/lib/Target/PowerPC/PPCMachineFunctionInfo.h

@@ -0,0 +1,104 @@
+//===-- PPCMachineFunctionInfo.h - Private data used for PowerPC --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the PowerPC specific subclass of MachineFunctionInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPC_MACHINE_FUNCTION_INFO_H
+#define PPC_MACHINE_FUNCTION_INFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+/// PPCFunctionInfo - This class is derived from MachineFunction private
+/// PowerPC target-specific information for each MachineFunction.
+class PPCFunctionInfo : public MachineFunctionInfo {
+private:
+  /// FramePointerSaveIndex - Frame index of where the old frame pointer is
+  /// stored.  Also used as an anchor for instructions that need to be altered
+  /// when using frame pointers (dyna_add, dyna_sub.)
+  int FramePointerSaveIndex;
+  
+  /// ReturnAddrSaveIndex - Frame index of where the return address is stored.
+  ///
+  int ReturnAddrSaveIndex;
+
+  /// MustSaveLR - Indicates whether LR is defined (or clobbered) in the current
+  /// function.  This is only valid after the initial scan of the function by
+  /// PEI.
+  bool MustSaveLR;
+
+  /// SpillsCR - Indicates whether CR is spilled in the current function.
+  bool SpillsCR;
+
+  /// LRStoreRequired - The bool indicates whether there is some explicit use of
+  /// the LR/LR8 stack slot that is not obvious from scanning the code.  This
+  /// requires that the code generator produce a store of LR to the stack on
+  /// entry, even though LR may otherwise apparently not be used.
+  bool LRStoreRequired;
+
+  /// MinReservedArea - This is the frame size that is at least reserved in a
+  /// potential caller (parameter+linkage area).
+  unsigned MinReservedArea;
+
+  /// TailCallSPDelta - Stack pointer delta used when tail calling. Maximum
+  /// amount the stack pointer is adjusted to make the frame bigger for tail
+  /// calls. Used for creating an area before the register spill area.
+  int TailCallSPDelta;
+
+  /// HasFastCall - Does this function contain a fast call. Used to determine
+  /// how the caller's stack pointer should be calculated (epilog/dynamicalloc).
+  bool HasFastCall;
+
+public:
+  explicit PPCFunctionInfo(MachineFunction &MF) 
+    : FramePointerSaveIndex(0),
+      ReturnAddrSaveIndex(0),
+      SpillsCR(false),
+      LRStoreRequired(false),
+      MinReservedArea(0),
+      TailCallSPDelta(0),
+      HasFastCall(false) {}
+
+  int getFramePointerSaveIndex() const { return FramePointerSaveIndex; }
+  void setFramePointerSaveIndex(int Idx) { FramePointerSaveIndex = Idx; }
+  
+  int getReturnAddrSaveIndex() const { return ReturnAddrSaveIndex; }
+  void setReturnAddrSaveIndex(int idx) { ReturnAddrSaveIndex = idx; }
+
+  unsigned getMinReservedArea() const { return MinReservedArea; }
+  void setMinReservedArea(unsigned size) { MinReservedArea = size; }
+
+  int getTailCallSPDelta() const { return TailCallSPDelta; }
+  void setTailCallSPDelta(int size) { TailCallSPDelta = size; }
+
+  /// MustSaveLR - This is set when the prolog/epilog inserter does its initial
+  /// scan of the function. It is true if the LR/LR8 register is ever explicitly
+  /// defined/clobbered in the machine function (e.g. by calls and movpctolr,
+  /// which is used in PIC generation), or if the LR stack slot is explicitly
+  /// referenced by builtin_return_address.
+  void setMustSaveLR(bool U) { MustSaveLR = U; }
+  bool mustSaveLR() const    { return MustSaveLR; }
+
+  void setSpillsCR()       { SpillsCR = true; }
+  bool isCRSpilled() const { return SpillsCR; }
+
+  void setLRStoreRequired() { LRStoreRequired = true; }
+  bool isLRStoreRequired() const { return LRStoreRequired; }
+
+  void setHasFastCall() { HasFastCall = true; }
+  bool hasFastCall() const { return HasFastCall;}
+};
+
+} // end of namespace llvm
+
+
+#endif

diff --git a/lib/Target/PowerPC/PPCPerfectShuffle.h b/lib/Target/PowerPC/PPCPerfectShuffle.h
new file mode 100644
index 0000000..3164e33
--- /dev/null
+++ b/lib/Target/PowerPC/PPCPerfectShuffle.h

@@ -0,0 +1,6586 @@
+//===-- PPCPerfectShuffle.h - Altivec Perfect Shuffle Table ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file, which was autogenerated by llvm-PerfectShuffle, contains data
+// for the optimal way to build a perfect shuffle without using vperm.
+//
+//===----------------------------------------------------------------------===//
+
+// 31 entries have cost 0
+// 292 entries have cost 1
+// 1384 entries have cost 2
+// 3061 entries have cost 3
+// 1733 entries have cost 4
+// 60 entries have cost 5
+
+// This table is 6561*4 = 26244 bytes in size.
+static const unsigned PerfectShuffleTable[6561+1] = {
+  202162278U,	// <0,0,0,0>: Cost 1 vspltisw0 LHS
+  1140850790U,	// <0,0,0,1>: Cost 2 vmrghw <0,0,0,0>, LHS
+  2617247181U,	// <0,0,0,2>: Cost 3 vsldoi4 <0,0,0,0>, <2,0,3,0>
+  2635163787U,	// <0,0,0,3>: Cost 3 vsldoi4 <3,0,0,0>, <3,0,0,0>
+  1543507254U,	// <0,0,0,4>: Cost 2 vsldoi4 <0,0,0,0>, RHS
+  2281701705U,	// <0,0,0,5>: Cost 3 vmrglw <0,0,0,0>, <0,4,0,5>
+  2617250133U,	// <0,0,0,6>: Cost 3 vsldoi4 <0,0,0,0>, <6,0,7,0>
+  2659054575U,	// <0,0,0,7>: Cost 3 vsldoi4 <7,0,0,0>, <7,0,0,0>
+  202162278U,	// <0,0,0,u>: Cost 1 vspltisw0 LHS
+  1141686282U,	// <0,0,1,0>: Cost 2 vmrghw LHS, <0,0,1,1>
+  67944550U,	// <0,0,1,1>: Cost 1 vmrghw LHS, LHS
+  1685241958U,	// <0,0,1,2>: Cost 2 vsldoi12 <1,2,3,0>, LHS
+  2215870716U,	// <0,0,1,3>: Cost 3 vmrghw LHS, <0,3,1,0>
+  1141727570U,	// <0,0,1,4>: Cost 2 vmrghw LHS, <0,4,1,5>
+  2215428562U,	// <0,0,1,5>: Cost 3 vmrghw LHS, <0,5,6,7>
+  2215428589U,	// <0,0,1,6>: Cost 3 vmrghw LHS, <0,6,0,7>
+  2659062768U,	// <0,0,1,7>: Cost 3 vsldoi4 <7,0,0,1>, <7,0,0,1>
+  67945117U,	// <0,0,1,u>: Cost 1 vmrghw LHS, LHS
+  2684356045U,	// <0,0,2,0>: Cost 3 vsldoi8 <0,0,0,0>, <2,0,3,0>
+  2216009830U,	// <0,0,2,1>: Cost 3 vmrghw <0,2,1,2>, LHS
+  2216009901U,	// <0,0,2,2>: Cost 3 vmrghw <0,2,1,2>, <0,2,1,2>
+  2698290853U,	// <0,0,2,3>: Cost 3 vsldoi8 <2,3,0,0>, <2,3,0,0>
+  3289751890U,	// <0,0,2,4>: Cost 4 vmrghw <0,2,1,2>, <0,4,1,5>
+  3758098275U,	// <0,0,2,5>: Cost 4 vsldoi8 <0,0,0,0>, <2,5,3,1>
+  2684356538U,	// <0,0,2,6>: Cost 3 vsldoi8 <0,0,0,0>, <2,6,3,7>
+  3758098410U,	// <0,0,2,7>: Cost 4 vsldoi8 <0,0,0,0>, <2,7,0,1>
+  2216010397U,	// <0,0,2,u>: Cost 3 vmrghw <0,2,1,2>, LHS
+  2702272651U,	// <0,0,3,0>: Cost 3 vsldoi8 <3,0,0,0>, <3,0,0,0>
+  2216656998U,	// <0,0,3,1>: Cost 3 vmrghw <0,3,1,0>, LHS
+  3844669704U,	// <0,0,3,2>: Cost 4 vsldoi12 <3,2,3,0>, <0,3,2,3>
+  2216657148U,	// <0,0,3,3>: Cost 3 vmrghw <0,3,1,0>, <0,3,1,0>
+  2684357122U,	// <0,0,3,4>: Cost 3 vsldoi8 <0,0,0,0>, <3,4,5,6>
+  3732820066U,	// <0,0,3,5>: Cost 4 vsldoi4 <7,0,0,3>, <5,6,7,0>
+  3778005624U,	// <0,0,3,6>: Cost 4 vsldoi8 <3,3,0,0>, <3,6,0,7>
+  3374713464U,	// <0,0,3,7>: Cost 4 vmrglw <3,2,0,3>, <3,6,0,7>
+  2216657565U,	// <0,0,3,u>: Cost 3 vmrghw <0,3,1,0>, LHS
+  2217361408U,	// <0,0,4,0>: Cost 3 vmrghw <0,4,1,5>, <0,0,0,0>
+  1143619686U,	// <0,0,4,1>: Cost 2 vmrghw <0,4,1,5>, LHS
+  3291103405U,	// <0,0,4,2>: Cost 4 vmrghw <0,4,1,5>, <0,2,1,2>
+  3827269988U,	// <0,0,4,3>: Cost 4 vsldoi12 <0,3,1,0>, <0,4,3,5>
+  1143619922U,	// <0,0,4,4>: Cost 2 vmrghw <0,4,1,5>, <0,4,1,5>
+  1610616118U,	// <0,0,4,5>: Cost 2 vsldoi8 <0,0,0,0>, RHS
+  3758099833U,	// <0,0,4,6>: Cost 4 vsldoi8 <0,0,0,0>, <4,6,5,2>
+  3854107016U,	// <0,0,4,7>: Cost 4 vsldoi12 <4,7,5,0>, <0,4,7,5>
+  1143620253U,	// <0,0,4,u>: Cost 2 vmrghw <0,4,1,5>, LHS
+  2284396544U,	// <0,0,5,0>: Cost 3 vmrglw <0,4,0,5>, <0,0,0,0>
+  2218025062U,	// <0,0,5,1>: Cost 3 vmrghw <0,5,1,5>, LHS
+  3758100203U,	// <0,0,5,2>: Cost 4 vsldoi8 <0,0,0,0>, <5,2,1,3>
+  3395966100U,	// <0,0,5,3>: Cost 4 vmrglw <6,7,0,5>, <7,2,0,3>
+  3804549052U,	// <0,0,5,4>: Cost 4 vsldoi8 <7,7,0,0>, <5,4,6,5>
+  2302314964U,	// <0,0,5,5>: Cost 3 vmrglw <3,4,0,5>, <3,4,0,5>
+  2785821138U,	// <0,0,5,6>: Cost 3 vsldoi12 <5,6,7,0>, <0,5,6,7>
+  3395966428U,	// <0,0,5,7>: Cost 4 vmrglw <6,7,0,5>, <7,6,0,7>
+  2787148260U,	// <0,0,5,u>: Cost 3 vsldoi12 <5,u,7,0>, <0,5,u,7>
+  2684358997U,	// <0,0,6,0>: Cost 3 vsldoi8 <0,0,0,0>, <6,0,7,0>
+  2218631270U,	// <0,0,6,1>: Cost 3 vmrghw <0,6,0,7>, LHS
+  2684359162U,	// <0,0,6,2>: Cost 3 vsldoi8 <0,0,0,0>, <6,2,7,3>
+  3758101042U,	// <0,0,6,3>: Cost 4 vsldoi8 <0,0,0,0>, <6,3,4,5>
+  3732843830U,	// <0,0,6,4>: Cost 4 vsldoi4 <7,0,0,6>, RHS
+  3758101227U,	// <0,0,6,5>: Cost 4 vsldoi8 <0,0,0,0>, <6,5,7,1>
+  2684359480U,	// <0,0,6,6>: Cost 3 vsldoi8 <0,0,0,0>, <6,6,6,6>
+  2724836173U,	// <0,0,6,7>: Cost 3 vsldoi8 <6,7,0,0>, <6,7,0,0>
+  2725499806U,	// <0,0,6,u>: Cost 3 vsldoi8 <6,u,0,0>, <6,u,0,0>
+  2726163439U,	// <0,0,7,0>: Cost 3 vsldoi8 <7,0,0,0>, <7,0,0,0>
+  2219311206U,	// <0,0,7,1>: Cost 3 vmrghw <0,7,1,0>, LHS
+  3868557900U,	// <0,0,7,2>: Cost 4 vsldoi12 <7,2,3,0>, <0,7,2,3>
+  3377400112U,	// <0,0,7,3>: Cost 4 vmrglw <3,6,0,7>, <3,2,0,3>
+  2684360038U,	// <0,0,7,4>: Cost 3 vsldoi8 <0,0,0,0>, <7,4,5,6>
+  3732852834U,	// <0,0,7,5>: Cost 4 vsldoi4 <7,0,0,7>, <5,6,7,0>
+  3871507060U,	// <0,0,7,6>: Cost 4 vsldoi12 <7,6,7,0>, <0,7,6,7>
+  2303658616U,	// <0,0,7,7>: Cost 3 vmrglw <3,6,0,7>, <3,6,0,7>
+  2726163439U,	// <0,0,7,u>: Cost 3 vsldoi8 <7,0,0,0>, <7,0,0,0>
+  202162278U,	// <0,0,u,0>: Cost 1 vspltisw0 LHS
+  72589414U,	// <0,0,u,1>: Cost 1 vmrghw LHS, LHS
+  1685242525U,	// <0,0,u,2>: Cost 2 vsldoi12 <1,2,3,0>, LHS
+  2220073212U,	// <0,0,u,3>: Cost 3 vmrghw LHS, <0,3,1,0>
+  1146331474U,	// <0,0,u,4>: Cost 2 vmrghw LHS, <0,4,1,5>
+  1610619034U,	// <0,0,u,5>: Cost 2 vsldoi8 <0,0,0,0>, RHS
+  2785821138U,	// <0,0,u,6>: Cost 3 vsldoi12 <5,6,7,0>, <0,5,6,7>
+  2659120119U,	// <0,0,u,7>: Cost 3 vsldoi4 <7,0,0,u>, <7,0,0,u>
+  72589981U,	// <0,0,u,u>: Cost 1 vmrghw LHS, LHS
+  2698297344U,	// <0,1,0,0>: Cost 3 vsldoi8 <2,3,0,1>, <0,0,0,0>
+  1624555622U,	// <0,1,0,1>: Cost 2 vsldoi8 <2,3,0,1>, LHS
+  2758984428U,	// <0,1,0,2>: Cost 3 vsldoi12 <1,2,3,0>, <1,0,2,1>
+  2635237524U,	// <0,1,0,3>: Cost 3 vsldoi4 <3,0,1,0>, <3,0,1,0>
+  2693652818U,	// <0,1,0,4>: Cost 3 vsldoi8 <1,5,0,1>, <0,4,1,5>
+  2281701714U,	// <0,1,0,5>: Cost 3 vmrglw <0,0,0,0>, <0,4,1,5>
+  2698297846U,	// <0,1,0,6>: Cost 3 vsldoi8 <2,3,0,1>, <0,6,1,7>
+  2659128312U,	// <0,1,0,7>: Cost 3 vsldoi4 <7,0,1,0>, <7,0,1,0>
+  1624556189U,	// <0,1,0,u>: Cost 2 vsldoi8 <2,3,0,1>, LHS
+  1543585802U,	// <0,1,1,0>: Cost 2 vsldoi4 <0,0,1,1>, <0,0,1,1>
+  1141728052U,	// <0,1,1,1>: Cost 2 vmrghw LHS, <1,1,1,1>
+  1141728150U,	// <0,1,1,2>: Cost 2 vmrghw LHS, <1,2,3,0>
+  2295644334U,	// <0,1,1,3>: Cost 3 vmrglw <2,3,0,1>, <0,2,1,3>
+  1543589174U,	// <0,1,1,4>: Cost 2 vsldoi4 <0,0,1,1>, RHS
+  2290999634U,	// <0,1,1,5>: Cost 3 vmrglw <1,5,0,1>, <0,4,1,5>
+  2617332135U,	// <0,1,1,6>: Cost 3 vsldoi4 <0,0,1,1>, <6,1,7,1>
+  2617332720U,	// <0,1,1,7>: Cost 3 vsldoi4 <0,0,1,1>, <7,0,0,1>
+  1142171004U,	// <0,1,1,u>: Cost 2 vmrghw LHS, <1,u,3,0>
+  1561509990U,	// <0,1,2,0>: Cost 2 vsldoi4 <3,0,1,2>, LHS
+  2623308516U,	// <0,1,2,1>: Cost 3 vsldoi4 <1,0,1,2>, <1,0,1,2>
+  2698298984U,	// <0,1,2,2>: Cost 3 vsldoi8 <2,3,0,1>, <2,2,2,2>
+  835584U,	// <0,1,2,3>: Cost 0 copy LHS
+  1561513270U,	// <0,1,2,4>: Cost 2 vsldoi4 <3,0,1,2>, RHS
+  2647199304U,	// <0,1,2,5>: Cost 3 vsldoi4 <5,0,1,2>, <5,0,1,2>
+  2698299322U,	// <0,1,2,6>: Cost 3 vsldoi8 <2,3,0,1>, <2,6,3,7>
+  1585402874U,	// <0,1,2,7>: Cost 2 vsldoi4 <7,0,1,2>, <7,0,1,2>
+  835584U,	// <0,1,2,u>: Cost 0 copy LHS
+  2698299540U,	// <0,1,3,0>: Cost 3 vsldoi8 <2,3,0,1>, <3,0,1,0>
+  3290399540U,	// <0,1,3,1>: Cost 4 vmrghw <0,3,1,0>, <1,1,1,1>
+  2698299720U,	// <0,1,3,2>: Cost 3 vsldoi8 <2,3,0,1>, <3,2,3,0>
+  2698299804U,	// <0,1,3,3>: Cost 3 vsldoi8 <2,3,0,1>, <3,3,3,3>
+  2698299906U,	// <0,1,3,4>: Cost 3 vsldoi8 <2,3,0,1>, <3,4,5,6>
+  3832726521U,	// <0,1,3,5>: Cost 4 vsldoi12 <1,2,3,0>, <1,3,5,0>
+  2724842160U,	// <0,1,3,6>: Cost 3 vsldoi8 <6,7,0,1>, <3,6,7,0>
+  2706926275U,	// <0,1,3,7>: Cost 3 vsldoi8 <3,7,0,1>, <3,7,0,1>
+  2698300190U,	// <0,1,3,u>: Cost 3 vsldoi8 <2,3,0,1>, <3,u,1,2>
+  2635268198U,	// <0,1,4,0>: Cost 3 vsldoi4 <3,0,1,4>, LHS
+  2217362228U,	// <0,1,4,1>: Cost 3 vmrghw <0,4,1,5>, <1,1,1,1>
+  2217362326U,	// <0,1,4,2>: Cost 3 vmrghw <0,4,1,5>, <1,2,3,0>
+  2635270296U,	// <0,1,4,3>: Cost 3 vsldoi4 <3,0,1,4>, <3,0,1,4>
+  2635271478U,	// <0,1,4,4>: Cost 3 vsldoi4 <3,0,1,4>, RHS
+  1624558902U,	// <0,1,4,5>: Cost 2 vsldoi8 <2,3,0,1>, RHS
+  2659160910U,	// <0,1,4,6>: Cost 3 vsldoi4 <7,0,1,4>, <6,7,0,1>
+  2659161084U,	// <0,1,4,7>: Cost 3 vsldoi4 <7,0,1,4>, <7,0,1,4>
+  1624559145U,	// <0,1,4,u>: Cost 2 vsldoi8 <2,3,0,1>, RHS
+  3832726639U,	// <0,1,5,0>: Cost 4 vsldoi12 <1,2,3,0>, <1,5,0,1>
+  2714889871U,	// <0,1,5,1>: Cost 3 vsldoi8 <5,1,0,1>, <5,1,0,1>
+  2302314646U,	// <0,1,5,2>: Cost 3 vmrglw <3,4,0,5>, <3,0,1,2>
+  3834717321U,	// <0,1,5,3>: Cost 4 vsldoi12 <1,5,3,0>, <1,5,3,0>
+  3832726679U,	// <0,1,5,4>: Cost 4 vsldoi12 <1,2,3,0>, <1,5,4,5>
+  2717544403U,	// <0,1,5,5>: Cost 3 vsldoi8 <5,5,0,1>, <5,5,0,1>
+  2718208036U,	// <0,1,5,6>: Cost 3 vsldoi8 <5,6,0,1>, <5,6,0,1>
+  3792613493U,	// <0,1,5,7>: Cost 4 vsldoi8 <5,7,0,1>, <5,7,0,1>
+  2719535302U,	// <0,1,5,u>: Cost 3 vsldoi8 <5,u,0,1>, <5,u,0,1>
+  2659172454U,	// <0,1,6,0>: Cost 3 vsldoi4 <7,0,1,6>, LHS
+  3832726735U,	// <0,1,6,1>: Cost 4 vsldoi12 <1,2,3,0>, <1,6,1,7>
+  2724844026U,	// <0,1,6,2>: Cost 3 vsldoi8 <6,7,0,1>, <6,2,7,3>
+  3775361608U,	// <0,1,6,3>: Cost 4 vsldoi8 <2,u,0,1>, <6,3,7,0>
+  2659175734U,	// <0,1,6,4>: Cost 3 vsldoi4 <7,0,1,6>, RHS
+  3832726771U,	// <0,1,6,5>: Cost 4 vsldoi12 <1,2,3,0>, <1,6,5,7>
+  2724844344U,	// <0,1,6,6>: Cost 3 vsldoi8 <6,7,0,1>, <6,6,6,6>
+  1651102542U,	// <0,1,6,7>: Cost 2 vsldoi8 <6,7,0,1>, <6,7,0,1>
+  1651766175U,	// <0,1,6,u>: Cost 2 vsldoi8 <6,u,0,1>, <6,u,0,1>
+  2724844536U,	// <0,1,7,0>: Cost 3 vsldoi8 <6,7,0,1>, <7,0,1,0>
+  3377397770U,	// <0,1,7,1>: Cost 4 vmrglw <3,6,0,7>, <0,0,1,1>
+  2698302636U,	// <0,1,7,2>: Cost 3 vsldoi8 <2,3,0,1>, <7,2,3,0>
+  2728162531U,	// <0,1,7,3>: Cost 3 vsldoi8 <7,3,0,1>, <7,3,0,1>
+  2724844902U,	// <0,1,7,4>: Cost 3 vsldoi8 <6,7,0,1>, <7,4,5,6>
+  3377398098U,	// <0,1,7,5>: Cost 4 vmrglw <3,6,0,7>, <0,4,1,5>
+  2724845076U,	// <0,1,7,6>: Cost 3 vsldoi8 <6,7,0,1>, <7,6,7,0>
+  2724845164U,	// <0,1,7,7>: Cost 3 vsldoi8 <6,7,0,1>, <7,7,7,7>
+  2724845186U,	// <0,1,7,u>: Cost 3 vsldoi8 <6,7,0,1>, <7,u,1,2>
+  1561559142U,	// <0,1,u,0>: Cost 2 vsldoi4 <3,0,1,u>, LHS
+  1146331956U,	// <0,1,u,1>: Cost 2 vmrghw LHS, <1,1,1,1>
+  1146332054U,	// <0,1,u,2>: Cost 2 vmrghw LHS, <1,2,3,0>
+  835584U,	// <0,1,u,3>: Cost 0 copy LHS
+  1561562422U,	// <0,1,u,4>: Cost 2 vsldoi4 <3,0,1,u>, RHS
+  1624561818U,	// <0,1,u,5>: Cost 2 vsldoi8 <2,3,0,1>, RHS
+  2220074191U,	// <0,1,u,6>: Cost 3 vmrghw LHS, <1,6,1,7>
+  1585452032U,	// <0,1,u,7>: Cost 2 vsldoi4 <7,0,1,u>, <7,0,1,u>
+  835584U,	// <0,1,u,u>: Cost 0 copy LHS
+  2214593997U,	// <0,2,0,0>: Cost 3 vmrghw <0,0,0,0>, <2,0,3,0>
+  2214675999U,	// <0,2,0,1>: Cost 3 vmrghw <0,0,1,1>, <2,1,3,1>
+  2214594152U,	// <0,2,0,2>: Cost 3 vmrghw <0,0,0,0>, <2,2,2,2>
+  1207959654U,	// <0,2,0,3>: Cost 2 vmrglw <0,0,0,0>, LHS
+  3709054262U,	// <0,2,0,4>: Cost 4 vsldoi4 <3,0,2,0>, RHS
+  3375350836U,	// <0,2,0,5>: Cost 4 vmrglw <3,3,0,0>, <1,4,2,5>
+  2214594490U,	// <0,2,0,6>: Cost 3 vmrghw <0,0,0,0>, <2,6,3,7>
+  3288336362U,	// <0,2,0,7>: Cost 4 vmrghw <0,0,0,0>, <2,7,0,1>
+  1207959659U,	// <0,2,0,u>: Cost 2 vmrglw <0,0,0,0>, LHS
+  2215871994U,	// <0,2,1,0>: Cost 3 vmrghw LHS, <2,0,u,0>
+  2215470623U,	// <0,2,1,1>: Cost 3 vmrghw LHS, <2,1,3,1>
+  1141728872U,	// <0,2,1,2>: Cost 2 vmrghw LHS, <2,2,2,2>
+  1141728934U,	// <0,2,1,3>: Cost 2 vmrghw LHS, <2,3,0,1>
+  2215872323U,	// <0,2,1,4>: Cost 3 vmrghw LHS, <2,4,u,5>
+  2215872405U,	// <0,2,1,5>: Cost 3 vmrghw LHS, <2,5,u,6>
+  1141729210U,	// <0,2,1,6>: Cost 2 vmrghw LHS, <2,6,3,7>
+  2215430122U,	// <0,2,1,7>: Cost 3 vmrghw LHS, <2,7,0,1>
+  1141729368U,	// <0,2,1,u>: Cost 2 vmrghw LHS, <2,u,3,3>
+  3289736698U,	// <0,2,2,0>: Cost 4 vmrghw <0,2,1,0>, <2,0,u,0>
+  3289744927U,	// <0,2,2,1>: Cost 4 vmrghw <0,2,1,1>, <2,1,3,1>
+  2216011368U,	// <0,2,2,2>: Cost 3 vmrghw <0,2,1,2>, <2,2,2,2>
+  2216019622U,	// <0,2,2,3>: Cost 3 vmrghw <0,2,1,3>, <2,3,0,1>
+  3289769795U,	// <0,2,2,4>: Cost 4 vmrghw <0,2,1,4>, <2,4,u,5>
+  3289778069U,	// <0,2,2,5>: Cost 4 vmrghw <0,2,1,5>, <2,5,u,6>
+  2216044474U,	// <0,2,2,6>: Cost 3 vmrghw <0,2,1,6>, <2,6,3,7>
+  3732960259U,	// <0,2,2,7>: Cost 4 vsldoi4 <7,0,2,2>, <7,0,2,2>
+  2216061016U,	// <0,2,2,u>: Cost 3 vmrghw <0,2,1,u>, <2,u,3,3>
+  2758985382U,	// <0,2,3,0>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,0,1>
+  2758985392U,	// <0,2,3,1>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,1,2>
+  3290400360U,	// <0,2,3,2>: Cost 4 vmrghw <0,3,1,0>, <2,2,2,2>
+  2758985408U,	// <0,2,3,3>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,3,0>
+  2758985422U,	// <0,2,3,4>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,4,5>
+  2785822424U,	// <0,2,3,5>: Cost 3 vsldoi12 <5,6,7,0>, <2,3,5,6>
+  3290400698U,	// <0,2,3,6>: Cost 4 vmrghw <0,3,1,0>, <2,6,3,7>
+  2765915876U,	// <0,2,3,7>: Cost 3 vsldoi12 <2,3,7,0>, <2,3,7,0>
+  2758985453U,	// <0,2,3,u>: Cost 3 vsldoi12 <1,2,3,0>, <2,3,u,0>
+  3291104762U,	// <0,2,4,0>: Cost 4 vmrghw <0,4,1,5>, <2,0,u,0>
+  2217362979U,	// <0,2,4,1>: Cost 3 vmrghw <0,4,1,5>, <2,1,3,5>
+  2217363048U,	// <0,2,4,2>: Cost 3 vmrghw <0,4,1,5>, <2,2,2,2>
+  2217363110U,	// <0,2,4,3>: Cost 3 vmrghw <0,4,1,5>, <2,3,0,1>
+  3291105087U,	// <0,2,4,4>: Cost 4 vmrghw <0,4,1,5>, <2,4,u,1>
+  3291105173U,	// <0,2,4,5>: Cost 4 vmrghw <0,4,1,5>, <2,5,u,6>
+  2217363386U,	// <0,2,4,6>: Cost 3 vmrghw <0,4,1,5>, <2,6,3,7>
+  3788639688U,	// <0,2,4,7>: Cost 4 vsldoi8 <5,1,0,2>, <4,7,5,0>
+  2217363515U,	// <0,2,4,u>: Cost 3 vmrghw <0,4,1,5>, <2,u,0,1>
+  3376054371U,	// <0,2,5,0>: Cost 4 vmrglw <3,4,0,5>, <0,1,2,0>
+  3788639888U,	// <0,2,5,1>: Cost 4 vsldoi8 <5,1,0,2>, <5,1,0,2>
+  3376055912U,	// <0,2,5,2>: Cost 4 vmrglw <3,4,0,5>, <2,2,2,2>
+  2302312550U,	// <0,2,5,3>: Cost 3 vmrglw <3,4,0,5>, LHS
+  3376054375U,	// <0,2,5,4>: Cost 4 vmrglw <3,4,0,5>, <0,1,2,4>
+  3374728244U,	// <0,2,5,5>: Cost 4 vmrglw <3,2,0,5>, <1,4,2,5>
+  3805229154U,	// <0,2,5,6>: Cost 4 vsldoi8 <7,u,0,2>, <5,6,7,0>
+  3376055512U,	// <0,2,5,7>: Cost 4 vmrglw <3,4,0,5>, <1,6,2,7>
+  2302312555U,	// <0,2,5,u>: Cost 3 vmrglw <3,4,0,5>, LHS
+  3709100134U,	// <0,2,6,0>: Cost 4 vsldoi4 <3,0,2,6>, LHS
+  3709100950U,	// <0,2,6,1>: Cost 4 vsldoi4 <3,0,2,6>, <1,2,3,0>
+  3709102010U,	// <0,2,6,2>: Cost 4 vsldoi4 <3,0,2,6>, <2,6,3,7>
+  2758985658U,	// <0,2,6,3>: Cost 3 vsldoi12 <1,2,3,0>, <2,6,3,7>
+  3709103414U,	// <0,2,6,4>: Cost 4 vsldoi4 <3,0,2,6>, RHS
+  3732992098U,	// <0,2,6,5>: Cost 4 vsldoi4 <7,0,2,6>, <5,6,7,0>
+  3292374970U,	// <0,2,6,6>: Cost 4 vmrghw <0,6,0,7>, <2,6,3,7>
+  3798594383U,	// <0,2,6,7>: Cost 4 vsldoi8 <6,7,0,2>, <6,7,0,2>
+  2758985703U,	// <0,2,6,u>: Cost 3 vsldoi12 <1,2,3,0>, <2,6,u,7>
+  3788641274U,	// <0,2,7,0>: Cost 4 vsldoi8 <5,1,0,2>, <7,0,1,2>
+  3377398508U,	// <0,2,7,1>: Cost 4 vmrglw <3,6,0,7>, <1,0,2,1>
+  3377398590U,	// <0,2,7,2>: Cost 4 vmrglw <3,6,0,7>, <1,1,2,2>
+  2303656038U,	// <0,2,7,3>: Cost 3 vmrglw <3,6,0,7>, LHS
+  3709111606U,	// <0,2,7,4>: Cost 4 vsldoi4 <3,0,2,7>, RHS
+  3377398836U,	// <0,2,7,5>: Cost 4 vmrglw <3,6,0,7>, <1,4,2,5>
+  3803903447U,	// <0,2,7,6>: Cost 4 vsldoi8 <7,6,0,2>, <7,6,0,2>
+  3293054954U,	// <0,2,7,7>: Cost 4 vmrghw <0,7,1,0>, <2,7,0,1>
+  2303656043U,	// <0,2,7,u>: Cost 3 vmrglw <3,6,0,7>, LHS
+  2220074490U,	// <0,2,u,0>: Cost 3 vmrghw LHS, <2,0,u,0>
+  2220074527U,	// <0,2,u,1>: Cost 3 vmrghw LHS, <2,1,3,1>
+  1146332776U,	// <0,2,u,2>: Cost 2 vmrghw LHS, <2,2,2,2>
+  1146332838U,	// <0,2,u,3>: Cost 2 vmrghw LHS, <2,3,0,1>
+  2220074819U,	// <0,2,u,4>: Cost 3 vmrghw LHS, <2,4,u,5>
+  2220074901U,	// <0,2,u,5>: Cost 3 vmrghw LHS, <2,5,u,6>
+  1146333114U,	// <0,2,u,6>: Cost 2 vmrghw LHS, <2,6,3,7>
+  2220074986U,	// <0,2,u,7>: Cost 3 vmrghw LHS, <2,7,0,1>
+  1146333243U,	// <0,2,u,u>: Cost 2 vmrghw LHS, <2,u,0,1>
+  2629410816U,	// <0,3,0,0>: Cost 3 vsldoi4 <2,0,3,0>, <0,0,0,0>
+  2753530006U,	// <0,3,0,1>: Cost 3 vsldoi12 <0,3,1,0>, <3,0,1,2>
+  2629412301U,	// <0,3,0,2>: Cost 3 vsldoi4 <2,0,3,0>, <2,0,3,0>
+  2214594972U,	// <0,3,0,3>: Cost 3 vmrghw <0,0,0,0>, <3,3,3,3>
+  2758985908U,	// <0,3,0,4>: Cost 3 vsldoi12 <1,2,3,0>, <3,0,4,5>
+  3733016674U,	// <0,3,0,5>: Cost 4 vsldoi4 <7,0,3,0>, <5,6,7,0>
+  3777364488U,	// <0,3,0,6>: Cost 4 vsldoi8 <3,2,0,3>, <0,6,3,7>
+  2281703354U,	// <0,3,0,7>: Cost 3 vmrglw <0,0,0,0>, <2,6,3,7>
+  2758985941U,	// <0,3,0,u>: Cost 3 vsldoi12 <1,2,3,0>, <3,0,u,2>
+  1141729430U,	// <0,3,1,0>: Cost 2 vmrghw LHS, <3,0,1,2>
+  2215471334U,	// <0,3,1,1>: Cost 3 vmrghw LHS, <3,1,1,1>
+  2215471425U,	// <0,3,1,2>: Cost 3 vmrghw LHS, <3,2,2,2>
+  1141729692U,	// <0,3,1,3>: Cost 2 vmrghw LHS, <3,3,3,3>
+  1141729794U,	// <0,3,1,4>: Cost 2 vmrghw LHS, <3,4,5,6>
+  2215430738U,	// <0,3,1,5>: Cost 3 vmrghw LHS, <3,5,5,5>
+  2215430776U,	// <0,3,1,6>: Cost 3 vmrghw LHS, <3,6,0,7>
+  2295646138U,	// <0,3,1,7>: Cost 3 vmrglw <2,3,0,1>, <2,6,3,7>
+  1141730078U,	// <0,3,1,u>: Cost 2 vmrghw LHS, <3,u,1,2>
+  2758986032U,	// <0,3,2,0>: Cost 3 vsldoi12 <1,2,3,0>, <3,2,0,3>
+  3709141910U,	// <0,3,2,1>: Cost 4 vsldoi4 <3,0,3,2>, <1,2,3,0>
+  3289753921U,	// <0,3,2,2>: Cost 4 vmrghw <0,2,1,2>, <3,2,2,2>
+  2770929992U,	// <0,3,2,3>: Cost 3 vsldoi12 <3,2,3,0>, <3,2,3,0>
+  3289754114U,	// <0,3,2,4>: Cost 4 vmrghw <0,2,1,2>, <3,4,5,6>
+  3362095460U,	// <0,3,2,5>: Cost 5 vmrglw <1,1,0,2>, <0,4,3,5>
+  3832727910U,	// <0,3,2,6>: Cost 4 vsldoi12 <1,2,3,0>, <3,2,6,3>
+  3365414842U,	// <0,3,2,7>: Cost 4 vmrglw <1,6,0,2>, <2,6,3,7>
+  2771298677U,	// <0,3,2,u>: Cost 3 vsldoi12 <3,2,u,0>, <3,2,u,0>
+  2216659094U,	// <0,3,3,0>: Cost 3 vmrghw <0,3,1,0>, <3,0,1,2>
+  3290409190U,	// <0,3,3,1>: Cost 4 vmrghw <0,3,1,1>, <3,1,1,1>
+  2703624496U,	// <0,3,3,2>: Cost 3 vsldoi8 <3,2,0,3>, <3,2,0,3>
+  2216683932U,	// <0,3,3,3>: Cost 3 vmrghw <0,3,1,3>, <3,3,3,3>
+  2216692226U,	// <0,3,3,4>: Cost 3 vmrghw <0,3,1,4>, <3,4,5,6>
+  3733041250U,	// <0,3,3,5>: Cost 4 vsldoi4 <7,0,3,3>, <5,6,7,0>
+  3832727988U,	// <0,3,3,6>: Cost 4 vsldoi12 <1,2,3,0>, <3,3,6,0>
+  3374712762U,	// <0,3,3,7>: Cost 4 vmrglw <3,2,0,3>, <2,6,3,7>
+  2216725278U,	// <0,3,3,u>: Cost 3 vmrghw <0,3,1,u>, <3,u,1,2>
+  2217363606U,	// <0,3,4,0>: Cost 3 vmrghw <0,4,1,5>, <3,0,1,2>
+  3291105510U,	// <0,3,4,1>: Cost 4 vmrghw <0,4,1,5>, <3,1,1,1>
+  3291105601U,	// <0,3,4,2>: Cost 4 vmrghw <0,4,1,5>, <3,2,2,2>
+  2217363868U,	// <0,3,4,3>: Cost 3 vmrghw <0,4,1,5>, <3,3,3,3>
+  2217363970U,	// <0,3,4,4>: Cost 3 vmrghw <0,4,1,5>, <3,4,5,6>
+  2758986242U,	// <0,3,4,5>: Cost 3 vsldoi12 <1,2,3,0>, <3,4,5,6>
+  3727077685U,	// <0,3,4,6>: Cost 4 vsldoi4 <6,0,3,4>, <6,0,3,4>
+  3364767674U,	// <0,3,4,7>: Cost 4 vmrglw <1,5,0,4>, <2,6,3,7>
+  2217364254U,	// <0,3,4,u>: Cost 3 vmrghw <0,4,1,5>, <3,u,1,2>
+  3832728102U,	// <0,3,5,0>: Cost 4 vsldoi12 <1,2,3,0>, <3,5,0,6>
+  3405916003U,	// <0,3,5,1>: Cost 4 vmrglw <u,4,0,5>, <2,5,3,1>
+  3376055840U,	// <0,3,5,2>: Cost 4 vmrglw <3,4,0,5>, <2,1,3,2>
+  3376055679U,	// <0,3,5,3>: Cost 4 vmrglw <3,4,0,5>, <1,u,3,3>
+  3376055194U,	// <0,3,5,4>: Cost 4 vmrglw <3,4,0,5>, <1,2,3,4>
+  3859565138U,	// <0,3,5,5>: Cost 4 vsldoi12 <5,6,7,0>, <3,5,5,5>
+  2727514210U,	// <0,3,5,6>: Cost 3 vsldoi8 <7,2,0,3>, <5,6,7,0>
+  3376056250U,	// <0,3,5,7>: Cost 4 vmrglw <3,4,0,5>, <2,6,3,7>
+  2727514210U,	// <0,3,5,u>: Cost 3 vsldoi8 <7,2,0,3>, <5,6,7,0>
+  2758986360U,	// <0,3,6,0>: Cost 3 vsldoi12 <1,2,3,0>, <3,6,0,7>
+  3709174678U,	// <0,3,6,1>: Cost 4 vsldoi4 <3,0,3,6>, <1,2,3,0>
+  3795284411U,	// <0,3,6,2>: Cost 4 vsldoi8 <6,2,0,3>, <6,2,0,3>
+  3709175980U,	// <0,3,6,3>: Cost 4 vsldoi4 <3,0,3,6>, <3,0,3,6>
+  3833096860U,	// <0,3,6,4>: Cost 4 vsldoi12 <1,2,u,0>, <3,6,4,7>
+  3376728235U,	// <0,3,6,5>: Cost 5 vmrglw <3,5,0,6>, <3,0,3,5>
+  3859565229U,	// <0,3,6,6>: Cost 4 vsldoi12 <5,6,7,0>, <3,6,6,6>
+  2773879472U,	// <0,3,6,7>: Cost 3 vsldoi12 <3,6,7,0>, <3,6,7,0>
+  2758986360U,	// <0,3,6,u>: Cost 3 vsldoi12 <1,2,3,0>, <3,6,0,7>
+  2303656854U,	// <0,3,7,0>: Cost 3 vmrglw <3,6,0,7>, <1,2,3,0>
+  3807229018U,	// <0,3,7,1>: Cost 4 vsldoi8 <u,2,0,3>, <7,1,2,u>
+  2727515284U,	// <0,3,7,2>: Cost 3 vsldoi8 <7,2,0,3>, <7,2,0,3>
+  3377399410U,	// <0,3,7,3>: Cost 4 vmrglw <3,6,0,7>, <2,2,3,3>
+  3377398682U,	// <0,3,7,4>: Cost 4 vmrglw <3,6,0,7>, <1,2,3,4>
+  3801257409U,	// <0,3,7,5>: Cost 4 vsldoi8 <7,2,0,3>, <7,5,6,7>
+  3377399980U,	// <0,3,7,6>: Cost 4 vmrglw <3,6,0,7>, <3,0,3,6>
+  3375409082U,	// <0,3,7,7>: Cost 4 vmrglw <3,3,0,7>, <2,6,3,7>
+  2731497082U,	// <0,3,7,u>: Cost 3 vsldoi8 <7,u,0,3>, <7,u,0,3>
+  1146333334U,	// <0,3,u,0>: Cost 2 vmrghw LHS, <3,0,1,2>
+  2220075238U,	// <0,3,u,1>: Cost 3 vmrghw LHS, <3,1,1,1>
+  2220075329U,	// <0,3,u,2>: Cost 3 vmrghw LHS, <3,2,2,2>
+  1146333596U,	// <0,3,u,3>: Cost 2 vmrghw LHS, <3,3,3,3>
+  1146333698U,	// <0,3,u,4>: Cost 2 vmrghw LHS, <3,4,5,6>
+  2758986566U,	// <0,3,u,5>: Cost 3 vsldoi12 <1,2,3,0>, <3,u,5,6>
+  2803739472U,	// <0,3,u,6>: Cost 3 vsldoi12 <u,6,7,0>, <3,u,6,7>
+  2295703482U,	// <0,3,u,7>: Cost 3 vmrglw <2,3,0,u>, <2,6,3,7>
+  1146333982U,	// <0,3,u,u>: Cost 2 vmrghw LHS, <3,u,1,2>
+  2214595473U,	// <0,4,0,0>: Cost 3 vmrghw <0,0,0,0>, <4,0,5,0>
+  2693677158U,	// <0,4,0,1>: Cost 3 vsldoi8 <1,5,0,4>, LHS
+  3839437689U,	// <0,4,0,2>: Cost 4 vsldoi12 <2,3,4,0>, <4,0,2,3>
+  3709200559U,	// <0,4,0,3>: Cost 4 vsldoi4 <3,0,4,0>, <3,0,4,0>
+  2693677394U,	// <0,4,0,4>: Cost 3 vsldoi8 <1,5,0,4>, <0,4,1,5>
+  1140854070U,	// <0,4,0,5>: Cost 2 vmrghw <0,0,0,0>, RHS
+  3767419409U,	// <0,4,0,6>: Cost 4 vsldoi8 <1,5,0,4>, <0,6,4,7>
+  3854109604U,	// <0,4,0,7>: Cost 4 vsldoi12 <4,7,5,0>, <4,0,7,1>
+  1140854313U,	// <0,4,0,u>: Cost 2 vmrghw <0,0,0,0>, RHS
+  1141689234U,	// <0,4,1,0>: Cost 2 vmrghw LHS, <4,0,5,1>
+  2215431114U,	// <0,4,1,1>: Cost 3 vmrghw LHS, <4,1,2,3>
+  2215431221U,	// <0,4,1,2>: Cost 3 vmrghw LHS, <4,2,5,2>
+  2635466928U,	// <0,4,1,3>: Cost 3 vsldoi4 <3,0,4,1>, <3,0,4,1>
+  1141689552U,	// <0,4,1,4>: Cost 2 vmrghw LHS, <4,4,4,4>
+  67947830U,	// <0,4,1,5>: Cost 1 vmrghw LHS, RHS
+  2215431545U,	// <0,4,1,6>: Cost 3 vmrghw LHS, <4,6,5,2>
+  2659357716U,	// <0,4,1,7>: Cost 3 vsldoi4 <7,0,4,1>, <7,0,4,1>
+  67948073U,	// <0,4,1,u>: Cost 1 vmrghw LHS, RHS
+  3767420369U,	// <0,4,2,0>: Cost 4 vsldoi8 <1,5,0,4>, <2,0,3,4>
+  3767420451U,	// <0,4,2,1>: Cost 4 vsldoi8 <1,5,0,4>, <2,1,3,5>
+  3767420520U,	// <0,4,2,2>: Cost 4 vsldoi8 <1,5,0,4>, <2,2,2,2>
+  2698323625U,	// <0,4,2,3>: Cost 3 vsldoi8 <2,3,0,4>, <2,3,0,4>
+  3709218102U,	// <0,4,2,4>: Cost 4 vsldoi4 <3,0,4,2>, RHS
+  2216013110U,	// <0,4,2,5>: Cost 3 vmrghw <0,2,1,2>, RHS
+  3767420858U,	// <0,4,2,6>: Cost 4 vsldoi8 <1,5,0,4>, <2,6,3,7>
+  3774719981U,	// <0,4,2,7>: Cost 4 vsldoi8 <2,7,0,4>, <2,7,0,4>
+  2216013353U,	// <0,4,2,u>: Cost 3 vmrghw <0,2,1,2>, RHS
+  3767421078U,	// <0,4,3,0>: Cost 4 vsldoi8 <1,5,0,4>, <3,0,1,2>
+  3776710880U,	// <0,4,3,1>: Cost 4 vsldoi8 <3,1,0,4>, <3,1,0,4>
+  3833097325U,	// <0,4,3,2>: Cost 5 vsldoi12 <1,2,u,0>, <4,3,2,4>
+  3767421340U,	// <0,4,3,3>: Cost 4 vsldoi8 <1,5,0,4>, <3,3,3,3>
+  3767421442U,	// <0,4,3,4>: Cost 4 vsldoi8 <1,5,0,4>, <3,4,5,6>
+  2216660278U,	// <0,4,3,5>: Cost 3 vmrghw <0,3,1,0>, RHS
+  3833097361U,	// <0,4,3,6>: Cost 5 vsldoi12 <1,2,u,0>, <4,3,6,4>
+  3780692678U,	// <0,4,3,7>: Cost 4 vsldoi8 <3,7,0,4>, <3,7,0,4>
+  2216660521U,	// <0,4,3,u>: Cost 3 vmrghw <0,3,1,0>, RHS
+  2617573416U,	// <0,4,4,0>: Cost 3 vsldoi4 <0,0,4,4>, <0,0,4,4>
+  2217364450U,	// <0,4,4,1>: Cost 3 vmrghw <0,4,1,5>, <4,1,5,0>
+  3691316771U,	// <0,4,4,2>: Cost 4 vsldoi4 <0,0,4,4>, <2,1,3,5>
+  3709233331U,	// <0,4,4,3>: Cost 4 vsldoi4 <3,0,4,4>, <3,0,4,4>
+  2785823952U,	// <0,4,4,4>: Cost 3 vsldoi12 <5,6,7,0>, <4,4,4,4>
+  1143622966U,	// <0,4,4,5>: Cost 2 vmrghw <0,4,1,5>, RHS
+  3691319723U,	// <0,4,4,6>: Cost 4 vsldoi4 <0,0,4,4>, <6,1,7,5>
+  3854109932U,	// <0,4,4,7>: Cost 4 vsldoi12 <4,7,5,0>, <4,4,7,5>
+  1143623209U,	// <0,4,4,u>: Cost 2 vmrghw <0,4,1,5>, RHS
+  2635497574U,	// <0,4,5,0>: Cost 3 vsldoi4 <3,0,4,5>, LHS
+  2635498390U,	// <0,4,5,1>: Cost 3 vsldoi4 <3,0,4,5>, <1,2,3,0>
+  3709240936U,	// <0,4,5,2>: Cost 4 vsldoi4 <3,0,4,5>, <2,2,2,2>
+  2635499700U,	// <0,4,5,3>: Cost 3 vsldoi4 <3,0,4,5>, <3,0,4,5>
+  2635500854U,	// <0,4,5,4>: Cost 3 vsldoi4 <3,0,4,5>, RHS
+  2785824044U,	// <0,4,5,5>: Cost 3 vsldoi12 <5,6,7,0>, <4,5,5,6>
+  1685245238U,	// <0,4,5,6>: Cost 2 vsldoi12 <1,2,3,0>, RHS
+  2659390488U,	// <0,4,5,7>: Cost 3 vsldoi4 <7,0,4,5>, <7,0,4,5>
+  1685245256U,	// <0,4,5,u>: Cost 2 vsldoi12 <1,2,3,0>, RHS
+  3839438161U,	// <0,4,6,0>: Cost 4 vsldoi12 <2,3,4,0>, <4,6,0,7>
+  3798610347U,	// <0,4,6,1>: Cost 4 vsldoi8 <6,7,0,4>, <6,1,7,5>
+  3798610426U,	// <0,4,6,2>: Cost 4 vsldoi8 <6,7,0,4>, <6,2,7,3>
+  3795956237U,	// <0,4,6,3>: Cost 4 vsldoi8 <6,3,0,4>, <6,3,0,4>
+  3733138742U,	// <0,4,6,4>: Cost 4 vsldoi4 <7,0,4,6>, RHS
+  2218634550U,	// <0,4,6,5>: Cost 3 vmrghw <0,6,0,7>, RHS
+  3798610744U,	// <0,4,6,6>: Cost 4 vsldoi8 <6,7,0,4>, <6,6,6,6>
+  2724868945U,	// <0,4,6,7>: Cost 3 vsldoi8 <6,7,0,4>, <6,7,0,4>
+  2725532578U,	// <0,4,6,u>: Cost 3 vsldoi8 <6,u,0,4>, <6,u,0,4>
+  3383371465U,	// <0,4,7,0>: Cost 4 vmrglw <4,6,0,7>, <2,3,4,0>
+  3800601668U,	// <0,4,7,1>: Cost 4 vsldoi8 <7,1,0,4>, <7,1,0,4>
+  3775386826U,	// <0,4,7,2>: Cost 5 vsldoi8 <2,u,0,4>, <7,2,6,3>
+  3801928934U,	// <0,4,7,3>: Cost 4 vsldoi8 <7,3,0,4>, <7,3,0,4>
+  3721202998U,	// <0,4,7,4>: Cost 4 vsldoi4 <5,0,4,7>, RHS
+  2780368328U,	// <0,4,7,5>: Cost 3 vsldoi12 <4,7,5,0>, <4,7,5,0>
+  3383372686U,	// <0,4,7,6>: Cost 5 vmrglw <4,6,0,7>, <4,0,4,6>
+  3854110170U,	// <0,4,7,7>: Cost 4 vsldoi12 <4,7,5,0>, <4,7,7,0>
+  2780368328U,	// <0,4,7,u>: Cost 3 vsldoi12 <4,7,5,0>, <4,7,5,0>
+  1146334098U,	// <0,4,u,0>: Cost 2 vmrghw LHS, <4,0,5,1>
+  2220076002U,	// <0,4,u,1>: Cost 3 vmrghw LHS, <4,1,5,0>
+  2220076085U,	// <0,4,u,2>: Cost 3 vmrghw LHS, <4,2,5,2>
+  2635524279U,	// <0,4,u,3>: Cost 3 vsldoi4 <3,0,4,u>, <3,0,4,u>
+  1146334416U,	// <0,4,u,4>: Cost 2 vmrghw LHS, <4,4,4,4>
+  72592694U,	// <0,4,u,5>: Cost 1 vmrghw LHS, RHS
+  1685245481U,	// <0,4,u,6>: Cost 2 vsldoi12 <1,2,3,0>, RHS
+  2659415067U,	// <0,4,u,7>: Cost 3 vsldoi4 <7,0,4,u>, <7,0,4,u>
+  72592937U,	// <0,4,u,u>: Cost 1 vmrghw LHS, RHS
+  2281704337U,	// <0,5,0,0>: Cost 3 vmrglw <0,0,0,0>, <4,0,5,0>
+  2704965734U,	// <0,5,0,1>: Cost 3 vsldoi8 <3,4,0,5>, LHS
+  3778707666U,	// <0,5,0,2>: Cost 4 vsldoi8 <3,4,0,5>, <0,2,5,3>
+  3778707708U,	// <0,5,0,3>: Cost 4 vsldoi8 <3,4,0,5>, <0,3,1,0>
+  2687050057U,	// <0,5,0,4>: Cost 3 vsldoi8 <0,4,0,5>, <0,4,0,5>
+  2214596612U,	// <0,5,0,5>: Cost 3 vmrghw <0,0,0,0>, <5,5,5,5>
+  2785824372U,	// <0,5,0,6>: Cost 3 vsldoi12 <5,6,7,0>, <5,0,6,1>
+  3854110332U,	// <0,5,0,7>: Cost 4 vsldoi12 <4,7,5,0>, <5,0,7,0>
+  2704966301U,	// <0,5,0,u>: Cost 3 vsldoi8 <3,4,0,5>, LHS
+  1567768678U,	// <0,5,1,0>: Cost 2 vsldoi4 <4,0,5,1>, LHS
+  2312236570U,	// <0,5,1,1>: Cost 3 vmrglw <5,1,0,1>, <4,u,5,1>
+  2215431915U,	// <0,5,1,2>: Cost 3 vmrghw LHS, <5,2,1,3>
+  2641512598U,	// <0,5,1,3>: Cost 3 vsldoi4 <4,0,5,1>, <3,0,1,2>
+  1567771538U,	// <0,5,1,4>: Cost 2 vsldoi4 <4,0,5,1>, <4,0,5,1>
+  1141690372U,	// <0,5,1,5>: Cost 2 vmrghw LHS, <5,5,5,5>
+  1141690466U,	// <0,5,1,6>: Cost 2 vmrghw LHS, <5,6,7,0>
+  2641515514U,	// <0,5,1,7>: Cost 3 vsldoi4 <4,0,5,1>, <7,0,1,2>
+  1141690615U,	// <0,5,1,u>: Cost 2 vmrghw LHS, <5,u,5,5>
+  3772736973U,	// <0,5,2,0>: Cost 4 vsldoi8 <2,4,0,5>, <2,0,3,0>
+  3778709024U,	// <0,5,2,1>: Cost 4 vsldoi8 <3,4,0,5>, <2,1,3,2>
+  3778709096U,	// <0,5,2,2>: Cost 4 vsldoi8 <3,4,0,5>, <2,2,2,2>
+  3778709158U,	// <0,5,2,3>: Cost 4 vsldoi8 <3,4,0,5>, <2,3,0,1>
+  3772737275U,	// <0,5,2,4>: Cost 4 vsldoi8 <2,4,0,5>, <2,4,0,5>
+  3859566351U,	// <0,5,2,5>: Cost 4 vsldoi12 <5,6,7,0>, <5,2,5,3>
+  3778709434U,	// <0,5,2,6>: Cost 4 vsldoi8 <3,4,0,5>, <2,6,3,7>
+  3805251562U,	// <0,5,2,7>: Cost 4 vsldoi8 <7,u,0,5>, <2,7,0,1>
+  3775391807U,	// <0,5,2,u>: Cost 4 vsldoi8 <2,u,0,5>, <2,u,0,5>
+  2704967830U,	// <0,5,3,0>: Cost 3 vsldoi8 <3,4,0,5>, <3,0,1,2>
+  3776719073U,	// <0,5,3,1>: Cost 4 vsldoi8 <3,1,0,5>, <3,1,0,5>
+  3777382706U,	// <0,5,3,2>: Cost 4 vsldoi8 <3,2,0,5>, <3,2,0,5>
+  3778709887U,	// <0,5,3,3>: Cost 4 vsldoi8 <3,4,0,5>, <3,3,0,1>
+  2704968148U,	// <0,5,3,4>: Cost 3 vsldoi8 <3,4,0,5>, <3,4,0,5>
+  3857428317U,	// <0,5,3,5>: Cost 4 vsldoi12 <5,3,5,0>, <5,3,5,0>
+  3364096514U,	// <0,5,3,6>: Cost 4 vmrglw <1,4,0,3>, <3,4,5,6>
+  3780700871U,	// <0,5,3,7>: Cost 4 vsldoi8 <3,7,0,5>, <3,7,0,5>
+  2707622680U,	// <0,5,3,u>: Cost 3 vsldoi8 <3,u,0,5>, <3,u,0,5>
+  2728856466U,	// <0,5,4,0>: Cost 3 vsldoi8 <7,4,0,5>, <4,0,5,1>
+  3697361674U,	// <0,5,4,1>: Cost 4 vsldoi4 <1,0,5,4>, <1,0,5,4>
+  3697362601U,	// <0,5,4,2>: Cost 4 vsldoi4 <1,0,5,4>, <2,3,0,4>
+  3364766635U,	// <0,5,4,3>: Cost 4 vmrglw <1,5,0,4>, <1,2,5,3>
+  2217365428U,	// <0,5,4,4>: Cost 3 vmrghw <0,4,1,5>, <5,4,5,6>
+  2704969014U,	// <0,5,4,5>: Cost 3 vsldoi8 <3,4,0,5>, RHS
+  2785824700U,	// <0,5,4,6>: Cost 3 vsldoi12 <5,6,7,0>, <5,4,6,5>
+  3364766963U,	// <0,5,4,7>: Cost 4 vmrglw <1,5,0,4>, <1,6,5,7>
+  2704969257U,	// <0,5,4,u>: Cost 3 vsldoi8 <3,4,0,5>, RHS
+  3846148050U,	// <0,5,5,0>: Cost 4 vsldoi12 <3,4,5,0>, <5,5,0,0>
+  2326203282U,	// <0,5,5,1>: Cost 3 vmrglw <7,4,0,5>, <4,0,5,1>
+  3291746027U,	// <0,5,5,2>: Cost 4 vmrghw <0,5,1,2>, <5,2,1,3>
+  3376054482U,	// <0,5,5,3>: Cost 4 vmrglw <3,4,0,5>, <0,2,5,3>
+  3790655366U,	// <0,5,5,4>: Cost 4 vsldoi8 <5,4,0,5>, <5,4,0,5>
+  2785824772U,	// <0,5,5,5>: Cost 3 vsldoi12 <5,6,7,0>, <5,5,5,5>
+  2724876386U,	// <0,5,5,6>: Cost 3 vsldoi8 <6,7,0,5>, <5,6,7,0>
+  3858903057U,	// <0,5,5,7>: Cost 4 vsldoi12 <5,5,7,0>, <5,5,7,0>
+  2736820484U,	// <0,5,5,u>: Cost 3 vsldoi8 <u,7,0,5>, <5,u,7,0>
+  2659467366U,	// <0,5,6,0>: Cost 3 vsldoi4 <7,0,5,6>, LHS
+  3859566643U,	// <0,5,6,1>: Cost 4 vsldoi12 <5,6,7,0>, <5,6,1,7>
+  3798618618U,	// <0,5,6,2>: Cost 4 vsldoi8 <6,7,0,5>, <6,2,7,3>
+  3852857410U,	// <0,5,6,3>: Cost 4 vsldoi12 <4,5,6,0>, <5,6,3,4>
+  2659470646U,	// <0,5,6,4>: Cost 3 vsldoi4 <7,0,5,6>, RHS
+  2659471458U,	// <0,5,6,5>: Cost 3 vsldoi4 <7,0,5,6>, <5,6,7,0>
+  3832729696U,	// <0,5,6,6>: Cost 4 vsldoi12 <1,2,3,0>, <5,6,6,7>
+  1712083042U,	// <0,5,6,7>: Cost 2 vsldoi12 <5,6,7,0>, <5,6,7,0>
+  1712156779U,	// <0,5,6,u>: Cost 2 vsldoi12 <5,6,u,0>, <5,6,u,0>
+  2731512826U,	// <0,5,7,0>: Cost 3 vsldoi8 <7,u,0,5>, <7,0,1,2>
+  3859566717U,	// <0,5,7,1>: Cost 4 vsldoi12 <5,6,7,0>, <5,7,1,0>
+  3798619284U,	// <0,5,7,2>: Cost 4 vsldoi8 <6,7,0,5>, <7,2,0,3>
+  3778712803U,	// <0,5,7,3>: Cost 4 vsldoi8 <3,4,0,5>, <7,3,0,1>
+  2728858936U,	// <0,5,7,4>: Cost 3 vsldoi8 <7,4,0,5>, <7,4,0,5>
+  3859566753U,	// <0,5,7,5>: Cost 4 vsldoi12 <5,6,7,0>, <5,7,5,0>
+  3377398135U,	// <0,5,7,6>: Cost 4 vmrglw <3,6,0,7>, <0,4,5,6>
+  3798619686U,	// <0,5,7,7>: Cost 4 vsldoi8 <6,7,0,5>, <7,7,0,0>
+  2731513468U,	// <0,5,7,u>: Cost 3 vsldoi8 <7,u,0,5>, <7,u,0,5>
+  1567826022U,	// <0,5,u,0>: Cost 2 vsldoi4 <4,0,5,u>, LHS
+  2704971566U,	// <0,5,u,1>: Cost 3 vsldoi8 <3,4,0,5>, LHS
+  2220076779U,	// <0,5,u,2>: Cost 3 vmrghw LHS, <5,2,1,3>
+  2641569942U,	// <0,5,u,3>: Cost 3 vsldoi4 <4,0,5,u>, <3,0,1,2>
+  1567828889U,	// <0,5,u,4>: Cost 2 vsldoi4 <4,0,5,u>, <4,0,5,u>
+  1146335236U,	// <0,5,u,5>: Cost 2 vmrghw LHS, <5,5,5,5>
+  1146335330U,	// <0,5,u,6>: Cost 2 vmrghw LHS, <5,6,7,0>
+  1713410308U,	// <0,5,u,7>: Cost 2 vsldoi12 <5,u,7,0>, <5,u,7,0>
+  1713484045U,	// <0,5,u,u>: Cost 2 vsldoi12 <5,u,u,0>, <5,u,u,0>
+  2214596949U,	// <0,6,0,0>: Cost 3 vmrghw <0,0,0,0>, <6,0,7,0>
+  2214678951U,	// <0,6,0,1>: Cost 3 vmrghw <0,0,1,1>, <6,1,7,1>
+  2214597114U,	// <0,6,0,2>: Cost 3 vmrghw <0,0,0,0>, <6,2,7,3>
+  3852857653U,	// <0,6,0,3>: Cost 4 vsldoi12 <4,5,6,0>, <6,0,3,4>
+  3832729919U,	// <0,6,0,4>: Cost 4 vsldoi12 <1,2,3,0>, <6,0,4,5>
+  3721293427U,	// <0,6,0,5>: Cost 4 vsldoi4 <5,0,6,0>, <5,0,6,0>
+  2214597432U,	// <0,6,0,6>: Cost 3 vmrghw <0,0,0,0>, <6,6,6,6>
+  1207962934U,	// <0,6,0,7>: Cost 2 vmrglw <0,0,0,0>, RHS
+  1207962935U,	// <0,6,0,u>: Cost 2 vmrglw <0,0,0,0>, RHS
+  2215432481U,	// <0,6,1,0>: Cost 3 vmrghw LHS, <6,0,1,2>
+  2215432615U,	// <0,6,1,1>: Cost 3 vmrghw LHS, <6,1,7,1>
+  1141690874U,	// <0,6,1,2>: Cost 2 vmrghw LHS, <6,2,7,3>
+  2215432754U,	// <0,6,1,3>: Cost 3 vmrghw LHS, <6,3,4,5>
+  2215432817U,	// <0,6,1,4>: Cost 3 vmrghw LHS, <6,4,2,5>
+  2215432939U,	// <0,6,1,5>: Cost 3 vmrghw LHS, <6,5,7,1>
+  1141691192U,	// <0,6,1,6>: Cost 2 vmrghw LHS, <6,6,6,6>
+  1221905718U,	// <0,6,1,7>: Cost 2 vmrglw <2,3,0,1>, RHS
+  1221905719U,	// <0,6,1,u>: Cost 2 vmrglw <2,3,0,1>, RHS
+  3852857787U,	// <0,6,2,0>: Cost 4 vsldoi12 <4,5,6,0>, <6,2,0,3>
+  3289764265U,	// <0,6,2,1>: Cost 4 vmrghw <0,2,1,3>, <6,1,7,3>
+  3289690618U,	// <0,6,2,2>: Cost 4 vmrghw <0,2,0,3>, <6,2,7,3>
+  3862589907U,	// <0,6,2,3>: Cost 4 vsldoi12 <6,2,3,0>, <6,2,3,0>
+  3733253430U,	// <0,6,2,4>: Cost 4 vsldoi4 <7,0,6,2>, RHS
+  3733254242U,	// <0,6,2,5>: Cost 4 vsldoi4 <7,0,6,2>, <5,6,7,0>
+  3777390522U,	// <0,6,2,6>: Cost 4 vsldoi8 <3,2,0,6>, <2,6,3,7>
+  2785825274U,	// <0,6,2,7>: Cost 3 vsldoi12 <5,6,7,0>, <6,2,7,3>
+  2785825283U,	// <0,6,2,u>: Cost 3 vsldoi12 <5,6,7,0>, <6,2,u,3>
+  3777390742U,	// <0,6,3,0>: Cost 4 vsldoi8 <3,2,0,6>, <3,0,1,2>
+  3863106066U,	// <0,6,3,1>: Cost 4 vsldoi12 <6,3,1,0>, <6,3,1,0>
+  3777390899U,	// <0,6,3,2>: Cost 4 vsldoi8 <3,2,0,6>, <3,2,0,6>
+  3290436146U,	// <0,6,3,3>: Cost 4 vmrghw <0,3,1,4>, <6,3,4,5>
+  3779381762U,	// <0,6,3,4>: Cost 4 vsldoi8 <3,5,0,6>, <3,4,5,6>
+  3779381798U,	// <0,6,3,5>: Cost 4 vsldoi8 <3,5,0,6>, <3,5,0,6>
+  3733262920U,	// <0,6,3,6>: Cost 4 vsldoi4 <7,0,6,3>, <6,3,7,0>
+  2300972342U,	// <0,6,3,7>: Cost 3 vmrglw <3,2,0,3>, RHS
+  2300972343U,	// <0,6,3,u>: Cost 3 vmrglw <3,2,0,3>, RHS
+  3802606482U,	// <0,6,4,0>: Cost 4 vsldoi8 <7,4,0,6>, <4,0,5,1>
+  2217365931U,	// <0,6,4,1>: Cost 3 vmrghw <0,4,1,5>, <6,1,7,5>
+  2217366010U,	// <0,6,4,2>: Cost 3 vmrghw <0,4,1,5>, <6,2,7,3>
+  3291107890U,	// <0,6,4,3>: Cost 4 vmrghw <0,4,1,5>, <6,3,4,5>
+  3291099805U,	// <0,6,4,4>: Cost 4 vmrghw <0,4,1,4>, <6,4,7,4>
+  3777391926U,	// <0,6,4,5>: Cost 4 vsldoi8 <3,2,0,6>, RHS
+  2217366328U,	// <0,6,4,6>: Cost 3 vmrghw <0,4,1,5>, <6,6,6,6>
+  2291027254U,	// <0,6,4,7>: Cost 3 vmrglw <1,5,0,4>, RHS
+  2291027255U,	// <0,6,4,u>: Cost 3 vmrglw <1,5,0,4>, RHS
+  3852858033U,	// <0,6,5,0>: Cost 4 vsldoi12 <4,5,6,0>, <6,5,0,6>
+  3395964532U,	// <0,6,5,1>: Cost 4 vmrglw <6,7,0,5>, <5,0,6,1>
+  3864507069U,	// <0,6,5,2>: Cost 4 vsldoi12 <6,5,2,0>, <6,5,2,0>
+  3376056678U,	// <0,6,5,3>: Cost 5 vmrglw <3,4,0,5>, <3,2,6,3>
+  3721334070U,	// <0,6,5,4>: Cost 4 vsldoi4 <5,0,6,5>, RHS
+  3395964860U,	// <0,6,5,5>: Cost 4 vmrglw <6,7,0,5>, <5,4,6,5>
+  3864802017U,	// <0,6,5,6>: Cost 4 vsldoi12 <6,5,6,0>, <6,5,6,0>
+  2302315830U,	// <0,6,5,7>: Cost 3 vmrglw <3,4,0,5>, RHS
+  2302315831U,	// <0,6,5,u>: Cost 3 vmrglw <3,4,0,5>, RHS
+  3852858108U,	// <0,6,6,0>: Cost 4 vsldoi12 <4,5,6,0>, <6,6,0,0>
+  3398624745U,	// <0,6,6,1>: Cost 4 vmrglw <7,2,0,6>, <2,0,6,1>
+  2218668538U,	// <0,6,6,2>: Cost 3 vmrghw <0,6,1,2>, <6,2,7,3>
+  3292418610U,	// <0,6,6,3>: Cost 4 vmrghw <0,6,1,3>, <6,3,4,5>
+  3733286198U,	// <0,6,6,4>: Cost 4 vsldoi4 <7,0,6,6>, RHS
+  3797299889U,	// <0,6,6,5>: Cost 4 vsldoi8 <6,5,0,6>, <6,5,0,6>
+  2785825592U,	// <0,6,6,6>: Cost 3 vsldoi12 <5,6,7,0>, <6,6,6,6>
+  2785825602U,	// <0,6,6,7>: Cost 3 vsldoi12 <5,6,7,0>, <6,6,7,7>
+  2785825611U,	// <0,6,6,u>: Cost 3 vsldoi12 <5,6,7,0>, <6,6,u,7>
+  2785825614U,	// <0,6,7,0>: Cost 3 vsldoi12 <5,6,7,0>, <6,7,0,1>
+  2758988632U,	// <0,6,7,1>: Cost 3 vsldoi12 <1,2,3,0>, <6,7,1,2>
+  3377400084U,	// <0,6,7,2>: Cost 4 vmrglw <3,6,0,7>, <3,1,6,2>
+  2792166248U,	// <0,6,7,3>: Cost 3 vsldoi12 <6,7,3,0>, <6,7,3,0>
+  2785825654U,	// <0,6,7,4>: Cost 3 vsldoi12 <5,6,7,0>, <6,7,4,5>
+  2785825664U,	// <0,6,7,5>: Cost 3 vsldoi12 <5,6,7,0>, <6,7,5,6>
+  3859567493U,	// <0,6,7,6>: Cost 4 vsldoi12 <5,6,7,0>, <6,7,6,2>
+  2303659318U,	// <0,6,7,7>: Cost 3 vmrglw <3,6,0,7>, RHS
+  2303659319U,	// <0,6,7,u>: Cost 3 vmrglw <3,6,0,7>, RHS
+  2785825695U,	// <0,6,u,0>: Cost 3 vsldoi12 <5,6,7,0>, <6,u,0,1>
+  2220077479U,	// <0,6,u,1>: Cost 3 vmrghw LHS, <6,1,7,1>
+  1146335738U,	// <0,6,u,2>: Cost 2 vmrghw LHS, <6,2,7,3>
+  2792829881U,	// <0,6,u,3>: Cost 3 vsldoi12 <6,u,3,0>, <6,u,3,0>
+  2785825735U,	// <0,6,u,4>: Cost 3 vsldoi12 <5,6,7,0>, <6,u,4,5>
+  2785825664U,	// <0,6,u,5>: Cost 3 vsldoi12 <5,6,7,0>, <6,7,5,6>
+  1146336056U,	// <0,6,u,6>: Cost 2 vmrghw LHS, <6,6,6,6>
+  1221963062U,	// <0,6,u,7>: Cost 2 vmrglw <2,3,0,u>, RHS
+  1221963063U,	// <0,6,u,u>: Cost 2 vmrglw <2,3,0,u>, RHS
+  2653593600U,	// <0,7,0,0>: Cost 3 vsldoi4 <6,0,7,0>, <0,0,0,0>
+  2706309222U,	// <0,7,0,1>: Cost 3 vsldoi8 <3,6,0,7>, LHS
+  3709421498U,	// <0,7,0,2>: Cost 4 vsldoi4 <3,0,7,0>, <2,6,3,7>
+  2281705978U,	// <0,7,0,3>: Cost 3 vmrglw <0,0,0,0>, <6,2,7,3>
+  2785825816U,	// <0,7,0,4>: Cost 3 vsldoi12 <5,6,7,0>, <7,0,4,5>
+  2785825826U,	// <0,7,0,5>: Cost 3 vsldoi12 <5,6,7,0>, <7,0,5,6>
+  2653598037U,	// <0,7,0,6>: Cost 3 vsldoi4 <6,0,7,0>, <6,0,7,0>
+  2214598252U,	// <0,7,0,7>: Cost 3 vmrghw <0,0,0,0>, <7,7,7,7>
+  2706309789U,	// <0,7,0,u>: Cost 3 vsldoi8 <3,6,0,7>, LHS
+  1141691386U,	// <0,7,1,0>: Cost 2 vmrghw LHS, <7,0,1,2>
+  2215433290U,	// <0,7,1,1>: Cost 3 vmrghw LHS, <7,1,1,1>
+  2706310038U,	// <0,7,1,2>: Cost 3 vsldoi8 <3,6,0,7>, <1,2,3,0>
+  2322190842U,	// <0,7,1,3>: Cost 3 vmrglw <6,7,0,1>, <6,2,7,3>
+  1141691750U,	// <0,7,1,4>: Cost 2 vmrghw LHS, <7,4,5,6>
+  2215433654U,	// <0,7,1,5>: Cost 3 vmrghw LHS, <7,5,5,5>
+  2653606230U,	// <0,7,1,6>: Cost 3 vsldoi4 <6,0,7,1>, <6,0,7,1>
+  1141692012U,	// <0,7,1,7>: Cost 2 vmrghw LHS, <7,7,7,7>
+  1141692034U,	// <0,7,1,u>: Cost 2 vmrghw LHS, <7,u,1,2>
+  2785825940U,	// <0,7,2,0>: Cost 3 vsldoi12 <5,6,7,0>, <7,2,0,3>
+  3768108576U,	// <0,7,2,1>: Cost 5 vsldoi8 <1,6,0,7>, <2,1,3,2>
+  3780052584U,	// <0,7,2,2>: Cost 4 vsldoi8 <3,6,0,7>, <2,2,2,2>
+  2794820780U,	// <0,7,2,3>: Cost 3 vsldoi12 <7,2,3,0>, <7,2,3,0>
+  3859641528U,	// <0,7,2,4>: Cost 4 vsldoi12 <5,6,u,0>, <7,2,4,3>
+  3733327970U,	// <0,7,2,5>: Cost 4 vsldoi4 <7,0,7,2>, <5,6,7,0>
+  3778062266U,	// <0,7,2,6>: Cost 4 vsldoi8 <3,3,0,7>, <2,6,3,7>
+  3733328944U,	// <0,7,2,7>: Cost 4 vsldoi4 <7,0,7,2>, <7,0,7,2>
+  2795189465U,	// <0,7,2,u>: Cost 3 vsldoi12 <7,2,u,0>, <7,2,u,0>
+  2324861026U,	// <0,7,3,0>: Cost 3 vmrglw <7,2,0,3>, <5,6,7,0>
+  3780053233U,	// <0,7,3,1>: Cost 4 vsldoi8 <3,6,0,7>, <3,1,2,3>
+  3780053296U,	// <0,7,3,2>: Cost 4 vsldoi8 <3,6,0,7>, <3,2,0,3>
+  3778062725U,	// <0,7,3,3>: Cost 4 vsldoi8 <3,3,0,7>, <3,3,0,7>
+  3780053506U,	// <0,7,3,4>: Cost 4 vsldoi8 <3,6,0,7>, <3,4,5,6>
+  3803941469U,	// <0,7,3,5>: Cost 4 vsldoi8 <7,6,0,7>, <3,5,6,7>
+  2706311800U,	// <0,7,3,6>: Cost 3 vsldoi8 <3,6,0,7>, <3,6,0,7>
+  3398603586U,	// <0,7,3,7>: Cost 4 vmrglw <7,2,0,3>, <6,6,7,7>
+  2707639066U,	// <0,7,3,u>: Cost 3 vsldoi8 <3,u,0,7>, <3,u,0,7>
+  2217366522U,	// <0,7,4,0>: Cost 3 vmrghw <0,4,1,5>, <7,0,1,2>
+  3727369110U,	// <0,7,4,1>: Cost 4 vsldoi4 <6,0,7,4>, <1,2,3,0>
+  3291108500U,	// <0,7,4,2>: Cost 4 vmrghw <0,4,1,5>, <7,2,0,3>
+  3727370872U,	// <0,7,4,3>: Cost 4 vsldoi4 <6,0,7,4>, <3,6,0,7>
+  2217366886U,	// <0,7,4,4>: Cost 3 vmrghw <0,4,1,5>, <7,4,5,6>
+  2706312502U,	// <0,7,4,5>: Cost 3 vsldoi8 <3,6,0,7>, RHS
+  3786026321U,	// <0,7,4,6>: Cost 4 vsldoi8 <4,6,0,7>, <4,6,0,7>
+  2217367148U,	// <0,7,4,7>: Cost 3 vmrghw <0,4,1,5>, <7,7,7,7>
+  2706312745U,	// <0,7,4,u>: Cost 3 vsldoi8 <3,6,0,7>, RHS
+  2322223202U,	// <0,7,5,0>: Cost 3 vmrglw <6,7,0,5>, <5,6,7,0>
+  3399946987U,	// <0,7,5,1>: Cost 4 vmrglw <7,4,0,5>, <6,5,7,1>
+  3291780244U,	// <0,7,5,2>: Cost 4 vmrghw <0,5,1,6>, <7,2,0,3>
+  3727378582U,	// <0,7,5,3>: Cost 4 vsldoi4 <6,0,7,5>, <3,0,1,2>
+  3727379766U,	// <0,7,5,4>: Cost 4 vsldoi4 <6,0,7,5>, RHS
+  3859568054U,	// <0,7,5,5>: Cost 4 vsldoi12 <5,6,7,0>, <7,5,5,5>
+  2785826241U,	// <0,7,5,6>: Cost 3 vsldoi12 <5,6,7,0>, <7,5,6,7>
+  3395965762U,	// <0,7,5,7>: Cost 4 vmrglw <6,7,0,5>, <6,6,7,7>
+  2787153363U,	// <0,7,5,u>: Cost 3 vsldoi12 <5,u,7,0>, <7,5,u,7>
+  2785826268U,	// <0,7,6,0>: Cost 3 vsldoi12 <5,6,7,0>, <7,6,0,7>
+  3780055420U,	// <0,7,6,1>: Cost 5 vsldoi8 <3,6,0,7>, <6,1,2,3>
+  3859568110U,	// <0,7,6,2>: Cost 4 vsldoi12 <5,6,7,0>, <7,6,2,7>
+  3874534903U,	// <0,7,6,3>: Cost 4 vsldoi12 <u,2,3,0>, <7,6,3,7>
+  3859641856U,	// <0,7,6,4>: Cost 4 vsldoi12 <5,6,u,0>, <7,6,4,7>
+  3733360738U,	// <0,7,6,5>: Cost 4 vsldoi4 <7,0,7,6>, <5,6,7,0>
+  3859568145U,	// <0,7,6,6>: Cost 4 vsldoi12 <5,6,7,0>, <7,6,6,6>
+  2797770260U,	// <0,7,6,7>: Cost 3 vsldoi12 <7,6,7,0>, <7,6,7,0>
+  2797843997U,	// <0,7,6,u>: Cost 3 vsldoi12 <7,6,u,0>, <7,6,u,0>
+  2785826342U,	// <0,7,7,0>: Cost 3 vsldoi12 <5,6,7,0>, <7,7,0,0>
+  3727393686U,	// <0,7,7,1>: Cost 4 vsldoi4 <6,0,7,7>, <1,2,3,0>
+  3868563003U,	// <0,7,7,2>: Cost 4 vsldoi12 <7,2,3,0>, <7,7,2,3>
+  3377397988U,	// <0,7,7,3>: Cost 4 vmrglw <3,6,0,7>, <0,2,7,3>
+  2219349350U,	// <0,7,7,4>: Cost 3 vmrghw <0,7,1,4>, <7,4,5,6>
+  3859568217U,	// <0,7,7,5>: Cost 4 vsldoi12 <5,6,7,0>, <7,7,5,6>
+  2730202588U,	// <0,7,7,6>: Cost 3 vsldoi8 <7,6,0,7>, <7,6,0,7>
+  2785826412U,	// <0,7,7,7>: Cost 3 vsldoi12 <5,6,7,0>, <7,7,7,7>
+  2731529854U,	// <0,7,7,u>: Cost 3 vsldoi8 <7,u,0,7>, <7,u,0,7>
+  1146336250U,	// <0,7,u,0>: Cost 2 vmrghw LHS, <7,0,1,2>
+  2706315054U,	// <0,7,u,1>: Cost 3 vsldoi8 <3,6,0,7>, LHS
+  2653660845U,	// <0,7,u,2>: Cost 3 vsldoi4 <6,0,7,u>, <2,3,0,u>
+  2322248186U,	// <0,7,u,3>: Cost 3 vmrglw <6,7,0,u>, <6,2,7,3>
+  1146336614U,	// <0,7,u,4>: Cost 2 vmrghw LHS, <7,4,5,6>
+  2706315418U,	// <0,7,u,5>: Cost 3 vsldoi8 <3,6,0,7>, RHS
+  2653663581U,	// <0,7,u,6>: Cost 3 vsldoi4 <6,0,7,u>, <6,0,7,u>
+  1146336876U,	// <0,7,u,7>: Cost 2 vmrghw LHS, <7,7,7,7>
+  1146336898U,	// <0,7,u,u>: Cost 2 vmrghw LHS, <7,u,1,2>
+  202162278U,	// <0,u,0,0>: Cost 1 vspltisw0 LHS
+  1624612966U,	// <0,u,0,1>: Cost 2 vsldoi8 <2,3,0,u>, LHS
+  2629780986U,	// <0,u,0,2>: Cost 3 vsldoi4 <2,0,u,0>, <2,0,u,0>
+  1207959708U,	// <0,u,0,3>: Cost 2 vmrglw <0,0,0,0>, LHS
+  1544097078U,	// <0,u,0,4>: Cost 2 vsldoi4 <0,0,u,0>, RHS
+  1140856986U,	// <0,u,0,5>: Cost 2 vmrghw <0,0,0,0>, RHS
+  2698355253U,	// <0,u,0,6>: Cost 3 vsldoi8 <2,3,0,u>, <0,6,u,7>
+  1207962952U,	// <0,u,0,7>: Cost 2 vmrglw <0,0,0,0>, RHS
+  202162278U,	// <0,u,0,u>: Cost 1 vspltisw0 LHS
+  1142134483U,	// <0,u,1,0>: Cost 2 vmrghw LHS, <u,0,1,2>
+  67950382U,	// <0,u,1,1>: Cost 1 vmrghw LHS, LHS
+  1142175624U,	// <0,u,1,2>: Cost 2 vmrghw LHS, <u,2,3,3>
+  1142175676U,	// <0,u,1,3>: Cost 2 vmrghw LHS, <u,3,0,1>
+  1142134847U,	// <0,u,1,4>: Cost 2 vmrghw LHS, <u,4,5,6>
+  67950746U,	// <0,u,1,5>: Cost 1 vmrghw LHS, RHS
+  1142175952U,	// <0,u,1,6>: Cost 2 vmrghw LHS, <u,6,3,7>
+  1221905736U,	// <0,u,1,7>: Cost 2 vmrglw <2,3,0,1>, RHS
+  67950949U,	// <0,u,1,u>: Cost 1 vmrghw LHS, LHS
+  1562026086U,	// <0,u,2,0>: Cost 2 vsldoi4 <3,0,u,2>, LHS
+  2216015662U,	// <0,u,2,1>: Cost 3 vmrghw <0,2,1,2>, LHS
+  2698356328U,	// <0,u,2,2>: Cost 3 vsldoi8 <2,3,0,u>, <2,2,2,2>
+  835584U,	// <0,u,2,3>: Cost 0 copy LHS
+  1562029366U,	// <0,u,2,4>: Cost 2 vsldoi4 <3,0,u,2>, RHS
+  2216016026U,	// <0,u,2,5>: Cost 3 vmrghw <0,2,1,2>, RHS
+  2698356666U,	// <0,u,2,6>: Cost 3 vsldoi8 <2,3,0,u>, <2,6,3,7>
+  1585919033U,	// <0,u,2,7>: Cost 2 vsldoi4 <7,0,u,2>, <7,0,u,2>
+  835584U,	// <0,u,2,u>: Cost 0 copy LHS
+  2758989756U,	// <0,u,3,0>: Cost 3 vsldoi12 <1,2,3,0>, <u,3,0,1>
+  2216662830U,	// <0,u,3,1>: Cost 3 vmrghw <0,3,1,0>, LHS
+  2703665461U,	// <0,u,3,2>: Cost 3 vsldoi8 <3,2,0,u>, <3,2,0,u>
+  2758989782U,	// <0,u,3,3>: Cost 3 vsldoi12 <1,2,3,0>, <u,3,3,0>
+  2758989796U,	// <0,u,3,4>: Cost 3 vsldoi12 <1,2,3,0>, <u,3,4,5>
+  2216663194U,	// <0,u,3,5>: Cost 3 vmrghw <0,3,1,0>, RHS
+  2706319993U,	// <0,u,3,6>: Cost 3 vsldoi8 <3,6,0,u>, <3,6,0,u>
+  2300972360U,	// <0,u,3,7>: Cost 3 vmrglw <3,2,0,3>, RHS
+  2216663397U,	// <0,u,3,u>: Cost 3 vmrghw <0,3,1,0>, LHS
+  2217367251U,	// <0,u,4,0>: Cost 3 vmrghw <0,4,1,5>, <u,0,1,2>
+  1143625518U,	// <0,u,4,1>: Cost 2 vmrghw <0,4,1,5>, LHS
+  2217367432U,	// <0,u,4,2>: Cost 3 vmrghw <0,4,1,5>, <u,2,3,3>
+  2217367484U,	// <0,u,4,3>: Cost 3 vmrghw <0,4,1,5>, <u,3,0,1>
+  1143619922U,	// <0,u,4,4>: Cost 2 vmrghw <0,4,1,5>, <0,4,1,5>
+  1143625882U,	// <0,u,4,5>: Cost 2 vmrghw <0,4,1,5>, RHS
+  2217367760U,	// <0,u,4,6>: Cost 3 vmrghw <0,4,1,5>, <u,6,3,7>
+  2291027272U,	// <0,u,4,7>: Cost 3 vmrglw <1,5,0,4>, RHS
+  1143626085U,	// <0,u,4,u>: Cost 2 vmrghw <0,4,1,5>, LHS
+  2635792486U,	// <0,u,5,0>: Cost 3 vsldoi4 <3,0,u,5>, LHS
+  2635793302U,	// <0,u,5,1>: Cost 3 vsldoi4 <3,0,u,5>, <1,2,3,0>
+  2302314646U,	// <0,u,5,2>: Cost 3 vmrglw <3,4,0,5>, <3,0,1,2>
+  2635794648U,	// <0,u,5,3>: Cost 3 vsldoi4 <3,0,u,5>, <3,0,u,5>
+  2635795766U,	// <0,u,5,4>: Cost 3 vsldoi4 <3,0,u,5>, RHS
+  2717601754U,	// <0,u,5,5>: Cost 3 vsldoi8 <5,5,0,u>, <5,5,0,u>
+  1685248154U,	// <0,u,5,6>: Cost 2 vsldoi12 <1,2,3,0>, RHS
+  2302315848U,	// <0,u,5,7>: Cost 3 vmrglw <3,4,0,5>, RHS
+  1685248172U,	// <0,u,5,u>: Cost 2 vsldoi12 <1,2,3,0>, RHS
+  2759358645U,	// <0,u,6,0>: Cost 3 vsldoi12 <1,2,u,0>, <u,6,0,7>
+  2218637102U,	// <0,u,6,1>: Cost 3 vmrghw <0,6,0,7>, LHS
+  2724901370U,	// <0,u,6,2>: Cost 3 vsldoi8 <6,7,0,u>, <6,2,7,3>
+  2758990032U,	// <0,u,6,3>: Cost 3 vsldoi12 <1,2,3,0>, <u,6,3,7>
+  2659691830U,	// <0,u,6,4>: Cost 3 vsldoi4 <7,0,u,6>, RHS
+  2659471458U,	// <0,u,6,5>: Cost 3 vsldoi4 <7,0,5,6>, <5,6,7,0>
+  2724901688U,	// <0,u,6,6>: Cost 3 vsldoi8 <6,7,0,u>, <6,6,6,6>
+  1651159893U,	// <0,u,6,7>: Cost 2 vsldoi8 <6,7,0,u>, <6,7,0,u>
+  1651823526U,	// <0,u,6,u>: Cost 2 vsldoi8 <6,u,0,u>, <6,u,0,u>
+  2785827072U,	// <0,u,7,0>: Cost 3 vsldoi12 <5,6,7,0>, <u,7,0,1>
+  2803964168U,	// <0,u,7,1>: Cost 3 vsldoi12 <u,7,1,0>, <u,7,1,0>
+  2727556249U,	// <0,u,7,2>: Cost 3 vsldoi8 <7,2,0,u>, <7,2,0,u>
+  2303656092U,	// <0,u,7,3>: Cost 3 vmrglw <3,6,0,7>, LHS
+  2785827112U,	// <0,u,7,4>: Cost 3 vsldoi12 <5,6,7,0>, <u,7,4,5>
+  2785827122U,	// <0,u,7,5>: Cost 3 vsldoi12 <5,6,7,0>, <u,7,5,6>
+  2730210781U,	// <0,u,7,6>: Cost 3 vsldoi8 <7,6,0,u>, <7,6,0,u>
+  2303659336U,	// <0,u,7,7>: Cost 3 vmrglw <3,6,0,7>, RHS
+  2303656097U,	// <0,u,7,u>: Cost 3 vmrglw <3,6,0,7>, LHS
+  202162278U,	// <0,u,u,0>: Cost 1 vspltisw0 LHS
+  72595246U,	// <0,u,u,1>: Cost 1 vmrghw LHS, LHS
+  1146337160U,	// <0,u,u,2>: Cost 2 vmrghw LHS, <u,2,3,3>
+  835584U,	// <0,u,u,3>: Cost 0 copy LHS
+  1146337343U,	// <0,u,u,4>: Cost 2 vmrghw LHS, <u,4,5,6>
+  72595610U,	// <0,u,u,5>: Cost 1 vmrghw LHS, RHS
+  1146337488U,	// <0,u,u,6>: Cost 2 vmrghw LHS, <u,6,3,7>
+  1221963080U,	// <0,u,u,7>: Cost 2 vmrglw <2,3,0,u>, RHS
+  835584U,	// <0,u,u,u>: Cost 0 copy LHS
+  2756853760U,	// <1,0,0,0>: Cost 3 vsldoi12 <0,u,1,1>, <0,0,0,0>
+  1677803530U,	// <1,0,0,1>: Cost 2 vsldoi12 <0,0,1,1>, <0,0,1,1>
+  3759497387U,	// <1,0,0,2>: Cost 4 vsldoi8 <0,2,1,0>, <0,2,1,0>
+  2686419196U,	// <1,0,0,3>: Cost 3 vsldoi8 <0,3,1,0>, <0,3,1,0>
+  2751766565U,	// <1,0,0,4>: Cost 3 vsldoi12 <0,0,4,1>, <0,0,4,1>
+  2687746462U,	// <1,0,0,5>: Cost 3 vsldoi8 <0,5,1,0>, <0,5,1,0>
+  3776086518U,	// <1,0,0,6>: Cost 4 vsldoi8 <3,0,1,0>, <0,6,1,7>
+  2689073728U,	// <1,0,0,7>: Cost 3 vsldoi8 <0,7,1,0>, <0,7,1,0>
+  1678319689U,	// <1,0,0,u>: Cost 2 vsldoi12 <0,0,u,1>, <0,0,u,1>
+  2287091712U,	// <1,0,1,0>: Cost 3 vmrglw <0,u,1,1>, <0,0,0,0>
+  1147568230U,	// <1,0,1,1>: Cost 2 vmrghw <1,1,1,1>, LHS
+  1683112038U,	// <1,0,1,2>: Cost 2 vsldoi12 <0,u,1,1>, LHS
+  3294970108U,	// <1,0,1,3>: Cost 4 vmrghw <1,1,0,0>, <0,3,1,0>
+  2623892790U,	// <1,0,1,4>: Cost 3 vsldoi4 <1,1,0,1>, RHS
+  2647781007U,	// <1,0,1,5>: Cost 3 vsldoi4 <5,1,0,1>, <5,1,0,1>
+  2791948430U,	// <1,0,1,6>: Cost 3 vsldoi12 <6,7,0,1>, <0,1,6,7>
+  3721524218U,	// <1,0,1,7>: Cost 4 vsldoi4 <5,1,0,1>, <7,0,1,2>
+  1683112092U,	// <1,0,1,u>: Cost 2 vsldoi12 <0,u,1,1>, LHS
+  2222112768U,	// <1,0,2,0>: Cost 3 vmrghw <1,2,3,0>, <0,0,0,0>
+  1148371046U,	// <1,0,2,1>: Cost 2 vmrghw <1,2,3,0>, LHS
+  3356862524U,	// <1,0,2,2>: Cost 4 vmrglw <0,2,1,2>, <2,u,0,2>
+  2702345894U,	// <1,0,2,3>: Cost 3 vsldoi8 <3,0,1,0>, <2,3,0,1>
+  2222113106U,	// <1,0,2,4>: Cost 3 vmrghw <1,2,3,0>, <0,4,1,5>
+  2299709908U,	// <1,0,2,5>: Cost 3 vmrglw <3,0,1,2>, <3,4,0,5>
+  3760162746U,	// <1,0,2,6>: Cost 4 vsldoi8 <0,3,1,0>, <2,6,3,7>
+  3369470584U,	// <1,0,2,7>: Cost 4 vmrglw <2,3,1,2>, <3,6,0,7>
+  1148371613U,	// <1,0,2,u>: Cost 2 vmrghw <1,2,3,0>, LHS
+  2686421142U,	// <1,0,3,0>: Cost 3 vsldoi8 <0,3,1,0>, <3,0,1,2>
+  2283128486U,	// <1,0,3,1>: Cost 3 vmrglw <0,2,1,3>, <2,3,0,1>
+  3296305326U,	// <1,0,3,2>: Cost 4 vmrghw <1,3,0,1>, <0,2,1,3>
+  3760163199U,	// <1,0,3,3>: Cost 4 vsldoi8 <0,3,1,0>, <3,3,0,1>
+  3760163330U,	// <1,0,3,4>: Cost 4 vsldoi8 <0,3,1,0>, <3,4,5,6>
+  3779406377U,	// <1,0,3,5>: Cost 4 vsldoi8 <3,5,1,0>, <3,5,1,0>
+  3865690416U,	// <1,0,3,6>: Cost 4 vsldoi12 <6,7,0,1>, <0,3,6,7>
+  3366824568U,	// <1,0,3,7>: Cost 5 vmrglw <1,u,1,3>, <3,6,0,7>
+  2707655452U,	// <1,0,3,u>: Cost 3 vsldoi8 <3,u,1,0>, <3,u,1,0>
+  2734861202U,	// <1,0,4,0>: Cost 3 vsldoi8 <u,4,1,0>, <4,0,5,1>
+  2756854098U,	// <1,0,4,1>: Cost 3 vsldoi12 <0,u,1,1>, <0,4,1,5>
+  3830595931U,	// <1,0,4,2>: Cost 5 vsldoi12 <0,u,1,1>, <0,4,2,5>
+  3296968960U,	// <1,0,4,3>: Cost 4 vmrghw <1,4,0,1>, <0,3,1,4>
+  3830595949U,	// <1,0,4,4>: Cost 4 vsldoi12 <0,u,1,1>, <0,4,4,5>
+  2686422326U,	// <1,0,4,5>: Cost 3 vsldoi8 <0,3,1,0>, RHS
+  3297378806U,	// <1,0,4,6>: Cost 5 vmrghw <1,4,5,6>, <0,6,1,7>
+  3810594248U,	// <1,0,4,7>: Cost 4 vsldoi8 <u,7,1,0>, <4,7,5,0>
+  2686422569U,	// <1,0,4,u>: Cost 3 vsldoi8 <0,3,1,0>, RHS
+  2284470272U,	// <1,0,5,0>: Cost 3 vmrglw <0,4,1,5>, <0,0,0,0>
+  2284471974U,	// <1,0,5,1>: Cost 3 vmrglw <0,4,1,5>, <2,3,0,1>
+  3809267435U,	// <1,0,5,2>: Cost 4 vsldoi8 <u,5,1,0>, <5,2,1,3>
+  3297968384U,	// <1,0,5,3>: Cost 4 vmrghw <1,5,4,6>, <0,3,1,4>
+  2284471977U,	// <1,0,5,4>: Cost 3 vmrglw <0,4,1,5>, <2,3,0,4>
+  3721555603U,	// <1,0,5,5>: Cost 4 vsldoi4 <5,1,0,5>, <5,1,0,5>
+  3792679010U,	// <1,0,5,6>: Cost 4 vsldoi8 <5,7,1,0>, <5,6,7,0>
+  3792679037U,	// <1,0,5,7>: Cost 4 vsldoi8 <5,7,1,0>, <5,7,1,0>
+  2284471981U,	// <1,0,5,u>: Cost 3 vmrglw <0,4,1,5>, <2,3,0,u>
+  3356893184U,	// <1,0,6,0>: Cost 4 vmrglw <0,2,1,6>, <0,0,0,0>
+  2224676966U,	// <1,0,6,1>: Cost 3 vmrghw <1,6,1,7>, LHS
+  3298295985U,	// <1,0,6,2>: Cost 4 vmrghw <1,6,0,1>, <0,2,1,6>
+  3298345212U,	// <1,0,6,3>: Cost 4 vmrghw <1,6,0,7>, <0,3,1,0>
+  2224972114U,	// <1,0,6,4>: Cost 3 vmrghw <1,6,5,7>, <0,4,1,5>
+  3808604907U,	// <1,0,6,5>: Cost 4 vsldoi8 <u,4,1,0>, <6,5,7,1>
+  3799978808U,	// <1,0,6,6>: Cost 4 vsldoi8 <7,0,1,0>, <6,6,6,6>
+  2726237006U,	// <1,0,6,7>: Cost 3 vsldoi8 <7,0,1,0>, <6,7,0,1>
+  2224677522U,	// <1,0,6,u>: Cost 3 vmrghw <1,6,1,7>, <0,u,1,1>
+  2726237176U,	// <1,0,7,0>: Cost 3 vsldoi8 <7,0,1,0>, <7,0,1,0>
+  2285815462U,	// <1,0,7,1>: Cost 3 vmrglw <0,6,1,7>, <2,3,0,1>
+  3805951193U,	// <1,0,7,2>: Cost 4 vsldoi8 <u,0,1,0>, <7,2,u,0>
+  3807941859U,	// <1,0,7,3>: Cost 4 vsldoi8 <u,3,1,0>, <7,3,0,1>
+  3799979366U,	// <1,0,7,4>: Cost 4 vsldoi8 <7,0,1,0>, <7,4,5,6>
+  3803297165U,	// <1,0,7,5>: Cost 4 vsldoi8 <7,5,1,0>, <7,5,1,0>
+  3799979540U,	// <1,0,7,6>: Cost 4 vsldoi8 <7,0,1,0>, <7,6,7,0>
+  3799979628U,	// <1,0,7,7>: Cost 4 vsldoi8 <7,0,1,0>, <7,7,7,7>
+  2731546240U,	// <1,0,7,u>: Cost 3 vsldoi8 <7,u,1,0>, <7,u,1,0>
+  2284494848U,	// <1,0,u,0>: Cost 3 vmrglw <0,4,1,u>, <0,0,0,0>
+  1683112594U,	// <1,0,u,1>: Cost 2 vsldoi12 <0,u,1,1>, <0,u,1,1>
+  1683112605U,	// <1,0,u,2>: Cost 2 vsldoi12 <0,u,1,1>, LHS
+  2734200772U,	// <1,0,u,3>: Cost 3 vsldoi8 <u,3,1,0>, <u,3,1,0>
+  2757075629U,	// <1,0,u,4>: Cost 3 vsldoi12 <0,u,4,1>, <0,u,4,1>
+  2686425242U,	// <1,0,u,5>: Cost 3 vsldoi8 <0,3,1,0>, RHS
+  2791948430U,	// <1,0,u,6>: Cost 3 vsldoi12 <6,7,0,1>, <0,1,6,7>
+  2736855304U,	// <1,0,u,7>: Cost 3 vsldoi8 <u,7,1,0>, <u,7,1,0>
+  1683112659U,	// <1,0,u,u>: Cost 2 vsldoi12 <0,u,1,1>, LHS
+  1610694666U,	// <1,1,0,0>: Cost 2 vsldoi8 <0,0,1,1>, <0,0,1,1>
+  1616003174U,	// <1,1,0,1>: Cost 2 vsldoi8 <0,u,1,1>, LHS
+  2283767958U,	// <1,1,0,2>: Cost 3 vmrglw <0,3,1,0>, <3,0,1,2>
+  3357507596U,	// <1,1,0,3>: Cost 4 vmrglw <0,3,1,0>, <0,0,1,3>
+  2689745234U,	// <1,1,0,4>: Cost 3 vsldoi8 <0,u,1,1>, <0,4,1,5>
+  3357507922U,	// <1,1,0,5>: Cost 4 vmrglw <0,3,1,0>, <0,4,1,5>
+  3294397647U,	// <1,1,0,6>: Cost 4 vmrghw <1,0,1,2>, <1,6,1,7>
+  3373433334U,	// <1,1,0,7>: Cost 4 vmrglw <3,0,1,0>, <0,6,1,7>
+  1616003730U,	// <1,1,0,u>: Cost 2 vsldoi8 <0,u,1,1>, <0,u,1,1>
+  1550221414U,	// <1,1,1,0>: Cost 2 vsldoi4 <1,1,1,1>, LHS
+  269271142U,	// <1,1,1,1>: Cost 1 vspltisw1 LHS
+  2287093910U,	// <1,1,1,2>: Cost 3 vmrglw <0,u,1,1>, <3,0,1,2>
+  2287092615U,	// <1,1,1,3>: Cost 3 vmrglw <0,u,1,1>, <1,2,1,3>
+  1550224694U,	// <1,1,1,4>: Cost 2 vsldoi4 <1,1,1,1>, RHS
+  2287092050U,	// <1,1,1,5>: Cost 3 vmrglw <0,u,1,1>, <0,4,1,5>
+  2689746127U,	// <1,1,1,6>: Cost 3 vsldoi8 <0,u,1,1>, <1,6,1,7>
+  2659800138U,	// <1,1,1,7>: Cost 3 vsldoi4 <7,1,1,1>, <7,1,1,1>
+  269271142U,	// <1,1,1,u>: Cost 1 vspltisw1 LHS
+  2222113516U,	// <1,1,2,0>: Cost 3 vmrghw <1,2,3,0>, <1,0,2,1>
+  2756854663U,	// <1,1,2,1>: Cost 3 vsldoi12 <0,u,1,1>, <1,2,1,3>
+  1148371862U,	// <1,1,2,2>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0>
+  2689746598U,	// <1,1,2,3>: Cost 3 vsldoi8 <0,u,1,1>, <2,3,0,1>
+  2618002742U,	// <1,1,2,4>: Cost 3 vsldoi4 <0,1,1,2>, RHS
+  2299707730U,	// <1,1,2,5>: Cost 3 vmrglw <3,0,1,2>, <0,4,1,5>
+  2689746874U,	// <1,1,2,6>: Cost 3 vsldoi8 <0,u,1,1>, <2,6,3,7>
+  3361506511U,	// <1,1,2,7>: Cost 4 vmrglw <1,0,1,2>, <1,6,1,7>
+  1148371862U,	// <1,1,2,u>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0>
+  2689747094U,	// <1,1,3,0>: Cost 3 vsldoi8 <0,u,1,1>, <3,0,1,2>
+  2691074278U,	// <1,1,3,1>: Cost 3 vsldoi8 <1,1,1,1>, <3,1,1,1>
+  3356870806U,	// <1,1,3,2>: Cost 4 vmrglw <0,2,1,3>, <3,0,1,2>
+  2283126958U,	// <1,1,3,3>: Cost 3 vmrglw <0,2,1,3>, <0,2,1,3>
+  2689747458U,	// <1,1,3,4>: Cost 3 vsldoi8 <0,u,1,1>, <3,4,5,6>
+  3356868946U,	// <1,1,3,5>: Cost 4 vmrglw <0,2,1,3>, <0,4,1,5>
+  3811265144U,	// <1,1,3,6>: Cost 4 vsldoi8 <u,u,1,1>, <3,6,0,7>
+  3362841807U,	// <1,1,3,7>: Cost 4 vmrglw <1,2,1,3>, <1,6,1,7>
+  2689747742U,	// <1,1,3,u>: Cost 3 vsldoi8 <0,u,1,1>, <3,u,1,2>
+  2623987814U,	// <1,1,4,0>: Cost 3 vsldoi4 <1,1,1,4>, LHS
+  2758181931U,	// <1,1,4,1>: Cost 3 vsldoi12 <1,1,1,1>, <1,4,1,5>
+  2223408022U,	// <1,1,4,2>: Cost 3 vmrghw <1,4,2,5>, <1,2,3,0>
+  3697731734U,	// <1,1,4,3>: Cost 4 vsldoi4 <1,1,1,4>, <3,0,1,2>
+  2283798784U,	// <1,1,4,4>: Cost 3 vmrglw <0,3,1,4>, <0,3,1,4>
+  1616006454U,	// <1,1,4,5>: Cost 2 vsldoi8 <0,u,1,1>, RHS
+  3297379535U,	// <1,1,4,6>: Cost 4 vmrghw <1,4,5,6>, <1,6,1,7>
+  3373466102U,	// <1,1,4,7>: Cost 4 vmrglw <3,0,1,4>, <0,6,1,7>
+  1616006697U,	// <1,1,4,u>: Cost 2 vsldoi8 <0,u,1,1>, RHS
+  2760762479U,	// <1,1,5,0>: Cost 3 vsldoi12 <1,5,0,1>, <1,5,0,1>
+  2284470282U,	// <1,1,5,1>: Cost 3 vmrglw <0,4,1,5>, <0,0,1,1>
+  2284472470U,	// <1,1,5,2>: Cost 3 vmrglw <0,4,1,5>, <3,0,1,2>
+  3358212270U,	// <1,1,5,3>: Cost 4 vmrglw <0,4,1,5>, <0,2,1,3>
+  2284470285U,	// <1,1,5,4>: Cost 3 vmrglw <0,4,1,5>, <0,0,1,4>
+  1210728786U,	// <1,1,5,5>: Cost 2 vmrglw <0,4,1,5>, <0,4,1,5>
+  2737524834U,	// <1,1,5,6>: Cost 3 vsldoi8 <u,u,1,1>, <5,6,7,0>
+  3360867535U,	// <1,1,5,7>: Cost 4 vmrglw <0,u,1,5>, <1,6,1,7>
+  1210728786U,	// <1,1,5,u>: Cost 2 vmrglw <0,4,1,5>, <0,4,1,5>
+  3697746022U,	// <1,1,6,0>: Cost 4 vsldoi4 <1,1,1,6>, LHS
+  2756854991U,	// <1,1,6,1>: Cost 3 vsldoi12 <0,u,1,1>, <1,6,1,7>
+  2737525242U,	// <1,1,6,2>: Cost 3 vsldoi8 <u,u,1,1>, <6,2,7,3>
+  3839149281U,	// <1,1,6,3>: Cost 4 vsldoi12 <2,3,0,1>, <1,6,3,7>
+  3697749302U,	// <1,1,6,4>: Cost 4 vsldoi4 <1,1,1,6>, RHS
+  3356893522U,	// <1,1,6,5>: Cost 4 vmrglw <0,2,1,6>, <0,4,1,5>
+  2283151537U,	// <1,1,6,6>: Cost 3 vmrglw <0,2,1,6>, <0,2,1,6>
+  2791949566U,	// <1,1,6,7>: Cost 3 vsldoi12 <6,7,0,1>, <1,6,7,0>
+  2792613127U,	// <1,1,6,u>: Cost 3 vsldoi12 <6,u,0,1>, <1,6,u,0>
+  2737525754U,	// <1,1,7,0>: Cost 3 vsldoi8 <u,u,1,1>, <7,0,1,2>
+  2291786386U,	// <1,1,7,1>: Cost 3 vmrglw <1,6,1,7>, <0,u,1,1>
+  3365528292U,	// <1,1,7,2>: Cost 4 vmrglw <1,6,1,7>, <1,0,1,2>
+  3365528455U,	// <1,1,7,3>: Cost 4 vmrglw <1,6,1,7>, <1,2,1,3>
+  2737526118U,	// <1,1,7,4>: Cost 3 vsldoi8 <u,u,1,1>, <7,4,5,6>
+  3365527890U,	// <1,1,7,5>: Cost 4 vmrglw <1,6,1,7>, <0,4,1,5>
+  3365528377U,	// <1,1,7,6>: Cost 4 vmrglw <1,6,1,7>, <1,1,1,6>
+  2291786959U,	// <1,1,7,7>: Cost 3 vmrglw <1,6,1,7>, <1,6,1,7>
+  2737526402U,	// <1,1,7,u>: Cost 3 vsldoi8 <u,u,1,1>, <7,u,1,2>
+  1550221414U,	// <1,1,u,0>: Cost 2 vsldoi4 <1,1,1,1>, LHS
+  269271142U,	// <1,1,u,1>: Cost 1 vspltisw1 LHS
+  1148371862U,	// <1,1,u,2>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0>
+  2689750972U,	// <1,1,u,3>: Cost 3 vsldoi8 <0,u,1,1>, <u,3,0,1>
+  1550224694U,	// <1,1,u,4>: Cost 2 vsldoi4 <1,1,1,1>, RHS
+  1616009370U,	// <1,1,u,5>: Cost 2 vsldoi8 <0,u,1,1>, RHS
+  2689751248U,	// <1,1,u,6>: Cost 3 vsldoi8 <0,u,1,1>, <u,6,3,7>
+  2736863497U,	// <1,1,u,7>: Cost 3 vsldoi8 <u,7,1,1>, <u,7,1,1>
+  269271142U,	// <1,1,u,u>: Cost 1 vspltisw1 LHS
+  2702360576U,	// <1,2,0,0>: Cost 3 vsldoi8 <3,0,1,2>, <0,0,0,0>
+  1628618854U,	// <1,2,0,1>: Cost 2 vsldoi8 <3,0,1,2>, LHS
+  2685771949U,	// <1,2,0,2>: Cost 3 vsldoi8 <0,2,1,2>, <0,2,1,2>
+  2283765862U,	// <1,2,0,3>: Cost 3 vmrglw <0,3,1,0>, LHS
+  2702360914U,	// <1,2,0,4>: Cost 3 vsldoi8 <3,0,1,2>, <0,4,1,5>
+  3788046813U,	// <1,2,0,5>: Cost 4 vsldoi8 <5,0,1,2>, <0,5,u,0>
+  2688426481U,	// <1,2,0,6>: Cost 3 vsldoi8 <0,6,1,2>, <0,6,1,2>
+  2726249024U,	// <1,2,0,7>: Cost 3 vsldoi8 <7,0,1,2>, <0,7,1,0>
+  1628619421U,	// <1,2,0,u>: Cost 2 vsldoi8 <3,0,1,2>, LHS
+  2690417380U,	// <1,2,1,0>: Cost 3 vsldoi8 <1,0,1,2>, <1,0,1,2>
+  2702361396U,	// <1,2,1,1>: Cost 3 vsldoi8 <3,0,1,2>, <1,1,1,1>
+  2287093352U,	// <1,2,1,2>: Cost 3 vmrglw <0,u,1,1>, <2,2,2,2>
+  1213349990U,	// <1,2,1,3>: Cost 2 vmrglw <0,u,1,1>, LHS
+  3764159522U,	// <1,2,1,4>: Cost 4 vsldoi8 <1,0,1,2>, <1,4,0,5>
+  3295053672U,	// <1,2,1,5>: Cost 4 vmrghw <1,1,1,1>, <2,5,3,6>
+  2221311930U,	// <1,2,1,6>: Cost 3 vmrghw <1,1,1,1>, <2,6,3,7>
+  3799991593U,	// <1,2,1,7>: Cost 4 vsldoi8 <7,0,1,2>, <1,7,2,7>
+  1213349995U,	// <1,2,1,u>: Cost 2 vmrglw <0,u,1,1>, LHS
+  2624045158U,	// <1,2,2,0>: Cost 3 vsldoi4 <1,1,2,2>, LHS
+  2702362144U,	// <1,2,2,1>: Cost 3 vsldoi8 <3,0,1,2>, <2,1,3,2>
+  2283120232U,	// <1,2,2,2>: Cost 3 vmrglw <0,2,1,2>, <2,2,2,2>
+  1225965670U,	// <1,2,2,3>: Cost 2 vmrglw <3,0,1,2>, LHS
+  2624048438U,	// <1,2,2,4>: Cost 3 vsldoi4 <1,1,2,2>, RHS
+  3356860763U,	// <1,2,2,5>: Cost 4 vmrglw <0,2,1,2>, <0,4,2,5>
+  2222114746U,	// <1,2,2,6>: Cost 3 vmrghw <1,2,3,0>, <2,6,3,7>
+  2299708632U,	// <1,2,2,7>: Cost 3 vmrglw <3,0,1,2>, <1,6,2,7>
+  1225965675U,	// <1,2,2,u>: Cost 2 vmrglw <3,0,1,2>, LHS
+  470597734U,	// <1,2,3,0>: Cost 1 vsldoi4 LHS, LHS
+  1544340276U,	// <1,2,3,1>: Cost 2 vsldoi4 LHS, <1,1,1,1>
+  1544341096U,	// <1,2,3,2>: Cost 2 vsldoi4 LHS, <2,2,2,2>
+  1544341916U,	// <1,2,3,3>: Cost 2 vsldoi4 LHS, <3,3,3,3>
+  470601014U,	// <1,2,3,4>: Cost 1 vsldoi4 LHS, RHS
+  1592119300U,	// <1,2,3,5>: Cost 2 vsldoi4 LHS, <5,5,5,5>
+  1592119802U,	// <1,2,3,6>: Cost 2 vsldoi4 LHS, <6,2,7,3>
+  1592120314U,	// <1,2,3,7>: Cost 2 vsldoi4 LHS, <7,0,1,2>
+  470603566U,	// <1,2,3,u>: Cost 1 vsldoi4 LHS, LHS
+  2708335471U,	// <1,2,4,0>: Cost 3 vsldoi8 <4,0,1,2>, <4,0,1,2>
+  3838043908U,	// <1,2,4,1>: Cost 4 vsldoi12 <2,1,3,1>, <2,4,1,5>
+  3357541992U,	// <1,2,4,2>: Cost 4 vmrglw <0,3,1,4>, <2,2,2,2>
+  2283798630U,	// <1,2,4,3>: Cost 3 vmrglw <0,3,1,4>, LHS
+  2726251728U,	// <1,2,4,4>: Cost 3 vsldoi8 <7,0,1,2>, <4,4,4,4>
+  1628622134U,	// <1,2,4,5>: Cost 2 vsldoi8 <3,0,1,2>, RHS
+  3297077178U,	// <1,2,4,6>: Cost 4 vmrghw <1,4,1,5>, <2,6,3,7>
+  2726251976U,	// <1,2,4,7>: Cost 3 vsldoi8 <7,0,1,2>, <4,7,5,0>
+  1628622377U,	// <1,2,4,u>: Cost 2 vsldoi8 <3,0,1,2>, RHS
+  2714308168U,	// <1,2,5,0>: Cost 3 vsldoi8 <5,0,1,2>, <5,0,1,2>
+  3297633827U,	// <1,2,5,1>: Cost 4 vmrghw <1,5,0,1>, <2,1,3,5>
+  2284471912U,	// <1,2,5,2>: Cost 3 vmrglw <0,4,1,5>, <2,2,2,2>
+  1210728550U,	// <1,2,5,3>: Cost 2 vmrglw <0,4,1,5>, LHS
+  3776106420U,	// <1,2,5,4>: Cost 4 vsldoi8 <3,0,1,2>, <5,4,5,6>
+  2726252548U,	// <1,2,5,5>: Cost 3 vsldoi8 <7,0,1,2>, <5,5,5,5>
+  2726252642U,	// <1,2,5,6>: Cost 3 vsldoi8 <7,0,1,2>, <5,6,7,0>
+  3799994538U,	// <1,2,5,7>: Cost 4 vsldoi8 <7,0,1,2>, <5,7,6,0>
+  1210728555U,	// <1,2,5,u>: Cost 2 vmrglw <0,4,1,5>, LHS
+  2720280865U,	// <1,2,6,0>: Cost 3 vsldoi8 <6,0,1,2>, <6,0,1,2>
+  2702365096U,	// <1,2,6,1>: Cost 3 vsldoi8 <3,0,1,2>, <6,1,7,2>
+  2726253050U,	// <1,2,6,2>: Cost 3 vsldoi8 <7,0,1,2>, <6,2,7,3>
+  2283151462U,	// <1,2,6,3>: Cost 3 vmrglw <0,2,1,6>, LHS
+  3697823030U,	// <1,2,6,4>: Cost 4 vsldoi4 <1,1,2,6>, RHS
+  3298715497U,	// <1,2,6,5>: Cost 4 vmrghw <1,6,5,7>, <2,5,3,7>
+  2726253368U,	// <1,2,6,6>: Cost 3 vsldoi8 <7,0,1,2>, <6,6,6,6>
+  2724926296U,	// <1,2,6,7>: Cost 3 vsldoi8 <6,7,1,2>, <6,7,1,2>
+  2283151467U,	// <1,2,6,u>: Cost 3 vmrglw <0,2,1,6>, LHS
+  1652511738U,	// <1,2,7,0>: Cost 2 vsldoi8 <7,0,1,2>, <7,0,1,2>
+  3371500916U,	// <1,2,7,1>: Cost 4 vmrglw <2,6,1,7>, <1,u,2,1>
+  3365529192U,	// <1,2,7,2>: Cost 4 vmrglw <1,6,1,7>, <2,2,2,2>
+  2291785830U,	// <1,2,7,3>: Cost 3 vmrglw <1,6,1,7>, LHS
+  2726253926U,	// <1,2,7,4>: Cost 3 vsldoi8 <7,0,1,2>, <7,4,5,6>
+  3788051845U,	// <1,2,7,5>: Cost 4 vsldoi8 <5,0,1,2>, <7,5,0,1>
+  3794023894U,	// <1,2,7,6>: Cost 4 vsldoi8 <6,0,1,2>, <7,6,0,1>
+  2726254119U,	// <1,2,7,7>: Cost 3 vsldoi8 <7,0,1,2>, <7,7,0,1>
+  1657820802U,	// <1,2,7,u>: Cost 2 vsldoi8 <7,u,1,2>, <7,u,1,2>
+  470638699U,	// <1,2,u,0>: Cost 1 vsldoi4 LHS, LHS
+  1544381236U,	// <1,2,u,1>: Cost 2 vsldoi4 LHS, <1,1,1,1>
+  1544382056U,	// <1,2,u,2>: Cost 2 vsldoi4 LHS, <2,2,2,2>
+  1544382614U,	// <1,2,u,3>: Cost 2 vsldoi4 LHS, <3,0,1,2>
+  470641974U,	// <1,2,u,4>: Cost 1 vsldoi4 LHS, RHS
+  1628625050U,	// <1,2,u,5>: Cost 2 vsldoi8 <3,0,1,2>, RHS
+  1592160762U,	// <1,2,u,6>: Cost 2 vsldoi4 LHS, <6,2,7,3>
+  1592161274U,	// <1,2,u,7>: Cost 2 vsldoi4 LHS, <7,0,1,2>
+  470644526U,	// <1,2,u,u>: Cost 1 vsldoi4 LHS, LHS
+  2769389708U,	// <1,3,0,0>: Cost 3 vsldoi12 <3,0,0,1>, <3,0,0,1>
+  2685780070U,	// <1,3,0,1>: Cost 3 vsldoi8 <0,2,1,3>, LHS
+  2685780142U,	// <1,3,0,2>: Cost 3 vsldoi8 <0,2,1,3>, <0,2,1,3>
+  2686443775U,	// <1,3,0,3>: Cost 3 vsldoi8 <0,3,1,3>, <0,3,1,3>
+  2769684656U,	// <1,3,0,4>: Cost 3 vsldoi12 <3,0,4,1>, <3,0,4,1>
+  3357507940U,	// <1,3,0,5>: Cost 4 vmrglw <0,3,1,0>, <0,4,3,5>
+  3759522294U,	// <1,3,0,6>: Cost 4 vsldoi8 <0,2,1,3>, <0,6,1,7>
+  3357509562U,	// <1,3,0,7>: Cost 4 vmrglw <0,3,1,0>, <2,6,3,7>
+  2685780637U,	// <1,3,0,u>: Cost 3 vsldoi8 <0,2,1,3>, LHS
+  2287092630U,	// <1,3,1,0>: Cost 3 vmrglw <0,u,1,1>, <1,2,3,0>
+  2221312230U,	// <1,3,1,1>: Cost 3 vmrghw <1,1,1,1>, <3,1,1,1>
+  2691752839U,	// <1,3,1,2>: Cost 3 vsldoi8 <1,2,1,3>, <1,2,1,3>
+  2287093362U,	// <1,3,1,3>: Cost 3 vmrglw <0,u,1,1>, <2,2,3,3>
+  2287092634U,	// <1,3,1,4>: Cost 3 vmrglw <0,u,1,1>, <1,2,3,4>
+  3360835107U,	// <1,3,1,5>: Cost 4 vmrglw <0,u,1,1>, <2,1,3,5>
+  3759523041U,	// <1,3,1,6>: Cost 4 vsldoi8 <0,2,1,3>, <1,6,3,7>
+  2287093690U,	// <1,3,1,7>: Cost 3 vmrglw <0,u,1,1>, <2,6,3,7>
+  2287092638U,	// <1,3,1,u>: Cost 3 vmrglw <0,u,1,1>, <1,2,3,u>
+  2222114966U,	// <1,3,2,0>: Cost 3 vmrghw <1,2,3,0>, <3,0,1,2>
+  2222115057U,	// <1,3,2,1>: Cost 3 vmrghw <1,2,3,0>, <3,1,2,3>
+  2630092320U,	// <1,3,2,2>: Cost 3 vsldoi4 <2,1,3,2>, <2,1,3,2>
+  2685781670U,	// <1,3,2,3>: Cost 3 vsldoi8 <0,2,1,3>, <2,3,0,1>
+  2222115330U,	// <1,3,2,4>: Cost 3 vmrghw <1,2,3,0>, <3,4,5,6>
+  3373449572U,	// <1,3,2,5>: Cost 4 vmrglw <3,0,1,2>, <0,4,3,5>
+  2222115448U,	// <1,3,2,6>: Cost 3 vmrghw <1,2,3,0>, <3,6,0,7>
+  2299709370U,	// <1,3,2,7>: Cost 3 vmrglw <3,0,1,2>, <2,6,3,7>
+  2222115614U,	// <1,3,2,u>: Cost 3 vmrghw <1,2,3,0>, <3,u,1,2>
+  2771380607U,	// <1,3,3,0>: Cost 3 vsldoi12 <3,3,0,1>, <3,3,0,1>
+  3356874468U,	// <1,3,3,1>: Cost 4 vmrglw <0,2,1,3>, <u,0,3,1>
+  3759524168U,	// <1,3,3,2>: Cost 4 vsldoi8 <0,2,1,3>, <3,2,3,0>
+  2283792796U,	// <1,3,3,3>: Cost 3 vmrglw <0,3,1,3>, <3,3,3,3>
+  3356869530U,	// <1,3,3,4>: Cost 4 vmrglw <0,2,1,3>, <1,2,3,4>
+  3721760428U,	// <1,3,3,5>: Cost 4 vsldoi4 <5,1,3,3>, <5,1,3,3>
+  3296496248U,	// <1,3,3,6>: Cost 4 vmrghw <1,3,2,6>, <3,6,0,7>
+  3356870586U,	// <1,3,3,7>: Cost 4 vmrglw <0,2,1,3>, <2,6,3,7>
+  2771970503U,	// <1,3,3,u>: Cost 3 vsldoi12 <3,3,u,1>, <3,3,u,1>
+  2772044240U,	// <1,3,4,0>: Cost 3 vsldoi12 <3,4,0,1>, <3,4,0,1>
+  3362186135U,	// <1,3,4,1>: Cost 4 vmrglw <1,1,1,4>, <1,2,3,1>
+  3297151280U,	// <1,3,4,2>: Cost 4 vmrghw <1,4,2,5>, <3,2,0,3>
+  3357542002U,	// <1,3,4,3>: Cost 4 vmrglw <0,3,1,4>, <2,2,3,3>
+  3357540626U,	// <1,3,4,4>: Cost 4 vmrglw <0,3,1,4>, <0,3,3,4>
+  2685783350U,	// <1,3,4,5>: Cost 3 vsldoi8 <0,2,1,3>, RHS
+  3357546622U,	// <1,3,4,6>: Cost 4 vmrglw <0,3,1,4>, <u,5,3,6>
+  3357542330U,	// <1,3,4,7>: Cost 4 vmrglw <0,3,1,4>, <2,6,3,7>
+  2685783593U,	// <1,3,4,u>: Cost 3 vsldoi8 <0,2,1,3>, RHS
+  2284471190U,	// <1,3,5,0>: Cost 3 vmrglw <0,4,1,5>, <1,2,3,0>
+  3358213015U,	// <1,3,5,1>: Cost 4 vmrglw <0,4,1,5>, <1,2,3,1>
+  2630116899U,	// <1,3,5,2>: Cost 3 vsldoi4 <2,1,3,5>, <2,1,3,5>
+  2284471922U,	// <1,3,5,3>: Cost 3 vmrglw <0,4,1,5>, <2,2,3,3>
+  2284471194U,	// <1,3,5,4>: Cost 3 vmrglw <0,4,1,5>, <1,2,3,4>
+  2284471843U,	// <1,3,5,5>: Cost 3 vmrglw <0,4,1,5>, <2,1,3,5>
+  3358218366U,	// <1,3,5,6>: Cost 4 vmrglw <0,4,1,5>, <u,5,3,6>
+  2284472250U,	// <1,3,5,7>: Cost 3 vmrglw <0,4,1,5>, <2,6,3,7>
+  2284471198U,	// <1,3,5,u>: Cost 3 vmrglw <0,4,1,5>, <1,2,3,u>
+  2224752790U,	// <1,3,6,0>: Cost 3 vmrghw <1,6,2,7>, <3,0,1,2>
+  3832736385U,	// <1,3,6,1>: Cost 4 vsldoi12 <1,2,3,1>, <3,6,1,7>
+  3703866916U,	// <1,3,6,2>: Cost 4 vsldoi4 <2,1,3,6>, <2,1,3,6>
+  3356894834U,	// <1,3,6,3>: Cost 4 vmrglw <0,2,1,6>, <2,2,3,3>
+  3356894106U,	// <1,3,6,4>: Cost 4 vmrglw <0,2,1,6>, <1,2,3,4>
+  3356894755U,	// <1,3,6,5>: Cost 5 vmrglw <0,2,1,6>, <2,1,3,5>
+  3356899130U,	// <1,3,6,6>: Cost 4 vmrglw <0,2,1,6>, <u,1,3,6>
+  2283153338U,	// <1,3,6,7>: Cost 3 vmrglw <0,2,1,6>, <2,6,3,7>
+  2283153338U,	// <1,3,6,u>: Cost 3 vmrglw <0,2,1,6>, <2,6,3,7>
+  2774035139U,	// <1,3,7,0>: Cost 3 vsldoi12 <3,7,0,1>, <3,7,0,1>
+  3703874767U,	// <1,3,7,1>: Cost 4 vsldoi4 <2,1,3,7>, <1,6,1,7>
+  3703875109U,	// <1,3,7,2>: Cost 4 vsldoi4 <2,1,3,7>, <2,1,3,7>
+  3365529202U,	// <1,3,7,3>: Cost 4 vmrglw <1,6,1,7>, <2,2,3,3>
+  3365528474U,	// <1,3,7,4>: Cost 4 vmrglw <1,6,1,7>, <1,2,3,4>
+  3789387159U,	// <1,3,7,5>: Cost 4 vsldoi8 <5,2,1,3>, <7,5,2,1>
+  3865692927U,	// <1,3,7,6>: Cost 4 vsldoi12 <6,7,0,1>, <3,7,6,7>
+  3363538874U,	// <1,3,7,7>: Cost 4 vmrglw <1,3,1,7>, <2,6,3,7>
+  2774625035U,	// <1,3,7,u>: Cost 3 vsldoi12 <3,7,u,1>, <3,7,u,1>
+  2284495766U,	// <1,3,u,0>: Cost 3 vmrglw <0,4,1,u>, <1,2,3,0>
+  2685785902U,	// <1,3,u,1>: Cost 3 vsldoi8 <0,2,1,3>, LHS
+  2630141478U,	// <1,3,u,2>: Cost 3 vsldoi4 <2,1,3,u>, <2,1,3,u>
+  2283169880U,	// <1,3,u,3>: Cost 3 vmrglw <0,2,1,u>, <2,u,3,3>
+  2284495770U,	// <1,3,u,4>: Cost 3 vmrglw <0,4,1,u>, <1,2,3,4>
+  2685786266U,	// <1,3,u,5>: Cost 3 vsldoi8 <0,2,1,3>, RHS
+  2222115448U,	// <1,3,u,6>: Cost 3 vmrghw <1,2,3,0>, <3,6,0,7>
+  2284496826U,	// <1,3,u,7>: Cost 3 vmrglw <0,4,1,u>, <2,6,3,7>
+  2685786469U,	// <1,3,u,u>: Cost 3 vsldoi8 <0,2,1,3>, LHS
+  2684461069U,	// <1,4,0,0>: Cost 3 vsldoi8 <0,0,1,4>, <0,0,1,4>
+  2686451814U,	// <1,4,0,1>: Cost 3 vsldoi8 <0,3,1,4>, LHS
+  3759530159U,	// <1,4,0,2>: Cost 4 vsldoi8 <0,2,1,4>, <0,2,1,4>
+  2686451968U,	// <1,4,0,3>: Cost 3 vsldoi8 <0,3,1,4>, <0,3,1,4>
+  2684461394U,	// <1,4,0,4>: Cost 3 vsldoi8 <0,0,1,4>, <0,4,1,5>
+  1701989266U,	// <1,4,0,5>: Cost 2 vsldoi12 <4,0,5,1>, <4,0,5,1>
+  3776119286U,	// <1,4,0,6>: Cost 4 vsldoi8 <3,0,1,4>, <0,6,1,7>
+  2689106500U,	// <1,4,0,7>: Cost 3 vsldoi8 <0,7,1,4>, <0,7,1,4>
+  1702210477U,	// <1,4,0,u>: Cost 2 vsldoi12 <4,0,u,1>, <4,0,u,1>
+  2221312914U,	// <1,4,1,0>: Cost 3 vmrghw <1,1,1,1>, <4,0,5,1>
+  2691097399U,	// <1,4,1,1>: Cost 3 vsldoi8 <1,1,1,4>, <1,1,1,4>
+  3760194454U,	// <1,4,1,2>: Cost 4 vsldoi8 <0,3,1,4>, <1,2,3,0>
+  3766166489U,	// <1,4,1,3>: Cost 4 vsldoi8 <1,3,1,4>, <1,3,1,4>
+  2334870736U,	// <1,4,1,4>: Cost 3 vmrglw <u,u,1,1>, <4,4,4,4>
+  1147571510U,	// <1,4,1,5>: Cost 2 vmrghw <1,1,1,1>, RHS
+  3760194794U,	// <1,4,1,6>: Cost 4 vsldoi8 <0,3,1,4>, <1,6,4,7>
+  3867315188U,	// <1,4,1,7>: Cost 4 vsldoi12 <7,0,4,1>, <4,1,7,0>
+  1147571753U,	// <1,4,1,u>: Cost 2 vmrghw <1,1,1,1>, RHS
+  2222115730U,	// <1,4,2,0>: Cost 3 vmrghw <1,2,3,0>, <4,0,5,1>
+  2222115812U,	// <1,4,2,1>: Cost 3 vmrghw <1,2,3,0>, <4,1,5,2>
+  3760195176U,	// <1,4,2,2>: Cost 4 vsldoi8 <0,3,1,4>, <2,2,2,2>
+  2702378662U,	// <1,4,2,3>: Cost 3 vsldoi8 <3,0,1,4>, <2,3,0,1>
+  2323598544U,	// <1,4,2,4>: Cost 3 vmrglw <7,0,1,2>, <4,4,4,4>
+  1148374326U,	// <1,4,2,5>: Cost 2 vmrghw <1,2,3,0>, RHS
+  3760195514U,	// <1,4,2,6>: Cost 4 vsldoi8 <0,3,1,4>, <2,6,3,7>
+  3373451932U,	// <1,4,2,7>: Cost 4 vmrglw <3,0,1,2>, <3,6,4,7>
+  1148374569U,	// <1,4,2,u>: Cost 2 vmrghw <1,2,3,0>, RHS
+  2702379160U,	// <1,4,3,0>: Cost 3 vsldoi8 <3,0,1,4>, <3,0,1,4>
+  3760195840U,	// <1,4,3,1>: Cost 4 vsldoi8 <0,3,1,4>, <3,1,4,0>
+  3776121160U,	// <1,4,3,2>: Cost 4 vsldoi8 <3,0,1,4>, <3,2,3,0>
+  3760195996U,	// <1,4,3,3>: Cost 4 vsldoi8 <0,3,1,4>, <3,3,3,3>
+  2686454274U,	// <1,4,3,4>: Cost 3 vsldoi8 <0,3,1,4>, <3,4,5,6>
+  3356870350U,	// <1,4,3,5>: Cost 4 vmrglw <0,2,1,3>, <2,3,4,5>
+  3800009392U,	// <1,4,3,6>: Cost 4 vsldoi8 <7,0,1,4>, <3,6,7,0>
+  3366824604U,	// <1,4,3,7>: Cost 5 vmrglw <1,u,1,3>, <3,6,4,7>
+  2707688224U,	// <1,4,3,u>: Cost 3 vsldoi8 <3,u,1,4>, <3,u,1,4>
+  2775731368U,	// <1,4,4,0>: Cost 3 vsldoi12 <4,0,5,1>, <4,4,0,0>
+  3830820018U,	// <1,4,4,1>: Cost 4 vsldoi12 <0,u,4,1>, <4,4,1,1>
+  3691980454U,	// <1,4,4,2>: Cost 4 vsldoi4 <0,1,4,4>, <2,3,0,1>
+  3357541282U,	// <1,4,4,3>: Cost 4 vmrglw <0,3,1,4>, <1,2,4,3>
+  2781039824U,	// <1,4,4,4>: Cost 3 vsldoi12 <4,u,5,1>, <4,4,4,4>
+  2686455094U,	// <1,4,4,5>: Cost 3 vsldoi8 <0,3,1,4>, RHS
+  3357541528U,	// <1,4,4,6>: Cost 4 vmrglw <0,3,1,4>, <1,5,4,6>
+  3810627020U,	// <1,4,4,7>: Cost 4 vsldoi8 <u,7,1,4>, <4,7,5,4>
+  2686455337U,	// <1,4,4,u>: Cost 3 vsldoi8 <0,3,1,4>, RHS
+  2624217190U,	// <1,4,5,0>: Cost 3 vsldoi4 <1,1,4,5>, LHS
+  2284470309U,	// <1,4,5,1>: Cost 3 vmrglw <0,4,1,5>, <0,0,4,1>
+  2618246822U,	// <1,4,5,2>: Cost 3 vsldoi4 <0,1,4,5>, <2,3,0,1>
+  3358212297U,	// <1,4,5,3>: Cost 4 vmrglw <0,4,1,5>, <0,2,4,3>
+  2284470312U,	// <1,4,5,4>: Cost 3 vmrglw <0,4,1,5>, <0,0,4,4>
+  2284470637U,	// <1,4,5,5>: Cost 3 vmrglw <0,4,1,5>, <0,4,4,5>
+  1683115318U,	// <1,4,5,6>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+  3721851898U,	// <1,4,5,7>: Cost 4 vsldoi4 <5,1,4,5>, <7,0,1,2>
+  1683115336U,	// <1,4,5,u>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+  3794039075U,	// <1,4,6,0>: Cost 4 vsldoi8 <6,0,1,4>, <6,0,1,4>
+  3830820186U,	// <1,4,6,1>: Cost 4 vsldoi12 <0,u,4,1>, <4,6,1,7>
+  3800011258U,	// <1,4,6,2>: Cost 4 vsldoi8 <7,0,1,4>, <6,2,7,3>
+  3807973938U,	// <1,4,6,3>: Cost 4 vsldoi8 <u,3,1,4>, <6,3,4,5>
+  3298716880U,	// <1,4,6,4>: Cost 4 vmrghw <1,6,5,7>, <4,4,4,4>
+  2224680246U,	// <1,4,6,5>: Cost 3 vmrghw <1,6,1,7>, RHS
+  3800011576U,	// <1,4,6,6>: Cost 4 vsldoi8 <7,0,1,4>, <6,6,6,6>
+  2726269774U,	// <1,4,6,7>: Cost 3 vsldoi8 <7,0,1,4>, <6,7,0,1>
+  2224680489U,	// <1,4,6,u>: Cost 3 vmrghw <1,6,1,7>, RHS
+  2726269948U,	// <1,4,7,0>: Cost 3 vsldoi8 <7,0,1,4>, <7,0,1,4>
+  3383444141U,	// <1,4,7,1>: Cost 4 vmrglw <4,6,1,7>, <0,u,4,1>
+  3805983961U,	// <1,4,7,2>: Cost 4 vsldoi8 <u,0,1,4>, <7,2,u,0>
+  3807974667U,	// <1,4,7,3>: Cost 4 vsldoi8 <u,3,1,4>, <7,3,4,5>
+  2736887142U,	// <1,4,7,4>: Cost 3 vsldoi8 <u,7,1,4>, <7,4,5,6>
+  3365528403U,	// <1,4,7,5>: Cost 4 vmrglw <1,6,1,7>, <1,1,4,5>
+  3800012308U,	// <1,4,7,6>: Cost 4 vsldoi8 <7,0,1,4>, <7,6,7,0>
+  3800012396U,	// <1,4,7,7>: Cost 4 vsldoi8 <7,0,1,4>, <7,7,7,7>
+  2731579012U,	// <1,4,7,u>: Cost 3 vsldoi8 <7,u,1,4>, <7,u,1,4>
+  2624241766U,	// <1,4,u,0>: Cost 3 vsldoi4 <1,1,4,u>, LHS
+  2686457646U,	// <1,4,u,1>: Cost 3 vsldoi8 <0,3,1,4>, LHS
+  2618271398U,	// <1,4,u,2>: Cost 3 vsldoi4 <0,1,4,u>, <2,3,0,1>
+  2734233544U,	// <1,4,u,3>: Cost 3 vsldoi8 <u,3,1,4>, <u,3,1,4>
+  2689775679U,	// <1,4,u,4>: Cost 3 vsldoi8 <0,u,1,4>, <u,4,5,6>
+  1152355638U,	// <1,4,u,5>: Cost 2 vmrghw <1,u,3,0>, RHS
+  1683115561U,	// <1,4,u,6>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+  2736888076U,	// <1,4,u,7>: Cost 3 vsldoi8 <u,7,1,4>, <u,7,1,4>
+  1683115579U,	// <1,4,u,u>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+  2687123456U,	// <1,5,0,0>: Cost 3 vsldoi8 <0,4,1,5>, <0,0,0,0>
+  1613381734U,	// <1,5,0,1>: Cost 2 vsldoi8 <0,4,1,5>, LHS
+  3759538352U,	// <1,5,0,2>: Cost 4 vsldoi8 <0,2,1,5>, <0,2,1,5>
+  3760865532U,	// <1,5,0,3>: Cost 4 vsldoi8 <0,4,1,5>, <0,3,1,0>
+  1613381970U,	// <1,5,0,4>: Cost 2 vsldoi8 <0,4,1,5>, <0,4,1,5>
+  2687787427U,	// <1,5,0,5>: Cost 3 vsldoi8 <0,5,1,5>, <0,5,1,5>
+  2781777524U,	// <1,5,0,6>: Cost 3 vsldoi12 <5,0,6,1>, <5,0,6,1>
+  3733828717U,	// <1,5,0,7>: Cost 4 vsldoi4 <7,1,5,0>, <7,1,5,0>
+  1613382301U,	// <1,5,0,u>: Cost 2 vsldoi8 <0,4,1,5>, LHS
+  2781040271U,	// <1,5,1,0>: Cost 3 vsldoi12 <4,u,5,1>, <5,1,0,1>
+  2687124276U,	// <1,5,1,1>: Cost 3 vsldoi8 <0,4,1,5>, <1,1,1,1>
+  2687124374U,	// <1,5,1,2>: Cost 3 vsldoi8 <0,4,1,5>, <1,2,3,0>
+  3760866297U,	// <1,5,1,3>: Cost 4 vsldoi8 <0,4,1,5>, <1,3,5,0>
+  2693096491U,	// <1,5,1,4>: Cost 3 vsldoi8 <1,4,1,5>, <1,4,1,5>
+  2687124591U,	// <1,5,1,5>: Cost 3 vsldoi8 <0,4,1,5>, <1,5,0,1>
+  2687124723U,	// <1,5,1,6>: Cost 3 vsldoi8 <0,4,1,5>, <1,6,5,7>
+  3360834803U,	// <1,5,1,7>: Cost 4 vmrglw <0,u,1,1>, <1,6,5,7>
+  2687124860U,	// <1,5,1,u>: Cost 3 vsldoi8 <0,4,1,5>, <1,u,3,0>
+  2323598792U,	// <1,5,2,0>: Cost 3 vmrglw <7,0,1,2>, <4,7,5,0>
+  2687125027U,	// <1,5,2,1>: Cost 3 vsldoi8 <0,4,1,5>, <2,1,3,5>
+  2687125096U,	// <1,5,2,2>: Cost 3 vsldoi8 <0,4,1,5>, <2,2,2,2>
+  2687125158U,	// <1,5,2,3>: Cost 3 vsldoi8 <0,4,1,5>, <2,3,0,1>
+  2642185188U,	// <1,5,2,4>: Cost 3 vsldoi4 <4,1,5,2>, <4,1,5,2>
+  2323598554U,	// <1,5,2,5>: Cost 3 vmrglw <7,0,1,2>, <4,4,5,5>
+  2687125434U,	// <1,5,2,6>: Cost 3 vsldoi8 <0,4,1,5>, <2,6,3,7>
+  3373450483U,	// <1,5,2,7>: Cost 4 vmrglw <3,0,1,2>, <1,6,5,7>
+  2687125563U,	// <1,5,2,u>: Cost 3 vsldoi8 <0,4,1,5>, <2,u,0,1>
+  2687125654U,	// <1,5,3,0>: Cost 3 vsldoi8 <0,4,1,5>, <3,0,1,2>
+  2312990234U,	// <1,5,3,1>: Cost 3 vmrglw <5,2,1,3>, <4,u,5,1>
+  3760867649U,	// <1,5,3,2>: Cost 4 vsldoi8 <0,4,1,5>, <3,2,2,2>
+  2687125916U,	// <1,5,3,3>: Cost 3 vsldoi8 <0,4,1,5>, <3,3,3,3>
+  2687126018U,	// <1,5,3,4>: Cost 3 vsldoi8 <0,4,1,5>, <3,4,5,6>
+  3386731738U,	// <1,5,3,5>: Cost 4 vmrglw <5,2,1,3>, <4,4,5,5>
+  3356871170U,	// <1,5,3,6>: Cost 4 vmrglw <0,2,1,3>, <3,4,5,6>
+  3808643779U,	// <1,5,3,7>: Cost 4 vsldoi8 <u,4,1,5>, <3,7,0,1>
+  2687126302U,	// <1,5,3,u>: Cost 3 vsldoi8 <0,4,1,5>, <3,u,1,2>
+  2642198630U,	// <1,5,4,0>: Cost 3 vsldoi4 <4,1,5,4>, LHS
+  2687126498U,	// <1,5,4,1>: Cost 3 vsldoi8 <0,4,1,5>, <4,1,5,0>
+  3715941923U,	// <1,5,4,2>: Cost 4 vsldoi4 <4,1,5,4>, <2,1,3,5>
+  3709970701U,	// <1,5,4,3>: Cost 4 vsldoi4 <3,1,5,4>, <3,1,5,4>
+  2687126736U,	// <1,5,4,4>: Cost 3 vsldoi8 <0,4,1,5>, <4,4,4,4>
+  1613385014U,	// <1,5,4,5>: Cost 2 vsldoi8 <0,4,1,5>, RHS
+  2283801090U,	// <1,5,4,6>: Cost 3 vmrglw <0,3,1,4>, <3,4,5,6>
+  3733861489U,	// <1,5,4,7>: Cost 4 vsldoi4 <7,1,5,4>, <7,1,5,4>
+  1613385257U,	// <1,5,4,u>: Cost 2 vsldoi8 <0,4,1,5>, RHS
+  2624290918U,	// <1,5,5,0>: Cost 3 vsldoi4 <1,1,5,5>, LHS
+  2624291676U,	// <1,5,5,1>: Cost 3 vsldoi4 <1,1,5,5>, <1,1,5,5>
+  3698034211U,	// <1,5,5,2>: Cost 4 vsldoi4 <1,1,5,5>, <2,1,3,5>
+  2284471211U,	// <1,5,5,3>: Cost 3 vmrglw <0,4,1,5>, <1,2,5,3>
+  2624294198U,	// <1,5,5,4>: Cost 3 vsldoi4 <1,1,5,5>, RHS
+  2284471132U,	// <1,5,5,5>: Cost 3 vmrglw <0,4,1,5>, <1,1,5,5>
+  2284472834U,	// <1,5,5,6>: Cost 3 vmrglw <0,4,1,5>, <3,4,5,6>
+  2284471539U,	// <1,5,5,7>: Cost 3 vmrglw <0,4,1,5>, <1,6,5,7>
+  2284471216U,	// <1,5,5,u>: Cost 3 vmrglw <0,4,1,5>, <1,2,5,u>
+  2785316900U,	// <1,5,6,0>: Cost 3 vsldoi12 <5,6,0,1>, <5,6,0,1>
+  2781040691U,	// <1,5,6,1>: Cost 3 vsldoi12 <4,u,5,1>, <5,6,1,7>
+  2734903802U,	// <1,5,6,2>: Cost 3 vsldoi8 <u,4,1,5>, <6,2,7,3>
+  3848736834U,	// <1,5,6,3>: Cost 4 vsldoi12 <3,u,4,1>, <5,6,3,4>
+  3298717620U,	// <1,5,6,4>: Cost 4 vmrghw <1,6,5,7>, <5,4,5,6>
+  3298717700U,	// <1,5,6,5>: Cost 4 vmrghw <1,6,5,7>, <5,5,5,5>
+  2734904120U,	// <1,5,6,6>: Cost 3 vsldoi8 <u,4,1,5>, <6,6,6,6>
+  2781040738U,	// <1,5,6,7>: Cost 3 vsldoi12 <4,u,5,1>, <5,6,7,0>
+  2781040747U,	// <1,5,6,u>: Cost 3 vsldoi12 <4,u,5,1>, <5,6,u,0>
+  2734904314U,	// <1,5,7,0>: Cost 3 vsldoi8 <u,4,1,5>, <7,0,1,2>
+  2315677210U,	// <1,5,7,1>: Cost 3 vmrglw <5,6,1,7>, <4,u,5,1>
+  3808646292U,	// <1,5,7,2>: Cost 4 vsldoi8 <u,4,1,5>, <7,2,0,3>
+  3808646371U,	// <1,5,7,3>: Cost 4 vsldoi8 <u,4,1,5>, <7,3,0,1>
+  2734904678U,	// <1,5,7,4>: Cost 3 vsldoi8 <u,4,1,5>, <7,4,5,6>
+  3389418714U,	// <1,5,7,5>: Cost 4 vmrglw <5,6,1,7>, <4,4,5,5>
+  3365528656U,	// <1,5,7,6>: Cost 4 vmrglw <1,6,1,7>, <1,4,5,6>
+  2734904940U,	// <1,5,7,7>: Cost 3 vsldoi8 <u,4,1,5>, <7,7,7,7>
+  2734904962U,	// <1,5,7,u>: Cost 3 vsldoi8 <u,4,1,5>, <7,u,1,2>
+  2687129299U,	// <1,5,u,0>: Cost 3 vsldoi8 <0,4,1,5>, <u,0,1,2>
+  1613387566U,	// <1,5,u,1>: Cost 2 vsldoi8 <0,4,1,5>, LHS
+  2687129480U,	// <1,5,u,2>: Cost 3 vsldoi8 <0,4,1,5>, <u,2,3,3>
+  2687129532U,	// <1,5,u,3>: Cost 3 vsldoi8 <0,4,1,5>, <u,3,0,1>
+  1661163546U,	// <1,5,u,4>: Cost 2 vsldoi8 <u,4,1,5>, <u,4,1,5>
+  1613387930U,	// <1,5,u,5>: Cost 2 vsldoi8 <0,4,1,5>, RHS
+  2687129808U,	// <1,5,u,6>: Cost 3 vsldoi8 <0,4,1,5>, <u,6,3,7>
+  2781040900U,	// <1,5,u,7>: Cost 3 vsldoi12 <4,u,5,1>, <5,u,7,0>
+  1613388133U,	// <1,5,u,u>: Cost 2 vsldoi8 <0,4,1,5>, LHS
+  3759546368U,	// <1,6,0,0>: Cost 4 vsldoi8 <0,2,1,6>, <0,0,0,0>
+  2685804646U,	// <1,6,0,1>: Cost 3 vsldoi8 <0,2,1,6>, LHS
+  2685804721U,	// <1,6,0,2>: Cost 3 vsldoi8 <0,2,1,6>, <0,2,1,6>
+  3861270834U,	// <1,6,0,3>: Cost 4 vsldoi12 <6,0,3,1>, <6,0,3,1>
+  3759546706U,	// <1,6,0,4>: Cost 4 vsldoi8 <0,2,1,6>, <0,4,1,5>
+  2687795620U,	// <1,6,0,5>: Cost 3 vsldoi8 <0,5,1,6>, <0,5,1,6>
+  2688459253U,	// <1,6,0,6>: Cost 3 vsldoi8 <0,6,1,6>, <0,6,1,6>
+  2283769142U,	// <1,6,0,7>: Cost 3 vmrglw <0,3,1,0>, RHS
+  2685805213U,	// <1,6,0,u>: Cost 3 vsldoi8 <0,2,1,6>, LHS
+  3698073702U,	// <1,6,1,0>: Cost 4 vsldoi4 <1,1,6,1>, LHS
+  3759547188U,	// <1,6,1,1>: Cost 4 vsldoi8 <0,2,1,6>, <1,1,1,1>
+  2221314554U,	// <1,6,1,2>: Cost 3 vmrghw <1,1,1,1>, <6,2,7,3>
+  3759547401U,	// <1,6,1,3>: Cost 4 vsldoi8 <0,2,1,6>, <1,3,6,7>
+  3698076982U,	// <1,6,1,4>: Cost 4 vsldoi4 <1,1,6,1>, RHS
+  3767510141U,	// <1,6,1,5>: Cost 4 vsldoi8 <1,5,1,6>, <1,5,1,6>
+  2334872376U,	// <1,6,1,6>: Cost 3 vmrglw <u,u,1,1>, <6,6,6,6>
+  1213353270U,	// <1,6,1,7>: Cost 2 vmrglw <0,u,1,1>, RHS
+  1213353271U,	// <1,6,1,u>: Cost 2 vmrglw <0,u,1,1>, RHS
+  3704053862U,	// <1,6,2,0>: Cost 4 vsldoi4 <2,1,6,2>, LHS
+  3759547961U,	// <1,6,2,1>: Cost 4 vsldoi8 <0,2,1,6>, <2,1,6,0>
+  2222117370U,	// <1,6,2,2>: Cost 3 vmrghw <1,2,3,0>, <6,2,7,3>
+  3759548070U,	// <1,6,2,3>: Cost 4 vsldoi8 <0,2,1,6>, <2,3,0,1>
+  3704057142U,	// <1,6,2,4>: Cost 4 vsldoi4 <2,1,6,2>, RHS
+  3373451057U,	// <1,6,2,5>: Cost 4 vmrglw <3,0,1,2>, <2,4,6,5>
+  2685806522U,	// <1,6,2,6>: Cost 3 vsldoi8 <0,2,1,6>, <2,6,3,7>
+  1225968950U,	// <1,6,2,7>: Cost 2 vmrglw <3,0,1,2>, RHS
+  1225968951U,	// <1,6,2,u>: Cost 2 vmrglw <3,0,1,2>, RHS
+  3759548566U,	// <1,6,3,0>: Cost 4 vsldoi8 <0,2,1,6>, <3,0,1,2>
+  3842912793U,	// <1,6,3,1>: Cost 4 vsldoi12 <2,u,6,1>, <6,3,1,7>
+  3759548774U,	// <1,6,3,2>: Cost 4 vsldoi8 <0,2,1,6>, <3,2,6,3>
+  3759548828U,	// <1,6,3,3>: Cost 4 vsldoi8 <0,2,1,6>, <3,3,3,3>
+  3759548930U,	// <1,6,3,4>: Cost 4 vsldoi8 <0,2,1,6>, <3,4,5,6>
+  3809315421U,	// <1,6,3,5>: Cost 4 vsldoi8 <u,5,1,6>, <3,5,6,7>
+  3386733368U,	// <1,6,3,6>: Cost 4 vmrglw <5,2,1,3>, <6,6,6,6>
+  2283130166U,	// <1,6,3,7>: Cost 3 vmrglw <0,2,1,3>, RHS
+  2283130167U,	// <1,6,3,u>: Cost 3 vmrglw <0,2,1,3>, RHS
+  3704070246U,	// <1,6,4,0>: Cost 4 vsldoi4 <2,1,6,4>, LHS
+  3862229608U,	// <1,6,4,1>: Cost 4 vsldoi12 <6,1,7,1>, <6,4,1,5>
+  3704071741U,	// <1,6,4,2>: Cost 4 vsldoi4 <2,1,6,4>, <2,1,6,4>
+  3721988610U,	// <1,6,4,3>: Cost 4 vsldoi4 <5,1,6,4>, <3,4,5,6>
+  3704073526U,	// <1,6,4,4>: Cost 4 vsldoi4 <2,1,6,4>, RHS
+  2685807926U,	// <1,6,4,5>: Cost 3 vsldoi8 <0,2,1,6>, RHS
+  3865621141U,	// <1,6,4,6>: Cost 4 vsldoi12 <6,6,u,1>, <6,4,6,5>
+  2283801910U,	// <1,6,4,7>: Cost 3 vmrglw <0,3,1,4>, RHS
+  2685808169U,	// <1,6,4,u>: Cost 3 vsldoi8 <0,2,1,6>, RHS
+  3710050406U,	// <1,6,5,0>: Cost 4 vsldoi4 <3,1,6,5>, LHS
+  3710051571U,	// <1,6,5,1>: Cost 4 vsldoi4 <3,1,6,5>, <1,6,5,7>
+  3405989597U,	// <1,6,5,2>: Cost 4 vmrglw <u,4,1,5>, <2,3,6,2>
+  3358214502U,	// <1,6,5,3>: Cost 4 vmrglw <0,4,1,5>, <3,2,6,3>
+  3710053686U,	// <1,6,5,4>: Cost 4 vsldoi4 <3,1,6,5>, RHS
+  3721998025U,	// <1,6,5,5>: Cost 4 vsldoi4 <5,1,6,5>, <5,1,6,5>
+  2332250936U,	// <1,6,5,6>: Cost 3 vmrglw <u,4,1,5>, <6,6,6,6>
+  1210731830U,	// <1,6,5,7>: Cost 2 vmrglw <0,4,1,5>, RHS
+  1210731831U,	// <1,6,5,u>: Cost 2 vmrglw <0,4,1,5>, RHS
+  2791289597U,	// <1,6,6,0>: Cost 3 vsldoi12 <6,6,0,1>, <6,6,0,1>
+  3698115430U,	// <1,6,6,1>: Cost 4 vsldoi4 <1,1,6,6>, <1,1,6,6>
+  3698116538U,	// <1,6,6,2>: Cost 4 vsldoi4 <1,1,6,6>, <2,6,3,7>
+  3356894132U,	// <1,6,6,3>: Cost 4 vmrglw <0,2,1,6>, <1,2,6,3>
+  3698117942U,	// <1,6,6,4>: Cost 4 vsldoi4 <1,1,6,6>, RHS
+  3722006218U,	// <1,6,6,5>: Cost 4 vsldoi4 <5,1,6,6>, <5,1,6,6>
+  2781041464U,	// <1,6,6,6>: Cost 3 vsldoi12 <4,u,5,1>, <6,6,6,6>
+  2283154742U,	// <1,6,6,7>: Cost 3 vmrglw <0,2,1,6>, RHS
+  2283154743U,	// <1,6,6,u>: Cost 3 vmrglw <0,2,1,6>, RHS
+  1718211406U,	// <1,6,7,0>: Cost 2 vsldoi12 <6,7,0,1>, <6,7,0,1>
+  2792026967U,	// <1,6,7,1>: Cost 3 vsldoi12 <6,7,1,1>, <6,7,1,1>
+  2765411170U,	// <1,6,7,2>: Cost 3 vsldoi12 <2,3,0,1>, <6,7,2,3>
+  3854783336U,	// <1,6,7,3>: Cost 4 vsldoi12 <4,u,5,1>, <6,7,3,0>
+  2781041526U,	// <1,6,7,4>: Cost 3 vsldoi12 <4,u,5,1>, <6,7,4,5>
+  3365528664U,	// <1,6,7,5>: Cost 4 vmrglw <1,6,1,7>, <1,4,6,5>
+  2791953290U,	// <1,6,7,6>: Cost 3 vsldoi12 <6,7,0,1>, <6,7,6,7>
+  2291789110U,	// <1,6,7,7>: Cost 3 vmrglw <1,6,1,7>, RHS
+  1718801302U,	// <1,6,7,u>: Cost 2 vsldoi12 <6,7,u,1>, <6,7,u,1>
+  1718875039U,	// <1,6,u,0>: Cost 2 vsldoi12 <6,u,0,1>, <6,u,0,1>
+  2685810478U,	// <1,6,u,1>: Cost 3 vsldoi8 <0,2,1,6>, LHS
+  2792764337U,	// <1,6,u,2>: Cost 3 vsldoi12 <6,u,2,1>, <6,u,2,1>
+  3759552444U,	// <1,6,u,3>: Cost 4 vsldoi8 <0,2,1,6>, <u,3,0,1>
+  2781041607U,	// <1,6,u,4>: Cost 3 vsldoi12 <4,u,5,1>, <6,u,4,5>
+  2685810842U,	// <1,6,u,5>: Cost 3 vsldoi8 <0,2,1,6>, RHS
+  2689792208U,	// <1,6,u,6>: Cost 3 vsldoi8 <0,u,1,6>, <u,6,3,7>
+  1210756406U,	// <1,6,u,7>: Cost 2 vmrglw <0,4,1,u>, RHS
+  1210756407U,	// <1,6,u,u>: Cost 2 vmrglw <0,4,1,u>, RHS
+  2793280496U,	// <1,7,0,0>: Cost 3 vsldoi12 <7,0,0,1>, <7,0,0,1>
+  2694439014U,	// <1,7,0,1>: Cost 3 vsldoi8 <1,6,1,7>, LHS
+  3393343912U,	// <1,7,0,2>: Cost 4 vmrglw <6,3,1,0>, <6,1,7,2>
+  3397325306U,	// <1,7,0,3>: Cost 4 vmrglw <7,0,1,0>, <6,2,7,3>
+  2793575444U,	// <1,7,0,4>: Cost 3 vsldoi12 <7,0,4,1>, <7,0,4,1>
+  3722030797U,	// <1,7,0,5>: Cost 4 vsldoi4 <5,1,7,0>, <5,1,7,0>
+  2688467446U,	// <1,7,0,6>: Cost 3 vsldoi8 <0,6,1,7>, <0,6,1,7>
+  2689131079U,	// <1,7,0,7>: Cost 3 vsldoi8 <0,7,1,7>, <0,7,1,7>
+  2694439570U,	// <1,7,0,u>: Cost 3 vsldoi8 <1,6,1,7>, <0,u,1,1>
+  2654265354U,	// <1,7,1,0>: Cost 3 vsldoi4 <6,1,7,1>, <0,0,1,1>
+  2794017866U,	// <1,7,1,1>: Cost 3 vsldoi12 <7,1,1,1>, <7,1,1,1>
+  3768181639U,	// <1,7,1,2>: Cost 4 vsldoi8 <1,6,1,7>, <1,2,1,3>
+  2334872058U,	// <1,7,1,3>: Cost 3 vmrglw <u,u,1,1>, <6,2,7,3>
+  2654268726U,	// <1,7,1,4>: Cost 3 vsldoi4 <6,1,7,1>, RHS
+  3792069797U,	// <1,7,1,5>: Cost 4 vsldoi8 <5,6,1,7>, <1,5,6,1>
+  2694440143U,	// <1,7,1,6>: Cost 3 vsldoi8 <1,6,1,7>, <1,6,1,7>
+  2334872386U,	// <1,7,1,7>: Cost 3 vmrglw <u,u,1,1>, <6,6,7,7>
+  2695767409U,	// <1,7,1,u>: Cost 3 vsldoi8 <1,u,1,7>, <1,u,1,7>
+  2654273638U,	// <1,7,2,0>: Cost 3 vsldoi4 <6,1,7,2>, LHS
+  2222117973U,	// <1,7,2,1>: Cost 3 vmrghw <1,2,3,0>, <7,1,2,3>
+  2299711912U,	// <1,7,2,2>: Cost 3 vmrglw <3,0,1,2>, <6,1,7,2>
+  2654275734U,	// <1,7,2,3>: Cost 3 vsldoi4 <6,1,7,2>, <3,0,1,2>
+  2654276918U,	// <1,7,2,4>: Cost 3 vsldoi4 <6,1,7,2>, RHS
+  3385397675U,	// <1,7,2,5>: Cost 4 vmrglw <5,0,1,2>, <6,1,7,5>
+  2654278056U,	// <1,7,2,6>: Cost 3 vsldoi4 <6,1,7,2>, <6,1,7,2>
+  2323599627U,	// <1,7,2,7>: Cost 3 vmrglw <7,0,1,2>, <5,u,7,7>
+  2654279470U,	// <1,7,2,u>: Cost 3 vsldoi4 <6,1,7,2>, LHS
+  2795271395U,	// <1,7,3,0>: Cost 3 vsldoi12 <7,3,0,1>, <7,3,0,1>
+  3768183059U,	// <1,7,3,1>: Cost 4 vsldoi8 <1,6,1,7>, <3,1,6,1>
+  3728025254U,	// <1,7,3,2>: Cost 4 vsldoi4 <6,1,7,3>, <2,3,0,1>
+  3768183196U,	// <1,7,3,3>: Cost 4 vsldoi8 <1,6,1,7>, <3,3,3,3>
+  3768183298U,	// <1,7,3,4>: Cost 4 vsldoi8 <1,6,1,7>, <3,4,5,6>
+  3792071255U,	// <1,7,3,5>: Cost 4 vsldoi8 <5,6,1,7>, <3,5,6,1>
+  3780127361U,	// <1,7,3,6>: Cost 4 vsldoi8 <3,6,1,7>, <3,6,1,7>
+  3847779617U,	// <1,7,3,7>: Cost 4 vsldoi12 <3,7,0,1>, <7,3,7,0>
+  2795861291U,	// <1,7,3,u>: Cost 3 vsldoi12 <7,3,u,1>, <7,3,u,1>
+  2795935028U,	// <1,7,4,0>: Cost 3 vsldoi12 <7,4,0,1>, <7,4,0,1>
+  3728032975U,	// <1,7,4,1>: Cost 4 vsldoi4 <6,1,7,4>, <1,6,1,7>
+  3839153480U,	// <1,7,4,2>: Cost 4 vsldoi12 <2,3,0,1>, <7,4,2,3>
+  3397358074U,	// <1,7,4,3>: Cost 4 vmrglw <7,0,1,4>, <6,2,7,3>
+  3854783835U,	// <1,7,4,4>: Cost 4 vsldoi12 <4,u,5,1>, <7,4,4,4>
+  2694442294U,	// <1,7,4,5>: Cost 3 vsldoi8 <1,6,1,7>, RHS
+  3786100058U,	// <1,7,4,6>: Cost 4 vsldoi8 <4,6,1,7>, <4,6,1,7>
+  3722065254U,	// <1,7,4,7>: Cost 4 vsldoi4 <5,1,7,4>, <7,4,5,6>
+  2694442537U,	// <1,7,4,u>: Cost 3 vsldoi8 <1,6,1,7>, RHS
+  2654298214U,	// <1,7,5,0>: Cost 3 vsldoi4 <6,1,7,5>, LHS
+  3854783893U,	// <1,7,5,1>: Cost 4 vsldoi12 <4,u,5,1>, <7,5,1,u>
+  3710126010U,	// <1,7,5,2>: Cost 4 vsldoi4 <3,1,7,5>, <2,6,3,7>
+  2332250618U,	// <1,7,5,3>: Cost 3 vmrglw <u,4,1,5>, <6,2,7,3>
+  2654301494U,	// <1,7,5,4>: Cost 3 vsldoi4 <6,1,7,5>, RHS
+  2284474795U,	// <1,7,5,5>: Cost 3 vmrglw <0,4,1,5>, <6,1,7,5>
+  2718330931U,	// <1,7,5,6>: Cost 3 vsldoi8 <5,6,1,7>, <5,6,1,7>
+  2332250946U,	// <1,7,5,7>: Cost 3 vmrglw <u,4,1,5>, <6,6,7,7>
+  2719658197U,	// <1,7,5,u>: Cost 3 vsldoi8 <5,u,1,7>, <5,u,1,7>
+  2332921954U,	// <1,7,6,0>: Cost 3 vmrglw <u,5,1,6>, <5,6,7,0>
+  3768185254U,	// <1,7,6,1>: Cost 4 vsldoi8 <1,6,1,7>, <6,1,7,0>
+  3710134202U,	// <1,7,6,2>: Cost 4 vsldoi4 <3,1,7,6>, <2,6,3,7>
+  3710134561U,	// <1,7,6,3>: Cost 4 vsldoi4 <3,1,7,6>, <3,1,7,6>
+  3710135606U,	// <1,7,6,4>: Cost 4 vsldoi4 <3,1,7,6>, RHS
+  3864884745U,	// <1,7,6,5>: Cost 4 vsldoi12 <6,5,7,1>, <7,6,5,7>
+  3854784017U,	// <1,7,6,6>: Cost 4 vsldoi12 <4,u,5,1>, <7,6,6,6>
+  2791953940U,	// <1,7,6,7>: Cost 3 vsldoi12 <6,7,0,1>, <7,6,7,0>
+  2792617501U,	// <1,7,6,u>: Cost 3 vsldoi12 <6,u,0,1>, <7,6,u,0>
+  2797925927U,	// <1,7,7,0>: Cost 3 vsldoi12 <7,7,0,1>, <7,7,0,1>
+  3365528426U,	// <1,7,7,1>: Cost 4 vmrglw <1,6,1,7>, <1,1,7,1>
+  3728058022U,	// <1,7,7,2>: Cost 4 vsldoi4 <6,1,7,7>, <2,3,0,1>
+  3365528509U,	// <1,7,7,3>: Cost 4 vmrglw <1,6,1,7>, <1,2,7,3>
+  3854784079U,	// <1,7,7,4>: Cost 4 vsldoi12 <4,u,5,1>, <7,7,4,5>
+  3722088148U,	// <1,7,7,5>: Cost 4 vsldoi4 <5,1,7,7>, <5,1,7,7>
+  3728060845U,	// <1,7,7,6>: Cost 4 vsldoi4 <6,1,7,7>, <6,1,7,7>
+  2781042284U,	// <1,7,7,7>: Cost 3 vsldoi12 <4,u,5,1>, <7,7,7,7>
+  2798515823U,	// <1,7,7,u>: Cost 3 vsldoi12 <7,7,u,1>, <7,7,u,1>
+  2654322705U,	// <1,7,u,0>: Cost 3 vsldoi4 <6,1,7,u>, <0,0,1,u>
+  2694444846U,	// <1,7,u,1>: Cost 3 vsldoi8 <1,6,1,7>, LHS
+  2299711912U,	// <1,7,u,2>: Cost 3 vmrglw <3,0,1,2>, <6,1,7,2>
+  2323649018U,	// <1,7,u,3>: Cost 3 vmrglw <7,0,1,u>, <6,2,7,3>
+  2654326070U,	// <1,7,u,4>: Cost 3 vsldoi4 <6,1,7,u>, RHS
+  2694445210U,	// <1,7,u,5>: Cost 3 vsldoi8 <1,6,1,7>, RHS
+  2654327214U,	// <1,7,u,6>: Cost 3 vsldoi4 <6,1,7,u>, <6,1,7,u>
+  2323649346U,	// <1,7,u,7>: Cost 3 vmrglw <7,0,1,u>, <6,6,7,7>
+  2694445413U,	// <1,7,u,u>: Cost 3 vsldoi8 <1,6,1,7>, LHS
+  1610752017U,	// <1,u,0,0>: Cost 2 vsldoi8 <0,0,1,u>, <0,0,1,u>
+  1613406310U,	// <1,u,0,1>: Cost 2 vsldoi8 <0,4,1,u>, LHS
+  2685821107U,	// <1,u,0,2>: Cost 3 vsldoi8 <0,2,1,u>, <0,2,1,u>
+  2283765916U,	// <1,u,0,3>: Cost 3 vmrglw <0,3,1,0>, LHS
+  1613406549U,	// <1,u,0,4>: Cost 2 vsldoi8 <0,4,1,u>, <0,4,1,u>
+  1725880054U,	// <1,u,0,5>: Cost 2 vsldoi12 <u,0,5,1>, <u,0,5,1>
+  2688475639U,	// <1,u,0,6>: Cost 3 vsldoi8 <0,6,1,u>, <0,6,1,u>
+  2283769160U,	// <1,u,0,7>: Cost 3 vmrglw <0,3,1,0>, RHS
+  1613406877U,	// <1,u,0,u>: Cost 2 vsldoi8 <0,4,1,u>, LHS
+  1550221414U,	// <1,u,1,0>: Cost 2 vsldoi4 <1,1,1,1>, LHS
+  269271142U,	// <1,u,1,1>: Cost 1 vspltisw1 LHS
+  1683117870U,	// <1,u,1,2>: Cost 2 vsldoi12 <0,u,1,1>, LHS
+  1213350044U,	// <1,u,1,3>: Cost 2 vmrglw <0,u,1,1>, LHS
+  1550224694U,	// <1,u,1,4>: Cost 2 vsldoi4 <1,1,1,1>, RHS
+  1147574426U,	// <1,u,1,5>: Cost 2 vmrghw <1,1,1,1>, RHS
+  2687149326U,	// <1,u,1,6>: Cost 3 vsldoi8 <0,4,1,u>, <1,6,u,7>
+  1213353288U,	// <1,u,1,7>: Cost 2 vmrglw <0,u,1,1>, RHS
+  269271142U,	// <1,u,1,u>: Cost 1 vspltisw1 LHS
+  2222118611U,	// <1,u,2,0>: Cost 3 vmrghw <1,2,3,0>, <u,0,1,2>
+  1148376878U,	// <1,u,2,1>: Cost 2 vmrghw <1,2,3,0>, LHS
+  1148371862U,	// <1,u,2,2>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0>
+  1225965724U,	// <1,u,2,3>: Cost 2 vmrglw <3,0,1,2>, LHS
+  2222118975U,	// <1,u,2,4>: Cost 3 vmrghw <1,2,3,0>, <u,4,5,6>
+  1148377242U,	// <1,u,2,5>: Cost 2 vmrghw <1,2,3,0>, RHS
+  2687150010U,	// <1,u,2,6>: Cost 3 vsldoi8 <0,4,1,u>, <2,6,3,7>
+  1225968968U,	// <1,u,2,7>: Cost 2 vmrglw <3,0,1,2>, RHS
+  1148377445U,	// <1,u,2,u>: Cost 2 vmrghw <1,2,3,0>, LHS
+  471040156U,	// <1,u,3,0>: Cost 1 vsldoi4 LHS, LHS
+  1544782644U,	// <1,u,3,1>: Cost 2 vsldoi4 LHS, <1,1,1,1>
+  1544783464U,	// <1,u,3,2>: Cost 2 vsldoi4 LHS, <2,2,2,2>
+  1544784022U,	// <1,u,3,3>: Cost 2 vsldoi4 LHS, <3,0,1,2>
+  471043382U,	// <1,u,3,4>: Cost 1 vsldoi4 LHS, RHS
+  1592561668U,	// <1,u,3,5>: Cost 2 vsldoi4 LHS, <5,5,5,5>
+  1592562170U,	// <1,u,3,6>: Cost 2 vsldoi4 LHS, <6,2,7,3>
+  1592562682U,	// <1,u,3,7>: Cost 2 vsldoi4 LHS, <7,0,1,2>
+  471045934U,	// <1,u,3,u>: Cost 1 vsldoi4 LHS, LHS
+  2708384629U,	// <1,u,4,0>: Cost 3 vsldoi8 <4,0,1,u>, <4,0,1,u>
+  2687151101U,	// <1,u,4,1>: Cost 3 vsldoi8 <0,4,1,u>, <4,1,u,0>
+  2223408022U,	// <1,u,4,2>: Cost 3 vmrghw <1,4,2,5>, <1,2,3,0>
+  2283798684U,	// <1,u,4,3>: Cost 3 vmrglw <0,3,1,4>, LHS
+  2642422785U,	// <1,u,4,4>: Cost 3 vsldoi4 <4,1,u,4>, <4,1,u,4>
+  1613409590U,	// <1,u,4,5>: Cost 2 vsldoi8 <0,4,1,u>, RHS
+  2283801090U,	// <1,u,4,6>: Cost 3 vmrglw <0,3,1,4>, <3,4,5,6>
+  2283801928U,	// <1,u,4,7>: Cost 3 vmrglw <0,3,1,4>, RHS
+  1613409833U,	// <1,u,4,u>: Cost 2 vsldoi8 <0,4,1,u>, RHS
+  2284471235U,	// <1,u,5,0>: Cost 3 vmrglw <0,4,1,5>, <1,2,u,0>
+  2284472046U,	// <1,u,5,1>: Cost 3 vmrglw <0,4,1,5>, <2,3,u,1>
+  2284472533U,	// <1,u,5,2>: Cost 3 vmrglw <0,4,1,5>, <3,0,u,2>
+  1210728604U,	// <1,u,5,3>: Cost 2 vmrglw <0,4,1,5>, LHS
+  2284471239U,	// <1,u,5,4>: Cost 3 vmrglw <0,4,1,5>, <1,2,u,4>
+  1210728786U,	// <1,u,5,5>: Cost 2 vmrglw <0,4,1,5>, <0,4,1,5>
+  1683118234U,	// <1,u,5,6>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+  1210731848U,	// <1,u,5,7>: Cost 2 vmrglw <0,4,1,5>, RHS
+  1210728609U,	// <1,u,5,u>: Cost 2 vmrglw <0,4,1,5>, LHS
+  2720330023U,	// <1,u,6,0>: Cost 3 vsldoi8 <6,0,1,u>, <6,0,1,u>
+  2757376190U,	// <1,u,6,1>: Cost 3 vsldoi12 <0,u,u,1>, <u,6,1,7>
+  2726302202U,	// <1,u,6,2>: Cost 3 vsldoi8 <7,0,1,u>, <6,2,7,3>
+  2283151516U,	// <1,u,6,3>: Cost 3 vmrglw <0,2,1,6>, LHS
+  2224972114U,	// <1,u,6,4>: Cost 3 vmrghw <1,6,5,7>, <0,4,1,5>
+  2224683162U,	// <1,u,6,5>: Cost 3 vmrghw <1,6,1,7>, RHS
+  2726302520U,	// <1,u,6,6>: Cost 3 vsldoi8 <7,0,1,u>, <6,6,6,6>
+  2283154760U,	// <1,u,6,7>: Cost 3 vmrglw <0,2,1,6>, RHS
+  2283151521U,	// <1,u,6,u>: Cost 3 vmrglw <0,2,1,6>, LHS
+  1652560896U,	// <1,u,7,0>: Cost 2 vsldoi8 <7,0,1,u>, <7,0,1,u>
+  2333590225U,	// <1,u,7,1>: Cost 3 vmrglw <u,6,1,7>, <0,u,u,1>
+  2765412628U,	// <1,u,7,2>: Cost 3 vsldoi12 <2,3,0,1>, <u,7,2,3>
+  2291785884U,	// <1,u,7,3>: Cost 3 vmrglw <1,6,1,7>, LHS
+  2781042984U,	// <1,u,7,4>: Cost 3 vsldoi12 <4,u,5,1>, <u,7,4,5>
+  3365527953U,	// <1,u,7,5>: Cost 4 vmrglw <1,6,1,7>, <0,4,u,5>
+  2791954748U,	// <1,u,7,6>: Cost 3 vsldoi12 <6,7,0,1>, <u,7,6,7>
+  2291789128U,	// <1,u,7,7>: Cost 3 vmrglw <1,6,1,7>, RHS
+  1657869960U,	// <1,u,7,u>: Cost 2 vsldoi8 <7,u,1,u>, <7,u,1,u>
+  471081121U,	// <1,u,u,0>: Cost 1 vsldoi4 LHS, LHS
+  269271142U,	// <1,u,u,1>: Cost 1 vspltisw1 LHS
+  1544824424U,	// <1,u,u,2>: Cost 2 vsldoi4 LHS, <2,2,2,2>
+  1544824982U,	// <1,u,u,3>: Cost 2 vsldoi4 LHS, <3,0,1,2>
+  471084342U,	// <1,u,u,4>: Cost 1 vsldoi4 LHS, RHS
+  1613412506U,	// <1,u,u,5>: Cost 2 vsldoi8 <0,4,1,u>, RHS
+  1683118477U,	// <1,u,u,6>: Cost 2 vsldoi12 <0,u,1,1>, RHS
+  1210756424U,	// <1,u,u,7>: Cost 2 vmrglw <0,4,1,u>, RHS
+  471086894U,	// <1,u,u,u>: Cost 1 vsldoi4 LHS, LHS
+  2226757632U,	// <2,0,0,0>: Cost 3 vmrghw <2,0,3,0>, <0,0,0,0>
+  2226757734U,	// <2,0,0,1>: Cost 3 vmrghw <2,0,3,0>, LHS
+  3826622483U,	// <2,0,0,2>: Cost 4 vsldoi12 <0,2,1,2>, <0,0,2,1>
+  3843211292U,	// <2,0,0,3>: Cost 4 vsldoi12 <3,0,1,2>, <0,0,3,1>
+  3300499794U,	// <2,0,0,4>: Cost 4 vmrghw <2,0,3,0>, <0,4,1,5>
+  3356256724U,	// <2,0,0,5>: Cost 4 vmrglw <0,1,2,0>, <3,4,0,5>
+  3825664056U,	// <2,0,0,6>: Cost 4 vsldoi12 <0,0,6,2>, <0,0,6,2>
+  3762889289U,	// <2,0,0,7>: Cost 4 vsldoi8 <0,7,2,0>, <0,7,2,0>
+  2226758301U,	// <2,0,0,u>: Cost 3 vmrghw <2,0,3,0>, LHS
+  2227429386U,	// <2,0,1,0>: Cost 3 vmrghw <2,1,3,1>, <0,0,1,1>
+  2227429478U,	// <2,0,1,1>: Cost 3 vmrghw <2,1,3,1>, LHS
+  1691156582U,	// <2,0,1,2>: Cost 2 vsldoi12 <2,2,2,2>, LHS
+  2666358997U,	// <2,0,1,3>: Cost 3 vsldoi4 <u,2,0,1>, <3,0,u,2>
+  2227462482U,	// <2,0,1,4>: Cost 3 vmrghw <2,1,3,5>, <0,4,1,5>
+  3722186464U,	// <2,0,1,5>: Cost 4 vsldoi4 <5,2,0,1>, <5,2,0,1>
+  3867099278U,	// <2,0,1,6>: Cost 4 vsldoi12 <7,0,1,2>, <0,1,6,7>
+  3366881912U,	// <2,0,1,7>: Cost 4 vmrglw <1,u,2,1>, <3,6,0,7>
+  1691156636U,	// <2,0,1,u>: Cost 2 vsldoi12 <2,2,2,2>, LHS
+  2228027392U,	// <2,0,2,0>: Cost 3 vmrghw <2,2,2,2>, <0,0,0,0>
+  1154285670U,	// <2,0,2,1>: Cost 2 vmrghw <2,2,2,2>, LHS
+  2228027565U,	// <2,0,2,2>: Cost 3 vmrghw <2,2,2,2>, <0,2,1,2>
+  3301769468U,	// <2,0,2,3>: Cost 4 vmrghw <2,2,2,2>, <0,3,1,0>
+  2228027730U,	// <2,0,2,4>: Cost 3 vmrghw <2,2,2,2>, <0,4,1,5>
+  3301769635U,	// <2,0,2,5>: Cost 4 vmrghw <2,2,2,2>, <0,5,1,5>
+  3780806586U,	// <2,0,2,6>: Cost 4 vsldoi8 <3,7,2,0>, <2,6,3,7>
+  3368880760U,	// <2,0,2,7>: Cost 4 vmrglw <2,2,2,2>, <3,6,0,7>
+  1154286237U,	// <2,0,2,u>: Cost 2 vmrghw <2,2,2,2>, LHS
+  1213440000U,	// <2,0,3,0>: Cost 2 vmrglw LHS, <0,0,0,0>
+  1213441702U,	// <2,0,3,1>: Cost 2 vmrglw LHS, <2,3,0,1>
+  2228535470U,	// <2,0,3,2>: Cost 3 vmrghw <2,3,0,1>, <0,2,1,3>
+  2636515632U,	// <2,0,3,3>: Cost 3 vsldoi4 <3,2,0,3>, <3,2,0,3>
+  2287182962U,	// <2,0,3,4>: Cost 3 vmrglw LHS, <1,5,0,4>
+  2660405346U,	// <2,0,3,5>: Cost 3 vsldoi4 <7,2,0,3>, <5,6,7,0>
+  2228535798U,	// <2,0,3,6>: Cost 3 vmrghw <2,3,0,1>, <0,6,1,7>
+  2660406420U,	// <2,0,3,7>: Cost 3 vsldoi4 <7,2,0,3>, <7,2,0,3>
+  1213441709U,	// <2,0,3,u>: Cost 2 vmrglw LHS, <2,3,0,u>
+  3368894464U,	// <2,0,4,0>: Cost 4 vmrglw <2,2,2,4>, <0,0,0,0>
+  2764898642U,	// <2,0,4,1>: Cost 3 vsldoi12 <2,2,2,2>, <0,4,1,5>
+  3826622811U,	// <2,0,4,2>: Cost 4 vsldoi12 <0,2,1,2>, <0,4,2,5>
+  3843211620U,	// <2,0,4,3>: Cost 4 vsldoi12 <3,0,1,2>, <0,4,3,5>
+  3838640493U,	// <2,0,4,4>: Cost 4 vsldoi12 <2,2,2,2>, <0,4,4,5>
+  2732944694U,	// <2,0,4,5>: Cost 3 vsldoi8 <u,1,2,0>, RHS
+  3797396857U,	// <2,0,4,6>: Cost 4 vsldoi8 <6,5,2,0>, <4,6,5,2>
+  3867099528U,	// <2,0,4,7>: Cost 4 vsldoi12 <7,0,1,2>, <0,4,7,5>
+  2764898705U,	// <2,0,4,u>: Cost 3 vsldoi12 <2,2,2,2>, <0,4,u,5>
+  3364257792U,	// <2,0,5,0>: Cost 4 vmrglw <1,4,2,5>, <0,0,0,0>
+  2230124646U,	// <2,0,5,1>: Cost 3 vmrghw <2,5,3,6>, LHS
+  3304235184U,	// <2,0,5,2>: Cost 4 vmrghw <2,5,u,6>, <0,2,1,5>
+  3364260144U,	// <2,0,5,3>: Cost 4 vmrglw <1,4,2,5>, <3,2,0,3>
+  3303817554U,	// <2,0,5,4>: Cost 4 vmrghw <2,5,3,0>, <0,4,1,5>
+  3364260146U,	// <2,0,5,5>: Cost 4 vmrglw <1,4,2,5>, <3,2,0,5>
+  3867099602U,	// <2,0,5,6>: Cost 4 vsldoi12 <7,0,1,2>, <0,5,6,7>
+  3364260472U,	// <2,0,5,7>: Cost 4 vmrglw <1,4,2,5>, <3,6,0,7>
+  2230125213U,	// <2,0,5,u>: Cost 3 vmrghw <2,5,3,6>, LHS
+  2230796288U,	// <2,0,6,0>: Cost 3 vmrghw <2,6,3,7>, <0,0,0,0>
+  1157054566U,	// <2,0,6,1>: Cost 2 vmrghw <2,6,3,7>, LHS
+  2230796465U,	// <2,0,6,2>: Cost 3 vmrghw <2,6,3,7>, <0,2,1,6>
+  3304538364U,	// <2,0,6,3>: Cost 4 vmrghw <2,6,3,7>, <0,3,1,0>
+  2230796626U,	// <2,0,6,4>: Cost 3 vmrghw <2,6,3,7>, <0,4,1,5>
+  3797398205U,	// <2,0,6,5>: Cost 4 vsldoi8 <6,5,2,0>, <6,5,2,0>
+  3304538614U,	// <2,0,6,6>: Cost 4 vmrghw <2,6,3,7>, <0,6,1,7>
+  3798725471U,	// <2,0,6,7>: Cost 4 vsldoi8 <6,7,2,0>, <6,7,2,0>
+  1157055133U,	// <2,0,6,u>: Cost 2 vmrghw <2,6,3,7>, LHS
+  3371573248U,	// <2,0,7,0>: Cost 4 vmrglw <2,6,2,7>, <0,0,0,0>
+  2231189606U,	// <2,0,7,1>: Cost 3 vmrghw <2,7,0,1>, LHS
+  3801380003U,	// <2,0,7,2>: Cost 4 vsldoi8 <7,2,2,0>, <7,2,2,0>
+  3802043636U,	// <2,0,7,3>: Cost 4 vsldoi8 <7,3,2,0>, <7,3,2,0>
+  3806688614U,	// <2,0,7,4>: Cost 4 vsldoi8 <u,1,2,0>, <7,4,5,6>
+  3356317308U,	// <2,0,7,5>: Cost 4 vmrglw <0,1,2,7>, <7,u,0,5>
+  3804034535U,	// <2,0,7,6>: Cost 4 vsldoi8 <7,6,2,0>, <7,6,2,0>
+  3806688876U,	// <2,0,7,7>: Cost 4 vsldoi8 <u,1,2,0>, <7,7,7,7>
+  2231190173U,	// <2,0,7,u>: Cost 3 vmrghw <2,7,0,1>, LHS
+  1208836096U,	// <2,0,u,0>: Cost 2 vmrglw LHS, <0,0,0,0>
+  1208837798U,	// <2,0,u,1>: Cost 2 vmrglw LHS, <2,3,0,1>
+  1691157149U,	// <2,0,u,2>: Cost 2 vsldoi12 <2,2,2,2>, LHS
+  2636556597U,	// <2,0,u,3>: Cost 3 vsldoi4 <3,2,0,u>, <3,2,0,u>
+  2282579625U,	// <2,0,u,4>: Cost 3 vmrglw LHS, <2,3,0,4>
+  2660446306U,	// <2,0,u,5>: Cost 3 vsldoi4 <7,2,0,u>, <5,6,7,0>
+  2228535798U,	// <2,0,u,6>: Cost 3 vmrghw <2,3,0,1>, <0,6,1,7>
+  2660447385U,	// <2,0,u,7>: Cost 3 vsldoi4 <7,2,0,u>, <7,2,0,u>
+  1208837805U,	// <2,0,u,u>: Cost 2 vmrglw LHS, <2,3,0,u>
+  3692388523U,	// <2,1,0,0>: Cost 4 vsldoi4 <0,2,1,0>, <0,2,1,0>
+  2757526244U,	// <2,1,0,1>: Cost 3 vsldoi12 <1,0,1,2>, <1,0,1,2>
+  2330290974U,	// <2,1,0,2>: Cost 3 vmrglw <u,1,2,0>, <3,u,1,2>
+  3843212020U,	// <2,1,0,3>: Cost 4 vsldoi12 <3,0,1,2>, <1,0,3,0>
+  3692391734U,	// <2,1,0,4>: Cost 4 vsldoi4 <0,2,1,0>, RHS
+  3300533362U,	// <2,1,0,5>: Cost 4 vmrghw <2,0,3,4>, <1,5,0,4>
+  3794084337U,	// <2,1,0,6>: Cost 4 vsldoi8 <6,0,2,1>, <0,6,1,2>
+  3374170614U,	// <2,1,0,7>: Cost 5 vmrglw <3,1,2,0>, <0,6,1,7>
+  2758042403U,	// <2,1,0,u>: Cost 3 vsldoi12 <1,0,u,2>, <1,0,u,2>
+  2690482924U,	// <2,1,1,0>: Cost 3 vsldoi8 <1,0,2,1>, <1,0,2,1>
+  2764899124U,	// <2,1,1,1>: Cost 3 vsldoi12 <2,2,2,2>, <1,1,1,1>
+  2695791510U,	// <2,1,1,2>: Cost 3 vsldoi8 <1,u,2,1>, <1,2,3,0>
+  3362235271U,	// <2,1,1,3>: Cost 4 vmrglw <1,1,2,1>, <1,2,1,3>
+  3692399926U,	// <2,1,1,4>: Cost 4 vsldoi4 <0,2,1,1>, RHS
+  3832226649U,	// <2,1,1,5>: Cost 4 vsldoi12 <1,1,5,2>, <1,1,5,2>
+  3301205235U,	// <2,1,1,6>: Cost 4 vmrghw <2,1,3,5>, <1,6,5,7>
+  3768870179U,	// <2,1,1,7>: Cost 4 vsldoi8 <1,7,2,1>, <1,7,2,1>
+  2695791988U,	// <2,1,1,u>: Cost 3 vsldoi8 <1,u,2,1>, <1,u,2,1>
+  2618663085U,	// <2,1,2,0>: Cost 3 vsldoi4 <0,2,1,2>, <0,2,1,2>
+  2228028212U,	// <2,1,2,1>: Cost 3 vmrghw <2,2,2,2>, <1,1,1,1>
+  2618664552U,	// <2,1,2,2>: Cost 3 vsldoi4 <0,2,1,2>, <2,2,2,2>
+  2759000984U,	// <2,1,2,3>: Cost 3 vsldoi12 <1,2,3,2>, <1,2,3,2>
+  2618666294U,	// <2,1,2,4>: Cost 3 vsldoi4 <0,2,1,2>, RHS
+  2295136594U,	// <2,1,2,5>: Cost 3 vmrglw <2,2,2,2>, <0,4,1,5>
+  3769534376U,	// <2,1,2,6>: Cost 4 vsldoi8 <1,u,2,1>, <2,6,1,7>
+  2793358266U,	// <2,1,2,7>: Cost 3 vsldoi12 <7,0,1,2>, <1,2,7,0>
+  2618668846U,	// <2,1,2,u>: Cost 3 vsldoi4 <0,2,1,2>, LHS
+  2282536969U,	// <2,1,3,0>: Cost 3 vmrglw LHS, <0,0,1,0>
+  1208795146U,	// <2,1,3,1>: Cost 2 vmrglw LHS, <0,0,1,1>
+  1213442198U,	// <2,1,3,2>: Cost 2 vmrglw LHS, <3,0,1,2>
+  2287181998U,	// <2,1,3,3>: Cost 3 vmrglw LHS, <0,2,1,3>
+  2618674486U,	// <2,1,3,4>: Cost 3 vsldoi4 <0,2,1,3>, RHS
+  1208795474U,	// <2,1,3,5>: Cost 2 vmrglw LHS, <0,4,1,5>
+  2287182001U,	// <2,1,3,6>: Cost 3 vmrglw LHS, <0,2,1,6>
+  2287183055U,	// <2,1,3,7>: Cost 3 vmrglw LHS, <1,6,1,7>
+  1208795153U,	// <2,1,3,u>: Cost 2 vmrglw LHS, <0,0,1,u>
+  3692421295U,	// <2,1,4,0>: Cost 4 vsldoi4 <0,2,1,4>, <0,2,1,4>
+  3838641195U,	// <2,1,4,1>: Cost 4 vsldoi12 <2,2,2,2>, <1,4,1,5>
+  2330323742U,	// <2,1,4,2>: Cost 3 vmrglw <u,1,2,4>, <3,u,1,2>
+  3692423318U,	// <2,1,4,3>: Cost 5 vsldoi4 <0,2,1,4>, <3,0,1,2>
+  3692424502U,	// <2,1,4,4>: Cost 4 vsldoi4 <0,2,1,4>, RHS
+  2695793974U,	// <2,1,4,5>: Cost 3 vsldoi8 <1,u,2,1>, RHS
+  3799395705U,	// <2,1,4,6>: Cost 4 vsldoi8 <6,u,2,1>, <4,6,5,2>
+  3368895695U,	// <2,1,4,7>: Cost 5 vmrglw <2,2,2,4>, <1,6,1,7>
+  2695794217U,	// <2,1,4,u>: Cost 3 vsldoi8 <1,u,2,1>, RHS
+  3692429488U,	// <2,1,5,0>: Cost 4 vsldoi4 <0,2,1,5>, <0,2,1,5>
+  3364257802U,	// <2,1,5,1>: Cost 4 vmrglw <1,4,2,5>, <0,0,1,1>
+  3692431253U,	// <2,1,5,2>: Cost 4 vsldoi4 <0,2,1,5>, <2,5,u,6>
+  3692431874U,	// <2,1,5,3>: Cost 4 vsldoi4 <0,2,1,5>, <3,4,5,6>
+  3692432694U,	// <2,1,5,4>: Cost 4 vsldoi4 <0,2,1,5>, RHS
+  3364258130U,	// <2,1,5,5>: Cost 4 vmrglw <1,4,2,5>, <0,4,1,5>
+  3303875827U,	// <2,1,5,6>: Cost 4 vmrghw <2,5,3,7>, <1,6,5,7>
+  3867100333U,	// <2,1,5,7>: Cost 4 vsldoi12 <7,0,1,2>, <1,5,7,0>
+  3692435246U,	// <2,1,5,u>: Cost 4 vsldoi4 <0,2,1,5>, LHS
+  2618695857U,	// <2,1,6,0>: Cost 3 vsldoi4 <0,2,1,6>, <0,2,1,6>
+  2230797108U,	// <2,1,6,1>: Cost 3 vmrghw <2,6,3,7>, <1,1,1,1>
+  2618697658U,	// <2,1,6,2>: Cost 3 vsldoi4 <0,2,1,6>, <2,6,3,7>
+  3692439702U,	// <2,1,6,3>: Cost 4 vsldoi4 <0,2,1,6>, <3,0,1,2>
+  2618699062U,	// <2,1,6,4>: Cost 3 vsldoi4 <0,2,1,6>, RHS
+  3364929874U,	// <2,1,6,5>: Cost 4 vmrglw <1,5,2,6>, <0,4,1,5>
+  3692442424U,	// <2,1,6,6>: Cost 4 vsldoi4 <0,2,1,6>, <6,6,6,6>
+  3798733664U,	// <2,1,6,7>: Cost 4 vsldoi8 <6,7,2,1>, <6,7,2,1>
+  2618701614U,	// <2,1,6,u>: Cost 3 vsldoi4 <0,2,1,6>, LHS
+  3799397370U,	// <2,1,7,0>: Cost 4 vsldoi8 <6,u,2,1>, <7,0,1,2>
+  3371573258U,	// <2,1,7,1>: Cost 4 vmrglw <2,6,2,7>, <0,0,1,1>
+  2330351234U,	// <2,1,7,2>: Cost 3 vmrglw <u,1,2,7>, <7,u,1,2>
+  3799397658U,	// <2,1,7,3>: Cost 4 vsldoi8 <6,u,2,1>, <7,3,6,2>
+  3799397734U,	// <2,1,7,4>: Cost 4 vsldoi8 <6,u,2,1>, <7,4,5,6>
+  3371573586U,	// <2,1,7,5>: Cost 4 vmrglw <2,6,2,7>, <0,4,1,5>
+  3799397870U,	// <2,1,7,6>: Cost 4 vsldoi8 <6,u,2,1>, <7,6,2,7>
+  3799397956U,	// <2,1,7,7>: Cost 4 vsldoi8 <6,u,2,1>, <7,7,3,3>
+  2330351234U,	// <2,1,7,u>: Cost 3 vmrglw <u,1,2,7>, <7,u,1,2>
+  2282577929U,	// <2,1,u,0>: Cost 3 vmrglw LHS, <0,0,1,0>
+  1208836106U,	// <2,1,u,1>: Cost 2 vmrglw LHS, <0,0,1,1>
+  1208838294U,	// <2,1,u,2>: Cost 2 vmrglw LHS, <3,0,1,2>
+  2282578094U,	// <2,1,u,3>: Cost 3 vmrglw LHS, <0,2,1,3>
+  2282577933U,	// <2,1,u,4>: Cost 3 vmrglw LHS, <0,0,1,4>
+  1208836434U,	// <2,1,u,5>: Cost 2 vmrglw LHS, <0,4,1,5>
+  2282578097U,	// <2,1,u,6>: Cost 3 vmrglw LHS, <0,2,1,6>
+  2287224015U,	// <2,1,u,7>: Cost 3 vmrglw LHS, <1,6,1,7>
+  1208836113U,	// <2,1,u,u>: Cost 2 vmrglw LHS, <0,0,1,u>
+  2226759117U,	// <2,2,0,0>: Cost 3 vmrghw <2,0,3,0>, <2,0,3,0>
+  1624047718U,	// <2,2,0,1>: Cost 2 vsldoi8 <2,2,2,2>, LHS
+  2697789613U,	// <2,2,0,2>: Cost 3 vsldoi8 <2,2,2,2>, <0,2,1,2>
+  2226767526U,	// <2,2,0,3>: Cost 3 vmrghw <2,0,3,1>, <2,3,0,1>
+  2697789778U,	// <2,2,0,4>: Cost 3 vsldoi8 <2,2,2,2>, <0,4,1,5>
+  3300657000U,	// <2,2,0,5>: Cost 4 vmrghw <2,0,5,1>, <2,5,3,6>
+  2226988986U,	// <2,2,0,6>: Cost 3 vmrghw <2,0,6,1>, <2,6,3,7>
+  3734271139U,	// <2,2,0,7>: Cost 4 vsldoi4 <7,2,2,0>, <7,2,2,0>
+  1624048285U,	// <2,2,0,u>: Cost 2 vsldoi8 <2,2,2,2>, LHS
+  3831268868U,	// <2,2,1,0>: Cost 4 vsldoi12 <1,0,1,2>, <2,1,0,1>
+  2293138804U,	// <2,2,1,1>: Cost 3 vmrglw <1,u,2,1>, <1,u,2,1>
+  2697790358U,	// <2,2,1,2>: Cost 3 vsldoi8 <2,2,2,2>, <1,2,3,0>
+  2293137510U,	// <2,2,1,3>: Cost 3 vmrglw <1,u,2,1>, LHS
+  3771532331U,	// <2,2,1,4>: Cost 4 vsldoi8 <2,2,2,2>, <1,4,1,5>
+  3767551106U,	// <2,2,1,5>: Cost 4 vsldoi8 <1,5,2,2>, <1,5,2,2>
+  3301173178U,	// <2,2,1,6>: Cost 4 vmrghw <2,1,3,1>, <2,6,3,7>
+  3372853169U,	// <2,2,1,7>: Cost 4 vmrglw <2,u,2,1>, <2,6,2,7>
+  2293137515U,	// <2,2,1,u>: Cost 3 vmrglw <1,u,2,1>, LHS
+  1556938854U,	// <2,2,2,0>: Cost 2 vsldoi4 <2,2,2,2>, LHS
+  2295137733U,	// <2,2,2,1>: Cost 3 vmrglw <2,2,2,2>, <2,0,2,1>
+  336380006U,	// <2,2,2,2>: Cost 1 vspltisw2 LHS
+  1221394534U,	// <2,2,2,3>: Cost 2 vmrglw <2,2,2,2>, LHS
+  1556942134U,	// <2,2,2,4>: Cost 2 vsldoi4 <2,2,2,2>, RHS
+  2295138061U,	// <2,2,2,5>: Cost 3 vmrglw <2,2,2,2>, <2,4,2,5>
+  2228029370U,	// <2,2,2,6>: Cost 3 vmrghw <2,2,2,2>, <2,6,3,7>
+  2660545701U,	// <2,2,2,7>: Cost 3 vsldoi4 <7,2,2,2>, <7,2,2,2>
+  336380006U,	// <2,2,2,u>: Cost 1 vspltisw2 LHS
+  2697791638U,	// <2,2,3,0>: Cost 3 vsldoi8 <2,2,2,2>, <3,0,1,2>
+  2765489840U,	// <2,2,3,1>: Cost 3 vsldoi12 <2,3,1,2>, <2,3,1,2>
+  1213441640U,	// <2,2,3,2>: Cost 2 vmrglw LHS, <2,2,2,2>
+  135053414U,	// <2,2,3,3>: Cost 1 vmrglw LHS, LHS
+  2697792002U,	// <2,2,3,4>: Cost 3 vsldoi8 <2,2,2,2>, <3,4,5,6>
+  2330313780U,	// <2,2,3,5>: Cost 3 vmrglw LHS, <1,4,2,5>
+  2287183549U,	// <2,2,3,6>: Cost 3 vmrglw LHS, <2,3,2,6>
+  2660553894U,	// <2,2,3,7>: Cost 3 vsldoi4 <7,2,2,3>, <7,2,2,3>
+  135053419U,	// <2,2,3,u>: Cost 1 vmrglw LHS, LHS
+  2630697062U,	// <2,2,4,0>: Cost 3 vsldoi4 <2,2,2,4>, LHS
+  3771534282U,	// <2,2,4,1>: Cost 4 vsldoi8 <2,2,2,2>, <4,1,2,3>
+  2764900109U,	// <2,2,4,2>: Cost 3 vsldoi12 <2,2,2,2>, <2,4,2,5>
+  2295152742U,	// <2,2,4,3>: Cost 3 vmrglw <2,2,2,4>, LHS
+  2295154282U,	// <2,2,4,4>: Cost 3 vmrglw <2,2,2,4>, <2,2,2,4>
+  1624050998U,	// <2,2,4,5>: Cost 2 vsldoi8 <2,2,2,2>, RHS
+  2229675962U,	// <2,2,4,6>: Cost 3 vmrghw <2,4,6,5>, <2,6,3,7>
+  3368896433U,	// <2,2,4,7>: Cost 4 vmrglw <2,2,2,4>, <2,6,2,7>
+  1624051241U,	// <2,2,4,u>: Cost 2 vsldoi8 <2,2,2,2>, RHS
+  3771534920U,	// <2,2,5,0>: Cost 4 vsldoi8 <2,2,2,2>, <5,0,1,2>
+  3364258540U,	// <2,2,5,1>: Cost 4 vmrglw <1,4,2,5>, <1,0,2,1>
+  2296489576U,	// <2,2,5,2>: Cost 3 vmrglw <2,4,2,5>, <2,2,2,2>
+  2290516070U,	// <2,2,5,3>: Cost 3 vmrglw <1,4,2,5>, LHS
+  3771535284U,	// <2,2,5,4>: Cost 4 vsldoi8 <2,2,2,2>, <5,4,5,6>
+  2290517044U,	// <2,2,5,5>: Cost 3 vmrglw <1,4,2,5>, <1,4,2,5>
+  2697793634U,	// <2,2,5,6>: Cost 3 vsldoi8 <2,2,2,2>, <5,6,7,0>
+  3370231729U,	// <2,2,5,7>: Cost 4 vmrglw <2,4,2,5>, <2,6,2,7>
+  2290516075U,	// <2,2,5,u>: Cost 3 vmrglw <1,4,2,5>, LHS
+  2230797801U,	// <2,2,6,0>: Cost 3 vmrghw <2,6,3,7>, <2,0,6,1>
+  3304539679U,	// <2,2,6,1>: Cost 4 vmrghw <2,6,3,7>, <2,1,3,1>
+  2764900273U,	// <2,2,6,2>: Cost 3 vsldoi12 <2,2,2,2>, <2,6,2,7>
+  2764900282U,	// <2,2,6,3>: Cost 3 vsldoi12 <2,2,2,2>, <2,6,3,7>
+  2230798129U,	// <2,2,6,4>: Cost 3 vmrghw <2,6,3,7>, <2,4,6,5>
+  3304540008U,	// <2,2,6,5>: Cost 4 vmrghw <2,6,3,7>, <2,5,3,6>
+  1157056442U,	// <2,2,6,6>: Cost 2 vmrghw <2,6,3,7>, <2,6,3,7>
+  2725000033U,	// <2,2,6,7>: Cost 3 vsldoi8 <6,7,2,2>, <6,7,2,2>
+  1157056442U,	// <2,2,6,u>: Cost 2 vmrghw <2,6,3,7>, <2,6,3,7>
+  2793359338U,	// <2,2,7,0>: Cost 3 vsldoi12 <7,0,1,2>, <2,7,0,1>
+  3371574725U,	// <2,2,7,1>: Cost 4 vmrglw <2,6,2,7>, <2,0,2,1>
+  2297833064U,	// <2,2,7,2>: Cost 3 vmrglw <2,6,2,7>, <2,2,2,2>
+  2297831526U,	// <2,2,7,3>: Cost 3 vmrglw <2,6,2,7>, LHS
+  2697794918U,	// <2,2,7,4>: Cost 3 vsldoi8 <2,2,2,2>, <7,4,5,6>
+  3371575053U,	// <2,2,7,5>: Cost 4 vmrglw <2,6,2,7>, <2,4,2,5>
+  3304933297U,	// <2,2,7,6>: Cost 4 vmrghw <2,7,0,1>, <2,6,2,7>
+  2297833393U,	// <2,2,7,7>: Cost 3 vmrglw <2,6,2,7>, <2,6,2,7>
+  2297831531U,	// <2,2,7,u>: Cost 3 vmrglw <2,6,2,7>, LHS
+  1556938854U,	// <2,2,u,0>: Cost 2 vsldoi4 <2,2,2,2>, LHS
+  1624053550U,	// <2,2,u,1>: Cost 2 vsldoi8 <2,2,2,2>, LHS
+  336380006U,	// <2,2,u,2>: Cost 1 vspltisw2 LHS
+  135094374U,	// <2,2,u,3>: Cost 1 vmrglw LHS, LHS
+  1556942134U,	// <2,2,u,4>: Cost 2 vsldoi4 <2,2,2,2>, RHS
+  1624053914U,	// <2,2,u,5>: Cost 2 vsldoi8 <2,2,2,2>, RHS
+  1157056442U,	// <2,2,u,6>: Cost 2 vmrghw <2,6,3,7>, <2,6,3,7>
+  2660594859U,	// <2,2,u,7>: Cost 3 vsldoi4 <7,2,2,u>, <7,2,2,u>
+  135094379U,	// <2,2,u,u>: Cost 1 vmrglw LHS, LHS
+  1611448320U,	// <2,3,0,0>: Cost 2 vsldoi8 LHS, <0,0,0,0>
+  537706598U,	// <2,3,0,1>: Cost 1 vsldoi8 LHS, LHS
+  2689835181U,	// <2,3,0,2>: Cost 3 vsldoi8 LHS, <0,2,1,2>
+  2689835260U,	// <2,3,0,3>: Cost 3 vsldoi8 LHS, <0,3,1,0>
+  1611448658U,	// <2,3,0,4>: Cost 2 vsldoi8 LHS, <0,4,1,5>
+  2732966354U,	// <2,3,0,5>: Cost 3 vsldoi8 LHS, <0,5,6,7>
+  2732966390U,	// <2,3,0,6>: Cost 3 vsldoi8 LHS, <0,6,1,7>
+  2660603052U,	// <2,3,0,7>: Cost 3 vsldoi4 <7,2,3,0>, <7,2,3,0>
+  537707165U,	// <2,3,0,u>: Cost 1 vsldoi8 LHS, LHS
+  2689835748U,	// <2,3,1,0>: Cost 3 vsldoi8 LHS, <1,0,1,2>
+  1611449140U,	// <2,3,1,1>: Cost 2 vsldoi8 LHS, <1,1,1,1>
+  1611449238U,	// <2,3,1,2>: Cost 2 vsldoi8 LHS, <1,2,3,0>
+  3763577805U,	// <2,3,1,3>: Cost 4 vsldoi8 LHS, <1,3,0,1>
+  2689836112U,	// <2,3,1,4>: Cost 3 vsldoi8 LHS, <1,4,5,6>
+  2689836143U,	// <2,3,1,5>: Cost 3 vsldoi8 LHS, <1,5,0,1>
+  2689836239U,	// <2,3,1,6>: Cost 3 vsldoi8 LHS, <1,6,1,7>
+  3366881210U,	// <2,3,1,7>: Cost 4 vmrglw <1,u,2,1>, <2,6,3,7>
+  1616094588U,	// <2,3,1,u>: Cost 2 vsldoi8 LHS, <1,u,3,0>
+  2689836493U,	// <2,3,2,0>: Cost 3 vsldoi8 LHS, <2,0,3,0>
+  2685191711U,	// <2,3,2,1>: Cost 3 vsldoi8 LHS, <2,1,3,1>
+  1611449960U,	// <2,3,2,2>: Cost 2 vsldoi8 LHS, <2,2,2,2>
+  1611450022U,	// <2,3,2,3>: Cost 2 vsldoi8 LHS, <2,3,0,1>
+  2689836822U,	// <2,3,2,4>: Cost 3 vsldoi8 LHS, <2,4,3,5>
+  2689836904U,	// <2,3,2,5>: Cost 3 vsldoi8 LHS, <2,5,3,6>
+  1611450298U,	// <2,3,2,6>: Cost 2 vsldoi8 LHS, <2,6,3,7>
+  2295138234U,	// <2,3,2,7>: Cost 3 vmrglw <2,2,2,2>, <2,6,3,7>
+  1611450456U,	// <2,3,2,u>: Cost 2 vsldoi8 LHS, <2,u,3,3>
+  1213440918U,	// <2,3,3,0>: Cost 2 vmrglw LHS, <1,2,3,0>
+  2282538527U,	// <2,3,3,1>: Cost 3 vmrglw LHS, <2,1,3,1>
+  1557022322U,	// <2,3,3,2>: Cost 2 vsldoi4 <2,2,3,3>, <2,2,3,3>
+  1208796786U,	// <2,3,3,3>: Cost 2 vmrglw LHS, <2,2,3,3>
+  1213440922U,	// <2,3,3,4>: Cost 2 vmrglw LHS, <1,2,3,4>
+  2282538531U,	// <2,3,3,5>: Cost 3 vmrglw LHS, <2,1,3,5>
+  2287188094U,	// <2,3,3,6>: Cost 3 vmrglw LHS, <u,5,3,6>
+  1213441978U,	// <2,3,3,7>: Cost 2 vmrglw LHS, <2,6,3,7>
+  1208796791U,	// <2,3,3,u>: Cost 2 vmrglw LHS, <2,2,3,u>
+  1551056998U,	// <2,3,4,0>: Cost 2 vsldoi4 <1,2,3,4>, LHS
+  1551057818U,	// <2,3,4,1>: Cost 2 vsldoi4 <1,2,3,4>, <1,2,3,4>
+  2624800360U,	// <2,3,4,2>: Cost 3 vsldoi4 <1,2,3,4>, <2,2,2,2>
+  2624800918U,	// <2,3,4,3>: Cost 3 vsldoi4 <1,2,3,4>, <3,0,1,2>
+  1551060278U,	// <2,3,4,4>: Cost 2 vsldoi4 <1,2,3,4>, RHS
+  537709878U,	// <2,3,4,5>: Cost 1 vsldoi8 LHS, RHS
+  2732969337U,	// <2,3,4,6>: Cost 3 vsldoi8 LHS, <4,6,5,2>
+  2660635824U,	// <2,3,4,7>: Cost 3 vsldoi4 <7,2,3,4>, <7,2,3,4>
+  537710121U,	// <2,3,4,u>: Cost 1 vsldoi8 LHS, RHS
+  2689838664U,	// <2,3,5,0>: Cost 3 vsldoi8 LHS, <5,0,1,2>
+  2732969615U,	// <2,3,5,1>: Cost 3 vsldoi8 LHS, <5,1,0,1>
+  2732969707U,	// <2,3,5,2>: Cost 3 vsldoi8 LHS, <5,2,1,3>
+  3763580721U,	// <2,3,5,3>: Cost 4 vsldoi8 LHS, <5,3,0,1>
+  2689839028U,	// <2,3,5,4>: Cost 3 vsldoi8 LHS, <5,4,5,6>
+  1659228164U,	// <2,3,5,5>: Cost 2 vsldoi8 LHS, <5,5,5,5>
+  1659228258U,	// <2,3,5,6>: Cost 2 vsldoi8 LHS, <5,6,7,0>
+  3364259770U,	// <2,3,5,7>: Cost 4 vmrglw <1,4,2,5>, <2,6,3,7>
+  1659228420U,	// <2,3,5,u>: Cost 2 vsldoi8 LHS, <5,u,7,0>
+  2230798486U,	// <2,3,6,0>: Cost 3 vmrghw <2,6,3,7>, <3,0,1,2>
+  2732970407U,	// <2,3,6,1>: Cost 3 vsldoi8 LHS, <6,1,7,1>
+  1659228666U,	// <2,3,6,2>: Cost 2 vsldoi8 LHS, <6,2,7,3>
+  2230798748U,	// <2,3,6,3>: Cost 3 vmrghw <2,6,3,7>, <3,3,3,3>
+  2230798850U,	// <2,3,6,4>: Cost 3 vmrghw <2,6,3,7>, <3,4,5,6>
+  2732970731U,	// <2,3,6,5>: Cost 3 vsldoi8 LHS, <6,5,7,1>
+  1659228984U,	// <2,3,6,6>: Cost 2 vsldoi8 LHS, <6,6,6,6>
+  1659229006U,	// <2,3,6,7>: Cost 2 vsldoi8 LHS, <6,7,0,1>
+  1659229087U,	// <2,3,6,u>: Cost 2 vsldoi8 LHS, <6,u,0,1>
+  1659229178U,	// <2,3,7,0>: Cost 2 vsldoi8 LHS, <7,0,1,2>
+  2726999125U,	// <2,3,7,1>: Cost 3 vsldoi8 <7,1,2,3>, <7,1,2,3>
+  2727662758U,	// <2,3,7,2>: Cost 3 vsldoi8 <7,2,2,3>, <7,2,2,3>
+  2732971235U,	// <2,3,7,3>: Cost 3 vsldoi8 LHS, <7,3,0,1>
+  1659229542U,	// <2,3,7,4>: Cost 2 vsldoi8 LHS, <7,4,5,6>
+  2732971446U,	// <2,3,7,5>: Cost 3 vsldoi8 LHS, <7,5,5,5>
+  2732971484U,	// <2,3,7,6>: Cost 3 vsldoi8 LHS, <7,6,0,7>
+  1659229804U,	// <2,3,7,7>: Cost 2 vsldoi8 LHS, <7,7,7,7>
+  1659229826U,	// <2,3,7,u>: Cost 2 vsldoi8 LHS, <7,u,1,2>
+  1208837014U,	// <2,3,u,0>: Cost 2 vmrglw LHS, <1,2,3,0>
+  537712430U,	// <2,3,u,1>: Cost 1 vsldoi8 LHS, LHS
+  1616099205U,	// <2,3,u,2>: Cost 2 vsldoi8 LHS, <u,2,3,0>
+  1208837746U,	// <2,3,u,3>: Cost 2 vmrglw LHS, <2,2,3,3>
+  1208837018U,	// <2,3,u,4>: Cost 2 vmrglw LHS, <1,2,3,4>
+  537712794U,	// <2,3,u,5>: Cost 1 vsldoi8 LHS, RHS
+  1616099536U,	// <2,3,u,6>: Cost 2 vsldoi8 LHS, <u,6,3,7>
+  1208838074U,	// <2,3,u,7>: Cost 2 vmrglw LHS, <2,6,3,7>
+  537712997U,	// <2,3,u,u>: Cost 1 vsldoi8 LHS, LHS
+  3771547648U,	// <2,4,0,0>: Cost 4 vsldoi8 <2,2,2,4>, <0,0,0,0>
+  2697805926U,	// <2,4,0,1>: Cost 3 vsldoi8 <2,2,2,4>, LHS
+  3770884269U,	// <2,4,0,2>: Cost 4 vsldoi8 <2,1,2,4>, <0,2,1,2>
+  3806716164U,	// <2,4,0,3>: Cost 4 vsldoi8 <u,1,2,4>, <0,3,1,u>
+  3771547986U,	// <2,4,0,4>: Cost 4 vsldoi8 <2,2,2,4>, <0,4,1,5>
+  2226761014U,	// <2,4,0,5>: Cost 3 vmrghw <2,0,3,0>, RHS
+  3853462427U,	// <2,4,0,6>: Cost 4 vsldoi12 <4,6,5,2>, <4,0,6,1>
+  3867102116U,	// <2,4,0,7>: Cost 4 vsldoi12 <7,0,1,2>, <4,0,7,1>
+  2226761257U,	// <2,4,0,u>: Cost 3 vmrghw <2,0,3,0>, RHS
+  3849186231U,	// <2,4,1,0>: Cost 4 vsldoi12 <4,0,1,2>, <4,1,0,2>
+  3301207010U,	// <2,4,1,1>: Cost 4 vmrghw <2,1,3,5>, <4,1,5,0>
+  3766240150U,	// <2,4,1,2>: Cost 4 vsldoi8 <1,3,2,4>, <1,2,3,0>
+  3766240226U,	// <2,4,1,3>: Cost 4 vsldoi8 <1,3,2,4>, <1,3,2,4>
+  3301207248U,	// <2,4,1,4>: Cost 4 vmrghw <2,1,3,5>, <4,4,4,4>
+  2227432758U,	// <2,4,1,5>: Cost 3 vmrghw <2,1,3,1>, RHS
+  3758941400U,	// <2,4,1,6>: Cost 4 vsldoi8 <0,1,2,4>, <1,6,2,7>
+  3768894758U,	// <2,4,1,7>: Cost 4 vsldoi8 <1,7,2,4>, <1,7,2,4>
+  2227433001U,	// <2,4,1,u>: Cost 3 vmrghw <2,1,3,1>, RHS
+  2228030354U,	// <2,4,2,0>: Cost 3 vmrghw <2,2,2,2>, <4,0,5,1>
+  3770885657U,	// <2,4,2,1>: Cost 4 vsldoi8 <2,1,2,4>, <2,1,2,4>
+  2697807466U,	// <2,4,2,2>: Cost 3 vsldoi8 <2,2,2,4>, <2,2,2,4>
+  3368880468U,	// <2,4,2,3>: Cost 4 vmrglw <2,2,2,2>, <3,2,4,3>
+  2228030672U,	// <2,4,2,4>: Cost 3 vmrghw <2,2,2,2>, <4,4,4,4>
+  1154288950U,	// <2,4,2,5>: Cost 2 vmrghw <2,2,2,2>, RHS
+  3771549617U,	// <2,4,2,6>: Cost 4 vsldoi8 <2,2,2,4>, <2,6,2,7>
+  3368880796U,	// <2,4,2,7>: Cost 4 vmrglw <2,2,2,2>, <3,6,4,7>
+  1154289193U,	// <2,4,2,u>: Cost 2 vmrghw <2,2,2,2>, RHS
+  2636808294U,	// <2,4,3,0>: Cost 3 vsldoi4 <3,2,4,3>, LHS
+  2287181861U,	// <2,4,3,1>: Cost 3 vmrglw LHS, <0,0,4,1>
+  2228866102U,	// <2,4,3,2>: Cost 3 vmrghw <2,3,4,5>, <4,2,5,3>
+  2636810580U,	// <2,4,3,3>: Cost 3 vsldoi4 <3,2,4,3>, <3,2,4,3>
+  1256574160U,	// <2,4,3,4>: Cost 2 vmrglw LHS, <4,4,4,4>
+  1213441742U,	// <2,4,3,5>: Cost 2 vmrglw LHS, <2,3,4,5>
+  2228866430U,	// <2,4,3,6>: Cost 3 vmrghw <2,3,4,5>, <4,6,5,7>
+  2660701368U,	// <2,4,3,7>: Cost 3 vsldoi4 <7,2,4,3>, <7,2,4,3>
+  1213441745U,	// <2,4,3,u>: Cost 2 vmrglw LHS, <2,3,4,u>
+  3704586342U,	// <2,4,4,0>: Cost 4 vsldoi4 <2,2,4,4>, LHS
+  3782831051U,	// <2,4,4,1>: Cost 4 vsldoi8 <4,1,2,4>, <4,1,2,4>
+  3704587900U,	// <2,4,4,2>: Cost 4 vsldoi4 <2,2,4,4>, <2,2,4,4>
+  3368896123U,	// <2,4,4,3>: Cost 4 vmrglw <2,2,2,4>, <2,2,4,3>
+  2793360592U,	// <2,4,4,4>: Cost 3 vsldoi12 <7,0,1,2>, <4,4,4,4>
+  2697809206U,	// <2,4,4,5>: Cost 3 vsldoi8 <2,2,2,4>, RHS
+  3303198078U,	// <2,4,4,6>: Cost 4 vmrghw <2,4,3,5>, <4,6,5,7>
+  3867102444U,	// <2,4,4,7>: Cost 4 vsldoi12 <7,0,1,2>, <4,4,7,5>
+  2697809449U,	// <2,4,4,u>: Cost 3 vsldoi8 <2,2,2,4>, RHS
+  2630852710U,	// <2,4,5,0>: Cost 3 vsldoi4 <2,2,4,5>, LHS
+  2624881572U,	// <2,4,5,1>: Cost 3 vsldoi4 <1,2,4,5>, <1,2,4,5>
+  2630854269U,	// <2,4,5,2>: Cost 3 vsldoi4 <2,2,4,5>, <2,2,4,5>
+  2666686677U,	// <2,4,5,3>: Cost 3 vsldoi4 <u,2,4,5>, <3,0,u,2>
+  2630855990U,	// <2,4,5,4>: Cost 3 vsldoi4 <2,2,4,5>, RHS
+  2230127926U,	// <2,4,5,5>: Cost 3 vmrghw <2,5,3,6>, RHS
+  1691159862U,	// <2,4,5,6>: Cost 2 vsldoi12 <2,2,2,2>, RHS
+  3867102520U,	// <2,4,5,7>: Cost 4 vsldoi12 <7,0,1,2>, <4,5,7,0>
+  1691159880U,	// <2,4,5,u>: Cost 2 vsldoi12 <2,2,2,2>, RHS
+  2230799250U,	// <2,4,6,0>: Cost 3 vmrghw <2,6,3,7>, <4,0,5,1>
+  3304541130U,	// <2,4,6,1>: Cost 4 vmrghw <2,6,3,7>, <4,1,2,3>
+  2230799417U,	// <2,4,6,2>: Cost 3 vmrghw <2,6,3,7>, <4,2,5,6>
+  3304541323U,	// <2,4,6,3>: Cost 4 vmrghw <2,6,3,7>, <4,3,5,7>
+  2230799568U,	// <2,4,6,4>: Cost 3 vmrghw <2,6,3,7>, <4,4,4,4>
+  1157057846U,	// <2,4,6,5>: Cost 2 vmrghw <2,6,3,7>, RHS
+  3304541566U,	// <2,4,6,6>: Cost 4 vmrghw <2,6,3,7>, <4,6,5,7>
+  3798758243U,	// <2,4,6,7>: Cost 4 vsldoi8 <6,7,2,4>, <6,7,2,4>
+  1157058089U,	// <2,4,6,u>: Cost 2 vmrghw <2,6,3,7>, RHS
+  3806721018U,	// <2,4,7,0>: Cost 4 vsldoi8 <u,1,2,4>, <7,0,1,2>
+  3853831590U,	// <2,4,7,1>: Cost 4 vsldoi12 <4,7,1,2>, <4,7,1,2>
+  3801412775U,	// <2,4,7,2>: Cost 4 vsldoi8 <7,2,2,4>, <7,2,2,4>
+  3802076408U,	// <2,4,7,3>: Cost 4 vsldoi8 <7,3,2,4>, <7,3,2,4>
+  3401436368U,	// <2,4,7,4>: Cost 4 vmrglw <7,6,2,7>, <4,4,4,4>
+  2793360840U,	// <2,4,7,5>: Cost 3 vsldoi12 <7,0,1,2>, <4,7,5,0>
+  3804067307U,	// <2,4,7,6>: Cost 4 vsldoi8 <7,6,2,4>, <7,6,2,4>
+  3867102682U,	// <2,4,7,7>: Cost 4 vsldoi12 <7,0,1,2>, <4,7,7,0>
+  2793360867U,	// <2,4,7,u>: Cost 3 vsldoi12 <7,0,1,2>, <4,7,u,0>
+  2630877286U,	// <2,4,u,0>: Cost 3 vsldoi4 <2,2,4,u>, LHS
+  2282580144U,	// <2,4,u,1>: Cost 3 vmrglw LHS, <3,0,4,1>
+  2630878848U,	// <2,4,u,2>: Cost 3 vsldoi4 <2,2,4,u>, <2,2,4,u>
+  2636851545U,	// <2,4,u,3>: Cost 3 vsldoi4 <3,2,4,u>, <3,2,4,u>
+  1256615120U,	// <2,4,u,4>: Cost 2 vmrglw LHS, <4,4,4,4>
+  1208837838U,	// <2,4,u,5>: Cost 2 vmrglw LHS, <2,3,4,5>
+  1691160105U,	// <2,4,u,6>: Cost 2 vsldoi12 <2,2,2,2>, RHS
+  2660742333U,	// <2,4,u,7>: Cost 3 vsldoi4 <7,2,4,u>, <7,2,4,u>
+  1208837841U,	// <2,4,u,u>: Cost 2 vmrglw LHS, <2,3,4,u>
+  3766910976U,	// <2,5,0,0>: Cost 4 vsldoi8 <1,4,2,5>, <0,0,0,0>
+  2693169254U,	// <2,5,0,1>: Cost 3 vsldoi8 <1,4,2,5>, LHS
+  3760939181U,	// <2,5,0,2>: Cost 4 vsldoi8 <0,4,2,5>, <0,2,1,2>
+  3843214936U,	// <2,5,0,3>: Cost 4 vsldoi12 <3,0,1,2>, <5,0,3,0>
+  3760939355U,	// <2,5,0,4>: Cost 4 vsldoi8 <0,4,2,5>, <0,4,2,5>
+  3867102827U,	// <2,5,0,5>: Cost 4 vsldoi12 <7,0,1,2>, <5,0,5,1>
+  3867102836U,	// <2,5,0,6>: Cost 4 vsldoi12 <7,0,1,2>, <5,0,6,1>
+  3867102844U,	// <2,5,0,7>: Cost 4 vsldoi12 <7,0,1,2>, <5,0,7,0>
+  2693169821U,	// <2,5,0,u>: Cost 3 vsldoi8 <1,4,2,5>, LHS
+  3766911724U,	// <2,5,1,0>: Cost 4 vsldoi8 <1,4,2,5>, <1,0,2,1>
+  3766911796U,	// <2,5,1,1>: Cost 4 vsldoi8 <1,4,2,5>, <1,1,1,1>
+  2693170070U,	// <2,5,1,2>: Cost 3 vsldoi8 <1,4,2,5>, <1,2,3,0>
+  3384798262U,	// <2,5,1,3>: Cost 4 vmrglw <4,u,2,1>, <4,2,5,3>
+  2693170228U,	// <2,5,1,4>: Cost 3 vsldoi8 <1,4,2,5>, <1,4,2,5>
+  3301208068U,	// <2,5,1,5>: Cost 4 vmrghw <2,1,3,5>, <5,5,5,5>
+  3366879607U,	// <2,5,1,6>: Cost 4 vmrglw <1,u,2,1>, <0,4,5,6>
+  3867102925U,	// <2,5,1,7>: Cost 4 vsldoi12 <7,0,1,2>, <5,1,7,0>
+  2695824760U,	// <2,5,1,u>: Cost 3 vsldoi8 <1,u,2,5>, <1,u,2,5>
+  2642845798U,	// <2,5,2,0>: Cost 3 vsldoi4 <4,2,5,2>, LHS
+  2295139218U,	// <2,5,2,1>: Cost 3 vmrglw <2,2,2,2>, <4,0,5,1>
+  2699142760U,	// <2,5,2,2>: Cost 3 vsldoi8 <2,4,2,5>, <2,2,2,2>
+  3766912678U,	// <2,5,2,3>: Cost 4 vsldoi8 <1,4,2,5>, <2,3,0,1>
+  2699142925U,	// <2,5,2,4>: Cost 3 vsldoi8 <2,4,2,5>, <2,4,2,5>
+  2228031492U,	// <2,5,2,5>: Cost 3 vmrghw <2,2,2,2>, <5,5,5,5>
+  2295138818U,	// <2,5,2,6>: Cost 3 vmrglw <2,2,2,2>, <3,4,5,6>
+  3368879347U,	// <2,5,2,7>: Cost 4 vmrglw <2,2,2,2>, <1,6,5,7>
+  2295138820U,	// <2,5,2,u>: Cost 3 vmrglw <2,2,2,2>, <3,4,5,u>
+  2287184866U,	// <2,5,3,0>: Cost 3 vmrglw LHS, <4,1,5,0>
+  1256573842U,	// <2,5,3,1>: Cost 2 vmrglw LHS, <4,0,5,1>
+  2642855630U,	// <2,5,3,2>: Cost 3 vsldoi4 <4,2,5,3>, <2,3,4,5>
+  2287182763U,	// <2,5,3,3>: Cost 3 vmrglw LHS, <1,2,5,3>
+  2287184870U,	// <2,5,3,4>: Cost 3 vmrglw LHS, <4,1,5,4>
+  1256574170U,	// <2,5,3,5>: Cost 2 vmrglw LHS, <4,4,5,5>
+  1213442562U,	// <2,5,3,6>: Cost 2 vmrglw LHS, <3,4,5,6>
+  2287183091U,	// <2,5,3,7>: Cost 3 vmrglw LHS, <1,6,5,7>
+  1213442564U,	// <2,5,3,u>: Cost 2 vmrglw LHS, <3,4,5,u>
+  3716604006U,	// <2,5,4,0>: Cost 4 vsldoi4 <4,2,5,4>, LHS
+  3716604822U,	// <2,5,4,1>: Cost 4 vsldoi4 <4,2,5,4>, <1,2,3,0>
+  3766914099U,	// <2,5,4,2>: Cost 4 vsldoi8 <1,4,2,5>, <4,2,5,0>
+  3368895403U,	// <2,5,4,3>: Cost 5 vmrglw <2,2,2,4>, <1,2,5,3>
+  3716607031U,	// <2,5,4,4>: Cost 4 vsldoi4 <4,2,5,4>, <4,2,5,4>
+  2693172534U,	// <2,5,4,5>: Cost 3 vsldoi8 <1,4,2,5>, RHS
+  3363588610U,	// <2,5,4,6>: Cost 4 vmrglw <1,3,2,4>, <3,4,5,6>
+  3368895731U,	// <2,5,4,7>: Cost 5 vmrglw <2,2,2,4>, <1,6,5,7>
+  2693172777U,	// <2,5,4,u>: Cost 3 vsldoi8 <1,4,2,5>, RHS
+  3704668262U,	// <2,5,5,0>: Cost 4 vsldoi4 <2,2,5,5>, LHS
+  3704669078U,	// <2,5,5,1>: Cost 4 vsldoi4 <2,2,5,5>, <1,2,3,0>
+  3704669830U,	// <2,5,5,2>: Cost 4 vsldoi4 <2,2,5,5>, <2,2,5,5>
+  3364259460U,	// <2,5,5,3>: Cost 4 vmrglw <1,4,2,5>, <2,2,5,3>
+  3704671542U,	// <2,5,5,4>: Cost 4 vsldoi4 <2,2,5,5>, RHS
+  2793361412U,	// <2,5,5,5>: Cost 3 vsldoi12 <7,0,1,2>, <5,5,5,5>
+  3364258167U,	// <2,5,5,6>: Cost 4 vmrglw <1,4,2,5>, <0,4,5,6>
+  3867103249U,	// <2,5,5,7>: Cost 4 vsldoi12 <7,0,1,2>, <5,5,7,0>
+  2793361412U,	// <2,5,5,u>: Cost 3 vsldoi12 <7,0,1,2>, <5,5,5,5>
+  2642878566U,	// <2,5,6,0>: Cost 3 vsldoi4 <4,2,5,6>, LHS
+  3386166810U,	// <2,5,6,1>: Cost 4 vmrglw <5,1,2,6>, <4,u,5,1>
+  2723033594U,	// <2,5,6,2>: Cost 3 vsldoi8 <6,4,2,5>, <6,2,7,3>
+  3848523842U,	// <2,5,6,3>: Cost 4 vsldoi12 <3,u,1,2>, <5,6,3,4>
+  2723033713U,	// <2,5,6,4>: Cost 3 vsldoi8 <6,4,2,5>, <6,4,2,5>
+  2230800388U,	// <2,5,6,5>: Cost 3 vmrghw <2,6,3,7>, <5,5,5,5>
+  2230800482U,	// <2,5,6,6>: Cost 3 vmrghw <2,6,3,7>, <5,6,7,0>
+  2785841252U,	// <2,5,6,7>: Cost 3 vsldoi12 <5,6,7,2>, <5,6,7,2>
+  2785914989U,	// <2,5,6,u>: Cost 3 vsldoi12 <5,6,u,2>, <5,6,u,2>
+  3796775930U,	// <2,5,7,0>: Cost 4 vsldoi8 <6,4,2,5>, <7,0,1,2>
+  3800757335U,	// <2,5,7,1>: Cost 4 vsldoi8 <7,1,2,5>, <7,1,2,5>
+  3853463689U,	// <2,5,7,2>: Cost 4 vsldoi12 <4,6,5,2>, <5,7,2,3>
+  3796776218U,	// <2,5,7,3>: Cost 4 vsldoi8 <6,4,2,5>, <7,3,6,2>
+  3796776294U,	// <2,5,7,4>: Cost 4 vsldoi8 <6,4,2,5>, <7,4,5,6>
+  3803411867U,	// <2,5,7,5>: Cost 4 vsldoi8 <7,5,2,5>, <7,5,2,5>
+  3371575081U,	// <2,5,7,6>: Cost 4 vmrglw <2,6,2,7>, <2,4,5,6>
+  3796776516U,	// <2,5,7,7>: Cost 4 vsldoi8 <6,4,2,5>, <7,7,3,3>
+  3371575083U,	// <2,5,7,u>: Cost 4 vmrglw <2,6,2,7>, <2,4,5,u>
+  2287225826U,	// <2,5,u,0>: Cost 3 vmrglw LHS, <4,1,5,0>
+  1256614802U,	// <2,5,u,1>: Cost 2 vmrglw LHS, <4,0,5,1>
+  2642896590U,	// <2,5,u,2>: Cost 3 vsldoi4 <4,2,5,u>, <2,3,4,5>
+  2287223723U,	// <2,5,u,3>: Cost 3 vmrglw LHS, <1,2,5,3>
+  2287225830U,	// <2,5,u,4>: Cost 3 vmrglw LHS, <4,1,5,4>
+  1256615130U,	// <2,5,u,5>: Cost 2 vmrglw LHS, <4,4,5,5>
+  1208838658U,	// <2,5,u,6>: Cost 2 vmrglw LHS, <3,4,5,6>
+  2287224051U,	// <2,5,u,7>: Cost 3 vmrglw LHS, <1,6,5,7>
+  1208838660U,	// <2,5,u,u>: Cost 2 vmrglw LHS, <3,4,5,u>
+  3772227584U,	// <2,6,0,0>: Cost 4 vsldoi8 <2,3,2,6>, <0,0,0,0>
+  2698485862U,	// <2,6,0,1>: Cost 3 vsldoi8 <2,3,2,6>, LHS
+  3759620282U,	// <2,6,0,2>: Cost 4 vsldoi8 <0,2,2,6>, <0,2,2,6>
+  3710675299U,	// <2,6,0,3>: Cost 4 vsldoi4 <3,2,6,0>, <3,2,6,0>
+  3767583058U,	// <2,6,0,4>: Cost 4 vsldoi8 <1,5,2,6>, <0,4,1,5>
+  3378153265U,	// <2,6,0,5>: Cost 5 vmrglw <3,7,2,0>, <2,4,6,5>
+  3865186637U,	// <2,6,0,6>: Cost 4 vsldoi12 <6,6,2,2>, <6,0,6,1>
+  2330291510U,	// <2,6,0,7>: Cost 3 vmrglw <u,1,2,0>, RHS
+  2698486429U,	// <2,6,0,u>: Cost 3 vsldoi8 <2,3,2,6>, LHS
+  3734569062U,	// <2,6,1,0>: Cost 4 vsldoi4 <7,2,6,1>, LHS
+  3764929346U,	// <2,6,1,1>: Cost 4 vsldoi8 <1,1,2,6>, <1,1,2,6>
+  3772228502U,	// <2,6,1,2>: Cost 4 vsldoi8 <2,3,2,6>, <1,2,3,0>
+  3734571158U,	// <2,6,1,3>: Cost 4 vsldoi4 <7,2,6,1>, <3,0,1,2>
+  3734572342U,	// <2,6,1,4>: Cost 4 vsldoi4 <7,2,6,1>, RHS
+  3767583878U,	// <2,6,1,5>: Cost 4 vsldoi8 <1,5,2,6>, <1,5,2,6>
+  3768247511U,	// <2,6,1,6>: Cost 4 vsldoi8 <1,6,2,6>, <1,6,2,6>
+  2293140790U,	// <2,6,1,7>: Cost 3 vmrglw <1,u,2,1>, RHS
+  2293140791U,	// <2,6,1,u>: Cost 3 vmrglw <1,u,2,1>, RHS
+  3704717414U,	// <2,6,2,0>: Cost 4 vsldoi4 <2,2,6,2>, LHS
+  3395424589U,	// <2,6,2,1>: Cost 4 vmrglw <6,6,2,2>, <6,0,6,1>
+  2228031993U,	// <2,6,2,2>: Cost 3 vmrghw <2,2,2,2>, <6,2,7,2>
+  2698487485U,	// <2,6,2,3>: Cost 3 vsldoi8 <2,3,2,6>, <2,3,2,6>
+  3704720694U,	// <2,6,2,4>: Cost 4 vsldoi4 <2,2,6,2>, RHS
+  3773556575U,	// <2,6,2,5>: Cost 4 vsldoi8 <2,5,2,6>, <2,5,2,6>
+  2698487738U,	// <2,6,2,6>: Cost 3 vsldoi8 <2,3,2,6>, <2,6,3,7>
+  1221397814U,	// <2,6,2,7>: Cost 2 vmrglw <2,2,2,2>, RHS
+  1221397815U,	// <2,6,2,u>: Cost 2 vmrglw <2,2,2,2>, RHS
+  2636955750U,	// <2,6,3,0>: Cost 3 vsldoi4 <3,2,6,3>, LHS
+  2330314217U,	// <2,6,3,1>: Cost 3 vmrglw LHS, <2,0,6,1>
+  2636957626U,	// <2,6,3,2>: Cost 3 vsldoi4 <3,2,6,3>, <2,6,3,7>
+  2287184230U,	// <2,6,3,3>: Cost 3 vmrglw LHS, <3,2,6,3>
+  2636959030U,	// <2,6,3,4>: Cost 3 vsldoi4 <3,2,6,3>, RHS
+  2648903448U,	// <2,6,3,5>: Cost 3 vsldoi4 <5,2,6,3>, <5,2,6,3>
+  1256575800U,	// <2,6,3,6>: Cost 2 vmrglw LHS, <6,6,6,6>
+  135056694U,	// <2,6,3,7>: Cost 1 vmrglw LHS, RHS
+  135056695U,	// <2,6,3,u>: Cost 1 vmrglw LHS, RHS
+  3710705766U,	// <2,6,4,0>: Cost 4 vsldoi4 <3,2,6,4>, LHS
+  3698762677U,	// <2,6,4,1>: Cost 5 vsldoi4 <1,2,6,4>, <1,2,6,4>
+  3710707389U,	// <2,6,4,2>: Cost 4 vsldoi4 <3,2,6,4>, <2,3,2,6>
+  3710708071U,	// <2,6,4,3>: Cost 4 vsldoi4 <3,2,6,4>, <3,2,6,4>
+  3710709046U,	// <2,6,4,4>: Cost 4 vsldoi4 <3,2,6,4>, RHS
+  2698489142U,	// <2,6,4,5>: Cost 3 vsldoi8 <2,3,2,6>, RHS
+  3796782457U,	// <2,6,4,6>: Cost 4 vsldoi8 <6,4,2,6>, <4,6,5,2>
+  2295156022U,	// <2,6,4,7>: Cost 3 vmrglw <2,2,2,4>, RHS
+  2295156023U,	// <2,6,4,u>: Cost 3 vmrglw <2,2,2,4>, RHS
+  3303870753U,	// <2,6,5,0>: Cost 4 vmrghw <2,5,3,6>, <6,0,1,2>
+  3788820134U,	// <2,6,5,1>: Cost 4 vsldoi8 <5,1,2,6>, <5,1,2,6>
+  3779530520U,	// <2,6,5,2>: Cost 4 vsldoi8 <3,5,2,6>, <5,2,6,3>
+  3303871026U,	// <2,6,5,3>: Cost 4 vmrghw <2,5,3,6>, <6,3,4,5>
+  3303871117U,	// <2,6,5,4>: Cost 4 vmrghw <2,5,3,6>, <6,4,5,6>
+  3791474666U,	// <2,6,5,5>: Cost 4 vsldoi8 <5,5,2,6>, <5,5,2,6>
+  3792138299U,	// <2,6,5,6>: Cost 4 vsldoi8 <5,6,2,6>, <5,6,2,6>
+  2290519350U,	// <2,6,5,7>: Cost 3 vmrglw <1,4,2,5>, RHS
+  2290519351U,	// <2,6,5,u>: Cost 3 vmrglw <1,4,2,5>, RHS
+  2631008358U,	// <2,6,6,0>: Cost 3 vsldoi4 <2,2,6,6>, LHS
+  3372893673U,	// <2,6,6,1>: Cost 4 vmrglw <2,u,2,6>, <2,0,6,1>
+  2791445264U,	// <2,6,6,2>: Cost 3 vsldoi12 <6,6,2,2>, <6,6,2,2>
+  2230800968U,	// <2,6,6,3>: Cost 3 vmrghw <2,6,3,7>, <6,3,7,0>
+  2631011638U,	// <2,6,6,4>: Cost 3 vsldoi4 <2,2,6,6>, RHS
+  3372894001U,	// <2,6,6,5>: Cost 4 vmrglw <2,u,2,6>, <2,4,6,5>
+  2793362232U,	// <2,6,6,6>: Cost 3 vsldoi12 <7,0,1,2>, <6,6,6,6>
+  2295835958U,	// <2,6,6,7>: Cost 3 vmrglw <2,3,2,6>, RHS
+  2295835959U,	// <2,6,6,u>: Cost 3 vmrglw <2,3,2,6>, RHS
+  2793362254U,	// <2,6,7,0>: Cost 3 vsldoi12 <7,0,1,2>, <6,7,0,1>
+  2792035160U,	// <2,6,7,1>: Cost 3 vsldoi12 <6,7,1,2>, <6,7,1,2>
+  2792108897U,	// <2,6,7,2>: Cost 3 vsldoi12 <6,7,2,2>, <6,7,2,2>
+  2769474408U,	// <2,6,7,3>: Cost 3 vsldoi12 <3,0,1,2>, <6,7,3,0>
+  2793362294U,	// <2,6,7,4>: Cost 3 vsldoi12 <7,0,1,2>, <6,7,4,5>
+  3371575089U,	// <2,6,7,5>: Cost 4 vmrglw <2,6,2,7>, <2,4,6,5>
+  2792403845U,	// <2,6,7,6>: Cost 3 vsldoi12 <6,7,6,2>, <6,7,6,2>
+  2297834806U,	// <2,6,7,7>: Cost 3 vmrglw <2,6,2,7>, RHS
+  2297834807U,	// <2,6,7,u>: Cost 3 vmrglw <2,6,2,7>, RHS
+  2636996710U,	// <2,6,u,0>: Cost 3 vsldoi4 <3,2,6,u>, LHS
+  2698491694U,	// <2,6,u,1>: Cost 3 vsldoi8 <2,3,2,6>, LHS
+  2636998631U,	// <2,6,u,2>: Cost 3 vsldoi4 <3,2,6,u>, <2,6,u,7>
+  2282580326U,	// <2,6,u,3>: Cost 3 vmrglw LHS, <3,2,6,3>
+  2636999990U,	// <2,6,u,4>: Cost 3 vsldoi4 <3,2,6,u>, RHS
+  2698492058U,	// <2,6,u,5>: Cost 3 vsldoi8 <2,3,2,6>, RHS
+  1256616760U,	// <2,6,u,6>: Cost 2 vmrglw LHS, <6,6,6,6>
+  135097654U,	// <2,6,u,7>: Cost 1 vmrglw LHS, RHS
+  135097655U,	// <2,6,u,u>: Cost 1 vmrglw LHS, RHS
+  2666864742U,	// <2,7,0,0>: Cost 3 vsldoi4 <u,2,7,0>, LHS
+  1719620602U,	// <2,7,0,1>: Cost 2 vsldoi12 <7,0,1,2>, <7,0,1,2>
+  3768254637U,	// <2,7,0,2>: Cost 4 vsldoi8 <1,6,2,7>, <0,2,1,2>
+  3393417722U,	// <2,7,0,3>: Cost 4 vmrglw <6,3,2,0>, <6,2,7,3>
+  2666868022U,	// <2,7,0,4>: Cost 3 vsldoi4 <u,2,7,0>, RHS
+  3867104290U,	// <2,7,0,5>: Cost 4 vsldoi12 <7,0,1,2>, <7,0,5,6>
+  3728667127U,	// <2,7,0,6>: Cost 4 vsldoi4 <6,2,7,0>, <6,2,7,0>
+  2666869817U,	// <2,7,0,7>: Cost 3 vsldoi4 <u,2,7,0>, <7,0,u,2>
+  1720136761U,	// <2,7,0,u>: Cost 2 vsldoi12 <7,0,u,2>, <7,0,u,2>
+  3728670822U,	// <2,7,1,0>: Cost 4 vsldoi4 <6,2,7,1>, LHS
+  3774227252U,	// <2,7,1,1>: Cost 4 vsldoi8 <2,6,2,7>, <1,1,1,1>
+  3774227350U,	// <2,7,1,2>: Cost 4 vsldoi8 <2,6,2,7>, <1,2,3,0>
+  2323001850U,	// <2,7,1,3>: Cost 3 vmrglw <6,u,2,1>, <6,2,7,3>
+  3728674102U,	// <2,7,1,4>: Cost 4 vsldoi4 <6,2,7,1>, RHS
+  3774227567U,	// <2,7,1,5>: Cost 5 vsldoi8 <2,6,2,7>, <1,5,0,1>
+  2694513880U,	// <2,7,1,6>: Cost 3 vsldoi8 <1,6,2,7>, <1,6,2,7>
+  3396744002U,	// <2,7,1,7>: Cost 4 vmrglw <6,u,2,1>, <6,6,7,7>
+  2323001850U,	// <2,7,1,u>: Cost 3 vmrglw <6,u,2,1>, <6,2,7,3>
+  2654937190U,	// <2,7,2,0>: Cost 3 vsldoi4 <6,2,7,2>, LHS
+  3728679732U,	// <2,7,2,1>: Cost 4 vsldoi4 <6,2,7,2>, <1,1,1,1>
+  2700486248U,	// <2,7,2,2>: Cost 3 vsldoi8 <2,6,2,7>, <2,2,2,2>
+  2321682938U,	// <2,7,2,3>: Cost 3 vmrglw <6,6,2,2>, <6,2,7,3>
+  2654940470U,	// <2,7,2,4>: Cost 3 vsldoi4 <6,2,7,2>, RHS
+  3859584196U,	// <2,7,2,5>: Cost 4 vsldoi12 <5,6,7,2>, <7,2,5,6>
+  2700486577U,	// <2,7,2,6>: Cost 3 vsldoi8 <2,6,2,7>, <2,6,2,7>
+  2228033132U,	// <2,7,2,7>: Cost 3 vmrghw <2,2,2,2>, <7,7,7,7>
+  2701813843U,	// <2,7,2,u>: Cost 3 vsldoi8 <2,u,2,7>, <2,u,2,7>
+  1581203558U,	// <2,7,3,0>: Cost 2 vsldoi4 <6,2,7,3>, LHS
+  2654946100U,	// <2,7,3,1>: Cost 3 vsldoi4 <6,2,7,3>, <1,1,1,1>
+  2637031354U,	// <2,7,3,2>: Cost 3 vsldoi4 <3,2,7,3>, <2,6,3,7>
+  1256575482U,	// <2,7,3,3>: Cost 2 vmrglw LHS, <6,2,7,3>
+  1581206838U,	// <2,7,3,4>: Cost 2 vsldoi4 <6,2,7,3>, RHS
+  2654949380U,	// <2,7,3,5>: Cost 3 vsldoi4 <6,2,7,3>, <5,5,5,5>
+  1581208058U,	// <2,7,3,6>: Cost 2 vsldoi4 <6,2,7,3>, <6,2,7,3>
+  1256575810U,	// <2,7,3,7>: Cost 2 vmrglw LHS, <6,6,7,7>
+  1581209390U,	// <2,7,3,u>: Cost 2 vsldoi4 <6,2,7,3>, LHS
+  3728695398U,	// <2,7,4,0>: Cost 4 vsldoi4 <6,2,7,4>, LHS
+  3869758782U,	// <2,7,4,1>: Cost 4 vsldoi12 <7,4,1,2>, <7,4,1,2>
+  3728696936U,	// <2,7,4,2>: Cost 4 vsldoi4 <6,2,7,4>, <2,2,2,2>
+  3393450490U,	// <2,7,4,3>: Cost 4 vmrglw <6,3,2,4>, <6,2,7,3>
+  3728698678U,	// <2,7,4,4>: Cost 4 vsldoi4 <6,2,7,4>, RHS
+  2700487990U,	// <2,7,4,5>: Cost 3 vsldoi8 <2,6,2,7>, RHS
+  3728699899U,	// <2,7,4,6>: Cost 4 vsldoi4 <6,2,7,4>, <6,2,7,4>
+  3867104626U,	// <2,7,4,7>: Cost 4 vsldoi12 <7,0,1,2>, <7,4,7,0>
+  2700488233U,	// <2,7,4,u>: Cost 3 vsldoi8 <2,6,2,7>, RHS
+  3855160709U,	// <2,7,5,0>: Cost 4 vsldoi12 <5,0,1,2>, <7,5,0,1>
+  3728704406U,	// <2,7,5,1>: Cost 4 vsldoi4 <6,2,7,5>, <1,2,3,0>
+  3370233956U,	// <2,7,5,2>: Cost 4 vmrglw <2,4,2,5>, <5,6,7,2>
+  2320380410U,	// <2,7,5,3>: Cost 3 vmrglw <6,4,2,5>, <6,2,7,3>
+  3728706870U,	// <2,7,5,4>: Cost 4 vsldoi4 <6,2,7,5>, RHS
+  3867104694U,	// <2,7,5,5>: Cost 4 vsldoi12 <7,0,1,2>, <7,5,5,5>
+  3792146492U,	// <2,7,5,6>: Cost 4 vsldoi8 <5,6,2,7>, <5,6,2,7>
+  3394122562U,	// <2,7,5,7>: Cost 4 vmrglw <6,4,2,5>, <6,6,7,7>
+  2320380410U,	// <2,7,5,u>: Cost 3 vmrglw <6,4,2,5>, <6,2,7,3>
+  2230801402U,	// <2,7,6,0>: Cost 3 vmrghw <2,6,3,7>, <7,0,1,2>
+  3768258984U,	// <2,7,6,1>: Cost 4 vsldoi8 <1,6,2,7>, <6,1,7,2>
+  2730349050U,	// <2,7,6,2>: Cost 3 vsldoi8 <7,6,2,7>, <6,2,7,3>
+  3372894575U,	// <2,7,6,3>: Cost 4 vmrglw <2,u,2,6>, <3,2,7,3>
+  2230801766U,	// <2,7,6,4>: Cost 3 vmrghw <2,6,3,7>, <7,4,5,6>
+  3304543670U,	// <2,7,6,5>: Cost 4 vmrghw <2,6,3,7>, <7,5,5,5>
+  3728716285U,	// <2,7,6,6>: Cost 4 vsldoi4 <6,2,7,6>, <6,2,7,6>
+  2230802028U,	// <2,7,6,7>: Cost 3 vmrghw <2,6,3,7>, <7,7,7,7>
+  2730349050U,	// <2,7,6,u>: Cost 3 vsldoi8 <7,6,2,7>, <6,2,7,3>
+  2793362983U,	// <2,7,7,0>: Cost 3 vsldoi12 <7,0,1,2>, <7,7,0,1>
+  3728721112U,	// <2,7,7,1>: Cost 4 vsldoi4 <6,2,7,7>, <1,6,2,7>
+  3371574933U,	// <2,7,7,2>: Cost 4 vmrglw <2,6,2,7>, <2,2,7,2>
+  2327695866U,	// <2,7,7,3>: Cost 3 vmrglw <7,6,2,7>, <6,2,7,3>
+  3728723254U,	// <2,7,7,4>: Cost 4 vsldoi4 <6,2,7,7>, RHS
+  3371574855U,	// <2,7,7,5>: Cost 5 vmrglw <2,6,2,7>, <2,1,7,5>
+  2730350062U,	// <2,7,7,6>: Cost 3 vsldoi8 <7,6,2,7>, <7,6,2,7>
+  2793363052U,	// <2,7,7,7>: Cost 3 vsldoi12 <7,0,1,2>, <7,7,7,7>
+  2798671471U,	// <2,7,7,u>: Cost 3 vsldoi12 <7,u,1,2>, <7,7,u,1>
+  1581244518U,	// <2,7,u,0>: Cost 2 vsldoi4 <6,2,7,u>, LHS
+  1724929666U,	// <2,7,u,1>: Cost 2 vsldoi12 <7,u,1,2>, <7,u,1,2>
+  2637072314U,	// <2,7,u,2>: Cost 3 vsldoi4 <3,2,7,u>, <2,6,3,7>
+  1256616442U,	// <2,7,u,3>: Cost 2 vmrglw LHS, <6,2,7,3>
+  1581247798U,	// <2,7,u,4>: Cost 2 vsldoi4 <6,2,7,u>, RHS
+  2700490906U,	// <2,7,u,5>: Cost 3 vsldoi8 <2,6,2,7>, RHS
+  1581249023U,	// <2,7,u,6>: Cost 2 vsldoi4 <6,2,7,u>, <6,2,7,u>
+  1256616770U,	// <2,7,u,7>: Cost 2 vmrglw LHS, <6,6,7,7>
+  1581250350U,	// <2,7,u,u>: Cost 2 vsldoi4 <6,2,7,u>, LHS
+  1611489280U,	// <2,u,0,0>: Cost 2 vsldoi8 LHS, <0,0,0,0>
+  537747563U,	// <2,u,0,1>: Cost 1 vsldoi8 LHS, LHS
+  2685231277U,	// <2,u,0,2>: Cost 3 vsldoi8 LHS, <0,2,1,2>
+  2685231356U,	// <2,u,0,3>: Cost 3 vsldoi8 LHS, <0,3,1,0>
+  1611489618U,	// <2,u,0,4>: Cost 2 vsldoi8 LHS, <0,4,1,5>
+  2226763930U,	// <2,u,0,5>: Cost 3 vmrghw <2,0,3,0>, RHS
+  2733007350U,	// <2,u,0,6>: Cost 3 vsldoi8 LHS, <0,6,1,7>
+  2660971737U,	// <2,u,0,7>: Cost 3 vsldoi4 <7,2,u,0>, <7,2,u,0>
+  537748125U,	// <2,u,0,u>: Cost 1 vsldoi8 LHS, LHS
+  2689876708U,	// <2,u,1,0>: Cost 3 vsldoi8 LHS, <1,0,1,2>
+  1611490100U,	// <2,u,1,1>: Cost 2 vsldoi8 LHS, <1,1,1,1>
+  1611490198U,	// <2,u,1,2>: Cost 2 vsldoi8 LHS, <1,2,3,0>
+  2293137564U,	// <2,u,1,3>: Cost 3 vmrglw <1,u,2,1>, LHS
+  2689877072U,	// <2,u,1,4>: Cost 3 vsldoi8 LHS, <1,4,5,6>
+  2689877103U,	// <2,u,1,5>: Cost 3 vsldoi8 LHS, <1,5,0,1>
+  2689877199U,	// <2,u,1,6>: Cost 3 vsldoi8 LHS, <1,6,1,7>
+  2293140808U,	// <2,u,1,7>: Cost 3 vmrglw <1,u,2,1>, RHS
+  1616135548U,	// <2,u,1,u>: Cost 2 vsldoi8 LHS, <1,u,3,0>
+  1556938854U,	// <2,u,2,0>: Cost 2 vsldoi4 <2,2,2,2>, LHS
+  1154291502U,	// <2,u,2,1>: Cost 2 vmrghw <2,2,2,2>, LHS
+  336380006U,	// <2,u,2,2>: Cost 1 vspltisw2 LHS
+  1611490982U,	// <2,u,2,3>: Cost 2 vsldoi8 LHS, <2,3,0,1>
+  1556942134U,	// <2,u,2,4>: Cost 2 vsldoi4 <2,2,2,2>, RHS
+  1154291866U,	// <2,u,2,5>: Cost 2 vmrghw <2,2,2,2>, RHS
+  1611491258U,	// <2,u,2,6>: Cost 2 vsldoi8 LHS, <2,6,3,7>
+  1221397832U,	// <2,u,2,7>: Cost 2 vmrglw <2,2,2,2>, RHS
+  336380006U,	// <2,u,2,u>: Cost 1 vspltisw2 LHS
+  1611491478U,	// <2,u,3,0>: Cost 2 vsldoi8 LHS, <3,0,1,2>
+  1213440073U,	// <2,u,3,1>: Cost 2 vmrglw LHS, <0,0,u,1>
+  1213442261U,	// <2,u,3,2>: Cost 2 vmrglw LHS, <3,0,u,2>
+  135053468U,	// <2,u,3,3>: Cost 1 vmrglw LHS, LHS
+  1611491842U,	// <2,u,3,4>: Cost 2 vsldoi8 LHS, <3,4,5,6>
+  1213440401U,	// <2,u,3,5>: Cost 2 vmrglw LHS, <0,4,u,5>
+  1213442589U,	// <2,u,3,6>: Cost 2 vmrglw LHS, <3,4,u,6>
+  135056712U,	// <2,u,3,7>: Cost 1 vmrglw LHS, RHS
+  135053473U,	// <2,u,3,u>: Cost 1 vmrglw LHS, LHS
+  1551425638U,	// <2,u,4,0>: Cost 2 vsldoi4 <1,2,u,4>, LHS
+  1551426503U,	// <2,u,4,1>: Cost 2 vsldoi4 <1,2,u,4>, <1,2,u,4>
+  2625169000U,	// <2,u,4,2>: Cost 3 vsldoi4 <1,2,u,4>, <2,2,2,2>
+  2625169558U,	// <2,u,4,3>: Cost 3 vsldoi4 <1,2,u,4>, <3,0,1,2>
+  1551428918U,	// <2,u,4,4>: Cost 2 vsldoi4 <1,2,u,4>, RHS
+  537750838U,	// <2,u,4,5>: Cost 1 vsldoi8 LHS, RHS
+  2733010297U,	// <2,u,4,6>: Cost 3 vsldoi8 LHS, <4,6,5,2>
+  2295156040U,	// <2,u,4,7>: Cost 3 vmrglw <2,2,2,4>, RHS
+  537751081U,	// <2,u,4,u>: Cost 1 vsldoi8 LHS, RHS
+  2689879624U,	// <2,u,5,0>: Cost 3 vsldoi8 LHS, <5,0,1,2>
+  2230130478U,	// <2,u,5,1>: Cost 3 vmrghw <2,5,3,6>, LHS
+  2631149217U,	// <2,u,5,2>: Cost 3 vsldoi4 <2,2,u,5>, <2,2,u,5>
+  2290516124U,	// <2,u,5,3>: Cost 3 vmrglw <1,4,2,5>, LHS
+  2689879988U,	// <2,u,5,4>: Cost 3 vsldoi8 LHS, <5,4,5,6>
+  1659269124U,	// <2,u,5,5>: Cost 2 vsldoi8 LHS, <5,5,5,5>
+  1691162778U,	// <2,u,5,6>: Cost 2 vsldoi12 <2,2,2,2>, RHS
+  2290519368U,	// <2,u,5,7>: Cost 3 vmrglw <1,4,2,5>, RHS
+  1691162796U,	// <2,u,5,u>: Cost 2 vsldoi12 <2,2,2,2>, RHS
+  2230802131U,	// <2,u,6,0>: Cost 3 vmrghw <2,6,3,7>, <u,0,1,2>
+  1157060398U,	// <2,u,6,1>: Cost 2 vmrghw <2,6,3,7>, LHS
+  1659269626U,	// <2,u,6,2>: Cost 2 vsldoi8 LHS, <6,2,7,3>
+  2764904656U,	// <2,u,6,3>: Cost 3 vsldoi12 <2,2,2,2>, <u,6,3,7>
+  2230802495U,	// <2,u,6,4>: Cost 3 vmrghw <2,6,3,7>, <u,4,5,6>
+  1157060762U,	// <2,u,6,5>: Cost 2 vmrghw <2,6,3,7>, RHS
+  1659269944U,	// <2,u,6,6>: Cost 2 vsldoi8 LHS, <6,6,6,6>
+  1659269966U,	// <2,u,6,7>: Cost 2 vsldoi8 LHS, <6,7,0,1>
+  1157060965U,	// <2,u,6,u>: Cost 2 vmrghw <2,6,3,7>, LHS
+  1659270138U,	// <2,u,7,0>: Cost 2 vsldoi8 LHS, <7,0,1,2>
+  2727040090U,	// <2,u,7,1>: Cost 3 vsldoi8 <7,1,2,u>, <7,1,2,u>
+  2727703723U,	// <2,u,7,2>: Cost 3 vsldoi8 <7,2,2,u>, <7,2,2,u>
+  2297831580U,	// <2,u,7,3>: Cost 3 vmrglw <2,6,2,7>, LHS
+  1659270502U,	// <2,u,7,4>: Cost 2 vsldoi8 LHS, <7,4,5,6>
+  2733012406U,	// <2,u,7,5>: Cost 3 vsldoi8 LHS, <7,5,5,5>
+  2730358255U,	// <2,u,7,6>: Cost 3 vsldoi8 <7,6,2,u>, <7,6,2,u>
+  1659270764U,	// <2,u,7,7>: Cost 2 vsldoi8 LHS, <7,7,7,7>
+  1659270786U,	// <2,u,7,u>: Cost 2 vsldoi8 LHS, <7,u,1,2>
+  1213481923U,	// <2,u,u,0>: Cost 2 vmrglw LHS, <1,2,u,0>
+  537753390U,	// <2,u,u,1>: Cost 1 vsldoi8 LHS, LHS
+  336380006U,	// <2,u,u,2>: Cost 1 vspltisw2 LHS
+  135094428U,	// <2,u,u,3>: Cost 1 vmrglw LHS, LHS
+  1213481927U,	// <2,u,u,4>: Cost 2 vmrglw LHS, <1,2,u,4>
+  537753754U,	// <2,u,u,5>: Cost 1 vsldoi8 LHS, RHS
+  1208838685U,	// <2,u,u,6>: Cost 2 vmrglw LHS, <3,4,u,6>
+  135097672U,	// <2,u,u,7>: Cost 1 vmrglw LHS, RHS
+  135094433U,	// <2,u,u,u>: Cost 1 vmrglw LHS, LHS
+  1678557184U,	// <3,0,0,0>: Cost 2 vsldoi12 LHS, <0,0,0,0>
+  1678557194U,	// <3,0,0,1>: Cost 2 vsldoi12 LHS, <0,0,1,1>
+  2631181989U,	// <3,0,0,2>: Cost 3 vsldoi4 <2,3,0,0>, <2,3,0,0>
+  2289223984U,	// <3,0,0,3>: Cost 3 vmrglw <1,2,3,0>, <3,2,0,3>
+  2756943909U,	// <3,0,0,4>: Cost 3 vsldoi12 LHS, <0,0,4,1>
+  3362965729U,	// <3,0,0,5>: Cost 4 vmrglw <1,2,3,0>, <3,1,0,5>
+  3362966054U,	// <3,0,0,6>: Cost 4 vmrglw <1,2,3,0>, <3,5,0,6>
+  2289224312U,	// <3,0,0,7>: Cost 3 vmrglw <1,2,3,0>, <3,6,0,7>
+  1683202121U,	// <3,0,0,u>: Cost 2 vsldoi12 LHS, <0,0,u,1>
+  1557446758U,	// <3,0,1,0>: Cost 2 vsldoi4 <2,3,0,1>, LHS
+  2752741467U,	// <3,0,1,1>: Cost 3 vsldoi12 LHS, <0,1,1,1>
+  604815462U,	// <3,0,1,2>: Cost 1 vsldoi12 LHS, LHS
+  2631190676U,	// <3,0,1,3>: Cost 3 vsldoi4 <2,3,0,1>, <3,0,1,0>
+  1557450038U,	// <3,0,1,4>: Cost 2 vsldoi4 <2,3,0,1>, RHS
+  2667024388U,	// <3,0,1,5>: Cost 3 vsldoi4 <u,3,0,1>, <5,5,5,5>
+  2800074894U,	// <3,0,1,6>: Cost 3 vsldoi12 LHS, <0,1,6,7>
+  2661053667U,	// <3,0,1,7>: Cost 3 vsldoi4 <7,3,0,1>, <7,3,0,1>
+  604815516U,	// <3,0,1,u>: Cost 1 vsldoi12 LHS, LHS
+  2696521165U,	// <3,0,2,0>: Cost 3 vsldoi8 <2,0,3,0>, <2,0,3,0>
+  2752741549U,	// <3,0,2,1>: Cost 3 vsldoi12 LHS, <0,2,1,2>
+  2691876456U,	// <3,0,2,2>: Cost 3 vsldoi8 <1,2,3,0>, <2,2,2,2>
+  2691876518U,	// <3,0,2,3>: Cost 3 vsldoi8 <1,2,3,0>, <2,3,0,1>
+  3830685895U,	// <3,0,2,4>: Cost 4 vsldoi12 LHS, <0,2,4,1>
+  3765618536U,	// <3,0,2,5>: Cost 4 vsldoi8 <1,2,3,0>, <2,5,3,6>
+  2691876794U,	// <3,0,2,6>: Cost 3 vsldoi8 <1,2,3,0>, <2,6,3,7>
+  2701166596U,	// <3,0,2,7>: Cost 3 vsldoi8 <2,7,3,0>, <2,7,3,0>
+  2756944108U,	// <3,0,2,u>: Cost 3 vsldoi12 LHS, <0,2,u,2>
+  2691877014U,	// <3,0,3,0>: Cost 3 vsldoi8 <1,2,3,0>, <3,0,1,2>
+  1161003110U,	// <3,0,3,1>: Cost 2 vmrghw <3,3,3,3>, LHS
+  2691877168U,	// <3,0,3,2>: Cost 3 vsldoi8 <1,2,3,0>, <3,2,0,3>
+  2691877246U,	// <3,0,3,3>: Cost 3 vsldoi8 <1,2,3,0>, <3,3,0,0>
+  2691877378U,	// <3,0,3,4>: Cost 3 vsldoi8 <1,2,3,0>, <3,4,5,6>
+  3765619238U,	// <3,0,3,5>: Cost 4 vsldoi8 <1,2,3,0>, <3,5,0,6>
+  2691877496U,	// <3,0,3,6>: Cost 3 vsldoi8 <1,2,3,0>, <3,6,0,7>
+  3368962680U,	// <3,0,3,7>: Cost 4 vmrglw <2,2,3,3>, <3,6,0,7>
+  1161003677U,	// <3,0,3,u>: Cost 2 vmrghw <3,3,3,3>, LHS
+  2289254400U,	// <3,0,4,0>: Cost 3 vmrglw <1,2,3,4>, <0,0,0,0>
+  1678557522U,	// <3,0,4,1>: Cost 2 vsldoi12 LHS, <0,4,1,5>
+  2631214761U,	// <3,0,4,2>: Cost 3 vsldoi4 <2,3,0,4>, <2,3,0,4>
+  2235580672U,	// <3,0,4,3>: Cost 3 vmrghw <3,4,5,6>, <0,3,1,4>
+  2756944237U,	// <3,0,4,4>: Cost 3 vsldoi12 LHS, <0,4,4,5>
+  1618136374U,	// <3,0,4,5>: Cost 2 vsldoi8 <1,2,3,0>, RHS
+  3309322742U,	// <3,0,4,6>: Cost 4 vmrghw <3,4,5,6>, <0,6,1,7>
+  3362998904U,	// <3,0,4,7>: Cost 4 vmrglw <1,2,3,4>, <3,6,0,7>
+  1683202449U,	// <3,0,4,u>: Cost 2 vsldoi12 LHS, <0,4,u,5>
+  3765620296U,	// <3,0,5,0>: Cost 4 vsldoi8 <1,2,3,0>, <5,0,1,2>
+  2752299427U,	// <3,0,5,1>: Cost 3 vsldoi12 LHS, <0,5,1,5>
+  3789508346U,	// <3,0,5,2>: Cost 4 vsldoi8 <5,2,3,0>, <5,2,3,0>
+  3403486842U,	// <3,0,5,3>: Cost 4 vmrglw <u,0,3,5>, <7,u,0,3>
+  3765620660U,	// <3,0,5,4>: Cost 4 vsldoi8 <1,2,3,0>, <5,4,5,6>
+  2733682692U,	// <3,0,5,5>: Cost 3 vsldoi8 <u,2,3,0>, <5,5,5,5>
+  2800075218U,	// <3,0,5,6>: Cost 3 vsldoi12 LHS, <0,5,6,7>
+  3873817044U,	// <3,0,5,7>: Cost 4 vsldoi12 LHS, <0,5,7,0>
+  2800075234U,	// <3,0,5,u>: Cost 3 vsldoi12 LHS, <0,5,u,5>
+  2752299501U,	// <3,0,6,0>: Cost 3 vsldoi12 LHS, <0,6,0,7>
+  2236547174U,	// <3,0,6,1>: Cost 3 vmrghw <3,6,0,7>, LHS
+  2733683194U,	// <3,0,6,2>: Cost 3 vsldoi8 <u,2,3,0>, <6,2,7,3>
+  3844473352U,	// <3,0,6,3>: Cost 4 vsldoi12 <3,2,0,3>, <0,6,3,7>
+  3310289234U,	// <3,0,6,4>: Cost 4 vmrghw <3,6,0,7>, <0,4,1,5>
+  3873817114U,	// <3,0,6,5>: Cost 4 vsldoi12 LHS, <0,6,5,7>
+  2733683512U,	// <3,0,6,6>: Cost 3 vsldoi8 <u,2,3,0>, <6,6,6,6>
+  2725057384U,	// <3,0,6,7>: Cost 3 vsldoi8 <6,7,3,0>, <6,7,3,0>
+  2236547741U,	// <3,0,6,u>: Cost 3 vmrghw <3,6,0,7>, LHS
+  2297905152U,	// <3,0,7,0>: Cost 3 vmrglw <2,6,3,7>, <0,0,0,0>
+  2297906854U,	// <3,0,7,1>: Cost 3 vmrglw <2,6,3,7>, <2,3,0,1>
+  2727711916U,	// <3,0,7,2>: Cost 3 vsldoi8 <7,2,3,0>, <7,2,3,0>
+  3371649328U,	// <3,0,7,3>: Cost 4 vmrglw <2,6,3,7>, <3,2,0,3>
+  2733684070U,	// <3,0,7,4>: Cost 3 vsldoi8 <u,2,3,0>, <7,4,5,6>
+  3734843490U,	// <3,0,7,5>: Cost 4 vsldoi4 <7,3,0,7>, <5,6,7,0>
+  3798799895U,	// <3,0,7,6>: Cost 4 vsldoi8 <6,7,3,0>, <7,6,7,3>
+  2733684332U,	// <3,0,7,7>: Cost 3 vsldoi8 <u,2,3,0>, <7,7,7,7>
+  2297906861U,	// <3,0,7,u>: Cost 3 vmrglw <2,6,3,7>, <2,3,0,u>
+  1557504102U,	// <3,0,u,0>: Cost 2 vsldoi4 <2,3,0,u>, LHS
+  1678557842U,	// <3,0,u,1>: Cost 2 vsldoi12 LHS, <0,u,1,1>
+  604816029U,	// <3,0,u,2>: Cost 1 vsldoi12 LHS, LHS
+  2691880892U,	// <3,0,u,3>: Cost 3 vsldoi8 <1,2,3,0>, <u,3,0,1>
+  1557507382U,	// <3,0,u,4>: Cost 2 vsldoi4 <2,3,0,u>, RHS
+  1618139290U,	// <3,0,u,5>: Cost 2 vsldoi8 <1,2,3,0>, RHS
+  2691881168U,	// <3,0,u,6>: Cost 3 vsldoi8 <1,2,3,0>, <u,6,3,7>
+  2661111018U,	// <3,0,u,7>: Cost 3 vsldoi4 <7,3,0,u>, <7,3,0,u>
+  604816083U,	// <3,0,u,u>: Cost 1 vsldoi12 LHS, LHS
+  2619310332U,	// <3,1,0,0>: Cost 3 vsldoi4 <0,3,1,0>, <0,3,1,0>
+  2756944612U,	// <3,1,0,1>: Cost 3 vsldoi12 LHS, <1,0,1,2>
+  2289221724U,	// <3,1,0,2>: Cost 3 vmrglw <1,2,3,0>, <0,1,1,2>
+  2619312278U,	// <3,1,0,3>: Cost 3 vsldoi4 <0,3,1,0>, <3,0,1,2>
+  2619313462U,	// <3,1,0,4>: Cost 3 vsldoi4 <0,3,1,0>, RHS
+  2289221970U,	// <3,1,0,5>: Cost 3 vmrglw <1,2,3,0>, <0,4,1,5>
+  2232599768U,	// <3,1,0,6>: Cost 3 vmrghw <3,0,1,2>, <1,6,2,7>
+  3362964687U,	// <3,1,0,7>: Cost 4 vmrglw <1,2,3,0>, <1,6,1,7>
+  2619316014U,	// <3,1,0,u>: Cost 3 vsldoi4 <0,3,1,0>, LHS
+  2756944683U,	// <3,1,1,0>: Cost 3 vsldoi12 LHS, <1,1,0,1>
+  1678558004U,	// <3,1,1,1>: Cost 2 vsldoi12 LHS, <1,1,1,1>
+  2691883927U,	// <3,1,1,2>: Cost 3 vsldoi8 <1,2,3,1>, <1,2,3,1>
+  3826631496U,	// <3,1,1,3>: Cost 4 vsldoi12 <0,2,1,3>, <1,1,3,3>
+  2756944723U,	// <3,1,1,4>: Cost 3 vsldoi12 LHS, <1,1,4,5>
+  2756944732U,	// <3,1,1,5>: Cost 3 vsldoi12 LHS, <1,1,5,5>
+  3830686561U,	// <3,1,1,6>: Cost 4 vsldoi12 LHS, <1,1,6,1>
+  3734869228U,	// <3,1,1,7>: Cost 4 vsldoi4 <7,3,1,1>, <7,3,1,1>
+  1678558004U,	// <3,1,1,u>: Cost 2 vsldoi12 LHS, <1,1,1,1>
+  2696529358U,	// <3,1,2,0>: Cost 3 vsldoi8 <2,0,3,1>, <2,0,3,1>
+  2756944775U,	// <3,1,2,1>: Cost 3 vsldoi12 LHS, <1,2,1,3>
+  2294548630U,	// <3,1,2,2>: Cost 3 vmrglw <2,1,3,2>, <3,0,1,2>
+  1678558102U,	// <3,1,2,3>: Cost 2 vsldoi12 LHS, <1,2,3,0>
+  2631273782U,	// <3,1,2,4>: Cost 3 vsldoi4 <2,3,1,2>, RHS
+  2756944811U,	// <3,1,2,5>: Cost 3 vsldoi12 LHS, <1,2,5,3>
+  3830686644U,	// <3,1,2,6>: Cost 4 vsldoi12 LHS, <1,2,6,3>
+  2800075706U,	// <3,1,2,7>: Cost 3 vsldoi12 LHS, <1,2,7,0>
+  1679000515U,	// <3,1,2,u>: Cost 2 vsldoi12 LHS, <1,2,u,0>
+  2619334911U,	// <3,1,3,0>: Cost 3 vsldoi4 <0,3,1,3>, <0,3,1,3>
+  2295218186U,	// <3,1,3,1>: Cost 3 vmrglw <2,2,3,3>, <0,0,1,1>
+  2293229718U,	// <3,1,3,2>: Cost 3 vmrglw <1,u,3,3>, <3,0,1,2>
+  2619337116U,	// <3,1,3,3>: Cost 3 vsldoi4 <0,3,1,3>, <3,3,3,3>
+  2619338038U,	// <3,1,3,4>: Cost 3 vsldoi4 <0,3,1,3>, RHS
+  2295218514U,	// <3,1,3,5>: Cost 3 vmrglw <2,2,3,3>, <0,4,1,5>
+  3830686729U,	// <3,1,3,6>: Cost 4 vsldoi12 LHS, <1,3,6,7>
+  3368961231U,	// <3,1,3,7>: Cost 4 vmrglw <2,2,3,3>, <1,6,1,7>
+  2619340590U,	// <3,1,3,u>: Cost 3 vsldoi4 <0,3,1,3>, LHS
+  2619343104U,	// <3,1,4,0>: Cost 3 vsldoi4 <0,3,1,4>, <0,3,1,4>
+  2289254410U,	// <3,1,4,1>: Cost 3 vmrglw <1,2,3,4>, <0,0,1,1>
+  2289256598U,	// <3,1,4,2>: Cost 3 vmrglw <1,2,3,4>, <3,0,1,2>
+  2619345410U,	// <3,1,4,3>: Cost 3 vsldoi4 <0,3,1,4>, <3,4,5,6>
+  2619346230U,	// <3,1,4,4>: Cost 3 vsldoi4 <0,3,1,4>, RHS
+  2756944976U,	// <3,1,4,5>: Cost 3 vsldoi12 LHS, <1,4,5,6>
+  3362996401U,	// <3,1,4,6>: Cost 4 vmrglw <1,2,3,4>, <0,2,1,6>
+  3362997455U,	// <3,1,4,7>: Cost 4 vmrglw <1,2,3,4>, <1,6,1,7>
+  2619348782U,	// <3,1,4,u>: Cost 3 vsldoi4 <0,3,1,4>, LHS
+  2756945007U,	// <3,1,5,0>: Cost 3 vsldoi12 LHS, <1,5,0,1>
+  3830686840U,	// <3,1,5,1>: Cost 4 vsldoi12 LHS, <1,5,1,1>
+  3358361750U,	// <3,1,5,2>: Cost 4 vmrglw <0,4,3,5>, <3,0,1,2>
+  3830686857U,	// <3,1,5,3>: Cost 4 vsldoi12 LHS, <1,5,3,0>
+  2756945047U,	// <3,1,5,4>: Cost 3 vsldoi12 LHS, <1,5,4,5>
+  2294571346U,	// <3,1,5,5>: Cost 3 vmrglw <2,1,3,5>, <0,4,1,5>
+  3806105698U,	// <3,1,5,6>: Cost 4 vsldoi8 <u,0,3,1>, <5,6,7,0>
+  3873817774U,	// <3,1,5,7>: Cost 4 vsldoi12 LHS, <1,5,7,1>
+  2756945079U,	// <3,1,5,u>: Cost 3 vsldoi12 LHS, <1,5,u,1>
+  3830686912U,	// <3,1,6,0>: Cost 4 vsldoi12 LHS, <1,6,0,1>
+  2756945103U,	// <3,1,6,1>: Cost 3 vsldoi12 LHS, <1,6,1,7>
+  2236547990U,	// <3,1,6,2>: Cost 3 vmrghw <3,6,0,7>, <1,2,3,0>
+  3826631905U,	// <3,1,6,3>: Cost 4 vsldoi12 <0,2,1,3>, <1,6,3,7>
+  3830686952U,	// <3,1,6,4>: Cost 4 vsldoi12 LHS, <1,6,4,5>
+  2756945139U,	// <3,1,6,5>: Cost 3 vsldoi12 LHS, <1,6,5,7>
+  3830686972U,	// <3,1,6,6>: Cost 4 vsldoi12 LHS, <1,6,6,7>
+  2800076030U,	// <3,1,6,7>: Cost 3 vsldoi12 LHS, <1,6,7,0>
+  2756945166U,	// <3,1,6,u>: Cost 3 vsldoi12 LHS, <1,6,u,7>
+  3699081318U,	// <3,1,7,0>: Cost 4 vsldoi4 <1,3,1,7>, LHS
+  2297905162U,	// <3,1,7,1>: Cost 3 vmrglw <2,6,3,7>, <0,0,1,1>
+  2297907350U,	// <3,1,7,2>: Cost 3 vmrglw <2,6,3,7>, <3,0,1,2>
+  3365675182U,	// <3,1,7,3>: Cost 4 vmrglw <1,6,3,7>, <0,2,1,3>
+  3699084598U,	// <3,1,7,4>: Cost 4 vsldoi4 <1,3,1,7>, RHS
+  2297905490U,	// <3,1,7,5>: Cost 3 vmrglw <2,6,3,7>, <0,4,1,5>
+  2297905329U,	// <3,1,7,6>: Cost 3 vmrglw <2,6,3,7>, <0,2,1,6>
+  3368330447U,	// <3,1,7,7>: Cost 4 vmrglw <2,1,3,7>, <1,6,1,7>
+  2297905169U,	// <3,1,7,u>: Cost 3 vmrglw <2,6,3,7>, <0,0,1,u>
+  2619375876U,	// <3,1,u,0>: Cost 3 vsldoi4 <0,3,1,u>, <0,3,1,u>
+  1678558004U,	// <3,1,u,1>: Cost 2 vsldoi12 LHS, <1,1,1,1>
+  2289289366U,	// <3,1,u,2>: Cost 3 vmrglw <1,2,3,u>, <3,0,1,2>
+  1679000956U,	// <3,1,u,3>: Cost 2 vsldoi12 LHS, <1,u,3,0>
+  2619378998U,	// <3,1,u,4>: Cost 3 vsldoi4 <0,3,1,u>, RHS
+  2756945297U,	// <3,1,u,5>: Cost 3 vsldoi12 LHS, <1,u,5,3>
+  2297905329U,	// <3,1,u,6>: Cost 3 vmrglw <2,6,3,7>, <0,2,1,6>
+  2800076192U,	// <3,1,u,7>: Cost 3 vsldoi12 LHS, <1,u,7,0>
+  1683203497U,	// <3,1,u,u>: Cost 2 vsldoi12 LHS, <1,u,u,0>
+  3362964203U,	// <3,2,0,0>: Cost 4 vmrglw <1,2,3,0>, <1,0,2,0>
+  2289222380U,	// <3,2,0,1>: Cost 3 vmrglw <1,2,3,0>, <1,0,2,1>
+  2289222462U,	// <3,2,0,2>: Cost 3 vmrglw <1,2,3,0>, <1,1,2,2>
+  1215479910U,	// <3,2,0,3>: Cost 2 vmrglw <1,2,3,0>, LHS
+  3362964207U,	// <3,2,0,4>: Cost 4 vmrglw <1,2,3,0>, <1,0,2,4>
+  2289222708U,	// <3,2,0,5>: Cost 3 vmrglw <1,2,3,0>, <1,4,2,5>
+  2232600506U,	// <3,2,0,6>: Cost 3 vmrghw <3,0,1,2>, <2,6,3,7>
+  3396142296U,	// <3,2,0,7>: Cost 4 vmrglw <6,7,3,0>, <1,6,2,7>
+  1215479915U,	// <3,2,0,u>: Cost 2 vmrglw <1,2,3,0>, LHS
+  3699105894U,	// <3,2,1,0>: Cost 4 vsldoi4 <1,3,2,1>, LHS
+  3765633844U,	// <3,2,1,1>: Cost 4 vsldoi8 <1,2,3,2>, <1,1,1,1>
+  2691892120U,	// <3,2,1,2>: Cost 3 vsldoi8 <1,2,3,2>, <1,2,3,2>
+  2752300575U,	// <3,2,1,3>: Cost 3 vsldoi12 LHS, <2,1,3,1>
+  3699109174U,	// <3,2,1,4>: Cost 4 vsldoi4 <1,3,2,1>, RHS
+  3830687280U,	// <3,2,1,5>: Cost 5 vsldoi12 LHS, <2,1,5,0>
+  3830687289U,	// <3,2,1,6>: Cost 4 vsldoi12 LHS, <2,1,6,0>
+  3874260548U,	// <3,2,1,7>: Cost 4 vsldoi12 LHS, <2,1,7,2>
+  2752742988U,	// <3,2,1,u>: Cost 3 vsldoi12 LHS, <2,1,u,1>
+  2631344230U,	// <3,2,2,0>: Cost 3 vsldoi4 <2,3,2,2>, LHS
+  2697201184U,	// <3,2,2,1>: Cost 3 vsldoi8 <2,1,3,2>, <2,1,3,2>
+  1678558824U,	// <3,2,2,2>: Cost 2 vsldoi12 LHS, <2,2,2,2>
+  1678558834U,	// <3,2,2,3>: Cost 2 vsldoi12 LHS, <2,2,3,3>
+  2631347510U,	// <3,2,2,4>: Cost 3 vsldoi4 <2,3,2,2>, RHS
+  3368953613U,	// <3,2,2,5>: Cost 4 vmrglw <2,2,3,2>, <2,4,2,5>
+  2234304442U,	// <3,2,2,6>: Cost 3 vmrghw <3,2,6,3>, <2,6,3,7>
+  3368953777U,	// <3,2,2,7>: Cost 4 vmrglw <2,2,3,2>, <2,6,2,7>
+  1679001247U,	// <3,2,2,u>: Cost 2 vsldoi12 LHS, <2,2,u,3>
+  1678558886U,	// <3,2,3,0>: Cost 2 vsldoi12 LHS, <2,3,0,1>
+  2752300719U,	// <3,2,3,1>: Cost 3 vsldoi12 LHS, <2,3,1,1>
+  2752300729U,	// <3,2,3,2>: Cost 3 vsldoi12 LHS, <2,3,2,2>
+  1221476454U,	// <3,2,3,3>: Cost 2 vmrglw <2,2,3,3>, LHS
+  1678558926U,	// <3,2,3,4>: Cost 2 vsldoi12 LHS, <2,3,4,5>
+  2800076503U,	// <3,2,3,5>: Cost 3 vsldoi12 LHS, <2,3,5,5>
+  2234746810U,	// <3,2,3,6>: Cost 3 vmrghw <3,3,3,3>, <2,6,3,7>
+  2800076516U,	// <3,2,3,7>: Cost 3 vsldoi12 LHS, <2,3,7,0>
+  1678558958U,	// <3,2,3,u>: Cost 2 vsldoi12 LHS, <2,3,u,1>
+  3699130470U,	// <3,2,4,0>: Cost 4 vsldoi4 <1,3,2,4>, LHS
+  3362996972U,	// <3,2,4,1>: Cost 4 vmrglw <1,2,3,4>, <1,0,2,1>
+  2289256040U,	// <3,2,4,2>: Cost 3 vmrglw <1,2,3,4>, <2,2,2,2>
+  1215512678U,	// <3,2,4,3>: Cost 2 vmrglw <1,2,3,4>, LHS
+  3362998676U,	// <3,2,4,4>: Cost 4 vmrglw <1,2,3,4>, <3,3,2,4>
+  2691894582U,	// <3,2,4,5>: Cost 3 vsldoi8 <1,2,3,2>, RHS
+  2235582394U,	// <3,2,4,6>: Cost 3 vmrghw <3,4,5,6>, <2,6,3,7>
+  3734967544U,	// <3,2,4,7>: Cost 4 vsldoi4 <7,3,2,4>, <7,3,2,4>
+  1215512683U,	// <3,2,4,u>: Cost 2 vmrglw <1,2,3,4>, LHS
+  3705110630U,	// <3,2,5,0>: Cost 4 vsldoi4 <2,3,2,5>, LHS
+  3368313985U,	// <3,2,5,1>: Cost 4 vmrglw <2,1,3,5>, <1,5,2,1>
+  3368314472U,	// <3,2,5,2>: Cost 4 vmrglw <2,1,3,5>, <2,2,2,2>
+  2756945768U,	// <3,2,5,3>: Cost 3 vsldoi12 LHS, <2,5,3,6>
+  3705113910U,	// <3,2,5,4>: Cost 4 vsldoi4 <2,3,2,5>, RHS
+  3310061416U,	// <3,2,5,5>: Cost 4 vmrghw <3,5,6,6>, <2,5,3,6>
+  3310135226U,	// <3,2,5,6>: Cost 4 vmrghw <3,5,7,6>, <2,6,3,7>
+  3370305457U,	// <3,2,5,7>: Cost 5 vmrglw <2,4,3,5>, <2,6,2,7>
+  2752743317U,	// <3,2,5,u>: Cost 3 vsldoi12 LHS, <2,5,u,6>
+  2631376998U,	// <3,2,6,0>: Cost 3 vsldoi4 <2,3,2,6>, LHS
+  3705119540U,	// <3,2,6,1>: Cost 4 vsldoi4 <2,3,2,6>, <1,1,1,1>
+  2631378621U,	// <3,2,6,2>: Cost 3 vsldoi4 <2,3,2,6>, <2,3,2,6>
+  1678559162U,	// <3,2,6,3>: Cost 2 vsldoi12 LHS, <2,6,3,7>
+  2631380278U,	// <3,2,6,4>: Cost 3 vsldoi4 <2,3,2,6>, RHS
+  3370976956U,	// <3,2,6,5>: Cost 4 vmrglw <2,5,3,6>, <2,3,2,5>
+  2237065146U,	// <3,2,6,6>: Cost 3 vmrghw <3,6,7,7>, <2,6,3,7>
+  3798815594U,	// <3,2,6,7>: Cost 4 vsldoi8 <6,7,3,2>, <6,7,3,2>
+  1679001575U,	// <3,2,6,u>: Cost 2 vsldoi12 LHS, <2,6,u,7>
+  2800076778U,	// <3,2,7,0>: Cost 3 vsldoi12 LHS, <2,7,0,1>
+  3371647724U,	// <3,2,7,1>: Cost 4 vmrglw <2,6,3,7>, <1,0,2,1>
+  2297906792U,	// <3,2,7,2>: Cost 3 vmrglw <2,6,3,7>, <2,2,2,2>
+  1224163430U,	// <3,2,7,3>: Cost 2 vmrglw <2,6,3,7>, LHS
+  3705130294U,	// <3,2,7,4>: Cost 4 vsldoi4 <2,3,2,7>, RHS
+  3371648052U,	// <3,2,7,5>: Cost 4 vmrglw <2,6,3,7>, <1,4,2,5>
+  2297906877U,	// <3,2,7,6>: Cost 3 vmrglw <2,6,3,7>, <2,3,2,6>
+  3371648702U,	// <3,2,7,7>: Cost 4 vmrglw <2,6,3,7>, <2,3,2,7>
+  1224163435U,	// <3,2,7,u>: Cost 2 vmrglw <2,6,3,7>, LHS
+  1679001659U,	// <3,2,u,0>: Cost 2 vsldoi12 LHS, <2,u,0,1>
+  2752743492U,	// <3,2,u,1>: Cost 3 vsldoi12 LHS, <2,u,1,1>
+  1678558824U,	// <3,2,u,2>: Cost 2 vsldoi12 LHS, <2,2,2,2>
+  1678559320U,	// <3,2,u,3>: Cost 2 vsldoi12 LHS, <2,u,3,3>
+  1679001699U,	// <3,2,u,4>: Cost 2 vsldoi12 LHS, <2,u,4,5>
+  2691897498U,	// <3,2,u,5>: Cost 3 vsldoi8 <1,2,3,2>, RHS
+  2237908922U,	// <3,2,u,6>: Cost 3 vmrghw <3,u,1,2>, <2,6,3,7>
+  2800519289U,	// <3,2,u,7>: Cost 3 vsldoi12 LHS, <2,u,7,0>
+  1679001731U,	// <3,2,u,u>: Cost 2 vsldoi12 LHS, <2,u,u,1>
+  1215480726U,	// <3,3,0,0>: Cost 2 vmrglw <1,2,3,0>, <1,2,3,0>
+  1678559382U,	// <3,3,0,1>: Cost 2 vsldoi12 LHS, <3,0,1,2>
+  2631403200U,	// <3,3,0,2>: Cost 3 vsldoi4 <2,3,3,0>, <2,3,3,0>
+  2289223282U,	// <3,3,0,3>: Cost 3 vmrglw <1,2,3,0>, <2,2,3,3>
+  2752301232U,	// <3,3,0,4>: Cost 3 vsldoi12 LHS, <3,0,4,1>
+  3362965027U,	// <3,3,0,5>: Cost 4 vmrglw <1,2,3,0>, <2,1,3,5>
+  3362965352U,	// <3,3,0,6>: Cost 4 vmrglw <1,2,3,0>, <2,5,3,6>
+  2289223610U,	// <3,3,0,7>: Cost 3 vmrglw <1,2,3,0>, <2,6,3,7>
+  1678559445U,	// <3,3,0,u>: Cost 2 vsldoi12 LHS, <3,0,u,2>
+  3830687964U,	// <3,3,1,0>: Cost 4 vsldoi12 LHS, <3,1,0,0>
+  2752301286U,	// <3,3,1,1>: Cost 3 vsldoi12 LHS, <3,1,1,1>
+  2752301297U,	// <3,3,1,2>: Cost 3 vsldoi12 LHS, <3,1,2,3>
+  2305157532U,	// <3,3,1,3>: Cost 3 vmrglw <3,u,3,1>, <3,3,3,3>
+  3830688000U,	// <3,3,1,4>: Cost 4 vsldoi12 LHS, <3,1,4,0>
+  3830688009U,	// <3,3,1,5>: Cost 4 vsldoi12 LHS, <3,1,5,0>
+  3830688019U,	// <3,3,1,6>: Cost 4 vsldoi12 LHS, <3,1,6,1>
+  3362973626U,	// <3,3,1,7>: Cost 4 vmrglw <1,2,3,1>, <2,6,3,7>
+  2752743719U,	// <3,3,1,u>: Cost 3 vsldoi12 LHS, <3,1,u,3>
+  2631417958U,	// <3,3,2,0>: Cost 3 vsldoi4 <2,3,3,2>, LHS
+  3826043193U,	// <3,3,2,1>: Cost 4 vsldoi12 LHS, <3,2,1,3>
+  1624131186U,	// <3,3,2,2>: Cost 2 vsldoi8 <2,2,3,3>, <2,2,3,3>
+  2752301384U,	// <3,3,2,3>: Cost 3 vsldoi12 LHS, <3,2,3,0>
+  2631421238U,	// <3,3,2,4>: Cost 3 vsldoi4 <2,3,3,2>, RHS
+  3826485602U,	// <3,3,2,5>: Cost 4 vsldoi12 LHS, <3,2,5,u>
+  2752301414U,	// <3,3,2,6>: Cost 3 vsldoi12 LHS, <3,2,6,3>
+  2771249519U,	// <3,3,2,7>: Cost 3 vsldoi12 <3,2,7,3>, <3,2,7,3>
+  1628112984U,	// <3,3,2,u>: Cost 2 vsldoi8 <2,u,3,3>, <2,u,3,3>
+  1563656294U,	// <3,3,3,0>: Cost 2 vsldoi4 <3,3,3,3>, LHS
+  2301855911U,	// <3,3,3,1>: Cost 3 vmrglw <3,3,3,3>, <3,0,3,1>
+  2697873730U,	// <3,3,3,2>: Cost 3 vsldoi8 <2,2,3,3>, <3,2,2,3>
+  403488870U,	// <3,3,3,3>: Cost 1 vspltisw3 LHS
+  1563659574U,	// <3,3,3,4>: Cost 2 vsldoi4 <3,3,3,3>, RHS
+  2301856239U,	// <3,3,3,5>: Cost 3 vmrglw <3,3,3,3>, <3,4,3,5>
+  2697874067U,	// <3,3,3,6>: Cost 3 vsldoi8 <2,2,3,3>, <3,6,3,7>
+  2295220154U,	// <3,3,3,7>: Cost 3 vmrglw <2,2,3,3>, <2,6,3,7>
+  403488870U,	// <3,3,3,u>: Cost 1 vspltisw3 LHS
+  2289255318U,	// <3,3,4,0>: Cost 3 vmrglw <1,2,3,4>, <1,2,3,0>
+  2631435162U,	// <3,3,4,1>: Cost 3 vsldoi4 <2,3,3,4>, <1,2,3,4>
+  2631435972U,	// <3,3,4,2>: Cost 3 vsldoi4 <2,3,3,4>, <2,3,3,4>
+  2289256050U,	// <3,3,4,3>: Cost 3 vmrglw <1,2,3,4>, <2,2,3,3>
+  1215513498U,	// <3,3,4,4>: Cost 2 vmrglw <1,2,3,4>, <1,2,3,4>
+  1679002114U,	// <3,3,4,5>: Cost 2 vsldoi12 LHS, <3,4,5,6>
+  3362998120U,	// <3,3,4,6>: Cost 4 vmrglw <1,2,3,4>, <2,5,3,6>
+  2289256378U,	// <3,3,4,7>: Cost 3 vmrglw <1,2,3,4>, <2,6,3,7>
+  1679002141U,	// <3,3,4,u>: Cost 2 vsldoi12 LHS, <3,4,u,6>
+  3831130657U,	// <3,3,5,0>: Cost 4 vsldoi12 LHS, <3,5,0,1>
+  3376277671U,	// <3,3,5,1>: Cost 4 vmrglw <3,4,3,5>, <3,0,3,1>
+  3771617012U,	// <3,3,5,2>: Cost 4 vsldoi8 <2,2,3,3>, <5,2,2,3>
+  2302536092U,	// <3,3,5,3>: Cost 3 vmrglw <3,4,3,5>, <3,3,3,3>
+  3831130697U,	// <3,3,5,4>: Cost 4 vsldoi12 LHS, <3,5,4,5>
+  2294572579U,	// <3,3,5,5>: Cost 3 vmrglw <2,1,3,5>, <2,1,3,5>
+  2800519773U,	// <3,3,5,6>: Cost 3 vsldoi12 LHS, <3,5,6,7>
+  3368314810U,	// <3,3,5,7>: Cost 4 vmrglw <2,1,3,5>, <2,6,3,7>
+  2800519791U,	// <3,3,5,u>: Cost 3 vsldoi12 LHS, <3,5,u,7>
+  2800077432U,	// <3,3,6,0>: Cost 3 vsldoi12 LHS, <3,6,0,7>
+  3310291185U,	// <3,3,6,1>: Cost 4 vmrghw <3,6,0,7>, <3,1,2,3>
+  2789165706U,	// <3,3,6,2>: Cost 3 vsldoi12 <6,2,7,3>, <3,6,2,7>
+  2764982931U,	// <3,3,6,3>: Cost 3 vsldoi12 <2,2,3,3>, <3,6,3,7>
+  2800077468U,	// <3,3,6,4>: Cost 3 vsldoi12 LHS, <3,6,4,7>
+  3873819301U,	// <3,3,6,5>: Cost 4 vsldoi12 LHS, <3,6,5,7>
+  2297235304U,	// <3,3,6,6>: Cost 3 vmrglw <2,5,3,6>, <2,5,3,6>
+  2725081963U,	// <3,3,6,7>: Cost 3 vsldoi8 <6,7,3,3>, <6,7,3,3>
+  2725745596U,	// <3,3,6,u>: Cost 3 vsldoi8 <6,u,3,3>, <6,u,3,3>
+  2631458918U,	// <3,3,7,0>: Cost 3 vsldoi4 <2,3,3,7>, LHS
+  3705201460U,	// <3,3,7,1>: Cost 4 vsldoi4 <2,3,3,7>, <1,1,1,1>
+  2631460551U,	// <3,3,7,2>: Cost 3 vsldoi4 <2,3,3,7>, <2,3,3,7>
+  2297906802U,	// <3,3,7,3>: Cost 3 vmrglw <2,6,3,7>, <2,2,3,3>
+  2631462198U,	// <3,3,7,4>: Cost 3 vsldoi4 <2,3,3,7>, RHS
+  3371648547U,	// <3,3,7,5>: Cost 4 vmrglw <2,6,3,7>, <2,1,3,5>
+  3371648548U,	// <3,3,7,6>: Cost 4 vmrglw <2,6,3,7>, <2,1,3,6>
+  1224165306U,	// <3,3,7,7>: Cost 2 vmrglw <2,6,3,7>, <2,6,3,7>
+  1224165306U,	// <3,3,7,u>: Cost 2 vmrglw <2,6,3,7>, <2,6,3,7>
+  1215480726U,	// <3,3,u,0>: Cost 2 vmrglw <1,2,3,0>, <1,2,3,0>
+  1679002398U,	// <3,3,u,1>: Cost 2 vsldoi12 LHS, <3,u,1,2>
+  1659967368U,	// <3,3,u,2>: Cost 2 vsldoi8 <u,2,3,3>, <u,2,3,3>
+  403488870U,	// <3,3,u,3>: Cost 1 vspltisw3 LHS
+  1563659574U,	// <3,3,u,4>: Cost 2 vsldoi4 <3,3,3,3>, RHS
+  1679002438U,	// <3,3,u,5>: Cost 2 vsldoi12 LHS, <3,u,5,6>
+  2756946764U,	// <3,3,u,6>: Cost 3 vsldoi12 LHS, <3,u,6,3>
+  1224165306U,	// <3,3,u,7>: Cost 2 vmrglw <2,6,3,7>, <2,6,3,7>
+  403488870U,	// <3,3,u,u>: Cost 1 vspltisw3 LHS
+  2691907584U,	// <3,4,0,0>: Cost 3 vsldoi8 <1,2,3,4>, <0,0,0,0>
+  1618165862U,	// <3,4,0,1>: Cost 2 vsldoi8 <1,2,3,4>, LHS
+  2631476937U,	// <3,4,0,2>: Cost 3 vsldoi4 <2,3,4,0>, <2,3,4,0>
+  2232601732U,	// <3,4,0,3>: Cost 3 vmrghw <3,0,1,2>, <4,3,5,0>
+  2691907922U,	// <3,4,0,4>: Cost 3 vsldoi8 <1,2,3,4>, <0,4,1,5>
+  1158860086U,	// <3,4,0,5>: Cost 2 vmrghw <3,0,1,2>, RHS
+  3306343806U,	// <3,4,0,6>: Cost 4 vmrghw <3,0,1,2>, <4,6,5,7>
+  3366947484U,	// <3,4,0,7>: Cost 4 vmrglw <1,u,3,0>, <3,6,4,7>
+  1618166429U,	// <3,4,0,u>: Cost 2 vsldoi8 <1,2,3,4>, LHS
+  2631483494U,	// <3,4,1,0>: Cost 3 vsldoi4 <2,3,4,1>, LHS
+  2691908404U,	// <3,4,1,1>: Cost 3 vsldoi8 <1,2,3,4>, <1,1,1,1>
+  1618166682U,	// <3,4,1,2>: Cost 2 vsldoi8 <1,2,3,4>, <1,2,3,4>
+  3765650393U,	// <3,4,1,3>: Cost 4 vsldoi8 <1,2,3,4>, <1,3,1,4>
+  2631486774U,	// <3,4,1,4>: Cost 3 vsldoi4 <2,3,4,1>, RHS
+  2756946914U,	// <3,4,1,5>: Cost 3 vsldoi12 LHS, <4,1,5,0>
+  3765650639U,	// <3,4,1,6>: Cost 4 vsldoi8 <1,2,3,4>, <1,6,1,7>
+  3735090439U,	// <3,4,1,7>: Cost 4 vsldoi4 <7,3,4,1>, <7,3,4,1>
+  1622148480U,	// <3,4,1,u>: Cost 2 vsldoi8 <1,u,3,4>, <1,u,3,4>
+  3765650893U,	// <3,4,2,0>: Cost 4 vsldoi8 <1,2,3,4>, <2,0,3,0>
+  3831131154U,	// <3,4,2,1>: Cost 4 vsldoi12 LHS, <4,2,1,3>
+  2691909224U,	// <3,4,2,2>: Cost 3 vsldoi8 <1,2,3,4>, <2,2,2,2>
+  2691909286U,	// <3,4,2,3>: Cost 3 vsldoi8 <1,2,3,4>, <2,3,0,1>
+  2699208469U,	// <3,4,2,4>: Cost 3 vsldoi8 <2,4,3,4>, <2,4,3,4>
+  2233863478U,	// <3,4,2,5>: Cost 3 vmrghw <3,2,0,3>, RHS
+  2691909562U,	// <3,4,2,6>: Cost 3 vsldoi8 <1,2,3,4>, <2,6,3,7>
+  2701199368U,	// <3,4,2,7>: Cost 3 vsldoi8 <2,7,3,4>, <2,7,3,4>
+  2691909691U,	// <3,4,2,u>: Cost 3 vsldoi8 <1,2,3,4>, <2,u,0,1>
+  2691909782U,	// <3,4,3,0>: Cost 3 vsldoi8 <1,2,3,4>, <3,0,1,2>
+  3765651686U,	// <3,4,3,1>: Cost 4 vsldoi8 <1,2,3,4>, <3,1,1,1>
+  2691909972U,	// <3,4,3,2>: Cost 3 vsldoi8 <1,2,3,4>, <3,2,4,3>
+  2691910044U,	// <3,4,3,3>: Cost 3 vsldoi8 <1,2,3,4>, <3,3,3,3>
+  2691910096U,	// <3,4,3,4>: Cost 3 vsldoi8 <1,2,3,4>, <3,4,0,1>
+  1161006390U,	// <3,4,3,5>: Cost 2 vmrghw <3,3,3,3>, RHS
+  2691910300U,	// <3,4,3,6>: Cost 3 vsldoi8 <1,2,3,4>, <3,6,4,7>
+  3368962716U,	// <3,4,3,7>: Cost 4 vmrglw <2,2,3,3>, <3,6,4,7>
+  1161006633U,	// <3,4,3,u>: Cost 2 vmrghw <3,3,3,3>, RHS
+  2631508070U,	// <3,4,4,0>: Cost 3 vsldoi4 <2,3,4,4>, LHS
+  2631508890U,	// <3,4,4,1>: Cost 3 vsldoi4 <2,3,4,4>, <1,2,3,4>
+  2631509709U,	// <3,4,4,2>: Cost 3 vsldoi4 <2,3,4,4>, <2,3,4,4>
+  2289256788U,	// <3,4,4,3>: Cost 3 vmrglw <1,2,3,4>, <3,2,4,3>
+  1726336208U,	// <3,4,4,4>: Cost 2 vsldoi12 LHS, <4,4,4,4>
+  1618169142U,	// <3,4,4,5>: Cost 2 vsldoi8 <1,2,3,4>, RHS
+  3362998858U,	// <3,4,4,6>: Cost 4 vmrglw <1,2,3,4>, <3,5,4,6>
+  2289257116U,	// <3,4,4,7>: Cost 3 vmrglw <1,2,3,4>, <3,6,4,7>
+  1618169385U,	// <3,4,4,u>: Cost 2 vsldoi8 <1,2,3,4>, RHS
+  1557774438U,	// <3,4,5,0>: Cost 2 vsldoi4 <2,3,4,5>, LHS
+  2631516980U,	// <3,4,5,1>: Cost 3 vsldoi4 <2,3,4,5>, <1,1,1,1>
+  1557776078U,	// <3,4,5,2>: Cost 2 vsldoi4 <2,3,4,5>, <2,3,4,5>
+  2631518358U,	// <3,4,5,3>: Cost 3 vsldoi4 <2,3,4,5>, <3,0,1,2>
+  1557777718U,	// <3,4,5,4>: Cost 2 vsldoi4 <2,3,4,5>, RHS
+  2296563406U,	// <3,4,5,5>: Cost 3 vmrglw <2,4,3,5>, <2,3,4,5>
+  604818742U,	// <3,4,5,6>: Cost 1 vsldoi12 LHS, RHS
+  2661381387U,	// <3,4,5,7>: Cost 3 vsldoi4 <7,3,4,5>, <7,3,4,5>
+  604818760U,	// <3,4,5,u>: Cost 1 vsldoi12 LHS, RHS
+  3705266278U,	// <3,4,6,0>: Cost 4 vsldoi4 <2,3,4,6>, LHS
+  3831131482U,	// <3,4,6,1>: Cost 4 vsldoi12 LHS, <4,6,1,7>
+  2733715962U,	// <3,4,6,2>: Cost 3 vsldoi8 <u,2,3,4>, <6,2,7,3>
+  3844771180U,	// <3,4,6,3>: Cost 4 vsldoi12 <3,2,4,3>, <4,6,3,7>
+  2800078197U,	// <3,4,6,4>: Cost 3 vsldoi12 LHS, <4,6,4,7>
+  2236550454U,	// <3,4,6,5>: Cost 3 vmrghw <3,6,0,7>, RHS
+  2733716280U,	// <3,4,6,6>: Cost 3 vsldoi8 <u,2,3,4>, <6,6,6,6>
+  2725090156U,	// <3,4,6,7>: Cost 3 vsldoi8 <6,7,3,4>, <6,7,3,4>
+  2236550697U,	// <3,4,6,u>: Cost 3 vmrghw <3,6,0,7>, RHS
+  2733716474U,	// <3,4,7,0>: Cost 3 vsldoi8 <u,2,3,4>, <7,0,1,2>
+  3371647013U,	// <3,4,7,1>: Cost 4 vmrglw <2,6,3,7>, <0,0,4,1>
+  2727744688U,	// <3,4,7,2>: Cost 3 vsldoi8 <7,2,3,4>, <7,2,3,4>
+  3371649364U,	// <3,4,7,3>: Cost 4 vmrglw <2,6,3,7>, <3,2,4,3>
+  2733716838U,	// <3,4,7,4>: Cost 3 vsldoi8 <u,2,3,4>, <7,4,5,6>
+  2297906894U,	// <3,4,7,5>: Cost 3 vmrglw <2,6,3,7>, <2,3,4,5>
+  3371647180U,	// <3,4,7,6>: Cost 4 vmrglw <2,6,3,7>, <0,2,4,6>
+  2733717100U,	// <3,4,7,7>: Cost 3 vsldoi8 <u,2,3,4>, <7,7,7,7>
+  2297906897U,	// <3,4,7,u>: Cost 3 vmrglw <2,6,3,7>, <2,3,4,u>
+  1557799014U,	// <3,4,u,0>: Cost 2 vsldoi4 <2,3,4,u>, LHS
+  1618171694U,	// <3,4,u,1>: Cost 2 vsldoi8 <1,2,3,4>, LHS
+  1557800657U,	// <3,4,u,2>: Cost 2 vsldoi4 <2,3,4,u>, <2,3,4,u>
+  2691913660U,	// <3,4,u,3>: Cost 3 vsldoi8 <1,2,3,4>, <u,3,0,1>
+  1557802294U,	// <3,4,u,4>: Cost 2 vsldoi4 <2,3,4,u>, RHS
+  1618172058U,	// <3,4,u,5>: Cost 2 vsldoi8 <1,2,3,4>, RHS
+  604818985U,	// <3,4,u,6>: Cost 1 vsldoi12 LHS, RHS
+  2661405966U,	// <3,4,u,7>: Cost 3 vsldoi4 <7,3,4,u>, <7,3,4,u>
+  604819003U,	// <3,4,u,u>: Cost 1 vsldoi12 LHS, RHS
+  2643492966U,	// <3,5,0,0>: Cost 3 vsldoi4 <4,3,5,0>, LHS
+  2756947528U,	// <3,5,0,1>: Cost 3 vsldoi12 LHS, <5,0,1,2>
+  2331029019U,	// <3,5,0,2>: Cost 3 vmrglw <u,2,3,0>, <4,u,5,2>
+  2643495062U,	// <3,5,0,3>: Cost 3 vsldoi4 <4,3,5,0>, <3,0,1,2>
+  2756947554U,	// <3,5,0,4>: Cost 3 vsldoi12 LHS, <5,0,4,1>
+  2800078443U,	// <3,5,0,5>: Cost 3 vsldoi12 LHS, <5,0,5,1>
+  2289224194U,	// <3,5,0,6>: Cost 3 vmrglw <1,2,3,0>, <3,4,5,6>
+  3362964723U,	// <3,5,0,7>: Cost 4 vmrglw <1,2,3,0>, <1,6,5,7>
+  2756947590U,	// <3,5,0,u>: Cost 3 vsldoi12 LHS, <5,0,u,1>
+  2800078479U,	// <3,5,1,0>: Cost 3 vsldoi12 LHS, <5,1,0,1>
+  2333027218U,	// <3,5,1,1>: Cost 3 vmrglw <u,5,3,1>, <4,0,5,1>
+  2691916699U,	// <3,5,1,2>: Cost 3 vsldoi8 <1,2,3,5>, <1,2,3,5>
+  3832901294U,	// <3,5,1,3>: Cost 4 vsldoi12 <1,2,5,3>, <5,1,3,5>
+  2800078519U,	// <3,5,1,4>: Cost 3 vsldoi12 LHS, <5,1,4,5>
+  3830689467U,	// <3,5,1,5>: Cost 4 vsldoi12 LHS, <5,1,5,0>
+  3830689481U,	// <3,5,1,6>: Cost 4 vsldoi12 LHS, <5,1,6,5>
+  3873820365U,	// <3,5,1,7>: Cost 4 vsldoi12 LHS, <5,1,7,0>
+  2800078551U,	// <3,5,1,u>: Cost 3 vsldoi12 LHS, <5,1,u,1>
+  3770967487U,	// <3,5,2,0>: Cost 4 vsldoi8 <2,1,3,5>, <2,0,1,4>
+  2697225763U,	// <3,5,2,1>: Cost 3 vsldoi8 <2,1,3,5>, <2,1,3,5>
+  3830689523U,	// <3,5,2,2>: Cost 4 vsldoi12 LHS, <5,2,2,2>
+  2699216590U,	// <3,5,2,3>: Cost 3 vsldoi8 <2,4,3,5>, <2,3,4,5>
+  2699216662U,	// <3,5,2,4>: Cost 3 vsldoi8 <2,4,3,5>, <2,4,3,5>
+  2783047439U,	// <3,5,2,5>: Cost 3 vsldoi12 <5,2,5,3>, <5,2,5,3>
+  2783121176U,	// <3,5,2,6>: Cost 3 vsldoi12 <5,2,6,3>, <5,2,6,3>
+  3856936737U,	// <3,5,2,7>: Cost 4 vsldoi12 <5,2,7,3>, <5,2,7,3>
+  2701871194U,	// <3,5,2,u>: Cost 3 vsldoi8 <2,u,3,5>, <2,u,3,5>
+  2643517542U,	// <3,5,3,0>: Cost 3 vsldoi4 <4,3,5,3>, LHS
+  2331052946U,	// <3,5,3,1>: Cost 3 vmrglw <u,2,3,3>, <4,0,5,1>
+  3699345010U,	// <3,5,3,2>: Cost 4 vsldoi4 <1,3,5,3>, <2,2,3,3>
+  2705189276U,	// <3,5,3,3>: Cost 3 vsldoi8 <3,4,3,5>, <3,3,3,3>
+  2705189359U,	// <3,5,3,4>: Cost 3 vsldoi8 <3,4,3,5>, <3,4,3,5>
+  2331053274U,	// <3,5,3,5>: Cost 3 vmrglw <u,2,3,3>, <4,4,5,5>
+  2295220738U,	// <3,5,3,6>: Cost 3 vmrglw <2,2,3,3>, <3,4,5,6>
+  3368961267U,	// <3,5,3,7>: Cost 4 vmrglw <2,2,3,3>, <1,6,5,7>
+  2295220740U,	// <3,5,3,u>: Cost 3 vmrglw <2,2,3,3>, <3,4,5,u>
+  2643525734U,	// <3,5,4,0>: Cost 3 vsldoi4 <4,3,5,4>, LHS
+  2331061138U,	// <3,5,4,1>: Cost 3 vmrglw <u,2,3,4>, <4,0,5,1>
+  2235584280U,	// <3,5,4,2>: Cost 3 vmrghw <3,4,5,6>, <5,2,6,3>
+  2643528194U,	// <3,5,4,3>: Cost 3 vsldoi4 <4,3,5,4>, <3,4,5,6>
+  2735713498U,	// <3,5,4,4>: Cost 3 vsldoi8 <u,5,3,5>, <4,4,5,5>
+  2756947892U,	// <3,5,4,5>: Cost 3 vsldoi12 LHS, <5,4,5,6>
+  2289256962U,	// <3,5,4,6>: Cost 3 vmrglw <1,2,3,4>, <3,4,5,6>
+  3362997491U,	// <3,5,4,7>: Cost 4 vmrglw <1,2,3,4>, <1,6,5,7>
+  2756947919U,	// <3,5,4,u>: Cost 3 vsldoi12 LHS, <5,4,u,6>
+  2800078803U,	// <3,5,5,0>: Cost 3 vsldoi12 LHS, <5,5,0,1>
+  2800078812U,	// <3,5,5,1>: Cost 3 vsldoi12 LHS, <5,5,1,1>
+  2631591639U,	// <3,5,5,2>: Cost 3 vsldoi4 <2,3,5,5>, <2,3,5,5>
+  3832901616U,	// <3,5,5,3>: Cost 4 vsldoi12 <1,2,5,3>, <5,5,3,3>
+  2800078843U,	// <3,5,5,4>: Cost 3 vsldoi12 LHS, <5,5,4,5>
+  1726337028U,	// <3,5,5,5>: Cost 2 vsldoi12 LHS, <5,5,5,5>
+  2800078862U,	// <3,5,5,6>: Cost 3 vsldoi12 LHS, <5,5,6,6>
+  3368314099U,	// <3,5,5,7>: Cost 4 vmrglw <2,1,3,5>, <1,6,5,7>
+  1726337028U,	// <3,5,5,u>: Cost 2 vsldoi12 LHS, <5,5,5,5>
+  2800078884U,	// <3,5,6,0>: Cost 3 vsldoi12 LHS, <5,6,0,1>
+  2800078899U,	// <3,5,6,1>: Cost 3 vsldoi12 LHS, <5,6,1,7>
+  2631599832U,	// <3,5,6,2>: Cost 3 vsldoi4 <2,3,5,6>, <2,3,5,6>
+  2800078914U,	// <3,5,6,3>: Cost 3 vsldoi12 LHS, <5,6,3,4>
+  2800078924U,	// <3,5,6,4>: Cost 3 vsldoi12 LHS, <5,6,4,5>
+  2800078935U,	// <3,5,6,5>: Cost 3 vsldoi12 LHS, <5,6,5,7>
+  2297235970U,	// <3,5,6,6>: Cost 3 vmrglw <2,5,3,6>, <3,4,5,6>
+  1726337122U,	// <3,5,6,7>: Cost 2 vsldoi12 LHS, <5,6,7,0>
+  1726337131U,	// <3,5,6,u>: Cost 2 vsldoi12 LHS, <5,6,u,0>
+  3699376230U,	// <3,5,7,0>: Cost 4 vsldoi4 <1,3,5,7>, LHS
+  2333739922U,	// <3,5,7,1>: Cost 3 vmrglw <u,6,3,7>, <4,0,5,1>
+  3699378106U,	// <3,5,7,2>: Cost 4 vsldoi4 <1,3,5,7>, <2,6,3,7>
+  3371647915U,	// <3,5,7,3>: Cost 4 vmrglw <2,6,3,7>, <1,2,5,3>
+  3699379510U,	// <3,5,7,4>: Cost 4 vsldoi4 <1,3,5,7>, RHS
+  2333740250U,	// <3,5,7,5>: Cost 3 vmrglw <u,6,3,7>, <4,4,5,5>
+  2297907714U,	// <3,5,7,6>: Cost 3 vmrglw <2,6,3,7>, <3,4,5,6>
+  3370984691U,	// <3,5,7,7>: Cost 4 vmrglw <2,5,3,7>, <1,6,5,7>
+  2297907716U,	// <3,5,7,u>: Cost 3 vmrglw <2,6,3,7>, <3,4,5,u>
+  2800079046U,	// <3,5,u,0>: Cost 3 vsldoi12 LHS, <5,u,0,1>
+  2756948176U,	// <3,5,u,1>: Cost 3 vsldoi12 LHS, <5,u,1,2>
+  2331029019U,	// <3,5,u,2>: Cost 3 vmrglw <u,2,3,0>, <4,u,5,2>
+  2800079076U,	// <3,5,u,3>: Cost 3 vsldoi12 LHS, <5,u,3,4>
+  2800079085U,	// <3,5,u,4>: Cost 3 vsldoi12 LHS, <5,u,4,4>
+  1726337028U,	// <3,5,u,5>: Cost 2 vsldoi12 LHS, <5,5,5,5>
+  2289289730U,	// <3,5,u,6>: Cost 3 vmrglw <1,2,3,u>, <3,4,5,6>
+  1726337284U,	// <3,5,u,7>: Cost 2 vsldoi12 LHS, <5,u,7,0>
+  1726337293U,	// <3,5,u,u>: Cost 2 vsldoi12 LHS, <5,u,u,0>
+  3773628416U,	// <3,6,0,0>: Cost 4 vsldoi8 <2,5,3,6>, <0,0,0,0>
+  2699886694U,	// <3,6,0,1>: Cost 3 vsldoi8 <2,5,3,6>, LHS
+  2789167401U,	// <3,6,0,2>: Cost 3 vsldoi12 <6,2,7,3>, <6,0,2,1>
+  3362965862U,	// <3,6,0,3>: Cost 4 vmrglw <1,2,3,0>, <3,2,6,3>
+  3773628754U,	// <3,6,0,4>: Cost 4 vsldoi8 <2,5,3,6>, <0,4,1,5>
+  3723284326U,	// <3,6,0,5>: Cost 4 vsldoi4 <5,3,6,0>, <5,3,6,0>
+  2800079181U,	// <3,6,0,6>: Cost 3 vsldoi12 LHS, <6,0,6,1>
+  1215483190U,	// <3,6,0,7>: Cost 2 vmrglw <1,2,3,0>, RHS
+  1215483191U,	// <3,6,0,u>: Cost 2 vmrglw <1,2,3,0>, RHS
+  3873821032U,	// <3,6,1,0>: Cost 4 vsldoi12 LHS, <6,1,0,1>
+  3773629236U,	// <3,6,1,1>: Cost 4 vsldoi8 <2,5,3,6>, <1,1,1,1>
+  2691924892U,	// <3,6,1,2>: Cost 3 vsldoi8 <1,2,3,6>, <1,2,3,6>
+  3830690184U,	// <3,6,1,3>: Cost 5 vsldoi12 LHS, <6,1,3,6>
+  3873821072U,	// <3,6,1,4>: Cost 4 vsldoi12 LHS, <6,1,4,5>
+  3873821082U,	// <3,6,1,5>: Cost 4 vsldoi12 LHS, <6,1,5,6>
+  3403453240U,	// <3,6,1,6>: Cost 4 vmrglw <u,0,3,1>, <6,6,6,6>
+  2289233206U,	// <3,6,1,7>: Cost 3 vmrglw <1,2,3,1>, RHS
+  2289233207U,	// <3,6,1,u>: Cost 3 vmrglw <1,2,3,1>, RHS
+  2661498982U,	// <3,6,2,0>: Cost 3 vsldoi4 <7,3,6,2>, LHS
+  3770975780U,	// <3,6,2,1>: Cost 4 vsldoi8 <2,1,3,6>, <2,1,3,6>
+  2631640797U,	// <3,6,2,2>: Cost 3 vsldoi4 <2,3,6,2>, <2,3,6,2>
+  3771639485U,	// <3,6,2,3>: Cost 4 vsldoi8 <2,2,3,6>, <2,3,2,6>
+  2661502262U,	// <3,6,2,4>: Cost 3 vsldoi4 <7,3,6,2>, RHS
+  2699888488U,	// <3,6,2,5>: Cost 3 vsldoi8 <2,5,3,6>, <2,5,3,6>
+  2661503482U,	// <3,6,2,6>: Cost 3 vsldoi4 <7,3,6,2>, <6,2,7,3>
+  1715425786U,	// <3,6,2,7>: Cost 2 vsldoi12 <6,2,7,3>, <6,2,7,3>
+  1715499523U,	// <3,6,2,u>: Cost 2 vsldoi12 <6,2,u,3>, <6,2,u,3>
+  3773630614U,	// <3,6,3,0>: Cost 4 vsldoi8 <2,5,3,6>, <3,0,1,2>
+  3372942825U,	// <3,6,3,1>: Cost 4 vmrglw <2,u,3,3>, <2,0,6,1>
+  2234749434U,	// <3,6,3,2>: Cost 3 vmrghw <3,3,3,3>, <6,2,7,3>
+  3368962406U,	// <3,6,3,3>: Cost 4 vmrglw <2,2,3,3>, <3,2,6,3>
+  2699889154U,	// <3,6,3,4>: Cost 3 vsldoi8 <2,5,3,6>, <3,4,5,6>
+  3773631068U,	// <3,6,3,5>: Cost 4 vsldoi8 <2,5,3,6>, <3,5,6,6>
+  2331054904U,	// <3,6,3,6>: Cost 3 vmrglw <u,2,3,3>, <6,6,6,6>
+  1221479734U,	// <3,6,3,7>: Cost 2 vmrglw <2,2,3,3>, RHS
+  1221479735U,	// <3,6,3,u>: Cost 2 vmrglw <2,2,3,3>, RHS
+  2235584801U,	// <3,6,4,0>: Cost 3 vmrghw <3,4,5,6>, <6,0,1,2>
+  3717342106U,	// <3,6,4,1>: Cost 4 vsldoi4 <4,3,6,4>, <1,2,3,4>
+  2789167729U,	// <3,6,4,2>: Cost 3 vsldoi12 <6,2,7,3>, <6,4,2,5>
+  2235585074U,	// <3,6,4,3>: Cost 3 vmrghw <3,4,5,6>, <6,3,4,5>
+  2235585165U,	// <3,6,4,4>: Cost 3 vmrghw <3,4,5,6>, <6,4,5,6>
+  2699889974U,	// <3,6,4,5>: Cost 3 vsldoi8 <2,5,3,6>, RHS
+  2800079509U,	// <3,6,4,6>: Cost 3 vsldoi12 LHS, <6,4,6,5>
+  1215515958U,	// <3,6,4,7>: Cost 2 vmrglw <1,2,3,4>, RHS
+  1215515959U,	// <3,6,4,u>: Cost 2 vmrglw <1,2,3,4>, RHS
+  3873821356U,	// <3,6,5,0>: Cost 4 vsldoi12 LHS, <6,5,0,1>
+  3372959209U,	// <3,6,5,1>: Cost 5 vmrglw <2,u,3,5>, <2,0,6,1>
+  3862909629U,	// <3,6,5,2>: Cost 4 vsldoi12 <6,2,7,3>, <6,5,2,0>
+  3773632358U,	// <3,6,5,3>: Cost 4 vsldoi8 <2,5,3,6>, <5,3,6,0>
+  3873821396U,	// <3,6,5,4>: Cost 4 vsldoi12 LHS, <6,5,4,5>
+  3873821405U,	// <3,6,5,5>: Cost 4 vsldoi12 LHS, <6,5,5,5>
+  3862909672U,	// <3,6,5,6>: Cost 4 vsldoi12 <6,2,7,3>, <6,5,6,7>
+  2294574390U,	// <3,6,5,7>: Cost 3 vmrglw <2,1,3,5>, RHS
+  2294574391U,	// <3,6,5,u>: Cost 3 vmrglw <2,1,3,5>, RHS
+  2800079613U,	// <3,6,6,0>: Cost 3 vsldoi12 LHS, <6,6,0,1>
+  3873821446U,	// <3,6,6,1>: Cost 4 vsldoi12 LHS, <6,6,1,1>
+  2789167888U,	// <3,6,6,2>: Cost 3 vsldoi12 <6,2,7,3>, <6,6,2,2>
+  3844920090U,	// <3,6,6,3>: Cost 4 vsldoi12 <3,2,6,3>, <6,6,3,3>
+  2800079653U,	// <3,6,6,4>: Cost 3 vsldoi12 LHS, <6,6,4,5>
+  3723333484U,	// <3,6,6,5>: Cost 4 vsldoi4 <5,3,6,6>, <5,3,6,6>
+  1726337848U,	// <3,6,6,6>: Cost 2 vsldoi12 LHS, <6,6,6,6>
+  1726337858U,	// <3,6,6,7>: Cost 2 vsldoi12 LHS, <6,6,7,7>
+  1726337867U,	// <3,6,6,u>: Cost 2 vsldoi12 LHS, <6,6,u,7>
+  1726337870U,	// <3,6,7,0>: Cost 2 vsldoi12 LHS, <6,7,0,1>
+  2297906665U,	// <3,6,7,1>: Cost 3 vmrglw <2,6,3,7>, <2,0,6,1>
+  2792117090U,	// <3,6,7,2>: Cost 3 vsldoi12 <6,7,2,3>, <6,7,2,3>
+  2297907558U,	// <3,6,7,3>: Cost 3 vmrglw <2,6,3,7>, <3,2,6,3>
+  1726337910U,	// <3,6,7,4>: Cost 2 vsldoi12 LHS, <6,7,4,5>
+  2297906993U,	// <3,6,7,5>: Cost 3 vmrglw <2,6,3,7>, <2,4,6,5>
+  2297906832U,	// <3,6,7,6>: Cost 3 vmrglw <2,6,3,7>, <2,2,6,6>
+  1224166710U,	// <3,6,7,7>: Cost 2 vmrglw <2,6,3,7>, RHS
+  1224166711U,	// <3,6,7,u>: Cost 2 vmrglw <2,6,3,7>, RHS
+  1726337951U,	// <3,6,u,0>: Cost 2 vsldoi12 LHS, <6,u,0,1>
+  2699892526U,	// <3,6,u,1>: Cost 3 vsldoi8 <2,5,3,6>, LHS
+  2789168049U,	// <3,6,u,2>: Cost 3 vsldoi12 <6,2,7,3>, <6,u,2,1>
+  2792854460U,	// <3,6,u,3>: Cost 3 vsldoi12 <6,u,3,3>, <6,u,3,3>
+  1726337991U,	// <3,6,u,4>: Cost 2 vsldoi12 LHS, <6,u,4,5>
+  2699892890U,	// <3,6,u,5>: Cost 3 vsldoi8 <2,5,3,6>, RHS
+  1726337848U,	// <3,6,u,6>: Cost 2 vsldoi12 LHS, <6,6,6,6>
+  1215548726U,	// <3,6,u,7>: Cost 2 vmrglw <1,2,3,u>, RHS
+  1215548727U,	// <3,6,u,u>: Cost 2 vmrglw <1,2,3,u>, RHS
+  2700558336U,	// <3,7,0,0>: Cost 3 vsldoi8 <2,6,3,7>, <0,0,0,0>
+  1626816614U,	// <3,7,0,1>: Cost 2 vsldoi8 <2,6,3,7>, LHS
+  2700558513U,	// <3,7,0,2>: Cost 3 vsldoi8 <2,6,3,7>, <0,2,1,6>
+  2331030010U,	// <3,7,0,3>: Cost 3 vmrglw <u,2,3,0>, <6,2,7,3>
+  2700558674U,	// <3,7,0,4>: Cost 3 vsldoi8 <2,6,3,7>, <0,4,1,5>
+  2800079906U,	// <3,7,0,5>: Cost 3 vsldoi12 LHS, <7,0,5,6>
+  2655588936U,	// <3,7,0,6>: Cost 3 vsldoi4 <6,3,7,0>, <6,3,7,0>
+  2800079919U,	// <3,7,0,7>: Cost 3 vsldoi12 LHS, <7,0,7,1>
+  1626817181U,	// <3,7,0,u>: Cost 2 vsldoi8 <2,6,3,7>, LHS
+  3774300899U,	// <3,7,1,0>: Cost 4 vsldoi8 <2,6,3,7>, <1,0,1,1>
+  2700559156U,	// <3,7,1,1>: Cost 3 vsldoi8 <2,6,3,7>, <1,1,1,1>
+  2700559254U,	// <3,7,1,2>: Cost 3 vsldoi8 <2,6,3,7>, <1,2,3,0>
+  3774301148U,	// <3,7,1,3>: Cost 4 vsldoi8 <2,6,3,7>, <1,3,1,7>
+  3774301227U,	// <3,7,1,4>: Cost 4 vsldoi8 <2,6,3,7>, <1,4,1,5>
+  3774301295U,	// <3,7,1,5>: Cost 4 vsldoi8 <2,6,3,7>, <1,5,0,1>
+  3768329441U,	// <3,7,1,6>: Cost 4 vsldoi8 <1,6,3,7>, <1,6,3,7>
+  3403453250U,	// <3,7,1,7>: Cost 4 vmrglw <u,0,3,1>, <6,6,7,7>
+  2700559740U,	// <3,7,1,u>: Cost 3 vsldoi8 <2,6,3,7>, <1,u,3,0>
+  2700559849U,	// <3,7,2,0>: Cost 3 vsldoi8 <2,6,3,7>, <2,0,6,1>
+  3770983973U,	// <3,7,2,1>: Cost 4 vsldoi8 <2,1,3,7>, <2,1,3,7>
+  2700559976U,	// <3,7,2,2>: Cost 3 vsldoi8 <2,6,3,7>, <2,2,2,2>
+  2698569415U,	// <3,7,2,3>: Cost 3 vsldoi8 <2,3,3,7>, <2,3,3,7>
+  2700560177U,	// <3,7,2,4>: Cost 3 vsldoi8 <2,6,3,7>, <2,4,6,5>
+  3773638505U,	// <3,7,2,5>: Cost 4 vsldoi8 <2,5,3,7>, <2,5,3,7>
+  1626818490U,	// <3,7,2,6>: Cost 2 vsldoi8 <2,6,3,7>, <2,6,3,7>
+  2795140307U,	// <3,7,2,7>: Cost 3 vsldoi12 <7,2,7,3>, <7,2,7,3>
+  1628145756U,	// <3,7,2,u>: Cost 2 vsldoi8 <2,u,3,7>, <2,u,3,7>
+  2700560534U,	// <3,7,3,0>: Cost 3 vsldoi8 <2,6,3,7>, <3,0,1,2>
+  3774302438U,	// <3,7,3,1>: Cost 4 vsldoi8 <2,6,3,7>, <3,1,1,1>
+  2700560742U,	// <3,7,3,2>: Cost 3 vsldoi8 <2,6,3,7>, <3,2,6,3>
+  2700560796U,	// <3,7,3,3>: Cost 3 vsldoi8 <2,6,3,7>, <3,3,3,3>
+  2700560898U,	// <3,7,3,4>: Cost 3 vsldoi8 <2,6,3,7>, <3,4,5,6>
+  3774302821U,	// <3,7,3,5>: Cost 4 vsldoi8 <2,6,3,7>, <3,5,7,6>
+  2700561079U,	// <3,7,3,6>: Cost 3 vsldoi8 <2,6,3,7>, <3,6,7,7>
+  2700561091U,	// <3,7,3,7>: Cost 3 vsldoi8 <2,6,3,7>, <3,7,0,1>
+  2700561182U,	// <3,7,3,u>: Cost 3 vsldoi8 <2,6,3,7>, <3,u,1,2>
+  2655617126U,	// <3,7,4,0>: Cost 3 vsldoi4 <6,3,7,4>, LHS
+  3774303178U,	// <3,7,4,1>: Cost 4 vsldoi8 <2,6,3,7>, <4,1,2,3>
+  2655619002U,	// <3,7,4,2>: Cost 3 vsldoi4 <6,3,7,4>, <2,6,3,7>
+  2331062778U,	// <3,7,4,3>: Cost 3 vmrglw <u,2,3,4>, <6,2,7,3>
+  2655620406U,	// <3,7,4,4>: Cost 3 vsldoi4 <6,3,7,4>, RHS
+  1626819894U,	// <3,7,4,5>: Cost 2 vsldoi8 <2,6,3,7>, RHS
+  2655621708U,	// <3,7,4,6>: Cost 3 vsldoi4 <6,3,7,4>, <6,3,7,4>
+  2800080247U,	// <3,7,4,7>: Cost 3 vsldoi12 LHS, <7,4,7,5>
+  1626820137U,	// <3,7,4,u>: Cost 2 vsldoi8 <2,6,3,7>, RHS
+  3774303816U,	// <3,7,5,0>: Cost 4 vsldoi8 <2,6,3,7>, <5,0,1,2>
+  3873822093U,	// <3,7,5,1>: Cost 4 vsldoi12 LHS, <7,5,1,0>
+  3774303998U,	// <3,7,5,2>: Cost 4 vsldoi8 <2,6,3,7>, <5,2,3,4>
+  3862910368U,	// <3,7,5,3>: Cost 4 vsldoi12 <6,2,7,3>, <7,5,3,1>
+  3774304180U,	// <3,7,5,4>: Cost 4 vsldoi8 <2,6,3,7>, <5,4,5,6>
+  2800080310U,	// <3,7,5,5>: Cost 3 vsldoi12 LHS, <7,5,5,5>
+  2800080321U,	// <3,7,5,6>: Cost 3 vsldoi12 LHS, <7,5,6,7>
+  3873822147U,	// <3,7,5,7>: Cost 4 vsldoi12 LHS, <7,5,7,0>
+  2800080339U,	// <3,7,5,u>: Cost 3 vsldoi12 LHS, <7,5,u,7>
+  2800080348U,	// <3,7,6,0>: Cost 3 vsldoi12 LHS, <7,6,0,7>
+  3873822181U,	// <3,7,6,1>: Cost 4 vsldoi12 LHS, <7,6,1,7>
+  2789168622U,	// <3,7,6,2>: Cost 3 vsldoi12 <6,2,7,3>, <7,6,2,7>
+  2700563016U,	// <3,7,6,3>: Cost 3 vsldoi8 <2,6,3,7>, <6,3,7,0>
+  2800080384U,	// <3,7,6,4>: Cost 3 vsldoi12 LHS, <7,6,4,7>
+  3862910472U,	// <3,7,6,5>: Cost 4 vsldoi12 <6,2,7,3>, <7,6,5,6>
+  2700563256U,	// <3,7,6,6>: Cost 3 vsldoi8 <2,6,3,7>, <6,6,6,6>
+  2800080404U,	// <3,7,6,7>: Cost 3 vsldoi12 LHS, <7,6,7,0>
+  2793149988U,	// <3,7,6,u>: Cost 3 vsldoi12 <6,u,7,3>, <7,6,u,7>
+  2637725798U,	// <3,7,7,0>: Cost 3 vsldoi4 <3,3,7,7>, LHS
+  3371649227U,	// <3,7,7,1>: Cost 4 vmrglw <2,6,3,7>, <3,0,7,1>
+  2637727674U,	// <3,7,7,2>: Cost 3 vsldoi4 <3,3,7,7>, <2,6,3,7>
+  2297907567U,	// <3,7,7,3>: Cost 3 vmrglw <2,6,3,7>, <3,2,7,3>
+  2637729078U,	// <3,7,7,4>: Cost 3 vsldoi4 <3,3,7,7>, RHS
+  3371649312U,	// <3,7,7,5>: Cost 4 vmrglw <2,6,3,7>, <3,1,7,5>
+  2655646287U,	// <3,7,7,6>: Cost 3 vsldoi4 <6,3,7,7>, <6,3,7,7>
+  1726338668U,	// <3,7,7,7>: Cost 2 vsldoi12 LHS, <7,7,7,7>
+  1726338668U,	// <3,7,7,u>: Cost 2 vsldoi12 LHS, <7,7,7,7>
+  2700564179U,	// <3,7,u,0>: Cost 3 vsldoi8 <2,6,3,7>, <u,0,1,2>
+  1626822446U,	// <3,7,u,1>: Cost 2 vsldoi8 <2,6,3,7>, LHS
+  2700564357U,	// <3,7,u,2>: Cost 3 vsldoi8 <2,6,3,7>, <u,2,3,0>
+  2700564412U,	// <3,7,u,3>: Cost 3 vsldoi8 <2,6,3,7>, <u,3,0,1>
+  2700564543U,	// <3,7,u,4>: Cost 3 vsldoi8 <2,6,3,7>, <u,4,5,6>
+  1626822810U,	// <3,7,u,5>: Cost 2 vsldoi8 <2,6,3,7>, RHS
+  1662654672U,	// <3,7,u,6>: Cost 2 vsldoi8 <u,6,3,7>, <u,6,3,7>
+  1726338668U,	// <3,7,u,7>: Cost 2 vsldoi12 LHS, <7,7,7,7>
+  1626823013U,	// <3,7,u,u>: Cost 2 vsldoi8 <2,6,3,7>, LHS
+  1678557184U,	// <3,u,0,0>: Cost 2 vsldoi12 LHS, <0,0,0,0>
+  1679005395U,	// <3,u,0,1>: Cost 2 vsldoi12 LHS, <u,0,1,2>
+  2289221787U,	// <3,u,0,2>: Cost 3 vmrglw <1,2,3,0>, <0,1,u,2>
+  1215479964U,	// <3,u,0,3>: Cost 2 vmrglw <1,2,3,0>, LHS
+  2752747245U,	// <3,u,0,4>: Cost 3 vsldoi12 LHS, <u,0,4,1>
+  1158863002U,	// <3,u,0,5>: Cost 2 vmrghw <3,0,1,2>, RHS
+  2289224221U,	// <3,u,0,6>: Cost 3 vmrglw <1,2,3,0>, <3,4,u,6>
+  1215483208U,	// <3,u,0,7>: Cost 2 vmrglw <1,2,3,0>, RHS
+  1679005458U,	// <3,u,0,u>: Cost 2 vsldoi12 LHS, <u,0,u,2>
+  1558036582U,	// <3,u,1,0>: Cost 2 vsldoi4 <2,3,u,1>, LHS
+  1678558004U,	// <3,u,1,1>: Cost 2 vsldoi12 LHS, <1,1,1,1>
+  604821294U,	// <3,u,1,2>: Cost 1 vsldoi12 LHS, LHS
+  2752747317U,	// <3,u,1,3>: Cost 3 vsldoi12 LHS, <u,1,3,1>
+  1558039862U,	// <3,u,1,4>: Cost 2 vsldoi4 <2,3,u,1>, RHS
+  2756949830U,	// <3,u,1,5>: Cost 3 vsldoi12 LHS, <u,1,5,0>
+  2800080726U,	// <3,u,1,6>: Cost 3 vsldoi12 LHS, <u,1,6,7>
+  2289233224U,	// <3,u,1,7>: Cost 3 vmrglw <1,2,3,1>, RHS
+  604821348U,	// <3,u,1,u>: Cost 1 vsldoi12 LHS, LHS
+  2696586709U,	// <3,u,2,0>: Cost 3 vsldoi8 <2,0,3,u>, <2,0,3,u>
+  2757392246U,	// <3,u,2,1>: Cost 3 vsldoi12 LHS, <u,2,1,3>
+  1624172151U,	// <3,u,2,2>: Cost 2 vsldoi8 <2,2,3,u>, <2,2,3,u>
+  1679005576U,	// <3,u,2,3>: Cost 2 vsldoi12 LHS, <u,2,3,3>
+  2631789878U,	// <3,u,2,4>: Cost 3 vsldoi4 <2,3,u,2>, RHS
+  2699904874U,	// <3,u,2,5>: Cost 3 vsldoi8 <2,5,3,u>, <2,5,3,u>
+  1626826683U,	// <3,u,2,6>: Cost 2 vsldoi8 <2,6,3,u>, <2,6,3,u>
+  1726338988U,	// <3,u,2,7>: Cost 2 vsldoi12 LHS, <u,2,7,3>
+  1683208117U,	// <3,u,2,u>: Cost 2 vsldoi12 LHS, <u,2,u,3>
+  1679005628U,	// <3,u,3,0>: Cost 2 vsldoi12 LHS, <u,3,0,1>
+  1161008942U,	// <3,u,3,1>: Cost 2 vmrghw <3,3,3,3>, LHS
+  2752747471U,	// <3,u,3,2>: Cost 3 vsldoi12 LHS, <u,3,2,2>
+  403488870U,	// <3,u,3,3>: Cost 1 vspltisw3 LHS
+  1679005668U,	// <3,u,3,4>: Cost 2 vsldoi12 LHS, <u,3,4,5>
+  1161009306U,	// <3,u,3,5>: Cost 2 vmrghw <3,3,3,3>, RHS
+  2691943104U,	// <3,u,3,6>: Cost 3 vsldoi8 <1,2,3,u>, <3,6,u,7>
+  1221479752U,	// <3,u,3,7>: Cost 2 vmrglw <2,2,3,3>, RHS
+  403488870U,	// <3,u,3,u>: Cost 1 vspltisw3 LHS
+  2289255363U,	// <3,u,4,0>: Cost 3 vmrglw <1,2,3,4>, <1,2,u,0>
+  1161844526U,	// <3,u,4,1>: Cost 2 vmrghw <3,4,5,6>, LHS
+  2289256661U,	// <3,u,4,2>: Cost 3 vmrglw <1,2,3,4>, <3,0,u,2>
+  1215512732U,	// <3,u,4,3>: Cost 2 vmrglw <1,2,3,4>, LHS
+  1215513498U,	// <3,u,4,4>: Cost 2 vmrglw <1,2,3,4>, <1,2,3,4>
+  1679005759U,	// <3,u,4,5>: Cost 2 vsldoi12 LHS, <u,4,5,6>
+  2289256989U,	// <3,u,4,6>: Cost 3 vmrglw <1,2,3,4>, <3,4,u,6>
+  1215515976U,	// <3,u,4,7>: Cost 2 vmrglw <1,2,3,4>, RHS
+  1679005786U,	// <3,u,4,u>: Cost 2 vsldoi12 LHS, <u,4,u,6>
+  1558069350U,	// <3,u,5,0>: Cost 2 vsldoi4 <2,3,u,5>, LHS
+  2631811892U,	// <3,u,5,1>: Cost 3 vsldoi4 <2,3,u,5>, <1,1,1,1>
+  1558071026U,	// <3,u,5,2>: Cost 2 vsldoi4 <2,3,u,5>, <2,3,u,5>
+  2752747646U,	// <3,u,5,3>: Cost 3 vsldoi12 LHS, <u,5,3,6>
+  1558072630U,	// <3,u,5,4>: Cost 2 vsldoi4 <2,3,u,5>, RHS
+  1726337028U,	// <3,u,5,5>: Cost 2 vsldoi12 LHS, <5,5,5,5>
+  604821658U,	// <3,u,5,6>: Cost 1 vsldoi12 LHS, RHS
+  2294574408U,	// <3,u,5,7>: Cost 3 vmrglw <2,1,3,5>, RHS
+  604821676U,	// <3,u,5,u>: Cost 1 vsldoi12 LHS, RHS
+  2631819366U,	// <3,u,6,0>: Cost 3 vsldoi4 <2,3,u,6>, LHS
+  2757392574U,	// <3,u,6,1>: Cost 3 vsldoi12 LHS, <u,6,1,7>
+  2631821043U,	// <3,u,6,2>: Cost 3 vsldoi4 <2,3,u,6>, <2,3,u,6>
+  1679005904U,	// <3,u,6,3>: Cost 2 vsldoi12 LHS, <u,6,3,7>
+  2631822646U,	// <3,u,6,4>: Cost 3 vsldoi4 <2,3,u,6>, RHS
+  2236553370U,	// <3,u,6,5>: Cost 3 vmrghw <3,6,0,7>, RHS
+  1726337848U,	// <3,u,6,6>: Cost 2 vsldoi12 LHS, <6,6,6,6>
+  1726339309U,	// <3,u,6,7>: Cost 2 vsldoi12 LHS, <u,6,7,0>
+  1683208445U,	// <3,u,6,u>: Cost 2 vsldoi12 LHS, <u,6,u,7>
+  1726339328U,	// <3,u,7,0>: Cost 2 vsldoi12 LHS, <u,7,0,1>
+  2297905225U,	// <3,u,7,1>: Cost 3 vmrglw <2,6,3,7>, <0,0,u,1>
+  2631829236U,	// <3,u,7,2>: Cost 3 vsldoi4 <2,3,u,7>, <2,3,u,7>
+  1224163484U,	// <3,u,7,3>: Cost 2 vmrglw <2,6,3,7>, LHS
+  1726339368U,	// <3,u,7,4>: Cost 2 vsldoi12 LHS, <u,7,4,5>
+  2297905553U,	// <3,u,7,5>: Cost 3 vmrglw <2,6,3,7>, <0,4,u,5>
+  2297905392U,	// <3,u,7,6>: Cost 3 vmrglw <2,6,3,7>, <0,2,u,6>
+  1224166728U,	// <3,u,7,7>: Cost 2 vmrglw <2,6,3,7>, RHS
+  1224163489U,	// <3,u,7,u>: Cost 2 vmrglw <2,6,3,7>, LHS
+  1683208529U,	// <3,u,u,0>: Cost 2 vsldoi12 LHS, <u,u,0,1>
+  1679006043U,	// <3,u,u,1>: Cost 2 vsldoi12 LHS, <u,u,1,2>
+  604821861U,	// <3,u,u,2>: Cost 1 vsldoi12 LHS, LHS
+  403488870U,	// <3,u,u,3>: Cost 1 vspltisw3 LHS
+  1683208569U,	// <3,u,u,4>: Cost 2 vsldoi12 LHS, <u,u,4,5>
+  1679006083U,	// <3,u,u,5>: Cost 2 vsldoi12 LHS, <u,u,5,6>
+  604821901U,	// <3,u,u,6>: Cost 1 vsldoi12 LHS, RHS
+  1215548744U,	// <3,u,u,7>: Cost 2 vmrglw <1,2,3,u>, RHS
+  604821915U,	// <3,u,u,u>: Cost 1 vsldoi12 LHS, LHS
+  2759016448U,	// <4,0,0,0>: Cost 3 vsldoi12 <1,2,3,4>, <0,0,0,0>
+  1165115494U,	// <4,0,0,1>: Cost 2 vmrghw <4,0,5,1>, LHS
+  3717531337U,	// <4,0,0,2>: Cost 4 vsldoi4 <4,4,0,0>, <2,3,4,0>
+  3369675785U,	// <4,0,0,3>: Cost 4 vmrglw <2,3,4,0>, <4,2,0,3>
+  2751791144U,	// <4,0,0,4>: Cost 3 vsldoi12 <0,0,4,4>, <0,0,4,4>
+  2238857630U,	// <4,0,0,5>: Cost 3 vmrghw <4,0,5,1>, <0,5,1,0>
+  3312591341U,	// <4,0,0,6>: Cost 4 vmrghw <4,0,5,0>, <0,6,0,7>
+  3369676113U,	// <4,0,0,7>: Cost 4 vmrglw <2,3,4,0>, <4,6,0,7>
+  1165116061U,	// <4,0,0,u>: Cost 2 vmrghw <4,0,5,1>, LHS
+  2637824102U,	// <4,0,1,0>: Cost 3 vsldoi4 <3,4,0,1>, LHS
+  2637824922U,	// <4,0,1,1>: Cost 3 vsldoi4 <3,4,0,1>, <1,2,3,4>
+  1685274726U,	// <4,0,1,2>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+  2637826512U,	// <4,0,1,3>: Cost 3 vsldoi4 <3,4,0,1>, <3,4,0,1>
+  2637827382U,	// <4,0,1,4>: Cost 3 vsldoi4 <3,4,0,1>, RHS
+  2661716070U,	// <4,0,1,5>: Cost 3 vsldoi4 <7,4,0,1>, <5,6,7,4>
+  3729486427U,	// <4,0,1,6>: Cost 4 vsldoi4 <6,4,0,1>, <6,4,0,1>
+  2661717300U,	// <4,0,1,7>: Cost 3 vsldoi4 <7,4,0,1>, <7,4,0,1>
+  1685274780U,	// <4,0,1,u>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+  3711574118U,	// <4,0,2,0>: Cost 4 vsldoi4 <3,4,0,2>, LHS
+  2240200806U,	// <4,0,2,1>: Cost 3 vmrghw <4,2,5,3>, LHS
+  3771663992U,	// <4,0,2,2>: Cost 4 vsldoi8 <2,2,4,0>, <2,2,4,0>
+  2698585801U,	// <4,0,2,3>: Cost 3 vsldoi8 <2,3,4,0>, <2,3,4,0>
+  3373672105U,	// <4,0,2,4>: Cost 4 vmrglw <3,0,4,2>, <2,3,0,4>
+  3810813795U,	// <4,0,2,5>: Cost 4 vsldoi8 <u,7,4,0>, <2,5,3,1>
+  3772327866U,	// <4,0,2,6>: Cost 4 vsldoi8 <2,3,4,0>, <2,6,3,7>
+  3386280568U,	// <4,0,2,7>: Cost 5 vmrglw <5,1,4,2>, <3,6,0,7>
+  2701903966U,	// <4,0,2,u>: Cost 3 vsldoi8 <2,u,4,0>, <2,u,4,0>
+  3699638374U,	// <4,0,3,0>: Cost 4 vsldoi4 <1,4,0,3>, LHS
+  2753560832U,	// <4,0,3,1>: Cost 3 vsldoi12 <0,3,1,4>, <0,3,1,4>
+  3772328276U,	// <4,0,3,2>: Cost 4 vsldoi8 <2,3,4,0>, <3,2,4,3>
+  3827302674U,	// <4,0,3,3>: Cost 4 vsldoi12 <0,3,1,4>, <0,3,3,4>
+  3699641654U,	// <4,0,3,4>: Cost 4 vsldoi4 <1,4,0,3>, RHS
+  3779627588U,	// <4,0,3,5>: Cost 4 vsldoi8 <3,5,4,0>, <3,5,4,0>
+  3772328604U,	// <4,0,3,6>: Cost 4 vsldoi8 <2,3,4,0>, <3,6,4,7>
+  3780954854U,	// <4,0,3,7>: Cost 4 vsldoi8 <3,7,4,0>, <3,7,4,0>
+  2753560832U,	// <4,0,3,u>: Cost 3 vsldoi12 <0,3,1,4>, <0,3,1,4>
+  2725129106U,	// <4,0,4,0>: Cost 3 vsldoi8 <6,7,4,0>, <4,0,5,1>
+  1167720550U,	// <4,0,4,1>: Cost 2 vmrghw <4,4,4,4>, LHS
+  3839172953U,	// <4,0,4,2>: Cost 4 vsldoi12 <2,3,0,4>, <0,4,2,3>
+  3772329051U,	// <4,0,4,3>: Cost 4 vsldoi8 <2,3,4,0>, <4,3,0,4>
+  2241462610U,	// <4,0,4,4>: Cost 3 vmrghw <4,4,4,4>, <0,4,1,5>
+  2698587446U,	// <4,0,4,5>: Cost 3 vsldoi8 <2,3,4,0>, RHS
+  3772329297U,	// <4,0,4,6>: Cost 4 vsldoi8 <2,3,4,0>, <4,6,0,7>
+  3735483703U,	// <4,0,4,7>: Cost 4 vsldoi4 <7,4,0,4>, <7,4,0,4>
+  1167721117U,	// <4,0,4,u>: Cost 2 vmrghw <4,4,4,4>, LHS
+  1168556032U,	// <4,0,5,0>: Cost 2 vmrghw RHS, <0,0,0,0>
+  94814310U,	// <4,0,5,1>: Cost 1 vmrghw RHS, LHS
+  2242298029U,	// <4,0,5,2>: Cost 3 vmrghw RHS, <0,2,1,2>
+  2637859284U,	// <4,0,5,3>: Cost 3 vsldoi4 <3,4,0,5>, <3,4,0,5>
+  1168556370U,	// <4,0,5,4>: Cost 2 vmrghw RHS, <0,4,1,5>
+  2242306530U,	// <4,0,5,5>: Cost 3 vmrghw RHS, <0,5,u,5>
+  2242298358U,	// <4,0,5,6>: Cost 3 vmrghw RHS, <0,6,1,7>
+  2661750072U,	// <4,0,5,7>: Cost 3 vsldoi4 <7,4,0,5>, <7,4,0,5>
+  94814877U,	// <4,0,5,u>: Cost 1 vmrghw RHS, LHS
+  3316580362U,	// <4,0,6,0>: Cost 4 vmrghw <4,6,5,1>, <0,0,1,1>
+  2242846822U,	// <4,0,6,1>: Cost 3 vmrghw <4,6,5,2>, LHS
+  3798872570U,	// <4,0,6,2>: Cost 4 vsldoi8 <6,7,4,0>, <6,2,7,3>
+  3796218413U,	// <4,0,6,3>: Cost 4 vsldoi8 <6,3,4,0>, <6,3,4,0>
+  3834528273U,	// <4,0,6,4>: Cost 4 vsldoi12 <1,5,0,4>, <0,6,4,7>
+  3798872811U,	// <4,0,6,5>: Cost 4 vsldoi8 <6,7,4,0>, <6,5,7,1>
+  3316621876U,	// <4,0,6,6>: Cost 4 vmrghw <4,6,5,6>, <0,6,u,6>
+  2725131121U,	// <4,0,6,7>: Cost 3 vsldoi8 <6,7,4,0>, <6,7,4,0>
+  2242847389U,	// <4,0,6,u>: Cost 3 vmrghw <4,6,5,2>, LHS
+  3377692672U,	// <4,0,7,0>: Cost 4 vmrglw <3,6,4,7>, <0,0,0,0>
+  2243493990U,	// <4,0,7,1>: Cost 3 vmrghw <4,7,5,0>, LHS
+  3775648970U,	// <4,0,7,2>: Cost 5 vsldoi8 <2,u,4,0>, <7,2,6,3>
+  3802191110U,	// <4,0,7,3>: Cost 4 vsldoi8 <7,3,4,0>, <7,3,4,0>
+  3317236050U,	// <4,0,7,4>: Cost 4 vmrghw <4,7,5,0>, <0,4,1,5>
+  3803518376U,	// <4,0,7,5>: Cost 4 vsldoi8 <7,5,4,0>, <7,5,4,0>
+  3317236214U,	// <4,0,7,6>: Cost 5 vmrghw <4,7,5,0>, <0,6,1,7>
+  3798873708U,	// <4,0,7,7>: Cost 4 vsldoi8 <6,7,4,0>, <7,7,7,7>
+  2243494557U,	// <4,0,7,u>: Cost 3 vmrghw <4,7,5,0>, LHS
+  1170546688U,	// <4,0,u,0>: Cost 2 vmrghw RHS, <0,0,0,0>
+  96804966U,	// <4,0,u,1>: Cost 1 vmrghw RHS, LHS
+  1685275293U,	// <4,0,u,2>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+  2637883863U,	// <4,0,u,3>: Cost 3 vsldoi4 <3,4,0,u>, <3,4,0,u>
+  1170547026U,	// <4,0,u,4>: Cost 2 vmrghw RHS, <0,4,1,5>
+  2698590362U,	// <4,0,u,5>: Cost 3 vsldoi8 <2,3,4,0>, RHS
+  2244289014U,	// <4,0,u,6>: Cost 3 vmrghw RHS, <0,6,1,7>
+  2661774651U,	// <4,0,u,7>: Cost 3 vsldoi4 <7,4,0,u>, <7,4,0,u>
+  96805533U,	// <4,0,u,u>: Cost 1 vmrghw RHS, LHS
+  2667749478U,	// <4,1,0,0>: Cost 3 vsldoi4 <u,4,1,0>, LHS
+  2689966182U,	// <4,1,0,1>: Cost 3 vsldoi8 <0,u,4,1>, LHS
+  2238571418U,	// <4,1,0,2>: Cost 3 vmrghw <4,0,1,2>, <1,2,3,4>
+  3711633880U,	// <4,1,0,3>: Cost 4 vsldoi4 <3,4,1,0>, <3,4,1,0>
+  2689966418U,	// <4,1,0,4>: Cost 3 vsldoi8 <0,u,4,1>, <0,4,1,5>
+  3361046866U,	// <4,1,0,5>: Cost 4 vmrglw <0,u,4,0>, <0,4,1,5>
+  3741495802U,	// <4,1,0,6>: Cost 4 vsldoi4 <u,4,1,0>, <6,2,7,3>
+  3741496314U,	// <4,1,0,7>: Cost 4 vsldoi4 <u,4,1,0>, <7,0,1,2>
+  2689966765U,	// <4,1,0,u>: Cost 3 vsldoi8 <0,u,4,1>, <0,u,4,1>
+  3764372222U,	// <4,1,1,0>: Cost 4 vsldoi8 <1,0,4,1>, <1,0,4,1>
+  2758206263U,	// <4,1,1,1>: Cost 3 vsldoi12 <1,1,1,4>, <1,1,1,4>
+  2698593178U,	// <4,1,1,2>: Cost 3 vsldoi8 <2,3,4,1>, <1,2,3,4>
+  3361057810U,	// <4,1,1,3>: Cost 4 vmrglw <0,u,4,1>, <4,2,1,3>
+  3827303250U,	// <4,1,1,4>: Cost 4 vsldoi12 <0,3,1,4>, <1,1,4,4>
+  2287313234U,	// <4,1,1,5>: Cost 3 vmrglw <0,u,4,1>, <0,4,1,5>
+  3763709171U,	// <4,1,1,6>: Cost 4 vsldoi8 <0,u,4,1>, <1,6,5,7>
+  3361058138U,	// <4,1,1,7>: Cost 4 vmrglw <0,u,4,1>, <4,6,1,7>
+  2239759744U,	// <4,1,1,u>: Cost 3 vmrghw <4,1,u,3>, <1,u,3,4>
+  2637906022U,	// <4,1,2,0>: Cost 3 vsldoi4 <3,4,1,2>, LHS
+  2637906842U,	// <4,1,2,1>: Cost 3 vsldoi4 <3,4,1,2>, <1,2,3,4>
+  3763709544U,	// <4,1,2,2>: Cost 4 vsldoi8 <0,u,4,1>, <2,2,2,2>
+  1685275546U,	// <4,1,2,3>: Cost 2 vsldoi12 <1,2,3,4>, <1,2,3,4>
+  2637909302U,	// <4,1,2,4>: Cost 3 vsldoi4 <3,4,1,2>, RHS
+  3361063250U,	// <4,1,2,5>: Cost 4 vmrglw <0,u,4,2>, <0,4,1,5>
+  3763709882U,	// <4,1,2,6>: Cost 4 vsldoi8 <0,u,4,1>, <2,6,3,7>
+  3735541054U,	// <4,1,2,7>: Cost 4 vsldoi4 <7,4,1,2>, <7,4,1,2>
+  1685644231U,	// <4,1,2,u>: Cost 2 vsldoi12 <1,2,u,4>, <1,2,u,4>
+  2702575792U,	// <4,1,3,0>: Cost 3 vsldoi8 <3,0,4,1>, <3,0,4,1>
+  3832759257U,	// <4,1,3,1>: Cost 4 vsldoi12 <1,2,3,4>, <1,3,1,4>
+  3833349090U,	// <4,1,3,2>: Cost 4 vsldoi12 <1,3,2,4>, <1,3,2,4>
+  3763710364U,	// <4,1,3,3>: Cost 4 vsldoi8 <0,u,4,1>, <3,3,3,3>
+  2707884546U,	// <4,1,3,4>: Cost 3 vsldoi8 <3,u,4,1>, <3,4,5,6>
+  3361071442U,	// <4,1,3,5>: Cost 4 vmrglw <0,u,4,3>, <0,4,1,5>
+  3772336796U,	// <4,1,3,6>: Cost 4 vsldoi8 <2,3,4,1>, <3,6,4,7>
+  3775654595U,	// <4,1,3,7>: Cost 5 vsldoi8 <2,u,4,1>, <3,7,0,1>
+  2707884856U,	// <4,1,3,u>: Cost 3 vsldoi8 <3,u,4,1>, <3,u,4,1>
+  2667782246U,	// <4,1,4,0>: Cost 3 vsldoi4 <u,4,1,4>, LHS
+  2241463092U,	// <4,1,4,1>: Cost 3 vmrghw <4,4,4,4>, <1,1,1,1>
+  2241553306U,	// <4,1,4,2>: Cost 3 vmrghw <4,4,5,6>, <1,2,3,4>
+  3827303484U,	// <4,1,4,3>: Cost 4 vsldoi12 <0,3,1,4>, <1,4,3,4>
+  2667785424U,	// <4,1,4,4>: Cost 3 vsldoi4 <u,4,1,4>, <4,4,4,4>
+  2689969462U,	// <4,1,4,5>: Cost 3 vsldoi8 <0,u,4,1>, RHS
+  3763711322U,	// <4,1,4,6>: Cost 4 vsldoi8 <0,u,4,1>, <4,6,1,7>
+  3867116636U,	// <4,1,4,7>: Cost 4 vsldoi12 <7,0,1,4>, <1,4,7,0>
+  2689969705U,	// <4,1,4,u>: Cost 3 vsldoi8 <0,u,4,1>, RHS
+  1546273106U,	// <4,1,5,0>: Cost 2 vsldoi4 <0,4,1,5>, <0,4,1,5>
+  1168556852U,	// <4,1,5,1>: Cost 2 vmrghw RHS, <1,1,1,1>
+  1168556950U,	// <4,1,5,2>: Cost 2 vmrghw RHS, <1,2,3,0>
+  2620016790U,	// <4,1,5,3>: Cost 3 vsldoi4 <0,4,1,5>, <3,0,1,2>
+  1546276150U,	// <4,1,5,4>: Cost 2 vsldoi4 <0,4,1,5>, RHS
+  2620018692U,	// <4,1,5,5>: Cost 3 vsldoi4 <0,4,1,5>, <5,5,5,5>
+  2242299087U,	// <4,1,5,6>: Cost 3 vmrghw RHS, <1,6,1,7>
+  2667795450U,	// <4,1,5,7>: Cost 3 vsldoi4 <u,4,1,5>, <7,0,1,2>
+  1546278702U,	// <4,1,5,u>: Cost 2 vsldoi4 <0,4,1,5>, LHS
+  3781628193U,	// <4,1,6,0>: Cost 4 vsldoi8 <3,u,4,1>, <6,0,1,2>
+  3832759503U,	// <4,1,6,1>: Cost 4 vsldoi12 <1,2,3,4>, <1,6,1,7>
+  3316261786U,	// <4,1,6,2>: Cost 4 vmrghw <4,6,0,7>, <1,2,3,4>
+  3781628466U,	// <4,1,6,3>: Cost 4 vsldoi8 <3,u,4,1>, <6,3,4,5>
+  3827303658U,	// <4,1,6,4>: Cost 4 vsldoi12 <0,3,1,4>, <1,6,4,7>
+  3361096018U,	// <4,1,6,5>: Cost 4 vmrglw <0,u,4,6>, <0,4,1,5>
+  3788264248U,	// <4,1,6,6>: Cost 4 vsldoi8 <5,0,4,1>, <6,6,6,6>
+  3788264270U,	// <4,1,6,7>: Cost 4 vsldoi8 <5,0,4,1>, <6,7,0,1>
+  3832759566U,	// <4,1,6,u>: Cost 4 vsldoi12 <1,2,3,4>, <1,6,u,7>
+  2726466580U,	// <4,1,7,0>: Cost 3 vsldoi8 <7,0,4,1>, <7,0,4,1>
+  3377692682U,	// <4,1,7,1>: Cost 4 vmrglw <3,6,4,7>, <0,0,1,1>
+  3377694870U,	// <4,1,7,2>: Cost 4 vmrglw <3,6,4,7>, <3,0,1,2>
+  3802199303U,	// <4,1,7,3>: Cost 4 vsldoi8 <7,3,4,1>, <7,3,4,1>
+  2731775334U,	// <4,1,7,4>: Cost 3 vsldoi8 <7,u,4,1>, <7,4,5,6>
+  3377693010U,	// <4,1,7,5>: Cost 4 vmrglw <3,6,4,7>, <0,4,1,5>
+  3365749804U,	// <4,1,7,6>: Cost 5 vmrglw <1,6,4,7>, <1,4,1,6>
+  3788265068U,	// <4,1,7,7>: Cost 4 vsldoi8 <5,0,4,1>, <7,7,7,7>
+  2731775644U,	// <4,1,7,u>: Cost 3 vsldoi8 <7,u,4,1>, <7,u,4,1>
+  1546297685U,	// <4,1,u,0>: Cost 2 vsldoi4 <0,4,1,u>, <0,4,1,u>
+  1170547508U,	// <4,1,u,1>: Cost 2 vmrghw RHS, <1,1,1,1>
+  1170547606U,	// <4,1,u,2>: Cost 2 vmrghw RHS, <1,2,3,0>
+  1689257344U,	// <4,1,u,3>: Cost 2 vsldoi12 <1,u,3,4>, <1,u,3,4>
+  1546300726U,	// <4,1,u,4>: Cost 2 vsldoi4 <0,4,1,u>, RHS
+  2284716370U,	// <4,1,u,5>: Cost 3 vmrglw <0,4,4,u>, <0,4,1,5>
+  2244289743U,	// <4,1,u,6>: Cost 3 vmrghw RHS, <1,6,1,7>
+  2667820026U,	// <4,1,u,7>: Cost 3 vsldoi4 <u,4,1,u>, <7,0,1,2>
+  1546303278U,	// <4,1,u,u>: Cost 2 vsldoi4 <0,4,1,u>, LHS
+  3729621094U,	// <4,2,0,0>: Cost 4 vsldoi4 <6,4,2,0>, LHS
+  3763716198U,	// <4,2,0,1>: Cost 4 vsldoi8 <0,u,4,2>, LHS
+  2238858856U,	// <4,2,0,2>: Cost 3 vmrghw <4,0,5,1>, <2,2,2,2>
+  2295930982U,	// <4,2,0,3>: Cost 3 vmrglw <2,3,4,0>, LHS
+  3763716434U,	// <4,2,0,4>: Cost 4 vsldoi8 <0,u,4,2>, <0,4,1,5>
+  2238859107U,	// <4,2,0,5>: Cost 3 vmrghw <4,0,5,1>, <2,5,3,1>
+  2238859194U,	// <4,2,0,6>: Cost 3 vmrghw <4,0,5,1>, <2,6,3,7>
+  3312601066U,	// <4,2,0,7>: Cost 4 vmrghw <4,0,5,1>, <2,7,0,1>
+  2295930987U,	// <4,2,0,u>: Cost 3 vmrglw <2,3,4,0>, LHS
+  3699769446U,	// <4,2,1,0>: Cost 4 vsldoi4 <1,4,2,1>, LHS
+  3313255971U,	// <4,2,1,1>: Cost 4 vmrghw <4,1,5,0>, <2,1,3,5>
+  3361056360U,	// <4,2,1,2>: Cost 4 vmrglw <0,u,4,1>, <2,2,2,2>
+  2287312998U,	// <4,2,1,3>: Cost 3 vmrglw <0,u,4,1>, LHS
+  3788932148U,	// <4,2,1,4>: Cost 4 vsldoi8 <5,1,4,2>, <1,4,2,5>
+  3313256290U,	// <4,2,1,5>: Cost 4 vmrghw <4,1,5,0>, <2,5,3,0>
+  3838289469U,	// <4,2,1,6>: Cost 4 vsldoi12 <2,1,6,4>, <2,1,6,4>
+  3369682865U,	// <4,2,1,7>: Cost 5 vmrglw <2,3,4,1>, <2,6,2,7>
+  2287313003U,	// <4,2,1,u>: Cost 3 vmrglw <0,u,4,1>, LHS
+  3838658133U,	// <4,2,2,0>: Cost 4 vsldoi12 <2,2,2,4>, <2,2,0,1>
+  3711722394U,	// <4,2,2,1>: Cost 4 vsldoi4 <3,4,2,2>, <1,2,3,4>
+  2759018088U,	// <4,2,2,2>: Cost 3 vsldoi12 <1,2,3,4>, <2,2,2,2>
+  2759018098U,	// <4,2,2,3>: Cost 3 vsldoi12 <1,2,3,4>, <2,2,3,3>
+  3838658168U,	// <4,2,2,4>: Cost 4 vsldoi12 <2,2,2,4>, <2,2,4,0>
+  3369027341U,	// <4,2,2,5>: Cost 4 vmrglw <2,2,4,2>, <2,4,2,5>
+  2240227258U,	// <4,2,2,6>: Cost 3 vmrghw <4,2,5,6>, <2,6,3,7>
+  3735614791U,	// <4,2,2,7>: Cost 4 vsldoi4 <7,4,2,2>, <7,4,2,2>
+  2759018143U,	// <4,2,2,u>: Cost 3 vsldoi12 <1,2,3,4>, <2,2,u,3>
+  2759018150U,	// <4,2,3,0>: Cost 3 vsldoi12 <1,2,3,4>, <2,3,0,1>
+  3831948975U,	// <4,2,3,1>: Cost 4 vsldoi12 <1,1,1,4>, <2,3,1,1>
+  3832759993U,	// <4,2,3,2>: Cost 4 vsldoi12 <1,2,3,4>, <2,3,2,2>
+  2759018180U,	// <4,2,3,3>: Cost 3 vsldoi12 <1,2,3,4>, <2,3,3,4>
+  2759018185U,	// <4,2,3,4>: Cost 3 vsldoi12 <1,2,3,4>, <2,3,4,0>
+  3839542998U,	// <4,2,3,5>: Cost 4 vsldoi12 <2,3,5,4>, <2,3,5,4>
+  3314640826U,	// <4,2,3,6>: Cost 4 vmrghw <4,3,5,7>, <2,6,3,7>
+  2765948648U,	// <4,2,3,7>: Cost 3 vsldoi12 <2,3,7,4>, <2,3,7,4>
+  2759018222U,	// <4,2,3,u>: Cost 3 vsldoi12 <1,2,3,4>, <2,3,u,1>
+  3838658295U,	// <4,2,4,0>: Cost 4 vsldoi12 <2,2,2,4>, <2,4,0,1>
+  3315205667U,	// <4,2,4,1>: Cost 4 vmrghw <4,4,4,4>, <2,1,3,5>
+  2241463912U,	// <4,2,4,2>: Cost 3 vmrghw <4,4,4,4>, <2,2,2,2>
+  1234829414U,	// <4,2,4,3>: Cost 2 vmrglw <4,4,4,4>, LHS
+  2241464085U,	// <4,2,4,4>: Cost 3 vmrghw <4,4,4,4>, <2,4,3,4>
+  2241546087U,	// <4,2,4,5>: Cost 3 vmrghw <4,4,5,5>, <2,5,3,5>
+  2241464250U,	// <4,2,4,6>: Cost 3 vmrghw <4,4,4,4>, <2,6,3,7>
+  3741602873U,	// <4,2,4,7>: Cost 4 vsldoi4 <u,4,2,4>, <7,0,u,2>
+  1234829419U,	// <4,2,4,u>: Cost 2 vmrglw <4,4,4,4>, LHS
+  2626060390U,	// <4,2,5,0>: Cost 3 vsldoi4 <1,4,2,5>, LHS
+  2626061364U,	// <4,2,5,1>: Cost 3 vsldoi4 <1,4,2,5>, <1,4,2,5>
+  1168557672U,	// <4,2,5,2>: Cost 2 vmrghw RHS, <2,2,2,2>
+  1222230118U,	// <4,2,5,3>: Cost 2 vmrglw <2,3,4,5>, LHS
+  2626063670U,	// <4,2,5,4>: Cost 3 vsldoi4 <1,4,2,5>, RHS
+  2242299752U,	// <4,2,5,5>: Cost 3 vmrghw RHS, <2,5,3,6>
+  1168558010U,	// <4,2,5,6>: Cost 2 vmrghw RHS, <2,6,3,7>
+  2242299882U,	// <4,2,5,7>: Cost 3 vmrghw RHS, <2,7,0,1>
+  1222230123U,	// <4,2,5,u>: Cost 2 vmrglw <2,3,4,5>, LHS
+  3711754342U,	// <4,2,6,0>: Cost 4 vsldoi4 <3,4,2,6>, LHS
+  3711755162U,	// <4,2,6,1>: Cost 4 vsldoi4 <3,4,2,6>, <1,2,3,4>
+  3838658481U,	// <4,2,6,2>: Cost 4 vsldoi12 <2,2,2,4>, <2,6,2,7>
+  2759018426U,	// <4,2,6,3>: Cost 3 vsldoi12 <1,2,3,4>, <2,6,3,7>
+  3838658499U,	// <4,2,6,4>: Cost 4 vsldoi12 <2,2,2,4>, <2,6,4,7>
+  3735646310U,	// <4,2,6,5>: Cost 4 vsldoi4 <7,4,2,6>, <5,6,7,4>
+  3316590522U,	// <4,2,6,6>: Cost 4 vmrghw <4,6,5,2>, <2,6,3,7>
+  3798889331U,	// <4,2,6,7>: Cost 4 vsldoi8 <6,7,4,2>, <6,7,4,2>
+  2759018471U,	// <4,2,6,u>: Cost 3 vsldoi12 <1,2,3,4>, <2,6,u,7>
+  3874564074U,	// <4,2,7,0>: Cost 4 vsldoi12 <u,2,3,4>, <2,7,0,1>
+  3800880230U,	// <4,2,7,1>: Cost 4 vsldoi8 <7,1,4,2>, <7,1,4,2>
+  3371722344U,	// <4,2,7,2>: Cost 4 vmrglw <2,6,4,7>, <2,2,2,2>
+  2303950950U,	// <4,2,7,3>: Cost 3 vmrglw <3,6,4,7>, LHS
+  3371722346U,	// <4,2,7,4>: Cost 4 vmrglw <2,6,4,7>, <2,2,2,4>
+  3371722509U,	// <4,2,7,5>: Cost 5 vmrglw <2,6,4,7>, <2,4,2,5>
+  3317237690U,	// <4,2,7,6>: Cost 4 vmrghw <4,7,5,0>, <2,6,3,7>
+  3317237738U,	// <4,2,7,7>: Cost 4 vmrghw <4,7,5,0>, <2,7,0,1>
+  2303950955U,	// <4,2,7,u>: Cost 3 vmrglw <3,6,4,7>, LHS
+  2759018555U,	// <4,2,u,0>: Cost 3 vsldoi12 <1,2,3,4>, <2,u,0,1>
+  2626085943U,	// <4,2,u,1>: Cost 3 vsldoi4 <1,4,2,u>, <1,4,2,u>
+  1170548328U,	// <4,2,u,2>: Cost 2 vmrghw RHS, <2,2,2,2>
+  1222254694U,	// <4,2,u,3>: Cost 2 vmrglw <2,3,4,u>, LHS
+  2759018595U,	// <4,2,u,4>: Cost 3 vsldoi12 <1,2,3,4>, <2,u,4,5>
+  2244290408U,	// <4,2,u,5>: Cost 3 vmrghw RHS, <2,5,3,6>
+  1170548666U,	// <4,2,u,6>: Cost 2 vmrghw RHS, <2,6,3,7>
+  2769266813U,	// <4,2,u,7>: Cost 3 vsldoi12 <2,u,7,4>, <2,u,7,4>
+  1222254699U,	// <4,2,u,u>: Cost 2 vmrglw <2,3,4,u>, LHS
+  2238859414U,	// <4,3,0,0>: Cost 3 vmrghw <4,0,5,1>, <3,0,1,2>
+  2759018646U,	// <4,3,0,1>: Cost 3 vsldoi12 <1,2,3,4>, <3,0,1,2>
+  3312314708U,	// <4,3,0,2>: Cost 4 vmrghw <4,0,1,2>, <3,2,4,3>
+  2238859676U,	// <4,3,0,3>: Cost 3 vmrghw <4,0,5,1>, <3,3,3,3>
+  2295931802U,	// <4,3,0,4>: Cost 3 vmrglw <2,3,4,0>, <1,2,3,4>
+  3735670886U,	// <4,3,0,5>: Cost 4 vsldoi4 <7,4,3,0>, <5,6,7,4>
+  3312315036U,	// <4,3,0,6>: Cost 4 vmrghw <4,0,1,2>, <3,6,4,7>
+  3369674682U,	// <4,3,0,7>: Cost 4 vmrglw <2,3,4,0>, <2,6,3,7>
+  2759018709U,	// <4,3,0,u>: Cost 3 vsldoi12 <1,2,3,4>, <3,0,u,2>
+  3361055638U,	// <4,3,1,0>: Cost 4 vmrglw <0,u,4,1>, <1,2,3,0>
+  3831949542U,	// <4,3,1,1>: Cost 4 vsldoi12 <1,1,1,4>, <3,1,1,1>
+  2703917978U,	// <4,3,1,2>: Cost 3 vsldoi8 <3,2,4,3>, <1,2,3,4>
+  3361056370U,	// <4,3,1,3>: Cost 4 vmrglw <0,u,4,1>, <2,2,3,3>
+  2295939994U,	// <4,3,1,4>: Cost 3 vmrglw <2,3,4,1>, <1,2,3,4>
+  3361056291U,	// <4,3,1,5>: Cost 4 vmrglw <0,u,4,1>, <2,1,3,5>
+  3378972520U,	// <4,3,1,6>: Cost 4 vmrglw <3,u,4,1>, <2,5,3,6>
+  3361056698U,	// <4,3,1,7>: Cost 4 vmrglw <0,u,4,1>, <2,6,3,7>
+  2703917978U,	// <4,3,1,u>: Cost 3 vsldoi8 <3,2,4,3>, <1,2,3,4>
+  3832760624U,	// <4,3,2,0>: Cost 4 vsldoi12 <1,2,3,4>, <3,2,0,3>
+  3711796122U,	// <4,3,2,1>: Cost 4 vsldoi4 <3,4,3,2>, <1,2,3,4>
+  3832760641U,	// <4,3,2,2>: Cost 4 vsldoi12 <1,2,3,4>, <3,2,2,2>
+  2770962764U,	// <4,3,2,3>: Cost 3 vsldoi12 <3,2,3,4>, <3,2,3,4>
+  2759018836U,	// <4,3,2,4>: Cost 3 vsldoi12 <1,2,3,4>, <3,2,4,3>
+  3827304802U,	// <4,3,2,5>: Cost 5 vsldoi12 <0,3,1,4>, <3,2,5,u>
+  3832760678U,	// <4,3,2,6>: Cost 4 vsldoi12 <1,2,3,4>, <3,2,6,3>
+  3859597679U,	// <4,3,2,7>: Cost 4 vsldoi12 <5,6,7,4>, <3,2,7,3>
+  2771331449U,	// <4,3,2,u>: Cost 3 vsldoi12 <3,2,u,4>, <3,2,u,4>
+  2240841878U,	// <4,3,3,0>: Cost 3 vmrghw <4,3,5,0>, <3,0,1,2>
+  3776997635U,	// <4,3,3,1>: Cost 4 vsldoi8 <3,1,4,3>, <3,1,4,3>
+  2703919444U,	// <4,3,3,2>: Cost 3 vsldoi8 <3,2,4,3>, <3,2,4,3>
+  2759018908U,	// <4,3,3,3>: Cost 3 vsldoi12 <1,2,3,4>, <3,3,3,3>
+  2759018918U,	// <4,3,3,4>: Cost 3 vsldoi12 <1,2,3,4>, <3,3,4,4>
+  3386951446U,	// <4,3,3,5>: Cost 4 vmrglw <5,2,4,3>, <2,4,3,5>
+  3777661596U,	// <4,3,3,6>: Cost 4 vsldoi8 <3,2,4,3>, <3,6,4,7>
+  3375007674U,	// <4,3,3,7>: Cost 4 vmrglw <3,2,4,3>, <2,6,3,7>
+  2707901242U,	// <4,3,3,u>: Cost 3 vsldoi8 <3,u,4,3>, <3,u,4,3>
+  2759018960U,	// <4,3,4,0>: Cost 3 vsldoi12 <1,2,3,4>, <3,4,0,1>
+  2759018970U,	// <4,3,4,1>: Cost 3 vsldoi12 <1,2,3,4>, <3,4,1,2>
+  2632099605U,	// <4,3,4,2>: Cost 3 vsldoi4 <2,4,3,4>, <2,4,3,4>
+  2241464732U,	// <4,3,4,3>: Cost 3 vmrghw <4,4,4,4>, <3,3,3,3>
+  2759019000U,	// <4,3,4,4>: Cost 3 vsldoi12 <1,2,3,4>, <3,4,4,5>
+  2753563138U,	// <4,3,4,5>: Cost 3 vsldoi12 <0,3,1,4>, <3,4,5,6>
+  3777662316U,	// <4,3,4,6>: Cost 4 vsldoi8 <3,2,4,3>, <4,6,3,7>
+  2308573114U,	// <4,3,4,7>: Cost 3 vmrglw <4,4,4,4>, <2,6,3,7>
+  2759019032U,	// <4,3,4,u>: Cost 3 vsldoi12 <1,2,3,4>, <3,4,u,1>
+  1168558230U,	// <4,3,5,0>: Cost 2 vmrghw RHS, <3,0,1,2>
+  2242300134U,	// <4,3,5,1>: Cost 3 vmrghw RHS, <3,1,1,1>
+  2632107798U,	// <4,3,5,2>: Cost 3 vsldoi4 <2,4,3,5>, <2,4,3,5>
+  1168558492U,	// <4,3,5,3>: Cost 2 vmrghw RHS, <3,3,3,3>
+  1168558594U,	// <4,3,5,4>: Cost 2 vmrghw RHS, <3,4,5,6>
+  2295973654U,	// <4,3,5,5>: Cost 3 vmrglw <2,3,4,5>, <2,4,3,5>
+  2242300536U,	// <4,3,5,6>: Cost 3 vmrghw RHS, <3,6,0,7>
+  2295973818U,	// <4,3,5,7>: Cost 3 vmrglw <2,3,4,5>, <2,6,3,7>
+  1168558878U,	// <4,3,5,u>: Cost 2 vmrghw RHS, <3,u,1,2>
+  3832760952U,	// <4,3,6,0>: Cost 4 vsldoi12 <1,2,3,4>, <3,6,0,7>
+  3711828890U,	// <4,3,6,1>: Cost 4 vsldoi4 <3,4,3,6>, <1,2,3,4>
+  3316484436U,	// <4,3,6,2>: Cost 4 vmrghw <4,6,3,7>, <3,2,4,3>
+  3711830512U,	// <4,3,6,3>: Cost 4 vsldoi4 <3,4,3,6>, <3,4,3,6>
+  2759019164U,	// <4,3,6,4>: Cost 3 vsldoi12 <1,2,3,4>, <3,6,4,7>
+  3361097251U,	// <4,3,6,5>: Cost 5 vmrglw <0,u,4,6>, <2,1,3,5>
+  3316624045U,	// <4,3,6,6>: Cost 4 vmrghw <4,6,5,6>, <3,6,6,6>
+  2773912244U,	// <4,3,6,7>: Cost 3 vsldoi12 <3,6,7,4>, <3,6,7,4>
+  2759019164U,	// <4,3,6,u>: Cost 3 vsldoi12 <1,2,3,4>, <3,6,4,7>
+  3377693590U,	// <4,3,7,0>: Cost 4 vmrglw <3,6,4,7>, <1,2,3,0>
+  3365751680U,	// <4,3,7,1>: Cost 5 vmrglw <1,6,4,7>, <4,0,3,1>
+  2727810232U,	// <4,3,7,2>: Cost 3 vsldoi8 <7,2,4,3>, <7,2,4,3>
+  3377694322U,	// <4,3,7,3>: Cost 4 vmrglw <3,6,4,7>, <2,2,3,3>
+  2303951770U,	// <4,3,7,4>: Cost 3 vmrglw <3,6,4,7>, <1,2,3,4>
+  3741700198U,	// <4,3,7,5>: Cost 4 vsldoi4 <u,4,3,7>, <5,6,7,4>
+  3377695216U,	// <4,3,7,6>: Cost 4 vmrglw <3,6,4,7>, <3,4,3,6>
+  3375703994U,	// <4,3,7,7>: Cost 4 vmrglw <3,3,4,7>, <2,6,3,7>
+  2731792030U,	// <4,3,7,u>: Cost 3 vsldoi8 <7,u,4,3>, <7,u,4,3>
+  1170548886U,	// <4,3,u,0>: Cost 2 vmrghw RHS, <3,0,1,2>
+  2759019294U,	// <4,3,u,1>: Cost 3 vsldoi12 <1,2,3,4>, <3,u,1,2>
+  2632132377U,	// <4,3,u,2>: Cost 3 vsldoi4 <2,4,3,u>, <2,4,3,u>
+  1170549148U,	// <4,3,u,3>: Cost 2 vmrghw RHS, <3,3,3,3>
+  1170549250U,	// <4,3,u,4>: Cost 2 vmrghw RHS, <3,4,5,6>
+  2759019334U,	// <4,3,u,5>: Cost 3 vsldoi12 <1,2,3,4>, <3,u,5,6>
+  2244291192U,	// <4,3,u,6>: Cost 3 vmrghw RHS, <3,6,0,7>
+  2295998394U,	// <4,3,u,7>: Cost 3 vmrglw <2,3,4,u>, <2,6,3,7>
+  1170549534U,	// <4,3,u,u>: Cost 2 vmrghw RHS, <3,u,1,2>
+  1165118354U,	// <4,4,0,0>: Cost 2 vmrghw <4,0,5,1>, <4,0,5,1>
+  1637482598U,	// <4,4,0,1>: Cost 2 vsldoi8 <4,4,4,4>, LHS
+  3711854285U,	// <4,4,0,2>: Cost 4 vsldoi4 <3,4,4,0>, <2,3,4,4>
+  3827305344U,	// <4,4,0,3>: Cost 4 vsldoi12 <0,3,1,4>, <4,0,3,1>
+  2711224658U,	// <4,4,0,4>: Cost 3 vsldoi8 <4,4,4,4>, <0,4,1,5>
+  1165118774U,	// <4,4,0,5>: Cost 2 vmrghw <4,0,5,1>, RHS
+  3312602489U,	// <4,4,0,6>: Cost 4 vmrghw <4,0,5,1>, <4,6,5,2>
+  3369675420U,	// <4,4,0,7>: Cost 4 vmrglw <2,3,4,0>, <3,6,4,7>
+  1165119017U,	// <4,4,0,u>: Cost 2 vmrghw <4,0,5,1>, RHS
+  3369682633U,	// <4,4,1,0>: Cost 4 vmrglw <2,3,4,1>, <2,3,4,0>
+  2287313581U,	// <4,4,1,1>: Cost 3 vmrglw <0,u,4,1>, <0,u,4,1>
+  2759019466U,	// <4,4,1,2>: Cost 3 vsldoi12 <1,2,3,4>, <4,1,2,3>
+  3369683284U,	// <4,4,1,3>: Cost 4 vmrglw <2,3,4,1>, <3,2,4,3>
+  2311204048U,	// <4,4,1,4>: Cost 3 vmrglw <4,u,4,1>, <4,4,4,4>
+  2239319350U,	// <4,4,1,5>: Cost 3 vmrghw <4,1,2,3>, RHS
+  3784967411U,	// <4,4,1,6>: Cost 4 vsldoi8 <4,4,4,4>, <1,6,5,7>
+  3369683612U,	// <4,4,1,7>: Cost 4 vmrglw <2,3,4,1>, <3,6,4,7>
+  2763000832U,	// <4,4,1,u>: Cost 3 vsldoi12 <1,u,3,4>, <4,1,u,3>
+  3711869030U,	// <4,4,2,0>: Cost 4 vsldoi4 <3,4,4,2>, LHS
+  3711869850U,	// <4,4,2,1>: Cost 4 vsldoi4 <3,4,4,2>, <1,2,3,4>
+  2240203830U,	// <4,4,2,2>: Cost 3 vmrghw <4,2,5,3>, <4,2,5,3>
+  2698618573U,	// <4,4,2,3>: Cost 3 vsldoi8 <2,3,4,4>, <2,3,4,4>
+  2711226133U,	// <4,4,2,4>: Cost 3 vsldoi8 <4,4,4,4>, <2,4,3,4>
+  2240204086U,	// <4,4,2,5>: Cost 3 vmrghw <4,2,5,3>, RHS
+  2711226298U,	// <4,4,2,6>: Cost 3 vsldoi8 <4,4,4,4>, <2,6,3,7>
+  3832761416U,	// <4,4,2,7>: Cost 4 vsldoi12 <1,2,3,4>, <4,2,7,3>
+  2701936738U,	// <4,4,2,u>: Cost 3 vsldoi8 <2,u,4,4>, <2,u,4,4>
+  2711226518U,	// <4,4,3,0>: Cost 3 vsldoi8 <4,4,4,4>, <3,0,1,2>
+  3777005828U,	// <4,4,3,1>: Cost 4 vsldoi8 <3,1,4,4>, <3,1,4,4>
+  3832761453U,	// <4,4,3,2>: Cost 4 vsldoi12 <1,2,3,4>, <4,3,2,4>
+  2301266260U,	// <4,4,3,3>: Cost 3 vmrglw <3,2,4,3>, <3,2,4,3>
+  2705254903U,	// <4,4,3,4>: Cost 3 vsldoi8 <3,4,4,4>, <3,4,4,4>
+  2240843062U,	// <4,4,3,5>: Cost 3 vmrghw <4,3,5,0>, RHS
+  3832761489U,	// <4,4,3,6>: Cost 4 vsldoi12 <1,2,3,4>, <4,3,6,4>
+  3375008412U,	// <4,4,3,7>: Cost 4 vmrglw <3,2,4,3>, <3,6,4,7>
+  2301266260U,	// <4,4,3,u>: Cost 3 vmrglw <3,2,4,3>, <3,2,4,3>
+  1570373734U,	// <4,4,4,0>: Cost 2 vsldoi4 <4,4,4,4>, LHS
+  2308574089U,	// <4,4,4,1>: Cost 3 vmrglw <4,4,4,4>, <4,0,4,1>
+  2644117096U,	// <4,4,4,2>: Cost 3 vsldoi4 <4,4,4,4>, <2,2,2,2>
+  2638146039U,	// <4,4,4,3>: Cost 3 vsldoi4 <3,4,4,4>, <3,4,4,4>
+  229035318U,	// <4,4,4,4>: Cost 1 vspltisw0 RHS
+  1167723830U,	// <4,4,4,5>: Cost 2 vmrghw <4,4,4,4>, RHS
+  2644120058U,	// <4,4,4,6>: Cost 3 vsldoi4 <4,4,4,4>, <6,2,7,3>
+  2662036827U,	// <4,4,4,7>: Cost 3 vsldoi4 <7,4,4,4>, <7,4,4,4>
+  229035318U,	// <4,4,4,u>: Cost 1 vspltisw0 RHS
+  1168558994U,	// <4,4,5,0>: Cost 2 vmrghw RHS, <4,0,5,1>
+  2638152602U,	// <4,4,5,1>: Cost 3 vsldoi4 <3,4,4,5>, <1,2,3,4>
+  2242300981U,	// <4,4,5,2>: Cost 3 vmrghw RHS, <4,2,5,2>
+  2638154232U,	// <4,4,5,3>: Cost 3 vsldoi4 <3,4,4,5>, <3,4,4,5>
+  1168559322U,	// <4,4,5,4>: Cost 2 vmrghw RHS, <4,4,5,5>
+  94817590U,	// <4,4,5,5>: Cost 1 vmrghw RHS, RHS
+  1685278006U,	// <4,4,5,6>: Cost 2 vsldoi12 <1,2,3,4>, RHS
+  2242309576U,	// <4,4,5,7>: Cost 3 vmrghw RHS, <4,7,5,0>
+  94817833U,	// <4,4,5,u>: Cost 1 vmrghw RHS, RHS
+  3316591506U,	// <4,4,6,0>: Cost 4 vmrghw <4,6,5,2>, <4,0,5,1>
+  3758428587U,	// <4,4,6,1>: Cost 4 vsldoi8 <0,0,4,4>, <6,1,7,5>
+  2711228922U,	// <4,4,6,2>: Cost 3 vsldoi8 <4,4,4,4>, <6,2,7,3>
+  3796251185U,	// <4,4,6,3>: Cost 4 vsldoi8 <6,3,4,4>, <6,3,4,4>
+  2711229085U,	// <4,4,6,4>: Cost 3 vsldoi8 <4,4,4,4>, <6,4,7,4>
+  2242850102U,	// <4,4,6,5>: Cost 3 vmrghw <4,6,5,2>, RHS
+  2242850169U,	// <4,4,6,6>: Cost 3 vmrghw <4,6,5,2>, <4,6,5,2>
+  2725163893U,	// <4,4,6,7>: Cost 3 vsldoi8 <6,7,4,4>, <6,7,4,4>
+  2242850345U,	// <4,4,6,u>: Cost 3 vmrghw <4,6,5,2>, RHS
+  2711229434U,	// <4,4,7,0>: Cost 3 vsldoi8 <4,4,4,4>, <7,0,1,2>
+  3377694410U,	// <4,4,7,1>: Cost 4 vmrglw <3,6,4,7>, <2,3,4,1>
+  3868593584U,	// <4,4,7,2>: Cost 4 vsldoi12 <7,2,3,4>, <4,7,2,3>
+  3377695060U,	// <4,4,7,3>: Cost 4 vmrglw <3,6,4,7>, <3,2,4,3>
+  2729145691U,	// <4,4,7,4>: Cost 3 vsldoi8 <7,4,4,4>, <7,4,4,4>
+  2243497270U,	// <4,4,7,5>: Cost 3 vmrghw <4,7,5,0>, RHS
+  3871542744U,	// <4,4,7,6>: Cost 4 vsldoi12 <7,6,7,4>, <4,7,6,7>
+  2303953564U,	// <4,4,7,7>: Cost 3 vmrglw <3,6,4,7>, <3,6,4,7>
+  2243497513U,	// <4,4,7,u>: Cost 3 vmrghw <4,7,5,0>, RHS
+  1170549650U,	// <4,4,u,0>: Cost 2 vmrghw RHS, <4,0,5,1>
+  1637488430U,	// <4,4,u,1>: Cost 2 vsldoi8 <4,4,4,4>, LHS
+  2244291637U,	// <4,4,u,2>: Cost 3 vmrghw RHS, <4,2,5,2>
+  2638178811U,	// <4,4,u,3>: Cost 3 vsldoi4 <3,4,4,u>, <3,4,4,u>
+  229035318U,	// <4,4,u,4>: Cost 1 vspltisw0 RHS
+  96808246U,	// <4,4,u,5>: Cost 1 vmrghw RHS, RHS
+  1685278249U,	// <4,4,u,6>: Cost 2 vsldoi12 <1,2,3,4>, RHS
+  2244292040U,	// <4,4,u,7>: Cost 3 vmrghw RHS, <4,7,5,0>
+  96808489U,	// <4,4,u,u>: Cost 1 vmrghw RHS, RHS
+  2698625024U,	// <4,5,0,0>: Cost 3 vsldoi8 <2,3,4,5>, <0,0,0,0>
+  1624883302U,	// <4,5,0,1>: Cost 2 vsldoi8 <2,3,4,5>, LHS
+  2638186190U,	// <4,5,0,2>: Cost 3 vsldoi4 <3,4,5,0>, <2,3,4,5>
+  2638187004U,	// <4,5,0,3>: Cost 3 vsldoi4 <3,4,5,0>, <3,4,5,0>
+  2687345005U,	// <4,5,0,4>: Cost 3 vsldoi8 <0,4,4,5>, <0,4,4,5>
+  2238861316U,	// <4,5,0,5>: Cost 3 vmrghw <4,0,5,1>, <5,5,5,5>
+  2662077302U,	// <4,5,0,6>: Cost 3 vsldoi4 <7,4,5,0>, <6,7,4,5>
+  2662077792U,	// <4,5,0,7>: Cost 3 vsldoi4 <7,4,5,0>, <7,4,5,0>
+  1624883869U,	// <4,5,0,u>: Cost 2 vsldoi8 <2,3,4,5>, LHS
+  3361057762U,	// <4,5,1,0>: Cost 4 vmrglw <0,u,4,1>, <4,1,5,0>
+  2691326803U,	// <4,5,1,1>: Cost 3 vsldoi8 <1,1,4,5>, <1,1,4,5>
+  2698625942U,	// <4,5,1,2>: Cost 3 vsldoi8 <2,3,4,5>, <1,2,3,0>
+  3361055659U,	// <4,5,1,3>: Cost 4 vmrglw <0,u,4,1>, <1,2,5,3>
+  3761087567U,	// <4,5,1,4>: Cost 4 vsldoi8 <0,4,4,5>, <1,4,5,5>
+  2693981335U,	// <4,5,1,5>: Cost 3 vsldoi8 <1,5,4,5>, <1,5,4,5>
+  2305231362U,	// <4,5,1,6>: Cost 3 vmrglw <3,u,4,1>, <3,4,5,6>
+  3361055987U,	// <4,5,1,7>: Cost 4 vmrglw <0,u,4,1>, <1,6,5,7>
+  2695972234U,	// <4,5,1,u>: Cost 3 vsldoi8 <1,u,4,5>, <1,u,4,5>
+  2638200934U,	// <4,5,2,0>: Cost 3 vsldoi4 <3,4,5,2>, LHS
+  3761088035U,	// <4,5,2,1>: Cost 4 vsldoi8 <0,4,4,5>, <2,1,3,5>
+  2697963133U,	// <4,5,2,2>: Cost 3 vsldoi8 <2,2,4,5>, <2,2,4,5>
+  1624884942U,	// <4,5,2,3>: Cost 2 vsldoi8 <2,3,4,5>, <2,3,4,5>
+  2698626838U,	// <4,5,2,4>: Cost 3 vsldoi8 <2,3,4,5>, <2,4,3,5>
+  3772368744U,	// <4,5,2,5>: Cost 4 vsldoi8 <2,3,4,5>, <2,5,3,6>
+  2698627002U,	// <4,5,2,6>: Cost 3 vsldoi8 <2,3,4,5>, <2,6,3,7>
+  3775023122U,	// <4,5,2,7>: Cost 4 vsldoi8 <2,7,4,5>, <2,7,4,5>
+  1628203107U,	// <4,5,2,u>: Cost 2 vsldoi8 <2,u,4,5>, <2,u,4,5>
+  2698627222U,	// <4,5,3,0>: Cost 3 vsldoi8 <2,3,4,5>, <3,0,1,2>
+  3765070057U,	// <4,5,3,1>: Cost 4 vsldoi8 <1,1,4,5>, <3,1,1,4>
+  2698627404U,	// <4,5,3,2>: Cost 3 vsldoi8 <2,3,4,5>, <3,2,3,4>
+  2698627484U,	// <4,5,3,3>: Cost 3 vsldoi8 <2,3,4,5>, <3,3,3,3>
+  2698627580U,	// <4,5,3,4>: Cost 3 vsldoi8 <2,3,4,5>, <3,4,5,0>
+  3779668553U,	// <4,5,3,5>: Cost 4 vsldoi8 <3,5,4,5>, <3,5,4,5>
+  2725169844U,	// <4,5,3,6>: Cost 3 vsldoi8 <6,7,4,5>, <3,6,7,4>
+  2707253995U,	// <4,5,3,7>: Cost 3 vsldoi8 <3,7,4,5>, <3,7,4,5>
+  2698627870U,	// <4,5,3,u>: Cost 3 vsldoi8 <2,3,4,5>, <3,u,1,2>
+  2638217318U,	// <4,5,4,0>: Cost 3 vsldoi4 <3,4,5,4>, LHS
+  2308574098U,	// <4,5,4,1>: Cost 3 vmrglw <4,4,4,4>, <4,0,5,1>
+  2698628150U,	// <4,5,4,2>: Cost 3 vsldoi8 <2,3,4,5>, <4,2,5,3>
+  2638219776U,	// <4,5,4,3>: Cost 3 vsldoi4 <3,4,5,4>, <3,4,5,4>
+  2698628314U,	// <4,5,4,4>: Cost 3 vsldoi8 <2,3,4,5>, <4,4,5,5>
+  1624886582U,	// <4,5,4,5>: Cost 2 vsldoi8 <2,3,4,5>, RHS
+  2698628478U,	// <4,5,4,6>: Cost 3 vsldoi8 <2,3,4,5>, <4,6,5,7>
+  2662110564U,	// <4,5,4,7>: Cost 3 vsldoi4 <7,4,5,4>, <7,4,5,4>
+  1624886825U,	// <4,5,4,u>: Cost 2 vsldoi8 <2,3,4,5>, RHS
+  1570455654U,	// <4,5,5,0>: Cost 2 vsldoi4 <4,4,5,5>, LHS
+  2312564250U,	// <4,5,5,1>: Cost 3 vmrglw <5,1,4,5>, <4,u,5,1>
+  2644199118U,	// <4,5,5,2>: Cost 3 vsldoi4 <4,4,5,5>, <2,3,4,5>
+  2295974966U,	// <4,5,5,3>: Cost 3 vmrglw <2,3,4,5>, <4,2,5,3>
+  1570458842U,	// <4,5,5,4>: Cost 2 vsldoi4 <4,4,5,5>, <4,4,5,5>
+  1168568324U,	// <4,5,5,5>: Cost 2 vmrghw RHS, <5,5,5,5>
+  1168568418U,	// <4,5,5,6>: Cost 2 vmrghw RHS, <5,6,7,0>
+  2295975294U,	// <4,5,5,7>: Cost 3 vmrglw <2,3,4,5>, <4,6,5,7>
+  1168716036U,	// <4,5,5,u>: Cost 2 vmrghw RHS, <5,u,7,0>
+  1564491878U,	// <4,5,6,0>: Cost 2 vsldoi4 <3,4,5,6>, LHS
+  2626290768U,	// <4,5,6,1>: Cost 3 vsldoi4 <1,4,5,6>, <1,4,5,6>
+  2632263465U,	// <4,5,6,2>: Cost 3 vsldoi4 <2,4,5,6>, <2,4,5,6>
+  1564494338U,	// <4,5,6,3>: Cost 2 vsldoi4 <3,4,5,6>, <3,4,5,6>
+  1564495158U,	// <4,5,6,4>: Cost 2 vsldoi4 <3,4,5,6>, RHS
+  2638237464U,	// <4,5,6,5>: Cost 3 vsldoi4 <3,4,5,6>, <5,2,6,3>
+  2656154253U,	// <4,5,6,6>: Cost 3 vsldoi4 <6,4,5,6>, <6,4,5,6>
+  27705344U,	// <4,5,6,7>: Cost 0 copy RHS
+  27705344U,	// <4,5,6,u>: Cost 0 copy RHS
+  2725172218U,	// <4,5,7,0>: Cost 3 vsldoi8 <6,7,4,5>, <7,0,1,2>
+  3859599489U,	// <4,5,7,1>: Cost 4 vsldoi12 <5,6,7,4>, <5,7,1,4>
+  2698630320U,	// <4,5,7,2>: Cost 3 vsldoi8 <2,3,4,5>, <7,2,3,4>
+  2728490251U,	// <4,5,7,3>: Cost 3 vsldoi8 <7,3,4,5>, <7,3,4,5>
+  2725172576U,	// <4,5,7,4>: Cost 3 vsldoi8 <6,7,4,5>, <7,4,5,0>
+  3317239812U,	// <4,5,7,5>: Cost 4 vmrghw <4,7,5,0>, <5,5,5,5>
+  2725172760U,	// <4,5,7,6>: Cost 3 vsldoi8 <6,7,4,5>, <7,6,7,4>
+  2725172844U,	// <4,5,7,7>: Cost 3 vsldoi8 <6,7,4,5>, <7,7,7,7>
+  2725172866U,	// <4,5,7,u>: Cost 3 vsldoi8 <6,7,4,5>, <7,u,1,2>
+  1564508262U,	// <4,5,u,0>: Cost 2 vsldoi4 <3,4,5,u>, LHS
+  1624889134U,	// <4,5,u,1>: Cost 2 vsldoi8 <2,3,4,5>, LHS
+  2698631045U,	// <4,5,u,2>: Cost 3 vsldoi8 <2,3,4,5>, <u,2,3,0>
+  1564510724U,	// <4,5,u,3>: Cost 2 vsldoi4 <3,4,5,u>, <3,4,5,u>
+  1564511542U,	// <4,5,u,4>: Cost 2 vsldoi4 <3,4,5,u>, RHS
+  1624889498U,	// <4,5,u,5>: Cost 2 vsldoi8 <2,3,4,5>, RHS
+  1170550882U,	// <4,5,u,6>: Cost 2 vmrghw RHS, <5,6,7,0>
+  27705344U,	// <4,5,u,7>: Cost 0 copy RHS
+  27705344U,	// <4,5,u,u>: Cost 0 copy RHS
+  3312595285U,	// <4,6,0,0>: Cost 4 vmrghw <4,0,5,0>, <6,0,7,0>
+  3763748966U,	// <4,6,0,1>: Cost 4 vsldoi8 <0,u,4,6>, LHS
+  2238861818U,	// <4,6,0,2>: Cost 3 vmrghw <4,0,5,1>, <6,2,7,3>
+  3767730432U,	// <4,6,0,3>: Cost 4 vsldoi8 <1,5,4,6>, <0,3,1,4>
+  3763749202U,	// <4,6,0,4>: Cost 4 vsldoi8 <0,u,4,6>, <0,4,1,5>
+  2238862059U,	// <4,6,0,5>: Cost 3 vmrghw <4,0,5,1>, <6,5,7,1>
+  2238862136U,	// <4,6,0,6>: Cost 3 vmrghw <4,0,5,1>, <6,6,6,6>
+  2295934262U,	// <4,6,0,7>: Cost 3 vmrglw <2,3,4,0>, RHS
+  2295934263U,	// <4,6,0,u>: Cost 3 vmrglw <2,3,4,0>, RHS
+  3378973999U,	// <4,6,1,0>: Cost 4 vmrglw <3,u,4,1>, <4,5,6,0>
+  3378974648U,	// <4,6,1,1>: Cost 4 vmrglw <3,u,4,1>, <5,4,6,1>
+  3779675034U,	// <4,6,1,2>: Cost 4 vsldoi8 <3,5,4,6>, <1,2,3,4>
+  3378974002U,	// <4,6,1,3>: Cost 4 vmrglw <3,u,4,1>, <4,5,6,3>
+  3378974003U,	// <4,6,1,4>: Cost 4 vmrglw <3,u,4,1>, <4,5,6,4>
+  3767731352U,	// <4,6,1,5>: Cost 4 vsldoi8 <1,5,4,6>, <1,5,4,6>
+  3378974734U,	// <4,6,1,6>: Cost 4 vmrglw <3,u,4,1>, <5,5,6,6>
+  2287316278U,	// <4,6,1,7>: Cost 3 vmrglw <0,u,4,1>, RHS
+  2287316279U,	// <4,6,1,u>: Cost 3 vmrglw <0,u,4,1>, RHS
+  3735904358U,	// <4,6,2,0>: Cost 4 vsldoi4 <7,4,6,2>, LHS
+  3763750435U,	// <4,6,2,1>: Cost 5 vsldoi8 <0,u,4,6>, <2,1,3,5>
+  3313938937U,	// <4,6,2,2>: Cost 4 vmrghw <4,2,5,2>, <6,2,7,2>
+  3772376782U,	// <4,6,2,3>: Cost 4 vsldoi8 <2,3,4,6>, <2,3,4,5>
+  3852890591U,	// <4,6,2,4>: Cost 4 vsldoi12 <4,5,6,4>, <6,2,4,3>
+  3735908454U,	// <4,6,2,5>: Cost 4 vsldoi4 <7,4,6,2>, <5,6,7,4>
+  3801573306U,	// <4,6,2,6>: Cost 4 vsldoi8 <7,2,4,6>, <2,6,3,7>
+  2785858042U,	// <4,6,2,7>: Cost 3 vsldoi12 <5,6,7,4>, <6,2,7,3>
+  2785858051U,	// <4,6,2,u>: Cost 3 vsldoi12 <5,6,7,4>, <6,2,u,3>
+  3863065101U,	// <4,6,3,0>: Cost 4 vsldoi12 <6,3,0,4>, <6,3,0,4>
+  3314586024U,	// <4,6,3,1>: Cost 4 vmrghw <4,3,5,0>, <6,1,7,2>
+  3863212575U,	// <4,6,3,2>: Cost 4 vsldoi12 <6,3,2,4>, <6,3,2,4>
+  3863286312U,	// <4,6,3,3>: Cost 4 vsldoi12 <6,3,3,4>, <6,3,3,4>
+  3767732738U,	// <4,6,3,4>: Cost 4 vsldoi8 <1,5,4,6>, <3,4,5,6>
+  3779676746U,	// <4,6,3,5>: Cost 4 vsldoi8 <3,5,4,6>, <3,5,4,6>
+  3398898488U,	// <4,6,3,6>: Cost 4 vmrglw <7,2,4,3>, <6,6,6,6>
+  2301267254U,	// <4,6,3,7>: Cost 3 vmrglw <3,2,4,3>, RHS
+  2301267255U,	// <4,6,3,u>: Cost 3 vmrglw <3,2,4,3>, RHS
+  3852890715U,	// <4,6,4,0>: Cost 4 vsldoi12 <4,5,6,4>, <6,4,0,1>
+  3315208615U,	// <4,6,4,1>: Cost 4 vmrghw <4,4,4,4>, <6,1,7,1>
+  2241466874U,	// <4,6,4,2>: Cost 3 vmrghw <4,4,4,4>, <6,2,7,3>
+  3852890745U,	// <4,6,4,3>: Cost 4 vsldoi12 <4,5,6,4>, <6,4,3,4>
+  2241467037U,	// <4,6,4,4>: Cost 3 vmrghw <4,4,4,4>, <6,4,7,4>
+  2241549039U,	// <4,6,4,5>: Cost 3 vmrghw <4,4,5,5>, <6,5,7,5>
+  2241467192U,	// <4,6,4,6>: Cost 3 vmrghw <4,4,4,4>, <6,6,6,6>
+  1234832694U,	// <4,6,4,7>: Cost 2 vmrglw <4,4,4,4>, RHS
+  1234832695U,	// <4,6,4,u>: Cost 2 vmrglw <4,4,4,4>, RHS
+  2242302241U,	// <4,6,5,0>: Cost 3 vmrghw RHS, <6,0,1,2>
+  2242310567U,	// <4,6,5,1>: Cost 3 vmrghw RHS, <6,1,7,1>
+  1168568826U,	// <4,6,5,2>: Cost 2 vmrghw RHS, <6,2,7,3>
+  2242302514U,	// <4,6,5,3>: Cost 3 vmrghw RHS, <6,3,4,5>
+  2242302605U,	// <4,6,5,4>: Cost 3 vmrghw RHS, <6,4,5,6>
+  2242310891U,	// <4,6,5,5>: Cost 3 vmrghw RHS, <6,5,7,1>
+  1168569144U,	// <4,6,5,6>: Cost 2 vmrghw RHS, <6,6,6,6>
+  1222233398U,	// <4,6,5,7>: Cost 2 vmrglw <2,3,4,5>, RHS
+  1222233399U,	// <4,6,5,u>: Cost 2 vmrglw <2,3,4,5>, RHS
+  3316576545U,	// <4,6,6,0>: Cost 4 vmrghw <4,6,5,0>, <6,0,1,2>
+  3316584871U,	// <4,6,6,1>: Cost 4 vmrghw <4,6,5,1>, <6,1,7,1>
+  2242851322U,	// <4,6,6,2>: Cost 3 vmrghw <4,6,5,2>, <6,2,7,3>
+  3316601394U,	// <4,6,6,3>: Cost 4 vmrghw <4,6,5,3>, <6,3,4,5>
+  3852890916U,	// <4,6,6,4>: Cost 4 vsldoi12 <4,5,6,4>, <6,6,4,4>
+  3316617963U,	// <4,6,6,5>: Cost 4 vmrghw <4,6,5,5>, <6,5,7,1>
+  2242884408U,	// <4,6,6,6>: Cost 3 vmrghw <4,6,5,6>, <6,6,6,6>
+  2785858370U,	// <4,6,6,7>: Cost 3 vsldoi12 <5,6,7,4>, <6,6,7,7>
+  2785858379U,	// <4,6,6,u>: Cost 3 vsldoi12 <5,6,7,4>, <6,6,u,7>
+  2785858382U,	// <4,6,7,0>: Cost 3 vsldoi12 <5,6,7,4>, <6,7,0,1>
+  3859600215U,	// <4,6,7,1>: Cost 4 vsldoi12 <5,6,7,4>, <6,7,1,1>
+  3317240314U,	// <4,6,7,2>: Cost 4 vmrghw <4,7,5,0>, <6,2,7,3>
+  2792199020U,	// <4,6,7,3>: Cost 3 vsldoi12 <6,7,3,4>, <6,7,3,4>
+  2785858422U,	// <4,6,7,4>: Cost 3 vsldoi12 <5,6,7,4>, <6,7,4,5>
+  3856651132U,	// <4,6,7,5>: Cost 4 vsldoi12 <5,2,3,4>, <6,7,5,2>
+  3317240632U,	// <4,6,7,6>: Cost 4 vmrghw <4,7,5,0>, <6,6,6,6>
+  2303954230U,	// <4,6,7,7>: Cost 3 vmrglw <3,6,4,7>, RHS
+  2303954231U,	// <4,6,7,u>: Cost 3 vmrglw <3,6,4,7>, RHS
+  2244292897U,	// <4,6,u,0>: Cost 3 vmrghw RHS, <6,0,1,2>
+  2244293031U,	// <4,6,u,1>: Cost 3 vmrghw RHS, <6,1,7,1>
+  1170551290U,	// <4,6,u,2>: Cost 2 vmrghw RHS, <6,2,7,3>
+  2244293170U,	// <4,6,u,3>: Cost 3 vmrghw RHS, <6,3,4,5>
+  2244293261U,	// <4,6,u,4>: Cost 3 vmrghw RHS, <6,4,5,6>
+  2244293355U,	// <4,6,u,5>: Cost 3 vmrghw RHS, <6,5,7,1>
+  1170551608U,	// <4,6,u,6>: Cost 2 vmrghw RHS, <6,6,6,6>
+  1222257974U,	// <4,6,u,7>: Cost 2 vmrglw <2,3,4,u>, RHS
+  1222257975U,	// <4,6,u,u>: Cost 2 vmrglw <2,3,4,u>, RHS
+  2238862330U,	// <4,7,0,0>: Cost 3 vmrghw <4,0,5,1>, <7,0,1,2>
+  2706604134U,	// <4,7,0,1>: Cost 3 vsldoi8 <3,6,4,7>, LHS
+  3312604308U,	// <4,7,0,2>: Cost 4 vmrghw <4,0,5,1>, <7,2,0,3>
+  3768402176U,	// <4,7,0,3>: Cost 4 vsldoi8 <1,6,4,7>, <0,3,1,4>
+  2238862648U,	// <4,7,0,4>: Cost 3 vmrghw <4,0,5,1>, <7,4,0,5>
+  3859600418U,	// <4,7,0,5>: Cost 4 vsldoi12 <5,6,7,4>, <7,0,5,6>
+  3729994393U,	// <4,7,0,6>: Cost 4 vsldoi4 <6,4,7,0>, <6,4,7,0>
+  2238862956U,	// <4,7,0,7>: Cost 3 vmrghw <4,0,5,1>, <7,7,7,7>
+  2706604701U,	// <4,7,0,u>: Cost 3 vsldoi8 <3,6,4,7>, LHS
+  3385610338U,	// <4,7,1,0>: Cost 4 vmrglw <5,0,4,1>, <5,6,7,0>
+  3780346676U,	// <4,7,1,1>: Cost 4 vsldoi8 <3,6,4,7>, <1,1,1,1>
+  2706604954U,	// <4,7,1,2>: Cost 3 vsldoi8 <3,6,4,7>, <1,2,3,4>
+  3385610746U,	// <4,7,1,3>: Cost 4 vmrglw <5,0,4,1>, <6,2,7,3>
+  3385610342U,	// <4,7,1,4>: Cost 4 vmrglw <5,0,4,1>, <5,6,7,4>
+  3385610667U,	// <4,7,1,5>: Cost 4 vmrglw <5,0,4,1>, <6,1,7,5>
+  3768403178U,	// <4,7,1,6>: Cost 4 vsldoi8 <1,6,4,7>, <1,6,4,7>
+  3385611074U,	// <4,7,1,7>: Cost 4 vmrglw <5,0,4,1>, <6,6,7,7>
+  2706604954U,	// <4,7,1,u>: Cost 3 vsldoi8 <3,6,4,7>, <1,2,3,4>
+  3859600532U,	// <4,7,2,0>: Cost 4 vsldoi12 <5,6,7,4>, <7,2,0,3>
+  3712091034U,	// <4,7,2,1>: Cost 5 vsldoi4 <3,4,7,2>, <1,2,3,4>
+  3774375528U,	// <4,7,2,2>: Cost 4 vsldoi8 <2,6,4,7>, <2,2,2,2>
+  2794853552U,	// <4,7,2,3>: Cost 3 vsldoi12 <7,2,3,4>, <7,2,3,4>
+  2785858744U,	// <4,7,2,4>: Cost 3 vsldoi12 <5,6,7,4>, <7,2,4,3>
+  3735982182U,	// <4,7,2,5>: Cost 4 vsldoi4 <7,4,7,2>, <5,6,7,4>
+  3774375875U,	// <4,7,2,6>: Cost 4 vsldoi8 <2,6,4,7>, <2,6,4,7>
+  3735983476U,	// <4,7,2,7>: Cost 4 vsldoi4 <7,4,7,2>, <7,4,7,2>
+  2795222237U,	// <4,7,2,u>: Cost 3 vsldoi12 <7,2,u,4>, <7,2,u,4>
+  3780348054U,	// <4,7,3,0>: Cost 4 vsldoi8 <3,6,4,7>, <3,0,1,2>
+  3730015130U,	// <4,7,3,1>: Cost 4 vsldoi4 <6,4,7,3>, <1,2,3,4>
+  3780348244U,	// <4,7,3,2>: Cost 4 vsldoi8 <3,6,4,7>, <3,2,4,3>
+  3778357673U,	// <4,7,3,3>: Cost 4 vsldoi8 <3,3,4,7>, <3,3,4,7>
+  2325155942U,	// <4,7,3,4>: Cost 3 vmrglw <7,2,4,3>, <5,6,7,4>
+  3779684939U,	// <4,7,3,5>: Cost 5 vsldoi8 <3,5,4,7>, <3,5,4,7>
+  2706606748U,	// <4,7,3,6>: Cost 3 vsldoi8 <3,6,4,7>, <3,6,4,7>
+  3398898498U,	// <4,7,3,7>: Cost 4 vmrglw <7,2,4,3>, <6,6,7,7>
+  2707934014U,	// <4,7,3,u>: Cost 3 vsldoi8 <3,u,4,7>, <3,u,4,7>
+  2785858868U,	// <4,7,4,0>: Cost 3 vsldoi12 <5,6,7,4>, <7,4,0,1>
+  3780348874U,	// <4,7,4,1>: Cost 4 vsldoi8 <3,6,4,7>, <4,1,2,3>
+  3780349000U,	// <4,7,4,2>: Cost 4 vsldoi8 <3,6,4,7>, <4,2,7,3>
+  2308575738U,	// <4,7,4,3>: Cost 3 vmrglw <4,4,4,4>, <6,2,7,3>
+  2656283856U,	// <4,7,4,4>: Cost 3 vsldoi4 <6,4,7,4>, <4,4,4,4>
+  2706607414U,	// <4,7,4,5>: Cost 3 vsldoi8 <3,6,4,7>, RHS
+  2656285341U,	// <4,7,4,6>: Cost 3 vsldoi4 <6,4,7,4>, <6,4,7,4>
+  2241468012U,	// <4,7,4,7>: Cost 3 vmrghw <4,4,4,4>, <7,7,7,7>
+  2706607657U,	// <4,7,4,u>: Cost 3 vsldoi8 <3,6,4,7>, RHS
+  1168569338U,	// <4,7,5,0>: Cost 2 vmrghw RHS, <7,0,1,2>
+  2242311242U,	// <4,7,5,1>: Cost 3 vmrghw RHS, <7,1,1,1>
+  2242303178U,	// <4,7,5,2>: Cost 3 vmrghw RHS, <7,2,6,3>
+  2242311395U,	// <4,7,5,3>: Cost 3 vmrghw RHS, <7,3,0,1>
+  1168569702U,	// <4,7,5,4>: Cost 2 vmrghw RHS, <7,4,5,6>
+  2242311606U,	// <4,7,5,5>: Cost 3 vmrghw RHS, <7,5,5,5>
+  2242311662U,	// <4,7,5,6>: Cost 3 vmrghw RHS, <7,6,2,7>
+  1168569964U,	// <4,7,5,7>: Cost 2 vmrghw RHS, <7,7,7,7>
+  1168569986U,	// <4,7,5,u>: Cost 2 vmrghw RHS, <7,u,1,2>
+  3316593658U,	// <4,7,6,0>: Cost 4 vmrghw <4,6,5,2>, <7,0,1,2>
+  3316593738U,	// <4,7,6,1>: Cost 5 vmrghw <4,6,5,2>, <7,1,1,1>
+  3316634800U,	// <4,7,6,2>: Cost 4 vmrghw <4,6,5,7>, <7,2,3,4>
+  3386978810U,	// <4,7,6,3>: Cost 4 vmrglw <5,2,4,6>, <6,2,7,3>
+  2785859072U,	// <4,7,6,4>: Cost 3 vsldoi12 <5,6,7,4>, <7,6,4,7>
+  3736014950U,	// <4,7,6,5>: Cost 4 vsldoi4 <7,4,7,6>, <5,6,7,4>
+  3316594158U,	// <4,7,6,6>: Cost 4 vmrghw <4,6,5,2>, <7,6,2,7>
+  2797803032U,	// <4,7,6,7>: Cost 3 vsldoi12 <7,6,7,4>, <7,6,7,4>
+  2797876769U,	// <4,7,6,u>: Cost 3 vsldoi12 <7,6,u,4>, <7,6,u,4>
+  2243499002U,	// <4,7,7,0>: Cost 3 vmrghw <4,7,5,0>, <7,0,1,2>
+  3718103962U,	// <4,7,7,1>: Cost 4 vsldoi4 <4,4,7,7>, <1,2,3,4>
+  3317257418U,	// <4,7,7,2>: Cost 4 vmrghw <4,7,5,2>, <7,2,6,3>
+  3377695816U,	// <4,7,7,3>: Cost 4 vmrglw <3,6,4,7>, <4,2,7,3>
+  2243532134U,	// <4,7,7,4>: Cost 3 vmrghw <4,7,5,4>, <7,4,5,6>
+  3317282230U,	// <4,7,7,5>: Cost 4 vmrghw <4,7,5,5>, <7,5,5,5>
+  2730497536U,	// <4,7,7,6>: Cost 3 vsldoi8 <7,6,4,7>, <7,6,4,7>
+  2243556972U,	// <4,7,7,7>: Cost 3 vmrghw <4,7,5,7>, <7,7,7,7>
+  2243565186U,	// <4,7,7,u>: Cost 3 vmrghw <4,7,5,u>, <7,u,1,2>
+  1170551802U,	// <4,7,u,0>: Cost 2 vmrghw RHS, <7,0,1,2>
+  2706609966U,	// <4,7,u,1>: Cost 3 vsldoi8 <3,6,4,7>, LHS
+  2244293797U,	// <4,7,u,2>: Cost 3 vmrghw RHS, <7,2,2,2>
+  2244293859U,	// <4,7,u,3>: Cost 3 vmrghw RHS, <7,3,0,1>
+  1170552166U,	// <4,7,u,4>: Cost 2 vmrghw RHS, <7,4,5,6>
+  2706610330U,	// <4,7,u,5>: Cost 3 vsldoi8 <3,6,4,7>, RHS
+  2244294126U,	// <4,7,u,6>: Cost 3 vmrghw RHS, <7,6,2,7>
+  1170552428U,	// <4,7,u,7>: Cost 2 vmrghw RHS, <7,7,7,7>
+  1170552450U,	// <4,7,u,u>: Cost 2 vmrghw RHS, <7,u,1,2>
+  1165118354U,	// <4,u,0,0>: Cost 2 vmrghw <4,0,5,1>, <4,0,5,1>
+  1624907878U,	// <4,u,0,1>: Cost 2 vsldoi8 <2,3,4,u>, LHS
+  2638407377U,	// <4,u,0,2>: Cost 3 vsldoi4 <3,4,u,0>, <2,3,4,u>
+  2295931036U,	// <4,u,0,3>: Cost 3 vmrglw <2,3,4,0>, LHS
+  2687369584U,	// <4,u,0,4>: Cost 3 vsldoi8 <0,4,4,u>, <0,4,4,u>
+  1165121690U,	// <4,u,0,5>: Cost 2 vmrghw <4,0,5,1>, RHS
+  2662298489U,	// <4,u,0,6>: Cost 3 vsldoi4 <7,4,u,0>, <6,7,4,u>
+  2295934280U,	// <4,u,0,7>: Cost 3 vmrglw <2,3,4,0>, RHS
+  1624908445U,	// <4,u,0,u>: Cost 2 vsldoi8 <2,3,4,u>, LHS
+  2638413926U,	// <4,u,1,0>: Cost 3 vsldoi4 <3,4,u,1>, LHS
+  2691351382U,	// <4,u,1,1>: Cost 3 vsldoi8 <1,1,4,u>, <1,1,4,u>
+  1685280558U,	// <4,u,1,2>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+  2287313052U,	// <4,u,1,3>: Cost 3 vmrglw <0,u,4,1>, LHS
+  2299257799U,	// <4,u,1,4>: Cost 3 vmrglw <2,u,4,1>, <1,2,u,4>
+  2694005914U,	// <4,u,1,5>: Cost 3 vsldoi8 <1,5,4,u>, <1,5,4,u>
+  2305231362U,	// <4,u,1,6>: Cost 3 vmrglw <3,u,4,1>, <3,4,5,6>
+  2287316296U,	// <4,u,1,7>: Cost 3 vmrglw <0,u,4,1>, RHS
+  1685280612U,	// <4,u,1,u>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+  2638422118U,	// <4,u,2,0>: Cost 3 vsldoi4 <3,4,u,2>, LHS
+  2240206638U,	// <4,u,2,1>: Cost 3 vmrghw <4,2,5,3>, LHS
+  2697987712U,	// <4,u,2,2>: Cost 3 vsldoi8 <2,2,4,u>, <2,2,4,u>
+  1624909521U,	// <4,u,2,3>: Cost 2 vsldoi8 <2,3,4,u>, <2,3,4,u>
+  2759391121U,	// <4,u,2,4>: Cost 3 vsldoi12 <1,2,u,4>, <u,2,4,3>
+  2240207002U,	// <4,u,2,5>: Cost 3 vmrghw <4,2,5,3>, RHS
+  2698651578U,	// <4,u,2,6>: Cost 3 vsldoi8 <2,3,4,u>, <2,6,3,7>
+  2785859500U,	// <4,u,2,7>: Cost 3 vsldoi12 <5,6,7,4>, <u,2,7,3>
+  1628227686U,	// <4,u,2,u>: Cost 2 vsldoi8 <2,u,4,u>, <2,u,4,u>
+  2759022524U,	// <4,u,3,0>: Cost 3 vsldoi12 <1,2,3,4>, <u,3,0,1>
+  2801342408U,	// <4,u,3,1>: Cost 3 vsldoi12 <u,3,1,4>, <u,3,1,4>
+  2703960409U,	// <4,u,3,2>: Cost 3 vsldoi8 <3,2,4,u>, <3,2,4,u>
+  2759022554U,	// <4,u,3,3>: Cost 3 vsldoi12 <1,2,3,4>, <u,3,3,4>
+  2759022564U,	// <4,u,3,4>: Cost 3 vsldoi12 <1,2,3,4>, <u,3,4,5>
+  2240845978U,	// <4,u,3,5>: Cost 3 vmrghw <4,3,5,0>, RHS
+  2706614941U,	// <4,u,3,6>: Cost 3 vsldoi8 <3,6,4,u>, <3,6,4,u>
+  2301267272U,	// <4,u,3,7>: Cost 3 vmrglw <3,2,4,3>, RHS
+  2759022596U,	// <4,u,3,u>: Cost 3 vsldoi12 <1,2,3,4>, <u,3,u,1>
+  1570668646U,	// <4,u,4,0>: Cost 2 vsldoi4 <4,4,u,4>, LHS
+  1167726382U,	// <4,u,4,1>: Cost 2 vmrghw <4,4,4,4>, LHS
+  2698652753U,	// <4,u,4,2>: Cost 3 vsldoi8 <2,3,4,u>, <4,2,u,3>
+  1234829468U,	// <4,u,4,3>: Cost 2 vmrglw <4,4,4,4>, LHS
+  229035318U,	// <4,u,4,4>: Cost 1 vspltisw0 RHS
+  1624911158U,	// <4,u,4,5>: Cost 2 vsldoi8 <2,3,4,u>, RHS
+  2698653081U,	// <4,u,4,6>: Cost 3 vsldoi8 <2,3,4,u>, <4,6,u,7>
+  1234832712U,	// <4,u,4,7>: Cost 2 vmrglw <4,4,4,4>, RHS
+  229035318U,	// <4,u,4,u>: Cost 1 vspltisw0 RHS
+  1168561875U,	// <4,u,5,0>: Cost 2 vmrghw RHS, <u,0,1,2>
+  94820142U,	// <4,u,5,1>: Cost 1 vmrghw RHS, LHS
+  1168562053U,	// <4,u,5,2>: Cost 2 vmrghw RHS, <u,2,3,0>
+  1222230172U,	// <4,u,5,3>: Cost 2 vmrglw <2,3,4,5>, LHS
+  1168562239U,	// <4,u,5,4>: Cost 2 vmrghw RHS, <u,4,5,6>
+  94820506U,	// <4,u,5,5>: Cost 1 vmrghw RHS, RHS
+  1685280922U,	// <4,u,5,6>: Cost 2 vsldoi12 <1,2,3,4>, RHS
+  1222233416U,	// <4,u,5,7>: Cost 2 vmrglw <2,3,4,5>, RHS
+  94820709U,	// <4,u,5,u>: Cost 1 vmrghw RHS, LHS
+  1564713062U,	// <4,u,6,0>: Cost 2 vsldoi4 <3,4,u,6>, LHS
+  2626511979U,	// <4,u,6,1>: Cost 3 vsldoi4 <1,4,u,6>, <1,4,u,6>
+  2632484676U,	// <4,u,6,2>: Cost 3 vsldoi4 <2,4,u,6>, <2,4,u,6>
+  1564715549U,	// <4,u,6,3>: Cost 2 vsldoi4 <3,4,u,6>, <3,4,u,6>
+  1564716342U,	// <4,u,6,4>: Cost 2 vsldoi4 <3,4,u,6>, RHS
+  2242853018U,	// <4,u,6,5>: Cost 3 vmrghw <4,6,5,2>, RHS
+  2656375464U,	// <4,u,6,6>: Cost 3 vsldoi4 <6,4,u,6>, <6,4,u,6>
+  27705344U,	// <4,u,6,7>: Cost 0 copy RHS
+  27705344U,	// <4,u,6,u>: Cost 0 copy RHS
+  2785859840U,	// <4,u,7,0>: Cost 3 vsldoi12 <5,6,7,4>, <u,7,0,1>
+  2243499822U,	// <4,u,7,1>: Cost 3 vmrghw <4,7,5,0>, LHS
+  2727851197U,	// <4,u,7,2>: Cost 3 vsldoi8 <7,2,4,u>, <7,2,4,u>
+  2303951004U,	// <4,u,7,3>: Cost 3 vmrglw <3,6,4,7>, LHS
+  2785859880U,	// <4,u,7,4>: Cost 3 vsldoi12 <5,6,7,4>, <u,7,4,5>
+  2243500186U,	// <4,u,7,5>: Cost 3 vmrghw <4,7,5,0>, RHS
+  2730505729U,	// <4,u,7,6>: Cost 3 vsldoi8 <7,6,4,u>, <7,6,4,u>
+  2303954248U,	// <4,u,7,7>: Cost 3 vmrglw <3,6,4,7>, RHS
+  2303951009U,	// <4,u,7,u>: Cost 3 vmrglw <3,6,4,7>, LHS
+  1564729446U,	// <4,u,u,0>: Cost 2 vsldoi4 <3,4,u,u>, LHS
+  96810798U,	// <4,u,u,1>: Cost 1 vmrghw RHS, LHS
+  1685281125U,	// <4,u,u,2>: Cost 2 vsldoi12 <1,2,3,4>, LHS
+  1222254748U,	// <4,u,u,3>: Cost 2 vmrglw <2,3,4,u>, LHS
+  229035318U,	// <4,u,u,4>: Cost 1 vspltisw0 RHS
+  96811162U,	// <4,u,u,5>: Cost 1 vmrghw RHS, RHS
+  1685281165U,	// <4,u,u,6>: Cost 2 vsldoi12 <1,2,3,4>, RHS
+  27705344U,	// <4,u,u,7>: Cost 0 copy RHS
+  27705344U,	// <4,u,u,u>: Cost 0 copy RHS
+  2754232320U,	// <5,0,0,0>: Cost 3 vsldoi12 <0,4,1,5>, <0,0,0,0>
+  2754232330U,	// <5,0,0,1>: Cost 3 vsldoi12 <0,4,1,5>, <0,0,1,1>
+  3718194894U,	// <5,0,0,2>: Cost 4 vsldoi4 <4,5,0,0>, <2,3,4,5>
+  3376385762U,	// <5,0,0,3>: Cost 4 vmrglw <3,4,5,0>, <5,2,0,3>
+  2754232357U,	// <5,0,0,4>: Cost 3 vsldoi12 <0,4,1,5>, <0,0,4,1>
+  3845816370U,	// <5,0,0,5>: Cost 4 vsldoi12 <3,4,0,5>, <0,0,5,5>
+  3782353389U,	// <5,0,0,6>: Cost 4 vsldoi8 <4,0,5,0>, <0,6,0,7>
+  3376386090U,	// <5,0,0,7>: Cost 4 vmrglw <3,4,5,0>, <5,6,0,7>
+  2757402697U,	// <5,0,0,u>: Cost 3 vsldoi12 <0,u,u,5>, <0,0,u,1>
+  2626543718U,	// <5,0,1,0>: Cost 3 vsldoi4 <1,5,0,1>, LHS
+  2626544751U,	// <5,0,1,1>: Cost 3 vsldoi4 <1,5,0,1>, <1,5,0,1>
+  1680490598U,	// <5,0,1,2>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  3766428665U,	// <5,0,1,3>: Cost 4 vsldoi8 <1,3,5,0>, <1,3,5,0>
+  2626546998U,	// <5,0,1,4>: Cost 3 vsldoi4 <1,5,0,1>, RHS
+  2650435539U,	// <5,0,1,5>: Cost 3 vsldoi4 <5,5,0,1>, <5,5,0,1>
+  3783017715U,	// <5,0,1,6>: Cost 4 vsldoi8 <4,1,5,0>, <1,6,5,7>
+  3385019000U,	// <5,0,1,7>: Cost 4 vmrglw <4,u,5,1>, <3,6,0,7>
+  1680490652U,	// <5,0,1,u>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  3376398336U,	// <5,0,2,0>: Cost 4 vmrglw <3,4,5,2>, <0,0,0,0>
+  2245877862U,	// <5,0,2,1>: Cost 3 vmrghw <5,2,1,3>, LHS
+  3773064808U,	// <5,0,2,2>: Cost 4 vsldoi8 <2,4,5,0>, <2,2,2,2>
+  2705295054U,	// <5,0,2,3>: Cost 3 vsldoi8 <3,4,5,0>, <2,3,4,5>
+  3827974343U,	// <5,0,2,4>: Cost 4 vsldoi12 <0,4,1,5>, <0,2,4,1>
+  3845816530U,	// <5,0,2,5>: Cost 4 vsldoi12 <3,4,0,5>, <0,2,5,3>
+  3779037114U,	// <5,0,2,6>: Cost 4 vsldoi8 <3,4,5,0>, <2,6,3,7>
+  3810887658U,	// <5,0,2,7>: Cost 4 vsldoi8 <u,7,5,0>, <2,7,0,1>
+  2245878429U,	// <5,0,2,u>: Cost 3 vmrghw <5,2,1,3>, LHS
+  2710603926U,	// <5,0,3,0>: Cost 3 vsldoi8 <4,3,5,0>, <3,0,1,2>
+  3827974396U,	// <5,0,3,1>: Cost 4 vsldoi12 <0,4,1,5>, <0,3,1,0>
+  3779037516U,	// <5,0,3,2>: Cost 4 vsldoi8 <3,4,5,0>, <3,2,3,4>
+  3779037596U,	// <5,0,3,3>: Cost 4 vsldoi8 <3,4,5,0>, <3,3,3,3>
+  2705295868U,	// <5,0,3,4>: Cost 3 vsldoi8 <3,4,5,0>, <3,4,5,0>
+  3379726804U,	// <5,0,3,5>: Cost 4 vmrglw <4,0,5,3>, <3,4,0,5>
+  3802925748U,	// <5,0,3,6>: Cost 4 vsldoi8 <7,4,5,0>, <3,6,7,4>
+  3363138168U,	// <5,0,3,7>: Cost 5 vmrglw <1,2,5,3>, <3,6,0,7>
+  2707950400U,	// <5,0,3,u>: Cost 3 vsldoi8 <3,u,5,0>, <3,u,5,0>
+  2626568294U,	// <5,0,4,0>: Cost 3 vsldoi4 <1,5,0,4>, LHS
+  1680490834U,	// <5,0,4,1>: Cost 2 vsldoi12 <0,4,1,5>, <0,4,1,5>
+  3828048219U,	// <5,0,4,2>: Cost 4 vsldoi12 <0,4,2,5>, <0,4,2,5>
+  2710604932U,	// <5,0,4,3>: Cost 3 vsldoi8 <4,3,5,0>, <4,3,5,0>
+  2754232685U,	// <5,0,4,4>: Cost 3 vsldoi12 <0,4,1,5>, <0,4,4,5>
+  2705296694U,	// <5,0,4,5>: Cost 3 vsldoi8 <3,4,5,0>, RHS
+  3779038590U,	// <5,0,4,6>: Cost 4 vsldoi8 <3,4,5,0>, <4,6,5,7>
+  2713259464U,	// <5,0,4,7>: Cost 3 vsldoi8 <4,7,5,0>, <4,7,5,0>
+  1680490834U,	// <5,0,4,u>: Cost 2 vsldoi12 <0,4,1,5>, <0,4,1,5>
+  2311307264U,	// <5,0,5,0>: Cost 3 vmrglw <4,u,5,5>, <0,0,0,0>
+  1174437990U,	// <5,0,5,1>: Cost 2 vmrghw <5,5,5,5>, LHS
+  3779038946U,	// <5,0,5,2>: Cost 4 vsldoi8 <3,4,5,0>, <5,2,0,3>
+  3845816752U,	// <5,0,5,3>: Cost 4 vsldoi12 <3,4,0,5>, <0,5,3,0>
+  2248180050U,	// <5,0,5,4>: Cost 3 vmrghw <5,5,5,5>, <0,4,1,5>
+  2248180194U,	// <5,0,5,5>: Cost 3 vmrghw <5,5,5,5>, <0,5,u,5>
+  3779039274U,	// <5,0,5,6>: Cost 4 vsldoi8 <3,4,5,0>, <5,6,0,7>
+  3385051768U,	// <5,0,5,7>: Cost 4 vmrglw <4,u,5,5>, <3,6,0,7>
+  1174438557U,	// <5,0,5,u>: Cost 2 vmrghw <5,5,5,5>, LHS
+  2302689280U,	// <5,0,6,0>: Cost 3 vmrglw <3,4,5,6>, <0,0,0,0>
+  1175208038U,	// <5,0,6,1>: Cost 2 vmrghw <5,6,7,0>, LHS
+  3787002362U,	// <5,0,6,2>: Cost 4 vsldoi8 <4,7,5,0>, <6,2,7,3>
+  3376432160U,	// <5,0,6,3>: Cost 4 vmrglw <3,4,5,6>, <1,4,0,3>
+  2248950098U,	// <5,0,6,4>: Cost 3 vmrghw <5,6,7,0>, <0,4,1,5>
+  2248950180U,	// <5,0,6,5>: Cost 3 vmrghw <5,6,7,0>, <0,5,1,6>
+  3376433702U,	// <5,0,6,6>: Cost 4 vmrglw <3,4,5,6>, <3,5,0,6>
+  2729186166U,	// <5,0,6,7>: Cost 3 vsldoi8 <7,4,5,0>, <6,7,4,5>
+  1175208605U,	// <5,0,6,u>: Cost 2 vmrghw <5,6,7,0>, LHS
+  2713261050U,	// <5,0,7,0>: Cost 3 vsldoi8 <4,7,5,0>, <7,0,1,2>
+  3365823599U,	// <5,0,7,1>: Cost 4 vmrglw <1,6,5,7>, <1,5,0,1>
+  3808900317U,	// <5,0,7,2>: Cost 4 vsldoi8 <u,4,5,0>, <7,2,u,4>
+  3784348899U,	// <5,0,7,3>: Cost 4 vsldoi8 <4,3,5,0>, <7,3,0,1>
+  2729186656U,	// <5,0,7,4>: Cost 3 vsldoi8 <7,4,5,0>, <7,4,5,0>
+  3787003268U,	// <5,0,7,5>: Cost 4 vsldoi8 <4,7,5,0>, <7,5,0,0>
+  3802928664U,	// <5,0,7,6>: Cost 4 vsldoi8 <7,4,5,0>, <7,6,7,4>
+  3787003431U,	// <5,0,7,7>: Cost 4 vsldoi8 <4,7,5,0>, <7,7,0,1>
+  2731841188U,	// <5,0,7,u>: Cost 3 vsldoi8 <7,u,5,0>, <7,u,5,0>
+  2626601062U,	// <5,0,u,0>: Cost 3 vsldoi4 <1,5,0,u>, LHS
+  1683145366U,	// <5,0,u,1>: Cost 2 vsldoi12 <0,u,1,5>, <0,u,1,5>
+  1680491165U,	// <5,0,u,2>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  2705295054U,	// <5,0,u,3>: Cost 3 vsldoi8 <3,4,5,0>, <2,3,4,5>
+  2754233005U,	// <5,0,u,4>: Cost 3 vsldoi12 <0,4,1,5>, <0,u,4,1>
+  2705299610U,	// <5,0,u,5>: Cost 3 vsldoi8 <3,4,5,0>, RHS
+  3779041488U,	// <5,0,u,6>: Cost 4 vsldoi8 <3,4,5,0>, <u,6,3,7>
+  2737150252U,	// <5,0,u,7>: Cost 3 vsldoi8 <u,7,5,0>, <u,7,5,0>
+  1680491219U,	// <5,0,u,u>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  2713927680U,	// <5,1,0,0>: Cost 3 vsldoi8 <4,u,5,1>, <0,0,0,0>
+  1640185958U,	// <5,1,0,1>: Cost 2 vsldoi8 <4,u,5,1>, LHS
+  2310607866U,	// <5,1,0,2>: Cost 3 vmrglw <4,7,5,0>, <7,0,1,2>
+  3787669756U,	// <5,1,0,3>: Cost 4 vsldoi8 <4,u,5,1>, <0,3,1,0>
+  2713928018U,	// <5,1,0,4>: Cost 3 vsldoi8 <4,u,5,1>, <0,4,1,5>
+  2306621778U,	// <5,1,0,5>: Cost 3 vmrglw <4,1,5,0>, <0,4,1,5>
+  3787670006U,	// <5,1,0,6>: Cost 4 vsldoi8 <4,u,5,1>, <0,6,1,7>
+  3736188301U,	// <5,1,0,7>: Cost 4 vsldoi4 <7,5,1,0>, <7,5,1,0>
+  1640186525U,	// <5,1,0,u>: Cost 2 vsldoi8 <4,u,5,1>, LHS
+  2650505318U,	// <5,1,1,0>: Cost 3 vsldoi4 <5,5,1,1>, LHS
+  2754233140U,	// <5,1,1,1>: Cost 3 vsldoi12 <0,4,1,5>, <1,1,1,1>
+  2311276694U,	// <5,1,1,2>: Cost 3 vmrglw <4,u,5,1>, <3,0,1,2>
+  2311278315U,	// <5,1,1,3>: Cost 3 vmrglw <4,u,5,1>, <5,2,1,3>
+  2758435667U,	// <5,1,1,4>: Cost 3 vsldoi12 <1,1,4,5>, <1,1,4,5>
+  2754233180U,	// <5,1,1,5>: Cost 3 vsldoi12 <0,4,1,5>, <1,1,5,5>
+  3385016497U,	// <5,1,1,6>: Cost 4 vmrglw <4,u,5,1>, <0,2,1,6>
+  2311278643U,	// <5,1,1,7>: Cost 3 vmrglw <4,u,5,1>, <5,6,1,7>
+  2758730615U,	// <5,1,1,u>: Cost 3 vsldoi12 <1,1,u,5>, <1,1,u,5>
+  3700367462U,	// <5,1,2,0>: Cost 4 vsldoi4 <1,5,1,2>, LHS
+  3830629255U,	// <5,1,2,1>: Cost 4 vsldoi12 <0,u,1,5>, <1,2,1,3>
+  2713929320U,	// <5,1,2,2>: Cost 3 vsldoi8 <4,u,5,1>, <2,2,2,2>
+  2754233238U,	// <5,1,2,3>: Cost 3 vsldoi12 <0,4,1,5>, <1,2,3,0>
+  2759099300U,	// <5,1,2,4>: Cost 3 vsldoi12 <1,2,4,5>, <1,2,4,5>
+  2754233259U,	// <5,1,2,5>: Cost 3 vsldoi12 <0,4,1,5>, <1,2,5,3>
+  2713929658U,	// <5,1,2,6>: Cost 3 vsldoi8 <4,u,5,1>, <2,6,3,7>
+  3872359354U,	// <5,1,2,7>: Cost 4 vsldoi12 <7,u,0,5>, <1,2,7,0>
+  2754233283U,	// <5,1,2,u>: Cost 3 vsldoi12 <0,4,1,5>, <1,2,u,0>
+  2713929878U,	// <5,1,3,0>: Cost 3 vsldoi8 <4,u,5,1>, <3,0,1,2>
+  3363135498U,	// <5,1,3,1>: Cost 4 vmrglw <1,2,5,3>, <0,0,1,1>
+  3363137686U,	// <5,1,3,2>: Cost 4 vmrglw <1,2,5,3>, <3,0,1,2>
+  2713930140U,	// <5,1,3,3>: Cost 3 vsldoi8 <4,u,5,1>, <3,3,3,3>
+  2713930242U,	// <5,1,3,4>: Cost 3 vsldoi8 <4,u,5,1>, <3,4,5,6>
+  2289394002U,	// <5,1,3,5>: Cost 3 vmrglw <1,2,5,3>, <0,4,1,5>
+  3787672184U,	// <5,1,3,6>: Cost 4 vsldoi8 <4,u,5,1>, <3,6,0,7>
+  3787672259U,	// <5,1,3,7>: Cost 4 vsldoi8 <4,u,5,1>, <3,7,0,1>
+  2713930526U,	// <5,1,3,u>: Cost 3 vsldoi8 <4,u,5,1>, <3,u,1,2>
+  1634880402U,	// <5,1,4,0>: Cost 2 vsldoi8 <4,0,5,1>, <4,0,5,1>
+  2760205355U,	// <5,1,4,1>: Cost 3 vsldoi12 <1,4,1,5>, <1,4,1,5>
+  2760279092U,	// <5,1,4,2>: Cost 3 vsldoi12 <1,4,2,5>, <1,4,2,5>
+  3787672708U,	// <5,1,4,3>: Cost 4 vsldoi8 <4,u,5,1>, <4,3,5,0>
+  2713930960U,	// <5,1,4,4>: Cost 3 vsldoi8 <4,u,5,1>, <4,4,4,4>
+  1640189238U,	// <5,1,4,5>: Cost 2 vsldoi8 <4,u,5,1>, RHS
+  3786345848U,	// <5,1,4,6>: Cost 4 vsldoi8 <4,6,5,1>, <4,6,5,1>
+  3787009481U,	// <5,1,4,7>: Cost 4 vsldoi8 <4,7,5,1>, <4,7,5,1>
+  1640189466U,	// <5,1,4,u>: Cost 2 vsldoi8 <4,u,5,1>, <4,u,5,1>
+  2754233455U,	// <5,1,5,0>: Cost 3 vsldoi12 <0,4,1,5>, <1,5,0,1>
+  2713931407U,	// <5,1,5,1>: Cost 3 vsldoi8 <4,u,5,1>, <5,1,0,1>
+  2713931499U,	// <5,1,5,2>: Cost 3 vsldoi8 <4,u,5,1>, <5,2,1,3>
+  3827975305U,	// <5,1,5,3>: Cost 4 vsldoi12 <0,4,1,5>, <1,5,3,0>
+  2754233495U,	// <5,1,5,4>: Cost 3 vsldoi12 <0,4,1,5>, <1,5,4,5>
+  2288746834U,	// <5,1,5,5>: Cost 3 vmrglw <1,1,5,5>, <0,4,1,5>
+  2713931827U,	// <5,1,5,6>: Cost 3 vsldoi8 <4,u,5,1>, <5,6,1,7>
+  3787673725U,	// <5,1,5,7>: Cost 4 vsldoi8 <4,u,5,1>, <5,7,1,0>
+  2754233527U,	// <5,1,5,u>: Cost 3 vsldoi12 <0,4,1,5>, <1,5,u,1>
+  2668462182U,	// <5,1,6,0>: Cost 3 vsldoi4 <u,5,1,6>, LHS
+  2290746002U,	// <5,1,6,1>: Cost 3 vmrglw <1,4,5,6>, <0,u,1,1>
+  2302691478U,	// <5,1,6,2>: Cost 3 vmrglw <3,4,5,6>, <3,0,1,2>
+  3364488071U,	// <5,1,6,3>: Cost 4 vmrglw <1,4,5,6>, <1,2,1,3>
+  2302689536U,	// <5,1,6,4>: Cost 3 vmrglw <3,4,5,6>, <0,3,1,4>
+  2754233587U,	// <5,1,6,5>: Cost 3 vsldoi12 <0,4,1,5>, <1,6,5,7>
+  2713932600U,	// <5,1,6,6>: Cost 3 vsldoi8 <4,u,5,1>, <6,6,6,6>
+  2713932622U,	// <5,1,6,7>: Cost 3 vsldoi8 <4,u,5,1>, <6,7,0,1>
+  2302689297U,	// <5,1,6,u>: Cost 3 vmrglw <3,4,5,6>, <0,0,1,u>
+  2713932794U,	// <5,1,7,0>: Cost 3 vsldoi8 <4,u,5,1>, <7,0,1,2>
+  3365822474U,	// <5,1,7,1>: Cost 4 vmrglw <1,6,5,7>, <0,0,1,1>
+  3365824662U,	// <5,1,7,2>: Cost 4 vmrglw <1,6,5,7>, <3,0,1,2>
+  3787674851U,	// <5,1,7,3>: Cost 4 vsldoi8 <4,u,5,1>, <7,3,0,1>
+  2713933158U,	// <5,1,7,4>: Cost 3 vsldoi8 <4,u,5,1>, <7,4,5,6>
+  2292080978U,	// <5,1,7,5>: Cost 3 vmrglw <1,6,5,7>, <0,4,1,5>
+  3365823613U,	// <5,1,7,6>: Cost 4 vmrglw <1,6,5,7>, <1,5,1,6>
+  2713933420U,	// <5,1,7,7>: Cost 3 vsldoi8 <4,u,5,1>, <7,7,7,7>
+  2713933442U,	// <5,1,7,u>: Cost 3 vsldoi8 <4,u,5,1>, <7,u,1,2>
+  1658771190U,	// <5,1,u,0>: Cost 2 vsldoi8 <u,0,5,1>, <u,0,5,1>
+  1640191790U,	// <5,1,u,1>: Cost 2 vsldoi8 <4,u,5,1>, LHS
+  2762933624U,	// <5,1,u,2>: Cost 3 vsldoi12 <1,u,2,5>, <1,u,2,5>
+  2754233724U,	// <5,1,u,3>: Cost 3 vsldoi12 <0,4,1,5>, <1,u,3,0>
+  2763081098U,	// <5,1,u,4>: Cost 3 vsldoi12 <1,u,4,5>, <1,u,4,5>
+  1640192154U,	// <5,1,u,5>: Cost 2 vsldoi8 <4,u,5,1>, RHS
+  2713934032U,	// <5,1,u,6>: Cost 3 vsldoi8 <4,u,5,1>, <u,6,3,7>
+  2713934080U,	// <5,1,u,7>: Cost 3 vsldoi8 <4,u,5,1>, <u,7,0,1>
+  1640192357U,	// <5,1,u,u>: Cost 2 vsldoi8 <4,u,5,1>, LHS
+  3779051520U,	// <5,2,0,0>: Cost 4 vsldoi8 <3,4,5,2>, <0,0,0,0>
+  2705309798U,	// <5,2,0,1>: Cost 3 vsldoi8 <3,4,5,2>, LHS
+  3838813637U,	// <5,2,0,2>: Cost 4 vsldoi12 <2,2,4,5>, <2,0,2,1>
+  2302640230U,	// <5,2,0,3>: Cost 3 vmrglw <3,4,5,0>, LHS
+  3765117266U,	// <5,2,0,4>: Cost 4 vsldoi8 <1,1,5,2>, <0,4,1,5>
+  3381027892U,	// <5,2,0,5>: Cost 4 vmrglw <4,2,5,0>, <1,4,2,5>
+  3842794985U,	// <5,2,0,6>: Cost 4 vsldoi12 <2,u,4,5>, <2,0,6,1>
+  3408232554U,	// <5,2,0,7>: Cost 4 vmrglw <u,7,5,0>, <0,1,2,7>
+  2302640235U,	// <5,2,0,u>: Cost 3 vmrglw <3,4,5,0>, LHS
+  3700432998U,	// <5,2,1,0>: Cost 4 vsldoi4 <1,5,2,1>, LHS
+  3765117785U,	// <5,2,1,1>: Cost 4 vsldoi8 <1,1,5,2>, <1,1,5,2>
+  2311276136U,	// <5,2,1,2>: Cost 3 vmrglw <4,u,5,1>, <2,2,2,2>
+  1237532774U,	// <5,2,1,3>: Cost 2 vmrglw <4,u,5,1>, LHS
+  3700436278U,	// <5,2,1,4>: Cost 4 vsldoi4 <1,5,2,1>, RHS
+  3381036084U,	// <5,2,1,5>: Cost 4 vmrglw <4,2,5,1>, <1,4,2,5>
+  3385018045U,	// <5,2,1,6>: Cost 4 vmrglw <4,u,5,1>, <2,3,2,6>
+  3385017560U,	// <5,2,1,7>: Cost 4 vmrglw <4,u,5,1>, <1,6,2,7>
+  1237532779U,	// <5,2,1,u>: Cost 2 vmrglw <4,u,5,1>, LHS
+  3700441190U,	// <5,2,2,0>: Cost 4 vsldoi4 <1,5,2,2>, LHS
+  3700442242U,	// <5,2,2,1>: Cost 4 vsldoi4 <1,5,2,2>, <1,5,2,2>
+  2754233960U,	// <5,2,2,2>: Cost 3 vsldoi12 <0,4,1,5>, <2,2,2,2>
+  2754233970U,	// <5,2,2,3>: Cost 3 vsldoi12 <0,4,1,5>, <2,2,3,3>
+  2765071997U,	// <5,2,2,4>: Cost 3 vsldoi12 <2,2,4,5>, <2,2,4,5>
+  3834021508U,	// <5,2,2,5>: Cost 4 vsldoi12 <1,4,2,5>, <2,2,5,3>
+  3842795152U,	// <5,2,2,6>: Cost 4 vsldoi12 <2,u,4,5>, <2,2,6,6>
+  3376402492U,	// <5,2,2,7>: Cost 4 vmrglw <3,4,5,2>, <5,6,2,7>
+  2754234015U,	// <5,2,2,u>: Cost 3 vsldoi12 <0,4,1,5>, <2,2,u,3>
+  2754234022U,	// <5,2,3,0>: Cost 3 vsldoi12 <0,4,1,5>, <2,3,0,1>
+  3827975855U,	// <5,2,3,1>: Cost 4 vsldoi12 <0,4,1,5>, <2,3,1,1>
+  2644625102U,	// <5,2,3,2>: Cost 3 vsldoi4 <4,5,2,3>, <2,3,4,5>
+  2289393766U,	// <5,2,3,3>: Cost 3 vmrglw <1,2,5,3>, LHS
+  1691993806U,	// <5,2,3,4>: Cost 2 vsldoi12 <2,3,4,5>, <2,3,4,5>
+  2785052375U,	// <5,2,3,5>: Cost 3 vsldoi12 <5,5,5,5>, <2,3,5,5>
+  3854812897U,	// <5,2,3,6>: Cost 4 vsldoi12 <4,u,5,5>, <2,3,6,6>
+  3802942187U,	// <5,2,3,7>: Cost 4 vsldoi8 <7,4,5,2>, <3,7,4,5>
+  1692288754U,	// <5,2,3,u>: Cost 2 vsldoi12 <2,3,u,5>, <2,3,u,5>
+  3839846139U,	// <5,2,4,0>: Cost 4 vsldoi12 <2,4,0,5>, <2,4,0,5>
+  2709294052U,	// <5,2,4,1>: Cost 3 vsldoi8 <4,1,5,2>, <4,1,5,2>
+  2766251789U,	// <5,2,4,2>: Cost 3 vsldoi12 <2,4,2,5>, <2,4,2,5>
+  2765735702U,	// <5,2,4,3>: Cost 3 vsldoi12 <2,3,4,5>, <2,4,3,5>
+  3840141087U,	// <5,2,4,4>: Cost 4 vsldoi12 <2,4,4,5>, <2,4,4,5>
+  2705313078U,	// <5,2,4,5>: Cost 3 vsldoi8 <3,4,5,2>, RHS
+  2712612217U,	// <5,2,4,6>: Cost 3 vsldoi8 <4,6,5,2>, <4,6,5,2>
+  3787017674U,	// <5,2,4,7>: Cost 4 vsldoi8 <4,7,5,2>, <4,7,5,2>
+  2765735747U,	// <5,2,4,u>: Cost 3 vsldoi12 <2,3,4,5>, <2,4,u,5>
+  3834021704U,	// <5,2,5,0>: Cost 4 vsldoi12 <1,4,2,5>, <2,5,0,1>
+  3834021714U,	// <5,2,5,1>: Cost 4 vsldoi12 <1,4,2,5>, <2,5,1,2>
+  2311308904U,	// <5,2,5,2>: Cost 3 vmrglw <4,u,5,5>, <2,2,2,2>
+  1237565542U,	// <5,2,5,3>: Cost 2 vmrglw <4,u,5,5>, LHS
+  3834021744U,	// <5,2,5,4>: Cost 4 vsldoi12 <1,4,2,5>, <2,5,4,5>
+  3369124916U,	// <5,2,5,5>: Cost 4 vmrglw <2,2,5,5>, <1,4,2,5>
+  2248181690U,	// <5,2,5,6>: Cost 3 vmrghw <5,5,5,5>, <2,6,3,7>
+  3786354825U,	// <5,2,5,7>: Cost 4 vsldoi8 <4,6,5,2>, <5,7,2,3>
+  1237565547U,	// <5,2,5,u>: Cost 2 vmrglw <4,u,5,5>, LHS
+  3700473958U,	// <5,2,6,0>: Cost 4 vsldoi4 <1,5,2,6>, LHS
+  3700475014U,	// <5,2,6,1>: Cost 4 vsldoi4 <1,5,2,6>, <1,5,2,6>
+  2296718952U,	// <5,2,6,2>: Cost 3 vmrglw <2,4,5,6>, <2,2,2,2>
+  1228947558U,	// <5,2,6,3>: Cost 2 vmrglw <3,4,5,6>, LHS
+  3700477238U,	// <5,2,6,4>: Cost 4 vsldoi4 <1,5,2,6>, RHS
+  3834021836U,	// <5,2,6,5>: Cost 4 vsldoi12 <1,4,2,5>, <2,6,5,7>
+  2248951738U,	// <5,2,6,6>: Cost 3 vmrghw <5,6,7,0>, <2,6,3,7>
+  3370461105U,	// <5,2,6,7>: Cost 4 vmrglw <2,4,5,6>, <2,6,2,7>
+  1228947563U,	// <5,2,6,u>: Cost 2 vmrglw <3,4,5,6>, LHS
+  3786355706U,	// <5,2,7,0>: Cost 4 vsldoi8 <4,6,5,2>, <7,0,1,2>
+  3783038037U,	// <5,2,7,1>: Cost 4 vsldoi8 <4,1,5,2>, <7,1,2,3>
+  3365824104U,	// <5,2,7,2>: Cost 4 vmrglw <1,6,5,7>, <2,2,2,2>
+  2292080742U,	// <5,2,7,3>: Cost 3 vmrglw <1,6,5,7>, LHS
+  3842131986U,	// <5,2,7,4>: Cost 4 vsldoi12 <2,7,4,5>, <2,7,4,5>
+  3371795508U,	// <5,2,7,5>: Cost 4 vmrglw <2,6,5,7>, <1,4,2,5>
+  3786356206U,	// <5,2,7,6>: Cost 4 vsldoi8 <4,6,5,2>, <7,6,2,7>
+  3786356332U,	// <5,2,7,7>: Cost 4 vsldoi8 <4,6,5,2>, <7,7,7,7>
+  2292080747U,	// <5,2,7,u>: Cost 3 vmrglw <1,6,5,7>, LHS
+  2754234427U,	// <5,2,u,0>: Cost 3 vsldoi12 <0,4,1,5>, <2,u,0,1>
+  2705315630U,	// <5,2,u,1>: Cost 3 vsldoi8 <3,4,5,2>, LHS
+  2296735336U,	// <5,2,u,2>: Cost 3 vmrglw <2,4,5,u>, <2,2,2,2>
+  1228963942U,	// <5,2,u,3>: Cost 2 vmrglw <3,4,5,u>, LHS
+  1695311971U,	// <5,2,u,4>: Cost 2 vsldoi12 <2,u,4,5>, <2,u,4,5>
+  2705315994U,	// <5,2,u,5>: Cost 3 vsldoi8 <3,4,5,2>, RHS
+  2769201269U,	// <5,2,u,6>: Cost 3 vsldoi12 <2,u,6,5>, <2,u,6,5>
+  3370477489U,	// <5,2,u,7>: Cost 4 vmrglw <2,4,5,u>, <2,6,2,7>
+  1695606919U,	// <5,2,u,u>: Cost 2 vsldoi12 <2,u,u,5>, <2,u,u,5>
+  3827976331U,	// <5,3,0,0>: Cost 4 vsldoi12 <0,4,1,5>, <3,0,0,0>
+  2754234518U,	// <5,3,0,1>: Cost 3 vsldoi12 <0,4,1,5>, <3,0,1,2>
+  3706472290U,	// <5,3,0,2>: Cost 4 vsldoi4 <2,5,3,0>, <2,5,3,0>
+  3700500630U,	// <5,3,0,3>: Cost 4 vsldoi4 <1,5,3,0>, <3,0,1,2>
+  2754234544U,	// <5,3,0,4>: Cost 3 vsldoi12 <0,4,1,5>, <3,0,4,1>
+  3376383766U,	// <5,3,0,5>: Cost 4 vmrglw <3,4,5,0>, <2,4,3,5>
+  3769770513U,	// <5,3,0,6>: Cost 5 vsldoi8 <1,u,5,3>, <0,6,4,7>
+  3376383930U,	// <5,3,0,7>: Cost 4 vmrglw <3,4,5,0>, <2,6,3,7>
+  2754234581U,	// <5,3,0,u>: Cost 3 vsldoi12 <0,4,1,5>, <3,0,u,2>
+  2311275414U,	// <5,3,1,0>: Cost 3 vmrglw <4,u,5,1>, <1,2,3,0>
+  2305967971U,	// <5,3,1,1>: Cost 3 vmrglw <4,0,5,1>, <2,5,3,1>
+  2692047787U,	// <5,3,1,2>: Cost 3 vsldoi8 <1,2,5,3>, <1,2,5,3>
+  2311276146U,	// <5,3,1,3>: Cost 3 vmrglw <4,u,5,1>, <2,2,3,3>
+  2311275418U,	// <5,3,1,4>: Cost 3 vmrglw <4,u,5,1>, <1,2,3,4>
+  3765789807U,	// <5,3,1,5>: Cost 4 vsldoi8 <1,2,5,3>, <1,5,0,1>
+  3765789939U,	// <5,3,1,6>: Cost 4 vsldoi8 <1,2,5,3>, <1,6,5,7>
+  2311276474U,	// <5,3,1,7>: Cost 3 vmrglw <4,u,5,1>, <2,6,3,7>
+  2696029585U,	// <5,3,1,u>: Cost 3 vsldoi8 <1,u,5,3>, <1,u,5,3>
+  2311288709U,	// <5,3,2,0>: Cost 3 vmrglw <4,u,5,2>, <u,2,3,0>
+  3765790243U,	// <5,3,2,1>: Cost 4 vsldoi8 <1,2,5,3>, <2,1,3,5>
+  3827976513U,	// <5,3,2,2>: Cost 4 vsldoi12 <0,4,1,5>, <3,2,2,2>
+  2765736268U,	// <5,3,2,3>: Cost 3 vsldoi12 <2,3,4,5>, <3,2,3,4>
+  2246248962U,	// <5,3,2,4>: Cost 3 vmrghw <5,2,6,3>, <3,4,5,6>
+  3765790563U,	// <5,3,2,5>: Cost 4 vsldoi8 <1,2,5,3>, <2,5,3,1>
+  3827976550U,	// <5,3,2,6>: Cost 4 vsldoi12 <0,4,1,5>, <3,2,6,3>
+  3842795887U,	// <5,3,2,7>: Cost 4 vsldoi12 <2,u,4,5>, <3,2,7,3>
+  2769054073U,	// <5,3,2,u>: Cost 3 vsldoi12 <2,u,4,5>, <3,2,u,4>
+  3827976575U,	// <5,3,3,0>: Cost 4 vsldoi12 <0,4,1,5>, <3,3,0,1>
+  3765790963U,	// <5,3,3,1>: Cost 4 vsldoi8 <1,2,5,3>, <3,1,2,5>
+  3839478162U,	// <5,3,3,2>: Cost 4 vsldoi12 <2,3,4,5>, <3,3,2,2>
+  2754234780U,	// <5,3,3,3>: Cost 3 vsldoi12 <0,4,1,5>, <3,3,3,3>
+  2771708327U,	// <5,3,3,4>: Cost 3 vsldoi12 <3,3,4,5>, <3,3,4,5>
+  3363137059U,	// <5,3,3,5>: Cost 4 vmrglw <1,2,5,3>, <2,1,3,5>
+  3375081320U,	// <5,3,3,6>: Cost 4 vmrglw <3,2,5,3>, <2,5,3,6>
+  3363137466U,	// <5,3,3,7>: Cost 4 vmrglw <1,2,5,3>, <2,6,3,7>
+  2772003275U,	// <5,3,3,u>: Cost 3 vsldoi12 <3,3,u,5>, <3,3,u,5>
+  2772077012U,	// <5,3,4,0>: Cost 3 vsldoi12 <3,4,0,5>, <3,4,0,5>
+  3765791714U,	// <5,3,4,1>: Cost 4 vsldoi8 <1,2,5,3>, <4,1,5,0>
+  2709965878U,	// <5,3,4,2>: Cost 3 vsldoi8 <4,2,5,3>, <4,2,5,3>
+  2772298223U,	// <5,3,4,3>: Cost 3 vsldoi12 <3,4,3,5>, <3,4,3,5>
+  2772371960U,	// <5,3,4,4>: Cost 3 vsldoi12 <3,4,4,5>, <3,4,4,5>
+  2754234882U,	// <5,3,4,5>: Cost 3 vsldoi12 <0,4,1,5>, <3,4,5,6>
+  3839478282U,	// <5,3,4,6>: Cost 4 vsldoi12 <2,3,4,5>, <3,4,6,5>
+  3376416698U,	// <5,3,4,7>: Cost 4 vmrglw <3,4,5,4>, <2,6,3,7>
+  2754234909U,	// <5,3,4,u>: Cost 3 vsldoi12 <0,4,1,5>, <3,4,u,6>
+  2311308182U,	// <5,3,5,0>: Cost 3 vmrglw <4,u,5,5>, <1,2,3,0>
+  3765792421U,	// <5,3,5,1>: Cost 4 vsldoi8 <1,2,5,3>, <5,1,2,5>
+  2715938575U,	// <5,3,5,2>: Cost 3 vsldoi8 <5,2,5,3>, <5,2,5,3>
+  2311308914U,	// <5,3,5,3>: Cost 3 vmrglw <4,u,5,5>, <2,2,3,3>
+  2311308186U,	// <5,3,5,4>: Cost 3 vmrglw <4,u,5,5>, <1,2,3,4>
+  2248182354U,	// <5,3,5,5>: Cost 3 vmrghw <5,5,5,5>, <3,5,5,5>
+  3765792837U,	// <5,3,5,6>: Cost 4 vsldoi8 <1,2,5,3>, <5,6,3,7>
+  2311309242U,	// <5,3,5,7>: Cost 3 vmrglw <4,u,5,5>, <2,6,3,7>
+  2311308190U,	// <5,3,5,u>: Cost 3 vmrglw <4,u,5,5>, <1,2,3,u>
+  2632777830U,	// <5,3,6,0>: Cost 3 vsldoi4 <2,5,3,6>, LHS
+  3706520372U,	// <5,3,6,1>: Cost 4 vsldoi4 <2,5,3,6>, <1,1,1,1>
+  2632779624U,	// <5,3,6,2>: Cost 3 vsldoi4 <2,5,3,6>, <2,5,3,6>
+  2632780290U,	// <5,3,6,3>: Cost 3 vsldoi4 <2,5,3,6>, <3,4,5,6>
+  2632781110U,	// <5,3,6,4>: Cost 3 vsldoi4 <2,5,3,6>, RHS
+  2248952413U,	// <5,3,6,5>: Cost 3 vmrghw <5,6,7,0>, <3,5,6,7>
+  2302691176U,	// <5,3,6,6>: Cost 3 vmrglw <3,4,5,6>, <2,5,3,6>
+  2302691258U,	// <5,3,6,7>: Cost 3 vmrglw <3,4,5,6>, <2,6,3,7>
+  2632783662U,	// <5,3,6,u>: Cost 3 vsldoi4 <2,5,3,6>, LHS
+  3365823382U,	// <5,3,7,0>: Cost 4 vmrglw <1,6,5,7>, <1,2,3,0>
+  3706529011U,	// <5,3,7,1>: Cost 4 vsldoi4 <2,5,3,7>, <1,6,5,7>
+  3706529641U,	// <5,3,7,2>: Cost 4 vsldoi4 <2,5,3,7>, <2,5,3,7>
+  3365824114U,	// <5,3,7,3>: Cost 4 vmrglw <1,6,5,7>, <2,2,3,3>
+  2774362859U,	// <5,3,7,4>: Cost 3 vsldoi12 <3,7,4,5>, <3,7,4,5>
+  3365824035U,	// <5,3,7,5>: Cost 4 vmrglw <1,6,5,7>, <2,1,3,5>
+  3383740183U,	// <5,3,7,6>: Cost 4 vmrglw <4,6,5,7>, <2,4,3,6>
+  3363833786U,	// <5,3,7,7>: Cost 4 vmrglw <1,3,5,7>, <2,6,3,7>
+  2774657807U,	// <5,3,7,u>: Cost 3 vsldoi12 <3,7,u,5>, <3,7,u,5>
+  2632794214U,	// <5,3,u,0>: Cost 3 vsldoi4 <2,5,3,u>, LHS
+  2754235166U,	// <5,3,u,1>: Cost 3 vsldoi12 <0,4,1,5>, <3,u,1,2>
+  2632796010U,	// <5,3,u,2>: Cost 3 vsldoi4 <2,5,3,u>, <2,5,3,u>
+  2632796676U,	// <5,3,u,3>: Cost 3 vsldoi4 <2,5,3,u>, <3,4,5,u>
+  2632797494U,	// <5,3,u,4>: Cost 3 vsldoi4 <2,5,3,u>, RHS
+  2754235206U,	// <5,3,u,5>: Cost 3 vsldoi12 <0,4,1,5>, <3,u,5,6>
+  2302691176U,	// <5,3,u,6>: Cost 3 vmrglw <3,4,5,6>, <2,5,3,6>
+  2302707642U,	// <5,3,u,7>: Cost 3 vmrglw <3,4,5,u>, <2,6,3,7>
+  2754235229U,	// <5,3,u,u>: Cost 3 vsldoi12 <0,4,1,5>, <3,u,u,2>
+  3765133325U,	// <5,4,0,0>: Cost 4 vsldoi8 <1,1,5,4>, <0,0,1,4>
+  2705326182U,	// <5,4,0,1>: Cost 3 vsldoi8 <3,4,5,4>, LHS
+  3718489806U,	// <5,4,0,2>: Cost 4 vsldoi4 <4,5,4,0>, <2,3,4,5>
+  3718490624U,	// <5,4,0,3>: Cost 4 vsldoi4 <4,5,4,0>, <3,4,5,4>
+  2709307730U,	// <5,4,0,4>: Cost 3 vsldoi8 <4,1,5,4>, <0,4,1,5>
+  2302641870U,	// <5,4,0,5>: Cost 3 vmrglw <3,4,5,0>, <2,3,4,5>
+  3376383695U,	// <5,4,0,6>: Cost 5 vmrglw <3,4,5,0>, <2,3,4,6>
+  3384351018U,	// <5,4,0,7>: Cost 4 vmrglw <4,7,5,0>, <u,7,4,7>
+  2705326749U,	// <5,4,0,u>: Cost 3 vsldoi8 <3,4,5,4>, LHS
+  2305971057U,	// <5,4,1,0>: Cost 3 vmrglw <4,0,5,1>, <6,7,4,0>
+  3765134171U,	// <5,4,1,1>: Cost 4 vsldoi8 <1,1,5,4>, <1,1,5,4>
+  3766461338U,	// <5,4,1,2>: Cost 4 vsldoi8 <1,3,5,4>, <1,2,3,4>
+  3766461437U,	// <5,4,1,3>: Cost 4 vsldoi8 <1,3,5,4>, <1,3,5,4>
+  2311277776U,	// <5,4,1,4>: Cost 3 vmrglw <4,u,5,1>, <4,4,4,4>
+  2754235362U,	// <5,4,1,5>: Cost 3 vsldoi12 <0,4,1,5>, <4,1,5,0>
+  3783050483U,	// <5,4,1,6>: Cost 4 vsldoi8 <4,1,5,4>, <1,6,5,7>
+  3385019036U,	// <5,4,1,7>: Cost 4 vmrglw <4,u,5,1>, <3,6,4,7>
+  2311276241U,	// <5,4,1,u>: Cost 3 vmrglw <4,u,5,1>, <2,3,4,u>
+  3718504550U,	// <5,4,2,0>: Cost 4 vsldoi4 <4,5,4,2>, LHS
+  3783050787U,	// <5,4,2,1>: Cost 4 vsldoi8 <4,1,5,4>, <2,1,3,5>
+  3773097576U,	// <5,4,2,2>: Cost 4 vsldoi8 <2,4,5,4>, <2,2,2,2>
+  2705327822U,	// <5,4,2,3>: Cost 3 vsldoi8 <3,4,5,4>, <2,3,4,5>
+  3773097767U,	// <5,4,2,4>: Cost 4 vsldoi8 <2,4,5,4>, <2,4,5,4>
+  2765737014U,	// <5,4,2,5>: Cost 3 vsldoi12 <2,3,4,5>, <4,2,5,3>
+  3779069882U,	// <5,4,2,6>: Cost 4 vsldoi8 <3,4,5,4>, <2,6,3,7>
+  3376401052U,	// <5,4,2,7>: Cost 5 vmrglw <3,4,5,2>, <3,6,4,7>
+  2245881370U,	// <5,4,2,u>: Cost 3 vmrghw <5,2,1,3>, <4,u,5,1>
+  3779070102U,	// <5,4,3,0>: Cost 4 vsldoi8 <3,4,5,4>, <3,0,1,2>
+  3363135525U,	// <5,4,3,1>: Cost 4 vmrglw <1,2,5,3>, <0,0,4,1>
+  3779070284U,	// <5,4,3,2>: Cost 4 vsldoi8 <3,4,5,4>, <3,2,3,4>
+  3779070364U,	// <5,4,3,3>: Cost 4 vsldoi8 <3,4,5,4>, <3,3,3,3>
+  2705328640U,	// <5,4,3,4>: Cost 3 vsldoi8 <3,4,5,4>, <3,4,5,4>
+  2307311310U,	// <5,4,3,5>: Cost 3 vmrglw <4,2,5,3>, <2,3,4,5>
+  3866021012U,	// <5,4,3,6>: Cost 4 vsldoi12 <6,7,4,5>, <4,3,6,7>
+  3363138204U,	// <5,4,3,7>: Cost 5 vmrglw <1,2,5,3>, <3,6,4,7>
+  2707983172U,	// <5,4,3,u>: Cost 3 vsldoi8 <3,u,5,4>, <3,u,5,4>
+  2708646805U,	// <5,4,4,0>: Cost 3 vsldoi8 <4,0,5,4>, <4,0,5,4>
+  2709310438U,	// <5,4,4,1>: Cost 3 vsldoi8 <4,1,5,4>, <4,1,5,4>
+  3779071030U,	// <5,4,4,2>: Cost 4 vsldoi8 <3,4,5,4>, <4,2,5,3>
+  2710637704U,	// <5,4,4,3>: Cost 3 vsldoi8 <4,3,5,4>, <4,3,5,4>
+  2754235600U,	// <5,4,4,4>: Cost 3 vsldoi12 <0,4,1,5>, <4,4,4,4>
+  1704676570U,	// <5,4,4,5>: Cost 2 vsldoi12 <4,4,5,5>, <4,4,5,5>
+  3779071358U,	// <5,4,4,6>: Cost 4 vsldoi8 <3,4,5,4>, <4,6,5,7>
+  2713292236U,	// <5,4,4,7>: Cost 3 vsldoi8 <4,7,5,4>, <4,7,5,4>
+  1704897781U,	// <5,4,4,u>: Cost 2 vsldoi12 <4,4,u,5>, <4,4,u,5>
+  2626871398U,	// <5,4,5,0>: Cost 3 vsldoi4 <1,5,4,5>, LHS
+  2626872471U,	// <5,4,5,1>: Cost 3 vsldoi4 <1,5,4,5>, <1,5,4,5>
+  2765737230U,	// <5,4,5,2>: Cost 3 vsldoi12 <2,3,4,5>, <4,5,2,3>
+  3700615318U,	// <5,4,5,3>: Cost 4 vsldoi4 <1,5,4,5>, <3,0,1,2>
+  2626874678U,	// <5,4,5,4>: Cost 3 vsldoi4 <1,5,4,5>, RHS
+  1174441270U,	// <5,4,5,5>: Cost 2 vmrghw <5,5,5,5>, RHS
+  1680493878U,	// <5,4,5,6>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+  3385051804U,	// <5,4,5,7>: Cost 4 vmrglw <4,u,5,5>, <3,6,4,7>
+  1680493896U,	// <5,4,5,u>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+  2248952722U,	// <5,4,6,0>: Cost 3 vmrghw <5,6,7,0>, <4,0,5,1>
+  2302692152U,	// <5,4,6,1>: Cost 3 vmrglw <3,4,5,6>, <3,u,4,1>
+  3382406107U,	// <5,4,6,2>: Cost 4 vmrglw <4,4,5,6>, <4,1,4,2>
+  3700623874U,	// <5,4,6,3>: Cost 4 vsldoi4 <1,5,4,6>, <3,4,5,6>
+  2248953040U,	// <5,4,6,4>: Cost 3 vmrghw <5,6,7,0>, <4,4,4,4>
+  1175211318U,	// <5,4,6,5>: Cost 2 vmrghw <5,6,7,0>, RHS
+  3376432280U,	// <5,4,6,6>: Cost 4 vmrglw <3,4,5,6>, <1,5,4,6>
+  2729218934U,	// <5,4,6,7>: Cost 3 vsldoi8 <7,4,5,4>, <6,7,4,5>
+  1175211561U,	// <5,4,6,u>: Cost 2 vmrghw <5,6,7,0>, RHS
+  3787035642U,	// <5,4,7,0>: Cost 4 vsldoi8 <4,7,5,4>, <7,0,1,2>
+  3365822501U,	// <5,4,7,1>: Cost 4 vmrglw <1,6,5,7>, <0,0,4,1>
+  3808933085U,	// <5,4,7,2>: Cost 4 vsldoi8 <u,4,5,4>, <7,2,u,4>
+  3784381707U,	// <5,4,7,3>: Cost 4 vsldoi8 <4,3,5,4>, <7,3,4,5>
+  2713294182U,	// <5,4,7,4>: Cost 3 vsldoi8 <4,7,5,4>, <7,4,5,6>
+  2309998286U,	// <5,4,7,5>: Cost 3 vmrglw <4,6,5,7>, <2,3,4,5>
+  3383740111U,	// <5,4,7,6>: Cost 4 vmrglw <4,6,5,7>, <2,3,4,6>
+  3787036239U,	// <5,4,7,7>: Cost 4 vsldoi8 <4,7,5,4>, <7,7,4,5>
+  2731873960U,	// <5,4,7,u>: Cost 3 vsldoi8 <7,u,5,4>, <7,u,5,4>
+  2626895974U,	// <5,4,u,0>: Cost 3 vsldoi4 <1,5,4,u>, LHS
+  2626897050U,	// <5,4,u,1>: Cost 3 vsldoi4 <1,5,4,u>, <1,5,4,u>
+  2644813518U,	// <5,4,u,2>: Cost 3 vsldoi4 <4,5,4,u>, <2,3,4,5>
+  2705327822U,	// <5,4,u,3>: Cost 3 vsldoi8 <3,4,5,4>, <2,3,4,5>
+  2626899254U,	// <5,4,u,4>: Cost 3 vsldoi4 <1,5,4,u>, RHS
+  1707331102U,	// <5,4,u,5>: Cost 2 vsldoi12 <4,u,5,5>, <4,u,5,5>
+  1680494121U,	// <5,4,u,6>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+  2737183024U,	// <5,4,u,7>: Cost 3 vsldoi8 <u,7,5,4>, <u,7,5,4>
+  1680494139U,	// <5,4,u,u>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+  2302642684U,	// <5,5,0,0>: Cost 3 vmrglw <3,4,5,0>, <3,4,5,0>
+  1640218726U,	// <5,5,0,1>: Cost 2 vsldoi8 <4,u,5,5>, LHS
+  3376384510U,	// <5,5,0,2>: Cost 4 vmrglw <3,4,5,0>, <3,4,5,2>
+  3376385078U,	// <5,5,0,3>: Cost 4 vmrglw <3,4,5,0>, <4,2,5,3>
+  2754236002U,	// <5,5,0,4>: Cost 3 vsldoi12 <0,4,1,5>, <5,0,4,1>
+  2717942242U,	// <5,5,0,5>: Cost 3 vsldoi8 <5,5,5,5>, <0,5,u,5>
+  2244907106U,	// <5,5,0,6>: Cost 3 vmrghw <5,0,6,1>, <5,6,7,0>
+  3376385406U,	// <5,5,0,7>: Cost 4 vmrglw <3,4,5,0>, <4,6,5,7>
+  1640219293U,	// <5,5,0,u>: Cost 2 vsldoi8 <4,u,5,5>, LHS
+  2305969365U,	// <5,5,1,0>: Cost 3 vmrglw <4,0,5,1>, <4,4,5,0>
+  1237536282U,	// <5,5,1,1>: Cost 2 vmrglw <4,u,5,1>, <4,u,5,1>
+  2713961366U,	// <5,5,1,2>: Cost 3 vsldoi8 <4,u,5,5>, <1,2,3,0>
+  3766469630U,	// <5,5,1,3>: Cost 4 vsldoi8 <1,3,5,5>, <1,3,5,5>
+  2782326455U,	// <5,5,1,4>: Cost 3 vsldoi12 <5,1,4,5>, <5,1,4,5>
+  2311277786U,	// <5,5,1,5>: Cost 3 vmrglw <4,u,5,1>, <4,4,5,5>
+  2311277058U,	// <5,5,1,6>: Cost 3 vmrglw <4,u,5,1>, <3,4,5,6>
+  3385017587U,	// <5,5,1,7>: Cost 4 vmrglw <4,u,5,1>, <1,6,5,7>
+  1237536282U,	// <5,5,1,u>: Cost 2 vmrglw <4,u,5,1>, <4,u,5,1>
+  3376400892U,	// <5,5,2,0>: Cost 4 vmrglw <3,4,5,2>, <3,4,5,0>
+  3827977963U,	// <5,5,2,1>: Cost 4 vsldoi12 <0,4,1,5>, <5,2,1,3>
+  2302659070U,	// <5,5,2,2>: Cost 3 vmrglw <3,4,5,2>, <3,4,5,2>
+  2765737726U,	// <5,5,2,3>: Cost 3 vsldoi12 <2,3,4,5>, <5,2,3,4>
+  3839479558U,	// <5,5,2,4>: Cost 4 vsldoi12 <2,3,4,5>, <5,2,4,3>
+  2781073167U,	// <5,5,2,5>: Cost 3 vsldoi12 <4,u,5,5>, <5,2,5,3>
+  2713962426U,	// <5,5,2,6>: Cost 3 vsldoi8 <4,u,5,5>, <2,6,3,7>
+  3376401790U,	// <5,5,2,7>: Cost 4 vmrglw <3,4,5,2>, <4,6,5,7>
+  2769055531U,	// <5,5,2,u>: Cost 3 vsldoi12 <2,u,4,5>, <5,2,u,4>
+  2713962646U,	// <5,5,3,0>: Cost 3 vsldoi8 <4,u,5,5>, <3,0,1,2>
+  3765143786U,	// <5,5,3,1>: Cost 4 vsldoi8 <1,1,5,5>, <3,1,1,5>
+  3839479621U,	// <5,5,3,2>: Cost 4 vsldoi12 <2,3,4,5>, <5,3,2,3>
+  2289394603U,	// <5,5,3,3>: Cost 3 vmrglw <1,2,5,3>, <1,2,5,3>
+  2713963010U,	// <5,5,3,4>: Cost 3 vsldoi8 <4,u,5,5>, <3,4,5,6>
+  2313285150U,	// <5,5,3,5>: Cost 3 vmrglw <5,2,5,3>, <4,u,5,5>
+  3363138050U,	// <5,5,3,6>: Cost 4 vmrglw <1,2,5,3>, <3,4,5,6>
+  3363136755U,	// <5,5,3,7>: Cost 4 vmrglw <1,2,5,3>, <1,6,5,7>
+  2713963294U,	// <5,5,3,u>: Cost 3 vsldoi8 <4,u,5,5>, <3,u,1,2>
+  2713963410U,	// <5,5,4,0>: Cost 3 vsldoi8 <4,u,5,5>, <4,0,5,1>
+  3827978127U,	// <5,5,4,1>: Cost 4 vsldoi12 <0,4,1,5>, <5,4,1,5>
+  3839479704U,	// <5,5,4,2>: Cost 4 vsldoi12 <2,3,4,5>, <5,4,2,5>
+  3376417846U,	// <5,5,4,3>: Cost 4 vmrglw <3,4,5,4>, <4,2,5,3>
+  1637567706U,	// <5,5,4,4>: Cost 2 vsldoi8 <4,4,5,5>, <4,4,5,5>
+  1640222006U,	// <5,5,4,5>: Cost 2 vsldoi8 <4,u,5,5>, RHS
+  2310640998U,	// <5,5,4,6>: Cost 3 vmrglw <4,7,5,4>, <7,4,5,6>
+  3376418174U,	// <5,5,4,7>: Cost 4 vmrglw <3,4,5,4>, <4,6,5,7>
+  1640222238U,	// <5,5,4,u>: Cost 2 vsldoi8 <4,u,5,5>, <4,u,5,5>
+  1577091174U,	// <5,5,5,0>: Cost 2 vsldoi4 <5,5,5,5>, LHS
+  2311310226U,	// <5,5,5,1>: Cost 3 vmrglw <4,u,5,5>, <4,0,5,1>
+  2713964303U,	// <5,5,5,2>: Cost 3 vsldoi8 <4,u,5,5>, <5,2,5,3>
+  2311311119U,	// <5,5,5,3>: Cost 3 vmrglw <4,u,5,5>, <5,2,5,3>
+  1577094454U,	// <5,5,5,4>: Cost 2 vsldoi4 <5,5,5,5>, RHS
+  296144182U,	// <5,5,5,5>: Cost 1 vspltisw1 RHS
+  2311309826U,	// <5,5,5,6>: Cost 3 vmrglw <4,u,5,5>, <3,4,5,6>
+  2311311447U,	// <5,5,5,7>: Cost 3 vmrglw <4,u,5,5>, <5,6,5,7>
+  296144182U,	// <5,5,5,u>: Cost 1 vspltisw1 RHS
+  2248953460U,	// <5,5,6,0>: Cost 3 vmrghw <5,6,7,0>, <5,0,6,1>
+  2326580114U,	// <5,5,6,1>: Cost 3 vmrglw <7,4,5,6>, <4,0,5,1>
+  2713965050U,	// <5,5,6,2>: Cost 3 vsldoi8 <4,u,5,5>, <6,2,7,3>
+  3700697602U,	// <5,5,6,3>: Cost 4 vsldoi4 <1,5,5,6>, <3,4,5,6>
+  2785644620U,	// <5,5,6,4>: Cost 3 vsldoi12 <5,6,4,5>, <5,6,4,5>
+  2781073495U,	// <5,5,6,5>: Cost 3 vsldoi12 <4,u,5,5>, <5,6,5,7>
+  1228950018U,	// <5,5,6,6>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6>
+  2713965390U,	// <5,5,6,7>: Cost 3 vsldoi8 <4,u,5,5>, <6,7,0,1>
+  1228950018U,	// <5,5,6,u>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6>
+  2713965562U,	// <5,5,7,0>: Cost 3 vsldoi8 <4,u,5,5>, <7,0,1,2>
+  3383741330U,	// <5,5,7,1>: Cost 4 vmrglw <4,6,5,7>, <4,0,5,1>
+  3718620878U,	// <5,5,7,2>: Cost 4 vsldoi4 <4,5,5,7>, <2,3,4,5>
+  3365823403U,	// <5,5,7,3>: Cost 4 vmrglw <1,6,5,7>, <1,2,5,3>
+  2713965926U,	// <5,5,7,4>: Cost 3 vsldoi8 <4,u,5,5>, <7,4,5,6>
+  2717947318U,	// <5,5,7,5>: Cost 3 vsldoi8 <5,5,5,5>, <7,5,5,5>
+  3365825026U,	// <5,5,7,6>: Cost 4 vmrglw <1,6,5,7>, <3,4,5,6>
+  2292081907U,	// <5,5,7,7>: Cost 3 vmrglw <1,6,5,7>, <1,6,5,7>
+  2713966210U,	// <5,5,7,u>: Cost 3 vsldoi8 <4,u,5,5>, <7,u,1,2>
+  1577091174U,	// <5,5,u,0>: Cost 2 vsldoi4 <5,5,5,5>, LHS
+  1640224558U,	// <5,5,u,1>: Cost 2 vsldoi8 <4,u,5,5>, LHS
+  2713966469U,	// <5,5,u,2>: Cost 3 vsldoi8 <4,u,5,5>, <u,2,3,0>
+  2713966524U,	// <5,5,u,3>: Cost 3 vsldoi8 <4,u,5,5>, <u,3,0,1>
+  1577094454U,	// <5,5,u,4>: Cost 2 vsldoi4 <5,5,5,5>, RHS
+  296144182U,	// <5,5,u,5>: Cost 1 vspltisw1 RHS
+  1228950018U,	// <5,5,u,6>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6>
+  2713966848U,	// <5,5,u,7>: Cost 3 vsldoi8 <4,u,5,5>, <u,7,0,1>
+  296144182U,	// <5,5,u,u>: Cost 1 vspltisw1 RHS
+  2705342464U,	// <5,6,0,0>: Cost 3 vsldoi8 <3,4,5,6>, <0,0,0,0>
+  1631600742U,	// <5,6,0,1>: Cost 2 vsldoi8 <3,4,5,6>, LHS
+  3773112493U,	// <5,6,0,2>: Cost 4 vsldoi8 <2,4,5,6>, <0,2,1,2>
+  2705342720U,	// <5,6,0,3>: Cost 3 vsldoi8 <3,4,5,6>, <0,3,1,4>
+  2705342802U,	// <5,6,0,4>: Cost 3 vsldoi8 <3,4,5,6>, <0,4,1,5>
+  3779084708U,	// <5,6,0,5>: Cost 4 vsldoi8 <3,4,5,6>, <0,5,1,6>
+  3779084790U,	// <5,6,0,6>: Cost 4 vsldoi8 <3,4,5,6>, <0,6,1,7>
+  2302643510U,	// <5,6,0,7>: Cost 3 vmrglw <3,4,5,0>, RHS
+  1631601309U,	// <5,6,0,u>: Cost 2 vsldoi8 <3,4,5,6>, LHS
+  3767141092U,	// <5,6,1,0>: Cost 4 vsldoi8 <1,4,5,6>, <1,0,1,2>
+  2705343284U,	// <5,6,1,1>: Cost 3 vsldoi8 <3,4,5,6>, <1,1,1,1>
+  2705343382U,	// <5,6,1,2>: Cost 3 vsldoi8 <3,4,5,6>, <1,2,3,0>
+  3779085282U,	// <5,6,1,3>: Cost 4 vsldoi8 <3,4,5,6>, <1,3,2,4>
+  2693399632U,	// <5,6,1,4>: Cost 3 vsldoi8 <1,4,5,6>, <1,4,5,6>
+  3767805089U,	// <5,6,1,5>: Cost 4 vsldoi8 <1,5,5,6>, <1,5,5,6>
+  2311279416U,	// <5,6,1,6>: Cost 3 vmrglw <4,u,5,1>, <6,6,6,6>
+  1237536054U,	// <5,6,1,7>: Cost 2 vmrglw <4,u,5,1>, RHS
+  1237536055U,	// <5,6,1,u>: Cost 2 vmrglw <4,u,5,1>, RHS
+  3773113789U,	// <5,6,2,0>: Cost 4 vsldoi8 <2,4,5,6>, <2,0,1,2>
+  3779085855U,	// <5,6,2,1>: Cost 4 vsldoi8 <3,4,5,6>, <2,1,3,1>
+  2699372136U,	// <5,6,2,2>: Cost 3 vsldoi8 <2,4,5,6>, <2,2,2,2>
+  2705344166U,	// <5,6,2,3>: Cost 3 vsldoi8 <3,4,5,6>, <2,3,0,1>
+  2699372329U,	// <5,6,2,4>: Cost 3 vsldoi8 <2,4,5,6>, <2,4,5,6>
+  2705344360U,	// <5,6,2,5>: Cost 3 vsldoi8 <3,4,5,6>, <2,5,3,6>
+  2705344442U,	// <5,6,2,6>: Cost 3 vsldoi8 <3,4,5,6>, <2,6,3,7>
+  2302659894U,	// <5,6,2,7>: Cost 3 vmrglw <3,4,5,2>, RHS
+  2702026861U,	// <5,6,2,u>: Cost 3 vsldoi8 <2,u,5,6>, <2,u,5,6>
+  2705344662U,	// <5,6,3,0>: Cost 3 vsldoi8 <3,4,5,6>, <3,0,1,2>
+  3767142661U,	// <5,6,3,1>: Cost 4 vsldoi8 <1,4,5,6>, <3,1,4,5>
+  3773114689U,	// <5,6,3,2>: Cost 4 vsldoi8 <2,4,5,6>, <3,2,2,2>
+  2705344924U,	// <5,6,3,3>: Cost 3 vsldoi8 <3,4,5,6>, <3,3,3,3>
+  1631603202U,	// <5,6,3,4>: Cost 2 vsldoi8 <3,4,5,6>, <3,4,5,6>
+  3842945597U,	// <5,6,3,5>: Cost 4 vsldoi12 <2,u,6,5>, <6,3,5,7>
+  3779086962U,	// <5,6,3,6>: Cost 4 vsldoi8 <3,4,5,6>, <3,6,0,1>
+  2289397046U,	// <5,6,3,7>: Cost 3 vmrglw <1,2,5,3>, RHS
+  1634257734U,	// <5,6,3,u>: Cost 2 vsldoi8 <3,u,5,6>, <3,u,5,6>
+  2644926566U,	// <5,6,4,0>: Cost 3 vsldoi4 <4,5,6,4>, LHS
+  3779087306U,	// <5,6,4,1>: Cost 4 vsldoi8 <3,4,5,6>, <4,1,2,3>
+  2790142577U,	// <5,6,4,2>: Cost 3 vsldoi12 <6,4,2,5>, <6,4,2,5>
+  2644929026U,	// <5,6,4,3>: Cost 3 vsldoi4 <4,5,6,4>, <3,4,5,6>
+  2711317723U,	// <5,6,4,4>: Cost 3 vsldoi8 <4,4,5,6>, <4,4,5,6>
+  1631604022U,	// <5,6,4,5>: Cost 2 vsldoi8 <3,4,5,6>, RHS
+  2712644989U,	// <5,6,4,6>: Cost 3 vsldoi8 <4,6,5,6>, <4,6,5,6>
+  2302676278U,	// <5,6,4,7>: Cost 3 vmrglw <3,4,5,4>, RHS
+  1631604265U,	// <5,6,4,u>: Cost 2 vsldoi8 <3,4,5,6>, RHS
+  3842945708U,	// <5,6,5,0>: Cost 4 vsldoi12 <2,u,6,5>, <6,5,0,1>
+  3767144133U,	// <5,6,5,1>: Cost 4 vsldoi8 <1,4,5,6>, <5,1,6,1>
+  2705346328U,	// <5,6,5,2>: Cost 3 vsldoi8 <3,4,5,6>, <5,2,6,3>
+  3779088207U,	// <5,6,5,3>: Cost 4 vsldoi8 <3,4,5,6>, <5,3,3,4>
+  2717290420U,	// <5,6,5,4>: Cost 3 vsldoi8 <5,4,5,6>, <5,4,5,6>
+  2705346574U,	// <5,6,5,5>: Cost 3 vsldoi8 <3,4,5,6>, <5,5,6,6>
+  2705346596U,	// <5,6,5,6>: Cost 3 vsldoi8 <3,4,5,6>, <5,6,0,1>
+  1237568822U,	// <5,6,5,7>: Cost 2 vmrglw <4,u,5,5>, RHS
+  1237568823U,	// <5,6,5,u>: Cost 2 vmrglw <4,u,5,5>, RHS
+  2650914918U,	// <5,6,6,0>: Cost 3 vsldoi4 <5,5,6,6>, LHS
+  3364490949U,	// <5,6,6,1>: Cost 4 vmrglw <1,4,5,6>, <5,1,6,1>
+  2248954362U,	// <5,6,6,2>: Cost 3 vmrghw <5,6,7,0>, <6,2,7,3>
+  2302693144U,	// <5,6,6,3>: Cost 3 vmrglw <3,4,5,6>, <5,2,6,3>
+  2650918198U,	// <5,6,6,4>: Cost 3 vsldoi4 <5,5,6,6>, RHS
+  2650918926U,	// <5,6,6,5>: Cost 3 vsldoi4 <5,5,6,6>, <5,5,6,6>
+  2302693390U,	// <5,6,6,6>: Cost 3 vmrglw <3,4,5,6>, <5,5,6,6>
+  1228950838U,	// <5,6,6,7>: Cost 2 vmrglw <3,4,5,6>, RHS
+  1228950839U,	// <5,6,6,u>: Cost 2 vmrglw <3,4,5,6>, RHS
+  497467494U,	// <5,6,7,0>: Cost 1 vsldoi4 RHS, LHS
+  1571210036U,	// <5,6,7,1>: Cost 2 vsldoi4 RHS, <1,1,1,1>
+  1571210856U,	// <5,6,7,2>: Cost 2 vsldoi4 RHS, <2,2,2,2>
+  1571211414U,	// <5,6,7,3>: Cost 2 vsldoi4 RHS, <3,0,1,2>
+  497470774U,	// <5,6,7,4>: Cost 1 vsldoi4 RHS, RHS
+  1571213316U,	// <5,6,7,5>: Cost 2 vsldoi4 RHS, <5,5,5,5>
+  1571213818U,	// <5,6,7,6>: Cost 2 vsldoi4 RHS, <6,2,7,3>
+  1571214956U,	// <5,6,7,7>: Cost 2 vsldoi4 RHS, <7,7,7,7>
+  497473326U,	// <5,6,7,u>: Cost 1 vsldoi4 RHS, LHS
+  497475686U,	// <5,6,u,0>: Cost 1 vsldoi4 RHS, LHS
+  1631606574U,	// <5,6,u,1>: Cost 2 vsldoi8 <3,4,5,6>, LHS
+  1571219048U,	// <5,6,u,2>: Cost 2 vsldoi4 RHS, <2,2,2,2>
+  1571219606U,	// <5,6,u,3>: Cost 2 vsldoi4 RHS, <3,0,1,2>
+  497478967U,	// <5,6,u,4>: Cost 1 vsldoi4 RHS, RHS
+  1631606938U,	// <5,6,u,5>: Cost 2 vsldoi8 <3,4,5,6>, RHS
+  1571222010U,	// <5,6,u,6>: Cost 2 vsldoi4 RHS, <6,2,7,3>
+  1228967222U,	// <5,6,u,7>: Cost 2 vmrglw <3,4,5,u>, RHS
+  497481518U,	// <5,6,u,u>: Cost 1 vsldoi4 RHS, LHS
+  3768475648U,	// <5,7,0,0>: Cost 4 vsldoi8 <1,6,5,7>, <0,0,0,0>
+  2694733926U,	// <5,7,0,1>: Cost 3 vsldoi8 <1,6,5,7>, LHS
+  3718711395U,	// <5,7,0,2>: Cost 4 vsldoi4 <4,5,7,0>, <2,u,4,5>
+  3384349178U,	// <5,7,0,3>: Cost 4 vmrglw <4,7,5,0>, <6,2,7,3>
+  2694734162U,	// <5,7,0,4>: Cost 3 vsldoi8 <1,6,5,7>, <0,4,1,5>
+  3384347884U,	// <5,7,0,5>: Cost 4 vmrglw <4,7,5,0>, <4,4,7,5>
+  3730658026U,	// <5,7,0,6>: Cost 4 vsldoi4 <6,5,7,0>, <6,5,7,0>
+  3718714362U,	// <5,7,0,7>: Cost 4 vsldoi4 <4,5,7,0>, <7,0,1,2>
+  2694734493U,	// <5,7,0,u>: Cost 3 vsldoi8 <1,6,5,7>, LHS
+  2311278690U,	// <5,7,1,0>: Cost 3 vmrglw <4,u,5,1>, <5,6,7,0>
+  2305970923U,	// <5,7,1,1>: Cost 3 vmrglw <4,0,5,1>, <6,5,7,1>
+  3768476566U,	// <5,7,1,2>: Cost 4 vsldoi8 <1,6,5,7>, <1,2,3,0>
+  2311279098U,	// <5,7,1,3>: Cost 3 vmrglw <4,u,5,1>, <6,2,7,3>
+  2311278694U,	// <5,7,1,4>: Cost 3 vmrglw <4,u,5,1>, <5,6,7,4>
+  3768476783U,	// <5,7,1,5>: Cost 4 vsldoi8 <1,6,5,7>, <1,5,0,1>
+  2694735091U,	// <5,7,1,6>: Cost 3 vsldoi8 <1,6,5,7>, <1,6,5,7>
+  2311279426U,	// <5,7,1,7>: Cost 3 vmrglw <4,u,5,1>, <6,6,7,7>
+  2696062357U,	// <5,7,1,u>: Cost 3 vsldoi8 <1,u,5,7>, <1,u,5,7>
+  3383701602U,	// <5,7,2,0>: Cost 4 vmrglw <4,6,5,2>, <5,6,7,0>
+  3768477219U,	// <5,7,2,1>: Cost 4 vsldoi8 <1,6,5,7>, <2,1,3,5>
+  3768477288U,	// <5,7,2,2>: Cost 4 vsldoi8 <1,6,5,7>, <2,2,2,2>
+  2309960186U,	// <5,7,2,3>: Cost 3 vmrglw <4,6,5,2>, <6,2,7,3>
+  3383701606U,	// <5,7,2,4>: Cost 4 vmrglw <4,6,5,2>, <5,6,7,4>
+  3768477545U,	// <5,7,2,5>: Cost 4 vsldoi8 <1,6,5,7>, <2,5,3,7>
+  3766486970U,	// <5,7,2,6>: Cost 4 vsldoi8 <1,3,5,7>, <2,6,3,7>
+  3383702338U,	// <5,7,2,7>: Cost 4 vmrglw <4,6,5,2>, <6,6,7,7>
+  2309960186U,	// <5,7,2,u>: Cost 3 vmrglw <4,6,5,2>, <6,2,7,3>
+  3768477846U,	// <5,7,3,0>: Cost 4 vsldoi8 <1,6,5,7>, <3,0,1,2>
+  3768477975U,	// <5,7,3,1>: Cost 4 vsldoi8 <1,6,5,7>, <3,1,6,5>
+  3786393932U,	// <5,7,3,2>: Cost 4 vsldoi8 <4,6,5,7>, <3,2,3,4>
+  3768478108U,	// <5,7,3,3>: Cost 4 vsldoi8 <1,6,5,7>, <3,3,3,3>
+  2795599115U,	// <5,7,3,4>: Cost 3 vsldoi12 <7,3,4,5>, <7,3,4,5>
+  3385037470U,	// <5,7,3,5>: Cost 4 vmrglw <4,u,5,3>, <6,4,7,5>
+  3780422309U,	// <5,7,3,6>: Cost 4 vsldoi8 <3,6,5,7>, <3,6,5,7>
+  3848107301U,	// <5,7,3,7>: Cost 4 vsldoi12 <3,7,4,5>, <7,3,7,4>
+  2795894063U,	// <5,7,3,u>: Cost 3 vsldoi12 <7,3,u,5>, <7,3,u,5>
+  2795967800U,	// <5,7,4,0>: Cost 3 vsldoi12 <7,4,0,5>, <7,4,0,5>
+  3768478690U,	// <5,7,4,1>: Cost 4 vsldoi8 <1,6,5,7>, <4,1,5,0>
+  3718744163U,	// <5,7,4,2>: Cost 4 vsldoi4 <4,5,7,4>, <2,u,4,5>
+  3784404107U,	// <5,7,4,3>: Cost 4 vsldoi8 <4,3,5,7>, <4,3,5,7>
+  2796262748U,	// <5,7,4,4>: Cost 3 vsldoi12 <7,4,4,5>, <7,4,4,5>
+  2694737206U,	// <5,7,4,5>: Cost 3 vsldoi8 <1,6,5,7>, RHS
+  2712653182U,	// <5,7,4,6>: Cost 3 vsldoi8 <4,6,5,7>, <4,6,5,7>
+  2713316815U,	// <5,7,4,7>: Cost 3 vsldoi8 <4,7,5,7>, <4,7,5,7>
+  2694737449U,	// <5,7,4,u>: Cost 3 vsldoi8 <1,6,5,7>, RHS
+  2311311458U,	// <5,7,5,0>: Cost 3 vmrglw <4,u,5,5>, <5,6,7,0>
+  3768479433U,	// <5,7,5,1>: Cost 4 vsldoi8 <1,6,5,7>, <5,1,6,5>
+  3768479521U,	// <5,7,5,2>: Cost 4 vsldoi8 <1,6,5,7>, <5,2,7,3>
+  2311311866U,	// <5,7,5,3>: Cost 3 vmrglw <4,u,5,5>, <6,2,7,3>
+  2311311462U,	// <5,7,5,4>: Cost 3 vmrglw <4,u,5,5>, <5,6,7,4>
+  2248185270U,	// <5,7,5,5>: Cost 3 vmrghw <5,5,5,5>, <7,5,5,5>
+  2718625879U,	// <5,7,5,6>: Cost 3 vsldoi8 <5,6,5,7>, <5,6,5,7>
+  2311312194U,	// <5,7,5,7>: Cost 3 vmrglw <4,u,5,5>, <6,6,7,7>
+  2311311466U,	// <5,7,5,u>: Cost 3 vmrglw <4,u,5,5>, <5,6,7,u>
+  2248954874U,	// <5,7,6,0>: Cost 3 vmrghw <5,6,7,0>, <7,0,1,2>
+  3322696778U,	// <5,7,6,1>: Cost 4 vmrghw <5,6,7,0>, <7,1,1,1>
+  2248955028U,	// <5,7,6,2>: Cost 3 vmrghw <5,6,7,0>, <7,2,0,3>
+  2656963074U,	// <5,7,6,3>: Cost 3 vsldoi4 <6,5,7,6>, <3,4,5,6>
+  2248955238U,	// <5,7,6,4>: Cost 3 vmrghw <5,6,7,0>, <7,4,5,6>
+  2248955329U,	// <5,7,6,5>: Cost 3 vmrghw <5,6,7,0>, <7,5,6,7>
+  2656965360U,	// <5,7,6,6>: Cost 3 vsldoi4 <6,5,7,6>, <6,5,7,6>
+  2248955500U,	// <5,7,6,7>: Cost 3 vmrghw <5,6,7,0>, <7,7,7,7>
+  2248955522U,	// <5,7,6,u>: Cost 3 vmrghw <5,6,7,0>, <7,u,1,2>
+  3718766694U,	// <5,7,7,0>: Cost 4 vsldoi4 <4,5,7,7>, LHS
+  3724739827U,	// <5,7,7,1>: Cost 4 vsldoi4 <5,5,7,7>, <1,6,5,7>
+  3718768739U,	// <5,7,7,2>: Cost 4 vsldoi4 <4,5,7,7>, <2,u,4,5>
+  3365826337U,	// <5,7,7,3>: Cost 4 vmrglw <1,6,5,7>, <5,2,7,3>
+  2798253647U,	// <5,7,7,4>: Cost 3 vsldoi12 <7,7,4,5>, <7,7,4,5>
+  3365826258U,	// <5,7,7,5>: Cost 4 vmrglw <1,6,5,7>, <5,1,7,5>
+  3730715377U,	// <5,7,7,6>: Cost 4 vsldoi4 <6,5,7,7>, <6,5,7,7>
+  2310665836U,	// <5,7,7,7>: Cost 3 vmrglw <4,7,5,7>, <7,7,7,7>
+  2798548595U,	// <5,7,7,u>: Cost 3 vsldoi12 <7,7,u,5>, <7,7,u,5>
+  2311336034U,	// <5,7,u,0>: Cost 3 vmrglw <4,u,5,u>, <5,6,7,0>
+  2694739758U,	// <5,7,u,1>: Cost 3 vsldoi8 <1,6,5,7>, LHS
+  2248955028U,	// <5,7,u,2>: Cost 3 vmrghw <5,6,7,0>, <7,2,0,3>
+  2311336442U,	// <5,7,u,3>: Cost 3 vmrglw <4,u,5,u>, <6,2,7,3>
+  2311336038U,	// <5,7,u,4>: Cost 3 vmrglw <4,u,5,u>, <5,6,7,4>
+  2694740122U,	// <5,7,u,5>: Cost 3 vsldoi8 <1,6,5,7>, RHS
+  2656981746U,	// <5,7,u,6>: Cost 3 vsldoi4 <6,5,7,u>, <6,5,7,u>
+  2311336770U,	// <5,7,u,7>: Cost 3 vmrglw <4,u,5,u>, <6,6,7,7>
+  2694740325U,	// <5,7,u,u>: Cost 3 vsldoi8 <1,6,5,7>, LHS
+  2705358848U,	// <5,u,0,0>: Cost 3 vsldoi8 <3,4,5,u>, <0,0,0,0>
+  1631617126U,	// <5,u,0,1>: Cost 2 vsldoi8 <3,4,5,u>, LHS
+  2310607866U,	// <5,u,0,2>: Cost 3 vmrglw <4,7,5,0>, <7,0,1,2>
+  2302640284U,	// <5,u,0,3>: Cost 3 vmrglw <3,4,5,0>, LHS
+  2754238189U,	// <5,u,0,4>: Cost 3 vsldoi12 <0,4,1,5>, <u,0,4,1>
+  2305296114U,	// <5,u,0,5>: Cost 3 vmrglw <3,u,5,0>, <2,3,u,5>
+  2244907106U,	// <5,u,0,6>: Cost 3 vmrghw <5,0,6,1>, <5,6,7,0>
+  2302643528U,	// <5,u,0,7>: Cost 3 vmrglw <3,4,5,0>, RHS
+  1631617693U,	// <5,u,0,u>: Cost 2 vsldoi8 <3,4,5,u>, LHS
+  2627133542U,	// <5,u,1,0>: Cost 3 vsldoi4 <1,5,u,1>, LHS
+  1237536282U,	// <5,u,1,1>: Cost 2 vmrglw <4,u,5,1>, <4,u,5,1>
+  1680496430U,	// <5,u,1,2>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  1237532828U,	// <5,u,1,3>: Cost 2 vmrglw <4,u,5,1>, LHS
+  2693416018U,	// <5,u,1,4>: Cost 3 vsldoi8 <1,4,5,u>, <1,4,5,u>
+  2756892486U,	// <5,u,1,5>: Cost 3 vsldoi12 <0,u,1,5>, <u,1,5,0>
+  2694743284U,	// <5,u,1,6>: Cost 3 vsldoi8 <1,6,5,u>, <1,6,5,u>
+  1237536072U,	// <5,u,1,7>: Cost 2 vmrglw <4,u,5,1>, RHS
+  1680496484U,	// <5,u,1,u>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  2311288709U,	// <5,u,2,0>: Cost 3 vmrglw <4,u,5,2>, <u,2,3,0>
+  2245883694U,	// <5,u,2,1>: Cost 3 vmrghw <5,2,1,3>, LHS
+  2699388520U,	// <5,u,2,2>: Cost 3 vsldoi8 <2,4,5,u>, <2,2,2,2>
+  2754238344U,	// <5,u,2,3>: Cost 3 vsldoi12 <0,4,1,5>, <u,2,3,3>
+  2699388715U,	// <5,u,2,4>: Cost 3 vsldoi8 <2,4,5,u>, <2,4,5,u>
+  2757408666U,	// <5,u,2,5>: Cost 3 vsldoi12 <0,u,u,5>, <u,2,5,3>
+  2705360826U,	// <5,u,2,6>: Cost 3 vsldoi8 <3,4,5,u>, <2,6,3,7>
+  2302659912U,	// <5,u,2,7>: Cost 3 vmrglw <3,4,5,2>, RHS
+  2754238389U,	// <5,u,2,u>: Cost 3 vsldoi12 <0,4,1,5>, <u,2,u,3>
+  2754238396U,	// <5,u,3,0>: Cost 3 vsldoi12 <0,4,1,5>, <u,3,0,1>
+  3827980229U,	// <5,u,3,1>: Cost 4 vsldoi12 <0,4,1,5>, <u,3,1,1>
+  2644625102U,	// <5,u,3,2>: Cost 3 vsldoi4 <4,5,2,3>, <2,3,4,5>
+  2289393820U,	// <5,u,3,3>: Cost 3 vmrglw <1,2,5,3>, LHS
+  1631619588U,	// <5,u,3,4>: Cost 2 vsldoi8 <3,4,5,u>, <3,4,5,u>
+  2785056749U,	// <5,u,3,5>: Cost 3 vsldoi12 <5,5,5,5>, <u,3,5,5>
+  3363138077U,	// <5,u,3,6>: Cost 4 vmrglw <1,2,5,3>, <3,4,u,6>
+  2289397064U,	// <5,u,3,7>: Cost 3 vmrglw <1,2,5,3>, RHS
+  1634274120U,	// <5,u,3,u>: Cost 2 vsldoi8 <3,u,5,u>, <3,u,5,u>
+  1634937753U,	// <5,u,4,0>: Cost 2 vsldoi8 <4,0,5,u>, <4,0,5,u>
+  1728272410U,	// <5,u,4,1>: Cost 2 vsldoi12 <u,4,1,5>, <u,4,1,5>
+  2710006843U,	// <5,u,4,2>: Cost 3 vsldoi8 <4,2,5,u>, <4,2,5,u>
+  2765740076U,	// <5,u,4,3>: Cost 3 vsldoi12 <2,3,4,5>, <u,4,3,5>
+  1637592285U,	// <5,u,4,4>: Cost 2 vsldoi8 <4,4,5,u>, <4,4,5,u>
+  1631620406U,	// <5,u,4,5>: Cost 2 vsldoi8 <3,4,5,u>, RHS
+  2712661375U,	// <5,u,4,6>: Cost 3 vsldoi8 <4,6,5,u>, <4,6,5,u>
+  2302676296U,	// <5,u,4,7>: Cost 3 vmrglw <3,4,5,4>, RHS
+  1631620649U,	// <5,u,4,u>: Cost 2 vsldoi8 <3,4,5,u>, RHS
+  1577091174U,	// <5,u,5,0>: Cost 2 vsldoi4 <5,5,5,5>, LHS
+  1174443822U,	// <5,u,5,1>: Cost 2 vmrghw <5,5,5,5>, LHS
+  2766035058U,	// <5,u,5,2>: Cost 3 vsldoi12 <2,3,u,5>, <u,5,2,3>
+  1237565596U,	// <5,u,5,3>: Cost 2 vmrglw <4,u,5,5>, LHS
+  1577094454U,	// <5,u,5,4>: Cost 2 vsldoi4 <5,5,5,5>, RHS
+  296144182U,	// <5,u,5,5>: Cost 1 vspltisw1 RHS
+  1680496794U,	// <5,u,5,6>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+  1237568840U,	// <5,u,5,7>: Cost 2 vmrglw <4,u,5,5>, RHS
+  296144182U,	// <5,u,5,u>: Cost 1 vspltisw1 RHS
+  2633146470U,	// <5,u,6,0>: Cost 3 vsldoi4 <2,5,u,6>, LHS
+  1175213870U,	// <5,u,6,1>: Cost 2 vmrghw <5,6,7,0>, LHS
+  2633148309U,	// <5,u,6,2>: Cost 3 vsldoi4 <2,5,u,6>, <2,5,u,6>
+  1228947612U,	// <5,u,6,3>: Cost 2 vmrglw <3,4,5,6>, LHS
+  2633149750U,	// <5,u,6,4>: Cost 3 vsldoi4 <2,5,u,6>, RHS
+  1175214234U,	// <5,u,6,5>: Cost 2 vmrghw <5,6,7,0>, RHS
+  1228950018U,	// <5,u,6,6>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6>
+  1228950856U,	// <5,u,6,7>: Cost 2 vmrglw <3,4,5,6>, RHS
+  1228947617U,	// <5,u,6,u>: Cost 2 vmrglw <3,4,5,6>, LHS
+  497614950U,	// <5,u,7,0>: Cost 1 vsldoi4 RHS, LHS
+  1571357492U,	// <5,u,7,1>: Cost 2 vsldoi4 RHS, <1,1,1,1>
+  1571358312U,	// <5,u,7,2>: Cost 2 vsldoi4 RHS, <2,2,2,2>
+  1571358870U,	// <5,u,7,3>: Cost 2 vsldoi4 RHS, <3,0,1,2>
+  497618248U,	// <5,u,7,4>: Cost 1 vsldoi4 RHS, RHS
+  1571360772U,	// <5,u,7,5>: Cost 2 vsldoi4 RHS, <5,5,5,5>
+  1571361274U,	// <5,u,7,6>: Cost 2 vsldoi4 RHS, <6,2,7,3>
+  1571361786U,	// <5,u,7,7>: Cost 2 vsldoi4 RHS, <7,0,1,2>
+  497620782U,	// <5,u,7,u>: Cost 1 vsldoi4 RHS, LHS
+  497623142U,	// <5,u,u,0>: Cost 1 vsldoi4 RHS, LHS
+  1631622958U,	// <5,u,u,1>: Cost 2 vsldoi8 <3,4,5,u>, LHS
+  1680496997U,	// <5,u,u,2>: Cost 2 vsldoi12 <0,4,1,5>, LHS
+  1228963996U,	// <5,u,u,3>: Cost 2 vmrglw <3,4,5,u>, LHS
+  497626441U,	// <5,u,u,4>: Cost 1 vsldoi4 RHS, RHS
+  296144182U,	// <5,u,u,5>: Cost 1 vspltisw1 RHS
+  1680497037U,	// <5,u,u,6>: Cost 2 vsldoi12 <0,4,1,5>, RHS
+  1228967240U,	// <5,u,u,7>: Cost 2 vmrglw <3,4,5,u>, RHS
+  497628974U,	// <5,u,u,u>: Cost 1 vsldoi4 RHS, LHS
+  2772451328U,	// <6,0,0,0>: Cost 3 vsldoi12 <3,4,5,6>, <0,0,0,0>
+  2772451338U,	// <6,0,0,1>: Cost 3 vsldoi12 <3,4,5,6>, <0,0,1,1>
+  3771146417U,	// <6,0,0,2>: Cost 4 vsldoi8 <2,1,6,0>, <0,2,1,6>
+  3383095739U,	// <6,0,0,3>: Cost 4 vmrglw <4,5,6,0>, <6,2,0,3>
+  3846193189U,	// <6,0,0,4>: Cost 4 vsldoi12 <3,4,5,6>, <0,0,4,1>
+  3724832803U,	// <6,0,0,5>: Cost 4 vsldoi4 <5,6,0,0>, <5,6,0,0>
+  3383095985U,	// <6,0,0,6>: Cost 4 vmrglw <4,5,6,0>, <6,5,0,6>
+  3383096067U,	// <6,0,0,7>: Cost 4 vmrglw <4,5,6,0>, <6,6,0,7>
+  2772451401U,	// <6,0,0,u>: Cost 3 vsldoi12 <3,4,5,6>, <0,0,u,1>
+  2651095142U,	// <6,0,1,0>: Cost 3 vsldoi4 <5,6,0,1>, LHS
+  2251612262U,	// <6,0,1,1>: Cost 3 vmrghw <6,1,7,1>, LHS
+  1698709606U,	// <6,0,1,2>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+  2651097602U,	// <6,0,1,3>: Cost 3 vsldoi4 <5,6,0,1>, <3,4,5,6>
+  2651098422U,	// <6,0,1,4>: Cost 3 vsldoi4 <5,6,0,1>, RHS
+  2651099172U,	// <6,0,1,5>: Cost 3 vsldoi4 <5,6,0,1>, <5,6,0,1>
+  2657071869U,	// <6,0,1,6>: Cost 3 vsldoi4 <6,6,0,1>, <6,6,0,1>
+  3724841978U,	// <6,0,1,7>: Cost 4 vsldoi4 <5,6,0,1>, <7,0,1,2>
+  1698709660U,	// <6,0,1,u>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+  2252292096U,	// <6,0,2,0>: Cost 3 vmrghw <6,2,7,3>, <0,0,0,0>
+  1178550374U,	// <6,0,2,1>: Cost 2 vmrghw <6,2,7,3>, LHS
+  3826655418U,	// <6,0,2,2>: Cost 4 vsldoi12 <0,2,1,6>, <0,2,2,6>
+  3777783485U,	// <6,0,2,3>: Cost 4 vsldoi8 <3,2,6,0>, <2,3,2,6>
+  2252292434U,	// <6,0,2,4>: Cost 3 vmrghw <6,2,7,3>, <0,4,1,5>
+  3785746280U,	// <6,0,2,5>: Cost 4 vsldoi8 <4,5,6,0>, <2,5,3,6>
+  2252292593U,	// <6,0,2,6>: Cost 3 vmrghw <6,2,7,3>, <0,6,1,2>
+  3736794583U,	// <6,0,2,7>: Cost 4 vsldoi4 <7,6,0,2>, <7,6,0,2>
+  1178550941U,	// <6,0,2,u>: Cost 2 vmrghw <6,2,7,3>, LHS
+  3375153152U,	// <6,0,3,0>: Cost 4 vmrglw <3,2,6,3>, <0,0,0,0>
+  2772451584U,	// <6,0,3,1>: Cost 3 vsldoi12 <3,4,5,6>, <0,3,1,4>
+  3777784163U,	// <6,0,3,2>: Cost 4 vsldoi8 <3,2,6,0>, <3,2,6,0>
+  3846193426U,	// <6,0,3,3>: Cost 4 vsldoi12 <3,4,5,6>, <0,3,3,4>
+  2712005122U,	// <6,0,3,4>: Cost 3 vsldoi8 <4,5,6,0>, <3,4,5,6>
+  3724857382U,	// <6,0,3,5>: Cost 4 vsldoi4 <5,6,0,3>, <5,6,0,3>
+  3802335864U,	// <6,0,3,6>: Cost 4 vsldoi8 <7,3,6,0>, <3,6,0,7>
+  3801672410U,	// <6,0,3,7>: Cost 4 vsldoi8 <7,2,6,0>, <3,7,2,6>
+  2772451647U,	// <6,0,3,u>: Cost 3 vsldoi12 <3,4,5,6>, <0,3,u,4>
+  3383123968U,	// <6,0,4,0>: Cost 4 vmrglw <4,5,6,4>, <0,0,0,0>
+  2772451666U,	// <6,0,4,1>: Cost 3 vsldoi12 <3,4,5,6>, <0,4,1,5>
+  3773803577U,	// <6,0,4,2>: Cost 4 vsldoi8 <2,5,6,0>, <4,2,5,6>
+  3724864002U,	// <6,0,4,3>: Cost 4 vsldoi4 <5,6,0,4>, <3,4,5,6>
+  3846193517U,	// <6,0,4,4>: Cost 4 vsldoi12 <3,4,5,6>, <0,4,4,5>
+  2712005935U,	// <6,0,4,5>: Cost 3 vsldoi8 <4,5,6,0>, <4,5,6,0>
+  3327009265U,	// <6,0,4,6>: Cost 4 vmrghw <6,4,2,5>, <0,6,1,2>
+  3383126648U,	// <6,0,4,7>: Cost 5 vmrglw <4,5,6,4>, <3,6,0,7>
+  2772451729U,	// <6,0,4,u>: Cost 3 vsldoi12 <3,4,5,6>, <0,4,u,5>
+  3373178880U,	// <6,0,5,0>: Cost 4 vmrglw <2,u,6,5>, <0,0,0,0>
+  2254266470U,	// <6,0,5,1>: Cost 3 vmrghw <6,5,7,1>, LHS
+  3785748248U,	// <6,0,5,2>: Cost 4 vsldoi8 <4,5,6,0>, <5,2,6,3>
+  3790393190U,	// <6,0,5,3>: Cost 4 vsldoi8 <5,3,6,0>, <5,3,6,0>
+  3328000338U,	// <6,0,5,4>: Cost 4 vmrghw <6,5,7,0>, <0,4,1,5>
+  3785748494U,	// <6,0,5,5>: Cost 4 vsldoi8 <4,5,6,0>, <5,5,6,6>
+  3785748516U,	// <6,0,5,6>: Cost 4 vsldoi8 <4,5,6,0>, <5,6,0,1>
+  3379153528U,	// <6,0,5,7>: Cost 4 vmrglw <3,u,6,5>, <3,6,0,7>
+  2254267037U,	// <6,0,5,u>: Cost 3 vmrghw <6,5,7,1>, LHS
+  2254897152U,	// <6,0,6,0>: Cost 3 vmrghw <6,6,6,6>, <0,0,0,0>
+  1181155430U,	// <6,0,6,1>: Cost 2 vmrghw <6,6,6,6>, LHS
+  3785748923U,	// <6,0,6,2>: Cost 4 vsldoi8 <4,5,6,0>, <6,2,0,3>
+  3785749042U,	// <6,0,6,3>: Cost 4 vsldoi8 <4,5,6,0>, <6,3,4,5>
+  2254897490U,	// <6,0,6,4>: Cost 3 vmrghw <6,6,6,6>, <0,4,1,5>
+  3785749169U,	// <6,0,6,5>: Cost 4 vsldoi8 <4,5,6,0>, <6,5,0,6>
+  2724614962U,	// <6,0,6,6>: Cost 3 vsldoi8 <6,6,6,0>, <6,6,6,0>
+  3787739982U,	// <6,0,6,7>: Cost 4 vsldoi8 <4,u,6,0>, <6,7,0,1>
+  1181155997U,	// <6,0,6,u>: Cost 2 vmrghw <6,6,6,6>, LHS
+  1235664896U,	// <6,0,7,0>: Cost 2 vmrglw RHS, <0,0,0,0>
+  1235666598U,	// <6,0,7,1>: Cost 2 vmrglw RHS, <2,3,0,1>
+  3712943720U,	// <6,0,7,2>: Cost 4 vsldoi4 <3,6,0,7>, <2,2,2,2>
+  2639202936U,	// <6,0,7,3>: Cost 3 vsldoi4 <3,6,0,7>, <3,6,0,7>
+  2639203638U,	// <6,0,7,4>: Cost 3 vsldoi4 <3,6,0,7>, RHS
+  2309409236U,	// <6,0,7,5>: Cost 3 vmrglw RHS, <3,4,0,5>
+  3712946517U,	// <6,0,7,6>: Cost 4 vsldoi4 <3,6,0,7>, <6,0,7,0>
+  2309409400U,	// <6,0,7,7>: Cost 3 vmrglw RHS, <3,6,0,7>
+  1235666605U,	// <6,0,7,u>: Cost 2 vmrglw RHS, <2,3,0,u>
+  1235673088U,	// <6,0,u,0>: Cost 2 vmrglw RHS, <0,0,0,0>
+  1235674790U,	// <6,0,u,1>: Cost 2 vmrglw RHS, <2,3,0,1>
+  1698710173U,	// <6,0,u,2>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+  2639211129U,	// <6,0,u,3>: Cost 3 vsldoi4 <3,6,0,u>, <3,6,0,u>
+  2639211830U,	// <6,0,u,4>: Cost 3 vsldoi4 <3,6,0,u>, RHS
+  2712008858U,	// <6,0,u,5>: Cost 3 vsldoi8 <4,5,6,0>, RHS
+  2657129220U,	// <6,0,u,6>: Cost 3 vsldoi4 <6,6,0,u>, <6,6,0,u>
+  2309417592U,	// <6,0,u,7>: Cost 3 vmrglw RHS, <3,6,0,7>
+  1698710227U,	// <6,0,u,u>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+  3775799296U,	// <6,1,0,0>: Cost 4 vsldoi8 <2,u,6,1>, <0,0,0,0>
+  2702057574U,	// <6,1,0,1>: Cost 3 vsldoi8 <2,u,6,1>, LHS
+  3373143763U,	// <6,1,0,2>: Cost 4 vmrglw <2,u,6,0>, <u,0,1,2>
+  3695045122U,	// <6,1,0,3>: Cost 4 vsldoi4 <0,6,1,0>, <3,4,5,6>
+  3775799634U,	// <6,1,0,4>: Cost 4 vsldoi8 <2,u,6,1>, <0,4,1,5>
+  3383091538U,	// <6,1,0,5>: Cost 4 vmrglw <4,5,6,0>, <0,4,1,5>
+  3368493233U,	// <6,1,0,6>: Cost 4 vmrglw <2,1,6,0>, <0,2,1,6>
+  3362522319U,	// <6,1,0,7>: Cost 5 vmrglw <1,1,6,0>, <1,6,1,7>
+  2702058141U,	// <6,1,0,u>: Cost 3 vsldoi8 <2,u,6,1>, LHS
+  3834250027U,	// <6,1,1,0>: Cost 4 vsldoi12 <1,4,5,6>, <1,1,0,1>
+  2772452148U,	// <6,1,1,1>: Cost 3 vsldoi12 <3,4,5,6>, <1,1,1,1>
+  3832038210U,	// <6,1,1,2>: Cost 4 vsldoi12 <1,1,2,6>, <1,1,2,6>
+  3373150660U,	// <6,1,1,3>: Cost 4 vmrglw <2,u,6,1>, <6,2,1,3>
+  3834250067U,	// <6,1,1,4>: Cost 4 vsldoi12 <1,4,5,6>, <1,1,4,5>
+  3373146450U,	// <6,1,1,5>: Cost 4 vmrglw <2,u,6,1>, <0,4,1,5>
+  3826656102U,	// <6,1,1,6>: Cost 4 vsldoi12 <0,2,1,6>, <1,1,6,6>
+  3362530511U,	// <6,1,1,7>: Cost 4 vmrglw <1,1,6,1>, <1,6,1,7>
+  2772452148U,	// <6,1,1,u>: Cost 3 vsldoi12 <3,4,5,6>, <1,1,1,1>
+  2669092966U,	// <6,1,2,0>: Cost 3 vsldoi4 <u,6,1,2>, LHS
+  2252292916U,	// <6,1,2,1>: Cost 3 vmrghw <6,2,7,3>, <1,1,1,1>
+  2252293014U,	// <6,1,2,2>: Cost 3 vmrghw <6,2,7,3>, <1,2,3,0>
+  2772452246U,	// <6,1,2,3>: Cost 3 vsldoi12 <3,4,5,6>, <1,2,3,0>
+  2669096246U,	// <6,1,2,4>: Cost 3 vsldoi4 <u,6,1,2>, RHS
+  3846194091U,	// <6,1,2,5>: Cost 4 vsldoi12 <3,4,5,6>, <1,2,5,3>
+  2702059450U,	// <6,1,2,6>: Cost 3 vsldoi8 <2,u,6,1>, <2,6,3,7>
+  3870081978U,	// <6,1,2,7>: Cost 4 vsldoi12 <7,4,5,6>, <1,2,7,0>
+  2702059633U,	// <6,1,2,u>: Cost 3 vsldoi8 <2,u,6,1>, <2,u,6,1>
+  3775801494U,	// <6,1,3,0>: Cost 4 vsldoi8 <2,u,6,1>, <3,0,1,2>
+  3777128723U,	// <6,1,3,1>: Cost 4 vsldoi8 <3,1,6,1>, <3,1,6,1>
+  3775801702U,	// <6,1,3,2>: Cost 4 vsldoi8 <2,u,6,1>, <3,2,6,3>
+  3775801756U,	// <6,1,3,3>: Cost 4 vsldoi8 <2,u,6,1>, <3,3,3,3>
+  3775801858U,	// <6,1,3,4>: Cost 4 vsldoi8 <2,u,6,1>, <3,4,5,6>
+  3375153490U,	// <6,1,3,5>: Cost 4 vmrglw <3,2,6,3>, <0,4,1,5>
+  3826656265U,	// <6,1,3,6>: Cost 4 vsldoi12 <0,2,1,6>, <1,3,6,7>
+  3775802051U,	// <6,1,3,7>: Cost 4 vsldoi8 <2,u,6,1>, <3,7,0,1>
+  3775802142U,	// <6,1,3,u>: Cost 4 vsldoi8 <2,u,6,1>, <3,u,1,2>
+  3846194206U,	// <6,1,4,0>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,0,1>
+  3846194219U,	// <6,1,4,1>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,1,5>
+  3846194228U,	// <6,1,4,2>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,2,5>
+  3846194236U,	// <6,1,4,3>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,3,4>
+  3846194246U,	// <6,1,4,4>: Cost 4 vsldoi12 <3,4,5,6>, <1,4,4,5>
+  2760508496U,	// <6,1,4,5>: Cost 3 vsldoi12 <1,4,5,6>, <1,4,5,6>
+  3368526001U,	// <6,1,4,6>: Cost 4 vmrglw <2,1,6,4>, <0,2,1,6>
+  3870082144U,	// <6,1,4,7>: Cost 4 vsldoi12 <7,4,5,6>, <1,4,7,4>
+  2760729707U,	// <6,1,4,u>: Cost 3 vsldoi12 <1,4,u,6>, <1,4,u,6>
+  2714668660U,	// <6,1,5,0>: Cost 3 vsldoi8 <5,0,6,1>, <5,0,6,1>
+  3834619005U,	// <6,1,5,1>: Cost 4 vsldoi12 <1,5,1,6>, <1,5,1,6>
+  3834692742U,	// <6,1,5,2>: Cost 4 vsldoi12 <1,5,2,6>, <1,5,2,6>
+  3846194317U,	// <6,1,5,3>: Cost 4 vsldoi12 <3,4,5,6>, <1,5,3,4>
+  3834840216U,	// <6,1,5,4>: Cost 4 vsldoi12 <1,5,4,6>, <1,5,4,6>
+  3834913953U,	// <6,1,5,5>: Cost 4 vsldoi12 <1,5,5,6>, <1,5,5,6>
+  2719977570U,	// <6,1,5,6>: Cost 3 vsldoi8 <5,u,6,1>, <5,6,7,0>
+  3367208143U,	// <6,1,5,7>: Cost 4 vmrglw <1,u,6,5>, <1,6,1,7>
+  2719977724U,	// <6,1,5,u>: Cost 3 vsldoi8 <5,u,6,1>, <5,u,6,1>
+  2669125734U,	// <6,1,6,0>: Cost 3 vsldoi4 <u,6,1,6>, LHS
+  2254897972U,	// <6,1,6,1>: Cost 3 vmrghw <6,6,6,6>, <1,1,1,1>
+  2254898070U,	// <6,1,6,2>: Cost 3 vmrghw <6,6,6,6>, <1,2,3,0>
+  3775803929U,	// <6,1,6,3>: Cost 4 vsldoi8 <2,u,6,1>, <6,3,1,7>
+  2669129014U,	// <6,1,6,4>: Cost 3 vsldoi4 <u,6,1,6>, RHS
+  2322006354U,	// <6,1,6,5>: Cost 3 vmrglw <6,6,6,6>, <0,4,1,5>
+  2725950264U,	// <6,1,6,6>: Cost 3 vsldoi8 <6,u,6,1>, <6,6,6,6>
+  3793720142U,	// <6,1,6,7>: Cost 4 vsldoi8 <5,u,6,1>, <6,7,0,1>
+  2254898556U,	// <6,1,6,u>: Cost 3 vmrghw <6,6,6,6>, <1,u,3,0>
+  2627330150U,	// <6,1,7,0>: Cost 3 vsldoi4 <1,6,1,7>, LHS
+  1235664906U,	// <6,1,7,1>: Cost 2 vmrglw RHS, <0,0,1,1>
+  1235667094U,	// <6,1,7,2>: Cost 2 vmrglw RHS, <3,0,1,2>
+  2309406894U,	// <6,1,7,3>: Cost 3 vmrglw RHS, <0,2,1,3>
+  2627333430U,	// <6,1,7,4>: Cost 3 vsldoi4 <1,6,1,7>, RHS
+  1235665234U,	// <6,1,7,5>: Cost 2 vmrglw RHS, <0,4,1,5>
+  2309406897U,	// <6,1,7,6>: Cost 3 vmrglw RHS, <0,2,1,6>
+  2309407222U,	// <6,1,7,7>: Cost 3 vmrglw RHS, <0,6,1,7>
+  1235664913U,	// <6,1,7,u>: Cost 2 vmrglw RHS, <0,0,1,u>
+  2627338342U,	// <6,1,u,0>: Cost 3 vsldoi4 <1,6,1,u>, LHS
+  1235673098U,	// <6,1,u,1>: Cost 2 vmrglw RHS, <0,0,1,1>
+  1235675286U,	// <6,1,u,2>: Cost 2 vmrglw RHS, <3,0,1,2>
+  2772452732U,	// <6,1,u,3>: Cost 3 vsldoi12 <3,4,5,6>, <1,u,3,0>
+  2627341622U,	// <6,1,u,4>: Cost 3 vsldoi4 <1,6,1,u>, RHS
+  1235673426U,	// <6,1,u,5>: Cost 2 vmrglw RHS, <0,4,1,5>
+  2309415089U,	// <6,1,u,6>: Cost 3 vmrglw RHS, <0,2,1,6>
+  2309415414U,	// <6,1,u,7>: Cost 3 vmrglw RHS, <0,6,1,7>
+  1235673105U,	// <6,1,u,u>: Cost 2 vmrglw RHS, <0,0,1,u>
+  3324683725U,	// <6,2,0,0>: Cost 4 vmrghw <6,0,7,0>, <2,0,3,0>
+  2725290086U,	// <6,2,0,1>: Cost 3 vsldoi8 <6,7,6,2>, LHS
+  3771162801U,	// <6,2,0,2>: Cost 4 vsldoi8 <2,1,6,2>, <0,2,1,6>
+  2309349478U,	// <6,2,0,3>: Cost 3 vmrglw <4,5,6,0>, LHS
+  3730951478U,	// <6,2,0,4>: Cost 4 vsldoi4 <6,6,2,0>, RHS
+  3840738784U,	// <6,2,0,5>: Cost 4 vsldoi12 <2,5,3,6>, <2,0,5,1>
+  3842655721U,	// <6,2,0,6>: Cost 4 vsldoi12 <2,u,2,6>, <2,0,6,1>
+  3736925671U,	// <6,2,0,7>: Cost 4 vsldoi4 <7,6,2,0>, <7,6,2,0>
+  2309349483U,	// <6,2,0,u>: Cost 3 vmrglw <4,5,6,0>, LHS
+  3367840468U,	// <6,2,1,0>: Cost 4 vmrglw <2,0,6,1>, <3,7,2,0>
+  3325355551U,	// <6,2,1,1>: Cost 4 vmrghw <6,1,7,1>, <2,1,3,1>
+  3373147752U,	// <6,2,1,2>: Cost 4 vmrglw <2,u,6,1>, <2,2,2,2>
+  2299404390U,	// <6,2,1,3>: Cost 3 vmrglw <2,u,6,1>, LHS
+  3701099830U,	// <6,2,1,4>: Cost 5 vsldoi4 <1,6,2,1>, RHS
+  3767846054U,	// <6,2,1,5>: Cost 4 vsldoi8 <1,5,6,2>, <1,5,6,2>
+  3826656825U,	// <6,2,1,6>: Cost 4 vsldoi12 <0,2,1,6>, <2,1,6,0>
+  3373147838U,	// <6,2,1,7>: Cost 5 vmrglw <2,u,6,1>, <2,3,2,7>
+  2299404395U,	// <6,2,1,u>: Cost 3 vmrglw <2,u,6,1>, LHS
+  2657222758U,	// <6,2,2,0>: Cost 3 vsldoi4 <6,6,2,2>, LHS
+  3771164219U,	// <6,2,2,1>: Cost 4 vsldoi8 <2,1,6,2>, <2,1,6,2>
+  2766481000U,	// <6,2,2,2>: Cost 3 vsldoi12 <2,4,5,6>, <2,2,2,2>
+  2772452978U,	// <6,2,2,3>: Cost 3 vsldoi12 <3,4,5,6>, <2,2,3,3>
+  2657226038U,	// <6,2,2,4>: Cost 3 vsldoi4 <6,6,2,2>, RHS
+  3790407528U,	// <6,2,2,5>: Cost 4 vsldoi8 <5,3,6,2>, <2,5,3,6>
+  2252294074U,	// <6,2,2,6>: Cost 3 vmrghw <6,2,7,3>, <2,6,3,7>
+  2252294148U,	// <6,2,2,7>: Cost 3 vmrghw <6,2,7,3>, <2,7,3,0>
+  2772453023U,	// <6,2,2,u>: Cost 3 vsldoi12 <3,4,5,6>, <2,2,u,3>
+  2772453030U,	// <6,2,3,0>: Cost 3 vsldoi12 <3,4,5,6>, <2,3,0,1>
+  3834250930U,	// <6,2,3,1>: Cost 4 vsldoi12 <1,4,5,6>, <2,3,1,4>
+  2765596349U,	// <6,2,3,2>: Cost 3 vsldoi12 <2,3,2,6>, <2,3,2,6>
+  2301411430U,	// <6,2,3,3>: Cost 3 vmrglw <3,2,6,3>, LHS
+  2772453070U,	// <6,2,3,4>: Cost 3 vsldoi12 <3,4,5,6>, <2,3,4,5>
+  2765817560U,	// <6,2,3,5>: Cost 3 vsldoi12 <2,3,5,6>, <2,3,5,6>
+  2252933050U,	// <6,2,3,6>: Cost 3 vmrghw <6,3,7,0>, <2,6,3,7>
+  2796340968U,	// <6,2,3,7>: Cost 3 vsldoi12 <7,4,5,6>, <2,3,7,4>
+  2766038771U,	// <6,2,3,u>: Cost 3 vsldoi12 <2,3,u,6>, <2,3,u,6>
+  3725008998U,	// <6,2,4,0>: Cost 4 vsldoi4 <5,6,2,4>, LHS
+  3368530217U,	// <6,2,4,1>: Cost 5 vmrglw <2,1,6,4>, <6,0,2,1>
+  3840222989U,	// <6,2,4,2>: Cost 4 vsldoi12 <2,4,5,6>, <2,4,2,5>
+  2309382246U,	// <6,2,4,3>: Cost 3 vmrglw <4,5,6,4>, LHS
+  3725012278U,	// <6,2,4,4>: Cost 4 vsldoi4 <5,6,2,4>, RHS
+  2766481193U,	// <6,2,4,5>: Cost 3 vsldoi12 <2,4,5,6>, <2,4,5,6>
+  3842656049U,	// <6,2,4,6>: Cost 4 vsldoi12 <2,u,2,6>, <2,4,6,5>
+  3327010820U,	// <6,2,4,7>: Cost 4 vmrghw <6,4,2,5>, <2,7,3,0>
+  2766702404U,	// <6,2,4,u>: Cost 3 vsldoi12 <2,4,u,6>, <2,4,u,6>
+  3713073254U,	// <6,2,5,0>: Cost 4 vsldoi4 <3,6,2,5>, LHS
+  3789082310U,	// <6,2,5,1>: Cost 4 vsldoi8 <5,1,6,2>, <5,1,6,2>
+  3840665439U,	// <6,2,5,2>: Cost 4 vsldoi12 <2,5,2,6>, <2,5,2,6>
+  2766997352U,	// <6,2,5,3>: Cost 3 vsldoi12 <2,5,3,6>, <2,5,3,6>
+  3713076534U,	// <6,2,5,4>: Cost 4 vsldoi4 <3,6,2,5>, RHS
+  3791736842U,	// <6,2,5,5>: Cost 4 vsldoi8 <5,5,6,2>, <5,5,6,2>
+  3373180605U,	// <6,2,5,6>: Cost 4 vmrglw <2,u,6,5>, <2,3,2,6>
+  3793064108U,	// <6,2,5,7>: Cost 4 vsldoi8 <5,7,6,2>, <5,7,6,2>
+  2767366037U,	// <6,2,5,u>: Cost 3 vsldoi12 <2,5,u,6>, <2,5,u,6>
+  3701137510U,	// <6,2,6,0>: Cost 4 vsldoi4 <1,6,2,6>, LHS
+  3701138647U,	// <6,2,6,1>: Cost 4 vsldoi4 <1,6,2,6>, <1,6,2,6>
+  2254898792U,	// <6,2,6,2>: Cost 3 vmrghw <6,6,6,6>, <2,2,2,2>
+  1248264294U,	// <6,2,6,3>: Cost 2 vmrglw <6,6,6,6>, LHS
+  3701140790U,	// <6,2,6,4>: Cost 4 vsldoi4 <1,6,2,6>, RHS
+  3725029435U,	// <6,2,6,5>: Cost 4 vsldoi4 <5,6,2,6>, <5,6,2,6>
+  2254899130U,	// <6,2,6,6>: Cost 3 vmrghw <6,6,6,6>, <2,6,3,7>
+  2725294981U,	// <6,2,6,7>: Cost 3 vsldoi8 <6,7,6,2>, <6,7,6,2>
+  1248264299U,	// <6,2,6,u>: Cost 2 vmrglw <6,6,6,6>, LHS
+  2633375846U,	// <6,2,7,0>: Cost 3 vsldoi4 <2,6,2,7>, LHS
+  2309407468U,	// <6,2,7,1>: Cost 3 vmrglw RHS, <1,0,2,1>
+  1235666536U,	// <6,2,7,2>: Cost 2 vmrglw RHS, <2,2,2,2>
+  161923174U,	// <6,2,7,3>: Cost 1 vmrglw RHS, LHS
+  2633379126U,	// <6,2,7,4>: Cost 3 vsldoi4 <2,6,2,7>, RHS
+  2309407796U,	// <6,2,7,5>: Cost 3 vmrglw RHS, <1,4,2,5>
+  2309408445U,	// <6,2,7,6>: Cost 3 vmrglw RHS, <2,3,2,6>
+  2309407960U,	// <6,2,7,7>: Cost 3 vmrglw RHS, <1,6,2,7>
+  161923179U,	// <6,2,7,u>: Cost 1 vmrglw RHS, LHS
+  2633384038U,	// <6,2,u,0>: Cost 3 vsldoi4 <2,6,2,u>, LHS
+  2309415660U,	// <6,2,u,1>: Cost 3 vmrglw RHS, <1,0,2,1>
+  1235674728U,	// <6,2,u,2>: Cost 2 vmrglw RHS, <2,2,2,2>
+  161931366U,	// <6,2,u,3>: Cost 1 vmrglw RHS, LHS
+  2633387318U,	// <6,2,u,4>: Cost 3 vsldoi4 <2,6,2,u>, RHS
+  2769135725U,	// <6,2,u,5>: Cost 3 vsldoi12 <2,u,5,6>, <2,u,5,6>
+  2309416637U,	// <6,2,u,6>: Cost 3 vmrglw RHS, <2,3,2,6>
+  2309416152U,	// <6,2,u,7>: Cost 3 vmrglw RHS, <1,6,2,7>
+  161931371U,	// <6,2,u,u>: Cost 1 vmrglw RHS, LHS
+  3777806336U,	// <6,3,0,0>: Cost 4 vsldoi8 <3,2,6,3>, <0,0,0,0>
+  2704064614U,	// <6,3,0,1>: Cost 3 vsldoi8 <3,2,6,3>, LHS
+  3765862577U,	// <6,3,0,2>: Cost 4 vsldoi8 <1,2,6,3>, <0,2,1,6>
+  3843393708U,	// <6,3,0,3>: Cost 4 vsldoi12 <3,0,3,6>, <3,0,3,6>
+  2250516994U,	// <6,3,0,4>: Cost 3 vmrghw <6,0,1,2>, <3,4,5,6>
+  3725054014U,	// <6,3,0,5>: Cost 4 vsldoi4 <5,6,3,0>, <5,6,3,0>
+  3383093096U,	// <6,3,0,6>: Cost 4 vmrglw <4,5,6,0>, <2,5,3,6>
+  3368495034U,	// <6,3,0,7>: Cost 4 vmrglw <2,1,6,0>, <2,6,3,7>
+  2704065181U,	// <6,3,0,u>: Cost 3 vsldoi8 <3,2,6,3>, LHS
+  2251622550U,	// <6,3,1,0>: Cost 3 vmrghw <6,1,7,2>, <3,0,1,2>
+  3777807156U,	// <6,3,1,1>: Cost 4 vsldoi8 <3,2,6,3>, <1,1,1,1>
+  3765863348U,	// <6,3,1,2>: Cost 4 vsldoi8 <1,2,6,3>, <1,2,6,3>
+  3373147762U,	// <6,3,1,3>: Cost 4 vmrglw <2,u,6,1>, <2,2,3,3>
+  3834251525U,	// <6,3,1,4>: Cost 4 vsldoi12 <1,4,5,6>, <3,1,4,5>
+  3373147683U,	// <6,3,1,5>: Cost 5 vmrglw <2,u,6,1>, <2,1,3,5>
+  3391727545U,	// <6,3,1,6>: Cost 4 vmrglw <6,0,6,1>, <2,6,3,6>
+  2299406266U,	// <6,3,1,7>: Cost 3 vmrglw <2,u,6,1>, <2,6,3,7>
+  2251622550U,	// <6,3,1,u>: Cost 3 vmrghw <6,1,7,2>, <3,0,1,2>
+  2252294294U,	// <6,3,2,0>: Cost 3 vmrghw <6,2,7,3>, <3,0,1,2>
+  3326036198U,	// <6,3,2,1>: Cost 4 vmrghw <6,2,7,3>, <3,1,1,1>
+  3771836045U,	// <6,3,2,2>: Cost 4 vsldoi8 <2,2,6,3>, <2,2,6,3>
+  2252294556U,	// <6,3,2,3>: Cost 3 vmrghw <6,2,7,3>, <3,3,3,3>
+  2252294658U,	// <6,3,2,4>: Cost 3 vmrghw <6,2,7,3>, <3,4,5,6>
+  3840739677U,	// <6,3,2,5>: Cost 4 vsldoi12 <2,5,3,6>, <3,2,5,3>
+  2704066490U,	// <6,3,2,6>: Cost 3 vsldoi8 <3,2,6,3>, <2,6,3,7>
+  3368511418U,	// <6,3,2,7>: Cost 4 vmrglw <2,1,6,2>, <2,6,3,7>
+  2252294942U,	// <6,3,2,u>: Cost 3 vmrghw <6,2,7,3>, <3,u,1,2>
+  3707158630U,	// <6,3,3,0>: Cost 4 vsldoi4 <2,6,3,3>, LHS
+  3765864692U,	// <6,3,3,1>: Cost 5 vsldoi8 <1,2,6,3>, <3,1,2,6>
+  2704066918U,	// <6,3,3,2>: Cost 3 vsldoi8 <3,2,6,3>, <3,2,6,3>
+  2772453788U,	// <6,3,3,3>: Cost 3 vsldoi12 <3,4,5,6>, <3,3,3,3>
+  2772453799U,	// <6,3,3,4>: Cost 3 vsldoi12 <3,4,5,6>, <3,3,4,5>
+  3789752888U,	// <6,3,3,5>: Cost 4 vsldoi8 <5,2,6,3>, <3,5,2,6>
+  3840739770U,	// <6,3,3,6>: Cost 4 vsldoi12 <2,5,3,6>, <3,3,6,6>
+  2301413306U,	// <6,3,3,7>: Cost 3 vmrglw <3,2,6,3>, <2,6,3,7>
+  2775108043U,	// <6,3,3,u>: Cost 3 vsldoi12 <3,u,5,6>, <3,3,u,5>
+  2651340902U,	// <6,3,4,0>: Cost 3 vsldoi4 <5,6,3,4>, LHS
+  3846195674U,	// <6,3,4,1>: Cost 4 vsldoi12 <3,4,5,6>, <3,4,1,2>
+  3845974503U,	// <6,3,4,2>: Cost 4 vsldoi12 <3,4,2,6>, <3,4,2,6>
+  2651343362U,	// <6,3,4,3>: Cost 3 vsldoi4 <5,6,3,4>, <3,4,5,6>
+  2651344182U,	// <6,3,4,4>: Cost 3 vsldoi4 <5,6,3,4>, RHS
+  1698712066U,	// <6,3,4,5>: Cost 2 vsldoi12 <3,4,5,6>, <3,4,5,6>
+  3383125864U,	// <6,3,4,6>: Cost 4 vmrglw <4,5,6,4>, <2,5,3,6>
+  3368527802U,	// <6,3,4,7>: Cost 4 vmrglw <2,1,6,4>, <2,6,3,7>
+  1698933277U,	// <6,3,4,u>: Cost 2 vsldoi12 <3,4,u,6>, <3,4,u,6>
+  3373179798U,	// <6,3,5,0>: Cost 4 vmrglw <2,u,6,5>, <1,2,3,0>
+  3707176179U,	// <6,3,5,1>: Cost 5 vsldoi4 <2,6,3,5>, <1,6,5,7>
+  2716012312U,	// <6,3,5,2>: Cost 3 vsldoi8 <5,2,6,3>, <5,2,6,3>
+  3373180530U,	// <6,3,5,3>: Cost 4 vmrglw <2,u,6,5>, <2,2,3,3>
+  2254309890U,	// <6,3,5,4>: Cost 3 vmrghw <6,5,7,6>, <3,4,5,6>
+  3785773070U,	// <6,3,5,5>: Cost 4 vsldoi8 <4,5,6,3>, <5,5,6,6>
+  3840739932U,	// <6,3,5,6>: Cost 4 vsldoi12 <2,5,3,6>, <3,5,6,6>
+  2299439034U,	// <6,3,5,7>: Cost 3 vmrglw <2,u,6,5>, <2,6,3,7>
+  2719994110U,	// <6,3,5,u>: Cost 3 vsldoi8 <5,u,6,3>, <5,u,6,3>
+  2254899350U,	// <6,3,6,0>: Cost 3 vmrghw <6,6,6,6>, <3,0,1,2>
+  3328641254U,	// <6,3,6,1>: Cost 4 vmrghw <6,6,6,6>, <3,1,1,1>
+  2633443257U,	// <6,3,6,2>: Cost 3 vsldoi4 <2,6,3,6>, <2,6,3,6>
+  2254899612U,	// <6,3,6,3>: Cost 3 vmrghw <6,6,6,6>, <3,3,3,3>
+  2254899714U,	// <6,3,6,4>: Cost 3 vmrghw <6,6,6,6>, <3,4,5,6>
+  3785773772U,	// <6,3,6,5>: Cost 4 vsldoi8 <4,5,6,3>, <6,5,3,6>
+  2725966648U,	// <6,3,6,6>: Cost 3 vsldoi8 <6,u,6,3>, <6,6,6,6>
+  2322007994U,	// <6,3,6,7>: Cost 3 vmrglw <6,6,6,6>, <2,6,3,7>
+  2254899998U,	// <6,3,6,u>: Cost 3 vmrghw <6,6,6,6>, <3,u,1,2>
+  1559707750U,	// <6,3,7,0>: Cost 2 vsldoi4 <2,6,3,7>, LHS
+  2633450292U,	// <6,3,7,1>: Cost 3 vsldoi4 <2,6,3,7>, <1,1,1,1>
+  1559709626U,	// <6,3,7,2>: Cost 2 vsldoi4 <2,6,3,7>, <2,6,3,7>
+  1235666546U,	// <6,3,7,3>: Cost 2 vmrglw RHS, <2,2,3,3>
+  1559711030U,	// <6,3,7,4>: Cost 2 vsldoi4 <2,6,3,7>, RHS
+  2309408291U,	// <6,3,7,5>: Cost 3 vmrglw RHS, <2,1,3,5>
+  2633454152U,	// <6,3,7,6>: Cost 3 vsldoi4 <2,6,3,7>, <6,3,7,0>
+  1235666874U,	// <6,3,7,7>: Cost 2 vmrglw RHS, <2,6,3,7>
+  1559713582U,	// <6,3,7,u>: Cost 2 vsldoi4 <2,6,3,7>, LHS
+  1559715942U,	// <6,3,u,0>: Cost 2 vsldoi4 <2,6,3,u>, LHS
+  2633458484U,	// <6,3,u,1>: Cost 3 vsldoi4 <2,6,3,u>, <1,1,1,1>
+  1559717819U,	// <6,3,u,2>: Cost 2 vsldoi4 <2,6,3,u>, <2,6,3,u>
+  1235674738U,	// <6,3,u,3>: Cost 2 vmrglw RHS, <2,2,3,3>
+  1559719222U,	// <6,3,u,4>: Cost 2 vsldoi4 <2,6,3,u>, RHS
+  1701366598U,	// <6,3,u,5>: Cost 2 vsldoi12 <3,u,5,6>, <3,u,5,6>
+  2633462353U,	// <6,3,u,6>: Cost 3 vsldoi4 <2,6,3,u>, <6,3,u,0>
+  1235675066U,	// <6,3,u,7>: Cost 2 vmrglw RHS, <2,6,3,7>
+  1559721774U,	// <6,3,u,u>: Cost 2 vsldoi4 <2,6,3,u>, LHS
+  3785777152U,	// <6,4,0,0>: Cost 4 vsldoi8 <4,5,6,4>, <0,0,0,0>
+  2712035430U,	// <6,4,0,1>: Cost 3 vsldoi8 <4,5,6,4>, LHS
+  3771179185U,	// <6,4,0,2>: Cost 4 vsldoi8 <2,1,6,4>, <0,2,1,6>
+  3846196096U,	// <6,4,0,3>: Cost 4 vsldoi12 <3,4,5,6>, <4,0,3,1>
+  3785777490U,	// <6,4,0,4>: Cost 4 vsldoi8 <4,5,6,4>, <0,4,1,5>
+  2250517814U,	// <6,4,0,5>: Cost 3 vmrghw <6,0,1,2>, RHS
+  3324259703U,	// <6,4,0,6>: Cost 4 vmrghw <6,0,1,2>, <4,6,5,0>
+  3383092458U,	// <6,4,0,7>: Cost 5 vmrglw <4,5,6,0>, <1,6,4,7>
+  2712035997U,	// <6,4,0,u>: Cost 3 vsldoi8 <4,5,6,4>, LHS
+  3325356946U,	// <6,4,1,0>: Cost 4 vmrghw <6,1,7,1>, <4,0,5,1>
+  3785777972U,	// <6,4,1,1>: Cost 4 vsldoi8 <4,5,6,4>, <1,1,1,1>
+  3846196170U,	// <6,4,1,2>: Cost 4 vsldoi12 <3,4,5,6>, <4,1,2,3>
+  3325365380U,	// <6,4,1,3>: Cost 4 vmrghw <6,1,7,2>, <4,3,5,0>
+  3852168155U,	// <6,4,1,4>: Cost 4 vsldoi12 <4,4,5,6>, <4,1,4,2>
+  2251615542U,	// <6,4,1,5>: Cost 3 vmrghw <6,1,7,1>, RHS
+  3325357432U,	// <6,4,1,6>: Cost 4 vmrghw <6,1,7,1>, <4,6,5,1>
+  3870084088U,	// <6,4,1,7>: Cost 4 vsldoi12 <7,4,5,6>, <4,1,7,4>
+  2251615785U,	// <6,4,1,u>: Cost 3 vmrghw <6,1,7,1>, RHS
+  2252295058U,	// <6,4,2,0>: Cost 3 vmrghw <6,2,7,3>, <4,0,5,1>
+  3771180605U,	// <6,4,2,1>: Cost 4 vsldoi8 <2,1,6,4>, <2,1,6,4>
+  3785778792U,	// <6,4,2,2>: Cost 4 vsldoi8 <4,5,6,4>, <2,2,2,2>
+  3777816253U,	// <6,4,2,3>: Cost 4 vsldoi8 <3,2,6,4>, <2,3,2,6>
+  2252295376U,	// <6,4,2,4>: Cost 3 vmrghw <6,2,7,3>, <4,4,4,4>
+  1178553654U,	// <6,4,2,5>: Cost 2 vmrghw <6,2,7,3>, RHS
+  2252295545U,	// <6,4,2,6>: Cost 3 vmrghw <6,2,7,3>, <4,6,5,2>
+  3326037448U,	// <6,4,2,7>: Cost 4 vmrghw <6,2,7,3>, <4,7,5,0>
+  1178553897U,	// <6,4,2,u>: Cost 2 vmrghw <6,2,7,3>, RHS
+  3785779350U,	// <6,4,3,0>: Cost 4 vsldoi8 <4,5,6,4>, <3,0,1,2>
+  3383118648U,	// <6,4,3,1>: Cost 4 vmrglw <4,5,6,3>, <3,u,4,1>
+  3777816935U,	// <6,4,3,2>: Cost 4 vsldoi8 <3,2,6,4>, <3,2,6,4>
+  3785779612U,	// <6,4,3,3>: Cost 4 vsldoi8 <4,5,6,4>, <3,3,3,3>
+  2712037890U,	// <6,4,3,4>: Cost 3 vsldoi8 <4,5,6,4>, <3,4,5,6>
+  2252754230U,	// <6,4,3,5>: Cost 3 vmrghw <6,3,4,5>, RHS
+  3784452764U,	// <6,4,3,6>: Cost 4 vsldoi8 <4,3,6,4>, <3,6,4,7>
+  3801705178U,	// <6,4,3,7>: Cost 4 vsldoi8 <7,2,6,4>, <3,7,2,6>
+  2252754473U,	// <6,4,3,u>: Cost 3 vmrghw <6,3,4,5>, RHS
+  3787770770U,	// <6,4,4,0>: Cost 4 vsldoi8 <4,u,6,4>, <4,0,5,1>
+  3383126840U,	// <6,4,4,1>: Cost 4 vmrglw <4,5,6,4>, <3,u,4,1>
+  3327380534U,	// <6,4,4,2>: Cost 4 vmrghw <6,4,7,5>, <4,2,5,3>
+  3784453265U,	// <6,4,4,3>: Cost 4 vsldoi8 <4,3,6,4>, <4,3,6,4>
+  2253630672U,	// <6,4,4,4>: Cost 3 vmrghw <6,4,7,4>, <4,4,4,4>
+  2778426587U,	// <6,4,4,5>: Cost 3 vsldoi12 <4,4,5,6>, <4,4,5,6>
+  3383128789U,	// <6,4,4,6>: Cost 4 vmrglw <4,5,6,4>, <6,5,4,6>
+  3381799580U,	// <6,4,4,7>: Cost 4 vmrglw <4,3,6,4>, <3,6,4,7>
+  2778647798U,	// <6,4,4,u>: Cost 3 vsldoi12 <4,4,u,6>, <4,4,u,6>
+  2651422822U,	// <6,4,5,0>: Cost 3 vsldoi4 <5,6,4,5>, LHS
+  3701277928U,	// <6,4,5,1>: Cost 4 vsldoi4 <1,6,4,5>, <1,6,4,5>
+  3701278650U,	// <6,4,5,2>: Cost 4 vsldoi4 <1,6,4,5>, <2,6,3,7>
+  2651425282U,	// <6,4,5,3>: Cost 3 vsldoi4 <5,6,4,5>, <3,4,5,6>
+  2651426102U,	// <6,4,5,4>: Cost 3 vsldoi4 <5,6,4,5>, RHS
+  2651426892U,	// <6,4,5,5>: Cost 3 vsldoi4 <5,6,4,5>, <5,6,4,5>
+  1698712886U,	// <6,4,5,6>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+  3725169658U,	// <6,4,5,7>: Cost 4 vsldoi4 <5,6,4,5>, <7,0,1,2>
+  1698712904U,	// <6,4,5,u>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+  2254900114U,	// <6,4,6,0>: Cost 3 vmrghw <6,6,6,6>, <4,0,5,1>
+  3389115192U,	// <6,4,6,1>: Cost 4 vmrglw <5,5,6,6>, <3,u,4,1>
+  3785781727U,	// <6,4,6,2>: Cost 4 vsldoi8 <4,5,6,4>, <6,2,4,3>
+  3785781810U,	// <6,4,6,3>: Cost 4 vsldoi8 <4,5,6,4>, <6,3,4,5>
+  2254900432U,	// <6,4,6,4>: Cost 3 vmrghw <6,6,6,6>, <4,4,4,4>
+  1181158710U,	// <6,4,6,5>: Cost 2 vmrghw <6,6,6,6>, RHS
+  2254900605U,	// <6,4,6,6>: Cost 3 vmrghw <6,6,6,6>, <4,6,5,6>
+  3787772750U,	// <6,4,6,7>: Cost 4 vsldoi8 <4,u,6,4>, <6,7,0,1>
+  1181158953U,	// <6,4,6,u>: Cost 2 vmrghw <6,6,6,6>, RHS
+  2639495270U,	// <6,4,7,0>: Cost 3 vsldoi4 <3,6,4,7>, LHS
+  2639496090U,	// <6,4,7,1>: Cost 3 vsldoi4 <3,6,4,7>, <1,2,3,4>
+  3707267011U,	// <6,4,7,2>: Cost 4 vsldoi4 <2,6,4,7>, <2,6,4,7>
+  2639497884U,	// <6,4,7,3>: Cost 3 vsldoi4 <3,6,4,7>, <3,6,4,7>
+  1237658832U,	// <6,4,7,4>: Cost 2 vmrglw RHS, <4,4,4,4>
+  1235666638U,	// <6,4,7,5>: Cost 2 vmrglw RHS, <2,3,4,5>
+  3713241753U,	// <6,4,7,6>: Cost 4 vsldoi4 <3,6,4,7>, <6,4,7,0>
+  2309409436U,	// <6,4,7,7>: Cost 3 vmrglw RHS, <3,6,4,7>
+  1235666641U,	// <6,4,7,u>: Cost 2 vmrglw RHS, <2,3,4,u>
+  2639503462U,	// <6,4,u,0>: Cost 3 vsldoi4 <3,6,4,u>, LHS
+  2639504282U,	// <6,4,u,1>: Cost 3 vsldoi4 <3,6,4,u>, <1,2,3,4>
+  3701303226U,	// <6,4,u,2>: Cost 4 vsldoi4 <1,6,4,u>, <2,6,3,7>
+  2639506077U,	// <6,4,u,3>: Cost 3 vsldoi4 <3,6,4,u>, <3,6,4,u>
+  1235676368U,	// <6,4,u,4>: Cost 2 vmrglw RHS, <4,4,4,4>
+  1235674830U,	// <6,4,u,5>: Cost 2 vmrglw RHS, <2,3,4,5>
+  1698713129U,	// <6,4,u,6>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+  2309417628U,	// <6,4,u,7>: Cost 3 vmrglw RHS, <3,6,4,7>
+  1698713147U,	// <6,4,u,u>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+  3775832064U,	// <6,5,0,0>: Cost 4 vsldoi8 <2,u,6,5>, <0,0,0,0>
+  2702090342U,	// <6,5,0,1>: Cost 3 vsldoi8 <2,u,6,5>, LHS
+  3775832241U,	// <6,5,0,2>: Cost 4 vsldoi8 <2,u,6,5>, <0,2,1,6>
+  3719227906U,	// <6,5,0,3>: Cost 4 vsldoi4 <4,6,5,0>, <3,4,5,6>
+  3775832402U,	// <6,5,0,4>: Cost 4 vsldoi8 <2,u,6,5>, <0,4,1,5>
+  3385085146U,	// <6,5,0,5>: Cost 4 vmrglw <4,u,6,0>, <4,4,5,5>
+  2309351938U,	// <6,5,0,6>: Cost 3 vmrglw <4,5,6,0>, <3,4,5,6>
+  3376459134U,	// <6,5,0,7>: Cost 5 vmrglw <3,4,6,0>, <4,6,5,7>
+  2702090909U,	// <6,5,0,u>: Cost 3 vsldoi8 <2,u,6,5>, LHS
+  3719233546U,	// <6,5,1,0>: Cost 4 vsldoi4 <4,6,5,1>, <0,0,1,1>
+  3775832884U,	// <6,5,1,1>: Cost 4 vsldoi8 <2,u,6,5>, <1,1,1,1>
+  3775832982U,	// <6,5,1,2>: Cost 4 vsldoi8 <2,u,6,5>, <1,2,3,0>
+  3846196909U,	// <6,5,1,3>: Cost 4 vsldoi12 <3,4,5,6>, <5,1,3,4>
+  3719236984U,	// <6,5,1,4>: Cost 4 vsldoi4 <4,6,5,1>, <4,6,5,1>
+  3856150209U,	// <6,5,1,5>: Cost 4 vsldoi12 <5,1,5,6>, <5,1,5,6>
+  3834252997U,	// <6,5,1,6>: Cost 4 vsldoi12 <1,4,5,6>, <5,1,6,1>
+  3870084817U,	// <6,5,1,7>: Cost 4 vsldoi12 <7,4,5,6>, <5,1,7,4>
+  3769861532U,	// <6,5,1,u>: Cost 4 vsldoi8 <1,u,6,5>, <1,u,6,5>
+  2645500006U,	// <6,5,2,0>: Cost 3 vsldoi4 <4,6,5,2>, LHS
+  3719242548U,	// <6,5,2,1>: Cost 4 vsldoi4 <4,6,5,2>, <1,1,1,1>
+  3775833704U,	// <6,5,2,2>: Cost 4 vsldoi8 <2,u,6,5>, <2,2,2,2>
+  3775833766U,	// <6,5,2,3>: Cost 4 vsldoi8 <2,u,6,5>, <2,3,0,1>
+  2645503353U,	// <6,5,2,4>: Cost 3 vsldoi4 <4,6,5,2>, <4,6,5,2>
+  2252296196U,	// <6,5,2,5>: Cost 3 vmrghw <6,2,7,3>, <5,5,5,5>
+  2702092218U,	// <6,5,2,6>: Cost 3 vsldoi8 <2,u,6,5>, <2,6,3,7>
+  3719246842U,	// <6,5,2,7>: Cost 4 vsldoi4 <4,6,5,2>, <7,0,1,2>
+  2702092405U,	// <6,5,2,u>: Cost 3 vsldoi8 <2,u,6,5>, <2,u,6,5>
+  3775834262U,	// <6,5,3,0>: Cost 4 vsldoi8 <2,u,6,5>, <3,0,1,2>
+  3777161495U,	// <6,5,3,1>: Cost 4 vsldoi8 <3,1,6,5>, <3,1,6,5>
+  3775834470U,	// <6,5,3,2>: Cost 4 vsldoi8 <2,u,6,5>, <3,2,6,3>
+  3775834524U,	// <6,5,3,3>: Cost 4 vsldoi8 <2,u,6,5>, <3,3,3,3>
+  3775834626U,	// <6,5,3,4>: Cost 4 vsldoi8 <2,u,6,5>, <3,4,5,6>
+  3385109722U,	// <6,5,3,5>: Cost 4 vmrglw <4,u,6,3>, <4,4,5,5>
+  2309376514U,	// <6,5,3,6>: Cost 3 vmrglw <4,5,6,3>, <3,4,5,6>
+  3775834819U,	// <6,5,3,7>: Cost 4 vsldoi8 <2,u,6,5>, <3,7,0,1>
+  2309376514U,	// <6,5,3,u>: Cost 3 vmrglw <4,5,6,3>, <3,4,5,6>
+  3719258214U,	// <6,5,4,0>: Cost 4 vsldoi4 <4,6,5,4>, LHS
+  3385117586U,	// <6,5,4,1>: Cost 4 vmrglw <4,u,6,4>, <4,0,5,1>
+  3327242008U,	// <6,5,4,2>: Cost 4 vmrghw <6,4,5,6>, <5,2,6,3>
+  3719260674U,	// <6,5,4,3>: Cost 4 vsldoi4 <4,6,5,4>, <3,4,5,6>
+  3719261563U,	// <6,5,4,4>: Cost 4 vsldoi4 <4,6,5,4>, <4,6,5,4>
+  2702093622U,	// <6,5,4,5>: Cost 3 vsldoi8 <2,u,6,5>, RHS
+  2309384706U,	// <6,5,4,6>: Cost 3 vmrglw <4,5,6,4>, <3,4,5,6>
+  3870085060U,	// <6,5,4,7>: Cost 4 vsldoi12 <7,4,5,6>, <5,4,7,4>
+  2702093865U,	// <6,5,4,u>: Cost 3 vsldoi8 <2,u,6,5>, RHS
+  3719266406U,	// <6,5,5,0>: Cost 4 vsldoi4 <4,6,5,5>, LHS
+  3789106889U,	// <6,5,5,1>: Cost 4 vsldoi8 <5,1,6,5>, <5,1,6,5>
+  3785789208U,	// <6,5,5,2>: Cost 4 vsldoi8 <4,5,6,5>, <5,2,6,3>
+  3373183950U,	// <6,5,5,3>: Cost 4 vmrglw <2,u,6,5>, <6,u,5,3>
+  2717355964U,	// <6,5,5,4>: Cost 3 vsldoi8 <5,4,6,5>, <5,4,6,5>
+  2791772164U,	// <6,5,5,5>: Cost 3 vsldoi12 <6,6,6,6>, <5,5,5,5>
+  2772455438U,	// <6,5,5,6>: Cost 3 vsldoi12 <3,4,5,6>, <5,5,6,6>
+  3373183549U,	// <6,5,5,7>: Cost 4 vmrglw <2,u,6,5>, <6,3,5,7>
+  2720010496U,	// <6,5,5,u>: Cost 3 vsldoi8 <5,u,6,5>, <5,u,6,5>
+  2772455460U,	// <6,5,6,0>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,0,1>
+  2322008978U,	// <6,5,6,1>: Cost 3 vmrglw <6,6,6,6>, <4,0,5,1>
+  3840225335U,	// <6,5,6,2>: Cost 4 vsldoi12 <2,4,5,6>, <5,6,2,2>
+  2772455490U,	// <6,5,6,3>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,3,4>
+  2772455500U,	// <6,5,6,4>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,4,5>
+  2254901252U,	// <6,5,6,5>: Cost 3 vmrghw <6,6,6,6>, <5,5,5,5>
+  2772455520U,	// <6,5,6,6>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,6,7>
+  2785874024U,	// <6,5,6,7>: Cost 3 vsldoi12 <5,6,7,6>, <5,6,7,6>
+  2772455532U,	// <6,5,6,u>: Cost 3 vsldoi12 <3,4,5,6>, <5,6,u,1>
+  2627625062U,	// <6,5,7,0>: Cost 3 vsldoi4 <1,6,5,7>, LHS
+  1235667858U,	// <6,5,7,1>: Cost 2 vmrglw RHS, <4,0,5,1>
+  2309409278U,	// <6,5,7,2>: Cost 3 vmrglw RHS, <3,4,5,2>
+  2309407659U,	// <6,5,7,3>: Cost 3 vmrglw RHS, <1,2,5,3>
+  2627628342U,	// <6,5,7,4>: Cost 3 vsldoi4 <1,6,5,7>, RHS
+  1235668186U,	// <6,5,7,5>: Cost 2 vmrglw RHS, <4,4,5,5>
+  1235667458U,	// <6,5,7,6>: Cost 2 vmrglw RHS, <3,4,5,6>
+  2309407987U,	// <6,5,7,7>: Cost 3 vmrglw RHS, <1,6,5,7>
+  1235667460U,	// <6,5,7,u>: Cost 2 vmrglw RHS, <3,4,5,u>
+  2627633254U,	// <6,5,u,0>: Cost 3 vsldoi4 <1,6,5,u>, LHS
+  1235676050U,	// <6,5,u,1>: Cost 2 vmrglw RHS, <4,0,5,1>
+  2309417470U,	// <6,5,u,2>: Cost 3 vmrglw RHS, <3,4,5,2>
+  2309415851U,	// <6,5,u,3>: Cost 3 vmrglw RHS, <1,2,5,3>
+  2627636534U,	// <6,5,u,4>: Cost 3 vsldoi4 <1,6,5,u>, RHS
+  1235676378U,	// <6,5,u,5>: Cost 2 vmrglw RHS, <4,4,5,5>
+  1235675650U,	// <6,5,u,6>: Cost 2 vmrglw RHS, <3,4,5,6>
+  2309416179U,	// <6,5,u,7>: Cost 3 vmrglw RHS, <1,6,5,7>
+  1235675652U,	// <6,5,u,u>: Cost 2 vmrglw RHS, <3,4,5,u>
+  2309352751U,	// <6,6,0,0>: Cost 3 vmrglw <4,5,6,0>, <4,5,6,0>
+  1650917478U,	// <6,6,0,1>: Cost 2 vsldoi8 <6,6,6,6>, LHS
+  2250584570U,	// <6,6,0,2>: Cost 3 vmrghw <6,0,2,1>, <6,2,7,3>
+  3846197554U,	// <6,6,0,3>: Cost 4 vsldoi12 <3,4,5,6>, <6,0,3,1>
+  2724659538U,	// <6,6,0,4>: Cost 3 vsldoi8 <6,6,6,6>, <0,4,1,5>
+  3725275225U,	// <6,6,0,5>: Cost 4 vsldoi4 <5,6,6,0>, <5,6,6,0>
+  2791772493U,	// <6,6,0,6>: Cost 3 vsldoi12 <6,6,6,6>, <6,0,6,1>
+  2309352758U,	// <6,6,0,7>: Cost 3 vmrglw <4,5,6,0>, RHS
+  1650918045U,	// <6,6,0,u>: Cost 2 vsldoi8 <6,6,6,6>, LHS
+  3325358368U,	// <6,6,1,0>: Cost 4 vmrghw <6,1,7,1>, <6,0,1,1>
+  2299406449U,	// <6,6,1,1>: Cost 3 vmrglw <2,u,6,1>, <2,u,6,1>
+  2724660118U,	// <6,6,1,2>: Cost 3 vsldoi8 <6,6,6,6>, <1,2,3,0>
+  3373148518U,	// <6,6,1,3>: Cost 4 vmrglw <2,u,6,1>, <3,2,6,3>
+  3834253712U,	// <6,6,1,4>: Cost 4 vsldoi12 <1,4,5,6>, <6,1,4,5>
+  3373147953U,	// <6,6,1,5>: Cost 4 vmrglw <2,u,6,1>, <2,4,6,5>
+  2323297080U,	// <6,6,1,6>: Cost 3 vmrglw <6,u,6,1>, <6,6,6,6>
+  2299407670U,	// <6,6,1,7>: Cost 3 vmrglw <2,u,6,1>, RHS
+  2299407671U,	// <6,6,1,u>: Cost 3 vmrglw <2,u,6,1>, RHS
+  2252296489U,	// <6,6,2,0>: Cost 3 vmrghw <6,2,7,3>, <6,0,2,1>
+  3326038394U,	// <6,6,2,1>: Cost 4 vmrghw <6,2,7,3>, <6,1,2,1>
+  1178554874U,	// <6,6,2,2>: Cost 2 vmrghw <6,2,7,3>, <6,2,7,3>
+  2724660902U,	// <6,6,2,3>: Cost 3 vsldoi8 <6,6,6,6>, <2,3,0,1>
+  2252296817U,	// <6,6,2,4>: Cost 3 vmrghw <6,2,7,3>, <6,4,2,5>
+  3840741864U,	// <6,6,2,5>: Cost 4 vsldoi12 <2,5,3,6>, <6,2,5,3>
+  2252296976U,	// <6,6,2,6>: Cost 3 vmrghw <6,2,7,3>, <6,6,2,2>
+  2785874426U,	// <6,6,2,7>: Cost 3 vsldoi12 <5,6,7,6>, <6,2,7,3>
+  1178554874U,	// <6,6,2,u>: Cost 2 vmrghw <6,2,7,3>, <6,2,7,3>
+  2724661398U,	// <6,6,3,0>: Cost 3 vsldoi8 <6,6,6,6>, <3,0,1,2>
+  3375154665U,	// <6,6,3,1>: Cost 4 vmrglw <3,2,6,3>, <2,0,6,1>
+  3375154909U,	// <6,6,3,2>: Cost 4 vmrglw <3,2,6,3>, <2,3,6,2>
+  2301413734U,	// <6,6,3,3>: Cost 3 vmrglw <3,2,6,3>, <3,2,6,3>
+  2772455986U,	// <6,6,3,4>: Cost 3 vsldoi12 <3,4,5,6>, <6,3,4,5>
+  3375154993U,	// <6,6,3,5>: Cost 4 vmrglw <3,2,6,3>, <2,4,6,5>
+  2323313464U,	// <6,6,3,6>: Cost 3 vmrglw <6,u,6,3>, <6,6,6,6>
+  2301414710U,	// <6,6,3,7>: Cost 3 vmrglw <3,2,6,3>, RHS
+  2301414711U,	// <6,6,3,u>: Cost 3 vmrglw <3,2,6,3>, RHS
+  2724662162U,	// <6,6,4,0>: Cost 3 vsldoi8 <6,6,6,6>, <4,0,5,1>
+  3326939559U,	// <6,6,4,1>: Cost 4 vmrghw <6,4,1,5>, <6,1,7,1>
+  2253271546U,	// <6,6,4,2>: Cost 3 vmrghw <6,4,2,5>, <6,2,7,3>
+  3383127346U,	// <6,6,4,3>: Cost 4 vmrglw <4,5,6,4>, <4,5,6,3>
+  2309385523U,	// <6,6,4,4>: Cost 3 vmrglw <4,5,6,4>, <4,5,6,4>
+  1650920758U,	// <6,6,4,5>: Cost 2 vsldoi8 <6,6,6,6>, RHS
+  2724662653U,	// <6,6,4,6>: Cost 3 vsldoi8 <6,6,6,6>, <4,6,5,6>
+  2309385526U,	// <6,6,4,7>: Cost 3 vmrglw <4,5,6,4>, RHS
+  1650921001U,	// <6,6,4,u>: Cost 2 vsldoi8 <6,6,6,6>, RHS
+  3725312102U,	// <6,6,5,0>: Cost 4 vsldoi4 <5,6,6,5>, LHS
+  3373180393U,	// <6,6,5,1>: Cost 4 vmrglw <2,u,6,5>, <2,0,6,1>
+  3791769368U,	// <6,6,5,2>: Cost 4 vsldoi8 <5,5,6,6>, <5,2,6,3>
+  3373181286U,	// <6,6,5,3>: Cost 4 vmrglw <2,u,6,5>, <3,2,6,3>
+  3725315382U,	// <6,6,5,4>: Cost 4 vsldoi4 <5,6,6,5>, RHS
+  2299439221U,	// <6,6,5,5>: Cost 3 vmrglw <2,u,6,5>, <2,u,6,5>
+  2724663394U,	// <6,6,5,6>: Cost 3 vsldoi8 <6,6,6,6>, <5,6,7,0>
+  2299440438U,	// <6,6,5,7>: Cost 3 vmrglw <2,u,6,5>, RHS
+  2299440439U,	// <6,6,5,u>: Cost 3 vmrglw <2,u,6,5>, RHS
+  1583808614U,	// <6,6,6,0>: Cost 2 vsldoi4 <6,6,6,6>, LHS
+  2322010445U,	// <6,6,6,1>: Cost 3 vmrglw <6,6,6,6>, <6,0,6,1>
+  2254574074U,	// <6,6,6,2>: Cost 3 vmrghw <6,6,2,2>, <6,2,7,3>
+  2322010609U,	// <6,6,6,3>: Cost 3 vmrglw <6,6,6,6>, <6,2,6,3>
+  1583811894U,	// <6,6,6,4>: Cost 2 vsldoi4 <6,6,6,6>, RHS
+  2322010773U,	// <6,6,6,5>: Cost 3 vmrglw <6,6,6,6>, <6,4,6,5>
+  363253046U,	// <6,6,6,6>: Cost 1 vspltisw2 RHS
+  1248267574U,	// <6,6,6,7>: Cost 2 vmrglw <6,6,6,6>, RHS
+  363253046U,	// <6,6,6,u>: Cost 1 vspltisw2 RHS
+  2309410095U,	// <6,6,7,0>: Cost 3 vmrglw RHS, <4,5,6,0>
+  2309408233U,	// <6,6,7,1>: Cost 3 vmrglw RHS, <2,0,6,1>
+  2311402373U,	// <6,6,7,2>: Cost 3 vmrglw RHS, <6,7,6,2>
+  2309409126U,	// <6,6,7,3>: Cost 3 vmrglw RHS, <3,2,6,3>
+  2309410099U,	// <6,6,7,4>: Cost 3 vmrglw RHS, <4,5,6,4>
+  2309408561U,	// <6,6,7,5>: Cost 3 vmrglw RHS, <2,4,6,5>
+  1237660472U,	// <6,6,7,6>: Cost 2 vmrglw RHS, <6,6,6,6>
+  161926454U,	// <6,6,7,7>: Cost 1 vmrglw RHS, RHS
+  161926455U,	// <6,6,7,u>: Cost 1 vmrglw RHS, RHS
+  1583808614U,	// <6,6,u,0>: Cost 2 vsldoi4 <6,6,6,6>, LHS
+  1650923310U,	// <6,6,u,1>: Cost 2 vsldoi8 <6,6,6,6>, LHS
+  1178554874U,	// <6,6,u,2>: Cost 2 vmrghw <6,2,7,3>, <6,2,7,3>
+  2309417318U,	// <6,6,u,3>: Cost 3 vmrglw RHS, <3,2,6,3>
+  1583811894U,	// <6,6,u,4>: Cost 2 vsldoi4 <6,6,6,6>, RHS
+  1650923674U,	// <6,6,u,5>: Cost 2 vsldoi8 <6,6,6,6>, RHS
+  363253046U,	// <6,6,u,6>: Cost 1 vspltisw2 RHS
+  161934646U,	// <6,6,u,7>: Cost 1 vmrglw RHS, RHS
+  161934647U,	// <6,6,u,u>: Cost 1 vmrglw RHS, RHS
+  1638318080U,	// <6,7,0,0>: Cost 2 vsldoi8 RHS, <0,0,0,0>
+  564576358U,	// <6,7,0,1>: Cost 1 vsldoi8 RHS, LHS
+  2712060077U,	// <6,7,0,2>: Cost 3 vsldoi8 RHS, <0,2,1,2>
+  2712060156U,	// <6,7,0,3>: Cost 3 vsldoi8 RHS, <0,3,1,0>
+  1638318418U,	// <6,7,0,4>: Cost 2 vsldoi8 RHS, <0,4,1,5>
+  1577865314U,	// <6,7,0,5>: Cost 2 vsldoi4 <5,6,7,0>, <5,6,7,0>
+  2712060406U,	// <6,7,0,6>: Cost 3 vsldoi8 RHS, <0,6,1,7>
+  2651608058U,	// <6,7,0,7>: Cost 3 vsldoi4 <5,6,7,0>, <7,0,1,2>
+  564576925U,	// <6,7,0,u>: Cost 1 vsldoi8 RHS, LHS
+  2712060643U,	// <6,7,1,0>: Cost 3 vsldoi8 RHS, <1,0,1,1>
+  1638318900U,	// <6,7,1,1>: Cost 2 vsldoi8 RHS, <1,1,1,1>
+  1638318998U,	// <6,7,1,2>: Cost 2 vsldoi8 RHS, <1,2,3,0>
+  3766559753U,	// <6,7,1,3>: Cost 4 vsldoi8 <1,3,6,7>, <1,3,6,7>
+  2712060971U,	// <6,7,1,4>: Cost 3 vsldoi8 RHS, <1,4,1,5>
+  2712061039U,	// <6,7,1,5>: Cost 3 vsldoi8 RHS, <1,5,0,1>
+  2712061135U,	// <6,7,1,6>: Cost 3 vsldoi8 RHS, <1,6,1,7>
+  3373148612U,	// <6,7,1,7>: Cost 4 vmrglw <2,u,6,1>, <3,3,7,7>
+  1638319484U,	// <6,7,1,u>: Cost 2 vsldoi8 RHS, <1,u,3,0>
+  2712061373U,	// <6,7,2,0>: Cost 3 vsldoi8 RHS, <2,0,1,2>
+  2712061471U,	// <6,7,2,1>: Cost 3 vsldoi8 RHS, <2,1,3,1>
+  1638319720U,	// <6,7,2,2>: Cost 2 vsldoi8 RHS, <2,2,2,2>
+  1638319782U,	// <6,7,2,3>: Cost 2 vsldoi8 RHS, <2,3,0,1>
+  2712061709U,	// <6,7,2,4>: Cost 3 vsldoi8 RHS, <2,4,2,5>
+  2712061800U,	// <6,7,2,5>: Cost 3 vsldoi8 RHS, <2,5,3,6>
+  1638320058U,	// <6,7,2,6>: Cost 2 vsldoi8 RHS, <2,6,3,7>
+  2252297836U,	// <6,7,2,7>: Cost 3 vmrghw <6,2,7,3>, <7,7,7,7>
+  1638320187U,	// <6,7,2,u>: Cost 2 vsldoi8 RHS, <2,u,0,1>
+  1638320278U,	// <6,7,3,0>: Cost 2 vsldoi8 RHS, <3,0,1,2>
+  2712062182U,	// <6,7,3,1>: Cost 3 vsldoi8 RHS, <3,1,1,1>
+  2712062256U,	// <6,7,3,2>: Cost 3 vsldoi8 RHS, <3,2,0,3>
+  1638320540U,	// <6,7,3,3>: Cost 2 vsldoi8 RHS, <3,3,3,3>
+  1638320642U,	// <6,7,3,4>: Cost 2 vsldoi8 RHS, <3,4,5,6>
+  2712062546U,	// <6,7,3,5>: Cost 3 vsldoi8 RHS, <3,5,5,5>
+  2712062584U,	// <6,7,3,6>: Cost 3 vsldoi8 RHS, <3,6,0,7>
+  2712062659U,	// <6,7,3,7>: Cost 3 vsldoi8 RHS, <3,7,0,1>
+  1638320926U,	// <6,7,3,u>: Cost 2 vsldoi8 RHS, <3,u,1,2>
+  1638321042U,	// <6,7,4,0>: Cost 2 vsldoi8 RHS, <4,0,5,1>
+  2712062922U,	// <6,7,4,1>: Cost 3 vsldoi8 RHS, <4,1,2,3>
+  2712063029U,	// <6,7,4,2>: Cost 3 vsldoi8 RHS, <4,2,5,2>
+  2712063108U,	// <6,7,4,3>: Cost 3 vsldoi8 RHS, <4,3,5,0>
+  1638321360U,	// <6,7,4,4>: Cost 2 vsldoi8 RHS, <4,4,4,4>
+  564579638U,	// <6,7,4,5>: Cost 1 vsldoi8 RHS, RHS
+  2712063357U,	// <6,7,4,6>: Cost 3 vsldoi8 RHS, <4,6,5,6>
+  2712063439U,	// <6,7,4,7>: Cost 3 vsldoi8 RHS, <4,7,5,7>
+  564579881U,	// <6,7,4,u>: Cost 1 vsldoi8 RHS, RHS
+  2712063560U,	// <6,7,5,0>: Cost 3 vsldoi8 RHS, <5,0,1,2>
+  2714054287U,	// <6,7,5,1>: Cost 3 vsldoi8 RHS, <5,1,0,1>
+  2712063742U,	// <6,7,5,2>: Cost 3 vsldoi8 RHS, <5,2,3,4>
+  3373181295U,	// <6,7,5,3>: Cost 4 vmrglw <2,u,6,5>, <3,2,7,3>
+  2712063924U,	// <6,7,5,4>: Cost 3 vsldoi8 RHS, <5,4,5,6>
+  1638322180U,	// <6,7,5,5>: Cost 2 vsldoi8 RHS, <5,5,5,5>
+  1638322274U,	// <6,7,5,6>: Cost 2 vsldoi8 RHS, <5,6,7,0>
+  3373181380U,	// <6,7,5,7>: Cost 4 vmrglw <2,u,6,5>, <3,3,7,7>
+  1640313092U,	// <6,7,5,u>: Cost 2 vsldoi8 RHS, <5,u,7,0>
+  2712064289U,	// <6,7,6,0>: Cost 3 vsldoi8 RHS, <6,0,1,2>
+  2712064423U,	// <6,7,6,1>: Cost 3 vsldoi8 RHS, <6,1,7,1>
+  1638322682U,	// <6,7,6,2>: Cost 2 vsldoi8 RHS, <6,2,7,3>
+  2712064562U,	// <6,7,6,3>: Cost 3 vsldoi8 RHS, <6,3,4,5>
+  2712064653U,	// <6,7,6,4>: Cost 3 vsldoi8 RHS, <6,4,5,6>
+  2712064747U,	// <6,7,6,5>: Cost 3 vsldoi8 RHS, <6,5,7,1>
+  1638323000U,	// <6,7,6,6>: Cost 2 vsldoi8 RHS, <6,6,6,6>
+  1638323022U,	// <6,7,6,7>: Cost 2 vsldoi8 RHS, <6,7,0,1>
+  1638323168U,	// <6,7,6,u>: Cost 2 vsldoi8 RHS, <6,u,7,3>
+  1237659746U,	// <6,7,7,0>: Cost 2 vmrglw RHS, <5,6,7,0>
+  2309411158U,	// <6,7,7,1>: Cost 3 vmrglw RHS, <6,0,7,1>
+  2639718330U,	// <6,7,7,2>: Cost 3 vsldoi4 <3,6,7,7>, <2,6,3,7>
+  1235669498U,	// <6,7,7,3>: Cost 2 vmrglw RHS, <6,2,7,3>
+  1237659750U,	// <6,7,7,4>: Cost 2 vmrglw RHS, <5,6,7,4>
+  2309411243U,	// <6,7,7,5>: Cost 3 vmrglw RHS, <6,1,7,5>
+  1583895362U,	// <6,7,7,6>: Cost 2 vsldoi4 <6,6,7,7>, <6,6,7,7>
+  1235669826U,	// <6,7,7,7>: Cost 2 vmrglw RHS, <6,6,7,7>
+  1235669503U,	// <6,7,7,u>: Cost 2 vmrglw RHS, <6,2,7,u>
+  1638323923U,	// <6,7,u,0>: Cost 2 vsldoi8 RHS, <u,0,1,2>
+  564582190U,	// <6,7,u,1>: Cost 1 vsldoi8 RHS, LHS
+  1638324101U,	// <6,7,u,2>: Cost 2 vsldoi8 RHS, <u,2,3,0>
+  1638324156U,	// <6,7,u,3>: Cost 2 vsldoi8 RHS, <u,3,0,1>
+  1638324287U,	// <6,7,u,4>: Cost 2 vsldoi8 RHS, <u,4,5,6>
+  564582554U,	// <6,7,u,5>: Cost 1 vsldoi8 RHS, RHS
+  1638324432U,	// <6,7,u,6>: Cost 2 vsldoi8 RHS, <u,6,3,7>
+  1235678018U,	// <6,7,u,7>: Cost 2 vmrglw RHS, <6,6,7,7>
+  564582757U,	// <6,7,u,u>: Cost 1 vsldoi8 RHS, LHS
+  1638326272U,	// <6,u,0,0>: Cost 2 vsldoi8 RHS, <0,0,0,0>
+  564584550U,	// <6,u,0,1>: Cost 1 vsldoi8 RHS, LHS
+  2712068269U,	// <6,u,0,2>: Cost 3 vsldoi8 RHS, <0,2,1,2>
+  2309349532U,	// <6,u,0,3>: Cost 3 vmrglw <4,5,6,0>, LHS
+  1638326610U,	// <6,u,0,4>: Cost 2 vsldoi8 RHS, <0,4,1,5>
+  1577939051U,	// <6,u,0,5>: Cost 2 vsldoi4 <5,6,u,0>, <5,6,u,0>
+  2712068598U,	// <6,u,0,6>: Cost 3 vsldoi8 RHS, <0,6,1,7>
+  2309352776U,	// <6,u,0,7>: Cost 3 vmrglw <4,5,6,0>, RHS
+  564585117U,	// <6,u,0,u>: Cost 1 vsldoi8 RHS, LHS
+  2712068835U,	// <6,u,1,0>: Cost 3 vsldoi8 RHS, <1,0,1,1>
+  1638327092U,	// <6,u,1,1>: Cost 2 vsldoi8 RHS, <1,1,1,1>
+  1698715438U,	// <6,u,1,2>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+  2299404444U,	// <6,u,1,3>: Cost 3 vmrglw <2,u,6,1>, LHS
+  2712069163U,	// <6,u,1,4>: Cost 3 vsldoi8 RHS, <1,4,1,5>
+  2712069231U,	// <6,u,1,5>: Cost 3 vsldoi8 RHS, <1,5,0,1>
+  2712069327U,	// <6,u,1,6>: Cost 3 vsldoi8 RHS, <1,6,1,7>
+  2299407688U,	// <6,u,1,7>: Cost 3 vmrglw <2,u,6,1>, RHS
+  1698715492U,	// <6,u,1,u>: Cost 2 vsldoi12 <3,4,5,6>, LHS
+  2712069565U,	// <6,u,2,0>: Cost 3 vsldoi8 RHS, <2,0,1,2>
+  1178556206U,	// <6,u,2,1>: Cost 2 vmrghw <6,2,7,3>, LHS
+  1638327912U,	// <6,u,2,2>: Cost 2 vsldoi8 RHS, <2,2,2,2>
+  1638327974U,	// <6,u,2,3>: Cost 2 vsldoi8 RHS, <2,3,0,1>
+  2712069901U,	// <6,u,2,4>: Cost 3 vsldoi8 RHS, <2,4,2,5>
+  1178556570U,	// <6,u,2,5>: Cost 2 vmrghw <6,2,7,3>, RHS
+  1638328250U,	// <6,u,2,6>: Cost 2 vsldoi8 RHS, <2,6,3,7>
+  2252298496U,	// <6,u,2,7>: Cost 3 vmrghw <6,2,7,3>, <u,7,0,1>
+  1638328379U,	// <6,u,2,u>: Cost 2 vsldoi8 RHS, <2,u,0,1>
+  1638328470U,	// <6,u,3,0>: Cost 2 vsldoi8 RHS, <3,0,1,2>
+  2712070374U,	// <6,u,3,1>: Cost 3 vsldoi8 RHS, <3,1,1,1>
+  2704107883U,	// <6,u,3,2>: Cost 3 vsldoi8 <3,2,6,u>, <3,2,6,u>
+  1638328732U,	// <6,u,3,3>: Cost 2 vsldoi8 RHS, <3,3,3,3>
+  1638328834U,	// <6,u,3,4>: Cost 2 vsldoi8 RHS, <3,4,5,6>
+  2712070738U,	// <6,u,3,5>: Cost 3 vsldoi8 RHS, <3,5,5,5>
+  2712070776U,	// <6,u,3,6>: Cost 3 vsldoi8 RHS, <3,6,0,7>
+  2301414728U,	// <6,u,3,7>: Cost 3 vmrglw <3,2,6,3>, RHS
+  1638329118U,	// <6,u,3,u>: Cost 2 vsldoi8 RHS, <3,u,1,2>
+  1638329234U,	// <6,u,4,0>: Cost 2 vsldoi8 RHS, <4,0,5,1>
+  2712071114U,	// <6,u,4,1>: Cost 3 vsldoi8 RHS, <4,1,2,3>
+  2712071221U,	// <6,u,4,2>: Cost 3 vsldoi8 RHS, <4,2,5,2>
+  2309382300U,	// <6,u,4,3>: Cost 3 vmrglw <4,5,6,4>, LHS
+  1638329552U,	// <6,u,4,4>: Cost 2 vsldoi8 RHS, <4,4,4,4>
+  564587831U,	// <6,u,4,5>: Cost 1 vsldoi8 RHS, RHS
+  2712071545U,	// <6,u,4,6>: Cost 3 vsldoi8 RHS, <4,6,5,2>
+  2309385544U,	// <6,u,4,7>: Cost 3 vmrglw <4,5,6,4>, RHS
+  564588073U,	// <6,u,4,u>: Cost 1 vsldoi8 RHS, RHS
+  2712071752U,	// <6,u,5,0>: Cost 3 vsldoi8 RHS, <5,0,1,2>
+  2714062479U,	// <6,u,5,1>: Cost 3 vsldoi8 RHS, <5,1,0,1>
+  2712071934U,	// <6,u,5,2>: Cost 3 vsldoi8 RHS, <5,2,3,4>
+  2299437212U,	// <6,u,5,3>: Cost 3 vmrglw <2,u,6,5>, LHS
+  2712072116U,	// <6,u,5,4>: Cost 3 vsldoi8 RHS, <5,4,5,6>
+  1638330372U,	// <6,u,5,5>: Cost 2 vsldoi8 RHS, <5,5,5,5>
+  1698715802U,	// <6,u,5,6>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+  2299440456U,	// <6,u,5,7>: Cost 3 vmrglw <2,u,6,5>, RHS
+  1698715820U,	// <6,u,5,u>: Cost 2 vsldoi12 <3,4,5,6>, RHS
+  1583808614U,	// <6,u,6,0>: Cost 2 vsldoi4 <6,6,6,6>, LHS
+  1181161262U,	// <6,u,6,1>: Cost 2 vmrghw <6,6,6,6>, LHS
+  1638330874U,	// <6,u,6,2>: Cost 2 vsldoi8 RHS, <6,2,7,3>
+  1248264348U,	// <6,u,6,3>: Cost 2 vmrglw <6,6,6,6>, LHS
+  1583811894U,	// <6,u,6,4>: Cost 2 vsldoi4 <6,6,6,6>, RHS
+  1181161626U,	// <6,u,6,5>: Cost 2 vmrghw <6,6,6,6>, RHS
+  363253046U,	// <6,u,6,6>: Cost 1 vspltisw2 RHS
+  1638331214U,	// <6,u,6,7>: Cost 2 vsldoi8 RHS, <6,7,0,1>
+  363253046U,	// <6,u,6,u>: Cost 1 vspltisw2 RHS
+  1560076390U,	// <6,u,7,0>: Cost 2 vsldoi4 <2,6,u,7>, LHS
+  1235664969U,	// <6,u,7,1>: Cost 2 vmrglw RHS, <0,0,u,1>
+  1560078311U,	// <6,u,7,2>: Cost 2 vsldoi4 <2,6,u,7>, <2,6,u,7>
+  161923228U,	// <6,u,7,3>: Cost 1 vmrglw RHS, LHS
+  1560079670U,	// <6,u,7,4>: Cost 2 vsldoi4 <2,6,u,7>, RHS
+  1235665297U,	// <6,u,7,5>: Cost 2 vmrglw RHS, <0,4,u,5>
+  1235667485U,	// <6,u,7,6>: Cost 2 vmrglw RHS, <3,4,u,6>
+  161926472U,	// <6,u,7,7>: Cost 1 vmrglw RHS, RHS
+  161923233U,	// <6,u,7,u>: Cost 1 vmrglw RHS, LHS
+  1560084582U,	// <6,u,u,0>: Cost 2 vsldoi4 <2,6,u,u>, LHS
+  564590382U,	// <6,u,u,1>: Cost 1 vsldoi8 RHS, LHS
+  1560086504U,	// <6,u,u,2>: Cost 2 vsldoi4 <2,6,u,u>, <2,6,u,u>
+  161931420U,	// <6,u,u,3>: Cost 1 vmrglw RHS, LHS
+  1560087862U,	// <6,u,u,4>: Cost 2 vsldoi4 <2,6,u,u>, RHS
+  564590746U,	// <6,u,u,5>: Cost 1 vsldoi8 RHS, RHS
+  363253046U,	// <6,u,u,6>: Cost 1 vspltisw2 RHS
+  161934664U,	// <6,u,u,7>: Cost 1 vmrglw RHS, RHS
+  161931425U,	// <6,u,u,u>: Cost 1 vmrglw RHS, LHS
+  1705426944U,	// <7,0,0,0>: Cost 2 vsldoi12 RHS, <0,0,0,0>
+  1705426954U,	// <7,0,0,1>: Cost 2 vsldoi12 RHS, <0,0,1,1>
+  3713550266U,	// <7,0,0,2>: Cost 4 vsldoi4 <3,7,0,0>, <2,6,3,7>
+  2316063892U,	// <7,0,0,3>: Cost 3 vmrglw <5,6,7,0>, <7,2,0,3>
+  2779168805U,	// <7,0,0,4>: Cost 3 vsldoi12 RHS, <0,0,4,1>
+  2663698530U,	// <7,0,0,5>: Cost 3 vsldoi4 <7,7,0,0>, <5,6,7,0>
+  2657727309U,	// <7,0,0,6>: Cost 3 vsldoi4 <6,7,0,0>, <6,7,0,0>
+  2316064220U,	// <7,0,0,7>: Cost 3 vmrglw <5,6,7,0>, <7,6,0,7>
+  1705427017U,	// <7,0,0,u>: Cost 2 vsldoi12 RHS, <0,0,u,1>
+  1583988838U,	// <7,0,1,0>: Cost 2 vsldoi4 <6,7,0,1>, LHS
+  2779168859U,	// <7,0,1,1>: Cost 3 vsldoi12 RHS, <0,1,1,1>
+  631685222U,	// <7,0,1,2>: Cost 1 vsldoi12 RHS, LHS
+  2639817411U,	// <7,0,1,3>: Cost 3 vsldoi4 <3,7,0,1>, <3,7,0,1>
+  1583992118U,	// <7,0,1,4>: Cost 2 vsldoi4 <6,7,0,1>, RHS
+  2657734660U,	// <7,0,1,5>: Cost 3 vsldoi4 <6,7,0,1>, <5,5,5,5>
+  1583993678U,	// <7,0,1,6>: Cost 2 vsldoi4 <6,7,0,1>, <6,7,0,1>
+  2657735672U,	// <7,0,1,7>: Cost 3 vsldoi4 <6,7,0,1>, <7,0,1,0>
+  631685276U,	// <7,0,1,u>: Cost 1 vsldoi12 RHS, LHS
+  2779168933U,	// <7,0,2,0>: Cost 3 vsldoi12 RHS, <0,2,0,3>
+  2767667377U,	// <7,0,2,1>: Cost 3 vsldoi12 <2,6,3,7>, <0,2,1,6>
+  2718713448U,	// <7,0,2,2>: Cost 3 vsldoi8 <5,6,7,0>, <2,2,2,2>
+  2718713510U,	// <7,0,2,3>: Cost 3 vsldoi8 <5,6,7,0>, <2,3,0,1>
+  3841409228U,	// <7,0,2,4>: Cost 4 vsldoi12 <2,6,3,7>, <0,2,4,6>
+  3852910802U,	// <7,0,2,5>: Cost 4 vsldoi12 RHS, <0,2,5,3>
+  2718713786U,	// <7,0,2,6>: Cost 3 vsldoi8 <5,6,7,0>, <2,6,3,7>
+  3847160036U,	// <7,0,2,7>: Cost 4 vsldoi12 <3,6,0,7>, <0,2,7,3>
+  2767667440U,	// <7,0,2,u>: Cost 3 vsldoi12 <2,6,3,7>, <0,2,u,6>
+  2718714006U,	// <7,0,3,0>: Cost 3 vsldoi8 <5,6,7,0>, <3,0,1,2>
+  2779169020U,	// <7,0,3,1>: Cost 3 vsldoi12 RHS, <0,3,1,0>
+  3852910853U,	// <7,0,3,2>: Cost 4 vsldoi12 RHS, <0,3,2,0>
+  2718714268U,	// <7,0,3,3>: Cost 3 vsldoi8 <5,6,7,0>, <3,3,3,3>
+  2718714370U,	// <7,0,3,4>: Cost 3 vsldoi8 <5,6,7,0>, <3,4,5,6>
+  2718714461U,	// <7,0,3,5>: Cost 3 vsldoi8 <5,6,7,0>, <3,5,6,7>
+  2706770608U,	// <7,0,3,6>: Cost 3 vsldoi8 <3,6,7,0>, <3,6,7,0>
+  3847160114U,	// <7,0,3,7>: Cost 4 vsldoi12 <3,6,0,7>, <0,3,7,0>
+  2779169083U,	// <7,0,3,u>: Cost 3 vsldoi12 RHS, <0,3,u,0>
+  2718714770U,	// <7,0,4,0>: Cost 3 vsldoi8 <5,6,7,0>, <4,0,5,1>
+  1705427282U,	// <7,0,4,1>: Cost 2 vsldoi12 RHS, <0,4,1,5>
+  3713583034U,	// <7,0,4,2>: Cost 4 vsldoi4 <3,7,0,4>, <2,6,3,7>
+  3713583814U,	// <7,0,4,3>: Cost 4 vsldoi4 <3,7,0,4>, <3,7,0,4>
+  2779169133U,	// <7,0,4,4>: Cost 3 vsldoi12 RHS, <0,4,4,5>
+  1644973366U,	// <7,0,4,5>: Cost 2 vsldoi8 <5,6,7,0>, RHS
+  2657760081U,	// <7,0,4,6>: Cost 3 vsldoi4 <6,7,0,4>, <6,7,0,4>
+  2259468868U,	// <7,0,4,7>: Cost 3 vmrghw <7,4,5,6>, <0,7,1,4>
+  1705427345U,	// <7,0,4,u>: Cost 2 vsldoi12 RHS, <0,4,u,5>
+  2718715508U,	// <7,0,5,0>: Cost 3 vsldoi8 <5,6,7,0>, <5,0,6,1>
+  2260123750U,	// <7,0,5,1>: Cost 3 vmrghw <7,5,5,5>, LHS
+  3792457451U,	// <7,0,5,2>: Cost 4 vsldoi8 <5,6,7,0>, <5,2,1,3>
+  3852911024U,	// <7,0,5,3>: Cost 4 vsldoi12 RHS, <0,5,3,0>
+  2718715836U,	// <7,0,5,4>: Cost 3 vsldoi8 <5,6,7,0>, <5,4,6,5>
+  2718715908U,	// <7,0,5,5>: Cost 3 vsldoi8 <5,6,7,0>, <5,5,5,5>
+  1644974178U,	// <7,0,5,6>: Cost 2 vsldoi8 <5,6,7,0>, <5,6,7,0>
+  3792457853U,	// <7,0,5,7>: Cost 4 vsldoi8 <5,6,7,0>, <5,7,1,0>
+  1646301444U,	// <7,0,5,u>: Cost 2 vsldoi8 <5,u,7,0>, <5,u,7,0>
+  2720706901U,	// <7,0,6,0>: Cost 3 vsldoi8 <6,0,7,0>, <6,0,7,0>
+  2779169270U,	// <7,0,6,1>: Cost 3 vsldoi12 RHS, <0,6,1,7>
+  2718716410U,	// <7,0,6,2>: Cost 3 vsldoi8 <5,6,7,0>, <6,2,7,3>
+  2722697800U,	// <7,0,6,3>: Cost 3 vsldoi8 <6,3,7,0>, <6,3,7,0>
+  3852911121U,	// <7,0,6,4>: Cost 4 vsldoi12 RHS, <0,6,4,7>
+  3852911130U,	// <7,0,6,5>: Cost 4 vsldoi12 RHS, <0,6,5,7>
+  2718716728U,	// <7,0,6,6>: Cost 3 vsldoi8 <5,6,7,0>, <6,6,6,6>
+  2718716750U,	// <7,0,6,7>: Cost 3 vsldoi8 <5,6,7,0>, <6,7,0,1>
+  2779169333U,	// <7,0,6,u>: Cost 3 vsldoi12 RHS, <0,6,u,7>
+  2718716922U,	// <7,0,7,0>: Cost 3 vsldoi8 <5,6,7,0>, <7,0,1,2>
+  1187872870U,	// <7,0,7,1>: Cost 2 vmrghw <7,7,7,7>, LHS
+  2718717076U,	// <7,0,7,2>: Cost 3 vsldoi8 <5,6,7,0>, <7,2,0,3>
+  3847160408U,	// <7,0,7,3>: Cost 4 vsldoi12 <3,6,0,7>, <0,7,3,6>
+  2718717286U,	// <7,0,7,4>: Cost 3 vsldoi8 <5,6,7,0>, <7,4,5,6>
+  2718717377U,	// <7,0,7,5>: Cost 3 vsldoi8 <5,6,7,0>, <7,5,6,7>
+  2718717404U,	// <7,0,7,6>: Cost 3 vsldoi8 <5,6,7,0>, <7,6,0,7>
+  2718717478U,	// <7,0,7,7>: Cost 3 vsldoi8 <5,6,7,0>, <7,7,0,0>
+  1187873437U,	// <7,0,7,u>: Cost 2 vmrghw <7,7,7,7>, LHS
+  1584046182U,	// <7,0,u,0>: Cost 2 vsldoi4 <6,7,0,u>, LHS
+  1705427602U,	// <7,0,u,1>: Cost 2 vsldoi12 RHS, <0,u,1,1>
+  631685789U,	// <7,0,u,2>: Cost 1 vsldoi12 RHS, LHS
+  2639874762U,	// <7,0,u,3>: Cost 3 vsldoi4 <3,7,0,u>, <3,7,0,u>
+  1584049462U,	// <7,0,u,4>: Cost 2 vsldoi4 <6,7,0,u>, RHS
+  1644976282U,	// <7,0,u,5>: Cost 2 vsldoi8 <5,6,7,0>, RHS
+  1584051029U,	// <7,0,u,6>: Cost 2 vsldoi4 <6,7,0,u>, <6,7,0,u>
+  2718718208U,	// <7,0,u,7>: Cost 3 vsldoi8 <5,6,7,0>, <u,7,0,1>
+  631685843U,	// <7,0,u,u>: Cost 1 vsldoi12 RHS, LHS
+  2721374218U,	// <7,1,0,0>: Cost 3 vsldoi8 <6,1,7,1>, <0,0,1,1>
+  2779169507U,	// <7,1,0,1>: Cost 3 vsldoi12 RHS, <1,0,1,1>
+  2779169516U,	// <7,1,0,2>: Cost 3 vsldoi12 RHS, <1,0,2,1>
+  3852911348U,	// <7,1,0,3>: Cost 4 vsldoi12 RHS, <1,0,3,0>
+  2669743414U,	// <7,1,0,4>: Cost 3 vsldoi4 <u,7,1,0>, RHS
+  2316058962U,	// <7,1,0,5>: Cost 3 vmrglw <5,6,7,0>, <0,4,1,5>
+  2316059044U,	// <7,1,0,6>: Cost 3 vmrglw <5,6,7,0>, <0,5,1,6>
+  2669745146U,	// <7,1,0,7>: Cost 3 vsldoi4 <u,7,1,0>, <7,0,1,2>
+  2779169570U,	// <7,1,0,u>: Cost 3 vsldoi12 RHS, <1,0,u,1>
+  2779169579U,	// <7,1,1,0>: Cost 3 vsldoi12 RHS, <1,1,0,1>
+  1705427764U,	// <7,1,1,1>: Cost 2 vsldoi12 RHS, <1,1,1,1>
+  2779169598U,	// <7,1,1,2>: Cost 3 vsldoi12 RHS, <1,1,2,2>
+  3713632972U,	// <7,1,1,3>: Cost 4 vsldoi4 <3,7,1,1>, <3,7,1,1>
+  2779169619U,	// <7,1,1,4>: Cost 3 vsldoi12 RHS, <1,1,4,5>
+  2779169628U,	// <7,1,1,5>: Cost 3 vsldoi12 RHS, <1,1,5,5>
+  2657809239U,	// <7,1,1,6>: Cost 3 vsldoi4 <6,7,1,1>, <6,7,1,1>
+  3835290474U,	// <7,1,1,7>: Cost 4 vsldoi12 <1,6,1,7>, <1,1,7,1>
+  1705427764U,	// <7,1,1,u>: Cost 2 vsldoi12 RHS, <1,1,1,1>
+  2779169660U,	// <7,1,2,0>: Cost 3 vsldoi12 RHS, <1,2,0,1>
+  2779169671U,	// <7,1,2,1>: Cost 3 vsldoi12 RHS, <1,2,1,3>
+  2779169680U,	// <7,1,2,2>: Cost 3 vsldoi12 RHS, <1,2,2,3>
+  1705427862U,	// <7,1,2,3>: Cost 2 vsldoi12 RHS, <1,2,3,0>
+  2779169700U,	// <7,1,2,4>: Cost 3 vsldoi12 RHS, <1,2,4,5>
+  2779169707U,	// <7,1,2,5>: Cost 3 vsldoi12 RHS, <1,2,5,3>
+  2657817432U,	// <7,1,2,6>: Cost 3 vsldoi4 <6,7,1,2>, <6,7,1,2>
+  2803057594U,	// <7,1,2,7>: Cost 3 vsldoi12 RHS, <1,2,7,0>
+  1705427907U,	// <7,1,2,u>: Cost 2 vsldoi12 RHS, <1,2,u,0>
+  3776538827U,	// <7,1,3,0>: Cost 4 vsldoi8 <3,0,7,1>, <3,0,7,1>
+  2319400970U,	// <7,1,3,1>: Cost 3 vmrglw <6,2,7,3>, <0,0,1,1>
+  2316085398U,	// <7,1,3,2>: Cost 3 vmrglw <5,6,7,3>, <3,0,1,2>
+  3852911591U,	// <7,1,3,3>: Cost 4 vsldoi12 RHS, <1,3,3,0>
+  3852911600U,	// <7,1,3,4>: Cost 4 vsldoi12 RHS, <1,3,4,0>
+  2319401298U,	// <7,1,3,5>: Cost 3 vmrglw <6,2,7,3>, <0,4,1,5>
+  3833668617U,	// <7,1,3,6>: Cost 4 vsldoi12 <1,3,6,7>, <1,3,6,7>
+  3367265487U,	// <7,1,3,7>: Cost 4 vmrglw <1,u,7,3>, <1,6,1,7>
+  2319400977U,	// <7,1,3,u>: Cost 3 vmrglw <6,2,7,3>, <0,0,1,u>
+  2724031378U,	// <7,1,4,0>: Cost 3 vsldoi8 <6,5,7,1>, <4,0,5,1>
+  2779169835U,	// <7,1,4,1>: Cost 3 vsldoi12 RHS, <1,4,1,5>
+  2779169844U,	// <7,1,4,2>: Cost 3 vsldoi12 RHS, <1,4,2,5>
+  3852911672U,	// <7,1,4,3>: Cost 4 vsldoi12 RHS, <1,4,3,0>
+  2669776182U,	// <7,1,4,4>: Cost 3 vsldoi4 <u,7,1,4>, RHS
+  2779169872U,	// <7,1,4,5>: Cost 3 vsldoi12 RHS, <1,4,5,6>
+  3835290712U,	// <7,1,4,6>: Cost 4 vsldoi12 <1,6,1,7>, <1,4,6,5>
+  2669778278U,	// <7,1,4,7>: Cost 3 vsldoi4 <u,7,1,4>, <7,4,5,6>
+  2779169898U,	// <7,1,4,u>: Cost 3 vsldoi12 RHS, <1,4,u,5>
+  2779169903U,	// <7,1,5,0>: Cost 3 vsldoi12 RHS, <1,5,0,1>
+  3835585661U,	// <7,1,5,1>: Cost 4 vsldoi12 <1,6,5,7>, <1,5,1,6>
+  3841410182U,	// <7,1,5,2>: Cost 4 vsldoi12 <2,6,3,7>, <1,5,2,6>
+  3852911753U,	// <7,1,5,3>: Cost 4 vsldoi12 RHS, <1,5,3,0>
+  2779169943U,	// <7,1,5,4>: Cost 3 vsldoi12 RHS, <1,5,4,5>
+  2318754130U,	// <7,1,5,5>: Cost 3 vmrglw <6,1,7,5>, <0,4,1,5>
+  2718724195U,	// <7,1,5,6>: Cost 3 vsldoi8 <5,6,7,1>, <5,6,7,1>
+  3859178670U,	// <7,1,5,7>: Cost 4 vsldoi12 <5,6,1,7>, <1,5,7,1>
+  2779169975U,	// <7,1,5,u>: Cost 3 vsldoi12 RHS, <1,5,u,1>
+  2720715094U,	// <7,1,6,0>: Cost 3 vsldoi8 <6,0,7,1>, <6,0,7,1>
+  2761549007U,	// <7,1,6,1>: Cost 3 vsldoi12 <1,6,1,7>, <1,6,1,7>
+  2779170008U,	// <7,1,6,2>: Cost 3 vsldoi12 RHS, <1,6,2,7>
+  3835438305U,	// <7,1,6,3>: Cost 4 vsldoi12 <1,6,3,7>, <1,6,3,7>
+  3835512042U,	// <7,1,6,4>: Cost 4 vsldoi12 <1,6,4,7>, <1,6,4,7>
+  2761843955U,	// <7,1,6,5>: Cost 3 vsldoi12 <1,6,5,7>, <1,6,5,7>
+  3835659516U,	// <7,1,6,6>: Cost 4 vsldoi12 <1,6,6,7>, <1,6,6,7>
+  2803057918U,	// <7,1,6,7>: Cost 3 vsldoi12 RHS, <1,6,7,0>
+  2762065166U,	// <7,1,6,u>: Cost 3 vsldoi12 <1,6,u,7>, <1,6,u,7>
+  2669797478U,	// <7,1,7,0>: Cost 3 vsldoi4 <u,7,1,7>, LHS
+  2322087946U,	// <7,1,7,1>: Cost 3 vmrglw <6,6,7,7>, <0,0,1,1>
+  2317448186U,	// <7,1,7,2>: Cost 3 vmrglw <5,u,7,7>, <7,0,1,2>
+  3395829934U,	// <7,1,7,3>: Cost 4 vmrglw <6,6,7,7>, <0,2,1,3>
+  2669800758U,	// <7,1,7,4>: Cost 3 vsldoi4 <u,7,1,7>, RHS
+  2322088274U,	// <7,1,7,5>: Cost 3 vmrglw <6,6,7,7>, <0,4,1,5>
+  3375923377U,	// <7,1,7,6>: Cost 4 vmrglw <3,3,7,7>, <0,2,1,6>
+  2731996780U,	// <7,1,7,7>: Cost 3 vsldoi8 <7,u,7,1>, <7,7,7,7>
+  2322087953U,	// <7,1,7,u>: Cost 3 vmrglw <6,6,7,7>, <0,0,1,u>
+  2779170146U,	// <7,1,u,0>: Cost 3 vsldoi12 RHS, <1,u,0,1>
+  1705427764U,	// <7,1,u,1>: Cost 2 vsldoi12 RHS, <1,1,1,1>
+  2779170164U,	// <7,1,u,2>: Cost 3 vsldoi12 RHS, <1,u,2,1>
+  1705428348U,	// <7,1,u,3>: Cost 2 vsldoi12 RHS, <1,u,3,0>
+  2779170186U,	// <7,1,u,4>: Cost 3 vsldoi12 RHS, <1,u,4,5>
+  2763171221U,	// <7,1,u,5>: Cost 3 vsldoi12 <1,u,5,7>, <1,u,5,7>
+  2657866590U,	// <7,1,u,6>: Cost 3 vsldoi4 <6,7,1,u>, <6,7,1,u>
+  2803058080U,	// <7,1,u,7>: Cost 3 vsldoi12 RHS, <1,u,7,0>
+  1705428393U,	// <7,1,u,u>: Cost 2 vsldoi12 RHS, <1,u,u,0>
+  3713695846U,	// <7,2,0,0>: Cost 4 vsldoi4 <3,7,2,0>, LHS
+  2779170237U,	// <7,2,0,1>: Cost 3 vsldoi12 RHS, <2,0,1,2>
+  2779170245U,	// <7,2,0,2>: Cost 3 vsldoi12 RHS, <2,0,2,1>
+  1242316902U,	// <7,2,0,3>: Cost 2 vmrglw <5,6,7,0>, LHS
+  3713699126U,	// <7,2,0,4>: Cost 4 vsldoi4 <3,7,2,0>, RHS
+  3852912096U,	// <7,2,0,5>: Cost 4 vsldoi12 RHS, <2,0,5,1>
+  2767668713U,	// <7,2,0,6>: Cost 3 vsldoi12 <2,6,3,7>, <2,0,6,1>
+  2256488426U,	// <7,2,0,7>: Cost 3 vmrghw <7,0,1,2>, <2,7,0,1>
+  1242316907U,	// <7,2,0,u>: Cost 2 vmrglw <5,6,7,0>, LHS
+  3852912132U,	// <7,2,1,0>: Cost 4 vsldoi12 RHS, <2,1,0,1>
+  3852912141U,	// <7,2,1,1>: Cost 4 vsldoi12 RHS, <2,1,1,1>
+  3852912149U,	// <7,2,1,2>: Cost 4 vsldoi12 RHS, <2,1,2,0>
+  2779170335U,	// <7,2,1,3>: Cost 3 vsldoi12 RHS, <2,1,3,1>
+  3852912172U,	// <7,2,1,4>: Cost 4 vsldoi12 RHS, <2,1,4,5>
+  3840747062U,	// <7,2,1,5>: Cost 5 vsldoi12 <2,5,3,7>, <2,1,5,6>
+  3841410617U,	// <7,2,1,6>: Cost 4 vsldoi12 <2,6,3,7>, <2,1,6,0>
+  3795125538U,	// <7,2,1,7>: Cost 4 vsldoi8 <6,1,7,2>, <1,7,2,0>
+  2779170380U,	// <7,2,1,u>: Cost 3 vsldoi12 RHS, <2,1,u,1>
+  2779170389U,	// <7,2,2,0>: Cost 3 vsldoi12 RHS, <2,2,0,1>
+  3852912222U,	// <7,2,2,1>: Cost 4 vsldoi12 RHS, <2,2,1,1>
+  1705428584U,	// <7,2,2,2>: Cost 2 vsldoi12 RHS, <2,2,2,2>
+  1705428594U,	// <7,2,2,3>: Cost 2 vsldoi12 RHS, <2,2,3,3>
+  2779170429U,	// <7,2,2,4>: Cost 3 vsldoi12 RHS, <2,2,4,5>
+  3852912259U,	// <7,2,2,5>: Cost 4 vsldoi12 RHS, <2,2,5,2>
+  2767668880U,	// <7,2,2,6>: Cost 3 vsldoi12 <2,6,3,7>, <2,2,6,6>
+  3841336981U,	// <7,2,2,7>: Cost 4 vsldoi12 <2,6,2,7>, <2,2,7,2>
+  1705428639U,	// <7,2,2,u>: Cost 2 vsldoi12 RHS, <2,2,u,3>
+  1705428646U,	// <7,2,3,0>: Cost 2 vsldoi12 RHS, <2,3,0,1>
+  2779170479U,	// <7,2,3,1>: Cost 3 vsldoi12 RHS, <2,3,1,1>
+  2767668925U,	// <7,2,3,2>: Cost 3 vsldoi12 <2,6,3,7>, <2,3,2,6>
+  1245659238U,	// <7,2,3,3>: Cost 2 vmrglw <6,2,7,3>, LHS
+  1705428686U,	// <7,2,3,4>: Cost 2 vsldoi12 RHS, <2,3,4,5>
+  2779170519U,	// <7,2,3,5>: Cost 3 vsldoi12 RHS, <2,3,5,5>
+  2657899362U,	// <7,2,3,6>: Cost 3 vsldoi4 <6,7,2,3>, <6,7,2,3>
+  2319406574U,	// <7,2,3,7>: Cost 3 vmrglw <6,2,7,3>, <7,6,2,7>
+  1705428718U,	// <7,2,3,u>: Cost 2 vsldoi12 RHS, <2,3,u,1>
+  3713728614U,	// <7,2,4,0>: Cost 4 vsldoi4 <3,7,2,4>, LHS
+  3852912388U,	// <7,2,4,1>: Cost 4 vsldoi12 RHS, <2,4,1,5>
+  2779170573U,	// <7,2,4,2>: Cost 3 vsldoi12 RHS, <2,4,2,5>
+  1242349670U,	// <7,2,4,3>: Cost 2 vmrglw <5,6,7,4>, LHS
+  3713731894U,	// <7,2,4,4>: Cost 4 vsldoi4 <3,7,2,4>, RHS
+  2779170601U,	// <7,2,4,5>: Cost 3 vsldoi12 RHS, <2,4,5,6>
+  2767669041U,	// <7,2,4,6>: Cost 3 vsldoi12 <2,6,3,7>, <2,4,6,5>
+  3389834456U,	// <7,2,4,7>: Cost 4 vmrglw <5,6,7,4>, <1,6,2,7>
+  1242349675U,	// <7,2,4,u>: Cost 2 vmrglw <5,6,7,4>, LHS
+  3852912456U,	// <7,2,5,0>: Cost 4 vsldoi12 RHS, <2,5,0,1>
+  3852912466U,	// <7,2,5,1>: Cost 4 vsldoi12 RHS, <2,5,1,2>
+  3852912475U,	// <7,2,5,2>: Cost 4 vsldoi12 RHS, <2,5,2,2>
+  2779170664U,	// <7,2,5,3>: Cost 3 vsldoi12 RHS, <2,5,3,6>
+  3852912496U,	// <7,2,5,4>: Cost 4 vsldoi12 RHS, <2,5,4,5>
+  3792474116U,	// <7,2,5,5>: Cost 4 vsldoi8 <5,6,7,2>, <5,5,5,5>
+  2718732388U,	// <7,2,5,6>: Cost 3 vsldoi8 <5,6,7,2>, <5,6,7,2>
+  3841337228U,	// <7,2,5,7>: Cost 5 vsldoi12 <2,6,2,7>, <2,5,7,6>
+  2779170709U,	// <7,2,5,u>: Cost 3 vsldoi12 RHS, <2,5,u,6>
+  2640003174U,	// <7,2,6,0>: Cost 3 vsldoi4 <3,7,2,6>, LHS
+  2721386920U,	// <7,2,6,1>: Cost 3 vsldoi8 <6,1,7,2>, <6,1,7,2>
+  2767595441U,	// <7,2,6,2>: Cost 3 vsldoi12 <2,6,2,7>, <2,6,2,7>
+  1693927354U,	// <7,2,6,3>: Cost 2 vsldoi12 <2,6,3,7>, <2,6,3,7>
+  2640006454U,	// <7,2,6,4>: Cost 3 vsldoi4 <3,7,2,6>, RHS
+  3841558476U,	// <7,2,6,5>: Cost 4 vsldoi12 <2,6,5,7>, <2,6,5,7>
+  2657923941U,	// <7,2,6,6>: Cost 3 vsldoi4 <6,7,2,6>, <6,7,2,6>
+  3841337310U,	// <7,2,6,7>: Cost 4 vsldoi12 <2,6,2,7>, <2,6,7,7>
+  1694296039U,	// <7,2,6,u>: Cost 2 vsldoi12 <2,6,u,7>, <2,6,u,7>
+  2803058666U,	// <7,2,7,0>: Cost 3 vsldoi12 RHS, <2,7,0,1>
+  3852912632U,	// <7,2,7,1>: Cost 4 vsldoi12 RHS, <2,7,1,6>
+  2322089576U,	// <7,2,7,2>: Cost 3 vmrglw <6,6,7,7>, <2,2,2,2>
+  1248346214U,	// <7,2,7,3>: Cost 2 vmrglw <6,6,7,7>, LHS
+  3841337362U,	// <7,2,7,4>: Cost 4 vsldoi12 <2,6,2,7>, <2,7,4,5>
+  3395830836U,	// <7,2,7,5>: Cost 4 vmrglw <6,6,7,7>, <1,4,2,5>
+  2261616570U,	// <7,2,7,6>: Cost 3 vmrghw <7,7,7,7>, <2,6,3,7>
+  3371943857U,	// <7,2,7,7>: Cost 4 vmrglw <2,6,7,7>, <2,6,2,7>
+  1248346219U,	// <7,2,7,u>: Cost 2 vmrglw <6,6,7,7>, LHS
+  1705429051U,	// <7,2,u,0>: Cost 2 vsldoi12 RHS, <2,u,0,1>
+  2779170884U,	// <7,2,u,1>: Cost 3 vsldoi12 RHS, <2,u,1,1>
+  1705428584U,	// <7,2,u,2>: Cost 2 vsldoi12 RHS, <2,2,2,2>
+  1695254620U,	// <7,2,u,3>: Cost 2 vsldoi12 <2,u,3,7>, <2,u,3,7>
+  1705429091U,	// <7,2,u,4>: Cost 2 vsldoi12 RHS, <2,u,4,5>
+  2779170924U,	// <7,2,u,5>: Cost 3 vsldoi12 RHS, <2,u,5,5>
+  2767669361U,	// <7,2,u,6>: Cost 3 vsldoi12 <2,6,3,7>, <2,u,6,1>
+  2803058809U,	// <7,2,u,7>: Cost 3 vsldoi12 RHS, <2,u,7,0>
+  1695623305U,	// <7,2,u,u>: Cost 2 vsldoi12 <2,u,u,7>, <2,u,u,7>
+  2779170955U,	// <7,3,0,0>: Cost 3 vsldoi12 RHS, <3,0,0,0>
+  1705429142U,	// <7,3,0,1>: Cost 2 vsldoi12 RHS, <3,0,1,2>
+  2634057732U,	// <7,3,0,2>: Cost 3 vsldoi4 <2,7,3,0>, <2,7,3,0>
+  2779170983U,	// <7,3,0,3>: Cost 3 vsldoi12 RHS, <3,0,3,1>
+  2779170992U,	// <7,3,0,4>: Cost 3 vsldoi12 RHS, <3,0,4,1>
+  3852912829U,	// <7,3,0,5>: Cost 4 vsldoi12 RHS, <3,0,5,5>
+  2657948520U,	// <7,3,0,6>: Cost 3 vsldoi4 <6,7,3,0>, <6,7,3,0>
+  2316060602U,	// <7,3,0,7>: Cost 3 vmrglw <5,6,7,0>, <2,6,3,7>
+  1705429205U,	// <7,3,0,u>: Cost 2 vsldoi12 RHS, <3,0,u,2>
+  3852912860U,	// <7,3,1,0>: Cost 4 vsldoi12 RHS, <3,1,0,0>
+  2779171046U,	// <7,3,1,1>: Cost 3 vsldoi12 RHS, <3,1,1,1>
+  2779171057U,	// <7,3,1,2>: Cost 3 vsldoi12 RHS, <3,1,2,3>
+  3852912887U,	// <7,3,1,3>: Cost 4 vsldoi12 RHS, <3,1,3,0>
+  3852912896U,	// <7,3,1,4>: Cost 4 vsldoi12 RHS, <3,1,4,0>
+  3852912905U,	// <7,3,1,5>: Cost 4 vsldoi12 RHS, <3,1,5,0>
+  3835291923U,	// <7,3,1,6>: Cost 4 vsldoi12 <1,6,1,7>, <3,1,6,1>
+  3841411356U,	// <7,3,1,7>: Cost 4 vsldoi12 <2,6,3,7>, <3,1,7,1>
+  2779171111U,	// <7,3,1,u>: Cost 3 vsldoi12 RHS, <3,1,u,3>
+  2779171120U,	// <7,3,2,0>: Cost 3 vsldoi12 RHS, <3,2,0,3>
+  3852912952U,	// <7,3,2,1>: Cost 4 vsldoi12 RHS, <3,2,1,2>
+  2779171137U,	// <7,3,2,2>: Cost 3 vsldoi12 RHS, <3,2,2,2>
+  2779171144U,	// <7,3,2,3>: Cost 3 vsldoi12 RHS, <3,2,3,0>
+  2779171156U,	// <7,3,2,4>: Cost 3 vsldoi12 RHS, <3,2,4,3>
+  3852912989U,	// <7,3,2,5>: Cost 4 vsldoi12 RHS, <3,2,5,3>
+  2767669606U,	// <7,3,2,6>: Cost 3 vsldoi12 <2,6,3,7>, <3,2,6,3>
+  2767669615U,	// <7,3,2,7>: Cost 3 vsldoi12 <2,6,3,7>, <3,2,7,3>
+  2779171189U,	// <7,3,2,u>: Cost 3 vsldoi12 RHS, <3,2,u,0>
+  2779171198U,	// <7,3,3,0>: Cost 3 vsldoi12 RHS, <3,3,0,0>
+  3852913032U,	// <7,3,3,1>: Cost 4 vsldoi12 RHS, <3,3,1,1>
+  2704140655U,	// <7,3,3,2>: Cost 3 vsldoi8 <3,2,7,3>, <3,2,7,3>
+  1705429404U,	// <7,3,3,3>: Cost 2 vsldoi12 RHS, <3,3,3,3>
+  2779171238U,	// <7,3,3,4>: Cost 3 vsldoi12 RHS, <3,3,4,4>
+  3852913070U,	// <7,3,3,5>: Cost 4 vsldoi12 RHS, <3,3,5,3>
+  2657973099U,	// <7,3,3,6>: Cost 3 vsldoi4 <6,7,3,3>, <6,7,3,3>
+  2767669700U,	// <7,3,3,7>: Cost 3 vsldoi12 <2,6,3,7>, <3,3,7,7>
+  1705429404U,	// <7,3,3,u>: Cost 2 vsldoi12 RHS, <3,3,3,3>
+  2779171280U,	// <7,3,4,0>: Cost 3 vsldoi12 RHS, <3,4,0,1>
+  2779171290U,	// <7,3,4,1>: Cost 3 vsldoi12 RHS, <3,4,1,2>
+  2634090504U,	// <7,3,4,2>: Cost 3 vsldoi4 <2,7,3,4>, <2,7,3,4>
+  2779171311U,	// <7,3,4,3>: Cost 3 vsldoi12 RHS, <3,4,3,5>
+  2779171319U,	// <7,3,4,4>: Cost 3 vsldoi12 RHS, <3,4,4,4>
+  1705429506U,	// <7,3,4,5>: Cost 2 vsldoi12 RHS, <3,4,5,6>
+  2722057593U,	// <7,3,4,6>: Cost 3 vsldoi8 <6,2,7,3>, <4,6,5,2>
+  2316093370U,	// <7,3,4,7>: Cost 3 vmrglw <5,6,7,4>, <2,6,3,7>
+  1705429533U,	// <7,3,4,u>: Cost 2 vsldoi12 RHS, <3,4,u,6>
+  3852913185U,	// <7,3,5,0>: Cost 4 vsldoi12 RHS, <3,5,0,1>
+  3795799695U,	// <7,3,5,1>: Cost 4 vsldoi8 <6,2,7,3>, <5,1,0,1>
+  3852913203U,	// <7,3,5,2>: Cost 4 vsldoi12 RHS, <3,5,2,1>
+  3852913214U,	// <7,3,5,3>: Cost 4 vsldoi12 RHS, <3,5,3,3>
+  3852913225U,	// <7,3,5,4>: Cost 4 vsldoi12 RHS, <3,5,4,5>
+  2779171410U,	// <7,3,5,5>: Cost 3 vsldoi12 RHS, <3,5,5,5>
+  2718740581U,	// <7,3,5,6>: Cost 3 vsldoi8 <5,6,7,3>, <5,6,7,3>
+  3841411685U,	// <7,3,5,7>: Cost 4 vsldoi12 <2,6,3,7>, <3,5,7,6>
+  2720067847U,	// <7,3,5,u>: Cost 3 vsldoi8 <5,u,7,3>, <5,u,7,3>
+  2773420664U,	// <7,3,6,0>: Cost 3 vsldoi12 <3,6,0,7>, <3,6,0,7>
+  3847236225U,	// <7,3,6,1>: Cost 4 vsldoi12 <3,6,1,7>, <3,6,1,7>
+  1648316922U,	// <7,3,6,2>: Cost 2 vsldoi8 <6,2,7,3>, <6,2,7,3>
+  2773641875U,	// <7,3,6,3>: Cost 3 vsldoi12 <3,6,3,7>, <3,6,3,7>
+  2773715612U,	// <7,3,6,4>: Cost 3 vsldoi12 <3,6,4,7>, <3,6,4,7>
+  3847531173U,	// <7,3,6,5>: Cost 4 vsldoi12 <3,6,5,7>, <3,6,5,7>
+  2722059024U,	// <7,3,6,6>: Cost 3 vsldoi8 <6,2,7,3>, <6,6,2,2>
+  2767669943U,	// <7,3,6,7>: Cost 3 vsldoi12 <2,6,3,7>, <3,6,7,7>
+  1652298720U,	// <7,3,6,u>: Cost 2 vsldoi8 <6,u,7,3>, <6,u,7,3>
+  2767669955U,	// <7,3,7,0>: Cost 3 vsldoi12 <2,6,3,7>, <3,7,0,1>
+  3841411788U,	// <7,3,7,1>: Cost 4 vsldoi12 <2,6,3,7>, <3,7,1,1>
+  2767669978U,	// <7,3,7,2>: Cost 3 vsldoi12 <2,6,3,7>, <3,7,2,6>
+  2722059546U,	// <7,3,7,3>: Cost 3 vsldoi8 <6,2,7,3>, <7,3,6,2>
+  2767669995U,	// <7,3,7,4>: Cost 3 vsldoi12 <2,6,3,7>, <3,7,4,5>
+  3852913396U,	// <7,3,7,5>: Cost 4 vsldoi12 RHS, <3,7,5,5>
+  2722059758U,	// <7,3,7,6>: Cost 3 vsldoi8 <6,2,7,3>, <7,6,2,7>
+  2302183354U,	// <7,3,7,7>: Cost 3 vmrglw <3,3,7,7>, <2,6,3,7>
+  2767670027U,	// <7,3,7,u>: Cost 3 vsldoi12 <2,6,3,7>, <3,7,u,1>
+  2774747930U,	// <7,3,u,0>: Cost 3 vsldoi12 <3,u,0,7>, <3,u,0,7>
+  1705429790U,	// <7,3,u,1>: Cost 2 vsldoi12 RHS, <3,u,1,2>
+  1660262316U,	// <7,3,u,2>: Cost 2 vsldoi8 <u,2,7,3>, <u,2,7,3>
+  1705429404U,	// <7,3,u,3>: Cost 2 vsldoi12 RHS, <3,3,3,3>
+  2775042878U,	// <7,3,u,4>: Cost 3 vsldoi12 <3,u,4,7>, <3,u,4,7>
+  1705429830U,	// <7,3,u,5>: Cost 2 vsldoi12 RHS, <3,u,5,6>
+  2779171660U,	// <7,3,u,6>: Cost 3 vsldoi12 RHS, <3,u,6,3>
+  2767670101U,	// <7,3,u,7>: Cost 3 vsldoi12 <2,6,3,7>, <3,u,7,3>
+  1705429853U,	// <7,3,u,u>: Cost 2 vsldoi12 RHS, <3,u,u,2>
+  2718744576U,	// <7,4,0,0>: Cost 3 vsldoi8 <5,6,7,4>, <0,0,0,0>
+  1645002854U,	// <7,4,0,1>: Cost 2 vsldoi8 <5,6,7,4>, LHS
+  3852913527U,	// <7,4,0,2>: Cost 4 vsldoi12 RHS, <4,0,2,1>
+  3852913536U,	// <7,4,0,3>: Cost 4 vsldoi12 RHS, <4,0,3,1>
+  2316061904U,	// <7,4,0,4>: Cost 3 vmrglw <5,6,7,0>, <4,4,4,4>
+  1705429906U,	// <7,4,0,5>: Cost 2 vsldoi12 RHS, <4,0,5,1>
+  2658022257U,	// <7,4,0,6>: Cost 3 vsldoi4 <6,7,4,0>, <6,7,4,0>
+  2256489928U,	// <7,4,0,7>: Cost 3 vmrghw <7,0,1,2>, <4,7,5,0>
+  1707420589U,	// <7,4,0,u>: Cost 2 vsldoi12 RHS, <4,0,u,1>
+  3852913590U,	// <7,4,1,0>: Cost 4 vsldoi12 RHS, <4,1,0,1>
+  2718745396U,	// <7,4,1,1>: Cost 3 vsldoi8 <5,6,7,4>, <1,1,1,1>
+  2779171786U,	// <7,4,1,2>: Cost 3 vsldoi12 RHS, <4,1,2,3>
+  3852913616U,	// <7,4,1,3>: Cost 4 vsldoi12 RHS, <4,1,3,0>
+  3852913627U,	// <7,4,1,4>: Cost 4 vsldoi12 RHS, <4,1,4,2>
+  2779171810U,	// <7,4,1,5>: Cost 3 vsldoi12 RHS, <4,1,5,0>
+  3792487631U,	// <7,4,1,6>: Cost 4 vsldoi8 <5,6,7,4>, <1,6,1,7>
+  3394456220U,	// <7,4,1,7>: Cost 4 vmrglw <6,4,7,1>, <3,6,4,7>
+  2779171837U,	// <7,4,1,u>: Cost 3 vsldoi12 RHS, <4,1,u,0>
+  3852913673U,	// <7,4,2,0>: Cost 4 vsldoi12 RHS, <4,2,0,3>
+  3852913682U,	// <7,4,2,1>: Cost 4 vsldoi12 RHS, <4,2,1,3>
+  2718746216U,	// <7,4,2,2>: Cost 3 vsldoi8 <5,6,7,4>, <2,2,2,2>
+  2718746278U,	// <7,4,2,3>: Cost 3 vsldoi8 <5,6,7,4>, <2,3,0,1>
+  2779171885U,	// <7,4,2,4>: Cost 3 vsldoi12 RHS, <4,2,4,3>
+  2779171893U,	// <7,4,2,5>: Cost 3 vsldoi12 RHS, <4,2,5,2>
+  2718746554U,	// <7,4,2,6>: Cost 3 vsldoi8 <5,6,7,4>, <2,6,3,7>
+  3847457864U,	// <7,4,2,7>: Cost 4 vsldoi12 <3,6,4,7>, <4,2,7,3>
+  2779171921U,	// <7,4,2,u>: Cost 3 vsldoi12 RHS, <4,2,u,3>
+  2718746774U,	// <7,4,3,0>: Cost 3 vsldoi8 <5,6,7,4>, <3,0,1,2>
+  3852913762U,	// <7,4,3,1>: Cost 4 vsldoi12 RHS, <4,3,1,2>
+  3852913772U,	// <7,4,3,2>: Cost 4 vsldoi12 RHS, <4,3,2,3>
+  2718747036U,	// <7,4,3,3>: Cost 3 vsldoi8 <5,6,7,4>, <3,3,3,3>
+  2718747138U,	// <7,4,3,4>: Cost 3 vsldoi8 <5,6,7,4>, <3,4,5,6>
+  2779171972U,	// <7,4,3,5>: Cost 3 vsldoi12 RHS, <4,3,5,0>
+  2706803380U,	// <7,4,3,6>: Cost 3 vsldoi8 <3,6,7,4>, <3,6,7,4>
+  3847457946U,	// <7,4,3,7>: Cost 4 vsldoi12 <3,6,4,7>, <4,3,7,4>
+  2781162655U,	// <7,4,3,u>: Cost 3 vsldoi12 RHS, <4,3,u,0>
+  2718747538U,	// <7,4,4,0>: Cost 3 vsldoi8 <5,6,7,4>, <4,0,5,1>
+  3852913842U,	// <7,4,4,1>: Cost 4 vsldoi12 RHS, <4,4,1,1>
+  3852913852U,	// <7,4,4,2>: Cost 4 vsldoi12 RHS, <4,4,2,2>
+  2316096696U,	// <7,4,4,3>: Cost 3 vmrglw <5,6,7,4>, <7,2,4,3>
+  1705430224U,	// <7,4,4,4>: Cost 2 vsldoi12 RHS, <4,4,4,4>
+  1705430234U,	// <7,4,4,5>: Cost 2 vsldoi12 RHS, <4,4,5,5>
+  2658055029U,	// <7,4,4,6>: Cost 3 vsldoi4 <6,7,4,4>, <6,7,4,4>
+  2316097024U,	// <7,4,4,7>: Cost 3 vmrglw <5,6,7,4>, <7,6,4,7>
+  1707420917U,	// <7,4,4,u>: Cost 2 vsldoi12 RHS, <4,4,u,5>
+  1584316518U,	// <7,4,5,0>: Cost 2 vsldoi4 <6,7,4,5>, LHS
+  2658059060U,	// <7,4,5,1>: Cost 3 vsldoi4 <6,7,4,5>, <1,1,1,1>
+  2640144314U,	// <7,4,5,2>: Cost 3 vsldoi4 <3,7,4,5>, <2,6,3,7>
+  2640145131U,	// <7,4,5,3>: Cost 3 vsldoi4 <3,7,4,5>, <3,7,4,5>
+  1584319798U,	// <7,4,5,4>: Cost 2 vsldoi4 <6,7,4,5>, RHS
+  2779172134U,	// <7,4,5,5>: Cost 3 vsldoi12 RHS, <4,5,5,0>
+  631688502U,	// <7,4,5,6>: Cost 1 vsldoi12 RHS, RHS
+  2658063354U,	// <7,4,5,7>: Cost 3 vsldoi4 <6,7,4,5>, <7,0,1,2>
+  631688520U,	// <7,4,5,u>: Cost 1 vsldoi12 RHS, RHS
+  3852914001U,	// <7,4,6,0>: Cost 4 vsldoi12 RHS, <4,6,0,7>
+  3852914010U,	// <7,4,6,1>: Cost 4 vsldoi12 RHS, <4,6,1,7>
+  2718749178U,	// <7,4,6,2>: Cost 3 vsldoi8 <5,6,7,4>, <6,2,7,3>
+  2722730572U,	// <7,4,6,3>: Cost 3 vsldoi8 <6,3,7,4>, <6,3,7,4>
+  2723394205U,	// <7,4,6,4>: Cost 3 vsldoi8 <6,4,7,4>, <6,4,7,4>
+  2779172221U,	// <7,4,6,5>: Cost 3 vsldoi12 RHS, <4,6,5,6>
+  2718749496U,	// <7,4,6,6>: Cost 3 vsldoi8 <5,6,7,4>, <6,6,6,6>
+  2718749518U,	// <7,4,6,7>: Cost 3 vsldoi8 <5,6,7,4>, <6,7,0,1>
+  2779172249U,	// <7,4,6,u>: Cost 3 vsldoi12 RHS, <4,6,u,7>
+  2718749690U,	// <7,4,7,0>: Cost 3 vsldoi8 <5,6,7,4>, <7,0,1,2>
+  3847458214U,	// <7,4,7,1>: Cost 4 vsldoi12 <3,6,4,7>, <4,7,1,2>
+  2718749880U,	// <7,4,7,2>: Cost 3 vsldoi8 <5,6,7,4>, <7,2,4,3>
+  3847458236U,	// <7,4,7,3>: Cost 4 vsldoi12 <3,6,4,7>, <4,7,3,6>
+  2718750004U,	// <7,4,7,4>: Cost 3 vsldoi8 <5,6,7,4>, <7,4,0,1>
+  1187876150U,	// <7,4,7,5>: Cost 2 vmrghw <7,7,7,7>, RHS
+  2718750208U,	// <7,4,7,6>: Cost 3 vsldoi8 <5,6,7,4>, <7,6,4,7>
+  2718750286U,	// <7,4,7,7>: Cost 3 vsldoi8 <5,6,7,4>, <7,7,4,4>
+  1187876393U,	// <7,4,7,u>: Cost 2 vmrghw <7,7,7,7>, RHS
+  1584341094U,	// <7,4,u,0>: Cost 2 vsldoi4 <6,7,4,u>, LHS
+  1645008686U,	// <7,4,u,1>: Cost 2 vsldoi8 <5,6,7,4>, LHS
+  2640168890U,	// <7,4,u,2>: Cost 3 vsldoi4 <3,7,4,u>, <2,6,3,7>
+  2640169710U,	// <7,4,u,3>: Cost 3 vsldoi4 <3,7,4,u>, <3,7,4,u>
+  1584344374U,	// <7,4,u,4>: Cost 2 vsldoi4 <6,7,4,u>, RHS
+  1705430554U,	// <7,4,u,5>: Cost 2 vsldoi12 RHS, <4,u,5,1>
+  631688745U,	// <7,4,u,6>: Cost 1 vsldoi12 RHS, RHS
+  2718750976U,	// <7,4,u,7>: Cost 3 vsldoi8 <5,6,7,4>, <u,7,0,1>
+  631688763U,	// <7,4,u,u>: Cost 1 vsldoi12 RHS, RHS
+  2646147174U,	// <7,5,0,0>: Cost 3 vsldoi4 <4,7,5,0>, LHS
+  2779172424U,	// <7,5,0,1>: Cost 3 vsldoi12 RHS, <5,0,1,2>
+  3852914258U,	// <7,5,0,2>: Cost 4 vsldoi12 RHS, <5,0,2,3>
+  3852914268U,	// <7,5,0,3>: Cost 4 vsldoi12 RHS, <5,0,3,4>
+  2779172450U,	// <7,5,0,4>: Cost 3 vsldoi12 RHS, <5,0,4,1>
+  2316061914U,	// <7,5,0,5>: Cost 3 vmrglw <5,6,7,0>, <4,4,5,5>
+  2316061186U,	// <7,5,0,6>: Cost 3 vmrglw <5,6,7,0>, <3,4,5,6>
+  2646152186U,	// <7,5,0,7>: Cost 3 vsldoi4 <4,7,5,0>, <7,0,1,2>
+  2779172486U,	// <7,5,0,u>: Cost 3 vsldoi12 RHS, <5,0,u,1>
+  2781163151U,	// <7,5,1,0>: Cost 3 vsldoi12 RHS, <5,1,0,1>
+  2321378194U,	// <7,5,1,1>: Cost 3 vmrglw <6,5,7,1>, <4,0,5,1>
+  3852914339U,	// <7,5,1,2>: Cost 4 vsldoi12 RHS, <5,1,2,3>
+  3852914350U,	// <7,5,1,3>: Cost 4 vsldoi12 RHS, <5,1,3,5>
+  2781163191U,	// <7,5,1,4>: Cost 3 vsldoi12 RHS, <5,1,4,5>
+  3852914363U,	// <7,5,1,5>: Cost 4 vsldoi12 RHS, <5,1,5,0>
+  3835588297U,	// <7,5,1,6>: Cost 4 vsldoi12 <1,6,5,7>, <5,1,6,5>
+  3835588306U,	// <7,5,1,7>: Cost 4 vsldoi12 <1,6,5,7>, <5,1,7,5>
+  2781163223U,	// <7,5,1,u>: Cost 3 vsldoi12 RHS, <5,1,u,1>
+  3852914400U,	// <7,5,2,0>: Cost 4 vsldoi12 RHS, <5,2,0,1>
+  2781163243U,	// <7,5,2,1>: Cost 3 vsldoi12 RHS, <5,2,1,3>
+  3852914419U,	// <7,5,2,2>: Cost 4 vsldoi12 RHS, <5,2,2,2>
+  2779172606U,	// <7,5,2,3>: Cost 3 vsldoi12 RHS, <5,2,3,4>
+  3780552497U,	// <7,5,2,4>: Cost 4 vsldoi8 <3,6,7,5>, <2,4,6,5>
+  2781163279U,	// <7,5,2,5>: Cost 3 vsldoi12 RHS, <5,2,5,3>
+  2779172632U,	// <7,5,2,6>: Cost 3 vsldoi12 RHS, <5,2,6,3>
+  3835588385U,	// <7,5,2,7>: Cost 4 vsldoi12 <1,6,5,7>, <5,2,7,3>
+  2779172650U,	// <7,5,2,u>: Cost 3 vsldoi12 RHS, <5,2,u,3>
+  3852914481U,	// <7,5,3,0>: Cost 4 vsldoi12 RHS, <5,3,0,1>
+  2319403922U,	// <7,5,3,1>: Cost 3 vmrglw <6,2,7,3>, <4,0,5,1>
+  2319404409U,	// <7,5,3,2>: Cost 3 vmrglw <6,2,7,3>, <4,6,5,2>
+  3852914510U,	// <7,5,3,3>: Cost 4 vsldoi12 RHS, <5,3,3,3>
+  3779226131U,	// <7,5,3,4>: Cost 4 vsldoi8 <3,4,7,5>, <3,4,7,5>
+  2319404250U,	// <7,5,3,5>: Cost 3 vmrglw <6,2,7,3>, <4,4,5,5>
+  2319403522U,	// <7,5,3,6>: Cost 3 vmrglw <6,2,7,3>, <3,4,5,6>
+  3852914547U,	// <7,5,3,7>: Cost 4 vsldoi12 RHS, <5,3,7,4>
+  2319403524U,	// <7,5,3,u>: Cost 3 vmrglw <6,2,7,3>, <3,4,5,u>
+  2646179942U,	// <7,5,4,0>: Cost 3 vsldoi4 <4,7,5,4>, LHS
+  2316094354U,	// <7,5,4,1>: Cost 3 vmrglw <5,6,7,4>, <4,0,5,1>
+  3852914582U,	// <7,5,4,2>: Cost 4 vsldoi12 RHS, <5,4,2,3>
+  3852914592U,	// <7,5,4,3>: Cost 4 vsldoi12 RHS, <5,4,3,4>
+  2646183372U,	// <7,5,4,4>: Cost 3 vsldoi4 <4,7,5,4>, <4,7,5,4>
+  2779172788U,	// <7,5,4,5>: Cost 3 vsldoi12 RHS, <5,4,5,6>
+  2316093954U,	// <7,5,4,6>: Cost 3 vmrglw <5,6,7,4>, <3,4,5,6>
+  2646185318U,	// <7,5,4,7>: Cost 3 vsldoi4 <4,7,5,4>, <7,4,5,6>
+  2779172815U,	// <7,5,4,u>: Cost 3 vsldoi12 RHS, <5,4,u,6>
+  2781163475U,	// <7,5,5,0>: Cost 3 vsldoi12 RHS, <5,5,0,1>
+  2781163484U,	// <7,5,5,1>: Cost 3 vsldoi12 RHS, <5,5,1,1>
+  3852914662U,	// <7,5,5,2>: Cost 4 vsldoi12 RHS, <5,5,2,2>
+  3852914672U,	// <7,5,5,3>: Cost 4 vsldoi12 RHS, <5,5,3,3>
+  2781163515U,	// <7,5,5,4>: Cost 3 vsldoi12 RHS, <5,5,4,5>
+  1705431044U,	// <7,5,5,5>: Cost 2 vsldoi12 RHS, <5,5,5,5>
+  2779172878U,	// <7,5,5,6>: Cost 3 vsldoi12 RHS, <5,5,6,6>
+  3835588632U,	// <7,5,5,7>: Cost 4 vsldoi12 <1,6,5,7>, <5,5,7,7>
+  1705431044U,	// <7,5,5,u>: Cost 2 vsldoi12 RHS, <5,5,5,5>
+  2779172900U,	// <7,5,6,0>: Cost 3 vsldoi12 RHS, <5,6,0,1>
+  2781163571U,	// <7,5,6,1>: Cost 3 vsldoi12 RHS, <5,6,1,7>
+  3852914743U,	// <7,5,6,2>: Cost 4 vsldoi12 RHS, <5,6,2,2>
+  2779172930U,	// <7,5,6,3>: Cost 3 vsldoi12 RHS, <5,6,3,4>
+  2779172940U,	// <7,5,6,4>: Cost 3 vsldoi12 RHS, <5,6,4,5>
+  2781163607U,	// <7,5,6,5>: Cost 3 vsldoi12 RHS, <5,6,5,7>
+  2779172960U,	// <7,5,6,6>: Cost 3 vsldoi12 RHS, <5,6,6,7>
+  1705431138U,	// <7,5,6,7>: Cost 2 vsldoi12 RHS, <5,6,7,0>
+  1705578603U,	// <7,5,6,u>: Cost 2 vsldoi12 RHS, <5,6,u,0>
+  2646204518U,	// <7,5,7,0>: Cost 3 vsldoi4 <4,7,5,7>, LHS
+  2322090898U,	// <7,5,7,1>: Cost 3 vmrglw <6,6,7,7>, <4,0,5,1>
+  3719947880U,	// <7,5,7,2>: Cost 4 vsldoi4 <4,7,5,7>, <2,2,2,2>
+  3719948438U,	// <7,5,7,3>: Cost 4 vsldoi4 <4,7,5,7>, <3,0,1,2>
+  2646207951U,	// <7,5,7,4>: Cost 3 vsldoi4 <4,7,5,7>, <4,7,5,7>
+  2322091226U,	// <7,5,7,5>: Cost 3 vmrglw <6,6,7,7>, <4,4,5,5>
+  2322090498U,	// <7,5,7,6>: Cost 3 vmrglw <6,6,7,7>, <3,4,5,6>
+  2646210156U,	// <7,5,7,7>: Cost 3 vsldoi4 <4,7,5,7>, <7,7,7,7>
+  2646210350U,	// <7,5,7,u>: Cost 3 vsldoi4 <4,7,5,7>, LHS
+  2779173062U,	// <7,5,u,0>: Cost 3 vsldoi12 RHS, <5,u,0,1>
+  2779173072U,	// <7,5,u,1>: Cost 3 vsldoi12 RHS, <5,u,1,2>
+  2319404409U,	// <7,5,u,2>: Cost 3 vmrglw <6,2,7,3>, <4,6,5,2>
+  2779173092U,	// <7,5,u,3>: Cost 3 vsldoi12 RHS, <5,u,3,4>
+  2779173101U,	// <7,5,u,4>: Cost 3 vsldoi12 RHS, <5,u,4,4>
+  1705431044U,	// <7,5,u,5>: Cost 2 vsldoi12 RHS, <5,5,5,5>
+  2779173118U,	// <7,5,u,6>: Cost 3 vsldoi12 RHS, <5,u,6,3>
+  1705578756U,	// <7,5,u,7>: Cost 2 vsldoi12 RHS, <5,u,7,0>
+  1707421965U,	// <7,5,u,u>: Cost 2 vsldoi12 RHS, <5,u,u,0>
+  3852914966U,	// <7,6,0,0>: Cost 4 vsldoi12 RHS, <6,0,0,0>
+  2779173153U,	// <7,6,0,1>: Cost 3 vsldoi12 RHS, <6,0,1,2>
+  2256491002U,	// <7,6,0,2>: Cost 3 vmrghw <7,0,1,2>, <6,2,7,3>
+  3852914994U,	// <7,6,0,3>: Cost 4 vsldoi12 RHS, <6,0,3,1>
+  3852915003U,	// <7,6,0,4>: Cost 4 vsldoi12 RHS, <6,0,4,1>
+  2316062652U,	// <7,6,0,5>: Cost 3 vmrglw <5,6,7,0>, <5,4,6,5>
+  2316063544U,	// <7,6,0,6>: Cost 3 vmrglw <5,6,7,0>, <6,6,6,6>
+  1242320182U,	// <7,6,0,7>: Cost 2 vmrglw <5,6,7,0>, RHS
+  1242320183U,	// <7,6,0,u>: Cost 2 vmrglw <5,6,7,0>, RHS
+  3852915048U,	// <7,6,1,0>: Cost 4 vsldoi12 RHS, <6,1,0,1>
+  3377866217U,	// <7,6,1,1>: Cost 4 vmrglw <3,6,7,1>, <2,0,6,1>
+  3852915068U,	// <7,6,1,2>: Cost 4 vsldoi12 RHS, <6,1,2,3>
+  3833672072U,	// <7,6,1,3>: Cost 5 vsldoi12 <1,3,6,7>, <6,1,3,6>
+  3852915088U,	// <7,6,1,4>: Cost 4 vsldoi12 RHS, <6,1,4,5>
+  3395122056U,	// <7,6,1,5>: Cost 4 vmrglw <6,5,7,1>, <6,7,6,5>
+  3389813560U,	// <7,6,1,6>: Cost 4 vmrglw <5,6,7,1>, <6,6,6,6>
+  2779173287U,	// <7,6,1,7>: Cost 3 vsldoi12 RHS, <6,1,7,1>
+  2779320752U,	// <7,6,1,u>: Cost 3 vsldoi12 RHS, <6,1,u,1>
+  2658181222U,	// <7,6,2,0>: Cost 3 vsldoi4 <6,7,6,2>, LHS
+  3852915140U,	// <7,6,2,1>: Cost 4 vsldoi12 RHS, <6,2,1,3>
+  2257973754U,	// <7,6,2,2>: Cost 3 vmrghw <7,2,3,3>, <6,2,7,3>
+  3841413589U,	// <7,6,2,3>: Cost 4 vsldoi12 <2,6,3,7>, <6,2,3,2>
+  2658184502U,	// <7,6,2,4>: Cost 3 vsldoi4 <6,7,6,2>, RHS
+  3852915176U,	// <7,6,2,5>: Cost 4 vsldoi12 RHS, <6,2,5,3>
+  2658186117U,	// <7,6,2,6>: Cost 3 vsldoi4 <6,7,6,2>, <6,7,6,2>
+  1705431546U,	// <7,6,2,7>: Cost 2 vsldoi12 RHS, <6,2,7,3>
+  1705579011U,	// <7,6,2,u>: Cost 2 vsldoi12 RHS, <6,2,u,3>
+  3714015334U,	// <7,6,3,0>: Cost 4 vsldoi4 <3,7,6,3>, LHS
+  3777243425U,	// <7,6,3,1>: Cost 4 vsldoi8 <3,1,7,6>, <3,1,7,6>
+  2319405957U,	// <7,6,3,2>: Cost 3 vmrglw <6,2,7,3>, <6,7,6,2>
+  3375229286U,	// <7,6,3,3>: Cost 4 vmrglw <3,2,7,3>, <3,2,6,3>
+  2779173426U,	// <7,6,3,4>: Cost 3 vsldoi12 RHS, <6,3,4,5>
+  3375228721U,	// <7,6,3,5>: Cost 4 vmrglw <3,2,7,3>, <2,4,6,5>
+  2319405880U,	// <7,6,3,6>: Cost 3 vmrglw <6,2,7,3>, <6,6,6,6>
+  1245662518U,	// <7,6,3,7>: Cost 2 vmrglw <6,2,7,3>, RHS
+  1245662519U,	// <7,6,3,u>: Cost 2 vmrglw <6,2,7,3>, RHS
+  3852915291U,	// <7,6,4,0>: Cost 4 vsldoi12 RHS, <6,4,0,1>
+  3389834729U,	// <7,6,4,1>: Cost 4 vmrglw <5,6,7,4>, <2,0,6,1>
+  2259472890U,	// <7,6,4,2>: Cost 3 vmrghw <7,4,5,6>, <6,2,7,3>
+  3852915321U,	// <7,6,4,3>: Cost 4 vsldoi12 RHS, <6,4,3,4>
+  3852915330U,	// <7,6,4,4>: Cost 4 vsldoi12 RHS, <6,4,4,4>
+  2779173517U,	// <7,6,4,5>: Cost 3 vsldoi12 RHS, <6,4,5,6>
+  2316096312U,	// <7,6,4,6>: Cost 3 vmrglw <5,6,7,4>, <6,6,6,6>
+  1242352950U,	// <7,6,4,7>: Cost 2 vmrglw <5,6,7,4>, RHS
+  1242352951U,	// <7,6,4,u>: Cost 2 vmrglw <5,6,7,4>, RHS
+  3852915372U,	// <7,6,5,0>: Cost 4 vsldoi12 RHS, <6,5,0,1>
+  3835294392U,	// <7,6,5,1>: Cost 5 vsldoi12 <1,6,1,7>, <6,5,1,4>
+  3852915395U,	// <7,6,5,2>: Cost 4 vsldoi12 RHS, <6,5,2,6>
+  3852915404U,	// <7,6,5,3>: Cost 4 vsldoi12 RHS, <6,5,3,6>
+  3852915412U,	// <7,6,5,4>: Cost 4 vsldoi12 RHS, <6,5,4,5>
+  3377899313U,	// <7,6,5,5>: Cost 4 vmrglw <3,6,7,5>, <2,4,6,5>
+  2718765160U,	// <7,6,5,6>: Cost 3 vsldoi8 <5,6,7,6>, <5,6,7,6>
+  2779173611U,	// <7,6,5,7>: Cost 3 vsldoi12 RHS, <6,5,7,1>
+  2779321076U,	// <7,6,5,u>: Cost 3 vsldoi12 RHS, <6,5,u,1>
+  2658213990U,	// <7,6,6,0>: Cost 3 vsldoi4 <6,7,6,6>, LHS
+  3852915462U,	// <7,6,6,1>: Cost 4 vsldoi12 RHS, <6,6,1,1>
+  2718765562U,	// <7,6,6,2>: Cost 3 vsldoi8 <5,6,7,6>, <6,2,7,3>
+  3714042622U,	// <7,6,6,3>: Cost 4 vsldoi4 <3,7,6,6>, <3,7,6,6>
+  2658217270U,	// <7,6,6,4>: Cost 3 vsldoi4 <6,7,6,6>, RHS
+  2724074224U,	// <7,6,6,5>: Cost 3 vsldoi8 <6,5,7,6>, <6,5,7,6>
+  1705431864U,	// <7,6,6,6>: Cost 2 vsldoi12 RHS, <6,6,6,6>
+  1705431874U,	// <7,6,6,7>: Cost 2 vsldoi12 RHS, <6,6,7,7>
+  1705579339U,	// <7,6,6,u>: Cost 2 vsldoi12 RHS, <6,6,u,7>
+  1705431886U,	// <7,6,7,0>: Cost 2 vsldoi12 RHS, <6,7,0,1>
+  2779173719U,	// <7,6,7,1>: Cost 3 vsldoi12 RHS, <6,7,1,1>
+  2779173729U,	// <7,6,7,2>: Cost 3 vsldoi12 RHS, <6,7,2,2>
+  2779173736U,	// <7,6,7,3>: Cost 3 vsldoi12 RHS, <6,7,3,0>
+  1705431926U,	// <7,6,7,4>: Cost 2 vsldoi12 RHS, <6,7,4,5>
+  2779173759U,	// <7,6,7,5>: Cost 3 vsldoi12 RHS, <6,7,5,5>
+  2779173765U,	// <7,6,7,6>: Cost 3 vsldoi12 RHS, <6,7,6,2>
+  1248349494U,	// <7,6,7,7>: Cost 2 vmrglw <6,6,7,7>, RHS
+  1705431958U,	// <7,6,7,u>: Cost 2 vsldoi12 RHS, <6,7,u,1>
+  1705579423U,	// <7,6,u,0>: Cost 2 vsldoi12 RHS, <6,u,0,1>
+  2779173801U,	// <7,6,u,1>: Cost 3 vsldoi12 RHS, <6,u,1,2>
+  2779321266U,	// <7,6,u,2>: Cost 3 vsldoi12 RHS, <6,u,2,2>
+  2779321273U,	// <7,6,u,3>: Cost 3 vsldoi12 RHS, <6,u,3,0>
+  1705579463U,	// <7,6,u,4>: Cost 2 vsldoi12 RHS, <6,u,4,5>
+  2779173841U,	// <7,6,u,5>: Cost 3 vsldoi12 RHS, <6,u,5,6>
+  1705431864U,	// <7,6,u,6>: Cost 2 vsldoi12 RHS, <6,6,6,6>
+  1705432032U,	// <7,6,u,7>: Cost 2 vsldoi12 RHS, <6,u,7,3>
+  1705579495U,	// <7,6,u,u>: Cost 2 vsldoi12 RHS, <6,u,u,1>
+  1242320994U,	// <7,7,0,0>: Cost 2 vmrglw <5,6,7,0>, <5,6,7,0>
+  1705432058U,	// <7,7,0,1>: Cost 2 vsldoi12 RHS, <7,0,1,2>
+  3841414146U,	// <7,7,0,2>: Cost 4 vsldoi12 <2,6,3,7>, <7,0,2,1>
+  2316063226U,	// <7,7,0,3>: Cost 3 vmrglw <5,6,7,0>, <6,2,7,3>
+  2779173908U,	// <7,7,0,4>: Cost 3 vsldoi12 RHS, <7,0,4,1>
+  2658242658U,	// <7,7,0,5>: Cost 3 vsldoi4 <6,7,7,0>, <5,6,7,0>
+  2658243468U,	// <7,7,0,6>: Cost 3 vsldoi4 <6,7,7,0>, <6,7,7,0>
+  2316063554U,	// <7,7,0,7>: Cost 3 vmrglw <5,6,7,0>, <6,6,7,7>
+  1705432121U,	// <7,7,0,u>: Cost 2 vsldoi12 RHS, <7,0,u,2>
+  3852915777U,	// <7,7,1,0>: Cost 4 vsldoi12 RHS, <7,1,0,1>
+  2779173962U,	// <7,7,1,1>: Cost 3 vsldoi12 RHS, <7,1,1,1>
+  2779173973U,	// <7,7,1,2>: Cost 3 vsldoi12 RHS, <7,1,2,3>
+  3389813242U,	// <7,7,1,3>: Cost 4 vmrglw <5,6,7,1>, <6,2,7,3>
+  3852915813U,	// <7,7,1,4>: Cost 4 vsldoi12 RHS, <7,1,4,1>
+  3852915821U,	// <7,7,1,5>: Cost 4 vsldoi12 RHS, <7,1,5,0>
+  3835294839U,	// <7,7,1,6>: Cost 4 vsldoi12 <1,6,1,7>, <7,1,6,1>
+  2329343596U,	// <7,7,1,7>: Cost 3 vmrglw <7,u,7,1>, <7,7,7,7>
+  2779174027U,	// <7,7,1,u>: Cost 3 vsldoi12 RHS, <7,1,u,3>
+  2803061908U,	// <7,7,2,0>: Cost 3 vsldoi12 RHS, <7,2,0,3>
+  3852915869U,	// <7,7,2,1>: Cost 4 vsldoi12 RHS, <7,2,1,3>
+  2779174053U,	// <7,7,2,2>: Cost 3 vsldoi12 RHS, <7,2,2,2>
+  2779174060U,	// <7,7,2,3>: Cost 3 vsldoi12 RHS, <7,2,3,0>
+  2803061944U,	// <7,7,2,4>: Cost 3 vsldoi12 RHS, <7,2,4,3>
+  3852915905U,	// <7,7,2,5>: Cost 4 vsldoi12 RHS, <7,2,5,3>
+  2767672522U,	// <7,7,2,6>: Cost 3 vsldoi12 <2,6,3,7>, <7,2,6,3>
+  2791855315U,	// <7,7,2,7>: Cost 3 vsldoi12 <6,6,7,7>, <7,2,7,3>
+  2768999644U,	// <7,7,2,u>: Cost 3 vsldoi12 <2,u,3,7>, <7,2,u,3>
+  2779174115U,	// <7,7,3,0>: Cost 3 vsldoi12 RHS, <7,3,0,1>
+  3852915948U,	// <7,7,3,1>: Cost 4 vsldoi12 RHS, <7,3,1,1>
+  3841414394U,	// <7,7,3,2>: Cost 4 vsldoi12 <2,6,3,7>, <7,3,2,6>
+  1245663738U,	// <7,7,3,3>: Cost 2 vmrglw <6,2,7,3>, <6,2,7,3>
+  2779174155U,	// <7,7,3,4>: Cost 3 vsldoi12 RHS, <7,3,4,5>
+  3852915988U,	// <7,7,3,5>: Cost 4 vsldoi12 RHS, <7,3,5,5>
+  2706827959U,	// <7,7,3,6>: Cost 3 vsldoi8 <3,6,7,7>, <3,6,7,7>
+  2319405890U,	// <7,7,3,7>: Cost 3 vmrglw <6,2,7,3>, <6,6,7,7>
+  1245663738U,	// <7,7,3,u>: Cost 2 vmrglw <6,2,7,3>, <6,2,7,3>
+  2779174200U,	// <7,7,4,0>: Cost 3 vsldoi12 RHS, <7,4,0,5>
+  3852916030U,	// <7,7,4,1>: Cost 4 vsldoi12 RHS, <7,4,1,2>
+  3714099130U,	// <7,7,4,2>: Cost 4 vsldoi4 <3,7,7,4>, <2,6,3,7>
+  2316095994U,	// <7,7,4,3>: Cost 3 vmrglw <5,6,7,4>, <6,2,7,3>
+  1242353766U,	// <7,7,4,4>: Cost 2 vmrglw <5,6,7,4>, <5,6,7,4>
+  1705432422U,	// <7,7,4,5>: Cost 2 vsldoi12 RHS, <7,4,5,6>
+  2658276240U,	// <7,7,4,6>: Cost 3 vsldoi4 <6,7,7,4>, <6,7,7,4>
+  2316096322U,	// <7,7,4,7>: Cost 3 vmrglw <5,6,7,4>, <6,6,7,7>
+  1705432449U,	// <7,7,4,u>: Cost 2 vsldoi12 RHS, <7,4,u,6>
+  3852916101U,	// <7,7,5,0>: Cost 4 vsldoi12 RHS, <7,5,0,1>
+  3854906765U,	// <7,7,5,1>: Cost 4 vsldoi12 RHS, <7,5,1,0>
+  3852916121U,	// <7,7,5,2>: Cost 4 vsldoi12 RHS, <7,5,2,3>
+  3389846010U,	// <7,7,5,3>: Cost 4 vmrglw <5,6,7,5>, <6,2,7,3>
+  3852916141U,	// <7,7,5,4>: Cost 4 vsldoi12 RHS, <7,5,4,5>
+  2779174326U,	// <7,7,5,5>: Cost 3 vsldoi12 RHS, <7,5,5,5>
+  2779174337U,	// <7,7,5,6>: Cost 3 vsldoi12 RHS, <7,5,6,7>
+  2329376364U,	// <7,7,5,7>: Cost 3 vmrglw <7,u,7,5>, <7,7,7,7>
+  2779321811U,	// <7,7,5,u>: Cost 3 vsldoi12 RHS, <7,5,u,7>
+  2658287718U,	// <7,7,6,0>: Cost 3 vsldoi4 <6,7,7,6>, LHS
+  3852916197U,	// <7,7,6,1>: Cost 4 vsldoi12 RHS, <7,6,1,7>
+  2779174382U,	// <7,7,6,2>: Cost 3 vsldoi12 RHS, <7,6,2,7>
+  2316112378U,	// <7,7,6,3>: Cost 3 vmrglw <5,6,7,6>, <6,2,7,3>
+  2658290998U,	// <7,7,6,4>: Cost 3 vsldoi4 <6,7,7,6>, RHS
+  3852916233U,	// <7,7,6,5>: Cost 4 vsldoi12 RHS, <7,6,5,7>
+  1651004226U,	// <7,7,6,6>: Cost 2 vsldoi8 <6,6,7,7>, <6,6,7,7>
+  2779174420U,	// <7,7,6,7>: Cost 3 vsldoi12 RHS, <7,6,7,0>
+  1652331492U,	// <7,7,6,u>: Cost 2 vsldoi8 <6,u,7,7>, <6,u,7,7>
+  1590526054U,	// <7,7,7,0>: Cost 2 vsldoi4 <7,7,7,7>, LHS
+  2328728623U,	// <7,7,7,1>: Cost 3 vmrglw <7,7,7,7>, <7,0,7,1>
+  2724746451U,	// <7,7,7,2>: Cost 3 vsldoi8 <6,6,7,7>, <7,2,7,3>
+  2322092538U,	// <7,7,7,3>: Cost 3 vmrglw <6,6,7,7>, <6,2,7,3>
+  1590529334U,	// <7,7,7,4>: Cost 2 vsldoi4 <7,7,7,7>, RHS
+  2328728951U,	// <7,7,7,5>: Cost 3 vmrglw <7,7,7,7>, <7,4,7,5>
+  2724746770U,	// <7,7,7,6>: Cost 3 vsldoi8 <6,6,7,7>, <7,6,6,7>
+  430361910U,	// <7,7,7,7>: Cost 1 vspltisw3 RHS
+  430361910U,	// <7,7,7,u>: Cost 1 vspltisw3 RHS
+  1242320994U,	// <7,7,u,0>: Cost 2 vmrglw <5,6,7,0>, <5,6,7,0>
+  1705580162U,	// <7,7,u,1>: Cost 2 vsldoi12 RHS, <7,u,1,2>
+  2779321996U,	// <7,7,u,2>: Cost 3 vsldoi12 RHS, <7,u,2,3>
+  1245663738U,	// <7,7,u,3>: Cost 2 vmrglw <6,2,7,3>, <6,2,7,3>
+  1242353766U,	// <7,7,u,4>: Cost 2 vmrglw <5,6,7,4>, <5,6,7,4>
+  1705580202U,	// <7,7,u,5>: Cost 2 vsldoi12 RHS, <7,u,5,6>
+  1662949620U,	// <7,7,u,6>: Cost 2 vsldoi8 <u,6,7,7>, <u,6,7,7>
+  430361910U,	// <7,7,u,7>: Cost 1 vspltisw3 RHS
+  430361910U,	// <7,7,u,u>: Cost 1 vspltisw3 RHS
+  1705426944U,	// <7,u,0,0>: Cost 2 vsldoi12 RHS, <0,0,0,0>
+  1705432787U,	// <7,u,0,1>: Cost 2 vsldoi12 RHS, <u,0,1,2>
+  2316060885U,	// <7,u,0,2>: Cost 3 vmrglw <5,6,7,0>, <3,0,u,2>
+  1242316956U,	// <7,u,0,3>: Cost 2 vmrglw <5,6,7,0>, LHS
+  2779174637U,	// <7,u,0,4>: Cost 3 vsldoi12 RHS, <u,0,4,1>
+  1182750874U,	// <7,u,0,5>: Cost 2 vmrghw <7,0,1,2>, RHS
+  2316061213U,	// <7,u,0,6>: Cost 3 vmrglw <5,6,7,0>, <3,4,u,6>
+  1242320200U,	// <7,u,0,7>: Cost 2 vmrglw <5,6,7,0>, RHS
+  1705432850U,	// <7,u,0,u>: Cost 2 vsldoi12 RHS, <u,0,u,2>
+  1584578662U,	// <7,u,1,0>: Cost 2 vsldoi4 <6,7,u,1>, LHS
+  1705427764U,	// <7,u,1,1>: Cost 2 vsldoi12 RHS, <1,1,1,1>
+  631691054U,	// <7,u,1,2>: Cost 1 vsldoi12 RHS, LHS
+  2640407307U,	// <7,u,1,3>: Cost 3 vsldoi4 <3,7,u,1>, <3,7,u,1>
+  1584581942U,	// <7,u,1,4>: Cost 2 vsldoi4 <6,7,u,1>, RHS
+  2779174726U,	// <7,u,1,5>: Cost 3 vsldoi12 RHS, <u,1,5,0>
+  1584583574U,	// <7,u,1,6>: Cost 2 vsldoi4 <6,7,u,1>, <6,7,u,1>
+  2779322201U,	// <7,u,1,7>: Cost 3 vsldoi12 RHS, <u,1,7,1>
+  631691108U,	// <7,u,1,u>: Cost 1 vsldoi12 RHS, LHS
+  2779174763U,	// <7,u,2,0>: Cost 3 vsldoi12 RHS, <u,2,0,1>
+  2779174774U,	// <7,u,2,1>: Cost 3 vsldoi12 RHS, <u,2,1,3>
+  1705428584U,	// <7,u,2,2>: Cost 2 vsldoi12 RHS, <2,2,2,2>
+  1705432965U,	// <7,u,2,3>: Cost 2 vsldoi12 RHS, <u,2,3,0>
+  2779174801U,	// <7,u,2,4>: Cost 3 vsldoi12 RHS, <u,2,4,3>
+  2779174810U,	// <7,u,2,5>: Cost 3 vsldoi12 RHS, <u,2,5,3>
+  2767673251U,	// <7,u,2,6>: Cost 3 vsldoi12 <2,6,3,7>, <u,2,6,3>
+  1705580460U,	// <7,u,2,7>: Cost 2 vsldoi12 RHS, <u,2,7,3>
+  1705433010U,	// <7,u,2,u>: Cost 2 vsldoi12 RHS, <u,2,u,0>
+  1705433020U,	// <7,u,3,0>: Cost 2 vsldoi12 RHS, <u,3,0,1>
+  2779174853U,	// <7,u,3,1>: Cost 3 vsldoi12 RHS, <u,3,1,1>
+  2767673299U,	// <7,u,3,2>: Cost 3 vsldoi12 <2,6,3,7>, <u,3,2,6>
+  1245659292U,	// <7,u,3,3>: Cost 2 vmrglw <6,2,7,3>, LHS
+  1705433060U,	// <7,u,3,4>: Cost 2 vsldoi12 RHS, <u,3,4,5>
+  2779174893U,	// <7,u,3,5>: Cost 3 vsldoi12 RHS, <u,3,5,5>
+  2706836152U,	// <7,u,3,6>: Cost 3 vsldoi8 <3,6,7,u>, <3,6,7,u>
+  1245662536U,	// <7,u,3,7>: Cost 2 vmrglw <6,2,7,3>, RHS
+  1705433092U,	// <7,u,3,u>: Cost 2 vsldoi12 RHS, <u,3,u,1>
+  2779174925U,	// <7,u,4,0>: Cost 3 vsldoi12 RHS, <u,4,0,1>
+  1185732398U,	// <7,u,4,1>: Cost 2 vmrghw <7,4,5,6>, LHS
+  2316093653U,	// <7,u,4,2>: Cost 3 vmrglw <5,6,7,4>, <3,0,u,2>
+  1242349724U,	// <7,u,4,3>: Cost 2 vmrglw <5,6,7,4>, LHS
+  1705430224U,	// <7,u,4,4>: Cost 2 vsldoi12 RHS, <4,4,4,4>
+  1705433151U,	// <7,u,4,5>: Cost 2 vsldoi12 RHS, <u,4,5,6>
+  2316093981U,	// <7,u,4,6>: Cost 3 vmrglw <5,6,7,4>, <3,4,u,6>
+  1242352968U,	// <7,u,4,7>: Cost 2 vmrglw <5,6,7,4>, RHS
+  1705433178U,	// <7,u,4,u>: Cost 2 vsldoi12 RHS, <u,4,u,6>
+  1584611430U,	// <7,u,5,0>: Cost 2 vsldoi4 <6,7,u,5>, LHS
+  2781165670U,	// <7,u,5,1>: Cost 3 vsldoi12 RHS, <u,5,1,0>
+  2640439226U,	// <7,u,5,2>: Cost 3 vsldoi4 <3,7,u,5>, <2,6,3,7>
+  2640440079U,	// <7,u,5,3>: Cost 3 vsldoi4 <3,7,u,5>, <3,7,u,5>
+  1584614710U,	// <7,u,5,4>: Cost 2 vsldoi4 <6,7,u,5>, RHS
+  1705431044U,	// <7,u,5,5>: Cost 2 vsldoi12 RHS, <5,5,5,5>
+  631691418U,	// <7,u,5,6>: Cost 1 vsldoi12 RHS, RHS
+  2779322525U,	// <7,u,5,7>: Cost 3 vsldoi12 RHS, <u,5,7,1>
+  631691436U,	// <7,u,5,u>: Cost 1 vsldoi12 RHS, RHS
+  2779175087U,	// <7,u,6,0>: Cost 3 vsldoi12 RHS, <u,6,0,1>
+  2779175102U,	// <7,u,6,1>: Cost 3 vsldoi12 RHS, <u,6,1,7>
+  1648357887U,	// <7,u,6,2>: Cost 2 vsldoi8 <6,2,7,u>, <6,2,7,u>
+  1705433296U,	// <7,u,6,3>: Cost 2 vsldoi12 RHS, <u,6,3,7>
+  2779175127U,	// <7,u,6,4>: Cost 3 vsldoi12 RHS, <u,6,4,5>
+  2779175138U,	// <7,u,6,5>: Cost 3 vsldoi12 RHS, <u,6,5,7>
+  1651012419U,	// <7,u,6,6>: Cost 2 vsldoi8 <6,6,7,u>, <6,6,7,u>
+  1705580788U,	// <7,u,6,7>: Cost 2 vsldoi12 RHS, <u,6,7,7>
+  1705433341U,	// <7,u,6,u>: Cost 2 vsldoi12 RHS, <u,6,u,7>
+  1705580800U,	// <7,u,7,0>: Cost 2 vsldoi12 RHS, <u,7,0,1>
+  1187878702U,	// <7,u,7,1>: Cost 2 vmrghw <7,7,7,7>, LHS
+  2768042263U,	// <7,u,7,2>: Cost 3 vsldoi12 <2,6,u,7>, <u,7,2,6>
+  1248346268U,	// <7,u,7,3>: Cost 2 vmrglw <6,6,7,7>, LHS
+  1705580840U,	// <7,u,7,4>: Cost 2 vsldoi12 RHS, <u,7,4,5>
+  1187879066U,	// <7,u,7,5>: Cost 2 vmrghw <7,7,7,7>, RHS
+  2779322679U,	// <7,u,7,6>: Cost 3 vsldoi12 RHS, <u,7,6,2>
+  430361910U,	// <7,u,7,7>: Cost 1 vspltisw3 RHS
+  430361910U,	// <7,u,7,u>: Cost 1 vspltisw3 RHS
+  1705433425U,	// <7,u,u,0>: Cost 2 vsldoi12 RHS, <u,u,0,1>
+  1705433435U,	// <7,u,u,1>: Cost 2 vsldoi12 RHS, <u,u,1,2>
+  631691621U,	// <7,u,u,2>: Cost 1 vsldoi12 RHS, LHS
+  1705433451U,	// <7,u,u,3>: Cost 2 vsldoi12 RHS, <u,u,3,0>
+  1705433465U,	// <7,u,u,4>: Cost 2 vsldoi12 RHS, <u,u,4,5>
+  1705433475U,	// <7,u,u,5>: Cost 2 vsldoi12 RHS, <u,u,5,6>
+  631691661U,	// <7,u,u,6>: Cost 1 vsldoi12 RHS, RHS
+  430361910U,	// <7,u,u,7>: Cost 1 vspltisw3 RHS
+  631691675U,	// <7,u,u,u>: Cost 1 vsldoi12 RHS, LHS
+  202162278U,	// <u,0,0,0>: Cost 1 vspltisw0 LHS
+  1678598154U,	// <u,0,0,1>: Cost 2 vsldoi12 LHS, <0,0,1,1>
+  2634500154U,	// <u,0,0,2>: Cost 3 vsldoi4 <2,u,0,0>, <2,u,0,0>
+  2289596269U,	// <u,0,0,3>: Cost 3 vmrglw <1,2,u,0>, <u,2,0,3>
+  1548815670U,	// <u,0,0,4>: Cost 2 vsldoi4 <0,u,0,0>, RHS
+  2663698530U,	// <u,0,0,5>: Cost 3 vsldoi4 <7,7,0,0>, <5,6,7,0>
+  2658390942U,	// <u,0,0,6>: Cost 3 vsldoi4 <6,u,0,0>, <6,u,0,0>
+  2289596597U,	// <u,0,0,7>: Cost 3 vmrglw <1,2,u,0>, <u,6,0,7>
+  202162278U,	// <u,0,0,u>: Cost 1 vspltisw0 LHS
+  1560764518U,	// <u,0,1,0>: Cost 2 vsldoi4 <2,u,0,1>, LHS
+  115720294U,	// <u,0,1,1>: Cost 1 vmrghw LHS, LHS
+  604856427U,	// <u,0,1,2>: Cost 1 vsldoi12 LHS, LHS
+  2634508438U,	// <u,0,1,3>: Cost 3 vsldoi4 <2,u,0,1>, <3,0,1,2>
+  1560767798U,	// <u,0,1,4>: Cost 2 vsldoi4 <2,u,0,1>, RHS
+  2652426438U,	// <u,0,1,5>: Cost 3 vsldoi4 <5,u,0,1>, <5,u,0,1>
+  1584657311U,	// <u,0,1,6>: Cost 2 vsldoi4 <6,u,0,1>, <6,u,0,1>
+  2658399226U,	// <u,0,1,7>: Cost 3 vsldoi4 <6,u,0,1>, <7,0,1,2>
+  604856476U,	// <u,0,1,u>: Cost 1 vsldoi12 LHS, LHS
+  2696889850U,	// <u,0,2,0>: Cost 3 vsldoi8 <2,0,u,0>, <2,0,u,0>
+  1190174822U,	// <u,0,2,1>: Cost 2 vmrghw <u,2,3,0>, LHS
+  2692245096U,	// <u,0,2,2>: Cost 3 vsldoi8 <1,2,u,0>, <2,2,2,2>
+  2692245158U,	// <u,0,2,3>: Cost 3 vsldoi8 <1,2,u,0>, <2,3,0,1>
+  2263916882U,	// <u,0,2,4>: Cost 3 vmrghw <u,2,3,0>, <0,4,1,5>
+  2299709908U,	// <u,0,2,5>: Cost 3 vmrglw <3,0,1,2>, <3,4,0,5>
+  2692245434U,	// <u,0,2,6>: Cost 3 vsldoi8 <1,2,u,0>, <2,6,3,7>
+  2701535281U,	// <u,0,2,7>: Cost 3 vsldoi8 <2,7,u,0>, <2,7,u,0>
+  1190175389U,	// <u,0,2,u>: Cost 2 vmrghw <u,2,3,0>, LHS
+  1209237504U,	// <u,0,3,0>: Cost 2 vmrglw LHS, <0,0,0,0>
+  1209239206U,	// <u,0,3,1>: Cost 2 vmrglw LHS, <2,3,0,1>
+  2704189813U,	// <u,0,3,2>: Cost 3 vsldoi8 <3,2,u,0>, <3,2,u,0>
+  2692245916U,	// <u,0,3,3>: Cost 3 vsldoi8 <1,2,u,0>, <3,3,3,3>
+  2282981033U,	// <u,0,3,4>: Cost 3 vmrglw LHS, <2,3,0,4>
+  2664386658U,	// <u,0,3,5>: Cost 3 vsldoi4 <7,u,0,3>, <5,6,7,0>
+  2691877496U,	// <u,0,3,6>: Cost 3 vsldoi8 <1,2,3,0>, <3,6,0,7>
+  2664388218U,	// <u,0,3,7>: Cost 3 vsldoi4 <7,u,0,3>, <7,u,0,3>
+  1209239213U,	// <u,0,3,u>: Cost 2 vmrglw LHS, <2,3,0,u>
+  2289623040U,	// <u,0,4,0>: Cost 3 vmrglw <1,2,u,4>, <0,0,0,0>
+  1678598482U,	// <u,0,4,1>: Cost 2 vsldoi12 LHS, <0,4,1,5>
+  2634532926U,	// <u,0,4,2>: Cost 3 vsldoi4 <2,u,0,4>, <2,u,0,4>
+  2235580672U,	// <u,0,4,3>: Cost 3 vmrghw <3,4,5,6>, <0,3,1,4>
+  1143619922U,	// <u,0,4,4>: Cost 2 vmrghw <0,4,1,5>, <0,4,1,5>
+  1618505014U,	// <u,0,4,5>: Cost 2 vsldoi8 <1,2,u,0>, RHS
+  2658423714U,	// <u,0,4,6>: Cost 3 vsldoi4 <6,u,0,4>, <6,u,0,4>
+  2713259464U,	// <u,0,4,7>: Cost 3 vsldoi8 <4,7,5,0>, <4,7,5,0>
+  1683243409U,	// <u,0,4,u>: Cost 2 vsldoi12 LHS, <0,4,u,5>
+  1192443904U,	// <u,0,5,0>: Cost 2 vmrghw RHS, <0,0,0,0>
+  118702182U,	// <u,0,5,1>: Cost 1 vmrghw RHS, LHS
+  2266185901U,	// <u,0,5,2>: Cost 3 vmrghw RHS, <0,2,1,2>
+  2640513816U,	// <u,0,5,3>: Cost 3 vsldoi4 <3,u,0,5>, <3,u,0,5>
+  1192444242U,	// <u,0,5,4>: Cost 2 vmrghw RHS, <0,4,1,5>
+  2718789636U,	// <u,0,5,5>: Cost 3 vsldoi8 <5,6,u,0>, <5,5,5,5>
+  1645047915U,	// <u,0,5,6>: Cost 2 vsldoi8 <5,6,u,0>, <5,6,u,0>
+  2664404604U,	// <u,0,5,7>: Cost 3 vsldoi4 <7,u,0,5>, <7,u,0,5>
+  118702749U,	// <u,0,5,u>: Cost 1 vmrghw RHS, LHS
+  2302910464U,	// <u,0,6,0>: Cost 3 vmrglw <3,4,u,6>, <0,0,0,0>
+  1192886374U,	// <u,0,6,1>: Cost 2 vmrghw <u,6,3,7>, LHS
+  2718790138U,	// <u,0,6,2>: Cost 3 vsldoi8 <5,6,u,0>, <6,2,7,3>
+  2722771537U,	// <u,0,6,3>: Cost 3 vsldoi8 <6,3,u,0>, <6,3,u,0>
+  2266628434U,	// <u,0,6,4>: Cost 3 vmrghw <u,6,3,7>, <0,4,1,5>
+  2248950180U,	// <u,0,6,5>: Cost 3 vmrghw <5,6,7,0>, <0,5,1,6>
+  2718790456U,	// <u,0,6,6>: Cost 3 vsldoi8 <5,6,u,0>, <6,6,6,6>
+  2718790478U,	// <u,0,6,7>: Cost 3 vsldoi8 <5,6,u,0>, <6,7,0,1>
+  1192886941U,	// <u,0,6,u>: Cost 2 vmrghw <u,6,3,7>, LHS
+  1235812352U,	// <u,0,7,0>: Cost 2 vmrglw RHS, <0,0,0,0>
+  1235814054U,	// <u,0,7,1>: Cost 2 vmrglw RHS, <2,3,0,1>
+  2728080601U,	// <u,0,7,2>: Cost 3 vsldoi8 <7,2,u,0>, <7,2,u,0>
+  2640530202U,	// <u,0,7,3>: Cost 3 vsldoi4 <3,u,0,7>, <3,u,0,7>
+  2640530742U,	// <u,0,7,4>: Cost 3 vsldoi4 <3,u,0,7>, RHS
+  2309556692U,	// <u,0,7,5>: Cost 3 vmrglw RHS, <3,4,0,5>
+  2730735133U,	// <u,0,7,6>: Cost 3 vsldoi8 <7,6,u,0>, <7,6,u,0>
+  2309556856U,	// <u,0,7,7>: Cost 3 vmrglw RHS, <3,6,0,7>
+  1235814061U,	// <u,0,7,u>: Cost 2 vmrglw RHS, <2,3,0,u>
+  202162278U,	// <u,0,u,0>: Cost 1 vspltisw0 LHS
+  120365158U,	// <u,0,u,1>: Cost 1 vmrghw LHS, LHS
+  604856989U,	// <u,0,u,2>: Cost 1 vsldoi12 LHS, LHS
+  2692249532U,	// <u,0,u,3>: Cost 3 vsldoi8 <1,2,u,0>, <u,3,0,1>
+  1560825142U,	// <u,0,u,4>: Cost 2 vsldoi4 <2,u,0,u>, RHS
+  1618507930U,	// <u,0,u,5>: Cost 2 vsldoi8 <1,2,u,0>, RHS
+  1584714662U,	// <u,0,u,6>: Cost 2 vsldoi4 <6,u,0,u>, <6,u,0,u>
+  2309565048U,	// <u,0,u,7>: Cost 3 vmrglw RHS, <3,6,0,7>
+  604857043U,	// <u,0,u,u>: Cost 1 vsldoi12 LHS, LHS
+  1611210825U,	// <u,1,0,0>: Cost 2 vsldoi8 <0,0,u,1>, <0,0,u,1>
+  1616519270U,	// <u,1,0,1>: Cost 2 vsldoi8 <0,u,u,1>, LHS
+  2287605459U,	// <u,1,0,2>: Cost 3 vmrglw <0,u,u,0>, <u,0,1,2>
+  2640546588U,	// <u,1,0,3>: Cost 3 vsldoi4 <3,u,1,0>, <3,u,1,0>
+  2622631222U,	// <u,1,0,4>: Cost 3 vsldoi4 <0,u,1,0>, RHS
+  2289590610U,	// <u,1,0,5>: Cost 3 vmrglw <1,2,u,0>, <0,4,1,5>
+  2664436630U,	// <u,1,0,6>: Cost 3 vsldoi4 <7,u,1,0>, <6,7,u,1>
+  2664437376U,	// <u,1,0,7>: Cost 3 vsldoi4 <7,u,1,0>, <7,u,1,0>
+  1616519889U,	// <u,1,0,u>: Cost 2 vsldoi8 <0,u,u,1>, <0,u,u,1>
+  1548894866U,	// <u,1,1,0>: Cost 2 vsldoi4 <0,u,1,1>, <0,u,1,1>
+  269271142U,	// <u,1,1,1>: Cost 1 vspltisw1 LHS
+  1189462934U,	// <u,1,1,2>: Cost 2 vmrghw LHS, <1,2,3,0>
+  2622638230U,	// <u,1,1,3>: Cost 3 vsldoi4 <0,u,1,1>, <3,0,1,2>
+  1548897590U,	// <u,1,1,4>: Cost 2 vsldoi4 <0,u,1,1>, RHS
+  2756985692U,	// <u,1,1,5>: Cost 3 vsldoi12 LHS, <1,1,5,5>
+  2658472872U,	// <u,1,1,6>: Cost 3 vsldoi4 <6,u,1,1>, <6,u,1,1>
+  2287614142U,	// <u,1,1,7>: Cost 3 vmrglw <0,u,u,1>, <u,6,1,7>
+  269271142U,	// <u,1,1,u>: Cost 1 vspltisw1 LHS
+  1566818406U,	// <u,1,2,0>: Cost 2 vsldoi4 <3,u,1,2>, LHS
+  2756985735U,	// <u,1,2,1>: Cost 3 vsldoi12 LHS, <1,2,1,3>
+  1148371862U,	// <u,1,2,2>: Cost 2 vmrghw <1,2,3,0>, <1,2,3,0>
+  835584U,	// <u,1,2,3>: Cost 0 copy LHS
+  1566821686U,	// <u,1,2,4>: Cost 2 vsldoi4 <3,u,1,2>, RHS
+  2756985771U,	// <u,1,2,5>: Cost 3 vsldoi12 LHS, <1,2,5,3>
+  2690262970U,	// <u,1,2,6>: Cost 3 vsldoi8 <0,u,u,1>, <2,6,3,7>
+  1590711938U,	// <u,1,2,7>: Cost 2 vsldoi4 <7,u,1,2>, <7,u,1,2>
+  835584U,	// <u,1,2,u>: Cost 0 copy LHS
+  2282979337U,	// <u,1,3,0>: Cost 3 vmrglw LHS, <0,0,1,0>
+  1209237514U,	// <u,1,3,1>: Cost 2 vmrglw LHS, <0,0,1,1>
+  1209239702U,	// <u,1,3,2>: Cost 2 vmrglw LHS, <3,0,1,2>
+  2282979502U,	// <u,1,3,3>: Cost 3 vmrglw LHS, <0,2,1,3>
+  2282979341U,	// <u,1,3,4>: Cost 3 vmrglw LHS, <0,0,1,4>
+  1209237842U,	// <u,1,3,5>: Cost 2 vmrglw LHS, <0,4,1,5>
+  2282979505U,	// <u,1,3,6>: Cost 3 vmrglw LHS, <0,2,1,6>
+  2287625423U,	// <u,1,3,7>: Cost 3 vmrglw LHS, <1,6,1,7>
+  1209237521U,	// <u,1,3,u>: Cost 2 vmrglw LHS, <0,0,1,u>
+  1635101613U,	// <u,1,4,0>: Cost 2 vsldoi8 <4,0,u,1>, <4,0,u,1>
+  2289623050U,	// <u,1,4,1>: Cost 3 vmrglw <1,2,u,4>, <0,0,1,1>
+  2289625238U,	// <u,1,4,2>: Cost 3 vmrglw <1,2,u,4>, <3,0,1,2>
+  2640579360U,	// <u,1,4,3>: Cost 3 vsldoi4 <3,u,1,4>, <3,u,1,4>
+  2622663990U,	// <u,1,4,4>: Cost 3 vsldoi4 <0,u,1,4>, RHS
+  1616522550U,	// <u,1,4,5>: Cost 2 vsldoi8 <0,u,u,1>, RHS
+  2664469398U,	// <u,1,4,6>: Cost 3 vsldoi4 <7,u,1,4>, <6,7,u,1>
+  2664470148U,	// <u,1,4,7>: Cost 3 vsldoi4 <7,u,1,4>, <7,u,1,4>
+  1616522793U,	// <u,1,4,u>: Cost 2 vsldoi8 <0,u,u,1>, RHS
+  1548927638U,	// <u,1,5,0>: Cost 2 vsldoi4 <0,u,1,5>, <0,u,1,5>
+  1192444724U,	// <u,1,5,1>: Cost 2 vmrghw RHS, <1,1,1,1>
+  1192444822U,	// <u,1,5,2>: Cost 2 vmrghw RHS, <1,2,3,0>
+  2622670998U,	// <u,1,5,3>: Cost 3 vsldoi4 <0,u,1,5>, <3,0,1,2>
+  1548930358U,	// <u,1,5,4>: Cost 2 vsldoi4 <0,u,1,5>, RHS
+  1210728786U,	// <u,1,5,5>: Cost 2 vmrglw <0,4,1,5>, <0,4,1,5>
+  2714153058U,	// <u,1,5,6>: Cost 3 vsldoi8 <4,u,u,1>, <5,6,7,0>
+  2670449658U,	// <u,1,5,7>: Cost 3 vsldoi4 <u,u,1,5>, <7,0,1,2>
+  1548932910U,	// <u,1,5,u>: Cost 2 vsldoi4 <0,u,1,5>, LHS
+  2622677655U,	// <u,1,6,0>: Cost 3 vsldoi4 <0,u,1,6>, <0,u,1,6>
+  2756986063U,	// <u,1,6,1>: Cost 3 vsldoi12 LHS, <1,6,1,7>
+  2302912662U,	// <u,1,6,2>: Cost 3 vmrglw <3,4,u,6>, <3,0,1,2>
+  3696421014U,	// <u,1,6,3>: Cost 4 vsldoi4 <0,u,1,6>, <3,0,1,2>
+  2622680374U,	// <u,1,6,4>: Cost 3 vsldoi4 <0,u,1,6>, RHS
+  2756986099U,	// <u,1,6,5>: Cost 3 vsldoi12 LHS, <1,6,5,7>
+  2714153784U,	// <u,1,6,6>: Cost 3 vsldoi8 <4,u,u,1>, <6,6,6,6>
+  1651692438U,	// <u,1,6,7>: Cost 2 vsldoi8 <6,7,u,1>, <6,7,u,1>
+  1652356071U,	// <u,1,6,u>: Cost 2 vsldoi8 <6,u,u,1>, <6,u,u,1>
+  2628657254U,	// <u,1,7,0>: Cost 3 vsldoi4 <1,u,1,7>, LHS
+  1235812362U,	// <u,1,7,1>: Cost 2 vmrglw RHS, <0,0,1,1>
+  1235814550U,	// <u,1,7,2>: Cost 2 vmrglw RHS, <3,0,1,2>
+  2309554350U,	// <u,1,7,3>: Cost 3 vmrglw RHS, <0,2,1,3>
+  2628660534U,	// <u,1,7,4>: Cost 3 vsldoi4 <1,u,1,7>, RHS
+  1235812690U,	// <u,1,7,5>: Cost 2 vmrglw RHS, <0,4,1,5>
+  2309554353U,	// <u,1,7,6>: Cost 3 vmrglw RHS, <0,2,1,6>
+  2309554678U,	// <u,1,7,7>: Cost 3 vmrglw RHS, <0,6,1,7>
+  1235812369U,	// <u,1,7,u>: Cost 2 vmrglw RHS, <0,0,1,u>
+  1548952217U,	// <u,1,u,0>: Cost 2 vsldoi4 <0,u,1,u>, <0,u,1,u>
+  269271142U,	// <u,1,u,1>: Cost 1 vspltisw1 LHS
+  1209280662U,	// <u,1,u,2>: Cost 2 vmrglw LHS, <3,0,1,2>
+  835584U,	// <u,1,u,3>: Cost 0 copy LHS
+  1548954934U,	// <u,1,u,4>: Cost 2 vsldoi4 <0,u,1,u>, RHS
+  1209278802U,	// <u,1,u,5>: Cost 2 vmrglw LHS, <0,4,1,5>
+  2283020465U,	// <u,1,u,6>: Cost 3 vmrglw LHS, <0,2,1,6>
+  1590761096U,	// <u,1,u,7>: Cost 2 vsldoi4 <7,u,1,u>, <7,u,1,u>
+  835584U,	// <u,1,u,u>: Cost 0 copy LHS
+  2702876672U,	// <u,2,0,0>: Cost 3 vsldoi8 <3,0,u,2>, <0,0,0,0>
+  1629134950U,	// <u,2,0,1>: Cost 2 vsldoi8 <3,0,u,2>, LHS
+  2289591912U,	// <u,2,0,2>: Cost 3 vmrglw <1,2,u,0>, <2,2,2,2>
+  1215848550U,	// <u,2,0,3>: Cost 2 vmrglw <1,2,u,0>, LHS
+  2702877010U,	// <u,2,0,4>: Cost 3 vsldoi8 <3,0,u,2>, <0,4,1,5>
+  2289222708U,	// <u,2,0,5>: Cost 3 vmrglw <1,2,3,0>, <1,4,2,5>
+  2779178473U,	// <u,2,0,6>: Cost 3 vsldoi12 RHS, <2,0,6,1>
+  2726249024U,	// <u,2,0,7>: Cost 3 vsldoi8 <7,0,1,2>, <0,7,1,0>
+  1215848555U,	// <u,2,0,u>: Cost 2 vmrglw <1,2,u,0>, LHS
+  2690933539U,	// <u,2,1,0>: Cost 3 vsldoi8 <1,0,u,2>, <1,0,u,2>
+  2628683124U,	// <u,2,1,1>: Cost 3 vsldoi4 <1,u,2,1>, <1,u,2,1>
+  1189463656U,	// <u,2,1,2>: Cost 2 vmrghw LHS, <2,2,2,2>
+  1213866086U,	// <u,2,1,3>: Cost 2 vmrglw <0,u,u,1>, LHS
+  2628685110U,	// <u,2,1,4>: Cost 3 vsldoi4 <1,u,2,1>, RHS
+  2263205736U,	// <u,2,1,5>: Cost 3 vmrghw LHS, <2,5,3,6>
+  1189463994U,	// <u,2,1,6>: Cost 2 vmrghw LHS, <2,6,3,7>
+  2263205866U,	// <u,2,1,7>: Cost 3 vmrghw LHS, <2,7,0,1>
+  1213866091U,	// <u,2,1,u>: Cost 2 vmrglw <0,u,u,1>, LHS
+  1556938854U,	// <u,2,2,0>: Cost 2 vsldoi4 <2,2,2,2>, LHS
+  2697569869U,	// <u,2,2,1>: Cost 3 vsldoi8 <2,1,u,2>, <2,1,u,2>
+  336380006U,	// <u,2,2,2>: Cost 1 vspltisw2 LHS
+  1678599794U,	// <u,2,2,3>: Cost 2 vsldoi12 LHS, <2,2,3,3>
+  1556942134U,	// <u,2,2,4>: Cost 2 vsldoi4 <2,2,2,2>, RHS
+  2295138061U,	// <u,2,2,5>: Cost 3 vmrglw <2,2,2,2>, <2,4,2,5>
+  2702878650U,	// <u,2,2,6>: Cost 3 vsldoi8 <3,0,u,2>, <2,6,3,7>
+  2300229831U,	// <u,2,2,7>: Cost 3 vmrglw <3,0,u,2>, <u,6,2,7>
+  336380006U,	// <u,2,2,u>: Cost 1 vspltisw2 LHS
+  475243165U,	// <u,2,3,0>: Cost 1 vsldoi4 LHS, LHS
+  1548985140U,	// <u,2,3,1>: Cost 2 vsldoi4 LHS, <1,1,1,1>
+  1209239144U,	// <u,2,3,2>: Cost 2 vmrglw LHS, <2,2,2,2>
+  135495782U,	// <u,2,3,3>: Cost 1 vmrglw LHS, LHS
+  475245878U,	// <u,2,3,4>: Cost 1 vsldoi4 LHS, RHS
+  1596764164U,	// <u,2,3,5>: Cost 2 vsldoi4 LHS, <5,5,5,5>
+  1596764666U,	// <u,2,3,6>: Cost 2 vsldoi4 LHS, <6,2,7,3>
+  1596765178U,	// <u,2,3,7>: Cost 2 vsldoi4 LHS, <7,0,1,2>
+  135495787U,	// <u,2,3,u>: Cost 1 vmrglw LHS, LHS
+  2708851630U,	// <u,2,4,0>: Cost 3 vsldoi8 <4,0,u,2>, <4,0,u,2>
+  2217362979U,	// <u,2,4,1>: Cost 3 vmrghw <0,4,1,5>, <2,1,3,5>
+  2289624680U,	// <u,2,4,2>: Cost 3 vmrglw <1,2,u,4>, <2,2,2,2>
+  1215881318U,	// <u,2,4,3>: Cost 2 vmrglw <1,2,u,4>, LHS
+  2726767824U,	// <u,2,4,4>: Cost 3 vsldoi8 <7,0,u,2>, <4,4,4,4>
+  1629138230U,	// <u,2,4,5>: Cost 2 vsldoi8 <3,0,u,2>, RHS
+  2779178801U,	// <u,2,4,6>: Cost 3 vsldoi12 RHS, <2,4,6,5>
+  2726251976U,	// <u,2,4,7>: Cost 3 vsldoi8 <7,0,1,2>, <4,7,5,0>
+  1215881323U,	// <u,2,4,u>: Cost 2 vmrglw <1,2,u,4>, LHS
+  2628714598U,	// <u,2,5,0>: Cost 3 vsldoi4 <1,u,2,5>, LHS
+  2628715896U,	// <u,2,5,1>: Cost 3 vsldoi4 <1,u,2,5>, <1,u,2,5>
+  1192445544U,	// <u,2,5,2>: Cost 2 vmrghw RHS, <2,2,2,2>
+  1213898854U,	// <u,2,5,3>: Cost 2 vmrglw <0,u,u,5>, LHS
+  2628717878U,	// <u,2,5,4>: Cost 3 vsldoi4 <1,u,2,5>, RHS
+  2726768644U,	// <u,2,5,5>: Cost 3 vsldoi8 <7,0,u,2>, <5,5,5,5>
+  1192445882U,	// <u,2,5,6>: Cost 2 vmrghw RHS, <2,6,3,7>
+  2266187754U,	// <u,2,5,7>: Cost 3 vmrghw RHS, <2,7,0,1>
+  1213898859U,	// <u,2,5,u>: Cost 2 vmrglw <0,u,u,5>, LHS
+  2634694758U,	// <u,2,6,0>: Cost 3 vsldoi4 <2,u,2,6>, LHS
+  2721460657U,	// <u,2,6,1>: Cost 3 vsldoi8 <6,1,u,2>, <6,1,u,2>
+  2296940136U,	// <u,2,6,2>: Cost 3 vmrglw <2,4,u,6>, <2,2,2,2>
+  1678600122U,	// <u,2,6,3>: Cost 2 vsldoi12 LHS, <2,6,3,7>
+  2634698038U,	// <u,2,6,4>: Cost 3 vsldoi4 <2,u,2,6>, RHS
+  3370682125U,	// <u,2,6,5>: Cost 4 vmrglw <2,4,u,6>, <2,4,2,5>
+  1157056442U,	// <u,2,6,6>: Cost 2 vmrghw <2,6,3,7>, <2,6,3,7>
+  2725442455U,	// <u,2,6,7>: Cost 3 vsldoi8 <6,7,u,2>, <6,7,u,2>
+  1678600167U,	// <u,2,6,u>: Cost 2 vsldoi12 LHS, <2,6,u,7>
+  1653027897U,	// <u,2,7,0>: Cost 2 vsldoi8 <7,0,u,2>, <7,0,u,2>
+  2309554924U,	// <u,2,7,1>: Cost 3 vmrglw RHS, <1,0,2,1>
+  1235813992U,	// <u,2,7,2>: Cost 2 vmrglw RHS, <2,2,2,2>
+  162070630U,	// <u,2,7,3>: Cost 1 vmrglw RHS, LHS
+  2634706230U,	// <u,2,7,4>: Cost 3 vsldoi4 <2,u,2,7>, RHS
+  2309555252U,	// <u,2,7,5>: Cost 3 vmrglw RHS, <1,4,2,5>
+  2309555901U,	// <u,2,7,6>: Cost 3 vmrglw RHS, <2,3,2,6>
+  2309555416U,	// <u,2,7,7>: Cost 3 vmrglw RHS, <1,6,2,7>
+  162070635U,	// <u,2,7,u>: Cost 1 vmrglw RHS, LHS
+  475284130U,	// <u,2,u,0>: Cost 1 vsldoi4 LHS, LHS
+  1549026100U,	// <u,2,u,1>: Cost 2 vsldoi4 LHS, <1,1,1,1>
+  336380006U,	// <u,2,u,2>: Cost 1 vspltisw2 LHS
+  135536742U,	// <u,2,u,3>: Cost 1 vmrglw LHS, LHS
+  475286838U,	// <u,2,u,4>: Cost 1 vsldoi4 LHS, RHS
+  1629141146U,	// <u,2,u,5>: Cost 2 vsldoi8 <3,0,u,2>, RHS
+  1194108858U,	// <u,2,u,6>: Cost 2 vmrghw LHS, <2,6,3,7>
+  1596806138U,	// <u,2,u,7>: Cost 2 vsldoi4 LHS, <7,0,1,2>
+  135536747U,	// <u,2,u,u>: Cost 1 vmrglw LHS, LHS
+  1611890688U,	// <u,3,0,0>: Cost 2 vsldoi8 LHS, <0,0,0,0>
+  538149020U,	// <u,3,0,1>: Cost 1 vsldoi8 LHS, LHS
+  2685632685U,	// <u,3,0,2>: Cost 3 vsldoi8 LHS, <0,2,1,2>
+  2685632764U,	// <u,3,0,3>: Cost 3 vsldoi8 LHS, <0,3,1,0>
+  1611891026U,	// <u,3,0,4>: Cost 2 vsldoi8 LHS, <0,4,1,5>
+  2733408722U,	// <u,3,0,5>: Cost 3 vsldoi8 LHS, <0,5,6,7>
+  2658612153U,	// <u,3,0,6>: Cost 3 vsldoi4 <6,u,3,0>, <6,u,3,0>
+  2289592250U,	// <u,3,0,7>: Cost 3 vmrglw <1,2,u,0>, <2,6,3,7>
+  538149533U,	// <u,3,0,u>: Cost 1 vsldoi8 LHS, LHS
+  1189464214U,	// <u,3,1,0>: Cost 2 vmrghw LHS, <3,0,1,2>
+  1611891508U,	// <u,3,1,1>: Cost 2 vsldoi8 LHS, <1,1,1,1>
+  1611891606U,	// <u,3,1,2>: Cost 2 vsldoi8 LHS, <1,2,3,0>
+  1189464476U,	// <u,3,1,3>: Cost 2 vmrghw LHS, <3,3,3,3>
+  1189464578U,	// <u,3,1,4>: Cost 2 vmrghw LHS, <3,4,5,6>
+  2690278511U,	// <u,3,1,5>: Cost 3 vsldoi8 LHS, <1,5,0,1>
+  2690278607U,	// <u,3,1,6>: Cost 3 vsldoi8 LHS, <1,6,1,7>
+  2287609786U,	// <u,3,1,7>: Cost 3 vmrglw <0,u,u,1>, <2,6,3,7>
+  1611892092U,	// <u,3,1,u>: Cost 2 vsldoi8 LHS, <1,u,3,0>
+  2685634042U,	// <u,3,2,0>: Cost 3 vsldoi8 LHS, <2,0,u,0>
+  2685634079U,	// <u,3,2,1>: Cost 3 vsldoi8 LHS, <2,1,3,1>
+  1611892328U,	// <u,3,2,2>: Cost 2 vsldoi8 LHS, <2,2,2,2>
+  1611892390U,	// <u,3,2,3>: Cost 2 vsldoi8 LHS, <2,3,0,1>
+  2685634371U,	// <u,3,2,4>: Cost 3 vsldoi8 LHS, <2,4,u,5>
+  2685634453U,	// <u,3,2,5>: Cost 3 vsldoi8 LHS, <2,5,u,6>
+  1611892666U,	// <u,3,2,6>: Cost 2 vsldoi8 LHS, <2,6,3,7>
+  2300225466U,	// <u,3,2,7>: Cost 3 vmrglw <3,0,u,2>, <2,6,3,7>
+  1611892795U,	// <u,3,2,u>: Cost 2 vsldoi8 LHS, <2,u,0,1>
+  1209238422U,	// <u,3,3,0>: Cost 2 vmrglw LHS, <1,2,3,0>
+  2282980247U,	// <u,3,3,1>: Cost 3 vmrglw LHS, <1,2,3,1>
+  1561004120U,	// <u,3,3,2>: Cost 2 vsldoi4 <2,u,3,3>, <2,u,3,3>
+  403488870U,	// <u,3,3,3>: Cost 1 vspltisw3 LHS
+  1209238426U,	// <u,3,3,4>: Cost 2 vmrglw LHS, <1,2,3,4>
+  2282980899U,	// <u,3,3,5>: Cost 3 vmrglw LHS, <2,1,3,5>
+  2282985598U,	// <u,3,3,6>: Cost 3 vmrglw LHS, <u,5,3,6>
+  1209239482U,	// <u,3,3,7>: Cost 2 vmrglw LHS, <2,6,3,7>
+  403488870U,	// <u,3,3,u>: Cost 1 vspltisw3 LHS
+  1555038310U,	// <u,3,4,0>: Cost 2 vsldoi4 <1,u,3,4>, LHS
+  1555039616U,	// <u,3,4,1>: Cost 2 vsldoi4 <1,u,3,4>, <1,u,3,4>
+  2628781672U,	// <u,3,4,2>: Cost 3 vsldoi4 <1,u,3,4>, <2,2,2,2>
+  2289624690U,	// <u,3,4,3>: Cost 3 vmrglw <1,2,u,4>, <2,2,3,3>
+  1555041590U,	// <u,3,4,4>: Cost 2 vsldoi4 <1,u,3,4>, RHS
+  538152246U,	// <u,3,4,5>: Cost 1 vsldoi8 LHS, RHS
+  2658644925U,	// <u,3,4,6>: Cost 3 vsldoi4 <6,u,3,4>, <6,u,3,4>
+  2289625018U,	// <u,3,4,7>: Cost 3 vmrglw <1,2,u,4>, <2,6,3,7>
+  538152489U,	// <u,3,4,u>: Cost 1 vsldoi8 LHS, RHS
+  1192446102U,	// <u,3,5,0>: Cost 2 vmrghw RHS, <3,0,1,2>
+  2733411983U,	// <u,3,5,1>: Cost 3 vsldoi8 LHS, <5,1,0,1>
+  2634762330U,	// <u,3,5,2>: Cost 3 vsldoi4 <2,u,3,5>, <2,u,3,5>
+  1192446364U,	// <u,3,5,3>: Cost 2 vmrghw RHS, <3,3,3,3>
+  1192446466U,	// <u,3,5,4>: Cost 2 vmrghw RHS, <3,4,5,6>
+  1659670532U,	// <u,3,5,5>: Cost 2 vsldoi8 LHS, <5,5,5,5>
+  1659670626U,	// <u,3,5,6>: Cost 2 vsldoi8 LHS, <5,6,7,0>
+  2287642554U,	// <u,3,5,7>: Cost 3 vmrglw <0,u,u,5>, <2,6,3,7>
+  1659670788U,	// <u,3,5,u>: Cost 2 vsldoi8 LHS, <5,u,7,0>
+  2634768486U,	// <u,3,6,0>: Cost 3 vsldoi4 <2,u,3,6>, LHS
+  2733412775U,	// <u,3,6,1>: Cost 3 vsldoi8 LHS, <6,1,7,1>
+  1648390659U,	// <u,3,6,2>: Cost 2 vsldoi8 <6,2,u,3>, <6,2,u,3>
+  2634770973U,	// <u,3,6,3>: Cost 3 vsldoi4 <2,u,3,6>, <3,4,u,6>
+  2634771766U,	// <u,3,6,4>: Cost 3 vsldoi4 <2,u,3,6>, RHS
+  2733413099U,	// <u,3,6,5>: Cost 3 vsldoi8 LHS, <6,5,7,1>
+  1659671352U,	// <u,3,6,6>: Cost 2 vsldoi8 LHS, <6,6,6,6>
+  1659671374U,	// <u,3,6,7>: Cost 2 vsldoi8 LHS, <6,7,0,1>
+  1652372457U,	// <u,3,6,u>: Cost 2 vsldoi8 <6,u,u,3>, <6,u,u,3>
+  1561034854U,	// <u,3,7,0>: Cost 2 vsldoi4 <2,u,3,7>, LHS
+  2634777396U,	// <u,3,7,1>: Cost 3 vsldoi4 <2,u,3,7>, <1,1,1,1>
+  1561036892U,	// <u,3,7,2>: Cost 2 vsldoi4 <2,u,3,7>, <2,u,3,7>
+  1235814002U,	// <u,3,7,3>: Cost 2 vmrglw RHS, <2,2,3,3>
+  1561038134U,	// <u,3,7,4>: Cost 2 vsldoi4 <2,u,3,7>, RHS
+  2309555747U,	// <u,3,7,5>: Cost 3 vmrglw RHS, <2,1,3,5>
+  2309556072U,	// <u,3,7,6>: Cost 3 vmrglw RHS, <2,5,3,6>
+  1235814330U,	// <u,3,7,7>: Cost 2 vmrglw RHS, <2,6,3,7>
+  1561040686U,	// <u,3,7,u>: Cost 2 vsldoi4 <2,u,3,7>, LHS
+  1611896531U,	// <u,3,u,0>: Cost 2 vsldoi8 LHS, <u,0,1,2>
+  538154798U,	// <u,3,u,1>: Cost 1 vsldoi8 LHS, LHS
+  1611896712U,	// <u,3,u,2>: Cost 2 vsldoi8 LHS, <u,2,3,3>
+  403488870U,	// <u,3,u,3>: Cost 1 vspltisw3 LHS
+  1611896895U,	// <u,3,u,4>: Cost 2 vsldoi8 LHS, <u,4,5,6>
+  538155162U,	// <u,3,u,5>: Cost 1 vsldoi8 LHS, RHS
+  1611897040U,	// <u,3,u,6>: Cost 2 vsldoi8 LHS, <u,6,3,7>
+  1209280442U,	// <u,3,u,7>: Cost 2 vmrglw LHS, <2,6,3,7>
+  538155365U,	// <u,3,u,u>: Cost 1 vsldoi8 LHS, LHS
+  1165118354U,	// <u,4,0,0>: Cost 2 vmrghw <4,0,5,1>, <4,0,5,1>
+  1618534502U,	// <u,4,0,1>: Cost 2 vsldoi8 <1,2,u,4>, LHS
+  2634795102U,	// <u,4,0,2>: Cost 3 vsldoi4 <2,u,4,0>, <2,u,4,0>
+  2686451968U,	// <u,4,0,3>: Cost 3 vsldoi8 <0,3,1,4>, <0,3,1,4>
+  2692276562U,	// <u,4,0,4>: Cost 3 vsldoi8 <1,2,u,4>, <0,4,1,5>
+  1705438098U,	// <u,4,0,5>: Cost 2 vsldoi12 RHS, <4,0,5,1>
+  2658685890U,	// <u,4,0,6>: Cost 3 vsldoi4 <6,u,4,0>, <6,u,4,0>
+  2256489928U,	// <u,4,0,7>: Cost 3 vmrghw <7,0,1,2>, <4,7,5,0>
+  1618535069U,	// <u,4,0,u>: Cost 2 vsldoi8 <1,2,u,4>, LHS
+  1189464978U,	// <u,4,1,0>: Cost 2 vmrghw LHS, <4,0,5,1>
+  2692277044U,	// <u,4,1,1>: Cost 3 vsldoi8 <1,2,u,4>, <1,1,1,1>
+  1618535367U,	// <u,4,1,2>: Cost 2 vsldoi8 <1,2,u,4>, <1,2,u,4>
+  2640775992U,	// <u,4,1,3>: Cost 3 vsldoi4 <3,u,4,1>, <3,u,4,1>
+  1189465296U,	// <u,4,1,4>: Cost 2 vmrghw LHS, <4,4,4,4>
+  115723574U,	// <u,4,1,5>: Cost 1 vmrghw LHS, RHS
+  2263207289U,	// <u,4,1,6>: Cost 3 vmrghw LHS, <4,6,5,2>
+  2664666780U,	// <u,4,1,7>: Cost 3 vsldoi4 <7,u,4,1>, <7,u,4,1>
+  115723817U,	// <u,4,1,u>: Cost 1 vmrghw LHS, RHS
+  2263919506U,	// <u,4,2,0>: Cost 3 vmrghw <u,2,3,0>, <4,0,5,1>
+  2222115812U,	// <u,4,2,1>: Cost 3 vmrghw <1,2,3,0>, <4,1,5,2>
+  2692277864U,	// <u,4,2,2>: Cost 3 vsldoi8 <1,2,u,4>, <2,2,2,2>
+  2692277926U,	// <u,4,2,3>: Cost 3 vsldoi8 <1,2,u,4>, <2,3,0,1>
+  2324114640U,	// <u,4,2,4>: Cost 3 vmrglw <7,0,u,2>, <4,4,4,4>
+  1190178102U,	// <u,4,2,5>: Cost 2 vmrghw <u,2,3,0>, RHS
+  2692278202U,	// <u,4,2,6>: Cost 3 vsldoi8 <1,2,u,4>, <2,6,3,7>
+  2701568053U,	// <u,4,2,7>: Cost 3 vsldoi8 <2,7,u,4>, <2,7,u,4>
+  1190178345U,	// <u,4,2,u>: Cost 2 vmrghw <u,2,3,0>, RHS
+  2692278422U,	// <u,4,3,0>: Cost 3 vsldoi8 <1,2,u,4>, <3,0,1,2>
+  2282981552U,	// <u,4,3,1>: Cost 3 vmrglw LHS, <3,0,4,1>
+  2704222585U,	// <u,4,3,2>: Cost 3 vsldoi8 <3,2,u,4>, <3,2,u,4>
+  2692278684U,	// <u,4,3,3>: Cost 3 vsldoi8 <1,2,u,4>, <3,3,3,3>
+  1257016528U,	// <u,4,3,4>: Cost 2 vmrglw LHS, <4,4,4,4>
+  1209239246U,	// <u,4,3,5>: Cost 2 vmrglw LHS, <2,3,4,5>
+  2691910300U,	// <u,4,3,6>: Cost 3 vsldoi8 <1,2,3,4>, <3,6,4,7>
+  2664683166U,	// <u,4,3,7>: Cost 3 vsldoi4 <7,u,4,3>, <7,u,4,3>
+  1209239249U,	// <u,4,3,u>: Cost 2 vmrglw LHS, <2,3,4,u>
+  1573027942U,	// <u,4,4,0>: Cost 2 vsldoi4 <4,u,4,4>, LHS
+  2634826695U,	// <u,4,4,1>: Cost 3 vsldoi4 <2,u,4,4>, <1,2,u,4>
+  2634827874U,	// <u,4,4,2>: Cost 3 vsldoi4 <2,u,4,4>, <2,u,4,4>
+  2289629073U,	// <u,4,4,3>: Cost 3 vmrglw <1,2,u,4>, <u,2,4,3>
+  229035318U,	// <u,4,4,4>: Cost 1 vspltisw0 RHS
+  1618537782U,	// <u,4,4,5>: Cost 2 vsldoi8 <1,2,u,4>, RHS
+  2658718662U,	// <u,4,4,6>: Cost 3 vsldoi4 <6,u,4,4>, <6,u,4,4>
+  2289629401U,	// <u,4,4,7>: Cost 3 vmrglw <1,2,u,4>, <u,6,4,7>
+  229035318U,	// <u,4,4,u>: Cost 1 vspltisw0 RHS
+  1561092198U,	// <u,4,5,0>: Cost 2 vsldoi4 <2,u,4,5>, LHS
+  2628863370U,	// <u,4,5,1>: Cost 3 vsldoi4 <1,u,4,5>, <1,u,4,5>
+  1561094243U,	// <u,4,5,2>: Cost 2 vsldoi4 <2,u,4,5>, <2,u,4,5>
+  2634836118U,	// <u,4,5,3>: Cost 3 vsldoi4 <2,u,4,5>, <3,0,1,2>
+  1561095478U,	// <u,4,5,4>: Cost 2 vsldoi4 <2,u,4,5>, RHS
+  118705462U,	// <u,4,5,5>: Cost 1 vmrghw RHS, RHS
+  604859702U,	// <u,4,5,6>: Cost 1 vsldoi12 LHS, RHS
+  2658726906U,	// <u,4,5,7>: Cost 3 vsldoi4 <6,u,4,5>, <7,0,1,2>
+  604859720U,	// <u,4,5,u>: Cost 1 vsldoi12 LHS, RHS
+  2266631058U,	// <u,4,6,0>: Cost 3 vmrghw <u,6,3,7>, <4,0,5,1>
+  2302692152U,	// <u,4,6,1>: Cost 3 vmrglw <3,4,5,6>, <3,u,4,1>
+  2718822906U,	// <u,4,6,2>: Cost 3 vsldoi8 <5,6,u,4>, <6,2,7,3>
+  2722804309U,	// <u,4,6,3>: Cost 3 vsldoi8 <6,3,u,4>, <6,3,u,4>
+  2723467942U,	// <u,4,6,4>: Cost 3 vsldoi8 <6,4,u,4>, <6,4,u,4>
+  1192889654U,	// <u,4,6,5>: Cost 2 vmrghw <u,6,3,7>, RHS
+  2718823224U,	// <u,4,6,6>: Cost 3 vsldoi8 <5,6,u,4>, <6,6,6,6>
+  2718823246U,	// <u,4,6,7>: Cost 3 vsldoi8 <5,6,u,4>, <6,7,0,1>
+  1192889897U,	// <u,4,6,u>: Cost 2 vmrghw <u,6,3,7>, RHS
+  2640822374U,	// <u,4,7,0>: Cost 3 vsldoi4 <3,u,4,7>, LHS
+  2640823194U,	// <u,4,7,1>: Cost 3 vsldoi4 <3,u,4,7>, <1,2,3,4>
+  2728113373U,	// <u,4,7,2>: Cost 3 vsldoi8 <7,2,u,4>, <7,2,u,4>
+  2640825150U,	// <u,4,7,3>: Cost 3 vsldoi4 <3,u,4,7>, <3,u,4,7>
+  1235815632U,	// <u,4,7,4>: Cost 2 vmrglw RHS, <4,4,4,4>
+  1235814094U,	// <u,4,7,5>: Cost 2 vmrglw RHS, <2,3,4,5>
+  2730767905U,	// <u,4,7,6>: Cost 3 vsldoi8 <7,6,u,4>, <7,6,u,4>
+  2309556892U,	// <u,4,7,7>: Cost 3 vmrglw RHS, <3,6,4,7>
+  1235814097U,	// <u,4,7,u>: Cost 2 vmrglw RHS, <2,3,4,u>
+  1561116774U,	// <u,4,u,0>: Cost 2 vsldoi4 <2,u,4,u>, LHS
+  1618540334U,	// <u,4,u,1>: Cost 2 vsldoi8 <1,2,u,4>, LHS
+  1561118822U,	// <u,4,u,2>: Cost 2 vsldoi4 <2,u,4,u>, <2,u,4,u>
+  2692282300U,	// <u,4,u,3>: Cost 3 vsldoi8 <1,2,u,4>, <u,3,0,1>
+  229035318U,	// <u,4,u,4>: Cost 1 vspltisw0 RHS
+  120368438U,	// <u,4,u,5>: Cost 1 vmrghw LHS, RHS
+  604859945U,	// <u,4,u,6>: Cost 1 vsldoi12 LHS, RHS
+  2309565084U,	// <u,4,u,7>: Cost 3 vmrglw RHS, <3,6,4,7>
+  604859963U,	// <u,4,u,u>: Cost 1 vsldoi12 LHS, RHS
+  2690293760U,	// <u,5,0,0>: Cost 3 vsldoi8 <0,u,u,5>, <0,0,0,0>
+  1616552038U,	// <u,5,0,1>: Cost 2 vsldoi8 <0,u,u,5>, LHS
+  2640840434U,	// <u,5,0,2>: Cost 3 vsldoi4 <3,u,5,0>, <2,3,u,5>
+  2640841536U,	// <u,5,0,3>: Cost 3 vsldoi4 <3,u,5,0>, <3,u,5,0>
+  1613381970U,	// <u,5,0,4>: Cost 2 vsldoi8 <0,4,1,5>, <0,4,1,5>
+  2316135642U,	// <u,5,0,5>: Cost 3 vmrglw <5,6,u,0>, <4,4,5,5>
+  2289592834U,	// <u,5,0,6>: Cost 3 vmrglw <1,2,u,0>, <3,4,5,6>
+  2664732324U,	// <u,5,0,7>: Cost 3 vsldoi4 <7,u,5,0>, <7,u,5,0>
+  1616552661U,	// <u,5,0,u>: Cost 2 vsldoi8 <0,u,u,5>, <0,u,u,5>
+  1573077094U,	// <u,5,1,0>: Cost 2 vsldoi4 <4,u,5,1>, LHS
+  1237536282U,	// <u,5,1,1>: Cost 2 vmrglw <4,u,5,1>, <4,u,5,1>
+  2690294678U,	// <u,5,1,2>: Cost 3 vsldoi8 <0,u,u,5>, <1,2,3,0>
+  2646821014U,	// <u,5,1,3>: Cost 3 vsldoi4 <4,u,5,1>, <3,0,1,2>
+  1573080602U,	// <u,5,1,4>: Cost 2 vsldoi4 <4,u,5,1>, <4,u,5,1>
+  1189466116U,	// <u,5,1,5>: Cost 2 vmrghw LHS, <5,5,5,5>
+  1189466210U,	// <u,5,1,6>: Cost 2 vmrghw LHS, <5,6,7,0>
+  2646823930U,	// <u,5,1,7>: Cost 3 vsldoi4 <4,u,5,1>, <7,0,1,2>
+  1573082926U,	// <u,5,1,u>: Cost 2 vsldoi4 <4,u,5,1>, LHS
+  2640855142U,	// <u,5,2,0>: Cost 3 vsldoi4 <3,u,5,2>, LHS
+  2697594448U,	// <u,5,2,1>: Cost 3 vsldoi8 <2,1,u,5>, <2,1,u,5>
+  2690295400U,	// <u,5,2,2>: Cost 3 vsldoi8 <0,u,u,5>, <2,2,2,2>
+  1625179890U,	// <u,5,2,3>: Cost 2 vsldoi8 <2,3,u,5>, <2,3,u,5>
+  2699585347U,	// <u,5,2,4>: Cost 3 vsldoi8 <2,4,u,5>, <2,4,u,5>
+  2781171471U,	// <u,5,2,5>: Cost 3 vsldoi12 RHS, <5,2,5,3>
+  2690295738U,	// <u,5,2,6>: Cost 3 vsldoi8 <0,u,u,5>, <2,6,3,7>
+  3775318070U,	// <u,5,2,7>: Cost 4 vsldoi8 <2,7,u,5>, <2,7,u,5>
+  1628498055U,	// <u,5,2,u>: Cost 2 vsldoi8 <2,u,u,5>, <2,u,u,5>
+  2287627234U,	// <u,5,3,0>: Cost 3 vmrglw LHS, <4,1,5,0>
+  1257016210U,	// <u,5,3,1>: Cost 2 vmrglw LHS, <4,0,5,1>
+  2646836942U,	// <u,5,3,2>: Cost 3 vsldoi4 <4,u,5,3>, <2,3,4,5>
+  2287625131U,	// <u,5,3,3>: Cost 3 vmrglw LHS, <1,2,5,3>
+  2287627238U,	// <u,5,3,4>: Cost 3 vmrglw LHS, <4,1,5,4>
+  1257016538U,	// <u,5,3,5>: Cost 2 vmrglw LHS, <4,4,5,5>
+  1209240066U,	// <u,5,3,6>: Cost 2 vmrglw LHS, <3,4,5,6>
+  2287625459U,	// <u,5,3,7>: Cost 3 vmrglw LHS, <1,6,5,7>
+  1209240068U,	// <u,5,3,u>: Cost 2 vmrglw LHS, <3,4,5,u>
+  2640871526U,	// <u,5,4,0>: Cost 3 vsldoi4 <3,u,5,4>, LHS
+  2316168082U,	// <u,5,4,1>: Cost 3 vmrglw <5,6,u,4>, <4,0,5,1>
+  2640873202U,	// <u,5,4,2>: Cost 3 vsldoi4 <3,u,5,4>, <2,3,u,5>
+  2640874308U,	// <u,5,4,3>: Cost 3 vsldoi4 <3,u,5,4>, <3,u,5,4>
+  1637788917U,	// <u,5,4,4>: Cost 2 vsldoi8 <4,4,u,5>, <4,4,u,5>
+  1616555318U,	// <u,5,4,5>: Cost 2 vsldoi8 <0,u,u,5>, RHS
+  2287638591U,	// <u,5,4,6>: Cost 3 vmrglw <0,u,u,4>, <u,4,5,6>
+  2664765096U,	// <u,5,4,7>: Cost 3 vsldoi4 <7,u,5,4>, <7,u,5,4>
+  1616555561U,	// <u,5,4,u>: Cost 2 vsldoi8 <0,u,u,5>, RHS
+  1573109862U,	// <u,5,5,0>: Cost 2 vsldoi4 <4,u,5,5>, LHS
+  2646852404U,	// <u,5,5,1>: Cost 3 vsldoi4 <4,u,5,5>, <1,1,1,1>
+  2646853224U,	// <u,5,5,2>: Cost 3 vsldoi4 <4,u,5,5>, <2,2,2,2>
+  2287646618U,	// <u,5,5,3>: Cost 3 vmrglw <0,u,u,5>, <u,2,5,3>
+  1573113374U,	// <u,5,5,4>: Cost 2 vsldoi4 <4,u,5,5>, <4,u,5,5>
+  296144182U,	// <u,5,5,5>: Cost 1 vspltisw1 RHS
+  1192448098U,	// <u,5,5,6>: Cost 2 vmrghw RHS, <5,6,7,0>
+  2287646946U,	// <u,5,5,7>: Cost 3 vmrglw <0,u,u,5>, <u,6,5,7>
+  296144182U,	// <u,5,5,u>: Cost 1 vspltisw1 RHS
+  1567146086U,	// <u,5,6,0>: Cost 2 vsldoi4 <3,u,5,6>, LHS
+  2628945300U,	// <u,5,6,1>: Cost 3 vsldoi4 <1,u,5,6>, <1,u,5,6>
+  2634917997U,	// <u,5,6,2>: Cost 3 vsldoi4 <2,u,5,6>, <2,u,5,6>
+  1567148870U,	// <u,5,6,3>: Cost 2 vsldoi4 <3,u,5,6>, <3,u,5,6>
+  1567149366U,	// <u,5,6,4>: Cost 2 vsldoi4 <3,u,5,6>, RHS
+  2781171799U,	// <u,5,6,5>: Cost 3 vsldoi12 RHS, <5,6,5,7>
+  1228950018U,	// <u,5,6,6>: Cost 2 vmrglw <3,4,5,6>, <3,4,5,6>
+  27705344U,	// <u,5,6,7>: Cost 0 copy RHS
+  27705344U,	// <u,5,6,u>: Cost 0 copy RHS
+  2628952166U,	// <u,5,7,0>: Cost 3 vsldoi4 <1,u,5,7>, LHS
+  1235815314U,	// <u,5,7,1>: Cost 2 vmrglw RHS, <4,0,5,1>
+  2309556734U,	// <u,5,7,2>: Cost 3 vmrglw RHS, <3,4,5,2>
+  2309555115U,	// <u,5,7,3>: Cost 3 vmrglw RHS, <1,2,5,3>
+  2628955446U,	// <u,5,7,4>: Cost 3 vsldoi4 <1,u,5,7>, RHS
+  1235815642U,	// <u,5,7,5>: Cost 2 vmrglw RHS, <4,4,5,5>
+  1235814914U,	// <u,5,7,6>: Cost 2 vmrglw RHS, <3,4,5,6>
+  2309555443U,	// <u,5,7,7>: Cost 3 vmrglw RHS, <1,6,5,7>
+  1235814916U,	// <u,5,7,u>: Cost 2 vmrglw RHS, <3,4,5,u>
+  1567162470U,	// <u,5,u,0>: Cost 2 vsldoi4 <3,u,5,u>, LHS
+  1616557870U,	// <u,5,u,1>: Cost 2 vsldoi8 <0,u,u,5>, LHS
+  2690299781U,	// <u,5,u,2>: Cost 3 vsldoi8 <0,u,u,5>, <u,2,3,0>
+  1567165256U,	// <u,5,u,3>: Cost 2 vsldoi4 <3,u,5,u>, <3,u,5,u>
+  1567165750U,	// <u,5,u,4>: Cost 2 vsldoi4 <3,u,5,u>, RHS
+  296144182U,	// <u,5,u,5>: Cost 1 vspltisw1 RHS
+  1209281026U,	// <u,5,u,6>: Cost 2 vmrglw LHS, <3,4,5,6>
+  27705344U,	// <u,5,u,7>: Cost 0 copy RHS
+  27705344U,	// <u,5,u,u>: Cost 0 copy RHS
+  2705563648U,	// <u,6,0,0>: Cost 3 vsldoi8 <3,4,u,6>, <0,0,0,0>
+  1631821926U,	// <u,6,0,1>: Cost 2 vsldoi8 <3,4,u,6>, LHS
+  2262462970U,	// <u,6,0,2>: Cost 3 vmrghw <u,0,1,2>, <6,2,7,3>
+  2646886941U,	// <u,6,0,3>: Cost 3 vsldoi4 <4,u,6,0>, <3,4,u,6>
+  2705563986U,	// <u,6,0,4>: Cost 3 vsldoi8 <3,4,u,6>, <0,4,1,5>
+  2316062652U,	// <u,6,0,5>: Cost 3 vmrglw <5,6,7,0>, <5,4,6,5>
+  2316137272U,	// <u,6,0,6>: Cost 3 vmrglw <5,6,u,0>, <6,6,6,6>
+  1215851830U,	// <u,6,0,7>: Cost 2 vmrglw <1,2,u,0>, RHS
+  1215851831U,	// <u,6,0,u>: Cost 2 vmrglw <1,2,u,0>, RHS
+  2634948710U,	// <u,6,1,0>: Cost 3 vsldoi4 <2,u,6,1>, LHS
+  2705564468U,	// <u,6,1,1>: Cost 3 vsldoi8 <3,4,u,6>, <1,1,1,1>
+  1189466618U,	// <u,6,1,2>: Cost 2 vmrghw LHS, <6,2,7,3>
+  2263208498U,	// <u,6,1,3>: Cost 3 vmrghw LHS, <6,3,4,5>
+  2693620843U,	// <u,6,1,4>: Cost 3 vsldoi8 <1,4,u,6>, <1,4,u,6>
+  2652868860U,	// <u,6,1,5>: Cost 3 vsldoi4 <5,u,6,1>, <5,u,6,1>
+  1189466936U,	// <u,6,1,6>: Cost 2 vmrghw LHS, <6,6,6,6>
+  1213869366U,	// <u,6,1,7>: Cost 2 vmrglw <0,u,u,1>, RHS
+  1213869367U,	// <u,6,1,u>: Cost 2 vmrglw <0,u,u,1>, RHS
+  2658844774U,	// <u,6,2,0>: Cost 3 vsldoi4 <6,u,6,2>, LHS
+  3771344465U,	// <u,6,2,1>: Cost 4 vsldoi8 <2,1,u,6>, <2,1,u,6>
+  1178554874U,	// <u,6,2,2>: Cost 2 vmrghw <6,2,7,3>, <6,2,7,3>
+  2698929907U,	// <u,6,2,3>: Cost 3 vsldoi8 <2,3,u,6>, <2,3,u,6>
+  2699593540U,	// <u,6,2,4>: Cost 3 vsldoi8 <2,4,u,6>, <2,4,u,6>
+  2700257173U,	// <u,6,2,5>: Cost 3 vsldoi8 <2,5,u,6>, <2,5,u,6>
+  2705565626U,	// <u,6,2,6>: Cost 3 vsldoi8 <3,4,u,6>, <2,6,3,7>
+  1226485046U,	// <u,6,2,7>: Cost 2 vmrglw <3,0,u,2>, RHS
+  1226485047U,	// <u,6,2,u>: Cost 2 vmrglw <3,0,u,2>, RHS
+  2705565846U,	// <u,6,3,0>: Cost 3 vsldoi8 <3,4,u,6>, <3,0,1,2>
+  2330756585U,	// <u,6,3,1>: Cost 3 vmrglw LHS, <2,0,6,1>
+  2330756829U,	// <u,6,3,2>: Cost 3 vmrglw LHS, <2,3,6,2>
+  2282981734U,	// <u,6,3,3>: Cost 3 vmrglw LHS, <3,2,6,3>
+  1631824413U,	// <u,6,3,4>: Cost 2 vsldoi8 <3,4,u,6>, <3,4,u,6>
+  2652885246U,	// <u,6,3,5>: Cost 3 vsldoi4 <5,u,6,3>, <5,u,6,3>
+  1257018168U,	// <u,6,3,6>: Cost 2 vmrglw LHS, <6,6,6,6>
+  135499062U,	// <u,6,3,7>: Cost 1 vmrglw LHS, RHS
+  135499063U,	// <u,6,3,u>: Cost 1 vmrglw LHS, RHS
+  2646917222U,	// <u,6,4,0>: Cost 3 vsldoi4 <4,u,6,4>, LHS
+  2217365931U,	// <u,6,4,1>: Cost 3 vmrghw <0,4,1,5>, <6,1,7,5>
+  2790167156U,	// <u,6,4,2>: Cost 3 vsldoi12 <6,4,2,u>, <6,4,2,u>
+  2646919709U,	// <u,6,4,3>: Cost 3 vsldoi4 <4,u,6,4>, <3,4,u,6>
+  2711538934U,	// <u,6,4,4>: Cost 3 vsldoi8 <4,4,u,6>, <4,4,u,6>
+  1631825206U,	// <u,6,4,5>: Cost 2 vsldoi8 <3,4,u,6>, RHS
+  2316170040U,	// <u,6,4,6>: Cost 3 vmrglw <5,6,u,4>, <6,6,6,6>
+  1215884598U,	// <u,6,4,7>: Cost 2 vmrglw <1,2,u,4>, RHS
+  1215884599U,	// <u,6,4,u>: Cost 2 vmrglw <1,2,u,4>, RHS
+  2634981478U,	// <u,6,5,0>: Cost 3 vsldoi4 <2,u,6,5>, LHS
+  2266190247U,	// <u,6,5,1>: Cost 3 vmrghw RHS, <6,1,7,1>
+  1192448506U,	// <u,6,5,2>: Cost 2 vmrghw RHS, <6,2,7,3>
+  2266190386U,	// <u,6,5,3>: Cost 3 vmrghw RHS, <6,3,4,5>
+  2634984758U,	// <u,6,5,4>: Cost 3 vsldoi4 <2,u,6,5>, RHS
+  2652901632U,	// <u,6,5,5>: Cost 3 vsldoi4 <5,u,6,5>, <5,u,6,5>
+  1192448824U,	// <u,6,5,6>: Cost 2 vmrghw RHS, <6,6,6,6>
+  1213902134U,	// <u,6,5,7>: Cost 2 vmrglw <0,u,u,5>, RHS
+  1213902135U,	// <u,6,5,u>: Cost 2 vmrglw <0,u,u,5>, RHS
+  1583808614U,	// <u,6,6,0>: Cost 2 vsldoi4 <6,6,6,6>, LHS
+  2322010445U,	// <u,6,6,1>: Cost 3 vmrglw <6,6,6,6>, <6,0,6,1>
+  2718839290U,	// <u,6,6,2>: Cost 3 vsldoi8 <5,6,u,6>, <6,2,7,3>
+  2670823965U,	// <u,6,6,3>: Cost 3 vsldoi4 <u,u,6,6>, <3,4,u,6>
+  1583811894U,	// <u,6,6,4>: Cost 2 vsldoi4 <6,6,6,6>, RHS
+  2724147961U,	// <u,6,6,5>: Cost 3 vsldoi8 <6,5,u,6>, <6,5,u,6>
+  363253046U,	// <u,6,6,6>: Cost 1 vspltisw2 RHS
+  1229172022U,	// <u,6,6,7>: Cost 2 vmrglw <3,4,u,6>, RHS
+  363253046U,	// <u,6,6,u>: Cost 1 vspltisw2 RHS
+  499458150U,	// <u,6,7,0>: Cost 1 vsldoi4 RHS, LHS
+  1573200692U,	// <u,6,7,1>: Cost 2 vsldoi4 RHS, <1,1,1,1>
+  1573201512U,	// <u,6,7,2>: Cost 2 vsldoi4 RHS, <2,2,2,2>
+  1573202070U,	// <u,6,7,3>: Cost 2 vsldoi4 RHS, <3,0,1,2>
+  499461673U,	// <u,6,7,4>: Cost 1 vsldoi4 RHS, RHS
+  1573203972U,	// <u,6,7,5>: Cost 2 vsldoi4 RHS, <5,5,5,5>
+  1235817272U,	// <u,6,7,6>: Cost 2 vmrglw RHS, <6,6,6,6>
+  162073910U,	// <u,6,7,7>: Cost 1 vmrglw RHS, RHS
+  162073911U,	// <u,6,7,u>: Cost 1 vmrglw RHS, RHS
+  499466342U,	// <u,6,u,0>: Cost 1 vsldoi4 RHS, LHS
+  1631827758U,	// <u,6,u,1>: Cost 2 vsldoi8 <3,4,u,6>, LHS
+  1573209704U,	// <u,6,u,2>: Cost 2 vsldoi4 RHS, <2,2,2,2>
+  1573210262U,	// <u,6,u,3>: Cost 2 vsldoi4 RHS, <3,0,1,2>
+  499469866U,	// <u,6,u,4>: Cost 1 vsldoi4 RHS, RHS
+  1631828122U,	// <u,6,u,5>: Cost 2 vsldoi8 <3,4,u,6>, RHS
+  363253046U,	// <u,6,u,6>: Cost 1 vspltisw2 RHS
+  135540022U,	// <u,6,u,7>: Cost 1 vmrglw LHS, RHS
+  135540023U,	// <u,6,u,u>: Cost 1 vmrglw LHS, RHS
+  1638465536U,	// <u,7,0,0>: Cost 2 vsldoi8 RHS, <0,0,0,0>
+  564723814U,	// <u,7,0,1>: Cost 1 vsldoi8 RHS, LHS
+  2712207533U,	// <u,7,0,2>: Cost 3 vsldoi8 RHS, <0,2,1,2>
+  2712207612U,	// <u,7,0,3>: Cost 3 vsldoi8 RHS, <0,3,1,0>
+  1638465874U,	// <u,7,0,4>: Cost 2 vsldoi8 RHS, <0,4,1,5>
+  1579192580U,	// <u,7,0,5>: Cost 2 vsldoi4 <5,u,7,0>, <5,u,7,0>
+  2712207862U,	// <u,7,0,6>: Cost 3 vsldoi8 RHS, <0,6,1,7>
+  2316137282U,	// <u,7,0,7>: Cost 3 vmrglw <5,6,u,0>, <6,6,7,7>
+  564724381U,	// <u,7,0,u>: Cost 1 vsldoi8 RHS, LHS
+  1189467130U,	// <u,7,1,0>: Cost 2 vmrghw LHS, <7,0,1,2>
+  1638466356U,	// <u,7,1,1>: Cost 2 vsldoi8 RHS, <1,1,1,1>
+  1638466454U,	// <u,7,1,2>: Cost 2 vsldoi8 RHS, <1,2,3,0>
+  2311500282U,	// <u,7,1,3>: Cost 3 vmrglw <4,u,u,1>, <6,2,7,3>
+  1189467494U,	// <u,7,1,4>: Cost 2 vmrghw LHS, <7,4,5,6>
+  2712208495U,	// <u,7,1,5>: Cost 3 vsldoi8 RHS, <1,5,0,1>
+  2694956302U,	// <u,7,1,6>: Cost 3 vsldoi8 <1,6,u,7>, <1,6,u,7>
+  1189467756U,	// <u,7,1,7>: Cost 2 vmrghw LHS, <7,7,7,7>
+  1638466940U,	// <u,7,1,u>: Cost 2 vsldoi8 RHS, <1,u,3,0>
+  2712208829U,	// <u,7,2,0>: Cost 3 vsldoi8 RHS, <2,0,1,2>
+  2712208927U,	// <u,7,2,1>: Cost 3 vsldoi8 RHS, <2,1,3,1>
+  1638467176U,	// <u,7,2,2>: Cost 2 vsldoi8 RHS, <2,2,2,2>
+  1638467238U,	// <u,7,2,3>: Cost 2 vsldoi8 RHS, <2,3,0,1>
+  2712209165U,	// <u,7,2,4>: Cost 3 vsldoi8 RHS, <2,4,2,5>
+  2712209256U,	// <u,7,2,5>: Cost 3 vsldoi8 RHS, <2,5,3,6>
+  1627187175U,	// <u,7,2,6>: Cost 2 vsldoi8 <2,6,u,7>, <2,6,u,7>
+  2324116290U,	// <u,7,2,7>: Cost 3 vmrglw <7,0,u,2>, <6,6,7,7>
+  1628514441U,	// <u,7,2,u>: Cost 2 vsldoi8 <2,u,u,7>, <2,u,u,7>
+  1638467734U,	// <u,7,3,0>: Cost 2 vsldoi8 RHS, <3,0,1,2>
+  2712209638U,	// <u,7,3,1>: Cost 3 vsldoi8 RHS, <3,1,1,1>
+  2700929387U,	// <u,7,3,2>: Cost 3 vsldoi8 <2,6,u,7>, <3,2,6,u>
+  1638467996U,	// <u,7,3,3>: Cost 2 vsldoi8 RHS, <3,3,3,3>
+  1638468098U,	// <u,7,3,4>: Cost 2 vsldoi8 RHS, <3,4,5,6>
+  2712210002U,	// <u,7,3,5>: Cost 3 vsldoi8 RHS, <3,5,5,5>
+  1585189856U,	// <u,7,3,6>: Cost 2 vsldoi4 <6,u,7,3>, <6,u,7,3>
+  1257018178U,	// <u,7,3,7>: Cost 2 vmrglw LHS, <6,6,7,7>
+  1638468382U,	// <u,7,3,u>: Cost 2 vsldoi8 RHS, <3,u,1,2>
+  1638468498U,	// <u,7,4,0>: Cost 2 vsldoi8 RHS, <4,0,5,1>
+  2712210378U,	// <u,7,4,1>: Cost 3 vsldoi8 RHS, <4,1,2,3>
+  2712210485U,	// <u,7,4,2>: Cost 3 vsldoi8 RHS, <4,2,5,2>
+  2712210564U,	// <u,7,4,3>: Cost 3 vsldoi8 RHS, <4,3,5,0>
+  1638468816U,	// <u,7,4,4>: Cost 2 vsldoi8 RHS, <4,4,4,4>
+  564727112U,	// <u,7,4,5>: Cost 1 vsldoi8 RHS, RHS
+  2712210809U,	// <u,7,4,6>: Cost 3 vsldoi8 RHS, <4,6,5,2>
+  2712210888U,	// <u,7,4,7>: Cost 3 vsldoi8 RHS, <4,7,5,0>
+  564727337U,	// <u,7,4,u>: Cost 1 vsldoi8 RHS, RHS
+  1192449018U,	// <u,7,5,0>: Cost 2 vmrghw RHS, <7,0,1,2>
+  2714201743U,	// <u,7,5,1>: Cost 3 vsldoi8 RHS, <5,1,0,1>
+  2712211198U,	// <u,7,5,2>: Cost 3 vsldoi8 RHS, <5,2,3,4>
+  2311533050U,	// <u,7,5,3>: Cost 3 vmrglw <4,u,u,5>, <6,2,7,3>
+  1192449382U,	// <u,7,5,4>: Cost 2 vmrghw RHS, <7,4,5,6>
+  1638469636U,	// <u,7,5,5>: Cost 2 vsldoi8 RHS, <5,5,5,5>
+  1638469730U,	// <u,7,5,6>: Cost 2 vsldoi8 RHS, <5,6,7,0>
+  1192449644U,	// <u,7,5,7>: Cost 2 vmrghw RHS, <7,7,7,7>
+  1638469892U,	// <u,7,5,u>: Cost 2 vsldoi8 RHS, <5,u,7,0>
+  2712211745U,	// <u,7,6,0>: Cost 3 vsldoi8 RHS, <6,0,1,2>
+  2712211879U,	// <u,7,6,1>: Cost 3 vsldoi8 RHS, <6,1,7,1>
+  1638470138U,	// <u,7,6,2>: Cost 2 vsldoi8 RHS, <6,2,7,3>
+  2712212018U,	// <u,7,6,3>: Cost 3 vsldoi8 RHS, <6,3,4,5>
+  2712212109U,	// <u,7,6,4>: Cost 3 vsldoi8 RHS, <6,4,5,6>
+  2712212203U,	// <u,7,6,5>: Cost 3 vsldoi8 RHS, <6,5,7,1>
+  1638470456U,	// <u,7,6,6>: Cost 2 vsldoi8 RHS, <6,6,6,6>
+  1638470478U,	// <u,7,6,7>: Cost 2 vsldoi8 RHS, <6,7,0,1>
+  1638470559U,	// <u,7,6,u>: Cost 2 vsldoi8 RHS, <6,u,0,1>
+  1235816546U,	// <u,7,7,0>: Cost 2 vmrglw RHS, <5,6,7,0>
+  2309558371U,	// <u,7,7,1>: Cost 3 vmrglw RHS, <5,6,7,1>
+  2641045434U,	// <u,7,7,2>: Cost 3 vsldoi4 <3,u,7,7>, <2,6,3,7>
+  1235816954U,	// <u,7,7,3>: Cost 2 vmrglw RHS, <6,2,7,3>
+  1235816550U,	// <u,7,7,4>: Cost 2 vmrglw RHS, <5,6,7,4>
+  2309558375U,	// <u,7,7,5>: Cost 3 vmrglw RHS, <5,6,7,5>
+  1585222628U,	// <u,7,7,6>: Cost 2 vsldoi4 <6,u,7,7>, <6,u,7,7>
+  430361910U,	// <u,7,7,7>: Cost 1 vspltisw3 RHS
+  430361910U,	// <u,7,7,u>: Cost 1 vspltisw3 RHS
+  1638471379U,	// <u,7,u,0>: Cost 2 vsldoi8 RHS, <u,0,1,2>
+  564729646U,	// <u,7,u,1>: Cost 1 vsldoi8 RHS, LHS
+  1638471557U,	// <u,7,u,2>: Cost 2 vsldoi8 RHS, <u,2,3,0>
+  1638471612U,	// <u,7,u,3>: Cost 2 vsldoi8 RHS, <u,3,0,1>
+  1638471743U,	// <u,7,u,4>: Cost 2 vsldoi8 RHS, <u,4,5,6>
+  564730010U,	// <u,7,u,5>: Cost 1 vsldoi8 RHS, RHS
+  1638471888U,	// <u,7,u,6>: Cost 2 vsldoi8 RHS, <u,6,3,7>
+  430361910U,	// <u,7,u,7>: Cost 1 vspltisw3 RHS
+  564730213U,	// <u,7,u,u>: Cost 1 vsldoi8 RHS, LHS
+  202162278U,	// <u,u,0,0>: Cost 1 vspltisw0 LHS
+  538189985U,	// <u,u,0,1>: Cost 1 vsldoi8 LHS, LHS
+  2685673645U,	// <u,u,0,2>: Cost 3 vsldoi8 LHS, <0,2,1,2>
+  1215848604U,	// <u,u,0,3>: Cost 2 vmrglw <1,2,u,0>, LHS
+  1611931986U,	// <u,u,0,4>: Cost 2 vsldoi8 LHS, <0,4,1,5>
+  1579266317U,	// <u,u,0,5>: Cost 2 vsldoi4 <5,u,u,0>, <5,u,u,0>
+  2289592861U,	// <u,u,0,6>: Cost 3 vmrglw <1,2,u,0>, <3,4,u,6>
+  1215851848U,	// <u,u,0,7>: Cost 2 vmrglw <1,2,u,0>, RHS
+  538190493U,	// <u,u,0,u>: Cost 1 vsldoi8 LHS, LHS
+  1549411025U,	// <u,u,1,0>: Cost 2 vsldoi4 <0,u,u,1>, <0,u,u,1>
+  115726126U,	// <u,u,1,1>: Cost 1 vmrghw LHS, LHS
+  604862254U,	// <u,u,1,2>: Cost 1 vsldoi12 LHS, LHS
+  1213866140U,	// <u,u,1,3>: Cost 2 vmrglw <0,u,u,1>, LHS
+  1549413686U,	// <u,u,1,4>: Cost 2 vsldoi4 <0,u,u,1>, RHS
+  115726490U,	// <u,u,1,5>: Cost 1 vmrghw LHS, RHS
+  1585247207U,	// <u,u,1,6>: Cost 2 vsldoi4 <6,u,u,1>, <6,u,u,1>
+  1213869384U,	// <u,u,1,7>: Cost 2 vmrglw <0,u,u,1>, RHS
+  604862308U,	// <u,u,1,u>: Cost 1 vsldoi12 LHS, LHS
+  1567334502U,	// <u,u,2,0>: Cost 2 vsldoi4 <3,u,u,2>, LHS
+  1190180654U,	// <u,u,2,1>: Cost 2 vmrghw <u,2,3,0>, LHS
+  336380006U,	// <u,u,2,2>: Cost 1 vspltisw2 LHS
+  835584U,	// <u,u,2,3>: Cost 0 copy LHS
+  1567337782U,	// <u,u,2,4>: Cost 2 vsldoi4 <3,u,u,2>, RHS
+  1190181018U,	// <u,u,2,5>: Cost 2 vmrghw <u,2,3,0>, RHS
+  1611933626U,	// <u,u,2,6>: Cost 2 vsldoi8 LHS, <2,6,3,7>
+  1226485064U,	// <u,u,2,7>: Cost 2 vmrglw <3,0,u,2>, RHS
+  835584U,	// <u,u,2,u>: Cost 0 copy LHS
+  475685587U,	// <u,u,3,0>: Cost 1 vsldoi4 LHS, LHS
+  1209239278U,	// <u,u,3,1>: Cost 2 vmrglw LHS, <2,3,u,1>
+  1209239765U,	// <u,u,3,2>: Cost 2 vmrglw LHS, <3,0,u,2>
+  135495836U,	// <u,u,3,3>: Cost 1 vmrglw LHS, LHS
+  475688246U,	// <u,u,3,4>: Cost 1 vsldoi4 LHS, RHS
+  1209239282U,	// <u,u,3,5>: Cost 2 vmrglw LHS, <2,3,u,5>
+  1209240093U,	// <u,u,3,6>: Cost 2 vmrglw LHS, <3,4,u,6>
+  135499080U,	// <u,u,3,7>: Cost 1 vmrglw LHS, RHS
+  135495841U,	// <u,u,3,u>: Cost 1 vmrglw LHS, LHS
+  1555406950U,	// <u,u,4,0>: Cost 2 vsldoi4 <1,u,u,4>, LHS
+  1555408301U,	// <u,u,4,1>: Cost 2 vsldoi4 <1,u,u,4>, <1,u,u,4>
+  2289625301U,	// <u,u,4,2>: Cost 3 vmrglw <1,2,u,4>, <3,0,u,2>
+  1215881372U,	// <u,u,4,3>: Cost 2 vmrglw <1,2,u,4>, LHS
+  229035318U,	// <u,u,4,4>: Cost 1 vspltisw0 RHS
+  538193206U,	// <u,u,4,5>: Cost 1 vsldoi8 LHS, RHS
+  2289625629U,	// <u,u,4,6>: Cost 3 vmrglw <1,2,u,4>, <3,4,u,6>
+  1215884616U,	// <u,u,4,7>: Cost 2 vmrglw <1,2,u,4>, RHS
+  538193449U,	// <u,u,4,u>: Cost 1 vsldoi8 LHS, RHS
+  1549443797U,	// <u,u,5,0>: Cost 2 vsldoi4 <0,u,u,5>, <0,u,u,5>
+  118708014U,	// <u,u,5,1>: Cost 1 vmrghw RHS, LHS
+  1561389191U,	// <u,u,5,2>: Cost 2 vsldoi4 <2,u,u,5>, <2,u,u,5>
+  1213898908U,	// <u,u,5,3>: Cost 2 vmrglw <0,u,u,5>, LHS
+  1549446454U,	// <u,u,5,4>: Cost 2 vsldoi4 <0,u,u,5>, RHS
+  118708378U,	// <u,u,5,5>: Cost 1 vmrghw RHS, RHS
+  604862618U,	// <u,u,5,6>: Cost 1 vsldoi12 LHS, RHS
+  1213902152U,	// <u,u,5,7>: Cost 2 vmrglw <0,u,u,5>, RHS
+  604862636U,	// <u,u,5,u>: Cost 1 vsldoi12 LHS, RHS
+  1567367270U,	// <u,u,6,0>: Cost 2 vsldoi4 <3,u,u,6>, LHS
+  1192892206U,	// <u,u,6,1>: Cost 2 vmrghw <u,6,3,7>, LHS
+  1638478330U,	// <u,u,6,2>: Cost 2 vsldoi8 RHS, <6,2,7,3>
+  1679046864U,	// <u,u,6,3>: Cost 2 vsldoi12 LHS, <u,6,3,7>
+  1567370550U,	// <u,u,6,4>: Cost 2 vsldoi4 <3,u,u,6>, RHS
+  1192892570U,	// <u,u,6,5>: Cost 2 vmrghw <u,6,3,7>, RHS
+  363253046U,	// <u,u,6,6>: Cost 1 vspltisw2 RHS
+  27705344U,	// <u,u,6,7>: Cost 0 copy RHS
+  27705344U,	// <u,u,6,u>: Cost 0 copy RHS
+  499605606U,	// <u,u,7,0>: Cost 1 vsldoi4 RHS, LHS
+  1235812425U,	// <u,u,7,1>: Cost 2 vmrglw RHS, <0,0,u,1>
+  1561405577U,	// <u,u,7,2>: Cost 2 vsldoi4 <2,u,u,7>, <2,u,u,7>
+  162070684U,	// <u,u,7,3>: Cost 1 vmrglw RHS, LHS
+  499609147U,	// <u,u,7,4>: Cost 1 vsldoi4 RHS, RHS
+  1235812753U,	// <u,u,7,5>: Cost 2 vmrglw RHS, <0,4,u,5>
+  1235814941U,	// <u,u,7,6>: Cost 2 vmrglw RHS, <3,4,u,6>
+  162073928U,	// <u,u,7,7>: Cost 1 vmrglw RHS, RHS
+  162070689U,	// <u,u,7,u>: Cost 1 vmrglw RHS, LHS
+  475726552U,	// <u,u,u,0>: Cost 1 vsldoi4 LHS, LHS
+  538195758U,	// <u,u,u,1>: Cost 1 vsldoi8 LHS, LHS
+  604862821U,	// <u,u,u,2>: Cost 1 vsldoi12 LHS, LHS
+  835584U,	// <u,u,u,3>: Cost 0 copy LHS
+  475729206U,	// <u,u,u,4>: Cost 1 vsldoi4 LHS, RHS
+  538196122U,	// <u,u,u,5>: Cost 1 vsldoi8 LHS, RHS
+  604862861U,	// <u,u,u,6>: Cost 1 vsldoi12 LHS, RHS
+  27705344U,	// <u,u,u,7>: Cost 0 copy RHS
+  835584U,	// <u,u,u,u>: Cost 0 copy LHS
+  0
+};

diff --git a/lib/Target/PowerPC/PPCPredicates.cpp b/lib/Target/PowerPC/PPCPredicates.cpp
new file mode 100644
index 0000000..12bb0a1
--- /dev/null
+++ b/lib/Target/PowerPC/PPCPredicates.cpp

@@ -0,0 +1,31 @@
+//===-- PPCPredicates.cpp - PPC Branch Predicate Information --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PowerPC branch predicates.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCPredicates.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cassert>
+using namespace llvm;
+
+PPC::Predicate PPC::InvertPredicate(PPC::Predicate Opcode) {
+  switch (Opcode) {
+  default: llvm_unreachable("Unknown PPC branch opcode!");
+  case PPC::PRED_EQ: return PPC::PRED_NE;
+  case PPC::PRED_NE: return PPC::PRED_EQ;
+  case PPC::PRED_LT: return PPC::PRED_GE;
+  case PPC::PRED_GE: return PPC::PRED_LT;
+  case PPC::PRED_GT: return PPC::PRED_LE;
+  case PPC::PRED_LE: return PPC::PRED_GT;
+  case PPC::PRED_NU: return PPC::PRED_UN;
+  case PPC::PRED_UN: return PPC::PRED_NU;
+  }
+}

diff --git a/lib/Target/PowerPC/PPCPredicates.h b/lib/Target/PowerPC/PPCPredicates.h
new file mode 100644
index 0000000..b2c8315
--- /dev/null
+++ b/lib/Target/PowerPC/PPCPredicates.h

@@ -0,0 +1,39 @@
+//===-- PPCPredicates.h - PPC Branch Predicate Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the PowerPC branch predicates.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_POWERPC_PPCPREDICATES_H
+#define LLVM_TARGET_POWERPC_PPCPREDICATES_H
+
+#include "PPC.h"
+
+namespace llvm {
+namespace PPC {
+  /// Predicate - These are "(BI << 5) | BO"  for various predicates.
+  enum Predicate {
+    PRED_ALWAYS = (0 << 5) | 20,
+    PRED_LT     = (0 << 5) | 12,
+    PRED_LE     = (1 << 5) |  4,
+    PRED_EQ     = (2 << 5) | 12,
+    PRED_GE     = (0 << 5) |  4,
+    PRED_GT     = (1 << 5) | 12,
+    PRED_NE     = (2 << 5) |  4,
+    PRED_UN     = (3 << 5) | 12,
+    PRED_NU     = (3 << 5) |  4
+  };
+  
+  /// Invert the specified predicate.  != -> ==, < -> >=.
+  Predicate InvertPredicate(Predicate Opcode);
+}
+}
+
+#endif

diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
new file mode 100644
index 0000000..20e77e7
--- /dev/null
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp

@@ -0,0 +1,1745 @@
+//===- PPCRegisterInfo.cpp - PowerPC Register Information -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PowerPC implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "reginfo"
+#include "PPC.h"
+#include "PPCInstrBuilder.h"
+#include "PPCMachineFunctionInfo.h"
+#include "PPCRegisterInfo.h"
+#include "PPCFrameInfo.h"
+#include "PPCSubtarget.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Type.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include <cstdlib>
+using namespace llvm;
+
+// FIXME This disables some code that aligns the stack to a boundary
+// bigger than the default (16 bytes on Darwin) when there is a stack local
+// of greater alignment.  This does not currently work, because the delta
+// between old and new stack pointers is added to offsets that reference
+// incoming parameters after the prolog is generated, and the code that 
+// does that doesn't handle a variable delta.  You don't want to do that
+// anyway; a better approach is to reserve another register that retains
+// to the incoming stack pointer, and reference parameters relative to that.
+#define ALIGN_STACK 0
+
+// FIXME (64-bit): Eventually enable by default.
+cl::opt<bool> EnablePPC32RS("enable-ppc32-regscavenger",
+                            cl::init(false),
+                            cl::desc("Enable PPC32 register scavenger"),
+                            cl::Hidden);
+cl::opt<bool> EnablePPC64RS("enable-ppc64-regscavenger",
+                            cl::init(false),
+                            cl::desc("Enable PPC64 register scavenger"),
+                            cl::Hidden);
+#define EnableRegisterScavenging \
+  ((EnablePPC32RS && !Subtarget.isPPC64()) || \
+   (EnablePPC64RS && Subtarget.isPPC64()))
+
+// FIXME (64-bit): Should be inlined.
+bool
+PPCRegisterInfo::requiresRegisterScavenging(const MachineFunction &) const {
+  return EnableRegisterScavenging;
+}
+
+/// getRegisterNumbering - Given the enum value for some register, e.g.
+/// PPC::F14, return the number that it corresponds to (e.g. 14).
+unsigned PPCRegisterInfo::getRegisterNumbering(unsigned RegEnum) {
+  using namespace PPC;
+  switch (RegEnum) {
+  case 0: return 0;
+  case R0 :  case X0 :  case F0 :  case V0 : case CR0:  case CR0LT: return  0;
+  case R1 :  case X1 :  case F1 :  case V1 : case CR1:  case CR0GT: return  1;
+  case R2 :  case X2 :  case F2 :  case V2 : case CR2:  case CR0EQ: return  2;
+  case R3 :  case X3 :  case F3 :  case V3 : case CR3:  case CR0UN: return  3;
+  case R4 :  case X4 :  case F4 :  case V4 : case CR4:  case CR1LT: return  4;
+  case R5 :  case X5 :  case F5 :  case V5 : case CR5:  case CR1GT: return  5;
+  case R6 :  case X6 :  case F6 :  case V6 : case CR6:  case CR1EQ: return  6;
+  case R7 :  case X7 :  case F7 :  case V7 : case CR7:  case CR1UN: return  7;
+  case R8 :  case X8 :  case F8 :  case V8 : case CR2LT: return  8;
+  case R9 :  case X9 :  case F9 :  case V9 : case CR2GT: return  9;
+  case R10:  case X10:  case F10:  case V10: case CR2EQ: return 10;
+  case R11:  case X11:  case F11:  case V11: case CR2UN: return 11;
+  case R12:  case X12:  case F12:  case V12: case CR3LT: return 12;
+  case R13:  case X13:  case F13:  case V13: case CR3GT: return 13;
+  case R14:  case X14:  case F14:  case V14: case CR3EQ: return 14;
+  case R15:  case X15:  case F15:  case V15: case CR3UN: return 15;
+  case R16:  case X16:  case F16:  case V16: case CR4LT: return 16;
+  case R17:  case X17:  case F17:  case V17: case CR4GT: return 17;
+  case R18:  case X18:  case F18:  case V18: case CR4EQ: return 18;
+  case R19:  case X19:  case F19:  case V19: case CR4UN: return 19;
+  case R20:  case X20:  case F20:  case V20: case CR5LT: return 20;
+  case R21:  case X21:  case F21:  case V21: case CR5GT: return 21;
+  case R22:  case X22:  case F22:  case V22: case CR5EQ: return 22;
+  case R23:  case X23:  case F23:  case V23: case CR5UN: return 23;
+  case R24:  case X24:  case F24:  case V24: case CR6LT: return 24;
+  case R25:  case X25:  case F25:  case V25: case CR6GT: return 25;
+  case R26:  case X26:  case F26:  case V26: case CR6EQ: return 26;
+  case R27:  case X27:  case F27:  case V27: case CR6UN: return 27;
+  case R28:  case X28:  case F28:  case V28: case CR7LT: return 28;
+  case R29:  case X29:  case F29:  case V29: case CR7GT: return 29;
+  case R30:  case X30:  case F30:  case V30: case CR7EQ: return 30;
+  case R31:  case X31:  case F31:  case V31: case CR7UN: return 31;
+  default:
+    llvm_unreachable("Unhandled reg in PPCRegisterInfo::getRegisterNumbering!");
+  }
+}
+
+PPCRegisterInfo::PPCRegisterInfo(const PPCSubtarget &ST,
+                                 const TargetInstrInfo &tii)
+  : PPCGenRegisterInfo(PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP),
+    Subtarget(ST), TII(tii) {
+  ImmToIdxMap[PPC::LD]   = PPC::LDX;    ImmToIdxMap[PPC::STD]  = PPC::STDX;
+  ImmToIdxMap[PPC::LBZ]  = PPC::LBZX;   ImmToIdxMap[PPC::STB]  = PPC::STBX;
+  ImmToIdxMap[PPC::LHZ]  = PPC::LHZX;   ImmToIdxMap[PPC::LHA]  = PPC::LHAX;
+  ImmToIdxMap[PPC::LWZ]  = PPC::LWZX;   ImmToIdxMap[PPC::LWA]  = PPC::LWAX;
+  ImmToIdxMap[PPC::LFS]  = PPC::LFSX;   ImmToIdxMap[PPC::LFD]  = PPC::LFDX;
+  ImmToIdxMap[PPC::STH]  = PPC::STHX;   ImmToIdxMap[PPC::STW]  = PPC::STWX;
+  ImmToIdxMap[PPC::STFS] = PPC::STFSX;  ImmToIdxMap[PPC::STFD] = PPC::STFDX;
+  ImmToIdxMap[PPC::ADDI] = PPC::ADD4;
+
+  // 64-bit
+  ImmToIdxMap[PPC::LHA8] = PPC::LHAX8; ImmToIdxMap[PPC::LBZ8] = PPC::LBZX8;
+  ImmToIdxMap[PPC::LHZ8] = PPC::LHZX8; ImmToIdxMap[PPC::LWZ8] = PPC::LWZX8;
+  ImmToIdxMap[PPC::STB8] = PPC::STBX8; ImmToIdxMap[PPC::STH8] = PPC::STHX8;
+  ImmToIdxMap[PPC::STW8] = PPC::STWX8; ImmToIdxMap[PPC::STDU] = PPC::STDUX;
+  ImmToIdxMap[PPC::ADDI8] = PPC::ADD8; ImmToIdxMap[PPC::STD_32] = PPC::STDX_32;
+}
+
+/// getPointerRegClass - Return the register class to use to hold pointers.
+/// This is used for addressing modes.
+const TargetRegisterClass *
+PPCRegisterInfo::getPointerRegClass(unsigned Kind) const {
+  if (Subtarget.isPPC64())
+    return &PPC::G8RCRegClass;
+  return &PPC::GPRCRegClass;
+}
+
+const unsigned*
+PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  // 32-bit Darwin calling convention. 
+  static const unsigned Darwin32_CalleeSavedRegs[] = {
+              PPC::R13, PPC::R14, PPC::R15,
+    PPC::R16, PPC::R17, PPC::R18, PPC::R19,
+    PPC::R20, PPC::R21, PPC::R22, PPC::R23,
+    PPC::R24, PPC::R25, PPC::R26, PPC::R27,
+    PPC::R28, PPC::R29, PPC::R30, PPC::R31,
+
+    PPC::F14, PPC::F15, PPC::F16, PPC::F17,
+    PPC::F18, PPC::F19, PPC::F20, PPC::F21,
+    PPC::F22, PPC::F23, PPC::F24, PPC::F25,
+    PPC::F26, PPC::F27, PPC::F28, PPC::F29,
+    PPC::F30, PPC::F31,
+    
+    PPC::CR2, PPC::CR3, PPC::CR4,
+    PPC::V20, PPC::V21, PPC::V22, PPC::V23,
+    PPC::V24, PPC::V25, PPC::V26, PPC::V27,
+    PPC::V28, PPC::V29, PPC::V30, PPC::V31,
+    
+    PPC::CR2LT, PPC::CR2GT, PPC::CR2EQ, PPC::CR2UN,
+    PPC::CR3LT, PPC::CR3GT, PPC::CR3EQ, PPC::CR3UN,
+    PPC::CR4LT, PPC::CR4GT, PPC::CR4EQ, PPC::CR4UN,
+    
+    PPC::LR,  0
+  };
+
+  // 32-bit SVR4 calling convention.
+  static const unsigned SVR4_CalleeSavedRegs[] = {
+                        PPC::R14, PPC::R15,
+    PPC::R16, PPC::R17, PPC::R18, PPC::R19,
+    PPC::R20, PPC::R21, PPC::R22, PPC::R23,
+    PPC::R24, PPC::R25, PPC::R26, PPC::R27,
+    PPC::R28, PPC::R29, PPC::R30, PPC::R31,
+
+    PPC::F14, PPC::F15, PPC::F16, PPC::F17,
+    PPC::F18, PPC::F19, PPC::F20, PPC::F21,
+    PPC::F22, PPC::F23, PPC::F24, PPC::F25,
+    PPC::F26, PPC::F27, PPC::F28, PPC::F29,
+    PPC::F30, PPC::F31,
+    
+    PPC::CR2, PPC::CR3, PPC::CR4,
+    
+    PPC::VRSAVE,
+    
+    PPC::V20, PPC::V21, PPC::V22, PPC::V23,
+    PPC::V24, PPC::V25, PPC::V26, PPC::V27,
+    PPC::V28, PPC::V29, PPC::V30, PPC::V31,
+    
+    PPC::CR2LT, PPC::CR2GT, PPC::CR2EQ, PPC::CR2UN,
+    PPC::CR3LT, PPC::CR3GT, PPC::CR3EQ, PPC::CR3UN,
+    PPC::CR4LT, PPC::CR4GT, PPC::CR4EQ, PPC::CR4UN,
+    
+    0
+  };
+  // 64-bit Darwin calling convention. 
+  static const unsigned Darwin64_CalleeSavedRegs[] = {
+    PPC::X14, PPC::X15,
+    PPC::X16, PPC::X17, PPC::X18, PPC::X19,
+    PPC::X20, PPC::X21, PPC::X22, PPC::X23,
+    PPC::X24, PPC::X25, PPC::X26, PPC::X27,
+    PPC::X28, PPC::X29, PPC::X30, PPC::X31,
+    
+    PPC::F14, PPC::F15, PPC::F16, PPC::F17,
+    PPC::F18, PPC::F19, PPC::F20, PPC::F21,
+    PPC::F22, PPC::F23, PPC::F24, PPC::F25,
+    PPC::F26, PPC::F27, PPC::F28, PPC::F29,
+    PPC::F30, PPC::F31,
+    
+    PPC::CR2, PPC::CR3, PPC::CR4,
+    PPC::V20, PPC::V21, PPC::V22, PPC::V23,
+    PPC::V24, PPC::V25, PPC::V26, PPC::V27,
+    PPC::V28, PPC::V29, PPC::V30, PPC::V31,
+    
+    PPC::CR2LT, PPC::CR2GT, PPC::CR2EQ, PPC::CR2UN,
+    PPC::CR3LT, PPC::CR3GT, PPC::CR3EQ, PPC::CR3UN,
+    PPC::CR4LT, PPC::CR4GT, PPC::CR4EQ, PPC::CR4UN,
+    
+    PPC::LR8,  0
+  };
+
+  // 64-bit SVR4 calling convention.
+  static const unsigned SVR4_64_CalleeSavedRegs[] = {
+    PPC::X14, PPC::X15,
+    PPC::X16, PPC::X17, PPC::X18, PPC::X19,
+    PPC::X20, PPC::X21, PPC::X22, PPC::X23,
+    PPC::X24, PPC::X25, PPC::X26, PPC::X27,
+    PPC::X28, PPC::X29, PPC::X30, PPC::X31,
+
+    PPC::F14, PPC::F15, PPC::F16, PPC::F17,
+    PPC::F18, PPC::F19, PPC::F20, PPC::F21,
+    PPC::F22, PPC::F23, PPC::F24, PPC::F25,
+    PPC::F26, PPC::F27, PPC::F28, PPC::F29,
+    PPC::F30, PPC::F31,
+
+    PPC::CR2, PPC::CR3, PPC::CR4,
+
+    PPC::VRSAVE,
+
+    PPC::V20, PPC::V21, PPC::V22, PPC::V23,
+    PPC::V24, PPC::V25, PPC::V26, PPC::V27,
+    PPC::V28, PPC::V29, PPC::V30, PPC::V31,
+
+    PPC::CR2LT, PPC::CR2GT, PPC::CR2EQ, PPC::CR2UN,
+    PPC::CR3LT, PPC::CR3GT, PPC::CR3EQ, PPC::CR3UN,
+    PPC::CR4LT, PPC::CR4GT, PPC::CR4EQ, PPC::CR4UN,
+
+    0
+  };
+  
+  if (Subtarget.isDarwinABI())
+    return Subtarget.isPPC64() ? Darwin64_CalleeSavedRegs :
+                                 Darwin32_CalleeSavedRegs;
+
+  return Subtarget.isPPC64() ? SVR4_64_CalleeSavedRegs : SVR4_CalleeSavedRegs;
+}
+
+const TargetRegisterClass* const*
+PPCRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
+  // 32-bit Darwin calling convention.
+  static const TargetRegisterClass * const Darwin32_CalleeSavedRegClasses[] = {
+                       &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,
+    &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,
+    &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,
+    &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,
+    &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,
+
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    
+    &PPC::CRRCRegClass,&PPC::CRRCRegClass,&PPC::CRRCRegClass,
+    
+    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
+    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
+    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
+    
+    &PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,
+    &PPC::CRBITRCRegClass, 
+    &PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,
+    &PPC::CRBITRCRegClass, 
+    &PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,
+    &PPC::CRBITRCRegClass, 
+    
+    &PPC::GPRCRegClass, 0
+  };
+  
+  // 32-bit SVR4 calling convention.
+  static const TargetRegisterClass * const SVR4_CalleeSavedRegClasses[] = {
+                                          &PPC::GPRCRegClass,&PPC::GPRCRegClass,
+    &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,
+    &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,
+    &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,
+    &PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,&PPC::GPRCRegClass,
+
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    
+    &PPC::CRRCRegClass,&PPC::CRRCRegClass,&PPC::CRRCRegClass,
+    
+    &PPC::VRSAVERCRegClass,
+    
+    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
+    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
+    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
+    
+    &PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,
+    &PPC::CRBITRCRegClass, 
+    &PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,
+    &PPC::CRBITRCRegClass, 
+    &PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,
+    &PPC::CRBITRCRegClass, 
+    
+    0
+  };
+  
+  // 64-bit Darwin calling convention.
+  static const TargetRegisterClass * const Darwin64_CalleeSavedRegClasses[] = {
+    &PPC::G8RCRegClass,&PPC::G8RCRegClass,
+    &PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,
+    &PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,
+    &PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,
+    &PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,
+    
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    
+    &PPC::CRRCRegClass,&PPC::CRRCRegClass,&PPC::CRRCRegClass,
+    
+    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
+    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
+    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
+    
+    &PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,
+    &PPC::CRBITRCRegClass, 
+    &PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,
+    &PPC::CRBITRCRegClass, 
+    &PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,
+    &PPC::CRBITRCRegClass, 
+    
+    &PPC::G8RCRegClass, 0
+  };
+
+  // 64-bit SVR4 calling convention.
+  static const TargetRegisterClass * const SVR4_64_CalleeSavedRegClasses[] = {
+    &PPC::G8RCRegClass,&PPC::G8RCRegClass,
+    &PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,
+    &PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,
+    &PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,
+    &PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,&PPC::G8RCRegClass,
+
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,&PPC::F8RCRegClass,
+    &PPC::F8RCRegClass,&PPC::F8RCRegClass,
+
+    &PPC::CRRCRegClass,&PPC::CRRCRegClass,&PPC::CRRCRegClass,
+
+    &PPC::VRSAVERCRegClass,
+
+    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
+    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
+    &PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,&PPC::VRRCRegClass,
+
+    &PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,
+    &PPC::CRBITRCRegClass,
+    &PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,
+    &PPC::CRBITRCRegClass,
+    &PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,&PPC::CRBITRCRegClass,
+    &PPC::CRBITRCRegClass,
+
+    0
+  };
+  
+  if (Subtarget.isDarwinABI())
+    return Subtarget.isPPC64() ? Darwin64_CalleeSavedRegClasses :
+                                 Darwin32_CalleeSavedRegClasses;
+  
+  return Subtarget.isPPC64() ? SVR4_64_CalleeSavedRegClasses
+                             : SVR4_CalleeSavedRegClasses;
+}
+
+// needsFP - Return true if the specified function should have a dedicated frame
+// pointer register.  This is true if the function has variable sized allocas or
+// if frame pointer elimination is disabled.
+//
+static bool needsFP(const MachineFunction &MF) {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  return NoFramePointerElim || MFI->hasVarSizedObjects() ||
+    (GuaranteedTailCallOpt && MF.getInfo<PPCFunctionInfo>()->hasFastCall());
+}
+
+static bool spillsCR(const MachineFunction &MF) {
+  const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+  return FuncInfo->isCRSpilled();
+}
+
+BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  Reserved.set(PPC::R0);
+  Reserved.set(PPC::R1);
+  Reserved.set(PPC::LR);
+  Reserved.set(PPC::LR8);
+  Reserved.set(PPC::RM);
+
+  // The SVR4 ABI reserves r2 and r13
+  if (Subtarget.isSVR4ABI()) {
+    Reserved.set(PPC::R2);  // System-reserved register
+    Reserved.set(PPC::R13); // Small Data Area pointer register
+  }
+  
+  // On PPC64, r13 is the thread pointer. Never allocate this register.
+  // Note that this is over conservative, as it also prevents allocation of R31
+  // when the FP is not needed.
+  if (Subtarget.isPPC64()) {
+    Reserved.set(PPC::R13);
+    Reserved.set(PPC::R31);
+
+    if (!EnableRegisterScavenging)
+      Reserved.set(PPC::R0);    // FIXME (64-bit): Remove
+
+    Reserved.set(PPC::X0);
+    Reserved.set(PPC::X1);
+    Reserved.set(PPC::X13);
+    Reserved.set(PPC::X31);
+
+    // The 64-bit SVR4 ABI reserves r2 for the TOC pointer.
+    if (Subtarget.isSVR4ABI()) {
+      Reserved.set(PPC::X2);
+    }
+  }
+
+  if (needsFP(MF))
+    Reserved.set(PPC::R31);
+
+  return Reserved;
+}
+
+//===----------------------------------------------------------------------===//
+// Stack Frame Processing methods
+//===----------------------------------------------------------------------===//
+
+// hasFP - Return true if the specified function actually has a dedicated frame
+// pointer register.  This is true if the function needs a frame pointer and has
+// a non-zero stack size.
+bool PPCRegisterInfo::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  return MFI->getStackSize() && needsFP(MF);
+}
+
+/// MustSaveLR - Return true if this function requires that we save the LR
+/// register onto the stack in the prolog and restore it in the epilog of the
+/// function.
+static bool MustSaveLR(const MachineFunction &MF, unsigned LR) {
+  const PPCFunctionInfo *MFI = MF.getInfo<PPCFunctionInfo>();
+  
+  // We need a save/restore of LR if there is any def of LR (which is
+  // defined by calls, including the PIC setup sequence), or if there is
+  // some use of the LR stack slot (e.g. for builtin_return_address).
+  // (LR comes in 32 and 64 bit versions.)
+  MachineRegisterInfo::def_iterator RI = MF.getRegInfo().def_begin(LR);
+  return RI !=MF.getRegInfo().def_end() || MFI->isLRStoreRequired();
+}
+
+
+
+void PPCRegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  if (GuaranteedTailCallOpt && I->getOpcode() == PPC::ADJCALLSTACKUP) {
+    // Add (actually subtract) back the amount the callee popped on return.
+    if (int CalleeAmt =  I->getOperand(1).getImm()) {
+      bool is64Bit = Subtarget.isPPC64();
+      CalleeAmt *= -1;
+      unsigned StackReg = is64Bit ? PPC::X1 : PPC::R1;
+      unsigned TmpReg = is64Bit ? PPC::X0 : PPC::R0;
+      unsigned ADDIInstr = is64Bit ? PPC::ADDI8 : PPC::ADDI;
+      unsigned ADDInstr = is64Bit ? PPC::ADD8 : PPC::ADD4;
+      unsigned LISInstr = is64Bit ? PPC::LIS8 : PPC::LIS;
+      unsigned ORIInstr = is64Bit ? PPC::ORI8 : PPC::ORI;
+      MachineInstr *MI = I;
+      DebugLoc dl = MI->getDebugLoc();
+
+      if (isInt16(CalleeAmt)) {
+        BuildMI(MBB, I, dl, TII.get(ADDIInstr), StackReg).addReg(StackReg).
+          addImm(CalleeAmt);
+      } else {
+        MachineBasicBlock::iterator MBBI = I;
+        BuildMI(MBB, MBBI, dl, TII.get(LISInstr), TmpReg)
+          .addImm(CalleeAmt >> 16);
+        BuildMI(MBB, MBBI, dl, TII.get(ORIInstr), TmpReg)
+          .addReg(TmpReg, RegState::Kill)
+          .addImm(CalleeAmt & 0xFFFF);
+        BuildMI(MBB, MBBI, dl, TII.get(ADDInstr))
+          .addReg(StackReg)
+          .addReg(StackReg)
+          .addReg(TmpReg);
+      }
+    }
+  }
+  // Simply discard ADJCALLSTACKDOWN, ADJCALLSTACKUP instructions.
+  MBB.erase(I);
+}
+
+/// findScratchRegister - Find a 'free' PPC register. Try for a call-clobbered
+/// register first and then a spilled callee-saved register if that fails.
+static
+unsigned findScratchRegister(MachineBasicBlock::iterator II, RegScavenger *RS,
+                             const TargetRegisterClass *RC, int SPAdj) {
+  assert(RS && "Register scavenging must be on");
+  unsigned Reg = RS->FindUnusedReg(RC);
+  // FIXME: move ARM callee-saved reg scan to target independent code, then 
+  // search for already spilled CS register here.
+  if (Reg == 0)
+    Reg = RS->scavengeRegister(RC, II, SPAdj);
+  return Reg;
+}
+
+/// lowerDynamicAlloc - Generate the code for allocating an object in the
+/// current frame.  The sequence of code with be in the general form
+///
+///   addi   R0, SP, \#frameSize ; get the address of the previous frame
+///   stwxu  R0, SP, Rnegsize   ; add and update the SP with the negated size
+///   addi   Rnew, SP, \#maxCalFrameSize ; get the top of the allocation
+///
+void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II,
+                                        int SPAdj, RegScavenger *RS) const {
+  // Get the instruction.
+  MachineInstr &MI = *II;
+  // Get the instruction's basic block.
+  MachineBasicBlock &MBB = *MI.getParent();
+  // Get the basic block's function.
+  MachineFunction &MF = *MBB.getParent();
+  // Get the frame info.
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  // Determine whether 64-bit pointers are used.
+  bool LP64 = Subtarget.isPPC64();
+  DebugLoc dl = MI.getDebugLoc();
+
+  // Get the maximum call stack size.
+  unsigned maxCallFrameSize = MFI->getMaxCallFrameSize();
+  // Get the total frame size.
+  unsigned FrameSize = MFI->getStackSize();
+  
+  // Get stack alignments.
+  unsigned TargetAlign = MF.getTarget().getFrameInfo()->getStackAlignment();
+  unsigned MaxAlign = MFI->getMaxAlignment();
+  assert(MaxAlign <= TargetAlign &&
+         "Dynamic alloca with large aligns not supported");
+
+  // Determine the previous frame's address.  If FrameSize can't be
+  // represented as 16 bits or we need special alignment, then we load the
+  // previous frame's address from 0(SP).  Why not do an addis of the hi? 
+  // Because R0 is our only safe tmp register and addi/addis treat R0 as zero. 
+  // Constructing the constant and adding would take 3 instructions. 
+  // Fortunately, a frame greater than 32K is rare.
+  const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+  const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+  const TargetRegisterClass *RC = LP64 ? G8RC : GPRC;
+
+  // FIXME (64-bit): Use "findScratchRegister"
+  unsigned Reg;
+  if (EnableRegisterScavenging)
+    Reg = findScratchRegister(II, RS, RC, SPAdj);
+  else
+    Reg = PPC::R0;
+  
+  if (MaxAlign < TargetAlign && isInt16(FrameSize)) {
+    BuildMI(MBB, II, dl, TII.get(PPC::ADDI), Reg)
+      .addReg(PPC::R31)
+      .addImm(FrameSize);
+  } else if (LP64) {
+    if (EnableRegisterScavenging) // FIXME (64-bit): Use "true" part.
+      BuildMI(MBB, II, dl, TII.get(PPC::LD), Reg)
+        .addImm(0)
+        .addReg(PPC::X1);
+    else
+      BuildMI(MBB, II, dl, TII.get(PPC::LD), PPC::X0)
+        .addImm(0)
+        .addReg(PPC::X1);
+  } else {
+    BuildMI(MBB, II, dl, TII.get(PPC::LWZ), Reg)
+      .addImm(0)
+      .addReg(PPC::R1);
+  }
+  
+  // Grow the stack and update the stack pointer link, then determine the
+  // address of new allocated space.
+  if (LP64) {
+    if (EnableRegisterScavenging) // FIXME (64-bit): Use "true" part.
+      BuildMI(MBB, II, dl, TII.get(PPC::STDUX))
+        .addReg(Reg, RegState::Kill)
+        .addReg(PPC::X1)
+        .addReg(MI.getOperand(1).getReg());
+    else
+      BuildMI(MBB, II, dl, TII.get(PPC::STDUX))
+        .addReg(PPC::X0, RegState::Kill)
+        .addReg(PPC::X1)
+        .addReg(MI.getOperand(1).getReg());
+
+    if (!MI.getOperand(1).isKill())
+      BuildMI(MBB, II, dl, TII.get(PPC::ADDI8), MI.getOperand(0).getReg())
+        .addReg(PPC::X1)
+        .addImm(maxCallFrameSize);
+    else
+      // Implicitly kill the register.
+      BuildMI(MBB, II, dl, TII.get(PPC::ADDI8), MI.getOperand(0).getReg())
+        .addReg(PPC::X1)
+        .addImm(maxCallFrameSize)
+        .addReg(MI.getOperand(1).getReg(), RegState::ImplicitKill);
+  } else {
+    BuildMI(MBB, II, dl, TII.get(PPC::STWUX))
+      .addReg(Reg, RegState::Kill)
+      .addReg(PPC::R1)
+      .addReg(MI.getOperand(1).getReg());
+
+    if (!MI.getOperand(1).isKill())
+      BuildMI(MBB, II, dl, TII.get(PPC::ADDI), MI.getOperand(0).getReg())
+        .addReg(PPC::R1)
+        .addImm(maxCallFrameSize);
+    else
+      // Implicitly kill the register.
+      BuildMI(MBB, II, dl, TII.get(PPC::ADDI), MI.getOperand(0).getReg())
+        .addReg(PPC::R1)
+        .addImm(maxCallFrameSize)
+        .addReg(MI.getOperand(1).getReg(), RegState::ImplicitKill);
+  }
+  
+  // Discard the DYNALLOC instruction.
+  MBB.erase(II);
+}
+
+/// lowerCRSpilling - Generate the code for spilling a CR register. Instead of
+/// reserving a whole register (R0), we scrounge for one here. This generates
+/// code like this:
+///
+///   mfcr rA                  ; Move the conditional register into GPR rA.
+///   rlwinm rA, rA, SB, 0, 31 ; Shift the bits left so they are in CR0's slot.
+///   stw rA, FI               ; Store rA to the frame.
+///
+void PPCRegisterInfo::lowerCRSpilling(MachineBasicBlock::iterator II,
+                                      unsigned FrameIndex, int SPAdj,
+                                      RegScavenger *RS) const {
+  // Get the instruction.
+  MachineInstr &MI = *II;       // ; SPILL_CR <SrcReg>, <offset>, <FI>
+  // Get the instruction's basic block.
+  MachineBasicBlock &MBB = *MI.getParent();
+  DebugLoc dl = MI.getDebugLoc();
+
+  const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+  const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+  const TargetRegisterClass *RC = Subtarget.isPPC64() ? G8RC : GPRC;
+  unsigned Reg = findScratchRegister(II, RS, RC, SPAdj);
+
+  // We need to store the CR in the low 4-bits of the saved value. First, issue
+  // an MFCR to save all of the CRBits. Add an implicit kill of the CR.
+  if (!MI.getOperand(0).isKill())
+    BuildMI(MBB, II, dl, TII.get(PPC::MFCR), Reg);
+  else
+    // Implicitly kill the CR register.
+    BuildMI(MBB, II, dl, TII.get(PPC::MFCR), Reg)
+      .addReg(MI.getOperand(0).getReg(), RegState::ImplicitKill);
+    
+  // If the saved register wasn't CR0, shift the bits left so that they are in
+  // CR0's slot.
+  unsigned SrcReg = MI.getOperand(0).getReg();
+  if (SrcReg != PPC::CR0)
+    // rlwinm rA, rA, ShiftBits, 0, 31.
+    BuildMI(MBB, II, dl, TII.get(PPC::RLWINM), Reg)
+      .addReg(Reg, RegState::Kill)
+      .addImm(PPCRegisterInfo::getRegisterNumbering(SrcReg) * 4)
+      .addImm(0)
+      .addImm(31);
+
+  addFrameReference(BuildMI(MBB, II, dl, TII.get(PPC::STW))
+                    .addReg(Reg, getKillRegState(MI.getOperand(1).getImm())),
+                    FrameIndex);
+
+  // Discard the pseudo instruction.
+  MBB.erase(II);
+}
+
+unsigned
+PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                     int SPAdj, int *Value,
+                                     RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+
+  // Get the instruction.
+  MachineInstr &MI = *II;
+  // Get the instruction's basic block.
+  MachineBasicBlock &MBB = *MI.getParent();
+  // Get the basic block's function.
+  MachineFunction &MF = *MBB.getParent();
+  // Get the frame info.
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  DebugLoc dl = MI.getDebugLoc();
+
+  // Find out which operand is the frame index.
+  unsigned FIOperandNo = 0;
+  while (!MI.getOperand(FIOperandNo).isFI()) {
+    ++FIOperandNo;
+    assert(FIOperandNo != MI.getNumOperands() &&
+           "Instr doesn't have FrameIndex operand!");
+  }
+  // Take into account whether it's an add or mem instruction
+  unsigned OffsetOperandNo = (FIOperandNo == 2) ? 1 : 2;
+  if (MI.isInlineAsm())
+    OffsetOperandNo = FIOperandNo-1;
+
+  // Get the frame index.
+  int FrameIndex = MI.getOperand(FIOperandNo).getIndex();
+
+  // Get the frame pointer save index.  Users of this index are primarily
+  // DYNALLOC instructions.
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+  int FPSI = FI->getFramePointerSaveIndex();
+  // Get the instruction opcode.
+  unsigned OpC = MI.getOpcode();
+  
+  // Special case for dynamic alloca.
+  if (FPSI && FrameIndex == FPSI &&
+      (OpC == PPC::DYNALLOC || OpC == PPC::DYNALLOC8)) {
+    lowerDynamicAlloc(II, SPAdj, RS);
+    return 0;
+  }
+
+  // Special case for pseudo-op SPILL_CR.
+  if (EnableRegisterScavenging) // FIXME (64-bit): Enable by default.
+    if (OpC == PPC::SPILL_CR) {
+      lowerCRSpilling(II, FrameIndex, SPAdj, RS);
+      return 0;
+    }
+
+  // Replace the FrameIndex with base register with GPR1 (SP) or GPR31 (FP).
+  MI.getOperand(FIOperandNo).ChangeToRegister(hasFP(MF) ? PPC::R31 : PPC::R1,
+                                              false);
+
+  // Figure out if the offset in the instruction is shifted right two bits. This
+  // is true for instructions like "STD", which the machine implicitly adds two
+  // low zeros to.
+  bool isIXAddr = false;
+  switch (OpC) {
+  case PPC::LWA:
+  case PPC::LD:
+  case PPC::STD:
+  case PPC::STD_32:
+    isIXAddr = true;
+    break;
+  }
+  
+  // Now add the frame object offset to the offset from r1.
+  int Offset = MFI->getObjectOffset(FrameIndex);
+  if (!isIXAddr)
+    Offset += MI.getOperand(OffsetOperandNo).getImm();
+  else
+    Offset += MI.getOperand(OffsetOperandNo).getImm() << 2;
+
+  // If we're not using a Frame Pointer that has been set to the value of the
+  // SP before having the stack size subtracted from it, then add the stack size
+  // to Offset to get the correct offset.
+  Offset += MFI->getStackSize();
+
+  // If we can, encode the offset directly into the instruction.  If this is a
+  // normal PPC "ri" instruction, any 16-bit value can be safely encoded.  If
+  // this is a PPC64 "ix" instruction, only a 16-bit value with the low two bits
+  // clear can be encoded.  This is extremely uncommon, because normally you
+  // only "std" to a stack slot that is at least 4-byte aligned, but it can
+  // happen in invalid code.
+  if (isInt16(Offset) && (!isIXAddr || (Offset & 3) == 0)) {
+    if (isIXAddr)
+      Offset >>= 2;    // The actual encoded value has the low two bits zero.
+    MI.getOperand(OffsetOperandNo).ChangeToImmediate(Offset);
+    return 0;
+  }
+
+  // The offset doesn't fit into a single register, scavenge one to build the
+  // offset in.
+  // FIXME: figure out what SPAdj is doing here.
+
+  // FIXME (64-bit): Use "findScratchRegister".
+  unsigned SReg;
+  if (EnableRegisterScavenging)
+    SReg = findScratchRegister(II, RS, &PPC::GPRCRegClass, SPAdj);
+  else
+    SReg = PPC::R0;
+
+  // Insert a set of rA with the full offset value before the ld, st, or add
+  BuildMI(MBB, II, dl, TII.get(PPC::LIS), SReg)
+    .addImm(Offset >> 16);
+  BuildMI(MBB, II, dl, TII.get(PPC::ORI), SReg)
+    .addReg(SReg, RegState::Kill)
+    .addImm(Offset);
+
+  // Convert into indexed form of the instruction:
+  // 
+  //   sth 0:rA, 1:imm 2:(rB) ==> sthx 0:rA, 2:rB, 1:r0
+  //   addi 0:rA 1:rB, 2, imm ==> add 0:rA, 1:rB, 2:r0
+  unsigned OperandBase;
+
+  if (OpC != TargetOpcode::INLINEASM) {
+    assert(ImmToIdxMap.count(OpC) &&
+           "No indexed form of load or store available!");
+    unsigned NewOpcode = ImmToIdxMap.find(OpC)->second;
+    MI.setDesc(TII.get(NewOpcode));
+    OperandBase = 1;
+  } else {
+    OperandBase = OffsetOperandNo;
+  }
+    
+  unsigned StackReg = MI.getOperand(FIOperandNo).getReg();
+  MI.getOperand(OperandBase).ChangeToRegister(StackReg, false);
+  MI.getOperand(OperandBase + 1).ChangeToRegister(SReg, false);
+  return 0;
+}
+
+/// VRRegNo - Map from a numbered VR register to its enum value.
+///
+static const unsigned short VRRegNo[] = {
+ PPC::V0 , PPC::V1 , PPC::V2 , PPC::V3 , PPC::V4 , PPC::V5 , PPC::V6 , PPC::V7 ,
+ PPC::V8 , PPC::V9 , PPC::V10, PPC::V11, PPC::V12, PPC::V13, PPC::V14, PPC::V15,
+ PPC::V16, PPC::V17, PPC::V18, PPC::V19, PPC::V20, PPC::V21, PPC::V22, PPC::V23,
+ PPC::V24, PPC::V25, PPC::V26, PPC::V27, PPC::V28, PPC::V29, PPC::V30, PPC::V31
+};
+
+/// RemoveVRSaveCode - We have found that this function does not need any code
+/// to manipulate the VRSAVE register, even though it uses vector registers.
+/// This can happen when the only registers used are known to be live in or out
+/// of the function.  Remove all of the VRSAVE related code from the function.
+static void RemoveVRSaveCode(MachineInstr *MI) {
+  MachineBasicBlock *Entry = MI->getParent();
+  MachineFunction *MF = Entry->getParent();
+
+  // We know that the MTVRSAVE instruction immediately follows MI.  Remove it.
+  MachineBasicBlock::iterator MBBI = MI;
+  ++MBBI;
+  assert(MBBI != Entry->end() && MBBI->getOpcode() == PPC::MTVRSAVE);
+  MBBI->eraseFromParent();
+  
+  bool RemovedAllMTVRSAVEs = true;
+  // See if we can find and remove the MTVRSAVE instruction from all of the
+  // epilog blocks.
+  for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I) {
+    // If last instruction is a return instruction, add an epilogue
+    if (!I->empty() && I->back().getDesc().isReturn()) {
+      bool FoundIt = false;
+      for (MBBI = I->end(); MBBI != I->begin(); ) {
+        --MBBI;
+        if (MBBI->getOpcode() == PPC::MTVRSAVE) {
+          MBBI->eraseFromParent();  // remove it.
+          FoundIt = true;
+          break;
+        }
+      }
+      RemovedAllMTVRSAVEs &= FoundIt;
+    }
+  }
+
+  // If we found and removed all MTVRSAVE instructions, remove the read of
+  // VRSAVE as well.
+  if (RemovedAllMTVRSAVEs) {
+    MBBI = MI;
+    assert(MBBI != Entry->begin() && "UPDATE_VRSAVE is first instr in block?");
+    --MBBI;
+    assert(MBBI->getOpcode() == PPC::MFVRSAVE && "VRSAVE instrs wandered?");
+    MBBI->eraseFromParent();
+  }
+  
+  // Finally, nuke the UPDATE_VRSAVE.
+  MI->eraseFromParent();
+}
+
+// HandleVRSaveUpdate - MI is the UPDATE_VRSAVE instruction introduced by the
+// instruction selector.  Based on the vector registers that have been used,
+// transform this into the appropriate ORI instruction.
+static void HandleVRSaveUpdate(MachineInstr *MI, const TargetInstrInfo &TII) {
+  MachineFunction *MF = MI->getParent()->getParent();
+  DebugLoc dl = MI->getDebugLoc();
+
+  unsigned UsedRegMask = 0;
+  for (unsigned i = 0; i != 32; ++i)
+    if (MF->getRegInfo().isPhysRegUsed(VRRegNo[i]))
+      UsedRegMask |= 1 << (31-i);
+  
+  // Live in and live out values already must be in the mask, so don't bother
+  // marking them.
+  for (MachineRegisterInfo::livein_iterator
+       I = MF->getRegInfo().livein_begin(),
+       E = MF->getRegInfo().livein_end(); I != E; ++I) {
+    unsigned RegNo = PPCRegisterInfo::getRegisterNumbering(I->first);
+    if (VRRegNo[RegNo] == I->first)        // If this really is a vector reg.
+      UsedRegMask &= ~(1 << (31-RegNo));   // Doesn't need to be marked.
+  }
+  for (MachineRegisterInfo::liveout_iterator
+       I = MF->getRegInfo().liveout_begin(),
+       E = MF->getRegInfo().liveout_end(); I != E; ++I) {
+    unsigned RegNo = PPCRegisterInfo::getRegisterNumbering(*I);
+    if (VRRegNo[RegNo] == *I)              // If this really is a vector reg.
+      UsedRegMask &= ~(1 << (31-RegNo));   // Doesn't need to be marked.
+  }
+  
+  // If no registers are used, turn this into a copy.
+  if (UsedRegMask == 0) {
+    // Remove all VRSAVE code.
+    RemoveVRSaveCode(MI);
+    return;
+  }
+
+  unsigned SrcReg = MI->getOperand(1).getReg();
+  unsigned DstReg = MI->getOperand(0).getReg();
+
+  if ((UsedRegMask & 0xFFFF) == UsedRegMask) {
+    if (DstReg != SrcReg)
+      BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORI), DstReg)
+        .addReg(SrcReg)
+        .addImm(UsedRegMask);
+    else
+      BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORI), DstReg)
+        .addReg(SrcReg, RegState::Kill)
+        .addImm(UsedRegMask);
+  } else if ((UsedRegMask & 0xFFFF0000) == UsedRegMask) {
+    if (DstReg != SrcReg)
+      BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORIS), DstReg)
+        .addReg(SrcReg)
+        .addImm(UsedRegMask >> 16);
+    else
+      BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORIS), DstReg)
+        .addReg(SrcReg, RegState::Kill)
+        .addImm(UsedRegMask >> 16);
+  } else {
+    if (DstReg != SrcReg)
+      BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORIS), DstReg)
+        .addReg(SrcReg)
+        .addImm(UsedRegMask >> 16);
+    else
+      BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORIS), DstReg)
+        .addReg(SrcReg, RegState::Kill)
+        .addImm(UsedRegMask >> 16);
+
+    BuildMI(*MI->getParent(), MI, dl, TII.get(PPC::ORI), DstReg)
+      .addReg(DstReg, RegState::Kill)
+      .addImm(UsedRegMask & 0xFFFF);
+  }
+  
+  // Remove the old UPDATE_VRSAVE instruction.
+  MI->eraseFromParent();
+}
+
+/// determineFrameLayout - Determine the size of the frame and maximum call
+/// frame size.
+void PPCRegisterInfo::determineFrameLayout(MachineFunction &MF) const {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Get the number of bytes to allocate from the FrameInfo
+  unsigned FrameSize = MFI->getStackSize();
+  
+  // Get the alignments provided by the target, and the maximum alignment
+  // (if any) of the fixed frame objects.
+  unsigned MaxAlign = MFI->getMaxAlignment();
+  unsigned TargetAlign = MF.getTarget().getFrameInfo()->getStackAlignment();
+  unsigned AlignMask = TargetAlign - 1;  //
+
+  // If we are a leaf function, and use up to 224 bytes of stack space,
+  // don't have a frame pointer, calls, or dynamic alloca then we do not need
+  // to adjust the stack pointer (we fit in the Red Zone).
+  bool DisableRedZone = MF.getFunction()->hasFnAttr(Attribute::NoRedZone);
+  // FIXME SVR4 The 32-bit SVR4 ABI has no red zone.
+  if (!DisableRedZone &&
+      FrameSize <= 224 &&                          // Fits in red zone.
+      !MFI->hasVarSizedObjects() &&                // No dynamic alloca.
+      !MFI->hasCalls() &&                          // No calls.
+      (!ALIGN_STACK || MaxAlign <= TargetAlign)) { // No special alignment.
+    // No need for frame
+    MFI->setStackSize(0);
+    return;
+  }
+  
+  // Get the maximum call frame size of all the calls.
+  unsigned maxCallFrameSize = MFI->getMaxCallFrameSize();
+  
+  // Maximum call frame needs to be at least big enough for linkage and 8 args.
+  unsigned minCallFrameSize =
+    PPCFrameInfo::getMinCallFrameSize(Subtarget.isPPC64(), 
+                                      Subtarget.isDarwinABI());
+  maxCallFrameSize = std::max(maxCallFrameSize, minCallFrameSize);
+
+  // If we have dynamic alloca then maxCallFrameSize needs to be aligned so
+  // that allocations will be aligned.
+  if (MFI->hasVarSizedObjects())
+    maxCallFrameSize = (maxCallFrameSize + AlignMask) & ~AlignMask;
+  
+  // Update maximum call frame size.
+  MFI->setMaxCallFrameSize(maxCallFrameSize);
+  
+  // Include call frame size in total.
+  FrameSize += maxCallFrameSize;
+  
+  // Make sure the frame is aligned.
+  FrameSize = (FrameSize + AlignMask) & ~AlignMask;
+
+  // Update frame info.
+  MFI->setStackSize(FrameSize);
+}
+
+void
+PPCRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                                      RegScavenger *RS) const {
+  //  Save and clear the LR state.
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+  unsigned LR = getRARegister();
+  FI->setMustSaveLR(MustSaveLR(MF, LR));
+  MF.getRegInfo().setPhysRegUnused(LR);
+
+  //  Save R31 if necessary
+  int FPSI = FI->getFramePointerSaveIndex();
+  bool isPPC64 = Subtarget.isPPC64();
+  bool isDarwinABI  = Subtarget.isDarwinABI();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+ 
+  // If the frame pointer save index hasn't been defined yet.
+  if (!FPSI && needsFP(MF)) {
+    // Find out what the fix offset of the frame pointer save area.
+    int FPOffset = PPCFrameInfo::getFramePointerSaveOffset(isPPC64,
+                                                           isDarwinABI);
+    // Allocate the frame index for frame pointer save area.
+    FPSI = MF.getFrameInfo()->CreateFixedObject(isPPC64? 8 : 4, FPOffset,
+                                                true, false);
+    // Save the result.
+    FI->setFramePointerSaveIndex(FPSI);                      
+  }
+
+  // Reserve stack space to move the linkage area to in case of a tail call.
+  int TCSPDelta = 0;
+  if (GuaranteedTailCallOpt && (TCSPDelta = FI->getTailCallSPDelta()) < 0) {
+    MF.getFrameInfo()->CreateFixedObject(-1 * TCSPDelta, TCSPDelta,
+                                         true, false);
+  }
+  
+  // Reserve a slot closest to SP or frame pointer if we have a dynalloc or
+  // a large stack, which will require scavenging a register to materialize a
+  // large offset.
+  // FIXME: this doesn't actually check stack size, so is a bit pessimistic
+  // FIXME: doesn't detect whether or not we need to spill vXX, which requires
+  //        r0 for now.
+
+  if (EnableRegisterScavenging) // FIXME (64-bit): Enable.
+    if (needsFP(MF) || spillsCR(MF)) {
+      const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+      const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+      const TargetRegisterClass *RC = isPPC64 ? G8RC : GPRC;
+      RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
+                                                         RC->getAlignment(),
+                                                         false));
+    }
+}
+
+void
+PPCRegisterInfo::processFunctionBeforeFrameFinalized(MachineFunction &MF)
+                                                     const {
+  // Early exit if not using the SVR4 ABI.
+  if (!Subtarget.isSVR4ABI()) {
+    return;
+  }
+
+  // Get callee saved register information.
+  MachineFrameInfo *FFI = MF.getFrameInfo();
+  const std::vector<CalleeSavedInfo> &CSI = FFI->getCalleeSavedInfo();
+
+  // Early exit if no callee saved registers are modified!
+  if (CSI.empty() && !needsFP(MF)) {
+    return;
+  }
+  
+  unsigned MinGPR = PPC::R31;
+  unsigned MinG8R = PPC::X31;
+  unsigned MinFPR = PPC::F31;
+  unsigned MinVR = PPC::V31;
+  
+  bool HasGPSaveArea = false;
+  bool HasG8SaveArea = false;
+  bool HasFPSaveArea = false;
+  bool HasCRSaveArea = false;
+  bool HasVRSAVESaveArea = false;
+  bool HasVRSaveArea = false;
+  
+  SmallVector<CalleeSavedInfo, 18> GPRegs;
+  SmallVector<CalleeSavedInfo, 18> G8Regs;
+  SmallVector<CalleeSavedInfo, 18> FPRegs;
+  SmallVector<CalleeSavedInfo, 18> VRegs;
+  
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    const TargetRegisterClass *RC = CSI[i].getRegClass();
+    
+    if (RC == PPC::GPRCRegisterClass) {
+      HasGPSaveArea = true;
+      
+      GPRegs.push_back(CSI[i]);
+      
+      if (Reg < MinGPR) {
+        MinGPR = Reg;
+      }
+    } else if (RC == PPC::G8RCRegisterClass) {
+      HasG8SaveArea = true;
+
+      G8Regs.push_back(CSI[i]);
+
+      if (Reg < MinG8R) {
+        MinG8R = Reg;
+      }
+    } else if (RC == PPC::F8RCRegisterClass) {
+      HasFPSaveArea = true;
+      
+      FPRegs.push_back(CSI[i]);
+      
+      if (Reg < MinFPR) {
+        MinFPR = Reg;
+      }
+// FIXME SVR4: Disable CR save area for now.
+    } else if (   RC == PPC::CRBITRCRegisterClass
+               || RC == PPC::CRRCRegisterClass) {
+//      HasCRSaveArea = true;
+    } else if (RC == PPC::VRSAVERCRegisterClass) {
+      HasVRSAVESaveArea = true;
+    } else if (RC == PPC::VRRCRegisterClass) {
+      HasVRSaveArea = true;
+      
+      VRegs.push_back(CSI[i]);
+      
+      if (Reg < MinVR) {
+        MinVR = Reg;
+      }
+    } else {
+      llvm_unreachable("Unknown RegisterClass!");
+    }
+  }
+
+  PPCFunctionInfo *PFI = MF.getInfo<PPCFunctionInfo>();
+  
+  int64_t LowerBound = 0;
+
+  // Take into account stack space reserved for tail calls.
+  int TCSPDelta = 0;
+  if (GuaranteedTailCallOpt && (TCSPDelta = PFI->getTailCallSPDelta()) < 0) {
+    LowerBound = TCSPDelta;
+  }
+
+  // The Floating-point register save area is right below the back chain word
+  // of the previous stack frame.
+  if (HasFPSaveArea) {
+    for (unsigned i = 0, e = FPRegs.size(); i != e; ++i) {
+      int FI = FPRegs[i].getFrameIdx();
+      
+      FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
+    }
+    
+    LowerBound -= (31 - getRegisterNumbering(MinFPR) + 1) * 8; 
+  }
+
+  // Check whether the frame pointer register is allocated. If so, make sure it
+  // is spilled to the correct offset.
+  if (needsFP(MF)) {
+    HasGPSaveArea = true;
+    
+    int FI = PFI->getFramePointerSaveIndex();
+    assert(FI && "No Frame Pointer Save Slot!");
+    
+    FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
+  }
+  
+  // General register save area starts right below the Floating-point
+  // register save area.
+  if (HasGPSaveArea || HasG8SaveArea) {
+    // Move general register save area spill slots down, taking into account
+    // the size of the Floating-point register save area.
+    for (unsigned i = 0, e = GPRegs.size(); i != e; ++i) {
+      int FI = GPRegs[i].getFrameIdx();
+      
+      FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
+    }
+    
+    // Move general register save area spill slots down, taking into account
+    // the size of the Floating-point register save area.
+    for (unsigned i = 0, e = G8Regs.size(); i != e; ++i) {
+      int FI = G8Regs[i].getFrameIdx();
+
+      FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
+    }
+
+    unsigned MinReg = std::min<unsigned>(getRegisterNumbering(MinGPR),
+                                         getRegisterNumbering(MinG8R));
+
+    if (Subtarget.isPPC64()) {
+      LowerBound -= (31 - MinReg + 1) * 8;
+    } else {
+      LowerBound -= (31 - MinReg + 1) * 4;
+    }
+  }
+  
+  // The CR save area is below the general register save area.
+  if (HasCRSaveArea) {
+    // FIXME SVR4: Is it actually possible to have multiple elements in CSI
+    //             which have the CR/CRBIT register class?
+    // Adjust the frame index of the CR spill slot.
+    for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+      const TargetRegisterClass *RC = CSI[i].getRegClass();
+    
+      if (RC == PPC::CRBITRCRegisterClass || RC == PPC::CRRCRegisterClass) {
+        int FI = CSI[i].getFrameIdx();
+
+        FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
+      }
+    }
+    
+    LowerBound -= 4; // The CR save area is always 4 bytes long.
+  }
+  
+  if (HasVRSAVESaveArea) {
+    // FIXME SVR4: Is it actually possible to have multiple elements in CSI
+    //             which have the VRSAVE register class?
+    // Adjust the frame index of the VRSAVE spill slot.
+    for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+      const TargetRegisterClass *RC = CSI[i].getRegClass();
+    
+      if (RC == PPC::VRSAVERCRegisterClass) {
+        int FI = CSI[i].getFrameIdx();
+
+        FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
+      }
+    }
+    
+    LowerBound -= 4; // The VRSAVE save area is always 4 bytes long.
+  }
+  
+  if (HasVRSaveArea) {
+    // Insert alignment padding, we need 16-byte alignment.
+    LowerBound = (LowerBound - 15) & ~(15);
+    
+    for (unsigned i = 0, e = VRegs.size(); i != e; ++i) {
+      int FI = VRegs[i].getFrameIdx();
+      
+      FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
+    }
+  }
+}
+
+void
+PPCRegisterInfo::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();   // Prolog goes in entry BB
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
+  DebugLoc dl = DebugLoc::getUnknownLoc();
+  bool needsFrameMoves = (MMI && MMI->hasDebugInfo()) ||
+       !MF.getFunction()->doesNotThrow() ||
+       UnwindTablesMandatory;
+  
+  // Prepare for frame info.
+  unsigned FrameLabelId = 0;
+
+  // Scan the prolog, looking for an UPDATE_VRSAVE instruction.  If we find it,
+  // process it.
+  for (unsigned i = 0; MBBI != MBB.end(); ++i, ++MBBI) {
+    if (MBBI->getOpcode() == PPC::UPDATE_VRSAVE) {
+      HandleVRSaveUpdate(MBBI, TII);
+      break;
+    }
+  }
+  
+  // Move MBBI back to the beginning of the function.
+  MBBI = MBB.begin();
+
+  // Work out frame sizes.
+  determineFrameLayout(MF);
+  unsigned FrameSize = MFI->getStackSize();
+  
+  int NegFrameSize = -FrameSize;
+  
+  // Get processor type.
+  bool isPPC64 = Subtarget.isPPC64();
+  // Get operating system
+  bool isDarwinABI = Subtarget.isDarwinABI();
+  // Check if the link register (LR) must be saved.
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+  bool MustSaveLR = FI->mustSaveLR();
+  // Do we have a frame pointer for this function?
+  bool HasFP = hasFP(MF) && FrameSize;
+  
+  int LROffset = PPCFrameInfo::getReturnSaveOffset(isPPC64, isDarwinABI);
+
+  int FPOffset = 0;
+  if (HasFP) {
+    if (Subtarget.isSVR4ABI()) {
+      MachineFrameInfo *FFI = MF.getFrameInfo();
+      int FPIndex = FI->getFramePointerSaveIndex();
+      assert(FPIndex && "No Frame Pointer Save Slot!");
+      FPOffset = FFI->getObjectOffset(FPIndex);
+    } else {
+      FPOffset = PPCFrameInfo::getFramePointerSaveOffset(isPPC64, isDarwinABI);
+    }
+  }
+
+  if (isPPC64) {
+    if (MustSaveLR)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::MFLR8), PPC::X0);
+      
+    if (HasFP)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::STD))
+        .addReg(PPC::X31)
+        .addImm(FPOffset/4)
+        .addReg(PPC::X1);
+    
+    if (MustSaveLR)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::STD))
+        .addReg(PPC::X0)
+        .addImm(LROffset / 4)
+        .addReg(PPC::X1);
+  } else {
+    if (MustSaveLR)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::MFLR), PPC::R0);
+      
+    if (HasFP)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::STW))
+        .addReg(PPC::R31)
+        .addImm(FPOffset)
+        .addReg(PPC::R1);
+
+    if (MustSaveLR)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::STW))
+        .addReg(PPC::R0)
+        .addImm(LROffset)
+        .addReg(PPC::R1);
+  }
+  
+  // Skip if a leaf routine.
+  if (!FrameSize) return;
+  
+  // Get stack alignments.
+  unsigned TargetAlign = MF.getTarget().getFrameInfo()->getStackAlignment();
+  unsigned MaxAlign = MFI->getMaxAlignment();
+
+  // Adjust stack pointer: r1 += NegFrameSize.
+  // If there is a preferred stack alignment, align R1 now
+  if (!isPPC64) {
+    // PPC32.
+    if (ALIGN_STACK && MaxAlign > TargetAlign) {
+      assert(isPowerOf2_32(MaxAlign)&&isInt16(MaxAlign)&&"Invalid alignment!");
+      assert(isInt16(NegFrameSize) && "Unhandled stack size and alignment!");
+
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::RLWINM), PPC::R0)
+        .addReg(PPC::R1)
+        .addImm(0)
+        .addImm(32 - Log2_32(MaxAlign))
+        .addImm(31);
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::SUBFIC) ,PPC::R0)
+        .addReg(PPC::R0, RegState::Kill)
+        .addImm(NegFrameSize);
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::STWUX))
+        .addReg(PPC::R1)
+        .addReg(PPC::R1)
+        .addReg(PPC::R0);
+    } else if (isInt16(NegFrameSize)) {
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::STWU), PPC::R1)
+        .addReg(PPC::R1)
+        .addImm(NegFrameSize)
+        .addReg(PPC::R1);
+    } else {
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::LIS), PPC::R0)
+        .addImm(NegFrameSize >> 16);
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::ORI), PPC::R0)
+        .addReg(PPC::R0, RegState::Kill)
+        .addImm(NegFrameSize & 0xFFFF);
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::STWUX))
+        .addReg(PPC::R1)
+        .addReg(PPC::R1)
+        .addReg(PPC::R0);
+    }
+  } else {    // PPC64.
+    if (ALIGN_STACK && MaxAlign > TargetAlign) {
+      assert(isPowerOf2_32(MaxAlign)&&isInt16(MaxAlign)&&"Invalid alignment!");
+      assert(isInt16(NegFrameSize) && "Unhandled stack size and alignment!");
+
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::RLDICL), PPC::X0)
+        .addReg(PPC::X1)
+        .addImm(0)
+        .addImm(64 - Log2_32(MaxAlign));
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::SUBFIC8), PPC::X0)
+        .addReg(PPC::X0)
+        .addImm(NegFrameSize);
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::STDUX))
+        .addReg(PPC::X1)
+        .addReg(PPC::X1)
+        .addReg(PPC::X0);
+    } else if (isInt16(NegFrameSize)) {
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::STDU), PPC::X1)
+        .addReg(PPC::X1)
+        .addImm(NegFrameSize / 4)
+        .addReg(PPC::X1);
+    } else {
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::LIS8), PPC::X0)
+        .addImm(NegFrameSize >> 16);
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::ORI8), PPC::X0)
+        .addReg(PPC::X0, RegState::Kill)
+        .addImm(NegFrameSize & 0xFFFF);
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::STDUX))
+        .addReg(PPC::X1)
+        .addReg(PPC::X1)
+        .addReg(PPC::X0);
+    }
+  }
+
+  std::vector<MachineMove> &Moves = MMI->getFrameMoves();
+  
+  // Add the "machine moves" for the instructions we generated above, but in
+  // reverse order.
+  if (needsFrameMoves) {
+    // Mark effective beginning of when frame pointer becomes valid.
+    FrameLabelId = MMI->NextLabelID();
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::DBG_LABEL)).addImm(FrameLabelId);
+  
+    // Show update of SP.
+    if (NegFrameSize) {
+      MachineLocation SPDst(MachineLocation::VirtualFP);
+      MachineLocation SPSrc(MachineLocation::VirtualFP, NegFrameSize);
+      Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc));
+    } else {
+      MachineLocation SP(isPPC64 ? PPC::X31 : PPC::R31);
+      Moves.push_back(MachineMove(FrameLabelId, SP, SP));
+    }
+    
+    if (HasFP) {
+      MachineLocation FPDst(MachineLocation::VirtualFP, FPOffset);
+      MachineLocation FPSrc(isPPC64 ? PPC::X31 : PPC::R31);
+      Moves.push_back(MachineMove(FrameLabelId, FPDst, FPSrc));
+    }
+
+    if (MustSaveLR) {
+      MachineLocation LRDst(MachineLocation::VirtualFP, LROffset);
+      MachineLocation LRSrc(isPPC64 ? PPC::LR8 : PPC::LR);
+      Moves.push_back(MachineMove(FrameLabelId, LRDst, LRSrc));
+    }
+  }
+
+  unsigned ReadyLabelId = 0;
+
+  // If there is a frame pointer, copy R1 into R31
+  if (HasFP) {
+    if (!isPPC64) {
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::OR), PPC::R31)
+        .addReg(PPC::R1)
+        .addReg(PPC::R1);
+    } else {
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::OR8), PPC::X31)
+        .addReg(PPC::X1)
+        .addReg(PPC::X1);
+    }
+
+    if (needsFrameMoves) {
+      ReadyLabelId = MMI->NextLabelID();
+
+      // Mark effective beginning of when frame pointer is ready.
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::DBG_LABEL)).addImm(ReadyLabelId);
+
+      MachineLocation FPDst(HasFP ? (isPPC64 ? PPC::X31 : PPC::R31) :
+                                    (isPPC64 ? PPC::X1 : PPC::R1));
+      MachineLocation FPSrc(MachineLocation::VirtualFP);
+      Moves.push_back(MachineMove(ReadyLabelId, FPDst, FPSrc));
+    }
+  }
+
+  if (needsFrameMoves) {
+    unsigned LabelId = HasFP ? ReadyLabelId : FrameLabelId;
+
+    // Add callee saved registers to move list.
+    const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+    for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
+      int Offset = MFI->getObjectOffset(CSI[I].getFrameIdx());
+      unsigned Reg = CSI[I].getReg();
+      if (Reg == PPC::LR || Reg == PPC::LR8 || Reg == PPC::RM) continue;
+      MachineLocation CSDst(MachineLocation::VirtualFP, Offset);
+      MachineLocation CSSrc(Reg);
+      Moves.push_back(MachineMove(LabelId, CSDst, CSSrc));
+    }
+  }
+}
+
+void PPCRegisterInfo::emitEpilogue(MachineFunction &MF,
+                                   MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  unsigned RetOpcode = MBBI->getOpcode();
+  DebugLoc dl = DebugLoc::getUnknownLoc();
+
+  assert( (RetOpcode == PPC::BLR ||
+           RetOpcode == PPC::TCRETURNri ||
+           RetOpcode == PPC::TCRETURNdi ||
+           RetOpcode == PPC::TCRETURNai ||
+           RetOpcode == PPC::TCRETURNri8 ||
+           RetOpcode == PPC::TCRETURNdi8 ||
+           RetOpcode == PPC::TCRETURNai8) &&
+         "Can only insert epilog into returning blocks");
+
+  // Get alignment info so we know how to restore r1
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  unsigned TargetAlign = MF.getTarget().getFrameInfo()->getStackAlignment();
+  unsigned MaxAlign = MFI->getMaxAlignment();
+
+  // Get the number of bytes allocated from the FrameInfo.
+  int FrameSize = MFI->getStackSize();
+
+  // Get processor type.
+  bool isPPC64 = Subtarget.isPPC64();
+  // Get operating system
+  bool isDarwinABI = Subtarget.isDarwinABI();
+  // Check if the link register (LR) has been saved.
+  PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+  bool MustSaveLR = FI->mustSaveLR();
+  // Do we have a frame pointer for this function?
+  bool HasFP = hasFP(MF) && FrameSize;
+  
+  int LROffset = PPCFrameInfo::getReturnSaveOffset(isPPC64, isDarwinABI);
+
+  int FPOffset = 0;
+  if (HasFP) {
+    if (Subtarget.isSVR4ABI()) {
+      MachineFrameInfo *FFI = MF.getFrameInfo();
+      int FPIndex = FI->getFramePointerSaveIndex();
+      assert(FPIndex && "No Frame Pointer Save Slot!");
+      FPOffset = FFI->getObjectOffset(FPIndex);
+    } else {
+      FPOffset = PPCFrameInfo::getFramePointerSaveOffset(isPPC64, isDarwinABI);
+    }
+  }
+  
+  bool UsesTCRet =  RetOpcode == PPC::TCRETURNri ||
+    RetOpcode == PPC::TCRETURNdi ||
+    RetOpcode == PPC::TCRETURNai ||
+    RetOpcode == PPC::TCRETURNri8 ||
+    RetOpcode == PPC::TCRETURNdi8 ||
+    RetOpcode == PPC::TCRETURNai8;
+
+  if (UsesTCRet) {
+    int MaxTCRetDelta = FI->getTailCallSPDelta();
+    MachineOperand &StackAdjust = MBBI->getOperand(1);
+    assert(StackAdjust.isImm() && "Expecting immediate value.");
+    // Adjust stack pointer.
+    int StackAdj = StackAdjust.getImm();
+    int Delta = StackAdj - MaxTCRetDelta;
+    assert((Delta >= 0) && "Delta must be positive");
+    if (MaxTCRetDelta>0)
+      FrameSize += (StackAdj +Delta);
+    else
+      FrameSize += StackAdj;
+  }
+
+  if (FrameSize) {
+    // The loaded (or persistent) stack pointer value is offset by the 'stwu'
+    // on entry to the function.  Add this offset back now.
+    if (!isPPC64) {
+      // If this function contained a fastcc call and GuaranteedTailCallOpt is
+      // enabled (=> hasFastCall()==true) the fastcc call might contain a tail
+      // call which invalidates the stack pointer value in SP(0). So we use the
+      // value of R31 in this case.
+      if (FI->hasFastCall() && isInt16(FrameSize)) {
+        assert(hasFP(MF) && "Expecting a valid the frame pointer.");
+        BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDI), PPC::R1)
+          .addReg(PPC::R31).addImm(FrameSize);
+      } else if(FI->hasFastCall()) {
+        BuildMI(MBB, MBBI, dl, TII.get(PPC::LIS), PPC::R0)
+          .addImm(FrameSize >> 16);
+        BuildMI(MBB, MBBI, dl, TII.get(PPC::ORI), PPC::R0)
+          .addReg(PPC::R0, RegState::Kill)
+          .addImm(FrameSize & 0xFFFF);
+        BuildMI(MBB, MBBI, dl, TII.get(PPC::ADD4))
+          .addReg(PPC::R1)
+          .addReg(PPC::R31)
+          .addReg(PPC::R0);
+      } else if (isInt16(FrameSize) &&
+                 (!ALIGN_STACK || TargetAlign >= MaxAlign) &&
+                 !MFI->hasVarSizedObjects()) {
+        BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDI), PPC::R1)
+          .addReg(PPC::R1).addImm(FrameSize);
+      } else {
+        BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ),PPC::R1)
+          .addImm(0).addReg(PPC::R1);
+      }
+    } else {
+      if (FI->hasFastCall() && isInt16(FrameSize)) {
+        assert(hasFP(MF) && "Expecting a valid the frame pointer.");
+        BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDI8), PPC::X1)
+          .addReg(PPC::X31).addImm(FrameSize);
+      } else if(FI->hasFastCall()) {
+        BuildMI(MBB, MBBI, dl, TII.get(PPC::LIS8), PPC::X0)
+          .addImm(FrameSize >> 16);
+        BuildMI(MBB, MBBI, dl, TII.get(PPC::ORI8), PPC::X0)
+          .addReg(PPC::X0, RegState::Kill)
+          .addImm(FrameSize & 0xFFFF);
+        BuildMI(MBB, MBBI, dl, TII.get(PPC::ADD8))
+          .addReg(PPC::X1)
+          .addReg(PPC::X31)
+          .addReg(PPC::X0);
+      } else if (isInt16(FrameSize) && TargetAlign >= MaxAlign &&
+            !MFI->hasVarSizedObjects()) {
+        BuildMI(MBB, MBBI, dl, TII.get(PPC::ADDI8), PPC::X1)
+           .addReg(PPC::X1).addImm(FrameSize);
+      } else {
+        BuildMI(MBB, MBBI, dl, TII.get(PPC::LD), PPC::X1)
+           .addImm(0).addReg(PPC::X1);
+      }
+    }
+  }
+
+  if (isPPC64) {
+    if (MustSaveLR)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::LD), PPC::X0)
+        .addImm(LROffset/4).addReg(PPC::X1);
+        
+    if (HasFP)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::LD), PPC::X31)
+        .addImm(FPOffset/4).addReg(PPC::X1);
+        
+    if (MustSaveLR)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::MTLR8)).addReg(PPC::X0);
+  } else {
+    if (MustSaveLR)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ), PPC::R0)
+          .addImm(LROffset).addReg(PPC::R1);
+        
+    if (HasFP)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ), PPC::R31)
+          .addImm(FPOffset).addReg(PPC::R1);
+          
+    if (MustSaveLR)
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::MTLR)).addReg(PPC::R0);
+  }
+
+  // Callee pop calling convention. Pop parameter/linkage area. Used for tail
+  // call optimization
+  if (GuaranteedTailCallOpt && RetOpcode == PPC::BLR &&
+      MF.getFunction()->getCallingConv() == CallingConv::Fast) {
+     PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
+     unsigned CallerAllocatedAmt = FI->getMinReservedArea();
+     unsigned StackReg = isPPC64 ? PPC::X1 : PPC::R1;
+     unsigned FPReg = isPPC64 ? PPC::X31 : PPC::R31;
+     unsigned TmpReg = isPPC64 ? PPC::X0 : PPC::R0;
+     unsigned ADDIInstr = isPPC64 ? PPC::ADDI8 : PPC::ADDI;
+     unsigned ADDInstr = isPPC64 ? PPC::ADD8 : PPC::ADD4;
+     unsigned LISInstr = isPPC64 ? PPC::LIS8 : PPC::LIS;
+     unsigned ORIInstr = isPPC64 ? PPC::ORI8 : PPC::ORI;
+
+     if (CallerAllocatedAmt && isInt16(CallerAllocatedAmt)) {
+       BuildMI(MBB, MBBI, dl, TII.get(ADDIInstr), StackReg)
+         .addReg(StackReg).addImm(CallerAllocatedAmt);
+     } else {
+       BuildMI(MBB, MBBI, dl, TII.get(LISInstr), TmpReg)
+          .addImm(CallerAllocatedAmt >> 16);
+       BuildMI(MBB, MBBI, dl, TII.get(ORIInstr), TmpReg)
+          .addReg(TmpReg, RegState::Kill)
+          .addImm(CallerAllocatedAmt & 0xFFFF);
+       BuildMI(MBB, MBBI, dl, TII.get(ADDInstr))
+          .addReg(StackReg)
+          .addReg(FPReg)
+          .addReg(TmpReg);
+     }
+  } else if (RetOpcode == PPC::TCRETURNdi) {
+    MBBI = prior(MBB.end());
+    MachineOperand &JumpTarget = MBBI->getOperand(0);
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB)).
+      addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset());
+  } else if (RetOpcode == PPC::TCRETURNri) {
+    MBBI = prior(MBB.end());
+    assert(MBBI->getOperand(0).isReg() && "Expecting register operand.");
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBCTR));
+  } else if (RetOpcode == PPC::TCRETURNai) {
+    MBBI = prior(MBB.end());
+    MachineOperand &JumpTarget = MBBI->getOperand(0);
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBA)).addImm(JumpTarget.getImm());
+  } else if (RetOpcode == PPC::TCRETURNdi8) {
+    MBBI = prior(MBB.end());
+    MachineOperand &JumpTarget = MBBI->getOperand(0);
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB8)).
+      addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset());
+  } else if (RetOpcode == PPC::TCRETURNri8) {
+    MBBI = prior(MBB.end());
+    assert(MBBI->getOperand(0).isReg() && "Expecting register operand.");
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBCTR8));
+  } else if (RetOpcode == PPC::TCRETURNai8) {
+    MBBI = prior(MBB.end());
+    MachineOperand &JumpTarget = MBBI->getOperand(0);
+    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILBA8)).addImm(JumpTarget.getImm());
+  }
+}
+
+unsigned PPCRegisterInfo::getRARegister() const {
+  return !Subtarget.isPPC64() ? PPC::LR : PPC::LR8;
+}
+
+unsigned PPCRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  if (!Subtarget.isPPC64())
+    return hasFP(MF) ? PPC::R31 : PPC::R1;
+  else
+    return hasFP(MF) ? PPC::X31 : PPC::X1;
+}
+
+void PPCRegisterInfo::getInitialFrameState(std::vector<MachineMove> &Moves)
+                                                                         const {
+  // Initial state of the frame pointer is R1.
+  MachineLocation Dst(MachineLocation::VirtualFP);
+  MachineLocation Src(PPC::R1, 0);
+  Moves.push_back(MachineMove(0, Dst, Src));
+}
+
+unsigned PPCRegisterInfo::getEHExceptionRegister() const {
+  return !Subtarget.isPPC64() ? PPC::R3 : PPC::X3;
+}
+
+unsigned PPCRegisterInfo::getEHHandlerRegister() const {
+  return !Subtarget.isPPC64() ? PPC::R4 : PPC::X4;
+}
+
+int PPCRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
+  // FIXME: Most probably dwarf numbers differs for Linux and Darwin
+  return PPCGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
+}
+
+#include "PPCGenRegisterInfo.inc"
+

diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h
new file mode 100644
index 0000000..3aeed80
--- /dev/null
+++ b/lib/Target/PowerPC/PPCRegisterInfo.h

@@ -0,0 +1,98 @@
+//===- PPCRegisterInfo.h - PowerPC Register Information Impl -----*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the PowerPC implementation of the TargetRegisterInfo
+// class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POWERPC32_REGISTERINFO_H
+#define POWERPC32_REGISTERINFO_H
+
+#include "PPC.h"
+#include "PPCGenRegisterInfo.h.inc"
+#include <map>
+
+namespace llvm {
+class PPCSubtarget;
+class TargetInstrInfo;
+class Type;
+
+class PPCRegisterInfo : public PPCGenRegisterInfo {
+  std::map<unsigned, unsigned> ImmToIdxMap;
+  const PPCSubtarget &Subtarget;
+  const TargetInstrInfo &TII;
+public:
+  PPCRegisterInfo(const PPCSubtarget &SubTarget, const TargetInstrInfo &tii);
+  
+  /// getRegisterNumbering - Given the enum value for some register, e.g.
+  /// PPC::F14, return the number that it corresponds to (e.g. 14).
+  static unsigned getRegisterNumbering(unsigned RegEnum);
+
+  /// getPointerRegClass - Return the register class to use to hold pointers.
+  /// This is used for addressing modes.
+  virtual const TargetRegisterClass *getPointerRegClass(unsigned Kind=0) const;  
+
+  /// Code Generation virtual methods...
+  const unsigned *getCalleeSavedRegs(const MachineFunction* MF = 0) const;
+
+  const TargetRegisterClass* const*
+  getCalleeSavedRegClasses(const MachineFunction *MF = 0) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+
+  /// targetHandlesStackFrameRounding - Returns true if the target is
+  /// responsible for rounding up the stack frame (probably at emitPrologue
+  /// time).
+  bool targetHandlesStackFrameRounding() const { return true; }
+
+  /// requiresRegisterScavenging - We require a register scavenger.
+  /// FIXME (64-bit): Should be inlined.
+  bool requiresRegisterScavenging(const MachineFunction &MF) const;
+
+  bool hasFP(const MachineFunction &MF) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+  void lowerDynamicAlloc(MachineBasicBlock::iterator II,
+                         int SPAdj, RegScavenger *RS) const;
+  void lowerCRSpilling(MachineBasicBlock::iterator II, unsigned FrameIndex,
+                       int SPAdj, RegScavenger *RS) const;
+  unsigned eliminateFrameIndex(MachineBasicBlock::iterator II,
+                               int SPAdj, int *Value = NULL,
+                               RegScavenger *RS = NULL) const;
+
+  /// determineFrameLayout - Determine the size of the frame and maximum call
+  /// frame size.
+  void determineFrameLayout(MachineFunction &MF) const;
+
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                            RegScavenger *RS = NULL) const;
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
+
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+  // Debug information queries.
+  unsigned getRARegister() const;
+  unsigned getFrameRegister(const MachineFunction &MF) const;
+  void getInitialFrameState(std::vector<MachineMove> &Moves) const;
+
+  // Exception handling queries.
+  unsigned getEHExceptionRegister() const;
+  unsigned getEHHandlerRegister() const;
+
+  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+};
+
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/PowerPC/PPCRegisterInfo.td b/lib/Target/PowerPC/PPCRegisterInfo.td
new file mode 100644
index 0000000..049e893
--- /dev/null
+++ b/lib/Target/PowerPC/PPCRegisterInfo.td

@@ -0,0 +1,387 @@
+//===- PPCRegisterInfo.td - The PowerPC Register File ------*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+class PPCReg<string n> : Register<n> {
+  let Namespace = "PPC";
+}
+
+// We identify all our registers with a 5-bit ID, for consistency's sake.
+
+// GPR - One of the 32 32-bit general-purpose registers
+class GPR<bits<5> num, string n> : PPCReg<n> {
+  field bits<5> Num = num;
+}
+
+// GP8 - One of the 32 64-bit general-purpose registers
+class GP8<GPR SubReg, string n> : PPCReg<n> {
+  field bits<5> Num = SubReg.Num;
+  let SubRegs = [SubReg];
+}
+
+// SPR - One of the 32-bit special-purpose registers
+class SPR<bits<10> num, string n> : PPCReg<n> {
+  field bits<10> Num = num;
+}
+
+// FPR - One of the 32 64-bit floating-point registers
+class FPR<bits<5> num, string n> : PPCReg<n> {
+  field bits<5> Num = num;
+}
+
+// VR - One of the 32 128-bit vector registers
+class VR<bits<5> num, string n> : PPCReg<n> {
+  field bits<5> Num = num;
+}
+
+// CR - One of the 8 4-bit condition registers
+class CR<bits<3> num, string n, list<Register> subregs> : PPCReg<n> {
+  field bits<3> Num = num;
+  let SubRegs = subregs;
+}
+
+// CRBIT - One of the 32 1-bit condition register fields
+class CRBIT<bits<5> num, string n> : PPCReg<n> {
+  field bits<5> Num = num;
+}
+
+
+// General-purpose registers
+def R0  : GPR< 0,  "r0">, DwarfRegNum<[0]>;
+def R1  : GPR< 1,  "r1">, DwarfRegNum<[1]>;
+def R2  : GPR< 2,  "r2">, DwarfRegNum<[2]>;
+def R3  : GPR< 3,  "r3">, DwarfRegNum<[3]>;
+def R4  : GPR< 4,  "r4">, DwarfRegNum<[4]>;
+def R5  : GPR< 5,  "r5">, DwarfRegNum<[5]>;
+def R6  : GPR< 6,  "r6">, DwarfRegNum<[6]>;
+def R7  : GPR< 7,  "r7">, DwarfRegNum<[7]>;
+def R8  : GPR< 8,  "r8">, DwarfRegNum<[8]>;
+def R9  : GPR< 9,  "r9">, DwarfRegNum<[9]>;
+def R10 : GPR<10, "r10">, DwarfRegNum<[10]>;
+def R11 : GPR<11, "r11">, DwarfRegNum<[11]>;
+def R12 : GPR<12, "r12">, DwarfRegNum<[12]>;
+def R13 : GPR<13, "r13">, DwarfRegNum<[13]>;
+def R14 : GPR<14, "r14">, DwarfRegNum<[14]>;
+def R15 : GPR<15, "r15">, DwarfRegNum<[15]>;
+def R16 : GPR<16, "r16">, DwarfRegNum<[16]>;
+def R17 : GPR<17, "r17">, DwarfRegNum<[17]>;
+def R18 : GPR<18, "r18">, DwarfRegNum<[18]>;
+def R19 : GPR<19, "r19">, DwarfRegNum<[19]>;
+def R20 : GPR<20, "r20">, DwarfRegNum<[20]>;
+def R21 : GPR<21, "r21">, DwarfRegNum<[21]>;
+def R22 : GPR<22, "r22">, DwarfRegNum<[22]>;
+def R23 : GPR<23, "r23">, DwarfRegNum<[23]>;
+def R24 : GPR<24, "r24">, DwarfRegNum<[24]>;
+def R25 : GPR<25, "r25">, DwarfRegNum<[25]>;
+def R26 : GPR<26, "r26">, DwarfRegNum<[26]>;
+def R27 : GPR<27, "r27">, DwarfRegNum<[27]>;
+def R28 : GPR<28, "r28">, DwarfRegNum<[28]>;
+def R29 : GPR<29, "r29">, DwarfRegNum<[29]>;
+def R30 : GPR<30, "r30">, DwarfRegNum<[30]>;
+def R31 : GPR<31, "r31">, DwarfRegNum<[31]>;
+
+// 64-bit General-purpose registers
+def X0  : GP8< R0,  "r0">, DwarfRegNum<[0]>;
+def X1  : GP8< R1,  "r1">, DwarfRegNum<[1]>;
+def X2  : GP8< R2,  "r2">, DwarfRegNum<[2]>;
+def X3  : GP8< R3,  "r3">, DwarfRegNum<[3]>;
+def X4  : GP8< R4,  "r4">, DwarfRegNum<[4]>;
+def X5  : GP8< R5,  "r5">, DwarfRegNum<[5]>;
+def X6  : GP8< R6,  "r6">, DwarfRegNum<[6]>;
+def X7  : GP8< R7,  "r7">, DwarfRegNum<[7]>;
+def X8  : GP8< R8,  "r8">, DwarfRegNum<[8]>;
+def X9  : GP8< R9,  "r9">, DwarfRegNum<[9]>;
+def X10 : GP8<R10, "r10">, DwarfRegNum<[10]>;
+def X11 : GP8<R11, "r11">, DwarfRegNum<[11]>;
+def X12 : GP8<R12, "r12">, DwarfRegNum<[12]>;
+def X13 : GP8<R13, "r13">, DwarfRegNum<[13]>;
+def X14 : GP8<R14, "r14">, DwarfRegNum<[14]>;
+def X15 : GP8<R15, "r15">, DwarfRegNum<[15]>;
+def X16 : GP8<R16, "r16">, DwarfRegNum<[16]>;
+def X17 : GP8<R17, "r17">, DwarfRegNum<[17]>;
+def X18 : GP8<R18, "r18">, DwarfRegNum<[18]>;
+def X19 : GP8<R19, "r19">, DwarfRegNum<[19]>;
+def X20 : GP8<R20, "r20">, DwarfRegNum<[20]>;
+def X21 : GP8<R21, "r21">, DwarfRegNum<[21]>;
+def X22 : GP8<R22, "r22">, DwarfRegNum<[22]>;
+def X23 : GP8<R23, "r23">, DwarfRegNum<[23]>;
+def X24 : GP8<R24, "r24">, DwarfRegNum<[24]>;
+def X25 : GP8<R25, "r25">, DwarfRegNum<[25]>;
+def X26 : GP8<R26, "r26">, DwarfRegNum<[26]>;
+def X27 : GP8<R27, "r27">, DwarfRegNum<[27]>;
+def X28 : GP8<R28, "r28">, DwarfRegNum<[28]>;
+def X29 : GP8<R29, "r29">, DwarfRegNum<[29]>;
+def X30 : GP8<R30, "r30">, DwarfRegNum<[30]>;
+def X31 : GP8<R31, "r31">, DwarfRegNum<[31]>;
+
+// Floating-point registers
+def F0  : FPR< 0,  "f0">, DwarfRegNum<[32]>;
+def F1  : FPR< 1,  "f1">, DwarfRegNum<[33]>;
+def F2  : FPR< 2,  "f2">, DwarfRegNum<[34]>;
+def F3  : FPR< 3,  "f3">, DwarfRegNum<[35]>;
+def F4  : FPR< 4,  "f4">, DwarfRegNum<[36]>;
+def F5  : FPR< 5,  "f5">, DwarfRegNum<[37]>;
+def F6  : FPR< 6,  "f6">, DwarfRegNum<[38]>;
+def F7  : FPR< 7,  "f7">, DwarfRegNum<[39]>;
+def F8  : FPR< 8,  "f8">, DwarfRegNum<[40]>;
+def F9  : FPR< 9,  "f9">, DwarfRegNum<[41]>;
+def F10 : FPR<10, "f10">, DwarfRegNum<[42]>;
+def F11 : FPR<11, "f11">, DwarfRegNum<[43]>;
+def F12 : FPR<12, "f12">, DwarfRegNum<[44]>;
+def F13 : FPR<13, "f13">, DwarfRegNum<[45]>;
+def F14 : FPR<14, "f14">, DwarfRegNum<[46]>;
+def F15 : FPR<15, "f15">, DwarfRegNum<[47]>;
+def F16 : FPR<16, "f16">, DwarfRegNum<[48]>;
+def F17 : FPR<17, "f17">, DwarfRegNum<[49]>;
+def F18 : FPR<18, "f18">, DwarfRegNum<[50]>;
+def F19 : FPR<19, "f19">, DwarfRegNum<[51]>;
+def F20 : FPR<20, "f20">, DwarfRegNum<[52]>;
+def F21 : FPR<21, "f21">, DwarfRegNum<[53]>;
+def F22 : FPR<22, "f22">, DwarfRegNum<[54]>;
+def F23 : FPR<23, "f23">, DwarfRegNum<[55]>;
+def F24 : FPR<24, "f24">, DwarfRegNum<[56]>;
+def F25 : FPR<25, "f25">, DwarfRegNum<[57]>;
+def F26 : FPR<26, "f26">, DwarfRegNum<[58]>;
+def F27 : FPR<27, "f27">, DwarfRegNum<[59]>;
+def F28 : FPR<28, "f28">, DwarfRegNum<[60]>;
+def F29 : FPR<29, "f29">, DwarfRegNum<[61]>;
+def F30 : FPR<30, "f30">, DwarfRegNum<[62]>;
+def F31 : FPR<31, "f31">, DwarfRegNum<[63]>;
+
+// Vector registers
+def V0  : VR< 0,  "v0">, DwarfRegNum<[77]>;
+def V1  : VR< 1,  "v1">, DwarfRegNum<[78]>;
+def V2  : VR< 2,  "v2">, DwarfRegNum<[79]>;
+def V3  : VR< 3,  "v3">, DwarfRegNum<[80]>;
+def V4  : VR< 4,  "v4">, DwarfRegNum<[81]>;
+def V5  : VR< 5,  "v5">, DwarfRegNum<[82]>;
+def V6  : VR< 6,  "v6">, DwarfRegNum<[83]>;
+def V7  : VR< 7,  "v7">, DwarfRegNum<[84]>;
+def V8  : VR< 8,  "v8">, DwarfRegNum<[85]>;
+def V9  : VR< 9,  "v9">, DwarfRegNum<[86]>;
+def V10 : VR<10, "v10">, DwarfRegNum<[87]>;
+def V11 : VR<11, "v11">, DwarfRegNum<[88]>;
+def V12 : VR<12, "v12">, DwarfRegNum<[89]>;
+def V13 : VR<13, "v13">, DwarfRegNum<[90]>;
+def V14 : VR<14, "v14">, DwarfRegNum<[91]>;
+def V15 : VR<15, "v15">, DwarfRegNum<[92]>;
+def V16 : VR<16, "v16">, DwarfRegNum<[93]>;
+def V17 : VR<17, "v17">, DwarfRegNum<[94]>;
+def V18 : VR<18, "v18">, DwarfRegNum<[95]>;
+def V19 : VR<19, "v19">, DwarfRegNum<[96]>;
+def V20 : VR<20, "v20">, DwarfRegNum<[97]>;
+def V21 : VR<21, "v21">, DwarfRegNum<[98]>;
+def V22 : VR<22, "v22">, DwarfRegNum<[99]>;
+def V23 : VR<23, "v23">, DwarfRegNum<[100]>;
+def V24 : VR<24, "v24">, DwarfRegNum<[101]>;
+def V25 : VR<25, "v25">, DwarfRegNum<[102]>;
+def V26 : VR<26, "v26">, DwarfRegNum<[103]>;
+def V27 : VR<27, "v27">, DwarfRegNum<[104]>;
+def V28 : VR<28, "v28">, DwarfRegNum<[105]>;
+def V29 : VR<29, "v29">, DwarfRegNum<[106]>;
+def V30 : VR<30, "v30">, DwarfRegNum<[107]>;
+def V31 : VR<31, "v31">, DwarfRegNum<[108]>;
+
+// Condition register bits
+def CR0LT : CRBIT< 0, "0">, DwarfRegNum<[0]>;
+def CR0GT : CRBIT< 1, "1">, DwarfRegNum<[0]>;
+def CR0EQ : CRBIT< 2, "2">, DwarfRegNum<[0]>;
+def CR0UN : CRBIT< 3, "3">, DwarfRegNum<[0]>;
+def CR1LT : CRBIT< 4, "4">, DwarfRegNum<[0]>;
+def CR1GT : CRBIT< 5, "5">, DwarfRegNum<[0]>;
+def CR1EQ : CRBIT< 6, "6">, DwarfRegNum<[0]>;
+def CR1UN : CRBIT< 7, "7">, DwarfRegNum<[0]>;
+def CR2LT : CRBIT< 8, "8">, DwarfRegNum<[0]>;
+def CR2GT : CRBIT< 9, "9">, DwarfRegNum<[0]>;
+def CR2EQ : CRBIT<10, "10">, DwarfRegNum<[0]>;
+def CR2UN : CRBIT<11, "11">, DwarfRegNum<[0]>;
+def CR3LT : CRBIT<12, "12">, DwarfRegNum<[0]>;
+def CR3GT : CRBIT<13, "13">, DwarfRegNum<[0]>;
+def CR3EQ : CRBIT<14, "14">, DwarfRegNum<[0]>;
+def CR3UN : CRBIT<15, "15">, DwarfRegNum<[0]>;
+def CR4LT : CRBIT<16, "16">, DwarfRegNum<[0]>;
+def CR4GT : CRBIT<17, "17">, DwarfRegNum<[0]>;
+def CR4EQ : CRBIT<18, "18">, DwarfRegNum<[0]>;
+def CR4UN : CRBIT<19, "19">, DwarfRegNum<[0]>;
+def CR5LT : CRBIT<20, "20">, DwarfRegNum<[0]>;
+def CR5GT : CRBIT<21, "21">, DwarfRegNum<[0]>;
+def CR5EQ : CRBIT<22, "22">, DwarfRegNum<[0]>;
+def CR5UN : CRBIT<23, "23">, DwarfRegNum<[0]>;
+def CR6LT : CRBIT<24, "24">, DwarfRegNum<[0]>;
+def CR6GT : CRBIT<25, "25">, DwarfRegNum<[0]>;
+def CR6EQ : CRBIT<26, "26">, DwarfRegNum<[0]>;
+def CR6UN : CRBIT<27, "27">, DwarfRegNum<[0]>;
+def CR7LT : CRBIT<28, "28">, DwarfRegNum<[0]>;
+def CR7GT : CRBIT<29, "29">, DwarfRegNum<[0]>;
+def CR7EQ : CRBIT<30, "30">, DwarfRegNum<[0]>;
+def CR7UN : CRBIT<31, "31">, DwarfRegNum<[0]>;
+
+// Condition registers
+def CR0 : CR<0, "cr0", [CR0LT, CR0GT, CR0EQ, CR0UN]>, DwarfRegNum<[68]>;
+def CR1 : CR<1, "cr1", [CR1LT, CR1GT, CR1EQ, CR1UN]>, DwarfRegNum<[69]>;
+def CR2 : CR<2, "cr2", [CR2LT, CR2GT, CR2EQ, CR2UN]>, DwarfRegNum<[70]>;
+def CR3 : CR<3, "cr3", [CR3LT, CR3GT, CR3EQ, CR3UN]>, DwarfRegNum<[71]>;
+def CR4 : CR<4, "cr4", [CR4LT, CR4GT, CR4EQ, CR4UN]>, DwarfRegNum<[72]>;
+def CR5 : CR<5, "cr5", [CR5LT, CR5GT, CR5EQ, CR5UN]>, DwarfRegNum<[73]>;
+def CR6 : CR<6, "cr6", [CR6LT, CR6GT, CR6EQ, CR6UN]>, DwarfRegNum<[74]>;
+def CR7 : CR<7, "cr7", [CR7LT, CR7GT, CR7EQ, CR7UN]>, DwarfRegNum<[75]>;
+
+def : SubRegSet<1, [CR0, CR1, CR2, CR3, CR4, CR5, CR6, CR7],
+                   [CR0LT, CR1LT, CR2LT, CR3LT, CR4LT, CR5LT, CR6LT, CR7LT]>;
+def : SubRegSet<2, [CR0, CR1, CR2, CR3, CR4, CR5, CR6, CR7],
+                   [CR0GT, CR1GT, CR2GT, CR3GT, CR4GT, CR5GT, CR6GT, CR7GT]>;
+def : SubRegSet<3, [CR0, CR1, CR2, CR3, CR4, CR5, CR6, CR7],
+                   [CR0EQ, CR1EQ, CR2EQ, CR3EQ, CR4EQ, CR5EQ, CR6EQ, CR7EQ]>;
+def : SubRegSet<4, [CR0, CR1, CR2, CR3, CR4, CR5, CR6, CR7],
+                   [CR0UN, CR1UN, CR2UN, CR3UN, CR4UN, CR5UN, CR6UN, CR7UN]>;
+
+// Link register
+def LR  : SPR<8, "lr">, DwarfRegNum<[65]>;
+//let Aliases = [LR] in
+def LR8 : SPR<8, "lr">, DwarfRegNum<[65]>;
+
+// Count register
+def CTR  : SPR<9, "ctr">, DwarfRegNum<[66]>;
+def CTR8 : SPR<9, "ctr">, DwarfRegNum<[66]>;
+
+// VRsave register
+def VRSAVE: SPR<256, "VRsave">, DwarfRegNum<[107]>;
+
+// Carry bit.  In the architecture this is really bit 0 of the XER register
+// (which really is SPR register 1);  this is the only bit interesting to a
+// compiler.
+def CARRY: SPR<1, "ca">, DwarfRegNum<[0]>;
+
+// FP rounding mode:  bits 30 and 31 of the FP status and control register
+// This is not allocated as a normal register; it appears only in
+// Uses and Defs.  The ABI says it needs to be preserved by a function,
+// but this is not achieved by saving and restoring it as with
+// most registers, it has to be done in code; to make this work all the
+// return and call instructions are described as Uses of RM, so instructions
+// that do nothing but change RM will not get deleted.
+// Also, in the architecture it is not really a SPR; 512 is arbitrary.
+def RM: SPR<512, "**ROUNDING MODE**">, DwarfRegNum<[0]>;
+
+/// Register classes
+// Allocate volatiles first
+// then nonvolatiles in reverse order since stmw/lmw save from rN to r31
+def GPRC : RegisterClass<"PPC", [i32], 32,
+     [R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12,
+      R30, R29, R28, R27, R26, R25, R24, R23, R22, R21, R20, R19, R18, R17,
+      R16, R15, R14, R13, R31, R0, R1, LR]>
+{
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    GPRCClass::iterator
+    GPRCClass::allocation_order_begin(const MachineFunction &MF) const {
+      // 32-bit SVR4 ABI: r2 is reserved for the OS.
+      // 64-bit SVR4 ABI: r2 is reserved for the TOC pointer.
+      if (!MF.getTarget().getSubtarget<PPCSubtarget>().isDarwin())
+        return begin()+1;
+
+      return begin();
+    }
+    GPRCClass::iterator
+    GPRCClass::allocation_order_end(const MachineFunction &MF) const {
+      // On PPC64, r13 is the thread pointer.  Never allocate this register.
+      // Note that this is overconservative, as it also prevents allocation of
+      // R31 when the FP is not needed.
+      // When using the 32-bit SVR4 ABI, r13 is reserved for the Small Data Area
+      // pointer.
+      const PPCSubtarget &Subtarget
+        = MF.getTarget().getSubtarget<PPCSubtarget>();
+         
+      if (Subtarget.isPPC64() || Subtarget.isSVR4ABI())
+        return end()-5;  // don't allocate R13, R31, R0, R1, LR
+        
+      if (needsFP(MF))
+        return end()-4;  // don't allocate R31, R0, R1, LR
+      else
+        return end()-3;  // don't allocate R0, R1, LR
+    }
+  }];
+}
+def G8RC : RegisterClass<"PPC", [i64], 64,
+     [X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X12,
+      X30, X29, X28, X27, X26, X25, X24, X23, X22, X21, X20, X19, X18, X17,
+      X16, X15, X14, X31, X13, X0, X1, LR8]>
+{
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    G8RCClass::iterator
+    G8RCClass::allocation_order_begin(const MachineFunction &MF) const {
+      // 64-bit SVR4 ABI: r2 is reserved for the TOC pointer.
+      if (!MF.getTarget().getSubtarget<PPCSubtarget>().isDarwin())
+        return begin()+1;
+
+      return begin();
+    }
+    G8RCClass::iterator
+    G8RCClass::allocation_order_end(const MachineFunction &MF) const {
+      if (needsFP(MF))
+        return end()-5;
+      else
+        return end()-4;
+    }
+  }];
+}
+
+// Allocate volatiles first, then non-volatiles in reverse order. With the SVR4
+// ABI the size of the Floating-point register save area is determined by the
+// allocated non-volatile register with the lowest register number, as FP
+// register N is spilled to offset 8 * (32 - N) below the back chain word of the
+// previous stack frame. By allocating non-volatiles in reverse order we make
+// sure that the Floating-point register save area is always as small as
+// possible because there aren't any unused spill slots.
+def F8RC : RegisterClass<"PPC", [f64], 64, [F0, F1, F2, F3, F4, F5, F6, F7,
+  F8, F9, F10, F11, F12, F13, F31, F30, F29, F28, F27, F26, F25, F24, F23,
+  F22, F21, F20, F19, F18, F17, F16, F15, F14]>;
+def F4RC : RegisterClass<"PPC", [f32], 32, [F0, F1, F2, F3, F4, F5, F6, F7,
+  F8, F9, F10, F11, F12, F13, F31, F30, F29, F28, F27, F26, F25, F24, F23,
+  F22, F21, F20, F19, F18, F17, F16, F15, F14]>;
+
+def VRRC : RegisterClass<"PPC", [v16i8,v8i16,v4i32,v4f32], 128,
+ [V2, V3, V4, V5, V0, V1, 
+  V6, V7, V8, V9, V10, V11, V12, V13, V14, V15, V16, V17, V18, V19, V31, V30,
+  V29, V28, V27, V26, V25, V24, V23, V22, V21, V20]>;
+
+def CRBITRC : RegisterClass<"PPC", [i32], 32,
+  [CR0LT, CR0GT, CR0EQ, CR0UN,
+   CR1LT, CR1GT, CR1EQ, CR1UN,
+   CR2LT, CR2GT, CR2EQ, CR2UN,
+   CR3LT, CR3GT, CR3EQ, CR3UN,
+   CR4LT, CR4GT, CR4EQ, CR4UN,
+   CR5LT, CR5GT, CR5EQ, CR5UN,
+   CR6LT, CR6GT, CR6EQ, CR6UN,
+   CR7LT, CR7GT, CR7EQ, CR7UN
+  ]>
+{
+  let CopyCost = -1;
+}
+
+def CRRC : RegisterClass<"PPC", [i32], 32, [CR0, CR1, CR5, CR6, CR7, CR2, 
+  CR3, CR4]>
+{
+  let SubRegClassList = [CRBITRC, CRBITRC, CRBITRC, CRBITRC];
+}
+
+def CTRRC : RegisterClass<"PPC", [i32], 32, [CTR]>;
+def CTRRC8 : RegisterClass<"PPC", [i64], 64, [CTR8]>;
+def VRSAVERC : RegisterClass<"PPC", [i32], 32, [VRSAVE]>;
+def CARRYRC : RegisterClass<"PPC", [i32], 32, [CARRY]> {
+  let CopyCost = -1;
+}

diff --git a/lib/Target/PowerPC/PPCRelocations.h b/lib/Target/PowerPC/PPCRelocations.h
new file mode 100644
index 0000000..a33e7e0
--- /dev/null
+++ b/lib/Target/PowerPC/PPCRelocations.h

@@ -0,0 +1,56 @@
+//===- PPCRelocations.h - PPC32 Code Relocations ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the PowerPC 32-bit target-specific relocation types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPC32RELOCATIONS_H
+#define PPC32RELOCATIONS_H
+
+#include "llvm/CodeGen/MachineRelocation.h"
+
+// Hack to rid us of a PPC pre-processor symbol which is erroneously
+// defined in a PowerPC header file (bug in Linux/PPC)
+#ifdef PPC
+#undef PPC
+#endif
+
+namespace llvm {
+  namespace PPC {
+    enum RelocationType {
+      // reloc_vanilla - A standard relocation, where the address of the
+      // relocated object completely overwrites the address of the relocation.
+      reloc_vanilla,
+    
+      // reloc_pcrel_bx - PC relative relocation, for the b or bl instructions.
+      reloc_pcrel_bx,
+
+      // reloc_pcrel_bcx - PC relative relocation, for BLT,BLE,BEQ,BGE,BGT,BNE,
+      // and other bcx instructions.
+      reloc_pcrel_bcx,
+
+      // reloc_absolute_high - Absolute relocation, for the loadhi instruction
+      // (which is really addis).  Add the high 16-bits of the specified global
+      // address into the low 16-bits of the instruction.
+      reloc_absolute_high,
+
+      // reloc_absolute_low - Absolute relocation, for the la instruction (which
+      // is really an addi).  Add the low 16-bits of the specified global
+      // address into the low 16-bits of the instruction.
+      reloc_absolute_low,
+      
+      // reloc_absolute_low_ix - Absolute relocation for the 64-bit load/store
+      // instruction which have two implicit zero bits.
+      reloc_absolute_low_ix
+    };
+  }
+}
+
+#endif

diff --git a/lib/Target/PowerPC/PPCSchedule.td b/lib/Target/PowerPC/PPCSchedule.td
new file mode 100644
index 0000000..d589414
--- /dev/null
+++ b/lib/Target/PowerPC/PPCSchedule.td

@@ -0,0 +1,508 @@
+//===- PPCSchedule.td - PowerPC Scheduling Definitions -----*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Functional units across PowerPC chips sets
+//
+def BPU    : FuncUnit; // Branch unit
+def SLU    : FuncUnit; // Store/load unit
+def SRU    : FuncUnit; // special register unit
+def IU1    : FuncUnit; // integer unit 1 (simple)
+def IU2    : FuncUnit; // integer unit 2 (complex)
+def IU3    : FuncUnit; // integer unit 3 (7450 simple)
+def IU4    : FuncUnit; // integer unit 4 (7450 simple)
+def FPU1   : FuncUnit; // floating point unit 1
+def FPU2   : FuncUnit; // floating point unit 2
+def VPU    : FuncUnit; // vector permutation unit
+def VIU1   : FuncUnit; // vector integer unit 1 (simple)
+def VIU2   : FuncUnit; // vector integer unit 2 (complex)
+def VFPU   : FuncUnit; // vector floating point unit
+
+
+//===----------------------------------------------------------------------===//
+// Instruction Itinerary classes used for PowerPC
+//
+def IntGeneral   : InstrItinClass;
+def IntCompare   : InstrItinClass;
+def IntDivD      : InstrItinClass;
+def IntDivW      : InstrItinClass;
+def IntMFFS      : InstrItinClass;
+def IntMFVSCR    : InstrItinClass;
+def IntMTFSB0    : InstrItinClass;
+def IntMTSRD     : InstrItinClass;
+def IntMulHD     : InstrItinClass;
+def IntMulHW     : InstrItinClass;
+def IntMulHWU    : InstrItinClass;
+def IntMulLI     : InstrItinClass;
+def IntRFID      : InstrItinClass;
+def IntRotateD   : InstrItinClass;
+def IntRotate    : InstrItinClass;
+def IntShift     : InstrItinClass;
+def IntTrapD     : InstrItinClass;
+def IntTrapW     : InstrItinClass;
+def BrB          : InstrItinClass;
+def BrCR         : InstrItinClass;
+def BrMCR        : InstrItinClass;
+def BrMCRX       : InstrItinClass;
+def LdStDCBA     : InstrItinClass;
+def LdStDCBF     : InstrItinClass;
+def LdStDCBI     : InstrItinClass;
+def LdStGeneral  : InstrItinClass;
+def LdStDSS      : InstrItinClass;
+def LdStICBI     : InstrItinClass;
+def LdStUX       : InstrItinClass;
+def LdStLD       : InstrItinClass;
+def LdStLDARX    : InstrItinClass;
+def LdStLFD      : InstrItinClass;
+def LdStLFDU     : InstrItinClass;
+def LdStLHA      : InstrItinClass;
+def LdStLMW      : InstrItinClass;
+def LdStLVecX    : InstrItinClass;
+def LdStLWA      : InstrItinClass;
+def LdStLWARX    : InstrItinClass;
+def LdStSLBIA    : InstrItinClass;
+def LdStSLBIE    : InstrItinClass;
+def LdStSTD      : InstrItinClass;
+def LdStSTDCX    : InstrItinClass;
+def LdStSTVEBX   : InstrItinClass;
+def LdStSTWCX    : InstrItinClass;
+def LdStSync     : InstrItinClass;
+def SprISYNC     : InstrItinClass;
+def SprMFSR      : InstrItinClass;
+def SprMTMSR     : InstrItinClass;
+def SprMTSR      : InstrItinClass;
+def SprTLBSYNC   : InstrItinClass;
+def SprMFCR      : InstrItinClass;
+def SprMFMSR     : InstrItinClass;
+def SprMFSPR     : InstrItinClass;
+def SprMFTB      : InstrItinClass;
+def SprMTSPR     : InstrItinClass;
+def SprMTSRIN    : InstrItinClass;
+def SprRFI       : InstrItinClass;
+def SprSC        : InstrItinClass;
+def FPGeneral    : InstrItinClass;
+def FPCompare    : InstrItinClass;
+def FPDivD       : InstrItinClass;
+def FPDivS       : InstrItinClass;
+def FPFused      : InstrItinClass;
+def FPRes        : InstrItinClass;
+def FPSqrt       : InstrItinClass;
+def VecGeneral   : InstrItinClass;
+def VecFP        : InstrItinClass;
+def VecFPCompare : InstrItinClass;
+def VecComplex   : InstrItinClass;
+def VecPerm      : InstrItinClass;
+def VecFPRound   : InstrItinClass;
+def VecVSL       : InstrItinClass;
+def VecVSR       : InstrItinClass;
+
+//===----------------------------------------------------------------------===//
+// Processor instruction itineraries.
+
+include "PPCScheduleG3.td"
+include "PPCScheduleG4.td"
+include "PPCScheduleG4Plus.td"
+include "PPCScheduleG5.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction to itinerary class map - When add new opcodes to the supported
+// set, refer to the following table to determine which itinerary class the
+// opcode belongs.
+//
+//    opcode     itinerary class
+//    ======     ===============
+//    add        IntGeneral
+//    addc       IntGeneral
+//    adde       IntGeneral
+//    addi       IntGeneral
+//    addic      IntGeneral
+//    addic.     IntGeneral
+//    addis      IntGeneral
+//    addme      IntGeneral
+//    addze      IntGeneral
+//    and        IntGeneral
+//    andc       IntGeneral
+//    andi.      IntGeneral
+//    andis.     IntGeneral
+//    b          BrB
+//    bc         BrB
+//    bcctr      BrB
+//    bclr       BrB
+//    cmp        IntCompare
+//    cmpi       IntCompare
+//    cmpl       IntCompare
+//    cmpli      IntCompare
+//    cntlzd     IntRotateD
+//    cntlzw     IntGeneral
+//    crand      BrCR
+//    crandc     BrCR
+//    creqv      BrCR
+//    crnand     BrCR
+//    crnor      BrCR
+//    cror       BrCR
+//    crorc      BrCR
+//    crxor      BrCR
+//    dcba       LdStDCBA
+//    dcbf       LdStDCBF
+//    dcbi       LdStDCBI
+//    dcbst      LdStDCBF
+//    dcbt       LdStGeneral
+//    dcbtst     LdStGeneral
+//    dcbz       LdStDCBF
+//    divd       IntDivD
+//    divdu      IntDivD
+//    divw       IntDivW
+//    divwu      IntDivW
+//    dss        LdStDSS
+//    dst        LdStDSS
+//    dstst      LdStDSS
+//    eciwx      LdStGeneral
+//    ecowx      LdStGeneral
+//    eieio      LdStGeneral
+//    eqv        IntGeneral
+//    extsb      IntGeneral
+//    extsh      IntGeneral
+//    extsw      IntRotateD
+//    fabs       FPGeneral
+//    fadd       FPGeneral
+//    fadds      FPGeneral
+//    fcfid      FPGeneral
+//    fcmpo      FPCompare
+//    fcmpu      FPCompare
+//    fctid      FPGeneral
+//    fctidz     FPGeneral
+//    fctiw      FPGeneral
+//    fctiwz     FPGeneral
+//    fdiv       FPDivD
+//    fdivs      FPDivS
+//    fmadd      FPFused
+//    fmadds     FPGeneral
+//    fmr        FPGeneral
+//    fmsub      FPFused
+//    fmsubs     FPGeneral
+//    fmul       FPFused
+//    fmuls      FPGeneral
+//    fnabs      FPGeneral
+//    fneg       FPGeneral
+//    fnmadd     FPFused
+//    fnmadds    FPGeneral
+//    fnmsub     FPFused
+//    fnmsubs    FPGeneral
+//    fres       FPRes
+//    frsp       FPGeneral
+//    frsqrte    FPGeneral
+//    fsel       FPGeneral
+//    fsqrt      FPSqrt
+//    fsqrts     FPSqrt
+//    fsub       FPGeneral
+//    fsubs      FPGeneral
+//    icbi       LdStICBI
+//    isync      SprISYNC
+//    lbz        LdStGeneral
+//    lbzu       LdStGeneral
+//    lbzux      LdStUX
+//    lbzx       LdStGeneral
+//    ld         LdStLD
+//    ldarx      LdStLDARX
+//    ldu        LdStLD
+//    ldux       LdStLD
+//    ldx        LdStLD
+//    lfd        LdStLFD
+//    lfdu       LdStLFDU
+//    lfdux      LdStLFDU
+//    lfdx       LdStLFDU
+//    lfs        LdStLFDU
+//    lfsu       LdStLFDU
+//    lfsux      LdStLFDU
+//    lfsx       LdStLFDU
+//    lha        LdStLHA
+//    lhau       LdStLHA
+//    lhaux      LdStLHA
+//    lhax       LdStLHA
+//    lhbrx      LdStGeneral
+//    lhz        LdStGeneral
+//    lhzu       LdStGeneral
+//    lhzux      LdStUX
+//    lhzx       LdStGeneral
+//    lmw        LdStLMW
+//    lswi       LdStLMW
+//    lswx       LdStLMW
+//    lvebx      LdStLVecX
+//    lvehx      LdStLVecX
+//    lvewx      LdStLVecX
+//    lvsl       LdStLVecX
+//    lvsr       LdStLVecX
+//    lvx        LdStLVecX
+//    lvxl       LdStLVecX
+//    lwa        LdStLWA
+//    lwarx      LdStLWARX
+//    lwaux      LdStLHA
+//    lwax       LdStLHA
+//    lwbrx      LdStGeneral
+//    lwz        LdStGeneral
+//    lwzu       LdStGeneral
+//    lwzux      LdStUX
+//    lwzx       LdStGeneral
+//    mcrf       BrMCR
+//    mcrfs      FPGeneral
+//    mcrxr      BrMCRX
+//    mfcr       SprMFCR
+//    mffs       IntMFFS
+//    mfmsr      SprMFMSR
+//    mfspr      SprMFSPR
+//    mfsr       SprMFSR
+//    mfsrin     SprMFSR
+//    mftb       SprMFTB
+//    mfvscr     IntMFVSCR
+//    mtcrf      BrMCRX
+//    mtfsb0     IntMTFSB0
+//    mtfsb1     IntMTFSB0
+//    mtfsf      IntMTFSB0
+//    mtfsfi     IntMTFSB0
+//    mtmsr      SprMTMSR
+//    mtmsrd     LdStLD
+//    mtspr      SprMTSPR
+//    mtsr       SprMTSR
+//    mtsrd      IntMTSRD
+//    mtsrdin    IntMTSRD
+//    mtsrin     SprMTSRIN
+//    mtvscr     IntMFVSCR
+//    mulhd      IntMulHD
+//    mulhdu     IntMulHD
+//    mulhw      IntMulHW
+//    mulhwu     IntMulHWU
+//    mulld      IntMulHD
+//    mulli      IntMulLI
+//    mullw      IntMulHW
+//    nand       IntGeneral
+//    neg        IntGeneral
+//    nor        IntGeneral
+//    or         IntGeneral
+//    orc        IntGeneral
+//    ori        IntGeneral
+//    oris       IntGeneral
+//    rfi        SprRFI
+//    rfid       IntRFID
+//    rldcl      IntRotateD
+//    rldcr      IntRotateD
+//    rldic      IntRotateD
+//    rldicl     IntRotateD
+//    rldicr     IntRotateD
+//    rldimi     IntRotateD
+//    rlwimi     IntRotate
+//    rlwinm     IntGeneral
+//    rlwnm      IntGeneral
+//    sc         SprSC
+//    slbia      LdStSLBIA
+//    slbie      LdStSLBIE
+//    sld        IntRotateD
+//    slw        IntGeneral
+//    srad       IntRotateD
+//    sradi      IntRotateD
+//    sraw       IntShift
+//    srawi      IntShift
+//    srd        IntRotateD
+//    srw        IntGeneral
+//    stb        LdStGeneral
+//    stbu       LdStGeneral
+//    stbux      LdStGeneral
+//    stbx       LdStGeneral
+//    std        LdStSTD
+//    stdcx.     LdStSTDCX
+//    stdu       LdStSTD
+//    stdux      LdStSTD
+//    stdx       LdStSTD
+//    stfd       LdStUX
+//    stfdu      LdStUX
+//    stfdux     LdStUX
+//    stfdx      LdStUX
+//    stfiwx     LdStUX
+//    stfs       LdStUX
+//    stfsu      LdStUX
+//    stfsux     LdStUX
+//    stfsx      LdStUX
+//    sth        LdStGeneral
+//    sthbrx     LdStGeneral
+//    sthu       LdStGeneral
+//    sthux      LdStGeneral
+//    sthx       LdStGeneral
+//    stmw       LdStLMW
+//    stswi      LdStLMW
+//    stswx      LdStLMW
+//    stvebx     LdStSTVEBX
+//    stvehx     LdStSTVEBX
+//    stvewx     LdStSTVEBX
+//    stvx       LdStSTVEBX
+//    stvxl      LdStSTVEBX
+//    stw        LdStGeneral
+//    stwbrx     LdStGeneral
+//    stwcx.     LdStSTWCX
+//    stwu       LdStGeneral
+//    stwux      LdStGeneral
+//    stwx       LdStGeneral
+//    subf       IntGeneral
+//    subfc      IntGeneral
+//    subfe      IntGeneral
+//    subfic     IntGeneral
+//    subfme     IntGeneral
+//    subfze     IntGeneral
+//    sync       LdStSync
+//    td         IntTrapD
+//    tdi        IntTrapD
+//    tlbia      LdStSLBIA
+//    tlbie      LdStDCBF
+//    tlbsync    SprTLBSYNC
+//    tw         IntTrapW
+//    twi        IntTrapW
+//    vaddcuw    VecGeneral
+//    vaddfp     VecFP
+//    vaddsbs    VecGeneral
+//    vaddshs    VecGeneral
+//    vaddsws    VecGeneral
+//    vaddubm    VecGeneral
+//    vaddubs    VecGeneral
+//    vadduhm    VecGeneral
+//    vadduhs    VecGeneral
+//    vadduwm    VecGeneral
+//    vadduws    VecGeneral
+//    vand       VecGeneral
+//    vandc      VecGeneral
+//    vavgsb     VecGeneral
+//    vavgsh     VecGeneral
+//    vavgsw     VecGeneral
+//    vavgub     VecGeneral
+//    vavguh     VecGeneral
+//    vavguw     VecGeneral
+//    vcfsx      VecFP
+//    vcfux      VecFP
+//    vcmpbfp    VecFPCompare
+//    vcmpeqfp   VecFPCompare
+//    vcmpequb   VecGeneral
+//    vcmpequh   VecGeneral
+//    vcmpequw   VecGeneral
+//    vcmpgefp   VecFPCompare
+//    vcmpgtfp   VecFPCompare
+//    vcmpgtsb   VecGeneral
+//    vcmpgtsh   VecGeneral
+//    vcmpgtsw   VecGeneral
+//    vcmpgtub   VecGeneral
+//    vcmpgtuh   VecGeneral
+//    vcmpgtuw   VecGeneral
+//    vctsxs     VecFP
+//    vctuxs     VecFP
+//    vexptefp   VecFP
+//    vlogefp    VecFP
+//    vmaddfp    VecFP
+//    vmaxfp     VecFPCompare
+//    vmaxsb     VecGeneral
+//    vmaxsh     VecGeneral
+//    vmaxsw     VecGeneral
+//    vmaxub     VecGeneral
+//    vmaxuh     VecGeneral
+//    vmaxuw     VecGeneral
+//    vmhaddshs  VecComplex
+//    vmhraddshs VecComplex
+//    vminfp     VecFPCompare
+//    vminsb     VecGeneral
+//    vminsh     VecGeneral
+//    vminsw     VecGeneral
+//    vminub     VecGeneral
+//    vminuh     VecGeneral
+//    vminuw     VecGeneral
+//    vmladduhm  VecComplex
+//    vmrghb     VecPerm
+//    vmrghh     VecPerm
+//    vmrghw     VecPerm
+//    vmrglb     VecPerm
+//    vmrglh     VecPerm
+//    vmrglw     VecPerm
+//    vmsubfp    VecFP
+//    vmsummbm   VecComplex
+//    vmsumshm   VecComplex
+//    vmsumshs   VecComplex
+//    vmsumubm   VecComplex
+//    vmsumuhm   VecComplex
+//    vmsumuhs   VecComplex
+//    vmulesb    VecComplex
+//    vmulesh    VecComplex
+//    vmuleub    VecComplex
+//    vmuleuh    VecComplex
+//    vmulosb    VecComplex
+//    vmulosh    VecComplex
+//    vmuloub    VecComplex
+//    vmulouh    VecComplex
+//    vnor       VecGeneral
+//    vor        VecGeneral
+//    vperm      VecPerm
+//    vpkpx      VecPerm
+//    vpkshss    VecPerm
+//    vpkshus    VecPerm
+//    vpkswss    VecPerm
+//    vpkswus    VecPerm
+//    vpkuhum    VecPerm
+//    vpkuhus    VecPerm
+//    vpkuwum    VecPerm
+//    vpkuwus    VecPerm
+//    vrefp      VecFPRound
+//    vrfim      VecFPRound
+//    vrfin      VecFPRound
+//    vrfip      VecFPRound
+//    vrfiz      VecFPRound
+//    vrlb       VecGeneral
+//    vrlh       VecGeneral
+//    vrlw       VecGeneral
+//    vrsqrtefp  VecFP
+//    vsel       VecGeneral
+//    vsl        VecVSL
+//    vslb       VecGeneral
+//    vsldoi     VecPerm
+//    vslh       VecGeneral
+//    vslo       VecPerm
+//    vslw       VecGeneral
+//    vspltb     VecPerm
+//    vsplth     VecPerm
+//    vspltisb   VecPerm
+//    vspltish   VecPerm
+//    vspltisw   VecPerm
+//    vspltw     VecPerm
+//    vsr        VecVSR
+//    vsrab      VecGeneral
+//    vsrah      VecGeneral
+//    vsraw      VecGeneral
+//    vsrb       VecGeneral
+//    vsrh       VecGeneral
+//    vsro       VecPerm
+//    vsrw       VecGeneral
+//    vsubcuw    VecGeneral
+//    vsubfp     VecFP
+//    vsubsbs    VecGeneral
+//    vsubshs    VecGeneral
+//    vsubsws    VecGeneral
+//    vsububm    VecGeneral
+//    vsububs    VecGeneral
+//    vsubuhm    VecGeneral
+//    vsubuhs    VecGeneral
+//    vsubuwm    VecGeneral
+//    vsubuws    VecGeneral
+//    vsum2sws   VecComplex
+//    vsum4sbs   VecComplex
+//    vsum4shs   VecComplex
+//    vsum4ubs   VecComplex
+//    vsumsws    VecComplex
+//    vupkhpx    VecPerm
+//    vupkhsb    VecPerm
+//    vupkhsh    VecPerm
+//    vupklpx    VecPerm
+//    vupklsb    VecPerm
+//    vupklsh    VecPerm
+//    vxor       VecGeneral
+//    xor        IntGeneral
+//    xori       IntGeneral
+//    xoris      IntGeneral
+//

diff --git a/lib/Target/PowerPC/PPCScheduleG3.td b/lib/Target/PowerPC/PPCScheduleG3.td
new file mode 100644
index 0000000..f72194d
--- /dev/null
+++ b/lib/Target/PowerPC/PPCScheduleG3.td

@@ -0,0 +1,63 @@
+//===- PPCScheduleG3.td - PPC G3 Scheduling Definitions ----*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the G3 (750) processor.
+//
+//===----------------------------------------------------------------------===//
+
+
+def G3Itineraries : ProcessorItineraries<[
+  InstrItinData<IntGeneral  , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<IntCompare  , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<IntDivW     , [InstrStage<19, [IU1]>]>,
+  InstrItinData<IntMFFS     , [InstrStage<1, [FPU1]>]>,
+  InstrItinData<IntMTFSB0   , [InstrStage<3, [FPU1]>]>,
+  InstrItinData<IntMulHW    , [InstrStage<5, [IU1]>]>,
+  InstrItinData<IntMulHWU   , [InstrStage<6, [IU1]>]>,
+  InstrItinData<IntMulLI    , [InstrStage<3, [IU1]>]>,
+  InstrItinData<IntRotate   , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<IntShift    , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<IntTrapW    , [InstrStage<2, [IU1, IU2]>]>,
+  InstrItinData<BrB         , [InstrStage<1, [BPU]>]>,
+  InstrItinData<BrCR        , [InstrStage<1, [SRU]>]>,
+  InstrItinData<BrMCR       , [InstrStage<1, [SRU]>]>,
+  InstrItinData<BrMCRX      , [InstrStage<1, [SRU]>]>,
+  InstrItinData<LdStDCBA    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStDCBF    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStDCBI    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStGeneral , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStICBI    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStUX      , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLFD     , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLFDU    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLHA     , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLMW     , [InstrStage<34, [SLU]>]>,
+  InstrItinData<LdStLWARX   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTWCX   , [InstrStage<8, [SLU]>]>,
+  InstrItinData<LdStSync    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<SprISYNC    , [InstrStage<2, [SRU]>]>,
+  InstrItinData<SprMFSR     , [InstrStage<3, [SRU]>]>,
+  InstrItinData<SprMTMSR    , [InstrStage<1, [SRU]>]>,
+  InstrItinData<SprMTSR     , [InstrStage<2, [SRU]>]>,
+  InstrItinData<SprTLBSYNC  , [InstrStage<3, [SRU]>]>,
+  InstrItinData<SprMFCR     , [InstrStage<1, [SRU]>]>,
+  InstrItinData<SprMFMSR    , [InstrStage<1, [SRU]>]>,
+  InstrItinData<SprMFSPR    , [InstrStage<3, [SRU]>]>,
+  InstrItinData<SprMFTB     , [InstrStage<3, [SRU]>]>,
+  InstrItinData<SprMTSPR    , [InstrStage<2, [SRU]>]>,
+  InstrItinData<SprMTSRIN   , [InstrStage<2, [SRU]>]>,
+  InstrItinData<SprRFI      , [InstrStage<2, [SRU]>]>,
+  InstrItinData<SprSC       , [InstrStage<2, [SRU]>]>,
+  InstrItinData<FPGeneral   , [InstrStage<1, [FPU1]>]>,
+  InstrItinData<FPCompare   , [InstrStage<1, [FPU1]>]>,
+  InstrItinData<FPDivD      , [InstrStage<31, [FPU1]>]>,
+  InstrItinData<FPDivS      , [InstrStage<17, [FPU1]>]>,
+  InstrItinData<FPFused     , [InstrStage<2, [FPU1]>]>,
+  InstrItinData<FPRes       , [InstrStage<10, [FPU1]>]>
+]>;

diff --git a/lib/Target/PowerPC/PPCScheduleG4.td b/lib/Target/PowerPC/PPCScheduleG4.td
new file mode 100644
index 0000000..92ed20f
--- /dev/null
+++ b/lib/Target/PowerPC/PPCScheduleG4.td

@@ -0,0 +1,73 @@
+//===- PPCScheduleG4.td - PPC G4 Scheduling Definitions ----*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the G4 (7400) processor.
+//
+//===----------------------------------------------------------------------===//
+
+def G4Itineraries : ProcessorItineraries<[
+  InstrItinData<IntGeneral  , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<IntCompare  , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<IntDivW     , [InstrStage<19, [IU1]>]>,
+  InstrItinData<IntMFFS     , [InstrStage<3, [FPU1]>]>,
+  InstrItinData<IntMFVSCR   , [InstrStage<1, [VIU1]>]>,
+  InstrItinData<IntMTFSB0   , [InstrStage<3, [FPU1]>]>,
+  InstrItinData<IntMulHW    , [InstrStage<5, [IU1]>]>,
+  InstrItinData<IntMulHWU   , [InstrStage<6, [IU1]>]>,
+  InstrItinData<IntMulLI    , [InstrStage<3, [IU1]>]>,
+  InstrItinData<IntRotate   , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<IntShift    , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<IntTrapW    , [InstrStage<2, [IU1, IU2]>]>,
+  InstrItinData<BrB         , [InstrStage<1, [BPU]>]>,
+  InstrItinData<BrCR        , [InstrStage<1, [SRU]>]>,
+  InstrItinData<BrMCR       , [InstrStage<1, [SRU]>]>,
+  InstrItinData<BrMCRX      , [InstrStage<1, [SRU]>]>,
+  InstrItinData<LdStDCBF    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStDCBI    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStGeneral , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStDSS     , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStICBI    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStUX      , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLFD     , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLFDU    , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLHA     , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLMW     , [InstrStage<34, [SLU]>]>,
+  InstrItinData<LdStLVecX   , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStLWARX   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTVEBX  , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStSTWCX   , [InstrStage<5, [SLU]>]>,
+  InstrItinData<LdStSync    , [InstrStage<8, [SLU]>]>,
+  InstrItinData<SprISYNC    , [InstrStage<2, [SRU]>]>,
+  InstrItinData<SprMFSR     , [InstrStage<3, [SRU]>]>,
+  InstrItinData<SprMTMSR    , [InstrStage<1, [SRU]>]>,
+  InstrItinData<SprMTSR     , [InstrStage<2, [SRU]>]>,
+  InstrItinData<SprTLBSYNC  , [InstrStage<8, [SRU]>]>,
+  InstrItinData<SprMFCR     , [InstrStage<1, [SRU]>]>,
+  InstrItinData<SprMFMSR    , [InstrStage<1, [SRU]>]>,
+  InstrItinData<SprMFSPR    , [InstrStage<3, [SRU]>]>,
+  InstrItinData<SprMFTB     , [InstrStage<1, [SRU]>]>,
+  InstrItinData<SprMTSPR    , [InstrStage<2, [SRU]>]>,
+  InstrItinData<SprMTSRIN   , [InstrStage<2, [SRU]>]>,
+  InstrItinData<SprRFI      , [InstrStage<2, [SRU]>]>,
+  InstrItinData<SprSC       , [InstrStage<2, [SRU]>]>,
+  InstrItinData<FPGeneral   , [InstrStage<1, [FPU1]>]>,
+  InstrItinData<FPCompare   , [InstrStage<1, [FPU1]>]>,
+  InstrItinData<FPDivD      , [InstrStage<31, [FPU1]>]>,
+  InstrItinData<FPDivS      , [InstrStage<17, [FPU1]>]>,
+  InstrItinData<FPFused     , [InstrStage<1, [FPU1]>]>,
+  InstrItinData<FPRes       , [InstrStage<10, [FPU1]>]>,
+  InstrItinData<VecGeneral  , [InstrStage<1, [VIU1]>]>,
+  InstrItinData<VecFP       , [InstrStage<4, [VFPU]>]>,
+  InstrItinData<VecFPCompare, [InstrStage<1, [VIU1]>]>,
+  InstrItinData<VecComplex  , [InstrStage<3, [VIU2]>]>,
+  InstrItinData<VecPerm     , [InstrStage<1, [VPU]>]>,
+  InstrItinData<VecFPRound  , [InstrStage<4, [VFPU]>]>,
+  InstrItinData<VecVSL      , [InstrStage<1, [VIU1]>]>,
+  InstrItinData<VecVSR      , [InstrStage<1, [VIU1]>]>
+]>;

diff --git a/lib/Target/PowerPC/PPCScheduleG4Plus.td b/lib/Target/PowerPC/PPCScheduleG4Plus.td
new file mode 100644
index 0000000..7474ba4
--- /dev/null
+++ b/lib/Target/PowerPC/PPCScheduleG4Plus.td

@@ -0,0 +1,76 @@
+//===- PPCScheduleG4Plus.td - PPC G4+ Scheduling Defs. -----*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the G4+ (7450) processor.
+//
+//===----------------------------------------------------------------------===//
+
+def G4PlusItineraries : ProcessorItineraries<[
+  InstrItinData<IntGeneral  , [InstrStage<1, [IU1, IU2, IU3, IU4]>]>,
+  InstrItinData<IntCompare  , [InstrStage<1, [IU1, IU2, IU3, IU4]>]>,
+  InstrItinData<IntDivW     , [InstrStage<23, [IU2]>]>,
+  InstrItinData<IntMFFS     , [InstrStage<5, [FPU1]>]>,
+  InstrItinData<IntMFVSCR   , [InstrStage<2, [VFPU]>]>,
+  InstrItinData<IntMTFSB0   , [InstrStage<5, [FPU1]>]>,
+  InstrItinData<IntMulHW    , [InstrStage<4, [IU2]>]>,
+  InstrItinData<IntMulHWU   , [InstrStage<4, [IU2]>]>,
+  InstrItinData<IntMulLI    , [InstrStage<3, [IU2]>]>,
+  InstrItinData<IntRotate   , [InstrStage<1, [IU1, IU2, IU3, IU4]>]>,
+  InstrItinData<IntShift    , [InstrStage<2, [IU1, IU2, IU3, IU4]>]>,
+  InstrItinData<IntTrapW    , [InstrStage<2, [IU1, IU2, IU3, IU4]>]>,
+  InstrItinData<BrB         , [InstrStage<1, [BPU]>]>,
+  InstrItinData<BrCR        , [InstrStage<2, [IU2]>]>,
+  InstrItinData<BrMCR       , [InstrStage<2, [IU2]>]>,
+  InstrItinData<BrMCRX      , [InstrStage<2, [IU2]>]>,
+  InstrItinData<LdStDCBF    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStDCBI    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStGeneral , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStDSS     , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStICBI    , [InstrStage<3, [IU2]>]>,
+  InstrItinData<LdStUX      , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLFD     , [InstrStage<4, [SLU]>]>,
+  InstrItinData<LdStLFDU    , [InstrStage<4, [SLU]>]>,
+  InstrItinData<LdStLHA     , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLMW     , [InstrStage<37, [SLU]>]>,
+  InstrItinData<LdStLVecX   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLWA     , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLWARX   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTD     , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTDCX   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTVEBX  , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTWCX   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSync    , [InstrStage<35, [SLU]>]>,
+  InstrItinData<SprISYNC    , [InstrStage<0, [IU1, IU2, IU3, IU4]>]>,
+  InstrItinData<SprMFSR     , [InstrStage<4, [IU2]>]>,
+  InstrItinData<SprMTMSR    , [InstrStage<2, [IU2]>]>,
+  InstrItinData<SprMTSR     , [InstrStage<2, [IU2]>]>,
+  InstrItinData<SprTLBSYNC  , [InstrStage<3, [SLU]>]>,
+  InstrItinData<SprMFCR     , [InstrStage<2, [IU2]>]>,
+  InstrItinData<SprMFMSR    , [InstrStage<3, [IU2]>]>,
+  InstrItinData<SprMFSPR    , [InstrStage<4, [IU2]>]>,
+  InstrItinData<SprMFTB     , [InstrStage<5, [IU2]>]>,
+  InstrItinData<SprMTSPR    , [InstrStage<2, [IU2]>]>,
+  InstrItinData<SprMTSRIN   , [InstrStage<2, [IU2]>]>,
+  InstrItinData<SprRFI      , [InstrStage<1, [IU1, IU2, IU3, IU4]>]>,
+  InstrItinData<SprSC       , [InstrStage<0, [IU1, IU2, IU3, IU4]>]>,
+  InstrItinData<FPGeneral   , [InstrStage<5, [FPU1]>]>,
+  InstrItinData<FPCompare   , [InstrStage<5, [FPU1]>]>,
+  InstrItinData<FPDivD      , [InstrStage<35, [FPU1]>]>,
+  InstrItinData<FPDivS      , [InstrStage<21, [FPU1]>]>,
+  InstrItinData<FPFused     , [InstrStage<5, [FPU1]>]>,
+  InstrItinData<FPRes       , [InstrStage<14, [FPU1]>]>,
+  InstrItinData<VecGeneral  , [InstrStage<1, [VIU1]>]>,
+  InstrItinData<VecFP       , [InstrStage<4, [VFPU]>]>,
+  InstrItinData<VecFPCompare, [InstrStage<2, [VFPU]>]>,
+  InstrItinData<VecComplex  , [InstrStage<4, [VIU2]>]>,
+  InstrItinData<VecPerm     , [InstrStage<2, [VPU]>]>,
+  InstrItinData<VecFPRound  , [InstrStage<4, [VIU1]>]>,
+  InstrItinData<VecVSL      , [InstrStage<2, [VPU]>]>,
+  InstrItinData<VecVSR      , [InstrStage<2, [VPU]>]>
+]>;

diff --git a/lib/Target/PowerPC/PPCScheduleG5.td b/lib/Target/PowerPC/PPCScheduleG5.td
new file mode 100644
index 0000000..d282147
--- /dev/null
+++ b/lib/Target/PowerPC/PPCScheduleG5.td

@@ -0,0 +1,83 @@
+//===- PPCScheduleG5.td - PPC G5 Scheduling Definitions ----*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the G5 (970) processor.
+//
+//===----------------------------------------------------------------------===//
+
+def G5Itineraries : ProcessorItineraries<[
+  InstrItinData<IntGeneral  , [InstrStage<2, [IU1, IU2]>]>,
+  InstrItinData<IntCompare  , [InstrStage<3, [IU1, IU2]>]>,
+  InstrItinData<IntDivD     , [InstrStage<68, [IU1]>]>,
+  InstrItinData<IntDivW     , [InstrStage<36, [IU1]>]>,
+  InstrItinData<IntMFFS     , [InstrStage<6, [IU2]>]>,
+  InstrItinData<IntMFVSCR   , [InstrStage<1, [VFPU]>]>,
+  InstrItinData<IntMTFSB0   , [InstrStage<6, [FPU1, FPU2]>]>,
+  InstrItinData<IntMulHD    , [InstrStage<7, [IU1, IU2]>]>,
+  InstrItinData<IntMulHW    , [InstrStage<5, [IU1, IU2]>]>,
+  InstrItinData<IntMulHWU   , [InstrStage<5, [IU1, IU2]>]>,
+  InstrItinData<IntMulLI    , [InstrStage<4, [IU1, IU2]>]>,
+  InstrItinData<IntRFID     , [InstrStage<1, [IU2]>]>,
+  InstrItinData<IntRotateD  , [InstrStage<2, [IU1, IU2]>]>,
+  InstrItinData<IntRotate   , [InstrStage<4, [IU1, IU2]>]>,
+  InstrItinData<IntShift    , [InstrStage<2, [IU1, IU2]>]>,
+  InstrItinData<IntTrapD    , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<IntTrapW    , [InstrStage<1, [IU1, IU2]>]>,
+  InstrItinData<BrB         , [InstrStage<1, [BPU]>]>,
+  InstrItinData<BrCR        , [InstrStage<4, [BPU]>]>,
+  InstrItinData<BrMCR       , [InstrStage<2, [BPU]>]>,
+  InstrItinData<BrMCRX      , [InstrStage<3, [BPU]>]>,
+  InstrItinData<LdStDCBF    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStGeneral , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStDSS     , [InstrStage<10, [SLU]>]>,
+  InstrItinData<LdStICBI    , [InstrStage<40, [SLU]>]>,
+  InstrItinData<LdStUX      , [InstrStage<4, [SLU]>]>,
+  InstrItinData<LdStLD      , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLDARX   , [InstrStage<11, [SLU]>]>,
+  InstrItinData<LdStLFD     , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLFDU    , [InstrStage<5, [SLU]>]>,
+  InstrItinData<LdStLHA     , [InstrStage<5, [SLU]>]>,
+  InstrItinData<LdStLMW     , [InstrStage<64, [SLU]>]>,
+  InstrItinData<LdStLVecX   , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStLWA     , [InstrStage<5, [SLU]>]>,
+  InstrItinData<LdStLWARX   , [InstrStage<11, [SLU]>]>,
+  InstrItinData<LdStSLBIA   , [InstrStage<40, [SLU]>]>, // needs work
+  InstrItinData<LdStSLBIE   , [InstrStage<2, [SLU]>]>,
+  InstrItinData<LdStSTD     , [InstrStage<3, [SLU]>]>,
+  InstrItinData<LdStSTDCX   , [InstrStage<11, [SLU]>]>,
+  InstrItinData<LdStSTVEBX  , [InstrStage<5, [SLU]>]>,
+  InstrItinData<LdStSTWCX   , [InstrStage<11, [SLU]>]>,
+  InstrItinData<LdStSync    , [InstrStage<35, [SLU]>]>,
+  InstrItinData<SprISYNC    , [InstrStage<40, [SLU]>]>, // needs work
+  InstrItinData<SprMFSR     , [InstrStage<3, [SLU]>]>,
+  InstrItinData<SprMTMSR    , [InstrStage<3, [SLU]>]>,
+  InstrItinData<SprMTSR     , [InstrStage<3, [SLU]>]>,
+  InstrItinData<SprTLBSYNC  , [InstrStage<3, [SLU]>]>,
+  InstrItinData<SprMFCR     , [InstrStage<2, [IU2]>]>,
+  InstrItinData<SprMFMSR    , [InstrStage<3, [IU2]>]>,
+  InstrItinData<SprMFSPR    , [InstrStage<3, [IU2]>]>,
+  InstrItinData<SprMFTB     , [InstrStage<10, [IU2]>]>,
+  InstrItinData<SprMTSPR    , [InstrStage<8, [IU2]>]>,
+  InstrItinData<SprSC       , [InstrStage<1, [IU2]>]>,
+  InstrItinData<FPGeneral   , [InstrStage<6, [FPU1, FPU2]>]>,
+  InstrItinData<FPCompare   , [InstrStage<8, [FPU1, FPU2]>]>,
+  InstrItinData<FPDivD      , [InstrStage<33, [FPU1, FPU2]>]>,
+  InstrItinData<FPDivS      , [InstrStage<33, [FPU1, FPU2]>]>,
+  InstrItinData<FPFused     , [InstrStage<6, [FPU1, FPU2]>]>,
+  InstrItinData<FPRes       , [InstrStage<6, [FPU1, FPU2]>]>,
+  InstrItinData<FPSqrt      , [InstrStage<40, [FPU1, FPU2]>]>,
+  InstrItinData<VecGeneral  , [InstrStage<2, [VIU1]>]>,
+  InstrItinData<VecFP       , [InstrStage<8, [VFPU]>]>,
+  InstrItinData<VecFPCompare, [InstrStage<2, [VFPU]>]>,
+  InstrItinData<VecComplex  , [InstrStage<5, [VIU2]>]>,
+  InstrItinData<VecPerm     , [InstrStage<3, [VPU]>]>,
+  InstrItinData<VecFPRound  , [InstrStage<8, [VFPU]>]>,
+  InstrItinData<VecVSL      , [InstrStage<2, [VIU1]>]>,
+  InstrItinData<VecVSR      , [InstrStage<3, [VPU]>]>
+]>;

diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp
new file mode 100644
index 0000000..40914ba
--- /dev/null
+++ b/lib/Target/PowerPC/PPCSubtarget.cpp

@@ -0,0 +1,138 @@
+//===- PowerPCSubtarget.cpp - PPC Subtarget Information -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PPC specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPCSubtarget.h"
+#include "PPC.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Target/TargetMachine.h"
+#include "PPCGenSubtarget.inc"
+#include <cstdlib>
+using namespace llvm;
+
+#if defined(__APPLE__)
+#include <mach/mach.h>
+#include <mach/mach_host.h>
+#include <mach/host_info.h>
+#include <mach/machine.h>
+
+/// GetCurrentPowerPCFeatures - Returns the current CPUs features.
+static const char *GetCurrentPowerPCCPU() {
+  host_basic_info_data_t hostInfo;
+  mach_msg_type_number_t infoCount;
+
+  infoCount = HOST_BASIC_INFO_COUNT;
+  host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&hostInfo, 
+            &infoCount);
+            
+  if (hostInfo.cpu_type != CPU_TYPE_POWERPC) return "generic";
+
+  switch(hostInfo.cpu_subtype) {
+  case CPU_SUBTYPE_POWERPC_601:   return "601";
+  case CPU_SUBTYPE_POWERPC_602:   return "602";
+  case CPU_SUBTYPE_POWERPC_603:   return "603";
+  case CPU_SUBTYPE_POWERPC_603e:  return "603e";
+  case CPU_SUBTYPE_POWERPC_603ev: return "603ev";
+  case CPU_SUBTYPE_POWERPC_604:   return "604";
+  case CPU_SUBTYPE_POWERPC_604e:  return "604e";
+  case CPU_SUBTYPE_POWERPC_620:   return "620";
+  case CPU_SUBTYPE_POWERPC_750:   return "750";
+  case CPU_SUBTYPE_POWERPC_7400:  return "7400";
+  case CPU_SUBTYPE_POWERPC_7450:  return "7450";
+  case CPU_SUBTYPE_POWERPC_970:   return "970";
+  default: ;
+  }
+  
+  return "generic";
+}
+#endif
+
+
+PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &FS,
+                           bool is64Bit)
+  : StackAlignment(16)
+  , DarwinDirective(PPC::DIR_NONE)
+  , IsGigaProcessor(false)
+  , Has64BitSupport(false)
+  , Use64BitRegs(false)
+  , IsPPC64(is64Bit)
+  , HasAltivec(false)
+  , HasFSQRT(false)
+  , HasSTFIWX(false)
+  , HasLazyResolverStubs(false)
+  , DarwinVers(0) {
+
+  // Determine default and user specified characteristics
+  std::string CPU = "generic";
+#if defined(__APPLE__)
+  CPU = GetCurrentPowerPCCPU();
+#endif
+
+  // Parse features string.
+  ParseSubtargetFeatures(FS, CPU);
+
+  // If we are generating code for ppc64, verify that options make sense.
+  if (is64Bit) {
+    Has64BitSupport = true;
+    // Silently force 64-bit register use on ppc64.
+    Use64BitRegs = true;
+  }
+  
+  // If the user requested use of 64-bit regs, but the cpu selected doesn't
+  // support it, ignore.
+  if (use64BitRegs() && !has64BitSupport())
+    Use64BitRegs = false;
+  
+  // Set the boolean corresponding to the current target triple, or the default
+  // if one cannot be determined, to true.
+  if (TT.length() > 7) {
+    // Determine which version of darwin this is.
+    size_t DarwinPos = TT.find("-darwin");
+    if (DarwinPos != std::string::npos) {
+      if (isdigit(TT[DarwinPos+7]))
+        DarwinVers = atoi(&TT[DarwinPos+7]);
+      else
+        DarwinVers = 8;  // Minimum supported darwin is Tiger.
+    }
+  }
+
+  // Set up darwin-specific properties.
+  if (isDarwin())
+    HasLazyResolverStubs = true;
+}
+
+/// SetJITMode - This is called to inform the subtarget info that we are
+/// producing code for the JIT.
+void PPCSubtarget::SetJITMode() {
+  // JIT mode doesn't want lazy resolver stubs, it knows exactly where
+  // everything is.  This matters for PPC64, which codegens in PIC mode without
+  // stubs.
+  HasLazyResolverStubs = false;
+}
+
+
+/// hasLazyResolverStub - Return true if accesses to the specified global have
+/// to go through a dyld lazy resolution stub.  This means that an extra load
+/// is required to get the address of the global.
+bool PPCSubtarget::hasLazyResolverStub(const GlobalValue *GV,
+                                       const TargetMachine &TM) const {
+  // We never hae stubs if HasLazyResolverStubs=false or if in static mode.
+  if (!HasLazyResolverStubs || TM.getRelocationModel() == Reloc::Static)
+    return false;
+  // If symbol visibility is hidden, the extra load is not needed if
+  // the symbol is definitely defined in the current translation unit.
+  bool isDecl = GV->isDeclaration() && !GV->isMaterializable();
+  if (GV->hasHiddenVisibility() && !isDecl && !GV->hasCommonLinkage())
+    return false;
+  return GV->hasWeakLinkage() || GV->hasLinkOnceLinkage() ||
+         GV->hasCommonLinkage() || isDecl;
+}

diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h
new file mode 100644
index 0000000..75fcf62
--- /dev/null
+++ b/lib/Target/PowerPC/PPCSubtarget.h

@@ -0,0 +1,147 @@
+//=====-- PPCSubtarget.h - Define Subtarget for the PPC -------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the PowerPC specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POWERPCSUBTARGET_H
+#define POWERPCSUBTARGET_H
+
+#include "llvm/Target/TargetInstrItineraries.h"
+#include "llvm/Target/TargetSubtarget.h"
+
+#include <string>
+
+// GCC #defines PPC on Linux but we use it as our namespace name
+#undef PPC
+
+namespace llvm {
+
+namespace PPC {
+  // -m directive values.
+  enum {
+    DIR_NONE,
+    DIR_32,
+    DIR_601, 
+    DIR_602, 
+    DIR_603, 
+    DIR_7400,
+    DIR_750, 
+    DIR_970, 
+    DIR_64  
+  };
+}
+
+class GlobalValue;
+class TargetMachine;
+  
+class PPCSubtarget : public TargetSubtarget {
+protected:
+  /// stackAlignment - The minimum alignment known to hold of the stack frame on
+  /// entry to the function and which must be maintained by every function.
+  unsigned StackAlignment;
+  
+  /// Selected instruction itineraries (one entry per itinerary class.)
+  InstrItineraryData InstrItins;
+  
+  /// Which cpu directive was used.
+  unsigned DarwinDirective;
+
+  /// Used by the ISel to turn in optimizations for POWER4-derived architectures
+  bool IsGigaProcessor;
+  bool Has64BitSupport;
+  bool Use64BitRegs;
+  bool IsPPC64;
+  bool HasAltivec;
+  bool HasFSQRT;
+  bool HasSTFIWX;
+  bool HasLazyResolverStubs;
+  
+  /// DarwinVers - Nonzero if this is a darwin platform.  Otherwise, the numeric
+  /// version of the platform, e.g. 8 = 10.4 (Tiger), 9 = 10.5 (Leopard), etc.
+  unsigned char DarwinVers; // Is any darwin-ppc platform.
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified triple.
+  ///
+  PPCSubtarget(const std::string &TT, const std::string &FS, bool is64Bit);
+  
+  /// ParseSubtargetFeatures - Parses features string setting specified 
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  std::string ParseSubtargetFeatures(const std::string &FS,
+                                     const std::string &CPU);
+
+  
+  /// SetJITMode - This is called to inform the subtarget info that we are
+  /// producing code for the JIT.
+  void SetJITMode();
+
+  /// getStackAlignment - Returns the minimum alignment known to hold of the
+  /// stack frame on entry to the function and which must be maintained by every
+  /// function for this subtarget.
+  unsigned getStackAlignment() const { return StackAlignment; }
+  
+  /// getDarwinDirective - Returns the -m directive specified for the cpu.
+  ///
+  unsigned getDarwinDirective() const { return DarwinDirective; }
+  
+  /// getInstrItins - Return the instruction itineraies based on subtarget 
+  /// selection.
+  const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
+
+  /// getTargetDataString - Return the pointer size and type alignment
+  /// properties of this subtarget.
+  const char *getTargetDataString() const {
+    // Note, the alignment values for f64 and i64 on ppc64 in Darwin
+    // documentation are wrong; these are correct (i.e. "what gcc does").
+    return isPPC64() ? "E-p:64:64-f64:64:64-i64:64:64-f128:64:128-n32:64"
+                     : "E-p:32:32-f64:32:64-i64:32:64-f128:64:128-n32";
+  }
+
+  /// isPPC64 - Return true if we are generating code for 64-bit pointer mode.
+  ///
+  bool isPPC64() const { return IsPPC64; }
+  
+  /// has64BitSupport - Return true if the selected CPU supports 64-bit
+  /// instructions, regardless of whether we are in 32-bit or 64-bit mode.
+  bool has64BitSupport() const { return Has64BitSupport; }
+  
+  /// use64BitRegs - Return true if in 64-bit mode or if we should use 64-bit
+  /// registers in 32-bit mode when possible.  This can only true if
+  /// has64BitSupport() returns true.
+  bool use64BitRegs() const { return Use64BitRegs; }
+  
+  /// hasLazyResolverStub - Return true if accesses to the specified global have
+  /// to go through a dyld lazy resolution stub.  This means that an extra load
+  /// is required to get the address of the global.
+  bool hasLazyResolverStub(const GlobalValue *GV, 
+                           const TargetMachine &TM) const;
+  
+  // Specific obvious features.
+  bool hasFSQRT() const { return HasFSQRT; }
+  bool hasSTFIWX() const { return HasSTFIWX; }
+  bool hasAltivec() const { return HasAltivec; }
+  bool isGigaProcessor() const { return IsGigaProcessor; }
+
+  /// isDarwin - True if this is any darwin platform.
+  bool isDarwin() const { return DarwinVers != 0; }
+  /// isDarwin - True if this is darwin9 (leopard, 10.5) or above.
+  bool isDarwin9() const { return DarwinVers >= 9; }
+
+  /// getDarwinVers - Return the darwin version number, 8 = tiger, 9 = leopard.
+  unsigned getDarwinVers() const { return DarwinVers; }
+
+  bool isDarwinABI() const { return isDarwin(); }
+  bool isSVR4ABI() const { return !isDarwin(); }
+
+};
+} // End llvm namespace
+
+#endif

diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
new file mode 100644
index 0000000..22eecd4
--- /dev/null
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp

@@ -0,0 +1,146 @@
+//===-- PPCTargetMachine.cpp - Define TargetMachine for PowerPC -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Top-level implementation for the PowerPC target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "PPCMCAsmInfo.h"
+#include "PPCTargetMachine.h"
+#include "llvm/PassManager.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/FormattedStream.h"
+using namespace llvm;
+
+static const MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) {
+  Triple TheTriple(TT);
+  bool isPPC64 = TheTriple.getArch() == Triple::ppc64;
+  if (TheTriple.getOS() == Triple::Darwin)
+    return new PPCMCAsmInfoDarwin(isPPC64);
+  return new PPCLinuxMCAsmInfo(isPPC64);
+  
+}
+
+extern "C" void LLVMInitializePowerPCTarget() {
+  // Register the targets
+  RegisterTargetMachine<PPC32TargetMachine> A(ThePPC32Target);  
+  RegisterTargetMachine<PPC64TargetMachine> B(ThePPC64Target);
+  
+  RegisterAsmInfoFn C(ThePPC32Target, createMCAsmInfo);
+  RegisterAsmInfoFn D(ThePPC64Target, createMCAsmInfo);
+}
+
+
+PPCTargetMachine::PPCTargetMachine(const Target &T, const std::string &TT,
+                                   const std::string &FS, bool is64Bit)
+  : LLVMTargetMachine(T, TT),
+    Subtarget(TT, FS, is64Bit),
+    DataLayout(Subtarget.getTargetDataString()), InstrInfo(*this),
+    FrameInfo(*this, is64Bit), JITInfo(*this, is64Bit), TLInfo(*this),
+    InstrItins(Subtarget.getInstrItineraryData()) {
+
+  if (getRelocationModel() == Reloc::Default) {
+    if (Subtarget.isDarwin())
+      setRelocationModel(Reloc::DynamicNoPIC);
+    else
+      setRelocationModel(Reloc::Static);
+  }
+}
+
+/// Override this for PowerPC.  Tail merging happily breaks up instruction issue
+/// groups, which typically degrades performance.
+bool PPCTargetMachine::getEnableTailMergeDefault() const { return false; }
+
+PPC32TargetMachine::PPC32TargetMachine(const Target &T, const std::string &TT, 
+                                       const std::string &FS) 
+  : PPCTargetMachine(T, TT, FS, false) {
+}
+
+
+PPC64TargetMachine::PPC64TargetMachine(const Target &T, const std::string &TT, 
+                                       const std::string &FS)
+  : PPCTargetMachine(T, TT, FS, true) {
+}
+
+
+//===----------------------------------------------------------------------===//
+// Pass Pipeline Configuration
+//===----------------------------------------------------------------------===//
+
+bool PPCTargetMachine::addInstSelector(PassManagerBase &PM,
+                                       CodeGenOpt::Level OptLevel) {
+  // Install an instruction selector.
+  PM.add(createPPCISelDag(*this));
+  return false;
+}
+
+bool PPCTargetMachine::addPreEmitPass(PassManagerBase &PM,
+                                      CodeGenOpt::Level OptLevel) {
+  // Must run branch selection immediately preceding the asm printer.
+  PM.add(createPPCBranchSelectionPass());
+  return false;
+}
+
+bool PPCTargetMachine::addCodeEmitter(PassManagerBase &PM,
+                                      CodeGenOpt::Level OptLevel,
+                                      JITCodeEmitter &JCE) {
+  // The JIT should use the static relocation model in ppc32 mode, PIC in ppc64.
+  // FIXME: This should be moved to TargetJITInfo!!
+  if (Subtarget.isPPC64()) {
+    // We use PIC codegen in ppc64 mode, because otherwise we'd have to use many
+    // instructions to materialize arbitrary global variable + function +
+    // constant pool addresses.
+    setRelocationModel(Reloc::PIC_);
+    // Temporary workaround for the inability of PPC64 JIT to handle jump
+    // tables.
+    DisableJumpTables = true;      
+  } else {
+    setRelocationModel(Reloc::Static);
+  }
+  
+  // Inform the subtarget that we are in JIT mode.  FIXME: does this break macho
+  // writing?
+  Subtarget.SetJITMode();
+  
+  // Machine code emitter pass for PowerPC.
+  PM.add(createPPCJITCodeEmitterPass(*this, JCE));
+
+  return false;
+}
+
+/// getLSDAEncoding - Returns the LSDA pointer encoding. The choices are 4-byte,
+/// 8-byte, and target default. The CIE is hard-coded to indicate that the LSDA
+/// pointer in the FDE section is an "sdata4", and should be encoded as a 4-byte
+/// pointer by default. However, some systems may require a different size due
+/// to bugs or other conditions. We will default to a 4-byte encoding unless the
+/// system tells us otherwise.
+///
+/// The issue is when the CIE says their is an LSDA. That mandates that every
+/// FDE have an LSDA slot. But if the function does not need an LSDA. There
+/// needs to be some way to signify there is none. The LSDA is encoded as
+/// pc-rel. But you don't look for some magic value after adding the pc. You
+/// have to look for a zero before adding the pc. The problem is that the size
+/// of the zero to look for depends on the encoding. The unwinder bug in SL is
+/// that it always checks for a pointer-size zero. So on x86_64 it looks for 8
+/// bytes of zero. If you have an LSDA, it works fine since the 8-bytes are
+/// non-zero so it goes ahead and then reads the value based on the encoding.
+/// But if you use sdata4 and there is no LSDA, then the test for zero gives a
+/// false negative and the unwinder thinks there is an LSDA.
+///
+/// FIXME: This call-back isn't good! We should be using the correct encoding
+/// regardless of the system. However, there are some systems which have bugs
+/// that prevent this from occuring.
+DwarfLSDAEncoding::Encoding PPCTargetMachine::getLSDAEncoding() const {
+  if (Subtarget.isDarwin() && Subtarget.getDarwinVers() != 10)
+    return DwarfLSDAEncoding::Default;
+
+  return DwarfLSDAEncoding::EightByte;
+}

diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h
new file mode 100644
index 0000000..a654435
--- /dev/null
+++ b/lib/Target/PowerPC/PPCTargetMachine.h

@@ -0,0 +1,98 @@
+//===-- PPCTargetMachine.h - Define TargetMachine for PowerPC -----*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the PowerPC specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PPC_TARGETMACHINE_H
+#define PPC_TARGETMACHINE_H
+
+#include "PPCFrameInfo.h"
+#include "PPCSubtarget.h"
+#include "PPCJITInfo.h"
+#include "PPCInstrInfo.h"
+#include "PPCISelLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+
+namespace llvm {
+class PassManager;
+class GlobalValue;
+
+/// PPCTargetMachine - Common code between 32-bit and 64-bit PowerPC targets.
+///
+class PPCTargetMachine : public LLVMTargetMachine {
+  PPCSubtarget        Subtarget;
+  const TargetData    DataLayout;       // Calculates type size & alignment
+  PPCInstrInfo        InstrInfo;
+  PPCFrameInfo        FrameInfo;
+  PPCJITInfo          JITInfo;
+  PPCTargetLowering   TLInfo;
+  InstrItineraryData  InstrItins;
+
+public:
+  PPCTargetMachine(const Target &T, const std::string &TT,
+                   const std::string &FS, bool is64Bit);
+
+  virtual const PPCInstrInfo     *getInstrInfo() const { return &InstrInfo; }
+  virtual const PPCFrameInfo     *getFrameInfo() const { return &FrameInfo; }
+  virtual       PPCJITInfo       *getJITInfo()         { return &JITInfo; }
+  virtual       PPCTargetLowering *getTargetLowering() const { 
+   return const_cast<PPCTargetLowering*>(&TLInfo); 
+  }
+  virtual const PPCRegisterInfo  *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+  
+  virtual const TargetData    *getTargetData() const    { return &DataLayout; }
+  virtual const PPCSubtarget  *getSubtargetImpl() const { return &Subtarget; }
+  virtual const InstrItineraryData getInstrItineraryData() const {  
+    return InstrItins;
+  }
+
+  /// getLSDAEncoding - Returns the LSDA pointer encoding. The choices are
+  /// 4-byte, 8-byte, and target default. The CIE is hard-coded to indicate that
+  /// the LSDA pointer in the FDE section is an "sdata4", and should be encoded
+  /// as a 4-byte pointer by default. However, some systems may require a
+  /// different size due to bugs or other conditions. We will default to a
+  /// 4-byte encoding unless the system tells us otherwise.
+  ///
+  /// FIXME: This call-back isn't good! We should be using the correct encoding
+  /// regardless of the system. However, there are some systems which have bugs
+  /// that prevent this from occuring.
+  virtual DwarfLSDAEncoding::Encoding getLSDAEncoding() const;
+
+  // Pass Pipeline Configuration
+  virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+  virtual bool addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+  virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel,
+                              JITCodeEmitter &JCE);
+  virtual bool getEnableTailMergeDefault() const;
+};
+
+/// PPC32TargetMachine - PowerPC 32-bit target machine.
+///
+class PPC32TargetMachine : public PPCTargetMachine {
+public:
+  PPC32TargetMachine(const Target &T, const std::string &TT,
+                     const std::string &FS);
+};
+
+/// PPC64TargetMachine - PowerPC 64-bit target machine.
+///
+class PPC64TargetMachine : public PPCTargetMachine {
+public:
+  PPC64TargetMachine(const Target &T, const std::string &TT,
+                     const std::string &FS);
+};
+
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/PowerPC/README.txt b/lib/Target/PowerPC/README.txt
new file mode 100644
index 0000000..8f265cf
--- /dev/null
+++ b/lib/Target/PowerPC/README.txt

@@ -0,0 +1,897 @@
+//===- README.txt - Notes for improving PowerPC-specific code gen ---------===//
+
+TODO:
+* gpr0 allocation
+* implement do-loop -> bdnz transform
+* lmw/stmw pass a la arm load store optimizer for prolog/epilog
+
+===-------------------------------------------------------------------------===
+
+On PPC64, this:
+
+long f2 (long x) { return 0xfffffff000000000UL; }
+long f3 (long x) { return 0x1ffffffffUL; }
+
+could compile into:
+
+_f2:
+	li r3,-1
+	rldicr r3,r3,0,27
+	blr
+_f3:
+	li r3,-1
+	rldicl r3,r3,0,31
+	blr
+
+we produce:
+
+_f2:
+	lis r2, 4095
+	ori r2, r2, 65535
+	sldi r3, r2, 36
+	blr 
+_f3:
+	li r2, 1
+	sldi r2, r2, 32
+	oris r2, r2, 65535
+	ori r3, r2, 65535
+	blr 
+
+
+===-------------------------------------------------------------------------===
+
+Support 'update' load/store instructions.  These are cracked on the G5, but are
+still a codesize win.
+
+With preinc enabled, this:
+
+long *%test4(long *%X, long *%dest) {
+        %Y = getelementptr long* %X, int 4
+        %A = load long* %Y
+        store long %A, long* %dest
+        ret long* %Y
+}
+
+compiles to:
+
+_test4:
+        mr r2, r3
+        lwzu r5, 32(r2)
+        lwz r3, 36(r3)
+        stw r5, 0(r4)
+        stw r3, 4(r4)
+        mr r3, r2
+        blr 
+
+with -sched=list-burr, I get:
+
+_test4:
+        lwz r2, 36(r3)
+        lwzu r5, 32(r3)
+        stw r2, 4(r4)
+        stw r5, 0(r4)
+        blr 
+
+===-------------------------------------------------------------------------===
+
+We compile the hottest inner loop of viterbi to:
+
+        li r6, 0
+        b LBB1_84       ;bb432.i
+LBB1_83:        ;bb420.i
+        lbzx r8, r5, r7
+        addi r6, r7, 1
+        stbx r8, r4, r7
+LBB1_84:        ;bb432.i
+        mr r7, r6
+        cmplwi cr0, r7, 143
+        bne cr0, LBB1_83        ;bb420.i
+
+The CBE manages to produce:
+
+	li r0, 143
+	mtctr r0
+loop:
+	lbzx r2, r2, r11
+	stbx r0, r2, r9
+	addi r2, r2, 1
+	bdz later
+	b loop
+
+This could be much better (bdnz instead of bdz) but it still beats us.  If we
+produced this with bdnz, the loop would be a single dispatch group.
+
+===-------------------------------------------------------------------------===
+
+Compile:
+
+void foo(int *P) {
+ if (P)  *P = 0;
+}
+
+into:
+
+_foo:
+        cmpwi cr0,r3,0
+        beqlr cr0
+        li r0,0
+        stw r0,0(r3)
+        blr
+
+This is effectively a simple form of predication.
+
+===-------------------------------------------------------------------------===
+
+Lump the constant pool for each function into ONE pic object, and reference
+pieces of it as offsets from the start.  For functions like this (contrived
+to have lots of constants obviously):
+
+double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
+
+We generate:
+
+_X:
+        lis r2, ha16(.CPI_X_0)
+        lfd f0, lo16(.CPI_X_0)(r2)
+        lis r2, ha16(.CPI_X_1)
+        lfd f2, lo16(.CPI_X_1)(r2)
+        fmadd f0, f1, f0, f2
+        lis r2, ha16(.CPI_X_2)
+        lfd f1, lo16(.CPI_X_2)(r2)
+        lis r2, ha16(.CPI_X_3)
+        lfd f2, lo16(.CPI_X_3)(r2)
+        fmadd f1, f0, f1, f2
+        blr
+
+It would be better to materialize .CPI_X into a register, then use immediates
+off of the register to avoid the lis's.  This is even more important in PIC 
+mode.
+
+Note that this (and the static variable version) is discussed here for GCC:
+http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
+
+Here's another example (the sgn function):
+double testf(double a) {
+       return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
+}
+
+it produces a BB like this:
+LBB1_1: ; cond_true
+        lis r2, ha16(LCPI1_0)
+        lfs f0, lo16(LCPI1_0)(r2)
+        lis r2, ha16(LCPI1_1)
+        lis r3, ha16(LCPI1_2)
+        lfs f2, lo16(LCPI1_2)(r3)
+        lfs f3, lo16(LCPI1_1)(r2)
+        fsub f0, f0, f1
+        fsel f1, f0, f2, f3
+        blr 
+
+===-------------------------------------------------------------------------===
+
+PIC Code Gen IPO optimization:
+
+Squish small scalar globals together into a single global struct, allowing the 
+address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
+of the GOT on targets with one).
+
+Note that this is discussed here for GCC:
+http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
+
+===-------------------------------------------------------------------------===
+
+Implement Newton-Rhapson method for improving estimate instructions to the
+correct accuracy, and implementing divide as multiply by reciprocal when it has
+more than one use.  Itanium would want this too.
+
+===-------------------------------------------------------------------------===
+
+Compile offsets from allocas:
+
+int *%test() {
+        %X = alloca { int, int }
+        %Y = getelementptr {int,int}* %X, int 0, uint 1
+        ret int* %Y
+}
+
+into a single add, not two:
+
+_test:
+        addi r2, r1, -8
+        addi r3, r2, 4
+        blr
+
+--> important for C++.
+
+===-------------------------------------------------------------------------===
+
+No loads or stores of the constants should be needed:
+
+struct foo { double X, Y; };
+void xxx(struct foo F);
+void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
+
+===-------------------------------------------------------------------------===
+
+Darwin Stub removal:
+
+We still generate calls to foo$stub, and stubs, on Darwin.  This is not
+necessary when building with the Leopard (10.5) or later linker, as stubs are
+generated by ld when necessary.  Parameterizing this based on the deployment
+target (-mmacosx-version-min) is probably enough.  x86-32 does this right, see
+its logic.
+
+===-------------------------------------------------------------------------===
+
+Darwin Stub LICM optimization:
+
+Loops like this:
+  
+  for (...)  bar();
+
+Have to go through an indirect stub if bar is external or linkonce.  It would 
+be better to compile it as:
+
+     fp = &bar;
+     for (...)  fp();
+
+which only computes the address of bar once (instead of each time through the 
+stub).  This is Darwin specific and would have to be done in the code generator.
+Probably not a win on x86.
+
+===-------------------------------------------------------------------------===
+
+Simple IPO for argument passing, change:
+  void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
+
+the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
+of arguments get assigned to r3 through r10. That is, if you have a function
+foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
+argument bytes for r4 and r5. The trick then would be to shuffle the argument
+order for functions we can internalize so that the maximum number of 
+integers/pointers get passed in regs before you see any of the fp arguments.
+
+Instead of implementing this, it would actually probably be easier to just 
+implement a PPC fastcc, where we could do whatever we wanted to the CC, 
+including having this work sanely.
+
+===-------------------------------------------------------------------------===
+
+Fix Darwin FP-In-Integer Registers ABI
+
+Darwin passes doubles in structures in integer registers, which is very very 
+bad.  Add something like a BIT_CONVERT to LLVM, then do an i-p transformation 
+that percolates these things out of functions.
+
+Check out how horrible this is:
+http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
+
+This is an extension of "interprocedural CC unmunging" that can't be done with
+just fastcc.
+
+===-------------------------------------------------------------------------===
+
+Compile this:
+
+int foo(int a) {
+  int b = (a < 8);
+  if (b) {
+    return b * 3;     // ignore the fact that this is always 3.
+  } else {
+    return 2;
+  }
+}
+
+into something not this:
+
+_foo:
+1)      cmpwi cr7, r3, 8
+        mfcr r2, 1
+        rlwinm r2, r2, 29, 31, 31
+1)      cmpwi cr0, r3, 7
+        bgt cr0, LBB1_2 ; UnifiedReturnBlock
+LBB1_1: ; then
+        rlwinm r2, r2, 0, 31, 31
+        mulli r3, r2, 3
+        blr
+LBB1_2: ; UnifiedReturnBlock
+        li r3, 2
+        blr
+
+In particular, the two compares (marked 1) could be shared by reversing one.
+This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
+same operands (but backwards) exists.  In this case, this wouldn't save us 
+anything though, because the compares still wouldn't be shared.
+
+===-------------------------------------------------------------------------===
+
+We should custom expand setcc instead of pretending that we have it.  That
+would allow us to expose the access of the crbit after the mfcr, allowing
+that access to be trivially folded into other ops.  A simple example:
+
+int foo(int a, int b) { return (a < b) << 4; }
+
+compiles into:
+
+_foo:
+        cmpw cr7, r3, r4
+        mfcr r2, 1
+        rlwinm r2, r2, 29, 31, 31
+        slwi r3, r2, 4
+        blr
+
+===-------------------------------------------------------------------------===
+
+Fold add and sub with constant into non-extern, non-weak addresses so this:
+
+static int a;
+void bar(int b) { a = b; }
+void foo(unsigned char *c) {
+  *c = a;
+}
+
+So that 
+
+_foo:
+        lis r2, ha16(_a)
+        la r2, lo16(_a)(r2)
+        lbz r2, 3(r2)
+        stb r2, 0(r3)
+        blr
+
+Becomes
+
+_foo:
+        lis r2, ha16(_a+3)
+        lbz r2, lo16(_a+3)(r2)
+        stb r2, 0(r3)
+        blr
+
+===-------------------------------------------------------------------------===
+
+We generate really bad code for this:
+
+int f(signed char *a, _Bool b, _Bool c) {
+   signed char t = 0;
+  if (b)  t = *a;
+  if (c)  *a = t;
+}
+
+===-------------------------------------------------------------------------===
+
+This:
+int test(unsigned *P) { return *P >> 24; }
+
+Should compile to:
+
+_test:
+        lbz r3,0(r3)
+        blr
+
+not:
+
+_test:
+        lwz r2, 0(r3)
+        srwi r3, r2, 24
+        blr
+
+===-------------------------------------------------------------------------===
+
+On the G5, logical CR operations are more expensive in their three
+address form: ops that read/write the same register are half as expensive as
+those that read from two registers that are different from their destination.
+
+We should model this with two separate instructions.  The isel should generate
+the "two address" form of the instructions.  When the register allocator 
+detects that it needs to insert a copy due to the two-addresness of the CR
+logical op, it will invoke PPCInstrInfo::convertToThreeAddress.  At this point
+we can convert to the "three address" instruction, to save code space.
+
+This only matters when we start generating cr logical ops.
+
+===-------------------------------------------------------------------------===
+
+We should compile these two functions to the same thing:
+
+#include <stdlib.h>
+void f(int a, int b, int *P) {
+  *P = (a-b)>=0?(a-b):(b-a);
+}
+void g(int a, int b, int *P) {
+  *P = abs(a-b);
+}
+
+Further, they should compile to something better than:
+
+_g:
+        subf r2, r4, r3
+        subfic r3, r2, 0
+        cmpwi cr0, r2, -1
+        bgt cr0, LBB2_2 ; entry
+LBB2_1: ; entry
+        mr r2, r3
+LBB2_2: ; entry
+        stw r2, 0(r5)
+        blr
+
+GCC produces:
+
+_g:
+        subf r4,r4,r3
+        srawi r2,r4,31
+        xor r0,r2,r4
+        subf r0,r2,r0
+        stw r0,0(r5)
+        blr
+
+... which is much nicer.
+
+This theoretically may help improve twolf slightly (used in dimbox.c:142?).
+
+===-------------------------------------------------------------------------===
+
+PR5945: This: 
+define i32 @clamp0g(i32 %a) {
+entry:
+        %cmp = icmp slt i32 %a, 0
+        %sel = select i1 %cmp, i32 0, i32 %a
+        ret i32 %sel
+}
+
+Is compile to this with the PowerPC (32-bit) backend:
+
+_clamp0g:
+        cmpwi cr0, r3, 0
+        li r2, 0
+        blt cr0, LBB1_2
+; BB#1:                                                     ; %entry
+        mr r2, r3
+LBB1_2:                                                     ; %entry
+        mr r3, r2
+        blr
+
+This could be reduced to the much simpler:
+
+_clamp0g:
+        srawi r2, r3, 31
+        andc r3, r3, r2
+        blr
+
+===-------------------------------------------------------------------------===
+
+int foo(int N, int ***W, int **TK, int X) {
+  int t, i;
+  
+  for (t = 0; t < N; ++t)
+    for (i = 0; i < 4; ++i)
+      W[t / X][i][t % X] = TK[i][t];
+      
+  return 5;
+}
+
+We generate relatively atrocious code for this loop compared to gcc.
+
+We could also strength reduce the rem and the div:
+http://www.lcs.mit.edu/pubs/pdf/MIT-LCS-TM-600.pdf
+
+===-------------------------------------------------------------------------===
+
+float foo(float X) { return (int)(X); }
+
+Currently produces:
+
+_foo:
+        fctiwz f0, f1
+        stfd f0, -8(r1)
+        lwz r2, -4(r1)
+        extsw r2, r2
+        std r2, -16(r1)
+        lfd f0, -16(r1)
+        fcfid f0, f0
+        frsp f1, f0
+        blr
+
+We could use a target dag combine to turn the lwz/extsw into an lwa when the 
+lwz has a single use.  Since LWA is cracked anyway, this would be a codesize
+win only.
+
+===-------------------------------------------------------------------------===
+
+We generate ugly code for this:
+
+void func(unsigned int *ret, float dx, float dy, float dz, float dw) {
+  unsigned code = 0;
+  if(dx < -dw) code |= 1;
+  if(dx > dw)  code |= 2;
+  if(dy < -dw) code |= 4;
+  if(dy > dw)  code |= 8;
+  if(dz < -dw) code |= 16;
+  if(dz > dw)  code |= 32;
+  *ret = code;
+}
+
+===-------------------------------------------------------------------------===
+
+Complete the signed i32 to FP conversion code using 64-bit registers
+transformation, good for PI.  See PPCISelLowering.cpp, this comment:
+
+     // FIXME: disable this lowered code.  This generates 64-bit register values,
+     // and we don't model the fact that the top part is clobbered by calls.  We
+     // need to flag these together so that the value isn't live across a call.
+     //setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+
+Also, if the registers are spilled to the stack, we have to ensure that all
+64-bits of them are save/restored, otherwise we will miscompile the code.  It
+sounds like we need to get the 64-bit register classes going.
+
+===-------------------------------------------------------------------------===
+
+%struct.B = type { i8, [3 x i8] }
+
+define void @bar(%struct.B* %b) {
+entry:
+        %tmp = bitcast %struct.B* %b to i32*              ; <uint*> [#uses=1]
+        %tmp = load i32* %tmp          ; <uint> [#uses=1]
+        %tmp3 = bitcast %struct.B* %b to i32*             ; <uint*> [#uses=1]
+        %tmp4 = load i32* %tmp3                ; <uint> [#uses=1]
+        %tmp8 = bitcast %struct.B* %b to i32*             ; <uint*> [#uses=2]
+        %tmp9 = load i32* %tmp8                ; <uint> [#uses=1]
+        %tmp4.mask17 = shl i32 %tmp4, i8 1          ; <uint> [#uses=1]
+        %tmp1415 = and i32 %tmp4.mask17, 2147483648            ; <uint> [#uses=1]
+        %tmp.masked = and i32 %tmp, 2147483648         ; <uint> [#uses=1]
+        %tmp11 = or i32 %tmp1415, %tmp.masked          ; <uint> [#uses=1]
+        %tmp12 = and i32 %tmp9, 2147483647             ; <uint> [#uses=1]
+        %tmp13 = or i32 %tmp12, %tmp11         ; <uint> [#uses=1]
+        store i32 %tmp13, i32* %tmp8
+        ret void
+}
+
+We emit:
+
+_foo:
+        lwz r2, 0(r3)
+        slwi r4, r2, 1
+        or r4, r4, r2
+        rlwimi r2, r4, 0, 0, 0
+        stw r2, 0(r3)
+        blr
+
+We could collapse a bunch of those ORs and ANDs and generate the following
+equivalent code:
+
+_foo:
+        lwz r2, 0(r3)
+        rlwinm r4, r2, 1, 0, 0
+        or r2, r2, r4
+        stw r2, 0(r3)
+        blr
+
+===-------------------------------------------------------------------------===
+
+We compile:
+
+unsigned test6(unsigned x) { 
+  return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
+}
+
+into:
+
+_test6:
+        lis r2, 255
+        rlwinm r3, r3, 16, 0, 31
+        ori r2, r2, 255
+        and r3, r3, r2
+        blr
+
+GCC gets it down to:
+
+_test6:
+        rlwinm r0,r3,16,8,15
+        rlwinm r3,r3,16,24,31
+        or r3,r3,r0
+        blr
+
+
+===-------------------------------------------------------------------------===
+
+Consider a function like this:
+
+float foo(float X) { return X + 1234.4123f; }
+
+The FP constant ends up in the constant pool, so we need to get the LR register.
+ This ends up producing code like this:
+
+_foo:
+.LBB_foo_0:     ; entry
+        mflr r11
+***     stw r11, 8(r1)
+        bl "L00000$pb"
+"L00000$pb":
+        mflr r2
+        addis r2, r2, ha16(.CPI_foo_0-"L00000$pb")
+        lfs f0, lo16(.CPI_foo_0-"L00000$pb")(r2)
+        fadds f1, f1, f0
+***     lwz r11, 8(r1)
+        mtlr r11
+        blr
+
+This is functional, but there is no reason to spill the LR register all the way
+to the stack (the two marked instrs): spilling it to a GPR is quite enough.
+
+Implementing this will require some codegen improvements.  Nate writes:
+
+"So basically what we need to support the "no stack frame save and restore" is a
+generalization of the LR optimization to "callee-save regs".
+
+Currently, we have LR marked as a callee-save reg.  The register allocator sees
+that it's callee save, and spills it directly to the stack.
+
+Ideally, something like this would happen:
+
+LR would be in a separate register class from the GPRs. The class of LR would be
+marked "unspillable".  When the register allocator came across an unspillable
+reg, it would ask "what is the best class to copy this into that I *can* spill"
+If it gets a class back, which it will in this case (the gprs), it grabs a free
+register of that class.  If it is then later necessary to spill that reg, so be
+it.
+
+===-------------------------------------------------------------------------===
+
+We compile this:
+int test(_Bool X) {
+  return X ? 524288 : 0;
+}
+
+to: 
+_test:
+        cmplwi cr0, r3, 0
+        lis r2, 8
+        li r3, 0
+        beq cr0, LBB1_2 ;entry
+LBB1_1: ;entry
+        mr r3, r2
+LBB1_2: ;entry
+        blr 
+
+instead of:
+_test:
+        addic r2,r3,-1
+        subfe r0,r2,r3
+        slwi r3,r0,19
+        blr
+
+This sort of thing occurs a lot due to globalopt.
+
+===-------------------------------------------------------------------------===
+
+We compile:
+
+define i32 @bar(i32 %x) nounwind readnone ssp {
+entry:
+  %0 = icmp eq i32 %x, 0                          ; <i1> [#uses=1]
+  %neg = sext i1 %0 to i32              ; <i32> [#uses=1]
+  ret i32 %neg
+}
+
+to:
+
+_bar:
+	cntlzw r2, r3
+	slwi r2, r2, 26
+	srawi r3, r2, 31
+	blr 
+
+it would be better to produce:
+
+_bar: 
+        addic r3,r3,-1
+        subfe r3,r3,r3
+        blr
+
+===-------------------------------------------------------------------------===
+
+We currently compile 32-bit bswap:
+
+declare i32 @llvm.bswap.i32(i32 %A)
+define i32 @test(i32 %A) {
+        %B = call i32 @llvm.bswap.i32(i32 %A)
+        ret i32 %B
+}
+
+to:
+
+_test:
+        rlwinm r2, r3, 24, 16, 23
+        slwi r4, r3, 24
+        rlwimi r2, r3, 8, 24, 31
+        rlwimi r4, r3, 8, 8, 15
+        rlwimi r4, r2, 0, 16, 31
+        mr r3, r4
+        blr 
+
+it would be more efficient to produce:
+
+_foo:   mr r0,r3
+        rlwinm r3,r3,8,0xffffffff
+        rlwimi r3,r0,24,0,7
+        rlwimi r3,r0,24,16,23
+        blr
+
+===-------------------------------------------------------------------------===
+
+test/CodeGen/PowerPC/2007-03-24-cntlzd.ll compiles to:
+
+__ZNK4llvm5APInt17countLeadingZerosEv:
+        ld r2, 0(r3)
+        cntlzd r2, r2
+        or r2, r2, r2     <<-- silly.
+        addi r3, r2, -64
+        blr 
+
+The dead or is a 'truncate' from 64- to 32-bits.
+
+===-------------------------------------------------------------------------===
+
+We generate horrible ppc code for this:
+
+#define N  2000000
+double   a[N],c[N];
+void simpleloop() {
+   int j;
+   for (j=0; j<N; j++)
+     c[j] = a[j];
+}
+
+LBB1_1: ;bb
+        lfdx f0, r3, r4
+        addi r5, r5, 1                 ;; Extra IV for the exit value compare.
+        stfdx f0, r2, r4
+        addi r4, r4, 8
+
+        xoris r6, r5, 30               ;; This is due to a large immediate.
+        cmplwi cr0, r6, 33920
+        bne cr0, LBB1_1
+
+//===---------------------------------------------------------------------===//
+
+This:
+        #include <algorithm>
+        inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
+        { return std::make_pair(a + b, a + b < a); }
+        bool no_overflow(unsigned a, unsigned b)
+        { return !full_add(a, b).second; }
+
+Should compile to:
+
+__Z11no_overflowjj:
+        add r4,r3,r4
+        subfc r3,r3,r4
+        li r3,0
+        adde r3,r3,r3
+        blr
+
+(or better) not:
+
+__Z11no_overflowjj:
+        add r2, r4, r3
+        cmplw cr7, r2, r3
+        mfcr r2
+        rlwinm r2, r2, 29, 31, 31
+        xori r3, r2, 1
+        blr 
+
+//===---------------------------------------------------------------------===//
+
+We compile some FP comparisons into an mfcr with two rlwinms and an or.  For
+example:
+#include <math.h>
+int test(double x, double y) { return islessequal(x, y);}
+int test2(double x, double y) {  return islessgreater(x, y);}
+int test3(double x, double y) {  return !islessequal(x, y);}
+
+Compiles into (all three are similar, but the bits differ):
+
+_test:
+	fcmpu cr7, f1, f2
+	mfcr r2
+	rlwinm r3, r2, 29, 31, 31
+	rlwinm r2, r2, 31, 31, 31
+	or r3, r2, r3
+	blr 
+
+GCC compiles this into:
+
+ _test:
+	fcmpu cr7,f1,f2
+	cror 30,28,30
+	mfcr r3
+	rlwinm r3,r3,31,1
+	blr
+        
+which is more efficient and can use mfocr.  See PR642 for some more context.
+
+//===---------------------------------------------------------------------===//
+
+void foo(float *data, float d) {
+   long i;
+   for (i = 0; i < 8000; i++)
+      data[i] = d;
+}
+void foo2(float *data, float d) {
+   long i;
+   data--;
+   for (i = 0; i < 8000; i++) {
+      data[1] = d;
+      data++;
+   }
+}
+
+These compile to:
+
+_foo:
+	li r2, 0
+LBB1_1:	; bb
+	addi r4, r2, 4
+	stfsx f1, r3, r2
+	cmplwi cr0, r4, 32000
+	mr r2, r4
+	bne cr0, LBB1_1	; bb
+	blr 
+_foo2:
+	li r2, 0
+LBB2_1:	; bb
+	addi r4, r2, 4
+	stfsx f1, r3, r2
+	cmplwi cr0, r4, 32000
+	mr r2, r4
+	bne cr0, LBB2_1	; bb
+	blr 
+
+The 'mr' could be eliminated to folding the add into the cmp better.
+
+//===---------------------------------------------------------------------===//
+Codegen for the following (low-probability) case deteriorated considerably 
+when the correctness fixes for unordered comparisons went in (PR 642, 58871).
+It should be possible to recover the code quality described in the comments.
+
+; RUN: llvm-as < %s | llc -march=ppc32  | grep or | count 3
+; This should produce one 'or' or 'cror' instruction per function.
+
+; RUN: llvm-as < %s | llc -march=ppc32  | grep mfcr | count 3
+; PR2964
+
+define i32 @test(double %x, double %y) nounwind  {
+entry:
+	%tmp3 = fcmp ole double %x, %y		; <i1> [#uses=1]
+	%tmp345 = zext i1 %tmp3 to i32		; <i32> [#uses=1]
+	ret i32 %tmp345
+}
+
+define i32 @test2(double %x, double %y) nounwind  {
+entry:
+	%tmp3 = fcmp one double %x, %y		; <i1> [#uses=1]
+	%tmp345 = zext i1 %tmp3 to i32		; <i32> [#uses=1]
+	ret i32 %tmp345
+}
+
+define i32 @test3(double %x, double %y) nounwind  {
+entry:
+	%tmp3 = fcmp ugt double %x, %y		; <i1> [#uses=1]
+	%tmp34 = zext i1 %tmp3 to i32		; <i32> [#uses=1]
+	ret i32 %tmp34
+}
+//===----------------------------------------------------------------------===//
+; RUN: llvm-as < %s | llc -march=ppc32 | not grep fneg
+
+; This could generate FSEL with appropriate flags (FSEL is not IEEE-safe, and 
+; should not be generated except with -enable-finite-only-fp-math or the like).
+; With the correctness fixes for PR642 (58871) LowerSELECT_CC would need to
+; recognize a more elaborate tree than a simple SETxx.
+
+define double @test_FNEG_sel(double %A, double %B, double %C) {
+        %D = sub double -0.000000e+00, %A               ; <double> [#uses=1]
+        %Cond = fcmp ugt double %D, -0.000000e+00               ; <i1> [#uses=1]
+        %E = select i1 %Cond, double %B, double %C              ; <double> [#uses=1]
+        ret double %E
+}
+

diff --git a/lib/Target/PowerPC/README_ALTIVEC.txt b/lib/Target/PowerPC/README_ALTIVEC.txt
new file mode 100644
index 0000000..1e4c6fb
--- /dev/null
+++ b/lib/Target/PowerPC/README_ALTIVEC.txt

@@ -0,0 +1,211 @@
+//===- README_ALTIVEC.txt - Notes for improving Altivec code gen ----------===//
+
+Implement PPCInstrInfo::isLoadFromStackSlot/isStoreToStackSlot for vector
+registers, to generate better spill code.
+
+//===----------------------------------------------------------------------===//
+
+The first should be a single lvx from the constant pool, the second should be 
+a xor/stvx:
+
+void foo(void) {
+  int x[8] __attribute__((aligned(128))) = { 1, 1, 1, 17, 1, 1, 1, 1 };
+  bar (x);
+}
+
+#include <string.h>
+void foo(void) {
+  int x[8] __attribute__((aligned(128)));
+  memset (x, 0, sizeof (x));
+  bar (x);
+}
+
+//===----------------------------------------------------------------------===//
+
+Altivec: Codegen'ing MUL with vector FMADD should add -0.0, not 0.0:
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=8763
+
+When -ffast-math is on, we can use 0.0.
+
+//===----------------------------------------------------------------------===//
+
+  Consider this:
+  v4f32 Vector;
+  v4f32 Vector2 = { Vector.X, Vector.X, Vector.X, Vector.X };
+
+Since we know that "Vector" is 16-byte aligned and we know the element offset 
+of ".X", we should change the load into a lve*x instruction, instead of doing
+a load/store/lve*x sequence.
+
+//===----------------------------------------------------------------------===//
+
+For functions that use altivec AND have calls, we are VRSAVE'ing all call
+clobbered regs.
+
+//===----------------------------------------------------------------------===//
+
+Implement passing vectors by value into calls and receiving them as arguments.
+
+//===----------------------------------------------------------------------===//
+
+GCC apparently tries to codegen { C1, C2, Variable, C3 } as a constant pool load
+of C1/C2/C3, then a load and vperm of Variable.
+
+//===----------------------------------------------------------------------===//
+
+We need a way to teach tblgen that some operands of an intrinsic are required to
+be constants.  The verifier should enforce this constraint.
+
+//===----------------------------------------------------------------------===//
+
+We currently codegen SCALAR_TO_VECTOR as a store of the scalar to a 16-byte
+aligned stack slot, followed by a load/vperm.  We should probably just store it
+to a scalar stack slot, then use lvsl/vperm to load it.  If the value is already
+in memory this is a big win.
+
+//===----------------------------------------------------------------------===//
+
+extract_vector_elt of an arbitrary constant vector can be done with the 
+following instructions:
+
+vTemp = vec_splat(v0,2);    // 2 is the element the src is in.
+vec_ste(&destloc,0,vTemp);
+
+We can do an arbitrary non-constant value by using lvsr/perm/ste.
+
+//===----------------------------------------------------------------------===//
+
+If we want to tie instruction selection into the scheduler, we can do some
+constant formation with different instructions.  For example, we can generate
+"vsplti -1" with "vcmpequw R,R" and 1,1,1,1 with "vsubcuw R,R", and 0,0,0,0 with
+"vsplti 0" or "vxor", each of which use different execution units, thus could
+help scheduling.
+
+This is probably only reasonable for a post-pass scheduler.
+
+//===----------------------------------------------------------------------===//
+
+For this function:
+
+void test(vector float *A, vector float *B) {
+  vector float C = (vector float)vec_cmpeq(*A, *B);
+  if (!vec_any_eq(*A, *B))
+    *B = (vector float){0,0,0,0};
+  *A = C;
+}
+
+we get the following basic block:
+
+	...
+        lvx v2, 0, r4
+        lvx v3, 0, r3
+        vcmpeqfp v4, v3, v2
+        vcmpeqfp. v2, v3, v2
+        bne cr6, LBB1_2 ; cond_next
+
+The vcmpeqfp/vcmpeqfp. instructions currently cannot be merged when the
+vcmpeqfp. result is used by a branch.  This can be improved.
+
+//===----------------------------------------------------------------------===//
+
+The code generated for this is truly aweful:
+
+vector float test(float a, float b) {
+ return (vector float){ 0.0, a, 0.0, 0.0}; 
+}
+
+LCPI1_0:                                        ;  float
+        .space  4
+        .text
+        .globl  _test
+        .align  4
+_test:
+        mfspr r2, 256
+        oris r3, r2, 4096
+        mtspr 256, r3
+        lis r3, ha16(LCPI1_0)
+        addi r4, r1, -32
+        stfs f1, -16(r1)
+        addi r5, r1, -16
+        lfs f0, lo16(LCPI1_0)(r3)
+        stfs f0, -32(r1)
+        lvx v2, 0, r4
+        lvx v3, 0, r5
+        vmrghw v3, v3, v2
+        vspltw v2, v2, 0
+        vmrghw v2, v2, v3
+        mtspr 256, r2
+        blr
+
+//===----------------------------------------------------------------------===//
+
+int foo(vector float *x, vector float *y) {
+        if (vec_all_eq(*x,*y)) return 3245; 
+        else return 12;
+}
+
+A predicate compare being used in a select_cc should have the same peephole
+applied to it as a predicate compare used by a br_cc.  There should be no
+mfcr here:
+
+_foo:
+        mfspr r2, 256
+        oris r5, r2, 12288
+        mtspr 256, r5
+        li r5, 12
+        li r6, 3245
+        lvx v2, 0, r4
+        lvx v3, 0, r3
+        vcmpeqfp. v2, v3, v2
+        mfcr r3, 2
+        rlwinm r3, r3, 25, 31, 31
+        cmpwi cr0, r3, 0
+        bne cr0, LBB1_2 ; entry
+LBB1_1: ; entry
+        mr r6, r5
+LBB1_2: ; entry
+        mr r3, r6
+        mtspr 256, r2
+        blr
+
+//===----------------------------------------------------------------------===//
+
+CodeGen/PowerPC/vec_constants.ll has an and operation that should be
+codegen'd to andc.  The issue is that the 'all ones' build vector is
+SelectNodeTo'd a VSPLTISB instruction node before the and/xor is selected
+which prevents the vnot pattern from matching.
+
+
+//===----------------------------------------------------------------------===//
+
+An alternative to the store/store/load approach for illegal insert element 
+lowering would be:
+
+1. store element to any ol' slot
+2. lvx the slot
+3. lvsl 0; splat index; vcmpeq to generate a select mask
+4. lvsl slot + x; vperm to rotate result into correct slot
+5. vsel result together.
+
+//===----------------------------------------------------------------------===//
+
+Should codegen branches on vec_any/vec_all to avoid mfcr.  Two examples:
+
+#include <altivec.h>
+ int f(vector float a, vector float b)
+ {
+  int aa = 0;
+  if (vec_all_ge(a, b))
+    aa |= 0x1;
+  if (vec_any_ge(a,b))
+    aa |= 0x2;
+  return aa;
+}
+
+vector float f(vector float a, vector float b) { 
+  if (vec_any_eq(a, b)) 
+    return a; 
+  else 
+    return b; 
+}
+

diff --git a/lib/Target/PowerPC/TargetInfo/CMakeLists.txt b/lib/Target/PowerPC/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000..058d599
--- /dev/null
+++ b/lib/Target/PowerPC/TargetInfo/CMakeLists.txt

@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMPowerPCInfo
+  PowerPCTargetInfo.cpp
+  )
+
+add_dependencies(LLVMPowerPCInfo PowerPCCodeGenTable_gen)

diff --git a/lib/Target/PowerPC/TargetInfo/Makefile b/lib/Target/PowerPC/TargetInfo/Makefile
new file mode 100644
index 0000000..a101aa4
--- /dev/null
+++ b/lib/Target/PowerPC/TargetInfo/Makefile

@@ -0,0 +1,15 @@
+##===- lib/Target/PowerPC/TargetInfo/Makefile --------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMPowerPCInfo
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common

diff --git a/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp b/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp
new file mode 100644
index 0000000..ad607d0
--- /dev/null
+++ b/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp

@@ -0,0 +1,23 @@
+//===-- PowerPCTargetInfo.cpp - PowerPC Target Implementation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "llvm/Module.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::ThePPC32Target, llvm::ThePPC64Target;
+
+extern "C" void LLVMInitializePowerPCTargetInfo() { 
+  RegisterTarget<Triple::ppc, /*HasJIT=*/true>
+    X(ThePPC32Target, "ppc32", "PowerPC 32");
+
+  RegisterTarget<Triple::ppc64, /*HasJIT=*/true>
+    Y(ThePPC64Target, "ppc64", "PowerPC 64");
+}

diff --git a/lib/Target/README.txt b/lib/Target/README.txt
new file mode 100644
index 0000000..4fd46a8
--- /dev/null
+++ b/lib/Target/README.txt

@@ -0,0 +1,1821 @@
+Target Independent Opportunities:
+
+//===---------------------------------------------------------------------===//
+
+Dead argument elimination should be enhanced to handle cases when an argument is
+dead to an externally visible function.  Though the argument can't be removed
+from the externally visible function, the caller doesn't need to pass it in.
+For example in this testcase:
+
+  void foo(int X) __attribute__((noinline));
+  void foo(int X) { sideeffect(); }
+  void bar(int A) { foo(A+1); }
+
+We compile bar to:
+
+define void @bar(i32 %A) nounwind ssp {
+  %0 = add nsw i32 %A, 1                          ; <i32> [#uses=1]
+  tail call void @foo(i32 %0) nounwind noinline ssp
+  ret void
+}
+
+The add is dead, we could pass in 'i32 undef' instead.  This occurs for C++
+templates etc, which usually have linkonce_odr/weak_odr linkage, not internal
+linkage.
+
+//===---------------------------------------------------------------------===//
+
+With the recent changes to make the implicit def/use set explicit in
+machineinstrs, we should change the target descriptions for 'call' instructions
+so that the .td files don't list all the call-clobbered registers as implicit
+defs.  Instead, these should be added by the code generator (e.g. on the dag).
+
+This has a number of uses:
+
+1. PPC32/64 and X86 32/64 can avoid having multiple copies of call instructions
+   for their different impdef sets.
+2. Targets with multiple calling convs (e.g. x86) which have different clobber
+   sets don't need copies of call instructions.
+3. 'Interprocedural register allocation' can be done to reduce the clobber sets
+   of calls.
+
+//===---------------------------------------------------------------------===//
+
+Make the PPC branch selector target independant
+
+//===---------------------------------------------------------------------===//
+
+Get the C front-end to expand hypot(x,y) -> llvm.sqrt(x*x+y*y) when errno and
+precision don't matter (ffastmath).  Misc/mandel will like this. :)  This isn't
+safe in general, even on darwin.  See the libm implementation of hypot for
+examples (which special case when x/y are exactly zero to get signed zeros etc
+right).
+
+//===---------------------------------------------------------------------===//
+
+Solve this DAG isel folding deficiency:
+
+int X, Y;
+
+void fn1(void)
+{
+  X = X | (Y << 3);
+}
+
+compiles to
+
+fn1:
+	movl Y, %eax
+	shll $3, %eax
+	orl X, %eax
+	movl %eax, X
+	ret
+
+The problem is the store's chain operand is not the load X but rather
+a TokenFactor of the load X and load Y, which prevents the folding.
+
+There are two ways to fix this:
+
+1. The dag combiner can start using alias analysis to realize that y/x
+   don't alias, making the store to X not dependent on the load from Y.
+2. The generated isel could be made smarter in the case it can't
+   disambiguate the pointers.
+
+Number 1 is the preferred solution.
+
+This has been "fixed" by a TableGen hack. But that is a short term workaround
+which will be removed once the proper fix is made.
+
+//===---------------------------------------------------------------------===//
+
+On targets with expensive 64-bit multiply, we could LSR this:
+
+for (i = ...; ++i) {
+   x = 1ULL << i;
+
+into:
+ long long tmp = 1;
+ for (i = ...; ++i, tmp+=tmp)
+   x = tmp;
+
+This would be a win on ppc32, but not x86 or ppc64.
+
+//===---------------------------------------------------------------------===//
+
+Shrink: (setlt (loadi32 P), 0) -> (setlt (loadi8 Phi), 0)
+
+//===---------------------------------------------------------------------===//
+
+Reassociate should turn things like:
+
+int factorial(int X) {
+ return X*X*X*X*X*X*X*X;
+}
+
+into llvm.powi calls, allowing the code generator to produce balanced
+multiplication trees.
+
+First, the intrinsic needs to be extended to support integers, and second the
+code generator needs to be enhanced to lower these to multiplication trees.
+
+//===---------------------------------------------------------------------===//
+
+Interesting? testcase for add/shift/mul reassoc:
+
+int bar(int x, int y) {
+  return x*x*x+y+x*x*x*x*x*y*y*y*y;
+}
+int foo(int z, int n) {
+  return bar(z, n) + bar(2*z, 2*n);
+}
+
+This is blocked on not handling X*X*X -> powi(X, 3) (see note above).  The issue
+is that we end up getting t = 2*X  s = t*t   and don't turn this into 4*X*X,
+which is the same number of multiplies and is canonical, because the 2*X has
+multiple uses.  Here's a simple example:
+
+define i32 @test15(i32 %X1) {
+  %B = mul i32 %X1, 47   ; X1*47
+  %C = mul i32 %B, %B
+  ret i32 %C
+}
+
+
+//===---------------------------------------------------------------------===//
+
+Reassociate should handle the example in GCC PR16157:
+
+extern int a0, a1, a2, a3, a4; extern int b0, b1, b2, b3, b4; 
+void f () {  /* this can be optimized to four additions... */ 
+        b4 = a4 + a3 + a2 + a1 + a0; 
+        b3 = a3 + a2 + a1 + a0; 
+        b2 = a2 + a1 + a0; 
+        b1 = a1 + a0; 
+} 
+
+This requires reassociating to forms of expressions that are already available,
+something that reassoc doesn't think about yet.
+
+
+//===---------------------------------------------------------------------===//
+
+This function: (derived from GCC PR19988)
+double foo(double x, double y) {
+  return ((x + 0.1234 * y) * (x + -0.1234 * y));
+}
+
+compiles to:
+_foo:
+	movapd	%xmm1, %xmm2
+	mulsd	LCPI1_1(%rip), %xmm1
+	mulsd	LCPI1_0(%rip), %xmm2
+	addsd	%xmm0, %xmm1
+	addsd	%xmm0, %xmm2
+	movapd	%xmm1, %xmm0
+	mulsd	%xmm2, %xmm0
+	ret
+
+Reassociate should be able to turn it into:
+
+double foo(double x, double y) {
+  return ((x + 0.1234 * y) * (x - 0.1234 * y));
+}
+
+Which allows the multiply by constant to be CSE'd, producing:
+
+_foo:
+	mulsd	LCPI1_0(%rip), %xmm1
+	movapd	%xmm1, %xmm2
+	addsd	%xmm0, %xmm2
+	subsd	%xmm1, %xmm0
+	mulsd	%xmm2, %xmm0
+	ret
+
+This doesn't need -ffast-math support at all.  This is particularly bad because
+the llvm-gcc frontend is canonicalizing the later into the former, but clang
+doesn't have this problem.
+
+//===---------------------------------------------------------------------===//
+
+These two functions should generate the same code on big-endian systems:
+
+int g(int *j,int *l)  {  return memcmp(j,l,4);  }
+int h(int *j, int *l) {  return *j - *l; }
+
+this could be done in SelectionDAGISel.cpp, along with other special cases,
+for 1,2,4,8 bytes.
+
+//===---------------------------------------------------------------------===//
+
+It would be nice to revert this patch:
+http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20060213/031986.html
+
+And teach the dag combiner enough to simplify the code expanded before 
+legalize.  It seems plausible that this knowledge would let it simplify other
+stuff too.
+
+//===---------------------------------------------------------------------===//
+
+For vector types, TargetData.cpp::getTypeInfo() returns alignment that is equal
+to the type size. It works but can be overly conservative as the alignment of
+specific vector types are target dependent.
+
+//===---------------------------------------------------------------------===//
+
+We should produce an unaligned load from code like this:
+
+v4sf example(float *P) {
+  return (v4sf){P[0], P[1], P[2], P[3] };
+}
+
+//===---------------------------------------------------------------------===//
+
+Add support for conditional increments, and other related patterns.  Instead
+of:
+
+	movl 136(%esp), %eax
+	cmpl $0, %eax
+	je LBB16_2	#cond_next
+LBB16_1:	#cond_true
+	incl _foo
+LBB16_2:	#cond_next
+
+emit:
+	movl	_foo, %eax
+	cmpl	$1, %edi
+	sbbl	$-1, %eax
+	movl	%eax, _foo
+
+//===---------------------------------------------------------------------===//
+
+Combine: a = sin(x), b = cos(x) into a,b = sincos(x).
+
+Expand these to calls of sin/cos and stores:
+      double sincos(double x, double *sin, double *cos);
+      float sincosf(float x, float *sin, float *cos);
+      long double sincosl(long double x, long double *sin, long double *cos);
+
+Doing so could allow SROA of the destination pointers.  See also:
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17687
+
+This is now easily doable with MRVs.  We could even make an intrinsic for this
+if anyone cared enough about sincos.
+
+//===---------------------------------------------------------------------===//
+
+Turn this into a single byte store with no load (the other 3 bytes are
+unmodified):
+
+define void @test(i32* %P) {
+	%tmp = load i32* %P
+        %tmp14 = or i32 %tmp, 3305111552
+        %tmp15 = and i32 %tmp14, 3321888767
+        store i32 %tmp15, i32* %P
+        ret void
+}
+
+//===---------------------------------------------------------------------===//
+
+quantum_sigma_x in 462.libquantum contains the following loop:
+
+      for(i=0; i<reg->size; i++)
+	{
+	  /* Flip the target bit of each basis state */
+	  reg->node[i].state ^= ((MAX_UNSIGNED) 1 << target);
+	} 
+
+Where MAX_UNSIGNED/state is a 64-bit int.  On a 32-bit platform it would be just
+so cool to turn it into something like:
+
+   long long Res = ((MAX_UNSIGNED) 1 << target);
+   if (target < 32) {
+     for(i=0; i<reg->size; i++)
+       reg->node[i].state ^= Res & 0xFFFFFFFFULL;
+   } else {
+     for(i=0; i<reg->size; i++)
+       reg->node[i].state ^= Res & 0xFFFFFFFF00000000ULL
+   }
+   
+... which would only do one 32-bit XOR per loop iteration instead of two.
+
+It would also be nice to recognize the reg->size doesn't alias reg->node[i], but
+this requires TBAA.
+
+//===---------------------------------------------------------------------===//
+
+This isn't recognized as bswap by instcombine (yes, it really is bswap):
+
+unsigned long reverse(unsigned v) {
+    unsigned t;
+    t = v ^ ((v << 16) | (v >> 16));
+    t &= ~0xff0000;
+    v = (v << 24) | (v >> 8);
+    return v ^ (t >> 8);
+}
+
+//===---------------------------------------------------------------------===//
+
+[LOOP RECOGNITION]
+
+These idioms should be recognized as popcount (see PR1488):
+
+unsigned countbits_slow(unsigned v) {
+  unsigned c;
+  for (c = 0; v; v >>= 1)
+    c += v & 1;
+  return c;
+}
+unsigned countbits_fast(unsigned v){
+  unsigned c;
+  for (c = 0; v; c++)
+    v &= v - 1; // clear the least significant bit set
+  return c;
+}
+
+BITBOARD = unsigned long long
+int PopCnt(register BITBOARD a) {
+  register int c=0;
+  while(a) {
+    c++;
+    a &= a - 1;
+  }
+  return c;
+}
+unsigned int popcount(unsigned int input) {
+  unsigned int count = 0;
+  for (unsigned int i =  0; i < 4 * 8; i++)
+    count += (input >> i) & i;
+  return count;
+}
+
+This is a form of idiom recognition for loops, the same thing that could be
+useful for recognizing memset/memcpy.
+
+//===---------------------------------------------------------------------===//
+
+These should turn into single 16-bit (unaligned?) loads on little/big endian
+processors.
+
+unsigned short read_16_le(const unsigned char *adr) {
+  return adr[0] | (adr[1] << 8);
+}
+unsigned short read_16_be(const unsigned char *adr) {
+  return (adr[0] << 8) | adr[1];
+}
+
+//===---------------------------------------------------------------------===//
+
+-instcombine should handle this transform:
+   icmp pred (sdiv X / C1 ), C2
+when X, C1, and C2 are unsigned.  Similarly for udiv and signed operands. 
+
+Currently InstCombine avoids this transform but will do it when the signs of
+the operands and the sign of the divide match. See the FIXME in 
+InstructionCombining.cpp in the visitSetCondInst method after the switch case 
+for Instruction::UDiv (around line 4447) for more details.
+
+The SingleSource/Benchmarks/Shootout-C++/hash and hash2 tests have examples of
+this construct. 
+
+//===---------------------------------------------------------------------===//
+
+[LOOP RECOGNITION]
+
+viterbi speeds up *significantly* if the various "history" related copy loops
+are turned into memcpy calls at the source level.  We need a "loops to memcpy"
+pass.
+
+//===---------------------------------------------------------------------===//
+
+[LOOP OPTIMIZATION]
+
+SingleSource/Benchmarks/Misc/dt.c shows several interesting optimization
+opportunities in its double_array_divs_variable function: it needs loop
+interchange, memory promotion (which LICM already does), vectorization and
+variable trip count loop unrolling (since it has a constant trip count). ICC
+apparently produces this very nice code with -ffast-math:
+
+..B1.70:                        # Preds ..B1.70 ..B1.69
+       mulpd     %xmm0, %xmm1                                  #108.2
+       mulpd     %xmm0, %xmm1                                  #108.2
+       mulpd     %xmm0, %xmm1                                  #108.2
+       mulpd     %xmm0, %xmm1                                  #108.2
+       addl      $8, %edx                                      #
+       cmpl      $131072, %edx                                 #108.2
+       jb        ..B1.70       # Prob 99%                      #108.2
+
+It would be better to count down to zero, but this is a lot better than what we
+do.
+
+//===---------------------------------------------------------------------===//
+
+Consider:
+
+typedef unsigned U32;
+typedef unsigned long long U64;
+int test (U32 *inst, U64 *regs) {
+    U64 effective_addr2;
+    U32 temp = *inst;
+    int r1 = (temp >> 20) & 0xf;
+    int b2 = (temp >> 16) & 0xf;
+    effective_addr2 = temp & 0xfff;
+    if (b2) effective_addr2 += regs[b2];
+    b2 = (temp >> 12) & 0xf;
+    if (b2) effective_addr2 += regs[b2];
+    effective_addr2 &= regs[4];
+     if ((effective_addr2 & 3) == 0)
+        return 1;
+    return 0;
+}
+
+Note that only the low 2 bits of effective_addr2 are used.  On 32-bit systems,
+we don't eliminate the computation of the top half of effective_addr2 because
+we don't have whole-function selection dags.  On x86, this means we use one
+extra register for the function when effective_addr2 is declared as U64 than
+when it is declared U32.
+
+PHI Slicing could be extended to do this.
+
+//===---------------------------------------------------------------------===//
+
+LSR should know what GPR types a target has from TargetData.  This code:
+
+volatile short X, Y; // globals
+
+void foo(int N) {
+  int i;
+  for (i = 0; i < N; i++) { X = i; Y = i*4; }
+}
+
+produces two near identical IV's (after promotion) on PPC/ARM:
+
+LBB1_2:
+	ldr r3, LCPI1_0
+	ldr r3, [r3]
+	strh r2, [r3]
+	ldr r3, LCPI1_1
+	ldr r3, [r3]
+	strh r1, [r3]
+	add r1, r1, #4
+	add r2, r2, #1   <- [0,+,1]
+	sub r0, r0, #1   <- [0,-,1]
+	cmp r0, #0
+	bne LBB1_2
+
+LSR should reuse the "+" IV for the exit test.
+
+//===---------------------------------------------------------------------===//
+
+Tail call elim should be more aggressive, checking to see if the call is
+followed by an uncond branch to an exit block.
+
+; This testcase is due to tail-duplication not wanting to copy the return
+; instruction into the terminating blocks because there was other code
+; optimized out of the function after the taildup happened.
+; RUN: llvm-as < %s | opt -tailcallelim | llvm-dis | not grep call
+
+define i32 @t4(i32 %a) {
+entry:
+	%tmp.1 = and i32 %a, 1		; <i32> [#uses=1]
+	%tmp.2 = icmp ne i32 %tmp.1, 0		; <i1> [#uses=1]
+	br i1 %tmp.2, label %then.0, label %else.0
+
+then.0:		; preds = %entry
+	%tmp.5 = add i32 %a, -1		; <i32> [#uses=1]
+	%tmp.3 = call i32 @t4( i32 %tmp.5 )		; <i32> [#uses=1]
+	br label %return
+
+else.0:		; preds = %entry
+	%tmp.7 = icmp ne i32 %a, 0		; <i1> [#uses=1]
+	br i1 %tmp.7, label %then.1, label %return
+
+then.1:		; preds = %else.0
+	%tmp.11 = add i32 %a, -2		; <i32> [#uses=1]
+	%tmp.9 = call i32 @t4( i32 %tmp.11 )		; <i32> [#uses=1]
+	br label %return
+
+return:		; preds = %then.1, %else.0, %then.0
+	%result.0 = phi i32 [ 0, %else.0 ], [ %tmp.3, %then.0 ],
+                            [ %tmp.9, %then.1 ]
+	ret i32 %result.0
+}
+
+//===---------------------------------------------------------------------===//
+
+Tail recursion elimination should handle:
+
+int pow2m1(int n) {
+ if (n == 0)
+   return 0;
+ return 2 * pow2m1 (n - 1) + 1;
+}
+
+Also, multiplies can be turned into SHL's, so they should be handled as if
+they were associative.  "return foo() << 1" can be tail recursion eliminated.
+
+//===---------------------------------------------------------------------===//
+
+Argument promotion should promote arguments for recursive functions, like 
+this:
+
+; RUN: llvm-as < %s | opt -argpromotion | llvm-dis | grep x.val
+
+define internal i32 @foo(i32* %x) {
+entry:
+	%tmp = load i32* %x		; <i32> [#uses=0]
+	%tmp.foo = call i32 @foo( i32* %x )		; <i32> [#uses=1]
+	ret i32 %tmp.foo
+}
+
+define i32 @bar(i32* %x) {
+entry:
+	%tmp3 = call i32 @foo( i32* %x )		; <i32> [#uses=1]
+	ret i32 %tmp3
+}
+
+//===---------------------------------------------------------------------===//
+
+We should investigate an instruction sinking pass.  Consider this silly
+example in pic mode:
+
+#include <assert.h>
+void foo(int x) {
+  assert(x);
+  //...
+}
+
+we compile this to:
+_foo:
+	subl	$28, %esp
+	call	"L1$pb"
+"L1$pb":
+	popl	%eax
+	cmpl	$0, 32(%esp)
+	je	LBB1_2	# cond_true
+LBB1_1:	# return
+	# ...
+	addl	$28, %esp
+	ret
+LBB1_2:	# cond_true
+...
+
+The PIC base computation (call+popl) is only used on one path through the 
+code, but is currently always computed in the entry block.  It would be 
+better to sink the picbase computation down into the block for the 
+assertion, as it is the only one that uses it.  This happens for a lot of 
+code with early outs.
+
+Another example is loads of arguments, which are usually emitted into the 
+entry block on targets like x86.  If not used in all paths through a 
+function, they should be sunk into the ones that do.
+
+In this case, whole-function-isel would also handle this.
+
+//===---------------------------------------------------------------------===//
+
+Investigate lowering of sparse switch statements into perfect hash tables:
+http://burtleburtle.net/bob/hash/perfect.html
+
+//===---------------------------------------------------------------------===//
+
+We should turn things like "load+fabs+store" and "load+fneg+store" into the
+corresponding integer operations.  On a yonah, this loop:
+
+double a[256];
+void foo() {
+  int i, b;
+  for (b = 0; b < 10000000; b++)
+  for (i = 0; i < 256; i++)
+    a[i] = -a[i];
+}
+
+is twice as slow as this loop:
+
+long long a[256];
+void foo() {
+  int i, b;
+  for (b = 0; b < 10000000; b++)
+  for (i = 0; i < 256; i++)
+    a[i] ^= (1ULL << 63);
+}
+
+and I suspect other processors are similar.  On X86 in particular this is a
+big win because doing this with integers allows the use of read/modify/write
+instructions.
+
+//===---------------------------------------------------------------------===//
+
+DAG Combiner should try to combine small loads into larger loads when 
+profitable.  For example, we compile this C++ example:
+
+struct THotKey { short Key; bool Control; bool Shift; bool Alt; };
+extern THotKey m_HotKey;
+THotKey GetHotKey () { return m_HotKey; }
+
+into (-O3 -fno-exceptions -static -fomit-frame-pointer):
+
+__Z9GetHotKeyv:
+	pushl	%esi
+	movl	8(%esp), %eax
+	movb	_m_HotKey+3, %cl
+	movb	_m_HotKey+4, %dl
+	movb	_m_HotKey+2, %ch
+	movw	_m_HotKey, %si
+	movw	%si, (%eax)
+	movb	%ch, 2(%eax)
+	movb	%cl, 3(%eax)
+	movb	%dl, 4(%eax)
+	popl	%esi
+	ret	$4
+
+GCC produces:
+
+__Z9GetHotKeyv:
+	movl	_m_HotKey, %edx
+	movl	4(%esp), %eax
+	movl	%edx, (%eax)
+	movzwl	_m_HotKey+4, %edx
+	movw	%dx, 4(%eax)
+	ret	$4
+
+The LLVM IR contains the needed alignment info, so we should be able to 
+merge the loads and stores into 4-byte loads:
+
+	%struct.THotKey = type { i16, i8, i8, i8 }
+define void @_Z9GetHotKeyv(%struct.THotKey* sret  %agg.result) nounwind  {
+...
+	%tmp2 = load i16* getelementptr (@m_HotKey, i32 0, i32 0), align 8
+	%tmp5 = load i8* getelementptr (@m_HotKey, i32 0, i32 1), align 2
+	%tmp8 = load i8* getelementptr (@m_HotKey, i32 0, i32 2), align 1
+	%tmp11 = load i8* getelementptr (@m_HotKey, i32 0, i32 3), align 2
+
+Alternatively, we should use a small amount of base-offset alias analysis
+to make it so the scheduler doesn't need to hold all the loads in regs at
+once.
+
+//===---------------------------------------------------------------------===//
+
+We should add an FRINT node to the DAG to model targets that have legal
+implementations of ceil/floor/rint.
+
+//===---------------------------------------------------------------------===//
+
+Consider:
+
+int test() {
+  long long input[8] = {1,1,1,1,1,1,1,1};
+  foo(input);
+}
+
+We currently compile this into a memcpy from a global array since the 
+initializer is fairly large and not memset'able.  This is good, but the memcpy
+gets lowered to load/stores in the code generator.  This is also ok, except
+that the codegen lowering for memcpy doesn't handle the case when the source
+is a constant global.  This gives us atrocious code like this:
+
+	call	"L1$pb"
+"L1$pb":
+	popl	%eax
+	movl	_C.0.1444-"L1$pb"+32(%eax), %ecx
+	movl	%ecx, 40(%esp)
+	movl	_C.0.1444-"L1$pb"+20(%eax), %ecx
+	movl	%ecx, 28(%esp)
+	movl	_C.0.1444-"L1$pb"+36(%eax), %ecx
+	movl	%ecx, 44(%esp)
+	movl	_C.0.1444-"L1$pb"+44(%eax), %ecx
+	movl	%ecx, 52(%esp)
+	movl	_C.0.1444-"L1$pb"+40(%eax), %ecx
+	movl	%ecx, 48(%esp)
+	movl	_C.0.1444-"L1$pb"+12(%eax), %ecx
+	movl	%ecx, 20(%esp)
+	movl	_C.0.1444-"L1$pb"+4(%eax), %ecx
+...
+
+instead of:
+	movl	$1, 16(%esp)
+	movl	$0, 20(%esp)
+	movl	$1, 24(%esp)
+	movl	$0, 28(%esp)
+	movl	$1, 32(%esp)
+	movl	$0, 36(%esp)
+	...
+
+//===---------------------------------------------------------------------===//
+
+http://llvm.org/PR717:
+
+The following code should compile into "ret int undef". Instead, LLVM
+produces "ret int 0":
+
+int f() {
+  int x = 4;
+  int y;
+  if (x == 3) y = 0;
+  return y;
+}
+
+//===---------------------------------------------------------------------===//
+
+The loop unroller should partially unroll loops (instead of peeling them)
+when code growth isn't too bad and when an unroll count allows simplification
+of some code within the loop.  One trivial example is:
+
+#include <stdio.h>
+int main() {
+    int nRet = 17;
+    int nLoop;
+    for ( nLoop = 0; nLoop < 1000; nLoop++ ) {
+        if ( nLoop & 1 )
+            nRet += 2;
+        else
+            nRet -= 1;
+    }
+    return nRet;
+}
+
+Unrolling by 2 would eliminate the '&1' in both copies, leading to a net
+reduction in code size.  The resultant code would then also be suitable for
+exit value computation.
+
+//===---------------------------------------------------------------------===//
+
+We miss a bunch of rotate opportunities on various targets, including ppc, x86,
+etc.  On X86, we miss a bunch of 'rotate by variable' cases because the rotate
+matching code in dag combine doesn't look through truncates aggressively 
+enough.  Here are some testcases reduces from GCC PR17886:
+
+unsigned long long f(unsigned long long x, int y) {
+  return (x << y) | (x >> 64-y); 
+} 
+unsigned f2(unsigned x, int y){
+  return (x << y) | (x >> 32-y); 
+} 
+unsigned long long f3(unsigned long long x){
+  int y = 9;
+  return (x << y) | (x >> 64-y); 
+} 
+unsigned f4(unsigned x){
+  int y = 10;
+  return (x << y) | (x >> 32-y); 
+}
+unsigned long long f5(unsigned long long x, unsigned long long y) {
+  return (x << 8) | ((y >> 48) & 0xffull);
+}
+unsigned long long f6(unsigned long long x, unsigned long long y, int z) {
+  switch(z) {
+  case 1:
+    return (x << 8) | ((y >> 48) & 0xffull);
+  case 2:
+    return (x << 16) | ((y >> 40) & 0xffffull);
+  case 3:
+    return (x << 24) | ((y >> 32) & 0xffffffull);
+  case 4:
+    return (x << 32) | ((y >> 24) & 0xffffffffull);
+  default:
+    return (x << 40) | ((y >> 16) & 0xffffffffffull);
+  }
+}
+
+On X86-64, we only handle f2/f3/f4 right.  On x86-32, a few of these 
+generate truly horrible code, instead of using shld and friends.  On
+ARM, we end up with calls to L___lshrdi3/L___ashldi3 in f, which is
+badness.  PPC64 misses f, f5 and f6.  CellSPU aborts in isel.
+
+//===---------------------------------------------------------------------===//
+
+We do a number of simplifications in simplify libcalls to strength reduce
+standard library functions, but we don't currently merge them together.  For
+example, it is useful to merge memcpy(a,b,strlen(b)) -> strcpy.  This can only
+be done safely if "b" isn't modified between the strlen and memcpy of course.
+
+//===---------------------------------------------------------------------===//
+
+We compile this program: (from GCC PR11680)
+http://gcc.gnu.org/bugzilla/attachment.cgi?id=4487
+
+Into code that runs the same speed in fast/slow modes, but both modes run 2x
+slower than when compile with GCC (either 4.0 or 4.2):
+
+$ llvm-g++ perf.cpp -O3 -fno-exceptions
+$ time ./a.out fast
+1.821u 0.003s 0:01.82 100.0%	0+0k 0+0io 0pf+0w
+
+$ g++ perf.cpp -O3 -fno-exceptions
+$ time ./a.out fast
+0.821u 0.001s 0:00.82 100.0%	0+0k 0+0io 0pf+0w
+
+It looks like we are making the same inlining decisions, so this may be raw
+codegen badness or something else (haven't investigated).
+
+//===---------------------------------------------------------------------===//
+
+We miss some instcombines for stuff like this:
+void bar (void);
+void foo (unsigned int a) {
+  /* This one is equivalent to a >= (3 << 2).  */
+  if ((a >> 2) >= 3)
+    bar ();
+}
+
+A few other related ones are in GCC PR14753.
+
+//===---------------------------------------------------------------------===//
+
+Divisibility by constant can be simplified (according to GCC PR12849) from
+being a mulhi to being a mul lo (cheaper).  Testcase:
+
+void bar(unsigned n) {
+  if (n % 3 == 0)
+    true();
+}
+
+This is equivalent to the following, where 2863311531 is the multiplicative
+inverse of 3, and 1431655766 is ((2^32)-1)/3+1:
+void bar(unsigned n) {
+  if (n * 2863311531U < 1431655766U)
+    true();
+}
+
+The same transformation can work with an even modulo with the addition of a
+rotate: rotate the result of the multiply to the right by the number of bits
+which need to be zero for the condition to be true, and shrink the compare RHS
+by the same amount.  Unless the target supports rotates, though, that
+transformation probably isn't worthwhile.
+
+The transformation can also easily be made to work with non-zero equality
+comparisons: just transform, for example, "n % 3 == 1" to "(n-1) % 3 == 0".
+
+//===---------------------------------------------------------------------===//
+
+Better mod/ref analysis for scanf would allow us to eliminate the vtable and a
+bunch of other stuff from this example (see PR1604): 
+
+#include <cstdio>
+struct test {
+    int val;
+    virtual ~test() {}
+};
+
+int main() {
+    test t;
+    std::scanf("%d", &t.val);
+    std::printf("%d\n", t.val);
+}
+
+//===---------------------------------------------------------------------===//
+
+These functions perform the same computation, but produce different assembly.
+
+define i8 @select(i8 %x) readnone nounwind {
+  %A = icmp ult i8 %x, 250
+  %B = select i1 %A, i8 0, i8 1
+  ret i8 %B 
+}
+
+define i8 @addshr(i8 %x) readnone nounwind {
+  %A = zext i8 %x to i9
+  %B = add i9 %A, 6       ;; 256 - 250 == 6
+  %C = lshr i9 %B, 8
+  %D = trunc i9 %C to i8
+  ret i8 %D
+}
+
+//===---------------------------------------------------------------------===//
+
+From gcc bug 24696:
+int
+f (unsigned long a, unsigned long b, unsigned long c)
+{
+  return ((a & (c - 1)) != 0) || ((b & (c - 1)) != 0);
+}
+int
+f (unsigned long a, unsigned long b, unsigned long c)
+{
+  return ((a & (c - 1)) != 0) | ((b & (c - 1)) != 0);
+}
+Both should combine to ((a|b) & (c-1)) != 0.  Currently not optimized with
+"clang -emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+From GCC Bug 20192:
+#define PMD_MASK    (~((1UL << 23) - 1))
+void clear_pmd_range(unsigned long start, unsigned long end)
+{
+   if (!(start & ~PMD_MASK) && !(end & ~PMD_MASK))
+       f();
+}
+The expression should optimize to something like
+"!((start|end)&~PMD_MASK). Currently not optimized with "clang
+-emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+From GCC Bug 3756:
+int
+pn (int n)
+{
+ return (n >= 0 ? 1 : -1);
+}
+Should combine to (n >> 31) | 1.  Currently not optimized with "clang
+-emit-llvm-bc | opt -std-compile-opts | llc".
+
+//===---------------------------------------------------------------------===//
+
+void a(int variable)
+{
+ if (variable == 4 || variable == 6)
+   bar();
+}
+This should optimize to "if ((variable | 2) == 6)".  Currently not
+optimized with "clang -emit-llvm-bc | opt -std-compile-opts | llc".
+
+//===---------------------------------------------------------------------===//
+
+unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return
+i;}
+unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
+These should combine to the same thing.  Currently, the first function
+produces better code on X86.
+
+//===---------------------------------------------------------------------===//
+
+From GCC Bug 15784:
+#define abs(x) x>0?x:-x
+int f(int x, int y)
+{
+ return (abs(x)) >= 0;
+}
+This should optimize to x == INT_MIN. (With -fwrapv.)  Currently not
+optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+From GCC Bug 14753:
+void
+rotate_cst (unsigned int a)
+{
+ a = (a << 10) | (a >> 22);
+ if (a == 123)
+   bar ();
+}
+void
+minus_cst (unsigned int a)
+{
+ unsigned int tem;
+
+ tem = 20 - a;
+ if (tem == 5)
+   bar ();
+}
+void
+mask_gt (unsigned int a)
+{
+ /* This is equivalent to a > 15.  */
+ if ((a & ~7) > 8)
+   bar ();
+}
+void
+rshift_gt (unsigned int a)
+{
+ /* This is equivalent to a > 23.  */
+ if ((a >> 2) > 5)
+   bar ();
+}
+All should simplify to a single comparison.  All of these are
+currently not optimized with "clang -emit-llvm-bc | opt
+-std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+From GCC Bug 32605:
+int c(int* x) {return (char*)x+2 == (char*)x;}
+Should combine to 0.  Currently not optimized with "clang
+-emit-llvm-bc | opt -std-compile-opts" (although llc can optimize it).
+
+//===---------------------------------------------------------------------===//
+
+int a(unsigned b) {return ((b << 31) | (b << 30)) >> 31;}
+Should be combined to  "((b >> 1) | b) & 1".  Currently not optimized
+with "clang -emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+unsigned a(unsigned x, unsigned y) { return x | (y & 1) | (y & 2);}
+Should combine to "x | (y & 3)".  Currently not optimized with "clang
+-emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+int a(int a, int b, int c) {return (~a & c) | ((c|a) & b);}
+Should fold to "(~a & c) | (a & b)".  Currently not optimized with
+"clang -emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+int a(int a,int b) {return (~(a|b))|a;}
+Should fold to "a|~b".  Currently not optimized with "clang
+-emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+int a(int a, int b) {return (a&&b) || (a&&!b);}
+Should fold to "a".  Currently not optimized with "clang -emit-llvm-bc
+| opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+int a(int a, int b, int c) {return (a&&b) || (!a&&c);}
+Should fold to "a ? b : c", or at least something sane.  Currently not
+optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+int a(int a, int b, int c) {return (a&&b) || (a&&c) || (a&&b&&c);}
+Should fold to a && (b || c).  Currently not optimized with "clang
+-emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+int a(int x) {return x | ((x & 8) ^ 8);}
+Should combine to x | 8.  Currently not optimized with "clang
+-emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+int a(int x) {return x ^ ((x & 8) ^ 8);}
+Should also combine to x | 8.  Currently not optimized with "clang
+-emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+int a(int x) {return (x & 8) == 0 ? -1 : -9;}
+Should combine to (x | -9) ^ 8.  Currently not optimized with "clang
+-emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+int a(int x) {return (x & 8) == 0 ? -9 : -1;}
+Should combine to x | -9.  Currently not optimized with "clang
+-emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+int a(int x) {return ((x | -9) ^ 8) & x;}
+Should combine to x & -9.  Currently not optimized with "clang
+-emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+unsigned a(unsigned a) {return a * 0x11111111 >> 28 & 1;}
+Should combine to "a * 0x88888888 >> 31".  Currently not optimized
+with "clang -emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+unsigned a(char* x) {if ((*x & 32) == 0) return b();}
+There's an unnecessary zext in the generated code with "clang
+-emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+unsigned a(unsigned long long x) {return 40 * (x >> 1);}
+Should combine to "20 * (((unsigned)x) & -2)".  Currently not
+optimized with "clang -emit-llvm-bc | opt -std-compile-opts".
+
+//===---------------------------------------------------------------------===//
+
+This was noticed in the entryblock for grokdeclarator in 403.gcc:
+
+        %tmp = icmp eq i32 %decl_context, 4          
+        %decl_context_addr.0 = select i1 %tmp, i32 3, i32 %decl_context 
+        %tmp1 = icmp eq i32 %decl_context_addr.0, 1 
+        %decl_context_addr.1 = select i1 %tmp1, i32 0, i32 %decl_context_addr.0
+
+tmp1 should be simplified to something like:
+  (!tmp || decl_context == 1)
+
+This allows recursive simplifications, tmp1 is used all over the place in
+the function, e.g. by:
+
+        %tmp23 = icmp eq i32 %decl_context_addr.1, 0            ; <i1> [#uses=1]
+        %tmp24 = xor i1 %tmp1, true             ; <i1> [#uses=1]
+        %or.cond8 = and i1 %tmp23, %tmp24               ; <i1> [#uses=1]
+
+later.
+
+//===---------------------------------------------------------------------===//
+
+[STORE SINKING]
+
+Store sinking: This code:
+
+void f (int n, int *cond, int *res) {
+    int i;
+    *res = 0;
+    for (i = 0; i < n; i++)
+        if (*cond)
+            *res ^= 234; /* (*) */
+}
+
+On this function GVN hoists the fully redundant value of *res, but nothing
+moves the store out.  This gives us this code:
+
+bb:		; preds = %bb2, %entry
+	%.rle = phi i32 [ 0, %entry ], [ %.rle6, %bb2 ]	
+	%i.05 = phi i32 [ 0, %entry ], [ %indvar.next, %bb2 ]
+	%1 = load i32* %cond, align 4
+	%2 = icmp eq i32 %1, 0
+	br i1 %2, label %bb2, label %bb1
+
+bb1:		; preds = %bb
+	%3 = xor i32 %.rle, 234	
+	store i32 %3, i32* %res, align 4
+	br label %bb2
+
+bb2:		; preds = %bb, %bb1
+	%.rle6 = phi i32 [ %3, %bb1 ], [ %.rle, %bb ]	
+	%indvar.next = add i32 %i.05, 1	
+	%exitcond = icmp eq i32 %indvar.next, %n
+	br i1 %exitcond, label %return, label %bb
+
+DSE should sink partially dead stores to get the store out of the loop.
+
+Here's another partial dead case:
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=12395
+
+//===---------------------------------------------------------------------===//
+
+Scalar PRE hoists the mul in the common block up to the else:
+
+int test (int a, int b, int c, int g) {
+  int d, e;
+  if (a)
+    d = b * c;
+  else
+    d = b - c;
+  e = b * c + g;
+  return d + e;
+}
+
+It would be better to do the mul once to reduce codesize above the if.
+This is GCC PR38204.
+
+//===---------------------------------------------------------------------===//
+
+[STORE SINKING]
+
+GCC PR37810 is an interesting case where we should sink load/store reload
+into the if block and outside the loop, so we don't reload/store it on the
+non-call path.
+
+for () {
+  *P += 1;
+  if ()
+    call();
+  else
+    ...
+->
+tmp = *P
+for () {
+  tmp += 1;
+  if () {
+    *P = tmp;
+    call();
+    tmp = *P;
+  } else ...
+}
+*P = tmp;
+
+We now hoist the reload after the call (Transforms/GVN/lpre-call-wrap.ll), but
+we don't sink the store.  We need partially dead store sinking.
+
+//===---------------------------------------------------------------------===//
+
+[LOAD PRE CRIT EDGE SPLITTING]
+
+GCC PR37166: Sinking of loads prevents SROA'ing the "g" struct on the stack
+leading to excess stack traffic. This could be handled by GVN with some crazy
+symbolic phi translation.  The code we get looks like (g is on the stack):
+
+bb2:		; preds = %bb1
+..
+	%9 = getelementptr %struct.f* %g, i32 0, i32 0		
+	store i32 %8, i32* %9, align  bel %bb3
+
+bb3:		; preds = %bb1, %bb2, %bb
+	%c_addr.0 = phi %struct.f* [ %g, %bb2 ], [ %c, %bb ], [ %c, %bb1 ]
+	%b_addr.0 = phi %struct.f* [ %b, %bb2 ], [ %g, %bb ], [ %b, %bb1 ]
+	%10 = getelementptr %struct.f* %c_addr.0, i32 0, i32 0
+	%11 = load i32* %10, align 4
+
+%11 is partially redundant, an in BB2 it should have the value %8.
+
+GCC PR33344 and PR35287 are similar cases.
+
+
+//===---------------------------------------------------------------------===//
+
+[LOAD PRE]
+
+There are many load PRE testcases in testsuite/gcc.dg/tree-ssa/loadpre* in the
+GCC testsuite, ones we don't get yet are (checked through loadpre25):
+
+[CRIT EDGE BREAKING]
+loadpre3.c predcom-4.c
+
+[PRE OF READONLY CALL]
+loadpre5.c
+
+[TURN SELECT INTO BRANCH]
+loadpre14.c loadpre15.c 
+
+actually a conditional increment: loadpre18.c loadpre19.c
+
+
+//===---------------------------------------------------------------------===//
+
+[SCALAR PRE]
+There are many PRE testcases in testsuite/gcc.dg/tree-ssa/ssa-pre-*.c in the
+GCC testsuite.
+
+//===---------------------------------------------------------------------===//
+
+There are some interesting cases in testsuite/gcc.dg/tree-ssa/pred-comm* in the
+GCC testsuite.  For example, we get the first example in predcom-1.c, but 
+miss the second one:
+
+unsigned fib[1000];
+unsigned avg[1000];
+
+__attribute__ ((noinline))
+void count_averages(int n) {
+  int i;
+  for (i = 1; i < n; i++)
+    avg[i] = (((unsigned long) fib[i - 1] + fib[i] + fib[i + 1]) / 3) & 0xffff;
+}
+
+which compiles into two loads instead of one in the loop.
+
+predcom-2.c is the same as predcom-1.c
+
+predcom-3.c is very similar but needs loads feeding each other instead of
+store->load.
+
+
+//===---------------------------------------------------------------------===//
+
+[ALIAS ANALYSIS]
+
+Type based alias analysis:
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14705
+
+We should do better analysis of posix_memalign.  At the least it should
+no-capture its pointer argument, at best, we should know that the out-value
+result doesn't point to anything (like malloc).  One example of this is in
+SingleSource/Benchmarks/Misc/dt.c
+
+//===---------------------------------------------------------------------===//
+
+A/B get pinned to the stack because we turn an if/then into a select instead
+of PRE'ing the load/store.  This may be fixable in instcombine:
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=37892
+
+struct X { int i; };
+int foo (int x) {
+  struct X a;
+  struct X b;
+  struct X *p;
+  a.i = 1;
+  b.i = 2;
+  if (x)
+    p = &a;
+  else
+    p = &b;
+  return p->i;
+}
+
+//===---------------------------------------------------------------------===//
+
+Interesting missed case because of control flow flattening (should be 2 loads):
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=26629
+With: llvm-gcc t2.c -S -o - -O0 -emit-llvm | llvm-as | 
+             opt -mem2reg -gvn -instcombine | llvm-dis
+we miss it because we need 1) CRIT EDGE 2) MULTIPLE DIFFERENT
+VALS PRODUCED BY ONE BLOCK OVER DIFFERENT PATHS
+
+//===---------------------------------------------------------------------===//
+
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19633
+We could eliminate the branch condition here, loading from null is undefined:
+
+struct S { int w, x, y, z; };
+struct T { int r; struct S s; };
+void bar (struct S, int);
+void foo (int a, struct T b)
+{
+  struct S *c = 0;
+  if (a)
+    c = &b.s;
+  bar (*c, a);
+}
+
+//===---------------------------------------------------------------------===//
+
+simplifylibcalls should do several optimizations for strspn/strcspn:
+
+strcspn(x, "") -> strlen(x)
+strcspn("", x) -> 0
+strspn("", x) -> 0
+strspn(x, "") -> strlen(x)
+strspn(x, "a") -> strchr(x, 'a')-x
+
+strcspn(x, "a") -> inlined loop for up to 3 letters (similarly for strspn):
+
+size_t __strcspn_c3 (__const char *__s, int __reject1, int __reject2,
+                     int __reject3) {
+  register size_t __result = 0;
+  while (__s[__result] != '\0' && __s[__result] != __reject1 &&
+         __s[__result] != __reject2 && __s[__result] != __reject3)
+    ++__result;
+  return __result;
+}
+
+This should turn into a switch on the character.  See PR3253 for some notes on
+codegen.
+
+456.hmmer apparently uses strcspn and strspn a lot.  471.omnetpp uses strspn.
+
+//===---------------------------------------------------------------------===//
+
+"gas" uses this idiom:
+  else if (strchr ("+-/*%|&^:[]()~", *intel_parser.op_string))
+..
+  else if (strchr ("<>", *intel_parser.op_string)
+
+Those should be turned into a switch.
+
+//===---------------------------------------------------------------------===//
+
+252.eon contains this interesting code:
+
+        %3072 = getelementptr [100 x i8]* %tempString, i32 0, i32 0
+        %3073 = call i8* @strcpy(i8* %3072, i8* %3071) nounwind
+        %strlen = call i32 @strlen(i8* %3072)    ; uses = 1
+        %endptr = getelementptr [100 x i8]* %tempString, i32 0, i32 %strlen
+        call void @llvm.memcpy.i32(i8* %endptr, 
+          i8* getelementptr ([5 x i8]* @"\01LC42", i32 0, i32 0), i32 5, i32 1)
+        %3074 = call i32 @strlen(i8* %endptr) nounwind readonly 
+        
+This is interesting for a couple reasons.  First, in this:
+
+        %3073 = call i8* @strcpy(i8* %3072, i8* %3071) nounwind
+        %strlen = call i32 @strlen(i8* %3072)  
+
+The strlen could be replaced with: %strlen = sub %3072, %3073, because the
+strcpy call returns a pointer to the end of the string.  Based on that, the
+endptr GEP just becomes equal to 3073, which eliminates a strlen call and GEP.
+
+Second, the memcpy+strlen strlen can be replaced with:
+
+        %3074 = call i32 @strlen([5 x i8]* @"\01LC42") nounwind readonly 
+
+Because the destination was just copied into the specified memory buffer.  This,
+in turn, can be constant folded to "4".
+
+In other code, it contains:
+
+        %endptr6978 = bitcast i8* %endptr69 to i32*            
+        store i32 7107374, i32* %endptr6978, align 1
+        %3167 = call i32 @strlen(i8* %endptr69) nounwind readonly    
+
+Which could also be constant folded.  Whatever is producing this should probably
+be fixed to leave this as a memcpy from a string.
+
+Further, eon also has an interesting partially redundant strlen call:
+
+bb8:            ; preds = %_ZN18eonImageCalculatorC1Ev.exit
+        %682 = getelementptr i8** %argv, i32 6          ; <i8**> [#uses=2]
+        %683 = load i8** %682, align 4          ; <i8*> [#uses=4]
+        %684 = load i8* %683, align 1           ; <i8> [#uses=1]
+        %685 = icmp eq i8 %684, 0               ; <i1> [#uses=1]
+        br i1 %685, label %bb10, label %bb9
+
+bb9:            ; preds = %bb8
+        %686 = call i32 @strlen(i8* %683) nounwind readonly          
+        %687 = icmp ugt i32 %686, 254           ; <i1> [#uses=1]
+        br i1 %687, label %bb10, label %bb11
+
+bb10:           ; preds = %bb9, %bb8
+        %688 = call i32 @strlen(i8* %683) nounwind readonly          
+
+This could be eliminated by doing the strlen once in bb8, saving code size and
+improving perf on the bb8->9->10 path.
+
+//===---------------------------------------------------------------------===//
+
+I see an interesting fully redundant call to strlen left in 186.crafty:InputMove
+which looks like:
+       %movetext11 = getelementptr [128 x i8]* %movetext, i32 0, i32 0 
+ 
+
+bb62:           ; preds = %bb55, %bb53
+        %promote.0 = phi i32 [ %169, %bb55 ], [ 0, %bb53 ]             
+        %171 = call i32 @strlen(i8* %movetext11) nounwind readonly align 1
+        %172 = add i32 %171, -1         ; <i32> [#uses=1]
+        %173 = getelementptr [128 x i8]* %movetext, i32 0, i32 %172       
+
+...  no stores ...
+       br i1 %or.cond, label %bb65, label %bb72
+
+bb65:           ; preds = %bb62
+        store i8 0, i8* %173, align 1
+        br label %bb72
+
+bb72:           ; preds = %bb65, %bb62
+        %trank.1 = phi i32 [ %176, %bb65 ], [ -1, %bb62 ]            
+        %177 = call i32 @strlen(i8* %movetext11) nounwind readonly align 1
+
+Note that on the bb62->bb72 path, that the %177 strlen call is partially
+redundant with the %171 call.  At worst, we could shove the %177 strlen call
+up into the bb65 block moving it out of the bb62->bb72 path.   However, note
+that bb65 stores to the string, zeroing out the last byte.  This means that on
+that path the value of %177 is actually just %171-1.  A sub is cheaper than a
+strlen!
+
+This pattern repeats several times, basically doing:
+
+  A = strlen(P);
+  P[A-1] = 0;
+  B = strlen(P);
+  where it is "obvious" that B = A-1.
+
+//===---------------------------------------------------------------------===//
+
+186.crafty contains this interesting pattern:
+
+%77 = call i8* @strstr(i8* getelementptr ([6 x i8]* @"\01LC5", i32 0, i32 0),
+                       i8* %30)
+%phitmp648 = icmp eq i8* %77, getelementptr ([6 x i8]* @"\01LC5", i32 0, i32 0)
+br i1 %phitmp648, label %bb70, label %bb76
+
+bb70:           ; preds = %OptionMatch.exit91, %bb69
+        %78 = call i32 @strlen(i8* %30) nounwind readonly align 1               ; <i32> [#uses=1]
+
+This is basically:
+  cststr = "abcdef";
+  if (strstr(cststr, P) == cststr) {
+     x = strlen(P);
+     ...
+
+The strstr call would be significantly cheaper written as:
+
+cststr = "abcdef";
+if (memcmp(P, str, strlen(P)))
+  x = strlen(P);
+
+This is memcmp+strlen instead of strstr.  This also makes the strlen fully
+redundant.
+
+//===---------------------------------------------------------------------===//
+
+186.crafty also contains this code:
+
+%1906 = call i32 @strlen(i8* getelementptr ([32 x i8]* @pgn_event, i32 0,i32 0))
+%1907 = getelementptr [32 x i8]* @pgn_event, i32 0, i32 %1906
+%1908 = call i8* @strcpy(i8* %1907, i8* %1905) nounwind align 1
+%1909 = call i32 @strlen(i8* getelementptr ([32 x i8]* @pgn_event, i32 0,i32 0))
+%1910 = getelementptr [32 x i8]* @pgn_event, i32 0, i32 %1909         
+
+The last strlen is computable as 1908-@pgn_event, which means 1910=1908.
+
+//===---------------------------------------------------------------------===//
+
+186.crafty has this interesting pattern with the "out.4543" variable:
+
+call void @llvm.memcpy.i32(
+        i8* getelementptr ([10 x i8]* @out.4543, i32 0, i32 0),
+       i8* getelementptr ([7 x i8]* @"\01LC28700", i32 0, i32 0), i32 7, i32 1) 
+%101 = call@printf(i8* ...   @out.4543, i32 0, i32 0)) nounwind 
+
+It is basically doing:
+
+  memcpy(globalarray, "string");
+  printf(...,  globalarray);
+  
+Anyway, by knowing that printf just reads the memory and forward substituting
+the string directly into the printf, this eliminates reads from globalarray.
+Since this pattern occurs frequently in crafty (due to the "DisplayTime" and
+other similar functions) there are many stores to "out".  Once all the printfs
+stop using "out", all that is left is the memcpy's into it.  This should allow
+globalopt to remove the "stored only" global.
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+define inreg i32 @foo(i8* inreg %p) nounwind {
+  %tmp0 = load i8* %p
+  %tmp1 = ashr i8 %tmp0, 5
+  %tmp2 = sext i8 %tmp1 to i32
+  ret i32 %tmp2
+}
+
+could be dagcombine'd to a sign-extending load with a shift.
+For example, on x86 this currently gets this:
+
+	movb	(%eax), %al
+	sarb	$5, %al
+	movsbl	%al, %eax
+
+while it could get this:
+
+	movsbl	(%eax), %eax
+	sarl	$5, %eax
+
+//===---------------------------------------------------------------------===//
+
+GCC PR31029:
+
+int test(int x) { return 1-x == x; }     // --> return false
+int test2(int x) { return 2-x == x; }    // --> return x == 1 ?
+
+Always foldable for odd constants, what is the rule for even?
+
+//===---------------------------------------------------------------------===//
+
+PR 3381: GEP to field of size 0 inside a struct could be turned into GEP
+for next field in struct (which is at same address).
+
+For example: store of float into { {{}}, float } could be turned into a store to
+the float directly.
+
+//===---------------------------------------------------------------------===//
+
+#include <math.h>
+double foo(double a) {    return sin(a); }
+
+This compiles into this on x86-64 Linux:
+foo:
+	subq	$8, %rsp
+	call	sin
+	addq	$8, %rsp
+	ret
+vs:
+
+foo:
+        jmp sin
+
+//===---------------------------------------------------------------------===//
+
+The arg promotion pass should make use of nocapture to make its alias analysis
+stuff much more precise.
+
+//===---------------------------------------------------------------------===//
+
+The following functions should be optimized to use a select instead of a
+branch (from gcc PR40072):
+
+char char_int(int m) {if(m>7) return 0; return m;}
+int int_char(char m) {if(m>7) return 0; return m;}
+
+//===---------------------------------------------------------------------===//
+
+int func(int a, int b) { if (a & 0x80) b |= 0x80; else b &= ~0x80; return b; }
+
+Generates this:
+
+define i32 @func(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+  %0 = and i32 %a, 128                            ; <i32> [#uses=1]
+  %1 = icmp eq i32 %0, 0                          ; <i1> [#uses=1]
+  %2 = or i32 %b, 128                             ; <i32> [#uses=1]
+  %3 = and i32 %b, -129                           ; <i32> [#uses=1]
+  %b_addr.0 = select i1 %1, i32 %3, i32 %2        ; <i32> [#uses=1]
+  ret i32 %b_addr.0
+}
+
+However, it's functionally equivalent to:
+
+         b = (b & ~0x80) | (a & 0x80);
+
+Which generates this:
+
+define i32 @func(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+  %0 = and i32 %b, -129                           ; <i32> [#uses=1]
+  %1 = and i32 %a, 128                            ; <i32> [#uses=1]
+  %2 = or i32 %0, %1                              ; <i32> [#uses=1]
+  ret i32 %2
+}
+
+This can be generalized for other forms:
+
+     b = (b & ~0x80) | (a & 0x40) << 1;
+
+//===---------------------------------------------------------------------===//
+
+These two functions produce different code. They shouldn't:
+
+#include <stdint.h>
+ 
+uint8_t p1(uint8_t b, uint8_t a) {
+  b = (b & ~0xc0) | (a & 0xc0);
+  return (b);
+}
+ 
+uint8_t p2(uint8_t b, uint8_t a) {
+  b = (b & ~0x40) | (a & 0x40);
+  b = (b & ~0x80) | (a & 0x80);
+  return (b);
+}
+
+define zeroext i8 @p1(i8 zeroext %b, i8 zeroext %a) nounwind readnone ssp {
+entry:
+  %0 = and i8 %b, 63                              ; <i8> [#uses=1]
+  %1 = and i8 %a, -64                             ; <i8> [#uses=1]
+  %2 = or i8 %1, %0                               ; <i8> [#uses=1]
+  ret i8 %2
+}
+
+define zeroext i8 @p2(i8 zeroext %b, i8 zeroext %a) nounwind readnone ssp {
+entry:
+  %0 = and i8 %b, 63                              ; <i8> [#uses=1]
+  %.masked = and i8 %a, 64                        ; <i8> [#uses=1]
+  %1 = and i8 %a, -128                            ; <i8> [#uses=1]
+  %2 = or i8 %1, %0                               ; <i8> [#uses=1]
+  %3 = or i8 %2, %.masked                         ; <i8> [#uses=1]
+  ret i8 %3
+}
+
+//===---------------------------------------------------------------------===//
+
+IPSCCP does not currently propagate argument dependent constants through
+functions where it does not not all of the callers.  This includes functions
+with normal external linkage as well as templates, C99 inline functions etc.
+Specifically, it does nothing to:
+
+define i32 @test(i32 %x, i32 %y, i32 %z) nounwind {
+entry:
+  %0 = add nsw i32 %y, %z                         
+  %1 = mul i32 %0, %x                             
+  %2 = mul i32 %y, %z                             
+  %3 = add nsw i32 %1, %2                         
+  ret i32 %3
+}
+
+define i32 @test2() nounwind {
+entry:
+  %0 = call i32 @test(i32 1, i32 2, i32 4) nounwind
+  ret i32 %0
+}
+
+It would be interesting extend IPSCCP to be able to handle simple cases like
+this, where all of the arguments to a call are constant.  Because IPSCCP runs
+before inlining, trivial templates and inline functions are not yet inlined.
+The results for a function + set of constant arguments should be memoized in a
+map.
+
+//===---------------------------------------------------------------------===//
+
+The libcall constant folding stuff should be moved out of SimplifyLibcalls into
+libanalysis' constantfolding logic.  This would allow IPSCCP to be able to
+handle simple things like this:
+
+static int foo(const char *X) { return strlen(X); }
+int bar() { return foo("abcd"); }
+
+//===---------------------------------------------------------------------===//
+
+InstCombine should use SimplifyDemandedBits to remove the or instruction:
+
+define i1 @test(i8 %x, i8 %y) {
+  %A = or i8 %x, 1
+  %B = icmp ugt i8 %A, 3
+  ret i1 %B
+}
+
+Currently instcombine calls SimplifyDemandedBits with either all bits or just
+the sign bit, if the comparison is obviously a sign test. In this case, we only
+need all but the bottom two bits from %A, and if we gave that mask to SDB it
+would delete the or instruction for us.
+
+//===---------------------------------------------------------------------===//
+
+functionattrs doesn't know much about memcpy/memset.  This function should be
+marked readnone rather than readonly, since it only twiddles local memory, but
+functionattrs doesn't handle memset/memcpy/memmove aggressively:
+
+struct X { int *p; int *q; };
+int foo() {
+ int i = 0, j = 1;
+ struct X x, y;
+ int **p;
+ y.p = &i;
+ x.q = &j;
+ p = __builtin_memcpy (&x, &y, sizeof (int *));
+ return **p;
+}
+
+//===---------------------------------------------------------------------===//
+
+Missed instcombine transformation:
+define i1 @a(i32 %x) nounwind readnone {
+entry:
+  %cmp = icmp eq i32 %x, 30
+  %sub = add i32 %x, -30
+  %cmp2 = icmp ugt i32 %sub, 9
+  %or = or i1 %cmp, %cmp2
+  ret i1 %or
+}
+This should be optimized to a single compare.  Testcase derived from gcc.
+
+//===---------------------------------------------------------------------===//
+
+Missed instcombine transformation:
+void b();
+void a(int x) { if (((1<<x)&8)==0) b(); }
+
+The shift should be optimized out.  Testcase derived from gcc.
+
+//===---------------------------------------------------------------------===//
+
+Missed instcombine or reassociate transformation:
+int a(int a, int b) { return (a==12)&(b>47)&(b<58); }
+
+The sgt and slt should be combined into a single comparison. Testcase derived
+from gcc.
+
+//===---------------------------------------------------------------------===//
+
+Missed instcombine transformation:
+define i32 @a(i32 %x) nounwind readnone {
+entry:
+  %rem = srem i32 %x, 32
+  %shl = shl i32 1, %rem
+  ret i32 %shl
+}
+
+The srem can be transformed to an and because if x is negative, the shift is
+undefined. Testcase derived from gcc.
+
+//===---------------------------------------------------------------------===//
+
+Missed instcombine/dagcombine transformation:
+define i32 @a(i32 %x, i32 %y) nounwind readnone {
+entry:
+  %mul = mul i32 %y, -8
+  %sub = sub i32 %x, %mul
+  ret i32 %sub
+}
+
+Should compile to something like x+y*8, but currently compiles to an
+inefficient result.  Testcase derived from gcc.
+
+//===---------------------------------------------------------------------===//
+
+Missed instcombine/dagcombine transformation:
+define void @lshift_lt(i8 zeroext %a) nounwind {
+entry:
+  %conv = zext i8 %a to i32
+  %shl = shl i32 %conv, 3
+  %cmp = icmp ult i32 %shl, 33
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @bar() nounwind
+  ret void
+
+if.end:
+  ret void
+}
+declare void @bar() nounwind
+
+The shift should be eliminated.  Testcase derived from gcc.
+
+//===---------------------------------------------------------------------===//
+
+These compile into different code, one gets recognized as a switch and the
+other doesn't due to phase ordering issues (PR6212):
+
+int test1(int mainType, int subType) {
+  if (mainType == 7)
+    subType = 4;
+  else if (mainType == 9)
+    subType = 6;
+  else if (mainType == 11)
+    subType = 9;
+  return subType;
+}
+
+int test2(int mainType, int subType) {
+  if (mainType == 7)
+    subType = 4;
+  if (mainType == 9)
+    subType = 6;
+  if (mainType == 11)
+    subType = 9;
+  return subType;
+}
+
+//===---------------------------------------------------------------------===//

diff --git a/lib/Target/Sparc/AsmPrinter/CMakeLists.txt b/lib/Target/Sparc/AsmPrinter/CMakeLists.txt
new file mode 100644
index 0000000..e3ca18e
--- /dev/null
+++ b/lib/Target/Sparc/AsmPrinter/CMakeLists.txt

@@ -0,0 +1,6 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMSparcAsmPrinter
+  SparcAsmPrinter.cpp
+  )
+add_dependencies(LLVMSparcAsmPrinter SparcCodeGenTable_gen)
\ No newline at end of file

diff --git a/lib/Target/Sparc/AsmPrinter/Makefile b/lib/Target/Sparc/AsmPrinter/Makefile
new file mode 100644
index 0000000..a856828
--- /dev/null
+++ b/lib/Target/Sparc/AsmPrinter/Makefile

@@ -0,0 +1,15 @@
+##===- lib/Target/Sparc/AsmPrinter/Makefile ----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMSparcAsmPrinter
+
+# Hack: we need to include 'main' Sparc target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common

diff --git a/lib/Target/Sparc/AsmPrinter/SparcAsmPrinter.cpp b/lib/Target/Sparc/AsmPrinter/SparcAsmPrinter.cpp
new file mode 100644
index 0000000..9a2ce6b
--- /dev/null
+++ b/lib/Target/Sparc/AsmPrinter/SparcAsmPrinter.cpp

@@ -0,0 +1,204 @@
+//===-- SparcAsmPrinter.cpp - Sparc LLVM assembly writer ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to GAS-format SPARC assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "Sparc.h"
+#include "SparcInstrInfo.h"
+#include "SparcTargetMachine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/FormattedStream.h"
+using namespace llvm;
+
+namespace {
+  class SparcAsmPrinter : public AsmPrinter {
+  public:
+    explicit SparcAsmPrinter(formatted_raw_ostream &O, TargetMachine &TM,
+                             MCContext &Ctx, MCStreamer &Streamer,
+                             const MCAsmInfo *T)
+      : AsmPrinter(O, TM, Ctx, Streamer, T) {}
+
+    virtual const char *getPassName() const {
+      return "Sparc Assembly Printer";
+    }
+
+    void printOperand(const MachineInstr *MI, int opNum);
+    void printMemOperand(const MachineInstr *MI, int opNum,
+                         const char *Modifier = 0);
+    void printCCOperand(const MachineInstr *MI, int opNum);
+
+    virtual void EmitInstruction(const MachineInstr *MI) {
+      printInstruction(MI);
+      OutStreamer.AddBlankLine();
+    }
+    void printInstruction(const MachineInstr *MI);  // autogenerated.
+    static const char *getRegisterName(unsigned RegNo);
+
+    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       unsigned AsmVariant, const char *ExtraCode);
+    bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                             unsigned AsmVariant, const char *ExtraCode);
+
+    bool printGetPCX(const MachineInstr *MI, unsigned OpNo);
+  };
+} // end of anonymous namespace
+
+#include "SparcGenAsmWriter.inc"
+
+void SparcAsmPrinter::printOperand(const MachineInstr *MI, int opNum) {
+  const MachineOperand &MO = MI->getOperand (opNum);
+  bool CloseParen = false;
+  if (MI->getOpcode() == SP::SETHIi && !MO.isReg() && !MO.isImm()) {
+    O << "%hi(";
+    CloseParen = true;
+  } else if ((MI->getOpcode() == SP::ORri || MI->getOpcode() == SP::ADDri) &&
+             !MO.isReg() && !MO.isImm()) {
+    O << "%lo(";
+    CloseParen = true;
+  }
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    O << "%" << LowercaseString(getRegisterName(MO.getReg()));
+    break;
+
+  case MachineOperand::MO_Immediate:
+    O << (int)MO.getImm();
+    break;
+  case MachineOperand::MO_MachineBasicBlock:
+    O << *MO.getMBB()->getSymbol(OutContext);
+    return;
+  case MachineOperand::MO_GlobalAddress:
+    O << *GetGlobalValueSymbol(MO.getGlobal());
+    break;
+  case MachineOperand::MO_ExternalSymbol:
+    O << MO.getSymbolName();
+    break;
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << MAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << "_"
+      << MO.getIndex();
+    break;
+  default:
+    llvm_unreachable("<unknown operand type>");
+  }
+  if (CloseParen) O << ")";
+}
+
+void SparcAsmPrinter::printMemOperand(const MachineInstr *MI, int opNum,
+                                      const char *Modifier) {
+  printOperand(MI, opNum);
+
+  // If this is an ADD operand, emit it like normal operands.
+  if (Modifier && !strcmp(Modifier, "arith")) {
+    O << ", ";
+    printOperand(MI, opNum+1);
+    return;
+  }
+
+  if (MI->getOperand(opNum+1).isReg() &&
+      MI->getOperand(opNum+1).getReg() == SP::G0)
+    return;   // don't print "+%g0"
+  if (MI->getOperand(opNum+1).isImm() &&
+      MI->getOperand(opNum+1).getImm() == 0)
+    return;   // don't print "+0"
+
+  O << "+";
+  if (MI->getOperand(opNum+1).isGlobal() ||
+      MI->getOperand(opNum+1).isCPI()) {
+    O << "%lo(";
+    printOperand(MI, opNum+1);
+    O << ")";
+  } else {
+    printOperand(MI, opNum+1);
+  }
+}
+
+bool SparcAsmPrinter::printGetPCX(const MachineInstr *MI, unsigned opNum) {
+  std::string operand = "";
+  const MachineOperand &MO = MI->getOperand(opNum);
+  switch (MO.getType()) {
+  default: assert(0 && "Operand is not a register ");
+  case MachineOperand::MO_Register:
+    assert(TargetRegisterInfo::isPhysicalRegister(MO.getReg()) &&
+           "Operand is not a physical register ");
+    operand = "%" + LowercaseString(getRegisterName(MO.getReg()));
+    break;
+  }
+
+  unsigned bbNum = MI->getParent()->getNumber();
+
+  O << '\n' << ".LLGETPCH" << bbNum << ":\n";
+  O << "\tcall\t.LLGETPC" << bbNum << '\n' ;
+
+  O << "\t  sethi\t"
+    << "%hi(_GLOBAL_OFFSET_TABLE_+(.-.LLGETPCH" << bbNum << ")), "  
+    << operand << '\n' ;
+
+  O << ".LLGETPC" << bbNum << ":\n" ;
+  O << "\tor\t" << operand  
+    << ", %lo(_GLOBAL_OFFSET_TABLE_+(.-.LLGETPCH" << bbNum << ")), "
+    << operand << '\n';
+  O << "\tadd\t" << operand << ", %o7, " << operand << '\n'; 
+  
+  return true;
+}
+
+void SparcAsmPrinter::printCCOperand(const MachineInstr *MI, int opNum) {
+  int CC = (int)MI->getOperand(opNum).getImm();
+  O << SPARCCondCodeToString((SPCC::CondCodes)CC);
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool SparcAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                      unsigned AsmVariant,
+                                      const char *ExtraCode) {
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default: return true;  // Unknown modifier.
+    case 'r':
+     break;
+    }
+  }
+
+  printOperand(MI, OpNo);
+
+  return false;
+}
+
+bool SparcAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                            unsigned OpNo,
+                                            unsigned AsmVariant,
+                                            const char *ExtraCode) {
+  if (ExtraCode && ExtraCode[0])
+    return true;  // Unknown modifier
+
+  O << '[';
+  printMemOperand(MI, OpNo);
+  O << ']';
+
+  return false;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeSparcAsmPrinter() { 
+  RegisterAsmPrinter<SparcAsmPrinter> X(TheSparcTarget);
+  RegisterAsmPrinter<SparcAsmPrinter> Y(TheSparcV9Target);
+}

diff --git a/lib/Target/Sparc/CMakeLists.txt b/lib/Target/Sparc/CMakeLists.txt
new file mode 100644
index 0000000..74f320a
--- /dev/null
+++ b/lib/Target/Sparc/CMakeLists.txt

@@ -0,0 +1,25 @@
+set(LLVM_TARGET_DEFINITIONS Sparc.td)
+
+tablegen(SparcGenRegisterInfo.h.inc -gen-register-desc-header)
+tablegen(SparcGenRegisterNames.inc -gen-register-enums)
+tablegen(SparcGenRegisterInfo.inc -gen-register-desc)
+tablegen(SparcGenInstrNames.inc -gen-instr-enums)
+tablegen(SparcGenInstrInfo.inc -gen-instr-desc)
+tablegen(SparcGenAsmWriter.inc -gen-asm-writer)
+tablegen(SparcGenDAGISel.inc -gen-dag-isel)
+tablegen(SparcGenSubtarget.inc -gen-subtarget)
+tablegen(SparcGenCallingConv.inc -gen-callingconv)
+
+add_llvm_target(SparcCodeGen
+  DelaySlotFiller.cpp
+  FPMover.cpp
+  SparcInstrInfo.cpp
+  SparcISelDAGToDAG.cpp
+  SparcISelLowering.cpp
+  SparcMCAsmInfo.cpp
+  SparcRegisterInfo.cpp
+  SparcSubtarget.cpp
+  SparcTargetMachine.cpp
+  )
+
+target_link_libraries (LLVMSparcCodeGen LLVMSelectionDAG)

diff --git a/lib/Target/Sparc/DelaySlotFiller.cpp b/lib/Target/Sparc/DelaySlotFiller.cpp
new file mode 100644
index 0000000..15b26c2
--- /dev/null
+++ b/lib/Target/Sparc/DelaySlotFiller.cpp

@@ -0,0 +1,76 @@
+//===-- DelaySlotFiller.cpp - SPARC delay slot filler ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a simple local pass that fills delay slots with NOPs.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "delayslotfiller"
+#include "Sparc.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(FilledSlots, "Number of delay slots filled");
+
+namespace {
+  struct Filler : public MachineFunctionPass {
+    /// Target machine description which we query for reg. names, data
+    /// layout, etc.
+    ///
+    TargetMachine &TM;
+    const TargetInstrInfo *TII;
+
+    static char ID;
+    Filler(TargetMachine &tm) 
+      : MachineFunctionPass(&ID), TM(tm), TII(tm.getInstrInfo()) { }
+
+    virtual const char *getPassName() const {
+      return "SPARC Delay Slot Filler";
+    }
+
+    bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
+    bool runOnMachineFunction(MachineFunction &F) {
+      bool Changed = false;
+      for (MachineFunction::iterator FI = F.begin(), FE = F.end();
+           FI != FE; ++FI)
+        Changed |= runOnMachineBasicBlock(*FI);
+      return Changed;
+    }
+
+  };
+  char Filler::ID = 0;
+} // end of anonymous namespace
+
+/// createSparcDelaySlotFillerPass - Returns a pass that fills in delay
+/// slots in Sparc MachineFunctions
+///
+FunctionPass *llvm::createSparcDelaySlotFillerPass(TargetMachine &tm) {
+  return new Filler(tm);
+}
+
+/// runOnMachineBasicBlock - Fill in delay slots for the given basic block.
+/// Currently, we fill delay slots with NOPs. We assume there is only one
+/// delay slot per delayed instruction.
+///
+bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
+  bool Changed = false;
+  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I)
+    if (I->getDesc().hasDelaySlot()) {
+      MachineBasicBlock::iterator J = I;
+      ++J;
+      BuildMI(MBB, J, DebugLoc::getUnknownLoc(), TII->get(SP::NOP));
+      ++FilledSlots;
+      Changed = true;
+    }
+  return Changed;
+}

diff --git a/lib/Target/Sparc/FPMover.cpp b/lib/Target/Sparc/FPMover.cpp
new file mode 100644
index 0000000..88b0927
--- /dev/null
+++ b/lib/Target/Sparc/FPMover.cpp

@@ -0,0 +1,141 @@
+//===-- FPMover.cpp - Sparc double-precision floating point move fixer ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Expand FpMOVD/FpABSD/FpNEGD instructions into their single-precision pieces.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "fpmover"
+#include "Sparc.h"
+#include "SparcSubtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+STATISTIC(NumFpDs , "Number of instructions translated");
+STATISTIC(NoopFpDs, "Number of noop instructions removed");
+
+namespace {
+  struct FPMover : public MachineFunctionPass {
+    /// Target machine description which we query for reg. names, data
+    /// layout, etc.
+    ///
+    TargetMachine &TM;
+    
+    static char ID;
+    explicit FPMover(TargetMachine &tm) 
+      : MachineFunctionPass(&ID), TM(tm) { }
+
+    virtual const char *getPassName() const {
+      return "Sparc Double-FP Move Fixer";
+    }
+
+    bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
+    bool runOnMachineFunction(MachineFunction &F);
+  };
+  char FPMover::ID = 0;
+} // end of anonymous namespace
+
+/// createSparcFPMoverPass - Returns a pass that turns FpMOVD
+/// instructions into FMOVS instructions
+///
+FunctionPass *llvm::createSparcFPMoverPass(TargetMachine &tm) {
+  return new FPMover(tm);
+}
+
+/// getDoubleRegPair - Given a DFP register, return the even and odd FP
+/// registers that correspond to it.
+static void getDoubleRegPair(unsigned DoubleReg, unsigned &EvenReg,
+                             unsigned &OddReg) {
+  static const unsigned EvenHalvesOfPairs[] = {
+    SP::F0, SP::F2, SP::F4, SP::F6, SP::F8, SP::F10, SP::F12, SP::F14,
+    SP::F16, SP::F18, SP::F20, SP::F22, SP::F24, SP::F26, SP::F28, SP::F30
+  };
+  static const unsigned OddHalvesOfPairs[] = {
+    SP::F1, SP::F3, SP::F5, SP::F7, SP::F9, SP::F11, SP::F13, SP::F15,
+    SP::F17, SP::F19, SP::F21, SP::F23, SP::F25, SP::F27, SP::F29, SP::F31
+  };
+  static const unsigned DoubleRegsInOrder[] = {
+    SP::D0, SP::D1, SP::D2, SP::D3, SP::D4, SP::D5, SP::D6, SP::D7, SP::D8,
+    SP::D9, SP::D10, SP::D11, SP::D12, SP::D13, SP::D14, SP::D15
+  };
+  for (unsigned i = 0; i < sizeof(DoubleRegsInOrder)/sizeof(unsigned); ++i)
+    if (DoubleRegsInOrder[i] == DoubleReg) {
+      EvenReg = EvenHalvesOfPairs[i];
+      OddReg = OddHalvesOfPairs[i];
+      return;
+    }
+  llvm_unreachable("Can't find reg");
+}
+
+/// runOnMachineBasicBlock - Fixup FpMOVD instructions in this MBB.
+///
+bool FPMover::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
+  bool Changed = false;
+  for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ) {
+    MachineInstr *MI = I++;
+    DebugLoc dl = MI->getDebugLoc();
+    if (MI->getOpcode() == SP::FpMOVD || MI->getOpcode() == SP::FpABSD ||
+        MI->getOpcode() == SP::FpNEGD) {
+      Changed = true;
+      unsigned DestDReg = MI->getOperand(0).getReg();
+      unsigned SrcDReg  = MI->getOperand(1).getReg();
+      if (DestDReg == SrcDReg && MI->getOpcode() == SP::FpMOVD) {
+        MBB.erase(MI);   // Eliminate the noop copy.
+        ++NoopFpDs;
+        continue;
+      }
+      
+      unsigned EvenSrcReg = 0, OddSrcReg = 0, EvenDestReg = 0, OddDestReg = 0;
+      getDoubleRegPair(DestDReg, EvenDestReg, OddDestReg);
+      getDoubleRegPair(SrcDReg, EvenSrcReg, OddSrcReg);
+
+      const TargetInstrInfo *TII = TM.getInstrInfo();
+      if (MI->getOpcode() == SP::FpMOVD)
+        MI->setDesc(TII->get(SP::FMOVS));
+      else if (MI->getOpcode() == SP::FpNEGD)
+        MI->setDesc(TII->get(SP::FNEGS));
+      else if (MI->getOpcode() == SP::FpABSD)
+        MI->setDesc(TII->get(SP::FABSS));
+      else
+        llvm_unreachable("Unknown opcode!");
+        
+      MI->getOperand(0).setReg(EvenDestReg);
+      MI->getOperand(1).setReg(EvenSrcReg);
+      DEBUG(errs() << "FPMover: the modified instr is: " << *MI);
+      // Insert copy for the other half of the double.
+      if (DestDReg != SrcDReg) {
+        MI = BuildMI(MBB, I, dl, TM.getInstrInfo()->get(SP::FMOVS), OddDestReg)
+          .addReg(OddSrcReg);
+        DEBUG(errs() << "FPMover: the inserted instr is: " << *MI);
+      }
+      ++NumFpDs;
+    }
+  }
+  return Changed;
+}
+
+bool FPMover::runOnMachineFunction(MachineFunction &F) {
+  // If the target has V9 instructions, the fp-mover pseudos will never be
+  // emitted.  Avoid a scan of the instructions to improve compile time.
+  if (TM.getSubtarget<SparcSubtarget>().isV9())
+    return false;
+  
+  bool Changed = false;
+  for (MachineFunction::iterator FI = F.begin(), FE = F.end();
+       FI != FE; ++FI)
+    Changed |= runOnMachineBasicBlock(*FI);
+  return Changed;
+}

diff --git a/lib/Target/Sparc/Makefile b/lib/Target/Sparc/Makefile
new file mode 100644
index 0000000..e407848
--- /dev/null
+++ b/lib/Target/Sparc/Makefile

@@ -0,0 +1,23 @@
+##===- lib/Target/Sparc/Makefile ---------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMSparcCodeGen
+TARGET = Sparc
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = SparcGenRegisterInfo.h.inc SparcGenRegisterNames.inc \
+                SparcGenRegisterInfo.inc SparcGenInstrNames.inc \
+                SparcGenInstrInfo.inc SparcGenAsmWriter.inc \
+                SparcGenDAGISel.inc SparcGenSubtarget.inc SparcGenCallingConv.inc
+
+DIRS = AsmPrinter TargetInfo
+
+include $(LEVEL)/Makefile.common
+

diff --git a/lib/Target/Sparc/README.txt b/lib/Target/Sparc/README.txt
new file mode 100644
index 0000000..cc24abf
--- /dev/null
+++ b/lib/Target/Sparc/README.txt

@@ -0,0 +1,58 @@
+
+To-do
+-----
+
+* Keep the address of the constant pool in a register instead of forming its
+  address all of the time.
+* We can fold small constant offsets into the %hi/%lo references to constant
+  pool addresses as well.
+* When in V9 mode, register allocate %icc[0-3].
+* Add support for isel'ing UMUL_LOHI instead of marking it as Expand.
+* Emit the 'Branch on Integer Register with Prediction' instructions.  It's
+  not clear how to write a pattern for this though:
+
+float %t1(int %a, int* %p) {
+        %C = seteq int %a, 0
+        br bool %C, label %T, label %F
+T:
+        store int 123, int* %p
+        br label %F
+F:
+        ret float undef
+}
+
+codegens to this:
+
+t1:
+        save -96, %o6, %o6
+1)      subcc %i0, 0, %l0
+1)      bne .LBBt1_2    ! F
+        nop
+.LBBt1_1:       ! T
+        or %g0, 123, %l0
+        st %l0, [%i1]
+.LBBt1_2:       ! F
+        restore %g0, %g0, %g0
+        retl
+        nop
+
+1) should be replaced with a brz in V9 mode.
+
+* Same as above, but emit conditional move on register zero (p192) in V9 
+  mode.  Testcase:
+
+int %t1(int %a, int %b) {
+        %C = seteq int %a, 0
+        %D = select bool %C, int %a, int %b
+        ret int %D
+}
+
+* Emit MULX/[SU]DIVX instructions in V9 mode instead of fiddling 
+  with the Y register, if they are faster.
+
+* Codegen bswap(load)/store(bswap) -> load/store ASI
+
+* Implement frame pointer elimination, e.g. eliminate save/restore for 
+  leaf fns.
+* Fill delay slots
+

diff --git a/lib/Target/Sparc/Sparc.h b/lib/Target/Sparc/Sparc.h
new file mode 100644
index 0000000..a37920d
--- /dev/null
+++ b/lib/Target/Sparc/Sparc.h

@@ -0,0 +1,121 @@
+//===-- Sparc.h - Top-level interface for Sparc representation --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// Sparc back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_SPARC_H
+#define TARGET_SPARC_H
+
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cassert>
+
+namespace llvm {
+  class FunctionPass;
+  class SparcTargetMachine;
+  class formatted_raw_ostream;
+
+  FunctionPass *createSparcISelDag(SparcTargetMachine &TM);
+  FunctionPass *createSparcDelaySlotFillerPass(TargetMachine &TM);
+  FunctionPass *createSparcFPMoverPass(TargetMachine &TM);
+
+  extern Target TheSparcTarget;
+  extern Target TheSparcV9Target;
+
+} // end namespace llvm;
+
+// Defines symbolic names for Sparc registers.  This defines a mapping from
+// register name to register number.
+//
+#include "SparcGenRegisterNames.inc"
+
+// Defines symbolic names for the Sparc instructions.
+//
+#include "SparcGenInstrNames.inc"
+
+
+namespace llvm {
+  // Enums corresponding to Sparc condition codes, both icc's and fcc's.  These
+  // values must be kept in sync with the ones in the .td file.
+  namespace SPCC {
+    enum CondCodes {
+      //ICC_A   =  8   ,  // Always
+      //ICC_N   =  0   ,  // Never
+      ICC_NE  =  9   ,  // Not Equal
+      ICC_E   =  1   ,  // Equal
+      ICC_G   = 10   ,  // Greater
+      ICC_LE  =  2   ,  // Less or Equal
+      ICC_GE  = 11   ,  // Greater or Equal
+      ICC_L   =  3   ,  // Less
+      ICC_GU  = 12   ,  // Greater Unsigned
+      ICC_LEU =  4   ,  // Less or Equal Unsigned
+      ICC_CC  = 13   ,  // Carry Clear/Great or Equal Unsigned
+      ICC_CS  =  5   ,  // Carry Set/Less Unsigned
+      ICC_POS = 14   ,  // Positive
+      ICC_NEG =  6   ,  // Negative
+      ICC_VC  = 15   ,  // Overflow Clear
+      ICC_VS  =  7   ,  // Overflow Set
+      
+      //FCC_A   =  8+16,  // Always
+      //FCC_N   =  0+16,  // Never
+      FCC_U   =  7+16,  // Unordered
+      FCC_G   =  6+16,  // Greater
+      FCC_UG  =  5+16,  // Unordered or Greater
+      FCC_L   =  4+16,  // Less
+      FCC_UL  =  3+16,  // Unordered or Less
+      FCC_LG  =  2+16,  // Less or Greater
+      FCC_NE  =  1+16,  // Not Equal
+      FCC_E   =  9+16,  // Equal
+      FCC_UE  = 10+16,  // Unordered or Equal
+      FCC_GE  = 11+16,  // Greater or Equal
+      FCC_UGE = 12+16,  // Unordered or Greater or Equal
+      FCC_LE  = 13+16,  // Less or Equal
+      FCC_ULE = 14+16,  // Unordered or Less or Equal
+      FCC_O   = 15+16   // Ordered
+    };
+  }
+  
+  inline static const char *SPARCCondCodeToString(SPCC::CondCodes CC) {
+    switch (CC) {
+    default: llvm_unreachable("Unknown condition code");
+    case SPCC::ICC_NE:  return "ne";
+    case SPCC::ICC_E:   return "e";
+    case SPCC::ICC_G:   return "g";
+    case SPCC::ICC_LE:  return "le";
+    case SPCC::ICC_GE:  return "ge";
+    case SPCC::ICC_L:   return "l";
+    case SPCC::ICC_GU:  return "gu";
+    case SPCC::ICC_LEU: return "leu";
+    case SPCC::ICC_CC:  return "cc";
+    case SPCC::ICC_CS:  return "cs";
+    case SPCC::ICC_POS: return "pos";
+    case SPCC::ICC_NEG: return "neg";
+    case SPCC::ICC_VC:  return "vc";
+    case SPCC::ICC_VS:  return "vs";
+    case SPCC::FCC_U:   return "u";
+    case SPCC::FCC_G:   return "g";
+    case SPCC::FCC_UG:  return "ug";
+    case SPCC::FCC_L:   return "l";
+    case SPCC::FCC_UL:  return "ul";
+    case SPCC::FCC_LG:  return "lg";
+    case SPCC::FCC_NE:  return "ne";
+    case SPCC::FCC_E:   return "e";
+    case SPCC::FCC_UE:  return "ue";
+    case SPCC::FCC_GE:  return "ge";
+    case SPCC::FCC_UGE: return "uge";
+    case SPCC::FCC_LE:  return "le";
+    case SPCC::FCC_ULE: return "ule";
+    case SPCC::FCC_O:   return "o";
+    }       
+  }
+}  // end namespace llvm
+#endif

diff --git a/lib/Target/Sparc/Sparc.td b/lib/Target/Sparc/Sparc.td
new file mode 100644
index 0000000..53ea8f4
--- /dev/null
+++ b/lib/Target/Sparc/Sparc.td

@@ -0,0 +1,76 @@
+//===- Sparc.td - Describe the Sparc Target Machine -------------*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// SPARC Subtarget features.
+//
+ 
+def FeatureV9
+  : SubtargetFeature<"v9", "IsV9", "true",
+                     "Enable SPARC-V9 instructions">;
+def FeatureV8Deprecated
+  : SubtargetFeature<"deprecated-v8", "V8DeprecatedInsts", "true",
+                     "Enable deprecated V8 instructions in V9 mode">;
+def FeatureVIS
+  : SubtargetFeature<"vis", "IsVIS", "true",
+                     "Enable UltraSPARC Visual Instruction Set extensions">;
+
+//===----------------------------------------------------------------------===//
+// Register File, Calling Conv, Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "SparcRegisterInfo.td"
+include "SparcCallingConv.td"
+include "SparcInstrInfo.td"
+
+def SparcInstrInfo : InstrInfo {
+  // Define how we want to layout our target-specific information field.
+  let TSFlagsFields = [];
+  let TSFlagsShifts = [];
+}
+
+//===----------------------------------------------------------------------===//
+// SPARC processors supported.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"generic",         []>;
+def : Proc<"v8",              []>;
+def : Proc<"supersparc",      []>;
+def : Proc<"sparclite",       []>;
+def : Proc<"f934",            []>;
+def : Proc<"hypersparc",      []>;
+def : Proc<"sparclite86x",    []>;
+def : Proc<"sparclet",        []>;
+def : Proc<"tsc701",          []>;
+def : Proc<"v9",              [FeatureV9]>;
+def : Proc<"ultrasparc",      [FeatureV9, FeatureV8Deprecated]>;
+def : Proc<"ultrasparc3",     [FeatureV9, FeatureV8Deprecated]>;
+def : Proc<"ultrasparc3-vis", [FeatureV9, FeatureV8Deprecated, FeatureVIS]>;
+
+
+//===----------------------------------------------------------------------===//
+// Declare the target which we are implementing
+//===----------------------------------------------------------------------===//
+
+def Sparc : Target {
+  // Pull in Instruction Info:
+  let InstructionSet = SparcInstrInfo;
+}

diff --git a/lib/Target/Sparc/SparcCallingConv.td b/lib/Target/Sparc/SparcCallingConv.td
new file mode 100644
index 0000000..33ecfdf
--- /dev/null
+++ b/lib/Target/Sparc/SparcCallingConv.td

@@ -0,0 +1,32 @@
+//===- SparcCallingConv.td - Calling Conventions Sparc -----*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the Sparc architectures.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// Sparc 32-bit C return-value convention.
+def RetCC_Sparc32 : CallingConv<[
+  CCIfType<[i32], CCAssignToReg<[I0, I1, I2, I3, I4, I5]>>,
+  CCIfType<[f32], CCAssignToReg<[F0, F1, F2, F3]>>,
+  CCIfType<[f64], CCAssignToReg<[D0, D1]>>
+]>;
+
+// Sparc 32-bit C Calling convention.
+def CC_Sparc32 : CallingConv<[
+  // All arguments get passed in integer registers if there is space.
+  CCIfType<[i32, f32, f64], CCAssignToReg<[I0, I1, I2, I3, I4, I5]>>,
+  
+  // Alternatively, they are assigned to the stack in 4-byte aligned units.
+  CCAssignToStack<4, 4>
+]>;

diff --git a/lib/Target/Sparc/SparcISelDAGToDAG.cpp b/lib/Target/Sparc/SparcISelDAGToDAG.cpp
new file mode 100644
index 0000000..e1b3299
--- /dev/null
+++ b/lib/Target/Sparc/SparcISelDAGToDAG.cpp

@@ -0,0 +1,230 @@
+//===-- SparcISelDAGToDAG.cpp - A dag to dag inst selector for Sparc ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the SPARC target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcISelLowering.h"
+#include "SparcTargetMachine.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+//===--------------------------------------------------------------------===//
+/// SparcDAGToDAGISel - SPARC specific code to select SPARC machine
+/// instructions for SelectionDAG operations.
+///
+namespace {
+class SparcDAGToDAGISel : public SelectionDAGISel {
+  /// Subtarget - Keep a pointer to the Sparc Subtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const SparcSubtarget &Subtarget;
+  SparcTargetMachine& TM;
+  MachineBasicBlock *CurBB;
+public:
+  explicit SparcDAGToDAGISel(SparcTargetMachine &tm)
+    : SelectionDAGISel(tm),
+      Subtarget(tm.getSubtarget<SparcSubtarget>()),
+      TM(tm) {
+  }
+
+  SDNode *Select(SDNode *N);
+
+  // Complex Pattern Selectors.
+  bool SelectADDRrr(SDNode *Op, SDValue N, SDValue &R1, SDValue &R2);
+  bool SelectADDRri(SDNode *Op, SDValue N, SDValue &Base,
+                    SDValue &Offset);
+
+  /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+  /// inline asm expressions.
+  virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                            char ConstraintCode,
+                                            std::vector<SDValue> &OutOps);
+
+  /// InstructionSelect - This callback is invoked by
+  /// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+  virtual void InstructionSelect();
+
+  virtual const char *getPassName() const {
+    return "SPARC DAG->DAG Pattern Instruction Selection";
+  }
+
+  // Include the pieces autogenerated from the target description.
+#include "SparcGenDAGISel.inc"
+
+private:
+  SDNode* getGlobalBaseReg();
+};
+}  // end anonymous namespace
+
+/// InstructionSelect - This callback is invoked by
+/// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+void SparcDAGToDAGISel::InstructionSelect() {
+  CurBB = BB;
+  // Select target instructions for the DAG.
+  SelectRoot(*CurDAG);
+  CurDAG->RemoveDeadNodes();
+}
+
+SDNode* SparcDAGToDAGISel::getGlobalBaseReg() {
+  MachineFunction *MF = CurBB->getParent();
+  unsigned GlobalBaseReg = TM.getInstrInfo()->getGlobalBaseReg(MF);
+  return CurDAG->getRegister(GlobalBaseReg, TLI.getPointerTy()).getNode();
+}
+
+bool SparcDAGToDAGISel::SelectADDRri(SDNode *Op, SDValue Addr,
+                                     SDValue &Base, SDValue &Offset) {
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // direct calls.
+
+  if (Addr.getOpcode() == ISD::ADD) {
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1))) {
+      if (Predicate_simm13(CN)) {
+        if (FrameIndexSDNode *FIN =
+                dyn_cast<FrameIndexSDNode>(Addr.getOperand(0))) {
+          // Constant offset from frame ref.
+          Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+        } else {
+          Base = Addr.getOperand(0);
+        }
+        Offset = CurDAG->getTargetConstant(CN->getZExtValue(), MVT::i32);
+        return true;
+      }
+    }
+    if (Addr.getOperand(0).getOpcode() == SPISD::Lo) {
+      Base = Addr.getOperand(1);
+      Offset = Addr.getOperand(0).getOperand(0);
+      return true;
+    }
+    if (Addr.getOperand(1).getOpcode() == SPISD::Lo) {
+      Base = Addr.getOperand(0);
+      Offset = Addr.getOperand(1).getOperand(0);
+      return true;
+    }
+  }
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return true;
+}
+
+bool SparcDAGToDAGISel::SelectADDRrr(SDNode *Op, SDValue Addr,
+                                     SDValue &R1,  SDValue &R2) {
+  if (Addr.getOpcode() == ISD::FrameIndex) return false;
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress)
+    return false;  // direct calls.
+
+  if (Addr.getOpcode() == ISD::ADD) {
+    if (isa<ConstantSDNode>(Addr.getOperand(1)) &&
+        Predicate_simm13(Addr.getOperand(1).getNode()))
+      return false;  // Let the reg+imm pattern catch this!
+    if (Addr.getOperand(0).getOpcode() == SPISD::Lo ||
+        Addr.getOperand(1).getOpcode() == SPISD::Lo)
+      return false;  // Let the reg+imm pattern catch this!
+    R1 = Addr.getOperand(0);
+    R2 = Addr.getOperand(1);
+    return true;
+  }
+
+  R1 = Addr;
+  R2 = CurDAG->getRegister(SP::G0, MVT::i32);
+  return true;
+}
+
+SDNode *SparcDAGToDAGISel::Select(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  if (N->isMachineOpcode())
+    return NULL;   // Already selected.
+
+  switch (N->getOpcode()) {
+  default: break;
+  case SPISD::GLOBAL_BASE_REG:
+    return getGlobalBaseReg();
+
+  case ISD::SDIV:
+  case ISD::UDIV: {
+    // FIXME: should use a custom expander to expose the SRA to the dag.
+    SDValue DivLHS = N->getOperand(0);
+    SDValue DivRHS = N->getOperand(1);
+
+    // Set the Y register to the high-part.
+    SDValue TopPart;
+    if (N->getOpcode() == ISD::SDIV) {
+      TopPart = SDValue(CurDAG->getMachineNode(SP::SRAri, dl, MVT::i32, DivLHS,
+                                   CurDAG->getTargetConstant(31, MVT::i32)), 0);
+    } else {
+      TopPart = CurDAG->getRegister(SP::G0, MVT::i32);
+    }
+    TopPart = SDValue(CurDAG->getMachineNode(SP::WRYrr, dl, MVT::Flag, TopPart,
+                                     CurDAG->getRegister(SP::G0, MVT::i32)), 0);
+
+    // FIXME: Handle div by immediate.
+    unsigned Opcode = N->getOpcode() == ISD::SDIV ? SP::SDIVrr : SP::UDIVrr;
+    return CurDAG->SelectNodeTo(N, Opcode, MVT::i32, DivLHS, DivRHS,
+                                TopPart);
+  }
+  case ISD::MULHU:
+  case ISD::MULHS: {
+    // FIXME: Handle mul by immediate.
+    SDValue MulLHS = N->getOperand(0);
+    SDValue MulRHS = N->getOperand(1);
+    unsigned Opcode = N->getOpcode() == ISD::MULHU ? SP::UMULrr : SP::SMULrr;
+    SDNode *Mul = CurDAG->getMachineNode(Opcode, dl, MVT::i32, MVT::Flag,
+                                         MulLHS, MulRHS);
+    // The high part is in the Y register.
+    return CurDAG->SelectNodeTo(N, SP::RDY, MVT::i32, SDValue(Mul, 1));
+    return NULL;
+  }
+  }
+
+  return SelectCode(N);
+}
+
+
+/// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+/// inline asm expressions.
+bool
+SparcDAGToDAGISel::SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                                char ConstraintCode,
+                                                std::vector<SDValue> &OutOps) {
+  SDValue Op0, Op1;
+  switch (ConstraintCode) {
+  default: return true;
+  case 'm':   // memory
+   if (!SelectADDRrr(Op.getNode(), Op, Op0, Op1))
+     SelectADDRri(Op.getNode(), Op, Op0, Op1);
+   break;
+  }
+
+  OutOps.push_back(Op0);
+  OutOps.push_back(Op1);
+  return false;
+}
+
+/// createSparcISelDag - This pass converts a legalized DAG into a
+/// SPARC-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createSparcISelDag(SparcTargetMachine &TM) {
+  return new SparcDAGToDAGISel(TM);
+}

diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
new file mode 100644
index 0000000..e67002a
--- /dev/null
+++ b/lib/Target/Sparc/SparcISelLowering.cpp

@@ -0,0 +1,1090 @@
+//===-- SparcISelLowering.cpp - Sparc DAG Lowering Implementation ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the interfaces that Sparc uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcISelLowering.h"
+#include "SparcTargetMachine.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/ADT/VectorExtras.h"
+#include "llvm/Support/ErrorHandling.h"
+using namespace llvm;
+
+
+//===----------------------------------------------------------------------===//
+// Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "SparcGenCallingConv.inc"
+
+SDValue
+SparcTargetLowering::LowerReturn(SDValue Chain,
+                                 CallingConv::ID CallConv, bool isVarArg,
+                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                 DebugLoc dl, SelectionDAG &DAG) {
+
+  // CCValAssign - represent the assignment of the return value to locations.
+  SmallVector<CCValAssign, 16> RVLocs;
+
+  // CCState - Info about the registers and stack slot.
+  CCState CCInfo(CallConv, isVarArg, DAG.getTarget(),
+                 RVLocs, *DAG.getContext());
+
+  // Analize return values.
+  CCInfo.AnalyzeReturn(Outs, RetCC_Sparc32);
+
+  // If this is the first return lowered for this function, add the regs to the
+  // liveout set for the function.
+  if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
+    for (unsigned i = 0; i != RVLocs.size(); ++i)
+      if (RVLocs[i].isRegLoc())
+        DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
+  }
+
+  SDValue Flag;
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 
+                             Outs[i].Val, Flag);
+
+    // Guarantee that all emitted copies are stuck together with flags.
+    Flag = Chain.getValue(1);
+  }
+
+  if (Flag.getNode())
+    return DAG.getNode(SPISD::RET_FLAG, dl, MVT::Other, Chain, Flag);
+  return DAG.getNode(SPISD::RET_FLAG, dl, MVT::Other, Chain);
+}
+
+/// LowerFormalArguments - V8 uses a very simple ABI, where all values are
+/// passed in either one or two GPRs, including FP values.  TODO: we should
+/// pass FP values in FP registers for fastcc functions.
+SDValue
+SparcTargetLowering::LowerFormalArguments(SDValue Chain,
+                                          CallingConv::ID CallConv, bool isVarArg,
+                                          const SmallVectorImpl<ISD::InputArg>
+                                            &Ins,
+                                          DebugLoc dl, SelectionDAG &DAG,
+                                          SmallVectorImpl<SDValue> &InVals) {
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 ArgLocs, *DAG.getContext());
+  CCInfo.AnalyzeFormalArguments(Ins, CC_Sparc32);
+
+  static const unsigned ArgRegs[] = {
+    SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5
+  };
+  const unsigned *CurArgReg = ArgRegs, *ArgRegEnd = ArgRegs+6;
+  unsigned ArgOffset = 68;
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    SDValue ArgValue;
+    CCValAssign &VA = ArgLocs[i];
+    // FIXME: We ignore the register assignments of AnalyzeFormalArguments
+    // because it doesn't know how to split a double into two i32 registers.
+    EVT ObjectVT = VA.getValVT();
+    switch (ObjectVT.getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unhandled argument type!");
+    case MVT::i1:
+    case MVT::i8:
+    case MVT::i16:
+    case MVT::i32:
+      if (!Ins[i].Used) {                  // Argument is dead.
+        if (CurArgReg < ArgRegEnd) ++CurArgReg;
+        InVals.push_back(DAG.getUNDEF(ObjectVT));
+      } else if (CurArgReg < ArgRegEnd) {  // Lives in an incoming GPR
+        unsigned VReg = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
+        MF.getRegInfo().addLiveIn(*CurArgReg++, VReg);
+        SDValue Arg = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
+        if (ObjectVT != MVT::i32) {
+          unsigned AssertOp = ISD::AssertSext;
+          Arg = DAG.getNode(AssertOp, dl, MVT::i32, Arg,
+                            DAG.getValueType(ObjectVT));
+          Arg = DAG.getNode(ISD::TRUNCATE, dl, ObjectVT, Arg);
+        }
+        InVals.push_back(Arg);
+      } else {
+        int FrameIdx = MF.getFrameInfo()->CreateFixedObject(4, ArgOffset,
+                                                            true, false);
+        SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
+        SDValue Load;
+        if (ObjectVT == MVT::i32) {
+          Load = DAG.getLoad(MVT::i32, dl, Chain, FIPtr, NULL, 0);
+        } else {
+          ISD::LoadExtType LoadOp = ISD::SEXTLOAD;
+
+          // Sparc is big endian, so add an offset based on the ObjectVT.
+          unsigned Offset = 4-std::max(1U, ObjectVT.getSizeInBits()/8);
+          FIPtr = DAG.getNode(ISD::ADD, dl, MVT::i32, FIPtr,
+                              DAG.getConstant(Offset, MVT::i32));
+          Load = DAG.getExtLoad(LoadOp, dl, MVT::i32, Chain, FIPtr,
+                                NULL, 0, ObjectVT);
+          Load = DAG.getNode(ISD::TRUNCATE, dl, ObjectVT, Load);
+        }
+        InVals.push_back(Load);
+      }
+
+      ArgOffset += 4;
+      break;
+    case MVT::f32:
+      if (!Ins[i].Used) {                  // Argument is dead.
+        if (CurArgReg < ArgRegEnd) ++CurArgReg;
+        InVals.push_back(DAG.getUNDEF(ObjectVT));
+      } else if (CurArgReg < ArgRegEnd) {  // Lives in an incoming GPR
+        // FP value is passed in an integer register.
+        unsigned VReg = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
+        MF.getRegInfo().addLiveIn(*CurArgReg++, VReg);
+        SDValue Arg = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
+
+        Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Arg);
+        InVals.push_back(Arg);
+      } else {
+        int FrameIdx = MF.getFrameInfo()->CreateFixedObject(4, ArgOffset,
+                                                            true, false);
+        SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
+        SDValue Load = DAG.getLoad(MVT::f32, dl, Chain, FIPtr, NULL, 0);
+        InVals.push_back(Load);
+      }
+      ArgOffset += 4;
+      break;
+
+    case MVT::i64:
+    case MVT::f64:
+      if (!Ins[i].Used) {                // Argument is dead.
+        if (CurArgReg < ArgRegEnd) ++CurArgReg;
+        if (CurArgReg < ArgRegEnd) ++CurArgReg;
+        InVals.push_back(DAG.getUNDEF(ObjectVT));
+      } else {
+        SDValue HiVal;
+        if (CurArgReg < ArgRegEnd) {  // Lives in an incoming GPR
+          unsigned VRegHi = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
+          MF.getRegInfo().addLiveIn(*CurArgReg++, VRegHi);
+          HiVal = DAG.getCopyFromReg(Chain, dl, VRegHi, MVT::i32);
+        } else {
+          int FrameIdx = MF.getFrameInfo()->CreateFixedObject(4, ArgOffset,
+                                                              true, false);
+          SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
+          HiVal = DAG.getLoad(MVT::i32, dl, Chain, FIPtr, NULL, 0);
+        }
+
+        SDValue LoVal;
+        if (CurArgReg < ArgRegEnd) {  // Lives in an incoming GPR
+          unsigned VRegLo = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
+          MF.getRegInfo().addLiveIn(*CurArgReg++, VRegLo);
+          LoVal = DAG.getCopyFromReg(Chain, dl, VRegLo, MVT::i32);
+        } else {
+          int FrameIdx = MF.getFrameInfo()->CreateFixedObject(4, ArgOffset+4,
+                                                              true, false);
+          SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
+          LoVal = DAG.getLoad(MVT::i32, dl, Chain, FIPtr, NULL, 0);
+        }
+
+        // Compose the two halves together into an i64 unit.
+        SDValue WholeValue =
+          DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, LoVal, HiVal);
+
+        // If we want a double, do a bit convert.
+        if (ObjectVT == MVT::f64)
+          WholeValue = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f64, WholeValue);
+
+        InVals.push_back(WholeValue);
+      }
+      ArgOffset += 8;
+      break;
+    }
+  }
+
+  // Store remaining ArgRegs to the stack if this is a varargs function.
+  if (isVarArg) {
+    // Remember the vararg offset for the va_start implementation.
+    VarArgsFrameOffset = ArgOffset;
+
+    std::vector<SDValue> OutChains;
+
+    for (; CurArgReg != ArgRegEnd; ++CurArgReg) {
+      unsigned VReg = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
+      MF.getRegInfo().addLiveIn(*CurArgReg, VReg);
+      SDValue Arg = DAG.getCopyFromReg(DAG.getRoot(), dl, VReg, MVT::i32);
+
+      int FrameIdx = MF.getFrameInfo()->CreateFixedObject(4, ArgOffset,
+                                                          true, false);
+      SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
+
+      OutChains.push_back(DAG.getStore(DAG.getRoot(), dl, Arg, FIPtr, NULL, 0));
+      ArgOffset += 4;
+    }
+
+    if (!OutChains.empty()) {
+      OutChains.push_back(Chain);
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                          &OutChains[0], OutChains.size());
+    }
+  }
+
+  return Chain;
+}
+
+SDValue
+SparcTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
+                               CallingConv::ID CallConv, bool isVarArg,
+                               bool &isTailCall,
+                               const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               DebugLoc dl, SelectionDAG &DAG,
+                               SmallVectorImpl<SDValue> &InVals) {
+  // Sparc target does not yet support tail call optimization.
+  isTailCall = false;
+
+#if 0
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getTarget(), ArgLocs);
+  CCInfo.AnalyzeCallOperands(Outs, CC_Sparc32);
+
+  // Get the size of the outgoing arguments stack space requirement.
+  unsigned ArgsSize = CCInfo.getNextStackOffset();
+  // FIXME: We can't use this until f64 is known to take two GPRs.
+#else
+  (void)CC_Sparc32;
+
+  // Count the size of the outgoing arguments.
+  unsigned ArgsSize = 0;
+  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+    switch (Outs[i].Val.getValueType().getSimpleVT().SimpleTy) {
+      default: llvm_unreachable("Unknown value type!");
+      case MVT::i1:
+      case MVT::i8:
+      case MVT::i16:
+      case MVT::i32:
+      case MVT::f32:
+        ArgsSize += 4;
+        break;
+      case MVT::i64:
+      case MVT::f64:
+        ArgsSize += 8;
+        break;
+    }
+  }
+  if (ArgsSize > 4*6)
+    ArgsSize -= 4*6;    // Space for first 6 arguments is prereserved.
+  else
+    ArgsSize = 0;
+#endif
+
+  // Keep stack frames 8-byte aligned.
+  ArgsSize = (ArgsSize+7) & ~7;
+
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(ArgsSize, true));
+
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+
+#if 0
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    SDValue Arg = Outs[i].Val;
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, VA.getLocVT(), Arg);
+      break;
+    }
+
+    // Arguments that can be passed on register must be kept at
+    // RegsToPass vector
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+      continue;
+    }
+
+    assert(VA.isMemLoc());
+
+    // Create a store off the stack pointer for this argument.
+    SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
+    // FIXME: VERIFY THAT 68 IS RIGHT.
+    SDValue PtrOff = DAG.getIntPtrConstant(VA.getLocMemOffset()+68);
+    PtrOff = DAG.getNode(ISD::ADD, MVT::i32, StackPtr, PtrOff);
+    MemOpChains.push_back(DAG.getStore(Chain, Arg, PtrOff, NULL, 0));
+  }
+
+#else
+  static const unsigned ArgRegs[] = {
+    SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5
+  };
+  unsigned ArgOffset = 68;
+
+  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+    SDValue Val = Outs[i].Val;
+    EVT ObjectVT = Val.getValueType();
+    SDValue ValToStore(0, 0);
+    unsigned ObjSize;
+    switch (ObjectVT.getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unhandled argument type!");
+    case MVT::i32:
+      ObjSize = 4;
+
+      if (RegsToPass.size() >= 6) {
+        ValToStore = Val;
+      } else {
+        RegsToPass.push_back(std::make_pair(ArgRegs[RegsToPass.size()], Val));
+      }
+      break;
+    case MVT::f32:
+      ObjSize = 4;
+      if (RegsToPass.size() >= 6) {
+        ValToStore = Val;
+      } else {
+        // Convert this to a FP value in an int reg.
+        Val = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, Val);
+        RegsToPass.push_back(std::make_pair(ArgRegs[RegsToPass.size()], Val));
+      }
+      break;
+    case MVT::f64: {
+      ObjSize = 8;
+      if (RegsToPass.size() >= 6) {
+        ValToStore = Val;    // Whole thing is passed in memory.
+        break;
+      }
+
+      // Break into top and bottom parts by storing to the stack and loading
+      // out the parts as integers.  Top part goes in a reg.
+      SDValue StackPtr = DAG.CreateStackTemporary(MVT::f64, MVT::i32);
+      SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, 
+                                   Val, StackPtr, NULL, 0);
+      // Sparc is big-endian, so the high part comes first.
+      SDValue Hi = DAG.getLoad(MVT::i32, dl, Store, StackPtr, NULL, 0, 0);
+      // Increment the pointer to the other half.
+      StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
+                             DAG.getIntPtrConstant(4));
+      // Load the low part.
+      SDValue Lo = DAG.getLoad(MVT::i32, dl, Store, StackPtr, NULL, 0, 0);
+
+      RegsToPass.push_back(std::make_pair(ArgRegs[RegsToPass.size()], Hi));
+
+      if (RegsToPass.size() >= 6) {
+        ValToStore = Lo;
+        ArgOffset += 4;
+        ObjSize = 4;
+      } else {
+        RegsToPass.push_back(std::make_pair(ArgRegs[RegsToPass.size()], Lo));
+      }
+      break;
+    }
+    case MVT::i64: {
+      ObjSize = 8;
+      if (RegsToPass.size() >= 6) {
+        ValToStore = Val;    // Whole thing is passed in memory.
+        break;
+      }
+
+      // Split the value into top and bottom part.  Top part goes in a reg.
+      SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Val,
+                                 DAG.getConstant(1, MVT::i32));
+      SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Val,
+                                 DAG.getConstant(0, MVT::i32));
+      RegsToPass.push_back(std::make_pair(ArgRegs[RegsToPass.size()], Hi));
+
+      if (RegsToPass.size() >= 6) {
+        ValToStore = Lo;
+        ArgOffset += 4;
+        ObjSize = 4;
+      } else {
+        RegsToPass.push_back(std::make_pair(ArgRegs[RegsToPass.size()], Lo));
+      }
+      break;
+    }
+    }
+
+    if (ValToStore.getNode()) {
+      SDValue StackPtr = DAG.getRegister(SP::O6, MVT::i32);
+      SDValue PtrOff = DAG.getConstant(ArgOffset, MVT::i32);
+      PtrOff = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, PtrOff);
+      MemOpChains.push_back(DAG.getStore(Chain, dl, ValToStore, 
+                                         PtrOff, NULL, 0));
+    }
+    ArgOffset += ObjSize;
+  }
+#endif
+
+  // Emit all stores, make sure the occur before any copies into physregs.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Build a sequence of copy-to-reg nodes chained together with token
+  // chain and flag operands which copy the outgoing args into registers.
+  // The InFlag in necessary since all emited instructions must be
+  // stuck together.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    unsigned Reg = RegsToPass[i].first;
+    // Remap I0->I7 -> O0->O7.
+    if (Reg >= SP::I0 && Reg <= SP::I7)
+      Reg = Reg-SP::I0+SP::O0;
+
+    Chain = DAG.getCopyToReg(Chain, dl, Reg, RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress node (quite common, every direct call is)
+  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+  // Likewise ExternalSymbol -> TargetExternalSymbol.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), MVT::i32);
+  else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
+    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), MVT::i32);
+
+  std::vector<EVT> NodeTys;
+  NodeTys.push_back(MVT::Other);   // Returns a chain
+  NodeTys.push_back(MVT::Flag);    // Returns a flag for retval copy to use.
+  SDValue Ops[] = { Chain, Callee, InFlag };
+  Chain = DAG.getNode(SPISD::CALL, dl, NodeTys, Ops, InFlag.getNode() ? 3 : 2);
+  InFlag = Chain.getValue(1);
+
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(ArgsSize, true),
+                             DAG.getIntPtrConstant(0, true), InFlag);
+  InFlag = Chain.getValue(1);
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState RVInfo(CallConv, isVarArg, DAG.getTarget(),
+                 RVLocs, *DAG.getContext());
+
+  RVInfo.AnalyzeCallResult(Ins, RetCC_Sparc32);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    unsigned Reg = RVLocs[i].getLocReg();
+
+    // Remap I0->I7 -> O0->O7.
+    if (Reg >= SP::I0 && Reg <= SP::I7)
+      Reg = Reg-SP::I0+SP::O0;
+
+    Chain = DAG.getCopyFromReg(Chain, dl, Reg,
+                               RVLocs[i].getValVT(), InFlag).getValue(1);
+    InFlag = Chain.getValue(2);
+    InVals.push_back(Chain.getValue(0));
+  }
+
+  return Chain;
+}
+
+
+
+//===----------------------------------------------------------------------===//
+// TargetLowering Implementation
+//===----------------------------------------------------------------------===//
+
+/// IntCondCCodeToICC - Convert a DAG integer condition code to a SPARC ICC
+/// condition.
+static SPCC::CondCodes IntCondCCodeToICC(ISD::CondCode CC) {
+  switch (CC) {
+  default: llvm_unreachable("Unknown integer condition code!");
+  case ISD::SETEQ:  return SPCC::ICC_E;
+  case ISD::SETNE:  return SPCC::ICC_NE;
+  case ISD::SETLT:  return SPCC::ICC_L;
+  case ISD::SETGT:  return SPCC::ICC_G;
+  case ISD::SETLE:  return SPCC::ICC_LE;
+  case ISD::SETGE:  return SPCC::ICC_GE;
+  case ISD::SETULT: return SPCC::ICC_CS;
+  case ISD::SETULE: return SPCC::ICC_LEU;
+  case ISD::SETUGT: return SPCC::ICC_GU;
+  case ISD::SETUGE: return SPCC::ICC_CC;
+  }
+}
+
+/// FPCondCCodeToFCC - Convert a DAG floatingp oint condition code to a SPARC
+/// FCC condition.
+static SPCC::CondCodes FPCondCCodeToFCC(ISD::CondCode CC) {
+  switch (CC) {
+  default: llvm_unreachable("Unknown fp condition code!");
+  case ISD::SETEQ:
+  case ISD::SETOEQ: return SPCC::FCC_E;
+  case ISD::SETNE:
+  case ISD::SETUNE: return SPCC::FCC_NE;
+  case ISD::SETLT:
+  case ISD::SETOLT: return SPCC::FCC_L;
+  case ISD::SETGT:
+  case ISD::SETOGT: return SPCC::FCC_G;
+  case ISD::SETLE:
+  case ISD::SETOLE: return SPCC::FCC_LE;
+  case ISD::SETGE:
+  case ISD::SETOGE: return SPCC::FCC_GE;
+  case ISD::SETULT: return SPCC::FCC_UL;
+  case ISD::SETULE: return SPCC::FCC_ULE;
+  case ISD::SETUGT: return SPCC::FCC_UG;
+  case ISD::SETUGE: return SPCC::FCC_UGE;
+  case ISD::SETUO:  return SPCC::FCC_U;
+  case ISD::SETO:   return SPCC::FCC_O;
+  case ISD::SETONE: return SPCC::FCC_LG;
+  case ISD::SETUEQ: return SPCC::FCC_UE;
+  }
+}
+
+SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
+  : TargetLowering(TM, new TargetLoweringObjectFileELF()) {
+
+  // Set up the register classes.
+  addRegisterClass(MVT::i32, SP::IntRegsRegisterClass);
+  addRegisterClass(MVT::f32, SP::FPRegsRegisterClass);
+  addRegisterClass(MVT::f64, SP::DFPRegsRegisterClass);
+
+  // Turn FP extload into load/fextend
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+  // Sparc doesn't have i1 sign extending load
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+  // Turn FP truncstore into trunc + store.
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+  // Custom legalize GlobalAddress nodes into LO/HI parts.
+  setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
+  setOperationAction(ISD::ConstantPool , MVT::i32, Custom);
+
+  // Sparc doesn't have sext_inreg, replace them with shl/sra
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
+
+  // Sparc has no REM or DIVREM operations.
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
+  setOperationAction(ISD::SREM, MVT::i32, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+
+  // Custom expand fp<->sint
+  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+
+  // Expand fp<->uint
+  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Expand);
+
+  setOperationAction(ISD::BIT_CONVERT, MVT::f32, Expand);
+  setOperationAction(ISD::BIT_CONVERT, MVT::i32, Expand);
+
+  // Sparc has no select or setcc: expand to SELECT_CC.
+  setOperationAction(ISD::SELECT, MVT::i32, Expand);
+  setOperationAction(ISD::SELECT, MVT::f32, Expand);
+  setOperationAction(ISD::SELECT, MVT::f64, Expand);
+  setOperationAction(ISD::SETCC, MVT::i32, Expand);
+  setOperationAction(ISD::SETCC, MVT::f32, Expand);
+  setOperationAction(ISD::SETCC, MVT::f64, Expand);
+
+  // Sparc doesn't have BRCOND either, it has BR_CC.
+  setOperationAction(ISD::BRCOND, MVT::Other, Expand);
+  setOperationAction(ISD::BRIND, MVT::Other, Expand);
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC, MVT::i32, Custom);
+  setOperationAction(ISD::BR_CC, MVT::f32, Custom);
+  setOperationAction(ISD::BR_CC, MVT::f64, Custom);
+
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
+
+  // SPARC has no intrinsics for these particular operations.
+  setOperationAction(ISD::MEMBARRIER, MVT::Other, Expand);
+
+  setOperationAction(ISD::FSIN , MVT::f64, Expand);
+  setOperationAction(ISD::FCOS , MVT::f64, Expand);
+  setOperationAction(ISD::FREM , MVT::f64, Expand);
+  setOperationAction(ISD::FSIN , MVT::f32, Expand);
+  setOperationAction(ISD::FCOS , MVT::f32, Expand);
+  setOperationAction(ISD::FREM , MVT::f32, Expand);
+  setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+  setOperationAction(ISD::CTTZ , MVT::i32, Expand);
+  setOperationAction(ISD::CTLZ , MVT::i32, Expand);
+  setOperationAction(ISD::ROTL , MVT::i32, Expand);
+  setOperationAction(ISD::ROTR , MVT::i32, Expand);
+  setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+  setOperationAction(ISD::FPOW , MVT::f64, Expand);
+  setOperationAction(ISD::FPOW , MVT::f32, Expand);
+
+  setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
+
+  // FIXME: Sparc provides these multiplies, but we don't have them yet.
+  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
+
+  setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
+
+  // VASTART needs to be custom lowered to use the VarArgsFrameIndex.
+  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
+  // VAARG needs to be lowered to not do unaligned accesses for doubles.
+  setOperationAction(ISD::VAARG             , MVT::Other, Custom);
+
+  // Use the default implementation.
+  setOperationAction(ISD::VACOPY            , MVT::Other, Expand);
+  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
+  setOperationAction(ISD::STACKSAVE         , MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE      , MVT::Other, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32  , Custom);
+
+  // No debug info support yet.
+  setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
+
+  setStackPointerRegisterToSaveRestore(SP::O6);
+
+  if (TM.getSubtarget<SparcSubtarget>().isV9())
+    setOperationAction(ISD::CTPOP, MVT::i32, Legal);
+
+  computeRegisterProperties();
+}
+
+const char *SparcTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return 0;
+  case SPISD::CMPICC:     return "SPISD::CMPICC";
+  case SPISD::CMPFCC:     return "SPISD::CMPFCC";
+  case SPISD::BRICC:      return "SPISD::BRICC";
+  case SPISD::BRFCC:      return "SPISD::BRFCC";
+  case SPISD::SELECT_ICC: return "SPISD::SELECT_ICC";
+  case SPISD::SELECT_FCC: return "SPISD::SELECT_FCC";
+  case SPISD::Hi:         return "SPISD::Hi";
+  case SPISD::Lo:         return "SPISD::Lo";
+  case SPISD::FTOI:       return "SPISD::FTOI";
+  case SPISD::ITOF:       return "SPISD::ITOF";
+  case SPISD::CALL:       return "SPISD::CALL";
+  case SPISD::RET_FLAG:   return "SPISD::RET_FLAG";
+  }
+}
+
+/// isMaskedValueZeroForTargetNode - Return true if 'Op & Mask' is known to
+/// be zero. Op is expected to be a target specific node. Used by DAG
+/// combiner.
+void SparcTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
+                                                         const APInt &Mask,
+                                                         APInt &KnownZero,
+                                                         APInt &KnownOne,
+                                                         const SelectionDAG &DAG,
+                                                         unsigned Depth) const {
+  APInt KnownZero2, KnownOne2;
+  KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);   // Don't know anything.
+
+  switch (Op.getOpcode()) {
+  default: break;
+  case SPISD::SELECT_ICC:
+  case SPISD::SELECT_FCC:
+    DAG.ComputeMaskedBits(Op.getOperand(1), Mask, KnownZero, KnownOne,
+                          Depth+1);
+    DAG.ComputeMaskedBits(Op.getOperand(0), Mask, KnownZero2, KnownOne2,
+                          Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+
+    // Only known if known in both the LHS and RHS.
+    KnownOne &= KnownOne2;
+    KnownZero &= KnownZero2;
+    break;
+  }
+}
+
+// Look at LHS/RHS/CC and see if they are a lowered setcc instruction.  If so
+// set LHS/RHS and SPCC to the LHS/RHS of the setcc and SPCC to the condition.
+static void LookThroughSetCC(SDValue &LHS, SDValue &RHS,
+                             ISD::CondCode CC, unsigned &SPCC) {
+  if (isa<ConstantSDNode>(RHS) &&
+      cast<ConstantSDNode>(RHS)->getZExtValue() == 0 &&
+      CC == ISD::SETNE &&
+      ((LHS.getOpcode() == SPISD::SELECT_ICC &&
+        LHS.getOperand(3).getOpcode() == SPISD::CMPICC) ||
+       (LHS.getOpcode() == SPISD::SELECT_FCC &&
+        LHS.getOperand(3).getOpcode() == SPISD::CMPFCC)) &&
+      isa<ConstantSDNode>(LHS.getOperand(0)) &&
+      isa<ConstantSDNode>(LHS.getOperand(1)) &&
+      cast<ConstantSDNode>(LHS.getOperand(0))->getZExtValue() == 1 &&
+      cast<ConstantSDNode>(LHS.getOperand(1))->getZExtValue() == 0) {
+    SDValue CMPCC = LHS.getOperand(3);
+    SPCC = cast<ConstantSDNode>(LHS.getOperand(2))->getZExtValue();
+    LHS = CMPCC.getOperand(0);
+    RHS = CMPCC.getOperand(1);
+  }
+}
+
+SDValue SparcTargetLowering::LowerGlobalAddress(SDValue Op, 
+                                                SelectionDAG &DAG) {
+  GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  // FIXME there isn't really any debug info here
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue GA = DAG.getTargetGlobalAddress(GV, MVT::i32);
+  SDValue Hi = DAG.getNode(SPISD::Hi, dl, MVT::i32, GA);
+  SDValue Lo = DAG.getNode(SPISD::Lo, dl, MVT::i32, GA);
+
+  if (getTargetMachine().getRelocationModel() != Reloc::PIC_) 
+    return DAG.getNode(ISD::ADD, dl, MVT::i32, Lo, Hi);
+  
+  SDValue GlobalBase = DAG.getNode(SPISD::GLOBAL_BASE_REG, dl,
+                                   getPointerTy());
+  SDValue RelAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, Lo, Hi);
+  SDValue AbsAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, 
+                                GlobalBase, RelAddr);
+  return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 
+                     AbsAddr, NULL, 0);
+}
+
+SDValue SparcTargetLowering::LowerConstantPool(SDValue Op,
+                                               SelectionDAG &DAG) {
+  ConstantPoolSDNode *N = cast<ConstantPoolSDNode>(Op);
+  // FIXME there isn't really any debug info here
+  DebugLoc dl = Op.getDebugLoc();
+  Constant *C = N->getConstVal();
+  SDValue CP = DAG.getTargetConstantPool(C, MVT::i32, N->getAlignment());
+  SDValue Hi = DAG.getNode(SPISD::Hi, dl, MVT::i32, CP);
+  SDValue Lo = DAG.getNode(SPISD::Lo, dl, MVT::i32, CP);
+  if (getTargetMachine().getRelocationModel() != Reloc::PIC_) 
+    return DAG.getNode(ISD::ADD, dl, MVT::i32, Lo, Hi);
+
+  SDValue GlobalBase = DAG.getNode(SPISD::GLOBAL_BASE_REG, dl, 
+                                   getPointerTy());
+  SDValue RelAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, Lo, Hi);
+  SDValue AbsAddr = DAG.getNode(ISD::ADD, dl, MVT::i32,
+                                GlobalBase, RelAddr);
+  return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), 
+                     AbsAddr, NULL, 0);
+}
+
+static SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  // Convert the fp value to integer in an FP register.
+  assert(Op.getValueType() == MVT::i32);
+  Op = DAG.getNode(SPISD::FTOI, dl, MVT::f32, Op.getOperand(0));
+  return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32, Op);
+}
+
+static SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  assert(Op.getOperand(0).getValueType() == MVT::i32);
+  SDValue Tmp = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Op.getOperand(0));
+  // Convert the int value to FP in an FP register.
+  return DAG.getNode(SPISD::ITOF, dl, Op.getValueType(), Tmp);
+}
+
+static SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) {
+  SDValue Chain = Op.getOperand(0);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+  SDValue LHS = Op.getOperand(2);
+  SDValue RHS = Op.getOperand(3);
+  SDValue Dest = Op.getOperand(4);
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned Opc, SPCC = ~0U;
+
+  // If this is a br_cc of a "setcc", and if the setcc got lowered into
+  // an CMP[IF]CC/SELECT_[IF]CC pair, find the original compared values.
+  LookThroughSetCC(LHS, RHS, CC, SPCC);
+
+  // Get the condition flag.
+  SDValue CompareFlag;
+  if (LHS.getValueType() == MVT::i32) {
+    std::vector<EVT> VTs;
+    VTs.push_back(MVT::i32);
+    VTs.push_back(MVT::Flag);
+    SDValue Ops[2] = { LHS, RHS };
+    CompareFlag = DAG.getNode(SPISD::CMPICC, dl, VTs, Ops, 2).getValue(1);
+    if (SPCC == ~0U) SPCC = IntCondCCodeToICC(CC);
+    Opc = SPISD::BRICC;
+  } else {
+    CompareFlag = DAG.getNode(SPISD::CMPFCC, dl, MVT::Flag, LHS, RHS);
+    if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
+    Opc = SPISD::BRFCC;
+  }
+  return DAG.getNode(Opc, dl, MVT::Other, Chain, Dest,
+                     DAG.getConstant(SPCC, MVT::i32), CompareFlag);
+}
+
+static SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) {
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+  SDValue TrueVal = Op.getOperand(2);
+  SDValue FalseVal = Op.getOperand(3);
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned Opc, SPCC = ~0U;
+
+  // If this is a select_cc of a "setcc", and if the setcc got lowered into
+  // an CMP[IF]CC/SELECT_[IF]CC pair, find the original compared values.
+  LookThroughSetCC(LHS, RHS, CC, SPCC);
+
+  SDValue CompareFlag;
+  if (LHS.getValueType() == MVT::i32) {
+    std::vector<EVT> VTs;
+    VTs.push_back(LHS.getValueType());   // subcc returns a value
+    VTs.push_back(MVT::Flag);
+    SDValue Ops[2] = { LHS, RHS };
+    CompareFlag = DAG.getNode(SPISD::CMPICC, dl, VTs, Ops, 2).getValue(1);
+    Opc = SPISD::SELECT_ICC;
+    if (SPCC == ~0U) SPCC = IntCondCCodeToICC(CC);
+  } else {
+    CompareFlag = DAG.getNode(SPISD::CMPFCC, dl, MVT::Flag, LHS, RHS);
+    Opc = SPISD::SELECT_FCC;
+    if (SPCC == ~0U) SPCC = FPCondCCodeToFCC(CC);
+  }
+  return DAG.getNode(Opc, dl, TrueVal.getValueType(), TrueVal, FalseVal,
+                     DAG.getConstant(SPCC, MVT::i32), CompareFlag);
+}
+
+static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG,
+                              SparcTargetLowering &TLI) {
+  // vastart just stores the address of the VarArgsFrameIndex slot into the
+  // memory location argument.
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue Offset = DAG.getNode(ISD::ADD, dl, MVT::i32,
+                                 DAG.getRegister(SP::I6, MVT::i32),
+                                 DAG.getConstant(TLI.getVarArgsFrameOffset(),
+                                                 MVT::i32));
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), dl, Offset, Op.getOperand(1), SV, 0);
+}
+
+static SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) {
+  SDNode *Node = Op.getNode();
+  EVT VT = Node->getValueType(0);
+  SDValue InChain = Node->getOperand(0);
+  SDValue VAListPtr = Node->getOperand(1);
+  const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
+  DebugLoc dl = Node->getDebugLoc();
+  SDValue VAList = DAG.getLoad(MVT::i32, dl, InChain, VAListPtr, SV, 0);
+  // Increment the pointer, VAList, to the next vaarg
+  SDValue NextPtr = DAG.getNode(ISD::ADD, dl, MVT::i32, VAList,
+                                  DAG.getConstant(VT.getSizeInBits()/8,
+                                                  MVT::i32));
+  // Store the incremented VAList to the legalized pointer
+  InChain = DAG.getStore(VAList.getValue(1), dl, NextPtr,
+                         VAListPtr, SV, 0);
+  // Load the actual argument out of the pointer VAList, unless this is an
+  // f64 load.
+  if (VT != MVT::f64)
+    return DAG.getLoad(VT, dl, InChain, VAList, NULL, 0);
+
+  // Otherwise, load it as i64, then do a bitconvert.
+  SDValue V = DAG.getLoad(MVT::i64, dl, InChain, VAList, NULL, 0);
+
+  // Bit-Convert the value to f64.
+  SDValue Ops[2] = {
+    DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f64, V),
+    V.getValue(1)
+  };
+  return DAG.getMergeValues(Ops, 2, dl);
+}
+
+static SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) {
+  SDValue Chain = Op.getOperand(0);  // Legalize the chain.
+  SDValue Size  = Op.getOperand(1);  // Legalize the size.
+  DebugLoc dl = Op.getDebugLoc();
+
+  unsigned SPReg = SP::O6;
+  SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, MVT::i32);
+  SDValue NewSP = DAG.getNode(ISD::SUB, dl, MVT::i32, SP, Size); // Value
+  Chain = DAG.getCopyToReg(SP.getValue(1), dl, SPReg, NewSP);    // Output chain
+
+  // The resultant pointer is actually 16 words from the bottom of the stack,
+  // to provide a register spill area.
+  SDValue NewVal = DAG.getNode(ISD::ADD, dl, MVT::i32, NewSP,
+                                 DAG.getConstant(96, MVT::i32));
+  SDValue Ops[2] = { NewVal, Chain };
+  return DAG.getMergeValues(Ops, 2, dl);
+}
+
+
+SDValue SparcTargetLowering::
+LowerOperation(SDValue Op, SelectionDAG &DAG) {
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Should not custom lower this!");
+  // Frame & Return address.  Currently unimplemented
+  case ISD::RETURNADDR: return SDValue();
+  case ISD::FRAMEADDR:  return SDValue();
+  case ISD::GlobalTLSAddress:
+    llvm_unreachable("TLS not implemented for Sparc.");
+  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
+  case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
+  case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
+  case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
+  case ISD::BR_CC:              return LowerBR_CC(Op, DAG);
+  case ISD::SELECT_CC:          return LowerSELECT_CC(Op, DAG);
+  case ISD::VASTART:            return LowerVASTART(Op, DAG, *this);
+  case ISD::VAARG:              return LowerVAARG(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
+  }
+}
+
+MachineBasicBlock *
+SparcTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                 MachineBasicBlock *BB,
+                   DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const {
+  const TargetInstrInfo &TII = *getTargetMachine().getInstrInfo();
+  unsigned BROpcode;
+  unsigned CC;
+  DebugLoc dl = MI->getDebugLoc();
+  // Figure out the conditional branch opcode to use for this select_cc.
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("Unknown SELECT_CC!");
+  case SP::SELECT_CC_Int_ICC:
+  case SP::SELECT_CC_FP_ICC:
+  case SP::SELECT_CC_DFP_ICC:
+    BROpcode = SP::BCOND;
+    break;
+  case SP::SELECT_CC_Int_FCC:
+  case SP::SELECT_CC_FP_FCC:
+  case SP::SELECT_CC_DFP_FCC:
+    BROpcode = SP::FBCOND;
+    break;
+  }
+
+  CC = (SPCC::CondCodes)MI->getOperand(3).getImm();
+
+  // To "insert" a SELECT_CC instruction, we actually have to insert the diamond
+  // control-flow pattern.  The incoming instruction knows the destination vreg
+  // to set, the condition code register to branch on, the true/false values to
+  // select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   [f]bCC copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  BuildMI(BB, dl, TII.get(BROpcode)).addMBB(sinkMBB).addImm(CC);
+  F->insert(It, copy0MBB);
+  F->insert(It, sinkMBB);
+  // Update machine-CFG edges by first adding all successors of the current
+  // block to the new block which will contain the Phi node for the select.
+  // Also inform sdisel of the edge changes.
+  for (MachineBasicBlock::succ_iterator I = BB->succ_begin(), 
+         E = BB->succ_end(); I != E; ++I) {
+    EM->insert(std::make_pair(*I, sinkMBB));
+    sinkMBB->addSuccessor(*I);
+  }
+  // Next, remove all successors of the current block, and add the true
+  // and fallthrough blocks as its successors.
+  while (!BB->succ_empty())
+    BB->removeSuccessor(BB->succ_begin());
+  // Next, add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(sinkMBB);
+
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to sinkMBB
+  BB = copy0MBB;
+
+  // Update machine-CFG edges
+  BB->addSuccessor(sinkMBB);
+
+  //  sinkMBB:
+  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+  //  ...
+  BB = sinkMBB;
+  BuildMI(BB, dl, TII.get(SP::PHI), MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB)
+    .addReg(MI->getOperand(1).getReg()).addMBB(thisMBB);
+
+  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+  return BB;
+}
+
+//===----------------------------------------------------------------------===//
+//                         Sparc Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+SparcTargetLowering::ConstraintType
+SparcTargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default:  break;
+    case 'r': return C_RegisterClass;
+    }
+  }
+
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+std::pair<unsigned, const TargetRegisterClass*>
+SparcTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+                                                  EVT VT) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'r':
+      return std::make_pair(0U, SP::IntRegsRegisterClass);
+    }
+  }
+
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
+
+std::vector<unsigned> SparcTargetLowering::
+getRegClassForInlineAsmConstraint(const std::string &Constraint,
+                                  EVT VT) const {
+  if (Constraint.size() != 1)
+    return std::vector<unsigned>();
+
+  switch (Constraint[0]) {
+  default: break;
+  case 'r':
+    return make_vector<unsigned>(SP::L0, SP::L1, SP::L2, SP::L3,
+                                 SP::L4, SP::L5, SP::L6, SP::L7,
+                                 SP::I0, SP::I1, SP::I2, SP::I3,
+                                 SP::I4, SP::I5,
+                                 SP::O0, SP::O1, SP::O2, SP::O3,
+                                 SP::O4, SP::O5, SP::O7, 0);
+  }
+
+  return std::vector<unsigned>();
+}
+
+bool
+SparcTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  // The Sparc target isn't yet aware of offsets.
+  return false;
+}
+
+/// getFunctionAlignment - Return the Log2 alignment of this function.
+unsigned SparcTargetLowering::getFunctionAlignment(const Function *) const {
+  return 2;
+}

diff --git a/lib/Target/Sparc/SparcISelLowering.h b/lib/Target/Sparc/SparcISelLowering.h
new file mode 100644
index 0000000..2ee73c1
--- /dev/null
+++ b/lib/Target/Sparc/SparcISelLowering.h

@@ -0,0 +1,107 @@
+//===-- SparcISelLowering.h - Sparc DAG Lowering Interface ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that Sparc uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARC_ISELLOWERING_H
+#define SPARC_ISELLOWERING_H
+
+#include "llvm/Target/TargetLowering.h"
+#include "Sparc.h"
+
+namespace llvm {
+  namespace SPISD {
+    enum {
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+      CMPICC,      // Compare two GPR operands, set icc.
+      CMPFCC,      // Compare two FP operands, set fcc.
+      BRICC,       // Branch to dest on icc condition
+      BRFCC,       // Branch to dest on fcc condition
+      SELECT_ICC,  // Select between two values using the current ICC flags.
+      SELECT_FCC,  // Select between two values using the current FCC flags.
+
+      Hi, Lo,      // Hi/Lo operations, typically on a global address.
+
+      FTOI,        // FP to Int within a FP register.
+      ITOF,        // Int to FP within a FP register.
+
+      CALL,        // A call instruction.
+      RET_FLAG,    // Return with a flag operand.
+      GLOBAL_BASE_REG // Global base reg for PIC
+    };
+  }
+
+  class SparcTargetLowering : public TargetLowering {
+    int VarArgsFrameOffset;   // Frame offset to start of varargs area.
+  public:
+    SparcTargetLowering(TargetMachine &TM);
+    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG);
+
+    int getVarArgsFrameOffset() const { return VarArgsFrameOffset; }
+
+    /// computeMaskedBitsForTargetNode - Determine which of the bits specified
+    /// in Mask are known to be either zero or one and return them in the
+    /// KnownZero/KnownOne bitsets.
+    virtual void computeMaskedBitsForTargetNode(const SDValue Op,
+                                                const APInt &Mask,
+                                                APInt &KnownZero,
+                                                APInt &KnownOne,
+                                                const SelectionDAG &DAG,
+                                                unsigned Depth = 0) const;
+
+    virtual MachineBasicBlock *EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                         MachineBasicBlock *MBB,
+                    DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const;
+
+    virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+    ConstraintType getConstraintType(const std::string &Constraint) const;
+    std::pair<unsigned, const TargetRegisterClass*>
+    getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const;
+    std::vector<unsigned>
+    getRegClassForInlineAsmConstraint(const std::string &Constraint,
+                                      EVT VT) const;
+
+    virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
+
+    /// getFunctionAlignment - Return the Log2 alignment of this function.
+    virtual unsigned getFunctionAlignment(const Function *F) const;
+
+    virtual SDValue
+      LowerFormalArguments(SDValue Chain,
+                           CallingConv::ID CallConv,
+                           bool isVarArg,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerCall(SDValue Chain, SDValue Callee,
+                CallingConv::ID CallConv, bool isVarArg,
+                bool &isTailCall,
+                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<ISD::InputArg> &Ins,
+                DebugLoc dl, SelectionDAG &DAG,
+                SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerReturn(SDValue Chain,
+                  CallingConv::ID CallConv, bool isVarArg,
+                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  DebugLoc dl, SelectionDAG &DAG);
+
+    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG);
+  };
+} // end namespace llvm
+
+#endif    // SPARC_ISELLOWERING_H

diff --git a/lib/Target/Sparc/SparcInstrFormats.td b/lib/Target/Sparc/SparcInstrFormats.td
new file mode 100644
index 0000000..6535259
--- /dev/null
+++ b/lib/Target/Sparc/SparcInstrFormats.td

@@ -0,0 +1,114 @@
+//===- SparcInstrFormats.td - Sparc Instruction Formats ----*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+
+class InstSP<dag outs, dag ins, string asmstr, list<dag> pattern> : Instruction {
+  field bits<32> Inst;
+
+  let Namespace = "SP";
+
+  bits<2> op;
+  let Inst{31-30} = op;               // Top two bits are the 'op' field
+  
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  let AsmString   = asmstr;
+  let Pattern = pattern;
+}
+
+//===----------------------------------------------------------------------===//
+// Format #2 instruction classes in the Sparc
+//===----------------------------------------------------------------------===//
+
+// Format 2 instructions
+class F2<dag outs, dag ins, string asmstr, list<dag> pattern>
+   : InstSP<outs, ins, asmstr, pattern> {
+  bits<3>  op2;
+  bits<22> imm22;
+  let op          = 0;    // op = 0
+  let Inst{24-22} = op2;
+  let Inst{21-0}  = imm22;
+}
+
+// Specific F2 classes: SparcV8 manual, page 44
+//
+class F2_1<bits<3> op2Val, dag outs, dag ins, string asmstr, list<dag> pattern>
+   : F2<outs, ins, asmstr, pattern> {
+  bits<5>  rd;
+
+  let op2         = op2Val;
+
+  let Inst{29-25} = rd;
+}
+
+class F2_2<bits<4> condVal, bits<3> op2Val, dag outs, dag ins, string asmstr, 
+           list<dag> pattern> : F2<outs, ins, asmstr, pattern> {
+  bits<4>   cond;
+  bit       annul = 0;     // currently unused
+
+  let cond        = condVal;
+  let op2         = op2Val;
+
+  let Inst{29}    = annul;
+  let Inst{28-25} = cond;
+}
+
+//===----------------------------------------------------------------------===//
+// Format #3 instruction classes in the Sparc
+//===----------------------------------------------------------------------===//
+
+class F3<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstSP<outs, ins, asmstr, pattern> {
+  bits<5> rd;
+  bits<6> op3;
+  bits<5> rs1;
+  let op{1} = 1;   // Op = 2 or 3
+  let Inst{29-25} = rd;
+  let Inst{24-19} = op3;
+  let Inst{18-14} = rs1;
+}
+
+// Specific F3 classes: SparcV8 manual, page 44
+//
+class F3_1<bits<2> opVal, bits<6> op3val, dag outs, dag ins,
+           string asmstr, list<dag> pattern> : F3<outs, ins, asmstr, pattern> {
+  bits<8> asi = 0; // asi not currently used
+  bits<5> rs2;
+
+  let op         = opVal;
+  let op3        = op3val;
+
+  let Inst{13}   = 0;     // i field = 0
+  let Inst{12-5} = asi;   // address space identifier
+  let Inst{4-0}  = rs2;
+}
+
+class F3_2<bits<2> opVal, bits<6> op3val, dag outs, dag ins, 
+           string asmstr, list<dag> pattern> : F3<outs, ins, asmstr, pattern> {
+  bits<13> simm13;
+
+  let op         = opVal;
+  let op3        = op3val;
+
+  let Inst{13}   = 1;     // i field = 1
+  let Inst{12-0} = simm13;
+}
+
+// floating-point
+class F3_3<bits<2> opVal, bits<6> op3val, bits<9> opfval, dag outs, dag ins,
+           string asmstr, list<dag> pattern> : F3<outs, ins, asmstr, pattern> {
+  bits<5> rs2;
+
+  let op         = opVal;
+  let op3        = op3val;
+
+  let Inst{13-5} = opfval;   // fp opcode
+  let Inst{4-0}  = rs2;
+}
+
+

diff --git a/lib/Target/Sparc/SparcInstrInfo.cpp b/lib/Target/Sparc/SparcInstrInfo.cpp
new file mode 100644
index 0000000..8667bca
--- /dev/null
+++ b/lib/Target/Sparc/SparcInstrInfo.cpp

@@ -0,0 +1,261 @@
+//===- SparcInstrInfo.cpp - Sparc Instruction Information -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Sparc implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcInstrInfo.h"
+#include "SparcSubtarget.h"
+#include "Sparc.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "SparcGenInstrInfo.inc"
+#include "SparcMachineFunctionInfo.h"
+using namespace llvm;
+
+SparcInstrInfo::SparcInstrInfo(SparcSubtarget &ST)
+  : TargetInstrInfoImpl(SparcInsts, array_lengthof(SparcInsts)),
+    RI(ST, *this), Subtarget(ST) {
+}
+
+static bool isZeroImm(const MachineOperand &op) {
+  return op.isImm() && op.getImm() == 0;
+}
+
+/// Return true if the instruction is a register to register move and
+/// leave the source and dest operands in the passed parameters.
+///
+bool SparcInstrInfo::isMoveInstr(const MachineInstr &MI,
+                                 unsigned &SrcReg, unsigned &DstReg,
+                                 unsigned &SrcSR, unsigned &DstSR) const {
+  SrcSR = DstSR = 0; // No sub-registers.
+
+  // We look for 3 kinds of patterns here:
+  // or with G0 or 0
+  // add with G0 or 0
+  // fmovs or FpMOVD (pseudo double move).
+  if (MI.getOpcode() == SP::ORrr || MI.getOpcode() == SP::ADDrr) {
+    if (MI.getOperand(1).getReg() == SP::G0) {
+      DstReg = MI.getOperand(0).getReg();
+      SrcReg = MI.getOperand(2).getReg();
+      return true;
+    } else if (MI.getOperand(2).getReg() == SP::G0) {
+      DstReg = MI.getOperand(0).getReg();
+      SrcReg = MI.getOperand(1).getReg();
+      return true;
+    }
+  } else if ((MI.getOpcode() == SP::ORri || MI.getOpcode() == SP::ADDri) &&
+             isZeroImm(MI.getOperand(2)) && MI.getOperand(1).isReg()) {
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+    return true;
+  } else if (MI.getOpcode() == SP::FMOVS || MI.getOpcode() == SP::FpMOVD ||
+             MI.getOpcode() == SP::FMOVD) {
+    SrcReg = MI.getOperand(1).getReg();
+    DstReg = MI.getOperand(0).getReg();
+    return true;
+  }
+  return false;
+}
+
+/// isLoadFromStackSlot - If the specified machine instruction is a direct
+/// load from a stack slot, return the virtual or physical register number of
+/// the destination along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than loading from the stack slot.
+unsigned SparcInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                             int &FrameIndex) const {
+  if (MI->getOpcode() == SP::LDri ||
+      MI->getOpcode() == SP::LDFri ||
+      MI->getOpcode() == SP::LDDFri) {
+    if (MI->getOperand(1).isFI() && MI->getOperand(2).isImm() &&
+        MI->getOperand(2).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+  }
+  return 0;
+}
+
+/// isStoreToStackSlot - If the specified machine instruction is a direct
+/// store to a stack slot, return the virtual or physical register number of
+/// the source reg along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than storing to the stack slot.
+unsigned SparcInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                            int &FrameIndex) const {
+  if (MI->getOpcode() == SP::STri ||
+      MI->getOpcode() == SP::STFri ||
+      MI->getOpcode() == SP::STDFri) {
+    if (MI->getOperand(0).isFI() && MI->getOperand(1).isImm() &&
+        MI->getOperand(1).getImm() == 0) {
+      FrameIndex = MI->getOperand(0).getIndex();
+      return MI->getOperand(2).getReg();
+    }
+  }
+  return 0;
+}
+
+unsigned
+SparcInstrInfo::InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB,
+                             MachineBasicBlock *FBB,
+                             const SmallVectorImpl<MachineOperand> &Cond)const{
+  // FIXME this should probably take a DebugLoc argument
+  DebugLoc dl = DebugLoc::getUnknownLoc();
+  // Can only insert uncond branches so far.
+  assert(Cond.empty() && !FBB && TBB && "Can only handle uncond branches!");
+  BuildMI(&MBB, dl, get(SP::BA)).addMBB(TBB);
+  return 1;
+}
+
+bool SparcInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I,
+                                  unsigned DestReg, unsigned SrcReg,
+                                  const TargetRegisterClass *DestRC,
+                                  const TargetRegisterClass *SrcRC) const {
+  if (DestRC != SrcRC) {
+    // Not yet supported!
+    return false;
+  }
+
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  if (DestRC == SP::IntRegsRegisterClass)
+    BuildMI(MBB, I, DL, get(SP::ORrr), DestReg).addReg(SP::G0).addReg(SrcReg);
+  else if (DestRC == SP::FPRegsRegisterClass)
+    BuildMI(MBB, I, DL, get(SP::FMOVS), DestReg).addReg(SrcReg);
+  else if (DestRC == SP::DFPRegsRegisterClass)
+    BuildMI(MBB, I, DL, get(Subtarget.isV9() ? SP::FMOVD : SP::FpMOVD),DestReg)
+      .addReg(SrcReg);
+  else
+    // Can't copy this register
+    return false;
+
+  return true;
+}
+
+void SparcInstrInfo::
+storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                    unsigned SrcReg, bool isKill, int FI,
+                    const TargetRegisterClass *RC) const {
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  // On the order of operands here: think "[FrameIdx + 0] = SrcReg".
+  if (RC == SP::IntRegsRegisterClass)
+    BuildMI(MBB, I, DL, get(SP::STri)).addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg, getKillRegState(isKill));
+  else if (RC == SP::FPRegsRegisterClass)
+    BuildMI(MBB, I, DL, get(SP::STFri)).addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg,  getKillRegState(isKill));
+  else if (RC == SP::DFPRegsRegisterClass)
+    BuildMI(MBB, I, DL, get(SP::STDFri)).addFrameIndex(FI).addImm(0)
+      .addReg(SrcReg,  getKillRegState(isKill));
+  else
+    llvm_unreachable("Can't store this register to stack slot");
+}
+
+void SparcInstrInfo::
+loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                     unsigned DestReg, int FI,
+                     const TargetRegisterClass *RC) const {
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  if (RC == SP::IntRegsRegisterClass)
+    BuildMI(MBB, I, DL, get(SP::LDri), DestReg).addFrameIndex(FI).addImm(0);
+  else if (RC == SP::FPRegsRegisterClass)
+    BuildMI(MBB, I, DL, get(SP::LDFri), DestReg).addFrameIndex(FI).addImm(0);
+  else if (RC == SP::DFPRegsRegisterClass)
+    BuildMI(MBB, I, DL, get(SP::LDDFri), DestReg).addFrameIndex(FI).addImm(0);
+  else
+    llvm_unreachable("Can't load this register from stack slot");
+}
+
+MachineInstr *SparcInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+                                                    MachineInstr* MI,
+                                          const SmallVectorImpl<unsigned> &Ops,
+                                                    int FI) const {
+  if (Ops.size() != 1) return NULL;
+
+  unsigned OpNum = Ops[0];
+  bool isFloat = false;
+  MachineInstr *NewMI = NULL;
+  switch (MI->getOpcode()) {
+  case SP::ORrr:
+    if (MI->getOperand(1).isReg() && MI->getOperand(1).getReg() == SP::G0&&
+        MI->getOperand(0).isReg() && MI->getOperand(2).isReg()) {
+      if (OpNum == 0)    // COPY -> STORE
+        NewMI = BuildMI(MF, MI->getDebugLoc(), get(SP::STri))
+          .addFrameIndex(FI)
+          .addImm(0)
+          .addReg(MI->getOperand(2).getReg());
+      else               // COPY -> LOAD
+        NewMI = BuildMI(MF, MI->getDebugLoc(), get(SP::LDri),
+                        MI->getOperand(0).getReg())
+          .addFrameIndex(FI)
+          .addImm(0);
+    }
+    break;
+  case SP::FMOVS:
+    isFloat = true;
+    // FALLTHROUGH
+  case SP::FMOVD:
+    if (OpNum == 0) { // COPY -> STORE
+      unsigned SrcReg = MI->getOperand(1).getReg();
+      bool isKill = MI->getOperand(1).isKill();
+      bool isUndef = MI->getOperand(1).isUndef();
+      NewMI = BuildMI(MF, MI->getDebugLoc(),
+                      get(isFloat ? SP::STFri : SP::STDFri))
+        .addFrameIndex(FI)
+        .addImm(0)
+        .addReg(SrcReg, getKillRegState(isKill) | getUndefRegState(isUndef));
+    } else {             // COPY -> LOAD
+      unsigned DstReg = MI->getOperand(0).getReg();
+      bool isDead = MI->getOperand(0).isDead();
+      bool isUndef = MI->getOperand(0).isUndef();
+      NewMI = BuildMI(MF, MI->getDebugLoc(),
+                      get(isFloat ? SP::LDFri : SP::LDDFri))
+        .addReg(DstReg, RegState::Define |
+                getDeadRegState(isDead) | getUndefRegState(isUndef))
+        .addFrameIndex(FI)
+        .addImm(0);
+    }
+    break;
+  }
+
+  return NewMI;
+}
+
+unsigned SparcInstrInfo::getGlobalBaseReg(MachineFunction *MF) const
+{
+  SparcMachineFunctionInfo *SparcFI = MF->getInfo<SparcMachineFunctionInfo>();
+  unsigned GlobalBaseReg = SparcFI->getGlobalBaseReg();
+  if (GlobalBaseReg != 0)
+    return GlobalBaseReg;
+
+  // Insert the set of GlobalBaseReg into the first MBB of the function
+  MachineBasicBlock &FirstMBB = MF->front();
+  MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+
+  GlobalBaseReg = RegInfo.createVirtualRegister(&SP::IntRegsRegClass);
+
+
+  DebugLoc dl = DebugLoc::getUnknownLoc();
+
+  BuildMI(FirstMBB, MBBI, dl, get(SP::GETPCX), GlobalBaseReg);
+  SparcFI->setGlobalBaseReg(GlobalBaseReg);
+  return GlobalBaseReg;
+}

diff --git a/lib/Target/Sparc/SparcInstrInfo.h b/lib/Target/Sparc/SparcInstrInfo.h
new file mode 100644
index 0000000..345674b
--- /dev/null
+++ b/lib/Target/Sparc/SparcInstrInfo.h

@@ -0,0 +1,106 @@
+//===- SparcInstrInfo.h - Sparc Instruction Information ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Sparc implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARCINSTRUCTIONINFO_H
+#define SPARCINSTRUCTIONINFO_H
+
+#include "llvm/Target/TargetInstrInfo.h"
+#include "SparcRegisterInfo.h"
+
+namespace llvm {
+
+/// SPII - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace SPII {
+  enum {
+    Pseudo = (1<<0),
+    Load = (1<<1),
+    Store = (1<<2),
+    DelaySlot = (1<<3)
+  };
+}
+
+class SparcInstrInfo : public TargetInstrInfoImpl {
+  const SparcRegisterInfo RI;
+  const SparcSubtarget& Subtarget;
+public:
+  explicit SparcInstrInfo(SparcSubtarget &ST);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  virtual const SparcRegisterInfo &getRegisterInfo() const { return RI; }
+
+  /// Return true if the instruction is a register to register move and return
+  /// the source and dest operands and their sub-register indices by reference.
+  virtual bool isMoveInstr(const MachineInstr &MI,
+                           unsigned &SrcReg, unsigned &DstReg,
+                           unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
+  
+  /// isLoadFromStackSlot - If the specified machine instruction is a direct
+  /// load from a stack slot, return the virtual or physical register number of
+  /// the destination along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than loading from the stack slot.
+  virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                                       int &FrameIndex) const;
+  
+  /// isStoreToStackSlot - If the specified machine instruction is a direct
+  /// store to a stack slot, return the virtual or physical register number of
+  /// the source reg along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than storing to the stack slot.
+  virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
+                                      int &FrameIndex) const;
+  
+  
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                                MachineBasicBlock *FBB,
+                            const SmallVectorImpl<MachineOperand> &Cond) const;
+
+  virtual bool copyRegToReg(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator I,
+                            unsigned DestReg, unsigned SrcReg,
+                            const TargetRegisterClass *DestRC,
+                            const TargetRegisterClass *SrcRC) const;
+  
+  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   unsigned SrcReg, bool isKill, int FrameIndex,
+                                   const TargetRegisterClass *RC) const;
+
+  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MBBI,
+                                    unsigned DestReg, int FrameIndex,
+                                    const TargetRegisterClass *RC) const;
+  
+  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                              MachineInstr* MI,
+                                           const SmallVectorImpl<unsigned> &Ops,
+                                              int FrameIndex) const;
+
+  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                              MachineInstr* MI,
+                                           const SmallVectorImpl<unsigned> &Ops,
+                                              MachineInstr* LoadMI) const {
+    return 0;
+  }
+
+  unsigned getGlobalBaseReg(MachineFunction *MF) const;
+};
+
+}
+
+#endif

diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td
new file mode 100644
index 0000000..d88d508
--- /dev/null
+++ b/lib/Target/Sparc/SparcInstrInfo.td

@@ -0,0 +1,784 @@
+//===- SparcInstrInfo.td - Target Description for Sparc Target ------------===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Sparc instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+
+include "SparcInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Feature predicates.
+//===----------------------------------------------------------------------===//
+
+// HasV9 - This predicate is true when the target processor supports V9
+// instructions.  Note that the machine may be running in 32-bit mode.
+def HasV9   : Predicate<"Subtarget.isV9()">;
+
+// HasNoV9 - This predicate is true when the target doesn't have V9
+// instructions.  Use of this is just a hack for the isel not having proper
+// costs for V8 instructions that are more expensive than their V9 ones.
+def HasNoV9 : Predicate<"!Subtarget.isV9()">;
+
+// HasVIS - This is true when the target processor has VIS extensions.
+def HasVIS : Predicate<"Subtarget.isVIS()">;
+
+// UseDeprecatedInsts - This predicate is true when the target processor is a
+// V8, or when it is V9 but the V8 deprecated instructions are efficient enough
+// to use when appropriate.  In either of these cases, the instruction selector
+// will pick deprecated instructions.
+def UseDeprecatedInsts : Predicate<"Subtarget.useDeprecatedV8Instructions()">;
+
+//===----------------------------------------------------------------------===//
+// Instruction Pattern Stuff
+//===----------------------------------------------------------------------===//
+
+def simm11  : PatLeaf<(imm), [{
+  // simm11 predicate - True if the imm fits in a 11-bit sign extended field.
+  return (((int)N->getZExtValue() << (32-11)) >> (32-11)) ==
+         (int)N->getZExtValue();
+}]>;
+
+def simm13  : PatLeaf<(imm), [{
+  // simm13 predicate - True if the imm fits in a 13-bit sign extended field.
+  return (((int)N->getZExtValue() << (32-13)) >> (32-13)) ==
+         (int)N->getZExtValue();
+}]>;
+
+def LO10 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant((unsigned)N->getZExtValue() & 1023,
+                                   MVT::i32);
+}]>;
+
+def HI22 : SDNodeXForm<imm, [{
+  // Transformation function: shift the immediate value down into the low bits.
+  return CurDAG->getTargetConstant((unsigned)N->getZExtValue() >> 10, MVT::i32);
+}]>;
+
+def SETHIimm : PatLeaf<(imm), [{
+  return (((unsigned)N->getZExtValue() >> 10) << 10) ==
+         (unsigned)N->getZExtValue();
+}], HI22>;
+
+// Addressing modes.
+def ADDRrr : ComplexPattern<i32, 2, "SelectADDRrr", [], []>;
+def ADDRri : ComplexPattern<i32, 2, "SelectADDRri", [frameindex], []>;
+
+// Address operands
+def MEMrr : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops IntRegs, IntRegs);
+}
+def MEMri : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops IntRegs, i32imm);
+}
+
+// Branch targets have OtherVT type.
+def brtarget : Operand<OtherVT>;
+def calltarget : Operand<i32>;
+
+// Operand for printing out a condition code.
+let PrintMethod = "printCCOperand" in
+  def CCOp : Operand<i32>;
+
+def SDTSPcmpfcc : 
+SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisSameAs<0, 1>]>;
+def SDTSPbrcc : 
+SDTypeProfile<0, 2, [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>]>;
+def SDTSPselectcc :
+SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisVT<3, i32>]>;
+def SDTSPFTOI :
+SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisFP<1>]>;
+def SDTSPITOF :
+SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVT<1, f32>]>;
+
+def SPcmpicc : SDNode<"SPISD::CMPICC", SDTIntBinOp, [SDNPOutFlag]>;
+def SPcmpfcc : SDNode<"SPISD::CMPFCC", SDTSPcmpfcc, [SDNPOutFlag]>;
+def SPbricc : SDNode<"SPISD::BRICC", SDTSPbrcc, [SDNPHasChain, SDNPInFlag]>;
+def SPbrfcc : SDNode<"SPISD::BRFCC", SDTSPbrcc, [SDNPHasChain, SDNPInFlag]>;
+
+def SPhi    : SDNode<"SPISD::Hi", SDTIntUnaryOp>;
+def SPlo    : SDNode<"SPISD::Lo", SDTIntUnaryOp>;
+
+def SPftoi  : SDNode<"SPISD::FTOI", SDTSPFTOI>;
+def SPitof  : SDNode<"SPISD::ITOF", SDTSPITOF>;
+
+def SPselecticc : SDNode<"SPISD::SELECT_ICC", SDTSPselectcc, [SDNPInFlag]>;
+def SPselectfcc : SDNode<"SPISD::SELECT_FCC", SDTSPselectcc, [SDNPInFlag]>;
+
+//  These are target-independent nodes, but have target-specific formats.
+def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_SPCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
+                                        SDTCisVT<1, i32> ]>;
+
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_SPCallSeqStart,
+                           [SDNPHasChain, SDNPOutFlag]>;
+def callseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_SPCallSeqEnd,
+                           [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+
+def SDT_SPCall    : SDTypeProfile<0, 1, [SDTCisVT<0, i32>]>;
+def call          : SDNode<"SPISD::CALL", SDT_SPCall,
+                           [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+
+def retflag       : SDNode<"SPISD::RET_FLAG", SDTNone,
+                           [SDNPHasChain, SDNPOptInFlag]>;
+
+def getPCX        : Operand<i32> {
+  let PrintMethod = "printGetPCX";
+}  
+
+//===----------------------------------------------------------------------===//
+// SPARC Flag Conditions
+//===----------------------------------------------------------------------===//
+
+// Note that these values must be kept in sync with the CCOp::CondCode enum
+// values.
+class ICC_VAL<int N> : PatLeaf<(i32 N)>;
+def ICC_NE  : ICC_VAL< 9>;  // Not Equal
+def ICC_E   : ICC_VAL< 1>;  // Equal
+def ICC_G   : ICC_VAL<10>;  // Greater
+def ICC_LE  : ICC_VAL< 2>;  // Less or Equal
+def ICC_GE  : ICC_VAL<11>;  // Greater or Equal
+def ICC_L   : ICC_VAL< 3>;  // Less
+def ICC_GU  : ICC_VAL<12>;  // Greater Unsigned
+def ICC_LEU : ICC_VAL< 4>;  // Less or Equal Unsigned
+def ICC_CC  : ICC_VAL<13>;  // Carry Clear/Great or Equal Unsigned
+def ICC_CS  : ICC_VAL< 5>;  // Carry Set/Less Unsigned
+def ICC_POS : ICC_VAL<14>;  // Positive
+def ICC_NEG : ICC_VAL< 6>;  // Negative
+def ICC_VC  : ICC_VAL<15>;  // Overflow Clear
+def ICC_VS  : ICC_VAL< 7>;  // Overflow Set
+
+class FCC_VAL<int N> : PatLeaf<(i32 N)>;
+def FCC_U   : FCC_VAL<23>;  // Unordered
+def FCC_G   : FCC_VAL<22>;  // Greater
+def FCC_UG  : FCC_VAL<21>;  // Unordered or Greater
+def FCC_L   : FCC_VAL<20>;  // Less
+def FCC_UL  : FCC_VAL<19>;  // Unordered or Less
+def FCC_LG  : FCC_VAL<18>;  // Less or Greater
+def FCC_NE  : FCC_VAL<17>;  // Not Equal
+def FCC_E   : FCC_VAL<25>;  // Equal
+def FCC_UE  : FCC_VAL<24>;  // Unordered or Equal
+def FCC_GE  : FCC_VAL<25>;  // Greater or Equal
+def FCC_UGE : FCC_VAL<26>;  // Unordered or Greater or Equal
+def FCC_LE  : FCC_VAL<27>;  // Less or Equal
+def FCC_ULE : FCC_VAL<28>;  // Unordered or Less or Equal
+def FCC_O   : FCC_VAL<29>;  // Ordered
+
+//===----------------------------------------------------------------------===//
+// Instruction Class Templates
+//===----------------------------------------------------------------------===//
+
+/// F3_12 multiclass - Define a normal F3_1/F3_2 pattern in one shot.
+multiclass F3_12<string OpcStr, bits<6> Op3Val, SDNode OpNode> {
+  def rr  : F3_1<2, Op3Val, 
+                 (outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
+                 !strconcat(OpcStr, " $b, $c, $dst"),
+                 [(set IntRegs:$dst, (OpNode IntRegs:$b, IntRegs:$c))]>;
+  def ri  : F3_2<2, Op3Val,
+                 (outs IntRegs:$dst), (ins IntRegs:$b, i32imm:$c),
+                 !strconcat(OpcStr, " $b, $c, $dst"),
+                 [(set IntRegs:$dst, (OpNode IntRegs:$b, simm13:$c))]>;
+}
+
+/// F3_12np multiclass - Define a normal F3_1/F3_2 pattern in one shot, with no
+/// pattern.
+multiclass F3_12np<string OpcStr, bits<6> Op3Val> {
+  def rr  : F3_1<2, Op3Val, 
+                 (outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
+                 !strconcat(OpcStr, " $b, $c, $dst"), []>;
+  def ri  : F3_2<2, Op3Val,
+                 (outs IntRegs:$dst), (ins IntRegs:$b, i32imm:$c),
+                 !strconcat(OpcStr, " $b, $c, $dst"), []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+// Pseudo instructions.
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+   : InstSP<outs, ins, asmstr, pattern>;
+
+// GETPCX for PIC
+let Defs = [O7], Uses = [O7] in {
+  def GETPCX : Pseudo<(outs getPCX:$getpcseq), (ins), "$getpcseq", [] >;
+}
+
+let Defs = [O6], Uses = [O6] in {
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
+                               "!ADJCALLSTACKDOWN $amt",
+                               [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                            "!ADJCALLSTACKUP $amt1",
+                            [(callseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+// FpMOVD/FpNEGD/FpABSD - These are lowered to single-precision ops by the 
+// fpmover pass.
+let Predicates = [HasNoV9] in {  // Only emit these in V8 mode.
+  def FpMOVD : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$src),
+                      "!FpMOVD $src, $dst", []>;
+  def FpNEGD : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$src),
+                      "!FpNEGD $src, $dst",
+                      [(set DFPRegs:$dst, (fneg DFPRegs:$src))]>;
+  def FpABSD : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$src),
+                      "!FpABSD $src, $dst",
+                      [(set DFPRegs:$dst, (fabs DFPRegs:$src))]>;
+}
+
+// SELECT_CC_* - Used to implement the SELECT_CC DAG operation.  Expanded after
+// instruction selection into a branch sequence.  This has to handle all
+// permutations of selection between i32/f32/f64 on ICC and FCC.
+let usesCustomInserter = 1 in {   // Expanded after instruction selection.
+  def SELECT_CC_Int_ICC
+   : Pseudo<(outs IntRegs:$dst), (ins IntRegs:$T, IntRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_Int_ICC PSEUDO!",
+            [(set IntRegs:$dst, (SPselecticc IntRegs:$T, IntRegs:$F,
+                                             imm:$Cond))]>;
+  def SELECT_CC_Int_FCC
+   : Pseudo<(outs IntRegs:$dst), (ins IntRegs:$T, IntRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_Int_FCC PSEUDO!",
+            [(set IntRegs:$dst, (SPselectfcc IntRegs:$T, IntRegs:$F,
+                                             imm:$Cond))]>;
+  def SELECT_CC_FP_ICC
+   : Pseudo<(outs FPRegs:$dst), (ins FPRegs:$T, FPRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_FP_ICC PSEUDO!",
+            [(set FPRegs:$dst, (SPselecticc FPRegs:$T, FPRegs:$F,
+                                            imm:$Cond))]>;
+  def SELECT_CC_FP_FCC
+   : Pseudo<(outs FPRegs:$dst), (ins FPRegs:$T, FPRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_FP_FCC PSEUDO!",
+            [(set FPRegs:$dst, (SPselectfcc FPRegs:$T, FPRegs:$F,
+                                            imm:$Cond))]>;
+  def SELECT_CC_DFP_ICC
+   : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$T, DFPRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_DFP_ICC PSEUDO!",
+            [(set DFPRegs:$dst, (SPselecticc DFPRegs:$T, DFPRegs:$F,
+                                             imm:$Cond))]>;
+  def SELECT_CC_DFP_FCC
+   : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$T, DFPRegs:$F, i32imm:$Cond),
+            "; SELECT_CC_DFP_FCC PSEUDO!",
+            [(set DFPRegs:$dst, (SPselectfcc DFPRegs:$T, DFPRegs:$F,
+                                             imm:$Cond))]>;
+}
+
+
+// Section A.3 - Synthetic Instructions, p. 85
+// special cases of JMPL:
+let isReturn = 1, isTerminator = 1, hasDelaySlot = 1, isBarrier = 1 in {
+  let rd = O7.Num, rs1 = G0.Num, simm13 = 8 in
+    def RETL: F3_2<2, 0b111000, (outs), (ins), "retl", [(retflag)]>;
+}
+
+// Section B.1 - Load Integer Instructions, p. 90
+def LDSBrr : F3_1<3, 0b001001,
+                  (outs IntRegs:$dst), (ins MEMrr:$addr),
+                  "ldsb [$addr], $dst",
+                  [(set IntRegs:$dst, (sextloadi8 ADDRrr:$addr))]>;
+def LDSBri : F3_2<3, 0b001001,
+                  (outs IntRegs:$dst), (ins MEMri:$addr),
+                  "ldsb [$addr], $dst",
+                  [(set IntRegs:$dst, (sextloadi8 ADDRri:$addr))]>;
+def LDSHrr : F3_1<3, 0b001010,
+                  (outs IntRegs:$dst), (ins MEMrr:$addr),
+                  "ldsh [$addr], $dst",
+                  [(set IntRegs:$dst, (sextloadi16 ADDRrr:$addr))]>;
+def LDSHri : F3_2<3, 0b001010,
+                  (outs IntRegs:$dst), (ins MEMri:$addr),
+                  "ldsh [$addr], $dst",
+                  [(set IntRegs:$dst, (sextloadi16 ADDRri:$addr))]>;
+def LDUBrr : F3_1<3, 0b000001,
+                  (outs IntRegs:$dst), (ins MEMrr:$addr),
+                  "ldub [$addr], $dst",
+                  [(set IntRegs:$dst, (zextloadi8 ADDRrr:$addr))]>;
+def LDUBri : F3_2<3, 0b000001,
+                  (outs IntRegs:$dst), (ins MEMri:$addr),
+                  "ldub [$addr], $dst",
+                  [(set IntRegs:$dst, (zextloadi8 ADDRri:$addr))]>;
+def LDUHrr : F3_1<3, 0b000010,
+                  (outs IntRegs:$dst), (ins MEMrr:$addr),
+                  "lduh [$addr], $dst",
+                  [(set IntRegs:$dst, (zextloadi16 ADDRrr:$addr))]>;
+def LDUHri : F3_2<3, 0b000010,
+                  (outs IntRegs:$dst), (ins MEMri:$addr),
+                  "lduh [$addr], $dst",
+                  [(set IntRegs:$dst, (zextloadi16 ADDRri:$addr))]>;
+def LDrr   : F3_1<3, 0b000000,
+                  (outs IntRegs:$dst), (ins MEMrr:$addr),
+                  "ld [$addr], $dst",
+                  [(set IntRegs:$dst, (load ADDRrr:$addr))]>;
+def LDri   : F3_2<3, 0b000000,
+                  (outs IntRegs:$dst), (ins MEMri:$addr),
+                  "ld [$addr], $dst",
+                  [(set IntRegs:$dst, (load ADDRri:$addr))]>;
+
+// Section B.2 - Load Floating-point Instructions, p. 92
+def LDFrr  : F3_1<3, 0b100000,
+                  (outs FPRegs:$dst), (ins MEMrr:$addr),
+                  "ld [$addr], $dst",
+                  [(set FPRegs:$dst, (load ADDRrr:$addr))]>;
+def LDFri  : F3_2<3, 0b100000,
+                  (outs FPRegs:$dst), (ins MEMri:$addr),
+                  "ld [$addr], $dst",
+                  [(set FPRegs:$dst, (load ADDRri:$addr))]>;
+def LDDFrr : F3_1<3, 0b100011,
+                  (outs DFPRegs:$dst), (ins MEMrr:$addr),
+                  "ldd [$addr], $dst",
+                  [(set DFPRegs:$dst, (load ADDRrr:$addr))]>;
+def LDDFri : F3_2<3, 0b100011,
+                  (outs DFPRegs:$dst), (ins MEMri:$addr),
+                  "ldd [$addr], $dst",
+                  [(set DFPRegs:$dst, (load ADDRri:$addr))]>;
+
+// Section B.4 - Store Integer Instructions, p. 95
+def STBrr : F3_1<3, 0b000101,
+                 (outs), (ins MEMrr:$addr, IntRegs:$src),
+                 "stb $src, [$addr]",
+                 [(truncstorei8 IntRegs:$src, ADDRrr:$addr)]>;
+def STBri : F3_2<3, 0b000101,
+                 (outs), (ins MEMri:$addr, IntRegs:$src),
+                 "stb $src, [$addr]",
+                 [(truncstorei8 IntRegs:$src, ADDRri:$addr)]>;
+def STHrr : F3_1<3, 0b000110,
+                 (outs), (ins MEMrr:$addr, IntRegs:$src),
+                 "sth $src, [$addr]",
+                 [(truncstorei16 IntRegs:$src, ADDRrr:$addr)]>;
+def STHri : F3_2<3, 0b000110,
+                 (outs), (ins MEMri:$addr, IntRegs:$src),
+                 "sth $src, [$addr]",
+                 [(truncstorei16 IntRegs:$src, ADDRri:$addr)]>;
+def STrr  : F3_1<3, 0b000100,
+                 (outs), (ins MEMrr:$addr, IntRegs:$src),
+                 "st $src, [$addr]",
+                 [(store IntRegs:$src, ADDRrr:$addr)]>;
+def STri  : F3_2<3, 0b000100,
+                 (outs), (ins MEMri:$addr, IntRegs:$src),
+                 "st $src, [$addr]",
+                 [(store IntRegs:$src, ADDRri:$addr)]>;
+
+// Section B.5 - Store Floating-point Instructions, p. 97
+def STFrr   : F3_1<3, 0b100100,
+                   (outs), (ins MEMrr:$addr, FPRegs:$src),
+                   "st $src, [$addr]",
+                   [(store FPRegs:$src, ADDRrr:$addr)]>;
+def STFri   : F3_2<3, 0b100100,
+                   (outs), (ins MEMri:$addr, FPRegs:$src),
+                   "st $src, [$addr]",
+                   [(store FPRegs:$src, ADDRri:$addr)]>;
+def STDFrr  : F3_1<3, 0b100111,
+                   (outs), (ins MEMrr:$addr, DFPRegs:$src),
+                   "std  $src, [$addr]",
+                   [(store DFPRegs:$src, ADDRrr:$addr)]>;
+def STDFri  : F3_2<3, 0b100111,
+                   (outs), (ins MEMri:$addr, DFPRegs:$src),
+                   "std $src, [$addr]",
+                   [(store DFPRegs:$src, ADDRri:$addr)]>;
+
+// Section B.9 - SETHI Instruction, p. 104
+def SETHIi: F2_1<0b100,
+                 (outs IntRegs:$dst), (ins i32imm:$src),
+                 "sethi $src, $dst",
+                 [(set IntRegs:$dst, SETHIimm:$src)]>;
+
+// Section B.10 - NOP Instruction, p. 105
+// (It's a special case of SETHI)
+let rd = 0, imm22 = 0 in
+  def NOP : F2_1<0b100, (outs), (ins), "nop", []>;
+
+// Section B.11 - Logical Instructions, p. 106
+defm AND    : F3_12<"and", 0b000001, and>;
+
+def ANDNrr  : F3_1<2, 0b000101,
+                   (outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
+                   "andn $b, $c, $dst",
+                   [(set IntRegs:$dst, (and IntRegs:$b, (not IntRegs:$c)))]>;
+def ANDNri  : F3_2<2, 0b000101,
+                   (outs IntRegs:$dst), (ins IntRegs:$b, i32imm:$c),
+                   "andn $b, $c, $dst", []>;
+
+defm OR     : F3_12<"or", 0b000010, or>;
+
+def ORNrr   : F3_1<2, 0b000110,
+                   (outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
+                   "orn $b, $c, $dst",
+                   [(set IntRegs:$dst, (or IntRegs:$b, (not IntRegs:$c)))]>;
+def ORNri   : F3_2<2, 0b000110,
+                   (outs IntRegs:$dst), (ins IntRegs:$b, i32imm:$c),
+                   "orn $b, $c, $dst", []>;
+defm XOR    : F3_12<"xor", 0b000011, xor>;
+
+def XNORrr  : F3_1<2, 0b000111,
+                   (outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
+                   "xnor $b, $c, $dst",
+                   [(set IntRegs:$dst, (not (xor IntRegs:$b, IntRegs:$c)))]>;
+def XNORri  : F3_2<2, 0b000111,
+                   (outs IntRegs:$dst), (ins IntRegs:$b, i32imm:$c),
+                   "xnor $b, $c, $dst", []>;
+
+// Section B.12 - Shift Instructions, p. 107
+defm SLL : F3_12<"sll", 0b100101, shl>;
+defm SRL : F3_12<"srl", 0b100110, srl>;
+defm SRA : F3_12<"sra", 0b100111, sra>;
+
+// Section B.13 - Add Instructions, p. 108
+defm ADD   : F3_12<"add", 0b000000, add>;
+
+// "LEA" forms of add (patterns to make tblgen happy)
+def LEA_ADDri   : F3_2<2, 0b000000,
+                   (outs IntRegs:$dst), (ins MEMri:$addr),
+                   "add ${addr:arith}, $dst",
+                   [(set IntRegs:$dst, ADDRri:$addr)]>;
+
+let Defs = [ICC] in                   
+  defm ADDCC  : F3_12<"addcc", 0b010000, addc>;
+
+defm ADDX  : F3_12<"addx", 0b001000, adde>;
+
+// Section B.15 - Subtract Instructions, p. 110
+defm SUB    : F3_12  <"sub"  , 0b000100, sub>;
+defm SUBX   : F3_12  <"subx" , 0b001100, sube>;
+
+let Defs = [ICC] in {
+  defm SUBCC  : F3_12  <"subcc", 0b010100, SPcmpicc>;
+
+  def SUBXCCrr: F3_1<2, 0b011100, 
+                (outs IntRegs:$dst), (ins IntRegs:$b, IntRegs:$c),
+                "subxcc $b, $c, $dst", []>;
+}
+
+// Section B.18 - Multiply Instructions, p. 113
+defm UMUL : F3_12np<"umul", 0b001010>;
+defm SMUL : F3_12  <"smul", 0b001011, mul>;
+
+
+// Section B.19 - Divide Instructions, p. 115
+defm UDIV : F3_12np<"udiv", 0b001110>;
+defm SDIV : F3_12np<"sdiv", 0b001111>;
+
+// Section B.20 - SAVE and RESTORE, p. 117
+defm SAVE    : F3_12np<"save"   , 0b111100>;
+defm RESTORE : F3_12np<"restore", 0b111101>;
+
+// Section B.21 - Branch on Integer Condition Codes Instructions, p. 119
+
+// conditional branch class:
+class BranchSP<bits<4> cc, dag ins, string asmstr, list<dag> pattern>
+ : F2_2<cc, 0b010, (outs), ins, asmstr, pattern> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let hasDelaySlot = 1;
+}
+
+let isBarrier = 1 in
+  def BA   : BranchSP<0b1000, (ins brtarget:$dst),
+                      "ba $dst",
+                      [(br bb:$dst)]>;
+
+// FIXME: the encoding for the JIT should look at the condition field.
+let Uses = [ICC] in
+  def BCOND : BranchSP<0, (ins brtarget:$dst, CCOp:$cc),
+                         "b$cc $dst",
+                        [(SPbricc bb:$dst, imm:$cc)]>;
+
+
+// Section B.22 - Branch on Floating-point Condition Codes Instructions, p. 121
+
+// floating-point conditional branch class:
+class FPBranchSP<bits<4> cc, dag ins, string asmstr, list<dag> pattern>
+ : F2_2<cc, 0b110, (outs), ins, asmstr, pattern> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let hasDelaySlot = 1;
+}
+
+// FIXME: the encoding for the JIT should look at the condition field.
+let Uses = [FCC] in
+  def FBCOND  : FPBranchSP<0, (ins brtarget:$dst, CCOp:$cc),
+                              "fb$cc $dst",
+                              [(SPbrfcc bb:$dst, imm:$cc)]>;
+
+
+// Section B.24 - Call and Link Instruction, p. 125
+// This is the only Format 1 instruction
+let Uses = [O0, O1, O2, O3, O4, O5],
+    hasDelaySlot = 1, isCall = 1,
+    Defs = [O0, O1, O2, O3, O4, O5, O7, G1, G2, G3, G4, G5, G6, G7,
+    D0, D1, D2, D3, D4, D5, D6, D7, D8, D9, D10, D11, D12, D13, D14, D15] in { 
+  def CALL : InstSP<(outs), (ins calltarget:$dst),
+                    "call $dst", []> {
+    bits<30> disp;
+    let op = 1;
+    let Inst{29-0} = disp;
+  }
+  
+  // indirect calls
+  def JMPLrr : F3_1<2, 0b111000,
+                    (outs), (ins MEMrr:$ptr),
+                    "call $ptr",
+                    [(call  ADDRrr:$ptr)]>;
+  def JMPLri : F3_2<2, 0b111000,
+                    (outs), (ins MEMri:$ptr),
+                    "call $ptr",
+                    [(call  ADDRri:$ptr)]>;
+}
+
+// Section B.28 - Read State Register Instructions
+def RDY : F3_1<2, 0b101000,
+               (outs IntRegs:$dst), (ins),
+               "rd %y, $dst", []>;
+
+// Section B.29 - Write State Register Instructions
+def WRYrr : F3_1<2, 0b110000,
+                 (outs), (ins IntRegs:$b, IntRegs:$c),
+                 "wr $b, $c, %y", []>;
+def WRYri : F3_2<2, 0b110000,
+                 (outs), (ins IntRegs:$b, i32imm:$c),
+                 "wr $b, $c, %y", []>;
+
+// Convert Integer to Floating-point Instructions, p. 141
+def FITOS : F3_3<2, 0b110100, 0b011000100,
+                 (outs FPRegs:$dst), (ins FPRegs:$src),
+                 "fitos $src, $dst",
+                 [(set FPRegs:$dst, (SPitof FPRegs:$src))]>;
+def FITOD : F3_3<2, 0b110100, 0b011001000, 
+                 (outs DFPRegs:$dst), (ins FPRegs:$src),
+                 "fitod $src, $dst",
+                 [(set DFPRegs:$dst, (SPitof FPRegs:$src))]>;
+
+// Convert Floating-point to Integer Instructions, p. 142
+def FSTOI : F3_3<2, 0b110100, 0b011010001,
+                 (outs FPRegs:$dst), (ins FPRegs:$src),
+                 "fstoi $src, $dst",
+                 [(set FPRegs:$dst, (SPftoi FPRegs:$src))]>;
+def FDTOI : F3_3<2, 0b110100, 0b011010010,
+                 (outs FPRegs:$dst), (ins DFPRegs:$src),
+                 "fdtoi $src, $dst",
+                 [(set FPRegs:$dst, (SPftoi DFPRegs:$src))]>;
+
+// Convert between Floating-point Formats Instructions, p. 143
+def FSTOD : F3_3<2, 0b110100, 0b011001001, 
+                 (outs DFPRegs:$dst), (ins FPRegs:$src),
+                 "fstod $src, $dst",
+                 [(set DFPRegs:$dst, (fextend FPRegs:$src))]>;
+def FDTOS : F3_3<2, 0b110100, 0b011000110,
+                 (outs FPRegs:$dst), (ins DFPRegs:$src),
+                 "fdtos $src, $dst",
+                 [(set FPRegs:$dst, (fround DFPRegs:$src))]>;
+
+// Floating-point Move Instructions, p. 144
+def FMOVS : F3_3<2, 0b110100, 0b000000001,
+                 (outs FPRegs:$dst), (ins FPRegs:$src),
+                 "fmovs $src, $dst", []>;
+def FNEGS : F3_3<2, 0b110100, 0b000000101, 
+                 (outs FPRegs:$dst), (ins FPRegs:$src),
+                 "fnegs $src, $dst",
+                 [(set FPRegs:$dst, (fneg FPRegs:$src))]>;
+def FABSS : F3_3<2, 0b110100, 0b000001001, 
+                 (outs FPRegs:$dst), (ins FPRegs:$src),
+                 "fabss $src, $dst",
+                 [(set FPRegs:$dst, (fabs FPRegs:$src))]>;
+
+
+// Floating-point Square Root Instructions, p.145
+def FSQRTS : F3_3<2, 0b110100, 0b000101001, 
+                  (outs FPRegs:$dst), (ins FPRegs:$src),
+                  "fsqrts $src, $dst",
+                  [(set FPRegs:$dst, (fsqrt FPRegs:$src))]>;
+def FSQRTD : F3_3<2, 0b110100, 0b000101010, 
+                  (outs DFPRegs:$dst), (ins DFPRegs:$src),
+                  "fsqrtd $src, $dst",
+                  [(set DFPRegs:$dst, (fsqrt DFPRegs:$src))]>;
+
+
+
+// Floating-point Add and Subtract Instructions, p. 146
+def FADDS  : F3_3<2, 0b110100, 0b001000001,
+                  (outs FPRegs:$dst), (ins FPRegs:$src1, FPRegs:$src2),
+                  "fadds $src1, $src2, $dst",
+                  [(set FPRegs:$dst, (fadd FPRegs:$src1, FPRegs:$src2))]>;
+def FADDD  : F3_3<2, 0b110100, 0b001000010,
+                  (outs DFPRegs:$dst), (ins DFPRegs:$src1, DFPRegs:$src2),
+                  "faddd $src1, $src2, $dst",
+                  [(set DFPRegs:$dst, (fadd DFPRegs:$src1, DFPRegs:$src2))]>;
+def FSUBS  : F3_3<2, 0b110100, 0b001000101,
+                  (outs FPRegs:$dst), (ins FPRegs:$src1, FPRegs:$src2),
+                  "fsubs $src1, $src2, $dst",
+                  [(set FPRegs:$dst, (fsub FPRegs:$src1, FPRegs:$src2))]>;
+def FSUBD  : F3_3<2, 0b110100, 0b001000110,
+                  (outs DFPRegs:$dst), (ins DFPRegs:$src1, DFPRegs:$src2),
+                  "fsubd $src1, $src2, $dst",
+                  [(set DFPRegs:$dst, (fsub DFPRegs:$src1, DFPRegs:$src2))]>;
+
+// Floating-point Multiply and Divide Instructions, p. 147
+def FMULS  : F3_3<2, 0b110100, 0b001001001,
+                  (outs FPRegs:$dst), (ins FPRegs:$src1, FPRegs:$src2),
+                  "fmuls $src1, $src2, $dst",
+                  [(set FPRegs:$dst, (fmul FPRegs:$src1, FPRegs:$src2))]>;
+def FMULD  : F3_3<2, 0b110100, 0b001001010,
+                  (outs DFPRegs:$dst), (ins DFPRegs:$src1, DFPRegs:$src2),
+                  "fmuld $src1, $src2, $dst",
+                  [(set DFPRegs:$dst, (fmul DFPRegs:$src1, DFPRegs:$src2))]>;
+def FSMULD : F3_3<2, 0b110100, 0b001101001,
+                  (outs DFPRegs:$dst), (ins FPRegs:$src1, FPRegs:$src2),
+                  "fsmuld $src1, $src2, $dst",
+                  [(set DFPRegs:$dst, (fmul (fextend FPRegs:$src1),
+                                            (fextend FPRegs:$src2)))]>;
+def FDIVS  : F3_3<2, 0b110100, 0b001001101,
+                 (outs FPRegs:$dst), (ins FPRegs:$src1, FPRegs:$src2),
+                 "fdivs $src1, $src2, $dst",
+                 [(set FPRegs:$dst, (fdiv FPRegs:$src1, FPRegs:$src2))]>;
+def FDIVD  : F3_3<2, 0b110100, 0b001001110,
+                 (outs DFPRegs:$dst), (ins DFPRegs:$src1, DFPRegs:$src2),
+                 "fdivd $src1, $src2, $dst",
+                 [(set DFPRegs:$dst, (fdiv DFPRegs:$src1, DFPRegs:$src2))]>;
+
+// Floating-point Compare Instructions, p. 148
+// Note: the 2nd template arg is different for these guys.
+// Note 2: the result of a FCMP is not available until the 2nd cycle
+// after the instr is retired, but there is no interlock. This behavior
+// is modelled with a forced noop after the instruction.
+let Defs = [FCC] in {
+  def FCMPS  : F3_3<2, 0b110101, 0b001010001,
+                   (outs), (ins FPRegs:$src1, FPRegs:$src2),
+                   "fcmps $src1, $src2\n\tnop",
+                   [(SPcmpfcc FPRegs:$src1, FPRegs:$src2)]>;
+  def FCMPD  : F3_3<2, 0b110101, 0b001010010,
+                   (outs), (ins DFPRegs:$src1, DFPRegs:$src2),
+                   "fcmpd $src1, $src2\n\tnop",
+                   [(SPcmpfcc DFPRegs:$src1, DFPRegs:$src2)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// V9 Instructions
+//===----------------------------------------------------------------------===//
+
+// V9 Conditional Moves.
+let Predicates = [HasV9], isTwoAddress = 1 in {
+  // Move Integer Register on Condition (MOVcc) p. 194 of the V9 manual.
+  // FIXME: Add instruction encodings for the JIT some day.
+  def MOVICCrr
+    : Pseudo<(outs IntRegs:$dst), (ins IntRegs:$T, IntRegs:$F, CCOp:$cc),
+             "mov$cc %icc, $F, $dst",
+             [(set IntRegs:$dst,
+                         (SPselecticc IntRegs:$F, IntRegs:$T, imm:$cc))]>;
+  def MOVICCri
+    : Pseudo<(outs IntRegs:$dst), (ins IntRegs:$T, i32imm:$F, CCOp:$cc),
+             "mov$cc %icc, $F, $dst",
+             [(set IntRegs:$dst,
+                          (SPselecticc simm11:$F, IntRegs:$T, imm:$cc))]>;
+
+  def MOVFCCrr
+    : Pseudo<(outs IntRegs:$dst), (ins IntRegs:$T, IntRegs:$F, CCOp:$cc),
+             "mov$cc %fcc0, $F, $dst",
+             [(set IntRegs:$dst,
+                         (SPselectfcc IntRegs:$F, IntRegs:$T, imm:$cc))]>;
+  def MOVFCCri
+    : Pseudo<(outs IntRegs:$dst), (ins IntRegs:$T, i32imm:$F, CCOp:$cc),
+             "mov$cc %fcc0, $F, $dst",
+             [(set IntRegs:$dst,
+                          (SPselectfcc simm11:$F, IntRegs:$T, imm:$cc))]>;
+
+  def FMOVS_ICC
+    : Pseudo<(outs FPRegs:$dst), (ins FPRegs:$T, FPRegs:$F, CCOp:$cc),
+             "fmovs$cc %icc, $F, $dst",
+             [(set FPRegs:$dst,
+                         (SPselecticc FPRegs:$F, FPRegs:$T, imm:$cc))]>;
+  def FMOVD_ICC
+    : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$T, DFPRegs:$F, CCOp:$cc),
+             "fmovd$cc %icc, $F, $dst",
+             [(set DFPRegs:$dst,
+                         (SPselecticc DFPRegs:$F, DFPRegs:$T, imm:$cc))]>;
+  def FMOVS_FCC
+    : Pseudo<(outs FPRegs:$dst), (ins FPRegs:$T, FPRegs:$F, CCOp:$cc),
+             "fmovs$cc %fcc0, $F, $dst",
+             [(set FPRegs:$dst,
+                         (SPselectfcc FPRegs:$F, FPRegs:$T, imm:$cc))]>;
+  def FMOVD_FCC
+    : Pseudo<(outs DFPRegs:$dst), (ins DFPRegs:$T, DFPRegs:$F, CCOp:$cc),
+             "fmovd$cc %fcc0, $F, $dst",
+             [(set DFPRegs:$dst,
+                         (SPselectfcc DFPRegs:$F, DFPRegs:$T, imm:$cc))]>;
+
+}
+
+// Floating-Point Move Instructions, p. 164 of the V9 manual.
+let Predicates = [HasV9] in {
+  def FMOVD : F3_3<2, 0b110100, 0b000000010,
+                   (outs DFPRegs:$dst), (ins DFPRegs:$src),
+                   "fmovd $src, $dst", []>;
+  def FNEGD : F3_3<2, 0b110100, 0b000000110, 
+                   (outs DFPRegs:$dst), (ins DFPRegs:$src),
+                   "fnegd $src, $dst",
+                   [(set DFPRegs:$dst, (fneg DFPRegs:$src))]>;
+  def FABSD : F3_3<2, 0b110100, 0b000001010, 
+                   (outs DFPRegs:$dst), (ins DFPRegs:$src),
+                   "fabsd $src, $dst",
+                   [(set DFPRegs:$dst, (fabs DFPRegs:$src))]>;
+}
+
+// POPCrr - This does a ctpop of a 64-bit register.  As such, we have to clear
+// the top 32-bits before using it.  To do this clearing, we use a SLLri X,0.
+def POPCrr : F3_1<2, 0b101110, 
+                  (outs IntRegs:$dst), (ins IntRegs:$src),
+                  "popc $src, $dst", []>, Requires<[HasV9]>;
+def : Pat<(ctpop IntRegs:$src),
+          (POPCrr (SLLri IntRegs:$src, 0))>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// Small immediates.
+def : Pat<(i32 simm13:$val),
+          (ORri G0, imm:$val)>;
+// Arbitrary immediates.
+def : Pat<(i32 imm:$val),
+          (ORri (SETHIi (HI22 imm:$val)), (LO10 imm:$val))>;
+
+// subc
+def : Pat<(subc IntRegs:$b, IntRegs:$c),
+          (SUBCCrr IntRegs:$b, IntRegs:$c)>;
+def : Pat<(subc IntRegs:$b, simm13:$val),
+          (SUBCCri IntRegs:$b, imm:$val)>;
+
+// Global addresses, constant pool entries
+def : Pat<(SPhi tglobaladdr:$in), (SETHIi tglobaladdr:$in)>;
+def : Pat<(SPlo tglobaladdr:$in), (ORri G0, tglobaladdr:$in)>;
+def : Pat<(SPhi tconstpool:$in), (SETHIi tconstpool:$in)>;
+def : Pat<(SPlo tconstpool:$in), (ORri G0, tconstpool:$in)>;
+
+// Add reg, lo.  This is used when taking the addr of a global/constpool entry.
+def : Pat<(add IntRegs:$r, (SPlo tglobaladdr:$in)),
+          (ADDri IntRegs:$r, tglobaladdr:$in)>;
+def : Pat<(add IntRegs:$r, (SPlo tconstpool:$in)),
+          (ADDri IntRegs:$r, tconstpool:$in)>;
+
+// Calls: 
+def : Pat<(call tglobaladdr:$dst),
+          (CALL tglobaladdr:$dst)>;
+def : Pat<(call texternalsym:$dst),
+          (CALL texternalsym:$dst)>;
+
+// Map integer extload's to zextloads.
+def : Pat<(i32 (extloadi1 ADDRrr:$src)), (LDUBrr ADDRrr:$src)>;
+def : Pat<(i32 (extloadi1 ADDRri:$src)), (LDUBri ADDRri:$src)>;
+def : Pat<(i32 (extloadi8 ADDRrr:$src)), (LDUBrr ADDRrr:$src)>;
+def : Pat<(i32 (extloadi8 ADDRri:$src)), (LDUBri ADDRri:$src)>;
+def : Pat<(i32 (extloadi16 ADDRrr:$src)), (LDUHrr ADDRrr:$src)>;
+def : Pat<(i32 (extloadi16 ADDRri:$src)), (LDUHri ADDRri:$src)>;
+
+// zextload bool -> zextload byte
+def : Pat<(i32 (zextloadi1 ADDRrr:$src)), (LDUBrr ADDRrr:$src)>;
+def : Pat<(i32 (zextloadi1 ADDRri:$src)), (LDUBri ADDRri:$src)>;

diff --git a/lib/Target/Sparc/SparcMCAsmInfo.cpp b/lib/Target/Sparc/SparcMCAsmInfo.cpp
new file mode 100644
index 0000000..53a9bde
--- /dev/null
+++ b/lib/Target/Sparc/SparcMCAsmInfo.cpp

@@ -0,0 +1,36 @@
+//===-- SparcMCAsmInfo.cpp - Sparc asm properties -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the SparcMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcMCAsmInfo.h"
+#include "llvm/ADT/SmallVector.h"
+using namespace llvm;
+
+SparcELFMCAsmInfo::SparcELFMCAsmInfo(const Target &T, const StringRef &TT) {
+  Data16bitsDirective = "\t.half\t";
+  Data32bitsDirective = "\t.word\t";
+  Data64bitsDirective = 0;  // .xword is only supported by V9.
+  ZeroDirective = "\t.skip\t";
+  CommentString = "!";
+  HasLEB128 = true;
+  AbsoluteDebugSectionOffsets = true;
+  SupportsDebugInformation = true;
+  
+  SunStyleELFSectionSwitchSyntax = true;
+  UsesELFSectionDirectiveForBSS = true;
+
+  WeakRefDirective = "\t.weak\t";
+
+  PrivateGlobalPrefix = ".L";
+}
+
+

diff --git a/lib/Target/Sparc/SparcMCAsmInfo.h b/lib/Target/Sparc/SparcMCAsmInfo.h
new file mode 100644
index 0000000..12d6ef4
--- /dev/null
+++ b/lib/Target/Sparc/SparcMCAsmInfo.h

@@ -0,0 +1,28 @@
+//=====-- SparcMCAsmInfo.h - Sparc asm properties -------------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the SparcMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARCTARGETASMINFO_H
+#define SPARCTARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+  class Target;
+  class StringRef;
+  struct SparcELFMCAsmInfo : public MCAsmInfo {
+    explicit SparcELFMCAsmInfo(const Target &T, const StringRef &TT);
+  };
+
+} // namespace llvm
+
+#endif

diff --git a/lib/Target/Sparc/SparcMachineFunctionInfo.h b/lib/Target/Sparc/SparcMachineFunctionInfo.h
new file mode 100644
index 0000000..e457235
--- /dev/null
+++ b/lib/Target/Sparc/SparcMachineFunctionInfo.h

@@ -0,0 +1,32 @@
+//===- SparcMachineFunctionInfo.h - Sparc Machine Function Info -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares  Sparc specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+#ifndef SPARCMACHINEFUNCTIONINFO_H
+#define SPARCMACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+  class SparcMachineFunctionInfo : public MachineFunctionInfo {
+  private:
+    unsigned GlobalBaseReg;
+  public:
+    SparcMachineFunctionInfo() : GlobalBaseReg(0) {}
+    SparcMachineFunctionInfo(MachineFunction &MF) : GlobalBaseReg(0) {}
+
+    unsigned getGlobalBaseReg() const { return GlobalBaseReg; }
+    void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; }
+  };
+}
+
+#endif

diff --git a/lib/Target/Sparc/SparcRegisterInfo.cpp b/lib/Target/Sparc/SparcRegisterInfo.cpp
new file mode 100644
index 0000000..6f6183e
--- /dev/null
+++ b/lib/Target/Sparc/SparcRegisterInfo.cpp

@@ -0,0 +1,197 @@
+//===- SparcRegisterInfo.cpp - SPARC Register Information -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the SPARC implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Sparc.h"
+#include "SparcRegisterInfo.h"
+#include "SparcSubtarget.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Type.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+using namespace llvm;
+
+SparcRegisterInfo::SparcRegisterInfo(SparcSubtarget &st,
+                                     const TargetInstrInfo &tii)
+  : SparcGenRegisterInfo(SP::ADJCALLSTACKDOWN, SP::ADJCALLSTACKUP),
+    Subtarget(st), TII(tii) {
+}
+
+const unsigned* SparcRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
+                                                                         const {
+  static const unsigned CalleeSavedRegs[] = { 0 };
+  return CalleeSavedRegs;
+}
+
+BitVector SparcRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  Reserved.set(SP::G2);
+  Reserved.set(SP::G3);
+  Reserved.set(SP::G4);
+  Reserved.set(SP::O6);
+  Reserved.set(SP::I6);
+  Reserved.set(SP::I7);
+  Reserved.set(SP::G0);
+  Reserved.set(SP::G5);
+  Reserved.set(SP::G6);
+  Reserved.set(SP::G7);
+  return Reserved;
+}
+
+
+const TargetRegisterClass* const*
+SparcRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
+  static const TargetRegisterClass * const CalleeSavedRegClasses[] = { 0 };
+  return CalleeSavedRegClasses;
+}
+
+bool SparcRegisterInfo::hasFP(const MachineFunction &MF) const {
+  return false;
+}
+
+void SparcRegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  MachineInstr &MI = *I;
+  DebugLoc dl = MI.getDebugLoc();
+  int Size = MI.getOperand(0).getImm();
+  if (MI.getOpcode() == SP::ADJCALLSTACKDOWN)
+    Size = -Size;
+  if (Size)
+    BuildMI(MBB, I, dl, TII.get(SP::ADDri), SP::O6).addReg(SP::O6).addImm(Size);
+  MBB.erase(I);
+}
+
+unsigned
+SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                       int SPAdj, int *Value,
+                                       RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+
+  unsigned i = 0;
+  MachineInstr &MI = *II;
+  DebugLoc dl = MI.getDebugLoc();
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+
+  int FrameIndex = MI.getOperand(i).getIndex();
+
+  // Addressable stack objects are accessed using neg. offsets from %fp
+  MachineFunction &MF = *MI.getParent()->getParent();
+  int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex) +
+               MI.getOperand(i+1).getImm();
+
+  // Replace frame index with a frame pointer reference.
+  if (Offset >= -4096 && Offset <= 4095) {
+    // If the offset is small enough to fit in the immediate field, directly
+    // encode it.
+    MI.getOperand(i).ChangeToRegister(SP::I6, false);
+    MI.getOperand(i+1).ChangeToImmediate(Offset);
+  } else {
+    // Otherwise, emit a G1 = SETHI %hi(offset).  FIXME: it would be better to 
+    // scavenge a register here instead of reserving G1 all of the time.
+    unsigned OffHi = (unsigned)Offset >> 10U;
+    BuildMI(*MI.getParent(), II, dl, TII.get(SP::SETHIi), SP::G1).addImm(OffHi);
+    // Emit G1 = G1 + I6
+    BuildMI(*MI.getParent(), II, dl, TII.get(SP::ADDrr), SP::G1).addReg(SP::G1)
+      .addReg(SP::I6);
+    // Insert: G1+%lo(offset) into the user.
+    MI.getOperand(i).ChangeToRegister(SP::G1, false);
+    MI.getOperand(i+1).ChangeToImmediate(Offset & ((1 << 10)-1));
+  }
+  return 0;
+}
+
+void SparcRegisterInfo::
+processFunctionBeforeFrameFinalized(MachineFunction &MF) const {}
+
+void SparcRegisterInfo::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  DebugLoc dl = (MBBI != MBB.end() ?
+                 MBBI->getDebugLoc() : DebugLoc::getUnknownLoc());
+
+  // Get the number of bytes to allocate from the FrameInfo
+  int NumBytes = (int) MFI->getStackSize();
+
+  // Emit the correct save instruction based on the number of bytes in
+  // the frame. Minimum stack frame size according to V8 ABI is:
+  //   16 words for register window spill
+  //    1 word for address of returned aggregate-value
+  // +  6 words for passing parameters on the stack
+  // ----------
+  //   23 words * 4 bytes per word = 92 bytes
+  NumBytes += 92;
+
+  // Round up to next doubleword boundary -- a double-word boundary
+  // is required by the ABI.
+  NumBytes = (NumBytes + 7) & ~7;
+  NumBytes = -NumBytes;
+  
+  if (NumBytes >= -4096) {
+    BuildMI(MBB, MBBI, dl, TII.get(SP::SAVEri), SP::O6)
+      .addReg(SP::O6).addImm(NumBytes);
+  } else {
+    // Emit this the hard way.  This clobbers G1 which we always know is 
+    // available here.
+    unsigned OffHi = (unsigned)NumBytes >> 10U;
+    BuildMI(MBB, MBBI, dl, TII.get(SP::SETHIi), SP::G1).addImm(OffHi);
+    // Emit G1 = G1 + I6
+    BuildMI(MBB, MBBI, dl, TII.get(SP::ORri), SP::G1)
+      .addReg(SP::G1).addImm(NumBytes & ((1 << 10)-1));
+    BuildMI(MBB, MBBI, dl, TII.get(SP::SAVErr), SP::O6)
+      .addReg(SP::O6).addReg(SP::G1);
+  }
+}
+
+void SparcRegisterInfo::emitEpilogue(MachineFunction &MF,
+                                     MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  DebugLoc dl = MBBI->getDebugLoc();
+  assert(MBBI->getOpcode() == SP::RETL &&
+         "Can only put epilog before 'retl' instruction!");
+  BuildMI(MBB, MBBI, dl, TII.get(SP::RESTORErr), SP::G0).addReg(SP::G0)
+    .addReg(SP::G0);
+}
+
+unsigned SparcRegisterInfo::getRARegister() const {
+  return SP::I7;
+}
+
+unsigned SparcRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  return SP::I6;
+}
+
+unsigned SparcRegisterInfo::getEHExceptionRegister() const {
+  llvm_unreachable("What is the exception register");
+  return 0;
+}
+
+unsigned SparcRegisterInfo::getEHHandlerRegister() const {
+  llvm_unreachable("What is the exception handler register");
+  return 0;
+}
+
+int SparcRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
+  return SparcGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
+}
+
+#include "SparcGenRegisterInfo.inc"
+

diff --git a/lib/Target/Sparc/SparcRegisterInfo.h b/lib/Target/Sparc/SparcRegisterInfo.h
new file mode 100644
index 0000000..8889ea6
--- /dev/null
+++ b/lib/Target/Sparc/SparcRegisterInfo.h

@@ -0,0 +1,68 @@
+//===- SparcRegisterInfo.h - Sparc Register Information Impl ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Sparc implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARCREGISTERINFO_H
+#define SPARCREGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "SparcGenRegisterInfo.h.inc"
+
+namespace llvm {
+
+class SparcSubtarget;
+class TargetInstrInfo;
+class Type;
+
+struct SparcRegisterInfo : public SparcGenRegisterInfo {
+  SparcSubtarget &Subtarget;
+  const TargetInstrInfo &TII;
+  
+  SparcRegisterInfo(SparcSubtarget &st, const TargetInstrInfo &tii);
+
+  /// Code Generation virtual methods...  
+  const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+
+  const TargetRegisterClass* const* getCalleeSavedRegClasses(
+                                     const MachineFunction *MF = 0) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+
+  bool hasFP(const MachineFunction &MF) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+  unsigned eliminateFrameIndex(MachineBasicBlock::iterator II,
+                               int SPAdj, int *Value = NULL,
+                               RegScavenger *RS = NULL) const;
+
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
+
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+  
+  // Debug information queries.
+  unsigned getRARegister() const;
+  unsigned getFrameRegister(const MachineFunction &MF) const;
+
+  // Exception handling queries.
+  unsigned getEHExceptionRegister() const;
+  unsigned getEHHandlerRegister() const;
+
+  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+};
+
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/Sparc/SparcRegisterInfo.td b/lib/Target/Sparc/SparcRegisterInfo.td
new file mode 100644
index 0000000..2b05c19
--- /dev/null
+++ b/lib/Target/Sparc/SparcRegisterInfo.td

@@ -0,0 +1,166 @@
+//===- SparcRegisterInfo.td - Sparc Register defs ----------*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the Sparc register file 
+//===----------------------------------------------------------------------===//
+
+class SparcReg<string n> : Register<n> {
+  field bits<5> Num;
+  let Namespace = "SP";
+}
+
+class SparcCtrlReg<string n>: Register<n> {
+  let Namespace = "SP";
+}
+
+// Registers are identified with 5-bit ID numbers.
+// Ri - 32-bit integer registers
+class Ri<bits<5> num, string n> : SparcReg<n> {
+  let Num = num;
+}
+// Rf - 32-bit floating-point registers
+class Rf<bits<5> num, string n> : SparcReg<n> {
+  let Num = num;
+}
+// Rd - Slots in the FP register file for 64-bit floating-point values.
+class Rd<bits<5> num, string n, list<Register> subregs> : SparcReg<n> {
+  let Num = num;
+  let SubRegs = subregs;
+}
+
+// Control Registers
+def ICC : SparcCtrlReg<"ICC">;
+def FCC : SparcCtrlReg<"FCC">;
+
+// Integer registers
+def G0 : Ri< 0, "G0">, DwarfRegNum<[0]>;
+def G1 : Ri< 1, "G1">, DwarfRegNum<[1]>;
+def G2 : Ri< 2, "G2">, DwarfRegNum<[2]>; 
+def G3 : Ri< 3, "G3">, DwarfRegNum<[3]>;
+def G4 : Ri< 4, "G4">, DwarfRegNum<[4]>;
+def G5 : Ri< 5, "G5">, DwarfRegNum<[5]>; 
+def G6 : Ri< 6, "G6">, DwarfRegNum<[6]>;
+def G7 : Ri< 7, "G7">, DwarfRegNum<[7]>;
+def O0 : Ri< 8, "O0">, DwarfRegNum<[8]>;
+def O1 : Ri< 9, "O1">, DwarfRegNum<[9]>;
+def O2 : Ri<10, "O2">, DwarfRegNum<[10]>; 
+def O3 : Ri<11, "O3">, DwarfRegNum<[11]>;
+def O4 : Ri<12, "O4">, DwarfRegNum<[12]>;
+def O5 : Ri<13, "O5">, DwarfRegNum<[13]>; 
+def O6 : Ri<14, "SP">, DwarfRegNum<[14]>;
+def O7 : Ri<15, "O7">, DwarfRegNum<[15]>;
+def L0 : Ri<16, "L0">, DwarfRegNum<[16]>;
+def L1 : Ri<17, "L1">, DwarfRegNum<[17]>;
+def L2 : Ri<18, "L2">, DwarfRegNum<[18]>; 
+def L3 : Ri<19, "L3">, DwarfRegNum<[19]>;
+def L4 : Ri<20, "L4">, DwarfRegNum<[20]>;
+def L5 : Ri<21, "L5">, DwarfRegNum<[21]>; 
+def L6 : Ri<22, "L6">, DwarfRegNum<[22]>;
+def L7 : Ri<23, "L7">, DwarfRegNum<[23]>;
+def I0 : Ri<24, "I0">, DwarfRegNum<[24]>;
+def I1 : Ri<25, "I1">, DwarfRegNum<[25]>;
+def I2 : Ri<26, "I2">, DwarfRegNum<[26]>; 
+def I3 : Ri<27, "I3">, DwarfRegNum<[27]>;
+def I4 : Ri<28, "I4">, DwarfRegNum<[28]>;
+def I5 : Ri<29, "I5">, DwarfRegNum<[29]>; 
+def I6 : Ri<30, "FP">, DwarfRegNum<[30]>;
+def I7 : Ri<31, "I7">, DwarfRegNum<[31]>;
+
+// Floating-point registers
+def F0  : Rf< 0,  "F0">, DwarfRegNum<[32]>;
+def F1  : Rf< 1,  "F1">, DwarfRegNum<[33]>;
+def F2  : Rf< 2,  "F2">, DwarfRegNum<[34]>; 
+def F3  : Rf< 3,  "F3">, DwarfRegNum<[35]>;
+def F4  : Rf< 4,  "F4">, DwarfRegNum<[36]>;
+def F5  : Rf< 5,  "F5">, DwarfRegNum<[37]>; 
+def F6  : Rf< 6,  "F6">, DwarfRegNum<[38]>;
+def F7  : Rf< 7,  "F7">, DwarfRegNum<[39]>;
+def F8  : Rf< 8,  "F8">, DwarfRegNum<[40]>; 
+def F9  : Rf< 9,  "F9">, DwarfRegNum<[41]>;
+def F10 : Rf<10, "F10">, DwarfRegNum<[42]>;
+def F11 : Rf<11, "F11">, DwarfRegNum<[43]>; 
+def F12 : Rf<12, "F12">, DwarfRegNum<[44]>;
+def F13 : Rf<13, "F13">, DwarfRegNum<[45]>;
+def F14 : Rf<14, "F14">, DwarfRegNum<[46]>; 
+def F15 : Rf<15, "F15">, DwarfRegNum<[47]>;
+def F16 : Rf<16, "F16">, DwarfRegNum<[48]>;
+def F17 : Rf<17, "F17">, DwarfRegNum<[49]>; 
+def F18 : Rf<18, "F18">, DwarfRegNum<[50]>;
+def F19 : Rf<19, "F19">, DwarfRegNum<[51]>;
+def F20 : Rf<20, "F20">, DwarfRegNum<[52]>; 
+def F21 : Rf<21, "F21">, DwarfRegNum<[53]>;
+def F22 : Rf<22, "F22">, DwarfRegNum<[54]>;
+def F23 : Rf<23, "F23">, DwarfRegNum<[55]>;
+def F24 : Rf<24, "F24">, DwarfRegNum<[56]>;
+def F25 : Rf<25, "F25">, DwarfRegNum<[57]>;
+def F26 : Rf<26, "F26">, DwarfRegNum<[58]>; 
+def F27 : Rf<27, "F27">, DwarfRegNum<[59]>;
+def F28 : Rf<28, "F28">, DwarfRegNum<[60]>;
+def F29 : Rf<29, "F29">, DwarfRegNum<[61]>; 
+def F30 : Rf<30, "F30">, DwarfRegNum<[62]>;
+def F31 : Rf<31, "F31">, DwarfRegNum<[63]>;
+
+// Aliases of the F* registers used to hold 64-bit fp values (doubles)
+def D0  : Rd< 0,  "F0", [F0,   F1]>, DwarfRegNum<[32]>;
+def D1  : Rd< 2,  "F2", [F2,   F3]>, DwarfRegNum<[34]>; 
+def D2  : Rd< 4,  "F4", [F4,   F5]>, DwarfRegNum<[36]>;
+def D3  : Rd< 6,  "F6", [F6,   F7]>, DwarfRegNum<[38]>; 
+def D4  : Rd< 8,  "F8", [F8,   F9]>, DwarfRegNum<[40]>;
+def D5  : Rd<10, "F10", [F10, F11]>, DwarfRegNum<[42]>;
+def D6  : Rd<12, "F12", [F12, F13]>, DwarfRegNum<[44]>;
+def D7  : Rd<14, "F14", [F14, F15]>, DwarfRegNum<[46]>; 
+def D8  : Rd<16, "F16", [F16, F17]>, DwarfRegNum<[48]>;
+def D9  : Rd<18, "F18", [F18, F19]>, DwarfRegNum<[50]>; 
+def D10 : Rd<20, "F20", [F20, F21]>, DwarfRegNum<[52]>;
+def D11 : Rd<22, "F22", [F22, F23]>, DwarfRegNum<[54]>;
+def D12 : Rd<24, "F24", [F24, F25]>, DwarfRegNum<[56]>;
+def D13 : Rd<26, "F26", [F26, F27]>, DwarfRegNum<[58]>; 
+def D14 : Rd<28, "F28", [F28, F29]>, DwarfRegNum<[60]>;
+def D15 : Rd<30, "F30", [F30, F31]>, DwarfRegNum<[62]>;
+
+// Register classes.
+//
+// FIXME: the register order should be defined in terms of the preferred
+// allocation order...
+//
+def IntRegs : RegisterClass<"SP", [i32], 32, [L0, L1, L2, L3, L4, L5, L6, L7,
+                                     I0, I1, I2, I3, I4, I5,
+                                     O0, O1, O2, O3, O4, O5, O7,
+
+   // FIXME: G1 reserved for now for large imm generation by frame code.
+                                     G1,
+                                     // Non-allocatable regs:
+                                     G2, G3, G4, // FIXME: OK for use only in
+                                                 // applications, not libraries.
+                                     O6, // stack ptr
+                                     I6, // frame ptr
+                                     I7, // return address
+                                     G0, // constant zero
+                                     G5, G6, G7 // reserved for kernel
+                                     ]> {
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    IntRegsClass::iterator
+    IntRegsClass::allocation_order_end(const MachineFunction &MF) const {
+      // FIXME: These special regs should be taken out of the regclass!
+      return end()-10  // Don't allocate special registers
+         -1;  // FIXME: G1 reserved for large imm generation by frame code.
+    }
+  }];
+}
+
+def FPRegs : RegisterClass<"SP", [f32], 32, [F0, F1, F2, F3, F4, F5, F6, F7, F8,
+  F9, F10, F11, F12, F13, F14, F15, F16, F17, F18, F19, F20, F21, F22,
+  F23, F24, F25, F26, F27, F28, F29, F30, F31]>;
+
+def DFPRegs : RegisterClass<"SP", [f64], 64, [D0, D1, D2, D3, D4, D5, D6, D7,
+  D8, D9, D10, D11, D12, D13, D14, D15]>;

diff --git a/lib/Target/Sparc/SparcSubtarget.cpp b/lib/Target/Sparc/SparcSubtarget.cpp
new file mode 100644
index 0000000..ce11af1
--- /dev/null
+++ b/lib/Target/Sparc/SparcSubtarget.cpp

@@ -0,0 +1,34 @@
+//===- SparcSubtarget.cpp - SPARC Subtarget Information -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SPARC specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcSubtarget.h"
+#include "SparcGenSubtarget.inc"
+using namespace llvm;
+
+SparcSubtarget::SparcSubtarget(const std::string &TT, const std::string &FS, 
+                               bool is64Bit) :
+  IsV9(false),
+  V8DeprecatedInsts(false),
+  IsVIS(false),
+  Is64Bit(is64Bit) {
+  
+  // Determine default and user specified characteristics
+  const char *CPU = "v8";
+  if (is64Bit) {
+    CPU = "v9";
+    IsV9 = true;
+  }
+
+  // Parse features string.
+  ParseSubtargetFeatures(FS, CPU);
+}

diff --git a/lib/Target/Sparc/SparcSubtarget.h b/lib/Target/Sparc/SparcSubtarget.h
new file mode 100644
index 0000000..cec0ab4
--- /dev/null
+++ b/lib/Target/Sparc/SparcSubtarget.h

@@ -0,0 +1,54 @@
+//=====-- SparcSubtarget.h - Define Subtarget for the SPARC ----*- C++ -*-====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the SPARC specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARC_SUBTARGET_H
+#define SPARC_SUBTARGET_H
+
+#include "llvm/Target/TargetSubtarget.h"
+#include <string>
+
+namespace llvm {
+
+class SparcSubtarget : public TargetSubtarget {
+  bool IsV9;
+  bool V8DeprecatedInsts;
+  bool IsVIS;
+  bool Is64Bit;
+  
+public:
+  SparcSubtarget(const std::string &TT, const std::string &FS, bool is64bit);
+
+  bool isV9() const { return IsV9; }
+  bool isVIS() const { return IsVIS; }
+  bool useDeprecatedV8Instructions() const { return V8DeprecatedInsts; }
+  
+  /// ParseSubtargetFeatures - Parses features string setting specified 
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  std::string ParseSubtargetFeatures(const std::string &FS,
+                                     const std::string &CPU);
+  
+  bool is64Bit() const { return Is64Bit; }
+  std::string getDataLayout() const {
+    const char *p;
+    if (is64Bit()) {
+      p = "E-p:64:64:64-i64:64:64-f64:64:64-f128:128:128-n32:64";
+    } else {
+      p = "E-p:32:32:32-i64:64:64-f64:64:64-f128:64:64-n32";
+    }
+    return std::string(p);
+  }
+};
+
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp
new file mode 100644
index 0000000..a676623
--- /dev/null
+++ b/lib/Target/Sparc/SparcTargetMachine.cpp

@@ -0,0 +1,67 @@
+//===-- SparcTargetMachine.cpp - Define TargetMachine for Sparc -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "SparcMCAsmInfo.h"
+#include "SparcTargetMachine.h"
+#include "Sparc.h"
+#include "llvm/PassManager.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+extern "C" void LLVMInitializeSparcTarget() {
+  // Register the target.
+  RegisterTargetMachine<SparcV8TargetMachine> X(TheSparcTarget);
+  RegisterTargetMachine<SparcV9TargetMachine> Y(TheSparcV9Target);
+
+  RegisterAsmInfo<SparcELFMCAsmInfo> A(TheSparcTarget);
+  RegisterAsmInfo<SparcELFMCAsmInfo> B(TheSparcV9Target);
+
+}
+
+/// SparcTargetMachine ctor - Create an ILP32 architecture model
+///
+SparcTargetMachine::SparcTargetMachine(const Target &T, const std::string &TT, 
+                                       const std::string &FS, bool is64bit)
+  : LLVMTargetMachine(T, TT),
+    Subtarget(TT, FS, is64bit),
+    DataLayout(Subtarget.getDataLayout()),
+     TLInfo(*this), InstrInfo(Subtarget),
+    FrameInfo(TargetFrameInfo::StackGrowsDown, 8, 0) {
+}
+
+bool SparcTargetMachine::addInstSelector(PassManagerBase &PM,
+                                         CodeGenOpt::Level OptLevel) {
+  PM.add(createSparcISelDag(*this));
+  return false;
+}
+
+/// addPreEmitPass - This pass may be implemented by targets that want to run
+/// passes immediately before machine code is emitted.  This should return
+/// true if -print-machineinstrs should print out the code after the passes.
+bool SparcTargetMachine::addPreEmitPass(PassManagerBase &PM,
+                                        CodeGenOpt::Level OptLevel){
+  PM.add(createSparcFPMoverPass(*this));
+  PM.add(createSparcDelaySlotFillerPass(*this));
+  return true;
+}
+
+SparcV8TargetMachine::SparcV8TargetMachine(const Target &T,
+                                           const std::string &TT, 
+                                           const std::string &FS)
+  : SparcTargetMachine(T, TT, FS, false) {
+}
+
+SparcV9TargetMachine::SparcV9TargetMachine(const Target &T, 
+                                           const std::string &TT, 
+                                           const std::string &FS)
+  : SparcTargetMachine(T, TT, FS, true) {
+}

diff --git a/lib/Target/Sparc/SparcTargetMachine.h b/lib/Target/Sparc/SparcTargetMachine.h
new file mode 100644
index 0000000..5834d08
--- /dev/null
+++ b/lib/Target/Sparc/SparcTargetMachine.h

@@ -0,0 +1,70 @@
+//===-- SparcTargetMachine.h - Define TargetMachine for Sparc ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Sparc specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SPARCTARGETMACHINE_H
+#define SPARCTARGETMACHINE_H
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "SparcInstrInfo.h"
+#include "SparcSubtarget.h"
+#include "SparcISelLowering.h"
+
+namespace llvm {
+
+class SparcTargetMachine : public LLVMTargetMachine {
+  SparcSubtarget Subtarget;
+  const TargetData DataLayout;       // Calculates type size & alignment
+  SparcTargetLowering TLInfo;
+  SparcInstrInfo InstrInfo;
+  TargetFrameInfo FrameInfo;
+public:
+  SparcTargetMachine(const Target &T, const std::string &TT,
+                     const std::string &FS, bool is64bit);
+
+  virtual const SparcInstrInfo *getInstrInfo() const { return &InstrInfo; }
+  virtual const TargetFrameInfo  *getFrameInfo() const { return &FrameInfo; }
+  virtual const SparcSubtarget   *getSubtargetImpl() const{ return &Subtarget; }
+  virtual const SparcRegisterInfo *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+  virtual SparcTargetLowering* getTargetLowering() const {
+    return const_cast<SparcTargetLowering*>(&TLInfo);
+  }
+  virtual const TargetData       *getTargetData() const { return &DataLayout; }
+
+  // Pass Pipeline Configuration
+  virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+  virtual bool addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+};
+
+/// SparcV8TargetMachine - Sparc 32-bit target machine
+///
+class SparcV8TargetMachine : public SparcTargetMachine {
+public:
+  SparcV8TargetMachine(const Target &T, const std::string &TT,
+                       const std::string &FS);
+};
+
+/// SparcV9TargetMachine - Sparc 64-bit target machine
+///
+class SparcV9TargetMachine : public SparcTargetMachine {
+public:
+  SparcV9TargetMachine(const Target &T, const std::string &TT,
+                       const std::string &FS);
+};
+
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/Sparc/TargetInfo/CMakeLists.txt b/lib/Target/Sparc/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000..870b56a
--- /dev/null
+++ b/lib/Target/Sparc/TargetInfo/CMakeLists.txt

@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMSparcInfo
+  SparcTargetInfo.cpp
+  )
+
+add_dependencies(LLVMSparcInfo SparcCodeGenTable_gen)

diff --git a/lib/Target/Sparc/TargetInfo/Makefile b/lib/Target/Sparc/TargetInfo/Makefile
new file mode 100644
index 0000000..641ed87
--- /dev/null
+++ b/lib/Target/Sparc/TargetInfo/Makefile

@@ -0,0 +1,15 @@
+##===- lib/Target/Sparc/TargetInfo/Makefile ----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMSparcInfo
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common

diff --git a/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp b/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp
new file mode 100644
index 0000000..5c06f07
--- /dev/null
+++ b/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp

@@ -0,0 +1,21 @@
+//===-- SparcTargetInfo.cpp - Sparc Target Implementation -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Sparc.h"
+#include "llvm/Module.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheSparcTarget;
+Target llvm::TheSparcV9Target;
+
+extern "C" void LLVMInitializeSparcTargetInfo() { 
+  RegisterTarget<Triple::sparc> X(TheSparcTarget, "sparc", "Sparc");
+  RegisterTarget<Triple::sparcv9> Y(TheSparcV9Target, "sparcv9", "Sparc V9");
+}

diff --git a/lib/Target/SubtargetFeature.cpp b/lib/Target/SubtargetFeature.cpp
new file mode 100644
index 0000000..2094cc9
--- /dev/null
+++ b/lib/Target/SubtargetFeature.cpp

@@ -0,0 +1,387 @@
+//===- SubtargetFeature.cpp - CPU characteristics Implementation ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SubtargetFeature interface.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/SubtargetFeature.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/StringExtras.h"
+#include <algorithm>
+#include <cassert>
+#include <cctype>
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//                          Static Helper Functions
+//===----------------------------------------------------------------------===//
+
+/// hasFlag - Determine if a feature has a flag; '+' or '-'
+///
+static inline bool hasFlag(const std::string &Feature) {
+  assert(!Feature.empty() && "Empty string");
+  // Get first character
+  char Ch = Feature[0];
+  // Check if first character is '+' or '-' flag
+  return Ch == '+' || Ch =='-';
+}
+
+/// StripFlag - Return string stripped of flag.
+///
+static inline std::string StripFlag(const std::string &Feature) {
+  return hasFlag(Feature) ? Feature.substr(1) : Feature;
+}
+
+/// isEnabled - Return true if enable flag; '+'.
+///
+static inline bool isEnabled(const std::string &Feature) {
+  assert(!Feature.empty() && "Empty string");
+  // Get first character
+  char Ch = Feature[0];
+  // Check if first character is '+' for enabled
+  return Ch == '+';
+}
+
+/// PrependFlag - Return a string with a prepended flag; '+' or '-'.
+///
+static inline std::string PrependFlag(const std::string &Feature,
+                                      bool IsEnabled) {
+  assert(!Feature.empty() && "Empty string");
+  if (hasFlag(Feature)) return Feature;
+  return std::string(IsEnabled ? "+" : "-") + Feature;
+}
+
+/// Split - Splits a string of comma separated items in to a vector of strings.
+///
+static void Split(std::vector<std::string> &V, const std::string &S) {
+  // Start at beginning of string.
+  size_t Pos = 0;
+  while (true) {
+    // Find the next comma
+    size_t Comma = S.find(',', Pos);
+    // If no comma found then the rest of the string is used
+    if (Comma == std::string::npos) {
+      // Add string to vector
+      V.push_back(S.substr(Pos));
+      break;
+    }
+    // Otherwise add substring to vector
+    V.push_back(S.substr(Pos, Comma - Pos));
+    // Advance to next item
+    Pos = Comma + 1;
+  }
+}
+
+/// Join a vector of strings to a string with a comma separating each element.
+///
+static std::string Join(const std::vector<std::string> &V) {
+  // Start with empty string.
+  std::string Result;
+  // If the vector is not empty 
+  if (!V.empty()) {
+    // Start with the CPU feature
+    Result = V[0];
+    // For each successive feature
+    for (size_t i = 1; i < V.size(); i++) {
+      // Add a comma
+      Result += ",";
+      // Add the feature
+      Result += V[i];
+    }
+  }
+  // Return the features string 
+  return Result;
+}
+
+/// Adding features.
+void SubtargetFeatures::AddFeature(const std::string &String,
+                                   bool IsEnabled) {
+  // Don't add empty features
+  if (!String.empty()) {
+    // Convert to lowercase, prepend flag and add to vector
+    Features.push_back(PrependFlag(LowercaseString(String), IsEnabled));
+  }
+}
+
+/// Find KV in array using binary search.
+template<typename T> const T *Find(const std::string &S, const T *A, size_t L) {
+  // Make the lower bound element we're looking for
+  T KV;
+  KV.Key = S.c_str();
+  // Determine the end of the array
+  const T *Hi = A + L;
+  // Binary search the array
+  const T *F = std::lower_bound(A, Hi, KV);
+  // If not found then return NULL
+  if (F == Hi || std::string(F->Key) != S) return NULL;
+  // Return the found array item
+  return F;
+}
+
+/// getLongestEntryLength - Return the length of the longest entry in the table.
+///
+static size_t getLongestEntryLength(const SubtargetFeatureKV *Table,
+                                    size_t Size) {
+  size_t MaxLen = 0;
+  for (size_t i = 0; i < Size; i++)
+    MaxLen = std::max(MaxLen, std::strlen(Table[i].Key));
+  return MaxLen;
+}
+
+/// Display help for feature choices.
+///
+static void Help(const SubtargetFeatureKV *CPUTable, size_t CPUTableSize,
+                 const SubtargetFeatureKV *FeatTable, size_t FeatTableSize) {
+  // Determine the length of the longest CPU and Feature entries.
+  unsigned MaxCPULen  = getLongestEntryLength(CPUTable, CPUTableSize);
+  unsigned MaxFeatLen = getLongestEntryLength(FeatTable, FeatTableSize);
+
+  // Print the CPU table.
+  errs() << "Available CPUs for this target:\n\n";
+  for (size_t i = 0; i != CPUTableSize; i++)
+    errs() << "  " << CPUTable[i].Key
+         << std::string(MaxCPULen - std::strlen(CPUTable[i].Key), ' ')
+         << " - " << CPUTable[i].Desc << ".\n";
+  errs() << "\n";
+  
+  // Print the Feature table.
+  errs() << "Available features for this target:\n\n";
+  for (size_t i = 0; i != FeatTableSize; i++)
+    errs() << "  " << FeatTable[i].Key
+         << std::string(MaxFeatLen - std::strlen(FeatTable[i].Key), ' ')
+         << " - " << FeatTable[i].Desc << ".\n";
+  errs() << "\n";
+  
+  errs() << "Use +feature to enable a feature, or -feature to disable it.\n"
+       << "For example, llc -mcpu=mycpu -mattr=+feature1,-feature2\n";
+  exit(1);
+}
+
+//===----------------------------------------------------------------------===//
+//                    SubtargetFeatures Implementation
+//===----------------------------------------------------------------------===//
+
+SubtargetFeatures::SubtargetFeatures(const std::string &Initial) {
+  // Break up string into separate features
+  Split(Features, Initial);
+}
+
+
+std::string SubtargetFeatures::getString() const {
+  return Join(Features);
+}
+void SubtargetFeatures::setString(const std::string &Initial) {
+  // Throw out old features
+  Features.clear();
+  // Break up string into separate features
+  Split(Features, LowercaseString(Initial));
+}
+
+
+/// setCPU - Set the CPU string.  Replaces previous setting.  Setting to ""
+/// clears CPU.
+void SubtargetFeatures::setCPU(const std::string &String) {
+  Features[0] = LowercaseString(String);
+}
+
+
+/// setCPUIfNone - Setting CPU string only if no string is set.
+///
+void SubtargetFeatures::setCPUIfNone(const std::string &String) {
+  if (Features[0].empty()) setCPU(String);
+}
+
+/// getCPU - Returns current CPU.
+///
+const std::string & SubtargetFeatures::getCPU() const {
+  return Features[0];
+}
+
+
+/// SetImpliedBits - For each feature that is (transitively) implied by this
+/// feature, set it.
+///
+static
+void SetImpliedBits(uint32_t &Bits, const SubtargetFeatureKV *FeatureEntry,
+                    const SubtargetFeatureKV *FeatureTable,
+                    size_t FeatureTableSize) {
+  for (size_t i = 0; i < FeatureTableSize; ++i) {
+    const SubtargetFeatureKV &FE = FeatureTable[i];
+
+    if (FeatureEntry->Value == FE.Value) continue;
+
+    if (FeatureEntry->Implies & FE.Value) {
+      Bits |= FE.Value;
+      SetImpliedBits(Bits, &FE, FeatureTable, FeatureTableSize);
+    }
+  }
+}
+
+/// ClearImpliedBits - For each feature that (transitively) implies this
+/// feature, clear it.
+/// 
+static
+void ClearImpliedBits(uint32_t &Bits, const SubtargetFeatureKV *FeatureEntry,
+                      const SubtargetFeatureKV *FeatureTable,
+                      size_t FeatureTableSize) {
+  for (size_t i = 0; i < FeatureTableSize; ++i) {
+    const SubtargetFeatureKV &FE = FeatureTable[i];
+
+    if (FeatureEntry->Value == FE.Value) continue;
+
+    if (FE.Implies & FeatureEntry->Value) {
+      Bits &= ~FE.Value;
+      ClearImpliedBits(Bits, &FE, FeatureTable, FeatureTableSize);
+    }
+  }
+}
+
+/// getBits - Get feature bits.
+///
+uint32_t SubtargetFeatures::getBits(const SubtargetFeatureKV *CPUTable,
+                                          size_t CPUTableSize,
+                                    const SubtargetFeatureKV *FeatureTable,
+                                          size_t FeatureTableSize) {
+  assert(CPUTable && "missing CPU table");
+  assert(FeatureTable && "missing features table");
+#ifndef NDEBUG
+  for (size_t i = 1; i < CPUTableSize; i++) {
+    assert(strcmp(CPUTable[i - 1].Key, CPUTable[i].Key) < 0 &&
+           "CPU table is not sorted");
+  }
+  for (size_t i = 1; i < FeatureTableSize; i++) {
+    assert(strcmp(FeatureTable[i - 1].Key, FeatureTable[i].Key) < 0 &&
+          "CPU features table is not sorted");
+  }
+#endif
+  uint32_t Bits = 0;                    // Resulting bits
+
+  // Check if help is needed
+  if (Features[0] == "help")
+    Help(CPUTable, CPUTableSize, FeatureTable, FeatureTableSize);
+  
+  // Find CPU entry
+  const SubtargetFeatureKV *CPUEntry =
+                            Find(Features[0], CPUTable, CPUTableSize);
+  // If there is a match
+  if (CPUEntry) {
+    // Set base feature bits
+    Bits = CPUEntry->Value;
+
+    // Set the feature implied by this CPU feature, if any.
+    for (size_t i = 0; i < FeatureTableSize; ++i) {
+      const SubtargetFeatureKV &FE = FeatureTable[i];
+      if (CPUEntry->Value & FE.Value)
+        SetImpliedBits(Bits, &FE, FeatureTable, FeatureTableSize);
+    }
+  } else {
+    errs() << "'" << Features[0]
+           << "' is not a recognized processor for this target"
+           << " (ignoring processor)\n";
+  }
+  // Iterate through each feature
+  for (size_t i = 1; i < Features.size(); i++) {
+    const std::string &Feature = Features[i];
+    
+    // Check for help
+    if (Feature == "+help")
+      Help(CPUTable, CPUTableSize, FeatureTable, FeatureTableSize);
+    
+    // Find feature in table.
+    const SubtargetFeatureKV *FeatureEntry =
+                       Find(StripFlag(Feature), FeatureTable, FeatureTableSize);
+    // If there is a match
+    if (FeatureEntry) {
+      // Enable/disable feature in bits
+      if (isEnabled(Feature)) {
+        Bits |=  FeatureEntry->Value;
+
+        // For each feature that this implies, set it.
+        SetImpliedBits(Bits, FeatureEntry, FeatureTable, FeatureTableSize);
+      } else {
+        Bits &= ~FeatureEntry->Value;
+
+        // For each feature that implies this, clear it.
+        ClearImpliedBits(Bits, FeatureEntry, FeatureTable, FeatureTableSize);
+      }
+    } else {
+      errs() << "'" << Feature
+             << "' is not a recognized feature for this target"
+             << " (ignoring feature)\n";
+    }
+  }
+
+  return Bits;
+}
+
+/// Get info pointer
+void *SubtargetFeatures::getInfo(const SubtargetInfoKV *Table,
+                                       size_t TableSize) {
+  assert(Table && "missing table");
+#ifndef NDEBUG
+  for (size_t i = 1; i < TableSize; i++) {
+    assert(strcmp(Table[i - 1].Key, Table[i].Key) < 0 && "Table is not sorted");
+  }
+#endif
+
+  // Find entry
+  const SubtargetInfoKV *Entry = Find(Features[0], Table, TableSize);
+  
+  if (Entry) {
+    return Entry->Value;
+  } else {
+    errs() << "'" << Features[0]
+           << "' is not a recognized processor for this target"
+           << " (ignoring processor)\n";
+    return NULL;
+  }
+}
+
+/// print - Print feature string.
+///
+void SubtargetFeatures::print(raw_ostream &OS) const {
+  for (size_t i = 0, e = Features.size(); i != e; ++i)
+    OS << Features[i] << "  ";
+  OS << "\n";
+}
+
+/// dump - Dump feature info.
+///
+void SubtargetFeatures::dump() const {
+  print(dbgs());
+}
+
+/// getDefaultSubtargetFeatures - Return a string listing
+/// the features associated with the target triple.
+///
+/// FIXME: This is an inelegant way of specifying the features of a
+/// subtarget. It would be better if we could encode this information
+/// into the IR. See <rdar://5972456>.
+///
+std::string SubtargetFeatures::getDefaultSubtargetFeatures(
+                                               const Triple& Triple) {
+  switch (Triple.getVendor()) {
+  case Triple::Apple:
+    switch (Triple.getArch()) {
+    case Triple::ppc:   // powerpc-apple-*
+      return std::string("altivec");
+    case Triple::ppc64: // powerpc64-apple-*
+      return std::string("64bit,altivec");
+    default:
+      break;
+    }
+    break;
+  default:
+    break;
+  } 
+
+  return std::string("");
+}

diff --git a/lib/Target/SystemZ/AsmPrinter/CMakeLists.txt b/lib/Target/SystemZ/AsmPrinter/CMakeLists.txt
new file mode 100644
index 0000000..c6be83a
--- /dev/null
+++ b/lib/Target/SystemZ/AsmPrinter/CMakeLists.txt

@@ -0,0 +1,6 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMSystemZAsmPrinter
+  SystemZAsmPrinter.cpp
+  )
+add_dependencies(LLVMSystemZAsmPrinter SystemZCodeGenTable_gen)

diff --git a/lib/Target/SystemZ/AsmPrinter/Makefile b/lib/Target/SystemZ/AsmPrinter/Makefile
new file mode 100644
index 0000000..9a350df
--- /dev/null
+++ b/lib/Target/SystemZ/AsmPrinter/Makefile

@@ -0,0 +1,15 @@
+##===- lib/Target/SystemZ/AsmPrinter/Makefile --------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMSystemZAsmPrinter
+
+# Hack: we need to include 'main' SystemZ target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common

diff --git a/lib/Target/SystemZ/AsmPrinter/SystemZAsmPrinter.cpp b/lib/Target/SystemZ/AsmPrinter/SystemZAsmPrinter.cpp
new file mode 100644
index 0000000..7a9e8dd
--- /dev/null
+++ b/lib/Target/SystemZ/AsmPrinter/SystemZAsmPrinter.cpp

@@ -0,0 +1,222 @@
+//===-- SystemZAsmPrinter.cpp - SystemZ LLVM assembly writer ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to the SystemZ assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "SystemZ.h"
+#include "SystemZInstrInfo.h"
+#include "SystemZTargetMachine.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+using namespace llvm;
+
+namespace {
+  class SystemZAsmPrinter : public AsmPrinter {
+  public:
+    SystemZAsmPrinter(formatted_raw_ostream &O, TargetMachine &TM,
+                      MCContext &Ctx, MCStreamer &Streamer,
+                      const MCAsmInfo *MAI)
+      : AsmPrinter(O, TM, Ctx, Streamer, MAI) {}
+
+    virtual const char *getPassName() const {
+      return "SystemZ Assembly Printer";
+    }
+
+    void printOperand(const MachineInstr *MI, int OpNum,
+                      const char* Modifier = 0);
+    void printPCRelImmOperand(const MachineInstr *MI, int OpNum);
+    void printRIAddrOperand(const MachineInstr *MI, int OpNum,
+                            const char* Modifier = 0);
+    void printRRIAddrOperand(const MachineInstr *MI, int OpNum,
+                             const char* Modifier = 0);
+    void printS16ImmOperand(const MachineInstr *MI, int OpNum) {
+      O << (int16_t)MI->getOperand(OpNum).getImm();
+    }
+    void printS32ImmOperand(const MachineInstr *MI, int OpNum) {
+      O << (int32_t)MI->getOperand(OpNum).getImm();
+    }
+
+    void printInstruction(const MachineInstr *MI);  // autogenerated.
+    static const char *getRegisterName(unsigned RegNo);
+
+    void EmitInstruction(const MachineInstr *MI);
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AsmPrinter::getAnalysisUsage(AU);
+      AU.setPreservesAll();
+    }
+  };
+} // end of anonymous namespace
+
+#include "SystemZGenAsmWriter.inc"
+
+void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  // Call the autogenerated instruction printer routines.
+  printInstruction(MI);
+  OutStreamer.AddBlankLine();
+}
+
+void SystemZAsmPrinter::printPCRelImmOperand(const MachineInstr *MI, int OpNum){
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  switch (MO.getType()) {
+  case MachineOperand::MO_Immediate:
+    O << MO.getImm();
+    return;
+  case MachineOperand::MO_MachineBasicBlock:
+    O << *MO.getMBB()->getSymbol(OutContext);
+    return;
+  case MachineOperand::MO_GlobalAddress: {
+    const GlobalValue *GV = MO.getGlobal();
+    O << *GetGlobalValueSymbol(GV);
+
+    // Assemble calls via PLT for externally visible symbols if PIC.
+    if (TM.getRelocationModel() == Reloc::PIC_ &&
+        !GV->hasHiddenVisibility() && !GV->hasProtectedVisibility() &&
+        !GV->hasLocalLinkage())
+      O << "@PLT";
+
+    printOffset(MO.getOffset());
+    return;
+  }
+  case MachineOperand::MO_ExternalSymbol: {
+    std::string Name(MAI->getGlobalPrefix());
+    Name += MO.getSymbolName();
+    O << Name;
+
+    if (TM.getRelocationModel() == Reloc::PIC_)
+      O << "@PLT";
+
+    return;
+  }
+  default:
+    assert(0 && "Not implemented yet!");
+  }
+}
+
+
+void SystemZAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
+                                     const char* Modifier) {
+  const MachineOperand &MO = MI->getOperand(OpNum);
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register: {
+    assert (TargetRegisterInfo::isPhysicalRegister(MO.getReg()) &&
+            "Virtual registers should be already mapped!");
+    unsigned Reg = MO.getReg();
+    if (Modifier && strncmp(Modifier, "subreg", 6) == 0) {
+      if (strncmp(Modifier + 7, "even", 4) == 0)
+        Reg = TRI->getSubReg(Reg, SystemZ::SUBREG_EVEN);
+      else if (strncmp(Modifier + 7, "odd", 3) == 0)
+        Reg = TRI->getSubReg(Reg, SystemZ::SUBREG_ODD);
+      else
+        assert(0 && "Invalid subreg modifier");
+    }
+
+    O << '%' << getRegisterName(Reg);
+    return;
+  }
+  case MachineOperand::MO_Immediate:
+    O << MO.getImm();
+    return;
+  case MachineOperand::MO_MachineBasicBlock:
+    O << *MO.getMBB()->getSymbol(OutContext);
+    return;
+  case MachineOperand::MO_JumpTableIndex:
+    O << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber() << '_'
+      << MO.getIndex();
+
+    return;
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << MAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber() << '_'
+      << MO.getIndex();
+
+    printOffset(MO.getOffset());
+    break;
+  case MachineOperand::MO_GlobalAddress:
+    O << *GetGlobalValueSymbol(MO.getGlobal());
+    break;
+  case MachineOperand::MO_ExternalSymbol: {
+    O << *GetExternalSymbolSymbol(MO.getSymbolName());
+    break;
+  }
+  default:
+    assert(0 && "Not implemented yet!");
+  }
+
+  switch (MO.getTargetFlags()) {
+  default:
+    llvm_unreachable("Unknown target flag on GV operand");
+  case SystemZII::MO_NO_FLAG:
+    break;
+  case SystemZII::MO_GOTENT:    O << "@GOTENT";    break;
+  case SystemZII::MO_PLT:       O << "@PLT";       break;
+  }
+
+  printOffset(MO.getOffset());
+}
+
+void SystemZAsmPrinter::printRIAddrOperand(const MachineInstr *MI, int OpNum,
+                                           const char* Modifier) {
+  const MachineOperand &Base = MI->getOperand(OpNum);
+
+  // Print displacement operand.
+  printOperand(MI, OpNum+1);
+
+  // Print base operand (if any)
+  if (Base.getReg()) {
+    O << '(';
+    printOperand(MI, OpNum);
+    O << ')';
+  }
+}
+
+void SystemZAsmPrinter::printRRIAddrOperand(const MachineInstr *MI, int OpNum,
+                                            const char* Modifier) {
+  const MachineOperand &Base = MI->getOperand(OpNum);
+  const MachineOperand &Index = MI->getOperand(OpNum+2);
+
+  // Print displacement operand.
+  printOperand(MI, OpNum+1);
+
+  // Print base operand (if any)
+  if (Base.getReg()) {
+    O << '(';
+    printOperand(MI, OpNum);
+    if (Index.getReg()) {
+      O << ',';
+      printOperand(MI, OpNum+2);
+    }
+    O << ')';
+  } else
+    assert(!Index.getReg() && "Should allocate base register first!");
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeSystemZAsmPrinter() {
+  RegisterAsmPrinter<SystemZAsmPrinter> X(TheSystemZTarget);
+}

diff --git a/lib/Target/SystemZ/CMakeLists.txt b/lib/Target/SystemZ/CMakeLists.txt
new file mode 100644
index 0000000..81e51d8
--- /dev/null
+++ b/lib/Target/SystemZ/CMakeLists.txt

@@ -0,0 +1,23 @@
+set(LLVM_TARGET_DEFINITIONS SystemZ.td)
+
+tablegen(SystemZGenRegisterInfo.h.inc -gen-register-desc-header)
+tablegen(SystemZGenRegisterNames.inc -gen-register-enums)
+tablegen(SystemZGenRegisterInfo.inc -gen-register-desc)
+tablegen(SystemZGenInstrNames.inc -gen-instr-enums)
+tablegen(SystemZGenInstrInfo.inc -gen-instr-desc)
+tablegen(SystemZGenAsmWriter.inc -gen-asm-writer)
+tablegen(SystemZGenDAGISel.inc -gen-dag-isel)
+tablegen(SystemZGenCallingConv.inc -gen-callingconv)
+tablegen(SystemZGenSubtarget.inc -gen-subtarget)
+
+add_llvm_target(SystemZCodeGen
+  SystemZISelDAGToDAG.cpp
+  SystemZISelLowering.cpp
+  SystemZInstrInfo.cpp
+  SystemZMCAsmInfo.cpp
+  SystemZRegisterInfo.cpp
+  SystemZSubtarget.cpp
+  SystemZTargetMachine.cpp
+  )
+
+target_link_libraries (LLVMSystemZCodeGen LLVMSelectionDAG)

diff --git a/lib/Target/SystemZ/Makefile b/lib/Target/SystemZ/Makefile
new file mode 100644
index 0000000..5b44090
--- /dev/null
+++ b/lib/Target/SystemZ/Makefile

@@ -0,0 +1,23 @@
+##===- lib/Target/SystemZ/Makefile ---------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMSystemZCodeGen
+TARGET = SystemZ
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = SystemZGenRegisterInfo.h.inc SystemZGenRegisterNames.inc \
+                SystemZGenRegisterInfo.inc SystemZGenInstrNames.inc \
+                SystemZGenInstrInfo.inc SystemZGenAsmWriter.inc \
+                SystemZGenDAGISel.inc SystemZGenSubtarget.inc SystemZGenCallingConv.inc
+
+DIRS = AsmPrinter TargetInfo
+
+include $(LEVEL)/Makefile.common
+

diff --git a/lib/Target/SystemZ/SystemZ.h b/lib/Target/SystemZ/SystemZ.h
new file mode 100644
index 0000000..ea5240a
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZ.h

@@ -0,0 +1,61 @@
+//=-- SystemZ.h - Top-level interface for SystemZ representation -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in
+// the LLVM SystemZ backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_SystemZ_H
+#define LLVM_TARGET_SystemZ_H
+
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+  class SystemZTargetMachine;
+  class FunctionPass;
+  class formatted_raw_ostream;
+
+  namespace SystemZCC {
+    // SystemZ specific condition code. These correspond to SYSTEMZ_*_COND in
+    // SystemZInstrInfo.td. They must be kept in synch.
+    enum CondCodes {
+      O   = 0,
+      H   = 1,
+      NLE = 2,
+      L   = 3,
+      NHE = 4,
+      LH  = 5,
+      NE  = 6,
+      E   = 7,
+      NLH = 8,
+      HE  = 9,
+      NL  = 10,
+      LE  = 11,
+      NH  = 12,
+      NO  = 13,
+      INVALID = -1
+    };
+  }
+
+  FunctionPass *createSystemZISelDag(SystemZTargetMachine &TM,
+                                    CodeGenOpt::Level OptLevel);
+
+  extern Target TheSystemZTarget;
+
+} // end namespace llvm;
+
+// Defines symbolic names for SystemZ registers.
+// This defines a mapping from register name to register number.
+#include "SystemZGenRegisterNames.inc"
+
+// Defines symbolic names for the SystemZ instructions.
+#include "SystemZGenInstrNames.inc"
+
+#endif

diff --git a/lib/Target/SystemZ/SystemZ.td b/lib/Target/SystemZ/SystemZ.td
new file mode 100644
index 0000000..4c08c08
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZ.td

@@ -0,0 +1,61 @@
+//===- SystemZ.td - Describe the SystemZ Target Machine ------*- tblgen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This is the top level entry point for the SystemZ target.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// Subtarget Features. 
+//===----------------------------------------------------------------------===//
+def FeatureZ10 : SubtargetFeature<"z10", "HasZ10Insts", "true",
+                                  "Support Z10 instructions">;
+
+//===----------------------------------------------------------------------===//
+// SystemZ supported processors.
+//===----------------------------------------------------------------------===//
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"z9",  []>;
+def : Proc<"z10", [FeatureZ10]>;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "SystemZRegisterInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Calling Convention Description
+//===----------------------------------------------------------------------===//
+
+include "SystemZCallingConv.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "SystemZInstrInfo.td"
+include "SystemZInstrFP.td"
+
+def SystemZInstrInfo : InstrInfo {} 
+
+//===----------------------------------------------------------------------===//
+// Target Declaration
+//===----------------------------------------------------------------------===//
+
+def SystemZ : Target {
+  let InstructionSet = SystemZInstrInfo;
+}
+

diff --git a/lib/Target/SystemZ/SystemZCallingConv.td b/lib/Target/SystemZ/SystemZCallingConv.td
new file mode 100644
index 0000000..c799a9e
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZCallingConv.td

@@ -0,0 +1,46 @@
+//=- SystemZCallingConv.td - Calling Conventions for SystemZ -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This describes the calling conventions for SystemZ architecture.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SystemZ Return Value Calling Convention
+//===----------------------------------------------------------------------===//
+def RetCC_SystemZ : CallingConv<[
+  // Promote i8/i16/i32 arguments to i64.
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+  // i64 is returned in register R2
+  CCIfType<[i64], CCAssignToReg<[R2D, R3D, R4D, R5D]>>,
+
+  // f32 / f64 are returned in F0
+  CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
+  CCIfType<[f64], CCAssignToReg<[F0L, F2L, F4L, F6L]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// SystemZ Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+def CC_SystemZ : CallingConv<[
+  // Promote i8/i16/i32 arguments to i64.
+  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
+
+  // The first 5 integer arguments of non-varargs functions are passed in
+  // integer registers.
+  CCIfType<[i64], CCAssignToReg<[R2D, R3D, R4D, R5D, R6D]>>,
+
+  // The first 4 floating point arguments of non-varargs functions are passed
+  // in FP registers.
+  CCIfType<[f32], CCAssignToReg<[F0S, F2S, F4S, F6S]>>,
+  CCIfType<[f64], CCAssignToReg<[F0L, F2L, F4L, F6L]>>,
+
+  // Integer values get stored in stack slots that are 8 bytes in
+  // size and 8-byte aligned.
+  CCIfType<[i64, f32, f64], CCAssignToStack<8, 8>>
+]>;

diff --git a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
new file mode 100644
index 0000000..f6f632d
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp

@@ -0,0 +1,826 @@
+//==-- SystemZISelDAGToDAG.cpp - A dag to dag inst selector for SystemZ ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the SystemZ target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZ.h"
+#include "SystemZISelLowering.h"
+#include "SystemZTargetMachine.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+static const unsigned subreg_even32 = 1;
+static const unsigned subreg_odd32  = 2;
+static const unsigned subreg_even   = 3;
+static const unsigned subreg_odd    = 4;
+
+namespace {
+  /// SystemZRRIAddressMode - This corresponds to rriaddr, but uses SDValue's
+  /// instead of register numbers for the leaves of the matched tree.
+  struct SystemZRRIAddressMode {
+    enum {
+      RegBase,
+      FrameIndexBase
+    } BaseType;
+
+    struct {            // This is really a union, discriminated by BaseType!
+      SDValue Reg;
+      int FrameIndex;
+    } Base;
+
+    SDValue IndexReg;
+    int64_t Disp;
+    bool isRI;
+
+    SystemZRRIAddressMode(bool RI = false)
+      : BaseType(RegBase), IndexReg(), Disp(0), isRI(RI) {
+    }
+
+    void dump() {
+      errs() << "SystemZRRIAddressMode " << this << '\n';
+      if (BaseType == RegBase) {
+        errs() << "Base.Reg ";
+        if (Base.Reg.getNode() != 0)
+          Base.Reg.getNode()->dump();
+        else
+          errs() << "nul";
+        errs() << '\n';
+      } else {
+        errs() << " Base.FrameIndex " << Base.FrameIndex << '\n';
+      }
+      if (!isRI) {
+        errs() << "IndexReg ";
+        if (IndexReg.getNode() != 0) IndexReg.getNode()->dump();
+        else errs() << "nul";
+      }
+      errs() << " Disp " << Disp << '\n';
+    }
+  };
+}
+
+/// SystemZDAGToDAGISel - SystemZ specific code to select SystemZ machine
+/// instructions for SelectionDAG operations.
+///
+namespace {
+  class SystemZDAGToDAGISel : public SelectionDAGISel {
+    SystemZTargetLowering &Lowering;
+    const SystemZSubtarget &Subtarget;
+
+    void getAddressOperandsRI(const SystemZRRIAddressMode &AM,
+                            SDValue &Base, SDValue &Disp);
+    void getAddressOperands(const SystemZRRIAddressMode &AM,
+                            SDValue &Base, SDValue &Disp,
+                            SDValue &Index);
+
+  public:
+    SystemZDAGToDAGISel(SystemZTargetMachine &TM, CodeGenOpt::Level OptLevel)
+      : SelectionDAGISel(TM, OptLevel),
+        Lowering(*TM.getTargetLowering()),
+        Subtarget(*TM.getSubtargetImpl()) { }
+
+    virtual void InstructionSelect();
+
+    virtual const char *getPassName() const {
+      return "SystemZ DAG->DAG Pattern Instruction Selection";
+    }
+
+    /// getI8Imm - Return a target constant with the specified value, of type
+    /// i8.
+    inline SDValue getI8Imm(uint64_t Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i8);
+    }
+
+    /// getI16Imm - Return a target constant with the specified value, of type
+    /// i16.
+    inline SDValue getI16Imm(uint64_t Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i16);
+    }
+
+    /// getI32Imm - Return a target constant with the specified value, of type
+    /// i32.
+    inline SDValue getI32Imm(uint64_t Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i32);
+    }
+
+    // Include the pieces autogenerated from the target description.
+    #include "SystemZGenDAGISel.inc"
+
+  private:
+    bool SelectAddrRI12Only(SDNode *Op, SDValue& Addr,
+                            SDValue &Base, SDValue &Disp);
+    bool SelectAddrRI12(SDNode *Op, SDValue& Addr,
+                        SDValue &Base, SDValue &Disp,
+                        bool is12BitOnly = false);
+    bool SelectAddrRI(SDNode *Op, SDValue& Addr,
+                      SDValue &Base, SDValue &Disp);
+    bool SelectAddrRRI12(SDNode *Op, SDValue Addr,
+                         SDValue &Base, SDValue &Disp, SDValue &Index);
+    bool SelectAddrRRI20(SDNode *Op, SDValue Addr,
+                         SDValue &Base, SDValue &Disp, SDValue &Index);
+    bool SelectLAAddr(SDNode *Op, SDValue Addr,
+                      SDValue &Base, SDValue &Disp, SDValue &Index);
+
+    SDNode *Select(SDNode *Node);
+
+    bool TryFoldLoad(SDNode *P, SDValue N,
+                     SDValue &Base, SDValue &Disp, SDValue &Index);
+
+    bool MatchAddress(SDValue N, SystemZRRIAddressMode &AM,
+                      bool is12Bit, unsigned Depth = 0);
+    bool MatchAddressBase(SDValue N, SystemZRRIAddressMode &AM);
+    bool MatchAddressRI(SDValue N, SystemZRRIAddressMode &AM,
+                        bool is12Bit);
+
+  #ifndef NDEBUG
+    unsigned Indent;
+  #endif
+  };
+}  // end anonymous namespace
+
+/// createSystemZISelDag - This pass converts a legalized DAG into a
+/// SystemZ-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createSystemZISelDag(SystemZTargetMachine &TM,
+                                        CodeGenOpt::Level OptLevel) {
+  return new SystemZDAGToDAGISel(TM, OptLevel);
+}
+
+/// isImmSExt20 - This method tests to see if the node is either a 32-bit
+/// or 64-bit immediate, and if the value can be accurately represented as a
+/// sign extension from a 20-bit value. If so, this returns true and the
+/// immediate.
+static bool isImmSExt20(int64_t Val, int64_t &Imm) {
+  if (Val >= -524288 && Val <= 524287) {
+    Imm = Val;
+    return true;
+  }
+  return false;
+}
+
+/// isImmZExt12 - This method tests to see if the node is either a 32-bit
+/// or 64-bit immediate, and if the value can be accurately represented as a
+/// zero extension from a 12-bit value. If so, this returns true and the
+/// immediate.
+static bool isImmZExt12(int64_t Val, int64_t &Imm) {
+  if (Val >= 0 && Val <= 0xFFF) {
+    Imm = Val;
+    return true;
+  }
+  return false;
+}
+
+/// MatchAddress - Add the specified node to the specified addressing mode,
+/// returning true if it cannot be done.  This just pattern matches for the
+/// addressing mode.
+bool SystemZDAGToDAGISel::MatchAddress(SDValue N, SystemZRRIAddressMode &AM,
+                                       bool is12Bit, unsigned Depth) {
+  DebugLoc dl = N.getDebugLoc();
+  DEBUG(errs() << "MatchAddress: "; AM.dump());
+  // Limit recursion.
+  if (Depth > 5)
+    return MatchAddressBase(N, AM);
+
+  // FIXME: We can perform better here. If we have something like
+  // (shift (add A, imm), N), we can try to reassociate stuff and fold shift of
+  // imm into addressing mode.
+  switch (N.getOpcode()) {
+  default: break;
+  case ISD::Constant: {
+    int64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
+    int64_t Imm = 0;
+    bool Match = (is12Bit ?
+                  isImmZExt12(AM.Disp + Val, Imm) :
+                  isImmSExt20(AM.Disp + Val, Imm));
+    if (Match) {
+      AM.Disp = Imm;
+      return false;
+    }
+    break;
+  }
+
+  case ISD::FrameIndex:
+    if (AM.BaseType == SystemZRRIAddressMode::RegBase &&
+        AM.Base.Reg.getNode() == 0) {
+      AM.BaseType = SystemZRRIAddressMode::FrameIndexBase;
+      AM.Base.FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
+      return false;
+    }
+    break;
+
+  case ISD::SUB: {
+    // Given A-B, if A can be completely folded into the address and
+    // the index field with the index field unused, use -B as the index.
+    // This is a win if a has multiple parts that can be folded into
+    // the address. Also, this saves a mov if the base register has
+    // other uses, since it avoids a two-address sub instruction, however
+    // it costs an additional mov if the index register has other uses.
+
+    // Test if the LHS of the sub can be folded.
+    SystemZRRIAddressMode Backup = AM;
+    if (MatchAddress(N.getNode()->getOperand(0), AM, is12Bit, Depth+1)) {
+      AM = Backup;
+      break;
+    }
+    // Test if the index field is free for use.
+    if (AM.IndexReg.getNode() || AM.isRI) {
+      AM = Backup;
+      break;
+    }
+
+    // If the base is a register with multiple uses, this transformation may
+    // save a mov. Otherwise it's probably better not to do it.
+    if (AM.BaseType == SystemZRRIAddressMode::RegBase &&
+        (!AM.Base.Reg.getNode() || AM.Base.Reg.getNode()->hasOneUse())) {
+      AM = Backup;
+      break;
+    }
+
+    // Ok, the transformation is legal and appears profitable. Go for it.
+    SDValue RHS = N.getNode()->getOperand(1);
+    SDValue Zero = CurDAG->getConstant(0, N.getValueType());
+    SDValue Neg = CurDAG->getNode(ISD::SUB, dl, N.getValueType(), Zero, RHS);
+    AM.IndexReg = Neg;
+
+    // Insert the new nodes into the topological ordering.
+    if (Zero.getNode()->getNodeId() == -1 ||
+        Zero.getNode()->getNodeId() > N.getNode()->getNodeId()) {
+      CurDAG->RepositionNode(N.getNode(), Zero.getNode());
+      Zero.getNode()->setNodeId(N.getNode()->getNodeId());
+    }
+    if (Neg.getNode()->getNodeId() == -1 ||
+        Neg.getNode()->getNodeId() > N.getNode()->getNodeId()) {
+      CurDAG->RepositionNode(N.getNode(), Neg.getNode());
+      Neg.getNode()->setNodeId(N.getNode()->getNodeId());
+    }
+    return false;
+  }
+
+  case ISD::ADD: {
+    SystemZRRIAddressMode Backup = AM;
+    if (!MatchAddress(N.getNode()->getOperand(0), AM, is12Bit, Depth+1) &&
+        !MatchAddress(N.getNode()->getOperand(1), AM, is12Bit, Depth+1))
+      return false;
+    AM = Backup;
+    if (!MatchAddress(N.getNode()->getOperand(1), AM, is12Bit, Depth+1) &&
+        !MatchAddress(N.getNode()->getOperand(0), AM, is12Bit, Depth+1))
+      return false;
+    AM = Backup;
+
+    // If we couldn't fold both operands into the address at the same time,
+    // see if we can just put each operand into a register and fold at least
+    // the add.
+    if (!AM.isRI &&
+        AM.BaseType == SystemZRRIAddressMode::RegBase &&
+        !AM.Base.Reg.getNode() && !AM.IndexReg.getNode()) {
+      AM.Base.Reg = N.getNode()->getOperand(0);
+      AM.IndexReg = N.getNode()->getOperand(1);
+      return false;
+    }
+    break;
+  }
+
+  case ISD::OR:
+    // Handle "X | C" as "X + C" iff X is known to have C bits clear.
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      SystemZRRIAddressMode Backup = AM;
+      int64_t Offset = CN->getSExtValue();
+      int64_t Imm = 0;
+      bool MatchOffset = (is12Bit ?
+                          isImmZExt12(AM.Disp + Offset, Imm) :
+                          isImmSExt20(AM.Disp + Offset, Imm));
+      // The resultant disp must fit in 12 or 20-bits.
+      if (MatchOffset &&
+          // LHS should be an addr mode.
+          !MatchAddress(N.getOperand(0), AM, is12Bit, Depth+1) &&
+          // Check to see if the LHS & C is zero.
+          CurDAG->MaskedValueIsZero(N.getOperand(0), CN->getAPIntValue())) {
+        AM.Disp = Imm;
+        return false;
+      }
+      AM = Backup;
+    }
+    break;
+  }
+
+  return MatchAddressBase(N, AM);
+}
+
+/// MatchAddressBase - Helper for MatchAddress. Add the specified node to the
+/// specified addressing mode without any further recursion.
+bool SystemZDAGToDAGISel::MatchAddressBase(SDValue N,
+                                           SystemZRRIAddressMode &AM) {
+  // Is the base register already occupied?
+  if (AM.BaseType != SystemZRRIAddressMode::RegBase || AM.Base.Reg.getNode()) {
+    // If so, check to see if the index register is set.
+    if (AM.IndexReg.getNode() == 0 && !AM.isRI) {
+      AM.IndexReg = N;
+      return false;
+    }
+
+    // Otherwise, we cannot select it.
+    return true;
+  }
+
+  // Default, generate it as a register.
+  AM.BaseType = SystemZRRIAddressMode::RegBase;
+  AM.Base.Reg = N;
+  return false;
+}
+
+void SystemZDAGToDAGISel::getAddressOperandsRI(const SystemZRRIAddressMode &AM,
+                                               SDValue &Base, SDValue &Disp) {
+  if (AM.BaseType == SystemZRRIAddressMode::RegBase)
+    Base = AM.Base.Reg;
+  else
+    Base = CurDAG->getTargetFrameIndex(AM.Base.FrameIndex, TLI.getPointerTy());
+  Disp = CurDAG->getTargetConstant(AM.Disp, MVT::i64);
+}
+
+void SystemZDAGToDAGISel::getAddressOperands(const SystemZRRIAddressMode &AM,
+                                             SDValue &Base, SDValue &Disp,
+                                             SDValue &Index) {
+  getAddressOperandsRI(AM, Base, Disp);
+  Index = AM.IndexReg;
+}
+
+/// Returns true if the address can be represented by a base register plus
+/// an unsigned 12-bit displacement [r+imm].
+bool SystemZDAGToDAGISel::SelectAddrRI12Only(SDNode *Op, SDValue& Addr,
+                                             SDValue &Base, SDValue &Disp) {
+  return SelectAddrRI12(Op, Addr, Base, Disp, /*is12BitOnly*/true);
+}
+
+bool SystemZDAGToDAGISel::SelectAddrRI12(SDNode *Op, SDValue& Addr,
+                                         SDValue &Base, SDValue &Disp,
+                                         bool is12BitOnly) {
+  SystemZRRIAddressMode AM20(/*isRI*/true), AM12(/*isRI*/true);
+  bool Done = false;
+
+  if (!Addr.hasOneUse()) {
+    unsigned Opcode = Addr.getOpcode();
+    if (Opcode != ISD::Constant && Opcode != ISD::FrameIndex) {
+      // If we are able to fold N into addressing mode, then we'll allow it even
+      // if N has multiple uses. In general, addressing computation is used as
+      // addresses by all of its uses. But watch out for CopyToReg uses, that
+      // means the address computation is liveout. It will be computed by a LA
+      // so we want to avoid computing the address twice.
+      for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
+             UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
+        if (UI->getOpcode() == ISD::CopyToReg) {
+          MatchAddressBase(Addr, AM12);
+          Done = true;
+          break;
+        }
+      }
+    }
+  }
+  if (!Done && MatchAddress(Addr, AM12, /* is12Bit */ true))
+    return false;
+
+  // Check, whether we can match stuff using 20-bit displacements
+  if (!Done && !is12BitOnly &&
+      !MatchAddress(Addr, AM20, /* is12Bit */ false))
+    if (AM12.Disp == 0 && AM20.Disp != 0)
+      return false;
+
+  DEBUG(errs() << "MatchAddress (final): "; AM12.dump());
+
+  EVT VT = Addr.getValueType();
+  if (AM12.BaseType == SystemZRRIAddressMode::RegBase) {
+    if (!AM12.Base.Reg.getNode())
+      AM12.Base.Reg = CurDAG->getRegister(0, VT);
+  }
+
+  assert(AM12.IndexReg.getNode() == 0 && "Invalid reg-imm address mode!");
+
+  getAddressOperandsRI(AM12, Base, Disp);
+
+  return true;
+}
+
+/// Returns true if the address can be represented by a base register plus
+/// a signed 20-bit displacement [r+imm].
+bool SystemZDAGToDAGISel::SelectAddrRI(SDNode *Op, SDValue& Addr,
+                                       SDValue &Base, SDValue &Disp) {
+  SystemZRRIAddressMode AM(/*isRI*/true);
+  bool Done = false;
+
+  if (!Addr.hasOneUse()) {
+    unsigned Opcode = Addr.getOpcode();
+    if (Opcode != ISD::Constant && Opcode != ISD::FrameIndex) {
+      // If we are able to fold N into addressing mode, then we'll allow it even
+      // if N has multiple uses. In general, addressing computation is used as
+      // addresses by all of its uses. But watch out for CopyToReg uses, that
+      // means the address computation is liveout. It will be computed by a LA
+      // so we want to avoid computing the address twice.
+      for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
+             UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
+        if (UI->getOpcode() == ISD::CopyToReg) {
+          MatchAddressBase(Addr, AM);
+          Done = true;
+          break;
+        }
+      }
+    }
+  }
+  if (!Done && MatchAddress(Addr, AM, /* is12Bit */ false))
+    return false;
+
+  DEBUG(errs() << "MatchAddress (final): "; AM.dump());
+
+  EVT VT = Addr.getValueType();
+  if (AM.BaseType == SystemZRRIAddressMode::RegBase) {
+    if (!AM.Base.Reg.getNode())
+      AM.Base.Reg = CurDAG->getRegister(0, VT);
+  }
+
+  assert(AM.IndexReg.getNode() == 0 && "Invalid reg-imm address mode!");
+
+  getAddressOperandsRI(AM, Base, Disp);
+
+  return true;
+}
+
+/// Returns true if the address can be represented by a base register plus
+/// index register plus an unsigned 12-bit displacement [base + idx + imm].
+bool SystemZDAGToDAGISel::SelectAddrRRI12(SDNode *Op, SDValue Addr,
+                                SDValue &Base, SDValue &Disp, SDValue &Index) {
+  SystemZRRIAddressMode AM20, AM12;
+  bool Done = false;
+
+  if (!Addr.hasOneUse()) {
+    unsigned Opcode = Addr.getOpcode();
+    if (Opcode != ISD::Constant && Opcode != ISD::FrameIndex) {
+      // If we are able to fold N into addressing mode, then we'll allow it even
+      // if N has multiple uses. In general, addressing computation is used as
+      // addresses by all of its uses. But watch out for CopyToReg uses, that
+      // means the address computation is liveout. It will be computed by a LA
+      // so we want to avoid computing the address twice.
+      for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
+             UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
+        if (UI->getOpcode() == ISD::CopyToReg) {
+          MatchAddressBase(Addr, AM12);
+          Done = true;
+          break;
+        }
+      }
+    }
+  }
+  if (!Done && MatchAddress(Addr, AM12, /* is12Bit */ true))
+    return false;
+
+  // Check, whether we can match stuff using 20-bit displacements
+  if (!Done && !MatchAddress(Addr, AM20, /* is12Bit */ false))
+    if (AM12.Disp == 0 && AM20.Disp != 0)
+      return false;
+
+  DEBUG(errs() << "MatchAddress (final): "; AM12.dump());
+
+  EVT VT = Addr.getValueType();
+  if (AM12.BaseType == SystemZRRIAddressMode::RegBase) {
+    if (!AM12.Base.Reg.getNode())
+      AM12.Base.Reg = CurDAG->getRegister(0, VT);
+  }
+
+  if (!AM12.IndexReg.getNode())
+    AM12.IndexReg = CurDAG->getRegister(0, VT);
+
+  getAddressOperands(AM12, Base, Disp, Index);
+
+  return true;
+}
+
+/// Returns true if the address can be represented by a base register plus
+/// index register plus a signed 20-bit displacement [base + idx + imm].
+bool SystemZDAGToDAGISel::SelectAddrRRI20(SDNode *Op, SDValue Addr,
+                                SDValue &Base, SDValue &Disp, SDValue &Index) {
+  SystemZRRIAddressMode AM;
+  bool Done = false;
+
+  if (!Addr.hasOneUse()) {
+    unsigned Opcode = Addr.getOpcode();
+    if (Opcode != ISD::Constant && Opcode != ISD::FrameIndex) {
+      // If we are able to fold N into addressing mode, then we'll allow it even
+      // if N has multiple uses. In general, addressing computation is used as
+      // addresses by all of its uses. But watch out for CopyToReg uses, that
+      // means the address computation is liveout. It will be computed by a LA
+      // so we want to avoid computing the address twice.
+      for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
+             UE = Addr.getNode()->use_end(); UI != UE; ++UI) {
+        if (UI->getOpcode() == ISD::CopyToReg) {
+          MatchAddressBase(Addr, AM);
+          Done = true;
+          break;
+        }
+      }
+    }
+  }
+  if (!Done && MatchAddress(Addr, AM, /* is12Bit */ false))
+    return false;
+
+  DEBUG(errs() << "MatchAddress (final): "; AM.dump());
+
+  EVT VT = Addr.getValueType();
+  if (AM.BaseType == SystemZRRIAddressMode::RegBase) {
+    if (!AM.Base.Reg.getNode())
+      AM.Base.Reg = CurDAG->getRegister(0, VT);
+  }
+
+  if (!AM.IndexReg.getNode())
+    AM.IndexReg = CurDAG->getRegister(0, VT);
+
+  getAddressOperands(AM, Base, Disp, Index);
+
+  return true;
+}
+
+/// SelectLAAddr - it calls SelectAddr and determines if the maximal addressing
+/// mode it matches can be cost effectively emitted as an LA/LAY instruction.
+bool SystemZDAGToDAGISel::SelectLAAddr(SDNode *Op, SDValue Addr,
+                                  SDValue &Base, SDValue &Disp, SDValue &Index) {
+  SystemZRRIAddressMode AM;
+
+  if (MatchAddress(Addr, AM, false))
+    return false;
+
+  EVT VT = Addr.getValueType();
+  unsigned Complexity = 0;
+  if (AM.BaseType == SystemZRRIAddressMode::RegBase)
+    if (AM.Base.Reg.getNode())
+      Complexity = 1;
+    else
+      AM.Base.Reg = CurDAG->getRegister(0, VT);
+  else if (AM.BaseType == SystemZRRIAddressMode::FrameIndexBase)
+    Complexity = 4;
+
+  if (AM.IndexReg.getNode())
+    Complexity += 1;
+  else
+    AM.IndexReg = CurDAG->getRegister(0, VT);
+
+  if (AM.Disp && (AM.Base.Reg.getNode() || AM.IndexReg.getNode()))
+    Complexity += 1;
+
+  if (Complexity > 2) {
+    getAddressOperands(AM, Base, Disp, Index);
+    return true;
+  }
+
+  return false;
+}
+
+bool SystemZDAGToDAGISel::TryFoldLoad(SDNode *P, SDValue N,
+                                 SDValue &Base, SDValue &Disp, SDValue &Index) {
+  if (ISD::isNON_EXTLoad(N.getNode()) &&
+      N.hasOneUse() &&
+      IsLegalAndProfitableToFold(N.getNode(), P, P))
+    return SelectAddrRRI20(P, N.getOperand(1), Base, Disp, Index);
+  return false;
+}
+
+/// InstructionSelect - This callback is invoked by
+/// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+void SystemZDAGToDAGISel::InstructionSelect() {
+  // Codegen the basic block.
+  DEBUG(errs() << "===== Instruction selection begins:\n");
+  DEBUG(Indent = 0);
+  SelectRoot(*CurDAG);
+  DEBUG(errs() << "===== Instruction selection ends:\n");
+
+  CurDAG->RemoveDeadNodes();
+}
+
+SDNode *SystemZDAGToDAGISel::Select(SDNode *Node) {
+  EVT NVT = Node->getValueType(0);
+  DebugLoc dl = Node->getDebugLoc();
+  unsigned Opcode = Node->getOpcode();
+
+  // Dump information about the Node being selected
+  DEBUG(errs().indent(Indent) << "Selecting: ";
+        Node->dump(CurDAG);
+        errs() << "\n");
+  DEBUG(Indent += 2);
+
+  // If we have a custom node, we already have selected!
+  if (Node->isMachineOpcode()) {
+    DEBUG(errs().indent(Indent-2) << "== ";
+          Node->dump(CurDAG);
+          errs() << "\n");
+    DEBUG(Indent -= 2);
+    return NULL; // Already selected.
+  }
+
+  switch (Opcode) {
+  default: break;
+  case ISD::SDIVREM: {
+    unsigned Opc, MOpc;
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+
+    EVT ResVT;
+    bool is32Bit = false;
+    switch (NVT.getSimpleVT().SimpleTy) {
+    default: assert(0 && "Unsupported VT!");
+    case MVT::i32:
+      Opc = SystemZ::SDIVREM32r; MOpc = SystemZ::SDIVREM32m;
+      ResVT = MVT::v2i64;
+      is32Bit = true;
+      break;
+    case MVT::i64:
+      Opc = SystemZ::SDIVREM64r; MOpc = SystemZ::SDIVREM64m;
+      ResVT = MVT::v2i64;
+      break;
+    }
+
+    SDValue Tmp0, Tmp1, Tmp2;
+    bool foldedLoad = TryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2);
+
+    // Prepare the dividend
+    SDNode *Dividend;
+    if (is32Bit)
+      Dividend = CurDAG->getMachineNode(SystemZ::MOVSX64rr32, dl, MVT::i64, N0);
+    else
+      Dividend = N0.getNode();
+
+    // Insert prepared dividend into suitable 'subreg'
+    SDNode *Tmp = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
+                                         dl, ResVT);
+    Dividend =
+      CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, dl, ResVT,
+                             SDValue(Tmp, 0), SDValue(Dividend, 0),
+                             CurDAG->getTargetConstant(subreg_odd, MVT::i32));
+
+    SDNode *Result;
+    SDValue DivVal = SDValue(Dividend, 0);
+    if (foldedLoad) {
+      SDValue Ops[] = { DivVal, Tmp0, Tmp1, Tmp2, N1.getOperand(0) };
+      Result = CurDAG->getMachineNode(MOpc, dl, ResVT, MVT::Other,
+                                      Ops, array_lengthof(Ops));
+      // Update the chain.
+      ReplaceUses(N1.getValue(1), SDValue(Result, 1));
+    } else {
+      Result = CurDAG->getMachineNode(Opc, dl, ResVT, SDValue(Dividend, 0), N1);
+    }
+
+    // Copy the division (odd subreg) result, if it is needed.
+    if (!SDValue(Node, 0).use_empty()) {
+      unsigned SubRegIdx = (is32Bit ? subreg_odd32 : subreg_odd);
+      SDNode *Div = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                           dl, NVT,
+                                           SDValue(Result, 0),
+                                           CurDAG->getTargetConstant(SubRegIdx,
+                                                                     MVT::i32));
+
+      ReplaceUses(SDValue(Node, 0), SDValue(Div, 0));
+      DEBUG(errs().indent(Indent-2) << "=> ";
+            Result->dump(CurDAG);
+            errs() << "\n");
+    }
+
+    // Copy the remainder (even subreg) result, if it is needed.
+    if (!SDValue(Node, 1).use_empty()) {
+      unsigned SubRegIdx = (is32Bit ? subreg_even32 : subreg_even);
+      SDNode *Rem = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                           dl, NVT,
+                                           SDValue(Result, 0),
+                                           CurDAG->getTargetConstant(SubRegIdx,
+                                                                     MVT::i32));
+
+      ReplaceUses(SDValue(Node, 1), SDValue(Rem, 0));
+      DEBUG(errs().indent(Indent-2) << "=> ";
+            Result->dump(CurDAG);
+            errs() << "\n");
+    }
+
+#ifndef NDEBUG
+    Indent -= 2;
+#endif
+
+    return NULL;
+  }
+  case ISD::UDIVREM: {
+    unsigned Opc, MOpc, ClrOpc;
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+    EVT ResVT;
+
+    bool is32Bit = false;
+    switch (NVT.getSimpleVT().SimpleTy) {
+    default: assert(0 && "Unsupported VT!");
+    case MVT::i32:
+      Opc = SystemZ::UDIVREM32r; MOpc = SystemZ::UDIVREM32m;
+      ClrOpc = SystemZ::MOV64Pr0_even;
+      ResVT = MVT::v2i32;
+      is32Bit = true;
+      break;
+    case MVT::i64:
+      Opc = SystemZ::UDIVREM64r; MOpc = SystemZ::UDIVREM64m;
+      ClrOpc = SystemZ::MOV128r0_even;
+      ResVT = MVT::v2i64;
+      break;
+    }
+
+    SDValue Tmp0, Tmp1, Tmp2;
+    bool foldedLoad = TryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2);
+
+    // Prepare the dividend
+    SDNode *Dividend = N0.getNode();
+
+    // Insert prepared dividend into suitable 'subreg'
+    SDNode *Tmp = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
+                                         dl, ResVT);
+    {
+      unsigned SubRegIdx = (is32Bit ? subreg_odd32 : subreg_odd);
+      Dividend =
+        CurDAG->getMachineNode(TargetOpcode::INSERT_SUBREG, dl, ResVT,
+                               SDValue(Tmp, 0), SDValue(Dividend, 0),
+                               CurDAG->getTargetConstant(SubRegIdx, MVT::i32));
+    }
+
+    // Zero out even subreg
+    Dividend = CurDAG->getMachineNode(ClrOpc, dl, ResVT, SDValue(Dividend, 0));
+
+    SDValue DivVal = SDValue(Dividend, 0);
+    SDNode *Result;
+    if (foldedLoad) {
+      SDValue Ops[] = { DivVal, Tmp0, Tmp1, Tmp2, N1.getOperand(0) };
+      Result = CurDAG->getMachineNode(MOpc, dl, ResVT, MVT::Other,
+                                      Ops, array_lengthof(Ops));
+      // Update the chain.
+      ReplaceUses(N1.getValue(1), SDValue(Result, 1));
+    } else {
+      Result = CurDAG->getMachineNode(Opc, dl, ResVT, DivVal, N1);
+    }
+
+    // Copy the division (odd subreg) result, if it is needed.
+    if (!SDValue(Node, 0).use_empty()) {
+      unsigned SubRegIdx = (is32Bit ? subreg_odd32 : subreg_odd);
+      SDNode *Div = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                           dl, NVT,
+                                           SDValue(Result, 0),
+                                           CurDAG->getTargetConstant(SubRegIdx,
+                                                                     MVT::i32));
+      ReplaceUses(SDValue(Node, 0), SDValue(Div, 0));
+      DEBUG(errs().indent(Indent-2) << "=> ";
+            Result->dump(CurDAG);
+            errs() << "\n");
+    }
+
+    // Copy the remainder (even subreg) result, if it is needed.
+    if (!SDValue(Node, 1).use_empty()) {
+      unsigned SubRegIdx = (is32Bit ? subreg_even32 : subreg_even);
+      SDNode *Rem = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                           dl, NVT,
+                                           SDValue(Result, 0),
+                                           CurDAG->getTargetConstant(SubRegIdx,
+                                                                     MVT::i32));
+      ReplaceUses(SDValue(Node, 1), SDValue(Rem, 0));
+      DEBUG(errs().indent(Indent-2) << "=> ";
+            Result->dump(CurDAG);
+            errs() << "\n");
+    }
+
+#ifndef NDEBUG
+    Indent -= 2;
+#endif
+
+    return NULL;
+  }
+  }
+
+  // Select the default instruction
+  SDNode *ResNode = SelectCode(Node);
+
+  DEBUG(errs().indent(Indent-2) << "=> ";
+        if (ResNode == NULL || ResNode == Node)
+          Node->dump(CurDAG);
+        else
+          ResNode->dump(CurDAG);
+        errs() << "\n";
+        );
+  DEBUG(Indent -= 2);
+
+  return ResNode;
+}

diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
new file mode 100644
index 0000000..f7405a5
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp

@@ -0,0 +1,858 @@
+//===-- SystemZISelLowering.cpp - SystemZ DAG Lowering Implementation  -----==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SystemZTargetLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "systemz-lower"
+
+#include "SystemZISelLowering.h"
+#include "SystemZ.h"
+#include "SystemZTargetMachine.h"
+#include "SystemZSubtarget.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/CallingConv.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/GlobalAlias.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/VectorExtras.h"
+using namespace llvm;
+
+SystemZTargetLowering::SystemZTargetLowering(SystemZTargetMachine &tm) :
+  TargetLowering(tm, new TargetLoweringObjectFileELF()),
+  Subtarget(*tm.getSubtargetImpl()), TM(tm) {
+
+  RegInfo = TM.getRegisterInfo();
+
+  // Set up the register classes.
+  addRegisterClass(MVT::i32,  SystemZ::GR32RegisterClass);
+  addRegisterClass(MVT::i64,  SystemZ::GR64RegisterClass);
+  addRegisterClass(MVT::v2i32,SystemZ::GR64PRegisterClass);
+  addRegisterClass(MVT::v2i64,SystemZ::GR128RegisterClass);
+
+  if (!UseSoftFloat) {
+    addRegisterClass(MVT::f32, SystemZ::FP32RegisterClass);
+    addRegisterClass(MVT::f64, SystemZ::FP64RegisterClass);
+  }
+
+  // Compute derived properties from the register classes
+  computeRegisterProperties();
+
+  // Set shifts properties
+  setShiftAmountType(MVT::i64);
+
+  // Provide all sorts of operation actions
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::EXTLOAD,  MVT::i1, Promote);
+
+  setLoadExtAction(ISD::SEXTLOAD, MVT::f32, Expand);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::f32, Expand);
+  setLoadExtAction(ISD::EXTLOAD,  MVT::f32, Expand);
+
+  setLoadExtAction(ISD::SEXTLOAD, MVT::f64, Expand);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::f64, Expand);
+  setLoadExtAction(ISD::EXTLOAD,  MVT::f64, Expand);
+
+  setStackPointerRegisterToSaveRestore(SystemZ::R15D);
+
+  // TODO: It may be better to default to latency-oriented scheduling, however
+  // LLVM's current latency-oriented scheduler can't handle physreg definitions
+  // such as SystemZ has with PSW, so set this to the register-pressure
+  // scheduler, because it can.
+  setSchedulingPreference(SchedulingForRegPressure);
+
+  setBooleanContents(ZeroOrOneBooleanContent);
+
+  setOperationAction(ISD::BR_JT,            MVT::Other, Expand);
+  setOperationAction(ISD::BRCOND,           MVT::Other, Expand);
+  setOperationAction(ISD::BR_CC,            MVT::i32, Custom);
+  setOperationAction(ISD::BR_CC,            MVT::i64, Custom);
+  setOperationAction(ISD::BR_CC,            MVT::f32, Custom);
+  setOperationAction(ISD::BR_CC,            MVT::f64, Custom);
+  setOperationAction(ISD::ConstantPool,     MVT::i32, Custom);
+  setOperationAction(ISD::ConstantPool,     MVT::i64, Custom);
+  setOperationAction(ISD::GlobalAddress,    MVT::i64, Custom);
+  setOperationAction(ISD::JumpTable,        MVT::i64, Custom);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
+
+  setOperationAction(ISD::SDIV,             MVT::i32, Expand);
+  setOperationAction(ISD::UDIV,             MVT::i32, Expand);
+  setOperationAction(ISD::SDIV,             MVT::i64, Expand);
+  setOperationAction(ISD::UDIV,             MVT::i64, Expand);
+  setOperationAction(ISD::SREM,             MVT::i32, Expand);
+  setOperationAction(ISD::UREM,             MVT::i32, Expand);
+  setOperationAction(ISD::SREM,             MVT::i64, Expand);
+  setOperationAction(ISD::UREM,             MVT::i64, Expand);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+  setOperationAction(ISD::CTPOP,            MVT::i32, Expand);
+  setOperationAction(ISD::CTPOP,            MVT::i64, Expand);
+  setOperationAction(ISD::CTTZ,             MVT::i32, Expand);
+  setOperationAction(ISD::CTTZ,             MVT::i64, Expand);
+  setOperationAction(ISD::CTLZ,             MVT::i32, Promote);
+  setOperationAction(ISD::CTLZ,             MVT::i64, Legal);
+
+  // FIXME: Can we lower these 2 efficiently?
+  setOperationAction(ISD::SETCC,            MVT::i32, Expand);
+  setOperationAction(ISD::SETCC,            MVT::i64, Expand);
+  setOperationAction(ISD::SETCC,            MVT::f32, Expand);
+  setOperationAction(ISD::SETCC,            MVT::f64, Expand);
+  setOperationAction(ISD::SELECT,           MVT::i32, Expand);
+  setOperationAction(ISD::SELECT,           MVT::i64, Expand);
+  setOperationAction(ISD::SELECT,           MVT::f32, Expand);
+  setOperationAction(ISD::SELECT,           MVT::f64, Expand);
+  setOperationAction(ISD::SELECT_CC,        MVT::i32, Custom);
+  setOperationAction(ISD::SELECT_CC,        MVT::i64, Custom);
+  setOperationAction(ISD::SELECT_CC,        MVT::f32, Custom);
+  setOperationAction(ISD::SELECT_CC,        MVT::f64, Custom);
+
+  setOperationAction(ISD::MULHS,            MVT::i64, Expand);
+  setOperationAction(ISD::SMUL_LOHI,        MVT::i64, Expand);
+
+  // FIXME: Can we support these natively?
+  setOperationAction(ISD::UMUL_LOHI,        MVT::i64, Expand);
+  setOperationAction(ISD::SRL_PARTS,        MVT::i64, Expand);
+  setOperationAction(ISD::SHL_PARTS,        MVT::i64, Expand);
+  setOperationAction(ISD::SRA_PARTS,        MVT::i64, Expand);
+
+  // Lower some FP stuff
+  setOperationAction(ISD::FSIN,             MVT::f32, Expand);
+  setOperationAction(ISD::FSIN,             MVT::f64, Expand);
+  setOperationAction(ISD::FCOS,             MVT::f32, Expand);
+  setOperationAction(ISD::FCOS,             MVT::f64, Expand);
+  setOperationAction(ISD::FREM,             MVT::f32, Expand);
+  setOperationAction(ISD::FREM,             MVT::f64, Expand);
+
+  // We have only 64-bit bitconverts
+  setOperationAction(ISD::BIT_CONVERT,      MVT::f32, Expand);
+  setOperationAction(ISD::BIT_CONVERT,      MVT::i32, Expand);
+
+  setOperationAction(ISD::UINT_TO_FP,       MVT::i32, Expand);
+  setOperationAction(ISD::UINT_TO_FP,       MVT::i64, Expand);
+  setOperationAction(ISD::FP_TO_UINT,       MVT::i32, Expand);
+  setOperationAction(ISD::FP_TO_UINT,       MVT::i64, Expand);
+
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+}
+
+SDValue SystemZTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
+  switch (Op.getOpcode()) {
+  case ISD::BR_CC:            return LowerBR_CC(Op, DAG);
+  case ISD::SELECT_CC:        return LowerSELECT_CC(Op, DAG);
+  case ISD::GlobalAddress:    return LowerGlobalAddress(Op, DAG);
+  case ISD::JumpTable:        return LowerJumpTable(Op, DAG);
+  case ISD::ConstantPool:     return LowerConstantPool(Op, DAG);
+  default:
+    llvm_unreachable("Should not custom lower this!");
+    return SDValue();
+  }
+}
+
+bool SystemZTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+  if (UseSoftFloat || (VT != MVT::f32 && VT != MVT::f64))
+    return false;
+
+  // +0.0  lzer
+  // +0.0f lzdr
+  // -0.0  lzer + lner
+  // -0.0f lzdr + lndr
+  return Imm.isZero() || Imm.isNegZero();
+}
+
+//===----------------------------------------------------------------------===//
+//                       SystemZ Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+TargetLowering::ConstraintType
+SystemZTargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'r':
+      return C_RegisterClass;
+    default:
+      break;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+std::pair<unsigned, const TargetRegisterClass*>
+SystemZTargetLowering::
+getRegForInlineAsmConstraint(const std::string &Constraint,
+                             EVT VT) const {
+  if (Constraint.size() == 1) {
+    // GCC Constraint Letters
+    switch (Constraint[0]) {
+    default: break;
+    case 'r':   // GENERAL_REGS
+      if (VT == MVT::i32)
+        return std::make_pair(0U, SystemZ::GR32RegisterClass);
+      else if (VT == MVT::i128)
+        return std::make_pair(0U, SystemZ::GR128RegisterClass);
+
+      return std::make_pair(0U, SystemZ::GR64RegisterClass);
+    }
+  }
+
+  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+}
+
+//===----------------------------------------------------------------------===//
+//                      Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "SystemZGenCallingConv.inc"
+
+SDValue
+SystemZTargetLowering::LowerFormalArguments(SDValue Chain,
+                                            CallingConv::ID CallConv,
+                                            bool isVarArg,
+                                            const SmallVectorImpl<ISD::InputArg>
+                                              &Ins,
+                                            DebugLoc dl,
+                                            SelectionDAG &DAG,
+                                            SmallVectorImpl<SDValue> &InVals) {
+
+  switch (CallConv) {
+  default:
+    llvm_unreachable("Unsupported calling convention");
+  case CallingConv::C:
+  case CallingConv::Fast:
+    return LowerCCCArguments(Chain, CallConv, isVarArg, Ins, dl, DAG, InVals);
+  }
+}
+
+SDValue
+SystemZTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
+                                 CallingConv::ID CallConv, bool isVarArg,
+                                 bool &isTailCall,
+                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                 const SmallVectorImpl<ISD::InputArg> &Ins,
+                                 DebugLoc dl, SelectionDAG &DAG,
+                                 SmallVectorImpl<SDValue> &InVals) {
+  // SystemZ target does not yet support tail call optimization.
+  isTailCall = false;
+
+  switch (CallConv) {
+  default:
+    llvm_unreachable("Unsupported calling convention");
+  case CallingConv::Fast:
+  case CallingConv::C:
+    return LowerCCCCallTo(Chain, Callee, CallConv, isVarArg, isTailCall,
+                          Outs, Ins, dl, DAG, InVals);
+  }
+}
+
+/// LowerCCCArguments - transform physical registers into virtual registers and
+/// generate load operations for arguments places on the stack.
+// FIXME: struct return stuff
+// FIXME: varargs
+SDValue
+SystemZTargetLowering::LowerCCCArguments(SDValue Chain,
+                                         CallingConv::ID CallConv,
+                                         bool isVarArg,
+                                         const SmallVectorImpl<ISD::InputArg>
+                                           &Ins,
+                                         DebugLoc dl,
+                                         SelectionDAG &DAG,
+                                         SmallVectorImpl<SDValue> &InVals) {
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 ArgLocs, *DAG.getContext());
+  CCInfo.AnalyzeFormalArguments(Ins, CC_SystemZ);
+
+  if (isVarArg)
+    llvm_report_error("Varargs not supported yet");
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    SDValue ArgValue;
+    CCValAssign &VA = ArgLocs[i];
+    EVT LocVT = VA.getLocVT();
+    if (VA.isRegLoc()) {
+      // Arguments passed in registers
+      TargetRegisterClass *RC;
+      switch (LocVT.getSimpleVT().SimpleTy) {
+      default:
+#ifndef NDEBUG
+        errs() << "LowerFormalArguments Unhandled argument type: "
+             << LocVT.getSimpleVT().SimpleTy
+             << "\n";
+#endif
+        llvm_unreachable(0);
+      case MVT::i64:
+        RC = SystemZ::GR64RegisterClass;
+        break;
+      case MVT::f32:
+        RC = SystemZ::FP32RegisterClass;
+        break;
+      case MVT::f64:
+        RC = SystemZ::FP64RegisterClass;
+        break;
+      }
+
+      unsigned VReg = RegInfo.createVirtualRegister(RC);
+      RegInfo.addLiveIn(VA.getLocReg(), VReg);
+      ArgValue = DAG.getCopyFromReg(Chain, dl, VReg, LocVT);
+    } else {
+      // Sanity check
+      assert(VA.isMemLoc());
+
+      // Create the nodes corresponding to a load from this parameter slot.
+      // Create the frame index object for this incoming parameter...
+      int FI = MFI->CreateFixedObject(LocVT.getSizeInBits()/8,
+                                      VA.getLocMemOffset(), true, false);
+
+      // Create the SelectionDAG nodes corresponding to a load
+      // from this parameter
+      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+      ArgValue = DAG.getLoad(LocVT, dl, Chain, FIN,
+                             PseudoSourceValue::getFixedStack(FI), 0);
+    }
+
+    // If this is an 8/16/32-bit value, it is really passed promoted to 64
+    // bits. Insert an assert[sz]ext to capture this, then truncate to the
+    // right size.
+    if (VA.getLocInfo() == CCValAssign::SExt)
+      ArgValue = DAG.getNode(ISD::AssertSext, dl, LocVT, ArgValue,
+                             DAG.getValueType(VA.getValVT()));
+    else if (VA.getLocInfo() == CCValAssign::ZExt)
+      ArgValue = DAG.getNode(ISD::AssertZext, dl, LocVT, ArgValue,
+                             DAG.getValueType(VA.getValVT()));
+
+    if (VA.getLocInfo() != CCValAssign::Full)
+      ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+
+    InVals.push_back(ArgValue);
+  }
+
+  return Chain;
+}
+
+/// LowerCCCCallTo - functions arguments are copied from virtual regs to
+/// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted.
+/// TODO: sret.
+SDValue
+SystemZTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
+                                      CallingConv::ID CallConv, bool isVarArg,
+                                      bool isTailCall,
+                                      const SmallVectorImpl<ISD::OutputArg>
+                                        &Outs,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                      DebugLoc dl, SelectionDAG &DAG,
+                                      SmallVectorImpl<SDValue> &InVals) {
+
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // Offset to first argument stack slot.
+  const unsigned FirstArgOffset = 160;
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 ArgLocs, *DAG.getContext());
+
+  CCInfo.AnalyzeCallOperands(Outs, CC_SystemZ);
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+
+  Chain = DAG.getCALLSEQ_START(Chain ,DAG.getConstant(NumBytes,
+                                                      getPointerTy(), true));
+
+  SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
+  SmallVector<SDValue, 12> MemOpChains;
+  SDValue StackPtr;
+
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+
+    SDValue Arg = Outs[i].Val;
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+      default: assert(0 && "Unknown loc info!");
+      case CCValAssign::Full: break;
+      case CCValAssign::SExt:
+        Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::ZExt:
+        Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::AExt:
+        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+    }
+
+    // Arguments that can be passed on register must be kept at RegsToPass
+    // vector
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      assert(VA.isMemLoc());
+
+      if (StackPtr.getNode() == 0)
+        StackPtr =
+          DAG.getCopyFromReg(Chain, dl,
+                             (RegInfo->hasFP(MF) ?
+                              SystemZ::R11D : SystemZ::R15D),
+                             getPointerTy());
+
+      unsigned Offset = FirstArgOffset + VA.getLocMemOffset();
+      SDValue PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+                                   StackPtr,
+                                   DAG.getIntPtrConstant(Offset));
+
+      MemOpChains.push_back(DAG.getStore(Chain, dl, Arg, PtrOff,
+                                         PseudoSourceValue::getStack(), Offset));
+    }
+  }
+
+  // Transform all store nodes into one single node because all store nodes are
+  // independent of each other.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain and
+  // flag operands which copy the outgoing args into registers.  The InFlag in
+  // necessary since all emited instructions must be stuck together.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress node (quite common, every direct call is)
+  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+  // Likewise ExternalSymbol -> TargetExternalSymbol.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), getPointerTy());
+  else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
+    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), getPointerTy());
+
+  // Returns a chain & a flag for retval copy to use.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are
+  // known live into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  Chain = DAG.getNode(SystemZISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  // Create the CALLSEQ_END node.
+  Chain = DAG.getCALLSEQ_END(Chain,
+                             DAG.getConstant(NumBytes, getPointerTy(), true),
+                             DAG.getConstant(0, getPointerTy(), true),
+                             InFlag);
+  InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl,
+                         DAG, InVals);
+}
+
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+///
+SDValue
+SystemZTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
+                                       CallingConv::ID CallConv, bool isVarArg,
+                                       const SmallVectorImpl<ISD::InputArg>
+                                         &Ins,
+                                       DebugLoc dl, SelectionDAG &DAG,
+                                       SmallVectorImpl<SDValue> &InVals) {
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs,
+                 *DAG.getContext());
+
+  CCInfo.AnalyzeCallResult(Ins, RetCC_SystemZ);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+
+    Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
+                               VA.getLocVT(), InFlag).getValue(1);
+    SDValue RetValue = Chain.getValue(0);
+    InFlag = Chain.getValue(2);
+
+    // If this is an 8/16/32-bit value, it is really passed promoted to 64
+    // bits. Insert an assert[sz]ext to capture this, then truncate to the
+    // right size.
+    if (VA.getLocInfo() == CCValAssign::SExt)
+      RetValue = DAG.getNode(ISD::AssertSext, dl, VA.getLocVT(), RetValue,
+                             DAG.getValueType(VA.getValVT()));
+    else if (VA.getLocInfo() == CCValAssign::ZExt)
+      RetValue = DAG.getNode(ISD::AssertZext, dl, VA.getLocVT(), RetValue,
+                             DAG.getValueType(VA.getValVT()));
+
+    if (VA.getLocInfo() != CCValAssign::Full)
+      RetValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), RetValue);
+
+    InVals.push_back(RetValue);
+  }
+
+  return Chain;
+}
+
+
+SDValue
+SystemZTargetLowering::LowerReturn(SDValue Chain,
+                                   CallingConv::ID CallConv, bool isVarArg,
+                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                   DebugLoc dl, SelectionDAG &DAG) {
+
+  // CCValAssign - represent the assignment of the return value to a location
+  SmallVector<CCValAssign, 16> RVLocs;
+
+  // CCState - Info about the registers and stack slot.
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 RVLocs, *DAG.getContext());
+
+  // Analize return values.
+  CCInfo.AnalyzeReturn(Outs, RetCC_SystemZ);
+
+  // If this is the first return lowered for this function, add the regs to the
+  // liveout set for the function.
+  if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
+    for (unsigned i = 0; i != RVLocs.size(); ++i)
+      if (RVLocs[i].isRegLoc())
+        DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
+  }
+
+  SDValue Flag;
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    SDValue ResValue = Outs[i].Val;
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    // If this is an 8/16/32-bit value, it is really should be passed promoted
+    // to 64 bits.
+    if (VA.getLocInfo() == CCValAssign::SExt)
+      ResValue = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ResValue);
+    else if (VA.getLocInfo() == CCValAssign::ZExt)
+      ResValue = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ResValue);
+    else if (VA.getLocInfo() == CCValAssign::AExt)
+      ResValue = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ResValue);
+
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ResValue, Flag);
+
+    // Guarantee that all emitted copies are stuck together,
+    // avoiding something bad.
+    Flag = Chain.getValue(1);
+  }
+
+  if (Flag.getNode())
+    return DAG.getNode(SystemZISD::RET_FLAG, dl, MVT::Other, Chain, Flag);
+
+  // Return Void
+  return DAG.getNode(SystemZISD::RET_FLAG, dl, MVT::Other, Chain);
+}
+
+SDValue SystemZTargetLowering::EmitCmp(SDValue LHS, SDValue RHS,
+                                       ISD::CondCode CC, SDValue &SystemZCC,
+                                       SelectionDAG &DAG) {
+  // FIXME: Emit a test if RHS is zero
+
+  bool isUnsigned = false;
+  SystemZCC::CondCodes TCC;
+  switch (CC) {
+  default:
+    llvm_unreachable("Invalid integer condition!");
+  case ISD::SETEQ:
+  case ISD::SETOEQ:
+    TCC = SystemZCC::E;
+    break;
+  case ISD::SETUEQ:
+    TCC = SystemZCC::NLH;
+    break;
+  case ISD::SETNE:
+  case ISD::SETONE:
+    TCC = SystemZCC::NE;
+    break;
+  case ISD::SETUNE:
+    TCC = SystemZCC::LH;
+    break;
+  case ISD::SETO:
+    TCC = SystemZCC::O;
+    break;
+  case ISD::SETUO:
+    TCC = SystemZCC::NO;
+    break;
+  case ISD::SETULE:
+    if (LHS.getValueType().isFloatingPoint()) {
+      TCC = SystemZCC::NH;
+      break;
+    }
+    isUnsigned = true;   // FALLTHROUGH
+  case ISD::SETLE:
+  case ISD::SETOLE:
+    TCC = SystemZCC::LE;
+    break;
+  case ISD::SETUGE:
+    if (LHS.getValueType().isFloatingPoint()) {
+      TCC = SystemZCC::NL;
+      break;
+    }
+    isUnsigned = true;   // FALLTHROUGH
+  case ISD::SETGE:
+  case ISD::SETOGE:
+    TCC = SystemZCC::HE;
+    break;
+  case ISD::SETUGT:
+    if (LHS.getValueType().isFloatingPoint()) {
+      TCC = SystemZCC::NLE;
+      break;
+    }
+    isUnsigned = true;  // FALLTHROUGH
+  case ISD::SETGT:
+  case ISD::SETOGT:
+    TCC = SystemZCC::H;
+    break;
+  case ISD::SETULT:
+    if (LHS.getValueType().isFloatingPoint()) {
+      TCC = SystemZCC::NHE;
+      break;
+    }
+    isUnsigned = true;  // FALLTHROUGH
+  case ISD::SETLT:
+  case ISD::SETOLT:
+    TCC = SystemZCC::L;
+    break;
+  }
+
+  SystemZCC = DAG.getConstant(TCC, MVT::i32);
+
+  DebugLoc dl = LHS.getDebugLoc();
+  return DAG.getNode((isUnsigned ? SystemZISD::UCMP : SystemZISD::CMP),
+                     dl, MVT::i64, LHS, RHS);
+}
+
+
+SDValue SystemZTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) {
+  SDValue Chain = Op.getOperand(0);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+  SDValue LHS   = Op.getOperand(2);
+  SDValue RHS   = Op.getOperand(3);
+  SDValue Dest  = Op.getOperand(4);
+  DebugLoc dl   = Op.getDebugLoc();
+
+  SDValue SystemZCC;
+  SDValue Flag = EmitCmp(LHS, RHS, CC, SystemZCC, DAG);
+  return DAG.getNode(SystemZISD::BRCOND, dl, Op.getValueType(),
+                     Chain, Dest, SystemZCC, Flag);
+}
+
+SDValue SystemZTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) {
+  SDValue LHS    = Op.getOperand(0);
+  SDValue RHS    = Op.getOperand(1);
+  SDValue TrueV  = Op.getOperand(2);
+  SDValue FalseV = Op.getOperand(3);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+  DebugLoc dl   = Op.getDebugLoc();
+
+  SDValue SystemZCC;
+  SDValue Flag = EmitCmp(LHS, RHS, CC, SystemZCC, DAG);
+
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag);
+  SmallVector<SDValue, 4> Ops;
+  Ops.push_back(TrueV);
+  Ops.push_back(FalseV);
+  Ops.push_back(SystemZCC);
+  Ops.push_back(Flag);
+
+  return DAG.getNode(SystemZISD::SELECT, dl, VTs, &Ops[0], Ops.size());
+}
+
+SDValue SystemZTargetLowering::LowerGlobalAddress(SDValue Op,
+                                                  SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
+
+  bool IsPic = getTargetMachine().getRelocationModel() == Reloc::PIC_;
+  bool ExtraLoadRequired =
+    Subtarget.GVRequiresExtraLoad(GV, getTargetMachine(), false);
+
+  SDValue Result;
+  if (!IsPic && !ExtraLoadRequired) {
+    Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), Offset);
+    Offset = 0;
+  } else {
+    unsigned char OpFlags = 0;
+    if (ExtraLoadRequired)
+      OpFlags = SystemZII::MO_GOTENT;
+
+    Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), 0, OpFlags);
+  }
+
+  Result = DAG.getNode(SystemZISD::PCRelativeWrapper, dl,
+                       getPointerTy(), Result);
+
+  if (ExtraLoadRequired)
+    Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
+                         PseudoSourceValue::getGOT(), 0);
+
+  // If there was a non-zero offset that we didn't fold, create an explicit
+  // addition for it.
+  if (Offset != 0)
+    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
+                         DAG.getConstant(Offset, getPointerTy()));
+
+  return Result;
+}
+
+// FIXME: PIC here
+SDValue SystemZTargetLowering::LowerJumpTable(SDValue Op,
+                                              SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy());
+
+  return DAG.getNode(SystemZISD::PCRelativeWrapper, dl, getPointerTy(), Result);
+}
+
+
+// FIXME: PIC here
+// FIXME: This is just dirty hack. We need to lower cpool properly
+SDValue SystemZTargetLowering::LowerConstantPool(SDValue Op,
+                                                 SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+
+  SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
+                                             CP->getAlignment(),
+                                             CP->getOffset());
+
+  return DAG.getNode(SystemZISD::PCRelativeWrapper, dl, getPointerTy(), Result);
+}
+
+const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  case SystemZISD::RET_FLAG:           return "SystemZISD::RET_FLAG";
+  case SystemZISD::CALL:               return "SystemZISD::CALL";
+  case SystemZISD::BRCOND:             return "SystemZISD::BRCOND";
+  case SystemZISD::CMP:                return "SystemZISD::CMP";
+  case SystemZISD::UCMP:               return "SystemZISD::UCMP";
+  case SystemZISD::SELECT:             return "SystemZISD::SELECT";
+  case SystemZISD::PCRelativeWrapper:  return "SystemZISD::PCRelativeWrapper";
+  default: return NULL;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//  Other Lowering Code
+//===----------------------------------------------------------------------===//
+
+MachineBasicBlock*
+SystemZTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                   MachineBasicBlock *BB,
+                   DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const {
+  const SystemZInstrInfo &TII = *TM.getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+  assert((MI->getOpcode() == SystemZ::Select32  ||
+          MI->getOpcode() == SystemZ::SelectF32 ||
+          MI->getOpcode() == SystemZ::Select64  ||
+          MI->getOpcode() == SystemZ::SelectF64) &&
+         "Unexpected instr type to insert");
+
+  // To "insert" a SELECT instruction, we actually have to insert the diamond
+  // control-flow pattern.  The incoming instruction knows the destination vreg
+  // to set, the condition code register to branch on, the true/false values to
+  // select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator I = BB;
+  ++I;
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   cmpTY ccX, r1, r2
+  //   jCC copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *copy1MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  SystemZCC::CondCodes CC = (SystemZCC::CondCodes)MI->getOperand(3).getImm();
+  BuildMI(BB, dl, TII.getBrCond(CC)).addMBB(copy1MBB);
+  F->insert(I, copy0MBB);
+  F->insert(I, copy1MBB);
+  // Inform sdisel of the edge changes.
+  for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(), 
+         SE = BB->succ_end(); SI != SE; ++SI)
+    EM->insert(std::make_pair(*SI, copy1MBB));
+  // Update machine-CFG edges by transferring all successors of the current
+  // block to the new block which will contain the Phi node for the select.
+  copy1MBB->transferSuccessors(BB);
+  // Next, add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(copy1MBB);
+
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to copy1MBB
+  BB = copy0MBB;
+
+  // Update machine-CFG edges
+  BB->addSuccessor(copy1MBB);
+
+  //  copy1MBB:
+  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+  //  ...
+  BB = copy1MBB;
+  BuildMI(BB, dl, TII.get(SystemZ::PHI),
+          MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(2).getReg()).addMBB(copy0MBB)
+    .addReg(MI->getOperand(1).getReg()).addMBB(thisMBB);
+
+  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+  return BB;
+}

diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
new file mode 100644
index 0000000..36ff994
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZISelLowering.h

@@ -0,0 +1,146 @@
+//==-- SystemZISelLowering.h - SystemZ DAG Lowering Interface ----*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that SystemZ uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_SystemZ_ISELLOWERING_H
+#define LLVM_TARGET_SystemZ_ISELLOWERING_H
+
+#include "SystemZ.h"
+#include "SystemZRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+  namespace SystemZISD {
+    enum {
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      /// Return with a flag operand. Operand 0 is the chain operand.
+      RET_FLAG,
+
+      /// CALL - These operations represent an abstract call
+      /// instruction, which includes a bunch of information.
+      CALL,
+
+      /// PCRelativeWrapper - PC relative address
+      PCRelativeWrapper,
+
+      /// CMP, UCMP - Compare instruction
+      CMP,
+      UCMP,
+
+      /// BRCOND - Conditional branch. Operand 0 is chain operand, operand 1 is
+      /// the block to branch if condition is true, operand 2 is condition code
+      /// and operand 3 is the flag operand produced by a CMP instruction.
+      BRCOND,
+
+      /// SELECT - Operands 0 and 1 are selection variables, operand 2 is
+      /// condition code and operand 3 is the flag operand.
+      SELECT
+    };
+  }
+
+  class SystemZSubtarget;
+  class SystemZTargetMachine;
+
+  class SystemZTargetLowering : public TargetLowering {
+  public:
+    explicit SystemZTargetLowering(SystemZTargetMachine &TM);
+
+    /// LowerOperation - Provide custom lowering hooks for some operations.
+    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG);
+
+    /// getTargetNodeName - This method returns the name of a target specific
+    /// DAG node.
+    virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+    /// getFunctionAlignment - Return the Log2 alignment of this function.
+    virtual unsigned getFunctionAlignment(const Function *F) const {
+      return 1;
+    }
+
+    std::pair<unsigned, const TargetRegisterClass*>
+    getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const;
+    TargetLowering::ConstraintType
+    getConstraintType(const std::string &Constraint) const;
+
+    SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG);
+
+    SDValue EmitCmp(SDValue LHS, SDValue RHS,
+                    ISD::CondCode CC, SDValue &SystemZCC,
+                    SelectionDAG &DAG);
+
+
+    MachineBasicBlock* EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                   MachineBasicBlock *BB,
+                    DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const;
+
+    /// isFPImmLegal - Returns true if the target can instruction select the
+    /// specified FP immediate natively. If false, the legalizer will
+    /// materialize the FP immediate as a load from a constant pool.
+    virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
+
+  private:
+    SDValue LowerCCCCallTo(SDValue Chain, SDValue Callee,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           bool isTailCall,
+                           const SmallVectorImpl<ISD::OutputArg> &Outs,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals);
+
+    SDValue LowerCCCArguments(SDValue Chain,
+                              CallingConv::ID CallConv,
+                              bool isVarArg,
+                              const SmallVectorImpl<ISD::InputArg> &Ins,
+                              DebugLoc dl,
+                              SelectionDAG &DAG,
+                              SmallVectorImpl<SDValue> &InVals);
+
+    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                            CallingConv::ID CallConv, bool isVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            DebugLoc dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerFormalArguments(SDValue Chain,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals);
+    virtual SDValue
+      LowerCall(SDValue Chain, SDValue Callee,
+                CallingConv::ID CallConv, bool isVarArg, bool &isTailCall,
+                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<ISD::InputArg> &Ins,
+                DebugLoc dl, SelectionDAG &DAG,
+                SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerReturn(SDValue Chain,
+                  CallingConv::ID CallConv, bool isVarArg,
+                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  DebugLoc dl, SelectionDAG &DAG);
+
+    const SystemZSubtarget &Subtarget;
+    const SystemZTargetMachine &TM;
+    const SystemZRegisterInfo *RegInfo;
+  };
+} // namespace llvm
+
+#endif // LLVM_TARGET_SystemZ_ISELLOWERING_H

diff --git a/lib/Target/SystemZ/SystemZInstrBuilder.h b/lib/Target/SystemZ/SystemZInstrBuilder.h
new file mode 100644
index 0000000..b69d2f6
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZInstrBuilder.h

@@ -0,0 +1,128 @@
+//===- SystemZInstrBuilder.h - Functions to aid building  insts -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes functions that may be used with BuildMI from the
+// MachineInstrBuilder.h file to handle SystemZ'isms in a clean way.
+//
+// The BuildMem function may be used with the BuildMI function to add entire
+// memory references in a single, typed, function call.
+//
+// For reference, the order of operands for memory references is:
+// (Operand), Base, Displacement, Index.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SYSTEMZINSTRBUILDER_H
+#define SYSTEMZINSTRBUILDER_H
+
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+
+namespace llvm {
+
+/// SystemZAddressMode - This struct holds a generalized full x86 address mode.
+/// The base register can be a frame index, which will eventually be replaced
+/// with R15 or R11 and Disp being offsetted accordingly.
+struct SystemZAddressMode {
+  enum {
+    RegBase,
+    FrameIndexBase
+  } BaseType;
+
+  union {
+    unsigned Reg;
+    int FrameIndex;
+  } Base;
+
+  unsigned IndexReg;
+  int32_t Disp;
+  GlobalValue *GV;
+
+  SystemZAddressMode() : BaseType(RegBase), IndexReg(0), Disp(0) {
+    Base.Reg = 0;
+  }
+};
+
+/// addDirectMem - This function is used to add a direct memory reference to the
+/// current instruction -- that is, a dereference of an address in a register,
+/// with no index or displacement.
+///
+static inline const MachineInstrBuilder &
+addDirectMem(const MachineInstrBuilder &MIB, unsigned Reg) {
+  // Because memory references are always represented with 3
+  // values, this adds: Reg, [0, NoReg] to the instruction.
+  return MIB.addReg(Reg).addImm(0).addReg(0);
+}
+
+static inline const MachineInstrBuilder &
+addOffset(const MachineInstrBuilder &MIB, int Offset) {
+  return MIB.addImm(Offset).addReg(0);
+}
+
+/// addRegOffset - This function is used to add a memory reference of the form
+/// [Reg + Offset], i.e., one with no or index, but with a
+/// displacement. An example is: 10(%r15).
+///
+static inline const MachineInstrBuilder &
+addRegOffset(const MachineInstrBuilder &MIB,
+             unsigned Reg, bool isKill, int Offset) {
+  return addOffset(MIB.addReg(Reg, getKillRegState(isKill)), Offset);
+}
+
+/// addRegReg - This function is used to add a memory reference of the form:
+/// [Reg + Reg].
+static inline const MachineInstrBuilder &
+addRegReg(const MachineInstrBuilder &MIB,
+            unsigned Reg1, bool isKill1, unsigned Reg2, bool isKill2) {
+  return MIB.addReg(Reg1, getKillRegState(isKill1)).addImm(0)
+    .addReg(Reg2, getKillRegState(isKill2));
+}
+
+static inline const MachineInstrBuilder &
+addFullAddress(const MachineInstrBuilder &MIB, const SystemZAddressMode &AM) {
+  if (AM.BaseType == SystemZAddressMode::RegBase)
+    MIB.addReg(AM.Base.Reg);
+  else if (AM.BaseType == SystemZAddressMode::FrameIndexBase)
+    MIB.addFrameIndex(AM.Base.FrameIndex);
+  else
+    assert(0);
+
+  return MIB.addImm(AM.Disp).addReg(AM.IndexReg);
+}
+
+/// addFrameReference - This function is used to add a reference to the base of
+/// an abstract object on the stack frame of the current function.  This
+/// reference has base register as the FrameIndex offset until it is resolved.
+/// This allows a constant offset to be specified as well...
+///
+static inline const MachineInstrBuilder &
+addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0) {
+  MachineInstr *MI = MIB;
+  MachineFunction &MF = *MI->getParent()->getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  const TargetInstrDesc &TID = MI->getDesc();
+  unsigned Flags = 0;
+  if (TID.mayLoad())
+    Flags |= MachineMemOperand::MOLoad;
+  if (TID.mayStore())
+    Flags |= MachineMemOperand::MOStore;
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FI),
+                            Flags, Offset,
+                            MFI.getObjectSize(FI),
+                            MFI.getObjectAlignment(FI));
+  return addOffset(MIB.addFrameIndex(FI), Offset)
+            .addMemOperand(MMO);
+}
+
+} // End llvm namespace
+
+#endif

diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td
new file mode 100644
index 0000000..336e20e
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZInstrFP.td

@@ -0,0 +1,340 @@
+//===- SystemZInstrFP.td - SystemZ FP Instruction defs --------*- tblgen-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the SystemZ (binary) floating point instructions in 
+// TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+// FIXME: multiclassify!
+
+//===----------------------------------------------------------------------===//
+// FP Pattern fragments
+
+def fpimm0 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(+0.0);
+}]>;
+
+def fpimmneg0 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(-0.0);
+}]>;
+
+let Uses = [PSW], usesCustomInserter = 1 in {
+  def SelectF32 : Pseudo<(outs FP32:$dst), (ins FP32:$src1, FP32:$src2, i8imm:$cc),
+                        "# SelectF32 PSEUDO",
+                        [(set FP32:$dst,
+                              (SystemZselect FP32:$src1, FP32:$src2, imm:$cc, PSW))]>;
+  def SelectF64 : Pseudo<(outs FP64:$dst), (ins FP64:$src1, FP64:$src2, i8imm:$cc),
+                        "# SelectF64 PSEUDO",
+                        [(set FP64:$dst,
+                              (SystemZselect FP64:$src1, FP64:$src2, imm:$cc, PSW))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Move Instructions
+
+// Floating point constant loads.
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+def LD_Fp032 : Pseudo<(outs FP32:$dst), (ins),
+                      "lzer\t{$dst}",
+                      [(set FP32:$dst, fpimm0)]>;
+def LD_Fp064 : Pseudo<(outs FP64:$dst), (ins),
+                      "lzdr\t{$dst}",
+                      [(set FP64:$dst, fpimm0)]>;
+}
+
+let neverHasSideEffects = 1 in {
+def FMOV32rr : Pseudo<(outs FP32:$dst), (ins FP32:$src),
+                      "ler\t{$dst, $src}",
+                      []>;
+def FMOV64rr : Pseudo<(outs FP64:$dst), (ins FP64:$src),
+                      "ldr\t{$dst, $src}",
+                      []>;
+}
+
+let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in {
+def FMOV32rm  : Pseudo<(outs FP32:$dst), (ins rriaddr12:$src),
+                      "le\t{$dst, $src}",
+                      [(set FP32:$dst, (load rriaddr12:$src))]>;
+def FMOV32rmy : Pseudo<(outs FP32:$dst), (ins rriaddr:$src),
+                      "ley\t{$dst, $src}",
+                      [(set FP32:$dst, (load rriaddr:$src))]>;
+def FMOV64rm  : Pseudo<(outs FP64:$dst), (ins rriaddr12:$src),
+                      "ld\t{$dst, $src}",
+                      [(set FP64:$dst, (load rriaddr12:$src))]>;
+def FMOV64rmy : Pseudo<(outs FP64:$dst), (ins rriaddr:$src),
+                      "ldy\t{$dst, $src}",
+                      [(set FP64:$dst, (load rriaddr:$src))]>;
+}
+
+def FMOV32mr  : Pseudo<(outs), (ins rriaddr12:$dst, FP32:$src),
+                       "ste\t{$src, $dst}",
+                       [(store FP32:$src, rriaddr12:$dst)]>;
+def FMOV32mry : Pseudo<(outs), (ins rriaddr:$dst, FP32:$src),
+                       "stey\t{$src, $dst}",
+                       [(store FP32:$src, rriaddr:$dst)]>;
+def FMOV64mr  : Pseudo<(outs), (ins rriaddr12:$dst, FP64:$src),
+                       "std\t{$src, $dst}",
+                       [(store FP64:$src, rriaddr12:$dst)]>;
+def FMOV64mry : Pseudo<(outs), (ins rriaddr:$dst, FP64:$src),
+                       "stdy\t{$src, $dst}",
+                       [(store FP64:$src, rriaddr:$dst)]>;
+
+def FCOPYSIGN32 : Pseudo<(outs FP32:$dst), (ins FP32:$src1, FP32:$src2),
+                         "cpsdr\t{$dst, $src2, $src1}",
+                         [(set FP32:$dst, (fcopysign FP32:$src1, FP32:$src2))]>;
+def FCOPYSIGN64 : Pseudo<(outs FP64:$dst), (ins FP64:$src1, FP64:$src2),
+                         "cpsdr\t{$dst, $src2, $src1}",
+                         [(set FP64:$dst, (fcopysign FP64:$src1, FP64:$src2))]>;
+
+//===----------------------------------------------------------------------===//
+// Arithmetic Instructions
+
+
+let Defs = [PSW] in {
+def FNEG32rr : Pseudo<(outs FP32:$dst), (ins FP32:$src),
+                       "lcebr\t{$dst, $src}",
+                       [(set FP32:$dst, (fneg FP32:$src)),
+                        (implicit PSW)]>;
+def FNEG64rr : Pseudo<(outs FP64:$dst), (ins FP64:$src),
+                       "lcdbr\t{$dst, $src}",
+                       [(set FP64:$dst, (fneg FP64:$src)),
+                        (implicit PSW)]>;
+
+def FABS32rr : Pseudo<(outs FP32:$dst), (ins FP32:$src),
+                       "lpebr\t{$dst, $src}",
+                       [(set FP32:$dst, (fabs FP32:$src)),
+                        (implicit PSW)]>;
+def FABS64rr : Pseudo<(outs FP64:$dst), (ins FP64:$src),
+                       "lpdbr\t{$dst, $src}",
+                       [(set FP64:$dst, (fabs FP64:$src)),
+                        (implicit PSW)]>;
+
+def FNABS32rr : Pseudo<(outs FP32:$dst), (ins FP32:$src),
+                       "lnebr\t{$dst, $src}",
+                       [(set FP32:$dst, (fneg(fabs FP32:$src))),
+                        (implicit PSW)]>;
+def FNABS64rr : Pseudo<(outs FP64:$dst), (ins FP64:$src),
+                       "lndbr\t{$dst, $src}",
+                       [(set FP64:$dst, (fneg(fabs FP64:$src))),
+                        (implicit PSW)]>;
+}
+
+let isTwoAddress = 1 in {
+let Defs = [PSW] in {
+let isCommutable = 1 in { // X = ADD Y, Z  == X = ADD Z, Y
+def FADD32rr : Pseudo<(outs FP32:$dst), (ins FP32:$src1, FP32:$src2),
+                       "aebr\t{$dst, $src2}",
+                       [(set FP32:$dst, (fadd FP32:$src1, FP32:$src2)),
+                        (implicit PSW)]>;
+def FADD64rr : Pseudo<(outs FP64:$dst), (ins FP64:$src1, FP64:$src2),
+                       "adbr\t{$dst, $src2}",
+                       [(set FP64:$dst, (fadd FP64:$src1, FP64:$src2)),
+                        (implicit PSW)]>;
+}
+
+def FADD32rm : Pseudo<(outs FP32:$dst), (ins FP32:$src1, rriaddr12:$src2),
+                       "aeb\t{$dst, $src2}",
+                       [(set FP32:$dst, (fadd FP32:$src1, (load rriaddr12:$src2))),
+                        (implicit PSW)]>;
+def FADD64rm : Pseudo<(outs FP64:$dst), (ins FP64:$src1, rriaddr12:$src2),
+                       "adb\t{$dst, $src2}",
+                       [(set FP64:$dst, (fadd FP64:$src1, (load rriaddr12:$src2))),
+                        (implicit PSW)]>;
+
+def FSUB32rr : Pseudo<(outs FP32:$dst), (ins FP32:$src1, FP32:$src2),
+                       "sebr\t{$dst, $src2}",
+                       [(set FP32:$dst, (fsub FP32:$src1, FP32:$src2)),
+                        (implicit PSW)]>;
+def FSUB64rr : Pseudo<(outs FP64:$dst), (ins FP64:$src1, FP64:$src2),
+                       "sdbr\t{$dst, $src2}",
+                       [(set FP64:$dst, (fsub FP64:$src1, FP64:$src2)),
+                        (implicit PSW)]>;
+
+def FSUB32rm : Pseudo<(outs FP32:$dst), (ins FP32:$src1, rriaddr12:$src2),
+                       "seb\t{$dst, $src2}",
+                       [(set FP32:$dst, (fsub FP32:$src1, (load rriaddr12:$src2))),
+                        (implicit PSW)]>;
+def FSUB64rm : Pseudo<(outs FP64:$dst), (ins FP64:$src1, rriaddr12:$src2),
+                       "sdb\t{$dst, $src2}",
+                       [(set FP64:$dst, (fsub FP64:$src1, (load rriaddr12:$src2))),
+                        (implicit PSW)]>;
+} // Defs = [PSW]
+
+let isCommutable = 1 in { // X = MUL Y, Z  == X = MUL Z, Y
+def FMUL32rr : Pseudo<(outs FP32:$dst), (ins FP32:$src1, FP32:$src2),
+                       "meebr\t{$dst, $src2}",
+                       [(set FP32:$dst, (fmul FP32:$src1, FP32:$src2))]>;
+def FMUL64rr : Pseudo<(outs FP64:$dst), (ins FP64:$src1, FP64:$src2),
+                       "mdbr\t{$dst, $src2}",
+                       [(set FP64:$dst, (fmul FP64:$src1, FP64:$src2))]>;
+}
+
+def FMUL32rm : Pseudo<(outs FP32:$dst), (ins FP32:$src1, rriaddr12:$src2),
+                       "meeb\t{$dst, $src2}",
+                       [(set FP32:$dst, (fmul FP32:$src1, (load rriaddr12:$src2)))]>;
+def FMUL64rm : Pseudo<(outs FP64:$dst), (ins FP64:$src1, rriaddr12:$src2),
+                       "mdb\t{$dst, $src2}",
+                       [(set FP64:$dst, (fmul FP64:$src1, (load rriaddr12:$src2)))]>;
+
+def FMADD32rr : Pseudo<(outs FP32:$dst), (ins FP32:$src1, FP32:$src2, FP32:$src3),
+                       "maebr\t{$dst, $src3, $src2}",
+                       [(set FP32:$dst, (fadd (fmul FP32:$src2, FP32:$src3),
+                                              FP32:$src1))]>;
+def FMADD32rm : Pseudo<(outs FP32:$dst), (ins FP32:$src1, rriaddr12:$src2, FP32:$src3),
+                       "maeb\t{$dst, $src3, $src2}",
+                       [(set FP32:$dst, (fadd (fmul (load rriaddr12:$src2),
+                                                     FP32:$src3),
+                                              FP32:$src1))]>;
+
+def FMADD64rr : Pseudo<(outs FP64:$dst), (ins FP64:$src1, FP64:$src2, FP64:$src3),
+                       "madbr\t{$dst, $src3, $src2}",
+                       [(set FP64:$dst, (fadd (fmul FP64:$src2, FP64:$src3),
+                                              FP64:$src1))]>;
+def FMADD64rm : Pseudo<(outs FP64:$dst), (ins FP64:$src1, rriaddr12:$src2, FP64:$src3),
+                       "madb\t{$dst, $src3, $src2}",
+                       [(set FP64:$dst, (fadd (fmul (load rriaddr12:$src2),
+                                                     FP64:$src3),
+                                              FP64:$src1))]>;
+
+def FMSUB32rr : Pseudo<(outs FP32:$dst), (ins FP32:$src1, FP32:$src2, FP32:$src3),
+                       "msebr\t{$dst, $src3, $src2}",
+                       [(set FP32:$dst, (fsub (fmul FP32:$src2, FP32:$src3),
+                                              FP32:$src1))]>;
+def FMSUB32rm : Pseudo<(outs FP32:$dst), (ins FP32:$src1, rriaddr12:$src2, FP32:$src3),
+                       "mseb\t{$dst, $src3, $src2}",
+                       [(set FP32:$dst, (fsub (fmul (load rriaddr12:$src2),
+                                                     FP32:$src3),
+                                              FP32:$src1))]>;
+
+def FMSUB64rr : Pseudo<(outs FP64:$dst), (ins FP64:$src1, FP64:$src2, FP64:$src3),
+                       "msdbr\t{$dst, $src3, $src2}",
+                       [(set FP64:$dst, (fsub (fmul FP64:$src2, FP64:$src3),
+                                              FP64:$src1))]>;
+def FMSUB64rm : Pseudo<(outs FP64:$dst), (ins FP64:$src1, rriaddr12:$src2, FP64:$src3),
+                       "msdb\t{$dst, $src3, $src2}",
+                       [(set FP64:$dst, (fsub (fmul (load rriaddr12:$src2),
+                                                     FP64:$src3),
+                                              FP64:$src1))]>;
+
+def FDIV32rr : Pseudo<(outs FP32:$dst), (ins FP32:$src1, FP32:$src2),
+                       "debr\t{$dst, $src2}",
+                       [(set FP32:$dst, (fdiv FP32:$src1, FP32:$src2))]>;
+def FDIV64rr : Pseudo<(outs FP64:$dst), (ins FP64:$src1, FP64:$src2),
+                       "ddbr\t{$dst, $src2}",
+                       [(set FP64:$dst, (fdiv FP64:$src1, FP64:$src2))]>;
+
+def FDIV32rm : Pseudo<(outs FP32:$dst), (ins FP32:$src1, rriaddr12:$src2),
+                       "deb\t{$dst, $src2}",
+                       [(set FP32:$dst, (fdiv FP32:$src1, (load rriaddr12:$src2)))]>;
+def FDIV64rm : Pseudo<(outs FP64:$dst), (ins FP64:$src1, rriaddr12:$src2),
+                       "ddb\t{$dst, $src2}",
+                       [(set FP64:$dst, (fdiv FP64:$src1, (load rriaddr12:$src2)))]>;
+
+} // isTwoAddress = 1
+
+def FSQRT32rr : Pseudo<(outs FP32:$dst), (ins FP32:$src),
+                       "sqebr\t{$dst, $src}",
+                       [(set FP32:$dst, (fsqrt FP32:$src))]>;
+def FSQRT64rr : Pseudo<(outs FP64:$dst), (ins FP64:$src),
+                       "sqdbr\t{$dst, $src}",
+                       [(set FP64:$dst, (fsqrt FP64:$src))]>;
+
+def FSQRT32rm : Pseudo<(outs FP32:$dst), (ins rriaddr12:$src),
+                       "sqeb\t{$dst, $src}",
+                       [(set FP32:$dst, (fsqrt (load rriaddr12:$src)))]>;
+def FSQRT64rm : Pseudo<(outs FP64:$dst), (ins rriaddr12:$src),
+                       "sqdb\t{$dst, $src}",
+                       [(set FP64:$dst, (fsqrt (load rriaddr12:$src)))]>;
+
+def FROUND64r32 : Pseudo<(outs FP32:$dst), (ins FP64:$src),
+                         "ledbr\t{$dst, $src}",
+                         [(set FP32:$dst, (fround FP64:$src))]>;
+
+def FEXT32r64   : Pseudo<(outs FP64:$dst), (ins FP32:$src),
+                         "ldebr\t{$dst, $src}",
+                         [(set FP64:$dst, (fextend FP32:$src))]>;
+def FEXT32m64   : Pseudo<(outs FP64:$dst), (ins rriaddr12:$src),
+                         "ldeb\t{$dst, $src}",
+                         [(set FP64:$dst, (fextend (load rriaddr12:$src)))]>;
+
+let Defs = [PSW] in {
+def FCONVFP32   : Pseudo<(outs FP32:$dst), (ins GR32:$src),
+                         "cefbr\t{$dst, $src}",
+                         [(set FP32:$dst, (sint_to_fp GR32:$src)),
+                          (implicit PSW)]>;
+def FCONVFP32r64: Pseudo<(outs FP32:$dst), (ins GR64:$src),
+                         "cegbr\t{$dst, $src}",
+                         [(set FP32:$dst, (sint_to_fp GR64:$src)),
+                          (implicit PSW)]>;
+
+def FCONVFP64r32: Pseudo<(outs FP64:$dst), (ins GR32:$src),
+                         "cdfbr\t{$dst, $src}",
+                         [(set FP64:$dst, (sint_to_fp GR32:$src)),
+                          (implicit PSW)]>;
+def FCONVFP64   : Pseudo<(outs FP64:$dst), (ins GR64:$src),
+                         "cdgbr\t{$dst, $src}",
+                         [(set FP64:$dst, (sint_to_fp GR64:$src)),
+                          (implicit PSW)]>;
+
+def FCONVGR32   : Pseudo<(outs GR32:$dst), (ins FP32:$src),
+                         "cfebr\t{$dst, 5, $src}",
+                         [(set GR32:$dst, (fp_to_sint FP32:$src)),
+                          (implicit PSW)]>;
+def FCONVGR32r64: Pseudo<(outs GR32:$dst), (ins FP64:$src),
+                         "cfdbr\t{$dst, 5, $src}",
+                         [(set GR32:$dst, (fp_to_sint FP64:$src)),
+                          (implicit PSW)]>;
+
+def FCONVGR64r32: Pseudo<(outs GR64:$dst), (ins FP32:$src),
+                         "cgebr\t{$dst, 5, $src}",
+                         [(set GR64:$dst, (fp_to_sint FP32:$src)),
+                          (implicit PSW)]>;
+def FCONVGR64   : Pseudo<(outs GR64:$dst), (ins FP64:$src),
+                         "cgdbr\t{$dst, 5, $src}",
+                         [(set GR64:$dst, (fp_to_sint FP64:$src)),
+                          (implicit PSW)]>;
+} // Defs = [PSW]
+
+def FBCONVG64   : Pseudo<(outs GR64:$dst), (ins FP64:$src),
+                         "lgdr\t{$dst, $src}",
+                         [(set GR64:$dst, (bitconvert FP64:$src))]>;
+def FBCONVF64   : Pseudo<(outs FP64:$dst), (ins GR64:$src),
+                         "ldgr\t{$dst, $src}",
+                         [(set FP64:$dst, (bitconvert GR64:$src))]>;
+
+//===----------------------------------------------------------------------===//
+// Test instructions (like AND but do not produce any result)
+
+// Integer comparisons
+let Defs = [PSW] in {
+def FCMP32rr : Pseudo<(outs), (ins FP32:$src1, FP32:$src2),
+                      "cebr\t$src1, $src2",
+                      [(SystemZcmp FP32:$src1, FP32:$src2), (implicit PSW)]>;
+def FCMP64rr : Pseudo<(outs), (ins FP64:$src1, FP64:$src2),
+                      "cdbr\t$src1, $src2",
+                      [(SystemZcmp FP64:$src1, FP64:$src2), (implicit PSW)]>;
+
+def FCMP32rm : Pseudo<(outs), (ins FP32:$src1, rriaddr12:$src2),
+                      "ceb\t$src1, $src2",
+                      [(SystemZcmp FP32:$src1, (load rriaddr12:$src2)),
+                       (implicit PSW)]>;
+def FCMP64rm : Pseudo<(outs), (ins FP64:$src1, rriaddr12:$src2),
+                      "cdb\t$src1, $src2",
+                      [(SystemZcmp FP64:$src1, (load rriaddr12:$src2)),
+                       (implicit PSW)]>;
+} // Defs = [PSW]
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// Floating point constant -0.0
+def : Pat<(f32 fpimmneg0), (FNEG32rr (LD_Fp032))>;
+def : Pat<(f64 fpimmneg0), (FNEG64rr (LD_Fp064))>;

diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td
new file mode 100644
index 0000000..b4a8993
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZInstrFormats.td

@@ -0,0 +1,133 @@
+//===- SystemZInstrFormats.td - SystemZ Instruction Formats ----*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+
+// Format specifies the encoding used by the instruction.  This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<5> val> {
+  bits<5> Value = val;
+}
+
+def Pseudo   : Format<0>;
+def EForm    : Format<1>;
+def IForm    : Format<2>;
+def RIForm   : Format<3>;
+def RIEForm  : Format<4>;
+def RILForm  : Format<5>;
+def RISForm  : Format<6>;
+def RRForm   : Format<7>;
+def RREForm  : Format<8>;
+def RRFForm  : Format<9>;
+def RRRForm  : Format<10>;
+def RRSForm  : Format<11>;
+def RSForm   : Format<12>;
+def RSIForm  : Format<13>;
+def RSILForm : Format<14>;
+def RSYForm  : Format<15>;
+def RXForm   : Format<16>;
+def RXEForm  : Format<17>;
+def RXFForm  : Format<18>;
+def RXYForm  : Format<19>;
+def SForm    : Format<20>;
+def SIForm   : Format<21>;
+def SILForm  : Format<22>;
+def SIYForm  : Format<23>;
+def SSForm   : Format<24>;
+def SSEForm  : Format<25>;
+def SSFForm  : Format<26>;
+
+class InstSystemZ<bits<16> op, Format f, dag outs, dag ins> : Instruction {
+  let Namespace = "SystemZ";
+
+  bits<16> Opcode = op;
+
+  Format Form = f;
+  bits<5> FormBits = Form.Value;
+
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+}
+
+class I8<bits<8> op, Format f, dag outs, dag ins, string asmstr, 
+         list<dag> pattern> 
+  : InstSystemZ<0, f, outs, ins> {
+  let Opcode{0-7} = op;
+  let Opcode{8-15} = 0;
+
+  let Pattern = pattern;
+  let AsmString = asmstr;
+}
+
+class I12<bits<12> op, Format f, dag outs, dag ins, string asmstr, 
+         list<dag> pattern> 
+  : InstSystemZ<0, f, outs, ins> {
+  let Opcode{0-11} = op;
+  let Opcode{12-15} = 0;
+
+  let Pattern = pattern;
+  let AsmString = asmstr;
+}
+
+class I16<bits<16> op, Format f, dag outs, dag ins, string asmstr,
+         list<dag> pattern>
+  : InstSystemZ<op, f, outs, ins> {
+  let Pattern = pattern;
+  let AsmString = asmstr;
+}
+
+class RRI<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : I8<op, RRForm, outs, ins, asmstr, pattern>;
+
+class RII<bits<12> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : I12<op, RIForm, outs, ins, asmstr, pattern>;
+
+class RILI<bits<12> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : I12<op, RILForm, outs, ins, asmstr, pattern>;
+
+class RREI<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : I16<op, RREForm, outs, ins, asmstr, pattern>;
+
+class RXI<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : I8<op, RXForm, outs, ins, asmstr, pattern> {
+  let AddedComplexity = 1;
+}
+
+class RXYI<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : I16<op, RXYForm, outs, ins, asmstr, pattern>;
+
+class RSI<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : I8<op, RSForm, outs, ins, asmstr, pattern> {
+  let AddedComplexity = 1;
+}
+
+class RSYI<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : I16<op, RSYForm, outs, ins, asmstr, pattern>;
+
+class SII<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : I8<op, SIForm, outs, ins, asmstr, pattern> {
+  let AddedComplexity = 1;
+}
+
+class SIYI<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : I16<op, SIYForm, outs, ins, asmstr, pattern>;
+
+class SILI<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : I16<op, SILForm, outs, ins, asmstr, pattern>;
+
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions
+//===----------------------------------------------------------------------===//
+
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
+   : InstSystemZ<0, Pseudo, outs, ins> {
+
+  let Pattern = pattern;
+  let AsmString = asmstr;
+}

diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
new file mode 100644
index 0000000..5fa7e8c
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp

@@ -0,0 +1,636 @@
+//===- SystemZInstrInfo.cpp - SystemZ Instruction Information --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the SystemZ implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZ.h"
+#include "SystemZInstrBuilder.h"
+#include "SystemZInstrInfo.h"
+#include "SystemZMachineFunctionInfo.h"
+#include "SystemZTargetMachine.h"
+#include "SystemZGenInstrInfo.inc"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Support/ErrorHandling.h"
+using namespace llvm;
+
+SystemZInstrInfo::SystemZInstrInfo(SystemZTargetMachine &tm)
+  : TargetInstrInfoImpl(SystemZInsts, array_lengthof(SystemZInsts)),
+    RI(tm, *this), TM(tm) {
+  // Fill the spill offsets map
+  static const unsigned SpillOffsTab[][2] = {
+    { SystemZ::R2D,  0x10 },
+    { SystemZ::R3D,  0x18 },
+    { SystemZ::R4D,  0x20 },
+    { SystemZ::R5D,  0x28 },
+    { SystemZ::R6D,  0x30 },
+    { SystemZ::R7D,  0x38 },
+    { SystemZ::R8D,  0x40 },
+    { SystemZ::R9D,  0x48 },
+    { SystemZ::R10D, 0x50 },
+    { SystemZ::R11D, 0x58 },
+    { SystemZ::R12D, 0x60 },
+    { SystemZ::R13D, 0x68 },
+    { SystemZ::R14D, 0x70 },
+    { SystemZ::R15D, 0x78 }
+  };
+
+  RegSpillOffsets.grow(SystemZ::NUM_TARGET_REGS);
+
+  for (unsigned i = 0, e = array_lengthof(SpillOffsTab); i != e; ++i)
+    RegSpillOffsets[SpillOffsTab[i][0]] = SpillOffsTab[i][1];
+}
+
+/// isGVStub - Return true if the GV requires an extra load to get the
+/// real address.
+static inline bool isGVStub(GlobalValue *GV, SystemZTargetMachine &TM) {
+  return TM.getSubtarget<SystemZSubtarget>().GVRequiresExtraLoad(GV, TM, false);
+}
+
+void SystemZInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator MI,
+                                    unsigned SrcReg, bool isKill, int FrameIdx,
+                                    const TargetRegisterClass *RC) const {
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  unsigned Opc = 0;
+  if (RC == &SystemZ::GR32RegClass ||
+      RC == &SystemZ::ADDR32RegClass)
+    Opc = SystemZ::MOV32mr;
+  else if (RC == &SystemZ::GR64RegClass ||
+           RC == &SystemZ::ADDR64RegClass) {
+    Opc = SystemZ::MOV64mr;
+  } else if (RC == &SystemZ::FP32RegClass) {
+    Opc = SystemZ::FMOV32mr;
+  } else if (RC == &SystemZ::FP64RegClass) {
+    Opc = SystemZ::FMOV64mr;
+  } else if (RC == &SystemZ::GR64PRegClass) {
+    Opc = SystemZ::MOV64Pmr;
+  } else if (RC == &SystemZ::GR128RegClass) {
+    Opc = SystemZ::MOV128mr;
+  } else
+    llvm_unreachable("Unsupported regclass to store");
+
+  addFrameReference(BuildMI(MBB, MI, DL, get(Opc)), FrameIdx)
+    .addReg(SrcReg, getKillRegState(isKill));
+}
+
+void SystemZInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator MI,
+                                           unsigned DestReg, int FrameIdx,
+                                           const TargetRegisterClass *RC) const{
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  unsigned Opc = 0;
+  if (RC == &SystemZ::GR32RegClass ||
+      RC == &SystemZ::ADDR32RegClass)
+    Opc = SystemZ::MOV32rm;
+  else if (RC == &SystemZ::GR64RegClass ||
+           RC == &SystemZ::ADDR64RegClass) {
+    Opc = SystemZ::MOV64rm;
+  } else if (RC == &SystemZ::FP32RegClass) {
+    Opc = SystemZ::FMOV32rm;
+  } else if (RC == &SystemZ::FP64RegClass) {
+    Opc = SystemZ::FMOV64rm;
+  } else if (RC == &SystemZ::GR64PRegClass) {
+    Opc = SystemZ::MOV64Prm;
+  } else if (RC == &SystemZ::GR128RegClass) {
+    Opc = SystemZ::MOV128rm;
+  } else
+    llvm_unreachable("Unsupported regclass to load");
+
+  addFrameReference(BuildMI(MBB, MI, DL, get(Opc), DestReg), FrameIdx);
+}
+
+bool SystemZInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator I,
+                                    unsigned DestReg, unsigned SrcReg,
+                                    const TargetRegisterClass *DestRC,
+                                    const TargetRegisterClass *SrcRC) const {
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  // Determine if DstRC and SrcRC have a common superclass.
+  const TargetRegisterClass *CommonRC = DestRC;
+  if (DestRC == SrcRC)
+    /* Same regclass for source and dest */;
+  else if (CommonRC->hasSuperClass(SrcRC))
+    CommonRC = SrcRC;
+  else if (!CommonRC->hasSubClass(SrcRC))
+    CommonRC = 0;
+
+  if (CommonRC) {
+    if (CommonRC == &SystemZ::GR64RegClass ||
+        CommonRC == &SystemZ::ADDR64RegClass) {
+      BuildMI(MBB, I, DL, get(SystemZ::MOV64rr), DestReg).addReg(SrcReg);
+    } else if (CommonRC == &SystemZ::GR32RegClass ||
+               CommonRC == &SystemZ::ADDR32RegClass) {
+      BuildMI(MBB, I, DL, get(SystemZ::MOV32rr), DestReg).addReg(SrcReg);
+    } else if (CommonRC == &SystemZ::GR64PRegClass) {
+      BuildMI(MBB, I, DL, get(SystemZ::MOV64rrP), DestReg).addReg(SrcReg);
+    } else if (CommonRC == &SystemZ::GR128RegClass) {
+      BuildMI(MBB, I, DL, get(SystemZ::MOV128rr), DestReg).addReg(SrcReg);
+    } else if (CommonRC == &SystemZ::FP32RegClass) {
+      BuildMI(MBB, I, DL, get(SystemZ::FMOV32rr), DestReg).addReg(SrcReg);
+    } else if (CommonRC == &SystemZ::FP64RegClass) {
+      BuildMI(MBB, I, DL, get(SystemZ::FMOV64rr), DestReg).addReg(SrcReg);
+    } else {
+      return false;
+    }
+
+    return true;
+  }
+
+  if ((SrcRC == &SystemZ::GR64RegClass &&
+       DestRC == &SystemZ::ADDR64RegClass) ||
+      (DestRC == &SystemZ::GR64RegClass &&
+       SrcRC == &SystemZ::ADDR64RegClass)) {
+    BuildMI(MBB, I, DL, get(SystemZ::MOV64rr), DestReg).addReg(SrcReg);
+    return true;
+  } else if ((SrcRC == &SystemZ::GR32RegClass &&
+              DestRC == &SystemZ::ADDR32RegClass) ||
+             (DestRC == &SystemZ::GR32RegClass &&
+              SrcRC == &SystemZ::ADDR32RegClass)) {
+    BuildMI(MBB, I, DL, get(SystemZ::MOV32rr), DestReg).addReg(SrcReg);
+    return true;
+  }
+
+  return false;
+}
+
+bool
+SystemZInstrInfo::isMoveInstr(const MachineInstr& MI,
+                              unsigned &SrcReg, unsigned &DstReg,
+                              unsigned &SrcSubIdx, unsigned &DstSubIdx) const {
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case SystemZ::MOV32rr:
+  case SystemZ::MOV64rr:
+  case SystemZ::MOV64rrP:
+  case SystemZ::MOV128rr:
+  case SystemZ::FMOV32rr:
+  case SystemZ::FMOV64rr:
+    assert(MI.getNumOperands() >= 2 &&
+           MI.getOperand(0).isReg() &&
+           MI.getOperand(1).isReg() &&
+           "invalid register-register move instruction");
+    SrcReg = MI.getOperand(1).getReg();
+    DstReg = MI.getOperand(0).getReg();
+    SrcSubIdx = MI.getOperand(1).getSubReg();
+    DstSubIdx = MI.getOperand(0).getSubReg();
+    return true;
+  }
+}
+
+unsigned SystemZInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                               int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case SystemZ::MOV32rm:
+  case SystemZ::MOV32rmy:
+  case SystemZ::MOV64rm:
+  case SystemZ::MOVSX32rm8:
+  case SystemZ::MOVSX32rm16y:
+  case SystemZ::MOVSX64rm8:
+  case SystemZ::MOVSX64rm16:
+  case SystemZ::MOVSX64rm32:
+  case SystemZ::MOVZX32rm8:
+  case SystemZ::MOVZX32rm16:
+  case SystemZ::MOVZX64rm8:
+  case SystemZ::MOVZX64rm16:
+  case SystemZ::MOVZX64rm32:
+  case SystemZ::FMOV32rm:
+  case SystemZ::FMOV32rmy:
+  case SystemZ::FMOV64rm:
+  case SystemZ::FMOV64rmy:
+  case SystemZ::MOV64Prm:
+  case SystemZ::MOV64Prmy:
+  case SystemZ::MOV128rm:
+    if (MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isImm() && MI->getOperand(3).isReg() &&
+        MI->getOperand(2).getImm() == 0 && MI->getOperand(3).getReg() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+unsigned SystemZInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                              int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default: break;
+  case SystemZ::MOV32mr:
+  case SystemZ::MOV32mry:
+  case SystemZ::MOV64mr:
+  case SystemZ::MOV32m8r:
+  case SystemZ::MOV32m8ry:
+  case SystemZ::MOV32m16r:
+  case SystemZ::MOV32m16ry:
+  case SystemZ::MOV64m8r:
+  case SystemZ::MOV64m8ry:
+  case SystemZ::MOV64m16r:
+  case SystemZ::MOV64m16ry:
+  case SystemZ::MOV64m32r:
+  case SystemZ::MOV64m32ry:
+  case SystemZ::FMOV32mr:
+  case SystemZ::FMOV32mry:
+  case SystemZ::FMOV64mr:
+  case SystemZ::FMOV64mry:
+  case SystemZ::MOV64Pmr:
+  case SystemZ::MOV64Pmry:
+  case SystemZ::MOV128mr:
+    if (MI->getOperand(0).isFI() &&
+        MI->getOperand(1).isImm() && MI->getOperand(2).isReg() &&
+        MI->getOperand(1).getImm() == 0 && MI->getOperand(2).getReg() == 0) {
+      FrameIndex = MI->getOperand(0).getIndex();
+      return MI->getOperand(3).getReg();
+    }
+    break;
+  }
+  return 0;
+}
+
+bool
+SystemZInstrInfo::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator MI,
+                                const std::vector<CalleeSavedInfo> &CSI) const {
+  if (CSI.empty())
+    return false;
+
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  MachineFunction &MF = *MBB.getParent();
+  SystemZMachineFunctionInfo *MFI = MF.getInfo<SystemZMachineFunctionInfo>();
+  unsigned CalleeFrameSize = 0;
+
+  // Scan the callee-saved and find the bounds of register spill area.
+  unsigned LowReg = 0, HighReg = 0, StartOffset = -1U, EndOffset = 0;
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    const TargetRegisterClass *RegClass = CSI[i].getRegClass();
+    if (RegClass != &SystemZ::FP64RegClass) {
+      unsigned Offset = RegSpillOffsets[Reg];
+      CalleeFrameSize += 8;
+      if (StartOffset > Offset) {
+        LowReg = Reg; StartOffset = Offset;
+      }
+      if (EndOffset < Offset) {
+        HighReg = Reg; EndOffset = RegSpillOffsets[Reg];
+      }
+    }
+  }
+
+  // Save information for epilogue inserter.
+  MFI->setCalleeSavedFrameSize(CalleeFrameSize);
+  MFI->setLowReg(LowReg); MFI->setHighReg(HighReg);
+
+  // Save GPRs
+  if (StartOffset) {
+    // Build a store instruction. Use STORE MULTIPLE instruction if there are many
+    // registers to store, otherwise - just STORE.
+    MachineInstrBuilder MIB =
+      BuildMI(MBB, MI, DL, get((LowReg == HighReg ?
+                                SystemZ::MOV64mr : SystemZ::MOV64mrm)));
+
+    // Add store operands.
+    MIB.addReg(SystemZ::R15D).addImm(StartOffset);
+    if (LowReg == HighReg)
+      MIB.addReg(0);
+    MIB.addReg(LowReg, RegState::Kill);
+    if (LowReg != HighReg)
+      MIB.addReg(HighReg, RegState::Kill);
+
+    // Do a second scan adding regs as being killed by instruction
+    for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+      unsigned Reg = CSI[i].getReg();
+      // Add the callee-saved register as live-in. It's killed at the spill.
+      MBB.addLiveIn(Reg);
+      if (Reg != LowReg && Reg != HighReg)
+        MIB.addReg(Reg, RegState::ImplicitKill);
+    }
+  }
+
+  // Save FPRs
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    const TargetRegisterClass *RegClass = CSI[i].getRegClass();
+    if (RegClass == &SystemZ::FP64RegClass) {
+      MBB.addLiveIn(Reg);
+      storeRegToStackSlot(MBB, MI, Reg, true, CSI[i].getFrameIdx(), RegClass);
+    }
+  }
+
+  return true;
+}
+
+bool
+SystemZInstrInfo::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                             MachineBasicBlock::iterator MI,
+                                const std::vector<CalleeSavedInfo> &CSI) const {
+  if (CSI.empty())
+    return false;
+
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+
+  MachineFunction &MF = *MBB.getParent();
+  const TargetRegisterInfo *RegInfo= MF.getTarget().getRegisterInfo();
+  SystemZMachineFunctionInfo *MFI = MF.getInfo<SystemZMachineFunctionInfo>();
+
+  // Restore FP registers
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    const TargetRegisterClass *RegClass = CSI[i].getRegClass();
+    if (RegClass == &SystemZ::FP64RegClass)
+      loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RegClass);
+  }
+
+  // Restore GP registers
+  unsigned LowReg = MFI->getLowReg(), HighReg = MFI->getHighReg();
+  unsigned StartOffset = RegSpillOffsets[LowReg];
+
+  if (StartOffset) {
+    // Build a load instruction. Use LOAD MULTIPLE instruction if there are many
+    // registers to load, otherwise - just LOAD.
+    MachineInstrBuilder MIB =
+      BuildMI(MBB, MI, DL, get((LowReg == HighReg ?
+                                SystemZ::MOV64rm : SystemZ::MOV64rmm)));
+    // Add store operands.
+    MIB.addReg(LowReg, RegState::Define);
+    if (LowReg != HighReg)
+      MIB.addReg(HighReg, RegState::Define);
+
+    MIB.addReg((RegInfo->hasFP(MF) ? SystemZ::R11D : SystemZ::R15D));
+    MIB.addImm(StartOffset);
+    if (LowReg == HighReg)
+      MIB.addReg(0);
+
+    // Do a second scan adding regs as being defined by instruction
+    for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+      unsigned Reg = CSI[i].getReg();
+      if (Reg != LowReg && Reg != HighReg)
+        MIB.addReg(Reg, RegState::ImplicitDefine);
+    }
+  }
+
+  return true;
+}
+
+bool SystemZInstrInfo::
+ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+  assert(Cond.size() == 1 && "Invalid Xbranch condition!");
+
+  SystemZCC::CondCodes CC = static_cast<SystemZCC::CondCodes>(Cond[0].getImm());
+  Cond[0].setImm(getOppositeCondition(CC));
+  return false;
+}
+
+bool SystemZInstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
+  const TargetInstrDesc &TID = MI->getDesc();
+  if (!TID.isTerminator()) return false;
+
+  // Conditional branch is a special case.
+  if (TID.isBranch() && !TID.isBarrier())
+    return true;
+  if (!TID.isPredicable())
+    return true;
+  return !isPredicated(MI);
+}
+
+bool SystemZInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+                                     MachineBasicBlock *&TBB,
+                                     MachineBasicBlock *&FBB,
+                                     SmallVectorImpl<MachineOperand> &Cond,
+                                     bool AllowModify) const {
+  // Start from the bottom of the block and work up, examining the
+  // terminator instructions.
+  MachineBasicBlock::iterator I = MBB.end();
+  while (I != MBB.begin()) {
+    --I;
+    // Working from the bottom, when we see a non-terminator
+    // instruction, we're done.
+    if (!isUnpredicatedTerminator(I))
+      break;
+
+    // A terminator that isn't a branch can't easily be handled
+    // by this analysis.
+    if (!I->getDesc().isBranch())
+      return true;
+
+    // Handle unconditional branches.
+    if (I->getOpcode() == SystemZ::JMP) {
+      if (!AllowModify) {
+        TBB = I->getOperand(0).getMBB();
+        continue;
+      }
+
+      // If the block has any instructions after a JMP, delete them.
+      while (llvm::next(I) != MBB.end())
+        llvm::next(I)->eraseFromParent();
+      Cond.clear();
+      FBB = 0;
+
+      // Delete the JMP if it's equivalent to a fall-through.
+      if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
+        TBB = 0;
+        I->eraseFromParent();
+        I = MBB.end();
+        continue;
+      }
+
+      // TBB is used to indicate the unconditinal destination.
+      TBB = I->getOperand(0).getMBB();
+      continue;
+    }
+
+    // Handle conditional branches.
+    SystemZCC::CondCodes BranchCode = getCondFromBranchOpc(I->getOpcode());
+    if (BranchCode == SystemZCC::INVALID)
+      return true;  // Can't handle indirect branch.
+
+    // Working from the bottom, handle the first conditional branch.
+    if (Cond.empty()) {
+      FBB = TBB;
+      TBB = I->getOperand(0).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(BranchCode));
+      continue;
+    }
+
+    // Handle subsequent conditional branches. Only handle the case where all
+    // conditional branches branch to the same destination.
+    assert(Cond.size() == 1);
+    assert(TBB);
+
+    // Only handle the case where all conditional branches branch to
+    // the same destination.
+    if (TBB != I->getOperand(0).getMBB())
+      return true;
+
+    SystemZCC::CondCodes OldBranchCode = (SystemZCC::CondCodes)Cond[0].getImm();
+    // If the conditions are the same, we can leave them alone.
+    if (OldBranchCode == BranchCode)
+      continue;
+
+    return true;
+  }
+
+  return false;
+}
+
+unsigned SystemZInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  unsigned Count = 0;
+
+  while (I != MBB.begin()) {
+    --I;
+    if (I->getOpcode() != SystemZ::JMP &&
+        getCondFromBranchOpc(I->getOpcode()) == SystemZCC::INVALID)
+      break;
+    // Remove the branch.
+    I->eraseFromParent();
+    I = MBB.end();
+    ++Count;
+  }
+
+  return Count;
+}
+
+unsigned
+SystemZInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                               MachineBasicBlock *FBB,
+                            const SmallVectorImpl<MachineOperand> &Cond) const {
+  // FIXME: this should probably have a DebugLoc operand
+  DebugLoc dl = DebugLoc::getUnknownLoc();
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 1 || Cond.size() == 0) &&
+         "SystemZ branch conditions have one component!");
+
+  if (Cond.empty()) {
+    // Unconditional branch?
+    assert(!FBB && "Unconditional branch with multiple successors!");
+    BuildMI(&MBB, dl, get(SystemZ::JMP)).addMBB(TBB);
+    return 1;
+  }
+
+  // Conditional branch.
+  unsigned Count = 0;
+  SystemZCC::CondCodes CC = (SystemZCC::CondCodes)Cond[0].getImm();
+  BuildMI(&MBB, dl, getBrCond(CC)).addMBB(TBB);
+  ++Count;
+
+  if (FBB) {
+    // Two-way Conditional branch. Insert the second branch.
+    BuildMI(&MBB, dl, get(SystemZ::JMP)).addMBB(FBB);
+    ++Count;
+  }
+  return Count;
+}
+
+const TargetInstrDesc&
+SystemZInstrInfo::getBrCond(SystemZCC::CondCodes CC) const {
+  switch (CC) {
+  default:
+   llvm_unreachable("Unknown condition code!");
+  case SystemZCC::O:   return get(SystemZ::JO);
+  case SystemZCC::H:   return get(SystemZ::JH);
+  case SystemZCC::NLE: return get(SystemZ::JNLE);
+  case SystemZCC::L:   return get(SystemZ::JL);
+  case SystemZCC::NHE: return get(SystemZ::JNHE);
+  case SystemZCC::LH:  return get(SystemZ::JLH);
+  case SystemZCC::NE:  return get(SystemZ::JNE);
+  case SystemZCC::E:   return get(SystemZ::JE);
+  case SystemZCC::NLH: return get(SystemZ::JNLH);
+  case SystemZCC::HE:  return get(SystemZ::JHE);
+  case SystemZCC::NL:  return get(SystemZ::JNL);
+  case SystemZCC::LE:  return get(SystemZ::JLE);
+  case SystemZCC::NH:  return get(SystemZ::JNH);
+  case SystemZCC::NO:  return get(SystemZ::JNO);
+  }
+}
+
+SystemZCC::CondCodes
+SystemZInstrInfo::getCondFromBranchOpc(unsigned Opc) const {
+  switch (Opc) {
+  default:            return SystemZCC::INVALID;
+  case SystemZ::JO:   return SystemZCC::O;
+  case SystemZ::JH:   return SystemZCC::H;
+  case SystemZ::JNLE: return SystemZCC::NLE;
+  case SystemZ::JL:   return SystemZCC::L;
+  case SystemZ::JNHE: return SystemZCC::NHE;
+  case SystemZ::JLH:  return SystemZCC::LH;
+  case SystemZ::JNE:  return SystemZCC::NE;
+  case SystemZ::JE:   return SystemZCC::E;
+  case SystemZ::JNLH: return SystemZCC::NLH;
+  case SystemZ::JHE:  return SystemZCC::HE;
+  case SystemZ::JNL:  return SystemZCC::NL;
+  case SystemZ::JLE:  return SystemZCC::LE;
+  case SystemZ::JNH:  return SystemZCC::NH;
+  case SystemZ::JNO:  return SystemZCC::NO;
+  }
+}
+
+SystemZCC::CondCodes
+SystemZInstrInfo::getOppositeCondition(SystemZCC::CondCodes CC) const {
+  switch (CC) {
+  default:
+    llvm_unreachable("Invalid condition!");
+  case SystemZCC::O:   return SystemZCC::NO;
+  case SystemZCC::H:   return SystemZCC::NH;
+  case SystemZCC::NLE: return SystemZCC::LE;
+  case SystemZCC::L:   return SystemZCC::NL;
+  case SystemZCC::NHE: return SystemZCC::HE;
+  case SystemZCC::LH:  return SystemZCC::NLH;
+  case SystemZCC::NE:  return SystemZCC::E;
+  case SystemZCC::E:   return SystemZCC::NE;
+  case SystemZCC::NLH: return SystemZCC::LH;
+  case SystemZCC::HE:  return SystemZCC::NHE;
+  case SystemZCC::NL:  return SystemZCC::L;
+  case SystemZCC::LE:  return SystemZCC::NLE;
+  case SystemZCC::NH:  return SystemZCC::H;
+  case SystemZCC::NO:  return SystemZCC::O;
+  }
+}
+
+const TargetInstrDesc&
+SystemZInstrInfo::getLongDispOpc(unsigned Opc) const {
+  switch (Opc) {
+  default:
+    llvm_unreachable("Don't have long disp version of this instruction");
+  case SystemZ::MOV32mr:   return get(SystemZ::MOV32mry);
+  case SystemZ::MOV32rm:   return get(SystemZ::MOV32rmy);
+  case SystemZ::MOVSX32rm16: return get(SystemZ::MOVSX32rm16y);
+  case SystemZ::MOV32m8r:  return get(SystemZ::MOV32m8ry);
+  case SystemZ::MOV32m16r: return get(SystemZ::MOV32m16ry);
+  case SystemZ::MOV64m8r:  return get(SystemZ::MOV64m8ry);
+  case SystemZ::MOV64m16r: return get(SystemZ::MOV64m16ry);
+  case SystemZ::MOV64m32r: return get(SystemZ::MOV64m32ry);
+  case SystemZ::MOV8mi:    return get(SystemZ::MOV8miy);
+  case SystemZ::MUL32rm:   return get(SystemZ::MUL32rmy);
+  case SystemZ::CMP32rm:   return get(SystemZ::CMP32rmy);
+  case SystemZ::UCMP32rm:  return get(SystemZ::UCMP32rmy);
+  case SystemZ::FMOV32mr:  return get(SystemZ::FMOV32mry);
+  case SystemZ::FMOV64mr:  return get(SystemZ::FMOV64mry);
+  case SystemZ::FMOV32rm:  return get(SystemZ::FMOV32rmy);
+  case SystemZ::FMOV64rm:  return get(SystemZ::FMOV64rmy);
+  case SystemZ::MOV64Pmr:  return get(SystemZ::MOV64Pmry);
+  case SystemZ::MOV64Prm:  return get(SystemZ::MOV64Prmy);
+  }
+}

diff --git a/lib/Target/SystemZ/SystemZInstrInfo.h b/lib/Target/SystemZ/SystemZInstrInfo.h
new file mode 100644
index 0000000..ef3b39e
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZInstrInfo.h

@@ -0,0 +1,118 @@
+//===- SystemZInstrInfo.h - SystemZ Instruction Information -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the SystemZ implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_SYSTEMZINSTRINFO_H
+#define LLVM_TARGET_SYSTEMZINSTRINFO_H
+
+#include "SystemZ.h"
+#include "SystemZRegisterInfo.h"
+#include "llvm/ADT/IndexedMap.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+namespace llvm {
+
+class SystemZTargetMachine;
+
+/// SystemZII - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace SystemZII {
+  enum {
+    //===------------------------------------------------------------------===//
+    // SystemZ Specific MachineOperand flags.
+
+    MO_NO_FLAG = 0,
+
+    /// MO_GOTENT - On a symbol operand this indicates that the immediate is
+    /// the offset to the location of the symbol name from the base of the GOT.
+    ///
+    ///    SYMBOL_LABEL @GOTENT
+    MO_GOTENT = 1,
+
+    /// MO_PLT - On a symbol operand this indicates that the immediate is
+    /// offset to the PLT entry of symbol name from the current code location.
+    ///
+    ///    SYMBOL_LABEL @PLT
+    MO_PLT = 2
+  };
+}
+
+class SystemZInstrInfo : public TargetInstrInfoImpl {
+  const SystemZRegisterInfo RI;
+  SystemZTargetMachine &TM;
+  IndexedMap<unsigned> RegSpillOffsets;
+public:
+  explicit SystemZInstrInfo(SystemZTargetMachine &TM);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  virtual const SystemZRegisterInfo &getRegisterInfo() const { return RI; }
+
+  bool copyRegToReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                    unsigned DestReg, unsigned SrcReg,
+                    const TargetRegisterClass *DestRC,
+                    const TargetRegisterClass *SrcRC) const;
+
+  bool isMoveInstr(const MachineInstr& MI,
+                   unsigned &SrcReg, unsigned &DstReg,
+                   unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
+  unsigned isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const;
+  unsigned isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const;
+
+  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   unsigned SrcReg, bool isKill,
+                                   int FrameIndex,
+                                   const TargetRegisterClass *RC) const;
+  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MI,
+                                    unsigned DestReg, int FrameIdx,
+                                    const TargetRegisterClass *RC) const;
+
+  virtual bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI) const;
+  virtual bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI) const;
+
+  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+  virtual bool isUnpredicatedTerminator(const MachineInstr *MI) const;
+  virtual bool AnalyzeBranch(MachineBasicBlock &MBB,
+                             MachineBasicBlock *&TBB,
+                             MachineBasicBlock *&FBB,
+                             SmallVectorImpl<MachineOperand> &Cond,
+                             bool AllowModify) const;
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                                MachineBasicBlock *FBB,
+                             const SmallVectorImpl<MachineOperand> &Cond) const;
+  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+
+  SystemZCC::CondCodes getOppositeCondition(SystemZCC::CondCodes CC) const;
+  SystemZCC::CondCodes getCondFromBranchOpc(unsigned Opc) const;
+  const TargetInstrDesc& getBrCond(SystemZCC::CondCodes CC) const;
+  const TargetInstrDesc& getLongDispOpc(unsigned Opc) const;
+
+  const TargetInstrDesc& getMemoryInstr(unsigned Opc, int64_t Offset = 0) const {
+    if (Offset < 0 || Offset >= 4096)
+      return getLongDispOpc(Opc);
+    else
+      return get(Opc);
+  }
+};
+
+}
+
+#endif

diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td
new file mode 100644
index 0000000..1891bba
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZInstrInfo.td

@@ -0,0 +1,1155 @@
+//===- SystemZInstrInfo.td - SystemZ Instruction defs ---------*- tblgen-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the SystemZ instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SystemZ Instruction Predicate Definitions.
+def IsZ10 : Predicate<"Subtarget.isZ10()">;
+
+include "SystemZInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Type Constraints.
+//===----------------------------------------------------------------------===//
+class SDTCisI8<int OpNum> : SDTCisVT<OpNum, i8>;
+class SDTCisI16<int OpNum> : SDTCisVT<OpNum, i16>;
+class SDTCisI32<int OpNum> : SDTCisVT<OpNum, i32>;
+class SDTCisI64<int OpNum> : SDTCisVT<OpNum, i64>;
+
+//===----------------------------------------------------------------------===//
+// Type Profiles.
+//===----------------------------------------------------------------------===//
+def SDT_SystemZCall         : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
+def SDT_SystemZCallSeqStart : SDCallSeqStart<[SDTCisI64<0>]>;
+def SDT_SystemZCallSeqEnd   : SDCallSeqEnd<[SDTCisI64<0>, SDTCisI64<1>]>;
+def SDT_CmpTest             : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
+def SDT_BrCond              : SDTypeProfile<0, 3,
+                                           [SDTCisVT<0, OtherVT>,
+                                            SDTCisI8<1>, SDTCisVT<2, i64>]>;
+def SDT_SelectCC            : SDTypeProfile<1, 4,
+                                           [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
+                                            SDTCisI8<3>, SDTCisVT<4, i64>]>;
+def SDT_Address             : SDTypeProfile<1, 1,
+                                            [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+
+//===----------------------------------------------------------------------===//
+// SystemZ Specific Node Definitions.
+//===----------------------------------------------------------------------===//
+def SystemZretflag : SDNode<"SystemZISD::RET_FLAG", SDTNone,
+                     [SDNPHasChain, SDNPOptInFlag]>;
+def SystemZcall    : SDNode<"SystemZISD::CALL", SDT_SystemZCall,
+                     [SDNPHasChain, SDNPOutFlag, SDNPOptInFlag]>;
+def SystemZcallseq_start :
+                 SDNode<"ISD::CALLSEQ_START", SDT_SystemZCallSeqStart,
+                        [SDNPHasChain, SDNPOutFlag]>;
+def SystemZcallseq_end :
+                 SDNode<"ISD::CALLSEQ_END",   SDT_SystemZCallSeqEnd,
+                        [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+def SystemZcmp     : SDNode<"SystemZISD::CMP", SDT_CmpTest>;
+def SystemZucmp    : SDNode<"SystemZISD::UCMP", SDT_CmpTest>;
+def SystemZbrcond  : SDNode<"SystemZISD::BRCOND", SDT_BrCond,
+                            [SDNPHasChain]>;
+def SystemZselect  : SDNode<"SystemZISD::SELECT", SDT_SelectCC>;
+def SystemZpcrelwrapper : SDNode<"SystemZISD::PCRelativeWrapper", SDT_Address, []>;
+
+
+include "SystemZOperands.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction list..
+
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt),
+                              "#ADJCALLSTACKDOWN",
+                              [(SystemZcallseq_start timm:$amt)]>;
+def ADJCALLSTACKUP   : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2),
+                              "#ADJCALLSTACKUP",
+                              [(SystemZcallseq_end timm:$amt1, timm:$amt2)]>;
+
+let Uses = [PSW], usesCustomInserter = 1 in {
+  def Select32 : Pseudo<(outs GR32:$dst), (ins GR32:$src1, GR32:$src2, i8imm:$cc),
+                        "# Select32 PSEUDO",
+                        [(set GR32:$dst,
+                              (SystemZselect GR32:$src1, GR32:$src2, imm:$cc, PSW))]>;
+  def Select64 : Pseudo<(outs GR64:$dst), (ins GR64:$src1, GR64:$src2, i8imm:$cc),
+                        "# Select64 PSEUDO",
+                        [(set GR64:$dst,
+                              (SystemZselect GR64:$src1, GR64:$src2, imm:$cc, PSW))]>;
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Control Flow Instructions...
+//
+
+// FIXME: Provide proper encoding!
+let isReturn = 1, isTerminator = 1, isBarrier = 1, hasCtrlDep = 1 in {
+  def RET : Pseudo<(outs), (ins), "br\t%r14", [(SystemZretflag)]>;
+}
+
+let isBranch = 1, isTerminator = 1 in {
+  let isBarrier = 1 in {
+    def JMP  : Pseudo<(outs), (ins brtarget:$dst), "j\t{$dst}", [(br bb:$dst)]>;
+
+    let isIndirectBranch = 1 in
+      def JMPr   : Pseudo<(outs), (ins GR64:$dst), "br\t{$dst}", [(brind GR64:$dst)]>;
+  }
+
+  let Uses = [PSW] in {
+    def JO  : Pseudo<(outs), (ins brtarget:$dst),
+                     "jo\t$dst",
+                     [(SystemZbrcond bb:$dst, SYSTEMZ_COND_O, PSW)]>;
+    def JH  : Pseudo<(outs), (ins brtarget:$dst),
+                     "jh\t$dst",
+                     [(SystemZbrcond bb:$dst, SYSTEMZ_COND_H, PSW)]>;
+    def JNLE: Pseudo<(outs), (ins brtarget:$dst),
+                     "jnle\t$dst",
+                     [(SystemZbrcond bb:$dst, SYSTEMZ_COND_NLE, PSW)]>;
+    def JL  : Pseudo<(outs), (ins brtarget:$dst),
+                     "jl\t$dst",
+                     [(SystemZbrcond bb:$dst, SYSTEMZ_COND_L, PSW)]>;
+    def JNHE: Pseudo<(outs), (ins brtarget:$dst),
+                     "jnhe\t$dst",
+                     [(SystemZbrcond bb:$dst, SYSTEMZ_COND_NHE, PSW)]>;
+    def JLH : Pseudo<(outs), (ins brtarget:$dst),
+                     "jlh\t$dst",
+                     [(SystemZbrcond bb:$dst, SYSTEMZ_COND_LH, PSW)]>;
+    def JNE : Pseudo<(outs), (ins brtarget:$dst),
+                     "jne\t$dst",
+                     [(SystemZbrcond bb:$dst, SYSTEMZ_COND_NE, PSW)]>;
+    def JE  : Pseudo<(outs), (ins brtarget:$dst),
+                     "je\t$dst",
+                     [(SystemZbrcond bb:$dst, SYSTEMZ_COND_E, PSW)]>;
+    def JNLH: Pseudo<(outs), (ins brtarget:$dst),
+                     "jnlh\t$dst",
+                     [(SystemZbrcond bb:$dst, SYSTEMZ_COND_NLH, PSW)]>;
+    def JHE : Pseudo<(outs), (ins brtarget:$dst),
+                     "jhe\t$dst",
+                     [(SystemZbrcond bb:$dst, SYSTEMZ_COND_HE, PSW)]>;
+    def JNL : Pseudo<(outs), (ins brtarget:$dst),
+                     "jnl\t$dst",
+                     [(SystemZbrcond bb:$dst, SYSTEMZ_COND_NL, PSW)]>;
+    def JLE : Pseudo<(outs), (ins brtarget:$dst),
+                     "jle\t$dst",
+                     [(SystemZbrcond bb:$dst, SYSTEMZ_COND_LE, PSW)]>;
+    def JNH : Pseudo<(outs), (ins brtarget:$dst),
+                     "jnh\t$dst",
+                     [(SystemZbrcond bb:$dst, SYSTEMZ_COND_NH, PSW)]>;
+    def JNO : Pseudo<(outs), (ins brtarget:$dst),
+                     "jno\t$dst",
+                     [(SystemZbrcond bb:$dst, SYSTEMZ_COND_NO, PSW)]>;
+  } // Uses = [PSW]
+} // isBranch = 1
+
+//===----------------------------------------------------------------------===//
+//  Call Instructions...
+//
+
+let isCall = 1 in
+  // All calls clobber the non-callee saved registers. Uses for argument
+  // registers are added manually.
+  let Defs = [R0D, R1D, R2D, R3D, R4D, R5D, R14D,
+              F0L, F1L, F2L, F3L, F4L, F5L, F6L, F7L] in {
+    def CALLi     : Pseudo<(outs), (ins imm_pcrel:$dst, variable_ops),
+                           "brasl\t%r14, $dst", [(SystemZcall imm:$dst)]>;
+    def CALLr     : Pseudo<(outs), (ins ADDR64:$dst, variable_ops),
+                           "basr\t%r14, $dst", [(SystemZcall ADDR64:$dst)]>;
+  }
+
+//===----------------------------------------------------------------------===//
+//  Miscellaneous Instructions.
+//
+
+let isReMaterializable = 1 in
+// FIXME: Provide imm12 variant
+// FIXME: Address should be halfword aligned...
+def LA64r  : RXI<0x47,
+                 (outs GR64:$dst), (ins laaddr:$src),
+                 "lay\t{$dst, $src}",
+                 [(set GR64:$dst, laaddr:$src)]>;
+def LA64rm : RXYI<0x71E3,
+                  (outs GR64:$dst), (ins i64imm:$src),
+                  "larl\t{$dst, $src}",
+                  [(set GR64:$dst,
+                        (SystemZpcrelwrapper tglobaladdr:$src))]>;
+
+let neverHasSideEffects = 1 in
+def NOP : Pseudo<(outs), (ins), "# no-op", []>;
+
+//===----------------------------------------------------------------------===//
+// Move Instructions
+
+let neverHasSideEffects = 1 in {
+def MOV32rr : RRI<0x18,
+                  (outs GR32:$dst), (ins GR32:$src),
+                  "lr\t{$dst, $src}",
+                  []>;
+def MOV64rr : RREI<0xB904,
+                   (outs GR64:$dst), (ins GR64:$src),
+                   "lgr\t{$dst, $src}",
+                   []>;
+def MOV128rr : Pseudo<(outs GR128:$dst), (ins GR128:$src),
+                     "# MOV128 PSEUDO!\n"
+                     "\tlgr\t${dst:subreg_odd}, ${src:subreg_odd}\n"
+                     "\tlgr\t${dst:subreg_even}, ${src:subreg_even}",
+                     []>;
+def MOV64rrP : Pseudo<(outs GR64P:$dst), (ins GR64P:$src),
+                     "# MOV64P PSEUDO!\n"
+                     "\tlr\t${dst:subreg_odd}, ${src:subreg_odd}\n"
+                     "\tlr\t${dst:subreg_even}, ${src:subreg_even}",
+                     []>;
+}
+
+def MOVSX64rr32 : RREI<0xB914,
+                       (outs GR64:$dst), (ins GR32:$src),
+                       "lgfr\t{$dst, $src}",
+                       [(set GR64:$dst, (sext GR32:$src))]>;
+def MOVZX64rr32 : RREI<0xB916,
+                       (outs GR64:$dst), (ins GR32:$src),
+                       "llgfr\t{$dst, $src}",
+                       [(set GR64:$dst, (zext GR32:$src))]>;
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+def MOV32ri16 : RII<0x8A7,
+                    (outs GR32:$dst), (ins s16imm:$src),
+                    "lhi\t{$dst, $src}",
+                    [(set GR32:$dst, immSExt16:$src)]>;
+def MOV64ri16 : RII<0x9A7,
+                    (outs GR64:$dst), (ins s16imm64:$src),
+                    "lghi\t{$dst, $src}",
+                    [(set GR64:$dst, immSExt16:$src)]>;
+
+def MOV64rill16 : RII<0xFA5,
+                      (outs GR64:$dst), (ins i64imm:$src),
+                      "llill\t{$dst, $src}",
+                      [(set GR64:$dst, i64ll16:$src)]>;
+def MOV64rilh16 : RII<0xEA5,
+                      (outs GR64:$dst), (ins i64imm:$src),
+                      "llilh\t{$dst, $src}",
+                      [(set GR64:$dst, i64lh16:$src)]>;
+def MOV64rihl16 : RII<0xDA5,
+                      (outs GR64:$dst), (ins i64imm:$src),
+                      "llihl\t{$dst, $src}",
+                      [(set GR64:$dst, i64hl16:$src)]>;
+def MOV64rihh16 : RII<0xCA5,
+                      (outs GR64:$dst), (ins i64imm:$src),
+                      "llihh\t{$dst, $src}",
+                      [(set GR64:$dst, i64hh16:$src)]>;
+
+def MOV64ri32 : RILI<0x1C0,
+                     (outs GR64:$dst), (ins s32imm64:$src),
+                     "lgfi\t{$dst, $src}",
+                     [(set GR64:$dst, immSExt32:$src)]>;
+def MOV64rilo32 : RILI<0xFC0,
+                       (outs GR64:$dst), (ins i64imm:$src),
+                       "llilf\t{$dst, $src}",
+                       [(set GR64:$dst, i64lo32:$src)]>;
+def MOV64rihi32 : RILI<0xEC0, (outs GR64:$dst), (ins i64imm:$src),
+                       "llihf\t{$dst, $src}",
+                       [(set GR64:$dst, i64hi32:$src)]>;
+}
+
+let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in {
+def MOV32rm  : RXI<0x58,
+                   (outs GR32:$dst), (ins rriaddr12:$src),
+                   "l\t{$dst, $src}",
+                   [(set GR32:$dst, (load rriaddr12:$src))]>;
+def MOV32rmy : RXYI<0x58E3,
+                    (outs GR32:$dst), (ins rriaddr:$src),
+                    "ly\t{$dst, $src}",
+                    [(set GR32:$dst, (load rriaddr:$src))]>;
+def MOV64rm  : RXYI<0x04E3,
+                    (outs GR64:$dst), (ins rriaddr:$src),
+                    "lg\t{$dst, $src}",
+                    [(set GR64:$dst, (load rriaddr:$src))]>;
+def MOV64Prm : Pseudo<(outs GR64P:$dst), (ins rriaddr12:$src),
+                      "# MOV64P PSEUDO!\n"
+                      "\tl\t${dst:subreg_odd},  $src\n"
+                      "\tl\t${dst:subreg_even}, 4+$src",
+                      [(set GR64P:$dst, (load rriaddr12:$src))]>;
+def MOV64Prmy : Pseudo<(outs GR64P:$dst), (ins rriaddr:$src),
+                       "# MOV64P PSEUDO!\n"
+                       "\tly\t${dst:subreg_odd},  $src\n"
+                       "\tly\t${dst:subreg_even}, 4+$src",
+                       [(set GR64P:$dst, (load rriaddr:$src))]>;
+def MOV128rm : Pseudo<(outs GR128:$dst), (ins rriaddr:$src),
+                      "# MOV128 PSEUDO!\n"
+                      "\tlg\t${dst:subreg_odd},  $src\n"
+                      "\tlg\t${dst:subreg_even}, 8+$src",
+                      [(set GR128:$dst, (load rriaddr:$src))]>;
+}
+
+def MOV32mr  : RXI<0x50,
+                   (outs), (ins rriaddr12:$dst, GR32:$src),
+                   "st\t{$src, $dst}",
+                   [(store GR32:$src, rriaddr12:$dst)]>;
+def MOV32mry : RXYI<0x50E3,
+                    (outs), (ins rriaddr:$dst, GR32:$src),
+                    "sty\t{$src, $dst}",
+                    [(store GR32:$src, rriaddr:$dst)]>;
+def MOV64mr  : RXYI<0x24E3,
+                    (outs), (ins rriaddr:$dst, GR64:$src),
+                    "stg\t{$src, $dst}",
+                    [(store GR64:$src, rriaddr:$dst)]>;
+def MOV64Pmr : Pseudo<(outs), (ins rriaddr12:$dst, GR64P:$src),
+                      "# MOV64P PSEUDO!\n"
+                      "\tst\t${src:subreg_odd}, $dst\n"
+                      "\tst\t${src:subreg_even}, 4+$dst",
+                      [(store GR64P:$src, rriaddr12:$dst)]>;
+def MOV64Pmry : Pseudo<(outs), (ins rriaddr:$dst, GR64P:$src),
+                       "# MOV64P PSEUDO!\n"
+                       "\tsty\t${src:subreg_odd}, $dst\n"
+                       "\tsty\t${src:subreg_even}, 4+$dst",
+                       [(store GR64P:$src, rriaddr:$dst)]>;
+def MOV128mr : Pseudo<(outs), (ins rriaddr:$dst, GR128:$src),
+                      "# MOV128 PSEUDO!\n"
+                      "\tstg\t${src:subreg_odd}, $dst\n"
+                      "\tstg\t${src:subreg_even}, 8+$dst",
+                      [(store GR128:$src, rriaddr:$dst)]>;
+
+def MOV8mi    : SII<0x92,
+                    (outs), (ins riaddr12:$dst, i32i8imm:$src),
+                    "mvi\t{$dst, $src}",
+                    [(truncstorei8 (i32 i32immSExt8:$src), riaddr12:$dst)]>;
+def MOV8miy   : SIYI<0x52EB,
+                     (outs), (ins riaddr:$dst, i32i8imm:$src),
+                     "mviy\t{$dst, $src}",
+                     [(truncstorei8 (i32 i32immSExt8:$src), riaddr:$dst)]>;
+
+let AddedComplexity = 2 in {
+def MOV16mi   : SILI<0xE544,
+                     (outs), (ins riaddr12:$dst, s16imm:$src),
+                     "mvhhi\t{$dst, $src}",
+                     [(truncstorei16 (i32 i32immSExt16:$src), riaddr12:$dst)]>,
+                     Requires<[IsZ10]>;
+def MOV32mi16 : SILI<0xE54C,
+                     (outs), (ins riaddr12:$dst, s32imm:$src),
+                     "mvhi\t{$dst, $src}",
+                     [(store (i32 immSExt16:$src), riaddr12:$dst)]>,
+                     Requires<[IsZ10]>;
+def MOV64mi16 : SILI<0xE548,
+                     (outs), (ins riaddr12:$dst, s32imm64:$src),
+                     "mvghi\t{$dst, $src}",
+                     [(store (i64 immSExt16:$src), riaddr12:$dst)]>,
+                     Requires<[IsZ10]>;
+}
+
+// sexts
+def MOVSX32rr8  : RREI<0xB926,
+                       (outs GR32:$dst), (ins GR32:$src),
+                       "lbr\t{$dst, $src}",
+                       [(set GR32:$dst, (sext_inreg GR32:$src, i8))]>;
+def MOVSX64rr8  : RREI<0xB906,
+                       (outs GR64:$dst), (ins GR64:$src),
+                       "lgbr\t{$dst, $src}",
+                       [(set GR64:$dst, (sext_inreg GR64:$src, i8))]>;
+def MOVSX32rr16 : RREI<0xB927,
+                       (outs GR32:$dst), (ins GR32:$src),
+                       "lhr\t{$dst, $src}",
+                       [(set GR32:$dst, (sext_inreg GR32:$src, i16))]>;
+def MOVSX64rr16 : RREI<0xB907,
+                       (outs GR64:$dst), (ins GR64:$src),
+                       "lghr\t{$dst, $src}",
+                       [(set GR64:$dst, (sext_inreg GR64:$src, i16))]>;
+
+// extloads
+def MOVSX32rm8   : RXYI<0x76E3,
+                        (outs GR32:$dst), (ins rriaddr:$src),
+                        "lb\t{$dst, $src}",
+                        [(set GR32:$dst, (sextloadi32i8 rriaddr:$src))]>;
+def MOVSX32rm16  : RXI<0x48,
+                       (outs GR32:$dst), (ins rriaddr12:$src),
+                       "lh\t{$dst, $src}",
+                       [(set GR32:$dst, (sextloadi32i16 rriaddr12:$src))]>;
+def MOVSX32rm16y : RXYI<0x78E3,
+                        (outs GR32:$dst), (ins rriaddr:$src),
+                        "lhy\t{$dst, $src}",
+                        [(set GR32:$dst, (sextloadi32i16 rriaddr:$src))]>;
+def MOVSX64rm8   : RXYI<0x77E3,
+                        (outs GR64:$dst), (ins rriaddr:$src),
+                        "lgb\t{$dst, $src}",
+                        [(set GR64:$dst, (sextloadi64i8 rriaddr:$src))]>;
+def MOVSX64rm16  : RXYI<0x15E3,
+                        (outs GR64:$dst), (ins rriaddr:$src),
+                        "lgh\t{$dst, $src}",
+                        [(set GR64:$dst, (sextloadi64i16 rriaddr:$src))]>;
+def MOVSX64rm32  : RXYI<0x14E3,
+                        (outs GR64:$dst), (ins rriaddr:$src),
+                        "lgf\t{$dst, $src}",
+                        [(set GR64:$dst, (sextloadi64i32 rriaddr:$src))]>;
+
+def MOVZX32rm8  : RXYI<0x94E3,
+                       (outs GR32:$dst), (ins rriaddr:$src),
+                       "llc\t{$dst, $src}",
+                       [(set GR32:$dst, (zextloadi32i8 rriaddr:$src))]>;
+def MOVZX32rm16 : RXYI<0x95E3,
+                       (outs GR32:$dst), (ins rriaddr:$src),
+                       "llh\t{$dst, $src}",
+                       [(set GR32:$dst, (zextloadi32i16 rriaddr:$src))]>;
+def MOVZX64rm8  : RXYI<0x90E3,
+                       (outs GR64:$dst), (ins rriaddr:$src),
+                       "llgc\t{$dst, $src}",
+                       [(set GR64:$dst, (zextloadi64i8 rriaddr:$src))]>;
+def MOVZX64rm16 : RXYI<0x91E3,
+                       (outs GR64:$dst), (ins rriaddr:$src),
+                       "llgh\t{$dst, $src}",
+                       [(set GR64:$dst, (zextloadi64i16 rriaddr:$src))]>;
+def MOVZX64rm32 : RXYI<0x16E3,
+                       (outs GR64:$dst), (ins rriaddr:$src),
+                       "llgf\t{$dst, $src}",
+                       [(set GR64:$dst, (zextloadi64i32 rriaddr:$src))]>;
+
+// truncstores
+def MOV32m8r   : RXI<0x42,
+                     (outs), (ins rriaddr12:$dst, GR32:$src),
+                     "stc\t{$src, $dst}",
+                     [(truncstorei8 GR32:$src, rriaddr12:$dst)]>;
+
+def MOV32m8ry  : RXYI<0x72E3,
+                      (outs), (ins rriaddr:$dst, GR32:$src),
+                      "stcy\t{$src, $dst}",
+                      [(truncstorei8 GR32:$src, rriaddr:$dst)]>;
+
+def MOV32m16r  : RXI<0x40,
+                     (outs), (ins rriaddr12:$dst, GR32:$src),
+                     "sth\t{$src, $dst}",
+                     [(truncstorei16 GR32:$src, rriaddr12:$dst)]>;
+
+def MOV32m16ry : RXYI<0x70E3,
+                      (outs), (ins rriaddr:$dst, GR32:$src),
+                      "sthy\t{$src, $dst}",
+                      [(truncstorei16 GR32:$src, rriaddr:$dst)]>;
+
+def MOV64m8r   : RXI<0x42,
+                     (outs), (ins rriaddr12:$dst, GR64:$src),
+                     "stc\t{$src, $dst}",
+                     [(truncstorei8 GR64:$src, rriaddr12:$dst)]>;
+
+def MOV64m8ry  : RXYI<0x72E3,
+                      (outs), (ins rriaddr:$dst, GR64:$src),
+                      "stcy\t{$src, $dst}",
+                      [(truncstorei8 GR64:$src, rriaddr:$dst)]>;
+
+def MOV64m16r  : RXI<0x40,
+                     (outs), (ins rriaddr12:$dst, GR64:$src),
+                     "sth\t{$src, $dst}",
+                     [(truncstorei16 GR64:$src, rriaddr12:$dst)]>;
+
+def MOV64m16ry : RXYI<0x70E3,
+                      (outs), (ins rriaddr:$dst, GR64:$src),
+                      "sthy\t{$src, $dst}",
+                      [(truncstorei16 GR64:$src, rriaddr:$dst)]>;
+
+def MOV64m32r  : RXI<0x50,
+                     (outs), (ins rriaddr12:$dst, GR64:$src),
+                     "st\t{$src, $dst}",
+                     [(truncstorei32 GR64:$src, rriaddr12:$dst)]>;
+
+def MOV64m32ry : RXYI<0x50E3,
+                      (outs), (ins rriaddr:$dst, GR64:$src),
+                      "sty\t{$src, $dst}",
+                      [(truncstorei32 GR64:$src, rriaddr:$dst)]>;
+
+// multiple regs moves
+// FIXME: should we use multiple arg nodes?
+def MOV32mrm  : RSYI<0x90EB,
+                     (outs), (ins riaddr:$dst, GR32:$from, GR32:$to),
+                     "stmy\t{$from, $to, $dst}",
+                     []>;
+def MOV64mrm  : RSYI<0x24EB,
+                     (outs), (ins riaddr:$dst, GR64:$from, GR64:$to),
+                     "stmg\t{$from, $to, $dst}",
+                     []>;
+def MOV32rmm  : RSYI<0x90EB,
+                     (outs GR32:$from, GR32:$to), (ins riaddr:$dst),
+                     "lmy\t{$from, $to, $dst}",
+                     []>;
+def MOV64rmm  : RSYI<0x04EB,
+                     (outs GR64:$from, GR64:$to), (ins riaddr:$dst),
+                     "lmg\t{$from, $to, $dst}",
+                     []>;
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isTwoAddress = 1 in {
+def MOV64Pr0_even : Pseudo<(outs GR64P:$dst), (ins GR64P:$src),
+                           "lhi\t${dst:subreg_even}, 0",
+                           []>;
+def MOV128r0_even : Pseudo<(outs GR128:$dst), (ins GR128:$src),
+                           "lghi\t${dst:subreg_even}, 0",
+                           []>;
+}
+
+// Byte swaps
+def BSWAP32rr : RREI<0xB91F,
+                     (outs GR32:$dst), (ins GR32:$src),
+                     "lrvr\t{$dst, $src}",
+                     [(set GR32:$dst, (bswap GR32:$src))]>;
+def BSWAP64rr : RREI<0xB90F,
+                     (outs GR64:$dst), (ins GR64:$src),
+                     "lrvgr\t{$dst, $src}",
+                     [(set GR64:$dst, (bswap GR64:$src))]>;
+
+// FIXME: this is invalid pattern for big-endian
+//def BSWAP16rm : RXYI<0x1FE3, (outs GR32:$dst), (ins rriaddr:$src),
+//                     "lrvh\t{$dst, $src}",
+//                     [(set GR32:$dst, (bswap (extloadi32i16 rriaddr:$src)))]>;
+def BSWAP32rm : RXYI<0x1EE3, (outs GR32:$dst), (ins rriaddr:$src),
+                     "lrv\t{$dst, $src}",
+                     [(set GR32:$dst, (bswap (load rriaddr:$src)))]>;
+def BSWAP64rm : RXYI<0x0FE3, (outs GR64:$dst), (ins rriaddr:$src),
+                     "lrvg\t{$dst, $src}",
+                     [(set GR64:$dst, (bswap (load rriaddr:$src)))]>;
+
+//def BSWAP16mr : RXYI<0xE33F, (outs), (ins rriaddr:$dst, GR32:$src),
+//                     "strvh\t{$src, $dst}",
+//                     [(truncstorei16 (bswap GR32:$src), rriaddr:$dst)]>;
+def BSWAP32mr : RXYI<0xE33E, (outs), (ins rriaddr:$dst, GR32:$src),
+                     "strv\t{$src, $dst}",
+                     [(store (bswap GR32:$src), rriaddr:$dst)]>;
+def BSWAP64mr : RXYI<0xE32F, (outs), (ins rriaddr:$dst, GR64:$src),
+                     "strvg\t{$src, $dst}",
+                     [(store (bswap GR64:$src), rriaddr:$dst)]>;
+
+//===----------------------------------------------------------------------===//
+// Arithmetic Instructions
+
+let Defs = [PSW] in {
+def NEG32rr : RRI<0x13,
+                  (outs GR32:$dst), (ins GR32:$src),
+                  "lcr\t{$dst, $src}",
+                  [(set GR32:$dst, (ineg GR32:$src)),
+                   (implicit PSW)]>;
+def NEG64rr : RREI<0xB903, (outs GR64:$dst), (ins GR64:$src),
+                   "lcgr\t{$dst, $src}",
+                   [(set GR64:$dst, (ineg GR64:$src)),
+                    (implicit PSW)]>;
+def NEG64rr32 : RREI<0xB913, (outs GR64:$dst), (ins GR32:$src),
+                     "lcgfr\t{$dst, $src}",
+                     [(set GR64:$dst, (ineg (sext GR32:$src))),
+                      (implicit PSW)]>;
+}
+
+let isTwoAddress = 1 in {
+
+let Defs = [PSW] in {
+
+let isCommutable = 1 in { // X = ADD Y, Z  == X = ADD Z, Y
+def ADD32rr : RRI<0x1A, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "ar\t{$dst, $src2}",
+                  [(set GR32:$dst, (add GR32:$src1, GR32:$src2)),
+                   (implicit PSW)]>;
+def ADD64rr : RREI<0xB908, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "agr\t{$dst, $src2}",
+                   [(set GR64:$dst, (add GR64:$src1, GR64:$src2)),
+                    (implicit PSW)]>;
+}
+
+def ADD32rm   : RXI<0x5A, (outs GR32:$dst), (ins GR32:$src1, rriaddr12:$src2),
+                    "a\t{$dst, $src2}",
+                    [(set GR32:$dst, (add GR32:$src1, (load rriaddr12:$src2))),
+                     (implicit PSW)]>;
+def ADD32rmy  : RXYI<0xE35A, (outs GR32:$dst), (ins GR32:$src1, rriaddr:$src2),
+                     "ay\t{$dst, $src2}",
+                     [(set GR32:$dst, (add GR32:$src1, (load rriaddr:$src2))),
+                      (implicit PSW)]>;
+def ADD64rm   : RXYI<0xE308, (outs GR64:$dst), (ins GR64:$src1, rriaddr:$src2),
+                     "ag\t{$dst, $src2}",
+                     [(set GR64:$dst, (add GR64:$src1, (load rriaddr:$src2))),
+                      (implicit PSW)]>;
+
+
+def ADD32ri16 : RII<0xA7A,
+                    (outs GR32:$dst), (ins GR32:$src1, s16imm:$src2),
+                    "ahi\t{$dst, $src2}",
+                    [(set GR32:$dst, (add GR32:$src1, immSExt16:$src2)),
+                     (implicit PSW)]>;
+def ADD32ri   : RILI<0xC29,
+                     (outs GR32:$dst), (ins GR32:$src1, s32imm:$src2),
+                     "afi\t{$dst, $src2}",
+                     [(set GR32:$dst, (add GR32:$src1, imm:$src2)),
+                      (implicit PSW)]>;
+def ADD64ri16 : RILI<0xA7B,
+                     (outs GR64:$dst), (ins GR64:$src1, s16imm64:$src2),
+                     "aghi\t{$dst, $src2}",
+                     [(set GR64:$dst, (add GR64:$src1, immSExt16:$src2)),
+                      (implicit PSW)]>;
+def ADD64ri32 : RILI<0xC28,
+                     (outs GR64:$dst), (ins GR64:$src1, s32imm64:$src2),
+                     "agfi\t{$dst, $src2}",
+                     [(set GR64:$dst, (add GR64:$src1, immSExt32:$src2)),
+                      (implicit PSW)]>;
+
+let isCommutable = 1 in { // X = ADC Y, Z  == X = ADC Z, Y
+def ADC32rr : RRI<0x1E, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "alr\t{$dst, $src2}",
+                  [(set GR32:$dst, (addc GR32:$src1, GR32:$src2))]>;
+def ADC64rr : RREI<0xB90A, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "algr\t{$dst, $src2}",
+                   [(set GR64:$dst, (addc GR64:$src1, GR64:$src2))]>;
+}
+
+def ADC32ri   : RILI<0xC2B,
+                     (outs GR32:$dst), (ins GR32:$src1, s32imm:$src2),
+                     "alfi\t{$dst, $src2}",
+                     [(set GR32:$dst, (addc GR32:$src1, imm:$src2))]>;
+def ADC64ri32 : RILI<0xC2A,
+                     (outs GR64:$dst), (ins GR64:$src1, s32imm64:$src2),
+                     "algfi\t{$dst, $src2}",
+                     [(set GR64:$dst, (addc GR64:$src1, immSExt32:$src2))]>;
+
+let Uses = [PSW] in {
+def ADDE32rr : RREI<0xB998, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                    "alcr\t{$dst, $src2}",
+                    [(set GR32:$dst, (adde GR32:$src1, GR32:$src2)),
+                     (implicit PSW)]>;
+def ADDE64rr : RREI<0xB988, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                    "alcgr\t{$dst, $src2}",
+                    [(set GR64:$dst, (adde GR64:$src1, GR64:$src2)),
+                     (implicit PSW)]>;
+}
+
+let isCommutable = 1 in { // X = AND Y, Z  == X = AND Z, Y
+def AND32rr : RRI<0x14,
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "nr\t{$dst, $src2}",
+                  [(set GR32:$dst, (and GR32:$src1, GR32:$src2))]>;
+def AND64rr : RREI<0xB980,
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "ngr\t{$dst, $src2}",
+                   [(set GR64:$dst, (and GR64:$src1, GR64:$src2))]>;
+}
+
+def AND32rm   : RXI<0x54, (outs GR32:$dst), (ins GR32:$src1, rriaddr12:$src2),
+                    "n\t{$dst, $src2}",
+                    [(set GR32:$dst, (and GR32:$src1, (load rriaddr12:$src2))),
+                     (implicit PSW)]>;
+def AND32rmy  : RXYI<0xE354, (outs GR32:$dst), (ins GR32:$src1, rriaddr:$src2),
+                     "ny\t{$dst, $src2}",
+                     [(set GR32:$dst, (and GR32:$src1, (load rriaddr:$src2))),
+                      (implicit PSW)]>;
+def AND64rm   : RXYI<0xE360, (outs GR64:$dst), (ins GR64:$src1, rriaddr:$src2),
+                     "ng\t{$dst, $src2}",
+                     [(set GR64:$dst, (and GR64:$src1, (load rriaddr:$src2))),
+                      (implicit PSW)]>;
+
+def AND32rill16 : RII<0xA57,
+                      (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+                      "nill\t{$dst, $src2}",
+                      [(set GR32:$dst, (and GR32:$src1, i32ll16c:$src2))]>;
+def AND64rill16 : RII<0xA57,
+                      (outs GR64:$dst), (ins GR64:$src1, i64imm:$src2),
+                      "nill\t{$dst, $src2}",
+                      [(set GR64:$dst, (and GR64:$src1, i64ll16c:$src2))]>;
+
+def AND32rilh16 : RII<0xA56,
+                      (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+                      "nilh\t{$dst, $src2}",
+                      [(set GR32:$dst, (and GR32:$src1, i32lh16c:$src2))]>;
+def AND64rilh16 : RII<0xA56,
+                      (outs GR64:$dst), (ins GR64:$src1, i64imm:$src2),
+                      "nilh\t{$dst, $src2}",
+                      [(set GR64:$dst, (and GR64:$src1, i64lh16c:$src2))]>;
+
+def AND64rihl16 : RII<0xA55,
+                      (outs GR64:$dst), (ins GR64:$src1, i64imm:$src2),
+                      "nihl\t{$dst, $src2}",
+                      [(set GR64:$dst, (and GR64:$src1, i64hl16c:$src2))]>;
+def AND64rihh16 : RII<0xA54,
+                      (outs GR64:$dst), (ins GR64:$src1, i64imm:$src2),
+                      "nihh\t{$dst, $src2}",
+                      [(set GR64:$dst, (and GR64:$src1, i64hh16c:$src2))]>;
+
+def AND32ri     : RILI<0xC0B,
+                       (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+                       "nilf\t{$dst, $src2}",
+                       [(set GR32:$dst, (and GR32:$src1, imm:$src2))]>;
+def AND64rilo32 : RILI<0xC0B,
+                       (outs GR64:$dst), (ins GR64:$src1, i64imm:$src2),
+                       "nilf\t{$dst, $src2}",
+                       [(set GR64:$dst, (and GR64:$src1, i64lo32c:$src2))]>;
+def AND64rihi32 : RILI<0xC0A,
+                       (outs GR64:$dst), (ins GR64:$src1, i64imm:$src2),
+                       "nihf\t{$dst, $src2}",
+                       [(set GR64:$dst, (and GR64:$src1, i64hi32c:$src2))]>;
+
+let isCommutable = 1 in { // X = OR Y, Z  == X = OR Z, Y
+def OR32rr : RRI<0x16,
+                 (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                 "or\t{$dst, $src2}",
+                 [(set GR32:$dst, (or GR32:$src1, GR32:$src2))]>;
+def OR64rr : RREI<0xB981,
+                  (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                  "ogr\t{$dst, $src2}",
+                  [(set GR64:$dst, (or GR64:$src1, GR64:$src2))]>;
+}
+
+def OR32rm   : RXI<0x56, (outs GR32:$dst), (ins GR32:$src1, rriaddr12:$src2),
+                   "o\t{$dst, $src2}",
+                   [(set GR32:$dst, (or GR32:$src1, (load rriaddr12:$src2))),
+                    (implicit PSW)]>;
+def OR32rmy  : RXYI<0xE356, (outs GR32:$dst), (ins GR32:$src1, rriaddr:$src2),
+                    "oy\t{$dst, $src2}",
+                    [(set GR32:$dst, (or GR32:$src1, (load rriaddr:$src2))),
+                     (implicit PSW)]>;
+def OR64rm   : RXYI<0xE381, (outs GR64:$dst), (ins GR64:$src1, rriaddr:$src2),
+                    "og\t{$dst, $src2}",
+                    [(set GR64:$dst, (or GR64:$src1, (load rriaddr:$src2))),
+                     (implicit PSW)]>;
+
+ // FIXME: Provide proper encoding!
+def OR32ri16  : RII<0xA5B,
+                    (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+                    "oill\t{$dst, $src2}",
+                    [(set GR32:$dst, (or GR32:$src1, i32ll16:$src2))]>;
+def OR32ri16h : RII<0xA5A,
+                    (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+                    "oilh\t{$dst, $src2}",
+                    [(set GR32:$dst, (or GR32:$src1, i32lh16:$src2))]>;
+def OR32ri : RILI<0xC0D,
+                  (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+                  "oilf\t{$dst, $src2}",
+                  [(set GR32:$dst, (or GR32:$src1, imm:$src2))]>;
+
+def OR64rill16 : RII<0xA5B,
+                     (outs GR64:$dst), (ins GR64:$src1, i64imm:$src2),
+                     "oill\t{$dst, $src2}",
+                     [(set GR64:$dst, (or GR64:$src1, i64ll16:$src2))]>;
+def OR64rilh16 : RII<0xA5A,
+                     (outs GR64:$dst), (ins GR64:$src1, i64imm:$src2),
+                     "oilh\t{$dst, $src2}",
+                     [(set GR64:$dst, (or GR64:$src1, i64lh16:$src2))]>;
+def OR64rihl16 : RII<0xA59,
+                     (outs GR64:$dst), (ins GR64:$src1, i64imm:$src2),
+                     "oihl\t{$dst, $src2}",
+                     [(set GR64:$dst, (or GR64:$src1, i64hl16:$src2))]>;
+def OR64rihh16 : RII<0xA58,
+                     (outs GR64:$dst), (ins GR64:$src1, i64imm:$src2),
+                     "oihh\t{$dst, $src2}",
+                     [(set GR64:$dst, (or GR64:$src1, i64hh16:$src2))]>;
+
+def OR64rilo32 : RILI<0xC0D,
+                      (outs GR64:$dst), (ins GR64:$src1, i64imm:$src2),
+                      "oilf\t{$dst, $src2}",
+                      [(set GR64:$dst, (or GR64:$src1, i64lo32:$src2))]>;
+def OR64rihi32 : RILI<0xC0C,
+                      (outs GR64:$dst), (ins GR64:$src1, i64imm:$src2),
+                      "oihf\t{$dst, $src2}",
+                      [(set GR64:$dst, (or GR64:$src1, i64hi32:$src2))]>;
+
+def SUB32rr : RRI<0x1B,
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "sr\t{$dst, $src2}",
+                  [(set GR32:$dst, (sub GR32:$src1, GR32:$src2))]>;
+def SUB64rr : RREI<0xB909,
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "sgr\t{$dst, $src2}",
+                   [(set GR64:$dst, (sub GR64:$src1, GR64:$src2))]>;
+
+def SUB32rm   : RXI<0x5B, (outs GR32:$dst), (ins GR32:$src1, rriaddr12:$src2),
+                    "s\t{$dst, $src2}",
+                    [(set GR32:$dst, (sub GR32:$src1, (load rriaddr12:$src2))),
+                     (implicit PSW)]>;
+def SUB32rmy  : RXYI<0xE35B, (outs GR32:$dst), (ins GR32:$src1, rriaddr:$src2),
+                     "sy\t{$dst, $src2}",
+                     [(set GR32:$dst, (sub GR32:$src1, (load rriaddr:$src2))),
+                      (implicit PSW)]>;
+def SUB64rm   : RXYI<0xE309, (outs GR64:$dst), (ins GR64:$src1, rriaddr:$src2),
+                     "sg\t{$dst, $src2}",
+                     [(set GR64:$dst, (sub GR64:$src1, (load rriaddr:$src2))),
+                      (implicit PSW)]>;
+ 
+def SBC32rr : RRI<0x1F,
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "slr\t{$dst, $src2}",
+                  [(set GR32:$dst, (subc GR32:$src1, GR32:$src2))]>;
+def SBC64rr : RREI<0xB90B,
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "slgr\t{$dst, $src2}",
+                   [(set GR64:$dst, (subc GR64:$src1, GR64:$src2))]>;
+
+def SBC32ri   : RILI<0xC25,
+                     (outs GR32:$dst), (ins GR32:$src1, s32imm:$src2),
+                     "sllfi\t{$dst, $src2}",
+                     [(set GR32:$dst, (subc GR32:$src1, imm:$src2))]>;
+def SBC64ri32 : RILI<0xC24,
+                     (outs GR64:$dst), (ins GR64:$src1, s32imm64:$src2),
+                     "slgfi\t{$dst, $src2}",
+                     [(set GR64:$dst, (subc GR64:$src1, immSExt32:$src2))]>;
+
+let Uses = [PSW] in {
+def SUBE32rr : RREI<0xB999, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                    "slbr\t{$dst, $src2}",
+                    [(set GR32:$dst, (sube GR32:$src1, GR32:$src2)),
+                     (implicit PSW)]>;
+def SUBE64rr : RREI<0xB989, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                    "slbgr\t{$dst, $src2}",
+                    [(set GR64:$dst, (sube GR64:$src1, GR64:$src2)),
+                     (implicit PSW)]>;
+}
+
+let isCommutable = 1 in { // X = XOR Y, Z  == X = XOR Z, Y
+def XOR32rr : RRI<0x17,
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "xr\t{$dst, $src2}",
+                  [(set GR32:$dst, (xor GR32:$src1, GR32:$src2))]>;
+def XOR64rr : RREI<0xB982,
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "xgr\t{$dst, $src2}",
+                   [(set GR64:$dst, (xor GR64:$src1, GR64:$src2))]>;
+}
+
+def XOR32rm   : RXI<0x57,(outs GR32:$dst), (ins GR32:$src1, rriaddr12:$src2),
+                    "x\t{$dst, $src2}",
+                    [(set GR32:$dst, (xor GR32:$src1, (load rriaddr12:$src2))),
+                     (implicit PSW)]>;
+def XOR32rmy  : RXYI<0xE357, (outs GR32:$dst), (ins GR32:$src1, rriaddr:$src2),
+                     "xy\t{$dst, $src2}",
+                     [(set GR32:$dst, (xor GR32:$src1, (load rriaddr:$src2))),
+                      (implicit PSW)]>;
+def XOR64rm   : RXYI<0xE382, (outs GR64:$dst), (ins GR64:$src1, rriaddr:$src2),
+                     "xg\t{$dst, $src2}",
+                     [(set GR64:$dst, (xor GR64:$src1, (load rriaddr:$src2))),
+                      (implicit PSW)]>;
+
+def XOR32ri : RILI<0xC07,
+                   (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+                   "xilf\t{$dst, $src2}",
+                   [(set GR32:$dst, (xor GR32:$src1, imm:$src2))]>;
+
+} // Defs = [PSW]
+
+let isCommutable = 1 in { // X = MUL Y, Z == X = MUL Z, Y
+def MUL32rr : RREI<0xB252,
+                   (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                   "msr\t{$dst, $src2}",
+                   [(set GR32:$dst, (mul GR32:$src1, GR32:$src2))]>;
+def MUL64rr : RREI<0xB90C,
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "msgr\t{$dst, $src2}",
+                   [(set GR64:$dst, (mul GR64:$src1, GR64:$src2))]>;
+}
+
+def MUL64rrP   : RRI<0x1C,
+                     (outs GR64P:$dst), (ins GR64P:$src1, GR32:$src2),
+                     "mr\t{$dst, $src2}",
+                     []>;
+def UMUL64rrP  : RREI<0xB996,
+                      (outs GR64P:$dst), (ins GR64P:$src1, GR32:$src2),
+                      "mlr\t{$dst, $src2}",
+                      []>;
+def UMUL128rrP : RREI<0xB986,
+                      (outs GR128:$dst), (ins GR128:$src1, GR64:$src2),
+                      "mlgr\t{$dst, $src2}",
+                      []>;
+
+def MUL32ri16   : RII<0xA7C,
+                      (outs GR32:$dst), (ins GR32:$src1, s16imm:$src2),
+                      "mhi\t{$dst, $src2}",
+                      [(set GR32:$dst, (mul GR32:$src1, i32immSExt16:$src2))]>;
+def MUL64ri16   : RII<0xA7D,
+                      (outs GR64:$dst), (ins GR64:$src1, s16imm64:$src2),
+                      "mghi\t{$dst, $src2}",
+                      [(set GR64:$dst, (mul GR64:$src1, immSExt16:$src2))]>;
+
+let AddedComplexity = 2 in {
+def MUL32ri     : RILI<0xC21,
+                       (outs GR32:$dst), (ins GR32:$src1, s32imm:$src2),
+                       "msfi\t{$dst, $src2}",
+                       [(set GR32:$dst, (mul GR32:$src1, imm:$src2))]>,
+                       Requires<[IsZ10]>;
+def MUL64ri32   : RILI<0xC20,
+                       (outs GR64:$dst), (ins GR64:$src1, s32imm64:$src2),
+                       "msgfi\t{$dst, $src2}",
+                       [(set GR64:$dst, (mul GR64:$src1, i64immSExt32:$src2))]>,
+                       Requires<[IsZ10]>;
+}
+
+def MUL32rm : RXI<0x71,
+                  (outs GR32:$dst), (ins GR32:$src1, rriaddr12:$src2),
+                  "ms\t{$dst, $src2}",
+                  [(set GR32:$dst, (mul GR32:$src1, (load rriaddr12:$src2)))]>;
+def MUL32rmy : RXYI<0xE351,
+                    (outs GR32:$dst), (ins GR32:$src1, rriaddr:$src2),
+                    "msy\t{$dst, $src2}",
+                    [(set GR32:$dst, (mul GR32:$src1, (load rriaddr:$src2)))]>;
+def MUL64rm  : RXYI<0xE30C,
+                    (outs GR64:$dst), (ins GR64:$src1, rriaddr:$src2),
+                    "msg\t{$dst, $src2}",
+                    [(set GR64:$dst, (mul GR64:$src1, (load rriaddr:$src2)))]>;
+
+def MULSX64rr32 : RREI<0xB91C,
+                       (outs GR64:$dst), (ins GR64:$src1, GR32:$src2),
+                       "msgfr\t{$dst, $src2}",
+                       [(set GR64:$dst, (mul GR64:$src1, (sext GR32:$src2)))]>;
+
+def SDIVREM32r : RREI<0xB91D,
+                      (outs GR128:$dst), (ins GR128:$src1, GR32:$src2),
+                      "dsgfr\t{$dst, $src2}",
+                      []>;
+def SDIVREM64r : RREI<0xB90D,
+                      (outs GR128:$dst), (ins GR128:$src1, GR64:$src2),
+                      "dsgr\t{$dst, $src2}",
+                      []>;
+
+def UDIVREM32r : RREI<0xB997,
+                      (outs GR64P:$dst), (ins GR64P:$src1, GR32:$src2),
+                      "dlr\t{$dst, $src2}",
+                      []>;
+def UDIVREM64r : RREI<0xB987,
+                      (outs GR128:$dst), (ins GR128:$src1, GR64:$src2),
+                      "dlgr\t{$dst, $src2}",
+                      []>;
+let mayLoad = 1 in {
+def SDIVREM32m : RXYI<0xE31D,
+                      (outs GR128:$dst), (ins GR128:$src1, rriaddr:$src2),
+                      "dsgf\t{$dst, $src2}",
+                      []>;
+def SDIVREM64m : RXYI<0xE30D,
+                      (outs GR128:$dst), (ins GR128:$src1, rriaddr:$src2),
+                      "dsg\t{$dst, $src2}",
+                      []>;
+
+def UDIVREM32m : RXYI<0xE397, (outs GR64P:$dst), (ins GR64P:$src1, rriaddr:$src2),
+                      "dl\t{$dst, $src2}",
+                      []>;
+def UDIVREM64m : RXYI<0xE387, (outs GR128:$dst), (ins GR128:$src1, rriaddr:$src2),
+                      "dlg\t{$dst, $src2}",
+                      []>;
+} // mayLoad
+} // isTwoAddress = 1
+
+//===----------------------------------------------------------------------===//
+// Shifts
+
+let isTwoAddress = 1 in
+def SRL32rri : RSI<0x88,
+                   (outs GR32:$dst), (ins GR32:$src, riaddr32:$amt),
+                   "srl\t{$src, $amt}",
+                   [(set GR32:$dst, (srl GR32:$src, riaddr32:$amt))]>;
+def SRL64rri : RSYI<0xEB0C,
+                    (outs GR64:$dst), (ins GR64:$src, riaddr:$amt),
+                    "srlg\t{$dst, $src, $amt}",
+                    [(set GR64:$dst, (srl GR64:$src, riaddr:$amt))]>;
+
+let isTwoAddress = 1 in
+def SHL32rri : RSI<0x89,
+                   (outs GR32:$dst), (ins GR32:$src, riaddr32:$amt),
+                   "sll\t{$src, $amt}",
+                   [(set GR32:$dst, (shl GR32:$src, riaddr32:$amt))]>;
+def SHL64rri : RSYI<0xEB0D,
+                    (outs GR64:$dst), (ins GR64:$src, riaddr:$amt),
+                    "sllg\t{$dst, $src, $amt}",
+                    [(set GR64:$dst, (shl GR64:$src, riaddr:$amt))]>;
+
+let Defs = [PSW] in {
+let isTwoAddress = 1 in
+def SRA32rri : RSI<0x8A,
+                   (outs GR32:$dst), (ins GR32:$src, riaddr32:$amt),
+                   "sra\t{$src, $amt}",
+                   [(set GR32:$dst, (sra GR32:$src, riaddr32:$amt)),
+                    (implicit PSW)]>;
+
+def SRA64rri : RSYI<0xEB0A,
+                    (outs GR64:$dst), (ins GR64:$src, riaddr:$amt),
+                    "srag\t{$dst, $src, $amt}",
+                    [(set GR64:$dst, (sra GR64:$src, riaddr:$amt)),
+                     (implicit PSW)]>;
+} // Defs = [PSW]
+
+def ROTL32rri : RSYI<0xEB1D,
+                     (outs GR32:$dst), (ins GR32:$src, riaddr32:$amt),
+                     "rll\t{$dst, $src, $amt}",
+                     [(set GR32:$dst, (rotl GR32:$src, riaddr32:$amt))]>;
+def ROTL64rri : RSYI<0xEB1C,
+                     (outs GR64:$dst), (ins GR64:$src, riaddr:$amt),
+                     "rllg\t{$dst, $src, $amt}",
+                     [(set GR64:$dst, (rotl GR64:$src, riaddr:$amt))]>;
+
+//===----------------------------------------------------------------------===//
+// Test instructions (like AND but do not produce any result)
+
+// Integer comparisons
+let Defs = [PSW] in {
+def CMP32rr : RRI<0x19,
+                  (outs), (ins GR32:$src1, GR32:$src2),
+                  "cr\t$src1, $src2",
+                  [(SystemZcmp GR32:$src1, GR32:$src2), 
+                   (implicit PSW)]>;
+def CMP64rr : RREI<0xB920,
+                   (outs), (ins GR64:$src1, GR64:$src2),
+                   "cgr\t$src1, $src2",
+                   [(SystemZcmp GR64:$src1, GR64:$src2), 
+                    (implicit PSW)]>;
+
+def CMP32ri   : RILI<0xC2D,
+                     (outs), (ins GR32:$src1, s32imm:$src2),
+                     "cfi\t$src1, $src2",
+                     [(SystemZcmp GR32:$src1, imm:$src2), 
+                      (implicit PSW)]>;
+def CMP64ri32 : RILI<0xC2C,
+                     (outs), (ins GR64:$src1, s32imm64:$src2),
+                     "cgfi\t$src1, $src2",
+                     [(SystemZcmp GR64:$src1, i64immSExt32:$src2),
+                      (implicit PSW)]>;
+
+def CMP32rm : RXI<0x59,
+                  (outs), (ins GR32:$src1, rriaddr12:$src2),
+                  "c\t$src1, $src2",
+                  [(SystemZcmp GR32:$src1, (load rriaddr12:$src2)),
+                   (implicit PSW)]>;
+def CMP32rmy : RXYI<0xE359,
+                    (outs), (ins GR32:$src1, rriaddr:$src2),
+                    "cy\t$src1, $src2",
+                    [(SystemZcmp GR32:$src1, (load rriaddr:$src2)),
+                     (implicit PSW)]>;
+def CMP64rm  : RXYI<0xE320,
+                    (outs), (ins GR64:$src1, rriaddr:$src2),
+                    "cg\t$src1, $src2",
+                    [(SystemZcmp GR64:$src1, (load rriaddr:$src2)),
+                     (implicit PSW)]>;
+
+def UCMP32rr : RRI<0x15,
+                   (outs), (ins GR32:$src1, GR32:$src2),
+                   "clr\t$src1, $src2",
+                   [(SystemZucmp GR32:$src1, GR32:$src2),
+                    (implicit PSW)]>;
+def UCMP64rr : RREI<0xB921,
+                    (outs), (ins GR64:$src1, GR64:$src2),
+                    "clgr\t$src1, $src2",
+                    [(SystemZucmp GR64:$src1, GR64:$src2), 
+                     (implicit PSW)]>;
+
+def UCMP32ri   : RILI<0xC2F,
+                      (outs), (ins GR32:$src1, i32imm:$src2),
+                      "clfi\t$src1, $src2",
+                      [(SystemZucmp GR32:$src1, imm:$src2),
+                       (implicit PSW)]>;
+def UCMP64ri32 : RILI<0xC2E,
+                      (outs), (ins GR64:$src1, i64i32imm:$src2),
+                      "clgfi\t$src1, $src2",
+                      [(SystemZucmp GR64:$src1, i64immZExt32:$src2),
+                       (implicit PSW)]>;
+
+def UCMP32rm  : RXI<0x55,
+                    (outs), (ins GR32:$src1, rriaddr12:$src2),
+                    "cl\t$src1, $src2",
+                    [(SystemZucmp GR32:$src1, (load rriaddr12:$src2)),
+                     (implicit PSW)]>;
+def UCMP32rmy : RXYI<0xE355,
+                     (outs), (ins GR32:$src1, rriaddr:$src2),
+                     "cly\t$src1, $src2",
+                     [(SystemZucmp GR32:$src1, (load rriaddr:$src2)),
+                      (implicit PSW)]>;
+def UCMP64rm  : RXYI<0xE351,
+                     (outs), (ins GR64:$src1, rriaddr:$src2),
+                     "clg\t$src1, $src2",
+                     [(SystemZucmp GR64:$src1, (load rriaddr:$src2)),
+                      (implicit PSW)]>;
+
+def CMPSX64rr32  : RREI<0xB930,
+                        (outs), (ins GR64:$src1, GR32:$src2),
+                        "cgfr\t$src1, $src2",
+                        [(SystemZucmp GR64:$src1, (sext GR32:$src2)),
+                         (implicit PSW)]>;
+def UCMPZX64rr32 : RREI<0xB931,
+                        (outs), (ins GR64:$src1, GR32:$src2),
+                        "clgfr\t$src1, $src2",
+                        [(SystemZucmp GR64:$src1, (zext GR32:$src2)),
+                         (implicit PSW)]>;
+
+def CMPSX64rm32   : RXYI<0xE330,
+                         (outs), (ins GR64:$src1, rriaddr:$src2),
+                         "cgf\t$src1, $src2",
+                         [(SystemZucmp GR64:$src1, (sextloadi64i32 rriaddr:$src2)),
+                          (implicit PSW)]>;
+def UCMPZX64rm32  : RXYI<0xE331,
+                         (outs), (ins GR64:$src1, rriaddr:$src2),
+                         "clgf\t$src1, $src2",
+                         [(SystemZucmp GR64:$src1, (zextloadi64i32 rriaddr:$src2)),
+                          (implicit PSW)]>;
+
+// FIXME: Add other crazy ucmp forms
+
+} // Defs = [PSW]
+
+//===----------------------------------------------------------------------===//
+// Other crazy stuff
+let Defs = [PSW] in {
+def FLOGR64 : RREI<0xB983,
+                   (outs GR128:$dst), (ins GR64:$src),
+                   "flogr\t{$dst, $src}",
+                   []>;
+} // Defs = [PSW]
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns.
+//===----------------------------------------------------------------------===//
+
+// ConstPools, JumpTables
+def : Pat<(SystemZpcrelwrapper tjumptable:$src), (LA64rm tjumptable:$src)>;
+def : Pat<(SystemZpcrelwrapper tconstpool:$src), (LA64rm tconstpool:$src)>;
+
+// anyext
+def : Pat<(i64 (anyext GR32:$src)),
+          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, subreg_32bit)>;
+
+// calls
+def : Pat<(SystemZcall (i64 tglobaladdr:$dst)), (CALLi tglobaladdr:$dst)>;
+def : Pat<(SystemZcall (i64 texternalsym:$dst)), (CALLi texternalsym:$dst)>;
+
+//===----------------------------------------------------------------------===//
+// Peepholes.
+//===----------------------------------------------------------------------===//
+
+// FIXME: use add/sub tricks with 32678/-32768
+
+// Arbitrary immediate support.
+def : Pat<(i32 imm:$src),
+          (EXTRACT_SUBREG (MOV64ri32 (i64 imm:$src)), subreg_32bit)>;
+
+// Implement in terms of LLIHF/OILF.
+def : Pat<(i64 imm:$imm),
+          (OR64rilo32 (MOV64rihi32 (HI32 imm:$imm)), (LO32 imm:$imm))>;
+
+// trunc patterns
+def : Pat<(i32 (trunc GR64:$src)),
+          (EXTRACT_SUBREG GR64:$src, subreg_32bit)>;
+
+// sext_inreg patterns
+def : Pat<(sext_inreg GR64:$src, i32),
+          (MOVSX64rr32 (EXTRACT_SUBREG GR64:$src, subreg_32bit))>;
+
+// extload patterns
+def : Pat<(extloadi32i8  rriaddr:$src), (MOVZX32rm8  rriaddr:$src)>;
+def : Pat<(extloadi32i16 rriaddr:$src), (MOVZX32rm16 rriaddr:$src)>;
+def : Pat<(extloadi64i8  rriaddr:$src), (MOVZX64rm8  rriaddr:$src)>;
+def : Pat<(extloadi64i16 rriaddr:$src), (MOVZX64rm16 rriaddr:$src)>;
+def : Pat<(extloadi64i32 rriaddr:$src), (MOVZX64rm32 rriaddr:$src)>;
+
+// muls
+def : Pat<(mulhs GR32:$src1, GR32:$src2),
+          (EXTRACT_SUBREG (MUL64rrP (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
+                                                   GR32:$src1, subreg_odd32),
+                                    GR32:$src2),
+                          subreg_even32)>;
+
+def : Pat<(mulhu GR32:$src1, GR32:$src2),
+          (EXTRACT_SUBREG (UMUL64rrP (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
+                                                    GR32:$src1, subreg_odd32),
+                                     GR32:$src2),
+                          subreg_even32)>;
+def : Pat<(mulhu GR64:$src1, GR64:$src2),
+          (EXTRACT_SUBREG (UMUL128rrP (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
+                                                     GR64:$src1, subreg_odd),
+                                      GR64:$src2),
+                          subreg_even)>;
+
+def : Pat<(ctlz GR64:$src),
+          (EXTRACT_SUBREG (FLOGR64 GR64:$src), subreg_even)>;

diff --git a/lib/Target/SystemZ/SystemZMCAsmInfo.cpp b/lib/Target/SystemZ/SystemZMCAsmInfo.cpp
new file mode 100644
index 0000000..1a09206
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZMCAsmInfo.cpp

@@ -0,0 +1,27 @@
+//===-- SystemZMCAsmInfo.cpp - SystemZ asm properties ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the SystemZMCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZMCAsmInfo.h"
+#include "llvm/MC/MCSectionELF.h"
+using namespace llvm;
+
+SystemZMCAsmInfo::SystemZMCAsmInfo(const Target &T, const StringRef &TT) {
+  PrivateGlobalPrefix = ".L";
+  WeakRefDirective = "\t.weak\t";
+  PCSymbol = ".";
+}
+
+MCSection *SystemZMCAsmInfo::getNonexecutableStackSection(MCContext &Ctx) const{
+  return MCSectionELF::Create(".note.GNU-stack", MCSectionELF::SHT_PROGBITS,
+                              0, SectionKind::getMetadata(), false, Ctx);
+}

diff --git a/lib/Target/SystemZ/SystemZMCAsmInfo.h b/lib/Target/SystemZ/SystemZMCAsmInfo.h
new file mode 100644
index 0000000..00cb99b
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZMCAsmInfo.h

@@ -0,0 +1,30 @@
+//====-- SystemZMCAsmInfo.h - SystemZ asm properties -----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the SystemZMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SystemZTARGETASMINFO_H
+#define SystemZTARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+  class Target;
+  class StringRef;
+
+  struct SystemZMCAsmInfo : public MCAsmInfo {
+    explicit SystemZMCAsmInfo(const Target &T, const StringRef &TT);
+    virtual MCSection *getNonexecutableStackSection(MCContext &Ctx) const;
+  };
+  
+} // namespace llvm
+
+#endif

diff --git a/lib/Target/SystemZ/SystemZMachineFunctionInfo.h b/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
new file mode 100644
index 0000000..e47d419
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZMachineFunctionInfo.h

@@ -0,0 +1,50 @@
+//==- SystemZMachineFuctionInfo.h - SystemZ machine function info -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares SystemZ-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SYSTEMZMACHINEFUNCTIONINFO_H
+#define SYSTEMZMACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+/// SystemZMachineFunctionInfo - This class is derived from MachineFunction and
+/// contains private SystemZ target-specific information for each MachineFunction.
+class SystemZMachineFunctionInfo : public MachineFunctionInfo {
+  /// CalleeSavedFrameSize - Size of the callee-saved register portion of the
+  /// stack frame in bytes.
+  unsigned CalleeSavedFrameSize;
+
+  /// LowReg - Low register of range of callee-saved registers to store.
+  unsigned LowReg;
+
+  /// HighReg - High register of range of callee-saved registers to store.
+  unsigned HighReg;
+public:
+  SystemZMachineFunctionInfo() : CalleeSavedFrameSize(0) {}
+
+  SystemZMachineFunctionInfo(MachineFunction &MF) : CalleeSavedFrameSize(0) {}
+
+  unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
+  void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }
+
+  unsigned getLowReg() const { return LowReg; }
+  void setLowReg(unsigned Reg) { LowReg = Reg; }
+
+  unsigned getHighReg() const { return HighReg; }
+  void setHighReg(unsigned Reg) { HighReg = Reg; }
+};
+
+} // End llvm namespace
+
+#endif

diff --git a/lib/Target/SystemZ/SystemZOperands.td b/lib/Target/SystemZ/SystemZOperands.td
new file mode 100644
index 0000000..156cace
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZOperands.td

@@ -0,0 +1,306 @@
+//=====- SystemZOperands.td - SystemZ Operands defs ---------*- tblgen-*-=====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the various SystemZ instruction operands.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction Pattern Stuff.
+//===----------------------------------------------------------------------===//
+
+// SystemZ specific condition code. These correspond to CondCode in
+// SystemZ.h. They must be kept in synch.
+def SYSTEMZ_COND_O   : PatLeaf<(i8 0)>;
+def SYSTEMZ_COND_H   : PatLeaf<(i8 1)>;
+def SYSTEMZ_COND_NLE : PatLeaf<(i8 2)>;
+def SYSTEMZ_COND_L   : PatLeaf<(i8 3)>;
+def SYSTEMZ_COND_NHE : PatLeaf<(i8 4)>;
+def SYSTEMZ_COND_LH  : PatLeaf<(i8 5)>;
+def SYSTEMZ_COND_NE  : PatLeaf<(i8 6)>;
+def SYSTEMZ_COND_E   : PatLeaf<(i8 7)>;
+def SYSTEMZ_COND_NLH : PatLeaf<(i8 8)>;
+def SYSTEMZ_COND_HE  : PatLeaf<(i8 9)>;
+def SYSTEMZ_COND_NL  : PatLeaf<(i8 10)>;
+def SYSTEMZ_COND_LE  : PatLeaf<(i8 11)>;
+def SYSTEMZ_COND_NH  : PatLeaf<(i8 12)>;
+def SYSTEMZ_COND_NO  : PatLeaf<(i8 13)>;
+
+def LO8 : SDNodeXForm<imm, [{
+  // Transformation function: return low 8 bits.
+  return getI8Imm(N->getZExtValue() & 0x00000000000000FFULL);
+}]>;
+
+def LL16 : SDNodeXForm<imm, [{
+  // Transformation function: return low 16 bits.
+  return getI16Imm(N->getZExtValue() & 0x000000000000FFFFULL);
+}]>;
+
+def LH16 : SDNodeXForm<imm, [{
+  // Transformation function: return bits 16-31.
+  return getI16Imm((N->getZExtValue() & 0x00000000FFFF0000ULL) >> 16);
+}]>;
+
+def HL16 : SDNodeXForm<imm, [{
+  // Transformation function: return bits 32-47.
+  return getI16Imm((N->getZExtValue() & 0x0000FFFF00000000ULL) >> 32);
+}]>;
+
+def HH16 : SDNodeXForm<imm, [{
+  // Transformation function: return bits 48-63.
+  return getI16Imm((N->getZExtValue() & 0xFFFF000000000000ULL) >> 48);
+}]>;
+
+def LO32 : SDNodeXForm<imm, [{
+  // Transformation function: return low 32 bits.
+  return getI32Imm(N->getZExtValue() & 0x00000000FFFFFFFFULL);
+}]>;
+
+def HI32 : SDNodeXForm<imm, [{
+  // Transformation function: return bits 32-63.
+  return getI32Imm(N->getZExtValue() >> 32);
+}]>;
+
+def i32ll16 : PatLeaf<(i32 imm), [{
+  // i32ll16 predicate - true if the 32-bit immediate has only rightmost 16
+  // bits set.
+  return ((N->getZExtValue() & 0x000000000000FFFFULL) == N->getZExtValue());
+}], LL16>;
+
+def i32lh16 : PatLeaf<(i32 imm), [{
+  // i32lh16 predicate - true if the 32-bit immediate has only bits 16-31 set.
+  return ((N->getZExtValue() & 0x00000000FFFF0000ULL) == N->getZExtValue());
+}], LH16>;
+
+def i32ll16c : PatLeaf<(i32 imm), [{
+  // i32ll16c predicate - true if the 32-bit immediate has all bits 16-31 set.
+  return ((N->getZExtValue() | 0x00000000FFFF0000ULL) == N->getZExtValue());
+}], LL16>;
+
+def i32lh16c : PatLeaf<(i32 imm), [{
+  // i32lh16c predicate - true if the 32-bit immediate has all rightmost 16
+  //  bits set.
+  return ((N->getZExtValue() | 0x000000000000FFFFULL) == N->getZExtValue());
+}], LH16>;
+
+def i64ll16 : PatLeaf<(i64 imm), [{  
+  // i64ll16 predicate - true if the 64-bit immediate has only rightmost 16
+  // bits set.
+  return ((N->getZExtValue() & 0x000000000000FFFFULL) == N->getZExtValue());
+}], LL16>;
+
+def i64lh16 : PatLeaf<(i64 imm), [{  
+  // i64lh16 predicate - true if the 64-bit immediate has only bits 16-31 set.
+  return ((N->getZExtValue() & 0x00000000FFFF0000ULL) == N->getZExtValue());
+}], LH16>;
+
+def i64hl16 : PatLeaf<(i64 imm), [{  
+  // i64hl16 predicate - true if the 64-bit immediate has only bits 32-47 set.
+  return ((N->getZExtValue() & 0x0000FFFF00000000ULL) == N->getZExtValue());
+}], HL16>;
+
+def i64hh16 : PatLeaf<(i64 imm), [{  
+  // i64hh16 predicate - true if the 64-bit immediate has only bits 48-63 set.
+  return ((N->getZExtValue() & 0xFFFF000000000000ULL) == N->getZExtValue());
+}], HH16>;
+
+def i64ll16c : PatLeaf<(i64 imm), [{  
+  // i64ll16c predicate - true if the 64-bit immediate has only rightmost 16
+  // bits set.
+  return ((N->getZExtValue() | 0xFFFFFFFFFFFF0000ULL) == N->getZExtValue());
+}], LL16>;
+
+def i64lh16c : PatLeaf<(i64 imm), [{  
+  // i64lh16c predicate - true if the 64-bit immediate has only bits 16-31 set.
+  return ((N->getZExtValue() | 0xFFFFFFFF0000FFFFULL) == N->getZExtValue());
+}], LH16>;
+
+def i64hl16c : PatLeaf<(i64 imm), [{  
+  // i64hl16c predicate - true if the 64-bit immediate has only bits 32-47 set.
+  return ((N->getZExtValue() | 0xFFFF0000FFFFFFFFULL) == N->getZExtValue());
+}], HL16>;
+
+def i64hh16c : PatLeaf<(i64 imm), [{  
+  // i64hh16c predicate - true if the 64-bit immediate has only bits 48-63 set.
+  return ((N->getZExtValue() | 0x0000FFFFFFFFFFFFULL) == N->getZExtValue());
+}], HH16>;
+
+def immSExt16 : PatLeaf<(imm), [{
+  // immSExt16 predicate - true if the immediate fits in a 16-bit sign extended
+  // field.
+  if (N->getValueType(0) == MVT::i64) {
+    uint64_t val = N->getZExtValue();
+    return ((int64_t)val == (int16_t)val);
+  } else if (N->getValueType(0) == MVT::i32) {
+    uint32_t val = N->getZExtValue();
+    return ((int32_t)val == (int16_t)val);
+  }
+
+  return false;
+}], LL16>;
+
+def immSExt32 : PatLeaf<(i64 imm), [{
+  // immSExt32 predicate - true if the immediate fits in a 32-bit sign extended
+  // field.
+  uint64_t val = N->getZExtValue();
+  return ((int64_t)val == (int32_t)val);
+}], LO32>;
+
+def i64lo32 : PatLeaf<(i64 imm), [{
+  // i64lo32 predicate - true if the 64-bit immediate has only rightmost 32
+  // bits set.
+  return ((N->getZExtValue() & 0x00000000FFFFFFFFULL) == N->getZExtValue());
+}], LO32>;
+
+def i64hi32 : PatLeaf<(i64 imm), [{
+  // i64hi32 predicate - true if the 64-bit immediate has only bits 32-63 set.
+  return ((N->getZExtValue() & 0xFFFFFFFF00000000ULL) == N->getZExtValue());
+}], HI32>;
+
+def i64lo32c : PatLeaf<(i64 imm), [{
+  // i64lo32 predicate - true if the 64-bit immediate has only rightmost 32
+  // bits set.
+  return ((N->getZExtValue() | 0xFFFFFFFF00000000ULL) == N->getZExtValue());
+}], LO32>;
+
+def i64hi32c : PatLeaf<(i64 imm), [{
+  // i64hi32 predicate - true if the 64-bit immediate has only bits 32-63 set.
+  return ((N->getZExtValue() | 0x00000000FFFFFFFFULL) == N->getZExtValue());
+}], HI32>;
+
+def i32immSExt8  : PatLeaf<(i32 imm), [{
+  // i32immSExt8 predicate - True if the 32-bit immediate fits in a 8-bit
+  // sign extended field.
+  return (int32_t)N->getZExtValue() == (int8_t)N->getZExtValue();
+}], LO8>;
+
+def i32immSExt16 : PatLeaf<(i32 imm), [{
+  // i32immSExt16 predicate - True if the 32-bit immediate fits in a 16-bit
+  // sign extended field.
+  return (int32_t)N->getZExtValue() == (int16_t)N->getZExtValue();
+}], LL16>;
+
+def i64immSExt32 : PatLeaf<(i64 imm), [{
+  // i64immSExt32 predicate - True if the 64-bit immediate fits in a 32-bit
+  // sign extended field.
+  return (int64_t)N->getZExtValue() == (int32_t)N->getZExtValue();
+}], LO32>;
+
+def i64immZExt32 : PatLeaf<(i64 imm), [{
+  // i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit
+  // zero extended field.
+  return (uint64_t)N->getZExtValue() == (uint32_t)N->getZExtValue();
+}], LO32>;
+
+// extloads
+def extloadi32i8   : PatFrag<(ops node:$ptr), (i32 (extloadi8  node:$ptr))>;
+def extloadi32i16  : PatFrag<(ops node:$ptr), (i32 (extloadi16 node:$ptr))>;
+def extloadi64i8   : PatFrag<(ops node:$ptr), (i64 (extloadi8  node:$ptr))>;
+def extloadi64i16  : PatFrag<(ops node:$ptr), (i64 (extloadi16 node:$ptr))>;
+def extloadi64i32  : PatFrag<(ops node:$ptr), (i64 (extloadi32 node:$ptr))>;
+
+def sextloadi32i8   : PatFrag<(ops node:$ptr), (i32 (sextloadi8  node:$ptr))>;
+def sextloadi32i16  : PatFrag<(ops node:$ptr), (i32 (sextloadi16 node:$ptr))>;
+def sextloadi64i8   : PatFrag<(ops node:$ptr), (i64 (sextloadi8  node:$ptr))>;
+def sextloadi64i16  : PatFrag<(ops node:$ptr), (i64 (sextloadi16 node:$ptr))>;
+def sextloadi64i32  : PatFrag<(ops node:$ptr), (i64 (sextloadi32 node:$ptr))>;
+
+def zextloadi32i8   : PatFrag<(ops node:$ptr), (i32 (zextloadi8  node:$ptr))>;
+def zextloadi32i16  : PatFrag<(ops node:$ptr), (i32 (zextloadi16 node:$ptr))>;
+def zextloadi64i8   : PatFrag<(ops node:$ptr), (i64 (zextloadi8  node:$ptr))>;
+def zextloadi64i16  : PatFrag<(ops node:$ptr), (i64 (zextloadi16 node:$ptr))>;
+def zextloadi64i32  : PatFrag<(ops node:$ptr), (i64 (zextloadi32 node:$ptr))>;
+
+// A couple of more descriptive operand definitions.
+// 32-bits but only 8 bits are significant.
+def i32i8imm  : Operand<i32>;
+// 32-bits but only 16 bits are significant.
+def i32i16imm : Operand<i32>;
+// 64-bits but only 32 bits are significant.
+def i64i32imm : Operand<i64>;
+// Branch targets have OtherVT type.
+def brtarget : Operand<OtherVT>;
+
+// Unsigned i12
+def u12imm : Operand<i32> {
+  let PrintMethod = "printU12ImmOperand";
+}
+def u12imm64 : Operand<i64> {
+  let PrintMethod = "printU12ImmOperand";
+}
+
+// Signed i16
+def s16imm : Operand<i32> {
+  let PrintMethod = "printS16ImmOperand";
+}
+def s16imm64 : Operand<i64> {
+  let PrintMethod = "printS16ImmOperand";
+}
+// Signed i20
+def s20imm : Operand<i32> {
+  let PrintMethod = "printS20ImmOperand";
+}
+def s20imm64 : Operand<i64> {
+  let PrintMethod = "printS20ImmOperand";
+}
+// Signed i32
+def s32imm : Operand<i32> {
+  let PrintMethod = "printS32ImmOperand";
+}
+def s32imm64 : Operand<i64> {
+  let PrintMethod = "printS32ImmOperand";
+}
+
+def imm_pcrel : Operand<i64> {
+  let PrintMethod = "printPCRelImmOperand";
+}
+
+//===----------------------------------------------------------------------===//
+// SystemZ Operand Definitions.
+//===----------------------------------------------------------------------===//
+
+// Address operands
+
+// riaddr := reg + imm
+def riaddr32 : Operand<i64>,
+               ComplexPattern<i64, 2, "SelectAddrRI12Only", []> {
+  let PrintMethod = "printRIAddrOperand";
+  let MIOperandInfo = (ops ADDR64:$base, u12imm:$disp);
+}
+
+def riaddr12 : Operand<i64>,
+               ComplexPattern<i64, 2, "SelectAddrRI12", []> {
+  let PrintMethod = "printRIAddrOperand";
+  let MIOperandInfo = (ops ADDR64:$base, u12imm64:$disp);
+}
+
+def riaddr : Operand<i64>,
+             ComplexPattern<i64, 2, "SelectAddrRI", []> {
+  let PrintMethod = "printRIAddrOperand";
+  let MIOperandInfo = (ops ADDR64:$base, s20imm64:$disp);
+}
+
+//===----------------------------------------------------------------------===//
+
+// rriaddr := reg + reg + imm
+def rriaddr12 : Operand<i64>,
+                ComplexPattern<i64, 3, "SelectAddrRRI12", [], []> {
+  let PrintMethod = "printRRIAddrOperand";
+  let MIOperandInfo = (ops ADDR64:$base, u12imm64:$disp, ADDR64:$index);
+}
+def rriaddr : Operand<i64>,
+              ComplexPattern<i64, 3, "SelectAddrRRI20", [], []> {
+  let PrintMethod = "printRRIAddrOperand";
+  let MIOperandInfo = (ops ADDR64:$base, s20imm64:$disp, ADDR64:$index);
+}
+def laaddr : Operand<i64>,
+             ComplexPattern<i64, 3, "SelectLAAddr", [add, sub, or, frameindex], []> {
+  let PrintMethod = "printRRIAddrOperand";
+  let MIOperandInfo = (ops ADDR64:$base, s20imm64:$disp, ADDR64:$index);
+}

diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
new file mode 100644
index 0000000..fe50c90
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.cpp

@@ -0,0 +1,345 @@
+//===- SystemZRegisterInfo.cpp - SystemZ Register Information -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the SystemZ implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZ.h"
+#include "SystemZInstrInfo.h"
+#include "SystemZMachineFunctionInfo.h"
+#include "SystemZRegisterInfo.h"
+#include "SystemZSubtarget.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/BitVector.h"
+using namespace llvm;
+
+SystemZRegisterInfo::SystemZRegisterInfo(SystemZTargetMachine &tm,
+                                         const SystemZInstrInfo &tii)
+  : SystemZGenRegisterInfo(SystemZ::ADJCALLSTACKUP, SystemZ::ADJCALLSTACKDOWN),
+    TM(tm), TII(tii) {
+}
+
+const unsigned*
+SystemZRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  static const unsigned CalleeSavedRegs[] = {
+    SystemZ::R6D,  SystemZ::R7D,  SystemZ::R8D,  SystemZ::R9D,
+    SystemZ::R10D, SystemZ::R11D, SystemZ::R12D, SystemZ::R13D,
+    SystemZ::R14D, SystemZ::R15D,
+    SystemZ::F8L,  SystemZ::F9L,  SystemZ::F10L, SystemZ::F11L,
+    SystemZ::F12L, SystemZ::F13L, SystemZ::F14L, SystemZ::F15L,
+    0
+  };
+
+  return CalleeSavedRegs;
+}
+
+const TargetRegisterClass* const*
+SystemZRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
+  static const TargetRegisterClass * const CalleeSavedRegClasses[] = {
+    &SystemZ::GR64RegClass, &SystemZ::GR64RegClass,
+    &SystemZ::GR64RegClass, &SystemZ::GR64RegClass,
+    &SystemZ::GR64RegClass, &SystemZ::GR64RegClass,
+    &SystemZ::GR64RegClass, &SystemZ::GR64RegClass,
+    &SystemZ::GR64RegClass, &SystemZ::GR64RegClass,
+    &SystemZ::FP64RegClass, &SystemZ::FP64RegClass,
+    &SystemZ::FP64RegClass, &SystemZ::FP64RegClass,
+    &SystemZ::FP64RegClass, &SystemZ::FP64RegClass,
+    &SystemZ::FP64RegClass, &SystemZ::FP64RegClass, 0
+  };
+  return CalleeSavedRegClasses;
+}
+
+BitVector SystemZRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  if (hasFP(MF))
+    Reserved.set(SystemZ::R11D);
+  Reserved.set(SystemZ::R14D);
+  Reserved.set(SystemZ::R15D);
+  return Reserved;
+}
+
+/// needsFP - Return true if the specified function should have a dedicated
+/// frame pointer register.  This is true if the function has variable sized
+/// allocas or if frame pointer elimination is disabled.
+bool SystemZRegisterInfo::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  return NoFramePointerElim || MFI->hasVarSizedObjects();
+}
+
+void SystemZRegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  MBB.erase(I);
+}
+
+int SystemZRegisterInfo::getFrameIndexOffset(const MachineFunction &MF,
+                                             int FI) const {
+  const TargetFrameInfo &TFI = *MF.getTarget().getFrameInfo();
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const SystemZMachineFunctionInfo *SystemZMFI =
+    MF.getInfo<SystemZMachineFunctionInfo>();
+  int Offset = MFI->getObjectOffset(FI) + MFI->getOffsetAdjustment();
+  uint64_t StackSize = MFI->getStackSize();
+
+  // Fixed objects are really located in the "previous" frame.
+  if (FI < 0)
+    StackSize -= SystemZMFI->getCalleeSavedFrameSize();
+
+  Offset += StackSize - TFI.getOffsetOfLocalArea();
+
+  // Skip the register save area if we generated the stack frame.
+  if (StackSize || MFI->hasCalls())
+    Offset -= TFI.getOffsetOfLocalArea();
+
+  return Offset;
+}
+
+unsigned
+SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                         int SPAdj, int *Value,
+                                         RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unxpected");
+
+  unsigned i = 0;
+  MachineInstr &MI = *II;
+  MachineFunction &MF = *MI.getParent()->getParent();
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+
+  int FrameIndex = MI.getOperand(i).getIndex();
+
+  unsigned BasePtr = (hasFP(MF) ? SystemZ::R11D : SystemZ::R15D);
+
+  // This must be part of a rri or ri operand memory reference.  Replace the
+  // FrameIndex with base register with BasePtr.  Add an offset to the
+  // displacement field.
+  MI.getOperand(i).ChangeToRegister(BasePtr, false);
+
+  // Offset is a either 12-bit unsigned or 20-bit signed integer.
+  // FIXME: handle "too long" displacements.
+  int Offset = getFrameIndexOffset(MF, FrameIndex) + MI.getOperand(i+1).getImm();
+
+  // Check whether displacement is too long to fit into 12 bit zext field.
+  MI.setDesc(TII.getMemoryInstr(MI.getOpcode(), Offset));
+
+  MI.getOperand(i+1).ChangeToImmediate(Offset);
+  return 0;
+}
+
+void
+SystemZRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                                       RegScavenger *RS) const {
+  // Determine whether R15/R14 will ever be clobbered inside the function. And
+  // if yes - mark it as 'callee' saved.
+  MachineFrameInfo *FFI = MF.getFrameInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  // Check whether high FPRs are ever used, if yes - we need to save R15 as
+  // well.
+  static const unsigned HighFPRs[] = {
+    SystemZ::F8L,  SystemZ::F9L,  SystemZ::F10L, SystemZ::F11L,
+    SystemZ::F12L, SystemZ::F13L, SystemZ::F14L, SystemZ::F15L,
+    SystemZ::F8S,  SystemZ::F9S,  SystemZ::F10S, SystemZ::F11S,
+    SystemZ::F12S, SystemZ::F13S, SystemZ::F14S, SystemZ::F15S,
+  };
+
+  bool HighFPRsUsed = false;
+  for (unsigned i = 0, e = array_lengthof(HighFPRs); i != e; ++i)
+    HighFPRsUsed |= MRI.isPhysRegUsed(HighFPRs[i]);
+
+  if (FFI->hasCalls())
+    /* FIXME: function is varargs */
+    /* FIXME: function grabs RA */
+    /* FIXME: function calls eh_return */
+    MRI.setPhysRegUsed(SystemZ::R14D);
+
+  if (HighFPRsUsed ||
+      FFI->hasCalls() ||
+      FFI->getObjectIndexEnd() != 0 || // Contains automatic variables
+      FFI->hasVarSizedObjects() // Function calls dynamic alloca's
+      /* FIXME: function is varargs */)
+    MRI.setPhysRegUsed(SystemZ::R15D);
+}
+
+/// emitSPUpdate - Emit a series of instructions to increment / decrement the
+/// stack pointer by a constant value.
+static
+void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+                  int64_t NumBytes, const TargetInstrInfo &TII) {
+  unsigned Opc; uint64_t Chunk;
+  bool isSub = NumBytes < 0;
+  uint64_t Offset = isSub ? -NumBytes : NumBytes;
+
+  if (Offset >= (1LL << 15) - 1) {
+    Opc = SystemZ::ADD64ri32;
+    Chunk = (1LL << 31) - 1;
+  } else {
+    Opc = SystemZ::ADD64ri16;
+    Chunk = (1LL << 15) - 1;
+  }
+
+  DebugLoc DL = (MBBI != MBB.end() ? MBBI->getDebugLoc() :
+                 DebugLoc::getUnknownLoc());
+
+  while (Offset) {
+    uint64_t ThisVal = (Offset > Chunk) ? Chunk : Offset;
+    MachineInstr *MI =
+      BuildMI(MBB, MBBI, DL, TII.get(Opc), SystemZ::R15D)
+      .addReg(SystemZ::R15D).addImm((isSub ? -(int64_t)ThisVal : ThisVal));
+    // The PSW implicit def is dead.
+    MI->getOperand(3).setIsDead();
+    Offset -= ThisVal;
+  }
+}
+
+void SystemZRegisterInfo::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();   // Prolog goes in entry BB
+  const TargetFrameInfo &TFI = *MF.getTarget().getFrameInfo();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  SystemZMachineFunctionInfo *SystemZMFI =
+    MF.getInfo<SystemZMachineFunctionInfo>();
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  DebugLoc DL = (MBBI != MBB.end() ? MBBI->getDebugLoc() :
+                 DebugLoc::getUnknownLoc());
+
+  // Get the number of bytes to allocate from the FrameInfo.
+  // Note that area for callee-saved stuff is already allocated, thus we need to
+  // 'undo' the stack movement.
+  uint64_t StackSize = MFI->getStackSize();
+  StackSize -= SystemZMFI->getCalleeSavedFrameSize();
+
+  uint64_t NumBytes = StackSize - TFI.getOffsetOfLocalArea();
+
+  // Skip the callee-saved push instructions.
+  while (MBBI != MBB.end() &&
+         (MBBI->getOpcode() == SystemZ::MOV64mr ||
+          MBBI->getOpcode() == SystemZ::MOV64mrm))
+    ++MBBI;
+
+  if (MBBI != MBB.end())
+    DL = MBBI->getDebugLoc();
+
+  // adjust stack pointer: R15 -= numbytes
+  if (StackSize || MFI->hasCalls()) {
+    assert(MF.getRegInfo().isPhysRegUsed(SystemZ::R15D) &&
+           "Invalid stack frame calculation!");
+    emitSPUpdate(MBB, MBBI, -(int64_t)NumBytes, TII);
+  }
+
+  if (hasFP(MF)) {
+    // Update R11 with the new base value...
+    BuildMI(MBB, MBBI, DL, TII.get(SystemZ::MOV64rr), SystemZ::R11D)
+      .addReg(SystemZ::R15D);
+
+    // Mark the FramePtr as live-in in every block except the entry.
+    for (MachineFunction::iterator I = llvm::next(MF.begin()), E = MF.end();
+         I != E; ++I)
+      I->addLiveIn(SystemZ::R11D);
+
+  }
+}
+
+void SystemZRegisterInfo::emitEpilogue(MachineFunction &MF,
+                                     MachineBasicBlock &MBB) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const TargetFrameInfo &TFI = *MF.getTarget().getFrameInfo();
+  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  SystemZMachineFunctionInfo *SystemZMFI =
+    MF.getInfo<SystemZMachineFunctionInfo>();
+  unsigned RetOpcode = MBBI->getOpcode();
+
+  switch (RetOpcode) {
+  case SystemZ::RET: break;  // These are ok
+  default:
+    assert(0 && "Can only insert epilog into returning blocks");
+  }
+
+  // Get the number of bytes to allocate from the FrameInfo
+  // Note that area for callee-saved stuff is already allocated, thus we need to
+  // 'undo' the stack movement.
+  uint64_t StackSize =
+    MFI->getStackSize() - SystemZMFI->getCalleeSavedFrameSize();
+  uint64_t NumBytes = StackSize - TFI.getOffsetOfLocalArea();
+
+  // Skip the final terminator instruction.
+  while (MBBI != MBB.begin()) {
+    MachineBasicBlock::iterator PI = prior(MBBI);
+    --MBBI;
+    if (!PI->getDesc().isTerminator())
+      break;
+  }
+
+  // During callee-saved restores emission stack frame was not yet finialized
+  // (and thus - the stack size was unknown). Tune the offset having full stack
+  // size in hands.
+  if (StackSize || MFI->hasCalls()) {
+    assert((MBBI->getOpcode() == SystemZ::MOV64rmm ||
+            MBBI->getOpcode() == SystemZ::MOV64rm) &&
+           "Expected to see callee-save register restore code");
+    assert(MF.getRegInfo().isPhysRegUsed(SystemZ::R15D) &&
+           "Invalid stack frame calculation!");
+
+    unsigned i = 0;
+    MachineInstr &MI = *MBBI;
+    while (!MI.getOperand(i).isImm()) {
+      ++i;
+      assert(i < MI.getNumOperands() && "Unexpected restore code!");
+    }
+
+    uint64_t Offset = NumBytes + MI.getOperand(i).getImm();
+    // If Offset does not fit into 20-bit signed displacement field we need to
+    // emit some additional code...
+    if (Offset > 524287) {
+      // Fold the displacement into load instruction as much as possible.
+      NumBytes = Offset - 524287;
+      Offset = 524287;
+      emitSPUpdate(MBB, MBBI, NumBytes, TII);
+    }
+
+    MI.getOperand(i).ChangeToImmediate(Offset);
+  }
+}
+
+unsigned SystemZRegisterInfo::getRARegister() const {
+  assert(0 && "What is the return address register");
+  return 0;
+}
+
+unsigned
+SystemZRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  assert(0 && "What is the frame register");
+  return 0;
+}
+
+unsigned SystemZRegisterInfo::getEHExceptionRegister() const {
+  assert(0 && "What is the exception register");
+  return 0;
+}
+
+unsigned SystemZRegisterInfo::getEHHandlerRegister() const {
+  assert(0 && "What is the exception handler register");
+  return 0;
+}
+
+int SystemZRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
+  assert(0 && "What is the dwarf register number");
+  return -1;
+}
+
+#include "SystemZGenRegisterInfo.inc"

diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.h b/lib/Target/SystemZ/SystemZRegisterInfo.h
new file mode 100644
index 0000000..fabd4e8
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.h

@@ -0,0 +1,82 @@
+//===- SystemZRegisterInfo.h - SystemZ Register Information Impl ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the SystemZ implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SystemZREGISTERINFO_H
+#define SystemZREGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "SystemZGenRegisterInfo.h.inc"
+
+namespace llvm {
+
+namespace SystemZ {
+  /// SubregIndex - The index of various sized subregister classes. Note that
+  /// these indices must be kept in sync with the class indices in the
+  /// SystemZRegisterInfo.td file.
+  enum SubregIndex {
+    SUBREG_32BIT = 1, SUBREG_EVEN = 1, SUBREG_ODD = 2
+  };
+}
+
+class SystemZSubtarget;
+class SystemZInstrInfo;
+class Type;
+
+struct SystemZRegisterInfo : public SystemZGenRegisterInfo {
+  SystemZTargetMachine &TM;
+  const SystemZInstrInfo &TII;
+
+  SystemZRegisterInfo(SystemZTargetMachine &tm, const SystemZInstrInfo &tii);
+
+  /// Code Generation virtual methods...
+  const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+
+  const TargetRegisterClass* const* getCalleeSavedRegClasses(
+                                     const MachineFunction *MF = 0) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+
+  bool hasReservedCallFrame(MachineFunction &MF) const { return true; }
+  bool hasFP(const MachineFunction &MF) const;
+
+  int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+  unsigned eliminateFrameIndex(MachineBasicBlock::iterator II,
+                               int SPAdj, int *Value = NULL,
+                               RegScavenger *RS = NULL) const;
+
+
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                            RegScavenger *RS) const;
+
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+  // Debug information queries.
+  unsigned getRARegister() const;
+  unsigned getFrameRegister(const MachineFunction &MF) const;
+
+  // Exception handling queries.
+  unsigned getEHExceptionRegister() const;
+  unsigned getEHHandlerRegister() const;
+
+  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+};
+
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.td b/lib/Target/SystemZ/SystemZRegisterInfo.td
new file mode 100644
index 0000000..8795847
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.td

@@ -0,0 +1,490 @@
+//===- SystemZRegisterInfo.td - The PowerPC Register File ------*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+class SystemZReg<string n> : Register<n> {
+  let Namespace = "SystemZ";
+}
+
+class SystemZRegWithSubregs<string n, list<Register> subregs>
+  : RegisterWithSubRegs<n, subregs> {
+  let Namespace = "SystemZ";
+}
+
+// We identify all our registers with a 4-bit ID, for consistency's sake.
+
+// GPR32 - Lower 32 bits of one of the 16 64-bit general-purpose registers
+class GPR32<bits<4> num, string n> : SystemZReg<n> {
+  field bits<4> Num = num;
+}
+
+// GPR64 - One of the 16 64-bit general-purpose registers
+class GPR64<bits<4> num, string n, list<Register> subregs,
+            list<Register> aliases = []>
+ : SystemZRegWithSubregs<n, subregs> {
+  field bits<4> Num = num;
+  let Aliases = aliases;
+}
+
+// GPR128 - 8 even-odd register pairs
+class GPR128<bits<4> num, string n, list<Register> subregs,
+             list<Register> aliases = []>
+ : SystemZRegWithSubregs<n, subregs> {
+  field bits<4> Num = num;
+  let Aliases = aliases;
+}
+
+// FPRS - Lower 32 bits of one of the 16 64-bit floating-point registers
+class FPRS<bits<4> num, string n> : SystemZReg<n> {
+  field bits<4> Num = num;
+}
+
+// FPRL - One of the 16 64-bit floating-point registers
+class FPRL<bits<4> num, string n, list<Register> subregs>
+ : SystemZRegWithSubregs<n, subregs> {
+  field bits<4> Num = num;
+}
+
+// General-purpose registers
+def R0W  : GPR32< 0,  "r0">, DwarfRegNum<[0]>;
+def R1W  : GPR32< 1,  "r1">, DwarfRegNum<[1]>;
+def R2W  : GPR32< 2,  "r2">, DwarfRegNum<[2]>;
+def R3W  : GPR32< 3,  "r3">, DwarfRegNum<[3]>;
+def R4W  : GPR32< 4,  "r4">, DwarfRegNum<[4]>;
+def R5W  : GPR32< 5,  "r5">, DwarfRegNum<[5]>;
+def R6W  : GPR32< 6,  "r6">, DwarfRegNum<[6]>;
+def R7W  : GPR32< 7,  "r7">, DwarfRegNum<[7]>;
+def R8W  : GPR32< 8,  "r8">, DwarfRegNum<[8]>;
+def R9W  : GPR32< 9,  "r9">, DwarfRegNum<[9]>;
+def R10W : GPR32<10, "r10">, DwarfRegNum<[10]>;
+def R11W : GPR32<11, "r11">, DwarfRegNum<[11]>;
+def R12W : GPR32<12, "r12">, DwarfRegNum<[12]>;
+def R13W : GPR32<13, "r13">, DwarfRegNum<[13]>;
+def R14W : GPR32<14, "r14">, DwarfRegNum<[14]>;
+def R15W : GPR32<15, "r15">, DwarfRegNum<[15]>;
+
+def R0D  : GPR64< 0,  "r0", [R0W]>,  DwarfRegNum<[0]>;
+def R1D  : GPR64< 1,  "r1", [R1W]>,  DwarfRegNum<[1]>;
+def R2D  : GPR64< 2,  "r2", [R2W]>,  DwarfRegNum<[2]>;
+def R3D  : GPR64< 3,  "r3", [R3W]>,  DwarfRegNum<[3]>;
+def R4D  : GPR64< 4,  "r4", [R4W]>,  DwarfRegNum<[4]>;
+def R5D  : GPR64< 5,  "r5", [R5W]>,  DwarfRegNum<[5]>;
+def R6D  : GPR64< 6,  "r6", [R6W]>,  DwarfRegNum<[6]>;
+def R7D  : GPR64< 7,  "r7", [R7W]>,  DwarfRegNum<[7]>;
+def R8D  : GPR64< 8,  "r8", [R8W]>,  DwarfRegNum<[8]>;
+def R9D  : GPR64< 9,  "r9", [R9W]>,  DwarfRegNum<[9]>;
+def R10D : GPR64<10, "r10", [R10W]>, DwarfRegNum<[10]>;
+def R11D : GPR64<11, "r11", [R11W]>, DwarfRegNum<[11]>;
+def R12D : GPR64<12, "r12", [R12W]>, DwarfRegNum<[12]>;
+def R13D : GPR64<13, "r13", [R13W]>, DwarfRegNum<[13]>;
+def R14D : GPR64<14, "r14", [R14W]>, DwarfRegNum<[14]>;
+def R15D : GPR64<15, "r15", [R15W]>, DwarfRegNum<[15]>;
+
+// Register pairs
+def R0P  : GPR64< 0,  "r0", [R0W,  R1W],  [R0D,  R1D]>,  DwarfRegNum<[0]>;
+def R2P  : GPR64< 2,  "r2", [R2W,  R3W],  [R2D,  R3D]>,  DwarfRegNum<[2]>;
+def R4P  : GPR64< 4,  "r4", [R4W,  R5W],  [R4D,  R5D]>,  DwarfRegNum<[4]>;
+def R6P  : GPR64< 6,  "r6", [R6W,  R7W],  [R6D,  R7D]>,  DwarfRegNum<[6]>;
+def R8P  : GPR64< 8,  "r8", [R8W,  R9W],  [R8D,  R9D]>,  DwarfRegNum<[8]>;
+def R10P : GPR64<10, "r10", [R10W, R11W], [R10D, R11D]>, DwarfRegNum<[10]>;
+def R12P : GPR64<12, "r12", [R12W, R13W], [R12D, R13D]>, DwarfRegNum<[12]>;
+def R14P : GPR64<14, "r14", [R14W, R15W], [R14D, R15D]>, DwarfRegNum<[14]>;
+
+def R0Q  : GPR128< 0,  "r0", [R0D,  R1D],  [R0P]>,  DwarfRegNum<[0]>;
+def R2Q  : GPR128< 2,  "r2", [R2D,  R3D],  [R2P]>,  DwarfRegNum<[2]>;
+def R4Q  : GPR128< 4,  "r4", [R4D,  R5D],  [R4P]>,  DwarfRegNum<[4]>;
+def R6Q  : GPR128< 6,  "r6", [R6D,  R7D],  [R6P]>,  DwarfRegNum<[6]>;
+def R8Q  : GPR128< 8,  "r8", [R8D,  R9D],  [R8P]>,  DwarfRegNum<[8]>;
+def R10Q : GPR128<10, "r10", [R10D, R11D], [R10P]>, DwarfRegNum<[10]>;
+def R12Q : GPR128<12, "r12", [R12D, R13D], [R12P]>, DwarfRegNum<[12]>;
+def R14Q : GPR128<14, "r14", [R14D, R15D], [R14P]>, DwarfRegNum<[14]>;
+
+// Floating-point registers
+def F0S  : FPRS< 0,  "f0">, DwarfRegNum<[16]>;
+def F1S  : FPRS< 1,  "f1">, DwarfRegNum<[17]>;
+def F2S  : FPRS< 2,  "f2">, DwarfRegNum<[18]>;
+def F3S  : FPRS< 3,  "f3">, DwarfRegNum<[19]>;
+def F4S  : FPRS< 4,  "f4">, DwarfRegNum<[20]>;
+def F5S  : FPRS< 5,  "f5">, DwarfRegNum<[21]>;
+def F6S  : FPRS< 6,  "f6">, DwarfRegNum<[22]>;
+def F7S  : FPRS< 7,  "f7">, DwarfRegNum<[23]>;
+def F8S  : FPRS< 8,  "f8">, DwarfRegNum<[24]>;
+def F9S  : FPRS< 9,  "f9">, DwarfRegNum<[25]>;
+def F10S : FPRS<10, "f10">, DwarfRegNum<[26]>;
+def F11S : FPRS<11, "f11">, DwarfRegNum<[27]>;
+def F12S : FPRS<12, "f12">, DwarfRegNum<[28]>;
+def F13S : FPRS<13, "f13">, DwarfRegNum<[29]>;
+def F14S : FPRS<14, "f14">, DwarfRegNum<[30]>;
+def F15S : FPRS<15, "f15">, DwarfRegNum<[31]>;
+
+def F0L  : FPRL< 0,  "f0", [F0S]>,  DwarfRegNum<[16]>;
+def F1L  : FPRL< 1,  "f1", [F1S]>,  DwarfRegNum<[17]>;
+def F2L  : FPRL< 2,  "f2", [F2S]>,  DwarfRegNum<[18]>;
+def F3L  : FPRL< 3,  "f3", [F3S]>,  DwarfRegNum<[19]>;
+def F4L  : FPRL< 4,  "f4", [F4S]>,  DwarfRegNum<[20]>;
+def F5L  : FPRL< 5,  "f5", [F5S]>,  DwarfRegNum<[21]>;
+def F6L  : FPRL< 6,  "f6", [F6S]>,  DwarfRegNum<[22]>;
+def F7L  : FPRL< 7,  "f7", [F7S]>,  DwarfRegNum<[23]>;
+def F8L  : FPRL< 8,  "f8", [F8S]>,  DwarfRegNum<[24]>;
+def F9L  : FPRL< 9,  "f9", [F9S]>,  DwarfRegNum<[25]>;
+def F10L : FPRL<10, "f10", [F10S]>, DwarfRegNum<[26]>;
+def F11L : FPRL<11, "f11", [F11S]>, DwarfRegNum<[27]>;
+def F12L : FPRL<12, "f12", [F12S]>, DwarfRegNum<[28]>;
+def F13L : FPRL<13, "f13", [F13S]>, DwarfRegNum<[29]>;
+def F14L : FPRL<14, "f14", [F14S]>, DwarfRegNum<[30]>;
+def F15L : FPRL<15, "f15", [F15S]>, DwarfRegNum<[31]>;
+
+// Status register
+def PSW : SystemZReg<"psw">;
+
+def subreg_32bit  : PatLeaf<(i32 1)>;
+def subreg_even32 : PatLeaf<(i32 1)>;
+def subreg_odd32  : PatLeaf<(i32 2)>;
+def subreg_even   : PatLeaf<(i32 3)>;
+def subreg_odd    : PatLeaf<(i32 4)>;
+
+def : SubRegSet<1, [R0D, R1D,  R2D,  R3D,  R4D,  R5D,  R6D,  R7D,
+                    R8D, R9D, R10D, R11D, R12D, R13D, R14D, R15D],
+                   [R0W, R1W,  R2W,  R3W,  R4W,  R5W,  R6W,  R7W,
+                    R8W, R9W, R10W, R11W, R12W, R13W, R14W, R15W]>;
+
+def : SubRegSet<3, [R0Q, R2Q, R4Q, R6Q, R8Q, R10Q, R12Q, R14Q],
+                   [R0D, R2D, R4D, R6D, R8D, R10D, R12D, R14D]>;
+
+def : SubRegSet<4, [R0Q, R2Q, R4Q, R6Q, R8Q, R10Q, R12Q, R14Q],
+                   [R1D, R3D, R5D, R7D, R9D, R11D, R13D, R15D]>;
+
+def : SubRegSet<1, [R0P, R2P, R4P, R6P, R8P, R10P, R12P, R14P],
+                   [R0W, R2W, R4W, R6W, R8W, R10W, R12W, R14W]>;
+
+def : SubRegSet<2, [R0P, R2P, R4P, R6P, R8P, R10P, R12P, R14P],
+                   [R1W, R3W, R5W, R7W, R9W, R11W, R13W, R15W]>;
+
+def : SubRegSet<1, [R0Q, R2Q, R4Q, R6Q, R8Q, R10Q, R12Q, R14Q],
+                   [R0W, R2W, R4W, R6W, R8W, R10W, R12W, R14W]>;
+
+def : SubRegSet<2, [R0Q, R2Q, R4Q, R6Q, R8Q, R10Q, R12Q, R14Q],
+                   [R1W, R3W, R5W, R7W, R9W, R11W, R13W, R15W]>;
+
+/// Register classes
+def GR32 : RegisterClass<"SystemZ", [i32], 32,
+   // Volatile registers
+  [R0W, R1W, R2W, R3W, R4W, R5W, R6W, R7W, R8W, R9W, R10W, R12W, R13W,
+   // Frame pointer, sometimes allocable
+   R11W,
+   // Volatile, but not allocable
+   R14W, R15W]>
+{
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    static const unsigned SystemZ_REG32[] = {
+      SystemZ::R1W,  SystemZ::R2W,  SystemZ::R3W,  SystemZ::R4W,
+      SystemZ::R5W,  SystemZ::R0W,  SystemZ::R12W, SystemZ::R11W,
+      SystemZ::R10W, SystemZ::R9W,  SystemZ::R8W,  SystemZ::R7W,
+      SystemZ::R6W,  SystemZ::R14W, SystemZ::R13W
+    };
+    static const unsigned SystemZ_REG32_nofp[] = {
+      SystemZ::R1W,  SystemZ::R2W,  SystemZ::R3W,  SystemZ::R4W,
+      SystemZ::R5W,  SystemZ::R0W,  SystemZ::R12W, /* No R11W */
+      SystemZ::R10W, SystemZ::R9W,  SystemZ::R8W,  SystemZ::R7W,
+      SystemZ::R6W,  SystemZ::R14W, SystemZ::R13W
+    };
+    GR32Class::iterator
+    GR32Class::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF))
+        return SystemZ_REG32_nofp;
+      else
+        return SystemZ_REG32;
+    }
+    GR32Class::iterator
+    GR32Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF))
+        return SystemZ_REG32_nofp + (sizeof(SystemZ_REG32_nofp) / sizeof(unsigned));
+      else
+        return SystemZ_REG32 + (sizeof(SystemZ_REG32) / sizeof(unsigned));
+    }
+  }];
+}
+
+/// Registers used to generate address. Everything except R0.
+def ADDR32 : RegisterClass<"SystemZ", [i32], 32,
+   // Volatile registers
+  [R1W, R2W, R3W, R4W, R5W, R6W, R7W, R8W, R9W, R10W, R12W, R13W,
+   // Frame pointer, sometimes allocable
+   R11W,
+   // Volatile, but not allocable
+   R14W, R15W]>
+{
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    static const unsigned SystemZ_ADDR32[] = {
+      SystemZ::R1W,  SystemZ::R2W,  SystemZ::R3W,  SystemZ::R4W,
+      SystemZ::R5W,  /* No R0W */   SystemZ::R12W, SystemZ::R11W,
+      SystemZ::R10W, SystemZ::R9W,  SystemZ::R8W,  SystemZ::R7W,
+      SystemZ::R6W,  SystemZ::R14W, SystemZ::R13W
+    };
+    static const unsigned SystemZ_ADDR32_nofp[] = {
+      SystemZ::R1W,  SystemZ::R2W,  SystemZ::R3W,  SystemZ::R4W,
+      SystemZ::R5W,  /* No R0W */   SystemZ::R12W, /* No R11W */
+      SystemZ::R10W, SystemZ::R9W,  SystemZ::R8W,  SystemZ::R7W,
+      SystemZ::R6W,  SystemZ::R14W, SystemZ::R13W
+    };
+    ADDR32Class::iterator
+    ADDR32Class::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF))
+        return SystemZ_ADDR32_nofp;
+      else
+        return SystemZ_ADDR32;
+    }
+    ADDR32Class::iterator
+    ADDR32Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF))
+        return SystemZ_ADDR32_nofp + (sizeof(SystemZ_ADDR32_nofp) / sizeof(unsigned));
+      else
+        return SystemZ_ADDR32 + (sizeof(SystemZ_ADDR32) / sizeof(unsigned));
+    }
+  }];
+}
+
+def GR64 : RegisterClass<"SystemZ", [i64], 64,
+   // Volatile registers
+  [R0D, R1D, R2D, R3D, R4D, R5D, R6D, R7D, R8D, R9D, R10D, R12D, R13D,
+   // Frame pointer, sometimes allocable
+   R11D,
+   // Volatile, but not allocable
+   R14D, R15D]>
+{
+  let SubRegClassList = [GR32];
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    static const unsigned SystemZ_REG64[] = {
+      SystemZ::R1D,  SystemZ::R2D,  SystemZ::R3D,  SystemZ::R4D,
+      SystemZ::R5D,  SystemZ::R0D,  SystemZ::R12D, SystemZ::R11D,
+      SystemZ::R10D, SystemZ::R9D,  SystemZ::R8D,  SystemZ::R7D,
+      SystemZ::R6D,  SystemZ::R14D, SystemZ::R13D
+    };
+    static const unsigned SystemZ_REG64_nofp[] = {
+      SystemZ::R1D,  SystemZ::R2D,  SystemZ::R3D,  SystemZ::R4D,
+      SystemZ::R5D,  SystemZ::R0D,  SystemZ::R12D, /* No R11D */
+      SystemZ::R10D, SystemZ::R9D,  SystemZ::R8D,  SystemZ::R7D,
+      SystemZ::R6D,  SystemZ::R14D, SystemZ::R13D
+    };
+    GR64Class::iterator
+    GR64Class::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF))
+        return SystemZ_REG64_nofp;
+      else
+        return SystemZ_REG64;
+    }
+    GR64Class::iterator
+    GR64Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF))
+        return SystemZ_REG64_nofp + (sizeof(SystemZ_REG64_nofp) / sizeof(unsigned));
+      else
+        return SystemZ_REG64 + (sizeof(SystemZ_REG64) / sizeof(unsigned));
+    }
+  }];
+}
+
+def ADDR64 : RegisterClass<"SystemZ", [i64], 64,
+   // Volatile registers
+  [R1D, R2D, R3D, R4D, R5D, R6D, R7D, R8D, R9D, R10D, R12D, R13D,
+   // Frame pointer, sometimes allocable
+   R11D,
+   // Volatile, but not allocable
+   R14D, R15D]>
+{
+  let SubRegClassList = [ADDR32];
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    static const unsigned SystemZ_ADDR64[] = {
+      SystemZ::R1D,  SystemZ::R2D,  SystemZ::R3D,  SystemZ::R4D,
+      SystemZ::R5D,  /* No R0D */   SystemZ::R12D, SystemZ::R11D,
+      SystemZ::R10D, SystemZ::R9D,  SystemZ::R8D,  SystemZ::R7D,
+      SystemZ::R6D,  SystemZ::R14D, SystemZ::R13D
+    };
+    static const unsigned SystemZ_ADDR64_nofp[] = {
+      SystemZ::R1D,  SystemZ::R2D,  SystemZ::R3D,  SystemZ::R4D,
+      SystemZ::R5D,  /* No R0D */   SystemZ::R12D, /* No R11D */
+      SystemZ::R10D, SystemZ::R9D,  SystemZ::R8D,  SystemZ::R7D,
+      SystemZ::R6D,  SystemZ::R14D, SystemZ::R13D
+    };
+    ADDR64Class::iterator
+    ADDR64Class::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF))
+        return SystemZ_ADDR64_nofp;
+      else
+        return SystemZ_ADDR64;
+    }
+    ADDR64Class::iterator
+    ADDR64Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF))
+        return SystemZ_ADDR64_nofp + (sizeof(SystemZ_ADDR64_nofp) / sizeof(unsigned));
+      else
+        return SystemZ_ADDR64 + (sizeof(SystemZ_ADDR64) / sizeof(unsigned));
+    }
+  }];
+}
+
+// Even-odd register pairs
+def GR64P : RegisterClass<"SystemZ", [v2i32], 64,
+  [R0P, R2P, R4P, R6P, R8P, R10P, R12P, R14P]>
+{
+  let SubRegClassList = [GR32, GR32];
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    static const unsigned SystemZ_REG64P[] = {
+      SystemZ::R0P,  SystemZ::R2P,  SystemZ::R4P, SystemZ::R10P,
+      SystemZ::R8P,  SystemZ::R6P };
+    static const unsigned SystemZ_REG64P_nofp[] = {
+      SystemZ::R0P,  SystemZ::R2P,  SystemZ::R4P, /* NO R10P */
+      SystemZ::R8P,  SystemZ::R6P };
+    GR64PClass::iterator
+    GR64PClass::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF))
+        return SystemZ_REG64P_nofp;
+      else
+        return SystemZ_REG64P;
+    }
+    GR64PClass::iterator
+    GR64PClass::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF))
+        return SystemZ_REG64P_nofp + (sizeof(SystemZ_REG64P_nofp) / sizeof(unsigned));
+      else
+        return SystemZ_REG64P + (sizeof(SystemZ_REG64P) / sizeof(unsigned));
+    }
+  }];
+}
+
+def GR128 : RegisterClass<"SystemZ", [v2i64], 128,
+  [R0Q, R2Q, R4Q, R6Q, R8Q, R10Q, R12Q, R14Q]>
+{
+  let SubRegClassList = [GR32, GR32, GR64, GR64];
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    static const unsigned SystemZ_REG128[] = {
+      SystemZ::R0Q,  SystemZ::R2Q,  SystemZ::R4Q,  SystemZ::R10Q,
+      SystemZ::R8Q,  SystemZ::R6Q };
+    static const unsigned SystemZ_REG128_nofp[] = {
+      SystemZ::R0Q,  SystemZ::R2Q,  SystemZ::R4Q, /* NO R10Q */
+      SystemZ::R8Q,  SystemZ::R6Q };
+    GR128Class::iterator
+    GR128Class::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF))
+        return SystemZ_REG128_nofp;
+      else
+        return SystemZ_REG128;
+    }
+    GR128Class::iterator
+    GR128Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF))
+        return SystemZ_REG128_nofp + (sizeof(SystemZ_REG128_nofp) / sizeof(unsigned));
+      else
+        return SystemZ_REG128 + (sizeof(SystemZ_REG128) / sizeof(unsigned));
+    }
+  }];
+}
+
+def FP32 : RegisterClass<"SystemZ", [f32], 32,
+ [F0S, F1S,  F2S,  F3S,  F4S,  F5S,  F6S,  F7S,
+  F8S, F9S, F10S, F11S, F12S, F13S, F14S, F15S]> {
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    static const unsigned SystemZ_REGFP32[] = {
+      SystemZ::F0S,  SystemZ::F2S,  SystemZ::F4S,  SystemZ::F6S,
+      SystemZ::F1S,  SystemZ::F3S,  SystemZ::F5S,  SystemZ::F7S,
+      SystemZ::F8S,  SystemZ::F9S,  SystemZ::F10S, SystemZ::F11S,
+      SystemZ::F12S, SystemZ::F13S, SystemZ::F14S, SystemZ::F15S };
+    FP32Class::iterator
+    FP32Class::allocation_order_begin(const MachineFunction &MF) const {
+      return SystemZ_REGFP32;
+    }
+    FP32Class::iterator
+    FP32Class::allocation_order_end(const MachineFunction &MF) const {
+      return SystemZ_REGFP32 + (sizeof(SystemZ_REGFP32) / sizeof(unsigned));
+    }
+  }];
+}
+
+def FP64 : RegisterClass<"SystemZ", [f64], 64,
+ [F0L, F1L,  F2L,  F3L,  F4L,  F5L,  F6L,  F7L, 
+  F8L, F9L, F10L, F11L, F12L, F13L, F14L, F15L]> {
+  let SubRegClassList = [FP32];
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    static const unsigned SystemZ_REGFP64[] = {
+      SystemZ::F0L,  SystemZ::F2L,  SystemZ::F4L,  SystemZ::F6L,
+      SystemZ::F1L,  SystemZ::F3L,  SystemZ::F5L,  SystemZ::F7L,
+      SystemZ::F8L,  SystemZ::F9L,  SystemZ::F10L, SystemZ::F11L,
+      SystemZ::F12L, SystemZ::F13L, SystemZ::F14L, SystemZ::F15L };
+    FP64Class::iterator
+    FP64Class::allocation_order_begin(const MachineFunction &MF) const {
+      return SystemZ_REGFP64;
+    }
+    FP64Class::iterator
+    FP64Class::allocation_order_end(const MachineFunction &MF) const {
+      return SystemZ_REGFP64 + (sizeof(SystemZ_REGFP64) / sizeof(unsigned));
+    }
+  }];
+}
+
+// Status flags registers.
+def CCR : RegisterClass<"SystemZ", [i64], 64, [PSW]> {
+  let CopyCost = -1;  // Don't allow copying of status registers.
+}

diff --git a/lib/Target/SystemZ/SystemZSubtarget.cpp b/lib/Target/SystemZ/SystemZSubtarget.cpp
new file mode 100644
index 0000000..a8b5e1f
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZSubtarget.cpp

@@ -0,0 +1,47 @@
+//===- SystemZSubtarget.cpp - SystemZ Subtarget Information -------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SystemZ specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZSubtarget.h"
+#include "SystemZ.h"
+#include "SystemZGenSubtarget.inc"
+#include "llvm/GlobalValue.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+SystemZSubtarget::SystemZSubtarget(const std::string &TT, 
+                                   const std::string &FS):
+  HasZ10Insts(false) {
+  std::string CPU = "z9";
+
+  // Parse features string.
+  ParseSubtargetFeatures(FS, CPU);
+}
+
+/// True if accessing the GV requires an extra load.
+bool SystemZSubtarget::GVRequiresExtraLoad(const GlobalValue* GV,
+                                           const TargetMachine& TM,
+                                           bool isDirectCall) const {
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    // Extra load is needed for all externally visible.
+    if (isDirectCall)
+      return false;
+
+    if (GV->hasLocalLinkage() || GV->hasHiddenVisibility())
+      return false;
+
+    return true;
+  }
+
+  return false;
+}

diff --git a/lib/Target/SystemZ/SystemZSubtarget.h b/lib/Target/SystemZ/SystemZSubtarget.h
new file mode 100644
index 0000000..405d6e9
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZSubtarget.h

@@ -0,0 +1,45 @@
+//==-- SystemZSubtarget.h - Define Subtarget for the SystemZ ---*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the SystemZ specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_SystemZ_SUBTARGET_H
+#define LLVM_TARGET_SystemZ_SUBTARGET_H
+
+#include "llvm/Target/TargetSubtarget.h"
+
+#include <string>
+
+namespace llvm {
+class GlobalValue;
+class TargetMachine;
+
+class SystemZSubtarget : public TargetSubtarget {
+  bool HasZ10Insts;
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified triple.
+  ///
+  SystemZSubtarget(const std::string &TT, const std::string &FS);
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  std::string ParseSubtargetFeatures(const std::string &FS,
+                                     const std::string &CPU);
+
+  bool isZ10() const { return HasZ10Insts; }
+
+  bool GVRequiresExtraLoad(const GlobalValue* GV, const TargetMachine& TM,
+                           bool isDirectCall) const;
+};
+} // End llvm namespace
+
+#endif  // LLVM_TARGET_SystemZ_SUBTARGET_H

diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp
new file mode 100644
index 0000000..dfa26a1
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp

@@ -0,0 +1,44 @@
+//===-- SystemZTargetMachine.cpp - Define TargetMachine for SystemZ -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZMCAsmInfo.h"
+#include "SystemZTargetMachine.h"
+#include "SystemZ.h"
+#include "llvm/PassManager.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+extern "C" void LLVMInitializeSystemZTarget() {
+  // Register the target.
+  RegisterTargetMachine<SystemZTargetMachine> X(TheSystemZTarget);
+  RegisterAsmInfo<SystemZMCAsmInfo> Y(TheSystemZTarget);
+}
+
+/// SystemZTargetMachine ctor - Create an ILP64 architecture model
+///
+SystemZTargetMachine::SystemZTargetMachine(const Target &T,
+                                           const std::string &TT,
+                                           const std::string &FS)
+  : LLVMTargetMachine(T, TT),
+    Subtarget(TT, FS),
+    DataLayout("E-p:64:64:64-i8:8:16-i16:16:16-i32:32:32-i64:64:64-f32:32:32"
+               "-f64:64:64-f128:128:128-a0:16:16-n32:64"),
+    InstrInfo(*this), TLInfo(*this),
+    FrameInfo(TargetFrameInfo::StackGrowsDown, 8, -160) {
+
+  if (getRelocationModel() == Reloc::Default)
+    setRelocationModel(Reloc::Static);
+}
+
+bool SystemZTargetMachine::addInstSelector(PassManagerBase &PM,
+                                          CodeGenOpt::Level OptLevel) {
+  // Install an instruction selector.
+  PM.add(createSystemZISelDag(*this, OptLevel));
+  return false;
+}

diff --git a/lib/Target/SystemZ/SystemZTargetMachine.h b/lib/Target/SystemZ/SystemZTargetMachine.h
new file mode 100644
index 0000000..551aeb5
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZTargetMachine.h

@@ -0,0 +1,61 @@
+//==- SystemZTargetMachine.h - Define TargetMachine for SystemZ ---*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the SystemZ specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef LLVM_TARGET_SYSTEMZ_TARGETMACHINE_H
+#define LLVM_TARGET_SYSTEMZ_TARGETMACHINE_H
+
+#include "SystemZInstrInfo.h"
+#include "SystemZISelLowering.h"
+#include "SystemZRegisterInfo.h"
+#include "SystemZSubtarget.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+/// SystemZTargetMachine
+///
+class SystemZTargetMachine : public LLVMTargetMachine {
+  SystemZSubtarget        Subtarget;
+  const TargetData        DataLayout;       // Calculates type size & alignment
+  SystemZInstrInfo        InstrInfo;
+  SystemZTargetLowering   TLInfo;
+
+  // SystemZ does not have any call stack frame, therefore not having
+  // any SystemZ specific FrameInfo class.
+  TargetFrameInfo       FrameInfo;
+public:
+  SystemZTargetMachine(const Target &T, const std::string &TT,
+                       const std::string &FS);
+
+  virtual const TargetFrameInfo *getFrameInfo() const { return &FrameInfo; }
+  virtual const SystemZInstrInfo *getInstrInfo() const  { return &InstrInfo; }
+  virtual const TargetData *getTargetData() const     { return &DataLayout;}
+  virtual const SystemZSubtarget *getSubtargetImpl() const { return &Subtarget; }
+
+  virtual const SystemZRegisterInfo *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+
+  virtual SystemZTargetLowering *getTargetLowering() const {
+    return const_cast<SystemZTargetLowering*>(&TLInfo);
+  }
+
+  virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+}; // SystemZTargetMachine.
+
+} // end namespace llvm
+
+#endif // LLVM_TARGET_SystemZ_TARGETMACHINE_H

diff --git a/lib/Target/SystemZ/TargetInfo/CMakeLists.txt b/lib/Target/SystemZ/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000..743d8d3
--- /dev/null
+++ b/lib/Target/SystemZ/TargetInfo/CMakeLists.txt

@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMSystemZInfo
+  SystemZTargetInfo.cpp
+  )
+
+add_dependencies(LLVMSystemZInfo SystemZCodeGenTable_gen)

diff --git a/lib/Target/SystemZ/TargetInfo/Makefile b/lib/Target/SystemZ/TargetInfo/Makefile
new file mode 100644
index 0000000..0be80eb
--- /dev/null
+++ b/lib/Target/SystemZ/TargetInfo/Makefile

@@ -0,0 +1,15 @@
+##===- lib/Target/SystemZ/TargetInfo/Makefile --------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMSystemZInfo
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common

diff --git a/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp b/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp
new file mode 100644
index 0000000..8272b11
--- /dev/null
+++ b/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp

@@ -0,0 +1,19 @@
+//===-- SystemZTargetInfo.cpp - SystemZ Target Implementation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZ.h"
+#include "llvm/Module.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheSystemZTarget;
+
+extern "C" void LLVMInitializeSystemZTargetInfo() {
+  RegisterTarget<Triple::systemz> X(TheSystemZTarget, "systemz", "SystemZ");
+}

diff --git a/lib/Target/Target.cpp b/lib/Target/Target.cpp
new file mode 100644
index 0000000..f5c969a
--- /dev/null
+++ b/lib/Target/Target.cpp

@@ -0,0 +1,95 @@
+//===-- Target.cpp --------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the C bindings for libLLVMTarget.a, which implements
+// target information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/Target.h"
+#include "llvm/PassManager.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/LLVMContext.h"
+#include <cstring>
+
+using namespace llvm;
+
+LLVMTargetDataRef LLVMCreateTargetData(const char *StringRep) {
+  return wrap(new TargetData(StringRep));
+}
+
+void LLVMAddTargetData(LLVMTargetDataRef TD, LLVMPassManagerRef PM) {
+  unwrap(PM)->add(new TargetData(*unwrap(TD)));
+}
+
+char *LLVMCopyStringRepOfTargetData(LLVMTargetDataRef TD) {
+  std::string StringRep = unwrap(TD)->getStringRepresentation();
+  return strdup(StringRep.c_str());
+}
+
+LLVMByteOrdering LLVMByteOrder(LLVMTargetDataRef TD) {
+  return unwrap(TD)->isLittleEndian() ? LLVMLittleEndian : LLVMBigEndian;
+}
+
+unsigned LLVMPointerSize(LLVMTargetDataRef TD) {
+  return unwrap(TD)->getPointerSize();
+}
+
+LLVMTypeRef LLVMIntPtrType(LLVMTargetDataRef TD) {
+  return wrap(unwrap(TD)->getIntPtrType(getGlobalContext()));
+}
+
+unsigned long long LLVMSizeOfTypeInBits(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
+  return unwrap(TD)->getTypeSizeInBits(unwrap(Ty));
+}
+
+unsigned long long LLVMStoreSizeOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
+  return unwrap(TD)->getTypeStoreSize(unwrap(Ty));
+}
+
+unsigned long long LLVMABISizeOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
+  return unwrap(TD)->getTypeAllocSize(unwrap(Ty));
+}
+
+unsigned LLVMABIAlignmentOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
+  return unwrap(TD)->getABITypeAlignment(unwrap(Ty));
+}
+
+unsigned LLVMCallFrameAlignmentOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
+  return unwrap(TD)->getCallFrameTypeAlignment(unwrap(Ty));
+}
+
+unsigned LLVMPreferredAlignmentOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
+  return unwrap(TD)->getPrefTypeAlignment(unwrap(Ty));
+}
+
+unsigned LLVMPreferredAlignmentOfGlobal(LLVMTargetDataRef TD,
+                                        LLVMValueRef GlobalVar) {
+  return unwrap(TD)->getPreferredAlignment(unwrap<GlobalVariable>(GlobalVar));
+}
+
+unsigned LLVMElementAtOffset(LLVMTargetDataRef TD, LLVMTypeRef StructTy,
+                             unsigned long long Offset) {
+  const StructType *STy = unwrap<StructType>(StructTy);
+  return unwrap(TD)->getStructLayout(STy)->getElementContainingOffset(Offset);
+}
+
+unsigned long long LLVMOffsetOfElement(LLVMTargetDataRef TD, LLVMTypeRef StructTy,
+                                       unsigned Element) {
+  const StructType *STy = unwrap<StructType>(StructTy);
+  return unwrap(TD)->getStructLayout(STy)->getElementOffset(Element);
+}
+
+void LLVMInvalidateStructLayout(LLVMTargetDataRef TD, LLVMTypeRef StructTy) {
+  unwrap(TD)->InvalidateStructLayoutInfo(unwrap<StructType>(StructTy));
+}
+
+void LLVMDisposeTargetData(LLVMTargetDataRef TD) {
+  delete unwrap(TD);
+}

diff --git a/lib/Target/TargetAsmLexer.cpp b/lib/Target/TargetAsmLexer.cpp
new file mode 100644
index 0000000..d4893ff
--- /dev/null
+++ b/lib/Target/TargetAsmLexer.cpp

@@ -0,0 +1,14 @@
+//===-- llvm/Target/TargetAsmLexer.cpp - Target Assembly Lexer ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetAsmLexer.h"
+using namespace llvm;
+
+TargetAsmLexer::TargetAsmLexer(const Target &T) : TheTarget(T), Lexer(NULL) {}
+TargetAsmLexer::~TargetAsmLexer() {}

diff --git a/lib/Target/TargetData.cpp b/lib/Target/TargetData.cpp
new file mode 100644
index 0000000..295b30f
--- /dev/null
+++ b/lib/Target/TargetData.cpp

@@ -0,0 +1,641 @@
+//===-- TargetData.cpp - Data size & alignment routines --------------------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines target properties related to datatype size/offset/alignment
+// information.
+//
+// This structure should be created once, filled in if the defaults are not
+// correct and then passed around by const&.  None of the members functions
+// require modification to the object.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetData.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/System/Mutex.h"
+#include "llvm/ADT/DenseMap.h"
+#include <algorithm>
+#include <cstdlib>
+using namespace llvm;
+
+// Handle the Pass registration stuff necessary to use TargetData's.
+
+// Register the default SparcV9 implementation...
+static RegisterPass<TargetData> X("targetdata", "Target Data Layout", false, 
+                                  true);
+char TargetData::ID = 0;
+
+//===----------------------------------------------------------------------===//
+// Support for StructLayout
+//===----------------------------------------------------------------------===//
+
+StructLayout::StructLayout(const StructType *ST, const TargetData &TD) {
+  StructAlignment = 0;
+  StructSize = 0;
+  NumElements = ST->getNumElements();
+
+  // Loop over each of the elements, placing them in memory.
+  for (unsigned i = 0, e = NumElements; i != e; ++i) {
+    const Type *Ty = ST->getElementType(i);
+    unsigned TyAlign = ST->isPacked() ? 1 : TD.getABITypeAlignment(Ty);
+
+    // Add padding if necessary to align the data element properly.
+    if ((StructSize & (TyAlign-1)) != 0)
+      StructSize = TargetData::RoundUpAlignment(StructSize, TyAlign);
+
+    // Keep track of maximum alignment constraint.
+    StructAlignment = std::max(TyAlign, StructAlignment);
+
+    MemberOffsets[i] = StructSize;
+    StructSize += TD.getTypeAllocSize(Ty); // Consume space for this data item
+  }
+
+  // Empty structures have alignment of 1 byte.
+  if (StructAlignment == 0) StructAlignment = 1;
+
+  // Add padding to the end of the struct so that it could be put in an array
+  // and all array elements would be aligned correctly.
+  if ((StructSize & (StructAlignment-1)) != 0)
+    StructSize = TargetData::RoundUpAlignment(StructSize, StructAlignment);
+}
+
+
+/// getElementContainingOffset - Given a valid offset into the structure,
+/// return the structure index that contains it.
+unsigned StructLayout::getElementContainingOffset(uint64_t Offset) const {
+  const uint64_t *SI =
+    std::upper_bound(&MemberOffsets[0], &MemberOffsets[NumElements], Offset);
+  assert(SI != &MemberOffsets[0] && "Offset not in structure type!");
+  --SI;
+  assert(*SI <= Offset && "upper_bound didn't work");
+  assert((SI == &MemberOffsets[0] || *(SI-1) <= Offset) &&
+         (SI+1 == &MemberOffsets[NumElements] || *(SI+1) > Offset) &&
+         "Upper bound didn't work!");
+  
+  // Multiple fields can have the same offset if any of them are zero sized.
+  // For example, in { i32, [0 x i32], i32 }, searching for offset 4 will stop
+  // at the i32 element, because it is the last element at that offset.  This is
+  // the right one to return, because anything after it will have a higher
+  // offset, implying that this element is non-empty.
+  return SI-&MemberOffsets[0];
+}
+
+//===----------------------------------------------------------------------===//
+// TargetAlignElem, TargetAlign support
+//===----------------------------------------------------------------------===//
+
+TargetAlignElem
+TargetAlignElem::get(AlignTypeEnum align_type, unsigned char abi_align,
+                     unsigned char pref_align, uint32_t bit_width) {
+  assert(abi_align <= pref_align && "Preferred alignment worse than ABI!");
+  TargetAlignElem retval;
+  retval.AlignType = align_type;
+  retval.ABIAlign = abi_align;
+  retval.PrefAlign = pref_align;
+  retval.TypeBitWidth = bit_width;
+  return retval;
+}
+
+bool
+TargetAlignElem::operator==(const TargetAlignElem &rhs) const {
+  return (AlignType == rhs.AlignType
+          && ABIAlign == rhs.ABIAlign
+          && PrefAlign == rhs.PrefAlign
+          && TypeBitWidth == rhs.TypeBitWidth);
+}
+
+const TargetAlignElem TargetData::InvalidAlignmentElem =
+                TargetAlignElem::get((AlignTypeEnum) -1, 0, 0, 0);
+
+//===----------------------------------------------------------------------===//
+//                       TargetData Class Implementation
+//===----------------------------------------------------------------------===//
+
+/// getInt - Get an integer ignoring errors.
+static unsigned getInt(StringRef R) {
+  unsigned Result = 0;
+  R.getAsInteger(10, Result);
+  return Result;
+}
+
+void TargetData::init(StringRef Desc) {
+  LayoutMap = 0;
+  LittleEndian = false;
+  PointerMemSize = 8;
+  PointerABIAlign = 8;
+  PointerPrefAlign = PointerABIAlign;
+
+  // Default alignments
+  setAlignment(INTEGER_ALIGN,   1,  1, 1);   // i1
+  setAlignment(INTEGER_ALIGN,   1,  1, 8);   // i8
+  setAlignment(INTEGER_ALIGN,   2,  2, 16);  // i16
+  setAlignment(INTEGER_ALIGN,   4,  4, 32);  // i32
+  setAlignment(INTEGER_ALIGN,   4,  8, 64);  // i64
+  setAlignment(FLOAT_ALIGN,     4,  4, 32);  // float
+  setAlignment(FLOAT_ALIGN,     8,  8, 64);  // double
+  setAlignment(VECTOR_ALIGN,    8,  8, 64);  // v2i32, v1i64, ...
+  setAlignment(VECTOR_ALIGN,   16, 16, 128); // v16i8, v8i16, v4i32, ...
+  setAlignment(AGGREGATE_ALIGN, 0,  8,  0);  // struct
+
+  while (!Desc.empty()) {
+    std::pair<StringRef, StringRef> Split = Desc.split('-');
+    StringRef Token = Split.first;
+    Desc = Split.second;
+    
+    if (Token.empty())
+      continue;
+    
+    Split = Token.split(':');
+    StringRef Specifier = Split.first;
+    Token = Split.second;
+    
+    assert(!Specifier.empty() && "Can't be empty here");
+    
+    switch (Specifier[0]) {
+    case 'E':
+      LittleEndian = false;
+      break;
+    case 'e':
+      LittleEndian = true;
+      break;
+    case 'p':
+      Split = Token.split(':');
+      PointerMemSize = getInt(Split.first) / 8;
+      Split = Split.second.split(':');
+      PointerABIAlign = getInt(Split.first) / 8;
+      Split = Split.second.split(':');
+      PointerPrefAlign = getInt(Split.first) / 8;
+      if (PointerPrefAlign == 0)
+        PointerPrefAlign = PointerABIAlign;
+      break;
+    case 'i':
+    case 'v':
+    case 'f':
+    case 'a':
+    case 's': {
+      AlignTypeEnum AlignType;
+      switch (Specifier[0]) {
+      default:
+      case 'i': AlignType = INTEGER_ALIGN; break;
+      case 'v': AlignType = VECTOR_ALIGN; break;
+      case 'f': AlignType = FLOAT_ALIGN; break;
+      case 'a': AlignType = AGGREGATE_ALIGN; break;
+      case 's': AlignType = STACK_ALIGN; break;
+      }
+      unsigned Size = getInt(Specifier.substr(1));
+      Split = Token.split(':');
+      unsigned char ABIAlign = getInt(Split.first) / 8;
+      
+      Split = Split.second.split(':');
+      unsigned char PrefAlign = getInt(Split.first) / 8;
+      if (PrefAlign == 0)
+        PrefAlign = ABIAlign;
+      setAlignment(AlignType, ABIAlign, PrefAlign, Size);
+      break;
+    }
+    case 'n':  // Native integer types.
+      Specifier = Specifier.substr(1);
+      do {
+        if (unsigned Width = getInt(Specifier))
+          LegalIntWidths.push_back(Width);
+        Split = Token.split(':');
+        Specifier = Split.first;
+        Token = Split.second;
+      } while (!Specifier.empty() || !Token.empty());
+      break;
+        
+    default:
+      break;
+    }
+  }
+}
+
+/// Default ctor.
+///
+/// @note This has to exist, because this is a pass, but it should never be
+/// used.
+TargetData::TargetData() : ImmutablePass(&ID) {
+  llvm_report_error("Bad TargetData ctor used.  "
+                    "Tool did not specify a TargetData to use?");
+}
+
+TargetData::TargetData(const Module *M) 
+  : ImmutablePass(&ID) {
+  init(M->getDataLayout());
+}
+
+void
+TargetData::setAlignment(AlignTypeEnum align_type, unsigned char abi_align,
+                         unsigned char pref_align, uint32_t bit_width) {
+  assert(abi_align <= pref_align && "Preferred alignment worse than ABI!");
+  for (unsigned i = 0, e = Alignments.size(); i != e; ++i) {
+    if (Alignments[i].AlignType == align_type &&
+        Alignments[i].TypeBitWidth == bit_width) {
+      // Update the abi, preferred alignments.
+      Alignments[i].ABIAlign = abi_align;
+      Alignments[i].PrefAlign = pref_align;
+      return;
+    }
+  }
+  
+  Alignments.push_back(TargetAlignElem::get(align_type, abi_align,
+                                            pref_align, bit_width));
+}
+
+/// getAlignmentInfo - Return the alignment (either ABI if ABIInfo = true or 
+/// preferred if ABIInfo = false) the target wants for the specified datatype.
+unsigned TargetData::getAlignmentInfo(AlignTypeEnum AlignType, 
+                                      uint32_t BitWidth, bool ABIInfo,
+                                      const Type *Ty) const {
+  // Check to see if we have an exact match and remember the best match we see.
+  int BestMatchIdx = -1;
+  int LargestInt = -1;
+  for (unsigned i = 0, e = Alignments.size(); i != e; ++i) {
+    if (Alignments[i].AlignType == AlignType &&
+        Alignments[i].TypeBitWidth == BitWidth)
+      return ABIInfo ? Alignments[i].ABIAlign : Alignments[i].PrefAlign;
+    
+    // The best match so far depends on what we're looking for.
+    if (AlignType == VECTOR_ALIGN && Alignments[i].AlignType == VECTOR_ALIGN) {
+      // If this is a specification for a smaller vector type, we will fall back
+      // to it.  This happens because <128 x double> can be implemented in terms
+      // of 64 <2 x double>.
+      if (Alignments[i].TypeBitWidth < BitWidth) {
+        // Verify that we pick the biggest of the fallbacks.
+        if (BestMatchIdx == -1 ||
+            Alignments[BestMatchIdx].TypeBitWidth < Alignments[i].TypeBitWidth)
+          BestMatchIdx = i;
+      }
+    } else if (AlignType == INTEGER_ALIGN && 
+               Alignments[i].AlignType == INTEGER_ALIGN) {
+      // The "best match" for integers is the smallest size that is larger than
+      // the BitWidth requested.
+      if (Alignments[i].TypeBitWidth > BitWidth && (BestMatchIdx == -1 || 
+           Alignments[i].TypeBitWidth < Alignments[BestMatchIdx].TypeBitWidth))
+        BestMatchIdx = i;
+      // However, if there isn't one that's larger, then we must use the
+      // largest one we have (see below)
+      if (LargestInt == -1 || 
+          Alignments[i].TypeBitWidth > Alignments[LargestInt].TypeBitWidth)
+        LargestInt = i;
+    }
+  }
+
+  // Okay, we didn't find an exact solution.  Fall back here depending on what
+  // is being looked for.
+  if (BestMatchIdx == -1) {
+    // If we didn't find an integer alignment, fall back on most conservative.
+    if (AlignType == INTEGER_ALIGN) {
+      BestMatchIdx = LargestInt;
+    } else {
+      assert(AlignType == VECTOR_ALIGN && "Unknown alignment type!");
+
+      // If we didn't find a vector size that is smaller or equal to this type,
+      // then we will end up scalarizing this to its element type.  Just return
+      // the alignment of the element.
+      return getAlignment(cast<VectorType>(Ty)->getElementType(), ABIInfo);
+    }
+  }
+
+  // Since we got a "best match" index, just return it.
+  return ABIInfo ? Alignments[BestMatchIdx].ABIAlign
+                 : Alignments[BestMatchIdx].PrefAlign;
+}
+
+namespace {
+
+class StructLayoutMap : public AbstractTypeUser {
+  typedef DenseMap<const StructType*, StructLayout*> LayoutInfoTy;
+  LayoutInfoTy LayoutInfo;
+
+  void RemoveEntry(LayoutInfoTy::iterator I, bool WasAbstract) {
+    I->second->~StructLayout();
+    free(I->second);
+    if (WasAbstract)
+      I->first->removeAbstractTypeUser(this);
+    LayoutInfo.erase(I);
+  }
+  
+  
+  /// refineAbstractType - The callback method invoked when an abstract type is
+  /// resolved to another type.  An object must override this method to update
+  /// its internal state to reference NewType instead of OldType.
+  ///
+  virtual void refineAbstractType(const DerivedType *OldTy,
+                                  const Type *) {
+    LayoutInfoTy::iterator I = LayoutInfo.find(cast<const StructType>(OldTy));
+    assert(I != LayoutInfo.end() && "Using type but not in map?");
+    RemoveEntry(I, true);
+  }
+
+  /// typeBecameConcrete - The other case which AbstractTypeUsers must be aware
+  /// of is when a type makes the transition from being abstract (where it has
+  /// clients on its AbstractTypeUsers list) to concrete (where it does not).
+  /// This method notifies ATU's when this occurs for a type.
+  ///
+  virtual void typeBecameConcrete(const DerivedType *AbsTy) {
+    LayoutInfoTy::iterator I = LayoutInfo.find(cast<const StructType>(AbsTy));
+    assert(I != LayoutInfo.end() && "Using type but not in map?");
+    RemoveEntry(I, true);
+  }
+
+public:
+  virtual ~StructLayoutMap() {
+    // Remove any layouts.
+    for (LayoutInfoTy::iterator
+           I = LayoutInfo.begin(), E = LayoutInfo.end(); I != E; ++I) {
+      const Type *Key = I->first;
+      StructLayout *Value = I->second;
+
+      if (Key->isAbstract())
+        Key->removeAbstractTypeUser(this);
+
+      Value->~StructLayout();
+      free(Value);
+    }
+  }
+
+  void InvalidateEntry(const StructType *Ty) {
+    LayoutInfoTy::iterator I = LayoutInfo.find(Ty);
+    if (I == LayoutInfo.end()) return;
+    RemoveEntry(I, Ty->isAbstract());
+  }
+
+  StructLayout *&operator[](const StructType *STy) {
+    return LayoutInfo[STy];
+  }
+
+  // for debugging...
+  virtual void dump() const {}
+};
+
+} // end anonymous namespace
+
+TargetData::~TargetData() {
+  delete static_cast<StructLayoutMap*>(LayoutMap);
+}
+
+const StructLayout *TargetData::getStructLayout(const StructType *Ty) const {
+  if (!LayoutMap)
+    LayoutMap = new StructLayoutMap();
+  
+  StructLayoutMap *STM = static_cast<StructLayoutMap*>(LayoutMap);
+  StructLayout *&SL = (*STM)[Ty];
+  if (SL) return SL;
+
+  // Otherwise, create the struct layout.  Because it is variable length, we 
+  // malloc it, then use placement new.
+  int NumElts = Ty->getNumElements();
+  StructLayout *L =
+    (StructLayout *)malloc(sizeof(StructLayout)+(NumElts-1) * sizeof(uint64_t));
+  
+  // Set SL before calling StructLayout's ctor.  The ctor could cause other
+  // entries to be added to TheMap, invalidating our reference.
+  SL = L;
+  
+  new (L) StructLayout(Ty, *this);
+
+  if (Ty->isAbstract())
+    Ty->addAbstractTypeUser(STM);
+
+  return L;
+}
+
+/// InvalidateStructLayoutInfo - TargetData speculatively caches StructLayout
+/// objects.  If a TargetData object is alive when types are being refined and
+/// removed, this method must be called whenever a StructType is removed to
+/// avoid a dangling pointer in this cache.
+void TargetData::InvalidateStructLayoutInfo(const StructType *Ty) const {
+  if (!LayoutMap) return;  // No cache.
+  
+  static_cast<StructLayoutMap*>(LayoutMap)->InvalidateEntry(Ty);
+}
+
+std::string TargetData::getStringRepresentation() const {
+  std::string Result;
+  raw_string_ostream OS(Result);
+  
+  OS << (LittleEndian ? "e" : "E")
+     << "-p:" << PointerMemSize*8 << ':' << PointerABIAlign*8
+     << ':' << PointerPrefAlign*8;
+  for (unsigned i = 0, e = Alignments.size(); i != e; ++i) {
+    const TargetAlignElem &AI = Alignments[i];
+    OS << '-' << (char)AI.AlignType << AI.TypeBitWidth << ':'
+       << AI.ABIAlign*8 << ':' << AI.PrefAlign*8;
+  }
+  
+  if (!LegalIntWidths.empty()) {
+    OS << "-n" << (unsigned)LegalIntWidths[0];
+    
+    for (unsigned i = 1, e = LegalIntWidths.size(); i != e; ++i)
+      OS << ':' << (unsigned)LegalIntWidths[i];
+  }
+  return OS.str();
+}
+
+
+uint64_t TargetData::getTypeSizeInBits(const Type *Ty) const {
+  assert(Ty->isSized() && "Cannot getTypeInfo() on a type that is unsized!");
+  switch (Ty->getTypeID()) {
+  case Type::LabelTyID:
+  case Type::PointerTyID:
+    return getPointerSizeInBits();
+  case Type::ArrayTyID: {
+    const ArrayType *ATy = cast<ArrayType>(Ty);
+    return getTypeAllocSizeInBits(ATy->getElementType())*ATy->getNumElements();
+  }
+  case Type::StructTyID:
+    // Get the layout annotation... which is lazily created on demand.
+    return getStructLayout(cast<StructType>(Ty))->getSizeInBits();
+  case Type::IntegerTyID:
+    return cast<IntegerType>(Ty)->getBitWidth();
+  case Type::VoidTyID:
+    return 8;
+  case Type::FloatTyID:
+    return 32;
+  case Type::DoubleTyID:
+    return 64;
+  case Type::PPC_FP128TyID:
+  case Type::FP128TyID:
+    return 128;
+  // In memory objects this is always aligned to a higher boundary, but
+  // only 80 bits contain information.
+  case Type::X86_FP80TyID:
+    return 80;
+  case Type::VectorTyID:
+    return cast<VectorType>(Ty)->getBitWidth();
+  default:
+    llvm_unreachable("TargetData::getTypeSizeInBits(): Unsupported type");
+    break;
+  }
+  return 0;
+}
+
+/*!
+  \param abi_or_pref Flag that determines which alignment is returned. true
+  returns the ABI alignment, false returns the preferred alignment.
+  \param Ty The underlying type for which alignment is determined.
+
+  Get the ABI (\a abi_or_pref == true) or preferred alignment (\a abi_or_pref
+  == false) for the requested type \a Ty.
+ */
+unsigned char TargetData::getAlignment(const Type *Ty, bool abi_or_pref) const {
+  int AlignType = -1;
+
+  assert(Ty->isSized() && "Cannot getTypeInfo() on a type that is unsized!");
+  switch (Ty->getTypeID()) {
+  // Early escape for the non-numeric types.
+  case Type::LabelTyID:
+  case Type::PointerTyID:
+    return (abi_or_pref
+            ? getPointerABIAlignment()
+            : getPointerPrefAlignment());
+  case Type::ArrayTyID:
+    return getAlignment(cast<ArrayType>(Ty)->getElementType(), abi_or_pref);
+
+  case Type::StructTyID: {
+    // Packed structure types always have an ABI alignment of one.
+    if (cast<StructType>(Ty)->isPacked() && abi_or_pref)
+      return 1;
+
+    // Get the layout annotation... which is lazily created on demand.
+    const StructLayout *Layout = getStructLayout(cast<StructType>(Ty));
+    unsigned Align = getAlignmentInfo(AGGREGATE_ALIGN, 0, abi_or_pref, Ty);
+    return std::max(Align, (unsigned)Layout->getAlignment());
+  }
+  case Type::IntegerTyID:
+  case Type::VoidTyID:
+    AlignType = INTEGER_ALIGN;
+    break;
+  case Type::FloatTyID:
+  case Type::DoubleTyID:
+  // PPC_FP128TyID and FP128TyID have different data contents, but the
+  // same size and alignment, so they look the same here.
+  case Type::PPC_FP128TyID:
+  case Type::FP128TyID:
+  case Type::X86_FP80TyID:
+    AlignType = FLOAT_ALIGN;
+    break;
+  case Type::VectorTyID:
+    AlignType = VECTOR_ALIGN;
+    break;
+  default:
+    llvm_unreachable("Bad type for getAlignment!!!");
+    break;
+  }
+
+  return getAlignmentInfo((AlignTypeEnum)AlignType, getTypeSizeInBits(Ty),
+                          abi_or_pref, Ty);
+}
+
+unsigned char TargetData::getABITypeAlignment(const Type *Ty) const {
+  return getAlignment(Ty, true);
+}
+
+/// getABIIntegerTypeAlignment - Return the minimum ABI-required alignment for
+/// an integer type of the specified bitwidth.
+unsigned char TargetData::getABIIntegerTypeAlignment(unsigned BitWidth) const {
+  return getAlignmentInfo(INTEGER_ALIGN, BitWidth, true, 0);
+}
+
+
+unsigned char TargetData::getCallFrameTypeAlignment(const Type *Ty) const {
+  for (unsigned i = 0, e = Alignments.size(); i != e; ++i)
+    if (Alignments[i].AlignType == STACK_ALIGN)
+      return Alignments[i].ABIAlign;
+
+  return getABITypeAlignment(Ty);
+}
+
+unsigned char TargetData::getPrefTypeAlignment(const Type *Ty) const {
+  return getAlignment(Ty, false);
+}
+
+unsigned char TargetData::getPreferredTypeAlignmentShift(const Type *Ty) const {
+  unsigned Align = (unsigned) getPrefTypeAlignment(Ty);
+  assert(!(Align & (Align-1)) && "Alignment is not a power of two!");
+  return Log2_32(Align);
+}
+
+/// getIntPtrType - Return an unsigned integer type that is the same size or
+/// greater to the host pointer size.
+const IntegerType *TargetData::getIntPtrType(LLVMContext &C) const {
+  return IntegerType::get(C, getPointerSizeInBits());
+}
+
+
+uint64_t TargetData::getIndexedOffset(const Type *ptrTy, Value* const* Indices,
+                                      unsigned NumIndices) const {
+  const Type *Ty = ptrTy;
+  assert(isa<PointerType>(Ty) && "Illegal argument for getIndexedOffset()");
+  uint64_t Result = 0;
+
+  generic_gep_type_iterator<Value* const*>
+    TI = gep_type_begin(ptrTy, Indices, Indices+NumIndices);
+  for (unsigned CurIDX = 0; CurIDX != NumIndices; ++CurIDX, ++TI) {
+    if (const StructType *STy = dyn_cast<StructType>(*TI)) {
+      assert(Indices[CurIDX]->getType() ==
+             Type::getInt32Ty(ptrTy->getContext()) &&
+             "Illegal struct idx");
+      unsigned FieldNo = cast<ConstantInt>(Indices[CurIDX])->getZExtValue();
+
+      // Get structure layout information...
+      const StructLayout *Layout = getStructLayout(STy);
+
+      // Add in the offset, as calculated by the structure layout info...
+      Result += Layout->getElementOffset(FieldNo);
+
+      // Update Ty to refer to current element
+      Ty = STy->getElementType(FieldNo);
+    } else {
+      // Update Ty to refer to current element
+      Ty = cast<SequentialType>(Ty)->getElementType();
+
+      // Get the array index and the size of each array element.
+      int64_t arrayIdx = cast<ConstantInt>(Indices[CurIDX])->getSExtValue();
+      Result += arrayIdx * (int64_t)getTypeAllocSize(Ty);
+    }
+  }
+
+  return Result;
+}
+
+/// getPreferredAlignment - Return the preferred alignment of the specified
+/// global.  This includes an explicitly requested alignment (if the global
+/// has one).
+unsigned TargetData::getPreferredAlignment(const GlobalVariable *GV) const {
+  const Type *ElemType = GV->getType()->getElementType();
+  unsigned Alignment = getPrefTypeAlignment(ElemType);
+  if (GV->getAlignment() > Alignment)
+    Alignment = GV->getAlignment();
+
+  if (GV->hasInitializer()) {
+    if (Alignment < 16) {
+      // If the global is not external, see if it is large.  If so, give it a
+      // larger alignment.
+      if (getTypeSizeInBits(ElemType) > 128)
+        Alignment = 16;    // 16-byte alignment.
+    }
+  }
+  return Alignment;
+}
+
+/// getPreferredAlignmentLog - Return the preferred alignment of the
+/// specified global, returned in log form.  This includes an explicitly
+/// requested alignment (if the global has one).
+unsigned TargetData::getPreferredAlignmentLog(const GlobalVariable *GV) const {
+  return Log2_32(getPreferredAlignment(GV));
+}

diff --git a/lib/Target/TargetELFWriterInfo.cpp b/lib/Target/TargetELFWriterInfo.cpp
new file mode 100644
index 0000000..3631b35
--- /dev/null
+++ b/lib/Target/TargetELFWriterInfo.cpp

@@ -0,0 +1,26 @@
+//===-- lib/Target/TargetELFWriterInfo.cpp - ELF Writer Info --0-*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the TargetELFWriterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Function.h"
+#include "llvm/Target/TargetELFWriterInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+TargetELFWriterInfo::TargetELFWriterInfo(TargetMachine &tm) : TM(tm) {
+  is64Bit = TM.getTargetData()->getPointerSizeInBits() == 64;
+  isLittleEndian = TM.getTargetData()->isLittleEndian();
+}
+
+TargetELFWriterInfo::~TargetELFWriterInfo() {}
+

diff --git a/lib/Target/TargetFrameInfo.cpp b/lib/Target/TargetFrameInfo.cpp
new file mode 100644
index 0000000..873d60a
--- /dev/null
+++ b/lib/Target/TargetFrameInfo.cpp

@@ -0,0 +1,19 @@
+//===-- TargetFrameInfo.cpp - Implement machine frame interface -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements the layout of a stack frame on the target machine.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetFrameInfo.h"
+#include <cstdlib>
+using namespace llvm;
+
+TargetFrameInfo::~TargetFrameInfo() {
+}

diff --git a/lib/Target/TargetInstrInfo.cpp b/lib/Target/TargetInstrInfo.cpp
new file mode 100644
index 0000000..094a57e
--- /dev/null
+++ b/lib/Target/TargetInstrInfo.cpp

@@ -0,0 +1,95 @@
+//===-- TargetInstrInfo.cpp - Target Instruction Information --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//  TargetOperandInfo
+//===----------------------------------------------------------------------===//
+
+/// getRegClass - Get the register class for the operand, handling resolution
+/// of "symbolic" pointer register classes etc.  If this is not a register
+/// operand, this returns null.
+const TargetRegisterClass *
+TargetOperandInfo::getRegClass(const TargetRegisterInfo *TRI) const {
+  if (isLookupPtrRegClass())
+    return TRI->getPointerRegClass(RegClass);
+  return TRI->getRegClass(RegClass);
+}
+
+//===----------------------------------------------------------------------===//
+//  TargetInstrInfo
+//===----------------------------------------------------------------------===//
+
+TargetInstrInfo::TargetInstrInfo(const TargetInstrDesc* Desc,
+                                 unsigned numOpcodes)
+  : Descriptors(Desc), NumOpcodes(numOpcodes) {
+}
+
+TargetInstrInfo::~TargetInstrInfo() {
+}
+
+/// insertNoop - Insert a noop into the instruction stream at the specified
+/// point.
+void TargetInstrInfo::insertNoop(MachineBasicBlock &MBB, 
+                                 MachineBasicBlock::iterator MI) const {
+  llvm_unreachable("Target didn't implement insertNoop!");
+}
+
+
+bool TargetInstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
+  const TargetInstrDesc &TID = MI->getDesc();
+  if (!TID.isTerminator()) return false;
+  
+  // Conditional branch is a special case.
+  if (TID.isBranch() && !TID.isBarrier())
+    return true;
+  if (!TID.isPredicable())
+    return true;
+  return !isPredicated(MI);
+}
+
+
+/// Measure the specified inline asm to determine an approximation of its
+/// length.
+/// Comments (which run till the next SeparatorChar or newline) do not
+/// count as an instruction.
+/// Any other non-whitespace text is considered an instruction, with
+/// multiple instructions separated by SeparatorChar or newlines.
+/// Variable-length instructions are not handled here; this function
+/// may be overloaded in the target code to do that.
+unsigned TargetInstrInfo::getInlineAsmLength(const char *Str,
+                                             const MCAsmInfo &MAI) const {
+  
+  
+  // Count the number of instructions in the asm.
+  bool atInsnStart = true;
+  unsigned Length = 0;
+  for (; *Str; ++Str) {
+    if (*Str == '\n' || *Str == MAI.getSeparatorChar())
+      atInsnStart = true;
+    if (atInsnStart && !isspace(*Str)) {
+      Length += MAI.getMaxInstLength();
+      atInsnStart = false;
+    }
+    if (atInsnStart && strncmp(Str, MAI.getCommentString(),
+                               strlen(MAI.getCommentString())) == 0)
+      atInsnStart = false;
+  }
+  
+  return Length;
+}

diff --git a/lib/Target/TargetIntrinsicInfo.cpp b/lib/Target/TargetIntrinsicInfo.cpp
new file mode 100644
index 0000000..e049a1d
--- /dev/null
+++ b/lib/Target/TargetIntrinsicInfo.cpp

@@ -0,0 +1,30 @@
+//===-- TargetIntrinsicInfo.cpp - Target Instruction Information ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the TargetIntrinsicInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetIntrinsicInfo.h"
+#include "llvm/Function.h"
+#include "llvm/ADT/StringMap.h"
+using namespace llvm;
+
+TargetIntrinsicInfo::TargetIntrinsicInfo() {
+}
+
+TargetIntrinsicInfo::~TargetIntrinsicInfo() {
+}
+
+unsigned TargetIntrinsicInfo::getIntrinsicID(Function *F) const {
+  const ValueName *ValName = F->getValueName();
+  if (!ValName)
+    return 0;
+  return lookupName(ValName->getKeyData(), ValName->getKeyLength());
+}

diff --git a/lib/Target/TargetLoweringObjectFile.cpp b/lib/Target/TargetLoweringObjectFile.cpp
new file mode 100644
index 0000000..a231ebc
--- /dev/null
+++ b/lib/Target/TargetLoweringObjectFile.cpp

@@ -0,0 +1,1120 @@
+//===-- llvm/Target/TargetLoweringObjectFile.cpp - Object File Info -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements classes used to handle lowerings specific to common
+// object file formats.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//                              Generic Code
+//===----------------------------------------------------------------------===//
+
+TargetLoweringObjectFile::TargetLoweringObjectFile() : Ctx(0) {
+  TextSection = 0;
+  DataSection = 0;
+  BSSSection = 0;
+  ReadOnlySection = 0;
+  StaticCtorSection = 0;
+  StaticDtorSection = 0;
+  LSDASection = 0;
+  EHFrameSection = 0;
+
+  DwarfAbbrevSection = 0;
+  DwarfInfoSection = 0;
+  DwarfLineSection = 0;
+  DwarfFrameSection = 0;
+  DwarfPubNamesSection = 0;
+  DwarfPubTypesSection = 0;
+  DwarfDebugInlineSection = 0;
+  DwarfStrSection = 0;
+  DwarfLocSection = 0;
+  DwarfARangesSection = 0;
+  DwarfRangesSection = 0;
+  DwarfMacroInfoSection = 0;
+}
+
+TargetLoweringObjectFile::~TargetLoweringObjectFile() {
+}
+
+static bool isSuitableForBSS(const GlobalVariable *GV) {
+  Constant *C = GV->getInitializer();
+
+  // Must have zero initializer.
+  if (!C->isNullValue())
+    return false;
+
+  // Leave constant zeros in readonly constant sections, so they can be shared.
+  if (GV->isConstant())
+    return false;
+
+  // If the global has an explicit section specified, don't put it in BSS.
+  if (!GV->getSection().empty())
+    return false;
+
+  // If -nozero-initialized-in-bss is specified, don't ever use BSS.
+  if (NoZerosInBSS)
+    return false;
+
+  // Otherwise, put it in BSS!
+  return true;
+}
+
+/// IsNullTerminatedString - Return true if the specified constant (which is
+/// known to have a type that is an array of 1/2/4 byte elements) ends with a
+/// nul value and contains no other nuls in it.
+static bool IsNullTerminatedString(const Constant *C) {
+  const ArrayType *ATy = cast<ArrayType>(C->getType());
+
+  // First check: is we have constant array of i8 terminated with zero
+  if (const ConstantArray *CVA = dyn_cast<ConstantArray>(C)) {
+    if (ATy->getNumElements() == 0) return false;
+
+    ConstantInt *Null =
+      dyn_cast<ConstantInt>(CVA->getOperand(ATy->getNumElements()-1));
+    if (Null == 0 || Null->getZExtValue() != 0)
+      return false; // Not null terminated.
+
+    // Verify that the null doesn't occur anywhere else in the string.
+    for (unsigned i = 0, e = ATy->getNumElements()-1; i != e; ++i)
+      // Reject constantexpr elements etc.
+      if (!isa<ConstantInt>(CVA->getOperand(i)) ||
+          CVA->getOperand(i) == Null)
+        return false;
+    return true;
+  }
+
+  // Another possibility: [1 x i8] zeroinitializer
+  if (isa<ConstantAggregateZero>(C))
+    return ATy->getNumElements() == 1;
+
+  return false;
+}
+
+/// getKindForGlobal - This is a top-level target-independent classifier for
+/// a global variable.  Given an global variable and information from TM, it
+/// classifies the global in a variety of ways that make various target
+/// implementations simpler.  The target implementation is free to ignore this
+/// extra info of course.
+SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalValue *GV,
+                                                       const TargetMachine &TM){
+  assert(!GV->isDeclaration() && !GV->hasAvailableExternallyLinkage() &&
+         "Can only be used for global definitions");
+
+  Reloc::Model ReloModel = TM.getRelocationModel();
+
+  // Early exit - functions should be always in text sections.
+  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+  if (GVar == 0)
+    return SectionKind::getText();
+
+  // Handle thread-local data first.
+  if (GVar->isThreadLocal()) {
+    if (isSuitableForBSS(GVar))
+      return SectionKind::getThreadBSS();
+    return SectionKind::getThreadData();
+  }
+
+  // Variables with common linkage always get classified as common.
+  if (GVar->hasCommonLinkage())
+    return SectionKind::getCommon();
+
+  // Variable can be easily put to BSS section.
+  if (isSuitableForBSS(GVar)) {
+    if (GVar->hasLocalLinkage())
+      return SectionKind::getBSSLocal();
+    else if (GVar->hasExternalLinkage())
+      return SectionKind::getBSSExtern();
+    return SectionKind::getBSS();
+  }
+
+  Constant *C = GVar->getInitializer();
+
+  // If the global is marked constant, we can put it into a mergable section,
+  // a mergable string section, or general .data if it contains relocations.
+  if (GVar->isConstant()) {
+    // If the initializer for the global contains something that requires a
+    // relocation, then we may have to drop this into a wriable data section
+    // even though it is marked const.
+    switch (C->getRelocationInfo()) {
+    default: assert(0 && "unknown relocation info kind");
+    case Constant::NoRelocation:
+      // If initializer is a null-terminated string, put it in a "cstring"
+      // section of the right width.
+      if (const ArrayType *ATy = dyn_cast<ArrayType>(C->getType())) {
+        if (const IntegerType *ITy =
+              dyn_cast<IntegerType>(ATy->getElementType())) {
+          if ((ITy->getBitWidth() == 8 || ITy->getBitWidth() == 16 ||
+               ITy->getBitWidth() == 32) &&
+              IsNullTerminatedString(C)) {
+            if (ITy->getBitWidth() == 8)
+              return SectionKind::getMergeable1ByteCString();
+            if (ITy->getBitWidth() == 16)
+              return SectionKind::getMergeable2ByteCString();
+
+            assert(ITy->getBitWidth() == 32 && "Unknown width");
+            return SectionKind::getMergeable4ByteCString();
+          }
+        }
+      }
+
+      // Otherwise, just drop it into a mergable constant section.  If we have
+      // a section for this size, use it, otherwise use the arbitrary sized
+      // mergable section.
+      switch (TM.getTargetData()->getTypeAllocSize(C->getType())) {
+      case 4:  return SectionKind::getMergeableConst4();
+      case 8:  return SectionKind::getMergeableConst8();
+      case 16: return SectionKind::getMergeableConst16();
+      default: return SectionKind::getMergeableConst();
+      }
+
+    case Constant::LocalRelocation:
+      // In static relocation model, the linker will resolve all addresses, so
+      // the relocation entries will actually be constants by the time the app
+      // starts up.  However, we can't put this into a mergable section, because
+      // the linker doesn't take relocations into consideration when it tries to
+      // merge entries in the section.
+      if (ReloModel == Reloc::Static)
+        return SectionKind::getReadOnly();
+
+      // Otherwise, the dynamic linker needs to fix it up, put it in the
+      // writable data.rel.local section.
+      return SectionKind::getReadOnlyWithRelLocal();
+
+    case Constant::GlobalRelocations:
+      // In static relocation model, the linker will resolve all addresses, so
+      // the relocation entries will actually be constants by the time the app
+      // starts up.  However, we can't put this into a mergable section, because
+      // the linker doesn't take relocations into consideration when it tries to
+      // merge entries in the section.
+      if (ReloModel == Reloc::Static)
+        return SectionKind::getReadOnly();
+
+      // Otherwise, the dynamic linker needs to fix it up, put it in the
+      // writable data.rel section.
+      return SectionKind::getReadOnlyWithRel();
+    }
+  }
+
+  // Okay, this isn't a constant.  If the initializer for the global is going
+  // to require a runtime relocation by the dynamic linker, put it into a more
+  // specific section to improve startup time of the app.  This coalesces these
+  // globals together onto fewer pages, improving the locality of the dynamic
+  // linker.
+  if (ReloModel == Reloc::Static)
+    return SectionKind::getDataNoRel();
+
+  switch (C->getRelocationInfo()) {
+  default: assert(0 && "unknown relocation info kind");
+  case Constant::NoRelocation:
+    return SectionKind::getDataNoRel();
+  case Constant::LocalRelocation:
+    return SectionKind::getDataRelLocal();
+  case Constant::GlobalRelocations:
+    return SectionKind::getDataRel();
+  }
+}
+
+/// SectionForGlobal - This method computes the appropriate section to emit
+/// the specified global variable or function definition.  This should not
+/// be passed external (or available externally) globals.
+const MCSection *TargetLoweringObjectFile::
+SectionForGlobal(const GlobalValue *GV, SectionKind Kind, Mangler *Mang,
+                 const TargetMachine &TM) const {
+  // Select section name.
+  if (GV->hasSection())
+    return getExplicitSectionGlobal(GV, Kind, Mang, TM);
+
+
+  // Use default section depending on the 'type' of global
+  return SelectSectionForGlobal(GV, Kind, Mang, TM);
+}
+
+
+// Lame default implementation. Calculate the section name for global.
+const MCSection *
+TargetLoweringObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
+                                                 SectionKind Kind,
+                                                 Mangler *Mang,
+                                                 const TargetMachine &TM) const{
+  assert(!Kind.isThreadLocal() && "Doesn't support TLS");
+
+  if (Kind.isText())
+    return getTextSection();
+
+  if (Kind.isBSS() && BSSSection != 0)
+    return BSSSection;
+
+  if (Kind.isReadOnly() && ReadOnlySection != 0)
+    return ReadOnlySection;
+
+  return getDataSection();
+}
+
+/// getSectionForConstant - Given a mergable constant with the
+/// specified size and relocation information, return a section that it
+/// should be placed in.
+const MCSection *
+TargetLoweringObjectFile::getSectionForConstant(SectionKind Kind) const {
+  if (Kind.isReadOnly() && ReadOnlySection != 0)
+    return ReadOnlySection;
+
+  return DataSection;
+}
+
+/// getSymbolForDwarfGlobalReference - Return an MCExpr to use for a
+/// pc-relative reference to the specified global variable from exception
+/// handling information.  In addition to the symbol, this returns
+/// by-reference:
+///
+/// IsIndirect - True if the returned symbol is actually a stub that contains
+///    the address of the symbol, false if the symbol is the global itself.
+///
+/// IsPCRel - True if the symbol reference is already pc-relative, false if
+///    the caller needs to subtract off the address of the reference from the
+///    symbol.
+///
+const MCExpr *TargetLoweringObjectFile::
+getSymbolForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
+                                 MachineModuleInfo *MMI,
+                                 bool &IsIndirect, bool &IsPCRel) const {
+  // The generic implementation of this just returns a direct reference to the
+  // symbol.
+  IsIndirect = false;
+  IsPCRel    = false;
+  
+  // FIXME: Use GetGlobalValueSymbol.
+  SmallString<128> Name;
+  Mang->getNameWithPrefix(Name, GV, false);
+  return MCSymbolRefExpr::Create(Name.str(), getContext());
+}
+
+
+//===----------------------------------------------------------------------===//
+//                                  ELF
+//===----------------------------------------------------------------------===//
+typedef StringMap<const MCSectionELF*> ELFUniqueMapTy;
+
+TargetLoweringObjectFileELF::~TargetLoweringObjectFileELF() {
+  // If we have the section uniquing map, free it.
+  delete (ELFUniqueMapTy*)UniquingMap;
+}
+
+const MCSection *TargetLoweringObjectFileELF::
+getELFSection(StringRef Section, unsigned Type, unsigned Flags,
+              SectionKind Kind, bool IsExplicit) const {
+  if (UniquingMap == 0)
+    UniquingMap = new ELFUniqueMapTy();
+  ELFUniqueMapTy &Map = *(ELFUniqueMapTy*)UniquingMap;
+
+  // Do the lookup, if we have a hit, return it.
+  const MCSectionELF *&Entry = Map[Section];
+  if (Entry) return Entry;
+
+  return Entry = MCSectionELF::Create(Section, Type, Flags, Kind, IsExplicit,
+                                      getContext());
+}
+
+void TargetLoweringObjectFileELF::Initialize(MCContext &Ctx,
+                                             const TargetMachine &TM) {
+  if (UniquingMap != 0)
+    ((ELFUniqueMapTy*)UniquingMap)->clear();
+  TargetLoweringObjectFile::Initialize(Ctx, TM);
+
+  BSSSection =
+    getELFSection(".bss", MCSectionELF::SHT_NOBITS,
+                  MCSectionELF::SHF_WRITE | MCSectionELF::SHF_ALLOC,
+                  SectionKind::getBSS());
+
+  TextSection =
+    getELFSection(".text", MCSectionELF::SHT_PROGBITS,
+                  MCSectionELF::SHF_EXECINSTR | MCSectionELF::SHF_ALLOC,
+                  SectionKind::getText());
+
+  DataSection =
+    getELFSection(".data", MCSectionELF::SHT_PROGBITS,
+                  MCSectionELF::SHF_WRITE | MCSectionELF::SHF_ALLOC,
+                  SectionKind::getDataRel());
+
+  ReadOnlySection =
+    getELFSection(".rodata", MCSectionELF::SHT_PROGBITS,
+                  MCSectionELF::SHF_ALLOC,
+                  SectionKind::getReadOnly());
+
+  TLSDataSection =
+    getELFSection(".tdata", MCSectionELF::SHT_PROGBITS,
+                  MCSectionELF::SHF_ALLOC | MCSectionELF::SHF_TLS |
+                  MCSectionELF::SHF_WRITE, SectionKind::getThreadData());
+
+  TLSBSSSection =
+    getELFSection(".tbss", MCSectionELF::SHT_NOBITS,
+                  MCSectionELF::SHF_ALLOC | MCSectionELF::SHF_TLS |
+                  MCSectionELF::SHF_WRITE, SectionKind::getThreadBSS());
+
+  DataRelSection =
+    getELFSection(".data.rel", MCSectionELF::SHT_PROGBITS,
+                  MCSectionELF::SHF_ALLOC | MCSectionELF::SHF_WRITE,
+                  SectionKind::getDataRel());
+
+  DataRelLocalSection =
+    getELFSection(".data.rel.local", MCSectionELF::SHT_PROGBITS,
+                  MCSectionELF::SHF_ALLOC | MCSectionELF::SHF_WRITE,
+                  SectionKind::getDataRelLocal());
+
+  DataRelROSection =
+    getELFSection(".data.rel.ro", MCSectionELF::SHT_PROGBITS,
+                  MCSectionELF::SHF_ALLOC | MCSectionELF::SHF_WRITE,
+                  SectionKind::getReadOnlyWithRel());
+
+  DataRelROLocalSection =
+    getELFSection(".data.rel.ro.local", MCSectionELF::SHT_PROGBITS,
+                  MCSectionELF::SHF_ALLOC | MCSectionELF::SHF_WRITE,
+                  SectionKind::getReadOnlyWithRelLocal());
+
+  MergeableConst4Section =
+    getELFSection(".rodata.cst4", MCSectionELF::SHT_PROGBITS,
+                  MCSectionELF::SHF_ALLOC | MCSectionELF::SHF_MERGE,
+                  SectionKind::getMergeableConst4());
+
+  MergeableConst8Section =
+    getELFSection(".rodata.cst8", MCSectionELF::SHT_PROGBITS,
+                  MCSectionELF::SHF_ALLOC | MCSectionELF::SHF_MERGE,
+                  SectionKind::getMergeableConst8());
+
+  MergeableConst16Section =
+    getELFSection(".rodata.cst16", MCSectionELF::SHT_PROGBITS,
+                  MCSectionELF::SHF_ALLOC | MCSectionELF::SHF_MERGE,
+                  SectionKind::getMergeableConst16());
+
+  StaticCtorSection =
+    getELFSection(".ctors", MCSectionELF::SHT_PROGBITS,
+                  MCSectionELF::SHF_ALLOC | MCSectionELF::SHF_WRITE,
+                  SectionKind::getDataRel());
+
+  StaticDtorSection =
+    getELFSection(".dtors", MCSectionELF::SHT_PROGBITS,
+                  MCSectionELF::SHF_ALLOC | MCSectionELF::SHF_WRITE,
+                  SectionKind::getDataRel());
+
+  // Exception Handling Sections.
+
+  // FIXME: We're emitting LSDA info into a readonly section on ELF, even though
+  // it contains relocatable pointers.  In PIC mode, this is probably a big
+  // runtime hit for C++ apps.  Either the contents of the LSDA need to be
+  // adjusted or this should be a data section.
+  LSDASection =
+    getELFSection(".gcc_except_table", MCSectionELF::SHT_PROGBITS,
+                  MCSectionELF::SHF_ALLOC, SectionKind::getReadOnly());
+  EHFrameSection =
+    getELFSection(".eh_frame", MCSectionELF::SHT_PROGBITS,
+                  MCSectionELF::SHF_ALLOC | MCSectionELF::SHF_WRITE,
+                  SectionKind::getDataRel());
+
+  // Debug Info Sections.
+  DwarfAbbrevSection =
+    getELFSection(".debug_abbrev", MCSectionELF::SHT_PROGBITS, 0,
+                  SectionKind::getMetadata());
+  DwarfInfoSection =
+    getELFSection(".debug_info", MCSectionELF::SHT_PROGBITS, 0,
+                  SectionKind::getMetadata());
+  DwarfLineSection =
+    getELFSection(".debug_line", MCSectionELF::SHT_PROGBITS, 0,
+                  SectionKind::getMetadata());
+  DwarfFrameSection =
+    getELFSection(".debug_frame", MCSectionELF::SHT_PROGBITS, 0,
+                  SectionKind::getMetadata());
+  DwarfPubNamesSection =
+    getELFSection(".debug_pubnames", MCSectionELF::SHT_PROGBITS, 0,
+                  SectionKind::getMetadata());
+  DwarfPubTypesSection =
+    getELFSection(".debug_pubtypes", MCSectionELF::SHT_PROGBITS, 0,
+                  SectionKind::getMetadata());
+  DwarfStrSection =
+    getELFSection(".debug_str", MCSectionELF::SHT_PROGBITS, 0,
+                  SectionKind::getMetadata());
+  DwarfLocSection =
+    getELFSection(".debug_loc", MCSectionELF::SHT_PROGBITS, 0,
+                  SectionKind::getMetadata());
+  DwarfARangesSection =
+    getELFSection(".debug_aranges", MCSectionELF::SHT_PROGBITS, 0,
+                  SectionKind::getMetadata());
+  DwarfRangesSection =
+    getELFSection(".debug_ranges", MCSectionELF::SHT_PROGBITS, 0,
+                  SectionKind::getMetadata());
+  DwarfMacroInfoSection =
+    getELFSection(".debug_macinfo", MCSectionELF::SHT_PROGBITS, 0,
+                  SectionKind::getMetadata());
+}
+
+
+static SectionKind
+getELFKindForNamedSection(StringRef Name, SectionKind K) {
+  if (Name.empty() || Name[0] != '.') return K;
+
+  // Some lame default implementation based on some magic section names.
+  if (Name == ".bss" ||
+      Name.startswith(".bss.") ||
+      Name.startswith(".gnu.linkonce.b.") ||
+      Name.startswith(".llvm.linkonce.b.") ||
+      Name == ".sbss" ||
+      Name.startswith(".sbss.") ||
+      Name.startswith(".gnu.linkonce.sb.") ||
+      Name.startswith(".llvm.linkonce.sb."))
+    return SectionKind::getBSS();
+
+  if (Name == ".tdata" ||
+      Name.startswith(".tdata.") ||
+      Name.startswith(".gnu.linkonce.td.") ||
+      Name.startswith(".llvm.linkonce.td."))
+    return SectionKind::getThreadData();
+
+  if (Name == ".tbss" ||
+      Name.startswith(".tbss.") ||
+      Name.startswith(".gnu.linkonce.tb.") ||
+      Name.startswith(".llvm.linkonce.tb."))
+    return SectionKind::getThreadBSS();
+
+  return K;
+}
+
+
+static unsigned getELFSectionType(StringRef Name, SectionKind K) {
+
+  if (Name == ".init_array")
+    return MCSectionELF::SHT_INIT_ARRAY;
+
+  if (Name == ".fini_array")
+    return MCSectionELF::SHT_FINI_ARRAY;
+
+  if (Name == ".preinit_array")
+    return MCSectionELF::SHT_PREINIT_ARRAY;
+
+  if (K.isBSS() || K.isThreadBSS())
+    return MCSectionELF::SHT_NOBITS;
+
+  return MCSectionELF::SHT_PROGBITS;
+}
+
+
+static unsigned
+getELFSectionFlags(SectionKind K) {
+  unsigned Flags = 0;
+
+  if (!K.isMetadata())
+    Flags |= MCSectionELF::SHF_ALLOC;
+
+  if (K.isText())
+    Flags |= MCSectionELF::SHF_EXECINSTR;
+
+  if (K.isWriteable())
+    Flags |= MCSectionELF::SHF_WRITE;
+
+  if (K.isThreadLocal())
+    Flags |= MCSectionELF::SHF_TLS;
+
+  // K.isMergeableConst() is left out to honour PR4650
+  if (K.isMergeableCString() || K.isMergeableConst4() ||
+      K.isMergeableConst8() || K.isMergeableConst16())
+    Flags |= MCSectionELF::SHF_MERGE;
+
+  if (K.isMergeableCString())
+    Flags |= MCSectionELF::SHF_STRINGS;
+
+  return Flags;
+}
+
+
+const MCSection *TargetLoweringObjectFileELF::
+getExplicitSectionGlobal(const GlobalValue *GV, SectionKind Kind,
+                         Mangler *Mang, const TargetMachine &TM) const {
+  StringRef SectionName = GV->getSection();
+
+  // Infer section flags from the section name if we can.
+  Kind = getELFKindForNamedSection(SectionName, Kind);
+
+  return getELFSection(SectionName,
+                       getELFSectionType(SectionName, Kind),
+                       getELFSectionFlags(Kind), Kind, true);
+}
+
+static const char *getSectionPrefixForUniqueGlobal(SectionKind Kind) {
+  if (Kind.isText())                 return ".gnu.linkonce.t.";
+  if (Kind.isReadOnly())             return ".gnu.linkonce.r.";
+
+  if (Kind.isThreadData())           return ".gnu.linkonce.td.";
+  if (Kind.isThreadBSS())            return ".gnu.linkonce.tb.";
+
+  if (Kind.isDataNoRel())            return ".gnu.linkonce.d.";
+  if (Kind.isDataRelLocal())         return ".gnu.linkonce.d.rel.local.";
+  if (Kind.isDataRel())              return ".gnu.linkonce.d.rel.";
+  if (Kind.isReadOnlyWithRelLocal()) return ".gnu.linkonce.d.rel.ro.local.";
+
+  assert(Kind.isReadOnlyWithRel() && "Unknown section kind");
+  return ".gnu.linkonce.d.rel.ro.";
+}
+
+const MCSection *TargetLoweringObjectFileELF::
+SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
+                       Mangler *Mang, const TargetMachine &TM) const {
+
+  // If this global is linkonce/weak and the target handles this by emitting it
+  // into a 'uniqued' section name, create and return the section now.
+  if (GV->isWeakForLinker() && !Kind.isCommon() && !Kind.isBSS()) {
+    const char *Prefix = getSectionPrefixForUniqueGlobal(Kind);
+    SmallString<128> Name;
+    Name.append(Prefix, Prefix+strlen(Prefix));
+    Mang->getNameWithPrefix(Name, GV, false);
+    return getELFSection(Name.str(), getELFSectionType(Name.str(), Kind),
+                         getELFSectionFlags(Kind), Kind);
+  }
+
+  if (Kind.isText()) return TextSection;
+
+  if (Kind.isMergeable1ByteCString() ||
+      Kind.isMergeable2ByteCString() ||
+      Kind.isMergeable4ByteCString()) {
+
+    // We also need alignment here.
+    // FIXME: this is getting the alignment of the character, not the
+    // alignment of the global!
+    unsigned Align =
+      TM.getTargetData()->getPreferredAlignment(cast<GlobalVariable>(GV));
+
+    const char *SizeSpec = ".rodata.str1.";
+    if (Kind.isMergeable2ByteCString())
+      SizeSpec = ".rodata.str2.";
+    else if (Kind.isMergeable4ByteCString())
+      SizeSpec = ".rodata.str4.";
+    else
+      assert(Kind.isMergeable1ByteCString() && "unknown string width");
+
+
+    std::string Name = SizeSpec + utostr(Align);
+    return getELFSection(Name, MCSectionELF::SHT_PROGBITS,
+                         MCSectionELF::SHF_ALLOC |
+                         MCSectionELF::SHF_MERGE |
+                         MCSectionELF::SHF_STRINGS,
+                         Kind);
+  }
+
+  if (Kind.isMergeableConst()) {
+    if (Kind.isMergeableConst4() && MergeableConst4Section)
+      return MergeableConst4Section;
+    if (Kind.isMergeableConst8() && MergeableConst8Section)
+      return MergeableConst8Section;
+    if (Kind.isMergeableConst16() && MergeableConst16Section)
+      return MergeableConst16Section;
+    return ReadOnlySection;  // .const
+  }
+
+  if (Kind.isReadOnly())             return ReadOnlySection;
+
+  if (Kind.isThreadData())           return TLSDataSection;
+  if (Kind.isThreadBSS())            return TLSBSSSection;
+
+  // Note: we claim that common symbols are put in BSSSection, but they are
+  // really emitted with the magic .comm directive, which creates a symbol table
+  // entry but not a section.
+  if (Kind.isBSS() || Kind.isCommon()) return BSSSection;
+
+  if (Kind.isDataNoRel())            return DataSection;
+  if (Kind.isDataRelLocal())         return DataRelLocalSection;
+  if (Kind.isDataRel())              return DataRelSection;
+  if (Kind.isReadOnlyWithRelLocal()) return DataRelROLocalSection;
+
+  assert(Kind.isReadOnlyWithRel() && "Unknown section kind");
+  return DataRelROSection;
+}
+
+/// getSectionForConstant - Given a mergeable constant with the
+/// specified size and relocation information, return a section that it
+/// should be placed in.
+const MCSection *TargetLoweringObjectFileELF::
+getSectionForConstant(SectionKind Kind) const {
+  if (Kind.isMergeableConst4() && MergeableConst4Section)
+    return MergeableConst4Section;
+  if (Kind.isMergeableConst8() && MergeableConst8Section)
+    return MergeableConst8Section;
+  if (Kind.isMergeableConst16() && MergeableConst16Section)
+    return MergeableConst16Section;
+  if (Kind.isReadOnly())
+    return ReadOnlySection;
+
+  if (Kind.isReadOnlyWithRelLocal()) return DataRelROLocalSection;
+  assert(Kind.isReadOnlyWithRel() && "Unknown section kind");
+  return DataRelROSection;
+}
+
+//===----------------------------------------------------------------------===//
+//                                 MachO
+//===----------------------------------------------------------------------===//
+
+typedef StringMap<const MCSectionMachO*> MachOUniqueMapTy;
+
+TargetLoweringObjectFileMachO::~TargetLoweringObjectFileMachO() {
+  // If we have the MachO uniquing map, free it.
+  delete (MachOUniqueMapTy*)UniquingMap;
+}
+
+
+const MCSectionMachO *TargetLoweringObjectFileMachO::
+getMachOSection(StringRef Segment, StringRef Section,
+                unsigned TypeAndAttributes,
+                unsigned Reserved2, SectionKind Kind) const {
+  // We unique sections by their segment/section pair.  The returned section
+  // may not have the same flags as the requested section, if so this should be
+  // diagnosed by the client as an error.
+
+  // Create the map if it doesn't already exist.
+  if (UniquingMap == 0)
+    UniquingMap = new MachOUniqueMapTy();
+  MachOUniqueMapTy &Map = *(MachOUniqueMapTy*)UniquingMap;
+
+  // Form the name to look up.
+  SmallString<64> Name;
+  Name += Segment;
+  Name.push_back(',');
+  Name += Section;
+
+  // Do the lookup, if we have a hit, return it.
+  const MCSectionMachO *&Entry = Map[Name.str()];
+  if (Entry) return Entry;
+
+  // Otherwise, return a new section.
+  return Entry = MCSectionMachO::Create(Segment, Section, TypeAndAttributes,
+                                        Reserved2, Kind, getContext());
+}
+
+
+void TargetLoweringObjectFileMachO::Initialize(MCContext &Ctx,
+                                               const TargetMachine &TM) {
+  if (UniquingMap != 0)
+    ((MachOUniqueMapTy*)UniquingMap)->clear();
+  TargetLoweringObjectFile::Initialize(Ctx, TM);
+
+  TextSection // .text
+    = getMachOSection("__TEXT", "__text",
+                      MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
+                      SectionKind::getText());
+  DataSection // .data
+    = getMachOSection("__DATA", "__data", 0, SectionKind::getDataRel());
+
+  CStringSection // .cstring
+    = getMachOSection("__TEXT", "__cstring", MCSectionMachO::S_CSTRING_LITERALS,
+                      SectionKind::getMergeable1ByteCString());
+  UStringSection
+    = getMachOSection("__TEXT","__ustring", 0,
+                      SectionKind::getMergeable2ByteCString());
+  FourByteConstantSection // .literal4
+    = getMachOSection("__TEXT", "__literal4", MCSectionMachO::S_4BYTE_LITERALS,
+                      SectionKind::getMergeableConst4());
+  EightByteConstantSection // .literal8
+    = getMachOSection("__TEXT", "__literal8", MCSectionMachO::S_8BYTE_LITERALS,
+                      SectionKind::getMergeableConst8());
+
+  // ld_classic doesn't support .literal16 in 32-bit mode, and ld64 falls back
+  // to using it in -static mode.
+  SixteenByteConstantSection = 0;
+  if (TM.getRelocationModel() != Reloc::Static &&
+      TM.getTargetData()->getPointerSize() == 32)
+    SixteenByteConstantSection =   // .literal16
+      getMachOSection("__TEXT", "__literal16",MCSectionMachO::S_16BYTE_LITERALS,
+                      SectionKind::getMergeableConst16());
+
+  ReadOnlySection  // .const
+    = getMachOSection("__TEXT", "__const", 0, SectionKind::getReadOnly());
+
+  TextCoalSection
+    = getMachOSection("__TEXT", "__textcoal_nt",
+                      MCSectionMachO::S_COALESCED |
+                      MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
+                      SectionKind::getText());
+  ConstTextCoalSection
+    = getMachOSection("__TEXT", "__const_coal", MCSectionMachO::S_COALESCED,
+                      SectionKind::getText());
+  ConstDataCoalSection
+    = getMachOSection("__DATA","__const_coal", MCSectionMachO::S_COALESCED,
+                      SectionKind::getText());
+  ConstDataSection  // .const_data
+    = getMachOSection("__DATA", "__const", 0,
+                      SectionKind::getReadOnlyWithRel());
+  DataCoalSection
+    = getMachOSection("__DATA","__datacoal_nt", MCSectionMachO::S_COALESCED,
+                      SectionKind::getDataRel());
+  DataCommonSection
+    = getMachOSection("__DATA","__common", MCSectionMachO::S_ZEROFILL,
+                      SectionKind::getBSS());
+  DataBSSSection
+    = getMachOSection("__DATA","__bss", MCSectionMachO::S_ZEROFILL,
+                    SectionKind::getBSS());
+  
+
+  LazySymbolPointerSection
+    = getMachOSection("__DATA", "__la_symbol_ptr",
+                      MCSectionMachO::S_LAZY_SYMBOL_POINTERS,
+                      SectionKind::getMetadata());
+  NonLazySymbolPointerSection
+    = getMachOSection("__DATA", "__nl_symbol_ptr",
+                      MCSectionMachO::S_NON_LAZY_SYMBOL_POINTERS,
+                      SectionKind::getMetadata());
+
+  if (TM.getRelocationModel() == Reloc::Static) {
+    StaticCtorSection
+      = getMachOSection("__TEXT", "__constructor", 0,SectionKind::getDataRel());
+    StaticDtorSection
+      = getMachOSection("__TEXT", "__destructor", 0, SectionKind::getDataRel());
+  } else {
+    StaticCtorSection
+      = getMachOSection("__DATA", "__mod_init_func",
+                        MCSectionMachO::S_MOD_INIT_FUNC_POINTERS,
+                        SectionKind::getDataRel());
+    StaticDtorSection
+      = getMachOSection("__DATA", "__mod_term_func",
+                        MCSectionMachO::S_MOD_TERM_FUNC_POINTERS,
+                        SectionKind::getDataRel());
+  }
+
+  // Exception Handling.
+  LSDASection = getMachOSection("__DATA", "__gcc_except_tab", 0,
+                                SectionKind::getDataRel());
+  EHFrameSection =
+    getMachOSection("__TEXT", "__eh_frame",
+                    MCSectionMachO::S_COALESCED |
+                    MCSectionMachO::S_ATTR_NO_TOC |
+                    MCSectionMachO::S_ATTR_STRIP_STATIC_SYMS |
+                    MCSectionMachO::S_ATTR_LIVE_SUPPORT,
+                    SectionKind::getReadOnly());
+
+  // Debug Information.
+  DwarfAbbrevSection =
+    getMachOSection("__DWARF", "__debug_abbrev", MCSectionMachO::S_ATTR_DEBUG,
+                    SectionKind::getMetadata());
+  DwarfInfoSection =
+    getMachOSection("__DWARF", "__debug_info", MCSectionMachO::S_ATTR_DEBUG,
+                    SectionKind::getMetadata());
+  DwarfLineSection =
+    getMachOSection("__DWARF", "__debug_line", MCSectionMachO::S_ATTR_DEBUG,
+                    SectionKind::getMetadata());
+  DwarfFrameSection =
+    getMachOSection("__DWARF", "__debug_frame", MCSectionMachO::S_ATTR_DEBUG,
+                    SectionKind::getMetadata());
+  DwarfPubNamesSection =
+    getMachOSection("__DWARF", "__debug_pubnames", MCSectionMachO::S_ATTR_DEBUG,
+                    SectionKind::getMetadata());
+  DwarfPubTypesSection =
+    getMachOSection("__DWARF", "__debug_pubtypes", MCSectionMachO::S_ATTR_DEBUG,
+                    SectionKind::getMetadata());
+  DwarfStrSection =
+    getMachOSection("__DWARF", "__debug_str", MCSectionMachO::S_ATTR_DEBUG,
+                    SectionKind::getMetadata());
+  DwarfLocSection =
+    getMachOSection("__DWARF", "__debug_loc", MCSectionMachO::S_ATTR_DEBUG,
+                    SectionKind::getMetadata());
+  DwarfARangesSection =
+    getMachOSection("__DWARF", "__debug_aranges", MCSectionMachO::S_ATTR_DEBUG,
+                    SectionKind::getMetadata());
+  DwarfRangesSection =
+    getMachOSection("__DWARF", "__debug_ranges", MCSectionMachO::S_ATTR_DEBUG,
+                    SectionKind::getMetadata());
+  DwarfMacroInfoSection =
+    getMachOSection("__DWARF", "__debug_macinfo", MCSectionMachO::S_ATTR_DEBUG,
+                    SectionKind::getMetadata());
+  DwarfDebugInlineSection =
+    getMachOSection("__DWARF", "__debug_inlined", MCSectionMachO::S_ATTR_DEBUG,
+                    SectionKind::getMetadata());
+}
+
+const MCSection *TargetLoweringObjectFileMachO::
+getExplicitSectionGlobal(const GlobalValue *GV, SectionKind Kind,
+                         Mangler *Mang, const TargetMachine &TM) const {
+  // Parse the section specifier and create it if valid.
+  StringRef Segment, Section;
+  unsigned TAA, StubSize;
+  std::string ErrorCode =
+    MCSectionMachO::ParseSectionSpecifier(GV->getSection(), Segment, Section,
+                                          TAA, StubSize);
+  if (!ErrorCode.empty()) {
+    // If invalid, report the error with llvm_report_error.
+    llvm_report_error("Global variable '" + GV->getNameStr() +
+                      "' has an invalid section specifier '" + GV->getSection()+
+                      "': " + ErrorCode + ".");
+    // Fall back to dropping it into the data section.
+    return DataSection;
+  }
+
+  // Get the section.
+  const MCSectionMachO *S =
+    getMachOSection(Segment, Section, TAA, StubSize, Kind);
+
+  // Okay, now that we got the section, verify that the TAA & StubSize agree.
+  // If the user declared multiple globals with different section flags, we need
+  // to reject it here.
+  if (S->getTypeAndAttributes() != TAA || S->getStubSize() != StubSize) {
+    // If invalid, report the error with llvm_report_error.
+    llvm_report_error("Global variable '" + GV->getNameStr() +
+                      "' section type or attributes does not match previous"
+                      " section specifier");
+  }
+
+  return S;
+}
+
+const MCSection *TargetLoweringObjectFileMachO::
+SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
+                       Mangler *Mang, const TargetMachine &TM) const {
+  assert(!Kind.isThreadLocal() && "Darwin doesn't support TLS");
+
+  if (Kind.isText())
+    return GV->isWeakForLinker() ? TextCoalSection : TextSection;
+
+  // If this is weak/linkonce, put this in a coalescable section, either in text
+  // or data depending on if it is writable.
+  if (GV->isWeakForLinker()) {
+    if (Kind.isReadOnly())
+      return ConstTextCoalSection;
+    return DataCoalSection;
+  }
+
+  // FIXME: Alignment check should be handled by section classifier.
+  if (Kind.isMergeable1ByteCString() ||
+      Kind.isMergeable2ByteCString()) {
+    if (TM.getTargetData()->getPreferredAlignment(
+                                              cast<GlobalVariable>(GV)) < 32) {
+      if (Kind.isMergeable1ByteCString())
+        return CStringSection;
+      assert(Kind.isMergeable2ByteCString());
+      return UStringSection;
+    }
+  }
+
+  if (Kind.isMergeableConst()) {
+    if (Kind.isMergeableConst4())
+      return FourByteConstantSection;
+    if (Kind.isMergeableConst8())
+      return EightByteConstantSection;
+    if (Kind.isMergeableConst16() && SixteenByteConstantSection)
+      return SixteenByteConstantSection;
+  }
+
+  // Otherwise, if it is readonly, but not something we can specially optimize,
+  // just drop it in .const.
+  if (Kind.isReadOnly())
+    return ReadOnlySection;
+
+  // If this is marked const, put it into a const section.  But if the dynamic
+  // linker needs to write to it, put it in the data segment.
+  if (Kind.isReadOnlyWithRel())
+    return ConstDataSection;
+
+  // Put zero initialized globals with strong external linkage in the
+  // DATA, __common section with the .zerofill directive.
+  if (Kind.isBSSExtern())
+    return DataCommonSection;
+
+  // Put zero initialized globals with local linkage in __DATA,__bss directive
+  // with the .zerofill directive (aka .lcomm).
+  if (Kind.isBSSLocal())
+    return DataBSSSection;
+  
+  // Otherwise, just drop the variable in the normal data section.
+  return DataSection;
+}
+
+const MCSection *
+TargetLoweringObjectFileMachO::getSectionForConstant(SectionKind Kind) const {
+  // If this constant requires a relocation, we have to put it in the data
+  // segment, not in the text segment.
+  if (Kind.isDataRel() || Kind.isReadOnlyWithRel())
+    return ConstDataSection;
+
+  if (Kind.isMergeableConst4())
+    return FourByteConstantSection;
+  if (Kind.isMergeableConst8())
+    return EightByteConstantSection;
+  if (Kind.isMergeableConst16() && SixteenByteConstantSection)
+    return SixteenByteConstantSection;
+  return ReadOnlySection;  // .const
+}
+
+/// shouldEmitUsedDirectiveFor - This hook allows targets to selectively decide
+/// not to emit the UsedDirective for some symbols in llvm.used.
+// FIXME: REMOVE this (rdar://7071300)
+bool TargetLoweringObjectFileMachO::
+shouldEmitUsedDirectiveFor(const GlobalValue *GV, Mangler *Mang) const {
+  /// On Darwin, internally linked data beginning with "L" or "l" does not have
+  /// the directive emitted (this occurs in ObjC metadata).
+  if (!GV) return false;
+
+  // Check whether the mangled name has the "Private" or "LinkerPrivate" prefix.
+  if (GV->hasLocalLinkage() && !isa<Function>(GV)) {
+    // FIXME: ObjC metadata is currently emitted as internal symbols that have
+    // \1L and \0l prefixes on them.  Fix them to be Private/LinkerPrivate and
+    // this horrible hack can go away.
+    SmallString<64> Name;
+    Mang->getNameWithPrefix(Name, GV, false);
+    if (Name[0] == 'L' || Name[0] == 'l')
+      return false;
+  }
+
+  return true;
+}
+
+const MCExpr *TargetLoweringObjectFileMachO::
+getSymbolForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
+                                 MachineModuleInfo *MMI,
+                                 bool &IsIndirect, bool &IsPCRel) const {
+  // The mach-o version of this method defaults to returning a stub reference.
+  IsIndirect = true;
+  IsPCRel    = false;
+  
+  SmallString<128> Name;
+  Mang->getNameWithPrefix(Name, GV, true);
+  Name += "$non_lazy_ptr";
+  return MCSymbolRefExpr::Create(Name.str(), getContext());
+}
+
+
+//===----------------------------------------------------------------------===//
+//                                  COFF
+//===----------------------------------------------------------------------===//
+
+typedef StringMap<const MCSectionCOFF*> COFFUniqueMapTy;
+
+TargetLoweringObjectFileCOFF::~TargetLoweringObjectFileCOFF() {
+  delete (COFFUniqueMapTy*)UniquingMap;
+}
+
+
+const MCSection *TargetLoweringObjectFileCOFF::
+getCOFFSection(StringRef Name, bool isDirective, SectionKind Kind) const {
+  // Create the map if it doesn't already exist.
+  if (UniquingMap == 0)
+    UniquingMap = new MachOUniqueMapTy();
+  COFFUniqueMapTy &Map = *(COFFUniqueMapTy*)UniquingMap;
+
+  // Do the lookup, if we have a hit, return it.
+  const MCSectionCOFF *&Entry = Map[Name];
+  if (Entry) return Entry;
+
+  return Entry = MCSectionCOFF::Create(Name, isDirective, Kind, getContext());
+}
+
+void TargetLoweringObjectFileCOFF::Initialize(MCContext &Ctx,
+                                              const TargetMachine &TM) {
+  if (UniquingMap != 0)
+    ((COFFUniqueMapTy*)UniquingMap)->clear();
+  TargetLoweringObjectFile::Initialize(Ctx, TM);
+  TextSection = getCOFFSection("\t.text", true, SectionKind::getText());
+  DataSection = getCOFFSection("\t.data", true, SectionKind::getDataRel());
+  StaticCtorSection =
+    getCOFFSection(".ctors", false, SectionKind::getDataRel());
+  StaticDtorSection =
+    getCOFFSection(".dtors", false, SectionKind::getDataRel());
+
+  // FIXME: We're emitting LSDA info into a readonly section on COFF, even
+  // though it contains relocatable pointers.  In PIC mode, this is probably a
+  // big runtime hit for C++ apps.  Either the contents of the LSDA need to be
+  // adjusted or this should be a data section.
+  LSDASection =
+    getCOFFSection(".gcc_except_table", false, SectionKind::getReadOnly());
+  EHFrameSection =
+    getCOFFSection(".eh_frame", false, SectionKind::getDataRel());
+
+  // Debug info.
+  // FIXME: Don't use 'directive' mode here.
+  DwarfAbbrevSection =
+    getCOFFSection("\t.section\t.debug_abbrev,\"dr\"",
+                   true, SectionKind::getMetadata());
+  DwarfInfoSection =
+    getCOFFSection("\t.section\t.debug_info,\"dr\"",
+                   true, SectionKind::getMetadata());
+  DwarfLineSection =
+    getCOFFSection("\t.section\t.debug_line,\"dr\"",
+                   true, SectionKind::getMetadata());
+  DwarfFrameSection =
+    getCOFFSection("\t.section\t.debug_frame,\"dr\"",
+                   true, SectionKind::getMetadata());
+  DwarfPubNamesSection =
+    getCOFFSection("\t.section\t.debug_pubnames,\"dr\"",
+                   true, SectionKind::getMetadata());
+  DwarfPubTypesSection =
+    getCOFFSection("\t.section\t.debug_pubtypes,\"dr\"",
+                   true, SectionKind::getMetadata());
+  DwarfStrSection =
+    getCOFFSection("\t.section\t.debug_str,\"dr\"",
+                   true, SectionKind::getMetadata());
+  DwarfLocSection =
+    getCOFFSection("\t.section\t.debug_loc,\"dr\"",
+                   true, SectionKind::getMetadata());
+  DwarfARangesSection =
+    getCOFFSection("\t.section\t.debug_aranges,\"dr\"",
+                   true, SectionKind::getMetadata());
+  DwarfRangesSection =
+    getCOFFSection("\t.section\t.debug_ranges,\"dr\"",
+                   true, SectionKind::getMetadata());
+  DwarfMacroInfoSection =
+    getCOFFSection("\t.section\t.debug_macinfo,\"dr\"",
+                   true, SectionKind::getMetadata());
+}
+
+const MCSection *TargetLoweringObjectFileCOFF::
+getExplicitSectionGlobal(const GlobalValue *GV, SectionKind Kind,
+                         Mangler *Mang, const TargetMachine &TM) const {
+  return getCOFFSection(GV->getSection(), false, Kind);
+}
+
+static const char *getCOFFSectionPrefixForUniqueGlobal(SectionKind Kind) {
+  if (Kind.isText())
+    return ".text$linkonce";
+  if (Kind.isWriteable())
+    return ".data$linkonce";
+  return ".rdata$linkonce";
+}
+
+
+const MCSection *TargetLoweringObjectFileCOFF::
+SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
+                       Mangler *Mang, const TargetMachine &TM) const {
+  assert(!Kind.isThreadLocal() && "Doesn't support TLS");
+
+  // If this global is linkonce/weak and the target handles this by emitting it
+  // into a 'uniqued' section name, create and return the section now.
+  if (GV->isWeakForLinker()) {
+    const char *Prefix = getCOFFSectionPrefixForUniqueGlobal(Kind);
+    SmallString<128> Name(Prefix, Prefix+strlen(Prefix));
+    Mang->getNameWithPrefix(Name, GV, false);
+    return getCOFFSection(Name.str(), false, Kind);
+  }
+
+  if (Kind.isText())
+    return getTextSection();
+
+  return getDataSection();
+}
+

diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
new file mode 100644
index 0000000..88871e3
--- /dev/null
+++ b/lib/Target/TargetMachine.cpp

@@ -0,0 +1,265 @@
+//===-- TargetMachine.cpp - General Target Information ---------------------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the general parts of a Target machine.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+//---------------------------------------------------------------------------
+// Command-line options that tend to be useful on more than one back-end.
+//
+
+namespace llvm {
+  bool LessPreciseFPMADOption;
+  bool PrintMachineCode;
+  bool NoFramePointerElim;
+  bool NoExcessFPPrecision;
+  bool UnsafeFPMath;
+  bool FiniteOnlyFPMathOption;
+  bool HonorSignDependentRoundingFPMathOption;
+  bool UseSoftFloat;
+  FloatABI::ABIType FloatABIType;
+  bool NoImplicitFloat;
+  bool NoZerosInBSS;
+  bool DwarfExceptionHandling;
+  bool SjLjExceptionHandling;
+  bool JITEmitDebugInfo;
+  bool JITEmitDebugInfoToDisk;
+  bool UnwindTablesMandatory;
+  Reloc::Model RelocationModel;
+  CodeModel::Model CMModel;
+  bool GuaranteedTailCallOpt;
+  unsigned StackAlignment;
+  bool RealignStack;
+  bool DisableJumpTables;
+  bool StrongPHIElim;
+  bool AsmVerbosityDefault(false);
+}
+
+static cl::opt<bool, true>
+PrintCode("print-machineinstrs",
+  cl::desc("Print generated machine code"),
+  cl::location(PrintMachineCode), cl::init(false));
+static cl::opt<bool, true>
+DisableFPElim("disable-fp-elim",
+  cl::desc("Disable frame pointer elimination optimization"),
+  cl::location(NoFramePointerElim),
+  cl::init(false));
+static cl::opt<bool, true>
+DisableExcessPrecision("disable-excess-fp-precision",
+  cl::desc("Disable optimizations that may increase FP precision"),
+  cl::location(NoExcessFPPrecision),
+  cl::init(false));
+static cl::opt<bool, true>
+EnableFPMAD("enable-fp-mad",
+  cl::desc("Enable less precise MAD instructions to be generated"),
+  cl::location(LessPreciseFPMADOption),
+  cl::init(false));
+static cl::opt<bool, true>
+EnableUnsafeFPMath("enable-unsafe-fp-math",
+  cl::desc("Enable optimizations that may decrease FP precision"),
+  cl::location(UnsafeFPMath),
+  cl::init(false));
+static cl::opt<bool, true>
+EnableFiniteOnlyFPMath("enable-finite-only-fp-math",
+  cl::desc("Enable optimizations that assumes non- NaNs / +-Infs"),
+  cl::location(FiniteOnlyFPMathOption),
+  cl::init(false));
+static cl::opt<bool, true>
+EnableHonorSignDependentRoundingFPMath("enable-sign-dependent-rounding-fp-math",
+  cl::Hidden,
+  cl::desc("Force codegen to assume rounding mode can change dynamically"),
+  cl::location(HonorSignDependentRoundingFPMathOption),
+  cl::init(false));
+static cl::opt<bool, true>
+GenerateSoftFloatCalls("soft-float",
+  cl::desc("Generate software floating point library calls"),
+  cl::location(UseSoftFloat),
+  cl::init(false));
+static cl::opt<llvm::FloatABI::ABIType, true>
+FloatABIForCalls("float-abi",
+  cl::desc("Choose float ABI type"),
+  cl::location(FloatABIType),
+  cl::init(FloatABI::Default),
+  cl::values(
+    clEnumValN(FloatABI::Default, "default",
+               "Target default float ABI type"),
+    clEnumValN(FloatABI::Soft, "soft",
+               "Soft float ABI (implied by -soft-float)"),
+    clEnumValN(FloatABI::Hard, "hard",
+               "Hard float ABI (uses FP registers)"),
+    clEnumValEnd));
+static cl::opt<bool, true>
+DontPlaceZerosInBSS("nozero-initialized-in-bss",
+  cl::desc("Don't place zero-initialized symbols into bss section"),
+  cl::location(NoZerosInBSS),
+  cl::init(false));
+static cl::opt<bool, true>
+EnableDwarfExceptionHandling("enable-eh",
+  cl::desc("Emit DWARF exception handling (default if target supports)"),
+  cl::location(DwarfExceptionHandling),
+  cl::init(false));
+static cl::opt<bool, true>
+EnableSjLjExceptionHandling("enable-sjlj-eh",
+  cl::desc("Emit SJLJ exception handling (default if target supports)"),
+  cl::location(SjLjExceptionHandling),
+  cl::init(false));
+// In debug builds, make this default to true.
+#ifdef NDEBUG
+#define EMIT_DEBUG false
+#else
+#define EMIT_DEBUG true
+#endif
+static cl::opt<bool, true>
+EmitJitDebugInfo("jit-emit-debug",
+  cl::desc("Emit debug information to debugger"),
+  cl::location(JITEmitDebugInfo),
+  cl::init(EMIT_DEBUG));
+#undef EMIT_DEBUG
+static cl::opt<bool, true>
+EmitJitDebugInfoToDisk("jit-emit-debug-to-disk",
+  cl::Hidden,
+  cl::desc("Emit debug info objfiles to disk"),
+  cl::location(JITEmitDebugInfoToDisk),
+  cl::init(false));
+static cl::opt<bool, true>
+EnableUnwindTables("unwind-tables",
+  cl::desc("Generate unwinding tables for all functions"),
+  cl::location(UnwindTablesMandatory),
+  cl::init(false));
+
+static cl::opt<llvm::Reloc::Model, true>
+DefRelocationModel("relocation-model",
+  cl::desc("Choose relocation model"),
+  cl::location(RelocationModel),
+  cl::init(Reloc::Default),
+  cl::values(
+    clEnumValN(Reloc::Default, "default",
+               "Target default relocation model"),
+    clEnumValN(Reloc::Static, "static",
+               "Non-relocatable code"),
+    clEnumValN(Reloc::PIC_, "pic",
+               "Fully relocatable, position independent code"),
+    clEnumValN(Reloc::DynamicNoPIC, "dynamic-no-pic",
+               "Relocatable external references, non-relocatable code"),
+    clEnumValEnd));
+static cl::opt<llvm::CodeModel::Model, true>
+DefCodeModel("code-model",
+  cl::desc("Choose code model"),
+  cl::location(CMModel),
+  cl::init(CodeModel::Default),
+  cl::values(
+    clEnumValN(CodeModel::Default, "default",
+               "Target default code model"),
+    clEnumValN(CodeModel::Small, "small",
+               "Small code model"),
+    clEnumValN(CodeModel::Kernel, "kernel",
+               "Kernel code model"),
+    clEnumValN(CodeModel::Medium, "medium",
+               "Medium code model"),
+    clEnumValN(CodeModel::Large, "large",
+               "Large code model"),
+    clEnumValEnd));
+static cl::opt<bool, true>
+EnableGuaranteedTailCallOpt("tailcallopt",
+  cl::desc("Turn fastcc calls into tail calls by (potentially) changing ABI."),
+  cl::location(GuaranteedTailCallOpt),
+  cl::init(false));
+static cl::opt<unsigned, true>
+OverrideStackAlignment("stack-alignment",
+  cl::desc("Override default stack alignment"),
+  cl::location(StackAlignment),
+  cl::init(0));
+static cl::opt<bool, true>
+EnableRealignStack("realign-stack",
+  cl::desc("Realign stack if needed"),
+  cl::location(RealignStack),
+  cl::init(true));
+static cl::opt<bool, true>
+DisableSwitchTables(cl::Hidden, "disable-jump-tables", 
+  cl::desc("Do not generate jump tables."),
+  cl::location(DisableJumpTables),
+  cl::init(false));
+static cl::opt<bool, true>
+EnableStrongPHIElim(cl::Hidden, "strong-phi-elim",
+  cl::desc("Use strong PHI elimination."),
+  cl::location(StrongPHIElim),
+  cl::init(false));
+
+//---------------------------------------------------------------------------
+// TargetMachine Class
+//
+
+TargetMachine::TargetMachine(const Target &T) 
+  : TheTarget(T), AsmInfo(0) {
+  // Typically it will be subtargets that will adjust FloatABIType from Default
+  // to Soft or Hard.
+  if (UseSoftFloat)
+    FloatABIType = FloatABI::Soft;
+}
+
+TargetMachine::~TargetMachine() {
+  delete AsmInfo;
+}
+
+/// getRelocationModel - Returns the code generation relocation model. The
+/// choices are static, PIC, and dynamic-no-pic, and target default.
+Reloc::Model TargetMachine::getRelocationModel() {
+  return RelocationModel;
+}
+
+/// setRelocationModel - Sets the code generation relocation model.
+void TargetMachine::setRelocationModel(Reloc::Model Model) {
+  RelocationModel = Model;
+}
+
+/// getCodeModel - Returns the code model. The choices are small, kernel,
+/// medium, large, and target default.
+CodeModel::Model TargetMachine::getCodeModel() {
+  return CMModel;
+}
+
+/// setCodeModel - Sets the code model.
+void TargetMachine::setCodeModel(CodeModel::Model Model) {
+  CMModel = Model;
+}
+
+bool TargetMachine::getAsmVerbosityDefault() {
+  return AsmVerbosityDefault;
+}
+
+void TargetMachine::setAsmVerbosityDefault(bool V) {
+  AsmVerbosityDefault = V;
+}
+
+namespace llvm {
+  /// LessPreciseFPMAD - This flag return true when -enable-fp-mad option
+  /// is specified on the command line.  When this flag is off(default), the
+  /// code generator is not allowed to generate mad (multiply add) if the
+  /// result is "less precise" than doing those operations individually.
+  bool LessPreciseFPMAD() { return UnsafeFPMath || LessPreciseFPMADOption; }
+
+  /// FiniteOnlyFPMath - This returns true when the -enable-finite-only-fp-math
+  /// option is specified on the command line. If this returns false (default),
+  /// the code generator is not allowed to assume that FP arithmetic arguments
+  /// and results are never NaNs or +-Infs.
+  bool FiniteOnlyFPMath() { return UnsafeFPMath || FiniteOnlyFPMathOption; }
+  
+  /// HonorSignDependentRoundingFPMath - Return true if the codegen must assume
+  /// that the rounding mode of the FPU can change from its default.
+  bool HonorSignDependentRoundingFPMath() {
+    return !UnsafeFPMath && HonorSignDependentRoundingFPMathOption;
+  }
+}

diff --git a/lib/Target/TargetRegisterInfo.cpp b/lib/Target/TargetRegisterInfo.cpp
new file mode 100644
index 0000000..52983ff
--- /dev/null
+++ b/lib/Target/TargetRegisterInfo.cpp

@@ -0,0 +1,145 @@
+//===- TargetRegisterInfo.cpp - Target Register Information Implementation ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the TargetRegisterInfo interface.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/ADT/BitVector.h"
+
+using namespace llvm;
+
+TargetRegisterInfo::TargetRegisterInfo(const TargetRegisterDesc *D, unsigned NR,
+                             regclass_iterator RCB, regclass_iterator RCE,
+                             int CFSO, int CFDO,
+                             const unsigned* subregs, const unsigned subregsize,
+                         const unsigned* superregs, const unsigned superregsize,
+                         const unsigned* aliases, const unsigned aliasessize)
+  : SubregHash(subregs), SubregHashSize(subregsize),
+    SuperregHash(superregs), SuperregHashSize(superregsize),
+    AliasesHash(aliases), AliasesHashSize(aliasessize),
+    Desc(D), NumRegs(NR), RegClassBegin(RCB), RegClassEnd(RCE) {
+  assert(NumRegs < FirstVirtualRegister &&
+         "Target has too many physical registers!");
+
+  CallFrameSetupOpcode   = CFSO;
+  CallFrameDestroyOpcode = CFDO;
+}
+
+TargetRegisterInfo::~TargetRegisterInfo() {}
+
+/// getPhysicalRegisterRegClass - Returns the Register Class of a physical
+/// register of the given type. If type is EVT::Other, then just return any
+/// register class the register belongs to.
+const TargetRegisterClass *
+TargetRegisterInfo::getPhysicalRegisterRegClass(unsigned reg, EVT VT) const {
+  assert(isPhysicalRegister(reg) && "reg must be a physical register");
+
+  // Pick the most super register class of the right type that contains
+  // this physreg.
+  const TargetRegisterClass* BestRC = 0;
+  for (regclass_iterator I = regclass_begin(), E = regclass_end(); I != E; ++I){
+    const TargetRegisterClass* RC = *I;
+    if ((VT == MVT::Other || RC->hasType(VT)) && RC->contains(reg) &&
+        (!BestRC || BestRC->hasSuperClass(RC)))
+      BestRC = RC;
+  }
+
+  assert(BestRC && "Couldn't find the register class");
+  return BestRC;
+}
+
+/// getAllocatableSetForRC - Toggle the bits that represent allocatable
+/// registers for the specific register class.
+static void getAllocatableSetForRC(const MachineFunction &MF,
+                                   const TargetRegisterClass *RC, BitVector &R){  
+  for (TargetRegisterClass::iterator I = RC->allocation_order_begin(MF),
+         E = RC->allocation_order_end(MF); I != E; ++I)
+    R.set(*I);
+}
+
+BitVector TargetRegisterInfo::getAllocatableSet(const MachineFunction &MF,
+                                          const TargetRegisterClass *RC) const {
+  BitVector Allocatable(NumRegs);
+  if (RC) {
+    getAllocatableSetForRC(MF, RC, Allocatable);
+    return Allocatable;
+  }
+
+  for (TargetRegisterInfo::regclass_iterator I = regclass_begin(),
+         E = regclass_end(); I != E; ++I)
+    getAllocatableSetForRC(MF, *I, Allocatable);
+  return Allocatable;
+}
+
+/// getFrameIndexOffset - Returns the displacement from the frame register to
+/// the stack frame of the specified index. This is the default implementation
+/// which is overridden for some targets.
+int TargetRegisterInfo::getFrameIndexOffset(const MachineFunction &MF,
+                                            int FI) const {
+  const TargetFrameInfo &TFI = *MF.getTarget().getFrameInfo();
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  return MFI->getObjectOffset(FI) + MFI->getStackSize() -
+    TFI.getOffsetOfLocalArea() + MFI->getOffsetAdjustment();
+}
+
+/// getInitialFrameState - Returns a list of machine moves that are assumed
+/// on entry to a function.
+void
+TargetRegisterInfo::getInitialFrameState(std::vector<MachineMove> &Moves) const{
+  // Default is to do nothing.
+}
+
+const TargetRegisterClass *
+llvm::getCommonSubClass(const TargetRegisterClass *A,
+                        const TargetRegisterClass *B) {
+  // First take care of the trivial cases
+  if (A == B)
+    return A;
+  if (!A || !B)
+    return 0;
+
+  // If B is a subclass of A, it will be handled in the loop below
+  if (B->hasSubClass(A))
+    return A;
+
+  const TargetRegisterClass *Best = 0;
+  for (TargetRegisterClass::sc_iterator I = A->subclasses_begin();
+       const TargetRegisterClass *X = *I; ++I) {
+    if (X == B)
+      return B;                 // B is a subclass of A
+
+    // X must be a common subclass of A and B
+    if (!B->hasSubClass(X))
+      continue;
+
+    // A superclass is definitely better.
+    if (!Best || Best->hasSuperClass(X)) {
+      Best = X;
+      continue;
+    }
+
+    // A subclass is definitely worse
+    if (Best->hasSubClass(X))
+      continue;
+
+    // Best and *I have no super/sub class relation - pick the larger class, or
+    // the smaller spill size.
+    int nb = std::distance(Best->begin(), Best->end());
+    int ni = std::distance(X->begin(), X->end());
+    if (ni>nb || (ni==nb && X->getSize() < Best->getSize()))
+      Best = X;
+  }
+  return Best;
+}

diff --git a/lib/Target/TargetSubtarget.cpp b/lib/Target/TargetSubtarget.cpp
new file mode 100644
index 0000000..edb76f9
--- /dev/null
+++ b/lib/Target/TargetSubtarget.cpp

@@ -0,0 +1,33 @@
+//===-- TargetSubtarget.cpp - General Target Information -------------------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the general parts of a Subtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetSubtarget.h"
+#include "llvm/ADT/SmallVector.h"
+using namespace llvm;
+
+//---------------------------------------------------------------------------
+// TargetSubtarget Class
+//
+TargetSubtarget::TargetSubtarget() {}
+
+TargetSubtarget::~TargetSubtarget() {}
+
+bool TargetSubtarget::enablePostRAScheduler(
+          CodeGenOpt::Level OptLevel,
+          AntiDepBreakMode& Mode,
+          RegClassVector& CriticalPathRCs) const {
+  Mode = ANTIDEP_NONE;
+  CriticalPathRCs.clear();
+  return false;
+}
+

diff --git a/lib/Target/X86/AsmParser/CMakeLists.txt b/lib/Target/X86/AsmParser/CMakeLists.txt
new file mode 100644
index 0000000..40dbdd7
--- /dev/null
+++ b/lib/Target/X86/AsmParser/CMakeLists.txt

@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMX86AsmParser
+  X86AsmLexer.cpp
+  X86AsmParser.cpp
+  )
+add_dependencies(LLVMX86AsmParser X86CodeGenTable_gen)

diff --git a/lib/Target/X86/AsmParser/Makefile b/lib/Target/X86/AsmParser/Makefile
new file mode 100644
index 0000000..25fb0a2
--- /dev/null
+++ b/lib/Target/X86/AsmParser/Makefile

@@ -0,0 +1,15 @@
+##===- lib/Target/X86/AsmParser/Makefile -------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMX86AsmParser
+
+# Hack: we need to include 'main' x86 target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common

diff --git a/lib/Target/X86/AsmParser/X86AsmLexer.cpp b/lib/Target/X86/AsmParser/X86AsmLexer.cpp
new file mode 100644
index 0000000..a58f58e
--- /dev/null
+++ b/lib/Target/X86/AsmParser/X86AsmLexer.cpp

@@ -0,0 +1,147 @@
+//===-- X86AsmLexer.cpp - Tokenize X86 assembly to AsmTokens --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Target/TargetAsmLexer.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "X86.h"
+
+using namespace llvm;
+
+namespace {
+  
+class X86AsmLexer : public TargetAsmLexer {
+  const MCAsmInfo &AsmInfo;
+  
+  bool tentativeIsValid;
+  AsmToken tentativeToken;
+  
+  const AsmToken &lexTentative() {
+    tentativeToken = getLexer()->Lex();
+    tentativeIsValid = true;
+    return tentativeToken;
+  }
+  
+  const AsmToken &lexDefinite() {
+    if(tentativeIsValid) {
+      tentativeIsValid = false;
+      return tentativeToken;
+    }
+    else {
+      return getLexer()->Lex();
+    }
+  }
+  
+  AsmToken LexTokenATT();
+  AsmToken LexTokenIntel();
+protected:
+  AsmToken LexToken() {
+    if (!Lexer) {
+      SetError(SMLoc(), "No MCAsmLexer installed");
+      return AsmToken(AsmToken::Error, "", 0);
+    }
+    
+    switch (AsmInfo.getAssemblerDialect()) {
+    default:
+      SetError(SMLoc(), "Unhandled dialect");
+      return AsmToken(AsmToken::Error, "", 0);
+    case 0:
+      return LexTokenATT();
+    case 1:
+      return LexTokenIntel();
+    }
+  }
+public:
+  X86AsmLexer(const Target &T, const MCAsmInfo &MAI)
+    : TargetAsmLexer(T), AsmInfo(MAI), tentativeIsValid(false) {
+  }
+};
+
+}
+
+static unsigned MatchRegisterName(StringRef Name);
+
+AsmToken X86AsmLexer::LexTokenATT() {
+  const AsmToken lexedToken = lexDefinite();
+  
+  switch (lexedToken.getKind()) {
+  default:
+    return AsmToken(lexedToken);
+  case AsmToken::Error:
+    SetError(Lexer->getErrLoc(), Lexer->getErr());
+    return AsmToken(lexedToken);
+  case AsmToken::Percent:
+  {
+    const AsmToken &nextToken = lexTentative();
+    if (nextToken.getKind() == AsmToken::Identifier) {
+      unsigned regID = MatchRegisterName(nextToken.getString());
+      
+      if (regID) {
+        lexDefinite();
+        
+        StringRef regStr(lexedToken.getString().data(),
+                         lexedToken.getString().size() + 
+                         nextToken.getString().size());
+        
+        return AsmToken(AsmToken::Register, 
+                        regStr, 
+                        static_cast<int64_t>(regID));
+      }
+      else {
+        return AsmToken(lexedToken);
+      }
+    }
+    else {
+      return AsmToken(lexedToken);
+    }
+  }    
+  }
+}
+
+AsmToken X86AsmLexer::LexTokenIntel() {
+  const AsmToken &lexedToken = lexDefinite();
+  
+  switch(lexedToken.getKind()) {
+  default:
+    return AsmToken(lexedToken);
+  case AsmToken::Error:
+    SetError(Lexer->getErrLoc(), Lexer->getErr());
+    return AsmToken(lexedToken);
+  case AsmToken::Identifier:
+  {
+    std::string upperCase = lexedToken.getString().str();
+    std::string lowerCase = LowercaseString(upperCase);
+    StringRef lowerRef(lowerCase);
+    
+    unsigned regID = MatchRegisterName(lowerRef);
+    
+    if (regID) {
+      return AsmToken(AsmToken::Register,
+                      lexedToken.getString(),
+                      static_cast<int64_t>(regID));
+    }
+    else {
+      return AsmToken(lexedToken);
+    }
+  }
+  }
+}
+
+extern "C" void LLVMInitializeX86AsmLexer() {
+  RegisterAsmLexer<X86AsmLexer> X(TheX86_32Target);
+  RegisterAsmLexer<X86AsmLexer> Y(TheX86_64Target);
+}
+
+#define REGISTERS_ONLY
+#include "X86GenAsmMatcher.inc"
+#undef REGISTERS_ONLY

diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
new file mode 100644
index 0000000..acf497a
--- /dev/null
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp

@@ -0,0 +1,584 @@
+//===-- X86AsmParser.cpp - Parse X86 assembly to MCInst instructions ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetAsmParser.h"
+#include "X86.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Target/TargetAsmParser.h"
+using namespace llvm;
+
+namespace {
+struct X86Operand;
+
+class X86ATTAsmParser : public TargetAsmParser {
+  MCAsmParser &Parser;
+
+private:
+  MCAsmParser &getParser() const { return Parser; }
+
+  MCAsmLexer &getLexer() const { return Parser.getLexer(); }
+
+  void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); }
+
+  bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); }
+
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc);
+
+  X86Operand *ParseOperand();
+  X86Operand *ParseMemOperand();
+
+  bool ParseDirectiveWord(unsigned Size, SMLoc L);
+
+  /// @name Auto-generated Match Functions
+  /// {  
+
+  bool MatchInstruction(const SmallVectorImpl<MCParsedAsmOperand*> &Operands,
+                        MCInst &Inst);
+
+  /// }
+
+public:
+  X86ATTAsmParser(const Target &T, MCAsmParser &_Parser)
+    : TargetAsmParser(T), Parser(_Parser) {}
+
+  virtual bool ParseInstruction(const StringRef &Name, SMLoc NameLoc,
+                                SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+
+  virtual bool ParseDirective(AsmToken DirectiveID);
+};
+  
+} // end anonymous namespace
+
+/// @name Auto-generated Match Functions
+/// {  
+
+static unsigned MatchRegisterName(StringRef Name);
+
+/// }
+
+namespace {
+
+/// X86Operand - Instances of this class represent a parsed X86 machine
+/// instruction.
+struct X86Operand : public MCParsedAsmOperand {
+  enum KindTy {
+    Token,
+    Register,
+    Immediate,
+    Memory
+  } Kind;
+
+  SMLoc StartLoc, EndLoc;
+  
+  union {
+    struct {
+      const char *Data;
+      unsigned Length;
+    } Tok;
+
+    struct {
+      unsigned RegNo;
+    } Reg;
+
+    struct {
+      const MCExpr *Val;
+    } Imm;
+
+    struct {
+      unsigned SegReg;
+      const MCExpr *Disp;
+      unsigned BaseReg;
+      unsigned IndexReg;
+      unsigned Scale;
+    } Mem;
+  };
+
+  X86Operand(KindTy K, SMLoc Start, SMLoc End)
+    : Kind(K), StartLoc(Start), EndLoc(End) {}
+  
+  /// getStartLoc - Get the location of the first token of this operand.
+  SMLoc getStartLoc() const { return StartLoc; }
+  /// getEndLoc - Get the location of the last token of this operand.
+  SMLoc getEndLoc() const { return EndLoc; }
+
+  StringRef getToken() const {
+    assert(Kind == Token && "Invalid access!");
+    return StringRef(Tok.Data, Tok.Length);
+  }
+
+  unsigned getReg() const {
+    assert(Kind == Register && "Invalid access!");
+    return Reg.RegNo;
+  }
+
+  const MCExpr *getImm() const {
+    assert(Kind == Immediate && "Invalid access!");
+    return Imm.Val;
+  }
+
+  const MCExpr *getMemDisp() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.Disp;
+  }
+  unsigned getMemSegReg() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.SegReg;
+  }
+  unsigned getMemBaseReg() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.BaseReg;
+  }
+  unsigned getMemIndexReg() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.IndexReg;
+  }
+  unsigned getMemScale() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.Scale;
+  }
+
+  bool isToken() const {return Kind == Token; }
+
+  bool isImm() const { return Kind == Immediate; }
+  
+  bool isImmSExt8() const { 
+    // Accept immediates which fit in 8 bits when sign extended, and
+    // non-absolute immediates.
+    if (!isImm())
+      return false;
+
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm())) {
+      int64_t Value = CE->getValue();
+      return Value == (int64_t) (int8_t) Value;
+    }
+
+    return true;
+  }
+  
+  bool isMem() const { return Kind == Memory; }
+
+  bool isAbsMem() const {
+    return Kind == Memory && !getMemSegReg() && !getMemBaseReg() &&
+      !getMemIndexReg() && getMemScale() == 1;
+  }
+
+  bool isNoSegMem() const {
+    return Kind == Memory && !getMemSegReg();
+  }
+
+  bool isReg() const { return Kind == Register; }
+
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getReg()));
+  }
+
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateExpr(getImm()));
+  }
+
+  void addImmSExt8Operands(MCInst &Inst, unsigned N) const {
+    // FIXME: Support user customization of the render method.
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateExpr(getImm()));
+  }
+
+  void addMemOperands(MCInst &Inst, unsigned N) const {
+    assert((N == 5) && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getMemBaseReg()));
+    Inst.addOperand(MCOperand::CreateImm(getMemScale()));
+    Inst.addOperand(MCOperand::CreateReg(getMemIndexReg()));
+    Inst.addOperand(MCOperand::CreateExpr(getMemDisp()));
+    Inst.addOperand(MCOperand::CreateReg(getMemSegReg()));
+  }
+
+  void addAbsMemOperands(MCInst &Inst, unsigned N) const {
+    assert((N == 1) && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateExpr(getMemDisp()));
+  }
+
+  void addNoSegMemOperands(MCInst &Inst, unsigned N) const {
+    assert((N == 4) && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getMemBaseReg()));
+    Inst.addOperand(MCOperand::CreateImm(getMemScale()));
+    Inst.addOperand(MCOperand::CreateReg(getMemIndexReg()));
+    Inst.addOperand(MCOperand::CreateExpr(getMemDisp()));
+  }
+
+  static X86Operand *CreateToken(StringRef Str, SMLoc Loc) {
+    X86Operand *Res = new X86Operand(Token, Loc, Loc);
+    Res->Tok.Data = Str.data();
+    Res->Tok.Length = Str.size();
+    return Res;
+  }
+
+  static X86Operand *CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc) {
+    X86Operand *Res = new X86Operand(Register, StartLoc, EndLoc);
+    Res->Reg.RegNo = RegNo;
+    return Res;
+  }
+
+  static X86Operand *CreateImm(const MCExpr *Val, SMLoc StartLoc, SMLoc EndLoc){
+    X86Operand *Res = new X86Operand(Immediate, StartLoc, EndLoc);
+    Res->Imm.Val = Val;
+    return Res;
+  }
+
+  /// Create an absolute memory operand.
+  static X86Operand *CreateMem(const MCExpr *Disp, SMLoc StartLoc,
+                               SMLoc EndLoc) {
+    X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc);
+    Res->Mem.SegReg   = 0;
+    Res->Mem.Disp     = Disp;
+    Res->Mem.BaseReg  = 0;
+    Res->Mem.IndexReg = 0;
+    Res->Mem.Scale    = 1;
+    return Res;
+  }
+
+  /// Create a generalized memory operand.
+  static X86Operand *CreateMem(unsigned SegReg, const MCExpr *Disp,
+                               unsigned BaseReg, unsigned IndexReg,
+                               unsigned Scale, SMLoc StartLoc, SMLoc EndLoc) {
+    // We should never just have a displacement, that should be parsed as an
+    // absolute memory operand.
+    assert((SegReg || BaseReg || IndexReg) && "Invalid memory operand!");
+
+    // The scale should always be one of {1,2,4,8}.
+    assert(((Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8)) &&
+           "Invalid scale!");
+    X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc);
+    Res->Mem.SegReg   = SegReg;
+    Res->Mem.Disp     = Disp;
+    Res->Mem.BaseReg  = BaseReg;
+    Res->Mem.IndexReg = IndexReg;
+    Res->Mem.Scale    = Scale;
+    return Res;
+  }
+};
+
+} // end anonymous namespace.
+
+
+bool X86ATTAsmParser::ParseRegister(unsigned &RegNo,
+                                    SMLoc &StartLoc, SMLoc &EndLoc) {
+  RegNo = 0;
+  const AsmToken &TokPercent = Parser.getTok();
+  assert(TokPercent.is(AsmToken::Percent) && "Invalid token kind!");
+  StartLoc = TokPercent.getLoc();
+  Parser.Lex(); // Eat percent token.
+
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.isNot(AsmToken::Identifier))
+    return Error(Tok.getLoc(), "invalid register name");
+
+  // FIXME: Validate register for the current architecture; we have to do
+  // validation later, so maybe there is no need for this here.
+  RegNo = MatchRegisterName(Tok.getString());
+  
+  // Parse %st(1) and "%st" as "%st(0)"
+  if (RegNo == 0 && Tok.getString() == "st") {
+    RegNo = X86::ST0;
+    EndLoc = Tok.getLoc();
+    Parser.Lex(); // Eat 'st'
+    
+    // Check to see if we have '(4)' after %st.
+    if (getLexer().isNot(AsmToken::LParen))
+      return false;
+    // Lex the paren.
+    getParser().Lex();
+
+    const AsmToken &IntTok = Parser.getTok();
+    if (IntTok.isNot(AsmToken::Integer))
+      return Error(IntTok.getLoc(), "expected stack index");
+    switch (IntTok.getIntVal()) {
+    case 0: RegNo = X86::ST0; break;
+    case 1: RegNo = X86::ST1; break;
+    case 2: RegNo = X86::ST2; break;
+    case 3: RegNo = X86::ST3; break;
+    case 4: RegNo = X86::ST4; break;
+    case 5: RegNo = X86::ST5; break;
+    case 6: RegNo = X86::ST6; break;
+    case 7: RegNo = X86::ST7; break;
+    default: return Error(IntTok.getLoc(), "invalid stack index");
+    }
+    
+    if (getParser().Lex().isNot(AsmToken::RParen))
+      return Error(Parser.getTok().getLoc(), "expected ')'");
+    
+    EndLoc = Tok.getLoc();
+    Parser.Lex(); // Eat ')'
+    return false;
+  }
+  
+  if (RegNo == 0)
+    return Error(Tok.getLoc(), "invalid register name");
+
+  EndLoc = Tok.getLoc();
+  Parser.Lex(); // Eat identifier token.
+  return false;
+}
+
+X86Operand *X86ATTAsmParser::ParseOperand() {
+  switch (getLexer().getKind()) {
+  default:
+    return ParseMemOperand();
+  case AsmToken::Percent: {
+    // FIXME: if a segment register, this could either be just the seg reg, or
+    // the start of a memory operand.
+    unsigned RegNo;
+    SMLoc Start, End;
+    if (ParseRegister(RegNo, Start, End)) return 0;
+    return X86Operand::CreateReg(RegNo, Start, End);
+  }
+  case AsmToken::Dollar: {
+    // $42 -> immediate.
+    SMLoc Start = Parser.getTok().getLoc(), End;
+    Parser.Lex();
+    const MCExpr *Val;
+    if (getParser().ParseExpression(Val, End))
+      return 0;
+    return X86Operand::CreateImm(Val, Start, End);
+  }
+  }
+}
+
+/// ParseMemOperand: segment: disp(basereg, indexreg, scale)
+X86Operand *X86ATTAsmParser::ParseMemOperand() {
+  SMLoc MemStart = Parser.getTok().getLoc();
+  
+  // FIXME: If SegReg ':'  (e.g. %gs:), eat and remember.
+  unsigned SegReg = 0;
+  
+  // We have to disambiguate a parenthesized expression "(4+5)" from the start
+  // of a memory operand with a missing displacement "(%ebx)" or "(,%eax)".  The
+  // only way to do this without lookahead is to eat the '(' and see what is
+  // after it.
+  const MCExpr *Disp = MCConstantExpr::Create(0, getParser().getContext());
+  if (getLexer().isNot(AsmToken::LParen)) {
+    SMLoc ExprEnd;
+    if (getParser().ParseExpression(Disp, ExprEnd)) return 0;
+    
+    // After parsing the base expression we could either have a parenthesized
+    // memory address or not.  If not, return now.  If so, eat the (.
+    if (getLexer().isNot(AsmToken::LParen)) {
+      // Unless we have a segment register, treat this as an immediate.
+      if (SegReg == 0)
+        return X86Operand::CreateMem(Disp, MemStart, ExprEnd);
+      return X86Operand::CreateMem(SegReg, Disp, 0, 0, 1, MemStart, ExprEnd);
+    }
+    
+    // Eat the '('.
+    Parser.Lex();
+  } else {
+    // Okay, we have a '('.  We don't know if this is an expression or not, but
+    // so we have to eat the ( to see beyond it.
+    SMLoc LParenLoc = Parser.getTok().getLoc();
+    Parser.Lex(); // Eat the '('.
+    
+    if (getLexer().is(AsmToken::Percent) || getLexer().is(AsmToken::Comma)) {
+      // Nothing to do here, fall into the code below with the '(' part of the
+      // memory operand consumed.
+    } else {
+      SMLoc ExprEnd;
+      
+      // It must be an parenthesized expression, parse it now.
+      if (getParser().ParseParenExpression(Disp, ExprEnd))
+        return 0;
+      
+      // After parsing the base expression we could either have a parenthesized
+      // memory address or not.  If not, return now.  If so, eat the (.
+      if (getLexer().isNot(AsmToken::LParen)) {
+        // Unless we have a segment register, treat this as an immediate.
+        if (SegReg == 0)
+          return X86Operand::CreateMem(Disp, LParenLoc, ExprEnd);
+        return X86Operand::CreateMem(SegReg, Disp, 0, 0, 1, MemStart, ExprEnd);
+      }
+      
+      // Eat the '('.
+      Parser.Lex();
+    }
+  }
+  
+  // If we reached here, then we just ate the ( of the memory operand.  Process
+  // the rest of the memory operand.
+  unsigned BaseReg = 0, IndexReg = 0, Scale = 1;
+  
+  if (getLexer().is(AsmToken::Percent)) {
+    SMLoc L;
+    if (ParseRegister(BaseReg, L, L)) return 0;
+  }
+  
+  if (getLexer().is(AsmToken::Comma)) {
+    Parser.Lex(); // Eat the comma.
+
+    // Following the comma we should have either an index register, or a scale
+    // value. We don't support the later form, but we want to parse it
+    // correctly.
+    //
+    // Not that even though it would be completely consistent to support syntax
+    // like "1(%eax,,1)", the assembler doesn't.
+    if (getLexer().is(AsmToken::Percent)) {
+      SMLoc L;
+      if (ParseRegister(IndexReg, L, L)) return 0;
+    
+      if (getLexer().isNot(AsmToken::RParen)) {
+        // Parse the scale amount:
+        //  ::= ',' [scale-expression]
+        if (getLexer().isNot(AsmToken::Comma)) {
+          Error(Parser.getTok().getLoc(),
+                "expected comma in scale expression");
+          return 0;
+        }
+        Parser.Lex(); // Eat the comma.
+
+        if (getLexer().isNot(AsmToken::RParen)) {
+          SMLoc Loc = Parser.getTok().getLoc();
+
+          int64_t ScaleVal;
+          if (getParser().ParseAbsoluteExpression(ScaleVal))
+            return 0;
+          
+          // Validate the scale amount.
+          if (ScaleVal != 1 && ScaleVal != 2 && ScaleVal != 4 && ScaleVal != 8){
+            Error(Loc, "scale factor in address must be 1, 2, 4 or 8");
+            return 0;
+          }
+          Scale = (unsigned)ScaleVal;
+        }
+      }
+    } else if (getLexer().isNot(AsmToken::RParen)) {
+      // Otherwise we have the unsupported form of a scale amount without an
+      // index.
+      SMLoc Loc = Parser.getTok().getLoc();
+
+      int64_t Value;
+      if (getParser().ParseAbsoluteExpression(Value))
+        return 0;
+      
+      Error(Loc, "cannot have scale factor without index register");
+      return 0;
+    }
+  }
+  
+  // Ok, we've eaten the memory operand, verify we have a ')' and eat it too.
+  if (getLexer().isNot(AsmToken::RParen)) {
+    Error(Parser.getTok().getLoc(), "unexpected token in memory operand");
+    return 0;
+  }
+  SMLoc MemEnd = Parser.getTok().getLoc();
+  Parser.Lex(); // Eat the ')'.
+  
+  return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale,
+                               MemStart, MemEnd);
+}
+
+bool X86ATTAsmParser::
+ParseInstruction(const StringRef &Name, SMLoc NameLoc,
+                 SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+  // FIXME: Hack to recognize "sal..." for now. We need a way to represent
+  // alternative syntaxes in the .td file, without requiring instruction
+  // duplication.
+  if (Name.startswith("sal")) {
+    std::string Tmp = "shl" + Name.substr(3).str();
+    Operands.push_back(X86Operand::CreateToken(Tmp, NameLoc));
+  } else {
+    // FIXME: This is a hack.  We eventually want to add a general pattern
+    // mechanism to be used in the table gen file for these assembly names that
+    // use the same opcodes.  Also we should only allow the "alternate names"
+    // for rep and repne with the instructions they can only appear with.
+    StringRef PatchedName = Name;
+    if (Name == "repe" || Name == "repz")
+      PatchedName = "rep";
+    else if (Name == "repnz")
+      PatchedName = "repne";
+    Operands.push_back(X86Operand::CreateToken(PatchedName, NameLoc));
+  }
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+
+    // Parse '*' modifier.
+    if (getLexer().is(AsmToken::Star)) {
+      SMLoc Loc = Parser.getTok().getLoc();
+      Operands.push_back(X86Operand::CreateToken("*", Loc));
+      Parser.Lex(); // Eat the star.
+    }
+
+    // Read the first operand.
+    if (X86Operand *Op = ParseOperand())
+      Operands.push_back(Op);
+    else
+      return true;
+    
+    while (getLexer().is(AsmToken::Comma)) {
+      Parser.Lex();  // Eat the comma.
+
+      // Parse and remember the operand.
+      if (X86Operand *Op = ParseOperand())
+        Operands.push_back(Op);
+      else
+        return true;
+    }
+  }
+
+  return false;
+}
+
+bool X86ATTAsmParser::ParseDirective(AsmToken DirectiveID) {
+  StringRef IDVal = DirectiveID.getIdentifier();
+  if (IDVal == ".word")
+    return ParseDirectiveWord(2, DirectiveID.getLoc());
+  return true;
+}
+
+/// ParseDirectiveWord
+///  ::= .word [ expression (, expression)* ]
+bool X86ATTAsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    for (;;) {
+      const MCExpr *Value;
+      if (getParser().ParseExpression(Value))
+        return true;
+
+      getParser().getStreamer().EmitValue(Value, Size, 0 /*addrspace*/);
+
+      if (getLexer().is(AsmToken::EndOfStatement))
+        break;
+      
+      // FIXME: Improve diagnostic.
+      if (getLexer().isNot(AsmToken::Comma))
+        return Error(L, "unexpected token in directive");
+      Parser.Lex();
+    }
+  }
+
+  Parser.Lex();
+  return false;
+}
+
+extern "C" void LLVMInitializeX86AsmLexer();
+
+// Force static initialization.
+extern "C" void LLVMInitializeX86AsmParser() {
+  RegisterAsmParser<X86ATTAsmParser> X(TheX86_32Target);
+  RegisterAsmParser<X86ATTAsmParser> Y(TheX86_64Target);
+  LLVMInitializeX86AsmLexer();
+}
+
+#include "X86GenAsmMatcher.inc"

diff --git a/lib/Target/X86/AsmPrinter/CMakeLists.txt b/lib/Target/X86/AsmPrinter/CMakeLists.txt
new file mode 100644
index 0000000..b70a587
--- /dev/null
+++ b/lib/Target/X86/AsmPrinter/CMakeLists.txt

@@ -0,0 +1,9 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMX86AsmPrinter
+  X86ATTInstPrinter.cpp
+  X86AsmPrinter.cpp
+  X86IntelInstPrinter.cpp
+  X86MCInstLower.cpp
+  )
+add_dependencies(LLVMX86AsmPrinter X86CodeGenTable_gen)

diff --git a/lib/Target/X86/AsmPrinter/Makefile b/lib/Target/X86/AsmPrinter/Makefile
new file mode 100644
index 0000000..2368761
--- /dev/null
+++ b/lib/Target/X86/AsmPrinter/Makefile

@@ -0,0 +1,15 @@
+##===- lib/Target/X86/AsmPrinter/Makefile ------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMX86AsmPrinter
+
+# Hack: we need to include 'main' x86 target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common

diff --git a/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.cpp
new file mode 100644
index 0000000..38ccbf9
--- /dev/null
+++ b/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.cpp

@@ -0,0 +1,116 @@
+//===-- X86ATTInstPrinter.cpp - AT&T assembly instruction printing --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes code for rendering MCInst instances as AT&T-style
+// assembly.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "X86ATTInstPrinter.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FormattedStream.h"
+#include "X86GenInstrNames.inc"
+using namespace llvm;
+
+// Include the auto-generated portion of the assembly writer.
+#define MachineInstr MCInst
+#include "X86GenAsmWriter.inc"
+#undef MachineInstr
+
+void X86ATTInstPrinter::printInst(const MCInst *MI) { printInstruction(MI); }
+
+void X86ATTInstPrinter::printSSECC(const MCInst *MI, unsigned Op) {
+  switch (MI->getOperand(Op).getImm()) {
+  default: llvm_unreachable("Invalid ssecc argument!");
+  case 0: O << "eq"; break;
+  case 1: O << "lt"; break;
+  case 2: O << "le"; break;
+  case 3: O << "unord"; break;
+  case 4: O << "neq"; break;
+  case 5: O << "nlt"; break;
+  case 6: O << "nle"; break;
+  case 7: O << "ord"; break;
+  }
+}
+
+/// print_pcrel_imm - This is used to print an immediate value that ends up
+/// being encoded as a pc-relative value (e.g. for jumps and calls).  These
+/// print slightly differently than normal immediates.  For example, a $ is not
+/// emitted.
+void X86ATTInstPrinter::print_pcrel_imm(const MCInst *MI, unsigned OpNo) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isImm())
+    // Print this as a signed 32-bit value.
+    O << (int)Op.getImm();
+  else {
+    assert(Op.isExpr() && "unknown pcrel immediate operand");
+    O << *Op.getExpr();
+  }
+}
+
+void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo) {
+  
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    O << '%' << getRegisterName(Op.getReg());
+  } else if (Op.isImm()) {
+    O << '$' << Op.getImm();
+    
+    if (CommentStream && (Op.getImm() > 255 || Op.getImm() < -256))
+      *CommentStream << format("imm = 0x%X\n", Op.getImm());
+    
+  } else {
+    assert(Op.isExpr() && "unknown operand kind in printOperand");
+    O << '$' << *Op.getExpr();
+  }
+}
+
+void X86ATTInstPrinter::printLeaMemReference(const MCInst *MI, unsigned Op) {
+  const MCOperand &BaseReg  = MI->getOperand(Op);
+  const MCOperand &IndexReg = MI->getOperand(Op+2);
+  const MCOperand &DispSpec = MI->getOperand(Op+3);
+  
+  if (DispSpec.isImm()) {
+    int64_t DispVal = DispSpec.getImm();
+    if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg()))
+      O << DispVal;
+  } else {
+    assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
+    O << *DispSpec.getExpr();
+  }
+  
+  if (IndexReg.getReg() || BaseReg.getReg()) {
+    O << '(';
+    if (BaseReg.getReg())
+      printOperand(MI, Op);
+    
+    if (IndexReg.getReg()) {
+      O << ',';
+      printOperand(MI, Op+2);
+      unsigned ScaleVal = MI->getOperand(Op+1).getImm();
+      if (ScaleVal != 1)
+        O << ',' << ScaleVal;
+    }
+    O << ')';
+  }
+}
+
+void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op) {
+  // If this has a segment register, print it.
+  if (MI->getOperand(Op+4).getReg()) {
+    printOperand(MI, Op+4);
+    O << ':';
+  }
+  printLeaMemReference(MI, Op);
+}

diff --git a/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.h b/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.h
new file mode 100644
index 0000000..3180618
--- /dev/null
+++ b/lib/Target/X86/AsmPrinter/X86ATTInstPrinter.h

@@ -0,0 +1,85 @@
+//===-- X86ATTInstPrinter.h - Convert X86 MCInst to assembly syntax -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an X86 MCInst to AT&T style .s file syntax.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86_ATT_INST_PRINTER_H
+#define X86_ATT_INST_PRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+  class MCOperand;
+  
+class X86ATTInstPrinter : public MCInstPrinter {
+public:
+  X86ATTInstPrinter(raw_ostream &O, const MCAsmInfo &MAI)
+    : MCInstPrinter(O, MAI) {}
+
+  
+  virtual void printInst(const MCInst *MI);
+  
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI);
+  static const char *getRegisterName(unsigned RegNo);
+
+
+  void printOperand(const MCInst *MI, unsigned OpNo);
+  void printMemReference(const MCInst *MI, unsigned Op);
+  void printLeaMemReference(const MCInst *MI, unsigned Op);
+  void printSSECC(const MCInst *MI, unsigned Op);
+  void print_pcrel_imm(const MCInst *MI, unsigned OpNo);
+  
+  void printopaquemem(const MCInst *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  
+  void printi8mem(const MCInst *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printi16mem(const MCInst *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printi32mem(const MCInst *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printi64mem(const MCInst *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printi128mem(const MCInst *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printf32mem(const MCInst *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printf64mem(const MCInst *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printf80mem(const MCInst *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printf128mem(const MCInst *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printlea32mem(const MCInst *MI, unsigned OpNo) {
+    printLeaMemReference(MI, OpNo);
+  }
+  void printlea64mem(const MCInst *MI, unsigned OpNo) {
+    printLeaMemReference(MI, OpNo);
+  }
+  void printlea64_32mem(const MCInst *MI, unsigned OpNo) {
+    printLeaMemReference(MI, OpNo);
+  }
+};
+  
+}
+
+#endif

diff --git a/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp b/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp
new file mode 100644
index 0000000..304306d
--- /dev/null
+++ b/lib/Target/X86/AsmPrinter/X86AsmPrinter.cpp

@@ -0,0 +1,635 @@
+//===-- X86AsmPrinter.cpp - Convert X86 LLVM code to AT&T assembly --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to X86 machine code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86AsmPrinter.h"
+#include "X86ATTInstPrinter.h"
+#include "X86IntelInstPrinter.h"
+#include "X86MCInstLower.h"
+#include "X86.h"
+#include "X86COFF.h"
+#include "X86COFFMachineModuleInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86TargetMachine.h"
+#include "llvm/CallingConv.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/Type.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/ADT/SmallString.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Primitive Helper Functions.
+//===----------------------------------------------------------------------===//
+
+void X86AsmPrinter::PrintPICBaseSymbol() const {
+  const TargetLowering *TLI = TM.getTargetLowering();
+  O << *static_cast<const X86TargetLowering*>(TLI)->getPICBaseSymbol(MF,
+                                                                    OutContext);
+}
+
+/// runOnMachineFunction - Emit the function body.
+///
+bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  SetupMachineFunction(MF);
+  
+  // COFF and Cygwin specific mangling stuff.  This should be moved out to the
+  // mangler or handled some other way?
+  if (Subtarget->isTargetCOFF()) {
+    X86COFFMachineModuleInfo &COFFMMI = 
+      MMI->getObjFileInfo<X86COFFMachineModuleInfo>();
+
+    // Populate function information map.  Don't want to populate
+    // non-stdcall or non-fastcall functions' information right now.
+    const Function *F = MF.getFunction();
+    CallingConv::ID CC = F->getCallingConv();
+    if (CC == CallingConv::X86_StdCall || CC == CallingConv::X86_FastCall)
+      COFFMMI.AddFunctionInfo(F, *MF.getInfo<X86MachineFunctionInfo>());
+  }
+  if (Subtarget->isTargetCygMing()) {
+    const Function *F = MF.getFunction();
+    X86COFFMachineModuleInfo &COFFMMI = 
+      MMI->getObjFileInfo<X86COFFMachineModuleInfo>();
+    COFFMMI.DecorateCygMingName(CurrentFnSym, OutContext,F,*TM.getTargetData());
+    
+    O << "\t.def\t " << *CurrentFnSym;
+    O << ";\t.scl\t" <<
+    (F->hasInternalLinkage() ? COFF::C_STAT : COFF::C_EXT)
+    << ";\t.type\t" << (COFF::DT_FCN << COFF::N_BTSHFT)
+    << ";\t.endef\n";
+  }
+  
+  // Have common code print out the function header with linkage info etc.
+  EmitFunctionHeader();
+  
+  // Emit the rest of the function body.
+  EmitFunctionBody();
+
+  // We didn't modify anything.
+  return false;
+}
+
+/// printSymbolOperand - Print a raw symbol reference operand.  This handles
+/// jump tables, constant pools, global address and external symbols, all of
+/// which print to a label with various suffixes for relocation types etc.
+void X86AsmPrinter::printSymbolOperand(const MachineOperand &MO) {
+  switch (MO.getType()) {
+  default: llvm_unreachable("unknown symbol type!");
+  case MachineOperand::MO_JumpTableIndex:
+    O << *GetJTISymbol(MO.getIndex());
+    break;
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << *GetCPISymbol(MO.getIndex());
+    printOffset(MO.getOffset());
+    break;
+  case MachineOperand::MO_GlobalAddress: {
+    const GlobalValue *GV = MO.getGlobal();
+    
+    MCSymbol *GVSym;
+    if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB)
+      GVSym = GetSymbolWithGlobalValueBase(GV, "$stub");
+    else if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY ||
+             MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE ||
+             MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE)
+      GVSym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+    else
+      GVSym = GetGlobalValueSymbol(GV);
+
+    if (Subtarget->isTargetCygMing()) {
+      X86COFFMachineModuleInfo &COFFMMI =
+        MMI->getObjFileInfo<X86COFFMachineModuleInfo>();
+      COFFMMI.DecorateCygMingName(GVSym, OutContext, GV, *TM.getTargetData());
+    }
+    
+    // Handle dllimport linkage.
+    if (MO.getTargetFlags() == X86II::MO_DLLIMPORT)
+      GVSym = OutContext.GetOrCreateSymbol(Twine("__imp_") + GVSym->getName());
+    
+    if (MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY ||
+        MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE) {
+      MCSymbol *Sym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+      
+      MCSymbol *&StubSym = 
+        MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(Sym);
+      if (StubSym == 0)
+        StubSym = GetGlobalValueSymbol(GV);
+      
+    } else if (MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE){
+      MCSymbol *Sym = GetSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
+      MCSymbol *&StubSym =
+        MMI->getObjFileInfo<MachineModuleInfoMachO>().getHiddenGVStubEntry(Sym);
+      if (StubSym == 0)
+        StubSym = GetGlobalValueSymbol(GV);
+    } else if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB) {
+      MCSymbol *Sym = GetSymbolWithGlobalValueBase(GV, "$stub");
+      MCSymbol *&StubSym =
+        MMI->getObjFileInfo<MachineModuleInfoMachO>().getFnStubEntry(Sym);
+      if (StubSym == 0)
+        StubSym = GetGlobalValueSymbol(GV);
+    }
+    
+    // If the name begins with a dollar-sign, enclose it in parens.  We do this
+    // to avoid having it look like an integer immediate to the assembler.
+    if (GVSym->getName()[0] != '$')
+      O << *GVSym;
+    else
+      O << '(' << *GVSym << ')';
+    printOffset(MO.getOffset());
+    break;
+  }
+  case MachineOperand::MO_ExternalSymbol: {
+    const MCSymbol *SymToPrint;
+    if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB) {
+      SmallString<128> TempNameStr;
+      TempNameStr += StringRef(MO.getSymbolName());
+      TempNameStr += StringRef("$stub");
+      
+      MCSymbol *Sym = GetExternalSymbolSymbol(TempNameStr.str());
+      MCSymbol *&StubSym =
+        MMI->getObjFileInfo<MachineModuleInfoMachO>().getFnStubEntry(Sym);
+      if (StubSym == 0) {
+        TempNameStr.erase(TempNameStr.end()-5, TempNameStr.end());
+        StubSym = OutContext.GetOrCreateSymbol(TempNameStr.str());
+      }
+      SymToPrint = StubSym;
+    } else {
+      SymToPrint = GetExternalSymbolSymbol(MO.getSymbolName());
+    }
+    
+    // If the name begins with a dollar-sign, enclose it in parens.  We do this
+    // to avoid having it look like an integer immediate to the assembler.
+    if (SymToPrint->getName()[0] != '$') 
+      O << *SymToPrint;
+    else
+      O << '(' << *SymToPrint << '(';
+    break;
+  }
+  }
+  
+  switch (MO.getTargetFlags()) {
+  default:
+    llvm_unreachable("Unknown target flag on GV operand");
+  case X86II::MO_NO_FLAG:    // No flag.
+    break;
+  case X86II::MO_DARWIN_NONLAZY:
+  case X86II::MO_DLLIMPORT:
+  case X86II::MO_DARWIN_STUB:
+    // These affect the name of the symbol, not any suffix.
+    break;
+  case X86II::MO_GOT_ABSOLUTE_ADDRESS:
+    O << " + [.-";
+    PrintPICBaseSymbol();
+    O << ']';
+    break;      
+  case X86II::MO_PIC_BASE_OFFSET:
+  case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
+  case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE:
+    O << '-';
+    PrintPICBaseSymbol();
+    break;
+  case X86II::MO_TLSGD:     O << "@TLSGD";     break;
+  case X86II::MO_GOTTPOFF:  O << "@GOTTPOFF";  break;
+  case X86II::MO_INDNTPOFF: O << "@INDNTPOFF"; break;
+  case X86II::MO_TPOFF:     O << "@TPOFF";     break;
+  case X86II::MO_NTPOFF:    O << "@NTPOFF";    break;
+  case X86II::MO_GOTPCREL:  O << "@GOTPCREL";  break;
+  case X86II::MO_GOT:       O << "@GOT";       break;
+  case X86II::MO_GOTOFF:    O << "@GOTOFF";    break;
+  case X86II::MO_PLT:       O << "@PLT";       break;
+  }
+}
+
+/// print_pcrel_imm - This is used to print an immediate value that ends up
+/// being encoded as a pc-relative value.  These print slightly differently, for
+/// example, a $ is not emitted.
+void X86AsmPrinter::print_pcrel_imm(const MachineInstr *MI, unsigned OpNo) {
+  const MachineOperand &MO = MI->getOperand(OpNo);
+  switch (MO.getType()) {
+  default: llvm_unreachable("Unknown pcrel immediate operand");
+  case MachineOperand::MO_Immediate:
+    O << MO.getImm();
+    return;
+  case MachineOperand::MO_MachineBasicBlock:
+    O << *MO.getMBB()->getSymbol(OutContext);
+    return;
+  case MachineOperand::MO_GlobalAddress:
+  case MachineOperand::MO_ExternalSymbol:
+    printSymbolOperand(MO);
+    return;
+  }
+}
+
+
+void X86AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
+                                 const char *Modifier) {
+  const MachineOperand &MO = MI->getOperand(OpNo);
+  switch (MO.getType()) {
+  default: llvm_unreachable("unknown operand type!");
+  case MachineOperand::MO_Register: {
+    O << '%';
+    unsigned Reg = MO.getReg();
+    if (Modifier && strncmp(Modifier, "subreg", strlen("subreg")) == 0) {
+      EVT VT = (strcmp(Modifier+6,"64") == 0) ?
+        MVT::i64 : ((strcmp(Modifier+6, "32") == 0) ? MVT::i32 :
+                    ((strcmp(Modifier+6,"16") == 0) ? MVT::i16 : MVT::i8));
+      Reg = getX86SubSuperRegister(Reg, VT);
+    }
+    O << X86ATTInstPrinter::getRegisterName(Reg);
+    return;
+  }
+
+  case MachineOperand::MO_Immediate:
+    O << '$' << MO.getImm();
+    return;
+
+  case MachineOperand::MO_JumpTableIndex:
+  case MachineOperand::MO_ConstantPoolIndex:
+  case MachineOperand::MO_GlobalAddress: 
+  case MachineOperand::MO_ExternalSymbol: {
+    O << '$';
+    printSymbolOperand(MO);
+    break;
+  }
+  }
+}
+
+void X86AsmPrinter::printSSECC(const MachineInstr *MI, unsigned Op) {
+  unsigned char value = MI->getOperand(Op).getImm();
+  assert(value <= 7 && "Invalid ssecc argument!");
+  switch (value) {
+  case 0: O << "eq"; break;
+  case 1: O << "lt"; break;
+  case 2: O << "le"; break;
+  case 3: O << "unord"; break;
+  case 4: O << "neq"; break;
+  case 5: O << "nlt"; break;
+  case 6: O << "nle"; break;
+  case 7: O << "ord"; break;
+  }
+}
+
+void X86AsmPrinter::printLeaMemReference(const MachineInstr *MI, unsigned Op,
+                                         const char *Modifier) {
+  const MachineOperand &BaseReg  = MI->getOperand(Op);
+  const MachineOperand &IndexReg = MI->getOperand(Op+2);
+  const MachineOperand &DispSpec = MI->getOperand(Op+3);
+
+  // If we really don't want to print out (rip), don't.
+  bool HasBaseReg = BaseReg.getReg() != 0;
+  if (HasBaseReg && Modifier && !strcmp(Modifier, "no-rip") &&
+      BaseReg.getReg() == X86::RIP)
+    HasBaseReg = false;
+  
+  // HasParenPart - True if we will print out the () part of the mem ref.
+  bool HasParenPart = IndexReg.getReg() || HasBaseReg;
+  
+  if (DispSpec.isImm()) {
+    int DispVal = DispSpec.getImm();
+    if (DispVal || !HasParenPart)
+      O << DispVal;
+  } else {
+    assert(DispSpec.isGlobal() || DispSpec.isCPI() ||
+           DispSpec.isJTI() || DispSpec.isSymbol());
+    printSymbolOperand(MI->getOperand(Op+3));
+  }
+
+  if (HasParenPart) {
+    assert(IndexReg.getReg() != X86::ESP &&
+           "X86 doesn't allow scaling by ESP");
+
+    O << '(';
+    if (HasBaseReg)
+      printOperand(MI, Op, Modifier);
+
+    if (IndexReg.getReg()) {
+      O << ',';
+      printOperand(MI, Op+2, Modifier);
+      unsigned ScaleVal = MI->getOperand(Op+1).getImm();
+      if (ScaleVal != 1)
+        O << ',' << ScaleVal;
+    }
+    O << ')';
+  }
+}
+
+void X86AsmPrinter::printMemReference(const MachineInstr *MI, unsigned Op,
+                                      const char *Modifier) {
+  assert(isMem(MI, Op) && "Invalid memory reference!");
+  const MachineOperand &Segment = MI->getOperand(Op+4);
+  if (Segment.getReg()) {
+    printOperand(MI, Op+4, Modifier);
+    O << ':';
+  }
+  printLeaMemReference(MI, Op, Modifier);
+}
+
+void X86AsmPrinter::printPICLabel(const MachineInstr *MI, unsigned Op) {
+  PrintPICBaseSymbol();
+  O << '\n';
+  PrintPICBaseSymbol();
+  O << ':';
+}
+
+bool X86AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode) {
+  unsigned Reg = MO.getReg();
+  switch (Mode) {
+  default: return true;  // Unknown mode.
+  case 'b': // Print QImode register
+    Reg = getX86SubSuperRegister(Reg, MVT::i8);
+    break;
+  case 'h': // Print QImode high register
+    Reg = getX86SubSuperRegister(Reg, MVT::i8, true);
+    break;
+  case 'w': // Print HImode register
+    Reg = getX86SubSuperRegister(Reg, MVT::i16);
+    break;
+  case 'k': // Print SImode register
+    Reg = getX86SubSuperRegister(Reg, MVT::i32);
+    break;
+  case 'q': // Print DImode register
+    Reg = getX86SubSuperRegister(Reg, MVT::i64);
+    break;
+  }
+
+  O << '%' << X86ATTInstPrinter::getRegisterName(Reg);
+  return false;
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                    unsigned AsmVariant,
+                                    const char *ExtraCode) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    const MachineOperand &MO = MI->getOperand(OpNo);
+    
+    switch (ExtraCode[0]) {
+    default: return true;  // Unknown modifier.
+    case 'a': // This is an address.  Currently only 'i' and 'r' are expected.
+      if (MO.isImm()) {
+        O << MO.getImm();
+        return false;
+      } 
+      if (MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isSymbol()) {
+        printSymbolOperand(MO);
+        return false;
+      }
+      if (MO.isReg()) {
+        O << '(';
+        printOperand(MI, OpNo);
+        O << ')';
+        return false;
+      }
+      return true;
+
+    case 'c': // Don't print "$" before a global var name or constant.
+      if (MO.isImm())
+        O << MO.getImm();
+      else if (MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isSymbol())
+        printSymbolOperand(MO);
+      else
+        printOperand(MI, OpNo);
+      return false;
+
+    case 'A': // Print '*' before a register (it must be a register)
+      if (MO.isReg()) {
+        O << '*';
+        printOperand(MI, OpNo);
+        return false;
+      }
+      return true;
+
+    case 'b': // Print QImode register
+    case 'h': // Print QImode high register
+    case 'w': // Print HImode register
+    case 'k': // Print SImode register
+    case 'q': // Print DImode register
+      if (MO.isReg())
+        return printAsmMRegister(MO, ExtraCode[0]);
+      printOperand(MI, OpNo);
+      return false;
+
+    case 'P': // This is the operand of a call, treat specially.
+      print_pcrel_imm(MI, OpNo);
+      return false;
+
+    case 'n':  // Negate the immediate or print a '-' before the operand.
+      // Note: this is a temporary solution. It should be handled target
+      // independently as part of the 'MC' work.
+      if (MO.isImm()) {
+        O << -MO.getImm();
+        return false;
+      }
+      O << '-';
+    }
+  }
+
+  printOperand(MI, OpNo);
+  return false;
+}
+
+bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
+                                          unsigned OpNo, unsigned AsmVariant,
+                                          const char *ExtraCode) {
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default: return true;  // Unknown modifier.
+    case 'b': // Print QImode register
+    case 'h': // Print QImode high register
+    case 'w': // Print HImode register
+    case 'k': // Print SImode register
+    case 'q': // Print SImode register
+      // These only apply to registers, ignore on mem.
+      break;
+    case 'P': // Don't print @PLT, but do print as memory.
+      printMemReference(MI, OpNo, "no-rip");
+      return false;
+    }
+  }
+  printMemReference(MI, OpNo);
+  return false;
+}
+
+
+void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
+  if (Subtarget->isTargetDarwin()) {
+    // All darwin targets use mach-o.
+    TargetLoweringObjectFileMachO &TLOFMacho = 
+      static_cast<TargetLoweringObjectFileMachO &>(getObjFileLowering());
+    
+    MachineModuleInfoMachO &MMIMacho =
+      MMI->getObjFileInfo<MachineModuleInfoMachO>();
+    
+    // Output stubs for dynamically-linked functions.
+    MachineModuleInfoMachO::SymbolListTy Stubs;
+
+    Stubs = MMIMacho.GetFnStubList();
+    if (!Stubs.empty()) {
+      const MCSection *TheSection = 
+        TLOFMacho.getMachOSection("__IMPORT", "__jump_table",
+                                  MCSectionMachO::S_SYMBOL_STUBS |
+                                  MCSectionMachO::S_ATTR_SELF_MODIFYING_CODE |
+                                  MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS,
+                                  5, SectionKind::getMetadata());
+      OutStreamer.SwitchSection(TheSection);
+
+      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+        // L_foo$stub:
+        OutStreamer.EmitLabel(Stubs[i].first);
+        //   .indirect_symbol _foo
+        OutStreamer.EmitSymbolAttribute(Stubs[i].second, MCSA_IndirectSymbol);
+        // hlt; hlt; hlt; hlt; hlt     hlt = 0xf4 = -12.
+        const char HltInsts[] = { -12, -12, -12, -12, -12 };
+        OutStreamer.EmitBytes(StringRef(HltInsts, 5), 0/*addrspace*/);
+      }
+      
+      Stubs.clear();
+      OutStreamer.AddBlankLine();
+    }
+
+    // Output stubs for external and common global variables.
+    Stubs = MMIMacho.GetGVStubList();
+    if (!Stubs.empty()) {
+      const MCSection *TheSection = 
+        TLOFMacho.getMachOSection("__IMPORT", "__pointers",
+                                  MCSectionMachO::S_NON_LAZY_SYMBOL_POINTERS,
+                                  SectionKind::getMetadata());
+      OutStreamer.SwitchSection(TheSection);
+
+      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+        // L_foo$non_lazy_ptr:
+        OutStreamer.EmitLabel(Stubs[i].first);
+        // .indirect_symbol _foo
+        OutStreamer.EmitSymbolAttribute(Stubs[i].second, MCSA_IndirectSymbol);
+        // .long 0
+        OutStreamer.EmitIntValue(0, 4/*size*/, 0/*addrspace*/);
+      }
+      Stubs.clear();
+      OutStreamer.AddBlankLine();
+    }
+
+    Stubs = MMIMacho.GetHiddenGVStubList();
+    if (!Stubs.empty()) {
+      OutStreamer.SwitchSection(getObjFileLowering().getDataSection());
+      EmitAlignment(2);
+
+      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+        // L_foo$non_lazy_ptr:
+        OutStreamer.EmitLabel(Stubs[i].first);
+        // .long _foo
+        OutStreamer.EmitValue(MCSymbolRefExpr::Create(Stubs[i].second,
+                                                      OutContext),
+                              4/*size*/, 0/*addrspace*/);
+      }
+      Stubs.clear();
+      OutStreamer.AddBlankLine();
+    }
+
+    // Funny Darwin hack: This flag tells the linker that no global symbols
+    // contain code that falls through to other global symbols (e.g. the obvious
+    // implementation of multiple entry points).  If this doesn't occur, the
+    // linker can safely perform dead code stripping.  Since LLVM never
+    // generates code that does this, it is always safe to set.
+    OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
+  }
+
+  if (Subtarget->isTargetCOFF()) {
+    X86COFFMachineModuleInfo &COFFMMI =
+      MMI->getObjFileInfo<X86COFFMachineModuleInfo>();
+
+    // Emit type information for external functions
+    for (X86COFFMachineModuleInfo::stub_iterator I = COFFMMI.stub_begin(),
+           E = COFFMMI.stub_end(); I != E; ++I) {
+      O << "\t.def\t " << I->getKeyData()
+        << ";\t.scl\t" << COFF::C_EXT
+        << ";\t.type\t" << (COFF::DT_FCN << COFF::N_BTSHFT)
+        << ";\t.endef\n";
+    }
+
+    if (Subtarget->isTargetCygMing()) {
+      // Necessary for dllexport support
+      std::vector<const MCSymbol*> DLLExportedFns, DLLExportedGlobals;
+
+      TargetLoweringObjectFileCOFF &TLOFCOFF =
+        static_cast<TargetLoweringObjectFileCOFF&>(getObjFileLowering());
+
+      for (Module::const_iterator I = M.begin(), E = M.end(); I != E; ++I)
+        if (I->hasDLLExportLinkage()) {
+          MCSymbol *Sym = GetGlobalValueSymbol(I);
+          COFFMMI.DecorateCygMingName(Sym, OutContext, I, *TM.getTargetData());
+          DLLExportedFns.push_back(Sym);
+        }
+
+      for (Module::const_global_iterator I = M.global_begin(),
+             E = M.global_end(); I != E; ++I)
+        if (I->hasDLLExportLinkage())
+          DLLExportedGlobals.push_back(GetGlobalValueSymbol(I));
+
+      // Output linker support code for dllexported globals on windows.
+      if (!DLLExportedGlobals.empty() || !DLLExportedFns.empty()) {
+        OutStreamer.SwitchSection(TLOFCOFF.getCOFFSection(".section .drectve",
+                                                          true,
+                                                   SectionKind::getMetadata()));
+        for (unsigned i = 0, e = DLLExportedGlobals.size(); i != e; ++i)
+          O << "\t.ascii \" -export:" << *DLLExportedGlobals[i] << ",data\"\n";
+
+        for (unsigned i = 0, e = DLLExportedFns.size(); i != e; ++i)
+          O << "\t.ascii \" -export:" << *DLLExportedFns[i] << "\"\n";
+      }
+    }
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+// Target Registry Stuff
+//===----------------------------------------------------------------------===//
+
+static MCInstPrinter *createX86MCInstPrinter(const Target &T,
+                                             unsigned SyntaxVariant,
+                                             const MCAsmInfo &MAI,
+                                             raw_ostream &O) {
+  if (SyntaxVariant == 0)
+    return new X86ATTInstPrinter(O, MAI);
+  if (SyntaxVariant == 1)
+    return new X86IntelInstPrinter(O, MAI);
+  return 0;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeX86AsmPrinter() { 
+  RegisterAsmPrinter<X86AsmPrinter> X(TheX86_32Target);
+  RegisterAsmPrinter<X86AsmPrinter> Y(TheX86_64Target);
+  
+  TargetRegistry::RegisterMCInstPrinter(TheX86_32Target,createX86MCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheX86_64Target,createX86MCInstPrinter);
+}

diff --git a/lib/Target/X86/AsmPrinter/X86AsmPrinter.h b/lib/Target/X86/AsmPrinter/X86AsmPrinter.h
new file mode 100644
index 0000000..1d32a5f
--- /dev/null
+++ b/lib/Target/X86/AsmPrinter/X86AsmPrinter.h

@@ -0,0 +1,135 @@
+//===-- X86AsmPrinter.h - Convert X86 LLVM code to assembly -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// AT&T assembly code printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86ASMPRINTER_H
+#define X86ASMPRINTER_H
+
+#include "../X86.h"
+#include "../X86MachineFunctionInfo.h"
+#include "../X86TargetMachine.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+
+class MachineJumpTableInfo;
+class MCContext;
+class MCInst;
+class MCStreamer;
+class MCSymbol;
+
+class VISIBILITY_HIDDEN X86AsmPrinter : public AsmPrinter {
+  const X86Subtarget *Subtarget;
+ public:
+  explicit X86AsmPrinter(formatted_raw_ostream &O, TargetMachine &TM,
+                         MCContext &Ctx, MCStreamer &Streamer,
+                         const MCAsmInfo *T)
+    : AsmPrinter(O, TM, Ctx, Streamer, T) {
+    Subtarget = &TM.getSubtarget<X86Subtarget>();
+  }
+
+  virtual const char *getPassName() const {
+    return "X86 AT&T-Style Assembly Printer";
+  }
+  
+  const X86Subtarget &getSubtarget() const { return *Subtarget; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesAll();
+    AU.addRequired<MachineModuleInfo>();
+    AU.addRequired<DwarfWriter>();
+    AsmPrinter::getAnalysisUsage(AU);
+  }
+
+  
+  virtual void EmitEndOfAsmFile(Module &M);
+  
+  virtual void EmitInstruction(const MachineInstr *MI);
+  
+  void printSymbolOperand(const MachineOperand &MO);
+  
+  
+
+  // These methods are used by the tablegen'erated instruction printer.
+  void printOperand(const MachineInstr *MI, unsigned OpNo,
+                    const char *Modifier = 0);
+  void print_pcrel_imm(const MachineInstr *MI, unsigned OpNo);
+
+  void printopaquemem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+
+  void printi8mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printi16mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printi32mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printi64mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printi128mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printf32mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printf64mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printf80mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printf128mem(const MachineInstr *MI, unsigned OpNo) {
+    printMemReference(MI, OpNo);
+  }
+  void printlea32mem(const MachineInstr *MI, unsigned OpNo) {
+    printLeaMemReference(MI, OpNo);
+  }
+  void printlea64mem(const MachineInstr *MI, unsigned OpNo) {
+    printLeaMemReference(MI, OpNo);
+  }
+  void printlea64_32mem(const MachineInstr *MI, unsigned OpNo) {
+    printLeaMemReference(MI, OpNo, "subreg64");
+  }
+
+  bool printAsmMRegister(const MachineOperand &MO, char Mode);
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       unsigned AsmVariant, const char *ExtraCode);
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                             unsigned AsmVariant, const char *ExtraCode);
+
+  void printMachineInstruction(const MachineInstr *MI);
+  void printSSECC(const MachineInstr *MI, unsigned Op);
+  void printMemReference(const MachineInstr *MI, unsigned Op,
+                         const char *Modifier=NULL);
+  void printLeaMemReference(const MachineInstr *MI, unsigned Op,
+                            const char *Modifier=NULL);
+
+  void printPICLabel(const MachineInstr *MI, unsigned Op);
+
+  void PrintPICBaseSymbol() const;
+  
+  bool runOnMachineFunction(MachineFunction &F);
+};
+
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/X86/AsmPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/AsmPrinter/X86IntelInstPrinter.cpp
new file mode 100644
index 0000000..4274d0a
--- /dev/null
+++ b/lib/Target/X86/AsmPrinter/X86IntelInstPrinter.cpp

@@ -0,0 +1,130 @@
+//===-- X86IntelInstPrinter.cpp - AT&T assembly instruction printing ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes code for rendering MCInst instances as AT&T-style
+// assembly.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "X86IntelInstPrinter.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "X86GenInstrNames.inc"
+using namespace llvm;
+
+// Include the auto-generated portion of the assembly writer.
+#define MachineInstr MCInst
+#include "X86GenAsmWriter1.inc"
+#undef MachineInstr
+
+void X86IntelInstPrinter::printInst(const MCInst *MI) { printInstruction(MI); }
+
+void X86IntelInstPrinter::printSSECC(const MCInst *MI, unsigned Op) {
+  switch (MI->getOperand(Op).getImm()) {
+  default: llvm_unreachable("Invalid ssecc argument!");
+  case 0: O << "eq"; break;
+  case 1: O << "lt"; break;
+  case 2: O << "le"; break;
+  case 3: O << "unord"; break;
+  case 4: O << "neq"; break;
+  case 5: O << "nlt"; break;
+  case 6: O << "nle"; break;
+  case 7: O << "ord"; break;
+  }
+}
+
+/// print_pcrel_imm - This is used to print an immediate value that ends up
+/// being encoded as a pc-relative value.
+void X86IntelInstPrinter::print_pcrel_imm(const MCInst *MI, unsigned OpNo) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isImm())
+    O << Op.getImm();
+  else {
+    assert(Op.isExpr() && "unknown pcrel immediate operand");
+    O << *Op.getExpr();
+  }
+}
+
+static void PrintRegName(raw_ostream &O, StringRef RegName) {
+  for (unsigned i = 0, e = RegName.size(); i != e; ++i)
+    O << (char)toupper(RegName[i]);
+}
+
+void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                     const char *Modifier) {
+  assert(Modifier == 0 && "Modifiers should not be used");
+  
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    PrintRegName(O, getRegisterName(Op.getReg()));
+  } else if (Op.isImm()) {
+    O << Op.getImm();
+  } else {
+    assert(Op.isExpr() && "unknown operand kind in printOperand");
+    O << *Op.getExpr();
+  }
+}
+
+void X86IntelInstPrinter::printLeaMemReference(const MCInst *MI, unsigned Op) {
+  const MCOperand &BaseReg  = MI->getOperand(Op);
+  unsigned ScaleVal         = MI->getOperand(Op+1).getImm();
+  const MCOperand &IndexReg = MI->getOperand(Op+2);
+  const MCOperand &DispSpec = MI->getOperand(Op+3);
+  
+  O << '[';
+  
+  bool NeedPlus = false;
+  if (BaseReg.getReg()) {
+    printOperand(MI, Op);
+    NeedPlus = true;
+  }
+  
+  if (IndexReg.getReg()) {
+    if (NeedPlus) O << " + ";
+    if (ScaleVal != 1)
+      O << ScaleVal << '*';
+    printOperand(MI, Op+2);
+    NeedPlus = true;
+  }
+  
+ 
+  if (!DispSpec.isImm()) {
+    if (NeedPlus) O << " + ";
+    assert(DispSpec.isExpr() && "non-immediate displacement for LEA?");
+    O << *DispSpec.getExpr();
+  } else {
+    int64_t DispVal = DispSpec.getImm();
+    if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) {
+      if (NeedPlus) {
+        if (DispVal > 0)
+          O << " + ";
+        else {
+          O << " - ";
+          DispVal = -DispVal;
+        }
+      }
+      O << DispVal;
+    }
+  }
+  
+  O << ']';
+}
+
+void X86IntelInstPrinter::printMemReference(const MCInst *MI, unsigned Op) {
+  // If this has a segment register, print it.
+  if (MI->getOperand(Op+4).getReg()) {
+    printOperand(MI, Op+4);
+    O << ':';
+  }
+  printLeaMemReference(MI, Op);
+}

diff --git a/lib/Target/X86/AsmPrinter/X86IntelInstPrinter.h b/lib/Target/X86/AsmPrinter/X86IntelInstPrinter.h
new file mode 100644
index 0000000..1976177
--- /dev/null
+++ b/lib/Target/X86/AsmPrinter/X86IntelInstPrinter.h

@@ -0,0 +1,99 @@
+//===-- X86IntelInstPrinter.h - Convert X86 MCInst to assembly syntax -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an X86 MCInst to intel style .s file syntax.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86_INTEL_INST_PRINTER_H
+#define X86_INTEL_INST_PRINTER_H
+
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+  class MCOperand;
+  
+class X86IntelInstPrinter : public MCInstPrinter {
+public:
+  X86IntelInstPrinter(raw_ostream &O, const MCAsmInfo &MAI)
+    : MCInstPrinter(O, MAI) {}
+  
+  virtual void printInst(const MCInst *MI);
+  
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI);
+  static const char *getRegisterName(unsigned RegNo);
+
+
+  void printOperand(const MCInst *MI, unsigned OpNo,
+                    const char *Modifier = 0);
+  void printMemReference(const MCInst *MI, unsigned Op);
+  void printLeaMemReference(const MCInst *MI, unsigned Op);
+  void printSSECC(const MCInst *MI, unsigned Op);
+  void print_pcrel_imm(const MCInst *MI, unsigned OpNo);
+  
+  void printopaquemem(const MCInst *MI, unsigned OpNo) {
+    O << "OPAQUE PTR ";
+    printMemReference(MI, OpNo);
+  }
+  
+  void printi8mem(const MCInst *MI, unsigned OpNo) {
+    O << "BYTE PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printi16mem(const MCInst *MI, unsigned OpNo) {
+    O << "WORD PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printi32mem(const MCInst *MI, unsigned OpNo) {
+    O << "DWORD PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printi64mem(const MCInst *MI, unsigned OpNo) {
+    O << "QWORD PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printi128mem(const MCInst *MI, unsigned OpNo) {
+    O << "XMMWORD PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printf32mem(const MCInst *MI, unsigned OpNo) {
+    O << "DWORD PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printf64mem(const MCInst *MI, unsigned OpNo) {
+    O << "QWORD PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printf80mem(const MCInst *MI, unsigned OpNo) {
+    O << "XWORD PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printf128mem(const MCInst *MI, unsigned OpNo) {
+    O << "XMMWORD PTR ";
+    printMemReference(MI, OpNo);
+  }
+  void printlea32mem(const MCInst *MI, unsigned OpNo) {
+    O << "DWORD PTR ";
+    printLeaMemReference(MI, OpNo);
+  }
+  void printlea64mem(const MCInst *MI, unsigned OpNo) {
+    O << "QWORD PTR ";
+    printLeaMemReference(MI, OpNo);
+  }
+  void printlea64_32mem(const MCInst *MI, unsigned OpNo) {
+    O << "QWORD PTR ";
+    printLeaMemReference(MI, OpNo);
+  }
+};
+  
+}
+
+#endif

diff --git a/lib/Target/X86/AsmPrinter/X86MCInstLower.cpp b/lib/Target/X86/AsmPrinter/X86MCInstLower.cpp
new file mode 100644
index 0000000..fa8d13d
--- /dev/null
+++ b/lib/Target/X86/AsmPrinter/X86MCInstLower.cpp

@@ -0,0 +1,427 @@
+//===-- X86MCInstLower.cpp - Convert X86 MachineInstr to an MCInst --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains code to lower X86 MachineInstrs to their corresponding
+// MCInst records.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MCInstLower.h"
+#include "X86AsmPrinter.h"
+#include "X86COFFMachineModuleInfo.h"
+#include "X86MCAsmInfo.h"
+#include "X86MCTargetExpr.h"
+#include "llvm/Analysis/DebugInfo.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Type.h"
+using namespace llvm;
+
+
+const X86Subtarget &X86MCInstLower::getSubtarget() const {
+  return AsmPrinter.getSubtarget();
+}
+
+MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const {
+  assert(getSubtarget().isTargetDarwin() &&"Can only get MachO info on darwin");
+  return AsmPrinter.MMI->getObjFileInfo<MachineModuleInfoMachO>(); 
+}
+
+
+MCSymbol *X86MCInstLower::GetPICBaseSymbol() const {
+  const TargetLowering *TLI = AsmPrinter.TM.getTargetLowering();
+  return static_cast<const X86TargetLowering*>(TLI)->
+    getPICBaseSymbol(AsmPrinter.MF, Ctx);
+}
+
+/// GetSymbolFromOperand - Lower an MO_GlobalAddress or MO_ExternalSymbol
+/// operand to an MCSymbol.
+MCSymbol *X86MCInstLower::
+GetSymbolFromOperand(const MachineOperand &MO) const {
+  assert((MO.isGlobal() || MO.isSymbol()) && "Isn't a symbol reference");
+
+  SmallString<128> Name;
+  
+  if (MO.isGlobal()) {
+    bool isImplicitlyPrivate = false;
+    if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB ||
+        MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY ||
+        MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE ||
+        MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE)
+      isImplicitlyPrivate = true;
+    
+    const GlobalValue *GV = MO.getGlobal();
+    Mang->getNameWithPrefix(Name, GV, isImplicitlyPrivate);
+  
+    if (getSubtarget().isTargetCygMing()) {
+      X86COFFMachineModuleInfo &COFFMMI = 
+        AsmPrinter.MMI->getObjFileInfo<X86COFFMachineModuleInfo>();
+      COFFMMI.DecorateCygMingName(Name, GV, *AsmPrinter.TM.getTargetData());
+    }
+  } else {
+    assert(MO.isSymbol());
+    Name += AsmPrinter.MAI->getGlobalPrefix();
+    Name += MO.getSymbolName();
+  }
+
+  // If the target flags on the operand changes the name of the symbol, do that
+  // before we return the symbol.
+  switch (MO.getTargetFlags()) {
+  default: break;
+  case X86II::MO_DLLIMPORT: {
+    // Handle dllimport linkage.
+    const char *Prefix = "__imp_";
+    Name.insert(Name.begin(), Prefix, Prefix+strlen(Prefix));
+    break;
+  }
+  case X86II::MO_DARWIN_NONLAZY:
+  case X86II::MO_DARWIN_NONLAZY_PIC_BASE: {
+    Name += "$non_lazy_ptr";
+    MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name.str());
+
+    MCSymbol *&StubSym = getMachOMMI().getGVStubEntry(Sym);
+    if (StubSym == 0) {
+      assert(MO.isGlobal() && "Extern symbol not handled yet");
+      StubSym = AsmPrinter.GetGlobalValueSymbol(MO.getGlobal());
+    }
+    return Sym;
+  }
+  case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: {
+    Name += "$non_lazy_ptr";
+    MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name.str());
+    MCSymbol *&StubSym = getMachOMMI().getHiddenGVStubEntry(Sym);
+    if (StubSym == 0) {
+      assert(MO.isGlobal() && "Extern symbol not handled yet");
+      StubSym = AsmPrinter.GetGlobalValueSymbol(MO.getGlobal());
+    }
+    return Sym;
+  }
+  case X86II::MO_DARWIN_STUB: {
+    Name += "$stub";
+    MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name.str());
+    MCSymbol *&StubSym = getMachOMMI().getFnStubEntry(Sym);
+    if (StubSym)
+      return Sym;
+    
+    if (MO.isGlobal()) {
+      StubSym = AsmPrinter.GetGlobalValueSymbol(MO.getGlobal());
+    } else {
+      Name.erase(Name.end()-5, Name.end());
+      StubSym = Ctx.GetOrCreateSymbol(Name.str());
+    }
+    return Sym;
+  }
+  }
+
+  return Ctx.GetOrCreateSymbol(Name.str());
+}
+
+MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
+                                             MCSymbol *Sym) const {
+  // FIXME: We would like an efficient form for this, so we don't have to do a
+  // lot of extra uniquing.
+  const MCExpr *Expr = 0;
+  X86MCTargetExpr::VariantKind RefKind = X86MCTargetExpr::Invalid;
+  
+  switch (MO.getTargetFlags()) {
+  default: llvm_unreachable("Unknown target flag on GV operand");
+  case X86II::MO_NO_FLAG:    // No flag.
+  // These affect the name of the symbol, not any suffix.
+  case X86II::MO_DARWIN_NONLAZY:
+  case X86II::MO_DLLIMPORT:
+  case X86II::MO_DARWIN_STUB:
+    break;
+      
+  case X86II::MO_TLSGD:     RefKind = X86MCTargetExpr::TLSGD; break;
+  case X86II::MO_GOTTPOFF:  RefKind = X86MCTargetExpr::GOTTPOFF; break;
+  case X86II::MO_INDNTPOFF: RefKind = X86MCTargetExpr::INDNTPOFF; break;
+  case X86II::MO_TPOFF:     RefKind = X86MCTargetExpr::TPOFF; break;
+  case X86II::MO_NTPOFF:    RefKind = X86MCTargetExpr::NTPOFF; break;
+  case X86II::MO_GOTPCREL:  RefKind = X86MCTargetExpr::GOTPCREL; break;
+  case X86II::MO_GOT:       RefKind = X86MCTargetExpr::GOT; break;
+  case X86II::MO_GOTOFF:    RefKind = X86MCTargetExpr::GOTOFF; break;
+  case X86II::MO_PLT:       RefKind = X86MCTargetExpr::PLT; break;
+  case X86II::MO_PIC_BASE_OFFSET:
+  case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
+  case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE:
+    Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+    // Subtract the pic base.
+    Expr = MCBinaryExpr::CreateSub(Expr, 
+                               MCSymbolRefExpr::Create(GetPICBaseSymbol(), Ctx),
+                                   Ctx);
+    break;
+  }
+  
+  if (Expr == 0) {
+    if (RefKind == X86MCTargetExpr::Invalid)
+      Expr = MCSymbolRefExpr::Create(Sym, Ctx);
+    else
+      Expr = X86MCTargetExpr::Create(Sym, RefKind, Ctx);
+  }
+  
+  if (!MO.isJTI() && MO.getOffset())
+    Expr = MCBinaryExpr::CreateAdd(Expr,
+                                   MCConstantExpr::Create(MO.getOffset(), Ctx),
+                                   Ctx);
+  return MCOperand::CreateExpr(Expr);
+}
+
+
+
+static void lower_subreg32(MCInst *MI, unsigned OpNo) {
+  // Convert registers in the addr mode according to subreg32.
+  unsigned Reg = MI->getOperand(OpNo).getReg();
+  if (Reg != 0)
+    MI->getOperand(OpNo).setReg(getX86SubSuperRegister(Reg, MVT::i32));
+}
+
+static void lower_lea64_32mem(MCInst *MI, unsigned OpNo) {
+  // Convert registers in the addr mode according to subreg64.
+  for (unsigned i = 0; i != 4; ++i) {
+    if (!MI->getOperand(OpNo+i).isReg()) continue;
+    
+    unsigned Reg = MI->getOperand(OpNo+i).getReg();
+    if (Reg == 0) continue;
+    
+    MI->getOperand(OpNo+i).setReg(getX86SubSuperRegister(Reg, MVT::i64));
+  }
+}
+
+/// LowerSubReg32_Op0 - Things like MOVZX16rr8 -> MOVZX32rr8.
+static void LowerSubReg32_Op0(MCInst &OutMI, unsigned NewOpc) {
+  OutMI.setOpcode(NewOpc);
+  lower_subreg32(&OutMI, 0);
+}
+/// LowerUnaryToTwoAddr - R = setb   -> R = sbb R, R
+static void LowerUnaryToTwoAddr(MCInst &OutMI, unsigned NewOpc) {
+  OutMI.setOpcode(NewOpc);
+  OutMI.addOperand(OutMI.getOperand(0));
+  OutMI.addOperand(OutMI.getOperand(0));
+}
+
+
+void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+  
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    
+    MCOperand MCOp;
+    switch (MO.getType()) {
+    default:
+      MI->dump();
+      llvm_unreachable("unknown operand type");
+    case MachineOperand::MO_Register:
+      // Ignore all implicit register operands.
+      if (MO.isImplicit()) continue;
+      MCOp = MCOperand::CreateReg(MO.getReg());
+      break;
+    case MachineOperand::MO_Immediate:
+      MCOp = MCOperand::CreateImm(MO.getImm());
+      break;
+    case MachineOperand::MO_MachineBasicBlock:
+      MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
+                       MO.getMBB()->getSymbol(Ctx), Ctx));
+      break;
+    case MachineOperand::MO_GlobalAddress:
+      MCOp = LowerSymbolOperand(MO, GetSymbolFromOperand(MO));
+      break;
+    case MachineOperand::MO_ExternalSymbol:
+      MCOp = LowerSymbolOperand(MO, GetSymbolFromOperand(MO));
+      break;
+    case MachineOperand::MO_JumpTableIndex:
+      MCOp = LowerSymbolOperand(MO, AsmPrinter.GetJTISymbol(MO.getIndex()));
+      break;
+    case MachineOperand::MO_ConstantPoolIndex:
+      MCOp = LowerSymbolOperand(MO, AsmPrinter.GetCPISymbol(MO.getIndex()));
+      break;
+    case MachineOperand::MO_BlockAddress:
+      MCOp = LowerSymbolOperand(MO,
+                        AsmPrinter.GetBlockAddressSymbol(MO.getBlockAddress()));
+      break;
+    }
+    
+    OutMI.addOperand(MCOp);
+  }
+  
+  // Handle a few special cases to eliminate operand modifiers.
+  switch (OutMI.getOpcode()) {
+  case X86::LEA64_32r: // Handle 'subreg rewriting' for the lea64_32mem operand.
+    lower_lea64_32mem(&OutMI, 1);
+    break;
+  case X86::MOVZX16rr8:   LowerSubReg32_Op0(OutMI, X86::MOVZX32rr8); break;
+  case X86::MOVZX16rm8:   LowerSubReg32_Op0(OutMI, X86::MOVZX32rm8); break;
+  case X86::MOVSX16rr8:   LowerSubReg32_Op0(OutMI, X86::MOVSX32rr8); break;
+  case X86::MOVSX16rm8:   LowerSubReg32_Op0(OutMI, X86::MOVSX32rm8); break;
+  case X86::MOVZX64rr32:  LowerSubReg32_Op0(OutMI, X86::MOV32rr); break;
+  case X86::MOVZX64rm32:  LowerSubReg32_Op0(OutMI, X86::MOV32rm); break;
+  case X86::MOV64ri64i32: LowerSubReg32_Op0(OutMI, X86::MOV32ri); break;
+  case X86::MOVZX64rr8:   LowerSubReg32_Op0(OutMI, X86::MOVZX32rr8); break;
+  case X86::MOVZX64rm8:   LowerSubReg32_Op0(OutMI, X86::MOVZX32rm8); break;
+  case X86::MOVZX64rr16:  LowerSubReg32_Op0(OutMI, X86::MOVZX32rr16); break;
+  case X86::MOVZX64rm16:  LowerSubReg32_Op0(OutMI, X86::MOVZX32rm16); break;
+  case X86::SETB_C8r:     LowerUnaryToTwoAddr(OutMI, X86::SBB8rr); break;
+  case X86::SETB_C16r:    LowerUnaryToTwoAddr(OutMI, X86::SBB16rr); break;
+  case X86::SETB_C32r:    LowerUnaryToTwoAddr(OutMI, X86::SBB32rr); break;
+  case X86::SETB_C64r:    LowerUnaryToTwoAddr(OutMI, X86::SBB64rr); break;
+  case X86::MOV8r0:       LowerUnaryToTwoAddr(OutMI, X86::XOR8rr); break;
+  case X86::MOV32r0:      LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); break;
+  case X86::MMX_V_SET0:   LowerUnaryToTwoAddr(OutMI, X86::MMX_PXORrr); break;
+  case X86::MMX_V_SETALLONES:
+    LowerUnaryToTwoAddr(OutMI, X86::MMX_PCMPEQDrr); break;
+  case X86::FsFLD0SS:     LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break;
+  case X86::FsFLD0SD:     LowerUnaryToTwoAddr(OutMI, X86::PXORrr); break;
+  case X86::V_SET0:       LowerUnaryToTwoAddr(OutMI, X86::XORPSrr); break;
+  case X86::V_SETALLONES: LowerUnaryToTwoAddr(OutMI, X86::PCMPEQDrr); break;
+
+  case X86::MOV16r0:
+    LowerSubReg32_Op0(OutMI, X86::MOV32r0);   // MOV16r0 -> MOV32r0
+    LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); // MOV32r0 -> XOR32rr
+    break;
+  case X86::MOV64r0:
+    LowerSubReg32_Op0(OutMI, X86::MOV32r0);   // MOV64r0 -> MOV32r0
+    LowerUnaryToTwoAddr(OutMI, X86::XOR32rr); // MOV32r0 -> XOR32rr
+    break;
+  }
+}
+
+
+
+void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  X86MCInstLower MCInstLowering(OutContext, Mang, *this);
+  switch (MI->getOpcode()) {
+  case TargetOpcode::DBG_VALUE: {
+    // FIXME: if this is implemented for another target before it goes
+    // away completely, the common part should be moved into AsmPrinter.
+    if (!VerboseAsm)
+      return;
+    O << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
+    unsigned NOps = MI->getNumOperands();
+    // cast away const; DIetc do not take const operands for some reason.
+    DIVariable V((MDNode*)(MI->getOperand(NOps-1).getMetadata()));
+    O << V.getName();
+    O << " <- ";
+    if (NOps==3) {
+      // Register or immediate value. Register 0 means undef.
+      assert(MI->getOperand(0).getType()==MachineOperand::MO_Register ||
+             MI->getOperand(0).getType()==MachineOperand::MO_Immediate ||
+             MI->getOperand(0).getType()==MachineOperand::MO_FPImmediate);
+      if (MI->getOperand(0).getType()==MachineOperand::MO_Register &&
+          MI->getOperand(0).getReg()==0) {
+        // Suppress offset in this case, it is not meaningful.
+        O << "undef";
+        OutStreamer.AddBlankLine();
+        return;
+      } else if (MI->getOperand(0).getType()==MachineOperand::MO_FPImmediate) {
+        // This is more naturally done in printOperand, but since the only use
+        // of such an operand is in this comment and that is temporary (and it's
+        // ugly), we prefer to keep this localized.
+        // The include of Type.h may be removable when this code is.
+        if (MI->getOperand(0).getFPImm()->getType()->isFloatTy() ||
+            MI->getOperand(0).getFPImm()->getType()->isDoubleTy())
+          MI->getOperand(0).print(O, &TM);
+        else {
+          // There is no good way to print long double.  Convert a copy to
+          // double.  Ah well, it's only a comment.
+          bool ignored;
+          APFloat APF = APFloat(MI->getOperand(0).getFPImm()->getValueAPF());
+          APF.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven,
+                      &ignored);
+          O << "(long double) " << APF.convertToDouble();
+        }
+      } else
+        printOperand(MI, 0);
+    } else {
+      // Frame address.  Currently handles register +- offset only.
+      assert(MI->getOperand(0).getType()==MachineOperand::MO_Register);
+      assert(MI->getOperand(3).getType()==MachineOperand::MO_Immediate);
+      O << '['; printOperand(MI, 0); O << '+'; printOperand(MI, 3); O << ']';
+    }
+    O << "+";
+    printOperand(MI, NOps-2);
+    OutStreamer.AddBlankLine();
+    return;
+  }
+  case X86::MOVPC32r: {
+    MCInst TmpInst;
+    // This is a pseudo op for a two instruction sequence with a label, which
+    // looks like:
+    //     call "L1$pb"
+    // "L1$pb":
+    //     popl %esi
+    
+    // Emit the call.
+    MCSymbol *PICBase = MCInstLowering.GetPICBaseSymbol();
+    TmpInst.setOpcode(X86::CALLpcrel32);
+    // FIXME: We would like an efficient form for this, so we don't have to do a
+    // lot of extra uniquing.
+    TmpInst.addOperand(MCOperand::CreateExpr(MCSymbolRefExpr::Create(PICBase,
+                                                                 OutContext)));
+    OutStreamer.EmitInstruction(TmpInst);
+    
+    // Emit the label.
+    OutStreamer.EmitLabel(PICBase);
+    
+    // popl $reg
+    TmpInst.setOpcode(X86::POP32r);
+    TmpInst.getOperand(0) = MCOperand::CreateReg(MI->getOperand(0).getReg());
+    OutStreamer.EmitInstruction(TmpInst);
+    return;
+  }
+      
+  case X86::ADD32ri: {
+    // Lower the MO_GOT_ABSOLUTE_ADDRESS form of ADD32ri.
+    if (MI->getOperand(2).getTargetFlags() != X86II::MO_GOT_ABSOLUTE_ADDRESS)
+      break;
+    
+    // Okay, we have something like:
+    //  EAX = ADD32ri EAX, MO_GOT_ABSOLUTE_ADDRESS(@MYGLOBAL)
+    
+    // For this, we want to print something like:
+    //   MYGLOBAL + (. - PICBASE)
+    // However, we can't generate a ".", so just emit a new label here and refer
+    // to it.  We know that this operand flag occurs at most once per function.
+    const char *Prefix = MAI->getPrivateGlobalPrefix();
+    MCSymbol *DotSym = OutContext.GetOrCreateSymbol(Twine(Prefix)+"picbaseref"+
+                                                    Twine(getFunctionNumber()));
+    OutStreamer.EmitLabel(DotSym);
+    
+    // Now that we have emitted the label, lower the complex operand expression.
+    MCSymbol *OpSym = MCInstLowering.GetSymbolFromOperand(MI->getOperand(2));
+    
+    const MCExpr *DotExpr = MCSymbolRefExpr::Create(DotSym, OutContext);
+    const MCExpr *PICBase =
+      MCSymbolRefExpr::Create(MCInstLowering.GetPICBaseSymbol(), OutContext);
+    DotExpr = MCBinaryExpr::CreateSub(DotExpr, PICBase, OutContext);
+    
+    DotExpr = MCBinaryExpr::CreateAdd(MCSymbolRefExpr::Create(OpSym,OutContext), 
+                                      DotExpr, OutContext);
+    
+    MCInst TmpInst;
+    TmpInst.setOpcode(X86::ADD32ri);
+    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(1).getReg()));
+    TmpInst.addOperand(MCOperand::CreateExpr(DotExpr));
+    OutStreamer.EmitInstruction(TmpInst);
+    return;
+  }
+  }
+  
+  MCInst TmpInst;
+  MCInstLowering.Lower(MI, TmpInst);
+  
+  OutStreamer.EmitInstruction(TmpInst);
+}
+

diff --git a/lib/Target/X86/AsmPrinter/X86MCInstLower.h b/lib/Target/X86/AsmPrinter/X86MCInstLower.h
new file mode 100644
index 0000000..ebd23f6
--- /dev/null
+++ b/lib/Target/X86/AsmPrinter/X86MCInstLower.h

@@ -0,0 +1,51 @@
+//===-- X86MCInstLower.h - Lower MachineInstr to MCInst -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86_MCINSTLOWER_H
+#define X86_MCINSTLOWER_H
+
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+  class MCContext;
+  class MCInst;
+  class MCOperand;
+  class MCSymbol;
+  class MachineInstr;
+  class MachineModuleInfoMachO;
+  class MachineOperand;
+  class Mangler;
+  class X86AsmPrinter;
+  class X86Subtarget;
+  
+/// X86MCInstLower - This class is used to lower an MachineInstr into an MCInst.
+class VISIBILITY_HIDDEN X86MCInstLower {
+  MCContext &Ctx;
+  Mangler *Mang;
+  X86AsmPrinter &AsmPrinter;
+
+  const X86Subtarget &getSubtarget() const;
+public:
+  X86MCInstLower(MCContext &ctx, Mangler *mang, X86AsmPrinter &asmprinter)
+    : Ctx(ctx), Mang(mang), AsmPrinter(asmprinter) {}
+  
+  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+  MCSymbol *GetPICBaseSymbol() const;
+  
+  MCSymbol *GetSymbolFromOperand(const MachineOperand &MO) const;
+  MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
+  
+private:
+  MachineModuleInfoMachO &getMachOMMI() const;
+};
+
+}
+
+#endif

diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
new file mode 100644
index 0000000..61f26a7
--- /dev/null
+++ b/lib/Target/X86/CMakeLists.txt

@@ -0,0 +1,44 @@
+set(LLVM_TARGET_DEFINITIONS X86.td)
+
+tablegen(X86GenRegisterInfo.h.inc -gen-register-desc-header)
+tablegen(X86GenRegisterNames.inc -gen-register-enums)
+tablegen(X86GenRegisterInfo.inc -gen-register-desc)
+tablegen(X86GenDisassemblerTables.inc -gen-disassembler)
+tablegen(X86GenInstrNames.inc -gen-instr-enums)
+tablegen(X86GenInstrInfo.inc -gen-instr-desc)
+tablegen(X86GenAsmWriter.inc -gen-asm-writer)
+tablegen(X86GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1)
+tablegen(X86GenAsmMatcher.inc -gen-asm-matcher)
+tablegen(X86GenDAGISel.inc -gen-dag-isel)
+tablegen(X86GenFastISel.inc -gen-fast-isel)
+tablegen(X86GenCallingConv.inc -gen-callingconv)
+tablegen(X86GenSubtarget.inc -gen-subtarget)
+
+set(sources
+  X86CodeEmitter.cpp
+  X86COFFMachineModuleInfo.cpp
+  X86ELFWriterInfo.cpp
+  X86FloatingPoint.cpp
+  X86FloatingPointRegKill.cpp
+  X86ISelDAGToDAG.cpp
+  X86ISelLowering.cpp
+  X86InstrInfo.cpp
+  X86JITInfo.cpp
+  X86MCAsmInfo.cpp
+  X86MCCodeEmitter.cpp 
+  X86MCTargetExpr.cpp
+  X86RegisterInfo.cpp
+  X86Subtarget.cpp
+  X86TargetMachine.cpp
+  X86TargetObjectFile.cpp
+  X86FastISel.cpp
+  )
+
+if( CMAKE_CL_64 )
+  enable_language(ASM_MASM)
+  set(sources ${sources} X86CompilationCallback_Win64.asm)
+endif()
+
+add_llvm_target(X86CodeGen ${sources})
+
+target_link_libraries (LLVMX86CodeGen LLVMSelectionDAG)

diff --git a/lib/Target/X86/Disassembler/CMakeLists.txt b/lib/Target/X86/Disassembler/CMakeLists.txt
new file mode 100644
index 0000000..2a83a9c
--- /dev/null
+++ b/lib/Target/X86/Disassembler/CMakeLists.txt

@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMX86Disassembler
+  X86Disassembler.cpp
+  X86DisassemblerDecoder.c
+  )
+add_dependencies(LLVMX86Disassembler X86CodeGenTable_gen)

diff --git a/lib/Target/X86/Disassembler/Makefile b/lib/Target/X86/Disassembler/Makefile
new file mode 100644
index 0000000..b289647
--- /dev/null
+++ b/lib/Target/X86/Disassembler/Makefile

@@ -0,0 +1,16 @@
+##===- lib/Target/X86/Disassembler/Makefile ----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMX86Disassembler
+
+# Hack: we need to include 'main' x86 target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common

diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp
new file mode 100644
index 0000000..a316860
--- /dev/null
+++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp

@@ -0,0 +1,476 @@
+//===- X86Disassembler.cpp - Disassembler for x86 and x86_64 ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the X86 Disassembler.
+// It contains code to translate the data produced by the decoder into
+//  MCInsts.
+// Documentation for the disassembler can be found in X86Disassembler.h.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86Disassembler.h"
+#include "X86DisassemblerDecoder.h"
+
+#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/Support/MemoryObject.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include "X86GenRegisterNames.inc"
+
+using namespace llvm;
+using namespace llvm::X86Disassembler;
+
+namespace llvm {  
+  
+// Fill-ins to make the compiler happy.  These constants are never actually
+//   assigned; they are just filler to make an automatically-generated switch
+//   statement work.
+namespace X86 {
+  enum {
+    BX_SI = 500,
+    BX_DI = 501,
+    BP_SI = 502,
+    BP_DI = 503,
+    sib   = 504,
+    sib64 = 505
+  };
+}
+
+extern Target TheX86_32Target, TheX86_64Target;
+
+}
+
+static void translateInstruction(MCInst &target,
+                                 InternalInstruction &source);
+
+X86GenericDisassembler::X86GenericDisassembler(DisassemblerMode mode) :
+    MCDisassembler(),
+    fMode(mode) {
+}
+
+X86GenericDisassembler::~X86GenericDisassembler() {
+}
+
+/// regionReader - a callback function that wraps the readByte method from
+///   MemoryObject.
+///
+/// @param arg      - The generic callback parameter.  In this case, this should
+///                   be a pointer to a MemoryObject.
+/// @param byte     - A pointer to the byte to be read.
+/// @param address  - The address to be read.
+static int regionReader(void* arg, uint8_t* byte, uint64_t address) {
+  MemoryObject* region = static_cast<MemoryObject*>(arg);
+  return region->readByte(address, byte);
+}
+
+/// logger - a callback function that wraps the operator<< method from
+///   raw_ostream.
+///
+/// @param arg      - The generic callback parameter.  This should be a pointe
+///                   to a raw_ostream.
+/// @param log      - A string to be logged.  logger() adds a newline.
+static void logger(void* arg, const char* log) {
+  if (!arg)
+    return;
+  
+  raw_ostream &vStream = *(static_cast<raw_ostream*>(arg));
+  vStream << log << "\n";
+}  
+  
+//
+// Public interface for the disassembler
+//
+
+bool X86GenericDisassembler::getInstruction(MCInst &instr,
+                                            uint64_t &size,
+                                            const MemoryObject &region,
+                                            uint64_t address,
+                                            raw_ostream &vStream) const {
+  InternalInstruction internalInstr;
+  
+  int ret = decodeInstruction(&internalInstr,
+                              regionReader,
+                              (void*)&region,
+                              logger,
+                              (void*)&vStream,
+                              address,
+                              fMode);
+
+  if(ret) {
+    size = internalInstr.readerCursor - address;
+    return false;
+  }
+  else {
+    size = internalInstr.length;
+    translateInstruction(instr, internalInstr);
+    return true;
+  }
+}
+
+//
+// Private code that translates from struct InternalInstructions to MCInsts.
+//
+
+/// translateRegister - Translates an internal register to the appropriate LLVM
+///   register, and appends it as an operand to an MCInst.
+///
+/// @param mcInst     - The MCInst to append to.
+/// @param reg        - The Reg to append.
+static void translateRegister(MCInst &mcInst, Reg reg) {
+#define ENTRY(x) X86::x,
+  uint8_t llvmRegnums[] = {
+    ALL_REGS
+    0
+  };
+#undef ENTRY
+
+  uint8_t llvmRegnum = llvmRegnums[reg];
+  mcInst.addOperand(MCOperand::CreateReg(llvmRegnum));
+}
+
+/// translateImmediate  - Appends an immediate operand to an MCInst.
+///
+/// @param mcInst       - The MCInst to append to.
+/// @param immediate    - The immediate value to append.
+static void translateImmediate(MCInst &mcInst, uint64_t immediate) {
+  mcInst.addOperand(MCOperand::CreateImm(immediate));
+}
+
+/// translateRMRegister - Translates a register stored in the R/M field of the
+///   ModR/M byte to its LLVM equivalent and appends it to an MCInst.
+/// @param mcInst       - The MCInst to append to.
+/// @param insn         - The internal instruction to extract the R/M field
+///                       from.
+static void translateRMRegister(MCInst &mcInst,
+                                InternalInstruction &insn) {
+  assert(insn.eaBase != EA_BASE_sib && insn.eaBase != EA_BASE_sib64 && 
+         "A R/M register operand may not have a SIB byte");
+  
+  switch (insn.eaBase) {
+  case EA_BASE_NONE:
+    llvm_unreachable("EA_BASE_NONE for ModR/M base");
+    break;
+#define ENTRY(x) case EA_BASE_##x:
+  ALL_EA_BASES
+#undef ENTRY
+    llvm_unreachable("A R/M register operand may not have a base; "
+                     "the operand must be a register.");
+    break;
+#define ENTRY(x)                                                        \
+  case EA_REG_##x:                                                    \
+    mcInst.addOperand(MCOperand::CreateReg(X86::x)); break;
+  ALL_REGS
+#undef ENTRY
+  default:
+    llvm_unreachable("Unexpected EA base register");
+  }
+}
+
+/// translateRMMemory - Translates a memory operand stored in the Mod and R/M
+///   fields of an internal instruction (and possibly its SIB byte) to a memory
+///   operand in LLVM's format, and appends it to an MCInst.
+///
+/// @param mcInst       - The MCInst to append to.
+/// @param insn         - The instruction to extract Mod, R/M, and SIB fields
+///                       from.
+/// @param sr           - Whether or not to emit the segment register.  The
+///                       LEA instruction does not expect a segment-register
+///                       operand.
+static void translateRMMemory(MCInst &mcInst,
+                              InternalInstruction &insn,
+                              bool sr) {
+  // Addresses in an MCInst are represented as five operands:
+  //   1. basereg       (register)  The R/M base, or (if there is a SIB) the 
+  //                                SIB base
+  //   2. scaleamount   (immediate) 1, or (if there is a SIB) the specified 
+  //                                scale amount
+  //   3. indexreg      (register)  x86_registerNONE, or (if there is a SIB)
+  //                                the index (which is multiplied by the 
+  //                                scale amount)
+  //   4. displacement  (immediate) 0, or the displacement if there is one
+  //   5. segmentreg    (register)  x86_registerNONE for now, but could be set
+  //                                if we have segment overrides
+  
+  MCOperand baseReg;
+  MCOperand scaleAmount;
+  MCOperand indexReg;
+  MCOperand displacement;
+  MCOperand segmentReg;
+  
+  if (insn.eaBase == EA_BASE_sib || insn.eaBase == EA_BASE_sib64) {
+    if (insn.sibBase != SIB_BASE_NONE) {
+      switch (insn.sibBase) {
+      default:
+        llvm_unreachable("Unexpected sibBase");
+#define ENTRY(x)                                          \
+      case SIB_BASE_##x:                                  \
+        baseReg = MCOperand::CreateReg(X86::x); break;
+      ALL_SIB_BASES
+#undef ENTRY
+      }
+    } else {
+      baseReg = MCOperand::CreateReg(0);
+    }
+    
+    if (insn.sibIndex != SIB_INDEX_NONE) {
+      switch (insn.sibIndex) {
+      default:
+        llvm_unreachable("Unexpected sibIndex");
+#define ENTRY(x)                                          \
+      case SIB_INDEX_##x:                                 \
+        indexReg = MCOperand::CreateReg(X86::x); break;
+      EA_BASES_32BIT
+      EA_BASES_64BIT
+#undef ENTRY
+      }
+    } else {
+      indexReg = MCOperand::CreateReg(0);
+    }
+    
+    scaleAmount = MCOperand::CreateImm(insn.sibScale);
+  } else {
+    switch (insn.eaBase) {
+    case EA_BASE_NONE:
+      assert(insn.eaDisplacement != EA_DISP_NONE && 
+             "EA_BASE_NONE and EA_DISP_NONE for ModR/M base");
+      
+      if (insn.mode == MODE_64BIT)
+        baseReg = MCOperand::CreateReg(X86::RIP); // Section 2.2.1.6
+      else
+        baseReg = MCOperand::CreateReg(0);
+      
+      indexReg = MCOperand::CreateReg(0);
+      break;
+    case EA_BASE_BX_SI:
+      baseReg = MCOperand::CreateReg(X86::BX);
+      indexReg = MCOperand::CreateReg(X86::SI);
+      break;
+    case EA_BASE_BX_DI:
+      baseReg = MCOperand::CreateReg(X86::BX);
+      indexReg = MCOperand::CreateReg(X86::DI);
+      break;
+    case EA_BASE_BP_SI:
+      baseReg = MCOperand::CreateReg(X86::BP);
+      indexReg = MCOperand::CreateReg(X86::SI);
+      break;
+    case EA_BASE_BP_DI:
+      baseReg = MCOperand::CreateReg(X86::BP);
+      indexReg = MCOperand::CreateReg(X86::DI);
+      break;
+    default:
+      indexReg = MCOperand::CreateReg(0);
+      switch (insn.eaBase) {
+      default:
+        llvm_unreachable("Unexpected eaBase");
+        break;
+        // Here, we will use the fill-ins defined above.  However,
+        //   BX_SI, BX_DI, BP_SI, and BP_DI are all handled above and
+        //   sib and sib64 were handled in the top-level if, so they're only
+        //   placeholders to keep the compiler happy.
+#define ENTRY(x)                                        \
+      case EA_BASE_##x:                                 \
+        baseReg = MCOperand::CreateReg(X86::x); break; 
+      ALL_EA_BASES
+#undef ENTRY
+#define ENTRY(x) case EA_REG_##x:
+      ALL_REGS
+#undef ENTRY
+        llvm_unreachable("A R/M memory operand may not be a register; "
+                         "the base field must be a base.");
+            break;
+      }
+    }
+    
+    scaleAmount = MCOperand::CreateImm(1);
+  }
+  
+  displacement = MCOperand::CreateImm(insn.displacement);
+  
+  static const uint8_t segmentRegnums[SEG_OVERRIDE_max] = {
+    0,        // SEG_OVERRIDE_NONE
+    X86::CS,
+    X86::SS,
+    X86::DS,
+    X86::ES,
+    X86::FS,
+    X86::GS
+  };
+  
+  segmentReg = MCOperand::CreateReg(segmentRegnums[insn.segmentOverride]);
+  
+  mcInst.addOperand(baseReg);
+  mcInst.addOperand(scaleAmount);
+  mcInst.addOperand(indexReg);
+  mcInst.addOperand(displacement);
+  
+  if (sr)
+    mcInst.addOperand(segmentReg);
+}
+
+/// translateRM - Translates an operand stored in the R/M (and possibly SIB)
+///   byte of an instruction to LLVM form, and appends it to an MCInst.
+///
+/// @param mcInst       - The MCInst to append to.
+/// @param operand      - The operand, as stored in the descriptor table.
+/// @param insn         - The instruction to extract Mod, R/M, and SIB fields
+///                       from.
+static void translateRM(MCInst &mcInst,
+                        OperandSpecifier &operand,
+                        InternalInstruction &insn) {
+  switch (operand.type) {
+  default:
+    llvm_unreachable("Unexpected type for a R/M operand");
+  case TYPE_R8:
+  case TYPE_R16:
+  case TYPE_R32:
+  case TYPE_R64:
+  case TYPE_Rv:
+  case TYPE_MM:
+  case TYPE_MM32:
+  case TYPE_MM64:
+  case TYPE_XMM:
+  case TYPE_XMM32:
+  case TYPE_XMM64:
+  case TYPE_XMM128:
+  case TYPE_DEBUGREG:
+  case TYPE_CR32:
+  case TYPE_CR64:
+    translateRMRegister(mcInst, insn);
+    break;
+  case TYPE_M:
+  case TYPE_M8:
+  case TYPE_M16:
+  case TYPE_M32:
+  case TYPE_M64:
+  case TYPE_M128:
+  case TYPE_M512:
+  case TYPE_Mv:
+  case TYPE_M32FP:
+  case TYPE_M64FP:
+  case TYPE_M80FP:
+  case TYPE_M16INT:
+  case TYPE_M32INT:
+  case TYPE_M64INT:
+  case TYPE_M1616:
+  case TYPE_M1632:
+  case TYPE_M1664:
+    translateRMMemory(mcInst, insn, true);
+    break;
+  case TYPE_LEA:
+    translateRMMemory(mcInst, insn, false);
+    break;
+  }
+}
+  
+/// translateFPRegister - Translates a stack position on the FPU stack to its
+///   LLVM form, and appends it to an MCInst.
+///
+/// @param mcInst       - The MCInst to append to.
+/// @param stackPos     - The stack position to translate.
+static void translateFPRegister(MCInst &mcInst,
+                                uint8_t stackPos) {
+  assert(stackPos < 8 && "Invalid FP stack position");
+  
+  mcInst.addOperand(MCOperand::CreateReg(X86::ST0 + stackPos));
+}
+
+/// translateOperand - Translates an operand stored in an internal instruction 
+///   to LLVM's format and appends it to an MCInst.
+///
+/// @param mcInst       - The MCInst to append to.
+/// @param operand      - The operand, as stored in the descriptor table.
+/// @param insn         - The internal instruction.
+static void translateOperand(MCInst &mcInst,
+                             OperandSpecifier &operand,
+                             InternalInstruction &insn) {
+  switch (operand.encoding) {
+  default:
+    llvm_unreachable("Unhandled operand encoding during translation");
+  case ENCODING_REG:
+    translateRegister(mcInst, insn.reg);
+    break;
+  case ENCODING_RM:
+    translateRM(mcInst, operand, insn);
+    break;
+  case ENCODING_CB:
+  case ENCODING_CW:
+  case ENCODING_CD:
+  case ENCODING_CP:
+  case ENCODING_CO:
+  case ENCODING_CT:
+    llvm_unreachable("Translation of code offsets isn't supported.");
+  case ENCODING_IB:
+  case ENCODING_IW:
+  case ENCODING_ID:
+  case ENCODING_IO:
+  case ENCODING_Iv:
+  case ENCODING_Ia:
+    translateImmediate(mcInst, 
+                       insn.immediates[insn.numImmediatesTranslated++]);
+    break;
+  case ENCODING_RB:
+  case ENCODING_RW:
+  case ENCODING_RD:
+  case ENCODING_RO:
+    translateRegister(mcInst, insn.opcodeRegister);
+    break;
+  case ENCODING_I:
+    translateFPRegister(mcInst, insn.opcodeModifier);
+    break;
+  case ENCODING_Rv:
+    translateRegister(mcInst, insn.opcodeRegister);
+    break;
+  case ENCODING_DUP:
+    translateOperand(mcInst,
+                     insn.spec->operands[operand.type - TYPE_DUP0],
+                     insn);
+    break;
+  }
+}
+  
+/// translateInstruction - Translates an internal instruction and all its
+///   operands to an MCInst.
+///
+/// @param mcInst       - The MCInst to populate with the instruction's data.
+/// @param insn         - The internal instruction.
+static void translateInstruction(MCInst &mcInst,
+                                 InternalInstruction &insn) {  
+  assert(insn.spec);
+  
+  mcInst.setOpcode(insn.instructionID);
+  
+  int index;
+  
+  insn.numImmediatesTranslated = 0;
+  
+  for (index = 0; index < X86_MAX_OPERANDS; ++index) {
+    if (insn.spec->operands[index].encoding != ENCODING_NONE)                
+      translateOperand(mcInst, insn.spec->operands[index], insn);
+  }
+}
+
+static const MCDisassembler *createX86_32Disassembler(const Target &T) {
+  return new X86Disassembler::X86_32Disassembler;
+}
+
+static const MCDisassembler *createX86_64Disassembler(const Target &T) {
+  return new X86Disassembler::X86_64Disassembler;
+}
+
+extern "C" void LLVMInitializeX86Disassembler() { 
+  // Register the disassembler.
+  TargetRegistry::RegisterMCDisassembler(TheX86_32Target, 
+                                         createX86_32Disassembler);
+  TargetRegistry::RegisterMCDisassembler(TheX86_64Target,
+                                         createX86_64Disassembler);
+}

diff --git a/lib/Target/X86/Disassembler/X86Disassembler.h b/lib/Target/X86/Disassembler/X86Disassembler.h
new file mode 100644
index 0000000..0e6e0b0
--- /dev/null
+++ b/lib/Target/X86/Disassembler/X86Disassembler.h

@@ -0,0 +1,150 @@
+//===- X86Disassembler.h - Disassembler for x86 and x86_64 ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The X86 disassembler is a table-driven disassembler for the 16-, 32-, and
+// 64-bit X86 instruction sets.  The main decode sequence for an assembly
+// instruction in this disassembler is:
+//
+// 1. Read the prefix bytes and determine the attributes of the instruction.
+//    These attributes, recorded in enum attributeBits
+//    (X86DisassemblerDecoderCommon.h), form a bitmask.  The table CONTEXTS_SYM
+//    provides a mapping from bitmasks to contexts, which are represented by
+//    enum InstructionContext (ibid.).
+//
+// 2. Read the opcode, and determine what kind of opcode it is.  The
+//    disassembler distinguishes four kinds of opcodes, which are enumerated in
+//    OpcodeType (X86DisassemblerDecoderCommon.h): one-byte (0xnn), two-byte
+//    (0x0f 0xnn), three-byte-38 (0x0f 0x38 0xnn), or three-byte-3a 
+//    (0x0f 0x3a 0xnn).  Mandatory prefixes are treated as part of the context.
+//
+// 3. Depending on the opcode type, look in one of four ClassDecision structures
+//    (X86DisassemblerDecoderCommon.h).  Use the opcode class to determine which
+//    OpcodeDecision (ibid.) to look the opcode in.  Look up the opcode, to get
+//    a ModRMDecision (ibid.).
+//
+// 4. Some instructions, such as escape opcodes or extended opcodes, or even
+//    instructions that have ModRM*Reg / ModRM*Mem forms in LLVM, need the
+//    ModR/M byte to complete decode.  The ModRMDecision's type is an entry from
+//    ModRMDecisionType (X86DisassemblerDecoderCommon.h) that indicates if the
+//    ModR/M byte is required and how to interpret it.
+//
+// 5. After resolving the ModRMDecision, the disassembler has a unique ID
+//    of type InstrUID (X86DisassemblerDecoderCommon.h).  Looking this ID up in
+//    INSTRUCTIONS_SYM yields the name of the instruction and the encodings and
+//    meanings of its operands.
+//
+// 6. For each operand, its encoding is an entry from OperandEncoding
+//    (X86DisassemblerDecoderCommon.h) and its type is an entry from
+//    OperandType (ibid.).  The encoding indicates how to read it from the
+//    instruction; the type indicates how to interpret the value once it has
+//    been read.  For example, a register operand could be stored in the R/M
+//    field of the ModR/M byte, the REG field of the ModR/M byte, or added to
+//    the main opcode.  This is orthogonal from its meaning (an GPR or an XMM
+//    register, for instance).  Given this information, the operands can be
+//    extracted and interpreted.
+//
+// 7. As the last step, the disassembler translates the instruction information
+//    and operands into a format understandable by the client - in this case, an
+//    MCInst for use by the MC infrastructure.
+//
+// The disassembler is broken broadly into two parts: the table emitter that
+// emits the instruction decode tables discussed above during compilation, and
+// the disassembler itself.  The table emitter is documented in more detail in
+// utils/TableGen/X86DisassemblerEmitter.h.
+//
+// X86Disassembler.h contains the public interface for the disassembler,
+//   adhering to the MCDisassembler interface.
+// X86Disassembler.cpp contains the code responsible for step 7, and for
+//   invoking the decoder to execute steps 1-6.
+// X86DisassemblerDecoderCommon.h contains the definitions needed by both the
+//   table emitter and the disassembler.
+// X86DisassemblerDecoder.h contains the public interface of the decoder,
+//   factored out into C for possible use by other projects.
+// X86DisassemblerDecoder.c contains the source code of the decoder, which is
+//   responsible for steps 1-6.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86DISASSEMBLER_H
+#define X86DISASSEMBLER_H
+
+#define INSTRUCTION_SPECIFIER_FIELDS  \
+  const char*             name;
+
+#define INSTRUCTION_IDS               \
+  InstrUID*  instructionIDs;
+
+#include "X86DisassemblerDecoderCommon.h"
+
+#undef INSTRUCTION_SPECIFIER_FIELDS
+#undef INSTRUCTION_IDS
+
+#include "llvm/MC/MCDisassembler.h"
+
+struct InternalInstruction;
+
+namespace llvm {
+  
+class MCInst;
+class MemoryObject;
+class raw_ostream;
+  
+namespace X86Disassembler {
+
+/// X86GenericDisassembler - Generic disassembler for all X86 platforms.
+///   All each platform class should have to do is subclass the constructor, and
+///   provide a different disassemblerMode value.
+class X86GenericDisassembler : public MCDisassembler {
+protected:
+  /// Constructor     - Initializes the disassembler.
+  ///
+  /// @param mode     - The X86 architecture mode to decode for.
+  X86GenericDisassembler(DisassemblerMode mode);
+public:
+  ~X86GenericDisassembler();
+
+  /// getInstruction - See MCDisassembler.
+  bool getInstruction(MCInst &instr,
+                      uint64_t &size,
+                      const MemoryObject &region,
+                      uint64_t address,
+                      raw_ostream &vStream) const;
+private:
+  DisassemblerMode              fMode;
+};
+
+/// X86_16Disassembler - 16-bit X86 disassembler.
+class X86_16Disassembler : public X86GenericDisassembler {
+public:
+  X86_16Disassembler() :
+    X86GenericDisassembler(MODE_16BIT) {
+  }
+};  
+
+/// X86_16Disassembler - 32-bit X86 disassembler.
+class X86_32Disassembler : public X86GenericDisassembler {
+public:
+  X86_32Disassembler() :
+    X86GenericDisassembler(MODE_32BIT) {
+  }
+};
+
+/// X86_16Disassembler - 64-bit X86 disassembler.
+class X86_64Disassembler : public X86GenericDisassembler {
+public:
+  X86_64Disassembler() :
+    X86GenericDisassembler(MODE_64BIT) {
+  }
+};
+
+} // namespace X86Disassembler
+  
+} // namespace llvm
+  
+#endif

diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
new file mode 100644
index 0000000..a0a04ba
--- /dev/null
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c

@@ -0,0 +1,1365 @@
+/*===- X86DisassemblerDecoder.c - Disassembler decoder -------------*- C -*-==*
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===*
+ *
+ * This file is part of the X86 Disassembler.
+ * It contains the implementation of the instruction decoder.
+ * Documentation for the disassembler can be found in X86Disassembler.h.
+ *
+ *===----------------------------------------------------------------------===*/
+
+#include <assert.h>   /* for assert()     */
+#include <stdarg.h>   /* for va_*()       */
+#include <stdio.h>    /* for vsnprintf()  */
+#include <stdlib.h>   /* for exit()       */
+#include <string.h>   /* for memset()     */
+
+#include "X86DisassemblerDecoder.h"
+
+#include "X86GenDisassemblerTables.inc"
+
+#define TRUE  1
+#define FALSE 0
+
+#ifdef __GNUC__
+#define NORETURN __attribute__((noreturn))
+#else
+#define NORETURN
+#endif
+
+#define unreachable(s)                                      \
+  do {                                                      \
+    fprintf(stderr, "%s:%d: %s\n", __FILE__, __LINE__, s);  \
+    exit(-1);                                               \
+  } while (0);
+
+/*
+ * contextForAttrs - Client for the instruction context table.  Takes a set of
+ *   attributes and returns the appropriate decode context.
+ *
+ * @param attrMask  - Attributes, from the enumeration attributeBits.
+ * @return          - The InstructionContext to use when looking up an
+ *                    an instruction with these attributes.
+ */
+static InstructionContext contextForAttrs(uint8_t attrMask) {
+  return CONTEXTS_SYM[attrMask];
+}
+
+/*
+ * modRMRequired - Reads the appropriate instruction table to determine whether
+ *   the ModR/M byte is required to decode a particular instruction.
+ *
+ * @param type        - The opcode type (i.e., how many bytes it has).
+ * @param insnContext - The context for the instruction, as returned by
+ *                      contextForAttrs.
+ * @param opcode      - The last byte of the instruction's opcode, not counting
+ *                      ModR/M extensions and escapes.
+ * @return            - TRUE if the ModR/M byte is required, FALSE otherwise.
+ */
+static int modRMRequired(OpcodeType type,
+                                InstructionContext insnContext,
+                                uint8_t opcode) {
+  const struct ContextDecision* decision = 0;
+  
+  switch (type) {
+  case ONEBYTE:
+    decision = &ONEBYTE_SYM;
+    break;
+  case TWOBYTE:
+    decision = &TWOBYTE_SYM;
+    break;
+  case THREEBYTE_38:
+    decision = &THREEBYTE38_SYM;
+    break;
+  case THREEBYTE_3A:
+    decision = &THREEBYTE3A_SYM;
+    break;
+  }
+  
+  return decision->opcodeDecisions[insnContext].modRMDecisions[opcode].
+    modrm_type != MODRM_ONEENTRY;
+  
+  unreachable("Unknown opcode type");
+  return 0;
+}
+
+/*
+ * decode - Reads the appropriate instruction table to obtain the unique ID of
+ *   an instruction.
+ *
+ * @param type        - See modRMRequired().
+ * @param insnContext - See modRMRequired().
+ * @param opcode      - See modRMRequired().
+ * @param modRM       - The ModR/M byte if required, or any value if not.
+ */
+static InstrUID decode(OpcodeType type,
+                               InstructionContext insnContext,
+                               uint8_t opcode,
+                               uint8_t modRM) {
+  struct ModRMDecision* dec;
+  
+  switch (type) {
+  default:
+    unreachable("Unknown opcode type");
+  case ONEBYTE:
+    dec = &ONEBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  case TWOBYTE:
+    dec = &TWOBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  case THREEBYTE_38:
+    dec = &THREEBYTE38_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  case THREEBYTE_3A:
+    dec = &THREEBYTE3A_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+    break;
+  }
+  
+  switch (dec->modrm_type) {
+  default:
+    unreachable("Corrupt table!  Unknown modrm_type");
+  case MODRM_ONEENTRY:
+    return dec->instructionIDs[0];
+  case MODRM_SPLITRM:
+    if (modFromModRM(modRM) == 0x3)
+      return dec->instructionIDs[1];
+    else
+      return dec->instructionIDs[0];
+  case MODRM_FULL:
+    return dec->instructionIDs[modRM];
+  }
+  
+  return 0;
+}
+
+/*
+ * specifierForUID - Given a UID, returns the name and operand specification for
+ *   that instruction.
+ *
+ * @param uid - The unique ID for the instruction.  This should be returned by
+ *              decode(); specifierForUID will not check bounds.
+ * @return    - A pointer to the specification for that instruction.
+ */
+static struct InstructionSpecifier* specifierForUID(InstrUID uid) {
+  return &INSTRUCTIONS_SYM[uid];
+}
+
+/*
+ * consumeByte - Uses the reader function provided by the user to consume one
+ *   byte from the instruction's memory and advance the cursor.
+ *
+ * @param insn  - The instruction with the reader function to use.  The cursor
+ *                for this instruction is advanced.
+ * @param byte  - A pointer to a pre-allocated memory buffer to be populated
+ *                with the data read.
+ * @return      - 0 if the read was successful; nonzero otherwise.
+ */
+static int consumeByte(struct InternalInstruction* insn, uint8_t* byte) {
+  int ret = insn->reader(insn->readerArg, byte, insn->readerCursor);
+  
+  if (!ret)
+    ++(insn->readerCursor);
+  
+  return ret;
+}
+
+/*
+ * lookAtByte - Like consumeByte, but does not advance the cursor.
+ *
+ * @param insn  - See consumeByte().
+ * @param byte  - See consumeByte().
+ * @return      - See consumeByte().
+ */
+static int lookAtByte(struct InternalInstruction* insn, uint8_t* byte) {
+  return insn->reader(insn->readerArg, byte, insn->readerCursor);
+}
+
+static void unconsumeByte(struct InternalInstruction* insn) {
+  insn->readerCursor--;
+}
+
+#define CONSUME_FUNC(name, type)                                  \
+  static int name(struct InternalInstruction* insn, type* ptr) {  \
+    type combined = 0;                                            \
+    unsigned offset;                                              \
+    for (offset = 0; offset < sizeof(type); ++offset) {           \
+      uint8_t byte;                                               \
+      int ret = insn->reader(insn->readerArg,                     \
+                             &byte,                               \
+                             insn->readerCursor + offset);        \
+      if (ret)                                                    \
+        return ret;                                               \
+      combined = combined | ((type)byte << ((type)offset * 8));   \
+    }                                                             \
+    *ptr = combined;                                              \
+    insn->readerCursor += sizeof(type);                           \
+    return 0;                                                     \
+  }
+
+/*
+ * consume* - Use the reader function provided by the user to consume data
+ *   values of various sizes from the instruction's memory and advance the
+ *   cursor appropriately.  These readers perform endian conversion.
+ *
+ * @param insn    - See consumeByte().
+ * @param ptr     - A pointer to a pre-allocated memory of appropriate size to
+ *                  be populated with the data read.
+ * @return        - See consumeByte().
+ */
+CONSUME_FUNC(consumeInt8, int8_t)
+CONSUME_FUNC(consumeInt16, int16_t)
+CONSUME_FUNC(consumeInt32, int32_t)
+CONSUME_FUNC(consumeUInt16, uint16_t)
+CONSUME_FUNC(consumeUInt32, uint32_t)
+CONSUME_FUNC(consumeUInt64, uint64_t)
+
+/*
+ * dbgprintf - Uses the logging function provided by the user to log a single
+ *   message, typically without a carriage-return.
+ *
+ * @param insn    - The instruction containing the logging function.
+ * @param format  - See printf().
+ * @param ...     - See printf().
+ */
+static void dbgprintf(struct InternalInstruction* insn,
+                      const char* format,
+                      ...) {  
+  char buffer[256];
+  va_list ap;
+  
+  if (!insn->dlog)
+    return;
+    
+  va_start(ap, format);
+  (void)vsnprintf(buffer, sizeof(buffer), format, ap);
+  va_end(ap);
+  
+  insn->dlog(insn->dlogArg, buffer);
+  
+  return;
+}
+
+/*
+ * setPrefixPresent - Marks that a particular prefix is present at a particular
+ *   location.
+ *
+ * @param insn      - The instruction to be marked as having the prefix.
+ * @param prefix    - The prefix that is present.
+ * @param location  - The location where the prefix is located (in the address
+ *                    space of the instruction's reader).
+ */
+static void setPrefixPresent(struct InternalInstruction* insn,
+                                    uint8_t prefix,
+                                    uint64_t location)
+{
+  insn->prefixPresent[prefix] = 1;
+  insn->prefixLocations[prefix] = location;
+}
+
+/*
+ * isPrefixAtLocation - Queries an instruction to determine whether a prefix is
+ *   present at a given location.
+ *
+ * @param insn      - The instruction to be queried.
+ * @param prefix    - The prefix.
+ * @param location  - The location to query.
+ * @return          - Whether the prefix is at that location.
+ */
+static BOOL isPrefixAtLocation(struct InternalInstruction* insn,
+                               uint8_t prefix,
+                               uint64_t location)
+{
+  if (insn->prefixPresent[prefix] == 1 &&
+     insn->prefixLocations[prefix] == location)
+    return TRUE;
+  else
+    return FALSE;
+}
+
+/*
+ * readPrefixes - Consumes all of an instruction's prefix bytes, and marks the
+ *   instruction as having them.  Also sets the instruction's default operand,
+ *   address, and other relevant data sizes to report operands correctly.
+ *
+ * @param insn  - The instruction whose prefixes are to be read.
+ * @return      - 0 if the instruction could be read until the end of the prefix
+ *                bytes, and no prefixes conflicted; nonzero otherwise.
+ */
+static int readPrefixes(struct InternalInstruction* insn) {
+  BOOL isPrefix = TRUE;
+  BOOL prefixGroups[4] = { FALSE };
+  uint64_t prefixLocation;
+  uint8_t byte;
+  
+  BOOL hasAdSize = FALSE;
+  BOOL hasOpSize = FALSE;
+  
+  dbgprintf(insn, "readPrefixes()");
+    
+  while (isPrefix) {
+    prefixLocation = insn->readerCursor;
+    
+    if (consumeByte(insn, &byte))
+      return -1;
+    
+    switch (byte) {
+    case 0xf0:  /* LOCK */
+    case 0xf2:  /* REPNE/REPNZ */
+    case 0xf3:  /* REP or REPE/REPZ */
+      if (prefixGroups[0])
+        dbgprintf(insn, "Redundant Group 1 prefix");
+      prefixGroups[0] = TRUE;
+      setPrefixPresent(insn, byte, prefixLocation);
+      break;
+    case 0x2e:  /* CS segment override -OR- Branch not taken */
+    case 0x36:  /* SS segment override -OR- Branch taken */
+    case 0x3e:  /* DS segment override */
+    case 0x26:  /* ES segment override */
+    case 0x64:  /* FS segment override */
+    case 0x65:  /* GS segment override */
+      switch (byte) {
+      case 0x2e:
+        insn->segmentOverride = SEG_OVERRIDE_CS;
+        break;
+      case 0x36:
+        insn->segmentOverride = SEG_OVERRIDE_SS;
+        break;
+      case 0x3e:
+        insn->segmentOverride = SEG_OVERRIDE_DS;
+        break;
+      case 0x26:
+        insn->segmentOverride = SEG_OVERRIDE_ES;
+        break;
+      case 0x64:
+        insn->segmentOverride = SEG_OVERRIDE_FS;
+        break;
+      case 0x65:
+        insn->segmentOverride = SEG_OVERRIDE_GS;
+        break;
+      default:
+        unreachable("Unhandled override");
+      }
+      if (prefixGroups[1])
+        dbgprintf(insn, "Redundant Group 2 prefix");
+      prefixGroups[1] = TRUE;
+      setPrefixPresent(insn, byte, prefixLocation);
+      break;
+    case 0x66:  /* Operand-size override */
+      if (prefixGroups[2])
+        dbgprintf(insn, "Redundant Group 3 prefix");
+      prefixGroups[2] = TRUE;
+      hasOpSize = TRUE;
+      setPrefixPresent(insn, byte, prefixLocation);
+      break;
+    case 0x67:  /* Address-size override */
+      if (prefixGroups[3])
+        dbgprintf(insn, "Redundant Group 4 prefix");
+      prefixGroups[3] = TRUE;
+      hasAdSize = TRUE;
+      setPrefixPresent(insn, byte, prefixLocation);
+      break;
+    default:    /* Not a prefix byte */
+      isPrefix = FALSE;
+      break;
+    }
+    
+    if (isPrefix)
+      dbgprintf(insn, "Found prefix 0x%hhx", byte);
+  }
+  
+  if (insn->mode == MODE_64BIT) {
+    if ((byte & 0xf0) == 0x40) {
+      uint8_t opcodeByte;
+      
+      if(lookAtByte(insn, &opcodeByte) || ((opcodeByte & 0xf0) == 0x40)) {
+        dbgprintf(insn, "Redundant REX prefix");
+        return -1;
+      }
+      
+      insn->rexPrefix = byte;
+      insn->necessaryPrefixLocation = insn->readerCursor - 2;
+      
+      dbgprintf(insn, "Found REX prefix 0x%hhx", byte);
+    } else {                
+      unconsumeByte(insn);
+      insn->necessaryPrefixLocation = insn->readerCursor - 1;
+    }
+  } else {
+    unconsumeByte(insn);
+  }
+  
+  if (insn->mode == MODE_16BIT) {
+    insn->registerSize       = (hasOpSize ? 4 : 2);
+    insn->addressSize        = (hasAdSize ? 4 : 2);
+    insn->displacementSize   = (hasAdSize ? 4 : 2);
+    insn->immediateSize      = (hasOpSize ? 4 : 2);
+  } else if (insn->mode == MODE_32BIT) {
+    insn->registerSize       = (hasOpSize ? 2 : 4);
+    insn->addressSize        = (hasAdSize ? 2 : 4);
+    insn->displacementSize   = (hasAdSize ? 2 : 4);
+    insn->immediateSize      = (hasAdSize ? 2 : 4);
+  } else if (insn->mode == MODE_64BIT) {
+    if (insn->rexPrefix && wFromREX(insn->rexPrefix)) {
+      insn->registerSize       = 8;
+      insn->addressSize        = (hasAdSize ? 4 : 8);
+      insn->displacementSize   = 4;
+      insn->immediateSize      = 4;
+    } else if (insn->rexPrefix) {
+      insn->registerSize       = (hasOpSize ? 2 : 4);
+      insn->addressSize        = (hasAdSize ? 4 : 8);
+      insn->displacementSize   = (hasOpSize ? 2 : 4);
+      insn->immediateSize      = (hasOpSize ? 2 : 4);
+    } else {
+      insn->registerSize       = (hasOpSize ? 2 : 4);
+      insn->addressSize        = (hasAdSize ? 4 : 8);
+      insn->displacementSize   = (hasOpSize ? 2 : 4);
+      insn->immediateSize      = (hasOpSize ? 2 : 4);
+    }
+  }
+  
+  return 0;
+}
+
+/*
+ * readOpcode - Reads the opcode (excepting the ModR/M byte in the case of
+ *   extended or escape opcodes).
+ *
+ * @param insn  - The instruction whose opcode is to be read.
+ * @return      - 0 if the opcode could be read successfully; nonzero otherwise.
+ */
+static int readOpcode(struct InternalInstruction* insn) {  
+  /* Determine the length of the primary opcode */
+  
+  uint8_t current;
+  
+  dbgprintf(insn, "readOpcode()");
+  
+  insn->opcodeType = ONEBYTE;
+  if (consumeByte(insn, &current))
+    return -1;
+  
+  if (current == 0x0f) {
+    dbgprintf(insn, "Found a two-byte escape prefix (0x%hhx)", current);
+    
+    insn->twoByteEscape = current;
+    
+    if (consumeByte(insn, &current))
+      return -1;
+    
+    if (current == 0x38) {
+      dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
+      
+      insn->threeByteEscape = current;
+      
+      if (consumeByte(insn, &current))
+        return -1;
+      
+      insn->opcodeType = THREEBYTE_38;
+    } else if (current == 0x3a) {
+      dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
+      
+      insn->threeByteEscape = current;
+      
+      if (consumeByte(insn, &current))
+        return -1;
+      
+      insn->opcodeType = THREEBYTE_3A;
+    } else {
+      dbgprintf(insn, "Didn't find a three-byte escape prefix");
+      
+      insn->opcodeType = TWOBYTE;
+    }
+  }
+  
+  /*
+   * At this point we have consumed the full opcode.
+   * Anything we consume from here on must be unconsumed.
+   */
+  
+  insn->opcode = current;
+  
+  return 0;
+}
+
+static int readModRM(struct InternalInstruction* insn);
+
+/*
+ * getIDWithAttrMask - Determines the ID of an instruction, consuming
+ *   the ModR/M byte as appropriate for extended and escape opcodes,
+ *   and using a supplied attribute mask.
+ *
+ * @param instructionID - A pointer whose target is filled in with the ID of the
+ *                        instruction.
+ * @param insn          - The instruction whose ID is to be determined.
+ * @param attrMask      - The attribute mask to search.
+ * @return              - 0 if the ModR/M could be read when needed or was not
+ *                        needed; nonzero otherwise.
+ */
+static int getIDWithAttrMask(uint16_t* instructionID,
+                             struct InternalInstruction* insn,
+                             uint8_t attrMask) {
+  BOOL hasModRMExtension;
+  
+  uint8_t instructionClass;
+
+  instructionClass = contextForAttrs(attrMask);
+  
+  hasModRMExtension = modRMRequired(insn->opcodeType,
+                                    instructionClass,
+                                    insn->opcode);
+  
+  if (hasModRMExtension) {
+    readModRM(insn);
+    
+    *instructionID = decode(insn->opcodeType,
+                            instructionClass,
+                            insn->opcode,
+                            insn->modRM);
+  } else {
+    *instructionID = decode(insn->opcodeType,
+                            instructionClass,
+                            insn->opcode,
+                            0);
+  }
+      
+  return 0;
+}
+
+/*
+ * is16BitEquivalent - Determines whether two instruction names refer to
+ * equivalent instructions but one is 16-bit whereas the other is not.
+ *
+ * @param orig  - The instruction that is not 16-bit
+ * @param equiv - The instruction that is 16-bit
+ */
+static BOOL is16BitEquvalent(const char* orig, const char* equiv) {
+  off_t i;
+  
+  for(i = 0;; i++) {
+    if(orig[i] == '\0' && equiv[i] == '\0')
+      return TRUE;
+    if(orig[i] == '\0' || equiv[i] == '\0')
+      return FALSE;
+    if(orig[i] != equiv[i]) {
+      if((orig[i] == 'Q' || orig[i] == 'L') && equiv[i] == 'W')
+        continue;
+      if((orig[i] == '6' || orig[i] == '3') && equiv[i] == '1')
+        continue;
+      if((orig[i] == '4' || orig[i] == '2') && equiv[i] == '6')
+        continue;
+      return FALSE;
+    }
+  }
+}
+
+/*
+ * is64BitEquivalent - Determines whether two instruction names refer to
+ * equivalent instructions but one is 64-bit whereas the other is not.
+ *
+ * @param orig  - The instruction that is not 64-bit
+ * @param equiv - The instruction that is 64-bit
+ */
+static BOOL is64BitEquivalent(const char* orig, const char* equiv) {
+  off_t i;
+  
+  for(i = 0;; i++) {
+    if(orig[i] == '\0' && equiv[i] == '\0')
+      return TRUE;
+    if(orig[i] == '\0' || equiv[i] == '\0')
+      return FALSE;
+    if(orig[i] != equiv[i]) {
+      if((orig[i] == 'W' || orig[i] == 'L') && equiv[i] == 'Q')
+        continue;
+      if((orig[i] == '1' || orig[i] == '3') && equiv[i] == '6')
+        continue;
+      if((orig[i] == '6' || orig[i] == '2') && equiv[i] == '4')
+        continue;
+      return FALSE;
+    }
+  }
+}
+
+
+/*
+ * getID - Determines the ID of an instruction, consuming the ModR/M byte as 
+ *   appropriate for extended and escape opcodes.  Determines the attributes and 
+ *   context for the instruction before doing so.
+ *
+ * @param insn  - The instruction whose ID is to be determined.
+ * @return      - 0 if the ModR/M could be read when needed or was not needed;
+ *                nonzero otherwise.
+ */
+static int getID(struct InternalInstruction* insn) {  
+  uint8_t attrMask;
+  uint16_t instructionID;
+  
+  dbgprintf(insn, "getID()");
+    
+  attrMask = ATTR_NONE;
+  
+  if (insn->mode == MODE_64BIT)
+    attrMask |= ATTR_64BIT;
+  
+  if (insn->rexPrefix & 0x08)
+    attrMask |= ATTR_REXW;
+  
+  if (isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation))
+    attrMask |= ATTR_OPSIZE;
+  else if (isPrefixAtLocation(insn, 0xf3, insn->necessaryPrefixLocation))
+    attrMask |= ATTR_XS;
+  else if (isPrefixAtLocation(insn, 0xf2, insn->necessaryPrefixLocation))
+    attrMask |= ATTR_XD;
+  
+  if(getIDWithAttrMask(&instructionID, insn, attrMask))
+    return -1;
+  
+  /* The following clauses compensate for limitations of the tables. */
+  
+  if ((attrMask & ATTR_XD) && (attrMask & ATTR_REXW)) {
+    /*
+     * Although for SSE instructions it is usually necessary to treat REX.W+F2
+     * as F2 for decode (in the absence of a 64BIT_REXW_XD category) there is
+     * an occasional instruction where F2 is incidental and REX.W is the more
+     * significant.  If the decoded instruction is 32-bit and adding REX.W
+     * instead of F2 changes a 32 to a 64, we adopt the new encoding.
+     */
+    
+    struct InstructionSpecifier* spec;
+    uint16_t instructionIDWithREXw;
+    struct InstructionSpecifier* specWithREXw;
+    
+    spec = specifierForUID(instructionID);
+    
+    if (getIDWithAttrMask(&instructionIDWithREXw,
+                          insn,
+                          attrMask & (~ATTR_XD))) {
+      /*
+       * Decoding with REX.w would yield nothing; give up and return original
+       * decode.
+       */
+      
+      insn->instructionID = instructionID;
+      insn->spec = spec;
+      return 0;
+    }
+    
+    specWithREXw = specifierForUID(instructionIDWithREXw);
+    
+    if (is64BitEquivalent(spec->name, specWithREXw->name)) {
+      insn->instructionID = instructionIDWithREXw;
+      insn->spec = specWithREXw;
+    } else {
+      insn->instructionID = instructionID;
+      insn->spec = spec;
+    }
+    return 0;
+  }
+  
+  if (insn->prefixPresent[0x66] && !(attrMask & ATTR_OPSIZE)) {
+    /*
+     * The instruction tables make no distinction between instructions that
+     * allow OpSize anywhere (i.e., 16-bit operations) and that need it in a
+     * particular spot (i.e., many MMX operations).  In general we're
+     * conservative, but in the specific case where OpSize is present but not
+     * in the right place we check if there's a 16-bit operation.
+     */
+    
+    struct InstructionSpecifier* spec;
+    uint16_t instructionIDWithOpsize;
+    struct InstructionSpecifier* specWithOpsize;
+    
+    spec = specifierForUID(instructionID);
+    
+    if (getIDWithAttrMask(&instructionIDWithOpsize,
+                          insn,
+                          attrMask | ATTR_OPSIZE)) {
+      /* 
+       * ModRM required with OpSize but not present; give up and return version
+       * without OpSize set
+       */
+      
+      insn->instructionID = instructionID;
+      insn->spec = spec;
+      return 0;
+    }
+    
+    specWithOpsize = specifierForUID(instructionIDWithOpsize);
+    
+    if (is16BitEquvalent(spec->name, specWithOpsize->name)) {
+      insn->instructionID = instructionIDWithOpsize;
+      insn->spec = specWithOpsize;
+    } else {
+      insn->instructionID = instructionID;
+      insn->spec = spec;
+    }
+    return 0;
+  }
+  
+  insn->instructionID = instructionID;
+  insn->spec = specifierForUID(insn->instructionID);
+  
+  return 0;
+}
+
+/*
+ * readSIB - Consumes the SIB byte to determine addressing information for an
+ *   instruction.
+ *
+ * @param insn  - The instruction whose SIB byte is to be read.
+ * @return      - 0 if the SIB byte was successfully read; nonzero otherwise.
+ */
+static int readSIB(struct InternalInstruction* insn) {
+  SIBIndex sibIndexBase = 0;
+  SIBBase sibBaseBase = 0;
+  uint8_t index, base;
+  
+  dbgprintf(insn, "readSIB()");
+  
+  if (insn->consumedSIB)
+    return 0;
+  
+  insn->consumedSIB = TRUE;
+  
+  switch (insn->addressSize) {
+  case 2:
+    dbgprintf(insn, "SIB-based addressing doesn't work in 16-bit mode");
+    return -1;
+    break;
+  case 4:
+    sibIndexBase = SIB_INDEX_EAX;
+    sibBaseBase = SIB_BASE_EAX;
+    break;
+  case 8:
+    sibIndexBase = SIB_INDEX_RAX;
+    sibBaseBase = SIB_BASE_RAX;
+    break;
+  }
+
+  if (consumeByte(insn, &insn->sib))
+    return -1;
+  
+  index = indexFromSIB(insn->sib) | (xFromREX(insn->rexPrefix) << 3);
+  
+  switch (index) {
+  case 0x4:
+    insn->sibIndex = SIB_INDEX_NONE;
+    break;
+  default:
+    insn->sibIndex = (EABase)(sibIndexBase + index);
+    if (insn->sibIndex == SIB_INDEX_sib ||
+        insn->sibIndex == SIB_INDEX_sib64)
+      insn->sibIndex = SIB_INDEX_NONE;
+    break;
+  }
+  
+  switch (scaleFromSIB(insn->sib)) {
+  case 0:
+    insn->sibScale = 1;
+    break;
+  case 1:
+    insn->sibScale = 2;
+    break;
+  case 2:
+    insn->sibScale = 4;
+    break;
+  case 3:
+    insn->sibScale = 8;
+    break;
+  }
+  
+  base = baseFromSIB(insn->sib) | (bFromREX(insn->rexPrefix) << 3);
+  
+  switch (base) {
+  case 0x5:
+    switch (modFromModRM(insn->modRM)) {
+    case 0x0:
+      insn->eaDisplacement = EA_DISP_32;
+      insn->sibBase = SIB_BASE_NONE;
+      break;
+    case 0x1:
+      insn->eaDisplacement = EA_DISP_8;
+      insn->sibBase = (insn->addressSize == 4 ? 
+                       SIB_BASE_EBP : SIB_BASE_RBP);
+      break;
+    case 0x2:
+      insn->eaDisplacement = EA_DISP_32;
+      insn->sibBase = (insn->addressSize == 4 ? 
+                       SIB_BASE_EBP : SIB_BASE_RBP);
+      break;
+    case 0x3:
+      unreachable("Cannot have Mod = 0b11 and a SIB byte");
+    }
+    break;
+  default:
+    insn->sibBase = (EABase)(sibBaseBase + base);
+    break;
+  }
+  
+  return 0;
+}
+
+/*
+ * readDisplacement - Consumes the displacement of an instruction.
+ *
+ * @param insn  - The instruction whose displacement is to be read.
+ * @return      - 0 if the displacement byte was successfully read; nonzero 
+ *                otherwise.
+ */
+static int readDisplacement(struct InternalInstruction* insn) {  
+  int8_t d8;
+  int16_t d16;
+  int32_t d32;
+  
+  dbgprintf(insn, "readDisplacement()");
+  
+  if (insn->consumedDisplacement)
+    return 0;
+  
+  insn->consumedDisplacement = TRUE;
+  
+  switch (insn->eaDisplacement) {
+  case EA_DISP_NONE:
+    insn->consumedDisplacement = FALSE;
+    break;
+  case EA_DISP_8:
+    if (consumeInt8(insn, &d8))
+      return -1;
+    insn->displacement = d8;
+    break;
+  case EA_DISP_16:
+    if (consumeInt16(insn, &d16))
+      return -1;
+    insn->displacement = d16;
+    break;
+  case EA_DISP_32:
+    if (consumeInt32(insn, &d32))
+      return -1;
+    insn->displacement = d32;
+    break;
+  }
+  
+  insn->consumedDisplacement = TRUE;
+  return 0;
+}
+
+/*
+ * readModRM - Consumes all addressing information (ModR/M byte, SIB byte, and
+ *   displacement) for an instruction and interprets it.
+ *
+ * @param insn  - The instruction whose addressing information is to be read.
+ * @return      - 0 if the information was successfully read; nonzero otherwise.
+ */
+static int readModRM(struct InternalInstruction* insn) {  
+  uint8_t mod, rm, reg;
+  
+  dbgprintf(insn, "readModRM()");
+  
+  if (insn->consumedModRM)
+    return 0;
+  
+  consumeByte(insn, &insn->modRM);
+  insn->consumedModRM = TRUE;
+  
+  mod     = modFromModRM(insn->modRM);
+  rm      = rmFromModRM(insn->modRM);
+  reg     = regFromModRM(insn->modRM);
+  
+  /*
+   * This goes by insn->registerSize to pick the correct register, which messes
+   * up if we're using (say) XMM or 8-bit register operands.  That gets fixed in
+   * fixupReg().
+   */
+  switch (insn->registerSize) {
+  case 2:
+    insn->regBase = MODRM_REG_AX;
+    insn->eaRegBase = EA_REG_AX;
+    break;
+  case 4:
+    insn->regBase = MODRM_REG_EAX;
+    insn->eaRegBase = EA_REG_EAX;
+    break;
+  case 8:
+    insn->regBase = MODRM_REG_RAX;
+    insn->eaRegBase = EA_REG_RAX;
+    break;
+  }
+  
+  reg |= rFromREX(insn->rexPrefix) << 3;
+  rm  |= bFromREX(insn->rexPrefix) << 3;
+  
+  insn->reg = (Reg)(insn->regBase + reg);
+  
+  switch (insn->addressSize) {
+  case 2:
+    insn->eaBaseBase = EA_BASE_BX_SI;
+     
+    switch (mod) {
+    case 0x0:
+      if (rm == 0x6) {
+        insn->eaBase = EA_BASE_NONE;
+        insn->eaDisplacement = EA_DISP_16;
+        if(readDisplacement(insn))
+          return -1;
+      } else {
+        insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+        insn->eaDisplacement = EA_DISP_NONE;
+      }
+      break;
+    case 0x1:
+      insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+      insn->eaDisplacement = EA_DISP_8;
+      if(readDisplacement(insn))
+        return -1;
+      break;
+    case 0x2:
+      insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+      insn->eaDisplacement = EA_DISP_16;
+      if(readDisplacement(insn))
+        return -1;
+      break;
+    case 0x3:
+      insn->eaBase = (EABase)(insn->eaRegBase + rm);
+      if(readDisplacement(insn))
+        return -1;
+      break;
+    }
+    break;
+  case 4:
+  case 8:
+    insn->eaBaseBase = (insn->addressSize == 4 ? EA_BASE_EAX : EA_BASE_RAX);
+    
+    switch (mod) {
+    case 0x0:
+      insn->eaDisplacement = EA_DISP_NONE; /* readSIB may override this */
+      switch (rm) {
+      case 0x4:
+      case 0xc:   /* in case REXW.b is set */
+        insn->eaBase = (insn->addressSize == 4 ? 
+                        EA_BASE_sib : EA_BASE_sib64);
+        readSIB(insn);
+        if(readDisplacement(insn))
+          return -1;
+        break;
+      case 0x5:
+        insn->eaBase = EA_BASE_NONE;
+        insn->eaDisplacement = EA_DISP_32;
+        if(readDisplacement(insn))
+          return -1;
+        break;
+      default:
+        insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+        break;
+      }
+      break;
+    case 0x1:
+    case 0x2:
+      insn->eaDisplacement = (mod == 0x1 ? EA_DISP_8 : EA_DISP_32);
+      switch (rm) {
+      case 0x4:
+      case 0xc:   /* in case REXW.b is set */
+        insn->eaBase = EA_BASE_sib;
+        readSIB(insn);
+        if(readDisplacement(insn))
+          return -1;
+        break;
+      default:
+        insn->eaBase = (EABase)(insn->eaBaseBase + rm);
+        if(readDisplacement(insn))
+          return -1;
+        break;
+      }
+      break;
+    case 0x3:
+      insn->eaDisplacement = EA_DISP_NONE;
+      insn->eaBase = (EABase)(insn->eaRegBase + rm);
+      break;
+    }
+    break;
+  } /* switch (insn->addressSize) */
+  
+  return 0;
+}
+
+#define GENERIC_FIXUP_FUNC(name, base, prefix)            \
+  static uint8_t name(struct InternalInstruction *insn,   \
+                      OperandType type,                   \
+                      uint8_t index,                      \
+                      uint8_t *valid) {                   \
+    *valid = 1;                                           \
+    switch (type) {                                       \
+    default:                                              \
+      unreachable("Unhandled register type");             \
+    case TYPE_Rv:                                         \
+      return base + index;                                \
+    case TYPE_R8:                                         \
+      if(insn->rexPrefix &&                               \
+         index >= 4 && index <= 7) {                      \
+        return prefix##_SPL + (index - 4);                \
+      } else {                                            \
+        return prefix##_AL + index;                       \
+      }                                                   \
+    case TYPE_R16:                                        \
+      return prefix##_AX + index;                         \
+    case TYPE_R32:                                        \
+      return prefix##_EAX + index;                        \
+    case TYPE_R64:                                        \
+      return prefix##_RAX + index;                        \
+    case TYPE_XMM128:                                     \
+    case TYPE_XMM64:                                      \
+    case TYPE_XMM32:                                      \
+    case TYPE_XMM:                                        \
+      return prefix##_XMM0 + index;                       \
+    case TYPE_MM64:                                       \
+    case TYPE_MM32:                                       \
+    case TYPE_MM:                                         \
+      if(index > 7)                                       \
+        *valid = 0;                                       \
+      return prefix##_MM0 + index;                        \
+    case TYPE_SEGMENTREG:                                 \
+      if(index > 5)                                       \
+        *valid = 0;                                       \
+      return prefix##_ES + index;                         \
+    case TYPE_DEBUGREG:                                   \
+      if(index > 7)                                       \
+        *valid = 0;                                       \
+      return prefix##_DR0 + index;                        \
+    case TYPE_CR32:                                       \
+      if(index > 7)                                       \
+        *valid = 0;                                       \
+      return prefix##_ECR0 + index;                       \
+    case TYPE_CR64:                                       \
+      if(index > 8)                                       \
+        *valid = 0;                                       \
+      return prefix##_RCR0 + index;                       \
+    }                                                     \
+  }
+
+/*
+ * fixup*Value - Consults an operand type to determine the meaning of the
+ *   reg or R/M field.  If the operand is an XMM operand, for example, an
+ *   operand would be XMM0 instead of AX, which readModRM() would otherwise
+ *   misinterpret it as.
+ *
+ * @param insn  - The instruction containing the operand.
+ * @param type  - The operand type.
+ * @param index - The existing value of the field as reported by readModRM().
+ * @param valid - The address of a uint8_t.  The target is set to 1 if the
+ *                field is valid for the register class; 0 if not.
+ */
+GENERIC_FIXUP_FUNC(fixupRegValue, insn->regBase,    MODRM_REG)
+GENERIC_FIXUP_FUNC(fixupRMValue,  insn->eaRegBase,  EA_REG)
+
+/*
+ * fixupReg - Consults an operand specifier to determine which of the
+ *   fixup*Value functions to use in correcting readModRM()'ss interpretation.
+ *
+ * @param insn  - See fixup*Value().
+ * @param op    - The operand specifier.
+ * @return      - 0 if fixup was successful; -1 if the register returned was
+ *                invalid for its class.
+ */
+static int fixupReg(struct InternalInstruction *insn, 
+                    struct OperandSpecifier *op) {
+  uint8_t valid;
+  
+  dbgprintf(insn, "fixupReg()");
+  
+  switch ((OperandEncoding)op->encoding) {
+  default:
+    unreachable("Expected a REG or R/M encoding in fixupReg");
+  case ENCODING_REG:
+    insn->reg = (Reg)fixupRegValue(insn,
+                                   (OperandType)op->type,
+                                   insn->reg - insn->regBase,
+                                   &valid);
+    if (!valid)
+      return -1;
+    break;
+  case ENCODING_RM:
+    if (insn->eaBase >= insn->eaRegBase) {
+      insn->eaBase = (EABase)fixupRMValue(insn,
+                                          (OperandType)op->type,
+                                          insn->eaBase - insn->eaRegBase,
+                                          &valid);
+      if (!valid)
+        return -1;
+    }
+    break;
+  }
+  
+  return 0;
+}
+
+/*
+ * readOpcodeModifier - Reads an operand from the opcode field of an 
+ *   instruction.  Handles AddRegFrm instructions.
+ *
+ * @param insn    - The instruction whose opcode field is to be read.
+ * @param inModRM - Indicates that the opcode field is to be read from the
+ *                  ModR/M extension; useful for escape opcodes
+ */
+static void readOpcodeModifier(struct InternalInstruction* insn) {
+  dbgprintf(insn, "readOpcodeModifier()");
+  
+  if (insn->consumedOpcodeModifier)
+    return;
+  
+  insn->consumedOpcodeModifier = TRUE;
+  
+  switch(insn->spec->modifierType) {
+  default:
+    unreachable("Unknown modifier type.");
+  case MODIFIER_NONE:
+    unreachable("No modifier but an operand expects one.");
+  case MODIFIER_OPCODE:
+    insn->opcodeModifier = insn->opcode - insn->spec->modifierBase;
+    break;
+  case MODIFIER_MODRM:
+    insn->opcodeModifier = insn->modRM - insn->spec->modifierBase;
+    break;
+  }  
+}
+
+/*
+ * readOpcodeRegister - Reads an operand from the opcode field of an 
+ *   instruction and interprets it appropriately given the operand width.
+ *   Handles AddRegFrm instructions.
+ *
+ * @param insn  - See readOpcodeModifier().
+ * @param size  - The width (in bytes) of the register being specified.
+ *                1 means AL and friends, 2 means AX, 4 means EAX, and 8 means
+ *                RAX.
+ */
+static void readOpcodeRegister(struct InternalInstruction* insn, uint8_t size) {
+  dbgprintf(insn, "readOpcodeRegister()");
+
+  readOpcodeModifier(insn);
+  
+  if (size == 0)
+    size = insn->registerSize;
+  
+  switch (size) {
+  case 1:
+    insn->opcodeRegister = (Reg)(MODRM_REG_AL + ((bFromREX(insn->rexPrefix) << 3) 
+                                                  | insn->opcodeModifier));
+    if(insn->rexPrefix && 
+       insn->opcodeRegister >= MODRM_REG_AL + 0x4 &&
+       insn->opcodeRegister < MODRM_REG_AL + 0x8) {
+      insn->opcodeRegister = (Reg)(MODRM_REG_SPL
+                                   + (insn->opcodeRegister - MODRM_REG_AL - 4));
+    }
+      
+    break;
+  case 2:
+    insn->opcodeRegister = (Reg)(MODRM_REG_AX
+                                 + ((bFromREX(insn->rexPrefix) << 3) 
+                                    | insn->opcodeModifier));
+    break;
+  case 4:
+    insn->opcodeRegister = (Reg)(MODRM_REG_EAX +
+                                 + ((bFromREX(insn->rexPrefix) << 3) 
+                                    | insn->opcodeModifier));
+    break;
+  case 8:
+    insn->opcodeRegister = (Reg)(MODRM_REG_RAX 
+                                 + ((bFromREX(insn->rexPrefix) << 3) 
+                                    | insn->opcodeModifier));
+    break;
+  }
+}
+
+/*
+ * readImmediate - Consumes an immediate operand from an instruction, given the
+ *   desired operand size.
+ *
+ * @param insn  - The instruction whose operand is to be read.
+ * @param size  - The width (in bytes) of the operand.
+ * @return      - 0 if the immediate was successfully consumed; nonzero
+ *                otherwise.
+ */
+static int readImmediate(struct InternalInstruction* insn, uint8_t size) {
+  uint8_t imm8;
+  uint16_t imm16;
+  uint32_t imm32;
+  uint64_t imm64;
+  
+  dbgprintf(insn, "readImmediate()");
+  
+  if (insn->numImmediatesConsumed == 2)
+    unreachable("Already consumed two immediates");
+  
+  if (size == 0)
+    size = insn->immediateSize;
+  else
+    insn->immediateSize = size;
+  
+  switch (size) {
+  case 1:
+    if (consumeByte(insn, &imm8))
+      return -1;
+    insn->immediates[insn->numImmediatesConsumed] = imm8;
+    break;
+  case 2:
+    if (consumeUInt16(insn, &imm16))
+      return -1;
+    insn->immediates[insn->numImmediatesConsumed] = imm16;
+    break;
+  case 4:
+    if (consumeUInt32(insn, &imm32))
+      return -1;
+    insn->immediates[insn->numImmediatesConsumed] = imm32;
+    break;
+  case 8:
+    if (consumeUInt64(insn, &imm64))
+      return -1;
+    insn->immediates[insn->numImmediatesConsumed] = imm64;
+    break;
+  }
+  
+  insn->numImmediatesConsumed++;
+  
+  return 0;
+}
+
+/*
+ * readOperands - Consults the specifier for an instruction and consumes all
+ *   operands for that instruction, interpreting them as it goes.
+ *
+ * @param insn  - The instruction whose operands are to be read and interpreted.
+ * @return      - 0 if all operands could be read; nonzero otherwise.
+ */
+static int readOperands(struct InternalInstruction* insn) {
+  int index;
+  
+  dbgprintf(insn, "readOperands()");
+  
+  for (index = 0; index < X86_MAX_OPERANDS; ++index) {
+    switch (insn->spec->operands[index].encoding) {
+    case ENCODING_NONE:
+      break;
+    case ENCODING_REG:
+    case ENCODING_RM:
+      if (readModRM(insn))
+        return -1;
+      if (fixupReg(insn, &insn->spec->operands[index]))
+        return -1;
+      break;
+    case ENCODING_CB:
+    case ENCODING_CW:
+    case ENCODING_CD:
+    case ENCODING_CP:
+    case ENCODING_CO:
+    case ENCODING_CT:
+      dbgprintf(insn, "We currently don't hande code-offset encodings");
+      return -1;
+    case ENCODING_IB:
+      if (readImmediate(insn, 1))
+        return -1;
+      break;
+    case ENCODING_IW:
+      if (readImmediate(insn, 2))
+        return -1;
+      break;
+    case ENCODING_ID:
+      if (readImmediate(insn, 4))
+        return -1;
+      break;
+    case ENCODING_IO:
+      if (readImmediate(insn, 8))
+        return -1;
+      break;
+    case ENCODING_Iv:
+      readImmediate(insn, insn->immediateSize);
+      break;
+    case ENCODING_Ia:
+      readImmediate(insn, insn->addressSize);
+      break;
+    case ENCODING_RB:
+      readOpcodeRegister(insn, 1);
+      break;
+    case ENCODING_RW:
+      readOpcodeRegister(insn, 2);
+      break;
+    case ENCODING_RD:
+      readOpcodeRegister(insn, 4);
+      break;
+    case ENCODING_RO:
+      readOpcodeRegister(insn, 8);
+      break;
+    case ENCODING_Rv:
+      readOpcodeRegister(insn, 0);
+      break;
+    case ENCODING_I:
+      readOpcodeModifier(insn);
+      break;
+    case ENCODING_DUP:
+      break;
+    default:
+      dbgprintf(insn, "Encountered an operand with an unknown encoding.");
+      return -1;
+    }
+  }
+  
+  return 0;
+}
+
+/*
+ * decodeInstruction - Reads and interprets a full instruction provided by the
+ *   user.
+ *
+ * @param insn      - A pointer to the instruction to be populated.  Must be 
+ *                    pre-allocated.
+ * @param reader    - The function to be used to read the instruction's bytes.
+ * @param readerArg - A generic argument to be passed to the reader to store
+ *                    any internal state.
+ * @param logger    - If non-NULL, the function to be used to write log messages
+ *                    and warnings.
+ * @param loggerArg - A generic argument to be passed to the logger to store
+ *                    any internal state.
+ * @param startLoc  - The address (in the reader's address space) of the first
+ *                    byte in the instruction.
+ * @param mode      - The mode (real mode, IA-32e, or IA-32e in 64-bit mode) to
+ *                    decode the instruction in.
+ * @return          - 0 if the instruction's memory could be read; nonzero if
+ *                    not.
+ */
+int decodeInstruction(struct InternalInstruction* insn,
+                      byteReader_t reader,
+                      void* readerArg,
+                      dlog_t logger,
+                      void* loggerArg,
+                      uint64_t startLoc,
+                      DisassemblerMode mode) {
+  memset(insn, 0, sizeof(struct InternalInstruction));
+    
+  insn->reader = reader;
+  insn->readerArg = readerArg;
+  insn->dlog = logger;
+  insn->dlogArg = loggerArg;
+  insn->startLocation = startLoc;
+  insn->readerCursor = startLoc;
+  insn->mode = mode;
+  insn->numImmediatesConsumed = 0;
+  
+  if (readPrefixes(insn)       ||
+      readOpcode(insn)         ||
+      getID(insn)              ||
+      insn->instructionID == 0 ||
+      readOperands(insn))
+    return -1;
+  
+  insn->length = insn->readerCursor - insn->startLocation;
+  
+  dbgprintf(insn, "Read from 0x%llx to 0x%llx: length %llu",
+          startLoc, insn->readerCursor, insn->length);
+    
+  if (insn->length > 15)
+    dbgprintf(insn, "Instruction exceeds 15-byte limit");
+  
+  return 0;
+}

diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
new file mode 100644
index 0000000..c03c07a
--- /dev/null
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h

@@ -0,0 +1,515 @@
+/*===- X86DisassemblerDecoderInternal.h - Disassembler decoder -----*- C -*-==*
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===*
+ *
+ * This file is part of the X86 Disassembler.
+ * It contains the public interface of the instruction decoder.
+ * Documentation for the disassembler can be found in X86Disassembler.h.
+ *
+ *===----------------------------------------------------------------------===*/
+
+#ifndef X86DISASSEMBLERDECODER_H
+#define X86DISASSEMBLERDECODER_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+  
+#define INSTRUCTION_SPECIFIER_FIELDS  \
+  const char*             name;
+
+#define INSTRUCTION_IDS     \
+  InstrUID*  instructionIDs;
+
+#include "X86DisassemblerDecoderCommon.h"
+  
+#undef INSTRUCTION_SPECIFIER_FIELDS
+#undef INSTRUCTION_IDS
+  
+/*
+ * Accessor functions for various fields of an Intel instruction
+ */
+#define modFromModRM(modRM)  ((modRM & 0xc0) >> 6)
+#define regFromModRM(modRM)  ((modRM & 0x38) >> 3)
+#define rmFromModRM(modRM)   (modRM & 0x7)
+#define scaleFromSIB(sib)    ((sib & 0xc0) >> 6)
+#define indexFromSIB(sib)    ((sib & 0x38) >> 3)
+#define baseFromSIB(sib)     (sib & 0x7)
+#define wFromREX(rex)        ((rex & 0x8) >> 3)
+#define rFromREX(rex)        ((rex & 0x4) >> 2)
+#define xFromREX(rex)        ((rex & 0x2) >> 1)
+#define bFromREX(rex)        (rex & 0x1)
+
+/*
+ * These enums represent Intel registers for use by the decoder.
+ */
+
+#define REGS_8BIT     \
+  ENTRY(AL)           \
+  ENTRY(CL)           \
+  ENTRY(DL)           \
+  ENTRY(BL)           \
+  ENTRY(AH)           \
+  ENTRY(CH)           \
+  ENTRY(DH)           \
+  ENTRY(BH)           \
+  ENTRY(R8B)          \
+  ENTRY(R9B)          \
+  ENTRY(R10B)         \
+  ENTRY(R11B)         \
+  ENTRY(R12B)         \
+  ENTRY(R13B)         \
+  ENTRY(R14B)         \
+  ENTRY(R15B)         \
+  ENTRY(SPL)          \
+  ENTRY(BPL)          \
+  ENTRY(SIL)          \
+  ENTRY(DIL)
+
+#define EA_BASES_16BIT  \
+  ENTRY(BX_SI)          \
+  ENTRY(BX_DI)          \
+  ENTRY(BP_SI)          \
+  ENTRY(BP_DI)          \
+  ENTRY(SI)             \
+  ENTRY(DI)             \
+  ENTRY(BP)             \
+  ENTRY(BX)             \
+  ENTRY(R8W)            \
+  ENTRY(R9W)            \
+  ENTRY(R10W)           \
+  ENTRY(R11W)           \
+  ENTRY(R12W)           \
+  ENTRY(R13W)           \
+  ENTRY(R14W)           \
+  ENTRY(R15W)
+
+#define REGS_16BIT    \
+  ENTRY(AX)           \
+  ENTRY(CX)           \
+  ENTRY(DX)           \
+  ENTRY(BX)           \
+  ENTRY(SP)           \
+  ENTRY(BP)           \
+  ENTRY(SI)           \
+  ENTRY(DI)           \
+  ENTRY(R8W)          \
+  ENTRY(R9W)          \
+  ENTRY(R10W)         \
+  ENTRY(R11W)         \
+  ENTRY(R12W)         \
+  ENTRY(R13W)         \
+  ENTRY(R14W)         \
+  ENTRY(R15W)
+
+#define EA_BASES_32BIT  \
+  ENTRY(EAX)            \
+  ENTRY(ECX)            \
+  ENTRY(EDX)            \
+  ENTRY(EBX)            \
+  ENTRY(sib)            \
+  ENTRY(EBP)            \
+  ENTRY(ESI)            \
+  ENTRY(EDI)            \
+  ENTRY(R8D)            \
+  ENTRY(R9D)            \
+  ENTRY(R10D)           \
+  ENTRY(R11D)           \
+  ENTRY(R12D)           \
+  ENTRY(R13D)           \
+  ENTRY(R14D)           \
+  ENTRY(R15D)
+
+#define REGS_32BIT  \
+  ENTRY(EAX)        \
+  ENTRY(ECX)        \
+  ENTRY(EDX)        \
+  ENTRY(EBX)        \
+  ENTRY(ESP)        \
+  ENTRY(EBP)        \
+  ENTRY(ESI)        \
+  ENTRY(EDI)        \
+  ENTRY(R8D)        \
+  ENTRY(R9D)        \
+  ENTRY(R10D)       \
+  ENTRY(R11D)       \
+  ENTRY(R12D)       \
+  ENTRY(R13D)       \
+  ENTRY(R14D)       \
+  ENTRY(R15D)
+
+#define EA_BASES_64BIT  \
+  ENTRY(RAX)            \
+  ENTRY(RCX)            \
+  ENTRY(RDX)            \
+  ENTRY(RBX)            \
+  ENTRY(sib64)          \
+  ENTRY(RBP)            \
+  ENTRY(RSI)            \
+  ENTRY(RDI)            \
+  ENTRY(R8)             \
+  ENTRY(R9)             \
+  ENTRY(R10)            \
+  ENTRY(R11)            \
+  ENTRY(R12)            \
+  ENTRY(R13)            \
+  ENTRY(R14)            \
+  ENTRY(R15)
+
+#define REGS_64BIT  \
+  ENTRY(RAX)        \
+  ENTRY(RCX)        \
+  ENTRY(RDX)        \
+  ENTRY(RBX)        \
+  ENTRY(RSP)        \
+  ENTRY(RBP)        \
+  ENTRY(RSI)        \
+  ENTRY(RDI)        \
+  ENTRY(R8)         \
+  ENTRY(R9)         \
+  ENTRY(R10)        \
+  ENTRY(R11)        \
+  ENTRY(R12)        \
+  ENTRY(R13)        \
+  ENTRY(R14)        \
+  ENTRY(R15)
+
+#define REGS_MMX  \
+  ENTRY(MM0)      \
+  ENTRY(MM1)      \
+  ENTRY(MM2)      \
+  ENTRY(MM3)      \
+  ENTRY(MM4)      \
+  ENTRY(MM5)      \
+  ENTRY(MM6)      \
+  ENTRY(MM7)
+
+#define REGS_XMM  \
+  ENTRY(XMM0)     \
+  ENTRY(XMM1)     \
+  ENTRY(XMM2)     \
+  ENTRY(XMM3)     \
+  ENTRY(XMM4)     \
+  ENTRY(XMM5)     \
+  ENTRY(XMM6)     \
+  ENTRY(XMM7)     \
+  ENTRY(XMM8)     \
+  ENTRY(XMM9)     \
+  ENTRY(XMM10)    \
+  ENTRY(XMM11)    \
+  ENTRY(XMM12)    \
+  ENTRY(XMM13)    \
+  ENTRY(XMM14)    \
+  ENTRY(XMM15)
+  
+#define REGS_SEGMENT \
+  ENTRY(ES)          \
+  ENTRY(CS)          \
+  ENTRY(SS)          \
+  ENTRY(DS)          \
+  ENTRY(FS)          \
+  ENTRY(GS)
+  
+#define REGS_DEBUG  \
+  ENTRY(DR0)        \
+  ENTRY(DR1)        \
+  ENTRY(DR2)        \
+  ENTRY(DR3)        \
+  ENTRY(DR4)        \
+  ENTRY(DR5)        \
+  ENTRY(DR6)        \
+  ENTRY(DR7)
+
+#define REGS_CONTROL_32BIT  \
+  ENTRY(ECR0)               \
+  ENTRY(ECR1)               \
+  ENTRY(ECR2)               \
+  ENTRY(ECR3)               \
+  ENTRY(ECR4)               \
+  ENTRY(ECR5)               \
+  ENTRY(ECR6)               \
+  ENTRY(ECR7)
+
+#define REGS_CONTROL_64BIT  \
+  ENTRY(RCR0)               \
+  ENTRY(RCR1)               \
+  ENTRY(RCR2)               \
+  ENTRY(RCR3)               \
+  ENTRY(RCR4)               \
+  ENTRY(RCR5)               \
+  ENTRY(RCR6)               \
+  ENTRY(RCR7)               \
+  ENTRY(RCR8)
+  
+#define ALL_EA_BASES  \
+  EA_BASES_16BIT      \
+  EA_BASES_32BIT      \
+  EA_BASES_64BIT
+  
+#define ALL_SIB_BASES \
+  REGS_32BIT          \
+  REGS_64BIT
+
+#define ALL_REGS      \
+  REGS_8BIT           \
+  REGS_16BIT          \
+  REGS_32BIT          \
+  REGS_64BIT          \
+  REGS_MMX            \
+  REGS_XMM            \
+  REGS_SEGMENT        \
+  REGS_DEBUG          \
+  REGS_CONTROL_32BIT  \
+  REGS_CONTROL_64BIT  \
+  ENTRY(RIP)
+
+/*
+ * EABase - All possible values of the base field for effective-address 
+ *   computations, a.k.a. the Mod and R/M fields of the ModR/M byte.  We
+ *   distinguish between bases (EA_BASE_*) and registers that just happen to be
+ *   referred to when Mod == 0b11 (EA_REG_*).
+ */
+typedef enum {
+  EA_BASE_NONE,
+#define ENTRY(x) EA_BASE_##x,
+  ALL_EA_BASES
+#undef ENTRY
+#define ENTRY(x) EA_REG_##x,
+  ALL_REGS
+#undef ENTRY
+  EA_max
+} EABase;
+  
+/* 
+ * SIBIndex - All possible values of the SIB index field.
+ *   Borrows entries from ALL_EA_BASES with the special case that
+ *   sib is synonymous with NONE.
+ */
+typedef enum {
+  SIB_INDEX_NONE,
+#define ENTRY(x) SIB_INDEX_##x,
+  ALL_EA_BASES
+#undef ENTRY
+  SIB_INDEX_max
+} SIBIndex;
+  
+/*
+ * SIBBase - All possible values of the SIB base field.
+ */
+typedef enum {
+  SIB_BASE_NONE,
+#define ENTRY(x) SIB_BASE_##x,
+  ALL_SIB_BASES
+#undef ENTRY
+  SIB_BASE_max
+} SIBBase;
+
+/*
+ * EADisplacement - Possible displacement types for effective-address
+ *   computations.
+ */
+typedef enum {
+  EA_DISP_NONE,
+  EA_DISP_8,
+  EA_DISP_16,
+  EA_DISP_32
+} EADisplacement;
+
+/*
+ * Reg - All possible values of the reg field in the ModR/M byte.
+ */
+typedef enum {
+#define ENTRY(x) MODRM_REG_##x,
+  ALL_REGS
+#undef ENTRY
+  MODRM_REG_max
+} Reg;
+  
+/*
+ * SegmentOverride - All possible segment overrides.
+ */
+typedef enum {
+  SEG_OVERRIDE_NONE,
+  SEG_OVERRIDE_CS,
+  SEG_OVERRIDE_SS,
+  SEG_OVERRIDE_DS,
+  SEG_OVERRIDE_ES,
+  SEG_OVERRIDE_FS,
+  SEG_OVERRIDE_GS,
+  SEG_OVERRIDE_max
+} SegmentOverride;
+
+typedef uint8_t BOOL;
+
+/*
+ * byteReader_t - Type for the byte reader that the consumer must provide to
+ *   the decoder.  Reads a single byte from the instruction's address space.
+ * @param arg     - A baton that the consumer can associate with any internal
+ *                  state that it needs.
+ * @param byte    - A pointer to a single byte in memory that should be set to
+ *                  contain the value at address.
+ * @param address - The address in the instruction's address space that should
+ *                  be read from.
+ * @return        - -1 if the byte cannot be read for any reason; 0 otherwise.
+ */
+typedef int (*byteReader_t)(void* arg, uint8_t* byte, uint64_t address);
+
+/*
+ * dlog_t - Type for the logging function that the consumer can provide to
+ *   get debugging output from the decoder.
+ * @param arg     - A baton that the consumer can associate with any internal
+ *                  state that it needs.
+ * @param log     - A string that contains the message.  Will be reused after
+ *                  the logger returns.
+ */
+typedef void (*dlog_t)(void* arg, const char *log);
+
+/*
+ * The x86 internal instruction, which is produced by the decoder.
+ */
+struct InternalInstruction {
+  /* Reader interface (C) */
+  byteReader_t reader;
+  /* Opaque value passed to the reader */
+  void* readerArg;
+  /* The address of the next byte to read via the reader */
+  uint64_t readerCursor;
+
+  /* Logger interface (C) */
+  dlog_t dlog;
+  /* Opaque value passed to the logger */
+  void* dlogArg;
+
+  /* General instruction information */
+  
+  /* The mode to disassemble for (64-bit, protected, real) */
+  DisassemblerMode mode;
+  /* The start of the instruction, usable with the reader */
+  uint64_t startLocation;
+  /* The length of the instruction, in bytes */
+  size_t length;
+  
+  /* Prefix state */
+  
+  /* 1 if the prefix byte corresponding to the entry is present; 0 if not */
+  uint8_t prefixPresent[0x100];
+  /* contains the location (for use with the reader) of the prefix byte */
+  uint64_t prefixLocations[0x100];
+  /* The value of the REX prefix, if present */
+  uint8_t rexPrefix;
+  /* The location of the REX prefix */
+  uint64_t rexLocation;
+  /* The location where a mandatory prefix would have to be (i.e., right before
+     the opcode, or right before the REX prefix if one is present) */
+  uint64_t necessaryPrefixLocation;
+  /* The segment override type */
+  SegmentOverride segmentOverride;
+  
+  /* Sizes of various critical pieces of data */
+  uint8_t registerSize;
+  uint8_t addressSize;
+  uint8_t displacementSize;
+  uint8_t immediateSize;
+  
+  /* opcode state */
+  
+  /* The value of the two-byte escape prefix (usually 0x0f) */
+  uint8_t twoByteEscape;
+  /* The value of the three-byte escape prefix (usually 0x38 or 0x3a) */
+  uint8_t threeByteEscape;
+  /* The last byte of the opcode, not counting any ModR/M extension */
+  uint8_t opcode;
+  /* The ModR/M byte of the instruction, if it is an opcode extension */
+  uint8_t modRMExtension;
+  
+  /* decode state */
+  
+  /* The type of opcode, used for indexing into the array of decode tables */
+  OpcodeType opcodeType;
+  /* The instruction ID, extracted from the decode table */
+  uint16_t instructionID;
+  /* The specifier for the instruction, from the instruction info table */
+  struct InstructionSpecifier* spec;
+  
+  /* state for additional bytes, consumed during operand decode.  Pattern:
+     consumed___ indicates that the byte was already consumed and does not
+     need to be consumed again */
+  
+  /* The ModR/M byte, which contains most register operands and some portion of
+     all memory operands */
+  BOOL                          consumedModRM;
+  uint8_t                       modRM;
+  
+  /* The SIB byte, used for more complex 32- or 64-bit memory operands */
+  BOOL                          consumedSIB;
+  uint8_t                       sib;
+
+  /* The displacement, used for memory operands */
+  BOOL                          consumedDisplacement;
+  int32_t                       displacement;
+  
+  /* Immediates.  There can be two in some cases */
+  uint8_t                       numImmediatesConsumed;
+  uint8_t                       numImmediatesTranslated;
+  uint64_t                      immediates[2];
+  
+  /* A register or immediate operand encoded into the opcode */
+  BOOL                          consumedOpcodeModifier;
+  uint8_t                       opcodeModifier;
+  Reg                           opcodeRegister;
+  
+  /* Portions of the ModR/M byte */
+  
+  /* These fields determine the allowable values for the ModR/M fields, which
+     depend on operand and address widths */
+  EABase                        eaBaseBase;
+  EABase                        eaRegBase;
+  Reg                           regBase;
+
+  /* The Mod and R/M fields can encode a base for an effective address, or a
+     register.  These are separated into two fields here */
+  EABase                        eaBase;
+  EADisplacement                eaDisplacement;
+  /* The reg field always encodes a register */
+  Reg                           reg;
+  
+  /* SIB state */
+  SIBIndex                      sibIndex;
+  uint8_t                       sibScale;
+  SIBBase                       sibBase;
+};
+
+/* decodeInstruction - Decode one instruction and store the decoding results in
+ *   a buffer provided by the consumer.
+ * @param insn      - The buffer to store the instruction in.  Allocated by the
+ *                    consumer.
+ * @param reader    - The byteReader_t for the bytes to be read.
+ * @param readerArg - An argument to pass to the reader for storing context
+ *                    specific to the consumer.  May be NULL.
+ * @param logger    - The dlog_t to be used in printing status messages from the
+ *                    disassembler.  May be NULL.
+ * @param loggerArg - An argument to pass to the logger for storing context
+ *                    specific to the logger.  May be NULL.
+ * @param startLoc  - The address (in the reader's address space) of the first
+ *                    byte in the instruction.
+ * @param mode      - The mode (16-bit, 32-bit, 64-bit) to decode in.
+ * @return          - Nonzero if there was an error during decode, 0 otherwise.
+ */
+int decodeInstruction(struct InternalInstruction* insn,
+                      byteReader_t reader,
+                      void* readerArg,
+                      dlog_t logger,
+                      void* loggerArg,
+                      uint64_t startLoc,
+                      DisassemblerMode mode);
+
+#ifdef __cplusplus 
+}
+#endif
+  
+#endif

diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
new file mode 100644
index 0000000..c213f89
--- /dev/null
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h

@@ -0,0 +1,355 @@
+/*===- X86DisassemblerDecoderCommon.h - Disassembler decoder -------*- C -*-==*
+ *
+ *                     The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+ *===----------------------------------------------------------------------===*
+ *
+ * This file is part of the X86 Disassembler.
+ * It contains common definitions used by both the disassembler and the table
+ *  generator.
+ * Documentation for the disassembler can be found in X86Disassembler.h.
+ *
+ *===----------------------------------------------------------------------===*/
+
+/*
+ * This header file provides those definitions that need to be shared between
+ * the decoder and the table generator in a C-friendly manner.
+ */
+
+#ifndef X86DISASSEMBLERDECODERCOMMON_H
+#define X86DISASSEMBLERDECODERCOMMON_H
+
+#include "llvm/System/DataTypes.h"
+
+#define INSTRUCTIONS_SYM  x86DisassemblerInstrSpecifiers
+#define CONTEXTS_SYM      x86DisassemblerContexts
+#define ONEBYTE_SYM       x86DisassemblerOneByteOpcodes
+#define TWOBYTE_SYM       x86DisassemblerTwoByteOpcodes
+#define THREEBYTE38_SYM   x86DisassemblerThreeByte38Opcodes
+#define THREEBYTE3A_SYM   x86DisassemblerThreeByte3AOpcodes
+
+#define INSTRUCTIONS_STR  "x86DisassemblerInstrSpecifiers"
+#define CONTEXTS_STR      "x86DisassemblerContexts"
+#define ONEBYTE_STR       "x86DisassemblerOneByteOpcodes"
+#define TWOBYTE_STR       "x86DisassemblerTwoByteOpcodes"
+#define THREEBYTE38_STR   "x86DisassemblerThreeByte38Opcodes"
+#define THREEBYTE3A_STR   "x86DisassemblerThreeByte3AOpcodes"
+
+/*
+ * Attributes of an instruction that must be known before the opcode can be
+ * processed correctly.  Most of these indicate the presence of particular
+ * prefixes, but ATTR_64BIT is simply an attribute of the decoding context.
+ */
+#define ATTRIBUTE_BITS          \
+  ENUM_ENTRY(ATTR_NONE,   0x00) \
+  ENUM_ENTRY(ATTR_64BIT,  0x01) \
+  ENUM_ENTRY(ATTR_XS,     0x02) \
+  ENUM_ENTRY(ATTR_XD,     0x04) \
+  ENUM_ENTRY(ATTR_REXW,   0x08) \
+  ENUM_ENTRY(ATTR_OPSIZE, 0x10)
+
+#define ENUM_ENTRY(n, v) n = v,
+enum attributeBits {
+  ATTRIBUTE_BITS
+  ATTR_max
+};
+#undef ENUM_ENTRY
+
+/*
+ * Combinations of the above attributes that are relevant to instruction
+ * decode.  Although other combinations are possible, they can be reduced to
+ * these without affecting the ultimately decoded instruction.
+ */
+
+/*           Class name           Rank  Rationale for rank assignment         */
+#define INSTRUCTION_CONTEXTS                                                   \
+  ENUM_ENTRY(IC,                    0,  "says nothing about the instruction")  \
+  ENUM_ENTRY(IC_64BIT,              1,  "says the instruction applies in "     \
+                                        "64-bit mode but no more")             \
+  ENUM_ENTRY(IC_OPSIZE,             3,  "requires an OPSIZE prefix, so "       \
+                                        "operands change width")               \
+  ENUM_ENTRY(IC_XD,                 2,  "may say something about the opcode "  \
+                                        "but not the operands")                \
+  ENUM_ENTRY(IC_XS,                 2,  "may say something about the opcode "  \
+                                        "but not the operands")                \
+  ENUM_ENTRY(IC_64BIT_REXW,         4,  "requires a REX.W prefix, so operands "\
+                                        "change width; overrides IC_OPSIZE")   \
+  ENUM_ENTRY(IC_64BIT_OPSIZE,       3,  "Just as meaningful as IC_OPSIZE")     \
+  ENUM_ENTRY(IC_64BIT_XD,           5,  "XD instructions are SSE; REX.W is "   \
+                                        "secondary")                           \
+  ENUM_ENTRY(IC_64BIT_XS,           5,  "Just as meaningful as IC_64BIT_XD")   \
+  ENUM_ENTRY(IC_64BIT_REXW_XS,      6,  "OPSIZE could mean a different "       \
+                                        "opcode")                              \
+  ENUM_ENTRY(IC_64BIT_REXW_XD,      6,  "Just as meaningful as "               \
+                                        "IC_64BIT_REXW_XS")                    \
+  ENUM_ENTRY(IC_64BIT_REXW_OPSIZE,  7,  "The Dynamic Duo!  Prefer over all "   \
+                                        "else because this changes most "      \
+                                        "operands' meaning")
+
+#define ENUM_ENTRY(n, r, d) n,    
+typedef enum {
+  INSTRUCTION_CONTEXTS
+  IC_max
+} InstructionContext;
+#undef ENUM_ENTRY
+
+/*
+ * Opcode types, which determine which decode table to use, both in the Intel
+ * manual and also for the decoder.
+ */
+typedef enum {
+  ONEBYTE       = 0,
+  TWOBYTE       = 1,
+  THREEBYTE_38  = 2,
+  THREEBYTE_3A  = 3
+} OpcodeType;
+
+/*
+ * The following structs are used for the hierarchical decode table.  After
+ * determining the instruction's class (i.e., which IC_* constant applies to
+ * it), the decoder reads the opcode.  Some instructions require specific
+ * values of the ModR/M byte, so the ModR/M byte indexes into the final table.
+ *
+ * If a ModR/M byte is not required, "required" is left unset, and the values
+ * for each instructionID are identical.
+ */
+ 
+typedef uint16_t InstrUID;
+
+/*
+ * ModRMDecisionType - describes the type of ModR/M decision, allowing the 
+ * consumer to determine the number of entries in it.
+ *
+ * MODRM_ONEENTRY - No matter what the value of the ModR/M byte is, the decoded
+ *                  instruction is the same.
+ * MODRM_SPLITRM  - If the ModR/M byte is between 0x00 and 0xbf, the opcode
+ *                  corresponds to one instruction; otherwise, it corresponds to
+ *                  a different instruction.
+ * MODRM_FULL     - Potentially, each value of the ModR/M byte could correspond
+ *                  to a different instruction.
+ */
+
+#define MODRMTYPES            \
+  ENUM_ENTRY(MODRM_ONEENTRY)  \
+  ENUM_ENTRY(MODRM_SPLITRM)   \
+  ENUM_ENTRY(MODRM_FULL)
+
+#define ENUM_ENTRY(n) n,    
+typedef enum {
+  MODRMTYPES
+  MODRM_max
+} ModRMDecisionType;
+#undef ENUM_ENTRY
+
+/*
+ * ModRMDecision - Specifies whether a ModR/M byte is needed and (if so) which 
+ *  instruction each possible value of the ModR/M byte corresponds to.  Once
+ *  this information is known, we have narrowed down to a single instruction.
+ */
+struct ModRMDecision {
+  uint8_t     modrm_type;
+  
+  /* The macro below must be defined wherever this file is included. */
+  INSTRUCTION_IDS
+};
+
+/*
+ * OpcodeDecision - Specifies which set of ModR/M->instruction tables to look at
+ *   given a particular opcode.
+ */
+struct OpcodeDecision {
+  struct ModRMDecision modRMDecisions[256];
+};
+
+/*
+ * ContextDecision - Specifies which opcode->instruction tables to look at given
+ *   a particular context (set of attributes).  Since there are many possible
+ *   contexts, the decoder first uses CONTEXTS_SYM to determine which context
+ *   applies given a specific set of attributes.  Hence there are only IC_max
+ *   entries in this table, rather than 2^(ATTR_max).
+ */
+struct ContextDecision {
+  struct OpcodeDecision opcodeDecisions[IC_max];
+};
+
+/* 
+ * Physical encodings of instruction operands.
+ */
+
+#define ENCODINGS                                                              \
+  ENUM_ENTRY(ENCODING_NONE,   "")                                              \
+  ENUM_ENTRY(ENCODING_REG,    "Register operand in ModR/M byte.")              \
+  ENUM_ENTRY(ENCODING_RM,     "R/M operand in ModR/M byte.")                   \
+  ENUM_ENTRY(ENCODING_CB,     "1-byte code offset (possible new CS value)")    \
+  ENUM_ENTRY(ENCODING_CW,     "2-byte")                                        \
+  ENUM_ENTRY(ENCODING_CD,     "4-byte")                                        \
+  ENUM_ENTRY(ENCODING_CP,     "6-byte")                                        \
+  ENUM_ENTRY(ENCODING_CO,     "8-byte")                                        \
+  ENUM_ENTRY(ENCODING_CT,     "10-byte")                                       \
+  ENUM_ENTRY(ENCODING_IB,     "1-byte immediate")                              \
+  ENUM_ENTRY(ENCODING_IW,     "2-byte")                                        \
+  ENUM_ENTRY(ENCODING_ID,     "4-byte")                                        \
+  ENUM_ENTRY(ENCODING_IO,     "8-byte")                                        \
+  ENUM_ENTRY(ENCODING_RB,     "(AL..DIL, R8L..R15L) Register code added to "   \
+                              "the opcode byte")                               \
+  ENUM_ENTRY(ENCODING_RW,     "(AX..DI, R8W..R15W)")                           \
+  ENUM_ENTRY(ENCODING_RD,     "(EAX..EDI, R8D..R15D)")                         \
+  ENUM_ENTRY(ENCODING_RO,     "(RAX..RDI, R8..R15)")                           \
+  ENUM_ENTRY(ENCODING_I,      "Position on floating-point stack added to the " \
+                              "opcode byte")                                   \
+                                                                               \
+  ENUM_ENTRY(ENCODING_Iv,     "Immediate of operand size")                     \
+  ENUM_ENTRY(ENCODING_Ia,     "Immediate of address size")                     \
+  ENUM_ENTRY(ENCODING_Rv,     "Register code of operand size added to the "    \
+                              "opcode byte")                                   \
+  ENUM_ENTRY(ENCODING_DUP,    "Duplicate of another operand; ID is encoded "   \
+                              "in type")
+
+#define ENUM_ENTRY(n, d) n,    
+  typedef enum {
+    ENCODINGS
+    ENCODING_max
+  } OperandEncoding;
+#undef ENUM_ENTRY
+
+/* 
+ * Semantic interpretations of instruction operands.
+ */
+
+#define TYPES                                                                  \
+  ENUM_ENTRY(TYPE_NONE,       "")                                              \
+  ENUM_ENTRY(TYPE_REL8,       "1-byte immediate address")                      \
+  ENUM_ENTRY(TYPE_REL16,      "2-byte")                                        \
+  ENUM_ENTRY(TYPE_REL32,      "4-byte")                                        \
+  ENUM_ENTRY(TYPE_REL64,      "8-byte")                                        \
+  ENUM_ENTRY(TYPE_PTR1616,    "2+2-byte segment+offset address")               \
+  ENUM_ENTRY(TYPE_PTR1632,    "2+4-byte")                                      \
+  ENUM_ENTRY(TYPE_PTR1664,    "2+8-byte")                                      \
+  ENUM_ENTRY(TYPE_R8,         "1-byte register operand")                       \
+  ENUM_ENTRY(TYPE_R16,        "2-byte")                                        \
+  ENUM_ENTRY(TYPE_R32,        "4-byte")                                        \
+  ENUM_ENTRY(TYPE_R64,        "8-byte")                                        \
+  ENUM_ENTRY(TYPE_IMM8,       "1-byte immediate operand")                      \
+  ENUM_ENTRY(TYPE_IMM16,      "2-byte")                                        \
+  ENUM_ENTRY(TYPE_IMM32,      "4-byte")                                        \
+  ENUM_ENTRY(TYPE_IMM64,      "8-byte")                                        \
+  ENUM_ENTRY(TYPE_RM8,        "1-byte register or memory operand")             \
+  ENUM_ENTRY(TYPE_RM16,       "2-byte")                                        \
+  ENUM_ENTRY(TYPE_RM32,       "4-byte")                                        \
+  ENUM_ENTRY(TYPE_RM64,       "8-byte")                                        \
+  ENUM_ENTRY(TYPE_M,          "Memory operand")                                \
+  ENUM_ENTRY(TYPE_M8,         "1-byte")                                        \
+  ENUM_ENTRY(TYPE_M16,        "2-byte")                                        \
+  ENUM_ENTRY(TYPE_M32,        "4-byte")                                        \
+  ENUM_ENTRY(TYPE_M64,        "8-byte")                                        \
+  ENUM_ENTRY(TYPE_LEA,        "Effective address")                             \
+  ENUM_ENTRY(TYPE_M128,       "16-byte (SSE/SSE2)")                            \
+  ENUM_ENTRY(TYPE_M1616,      "2+2-byte segment+offset address")               \
+  ENUM_ENTRY(TYPE_M1632,      "2+4-byte")                                      \
+  ENUM_ENTRY(TYPE_M1664,      "2+8-byte")                                      \
+  ENUM_ENTRY(TYPE_M16_32,     "2+4-byte two-part memory operand (LIDT, LGDT)") \
+  ENUM_ENTRY(TYPE_M16_16,     "2+2-byte (BOUND)")                              \
+  ENUM_ENTRY(TYPE_M32_32,     "4+4-byte (BOUND)")                              \
+  ENUM_ENTRY(TYPE_M16_64,     "2+8-byte (LIDT, LGDT)")                         \
+  ENUM_ENTRY(TYPE_MOFFS8,     "1-byte memory offset (relative to segment "     \
+                              "base)")                                         \
+  ENUM_ENTRY(TYPE_MOFFS16,    "2-byte")                                        \
+  ENUM_ENTRY(TYPE_MOFFS32,    "4-byte")                                        \
+  ENUM_ENTRY(TYPE_MOFFS64,    "8-byte")                                        \
+  ENUM_ENTRY(TYPE_SREG,       "Byte with single bit set: 0 = ES, 1 = CS, "     \
+                              "2 = SS, 3 = DS, 4 = FS, 5 = GS")                \
+  ENUM_ENTRY(TYPE_M32FP,      "32-bit IEE754 memory floating-point operand")   \
+  ENUM_ENTRY(TYPE_M64FP,      "64-bit")                                        \
+  ENUM_ENTRY(TYPE_M80FP,      "80-bit extended")                               \
+  ENUM_ENTRY(TYPE_M16INT,     "2-byte memory integer operand for use in "      \
+                              "floating-point instructions")                   \
+  ENUM_ENTRY(TYPE_M32INT,     "4-byte")                                        \
+  ENUM_ENTRY(TYPE_M64INT,     "8-byte")                                        \
+  ENUM_ENTRY(TYPE_ST,         "Position on the floating-point stack")          \
+  ENUM_ENTRY(TYPE_MM,         "MMX register operand")                          \
+  ENUM_ENTRY(TYPE_MM32,       "4-byte MMX register or memory operand")         \
+  ENUM_ENTRY(TYPE_MM64,       "8-byte")                                        \
+  ENUM_ENTRY(TYPE_XMM,        "XMM register operand")                          \
+  ENUM_ENTRY(TYPE_XMM32,      "4-byte XMM register or memory operand")         \
+  ENUM_ENTRY(TYPE_XMM64,      "8-byte")                                        \
+  ENUM_ENTRY(TYPE_XMM128,     "16-byte")                                       \
+  ENUM_ENTRY(TYPE_XMM0,       "Implicit use of XMM0")                          \
+  ENUM_ENTRY(TYPE_SEGMENTREG, "Segment register operand")                      \
+  ENUM_ENTRY(TYPE_DEBUGREG,   "Debug register operand")                        \
+  ENUM_ENTRY(TYPE_CR32,       "4-byte control register operand")               \
+  ENUM_ENTRY(TYPE_CR64,       "8-byte")                                        \
+                                                                               \
+  ENUM_ENTRY(TYPE_Mv,         "Memory operand of operand size")                \
+  ENUM_ENTRY(TYPE_Rv,         "Register operand of operand size")              \
+  ENUM_ENTRY(TYPE_IMMv,       "Immediate operand of operand size")             \
+  ENUM_ENTRY(TYPE_RELv,       "Immediate address of operand size")             \
+  ENUM_ENTRY(TYPE_DUP0,       "Duplicate of operand 0")                        \
+  ENUM_ENTRY(TYPE_DUP1,       "operand 1")                                     \
+  ENUM_ENTRY(TYPE_DUP2,       "operand 2")                                     \
+  ENUM_ENTRY(TYPE_DUP3,       "operand 3")                                     \
+  ENUM_ENTRY(TYPE_DUP4,       "operand 4")                                     \
+  ENUM_ENTRY(TYPE_M512,       "512-bit FPU/MMX/XMM/MXCSR state")
+
+#define ENUM_ENTRY(n, d) n,    
+typedef enum {
+  TYPES
+  TYPE_max
+} OperandType;
+#undef ENUM_ENTRY
+
+/* 
+ * OperandSpecifier - The specification for how to extract and interpret one
+ *   operand.
+ */
+struct OperandSpecifier {
+  OperandEncoding  encoding;
+  OperandType      type;
+};
+
+/*
+ * Indicates where the opcode modifier (if any) is to be found.  Extended
+ * opcodes with AddRegFrm have the opcode modifier in the ModR/M byte.
+ */
+
+#define MODIFIER_TYPES        \
+  ENUM_ENTRY(MODIFIER_NONE)   \
+  ENUM_ENTRY(MODIFIER_OPCODE) \
+  ENUM_ENTRY(MODIFIER_MODRM)
+
+#define ENUM_ENTRY(n) n,
+typedef enum {
+  MODIFIER_TYPES
+  MODIFIER_max
+} ModifierType;
+#undef ENUM_ENTRY
+
+#define X86_MAX_OPERANDS 5
+
+/*
+ * The specification for how to extract and interpret a full instruction and
+ * its operands.
+ */
+struct InstructionSpecifier {
+  ModifierType modifierType;
+  uint8_t modifierBase;
+  struct OperandSpecifier operands[X86_MAX_OPERANDS];
+  
+  /* The macro below must be defined wherever this file is included. */
+  INSTRUCTION_SPECIFIER_FIELDS
+};
+
+/*
+ * Decoding mode for the Intel disassembler.  16-bit, 32-bit, and 64-bit mode
+ * are supported, and represent real mode, IA-32e, and IA-32e in 64-bit mode,
+ * respectively.
+ */
+typedef enum {
+  MODE_16BIT,
+  MODE_32BIT,
+  MODE_64BIT
+} DisassemblerMode;
+
+#endif

diff --git a/lib/Target/X86/Makefile b/lib/Target/X86/Makefile
new file mode 100644
index 0000000..f4ff894
--- /dev/null
+++ b/lib/Target/X86/Makefile

@@ -0,0 +1,25 @@
+##===- lib/Target/X86/Makefile -----------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMX86CodeGen
+TARGET = X86
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = X86GenRegisterInfo.h.inc X86GenRegisterNames.inc \
+                X86GenRegisterInfo.inc X86GenInstrNames.inc \
+                X86GenInstrInfo.inc X86GenAsmWriter.inc X86GenAsmMatcher.inc \
+                X86GenAsmWriter1.inc X86GenDAGISel.inc  \
+                X86GenDisassemblerTables.inc X86GenFastISel.inc \
+                X86GenCallingConv.inc X86GenSubtarget.inc \
+		X86GenEDInfo.inc
+
+DIRS = AsmPrinter AsmParser Disassembler TargetInfo
+
+include $(LEVEL)/Makefile.common

diff --git a/lib/Target/X86/README-FPStack.txt b/lib/Target/X86/README-FPStack.txt
new file mode 100644
index 0000000..be28e8b
--- /dev/null
+++ b/lib/Target/X86/README-FPStack.txt

@@ -0,0 +1,85 @@
+//===---------------------------------------------------------------------===//
+// Random ideas for the X86 backend: FP stack related stuff
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+
+Some targets (e.g. athlons) prefer freep to fstp ST(0):
+http://gcc.gnu.org/ml/gcc-patches/2004-04/msg00659.html
+
+//===---------------------------------------------------------------------===//
+
+This should use fiadd on chips where it is profitable:
+double foo(double P, int *I) { return P+*I; }
+
+We have fiadd patterns now but the followings have the same cost and
+complexity. We need a way to specify the later is more profitable.
+
+def FpADD32m  : FpI<(ops RFP:$dst, RFP:$src1, f32mem:$src2), OneArgFPRW,
+                    [(set RFP:$dst, (fadd RFP:$src1,
+                                     (extloadf64f32 addr:$src2)))]>;
+                // ST(0) = ST(0) + [mem32]
+
+def FpIADD32m : FpI<(ops RFP:$dst, RFP:$src1, i32mem:$src2), OneArgFPRW,
+                    [(set RFP:$dst, (fadd RFP:$src1,
+                                     (X86fild addr:$src2, i32)))]>;
+                // ST(0) = ST(0) + [mem32int]
+
+//===---------------------------------------------------------------------===//
+
+The FP stackifier needs to be global.  Also, it should handle simple permutates
+to reduce number of shuffle instructions, e.g. turning:
+
+fld P	->		fld Q
+fld Q			fld P
+fxch
+
+or:
+
+fxch	->		fucomi
+fucomi			jl X
+jg X
+
+Ideas:
+http://gcc.gnu.org/ml/gcc-patches/2004-11/msg02410.html
+
+
+//===---------------------------------------------------------------------===//
+
+Add a target specific hook to DAG combiner to handle SINT_TO_FP and
+FP_TO_SINT when the source operand is already in memory.
+
+//===---------------------------------------------------------------------===//
+
+Open code rint,floor,ceil,trunc:
+http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02006.html
+http://gcc.gnu.org/ml/gcc-patches/2004-08/msg02011.html
+
+Opencode the sincos[f] libcall.
+
+//===---------------------------------------------------------------------===//
+
+None of the FPStack instructions are handled in
+X86RegisterInfo::foldMemoryOperand, which prevents the spiller from
+folding spill code into the instructions.
+
+//===---------------------------------------------------------------------===//
+
+Currently the x86 codegen isn't very good at mixing SSE and FPStack
+code:
+
+unsigned int foo(double x) { return x; }
+
+foo:
+	subl $20, %esp
+	movsd 24(%esp), %xmm0
+	movsd %xmm0, 8(%esp)
+	fldl 8(%esp)
+	fisttpll (%esp)
+	movl (%esp), %eax
+	addl $20, %esp
+	ret
+
+This just requires being smarter when custom expanding fptoui.
+
+//===---------------------------------------------------------------------===//

diff --git a/lib/Target/X86/README-MMX.txt b/lib/Target/X86/README-MMX.txt
new file mode 100644
index 0000000..a6c8616
--- /dev/null
+++ b/lib/Target/X86/README-MMX.txt

@@ -0,0 +1,71 @@
+//===---------------------------------------------------------------------===//
+// Random ideas for the X86 backend: MMX-specific stuff.
+//===---------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------===//
+
+This:
+
+#include <mmintrin.h>
+
+__v2si qux(int A) {
+  return (__v2si){ 0, A };
+}
+
+is compiled into:
+
+_qux:
+        subl $28, %esp
+        movl 32(%esp), %eax
+        movd %eax, %mm0
+        movq %mm0, (%esp)
+        movl (%esp), %eax
+        movl %eax, 20(%esp)
+        movq %mm0, 8(%esp)
+        movl 12(%esp), %eax
+        movl %eax, 16(%esp)
+        movq 16(%esp), %mm0
+        addl $28, %esp
+        ret
+
+Yuck!
+
+GCC gives us:
+
+_qux:
+        subl    $12, %esp
+        movl    16(%esp), %eax
+        movl    20(%esp), %edx
+        movl    $0, (%eax)
+        movl    %edx, 4(%eax)
+        addl    $12, %esp
+        ret     $4
+
+//===---------------------------------------------------------------------===//
+
+We generate crappy code for this:
+
+__m64 t() {
+  return _mm_cvtsi32_si64(1);
+}
+
+_t:
+	subl	$12, %esp
+	movl	$1, %eax
+	movd	%eax, %mm0
+	movq	%mm0, (%esp)
+	movl	(%esp), %eax
+	movl	4(%esp), %edx
+	addl	$12, %esp
+	ret
+
+The extra stack traffic is covered in the previous entry. But the other reason
+is we are not smart about materializing constants in MMX registers. With -m64
+
+	movl	$1, %eax
+	movd	%eax, %mm0
+	movd	%mm0, %rax
+	ret
+
+We should be using a constantpool load instead:
+	movq	LC0(%rip), %rax

diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt
new file mode 100644
index 0000000..19eb05e
--- /dev/null
+++ b/lib/Target/X86/README-SSE.txt

@@ -0,0 +1,989 @@
+//===---------------------------------------------------------------------===//
+// Random ideas for the X86 backend: SSE-specific stuff.
+//===---------------------------------------------------------------------===//
+
+- Consider eliminating the unaligned SSE load intrinsics, replacing them with
+  unaligned LLVM load instructions.
+
+//===---------------------------------------------------------------------===//
+
+Expand libm rounding functions inline:  Significant speedups possible.
+http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
+
+//===---------------------------------------------------------------------===//
+
+When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
+other fast SSE modes.
+
+//===---------------------------------------------------------------------===//
+
+Think about doing i64 math in SSE regs on x86-32.
+
+//===---------------------------------------------------------------------===//
+
+This testcase should have no SSE instructions in it, and only one load from
+a constant pool:
+
+double %test3(bool %B) {
+        %C = select bool %B, double 123.412, double 523.01123123
+        ret double %C
+}
+
+Currently, the select is being lowered, which prevents the dag combiner from
+turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
+
+The pattern isel got this one right.
+
+//===---------------------------------------------------------------------===//
+
+SSE doesn't have [mem] op= reg instructions.  If we have an SSE instruction
+like this:
+
+  X += y
+
+and the register allocator decides to spill X, it is cheaper to emit this as:
+
+Y += [xslot]
+store Y -> [xslot]
+
+than as:
+
+tmp = [xslot]
+tmp += y
+store tmp -> [xslot]
+
+..and this uses one fewer register (so this should be done at load folding
+time, not at spiller time).  *Note* however that this can only be done
+if Y is dead.  Here's a testcase:
+
+@.str_3 = external global [15 x i8]
+declare void @printf(i32, ...)
+define void @main() {
+build_tree.exit:
+	br label %no_exit.i7
+
+no_exit.i7:		; preds = %no_exit.i7, %build_tree.exit
+	%tmp.0.1.0.i9 = phi double [ 0.000000e+00, %build_tree.exit ],
+                                   [ %tmp.34.i18, %no_exit.i7 ]
+	%tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ],
+                                    [ %tmp.28.i16, %no_exit.i7 ]
+	%tmp.28.i16 = add double %tmp.0.0.0.i10, 0.000000e+00
+	%tmp.34.i18 = add double %tmp.0.1.0.i9, 0.000000e+00
+	br i1 false, label %Compute_Tree.exit23, label %no_exit.i7
+
+Compute_Tree.exit23:		; preds = %no_exit.i7
+	tail call void (i32, ...)* @printf( i32 0 )
+	store double %tmp.34.i18, double* null
+	ret void
+}
+
+We currently emit:
+
+.BBmain_1:
+        xorpd %XMM1, %XMM1
+        addsd %XMM0, %XMM1
+***     movsd %XMM2, QWORD PTR [%ESP + 8]
+***     addsd %XMM2, %XMM1
+***     movsd QWORD PTR [%ESP + 8], %XMM2
+        jmp .BBmain_1   # no_exit.i7
+
+This is a bugpoint reduced testcase, which is why the testcase doesn't make
+much sense (e.g. its an infinite loop). :)
+
+//===---------------------------------------------------------------------===//
+
+SSE should implement 'select_cc' using 'emulated conditional moves' that use
+pcmp/pand/pandn/por to do a selection instead of a conditional branch:
+
+double %X(double %Y, double %Z, double %A, double %B) {
+        %C = setlt double %A, %B
+        %z = add double %Z, 0.0    ;; select operand is not a load
+        %D = select bool %C, double %Y, double %z
+        ret double %D
+}
+
+We currently emit:
+
+_X:
+        subl $12, %esp
+        xorpd %xmm0, %xmm0
+        addsd 24(%esp), %xmm0
+        movsd 32(%esp), %xmm1
+        movsd 16(%esp), %xmm2
+        ucomisd 40(%esp), %xmm1
+        jb LBB_X_2
+LBB_X_1:
+        movsd %xmm0, %xmm2
+LBB_X_2:
+        movsd %xmm2, (%esp)
+        fldl (%esp)
+        addl $12, %esp
+        ret
+
+//===---------------------------------------------------------------------===//
+
+It's not clear whether we should use pxor or xorps / xorpd to clear XMM
+registers. The choice may depend on subtarget information. We should do some
+more experiments on different x86 machines.
+
+//===---------------------------------------------------------------------===//
+
+Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
+feasible.
+
+//===---------------------------------------------------------------------===//
+
+Codegen:
+  if (copysign(1.0, x) == copysign(1.0, y))
+into:
+  if (x^y & mask)
+when using SSE.
+
+//===---------------------------------------------------------------------===//
+
+Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
+of a v4sf value.
+
+//===---------------------------------------------------------------------===//
+
+Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
+Perhaps use pxor / xorp* to clear a XMM register first?
+
+//===---------------------------------------------------------------------===//
+
+How to decide when to use the "floating point version" of logical ops? Here are
+some code fragments:
+
+	movaps LCPI5_5, %xmm2
+	divps %xmm1, %xmm2
+	mulps %xmm2, %xmm3
+	mulps 8656(%ecx), %xmm3
+	addps 8672(%ecx), %xmm3
+	andps LCPI5_6, %xmm2
+	andps LCPI5_1, %xmm3
+	por %xmm2, %xmm3
+	movdqa %xmm3, (%edi)
+
+	movaps LCPI5_5, %xmm1
+	divps %xmm0, %xmm1
+	mulps %xmm1, %xmm3
+	mulps 8656(%ecx), %xmm3
+	addps 8672(%ecx), %xmm3
+	andps LCPI5_6, %xmm1
+	andps LCPI5_1, %xmm3
+	orps %xmm1, %xmm3
+	movaps %xmm3, 112(%esp)
+	movaps %xmm3, (%ebx)
+
+Due to some minor source change, the later case ended up using orps and movaps
+instead of por and movdqa. Does it matter?
+
+//===---------------------------------------------------------------------===//
+
+X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible
+to choose between movaps, movapd, and movdqa based on types of source and
+destination?
+
+How about andps, andpd, and pand? Do we really care about the type of the packed
+elements? If not, why not always use the "ps" variants which are likely to be
+shorter.
+
+//===---------------------------------------------------------------------===//
+
+External test Nurbs exposed some problems. Look for
+__ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
+emits:
+
+        movaps    (%edx), %xmm2                                 #59.21
+        movaps    (%edx), %xmm5                                 #60.21
+        movaps    (%edx), %xmm4                                 #61.21
+        movaps    (%edx), %xmm3                                 #62.21
+        movl      40(%ecx), %ebp                                #69.49
+        shufps    $0, %xmm2, %xmm5                              #60.21
+        movl      100(%esp), %ebx                               #69.20
+        movl      (%ebx), %edi                                  #69.20
+        imull     %ebp, %edi                                    #69.49
+        addl      (%eax), %edi                                  #70.33
+        shufps    $85, %xmm2, %xmm4                             #61.21
+        shufps    $170, %xmm2, %xmm3                            #62.21
+        shufps    $255, %xmm2, %xmm2                            #63.21
+        lea       (%ebp,%ebp,2), %ebx                           #69.49
+        negl      %ebx                                          #69.49
+        lea       -3(%edi,%ebx), %ebx                           #70.33
+        shll      $4, %ebx                                      #68.37
+        addl      32(%ecx), %ebx                                #68.37
+        testb     $15, %bl                                      #91.13
+        jne       L_B1.24       # Prob 5%                       #91.13
+
+This is the llvm code after instruction scheduling:
+
+cond_next140 (0xa910740, LLVM BB @0xa90beb0):
+	%reg1078 = MOV32ri -3
+	%reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
+	%reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
+	%reg1080 = IMUL32rr %reg1079, %reg1037
+	%reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
+	%reg1038 = LEA32r %reg1081, 1, %reg1080, -3
+	%reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
+	%reg1082 = SHL32ri %reg1038, 4
+	%reg1039 = ADD32rr %reg1036, %reg1082
+	%reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
+	%reg1034 = SHUFPSrr %reg1083, %reg1083, 170
+	%reg1032 = SHUFPSrr %reg1083, %reg1083, 0
+	%reg1035 = SHUFPSrr %reg1083, %reg1083, 255
+	%reg1033 = SHUFPSrr %reg1083, %reg1083, 85
+	%reg1040 = MOV32rr %reg1039
+	%reg1084 = AND32ri8 %reg1039, 15
+	CMP32ri8 %reg1084, 0
+	JE mbb<cond_next204,0xa914d30>
+
+Still ok. After register allocation:
+
+cond_next140 (0xa910740, LLVM BB @0xa90beb0):
+	%EAX = MOV32ri -3
+	%EDX = MOV32rm <fi#3>, 1, %NOREG, 0
+	ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
+	%EDX = MOV32rm <fi#7>, 1, %NOREG, 0
+	%EDX = MOV32rm %EDX, 1, %NOREG, 40
+	IMUL32rr %EAX<def&use>, %EDX
+	%ESI = MOV32rm <fi#5>, 1, %NOREG, 0
+	%ESI = MOV32rm %ESI, 1, %NOREG, 0
+	MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
+	%EAX = LEA32r %ESI, 1, %EAX, -3
+	%ESI = MOV32rm <fi#7>, 1, %NOREG, 0
+	%ESI = MOV32rm %ESI, 1, %NOREG, 32
+	%EDI = MOV32rr %EAX
+	SHL32ri %EDI<def&use>, 4
+	ADD32rr %EDI<def&use>, %ESI
+	%XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
+	%XMM1 = MOVAPSrr %XMM0
+	SHUFPSrr %XMM1<def&use>, %XMM1, 170
+	%XMM2 = MOVAPSrr %XMM0
+	SHUFPSrr %XMM2<def&use>, %XMM2, 0
+	%XMM3 = MOVAPSrr %XMM0
+	SHUFPSrr %XMM3<def&use>, %XMM3, 255
+	SHUFPSrr %XMM0<def&use>, %XMM0, 85
+	%EBX = MOV32rr %EDI
+	AND32ri8 %EBX<def&use>, 15
+	CMP32ri8 %EBX, 0
+	JE mbb<cond_next204,0xa914d30>
+
+This looks really bad. The problem is shufps is a destructive opcode. Since it
+appears as operand two in more than one shufps ops. It resulted in a number of
+copies. Note icc also suffers from the same problem. Either the instruction
+selector should select pshufd or The register allocator can made the two-address
+to three-address transformation.
+
+It also exposes some other problems. See MOV32ri -3 and the spills.
+
+//===---------------------------------------------------------------------===//
+
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25500
+
+LLVM is producing bad code.
+
+LBB_main_4:	# cond_true44
+	addps %xmm1, %xmm2
+	subps %xmm3, %xmm2
+	movaps (%ecx), %xmm4
+	movaps %xmm2, %xmm1
+	addps %xmm4, %xmm1
+	addl $16, %ecx
+	incl %edx
+	cmpl $262144, %edx
+	movaps %xmm3, %xmm2
+	movaps %xmm4, %xmm3
+	jne LBB_main_4	# cond_true44
+
+There are two problems. 1) No need to two loop induction variables. We can
+compare against 262144 * 16. 2) Known register coalescer issue. We should
+be able eliminate one of the movaps:
+
+	addps %xmm2, %xmm1    <=== Commute!
+	subps %xmm3, %xmm1
+	movaps (%ecx), %xmm4
+	movaps %xmm1, %xmm1   <=== Eliminate!
+	addps %xmm4, %xmm1
+	addl $16, %ecx
+	incl %edx
+	cmpl $262144, %edx
+	movaps %xmm3, %xmm2
+	movaps %xmm4, %xmm3
+	jne LBB_main_4	# cond_true44
+
+//===---------------------------------------------------------------------===//
+
+Consider:
+
+__m128 test(float a) {
+  return _mm_set_ps(0.0, 0.0, 0.0, a*a);
+}
+
+This compiles into:
+
+movss 4(%esp), %xmm1
+mulss %xmm1, %xmm1
+xorps %xmm0, %xmm0
+movss %xmm1, %xmm0
+ret
+
+Because mulss doesn't modify the top 3 elements, the top elements of 
+xmm1 are already zero'd.  We could compile this to:
+
+movss 4(%esp), %xmm0
+mulss %xmm0, %xmm0
+ret
+
+//===---------------------------------------------------------------------===//
+
+Here's a sick and twisted idea.  Consider code like this:
+
+__m128 test(__m128 a) {
+  float b = *(float*)&A;
+  ...
+  return _mm_set_ps(0.0, 0.0, 0.0, b);
+}
+
+This might compile to this code:
+
+movaps c(%esp), %xmm1
+xorps %xmm0, %xmm0
+movss %xmm1, %xmm0
+ret
+
+Now consider if the ... code caused xmm1 to get spilled.  This might produce
+this code:
+
+movaps c(%esp), %xmm1
+movaps %xmm1, c2(%esp)
+...
+
+xorps %xmm0, %xmm0
+movaps c2(%esp), %xmm1
+movss %xmm1, %xmm0
+ret
+
+However, since the reload is only used by these instructions, we could 
+"fold" it into the uses, producing something like this:
+
+movaps c(%esp), %xmm1
+movaps %xmm1, c2(%esp)
+...
+
+movss c2(%esp), %xmm0
+ret
+
+... saving two instructions.
+
+The basic idea is that a reload from a spill slot, can, if only one 4-byte 
+chunk is used, bring in 3 zeros the one element instead of 4 elements.
+This can be used to simplify a variety of shuffle operations, where the
+elements are fixed zeros.
+
+//===---------------------------------------------------------------------===//
+
+__m128d test1( __m128d A, __m128d B) {
+  return _mm_shuffle_pd(A, B, 0x3);
+}
+
+compiles to
+
+shufpd $3, %xmm1, %xmm0
+
+Perhaps it's better to use unpckhpd instead?
+
+unpckhpd %xmm1, %xmm0
+
+Don't know if unpckhpd is faster. But it is shorter.
+
+//===---------------------------------------------------------------------===//
+
+This code generates ugly code, probably due to costs being off or something:
+
+define void @test(float* %P, <4 x float>* %P2 ) {
+        %xFloat0.688 = load float* %P
+        %tmp = load <4 x float>* %P2
+        %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3
+        store <4 x float> %inFloat3.713, <4 x float>* %P2
+        ret void
+}
+
+Generates:
+
+_test:
+	movl	8(%esp), %eax
+	movaps	(%eax), %xmm0
+	pxor	%xmm1, %xmm1
+	movaps	%xmm0, %xmm2
+	shufps	$50, %xmm1, %xmm2
+	shufps	$132, %xmm2, %xmm0
+	movaps	%xmm0, (%eax)
+	ret
+
+Would it be better to generate:
+
+_test:
+        movl 8(%esp), %ecx
+        movaps (%ecx), %xmm0
+	xor %eax, %eax
+        pinsrw $6, %eax, %xmm0
+        pinsrw $7, %eax, %xmm0
+        movaps %xmm0, (%ecx)
+        ret
+
+?
+
+//===---------------------------------------------------------------------===//
+
+Some useful information in the Apple Altivec / SSE Migration Guide:
+
+http://developer.apple.com/documentation/Performance/Conceptual/
+Accelerate_sse_migration/index.html
+
+e.g. SSE select using and, andnot, or. Various SSE compare translations.
+
+//===---------------------------------------------------------------------===//
+
+Add hooks to commute some CMPP operations.
+
+//===---------------------------------------------------------------------===//
+
+Apply the same transformation that merged four float into a single 128-bit load
+to loads from constant pool.
+
+//===---------------------------------------------------------------------===//
+
+Floating point max / min are commutable when -enable-unsafe-fp-path is
+specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other
+nodes which are selected to max / min instructions that are marked commutable.
+
+//===---------------------------------------------------------------------===//
+
+We should materialize vector constants like "all ones" and "signbit" with 
+code like:
+
+     cmpeqps xmm1, xmm1   ; xmm1 = all-ones
+
+and:
+     cmpeqps xmm1, xmm1   ; xmm1 = all-ones
+     psrlq   xmm1, 31     ; xmm1 = all 100000000000...
+
+instead of using a load from the constant pool.  The later is important for
+ABS/NEG/copysign etc.
+
+//===---------------------------------------------------------------------===//
+
+These functions:
+
+#include <xmmintrin.h>
+__m128i a;
+void x(unsigned short n) {
+  a = _mm_slli_epi32 (a, n);
+}
+void y(unsigned n) {
+  a = _mm_slli_epi32 (a, n);
+}
+
+compile to ( -O3 -static -fomit-frame-pointer):
+_x:
+        movzwl  4(%esp), %eax
+        movd    %eax, %xmm0
+        movaps  _a, %xmm1
+        pslld   %xmm0, %xmm1
+        movaps  %xmm1, _a
+        ret
+_y:
+        movd    4(%esp), %xmm0
+        movaps  _a, %xmm1
+        pslld   %xmm0, %xmm1
+        movaps  %xmm1, _a
+        ret
+
+"y" looks good, but "x" does silly movzwl stuff around into a GPR.  It seems
+like movd would be sufficient in both cases as the value is already zero 
+extended in the 32-bit stack slot IIRC.  For signed short, it should also be
+save, as a really-signed value would be undefined for pslld.
+
+
+//===---------------------------------------------------------------------===//
+
+#include <math.h>
+int t1(double d) { return signbit(d); }
+
+This currently compiles to:
+	subl	$12, %esp
+	movsd	16(%esp), %xmm0
+	movsd	%xmm0, (%esp)
+	movl	4(%esp), %eax
+	shrl	$31, %eax
+	addl	$12, %esp
+	ret
+
+We should use movmskp{s|d} instead.
+
+//===---------------------------------------------------------------------===//
+
+CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single
+(aligned) vector load.  This functionality has a couple of problems.
+
+1. The code to infer alignment from loads of globals is in the X86 backend,
+   not the dag combiner.  This is because dagcombine2 needs to be able to see
+   through the X86ISD::Wrapper node, which DAGCombine can't really do.
+2. The code for turning 4 x load into a single vector load is target 
+   independent and should be moved to the dag combiner.
+3. The code for turning 4 x load into a vector load can only handle a direct 
+   load from a global or a direct load from the stack.  It should be generalized
+   to handle any load from P, P+4, P+8, P+12, where P can be anything.
+4. The alignment inference code cannot handle loads from globals in non-static
+   mode because it doesn't look through the extra dyld stub load.  If you try
+   vec_align.ll without -relocation-model=static, you'll see what I mean.
+
+//===---------------------------------------------------------------------===//
+
+We should lower store(fneg(load p), q) into an integer load+xor+store, which
+eliminates a constant pool load.  For example, consider:
+
+define i64 @ccosf(float %z.0, float %z.1) nounwind readonly  {
+entry:
+ %tmp6 = sub float -0.000000e+00, %z.1		; <float> [#uses=1]
+ %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly
+ ret i64 %tmp20
+}
+
+This currently compiles to:
+
+LCPI1_0:					#  <4 x float>
+	.long	2147483648	# float -0
+	.long	2147483648	# float -0
+	.long	2147483648	# float -0
+	.long	2147483648	# float -0
+_ccosf:
+	subl	$12, %esp
+	movss	16(%esp), %xmm0
+	movss	%xmm0, 4(%esp)
+	movss	20(%esp), %xmm0
+	xorps	LCPI1_0, %xmm0
+	movss	%xmm0, (%esp)
+	call	L_ccoshf$stub
+	addl	$12, %esp
+	ret
+
+Note the load into xmm0, then xor (to negate), then store.  In PIC mode,
+this code computes the pic base and does two loads to do the constant pool 
+load, so the improvement is much bigger.
+
+The tricky part about this xform is that the argument load/store isn't exposed
+until post-legalize, and at that point, the fneg has been custom expanded into 
+an X86 fxor.  This means that we need to handle this case in the x86 backend
+instead of in target independent code.
+
+//===---------------------------------------------------------------------===//
+
+Non-SSE4 insert into 16 x i8 is atrociously bad.
+
+//===---------------------------------------------------------------------===//
+
+<2 x i64> extract is substantially worse than <2 x f64>, even if the destination
+is memory.
+
+//===---------------------------------------------------------------------===//
+
+SSE4 extract-to-mem ops aren't being pattern matched because of the AssertZext
+sitting between the truncate and the extract.
+
+//===---------------------------------------------------------------------===//
+
+INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert
+any number of 0.0 simultaneously.  Currently we only use it for simple
+insertions.
+
+See comments in LowerINSERT_VECTOR_ELT_SSE4.
+
+//===---------------------------------------------------------------------===//
+
+On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not
+Custom.  All combinations of insert/extract reg-reg, reg-mem, and mem-reg are
+legal, it'll just take a few extra patterns written in the .td file.
+
+Note: this is not a code quality issue; the custom lowered code happens to be
+right, but we shouldn't have to custom lower anything.  This is probably related
+to <2 x i64> ops being so bad.
+
+//===---------------------------------------------------------------------===//
+
+'select' on vectors and scalars could be a whole lot better.  We currently 
+lower them to conditional branches.  On x86-64 for example, we compile this:
+
+double test(double a, double b, double c, double d) { return a<b ? c : d; }
+
+to:
+
+_test:
+	ucomisd	%xmm0, %xmm1
+	ja	LBB1_2	# entry
+LBB1_1:	# entry
+	movapd	%xmm3, %xmm2
+LBB1_2:	# entry
+	movapd	%xmm2, %xmm0
+	ret
+
+instead of:
+
+_test:
+	cmpltsd	%xmm1, %xmm0
+	andpd	%xmm0, %xmm2
+	andnpd	%xmm3, %xmm0
+	orpd	%xmm2, %xmm0
+	ret
+
+For unpredictable branches, the later is much more efficient.  This should
+just be a matter of having scalar sse map to SELECT_CC and custom expanding
+or iseling it.
+
+//===---------------------------------------------------------------------===//
+
+LLVM currently generates stack realignment code, when it is not necessary
+needed. The problem is that we need to know about stack alignment too early,
+before RA runs.
+
+At that point we don't know, whether there will be vector spill, or not.
+Stack realignment logic is overly conservative here, but otherwise we can
+produce unaligned loads/stores.
+
+Fixing this will require some huge RA changes.
+
+Testcase:
+#include <emmintrin.h>
+
+typedef short vSInt16 __attribute__ ((__vector_size__ (16)));
+
+static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873,
+- 22725, - 12873};;
+
+vSInt16 madd(vSInt16 b)
+{
+    return _mm_madd_epi16(a, b);
+}
+
+Generated code (x86-32, linux):
+madd:
+        pushl   %ebp
+        movl    %esp, %ebp
+        andl    $-16, %esp
+        movaps  .LCPI1_0, %xmm1
+        pmaddwd %xmm1, %xmm0
+        movl    %ebp, %esp
+        popl    %ebp
+        ret
+
+//===---------------------------------------------------------------------===//
+
+Consider:
+#include <emmintrin.h> 
+__m128 foo2 (float x) {
+ return _mm_set_ps (0, 0, x, 0);
+}
+
+In x86-32 mode, we generate this spiffy code:
+
+_foo2:
+	movss	4(%esp), %xmm0
+	pshufd	$81, %xmm0, %xmm0
+	ret
+
+in x86-64 mode, we generate this code, which could be better:
+
+_foo2:
+	xorps	%xmm1, %xmm1
+	movss	%xmm0, %xmm1
+	pshufd	$81, %xmm1, %xmm0
+	ret
+
+In sse4 mode, we could use insertps to make both better.
+
+Here's another testcase that could use insertps [mem]:
+
+#include <xmmintrin.h>
+extern float x2, x3;
+__m128 foo1 (float x1, float x4) {
+ return _mm_set_ps (x2, x1, x3, x4);
+}
+
+gcc mainline compiles it to:
+
+foo1:
+       insertps        $0x10, x2(%rip), %xmm0
+       insertps        $0x10, x3(%rip), %xmm1
+       movaps  %xmm1, %xmm2
+       movlhps %xmm0, %xmm2
+       movaps  %xmm2, %xmm0
+       ret
+
+//===---------------------------------------------------------------------===//
+
+We compile vector multiply-by-constant into poor code:
+
+define <4 x i32> @f(<4 x i32> %i) nounwind  {
+	%A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 >
+	ret <4 x i32> %A
+}
+
+On targets without SSE4.1, this compiles into:
+
+LCPI1_0:					##  <4 x i32>
+	.long	10
+	.long	10
+	.long	10
+	.long	10
+	.text
+	.align	4,0x90
+	.globl	_f
+_f:
+	pshufd	$3, %xmm0, %xmm1
+	movd	%xmm1, %eax
+	imull	LCPI1_0+12, %eax
+	movd	%eax, %xmm1
+	pshufd	$1, %xmm0, %xmm2
+	movd	%xmm2, %eax
+	imull	LCPI1_0+4, %eax
+	movd	%eax, %xmm2
+	punpckldq	%xmm1, %xmm2
+	movd	%xmm0, %eax
+	imull	LCPI1_0, %eax
+	movd	%eax, %xmm1
+	movhlps	%xmm0, %xmm0
+	movd	%xmm0, %eax
+	imull	LCPI1_0+8, %eax
+	movd	%eax, %xmm0
+	punpckldq	%xmm0, %xmm1
+	movaps	%xmm1, %xmm0
+	punpckldq	%xmm2, %xmm0
+	ret
+
+It would be better to synthesize integer vector multiplication by constants
+using shifts and adds, pslld and paddd here. And even on targets with SSE4.1,
+simple cases such as multiplication by powers of two would be better as
+vector shifts than as multiplications.
+
+//===---------------------------------------------------------------------===//
+
+We compile this:
+
+__m128i
+foo2 (char x)
+{
+  return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0);
+}
+
+into:
+	movl	$1, %eax
+	xorps	%xmm0, %xmm0
+	pinsrw	$2, %eax, %xmm0
+	movzbl	4(%esp), %eax
+	pinsrw	$3, %eax, %xmm0
+	movl	$256, %eax
+	pinsrw	$7, %eax, %xmm0
+	ret
+
+
+gcc-4.2:
+	subl	$12, %esp
+	movzbl	16(%esp), %eax
+	movdqa	LC0, %xmm0
+	pinsrw	$3, %eax, %xmm0
+	addl	$12, %esp
+	ret
+	.const
+	.align 4
+LC0:
+	.word	0
+	.word	0
+	.word	1
+	.word	0
+	.word	0
+	.word	0
+	.word	0
+	.word	256
+
+With SSE4, it should be
+      movdqa  .LC0(%rip), %xmm0
+      pinsrb  $6, %edi, %xmm0
+
+//===---------------------------------------------------------------------===//
+
+We should transform a shuffle of two vectors of constants into a single vector
+of constants. Also, insertelement of a constant into a vector of constants
+should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll.
+
+We compiled it to something horrible:
+
+	.align	4
+LCPI1_1:					##  float
+	.long	1065353216	## float 1
+	.const
+
+	.align	4
+LCPI1_0:					##  <4 x float>
+	.space	4
+	.long	1065353216	## float 1
+	.space	4
+	.long	1065353216	## float 1
+	.text
+	.align	4,0x90
+	.globl	_t
+_t:
+	xorps	%xmm0, %xmm0
+	movhps	LCPI1_0, %xmm0
+	movss	LCPI1_1, %xmm1
+	movaps	%xmm0, %xmm2
+	shufps	$2, %xmm1, %xmm2
+	shufps	$132, %xmm2, %xmm0
+	movaps	%xmm0, 0
+
+//===---------------------------------------------------------------------===//
+rdar://5907648
+
+This function:
+
+float foo(unsigned char x) {
+  return x;
+}
+
+compiles to (x86-32):
+
+define float @foo(i8 zeroext  %x) nounwind  {
+	%tmp12 = uitofp i8 %x to float		; <float> [#uses=1]
+	ret float %tmp12
+}
+
+compiles to:
+
+_foo:
+	subl	$4, %esp
+	movzbl	8(%esp), %eax
+	cvtsi2ss	%eax, %xmm0
+	movss	%xmm0, (%esp)
+	flds	(%esp)
+	addl	$4, %esp
+	ret
+
+We should be able to use:
+  cvtsi2ss 8($esp), %xmm0
+since we know the stack slot is already zext'd.
+
+//===---------------------------------------------------------------------===//
+
+Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64))
+when code size is critical. movlps is slower than movsd on core2 but it's one
+byte shorter.
+
+//===---------------------------------------------------------------------===//
+
+We should use a dynamic programming based approach to tell when using FPStack
+operations is cheaper than SSE.  SciMark montecarlo contains code like this
+for example:
+
+double MonteCarlo_num_flops(int Num_samples) {
+    return ((double) Num_samples)* 4.0;
+}
+
+In fpstack mode, this compiles into:
+
+LCPI1_0:					
+	.long	1082130432	## float 4.000000e+00
+_MonteCarlo_num_flops:
+	subl	$4, %esp
+	movl	8(%esp), %eax
+	movl	%eax, (%esp)
+	fildl	(%esp)
+	fmuls	LCPI1_0
+	addl	$4, %esp
+	ret
+        
+in SSE mode, it compiles into significantly slower code:
+
+_MonteCarlo_num_flops:
+	subl	$12, %esp
+	cvtsi2sd	16(%esp), %xmm0
+	mulsd	LCPI1_0, %xmm0
+	movsd	%xmm0, (%esp)
+	fldl	(%esp)
+	addl	$12, %esp
+	ret
+
+There are also other cases in scimark where using fpstack is better, it is
+cheaper to do fld1 than load from a constant pool for example, so
+"load, add 1.0, store" is better done in the fp stack, etc.
+
+//===---------------------------------------------------------------------===//
+
+The X86 backend should be able to if-convert SSE comparisons like "ucomisd" to
+"cmpsd".  For example, this code:
+
+double d1(double x) { return x == x ? x : x + x; }
+
+Compiles into:
+
+_d1:
+	ucomisd	%xmm0, %xmm0
+	jnp	LBB1_2
+	addsd	%xmm0, %xmm0
+	ret
+LBB1_2:
+	ret
+
+Also, the 'ret's should be shared.  This is PR6032.
+
+//===---------------------------------------------------------------------===//
+
+These should compile into the same code (PR6214): Perhaps instcombine should
+canonicalize the former into the later?
+
+define float @foo(float %x) nounwind {
+  %t = bitcast float %x to i32
+  %s = and i32 %t, 2147483647
+  %d = bitcast i32 %s to float
+  ret float %d
+}
+
+declare float @fabsf(float %n)
+define float @bar(float %x) nounwind {
+  %d = call float @fabsf(float %x)
+  ret float %d
+}
+
+//===---------------------------------------------------------------------===//
+
+This IR (from PR6194):
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-darwin10.0.0"
+
+%0 = type { double, double }
+%struct.float3 = type { float, float, float }
+
+define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp {
+entry:
+  %tmp18 = extractvalue %0 %0, 0                  ; <double> [#uses=1]
+  %tmp19 = bitcast double %tmp18 to i64           ; <i64> [#uses=1]
+  %tmp20 = zext i64 %tmp19 to i128                ; <i128> [#uses=1]
+  %tmp10 = lshr i128 %tmp20, 32                   ; <i128> [#uses=1]
+  %tmp11 = trunc i128 %tmp10 to i32               ; <i32> [#uses=1]
+  %tmp12 = bitcast i32 %tmp11 to float            ; <float> [#uses=1]
+  %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1]
+  store float %tmp12, float* %tmp5
+  ret void
+}
+
+Compiles to:
+
+_test:                                  ## @test
+	movd	%xmm0, %rax
+	shrq	$32, %rax
+	movl	%eax, 4(%rdi)
+	ret
+
+This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and
+doing a shuffle from v[1] to v[0] then a float store.
+
+//===---------------------------------------------------------------------===//

diff --git a/lib/Target/X86/README-UNIMPLEMENTED.txt b/lib/Target/X86/README-UNIMPLEMENTED.txt
new file mode 100644
index 0000000..c26c75a
--- /dev/null
+++ b/lib/Target/X86/README-UNIMPLEMENTED.txt

@@ -0,0 +1,14 @@
+//===---------------------------------------------------------------------===//
+// Testcases that crash the X86 backend because they aren't implemented
+//===---------------------------------------------------------------------===//
+
+These are cases we know the X86 backend doesn't handle.  Patches are welcome
+and appreciated, because no one has signed up to implemented these yet.
+Implementing these would allow elimination of the corresponding intrinsics,
+which would be great.
+
+1) vector shifts
+2) vector comparisons
+3) vector fp<->int conversions: PR2683, PR2684, PR2685, PR2686, PR2688
+4) bitcasts from vectors to scalars: PR2804
+5) llvm.atomic.cmp.swap.i128.p0i128: PR3462

diff --git a/lib/Target/X86/README-X86-64.txt b/lib/Target/X86/README-X86-64.txt
new file mode 100644
index 0000000..e8f7c5d
--- /dev/null
+++ b/lib/Target/X86/README-X86-64.txt

@@ -0,0 +1,300 @@
+//===- README_X86_64.txt - Notes for X86-64 code gen ----------------------===//
+
+Implement different PIC models? Right now we only support Mac OS X with small
+PIC code model.
+
+//===---------------------------------------------------------------------===//
+
+For this:
+
+extern void xx(void);
+void bar(void) {
+  xx();
+}
+
+gcc compiles to:
+
+.globl _bar
+_bar:
+	jmp	_xx
+
+We need to do the tailcall optimization as well.
+
+//===---------------------------------------------------------------------===//
+
+AMD64 Optimization Manual 8.2 has some nice information about optimizing integer
+multiplication by a constant. How much of it applies to Intel's X86-64
+implementation? There are definite trade-offs to consider: latency vs. register
+pressure vs. code size.
+
+//===---------------------------------------------------------------------===//
+
+Are we better off using branches instead of cmove to implement FP to
+unsigned i64?
+
+_conv:
+	ucomiss	LC0(%rip), %xmm0
+	cvttss2siq	%xmm0, %rdx
+	jb	L3
+	subss	LC0(%rip), %xmm0
+	movabsq	$-9223372036854775808, %rax
+	cvttss2siq	%xmm0, %rdx
+	xorq	%rax, %rdx
+L3:
+	movq	%rdx, %rax
+	ret
+
+instead of
+
+_conv:
+	movss LCPI1_0(%rip), %xmm1
+	cvttss2siq %xmm0, %rcx
+	movaps %xmm0, %xmm2
+	subss %xmm1, %xmm2
+	cvttss2siq %xmm2, %rax
+	movabsq $-9223372036854775808, %rdx
+	xorq %rdx, %rax
+	ucomiss %xmm1, %xmm0
+	cmovb %rcx, %rax
+	ret
+
+Seems like the jb branch has high likelyhood of being taken. It would have
+saved a few instructions.
+
+//===---------------------------------------------------------------------===//
+
+Poor codegen:
+
+int X[2];
+int b;
+void test(void) {
+  memset(X, b, 2*sizeof(X[0]));
+}
+
+llc:
+	movq _b@GOTPCREL(%rip), %rax
+	movzbq (%rax), %rax
+	movq %rax, %rcx
+	shlq $8, %rcx
+	orq %rax, %rcx
+	movq %rcx, %rax
+	shlq $16, %rax
+	orq %rcx, %rax
+	movq %rax, %rcx
+	shlq $32, %rcx
+	movq _X@GOTPCREL(%rip), %rdx
+	orq %rax, %rcx
+	movq %rcx, (%rdx)
+	ret
+
+gcc:
+	movq	_b@GOTPCREL(%rip), %rax
+	movabsq	$72340172838076673, %rdx
+	movzbq	(%rax), %rax
+	imulq	%rdx, %rax
+	movq	_X@GOTPCREL(%rip), %rdx
+	movq	%rax, (%rdx)
+	ret
+
+//===---------------------------------------------------------------------===//
+
+Vararg function prologue can be further optimized. Currently all XMM registers
+are stored into register save area. Most of them can be eliminated since the
+upper bound of the number of XMM registers used are passed in %al. gcc produces
+something like the following:
+
+	movzbl	%al, %edx
+	leaq	0(,%rdx,4), %rax
+	leaq	4+L2(%rip), %rdx
+	leaq	239(%rsp), %rax
+       	jmp	*%rdx
+	movaps	%xmm7, -15(%rax)
+	movaps	%xmm6, -31(%rax)
+	movaps	%xmm5, -47(%rax)
+	movaps	%xmm4, -63(%rax)
+	movaps	%xmm3, -79(%rax)
+	movaps	%xmm2, -95(%rax)
+	movaps	%xmm1, -111(%rax)
+	movaps	%xmm0, -127(%rax)
+L2:
+
+It jumps over the movaps that do not need to be stored. Hard to see this being
+significant as it added 5 instruciton (including a indirect branch) to avoid
+executing 0 to 8 stores in the function prologue.
+
+Perhaps we can optimize for the common case where no XMM registers are used for
+parameter passing. i.e. is %al == 0 jump over all stores. Or in the case of a
+leaf function where we can determine that no XMM input parameter is need, avoid
+emitting the stores at all.
+
+//===---------------------------------------------------------------------===//
+
+AMD64 has a complex calling convention for aggregate passing by value:
+
+1. If the size of an object is larger than two eightbytes, or in C++, is a non- 
+   POD structure or union type, or contains unaligned fields, it has class 
+   MEMORY.
+2. Both eightbytes get initialized to class NO_CLASS. 
+3. Each field of an object is classified recursively so that always two fields
+   are considered. The resulting class is calculated according to the classes
+   of the fields in the eightbyte: 
+   (a) If both classes are equal, this is the resulting class. 
+   (b) If one of the classes is NO_CLASS, the resulting class is the other 
+       class. 
+   (c) If one of the classes is MEMORY, the result is the MEMORY class. 
+   (d) If one of the classes is INTEGER, the result is the INTEGER. 
+   (e) If one of the classes is X87, X87UP, COMPLEX_X87 class, MEMORY is used as
+      class. 
+   (f) Otherwise class SSE is used. 
+4. Then a post merger cleanup is done: 
+   (a) If one of the classes is MEMORY, the whole argument is passed in memory. 
+   (b) If SSEUP is not preceeded by SSE, it is converted to SSE.
+
+Currently llvm frontend does not handle this correctly.
+
+Problem 1:
+    typedef struct { int i; double d; } QuadWordS;
+It is currently passed in two i64 integer registers. However, gcc compiled
+callee expects the second element 'd' to be passed in XMM0.
+
+Problem 2:
+    typedef struct { int32_t i; float j; double d; } QuadWordS;
+The size of the first two fields == i64 so they will be combined and passed in
+a integer register RDI. The third field is still passed in XMM0.
+
+Problem 3:
+    typedef struct { int64_t i; int8_t j; int64_t d; } S;
+    void test(S s)
+The size of this aggregate is greater than two i64 so it should be passed in 
+memory. Currently llvm breaks this down and passed it in three integer
+registers.
+
+Problem 4:
+Taking problem 3 one step ahead where a function expects a aggregate value
+in memory followed by more parameter(s) passed in register(s).
+    void test(S s, int b)
+
+LLVM IR does not allow parameter passing by aggregates, therefore it must break
+the aggregates value (in problem 3 and 4) into a number of scalar values:
+    void %test(long %s.i, byte %s.j, long %s.d);
+
+However, if the backend were to lower this code literally it would pass the 3
+values in integer registers. To force it be passed in memory, the frontend
+should change the function signiture to:
+    void %test(long %undef1, long %undef2, long %undef3, long %undef4, 
+               long %undef5, long %undef6,
+               long %s.i, byte %s.j, long %s.d);
+And the callee would look something like this:
+    call void %test( undef, undef, undef, undef, undef, undef,
+                     %tmp.s.i, %tmp.s.j, %tmp.s.d );
+The first 6 undef parameters would exhaust the 6 integer registers used for
+parameter passing. The following three integer values would then be forced into
+memory.
+
+For problem 4, the parameter 'd' would be moved to the front of the parameter
+list so it will be passed in register:
+    void %test(int %d,
+               long %undef1, long %undef2, long %undef3, long %undef4, 
+               long %undef5, long %undef6,
+               long %s.i, byte %s.j, long %s.d);
+
+//===---------------------------------------------------------------------===//
+
+Right now the asm printer assumes GlobalAddress are accessed via RIP relative
+addressing. Therefore, it is not possible to generate this:
+        movabsq $__ZTV10polynomialIdE+16, %rax
+
+That is ok for now since we currently only support small model. So the above
+is selected as
+        leaq __ZTV10polynomialIdE+16(%rip), %rax
+
+This is probably slightly slower but is much shorter than movabsq. However, if
+we were to support medium or larger code models, we need to use the movabs
+instruction. We should probably introduce something like AbsoluteAddress to
+distinguish it from GlobalAddress so the asm printer and JIT code emitter can
+do the right thing.
+
+//===---------------------------------------------------------------------===//
+
+It's not possible to reference AH, BH, CH, and DH registers in an instruction
+requiring REX prefix. However, divb and mulb both produce results in AH. If isel
+emits a CopyFromReg which gets turned into a movb and that can be allocated a
+r8b - r15b.
+
+To get around this, isel emits a CopyFromReg from AX and then right shift it
+down by 8 and truncate it. It's not pretty but it works. We need some register
+allocation magic to make the hack go away (e.g. putting additional constraints
+on the result of the movb).
+
+//===---------------------------------------------------------------------===//
+
+The x86-64 ABI for hidden-argument struct returns requires that the
+incoming value of %rdi be copied into %rax by the callee upon return.
+
+The idea is that it saves callers from having to remember this value,
+which would often require a callee-saved register. Callees usually
+need to keep this value live for most of their body anyway, so it
+doesn't add a significant burden on them.
+
+We currently implement this in codegen, however this is suboptimal
+because it means that it would be quite awkward to implement the
+optimization for callers.
+
+A better implementation would be to relax the LLVM IR rules for sret
+arguments to allow a function with an sret argument to have a non-void
+return type, and to have the front-end to set up the sret argument value
+as the return value of the function. The front-end could more easily
+emit uses of the returned struct value to be in terms of the function's
+lowered return value, and it would free non-C frontends from a
+complication only required by a C-based ABI.
+
+//===---------------------------------------------------------------------===//
+
+We get a redundant zero extension for code like this:
+
+int mask[1000];
+int foo(unsigned x) {
+ if (x < 10)
+   x = x * 45;
+ else
+   x = x * 78;
+ return mask[x];
+}
+
+_foo:
+LBB1_0:	## entry
+	cmpl	$9, %edi
+	jbe	LBB1_3	## bb
+LBB1_1:	## bb1
+	imull	$78, %edi, %eax
+LBB1_2:	## bb2
+	movl	%eax, %eax                    <----
+	movq	_mask@GOTPCREL(%rip), %rcx
+	movl	(%rcx,%rax,4), %eax
+	ret
+LBB1_3:	## bb
+	imull	$45, %edi, %eax
+	jmp	LBB1_2	## bb2
+  
+Before regalloc, we have:
+
+        %reg1025<def> = IMUL32rri8 %reg1024, 45, %EFLAGS<imp-def>
+        JMP mbb<bb2,0x203afb0>
+    Successors according to CFG: 0x203afb0 (#3)
+
+bb1: 0x203af60, LLVM BB @0x1e02310, ID#2:
+    Predecessors according to CFG: 0x203aec0 (#0)
+        %reg1026<def> = IMUL32rri8 %reg1024, 78, %EFLAGS<imp-def>
+    Successors according to CFG: 0x203afb0 (#3)
+
+bb2: 0x203afb0, LLVM BB @0x1e02340, ID#3:
+    Predecessors according to CFG: 0x203af10 (#1) 0x203af60 (#2)
+        %reg1027<def> = PHI %reg1025, mbb<bb,0x203af10>,
+                            %reg1026, mbb<bb1,0x203af60>
+        %reg1029<def> = MOVZX64rr32 %reg1027
+
+so we'd have to know that IMUL32rri8 leaves the high word zero extended and to
+be able to recognize the zero extend.  This could also presumably be implemented
+if we have whole-function selectiondags.
+
+//===---------------------------------------------------------------------===//

diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt
new file mode 100644
index 0000000..aa7bb3d
--- /dev/null
+++ b/lib/Target/X86/README.txt

@@ -0,0 +1,1870 @@
+//===---------------------------------------------------------------------===//
+// Random ideas for the X86 backend.
+//===---------------------------------------------------------------------===//
+
+We should add support for the "movbe" instruction, which does a byte-swapping
+copy (3-addr bswap + memory support?)  This is available on Atom processors.
+
+//===---------------------------------------------------------------------===//
+
+CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move.  The X86
+backend knows how to three-addressify this shift, but it appears the register
+allocator isn't even asking it to do so in this case.  We should investigate
+why this isn't happening, it could have significant impact on other important
+cases for X86 as well.
+
+//===---------------------------------------------------------------------===//
+
+This should be one DIV/IDIV instruction, not a libcall:
+
+unsigned test(unsigned long long X, unsigned Y) {
+        return X/Y;
+}
+
+This can be done trivially with a custom legalizer.  What about overflow 
+though?  http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
+
+//===---------------------------------------------------------------------===//
+
+Improvements to the multiply -> shift/add algorithm:
+http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
+
+//===---------------------------------------------------------------------===//
+
+Improve code like this (occurs fairly frequently, e.g. in LLVM):
+long long foo(int x) { return 1LL << x; }
+
+http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
+http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
+http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
+
+Another useful one would be  ~0ULL >> X and ~0ULL << X.
+
+One better solution for 1LL << x is:
+        xorl    %eax, %eax
+        xorl    %edx, %edx
+        testb   $32, %cl
+        sete    %al
+        setne   %dl
+        sall    %cl, %eax
+        sall    %cl, %edx
+
+But that requires good 8-bit subreg support.
+
+Also, this might be better.  It's an extra shift, but it's one instruction
+shorter, and doesn't stress 8-bit subreg support.
+(From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html,
+but without the unnecessary and.)
+        movl %ecx, %eax
+        shrl $5, %eax
+        movl %eax, %edx
+        xorl $1, %edx
+        sall %cl, %eax
+        sall %cl. %edx
+
+64-bit shifts (in general) expand to really bad code.  Instead of using
+cmovs, we should expand to a conditional branch like GCC produces.
+
+//===---------------------------------------------------------------------===//
+
+Compile this:
+_Bool f(_Bool a) { return a!=1; }
+
+into:
+        movzbl  %dil, %eax
+        xorl    $1, %eax
+        ret
+
+(Although note that this isn't a legal way to express the code that llvm-gcc
+currently generates for that function.)
+
+//===---------------------------------------------------------------------===//
+
+Some isel ideas:
+
+1. Dynamic programming based approach when compile time if not an
+   issue.
+2. Code duplication (addressing mode) during isel.
+3. Other ideas from "Register-Sensitive Selection, Duplication, and
+   Sequencing of Instructions".
+4. Scheduling for reduced register pressure.  E.g. "Minimum Register 
+   Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs" 
+   and other related papers.
+   http://citeseer.ist.psu.edu/govindarajan01minimum.html
+
+//===---------------------------------------------------------------------===//
+
+Should we promote i16 to i32 to avoid partial register update stalls?
+
+//===---------------------------------------------------------------------===//
+
+Leave any_extend as pseudo instruction and hint to register
+allocator. Delay codegen until post register allocation.
+Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach
+the coalescer how to deal with it though.
+
+//===---------------------------------------------------------------------===//
+
+It appears icc use push for parameter passing. Need to investigate.
+
+//===---------------------------------------------------------------------===//
+
+Only use inc/neg/not instructions on processors where they are faster than
+add/sub/xor.  They are slower on the P4 due to only updating some processor
+flags.
+
+//===---------------------------------------------------------------------===//
+
+The instruction selector sometimes misses folding a load into a compare.  The
+pattern is written as (cmp reg, (load p)).  Because the compare isn't 
+commutative, it is not matched with the load on both sides.  The dag combiner
+should be made smart enough to cannonicalize the load into the RHS of a compare
+when it can invert the result of the compare for free.
+
+//===---------------------------------------------------------------------===//
+
+In many cases, LLVM generates code like this:
+
+_test:
+        movl 8(%esp), %eax
+        cmpl %eax, 4(%esp)
+        setl %al
+        movzbl %al, %eax
+        ret
+
+on some processors (which ones?), it is more efficient to do this:
+
+_test:
+        movl 8(%esp), %ebx
+        xor  %eax, %eax
+        cmpl %ebx, 4(%esp)
+        setl %al
+        ret
+
+Doing this correctly is tricky though, as the xor clobbers the flags.
+
+//===---------------------------------------------------------------------===//
+
+We should generate bts/btr/etc instructions on targets where they are cheap or
+when codesize is important.  e.g., for:
+
+void setbit(int *target, int bit) {
+    *target |= (1 << bit);
+}
+void clearbit(int *target, int bit) {
+    *target &= ~(1 << bit);
+}
+
+//===---------------------------------------------------------------------===//
+
+Instead of the following for memset char*, 1, 10:
+
+	movl $16843009, 4(%edx)
+	movl $16843009, (%edx)
+	movw $257, 8(%edx)
+
+It might be better to generate
+
+	movl $16843009, %eax
+	movl %eax, 4(%edx)
+	movl %eax, (%edx)
+	movw al, 8(%edx)
+	
+when we can spare a register. It reduces code size.
+
+//===---------------------------------------------------------------------===//
+
+Evaluate what the best way to codegen sdiv X, (2^C) is.  For X/8, we currently
+get this:
+
+define i32 @test1(i32 %X) {
+    %Y = sdiv i32 %X, 8
+    ret i32 %Y
+}
+
+_test1:
+        movl 4(%esp), %eax
+        movl %eax, %ecx
+        sarl $31, %ecx
+        shrl $29, %ecx
+        addl %ecx, %eax
+        sarl $3, %eax
+        ret
+
+GCC knows several different ways to codegen it, one of which is this:
+
+_test1:
+        movl    4(%esp), %eax
+        cmpl    $-1, %eax
+        leal    7(%eax), %ecx
+        cmovle  %ecx, %eax
+        sarl    $3, %eax
+        ret
+
+which is probably slower, but it's interesting at least :)
+
+//===---------------------------------------------------------------------===//
+
+We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
+We should leave these as libcalls for everything over a much lower threshold,
+since libc is hand tuned for medium and large mem ops (avoiding RFO for large
+stores, TLB preheating, etc)
+
+//===---------------------------------------------------------------------===//
+
+Optimize this into something reasonable:
+ x * copysign(1.0, y) * copysign(1.0, z)
+
+//===---------------------------------------------------------------------===//
+
+Optimize copysign(x, *y) to use an integer load from y.
+
+//===---------------------------------------------------------------------===//
+
+The following tests perform worse with LSR:
+
+lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
+
+//===---------------------------------------------------------------------===//
+
+Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
+FR64 to VR128.
+
+//===---------------------------------------------------------------------===//
+
+Adding to the list of cmp / test poor codegen issues:
+
+int test(__m128 *A, __m128 *B) {
+  if (_mm_comige_ss(*A, *B))
+    return 3;
+  else
+    return 4;
+}
+
+_test:
+	movl 8(%esp), %eax
+	movaps (%eax), %xmm0
+	movl 4(%esp), %eax
+	movaps (%eax), %xmm1
+	comiss %xmm0, %xmm1
+	setae %al
+	movzbl %al, %ecx
+	movl $3, %eax
+	movl $4, %edx
+	cmpl $0, %ecx
+	cmove %edx, %eax
+	ret
+
+Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
+are a number of issues. 1) We are introducing a setcc between the result of the
+intrisic call and select. 2) The intrinsic is expected to produce a i32 value
+so a any extend (which becomes a zero extend) is added.
+
+We probably need some kind of target DAG combine hook to fix this.
+
+//===---------------------------------------------------------------------===//
+
+We generate significantly worse code for this than GCC:
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
+http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
+
+There is also one case we do worse on PPC.
+
+//===---------------------------------------------------------------------===//
+
+For this:
+
+int test(int a)
+{
+  return a * 3;
+}
+
+We currently emits
+	imull $3, 4(%esp), %eax
+
+Perhaps this is what we really should generate is? Is imull three or four
+cycles? Note: ICC generates this:
+	movl	4(%esp), %eax
+	leal	(%eax,%eax,2), %eax
+
+The current instruction priority is based on pattern complexity. The former is
+more "complex" because it folds a load so the latter will not be emitted.
+
+Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
+should always try to match LEA first since the LEA matching code does some
+estimate to determine whether the match is profitable.
+
+However, if we care more about code size, then imull is better. It's two bytes
+shorter than movl + leal.
+
+On a Pentium M, both variants have the same characteristics with regard
+to throughput; however, the multiplication has a latency of four cycles, as
+opposed to two cycles for the movl+lea variant.
+
+//===---------------------------------------------------------------------===//
+
+__builtin_ffs codegen is messy.
+
+int ffs_(unsigned X) { return __builtin_ffs(X); }
+
+llvm produces:
+ffs_:
+        movl    4(%esp), %ecx
+        bsfl    %ecx, %eax
+        movl    $32, %edx
+        cmove   %edx, %eax
+        incl    %eax
+        xorl    %edx, %edx
+        testl   %ecx, %ecx
+        cmove   %edx, %eax
+        ret
+
+vs gcc:
+
+_ffs_:
+        movl    $-1, %edx
+        bsfl    4(%esp), %eax
+        cmove   %edx, %eax
+        addl    $1, %eax
+        ret
+
+Another example of __builtin_ffs (use predsimplify to eliminate a select):
+
+int foo (unsigned long j) {
+  if (j)
+    return __builtin_ffs (j) - 1;
+  else
+    return 0;
+}
+
+//===---------------------------------------------------------------------===//
+
+It appears gcc place string data with linkonce linkage in
+.section __TEXT,__const_coal,coalesced instead of
+.section __DATA,__const_coal,coalesced.
+Take a look at darwin.h, there are other Darwin assembler directives that we
+do not make use of.
+
+//===---------------------------------------------------------------------===//
+
+define i32 @foo(i32* %a, i32 %t) {
+entry:
+	br label %cond_true
+
+cond_true:		; preds = %cond_true, %entry
+	%x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ]		; <i32> [#uses=3]
+	%t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ]		; <i32> [#uses=1]
+	%tmp2 = getelementptr i32* %a, i32 %x.0.0		; <i32*> [#uses=1]
+	%tmp3 = load i32* %tmp2		; <i32> [#uses=1]
+	%tmp5 = add i32 %t_addr.0.0, %x.0.0		; <i32> [#uses=1]
+	%tmp7 = add i32 %tmp5, %tmp3		; <i32> [#uses=2]
+	%tmp9 = add i32 %x.0.0, 1		; <i32> [#uses=2]
+	%tmp = icmp sgt i32 %tmp9, 39		; <i1> [#uses=1]
+	br i1 %tmp, label %bb12, label %cond_true
+
+bb12:		; preds = %cond_true
+	ret i32 %tmp7
+}
+is pessimized by -loop-reduce and -indvars
+
+//===---------------------------------------------------------------------===//
+
+u32 to float conversion improvement:
+
+float uint32_2_float( unsigned u ) {
+  float fl = (int) (u & 0xffff);
+  float fh = (int) (u >> 16);
+  fh *= 0x1.0p16f;
+  return fh + fl;
+}
+
+00000000        subl    $0x04,%esp
+00000003        movl    0x08(%esp,1),%eax
+00000007        movl    %eax,%ecx
+00000009        shrl    $0x10,%ecx
+0000000c        cvtsi2ss        %ecx,%xmm0
+00000010        andl    $0x0000ffff,%eax
+00000015        cvtsi2ss        %eax,%xmm1
+00000019        mulss   0x00000078,%xmm0
+00000021        addss   %xmm1,%xmm0
+00000025        movss   %xmm0,(%esp,1)
+0000002a        flds    (%esp,1)
+0000002d        addl    $0x04,%esp
+00000030        ret
+
+//===---------------------------------------------------------------------===//
+
+When using fastcc abi, align stack slot of argument of type double on 8 byte
+boundary to improve performance.
+
+//===---------------------------------------------------------------------===//
+
+Codegen:
+
+int f(int a, int b) {
+  if (a == 4 || a == 6)
+    b++;
+  return b;
+}
+
+
+as:
+
+or eax, 2
+cmp eax, 6
+jz label
+
+//===---------------------------------------------------------------------===//
+
+GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
+simplifications for integer "x cmp y ? a : b".  For example, instead of:
+
+int G;
+void f(int X, int Y) {
+  G = X < 0 ? 14 : 13;
+}
+
+compiling to:
+
+_f:
+        movl $14, %eax
+        movl $13, %ecx
+        movl 4(%esp), %edx
+        testl %edx, %edx
+        cmovl %eax, %ecx
+        movl %ecx, _G
+        ret
+
+it could be:
+_f:
+        movl    4(%esp), %eax
+        sarl    $31, %eax
+        notl    %eax
+        addl    $14, %eax
+        movl    %eax, _G
+        ret
+
+etc.
+
+Another is:
+int usesbb(unsigned int a, unsigned int b) {
+       return (a < b ? -1 : 0);
+}
+to:
+_usesbb:
+	movl	8(%esp), %eax
+	cmpl	%eax, 4(%esp)
+	sbbl	%eax, %eax
+	ret
+
+instead of:
+_usesbb:
+	xorl	%eax, %eax
+	movl	8(%esp), %ecx
+	cmpl	%ecx, 4(%esp)
+	movl	$4294967295, %ecx
+	cmovb	%ecx, %eax
+	ret
+
+//===---------------------------------------------------------------------===//
+
+Consider the expansion of:
+
+define i32 @test3(i32 %X) {
+        %tmp1 = urem i32 %X, 255
+        ret i32 %tmp1
+}
+
+Currently it compiles to:
+
+...
+        movl $2155905153, %ecx
+        movl 8(%esp), %esi
+        movl %esi, %eax
+        mull %ecx
+...
+
+This could be "reassociated" into:
+
+        movl $2155905153, %eax
+        movl 8(%esp), %ecx
+        mull %ecx
+
+to avoid the copy.  In fact, the existing two-address stuff would do this
+except that mul isn't a commutative 2-addr instruction.  I guess this has
+to be done at isel time based on the #uses to mul?
+
+//===---------------------------------------------------------------------===//
+
+Make sure the instruction which starts a loop does not cross a cacheline
+boundary. This requires knowning the exact length of each machine instruction.
+That is somewhat complicated, but doable. Example 256.bzip2:
+
+In the new trace, the hot loop has an instruction which crosses a cacheline
+boundary.  In addition to potential cache misses, this can't help decoding as I
+imagine there has to be some kind of complicated decoder reset and realignment
+to grab the bytes from the next cacheline.
+
+532  532 0x3cfc movb     (1809(%esp, %esi), %bl   <<<--- spans 2 64 byte lines
+942  942 0x3d03 movl     %dh, (1809(%esp, %esi)
+937  937 0x3d0a incl     %esi
+3    3   0x3d0b cmpb     %bl, %dl
+27   27  0x3d0d jnz      0x000062db <main+11707>
+
+//===---------------------------------------------------------------------===//
+
+In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
+
+//===---------------------------------------------------------------------===//
+
+This could be a single 16-bit load.
+
+int f(char *p) {
+    if ((p[0] == 1) & (p[1] == 2)) return 1;
+    return 0;
+}
+
+//===---------------------------------------------------------------------===//
+
+We should inline lrintf and probably other libc functions.
+
+//===---------------------------------------------------------------------===//
+
+Use the FLAGS values from arithmetic instructions more.  For example, compile:
+
+int add_zf(int *x, int y, int a, int b) {
+     if ((*x += y) == 0)
+          return a;
+     else
+          return b;
+}
+
+to:
+       addl    %esi, (%rdi)
+       movl    %edx, %eax
+       cmovne  %ecx, %eax
+       ret
+instead of:
+
+_add_zf:
+        addl (%rdi), %esi
+        movl %esi, (%rdi)
+        testl %esi, %esi
+        cmove %edx, %ecx
+        movl %ecx, %eax
+        ret
+
+As another example, compile function f2 in test/CodeGen/X86/cmp-test.ll
+without a test instruction.
+
+//===---------------------------------------------------------------------===//
+
+These two functions have identical effects:
+
+unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
+unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
+
+We currently compile them to:
+
+_f:
+        movl 4(%esp), %eax
+        movl %eax, %ecx
+        incl %ecx
+        movl 8(%esp), %edx
+        cmpl %edx, %ecx
+        jne LBB1_2      #UnifiedReturnBlock
+LBB1_1: #cond_true
+        addl $2, %eax
+        ret
+LBB1_2: #UnifiedReturnBlock
+        movl %ecx, %eax
+        ret
+_f2:
+        movl 4(%esp), %eax
+        movl %eax, %ecx
+        incl %ecx
+        cmpl 8(%esp), %ecx
+        sete %cl
+        movzbl %cl, %ecx
+        leal 1(%ecx,%eax), %eax
+        ret
+
+both of which are inferior to GCC's:
+
+_f:
+        movl    4(%esp), %edx
+        leal    1(%edx), %eax
+        addl    $2, %edx
+        cmpl    8(%esp), %eax
+        cmove   %edx, %eax
+        ret
+_f2:
+        movl    4(%esp), %eax
+        addl    $1, %eax
+        xorl    %edx, %edx
+        cmpl    8(%esp), %eax
+        sete    %dl
+        addl    %edx, %eax
+        ret
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+void test(int X) {
+  if (X) abort();
+}
+
+is currently compiled to:
+
+_test:
+        subl $12, %esp
+        cmpl $0, 16(%esp)
+        jne LBB1_1
+        addl $12, %esp
+        ret
+LBB1_1:
+        call L_abort$stub
+
+It would be better to produce:
+
+_test:
+        subl $12, %esp
+        cmpl $0, 16(%esp)
+        jne L_abort$stub
+        addl $12, %esp
+        ret
+
+This can be applied to any no-return function call that takes no arguments etc.
+Alternatively, the stack save/restore logic could be shrink-wrapped, producing
+something like this:
+
+_test:
+        cmpl $0, 4(%esp)
+        jne LBB1_1
+        ret
+LBB1_1:
+        subl $12, %esp
+        call L_abort$stub
+
+Both are useful in different situations.  Finally, it could be shrink-wrapped
+and tail called, like this:
+
+_test:
+        cmpl $0, 4(%esp)
+        jne LBB1_1
+        ret
+LBB1_1:
+        pop %eax   # realign stack.
+        call L_abort$stub
+
+Though this probably isn't worth it.
+
+//===---------------------------------------------------------------------===//
+
+Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
+a neg instead of a sub instruction.  Consider:
+
+int test(char X) { return 7-X; }
+
+we currently produce:
+_test:
+        movl $7, %eax
+        movsbl 4(%esp), %ecx
+        subl %ecx, %eax
+        ret
+
+We would use one fewer register if codegen'd as:
+
+        movsbl 4(%esp), %eax
+	neg %eax
+        add $7, %eax
+        ret
+
+Note that this isn't beneficial if the load can be folded into the sub.  In
+this case, we want a sub:
+
+int test(int X) { return 7-X; }
+_test:
+        movl $7, %eax
+        subl 4(%esp), %eax
+        ret
+
+//===---------------------------------------------------------------------===//
+
+Leaf functions that require one 4-byte spill slot have a prolog like this:
+
+_foo:
+        pushl   %esi
+        subl    $4, %esp
+...
+and an epilog like this:
+        addl    $4, %esp
+        popl    %esi
+        ret
+
+It would be smaller, and potentially faster, to push eax on entry and to
+pop into a dummy register instead of using addl/subl of esp.  Just don't pop 
+into any return registers :)
+
+//===---------------------------------------------------------------------===//
+
+The X86 backend should fold (branch (or (setcc, setcc))) into multiple 
+branches.  We generate really poor code for:
+
+double testf(double a) {
+       return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
+}
+
+For example, the entry BB is:
+
+_testf:
+        subl    $20, %esp
+        pxor    %xmm0, %xmm0
+        movsd   24(%esp), %xmm1
+        ucomisd %xmm0, %xmm1
+        setnp   %al
+        sete    %cl
+        testb   %cl, %al
+        jne     LBB1_5  # UnifiedReturnBlock
+LBB1_1: # cond_true
+
+
+it would be better to replace the last four instructions with:
+
+	jp LBB1_1
+	je LBB1_5
+LBB1_1:
+
+We also codegen the inner ?: into a diamond:
+
+       cvtss2sd        LCPI1_0(%rip), %xmm2
+        cvtss2sd        LCPI1_1(%rip), %xmm3
+        ucomisd %xmm1, %xmm0
+        ja      LBB1_3  # cond_true
+LBB1_2: # cond_true
+        movapd  %xmm3, %xmm2
+LBB1_3: # cond_true
+        movapd  %xmm2, %xmm0
+        ret
+
+We should sink the load into xmm3 into the LBB1_2 block.  This should
+be pretty easy, and will nuke all the copies.
+
+//===---------------------------------------------------------------------===//
+
+This:
+        #include <algorithm>
+        inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
+        { return std::make_pair(a + b, a + b < a); }
+        bool no_overflow(unsigned a, unsigned b)
+        { return !full_add(a, b).second; }
+
+Should compile to:
+
+
+        _Z11no_overflowjj:
+                addl    %edi, %esi
+                setae   %al
+                ret
+
+FIXME: That code looks wrong; bool return is normally defined as zext.
+
+on x86-64, not:
+
+__Z11no_overflowjj:
+        addl    %edi, %esi
+        cmpl    %edi, %esi
+        setae   %al
+        movzbl  %al, %eax
+        ret
+
+
+//===---------------------------------------------------------------------===//
+
+The following code:
+
+bb114.preheader:		; preds = %cond_next94
+	%tmp231232 = sext i16 %tmp62 to i32		; <i32> [#uses=1]
+	%tmp233 = sub i32 32, %tmp231232		; <i32> [#uses=1]
+	%tmp245246 = sext i16 %tmp65 to i32		; <i32> [#uses=1]
+	%tmp252253 = sext i16 %tmp68 to i32		; <i32> [#uses=1]
+	%tmp254 = sub i32 32, %tmp252253		; <i32> [#uses=1]
+	%tmp553554 = bitcast i16* %tmp37 to i8*		; <i8*> [#uses=2]
+	%tmp583584 = sext i16 %tmp98 to i32		; <i32> [#uses=1]
+	%tmp585 = sub i32 32, %tmp583584		; <i32> [#uses=1]
+	%tmp614615 = sext i16 %tmp101 to i32		; <i32> [#uses=1]
+	%tmp621622 = sext i16 %tmp104 to i32		; <i32> [#uses=1]
+	%tmp623 = sub i32 32, %tmp621622		; <i32> [#uses=1]
+	br label %bb114
+
+produces:
+
+LBB3_5:	# bb114.preheader
+	movswl	-68(%ebp), %eax
+	movl	$32, %ecx
+	movl	%ecx, -80(%ebp)
+	subl	%eax, -80(%ebp)
+	movswl	-52(%ebp), %eax
+	movl	%ecx, -84(%ebp)
+	subl	%eax, -84(%ebp)
+	movswl	-70(%ebp), %eax
+	movl	%ecx, -88(%ebp)
+	subl	%eax, -88(%ebp)
+	movswl	-50(%ebp), %eax
+	subl	%eax, %ecx
+	movl	%ecx, -76(%ebp)
+	movswl	-42(%ebp), %eax
+	movl	%eax, -92(%ebp)
+	movswl	-66(%ebp), %eax
+	movl	%eax, -96(%ebp)
+	movw	$0, -98(%ebp)
+
+This appears to be bad because the RA is not folding the store to the stack 
+slot into the movl.  The above instructions could be:
+	movl    $32, -80(%ebp)
+...
+	movl    $32, -84(%ebp)
+...
+This seems like a cross between remat and spill folding.
+
+This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
+change, so we could simply subtract %eax from %ecx first and then use %ecx (or
+vice-versa).
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+	%tmp659 = icmp slt i16 %tmp654, 0		; <i1> [#uses=1]
+	br i1 %tmp659, label %cond_true662, label %cond_next715
+
+produces this:
+
+	testw	%cx, %cx
+	movswl	%cx, %esi
+	jns	LBB4_109	# cond_next715
+
+Shark tells us that using %cx in the testw instruction is sub-optimal. It
+suggests using the 32-bit register (which is what ICC uses).
+
+//===---------------------------------------------------------------------===//
+
+We compile this:
+
+void compare (long long foo) {
+  if (foo < 4294967297LL)
+    abort();
+}
+
+to:
+
+compare:
+        subl    $4, %esp
+        cmpl    $0, 8(%esp)
+        setne   %al
+        movzbw  %al, %ax
+        cmpl    $1, 12(%esp)
+        setg    %cl
+        movzbw  %cl, %cx
+        cmove   %ax, %cx
+        testb   $1, %cl
+        jne     .LBB1_2 # UnifiedReturnBlock
+.LBB1_1:        # ifthen
+        call    abort
+.LBB1_2:        # UnifiedReturnBlock
+        addl    $4, %esp
+        ret
+
+(also really horrible code on ppc).  This is due to the expand code for 64-bit
+compares.  GCC produces multiple branches, which is much nicer:
+
+compare:
+        subl    $12, %esp
+        movl    20(%esp), %edx
+        movl    16(%esp), %eax
+        decl    %edx
+        jle     .L7
+.L5:
+        addl    $12, %esp
+        ret
+        .p2align 4,,7
+.L7:
+        jl      .L4
+        cmpl    $0, %eax
+        .p2align 4,,8
+        ja      .L5
+.L4:
+        .p2align 4,,9
+        call    abort
+
+//===---------------------------------------------------------------------===//
+
+Tail call optimization improvements: Tail call optimization currently
+pushes all arguments on the top of the stack (their normal place for
+non-tail call optimized calls) that source from the callers arguments
+or  that source from a virtual register (also possibly sourcing from
+callers arguments).
+This is done to prevent overwriting of parameters (see example
+below) that might be used later.
+
+example:  
+
+int callee(int32, int64); 
+int caller(int32 arg1, int32 arg2) { 
+  int64 local = arg2 * 2; 
+  return callee(arg2, (int64)local); 
+}
+
+[arg1]          [!arg2 no longer valid since we moved local onto it]
+[arg2]      ->  [(int64)
+[RETADDR]        local  ]
+
+Moving arg1 onto the stack slot of callee function would overwrite
+arg2 of the caller.
+
+Possible optimizations:
+
+
+ - Analyse the actual parameters of the callee to see which would
+   overwrite a caller parameter which is used by the callee and only
+   push them onto the top of the stack.
+
+   int callee (int32 arg1, int32 arg2);
+   int caller (int32 arg1, int32 arg2) {
+       return callee(arg1,arg2);
+   }
+
+   Here we don't need to write any variables to the top of the stack
+   since they don't overwrite each other.
+
+   int callee (int32 arg1, int32 arg2);
+   int caller (int32 arg1, int32 arg2) {
+       return callee(arg2,arg1);
+   }
+
+   Here we need to push the arguments because they overwrite each
+   other.
+
+//===---------------------------------------------------------------------===//
+
+main ()
+{
+  int i = 0;
+  unsigned long int z = 0;
+
+  do {
+    z -= 0x00004000;
+    i++;
+    if (i > 0x00040000)
+      abort ();
+  } while (z > 0);
+  exit (0);
+}
+
+gcc compiles this to:
+
+_main:
+	subl	$28, %esp
+	xorl	%eax, %eax
+	jmp	L2
+L3:
+	cmpl	$262144, %eax
+	je	L10
+L2:
+	addl	$1, %eax
+	cmpl	$262145, %eax
+	jne	L3
+	call	L_abort$stub
+L10:
+	movl	$0, (%esp)
+	call	L_exit$stub
+
+llvm:
+
+_main:
+	subl	$12, %esp
+	movl	$1, %eax
+	movl	$16384, %ecx
+LBB1_1:	# bb
+	cmpl	$262145, %eax
+	jge	LBB1_4	# cond_true
+LBB1_2:	# cond_next
+	incl	%eax
+	addl	$4294950912, %ecx
+	cmpl	$16384, %ecx
+	jne	LBB1_1	# bb
+LBB1_3:	# bb11
+	xorl	%eax, %eax
+	addl	$12, %esp
+	ret
+LBB1_4:	# cond_true
+	call	L_abort$stub
+
+1. LSR should rewrite the first cmp with induction variable %ecx.
+2. DAG combiner should fold
+        leal    1(%eax), %edx
+        cmpl    $262145, %edx
+   =>
+        cmpl    $262144, %eax
+
+//===---------------------------------------------------------------------===//
+
+define i64 @test(double %X) {
+	%Y = fptosi double %X to i64
+	ret i64 %Y
+}
+
+compiles to:
+
+_test:
+	subl	$20, %esp
+	movsd	24(%esp), %xmm0
+	movsd	%xmm0, 8(%esp)
+	fldl	8(%esp)
+	fisttpll	(%esp)
+	movl	4(%esp), %edx
+	movl	(%esp), %eax
+	addl	$20, %esp
+	#FP_REG_KILL
+	ret
+
+This should just fldl directly from the input stack slot.
+
+//===---------------------------------------------------------------------===//
+
+This code:
+int foo (int x) { return (x & 65535) | 255; }
+
+Should compile into:
+
+_foo:
+        movzwl  4(%esp), %eax
+        orl     $255, %eax
+        ret
+
+instead of:
+_foo:
+        movl    $255, %eax
+        orl     4(%esp), %eax
+        andl    $65535, %eax
+        ret
+
+//===---------------------------------------------------------------------===//
+
+We're codegen'ing multiply of long longs inefficiently:
+
+unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) {
+  return arg1 *  arg2;
+}
+
+We compile to (fomit-frame-pointer):
+
+_LLM:
+	pushl	%esi
+	movl	8(%esp), %ecx
+	movl	16(%esp), %esi
+	movl	%esi, %eax
+	mull	%ecx
+	imull	12(%esp), %esi
+	addl	%edx, %esi
+	imull	20(%esp), %ecx
+	movl	%esi, %edx
+	addl	%ecx, %edx
+	popl	%esi
+	ret
+
+This looks like a scheduling deficiency and lack of remat of the load from
+the argument area.  ICC apparently produces:
+
+        movl      8(%esp), %ecx
+        imull     12(%esp), %ecx
+        movl      16(%esp), %eax
+        imull     4(%esp), %eax 
+        addl      %eax, %ecx  
+        movl      4(%esp), %eax
+        mull      12(%esp) 
+        addl      %ecx, %edx
+        ret
+
+Note that it remat'd loads from 4(esp) and 12(esp).  See this GCC PR:
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236
+
+//===---------------------------------------------------------------------===//
+
+We can fold a store into "zeroing a reg".  Instead of:
+
+xorl    %eax, %eax
+movl    %eax, 124(%esp)
+
+we should get:
+
+movl    $0, 124(%esp)
+
+if the flags of the xor are dead.
+
+Likewise, we isel "x<<1" into "add reg,reg".  If reg is spilled, this should
+be folded into: shl [mem], 1
+
+//===---------------------------------------------------------------------===//
+
+This testcase misses a read/modify/write opportunity (from PR1425):
+
+void vertical_decompose97iH1(int *b0, int *b1, int *b2, int width){
+    int i;
+    for(i=0; i<width; i++)
+        b1[i] += (1*(b0[i] + b2[i])+0)>>0;
+}
+
+We compile it down to:
+
+LBB1_2:	# bb
+	movl	(%esi,%edi,4), %ebx
+	addl	(%ecx,%edi,4), %ebx
+	addl	(%edx,%edi,4), %ebx
+	movl	%ebx, (%ecx,%edi,4)
+	incl	%edi
+	cmpl	%eax, %edi
+	jne	LBB1_2	# bb
+
+the inner loop should add to the memory location (%ecx,%edi,4), saving
+a mov.  Something like:
+
+        movl    (%esi,%edi,4), %ebx
+        addl    (%edx,%edi,4), %ebx
+        addl    %ebx, (%ecx,%edi,4)
+
+Here is another interesting example:
+
+void vertical_compose97iH1(int *b0, int *b1, int *b2, int width){
+    int i;
+    for(i=0; i<width; i++)
+        b1[i] -= (1*(b0[i] + b2[i])+0)>>0;
+}
+
+We miss the r/m/w opportunity here by using 2 subs instead of an add+sub[mem]:
+
+LBB9_2:	# bb
+	movl	(%ecx,%edi,4), %ebx
+	subl	(%esi,%edi,4), %ebx
+	subl	(%edx,%edi,4), %ebx
+	movl	%ebx, (%ecx,%edi,4)
+	incl	%edi
+	cmpl	%eax, %edi
+	jne	LBB9_2	# bb
+
+Additionally, LSR should rewrite the exit condition of these loops to use
+a stride-4 IV, would would allow all the scales in the loop to go away.
+This would result in smaller code and more efficient microops.
+
+//===---------------------------------------------------------------------===//
+
+In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
+or and instruction, for example:
+
+	xorpd	LCPI1_0, %xmm2
+
+However, if xmm2 gets spilled, we end up with really ugly code like this:
+
+	movsd	(%esp), %xmm0
+	xorpd	LCPI1_0, %xmm0
+	movsd	%xmm0, (%esp)
+
+Since we 'know' that this is a 'neg', we can actually "fold" the spill into
+the neg/abs instruction, turning it into an *integer* operation, like this:
+
+	xorl 2147483648, [mem+4]     ## 2147483648 = (1 << 31)
+
+you could also use xorb, but xorl is less likely to lead to a partial register
+stall.  Here is a contrived testcase:
+
+double a, b, c;
+void test(double *P) {
+  double X = *P;
+  a = X;
+  bar();
+  X = -X;
+  b = X;
+  bar();
+  c = X;
+}
+
+//===---------------------------------------------------------------------===//
+
+handling llvm.memory.barrier on pre SSE2 cpus
+
+should generate:
+lock ; mov %esp, %esp
+
+//===---------------------------------------------------------------------===//
+
+The generated code on x86 for checking for signed overflow on a multiply the
+obvious way is much longer than it needs to be.
+
+int x(int a, int b) {
+  long long prod = (long long)a*b;
+  return  prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1);
+}
+
+See PR2053 for more details.
+
+//===---------------------------------------------------------------------===//
+
+We should investigate using cdq/ctld (effect: edx = sar eax, 31)
+more aggressively; it should cost the same as a move+shift on any modern
+processor, but it's a lot shorter. Downside is that it puts more
+pressure on register allocation because it has fixed operands.
+
+Example:
+int abs(int x) {return x < 0 ? -x : x;}
+
+gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.:
+abs:
+        movl    4(%esp), %eax
+        cltd
+        xorl    %edx, %eax
+        subl    %edx, %eax
+        ret
+
+//===---------------------------------------------------------------------===//
+
+Consider:
+int test(unsigned long a, unsigned long b) { return -(a < b); }
+
+We currently compile this to:
+
+define i32 @test(i32 %a, i32 %b) nounwind  {
+	%tmp3 = icmp ult i32 %a, %b		; <i1> [#uses=1]
+	%tmp34 = zext i1 %tmp3 to i32		; <i32> [#uses=1]
+	%tmp5 = sub i32 0, %tmp34		; <i32> [#uses=1]
+	ret i32 %tmp5
+}
+
+and
+
+_test:
+	movl	8(%esp), %eax
+	cmpl	%eax, 4(%esp)
+	setb	%al
+	movzbl	%al, %eax
+	negl	%eax
+	ret
+
+Several deficiencies here.  First, we should instcombine zext+neg into sext:
+
+define i32 @test2(i32 %a, i32 %b) nounwind  {
+	%tmp3 = icmp ult i32 %a, %b		; <i1> [#uses=1]
+	%tmp34 = sext i1 %tmp3 to i32		; <i32> [#uses=1]
+	ret i32 %tmp34
+}
+
+However, before we can do that, we have to fix the bad codegen that we get for
+sext from bool:
+
+_test2:
+	movl	8(%esp), %eax
+	cmpl	%eax, 4(%esp)
+	setb	%al
+	movzbl	%al, %eax
+	shll	$31, %eax
+	sarl	$31, %eax
+	ret
+
+This code should be at least as good as the code above.  Once this is fixed, we
+can optimize this specific case even more to:
+
+	movl	8(%esp), %eax
+	xorl	%ecx, %ecx
+        cmpl    %eax, 4(%esp)
+        sbbl    %ecx, %ecx
+
+//===---------------------------------------------------------------------===//
+
+Take the following code (from 
+http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541):
+
+extern unsigned char first_one[65536];
+int FirstOnet(unsigned long long arg1)
+{
+  if (arg1 >> 48)
+    return (first_one[arg1 >> 48]);
+  return 0;
+}
+
+
+The following code is currently generated:
+FirstOnet:
+        movl    8(%esp), %eax
+        cmpl    $65536, %eax
+        movl    4(%esp), %ecx
+        jb      .LBB1_2 # UnifiedReturnBlock
+.LBB1_1:        # ifthen
+        shrl    $16, %eax
+        movzbl  first_one(%eax), %eax
+        ret
+.LBB1_2:        # UnifiedReturnBlock
+        xorl    %eax, %eax
+        ret
+
+There are a few possible improvements here:
+1. We should be able to eliminate the dead load into %ecx
+2. We could change the "movl 8(%esp), %eax" into
+   "movzwl 10(%esp), %eax"; this lets us change the cmpl
+   into a testl, which is shorter, and eliminate the shift.
+
+We could also in theory eliminate the branch by using a conditional
+for the address of the load, but that seems unlikely to be worthwhile
+in general.
+
+//===---------------------------------------------------------------------===//
+
+We compile this function:
+
+define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext  %d) nounwind  {
+entry:
+	%tmp2 = icmp eq i8 %d, 0		; <i1> [#uses=1]
+	br i1 %tmp2, label %bb7, label %bb
+
+bb:		; preds = %entry
+	%tmp6 = add i32 %b, %a		; <i32> [#uses=1]
+	ret i32 %tmp6
+
+bb7:		; preds = %entry
+	%tmp10 = sub i32 %a, %c		; <i32> [#uses=1]
+	ret i32 %tmp10
+}
+
+to:
+
+_foo:
+	cmpb	$0, 16(%esp)
+	movl	12(%esp), %ecx
+	movl	8(%esp), %eax
+	movl	4(%esp), %edx
+	je	LBB1_2	# bb7
+LBB1_1:	# bb
+	addl	%edx, %eax
+	ret
+LBB1_2:	# bb7
+	movl	%edx, %eax
+	subl	%ecx, %eax
+	ret
+
+The coalescer could coalesce "edx" with "eax" to avoid the movl in LBB1_2
+if it commuted the addl in LBB1_1.
+
+//===---------------------------------------------------------------------===//
+
+See rdar://4653682.
+
+From flops:
+
+LBB1_15:        # bb310
+        cvtss2sd        LCPI1_0, %xmm1
+        addsd   %xmm1, %xmm0
+        movsd   176(%esp), %xmm2
+        mulsd   %xmm0, %xmm2
+        movapd  %xmm2, %xmm3
+        mulsd   %xmm3, %xmm3
+        movapd  %xmm3, %xmm4
+        mulsd   LCPI1_23, %xmm4
+        addsd   LCPI1_24, %xmm4
+        mulsd   %xmm3, %xmm4
+        addsd   LCPI1_25, %xmm4
+        mulsd   %xmm3, %xmm4
+        addsd   LCPI1_26, %xmm4
+        mulsd   %xmm3, %xmm4
+        addsd   LCPI1_27, %xmm4
+        mulsd   %xmm3, %xmm4
+        addsd   LCPI1_28, %xmm4
+        mulsd   %xmm3, %xmm4
+        addsd   %xmm1, %xmm4
+        mulsd   %xmm2, %xmm4
+        movsd   152(%esp), %xmm1
+        addsd   %xmm4, %xmm1
+        movsd   %xmm1, 152(%esp)
+        incl    %eax
+        cmpl    %eax, %esi
+        jge     LBB1_15 # bb310
+LBB1_16:        # bb358.loopexit
+        movsd   152(%esp), %xmm0
+        addsd   %xmm0, %xmm0
+        addsd   LCPI1_22, %xmm0
+        movsd   %xmm0, 152(%esp)
+
+Rather than spilling the result of the last addsd in the loop, we should have
+insert a copy to split the interval (one for the duration of the loop, one
+extending to the fall through). The register pressure in the loop isn't high
+enough to warrant the spill.
+
+Also check why xmm7 is not used at all in the function.
+
+//===---------------------------------------------------------------------===//
+
+Legalize loses track of the fact that bools are always zero extended when in
+memory.  This causes us to compile abort_gzip (from 164.gzip) from:
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i386-apple-darwin8"
+@in_exit.4870.b = internal global i1 false		; <i1*> [#uses=2]
+define fastcc void @abort_gzip() noreturn nounwind  {
+entry:
+	%tmp.b.i = load i1* @in_exit.4870.b		; <i1> [#uses=1]
+	br i1 %tmp.b.i, label %bb.i, label %bb4.i
+bb.i:		; preds = %entry
+	tail call void @exit( i32 1 ) noreturn nounwind 
+	unreachable
+bb4.i:		; preds = %entry
+	store i1 true, i1* @in_exit.4870.b
+	tail call void @exit( i32 1 ) noreturn nounwind 
+	unreachable
+}
+declare void @exit(i32) noreturn nounwind 
+
+into:
+
+_abort_gzip:
+	subl	$12, %esp
+	movb	_in_exit.4870.b, %al
+	notb	%al
+	testb	$1, %al
+	jne	LBB1_2	## bb4.i
+LBB1_1:	## bb.i
+  ...
+
+//===---------------------------------------------------------------------===//
+
+We compile:
+
+int test(int x, int y) {
+  return x-y-1;
+}
+
+into (-m64):
+
+_test:
+	decl	%edi
+	movl	%edi, %eax
+	subl	%esi, %eax
+	ret
+
+it would be better to codegen as: x+~y  (notl+addl)
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+int foo(const char *str,...)
+{
+ __builtin_va_list a; int x;
+ __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a);
+ return x;
+}
+
+gets compiled into this on x86-64:
+	subq    $200, %rsp
+        movaps  %xmm7, 160(%rsp)
+        movaps  %xmm6, 144(%rsp)
+        movaps  %xmm5, 128(%rsp)
+        movaps  %xmm4, 112(%rsp)
+        movaps  %xmm3, 96(%rsp)
+        movaps  %xmm2, 80(%rsp)
+        movaps  %xmm1, 64(%rsp)
+        movaps  %xmm0, 48(%rsp)
+        movq    %r9, 40(%rsp)
+        movq    %r8, 32(%rsp)
+        movq    %rcx, 24(%rsp)
+        movq    %rdx, 16(%rsp)
+        movq    %rsi, 8(%rsp)
+        leaq    (%rsp), %rax
+        movq    %rax, 192(%rsp)
+        leaq    208(%rsp), %rax
+        movq    %rax, 184(%rsp)
+        movl    $48, 180(%rsp)
+        movl    $8, 176(%rsp)
+        movl    176(%rsp), %eax
+        cmpl    $47, %eax
+        jbe     .LBB1_3 # bb
+.LBB1_1:        # bb3
+        movq    184(%rsp), %rcx
+        leaq    8(%rcx), %rax
+        movq    %rax, 184(%rsp)
+.LBB1_2:        # bb4
+        movl    (%rcx), %eax
+        addq    $200, %rsp
+        ret
+.LBB1_3:        # bb
+        movl    %eax, %ecx
+        addl    $8, %eax
+        addq    192(%rsp), %rcx
+        movl    %eax, 176(%rsp)
+        jmp     .LBB1_2 # bb4
+
+gcc 4.3 generates:
+	subq    $96, %rsp
+.LCFI0:
+        leaq    104(%rsp), %rax
+        movq    %rsi, -80(%rsp)
+        movl    $8, -120(%rsp)
+        movq    %rax, -112(%rsp)
+        leaq    -88(%rsp), %rax
+        movq    %rax, -104(%rsp)
+        movl    $8, %eax
+        cmpl    $48, %eax
+        jb      .L6
+        movq    -112(%rsp), %rdx
+        movl    (%rdx), %eax
+        addq    $96, %rsp
+        ret
+        .p2align 4,,10
+        .p2align 3
+.L6:
+        mov     %eax, %edx
+        addq    -104(%rsp), %rdx
+        addl    $8, %eax
+        movl    %eax, -120(%rsp)
+        movl    (%rdx), %eax
+        addq    $96, %rsp
+        ret
+
+and it gets compiled into this on x86:
+	pushl   %ebp
+        movl    %esp, %ebp
+        subl    $4, %esp
+        leal    12(%ebp), %eax
+        movl    %eax, -4(%ebp)
+        leal    16(%ebp), %eax
+        movl    %eax, -4(%ebp)
+        movl    12(%ebp), %eax
+        addl    $4, %esp
+        popl    %ebp
+        ret
+
+gcc 4.3 generates:
+	pushl   %ebp
+        movl    %esp, %ebp
+        movl    12(%ebp), %eax
+        popl    %ebp
+        ret
+
+//===---------------------------------------------------------------------===//
+
+Teach tblgen not to check bitconvert source type in some cases. This allows us
+to consolidate the following patterns in X86InstrMMX.td:
+
+def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+                                                  (iPTR 0))))),
+          (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>;
+def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+                                                  (iPTR 0))))),
+          (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>;
+def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+                                                  (iPTR 0))))),
+          (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>;
+
+There are other cases in various td files.
+
+//===---------------------------------------------------------------------===//
+
+Take something like the following on x86-32:
+unsigned a(unsigned long long x, unsigned y) {return x % y;}
+
+We currently generate a libcall, but we really shouldn't: the expansion is
+shorter and likely faster than the libcall.  The expected code is something
+like the following:
+
+	movl	12(%ebp), %eax
+	movl	16(%ebp), %ecx
+	xorl	%edx, %edx
+	divl	%ecx
+	movl	8(%ebp), %eax
+	divl	%ecx
+	movl	%edx, %eax
+	ret
+
+A similar code sequence works for division.
+
+//===---------------------------------------------------------------------===//
+
+These should compile to the same code, but the later codegen's to useless
+instructions on X86. This may be a trivial dag combine (GCC PR7061):
+
+struct s1 { unsigned char a, b; };
+unsigned long f1(struct s1 x) {
+    return x.a + x.b;
+}
+struct s2 { unsigned a: 8, b: 8; };
+unsigned long f2(struct s2 x) {
+    return x.a + x.b;
+}
+
+//===---------------------------------------------------------------------===//
+
+We currently compile this:
+
+define i32 @func1(i32 %v1, i32 %v2) nounwind {
+entry:
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %sum = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %normal
+normal:
+  ret i32 %sum
+overflow:
+  call void @llvm.trap()
+  unreachable
+}
+declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
+declare void @llvm.trap()
+
+to:
+
+_func1:
+	movl	4(%esp), %eax
+	addl	8(%esp), %eax
+	jo	LBB1_2	## overflow
+LBB1_1:	## normal
+	ret
+LBB1_2:	## overflow
+	ud2
+
+it would be nice to produce "into" someday.
+
+//===---------------------------------------------------------------------===//
+
+This code:
+
+void vec_mpys1(int y[], const int x[], int scaler) {
+int i;
+for (i = 0; i < 150; i++)
+ y[i] += (((long long)scaler * (long long)x[i]) >> 31);
+}
+
+Compiles to this loop with GCC 3.x:
+
+.L5:
+	movl	%ebx, %eax
+	imull	(%edi,%ecx,4)
+	shrdl	$31, %edx, %eax
+	addl	%eax, (%esi,%ecx,4)
+	incl	%ecx
+	cmpl	$149, %ecx
+	jle	.L5
+
+llvm-gcc compiles it to the much uglier:
+
+LBB1_1:	## bb1
+	movl	24(%esp), %eax
+	movl	(%eax,%edi,4), %ebx
+	movl	%ebx, %ebp
+	imull	%esi, %ebp
+	movl	%ebx, %eax
+	mull	%ecx
+	addl	%ebp, %edx
+	sarl	$31, %ebx
+	imull	%ecx, %ebx
+	addl	%edx, %ebx
+	shldl	$1, %eax, %ebx
+	movl	20(%esp), %eax
+	addl	%ebx, (%eax,%edi,4)
+	incl	%edi
+	cmpl	$150, %edi
+	jne	LBB1_1	## bb1
+
+The issue is that we hoist the cast of "scaler" to long long outside of the
+loop, the value comes into the loop as two values, and
+RegsForValue::getCopyFromRegs doesn't know how to put an AssertSext on the
+constructed BUILD_PAIR which represents the cast value.
+
+//===---------------------------------------------------------------------===//
+
+Test instructions can be eliminated by using EFLAGS values from arithmetic
+instructions. This is currently not done for mul, and, or, xor, neg, shl,
+sra, srl, shld, shrd, atomic ops, and others. It is also currently not done
+for read-modify-write instructions. It is also current not done if the
+OF or CF flags are needed.
+
+The shift operators have the complication that when the shift count is
+zero, EFLAGS is not set, so they can only subsume a test instruction if
+the shift count is known to be non-zero. Also, using the EFLAGS value
+from a shift is apparently very slow on some x86 implementations.
+
+In read-modify-write instructions, the root node in the isel match is
+the store, and isel has no way for the use of the EFLAGS result of the
+arithmetic to be remapped to the new node.
+
+Add and subtract instructions set OF on signed overflow and CF on unsiged
+overflow, while test instructions always clear OF and CF. In order to
+replace a test with an add or subtract in a situation where OF or CF is
+needed, codegen must be able to prove that the operation cannot see
+signed or unsigned overflow, respectively.
+
+//===---------------------------------------------------------------------===//
+
+memcpy/memmove do not lower to SSE copies when possible.  A silly example is:
+define <16 x float> @foo(<16 x float> %A) nounwind {
+	%tmp = alloca <16 x float>, align 16
+	%tmp2 = alloca <16 x float>, align 16
+	store <16 x float> %A, <16 x float>* %tmp
+	%s = bitcast <16 x float>* %tmp to i8*
+	%s2 = bitcast <16 x float>* %tmp2 to i8*
+	call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16)
+	%R = load <16 x float>* %tmp2
+	ret <16 x float> %R
+}
+
+declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
+
+which compiles to:
+
+_foo:
+	subl	$140, %esp
+	movaps	%xmm3, 112(%esp)
+	movaps	%xmm2, 96(%esp)
+	movaps	%xmm1, 80(%esp)
+	movaps	%xmm0, 64(%esp)
+	movl	60(%esp), %eax
+	movl	%eax, 124(%esp)
+	movl	56(%esp), %eax
+	movl	%eax, 120(%esp)
+	movl	52(%esp), %eax
+        <many many more 32-bit copies>
+      	movaps	(%esp), %xmm0
+	movaps	16(%esp), %xmm1
+	movaps	32(%esp), %xmm2
+	movaps	48(%esp), %xmm3
+	addl	$140, %esp
+	ret
+
+On Nehalem, it may even be cheaper to just use movups when unaligned than to
+fall back to lower-granularity chunks.
+
+//===---------------------------------------------------------------------===//
+
+Implement processor-specific optimizations for parity with GCC on these
+processors.  GCC does two optimizations:
+
+1. ix86_pad_returns inserts a noop before ret instructions if immediately
+   preceeded by a conditional branch or is the target of a jump.
+2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of
+   code contains more than 3 branches.
+   
+The first one is done for all AMDs, Core2, and "Generic"
+The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona,
+  Core 2, and "Generic"
+
+//===---------------------------------------------------------------------===//
+
+Testcase:
+int a(int x) { return (x & 127) > 31; }
+
+Current output:
+	movl	4(%esp), %eax
+	andl	$127, %eax
+	cmpl	$31, %eax
+	seta	%al
+	movzbl	%al, %eax
+	ret
+
+Ideal output:
+	xorl	%eax, %eax
+	testl	$96, 4(%esp)
+	setne	%al
+	ret
+
+This should definitely be done in instcombine, canonicalizing the range
+condition into a != condition.  We get this IR:
+
+define i32 @a(i32 %x) nounwind readnone {
+entry:
+	%0 = and i32 %x, 127		; <i32> [#uses=1]
+	%1 = icmp ugt i32 %0, 31		; <i1> [#uses=1]
+	%2 = zext i1 %1 to i32		; <i32> [#uses=1]
+	ret i32 %2
+}
+
+Instcombine prefers to strength reduce relational comparisons to equality
+comparisons when possible, this should be another case of that.  This could
+be handled pretty easily in InstCombiner::visitICmpInstWithInstAndIntCst, but it
+looks like InstCombiner::visitICmpInstWithInstAndIntCst should really already
+be redesigned to use ComputeMaskedBits and friends.
+
+
+//===---------------------------------------------------------------------===//
+Testcase:
+int x(int a) { return (a&0xf0)>>4; }
+
+Current output:
+	movl	4(%esp), %eax
+	shrl	$4, %eax
+	andl	$15, %eax
+	ret
+
+Ideal output:
+	movzbl	4(%esp), %eax
+	shrl	$4, %eax
+	ret
+
+//===---------------------------------------------------------------------===//
+
+Testcase:
+int x(int a) { return (a & 0x80) ? 0x100 : 0; }
+int y(int a) { return (a & 0x80) *2; }
+
+Current:
+	testl	$128, 4(%esp)
+	setne	%al
+	movzbl	%al, %eax
+	shll	$8, %eax
+	ret
+
+Better:
+	movl	4(%esp), %eax
+	addl	%eax, %eax
+	andl	$256, %eax
+	ret
+
+This is another general instcombine transformation that is profitable on all
+targets.  In LLVM IR, these functions look like this:
+
+define i32 @x(i32 %a) nounwind readnone {
+entry:
+	%0 = and i32 %a, 128
+	%1 = icmp eq i32 %0, 0
+	%iftmp.0.0 = select i1 %1, i32 0, i32 256
+	ret i32 %iftmp.0.0
+}
+
+define i32 @y(i32 %a) nounwind readnone {
+entry:
+	%0 = shl i32 %a, 1
+	%1 = and i32 %0, 256
+	ret i32 %1
+}
+
+Replacing an icmp+select with a shift should always be considered profitable in
+instcombine.
+
+//===---------------------------------------------------------------------===//
+
+Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch
+properly.
+
+When the return value is not used (i.e. only care about the value in the
+memory), x86 does not have to use add to implement these. Instead, it can use
+add, sub, inc, dec instructions with the "lock" prefix.
+
+This is currently implemented using a bit of instruction selection trick. The
+issue is the target independent pattern produces one output and a chain and we
+want to map it into one that just output a chain. The current trick is to select
+it into a MERGE_VALUES with the first definition being an implicit_def. The
+proper solution is to add new ISD opcodes for the no-output variant. DAG
+combiner can then transform the node before it gets to target node selection.
+
+Problem #2 is we are adding a whole bunch of x86 atomic instructions when in
+fact these instructions are identical to the non-lock versions. We need a way to
+add target specific information to target nodes and have this information
+carried over to machine instructions. Asm printer (or JIT) can use this
+information to add the "lock" prefix.
+
+//===---------------------------------------------------------------------===//

diff --git a/lib/Target/X86/TargetInfo/CMakeLists.txt b/lib/Target/X86/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000..90be9f5
--- /dev/null
+++ b/lib/Target/X86/TargetInfo/CMakeLists.txt

@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMX86Info
+  X86TargetInfo.cpp
+  )
+
+add_dependencies(LLVMX86Info X86CodeGenTable_gen)

diff --git a/lib/Target/X86/TargetInfo/Makefile b/lib/Target/X86/TargetInfo/Makefile
new file mode 100644
index 0000000..9858e6a
--- /dev/null
+++ b/lib/Target/X86/TargetInfo/Makefile

@@ -0,0 +1,16 @@
+##===- lib/Target/X86/TargetInfo/Makefile ------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMX86Info
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common

diff --git a/lib/Target/X86/TargetInfo/X86TargetInfo.cpp b/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
new file mode 100644
index 0000000..08d4d84
--- /dev/null
+++ b/lib/Target/X86/TargetInfo/X86TargetInfo.cpp

@@ -0,0 +1,23 @@
+//===-- X86TargetInfo.cpp - X86 Target Implementation ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "llvm/Module.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheX86_32Target, llvm::TheX86_64Target;
+
+extern "C" void LLVMInitializeX86TargetInfo() { 
+  RegisterTarget<Triple::x86, /*HasJIT=*/true>
+    X(TheX86_32Target, "x86", "32-bit X86: Pentium-Pro and above");
+
+  RegisterTarget<Triple::x86_64, /*HasJIT=*/true>
+    Y(TheX86_64Target, "x86-64", "64-bit X86: EM64T and AMD64");
+}

diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
new file mode 100644
index 0000000..1d17a05
--- /dev/null
+++ b/lib/Target/X86/X86.h

@@ -0,0 +1,75 @@
+//===-- X86.h - Top-level interface for X86 representation ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the x86
+// target library, as used by the LLVM JIT.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_X86_H
+#define TARGET_X86_H
+
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class X86TargetMachine;
+class FunctionPass;
+class MachineCodeEmitter;
+class MCCodeEmitter;
+class JITCodeEmitter;
+class Target;
+class formatted_raw_ostream;
+
+/// createX86ISelDag - This pass converts a legalized DAG into a 
+/// X86-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *createX86ISelDag(X86TargetMachine &TM,
+                               CodeGenOpt::Level OptLevel);
+
+/// createX86FloatingPointStackifierPass - This function returns a pass which
+/// converts floating point register references and pseudo instructions into
+/// floating point stack references and physical instructions.
+///
+FunctionPass *createX86FloatingPointStackifierPass();
+
+/// createX87FPRegKillInserterPass - This function returns a pass which
+/// inserts FP_REG_KILL instructions where needed.
+///
+FunctionPass *createX87FPRegKillInserterPass();
+
+/// createX86CodeEmitterPass - Return a pass that emits the collected X86 code
+/// to the specified MCE object.
+FunctionPass *createX86JITCodeEmitterPass(X86TargetMachine &TM,
+                                          JITCodeEmitter &JCE);
+
+MCCodeEmitter *createHeinousX86MCCodeEmitter(const Target &, TargetMachine &TM);
+MCCodeEmitter *createX86_32MCCodeEmitter(const Target &, TargetMachine &TM);
+MCCodeEmitter *createX86_64MCCodeEmitter(const Target &, TargetMachine &TM);
+
+/// createX86EmitCodeToMemory - Returns a pass that converts a register
+/// allocated function into raw machine code in a dynamically
+/// allocated chunk of memory.
+///
+FunctionPass *createEmitX86CodeToMemory();
+
+extern Target TheX86_32Target, TheX86_64Target;
+
+} // End llvm namespace
+
+// Defines symbolic names for X86 registers.  This defines a mapping from
+// register name to register number.
+//
+#include "X86GenRegisterNames.inc"
+
+// Defines symbolic names for the X86 instructions.
+//
+#include "X86GenInstrNames.inc"
+
+#endif

diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
new file mode 100644
index 0000000..7919559
--- /dev/null
+++ b/lib/Target/X86/X86.td

@@ -0,0 +1,220 @@
+//===- X86.td - Target definition file for the Intel X86 ---*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This is a target description file for the Intel i386 architecture, refered to
+// here as the "X86" architecture.
+//
+//===----------------------------------------------------------------------===//
+
+// Get the target-independent interfaces which we are implementing...
+//
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// X86 Subtarget features.
+//===----------------------------------------------------------------------===//
+
+def FeatureCMOV    : SubtargetFeature<"cmov","HasCMov", "true",
+                                      "Enable conditional move instructions">;
+
+
+def FeatureMMX     : SubtargetFeature<"mmx","X86SSELevel", "MMX",
+                                      "Enable MMX instructions">;
+def FeatureSSE1    : SubtargetFeature<"sse", "X86SSELevel", "SSE1",
+                                      "Enable SSE instructions",
+                                      // SSE codegen depends on cmovs, and all
+                                      // SSE1+ processors support them. 
+                                      [FeatureMMX, FeatureCMOV]>;
+def FeatureSSE2    : SubtargetFeature<"sse2", "X86SSELevel", "SSE2",
+                                      "Enable SSE2 instructions",
+                                      [FeatureSSE1]>;
+def FeatureSSE3    : SubtargetFeature<"sse3", "X86SSELevel", "SSE3",
+                                      "Enable SSE3 instructions",
+                                      [FeatureSSE2]>;
+def FeatureSSSE3   : SubtargetFeature<"ssse3", "X86SSELevel", "SSSE3",
+                                      "Enable SSSE3 instructions",
+                                      [FeatureSSE3]>;
+def FeatureSSE41   : SubtargetFeature<"sse41", "X86SSELevel", "SSE41",
+                                      "Enable SSE 4.1 instructions",
+                                      [FeatureSSSE3]>;
+def FeatureSSE42   : SubtargetFeature<"sse42", "X86SSELevel", "SSE42",
+                                      "Enable SSE 4.2 instructions",
+                                      [FeatureSSE41]>;
+def Feature3DNow   : SubtargetFeature<"3dnow", "X863DNowLevel", "ThreeDNow",
+                                      "Enable 3DNow! instructions">;
+def Feature3DNowA  : SubtargetFeature<"3dnowa", "X863DNowLevel", "ThreeDNowA",
+                                      "Enable 3DNow! Athlon instructions",
+                                      [Feature3DNow]>;
+// All x86-64 hardware has SSE2, but we don't mark SSE2 as an implied
+// feature, because SSE2 can be disabled (e.g. for compiling OS kernels)
+// without disabling 64-bit mode.
+def Feature64Bit   : SubtargetFeature<"64bit", "HasX86_64", "true",
+                                      "Support 64-bit instructions">;
+def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true",
+                                       "Bit testing of memory is slow">;
+def FeatureSSE4A   : SubtargetFeature<"sse4a", "HasSSE4A", "true",
+                                      "Support SSE 4a instructions">;
+
+def FeatureAVX     : SubtargetFeature<"avx", "HasAVX", "true",
+                                      "Enable AVX instructions">;
+def FeatureFMA3    : SubtargetFeature<"fma3", "HasFMA3", "true",
+                                     "Enable three-operand fused multiple-add">;
+def FeatureFMA4    : SubtargetFeature<"fma4", "HasFMA4", "true",
+                                      "Enable four-operand fused multiple-add">;
+def FeatureVectorUAMem : SubtargetFeature<"vector-unaligned-mem",
+                                          "HasVectorUAMem", "true",
+                 "Allow unaligned memory operands on vector/SIMD instructions">;
+
+//===----------------------------------------------------------------------===//
+// X86 processors supported.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"generic",         []>;
+def : Proc<"i386",            []>;
+def : Proc<"i486",            []>;
+def : Proc<"i586",            []>;
+def : Proc<"pentium",         []>;
+def : Proc<"pentium-mmx",     [FeatureMMX]>;
+def : Proc<"i686",            []>;
+def : Proc<"pentiumpro",      [FeatureCMOV]>;
+def : Proc<"pentium2",        [FeatureMMX, FeatureCMOV]>;
+def : Proc<"pentium3",        [FeatureSSE1]>;
+def : Proc<"pentium-m",       [FeatureSSE2, FeatureSlowBTMem]>;
+def : Proc<"pentium4",        [FeatureSSE2]>;
+def : Proc<"x86-64",          [FeatureSSE2,   Feature64Bit, FeatureSlowBTMem]>;
+def : Proc<"yonah",           [FeatureSSE3, FeatureSlowBTMem]>;
+def : Proc<"prescott",        [FeatureSSE3, FeatureSlowBTMem]>;
+def : Proc<"nocona",          [FeatureSSE3,   Feature64Bit, FeatureSlowBTMem]>;
+def : Proc<"core2",           [FeatureSSSE3,  Feature64Bit, FeatureSlowBTMem]>;
+def : Proc<"penryn",          [FeatureSSE41,  Feature64Bit, FeatureSlowBTMem]>;
+def : Proc<"atom",            [FeatureSSE3,   Feature64Bit, FeatureSlowBTMem]>;
+def : Proc<"corei7",          [FeatureSSE42,  Feature64Bit, FeatureSlowBTMem]>;
+def : Proc<"nehalem",         [FeatureSSE42,  Feature64Bit, FeatureSlowBTMem]>;
+// Sandy Bridge does not have FMA
+def : Proc<"sandybridge",     [FeatureSSE42,  FeatureAVX,   Feature64Bit]>;
+
+def : Proc<"k6",              [FeatureMMX]>;
+def : Proc<"k6-2",            [FeatureMMX,    Feature3DNow]>;
+def : Proc<"k6-3",            [FeatureMMX,    Feature3DNow]>;
+def : Proc<"athlon",          [FeatureMMX,    Feature3DNowA, FeatureSlowBTMem]>;
+def : Proc<"athlon-tbird",    [FeatureMMX,    Feature3DNowA, FeatureSlowBTMem]>;
+def : Proc<"athlon-4",        [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem]>;
+def : Proc<"athlon-xp",       [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem]>;
+def : Proc<"athlon-mp",       [FeatureSSE1,   Feature3DNowA, FeatureSlowBTMem]>;
+def : Proc<"k8",              [FeatureSSE2,   Feature3DNowA, Feature64Bit,
+                               FeatureSlowBTMem]>;
+def : Proc<"opteron",         [FeatureSSE2,   Feature3DNowA, Feature64Bit,
+                               FeatureSlowBTMem]>;
+def : Proc<"athlon64",        [FeatureSSE2,   Feature3DNowA, Feature64Bit,
+                               FeatureSlowBTMem]>;
+def : Proc<"athlon-fx",       [FeatureSSE2,   Feature3DNowA, Feature64Bit,
+                               FeatureSlowBTMem]>;
+def : Proc<"k8-sse3",         [FeatureSSE3,   Feature3DNowA, Feature64Bit,
+                               FeatureSlowBTMem]>;
+def : Proc<"opteron-sse3",    [FeatureSSE3,   Feature3DNowA, Feature64Bit,
+                               FeatureSlowBTMem]>;
+def : Proc<"athlon64-sse3",   [FeatureSSE3,   Feature3DNowA, Feature64Bit,
+                               FeatureSlowBTMem]>;
+def : Proc<"amdfam10",        [FeatureSSE3,   FeatureSSE4A,
+                               Feature3DNowA, Feature64Bit, FeatureSlowBTMem]>;
+def : Proc<"barcelona",       [FeatureSSE3,   FeatureSSE4A,
+                               Feature3DNowA, Feature64Bit, FeatureSlowBTMem]>;
+def : Proc<"istanbul",        [Feature3DNowA, Feature64Bit, FeatureSSE4A,
+                               Feature3DNowA]>;
+def : Proc<"shanghai",        [Feature3DNowA, Feature64Bit, FeatureSSE4A,
+                               Feature3DNowA]>;
+
+def : Proc<"winchip-c6",      [FeatureMMX]>;
+def : Proc<"winchip2",        [FeatureMMX, Feature3DNow]>;
+def : Proc<"c3",              [FeatureMMX, Feature3DNow]>;
+def : Proc<"c3-2",            [FeatureSSE1]>;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "X86RegisterInfo.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "X86InstrInfo.td"
+
+def X86InstrInfo : InstrInfo {
+
+  // Define how we want to layout our TargetSpecific information field... This
+  // should be kept up-to-date with the fields in the X86InstrInfo.h file.
+  let TSFlagsFields = ["FormBits",
+                       "hasOpSizePrefix",
+                       "hasAdSizePrefix",
+                       "Prefix",
+                       "hasREX_WPrefix",
+                       "ImmTypeBits",
+                       "FPFormBits",
+                       "hasLockPrefix",
+                       "SegOvrBits",
+                       "Opcode"];
+  let TSFlagsShifts = [0,
+                       6,
+                       7,
+                       8,
+                       12,
+                       13,
+                       16,
+                       19,
+                       20,
+                       24];
+}
+
+//===----------------------------------------------------------------------===//
+// Calling Conventions
+//===----------------------------------------------------------------------===//
+
+include "X86CallingConv.td"
+
+
+//===----------------------------------------------------------------------===//
+// Assembly Printers
+//===----------------------------------------------------------------------===//
+
+// Currently the X86 assembly parser only supports ATT syntax.
+def ATTAsmParser : AsmParser {
+  string AsmParserClassName  = "ATTAsmParser";
+  int Variant = 0;
+
+  // Discard comments in assembly strings.
+  string CommentDelimiter = "#";
+
+  // Recognize hard coded registers.
+  string RegisterPrefix = "%";
+}
+
+// The X86 target supports two different syntaxes for emitting machine code.
+// This is controlled by the -x86-asm-syntax={att|intel}
+def ATTAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "ATTInstPrinter";
+  int Variant = 0;
+}
+def IntelAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "IntelInstPrinter";
+  int Variant = 1;
+}
+
+def X86 : Target {
+  // Information about the instructions...
+  let InstructionSet = X86InstrInfo;
+
+  let AssemblyParsers = [ATTAsmParser];
+
+  let AssemblyWriters = [ATTAsmWriter, IntelAsmWriter];
+}

diff --git a/lib/Target/X86/X86COFF.h b/lib/Target/X86/X86COFF.h
new file mode 100644
index 0000000..0a8e4e6
--- /dev/null
+++ b/lib/Target/X86/X86COFF.h

@@ -0,0 +1,95 @@
+//===--- X86COFF.h - Some definitions from COFF documentations ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file just defines some symbols found in COFF documentation. They are
+// used to emit function type information for COFF targets (Cygwin/Mingw32).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86COFF_H
+#define X86COFF_H
+
+namespace COFF 
+{
+/// Storage class tells where and what the symbol represents
+enum StorageClass {
+  C_EFCN =   -1,  ///< Physical end of function
+  C_NULL    = 0,  ///< No symbol
+  C_AUTO    = 1,  ///< External definition
+  C_EXT     = 2,  ///< External symbol
+  C_STAT    = 3,  ///< Static
+  C_REG     = 4,  ///< Register variable
+  C_EXTDEF  = 5,  ///< External definition
+  C_LABEL   = 6,  ///< Label
+  C_ULABEL  = 7,  ///< Undefined label
+  C_MOS     = 8,  ///< Member of structure
+  C_ARG     = 9,  ///< Function argument
+  C_STRTAG  = 10, ///< Structure tag
+  C_MOU     = 11, ///< Member of union
+  C_UNTAG   = 12, ///< Union tag
+  C_TPDEF   = 13, ///< Type definition
+  C_USTATIC = 14, ///< Undefined static
+  C_ENTAG   = 15, ///< Enumeration tag
+  C_MOE     = 16, ///< Member of enumeration
+  C_REGPARM = 17, ///< Register parameter
+  C_FIELD   = 18, ///< Bit field
+
+  C_BLOCK  = 100, ///< ".bb" or ".eb" - beginning or end of block
+  C_FCN    = 101, ///< ".bf" or ".ef" - beginning or end of function
+  C_EOS    = 102, ///< End of structure
+  C_FILE   = 103, ///< File name
+  C_LINE   = 104, ///< Line number, reformatted as symbol
+  C_ALIAS  = 105, ///< Duplicate tag
+  C_HIDDEN = 106  ///< External symbol in dmert public lib
+};
+
+/// The type of the symbol. This is made up of a base type and a derived type.
+/// For example, pointer to int is "pointer to T" and "int"
+enum SymbolType {
+  T_NULL   = 0,  ///< No type info
+  T_ARG    = 1,  ///< Void function argument (only used by compiler)
+  T_VOID   = 1,  ///< The same as above. Just named differently in some specs.
+  T_CHAR   = 2,  ///< Character
+  T_SHORT  = 3,  ///< Short integer
+  T_INT    = 4,  ///< Integer
+  T_LONG   = 5,  ///< Long integer
+  T_FLOAT  = 6,  ///< Floating point
+  T_DOUBLE = 7,  ///< Double word
+  T_STRUCT = 8,  ///< Structure
+  T_UNION  = 9,  ///< Union
+  T_ENUM   = 10, ///< Enumeration
+  T_MOE    = 11, ///< Member of enumeration
+  T_UCHAR  = 12, ///< Unsigned character
+  T_USHORT = 13, ///< Unsigned short
+  T_UINT   = 14, ///< Unsigned integer
+  T_ULONG  = 15  ///< Unsigned long
+};
+
+/// Derived type of symbol
+enum SymbolDerivedType {
+  DT_NON = 0, ///< No derived type
+  DT_PTR = 1, ///< Pointer to T
+  DT_FCN = 2, ///< Function returning T
+  DT_ARY = 3  ///< Array of T
+};
+
+/// Masks for extracting parts of type
+enum SymbolTypeMasks {
+  N_BTMASK = 017, ///< Mask for base type
+  N_TMASK  = 060  ///< Mask for derived type
+};
+
+/// Offsets of parts of type
+enum Shifts {
+  N_BTSHFT = 4 ///< Type is formed as (base + derived << N_BTSHIFT)
+};
+
+}
+
+#endif // X86COFF_H

diff --git a/lib/Target/X86/X86COFFMachineModuleInfo.cpp b/lib/Target/X86/X86COFFMachineModuleInfo.cpp
new file mode 100644
index 0000000..ea52795
--- /dev/null
+++ b/lib/Target/X86/X86COFFMachineModuleInfo.cpp

@@ -0,0 +1,126 @@
+//===-- llvm/CodeGen/X86COFFMachineModuleInfo.cpp -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is an MMI implementation for X86 COFF (windows) targets.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86COFFMachineModuleInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+X86COFFMachineModuleInfo::X86COFFMachineModuleInfo(const MachineModuleInfo &) {
+}
+X86COFFMachineModuleInfo::~X86COFFMachineModuleInfo() {
+}
+
+void X86COFFMachineModuleInfo::AddFunctionInfo(const Function *F,
+                                            const X86MachineFunctionInfo &Val) {
+  FunctionInfoMap[F] = Val;
+}
+
+
+
+static X86MachineFunctionInfo calculateFunctionInfo(const Function *F,
+                                                    const TargetData &TD) {
+  X86MachineFunctionInfo Info;
+  uint64_t Size = 0;
+  
+  switch (F->getCallingConv()) {
+  case CallingConv::X86_StdCall:
+    Info.setDecorationStyle(StdCall);
+    break;
+  case CallingConv::X86_FastCall:
+    Info.setDecorationStyle(FastCall);
+    break;
+  default:
+    return Info;
+  }
+  
+  unsigned argNum = 1;
+  for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
+       AI != AE; ++AI, ++argNum) {
+    const Type* Ty = AI->getType();
+    
+    // 'Dereference' type in case of byval parameter attribute
+    if (F->paramHasAttr(argNum, Attribute::ByVal))
+      Ty = cast<PointerType>(Ty)->getElementType();
+    
+    // Size should be aligned to DWORD boundary
+    Size += ((TD.getTypeAllocSize(Ty) + 3)/4)*4;
+  }
+  
+  // We're not supporting tooooo huge arguments :)
+  Info.setBytesToPopOnReturn((unsigned int)Size);
+  return Info;
+}
+
+
+/// DecorateCygMingName - Query FunctionInfoMap and use this information for
+/// various name decorations for Cygwin and MingW.
+void X86COFFMachineModuleInfo::DecorateCygMingName(SmallVectorImpl<char> &Name,
+                                                   const GlobalValue *GV,
+                                                   const TargetData &TD) {
+  const Function *F = dyn_cast<Function>(GV);
+  if (!F) return;
+  
+  // Save function name for later type emission.
+  if (F->isDeclaration())
+    CygMingStubs.insert(StringRef(Name.data(), Name.size()));
+  
+  // We don't want to decorate non-stdcall or non-fastcall functions right now
+  CallingConv::ID CC = F->getCallingConv();
+  if (CC != CallingConv::X86_StdCall && CC != CallingConv::X86_FastCall)
+    return;
+  
+  const X86MachineFunctionInfo *Info;
+  
+  FMFInfoMap::const_iterator info_item = FunctionInfoMap.find(F);
+  if (info_item == FunctionInfoMap.end()) {
+    // Calculate apropriate function info and populate map
+    FunctionInfoMap[F] = calculateFunctionInfo(F, TD);
+    Info = &FunctionInfoMap[F];
+  } else {
+    Info = &info_item->second;
+  }
+  
+  if (Info->getDecorationStyle() == None) return;
+  const FunctionType *FT = F->getFunctionType();
+  
+  // "Pure" variadic functions do not receive @0 suffix.
+  if (!FT->isVarArg() || FT->getNumParams() == 0 ||
+      (FT->getNumParams() == 1 && F->hasStructRetAttr()))
+    raw_svector_ostream(Name) << '@' << Info->getBytesToPopOnReturn();
+  
+  if (Info->getDecorationStyle() == FastCall) {
+    if (Name[0] == '_')
+      Name[0] = '@';
+    else
+      Name.insert(Name.begin(), '@');
+  }    
+}
+
+/// DecorateCygMingName - Query FunctionInfoMap and use this information for
+/// various name decorations for Cygwin and MingW.
+void X86COFFMachineModuleInfo::DecorateCygMingName(MCSymbol *&Name,
+                                                   MCContext &Ctx,
+                                                   const GlobalValue *GV,
+                                                   const TargetData &TD) {
+  SmallString<128> NameStr(Name->getName().begin(), Name->getName().end());
+  DecorateCygMingName(NameStr, GV, TD);
+  
+  Name = Ctx.GetOrCreateSymbol(NameStr.str());
+}

diff --git a/lib/Target/X86/X86COFFMachineModuleInfo.h b/lib/Target/X86/X86COFFMachineModuleInfo.h
new file mode 100644
index 0000000..0e2009e
--- /dev/null
+++ b/lib/Target/X86/X86COFFMachineModuleInfo.h

@@ -0,0 +1,68 @@
+//===-- llvm/CodeGen/X86COFFMachineModuleInfo.h -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is an MMI implementation for X86 COFF (windows) targets.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86COFF_MACHINEMODULEINFO_H
+#define X86COFF_MACHINEMODULEINFO_H
+
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/ADT/StringSet.h"
+#include "X86MachineFunctionInfo.h"
+
+namespace llvm {
+  class X86MachineFunctionInfo;
+  class TargetData;
+  
+/// X86COFFMachineModuleInfo - This is a MachineModuleInfoImpl implementation
+/// for X86 COFF targets.
+class X86COFFMachineModuleInfo : public MachineModuleInfoImpl {
+  StringSet<> CygMingStubs;
+  
+  // We have to propagate some information about MachineFunction to
+  // AsmPrinter. It's ok, when we're printing the function, since we have
+  // access to MachineFunction and can get the appropriate MachineFunctionInfo.
+  // Unfortunately, this is not possible when we're printing reference to
+  // Function (e.g. calling it and so on). Even more, there is no way to get the
+  // corresponding MachineFunctions: it can even be not created at all. That's
+  // why we should use additional structure, when we're collecting all necessary
+  // information.
+  //
+  // This structure is using e.g. for name decoration for stdcall & fastcall'ed
+  // function, since we have to use arguments' size for decoration.
+  typedef std::map<const Function*, X86MachineFunctionInfo> FMFInfoMap;
+  FMFInfoMap FunctionInfoMap;
+  
+public:
+  X86COFFMachineModuleInfo(const MachineModuleInfo &);
+  ~X86COFFMachineModuleInfo();
+  
+  
+  void DecorateCygMingName(MCSymbol* &Name, MCContext &Ctx,
+                           const GlobalValue *GV, const TargetData &TD);
+  void DecorateCygMingName(SmallVectorImpl<char> &Name, const GlobalValue *GV,
+                           const TargetData &TD);
+  
+  void AddFunctionInfo(const Function *F, const X86MachineFunctionInfo &Val);
+  
+
+  typedef StringSet<>::const_iterator stub_iterator;
+  stub_iterator stub_begin() const { return CygMingStubs.begin(); }
+  stub_iterator stub_end() const { return CygMingStubs.end(); }
+
+  
+};
+
+
+
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
new file mode 100644
index 0000000..12d3d04
--- /dev/null
+++ b/lib/Target/X86/X86CallingConv.td

@@ -0,0 +1,322 @@
+//===- X86CallingConv.td - Calling Conventions X86 32/64 ---*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This describes the calling conventions for the X86-32 and X86-64
+// architectures.
+//
+//===----------------------------------------------------------------------===//
+
+/// CCIfSubtarget - Match if the current subtarget has a feature F.
+class CCIfSubtarget<string F, CCAction A>
+ : CCIf<!strconcat("State.getTarget().getSubtarget<X86Subtarget>().", F), A>;
+
+//===----------------------------------------------------------------------===//
+// Return Value Calling Conventions
+//===----------------------------------------------------------------------===//
+
+// Return-value conventions common to all X86 CC's.
+def RetCC_X86Common : CallingConv<[
+  // Scalar values are returned in AX first, then DX.  For i8, the ABI
+  // requires the values to be in AL and AH, however this code uses AL and DL
+  // instead. This is because using AH for the second register conflicts with
+  // the way LLVM does multiple return values -- a return of {i16,i8} would end
+  // up in AX and AH, which overlap. Front-ends wishing to conform to the ABI
+  // for functions that return two i8 values are currently expected to pack the
+  // values into an i16 (which uses AX, and thus AL:AH).
+  CCIfType<[i8] , CCAssignToReg<[AL, DL]>>,
+  CCIfType<[i16], CCAssignToReg<[AX, DX]>>,
+  CCIfType<[i32], CCAssignToReg<[EAX, EDX]>>,
+  CCIfType<[i64], CCAssignToReg<[RAX, RDX]>>,
+  
+  // Vector types are returned in XMM0 and XMM1, when they fit.  XMMM2 and XMM3
+  // can only be used by ABI non-compliant code. If the target doesn't have XMM
+  // registers, it won't have vector types.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+            CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>,
+
+  // MMX vector types are always returned in MM0. If the target doesn't have
+  // MM0, it doesn't support these vector types.
+  CCIfType<[v8i8, v4i16, v2i32, v1i64, v2f32], CCAssignToReg<[MM0]>>,
+
+  // Long double types are always returned in ST0 (even with SSE).
+  CCIfType<[f80], CCAssignToReg<[ST0, ST1]>>
+]>;
+
+// X86-32 C return-value convention.
+def RetCC_X86_32_C : CallingConv<[
+  // The X86-32 calling convention returns FP values in ST0, unless marked
+  // with "inreg" (used here to distinguish one kind of reg from another,
+  // weirdly; this is really the sse-regparm calling convention) in which
+  // case they use XMM0, otherwise it is the same as the common X86 calling
+  // conv.
+  CCIfInReg<CCIfSubtarget<"hasSSE2()",
+    CCIfType<[f32, f64], CCAssignToReg<[XMM0,XMM1,XMM2]>>>>,
+  CCIfType<[f32,f64], CCAssignToReg<[ST0, ST1]>>,
+  CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-32 FastCC return-value convention.
+def RetCC_X86_32_Fast : CallingConv<[
+  // The X86-32 fastcc returns 1, 2, or 3 FP values in XMM0-2 if the target has
+  // SSE2.
+  // This can happen when a float, 2 x float, or 3 x float vector is split by
+  // target lowering, and is returned in 1-3 sse regs.
+  CCIfType<[f32], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>,
+  CCIfType<[f64], CCIfSubtarget<"hasSSE2()", CCAssignToReg<[XMM0,XMM1,XMM2]>>>,
+
+  // For integers, ECX can be used as an extra return register
+  CCIfType<[i8],  CCAssignToReg<[AL, DL, CL]>>,
+  CCIfType<[i16], CCAssignToReg<[AX, DX, CX]>>,
+  CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>,
+
+  // Otherwise, it is the same as the common X86 calling convention.
+  CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-64 C return-value convention.
+def RetCC_X86_64_C : CallingConv<[
+  // The X86-64 calling convention always returns FP values in XMM0.
+  CCIfType<[f32], CCAssignToReg<[XMM0, XMM1]>>,
+  CCIfType<[f64], CCAssignToReg<[XMM0, XMM1]>>,
+
+  // MMX vector types are always returned in XMM0 except for v1i64 which is
+  // returned in RAX. This disagrees with ABI documentation but is bug
+  // compatible with gcc.
+  CCIfType<[v1i64], CCAssignToReg<[RAX]>>,
+  CCIfType<[v8i8, v4i16, v2i32, v2f32], CCAssignToReg<[XMM0, XMM1]>>,
+  CCDelegateTo<RetCC_X86Common>
+]>;
+
+// X86-Win64 C return-value convention.
+def RetCC_X86_Win64_C : CallingConv<[
+  // The X86-Win64 calling convention always returns __m64 values in RAX.
+  CCIfType<[v8i8, v4i16, v2i32, v1i64], CCBitConvertToType<i64>>,
+
+  // And FP in XMM0 only.
+  CCIfType<[f32], CCAssignToReg<[XMM0]>>,
+  CCIfType<[f64], CCAssignToReg<[XMM0]>>,
+
+  // Otherwise, everything is the same as 'normal' X86-64 C CC.
+  CCDelegateTo<RetCC_X86_64_C>
+]>;
+
+
+// This is the root return-value convention for the X86-32 backend.
+def RetCC_X86_32 : CallingConv<[
+  // If FastCC, use RetCC_X86_32_Fast.
+  CCIfCC<"CallingConv::Fast", CCDelegateTo<RetCC_X86_32_Fast>>,
+  // Otherwise, use RetCC_X86_32_C.
+  CCDelegateTo<RetCC_X86_32_C>
+]>;
+
+// This is the root return-value convention for the X86-64 backend.
+def RetCC_X86_64 : CallingConv<[
+  // Mingw64 and native Win64 use Win64 CC
+  CCIfSubtarget<"isTargetWin64()", CCDelegateTo<RetCC_X86_Win64_C>>,
+
+  // Otherwise, drop to normal X86-64 CC
+  CCDelegateTo<RetCC_X86_64_C>
+]>;
+
+// This is the return-value convention used for the entire X86 backend.
+def RetCC_X86 : CallingConv<[
+  CCIfSubtarget<"is64Bit()", CCDelegateTo<RetCC_X86_64>>,
+  CCDelegateTo<RetCC_X86_32>
+]>;
+
+//===----------------------------------------------------------------------===//
+// X86-64 Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+
+def CC_X86_64_C : CallingConv<[
+  // Handles byval parameters.
+  CCIfByVal<CCPassByVal<8, 8>>,
+
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // The 'nest' parameter, if any, is passed in R10.
+  CCIfNest<CCAssignToReg<[R10]>>,
+
+  // The first 6 v1i64 vector arguments are passed in GPRs on Darwin.
+  CCIfType<[v1i64],
+            CCIfSubtarget<"isTargetDarwin()",
+            CCBitConvertToType<i64>>>,
+
+  // The first 6 integer arguments are passed in integer registers.
+  CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX, R8D, R9D]>>,
+  CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8 , R9 ]>>,
+
+  // The first 8 MMX (except for v1i64) vector arguments are passed in XMM
+  // registers on Darwin.
+  CCIfType<[v8i8, v4i16, v2i32, v2f32],
+            CCIfSubtarget<"isTargetDarwin()",
+            CCIfSubtarget<"hasSSE2()",
+            CCPromoteToType<v2i64>>>>,
+
+  // The first 8 FP/Vector arguments are passed in XMM registers.
+  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+            CCIfSubtarget<"hasSSE1()",
+            CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>,
+ 
+  // Integer/FP values get stored in stack slots that are 8 bytes in size and
+  // 8-byte aligned if there are no more registers to hold them.
+  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
+  
+  // Long doubles get stack slots whose size and alignment depends on the
+  // subtarget.
+  CCIfType<[f80], CCAssignToStack<0, 0>>,
+
+  // Vectors get 16-byte stack slots that are 16-byte aligned.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+
+  // __m64 vectors get 8-byte stack slots that are 8-byte aligned.
+  CCIfType<[v8i8, v4i16, v2i32, v1i64, v2f32], CCAssignToStack<8, 8>>
+]>;
+
+// Calling convention used on Win64
+def CC_X86_Win64_C : CallingConv<[
+  // FIXME: Handle byval stuff.
+  // FIXME: Handle varargs.
+
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // The 'nest' parameter, if any, is passed in R10.
+  CCIfNest<CCAssignToReg<[R10]>>,
+
+  // 128 bit vectors are passed by pointer
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCPassIndirect<i64>>,
+
+  // The first 4 MMX vector arguments are passed in GPRs.
+  CCIfType<[v8i8, v4i16, v2i32, v1i64, v2f32],
+           CCBitConvertToType<i64>>,
+
+  // The first 4 integer arguments are passed in integer registers.
+  CCIfType<[i32], CCAssignToRegWithShadow<[ECX , EDX , R8D , R9D ],
+                                          [XMM0, XMM1, XMM2, XMM3]>>,
+  CCIfType<[i64], CCAssignToRegWithShadow<[RCX , RDX , R8  , R9  ],
+                                          [XMM0, XMM1, XMM2, XMM3]>>,
+
+  // The first 4 FP/Vector arguments are passed in XMM registers.
+  CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+           CCAssignToRegWithShadow<[XMM0, XMM1, XMM2, XMM3],
+                                   [RCX , RDX , R8  , R9  ]>>,
+
+  // Integer/FP values get stored in stack slots that are 8 bytes in size and
+  // 8-byte aligned if there are no more registers to hold them.
+  CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
+
+  // Long doubles get stack slots whose size and alignment depends on the
+  // subtarget.
+  CCIfType<[f80], CCAssignToStack<0, 0>>,
+
+  // __m64 vectors get 8-byte stack slots that are 8-byte aligned.
+  CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToStack<8, 8>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// X86 C Calling Convention
+//===----------------------------------------------------------------------===//
+
+/// CC_X86_32_Common - In all X86-32 calling conventions, extra integers and FP
+/// values are spilled on the stack, and the first 4 vector values go in XMM
+/// regs.
+def CC_X86_32_Common : CallingConv<[
+  // Handles byval parameters.
+  CCIfByVal<CCPassByVal<4, 4>>,
+
+  // The first 3 float or double arguments, if marked 'inreg' and if the call
+  // is not a vararg call and if SSE2 is available, are passed in SSE registers.
+  CCIfNotVarArg<CCIfInReg<CCIfType<[f32,f64],
+                CCIfSubtarget<"hasSSE2()",
+                CCAssignToReg<[XMM0,XMM1,XMM2]>>>>>,
+
+  // The first 3 __m64 (except for v1i64) vector arguments are passed in mmx
+  // registers if the call is not a vararg call.
+  CCIfNotVarArg<CCIfType<[v8i8, v4i16, v2i32, v2f32],
+                CCAssignToReg<[MM0, MM1, MM2]>>>,
+
+  // Integer/Float values get stored in stack slots that are 4 bytes in
+  // size and 4-byte aligned.
+  CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
+  
+  // Doubles get 8-byte slots that are 4-byte aligned.
+  CCIfType<[f64], CCAssignToStack<8, 4>>,
+
+  // Long doubles get slots whose size depends on the subtarget.
+  CCIfType<[f80], CCAssignToStack<0, 4>>,
+
+  // The first 4 SSE vector arguments are passed in XMM registers.
+  CCIfNotVarArg<CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>>,
+
+  // Other SSE vectors get 16-byte stack slots that are 16-byte aligned.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+
+  // __m64 vectors get 8-byte stack slots that are 4-byte aligned. They are
+  // passed in the parameter area.
+  CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToStack<8, 4>>]>;
+
+def CC_X86_32_C : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // The 'nest' parameter, if any, is passed in ECX.
+  CCIfNest<CCAssignToReg<[ECX]>>,
+
+  // The first 3 integer arguments, if marked 'inreg' and if the call is not
+  // a vararg call, are passed in integer registers.
+  CCIfNotVarArg<CCIfInReg<CCIfType<[i32], CCAssignToReg<[EAX, EDX, ECX]>>>>,
+
+  // Otherwise, same as everything else.
+  CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_FastCall : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // The 'nest' parameter, if any, is passed in EAX.
+  CCIfNest<CCAssignToReg<[EAX]>>,
+
+  // The first 2 integer arguments are passed in ECX/EDX
+  CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>,
+
+  // Otherwise, same as everything else.
+  CCDelegateTo<CC_X86_32_Common>
+]>;
+
+def CC_X86_32_FastCC : CallingConv<[
+  // Handles byval parameters.  Note that we can't rely on the delegation
+  // to CC_X86_32_Common for this because that happens after code that
+  // puts arguments in registers.
+  CCIfByVal<CCPassByVal<4, 4>>,
+
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // The 'nest' parameter, if any, is passed in EAX.
+  CCIfNest<CCAssignToReg<[EAX]>>,
+
+  // The first 2 integer arguments are passed in ECX/EDX
+  CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>,
+
+  // The first 3 float or double arguments, if the call is not a vararg
+  // call and if SSE2 is available, are passed in SSE registers.
+  CCIfNotVarArg<CCIfType<[f32,f64],
+                CCIfSubtarget<"hasSSE2()",
+                CCAssignToReg<[XMM0,XMM1,XMM2]>>>>,
+
+  // Doubles get 8-byte slots that are 8-byte aligned.
+  CCIfType<[f64], CCAssignToStack<8, 8>>,
+
+  // Otherwise, same as everything else.
+  CCDelegateTo<CC_X86_32_Common>
+]>;

diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp
new file mode 100644
index 0000000..f0bceb1
--- /dev/null
+++ b/lib/Target/X86/X86CodeEmitter.cpp

@@ -0,0 +1,1198 @@
+//===-- X86/X86CodeEmitter.cpp - Convert X86 code to machine code ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the pass that transforms the X86 machine instructions into
+// relocatable machine code.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-emitter"
+#include "X86InstrInfo.h"
+#include "X86JITInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "X86Relocations.h"
+#include "X86.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/PassManager.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Function.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+STATISTIC(NumEmitted, "Number of machine instructions emitted");
+
+namespace {
+  template<class CodeEmitter>
+  class Emitter : public MachineFunctionPass {
+    const X86InstrInfo  *II;
+    const TargetData    *TD;
+    X86TargetMachine    &TM;
+    CodeEmitter         &MCE;
+    intptr_t PICBaseOffset;
+    bool Is64BitMode;
+    bool IsPIC;
+  public:
+    static char ID;
+    explicit Emitter(X86TargetMachine &tm, CodeEmitter &mce)
+      : MachineFunctionPass(&ID), II(0), TD(0), TM(tm), 
+      MCE(mce), PICBaseOffset(0), Is64BitMode(false),
+      IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
+    Emitter(X86TargetMachine &tm, CodeEmitter &mce,
+            const X86InstrInfo &ii, const TargetData &td, bool is64)
+      : MachineFunctionPass(&ID), II(&ii), TD(&td), TM(tm), 
+      MCE(mce), PICBaseOffset(0), Is64BitMode(is64),
+      IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
+
+    bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual const char *getPassName() const {
+      return "X86 Machine Code Emitter";
+    }
+
+    void emitInstruction(const MachineInstr &MI,
+                         const TargetInstrDesc *Desc);
+    
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+      AU.addRequired<MachineModuleInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    void emitPCRelativeBlockAddress(MachineBasicBlock *MBB);
+    void emitGlobalAddress(GlobalValue *GV, unsigned Reloc,
+                           intptr_t Disp = 0, intptr_t PCAdj = 0,
+                           bool Indirect = false);
+    void emitExternalSymbolAddress(const char *ES, unsigned Reloc);
+    void emitConstPoolAddress(unsigned CPI, unsigned Reloc, intptr_t Disp = 0,
+                              intptr_t PCAdj = 0);
+    void emitJumpTableAddress(unsigned JTI, unsigned Reloc,
+                              intptr_t PCAdj = 0);
+
+    void emitDisplacementField(const MachineOperand *RelocOp, int DispVal,
+                               intptr_t Adj = 0, bool IsPCRel = true);
+
+    void emitRegModRMByte(unsigned ModRMReg, unsigned RegOpcodeField);
+    void emitRegModRMByte(unsigned RegOpcodeField);
+    void emitSIBByte(unsigned SS, unsigned Index, unsigned Base);
+    void emitConstant(uint64_t Val, unsigned Size);
+
+    void emitMemModRMByte(const MachineInstr &MI,
+                          unsigned Op, unsigned RegOpcodeField,
+                          intptr_t PCAdj = 0);
+
+    unsigned getX86RegNum(unsigned RegNo) const;
+  };
+
+template<class CodeEmitter>
+  char Emitter<CodeEmitter>::ID = 0;
+} // end anonymous namespace.
+
+/// createX86CodeEmitterPass - Return a pass that emits the collected X86 code
+/// to the specified templated MachineCodeEmitter object.
+FunctionPass *llvm::createX86JITCodeEmitterPass(X86TargetMachine &TM,
+                                                JITCodeEmitter &JCE) {
+  return new Emitter<JITCodeEmitter>(TM, JCE);
+}
+
+template<class CodeEmitter>
+bool Emitter<CodeEmitter>::runOnMachineFunction(MachineFunction &MF) {
+ 
+  MCE.setModuleInfo(&getAnalysis<MachineModuleInfo>());
+  
+  II = TM.getInstrInfo();
+  TD = TM.getTargetData();
+  Is64BitMode = TM.getSubtarget<X86Subtarget>().is64Bit();
+  IsPIC = TM.getRelocationModel() == Reloc::PIC_;
+  
+  do {
+    DEBUG(dbgs() << "JITTing function '" 
+          << MF.getFunction()->getName() << "'\n");
+    MCE.startFunction(MF);
+    for (MachineFunction::iterator MBB = MF.begin(), E = MF.end(); 
+         MBB != E; ++MBB) {
+      MCE.StartMachineBasicBlock(MBB);
+      for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end();
+           I != E; ++I) {
+        const TargetInstrDesc &Desc = I->getDesc();
+        emitInstruction(*I, &Desc);
+        // MOVPC32r is basically a call plus a pop instruction.
+        if (Desc.getOpcode() == X86::MOVPC32r)
+          emitInstruction(*I, &II->get(X86::POP32r));
+        NumEmitted++;  // Keep track of the # of mi's emitted
+      }
+    }
+  } while (MCE.finishFunction(MF));
+
+  return false;
+}
+
+/// emitPCRelativeBlockAddress - This method keeps track of the information
+/// necessary to resolve the address of this block later and emits a dummy
+/// value.
+///
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitPCRelativeBlockAddress(MachineBasicBlock *MBB) {
+  // Remember where this reference was and where it is to so we can
+  // deal with it later.
+  MCE.addRelocation(MachineRelocation::getBB(MCE.getCurrentPCOffset(),
+                                             X86::reloc_pcrel_word, MBB));
+  MCE.emitWordLE(0);
+}
+
+/// emitGlobalAddress - Emit the specified address to the code stream assuming
+/// this is part of a "take the address of a global" instruction.
+///
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitGlobalAddress(GlobalValue *GV, unsigned Reloc,
+                                intptr_t Disp /* = 0 */,
+                                intptr_t PCAdj /* = 0 */,
+                                bool Indirect /* = false */) {
+  intptr_t RelocCST = Disp;
+  if (Reloc == X86::reloc_picrel_word)
+    RelocCST = PICBaseOffset;
+  else if (Reloc == X86::reloc_pcrel_word)
+    RelocCST = PCAdj;
+  MachineRelocation MR = Indirect
+    ? MachineRelocation::getIndirectSymbol(MCE.getCurrentPCOffset(), Reloc,
+                                           GV, RelocCST, false)
+    : MachineRelocation::getGV(MCE.getCurrentPCOffset(), Reloc,
+                               GV, RelocCST, false);
+  MCE.addRelocation(MR);
+  // The relocated value will be added to the displacement
+  if (Reloc == X86::reloc_absolute_dword)
+    MCE.emitDWordLE(Disp);
+  else
+    MCE.emitWordLE((int32_t)Disp);
+}
+
+/// emitExternalSymbolAddress - Arrange for the address of an external symbol to
+/// be emitted to the current location in the function, and allow it to be PC
+/// relative.
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitExternalSymbolAddress(const char *ES,
+                                                     unsigned Reloc) {
+  intptr_t RelocCST = (Reloc == X86::reloc_picrel_word) ? PICBaseOffset : 0;
+
+  // X86 never needs stubs because instruction selection will always pick
+  // an instruction sequence that is large enough to hold any address
+  // to a symbol.
+  // (see X86ISelLowering.cpp, near 2039: X86TargetLowering::LowerCall)
+  bool NeedStub = false;
+  MCE.addRelocation(MachineRelocation::getExtSym(MCE.getCurrentPCOffset(),
+                                                 Reloc, ES, RelocCST,
+                                                 0, NeedStub));
+  if (Reloc == X86::reloc_absolute_dword)
+    MCE.emitDWordLE(0);
+  else
+    MCE.emitWordLE(0);
+}
+
+/// emitConstPoolAddress - Arrange for the address of an constant pool
+/// to be emitted to the current location in the function, and allow it to be PC
+/// relative.
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitConstPoolAddress(unsigned CPI, unsigned Reloc,
+                                   intptr_t Disp /* = 0 */,
+                                   intptr_t PCAdj /* = 0 */) {
+  intptr_t RelocCST = 0;
+  if (Reloc == X86::reloc_picrel_word)
+    RelocCST = PICBaseOffset;
+  else if (Reloc == X86::reloc_pcrel_word)
+    RelocCST = PCAdj;
+  MCE.addRelocation(MachineRelocation::getConstPool(MCE.getCurrentPCOffset(),
+                                                    Reloc, CPI, RelocCST));
+  // The relocated value will be added to the displacement
+  if (Reloc == X86::reloc_absolute_dword)
+    MCE.emitDWordLE(Disp);
+  else
+    MCE.emitWordLE((int32_t)Disp);
+}
+
+/// emitJumpTableAddress - Arrange for the address of a jump table to
+/// be emitted to the current location in the function, and allow it to be PC
+/// relative.
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitJumpTableAddress(unsigned JTI, unsigned Reloc,
+                                   intptr_t PCAdj /* = 0 */) {
+  intptr_t RelocCST = 0;
+  if (Reloc == X86::reloc_picrel_word)
+    RelocCST = PICBaseOffset;
+  else if (Reloc == X86::reloc_pcrel_word)
+    RelocCST = PCAdj;
+  MCE.addRelocation(MachineRelocation::getJumpTable(MCE.getCurrentPCOffset(),
+                                                    Reloc, JTI, RelocCST));
+  // The relocated value will be added to the displacement
+  if (Reloc == X86::reloc_absolute_dword)
+    MCE.emitDWordLE(0);
+  else
+    MCE.emitWordLE(0);
+}
+
+template<class CodeEmitter>
+unsigned Emitter<CodeEmitter>::getX86RegNum(unsigned RegNo) const {
+  return X86RegisterInfo::getX86RegNum(RegNo);
+}
+
+inline static unsigned char ModRMByte(unsigned Mod, unsigned RegOpcode,
+                                      unsigned RM) {
+  assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!");
+  return RM | (RegOpcode << 3) | (Mod << 6);
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitRegModRMByte(unsigned ModRMReg,
+                                            unsigned RegOpcodeFld){
+  MCE.emitByte(ModRMByte(3, RegOpcodeFld, getX86RegNum(ModRMReg)));
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitRegModRMByte(unsigned RegOpcodeFld) {
+  MCE.emitByte(ModRMByte(3, RegOpcodeFld, 0));
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitSIBByte(unsigned SS, 
+                                       unsigned Index,
+                                       unsigned Base) {
+  // SIB byte is in the same format as the ModRMByte...
+  MCE.emitByte(ModRMByte(SS, Index, Base));
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitConstant(uint64_t Val, unsigned Size) {
+  // Output the constant in little endian byte order...
+  for (unsigned i = 0; i != Size; ++i) {
+    MCE.emitByte(Val & 255);
+    Val >>= 8;
+  }
+}
+
+/// isDisp8 - Return true if this signed displacement fits in a 8-bit 
+/// sign-extended field. 
+static bool isDisp8(int Value) {
+  return Value == (signed char)Value;
+}
+
+static bool gvNeedsNonLazyPtr(const MachineOperand &GVOp,
+                              const TargetMachine &TM) {
+  // For Darwin-64, simulate the linktime GOT by using the same non-lazy-pointer
+  // mechanism as 32-bit mode.
+  if (TM.getSubtarget<X86Subtarget>().is64Bit() && 
+      !TM.getSubtarget<X86Subtarget>().isTargetDarwin())
+    return false;
+  
+  // Return true if this is a reference to a stub containing the address of the
+  // global, not the global itself.
+  return isGlobalStubReference(GVOp.getTargetFlags());
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitDisplacementField(const MachineOperand *RelocOp,
+                                                 int DispVal,
+                                                 intptr_t Adj /* = 0 */,
+                                                 bool IsPCRel /* = true */) {
+  // If this is a simple integer displacement that doesn't require a relocation,
+  // emit it now.
+  if (!RelocOp) {
+    emitConstant(DispVal, 4);
+    return;
+  }
+
+  // Otherwise, this is something that requires a relocation.  Emit it as such
+  // now.
+  unsigned RelocType = Is64BitMode ?
+    (IsPCRel ? X86::reloc_pcrel_word : X86::reloc_absolute_word_sext)
+    : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
+  if (RelocOp->isGlobal()) {
+    // In 64-bit static small code model, we could potentially emit absolute.
+    // But it's probably not beneficial. If the MCE supports using RIP directly
+    // do it, otherwise fallback to absolute (this is determined by IsPCRel). 
+    //  89 05 00 00 00 00     mov    %eax,0(%rip)  # PC-relative
+    //  89 04 25 00 00 00 00  mov    %eax,0x0      # Absolute
+    bool Indirect = gvNeedsNonLazyPtr(*RelocOp, TM);
+    emitGlobalAddress(RelocOp->getGlobal(), RelocType, RelocOp->getOffset(),
+                      Adj, Indirect);
+  } else if (RelocOp->isSymbol()) {
+    emitExternalSymbolAddress(RelocOp->getSymbolName(), RelocType);
+  } else if (RelocOp->isCPI()) {
+    emitConstPoolAddress(RelocOp->getIndex(), RelocType,
+                         RelocOp->getOffset(), Adj);
+  } else {
+    assert(RelocOp->isJTI() && "Unexpected machine operand!");
+    emitJumpTableAddress(RelocOp->getIndex(), RelocType, Adj);
+  }
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
+                                            unsigned Op,unsigned RegOpcodeField,
+                                            intptr_t PCAdj) {
+  const MachineOperand &Op3 = MI.getOperand(Op+3);
+  int DispVal = 0;
+  const MachineOperand *DispForReloc = 0;
+  
+  // Figure out what sort of displacement we have to handle here.
+  if (Op3.isGlobal()) {
+    DispForReloc = &Op3;
+  } else if (Op3.isSymbol()) {
+    DispForReloc = &Op3;
+  } else if (Op3.isCPI()) {
+    if (!MCE.earlyResolveAddresses() || Is64BitMode || IsPIC) {
+      DispForReloc = &Op3;
+    } else {
+      DispVal += MCE.getConstantPoolEntryAddress(Op3.getIndex());
+      DispVal += Op3.getOffset();
+    }
+  } else if (Op3.isJTI()) {
+    if (!MCE.earlyResolveAddresses() || Is64BitMode || IsPIC) {
+      DispForReloc = &Op3;
+    } else {
+      DispVal += MCE.getJumpTableEntryAddress(Op3.getIndex());
+    }
+  } else {
+    DispVal = Op3.getImm();
+  }
+
+  const MachineOperand &Base     = MI.getOperand(Op);
+  const MachineOperand &Scale    = MI.getOperand(Op+1);
+  const MachineOperand &IndexReg = MI.getOperand(Op+2);
+
+  unsigned BaseReg = Base.getReg();
+
+  // Indicate that the displacement will use an pcrel or absolute reference
+  // by default. MCEs able to resolve addresses on-the-fly use pcrel by default
+  // while others, unless explicit asked to use RIP, use absolute references.
+  bool IsPCRel = MCE.earlyResolveAddresses() ? true : false;
+
+  // Is a SIB byte needed?
+  // If no BaseReg, issue a RIP relative instruction only if the MCE can 
+  // resolve addresses on-the-fly, otherwise use SIB (Intel Manual 2A, table
+  // 2-7) and absolute references.
+  if (// The SIB byte must be used if there is an index register.
+      IndexReg.getReg() == 0 && 
+      // The SIB byte must be used if the base is ESP/RSP.
+      BaseReg != X86::ESP && BaseReg != X86::RSP &&
+      // If there is no base register and we're in 64-bit mode, we need a SIB
+      // byte to emit an addr that is just 'disp32' (the non-RIP relative form).
+      (!Is64BitMode || BaseReg != 0)) {
+    if (BaseReg == 0 ||          // [disp32]     in X86-32 mode
+        BaseReg == X86::RIP) {   // [disp32+RIP] in X86-64 mode
+      MCE.emitByte(ModRMByte(0, RegOpcodeField, 5));
+      emitDisplacementField(DispForReloc, DispVal, PCAdj, true);
+      return;
+    }
+    
+    unsigned BaseRegNo = getX86RegNum(BaseReg);
+    // If the base is not EBP/ESP and there is no displacement, use simple
+    // indirect register encoding, this handles addresses like [EAX].  The
+    // encoding for [EBP] with no displacement means [disp32] so we handle it
+    // by emitting a displacement of 0 below.
+    if (!DispForReloc && DispVal == 0 && BaseRegNo != N86::EBP) {
+      MCE.emitByte(ModRMByte(0, RegOpcodeField, BaseRegNo));
+      return;
+    }
+    
+    // Otherwise, if the displacement fits in a byte, encode as [REG+disp8].
+    if (!DispForReloc && isDisp8(DispVal)) {
+      MCE.emitByte(ModRMByte(1, RegOpcodeField, BaseRegNo));
+      emitConstant(DispVal, 1);
+      return;
+    }
+    
+    // Otherwise, emit the most general non-SIB encoding: [REG+disp32]
+    MCE.emitByte(ModRMByte(2, RegOpcodeField, BaseRegNo));
+    emitDisplacementField(DispForReloc, DispVal, PCAdj, IsPCRel);
+    return;
+  }
+  
+  // Otherwise we need a SIB byte, so start by outputting the ModR/M byte first.
+  assert(IndexReg.getReg() != X86::ESP &&
+         IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!");
+
+  bool ForceDisp32 = false;
+  bool ForceDisp8  = false;
+  if (BaseReg == 0) {
+    // If there is no base register, we emit the special case SIB byte with
+    // MOD=0, BASE=4, to JUST get the index, scale, and displacement.
+    MCE.emitByte(ModRMByte(0, RegOpcodeField, 4));
+    ForceDisp32 = true;
+  } else if (DispForReloc) {
+    // Emit the normal disp32 encoding.
+    MCE.emitByte(ModRMByte(2, RegOpcodeField, 4));
+    ForceDisp32 = true;
+  } else if (DispVal == 0 && getX86RegNum(BaseReg) != N86::EBP) {
+    // Emit no displacement ModR/M byte
+    MCE.emitByte(ModRMByte(0, RegOpcodeField, 4));
+  } else if (isDisp8(DispVal)) {
+    // Emit the disp8 encoding...
+    MCE.emitByte(ModRMByte(1, RegOpcodeField, 4));
+    ForceDisp8 = true;           // Make sure to force 8 bit disp if Base=EBP
+  } else {
+    // Emit the normal disp32 encoding...
+    MCE.emitByte(ModRMByte(2, RegOpcodeField, 4));
+  }
+
+  // Calculate what the SS field value should be...
+  static const unsigned SSTable[] = { ~0, 0, 1, ~0, 2, ~0, ~0, ~0, 3 };
+  unsigned SS = SSTable[Scale.getImm()];
+
+  if (BaseReg == 0) {
+    // Handle the SIB byte for the case where there is no base, see Intel 
+    // Manual 2A, table 2-7. The displacement has already been output.
+    unsigned IndexRegNo;
+    if (IndexReg.getReg())
+      IndexRegNo = getX86RegNum(IndexReg.getReg());
+    else // Examples: [ESP+1*<noreg>+4] or [scaled idx]+disp32 (MOD=0,BASE=5)
+      IndexRegNo = 4;
+    emitSIBByte(SS, IndexRegNo, 5);
+  } else {
+    unsigned BaseRegNo = getX86RegNum(BaseReg);
+    unsigned IndexRegNo;
+    if (IndexReg.getReg())
+      IndexRegNo = getX86RegNum(IndexReg.getReg());
+    else
+      IndexRegNo = 4;   // For example [ESP+1*<noreg>+4]
+    emitSIBByte(SS, IndexRegNo, BaseRegNo);
+  }
+
+  // Do we need to output a displacement?
+  if (ForceDisp8) {
+    emitConstant(DispVal, 1);
+  } else if (DispVal != 0 || ForceDisp32) {
+    emitDisplacementField(DispForReloc, DispVal, PCAdj, IsPCRel);
+  }
+}
+
+template<class CodeEmitter>
+void Emitter<CodeEmitter>::emitInstruction(const MachineInstr &MI,
+                                           const TargetInstrDesc *Desc) {
+  DEBUG(dbgs() << MI);
+
+  MCE.processDebugLoc(MI.getDebugLoc(), true);
+
+  unsigned Opcode = Desc->Opcode;
+
+  // Emit the lock opcode prefix as needed.
+  if (Desc->TSFlags & X86II::LOCK)
+    MCE.emitByte(0xF0);
+
+  // Emit segment override opcode prefix as needed.
+  switch (Desc->TSFlags & X86II::SegOvrMask) {
+  case X86II::FS:
+    MCE.emitByte(0x64);
+    break;
+  case X86II::GS:
+    MCE.emitByte(0x65);
+    break;
+  default: llvm_unreachable("Invalid segment!");
+  case 0: break;  // No segment override!
+  }
+
+  // Emit the repeat opcode prefix as needed.
+  if ((Desc->TSFlags & X86II::Op0Mask) == X86II::REP)
+    MCE.emitByte(0xF3);
+
+  // Emit the operand size opcode prefix as needed.
+  if (Desc->TSFlags & X86II::OpSize)
+    MCE.emitByte(0x66);
+
+  // Emit the address size opcode prefix as needed.
+  if (Desc->TSFlags & X86II::AdSize)
+    MCE.emitByte(0x67);
+
+  bool Need0FPrefix = false;
+  switch (Desc->TSFlags & X86II::Op0Mask) {
+  case X86II::TB:  // Two-byte opcode prefix
+  case X86II::T8:  // 0F 38
+  case X86II::TA:  // 0F 3A
+    Need0FPrefix = true;
+    break;
+  case X86II::TF: // F2 0F 38
+    MCE.emitByte(0xF2);
+    Need0FPrefix = true;
+    break;
+  case X86II::REP: break; // already handled.
+  case X86II::XS:   // F3 0F
+    MCE.emitByte(0xF3);
+    Need0FPrefix = true;
+    break;
+  case X86II::XD:   // F2 0F
+    MCE.emitByte(0xF2);
+    Need0FPrefix = true;
+    break;
+  case X86II::D8: case X86II::D9: case X86II::DA: case X86II::DB:
+  case X86II::DC: case X86II::DD: case X86II::DE: case X86II::DF:
+    MCE.emitByte(0xD8+
+                 (((Desc->TSFlags & X86II::Op0Mask)-X86II::D8)
+                                   >> X86II::Op0Shift));
+    break; // Two-byte opcode prefix
+  default: llvm_unreachable("Invalid prefix!");
+  case 0: break;  // No prefix!
+  }
+
+  // Handle REX prefix.
+  if (Is64BitMode) {
+    if (unsigned REX = X86InstrInfo::determineREX(MI))
+      MCE.emitByte(0x40 | REX);
+  }
+
+  // 0x0F escape code must be emitted just before the opcode.
+  if (Need0FPrefix)
+    MCE.emitByte(0x0F);
+
+  switch (Desc->TSFlags & X86II::Op0Mask) {
+  case X86II::TF:    // F2 0F 38
+  case X86II::T8:    // 0F 38
+    MCE.emitByte(0x38);
+    break;
+  case X86II::TA:    // 0F 3A
+    MCE.emitByte(0x3A);
+    break;
+  }
+
+  // If this is a two-address instruction, skip one of the register operands.
+  unsigned NumOps = Desc->getNumOperands();
+  unsigned CurOp = 0;
+  if (NumOps > 1 && Desc->getOperandConstraint(1, TOI::TIED_TO) != -1)
+    ++CurOp;
+  else if (NumOps > 2 && Desc->getOperandConstraint(NumOps-1, TOI::TIED_TO)== 0)
+    // Skip the last source operand that is tied_to the dest reg. e.g. LXADD32
+    --NumOps;
+
+  unsigned char BaseOpcode = X86II::getBaseOpcodeFor(Desc->TSFlags);
+  switch (Desc->TSFlags & X86II::FormMask) {
+  default:
+    llvm_unreachable("Unknown FormMask value in X86 MachineCodeEmitter!");
+  case X86II::Pseudo:
+    // Remember the current PC offset, this is the PIC relocation
+    // base address.
+    switch (Opcode) {
+    default: 
+      llvm_unreachable("psuedo instructions should be removed before code"
+                       " emission");
+      break;
+    case TargetOpcode::INLINEASM:
+      // We allow inline assembler nodes with empty bodies - they can
+      // implicitly define registers, which is ok for JIT.
+      if (MI.getOperand(0).getSymbolName()[0])
+        llvm_report_error("JIT does not support inline asm!");
+      break;
+    case TargetOpcode::DBG_LABEL:
+    case TargetOpcode::EH_LABEL:
+    case TargetOpcode::GC_LABEL:
+      MCE.emitLabel(MI.getOperand(0).getImm());
+      break;
+    case TargetOpcode::IMPLICIT_DEF:
+    case TargetOpcode::KILL:
+    case X86::FP_REG_KILL:
+      break;
+    case X86::MOVPC32r: {
+      // This emits the "call" portion of this pseudo instruction.
+      MCE.emitByte(BaseOpcode);
+      emitConstant(0, X86II::getSizeOfImm(Desc->TSFlags));
+      // Remember PIC base.
+      PICBaseOffset = (intptr_t) MCE.getCurrentPCOffset();
+      X86JITInfo *JTI = TM.getJITInfo();
+      JTI->setPICBase(MCE.getCurrentPCValue());
+      break;
+    }
+    }
+    CurOp = NumOps;
+    break;
+  case X86II::RawFrm: {
+    MCE.emitByte(BaseOpcode);
+
+    if (CurOp == NumOps)
+      break;
+      
+    const MachineOperand &MO = MI.getOperand(CurOp++);
+
+    DEBUG(dbgs() << "RawFrm CurOp " << CurOp << "\n");
+    DEBUG(dbgs() << "isMBB " << MO.isMBB() << "\n");
+    DEBUG(dbgs() << "isGlobal " << MO.isGlobal() << "\n");
+    DEBUG(dbgs() << "isSymbol " << MO.isSymbol() << "\n");
+    DEBUG(dbgs() << "isImm " << MO.isImm() << "\n");
+
+    if (MO.isMBB()) {
+      emitPCRelativeBlockAddress(MO.getMBB());
+      break;
+    }
+    
+    if (MO.isGlobal()) {
+      emitGlobalAddress(MO.getGlobal(), X86::reloc_pcrel_word,
+                        MO.getOffset(), 0);
+      break;
+    }
+    
+    if (MO.isSymbol()) {
+      emitExternalSymbolAddress(MO.getSymbolName(), X86::reloc_pcrel_word);
+      break;
+    }
+
+    // FIXME: Only used by hackish MCCodeEmitter, remove when dead.
+    if (MO.isJTI()) {
+      emitJumpTableAddress(MO.getIndex(), X86::reloc_pcrel_word);
+      break;
+    }
+    
+    assert(MO.isImm() && "Unknown RawFrm operand!");
+    if (Opcode == X86::CALLpcrel32 || Opcode == X86::CALL64pcrel32) {
+      // Fix up immediate operand for pc relative calls.
+      intptr_t Imm = (intptr_t)MO.getImm();
+      Imm = Imm - MCE.getCurrentPCValue() - 4;
+      emitConstant(Imm, X86II::getSizeOfImm(Desc->TSFlags));
+    } else
+      emitConstant(MO.getImm(), X86II::getSizeOfImm(Desc->TSFlags));
+    break;
+  }
+      
+  case X86II::AddRegFrm: {
+    MCE.emitByte(BaseOpcode + getX86RegNum(MI.getOperand(CurOp++).getReg()));
+    
+    if (CurOp == NumOps)
+      break;
+      
+    const MachineOperand &MO1 = MI.getOperand(CurOp++);
+    unsigned Size = X86II::getSizeOfImm(Desc->TSFlags);
+    if (MO1.isImm()) {
+      emitConstant(MO1.getImm(), Size);
+      break;
+    }
+    
+    unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
+      : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
+    if (Opcode == X86::MOV64ri64i32)
+      rt = X86::reloc_absolute_word;  // FIXME: add X86II flag?
+    // This should not occur on Darwin for relocatable objects.
+    if (Opcode == X86::MOV64ri)
+      rt = X86::reloc_absolute_dword;  // FIXME: add X86II flag?
+    if (MO1.isGlobal()) {
+      bool Indirect = gvNeedsNonLazyPtr(MO1, TM);
+      emitGlobalAddress(MO1.getGlobal(), rt, MO1.getOffset(), 0,
+                        Indirect);
+    } else if (MO1.isSymbol())
+      emitExternalSymbolAddress(MO1.getSymbolName(), rt);
+    else if (MO1.isCPI())
+      emitConstPoolAddress(MO1.getIndex(), rt);
+    else if (MO1.isJTI())
+      emitJumpTableAddress(MO1.getIndex(), rt);
+    break;
+  }
+
+  case X86II::MRMDestReg: {
+    MCE.emitByte(BaseOpcode);
+    emitRegModRMByte(MI.getOperand(CurOp).getReg(),
+                     getX86RegNum(MI.getOperand(CurOp+1).getReg()));
+    CurOp += 2;
+    if (CurOp != NumOps)
+      emitConstant(MI.getOperand(CurOp++).getImm(),
+                   X86II::getSizeOfImm(Desc->TSFlags));
+    break;
+  }
+  case X86II::MRMDestMem: {
+    MCE.emitByte(BaseOpcode);
+    emitMemModRMByte(MI, CurOp,
+                     getX86RegNum(MI.getOperand(CurOp + X86AddrNumOperands)
+                                  .getReg()));
+    CurOp +=  X86AddrNumOperands + 1;
+    if (CurOp != NumOps)
+      emitConstant(MI.getOperand(CurOp++).getImm(),
+                   X86II::getSizeOfImm(Desc->TSFlags));
+    break;
+  }
+
+  case X86II::MRMSrcReg:
+    MCE.emitByte(BaseOpcode);
+    emitRegModRMByte(MI.getOperand(CurOp+1).getReg(),
+                     getX86RegNum(MI.getOperand(CurOp).getReg()));
+    CurOp += 2;
+    if (CurOp != NumOps)
+      emitConstant(MI.getOperand(CurOp++).getImm(),
+                   X86II::getSizeOfImm(Desc->TSFlags));
+    break;
+
+  case X86II::MRMSrcMem: {
+    // FIXME: Maybe lea should have its own form?
+    int AddrOperands;
+    if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r ||
+        Opcode == X86::LEA16r || Opcode == X86::LEA32r)
+      AddrOperands = X86AddrNumOperands - 1; // No segment register
+    else
+      AddrOperands = X86AddrNumOperands;
+
+    intptr_t PCAdj = (CurOp + AddrOperands + 1 != NumOps) ?
+      X86II::getSizeOfImm(Desc->TSFlags) : 0;
+
+    MCE.emitByte(BaseOpcode);
+    emitMemModRMByte(MI, CurOp+1, getX86RegNum(MI.getOperand(CurOp).getReg()),
+                     PCAdj);
+    CurOp += AddrOperands + 1;
+    if (CurOp != NumOps)
+      emitConstant(MI.getOperand(CurOp++).getImm(),
+                   X86II::getSizeOfImm(Desc->TSFlags));
+    break;
+  }
+
+  case X86II::MRM0r: case X86II::MRM1r:
+  case X86II::MRM2r: case X86II::MRM3r:
+  case X86II::MRM4r: case X86II::MRM5r:
+  case X86II::MRM6r: case X86II::MRM7r: {
+    MCE.emitByte(BaseOpcode);
+
+    // Special handling of lfence, mfence, monitor, and mwait.
+    if (Desc->getOpcode() == X86::LFENCE ||
+        Desc->getOpcode() == X86::MFENCE ||
+        Desc->getOpcode() == X86::MONITOR ||
+        Desc->getOpcode() == X86::MWAIT) {
+      emitRegModRMByte((Desc->TSFlags & X86II::FormMask)-X86II::MRM0r);
+
+      switch (Desc->getOpcode()) {
+      default: break;
+      case X86::MONITOR:
+        MCE.emitByte(0xC8);
+        break;
+      case X86::MWAIT:
+        MCE.emitByte(0xC9);
+        break;
+      }
+    } else {
+      emitRegModRMByte(MI.getOperand(CurOp++).getReg(),
+                       (Desc->TSFlags & X86II::FormMask)-X86II::MRM0r);
+    }
+
+    if (CurOp == NumOps)
+      break;
+    
+    const MachineOperand &MO1 = MI.getOperand(CurOp++);
+    unsigned Size = X86II::getSizeOfImm(Desc->TSFlags);
+    if (MO1.isImm()) {
+      emitConstant(MO1.getImm(), Size);
+      break;
+    }
+    
+    unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
+      : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
+    if (Opcode == X86::MOV64ri32)
+      rt = X86::reloc_absolute_word_sext;  // FIXME: add X86II flag?
+    if (MO1.isGlobal()) {
+      bool Indirect = gvNeedsNonLazyPtr(MO1, TM);
+      emitGlobalAddress(MO1.getGlobal(), rt, MO1.getOffset(), 0,
+                        Indirect);
+    } else if (MO1.isSymbol())
+      emitExternalSymbolAddress(MO1.getSymbolName(), rt);
+    else if (MO1.isCPI())
+      emitConstPoolAddress(MO1.getIndex(), rt);
+    else if (MO1.isJTI())
+      emitJumpTableAddress(MO1.getIndex(), rt);
+    break;
+  }
+
+  case X86II::MRM0m: case X86II::MRM1m:
+  case X86II::MRM2m: case X86II::MRM3m:
+  case X86II::MRM4m: case X86II::MRM5m:
+  case X86II::MRM6m: case X86II::MRM7m: {
+    intptr_t PCAdj = (CurOp + X86AddrNumOperands != NumOps) ?
+      (MI.getOperand(CurOp+X86AddrNumOperands).isImm() ? 
+          X86II::getSizeOfImm(Desc->TSFlags) : 4) : 0;
+
+    MCE.emitByte(BaseOpcode);
+    emitMemModRMByte(MI, CurOp, (Desc->TSFlags & X86II::FormMask)-X86II::MRM0m,
+                     PCAdj);
+    CurOp += X86AddrNumOperands;
+
+    if (CurOp == NumOps)
+      break;
+    
+    const MachineOperand &MO = MI.getOperand(CurOp++);
+    unsigned Size = X86II::getSizeOfImm(Desc->TSFlags);
+    if (MO.isImm()) {
+      emitConstant(MO.getImm(), Size);
+      break;
+    }
+    
+    unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
+      : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
+    if (Opcode == X86::MOV64mi32)
+      rt = X86::reloc_absolute_word_sext;  // FIXME: add X86II flag?
+    if (MO.isGlobal()) {
+      bool Indirect = gvNeedsNonLazyPtr(MO, TM);
+      emitGlobalAddress(MO.getGlobal(), rt, MO.getOffset(), 0,
+                        Indirect);
+    } else if (MO.isSymbol())
+      emitExternalSymbolAddress(MO.getSymbolName(), rt);
+    else if (MO.isCPI())
+      emitConstPoolAddress(MO.getIndex(), rt);
+    else if (MO.isJTI())
+      emitJumpTableAddress(MO.getIndex(), rt);
+    break;
+  }
+
+  case X86II::MRMInitReg:
+    MCE.emitByte(BaseOpcode);
+    // Duplicate register, used by things like MOV8r0 (aka xor reg,reg).
+    emitRegModRMByte(MI.getOperand(CurOp).getReg(),
+                     getX86RegNum(MI.getOperand(CurOp).getReg()));
+    ++CurOp;
+    break;
+  }
+
+  if (!Desc->isVariadic() && CurOp != NumOps) {
+#ifndef NDEBUG
+    dbgs() << "Cannot encode all operands of: " << MI << "\n";
+#endif
+    llvm_unreachable(0);
+  }
+
+  MCE.processDebugLoc(MI.getDebugLoc(), false);
+}
+
+// Adapt the Emitter / CodeEmitter interfaces to MCCodeEmitter.
+//
+// FIXME: This is a total hack designed to allow work on llvm-mc to proceed
+// without being blocked on various cleanups needed to support a clean interface
+// to instruction encoding.
+//
+// Look away!
+
+#include "llvm/DerivedTypes.h"
+
+namespace {
+class MCSingleInstructionCodeEmitter : public MachineCodeEmitter {
+  uint8_t Data[256];
+  const MCInst *CurrentInst;
+  SmallVectorImpl<MCFixup> *FixupList;
+
+public:
+  MCSingleInstructionCodeEmitter() { reset(0, 0); }
+
+  void reset(const MCInst *Inst, SmallVectorImpl<MCFixup> *Fixups) {
+    CurrentInst = Inst;
+    FixupList = Fixups;
+    BufferBegin = Data;
+    BufferEnd = array_endof(Data);
+    CurBufferPtr = Data;
+  }
+
+  StringRef str() {
+    return StringRef(reinterpret_cast<char*>(BufferBegin),
+                     CurBufferPtr - BufferBegin);
+  }
+
+  virtual void startFunction(MachineFunction &F) {}
+  virtual bool finishFunction(MachineFunction &F) { return false; }
+  virtual void StartMachineBasicBlock(MachineBasicBlock *MBB) {}
+  virtual bool earlyResolveAddresses() const { return false; }
+  virtual void addRelocation(const MachineRelocation &MR) {
+    unsigned Offset = 0, OpIndex = 0, Kind = MR.getRelocationType();
+
+    // This form is only used in one case, for branches.
+    if (MR.isBasicBlock()) {
+      Offset = unsigned(MR.getMachineCodeOffset());
+      OpIndex = 0;
+    } else {
+      assert(MR.isJumpTableIndex() && "Unexpected relocation!");
+
+      Offset = unsigned(MR.getMachineCodeOffset());
+
+      // The operand index is encoded as the first byte of the fake operand.
+      OpIndex = MR.getJumpTableIndex();
+    }
+
+    MCOperand Op = CurrentInst->getOperand(OpIndex);
+    assert(Op.isExpr() && "FIXME: Not yet implemented!");
+    FixupList->push_back(MCFixup::Create(Offset, Op.getExpr(),
+                                     MCFixupKind(FirstTargetFixupKind + Kind)));
+  }
+  virtual void setModuleInfo(MachineModuleInfo* Info) {}
+
+  // Interface functions which should never get called in our usage.
+
+  virtual void emitLabel(uint64_t LabelID) {
+    assert(0 && "Unexpected code emitter call!");
+  }
+  virtual uintptr_t getConstantPoolEntryAddress(unsigned Index) const {
+    assert(0 && "Unexpected code emitter call!");
+    return 0;
+  }
+  virtual uintptr_t getJumpTableEntryAddress(unsigned Index) const {
+    assert(0 && "Unexpected code emitter call!");
+    return 0;
+  }
+  virtual uintptr_t getMachineBasicBlockAddress(MachineBasicBlock *MBB) const {
+    assert(0 && "Unexpected code emitter call!");
+    return 0;
+  }
+  virtual uintptr_t getLabelAddress(uint64_t LabelID) const {
+    assert(0 && "Unexpected code emitter call!");
+    return 0;
+  }
+};
+
+class X86MCCodeEmitter : public MCCodeEmitter {
+  X86MCCodeEmitter(const X86MCCodeEmitter &); // DO NOT IMPLEMENT
+  void operator=(const X86MCCodeEmitter &); // DO NOT IMPLEMENT
+
+private:
+  X86TargetMachine &TM;
+  llvm::Function *DummyF;
+  TargetData *DummyTD;
+  mutable llvm::MachineFunction *DummyMF;
+  llvm::MachineBasicBlock *DummyMBB;
+  
+  MCSingleInstructionCodeEmitter *InstrEmitter;
+  Emitter<MachineCodeEmitter> *Emit;
+
+public:
+  X86MCCodeEmitter(X86TargetMachine &_TM) : TM(_TM) {
+    // Verily, thou shouldst avert thine eyes.
+    const llvm::FunctionType *FTy =
+      FunctionType::get(llvm::Type::getVoidTy(getGlobalContext()), false);
+    DummyF = Function::Create(FTy, GlobalValue::InternalLinkage);
+    DummyTD = new TargetData("");
+    DummyMF = new MachineFunction(DummyF, TM, 0);
+    DummyMBB = DummyMF->CreateMachineBasicBlock();
+
+    InstrEmitter = new MCSingleInstructionCodeEmitter();
+    Emit = new Emitter<MachineCodeEmitter>(TM, *InstrEmitter, 
+                                           *TM.getInstrInfo(),
+                                           *DummyTD, false);
+  }
+  ~X86MCCodeEmitter() {
+    delete Emit;
+    delete InstrEmitter;
+    delete DummyMF;
+    delete DummyF;
+  }
+
+  unsigned getNumFixupKinds() const {
+    return 5;
+  }
+
+  MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const {
+    static MCFixupKindInfo Infos[] = {
+      { "reloc_pcrel_word", 0, 4 * 8 },
+      { "reloc_picrel_word", 0, 4 * 8 },
+      { "reloc_absolute_word", 0, 4 * 8 },
+      { "reloc_absolute_word_sext", 0, 4 * 8 },
+      { "reloc_absolute_dword", 0, 8 * 8 }
+    };
+
+    assert(Kind >= FirstTargetFixupKind && Kind < MaxTargetFixupKind &&
+           "Invalid kind!");
+    return Infos[Kind - FirstTargetFixupKind];
+  }
+
+  bool AddRegToInstr(const MCInst &MI, MachineInstr *Instr,
+                     unsigned Start) const {
+    if (Start + 1 > MI.getNumOperands())
+      return false;
+
+    const MCOperand &Op = MI.getOperand(Start);
+    if (!Op.isReg()) return false;
+
+    Instr->addOperand(MachineOperand::CreateReg(Op.getReg(), false));
+    return true;
+  }
+
+  bool AddImmToInstr(const MCInst &MI, MachineInstr *Instr,
+                     unsigned Start) const {
+    if (Start + 1 > MI.getNumOperands())
+      return false;
+
+    const MCOperand &Op = MI.getOperand(Start);
+    if (Op.isImm()) {
+      Instr->addOperand(MachineOperand::CreateImm(Op.getImm()));
+      return true;
+    }
+    if (!Op.isExpr())
+      return false;
+
+    const MCExpr *Expr = Op.getExpr();
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr)) {
+      Instr->addOperand(MachineOperand::CreateImm(CE->getValue()));
+      return true;
+    }
+
+    // Fake this as an external symbol to the code emitter to add a relcoation
+    // entry we will recognize.
+    Instr->addOperand(MachineOperand::CreateJTI(Start, 0));
+    return true;
+  }
+
+  bool AddLMemToInstr(const MCInst &MI, MachineInstr *Instr,
+                     unsigned Start) const {
+    return (AddRegToInstr(MI, Instr, Start + 0) &&
+            AddImmToInstr(MI, Instr, Start + 1) &&
+            AddRegToInstr(MI, Instr, Start + 2) &&
+            AddImmToInstr(MI, Instr, Start + 3));
+  }
+
+  bool AddMemToInstr(const MCInst &MI, MachineInstr *Instr,
+                     unsigned Start) const {
+    return (AddRegToInstr(MI, Instr, Start + 0) &&
+            AddImmToInstr(MI, Instr, Start + 1) &&
+            AddRegToInstr(MI, Instr, Start + 2) &&
+            AddImmToInstr(MI, Instr, Start + 3) &&
+            AddRegToInstr(MI, Instr, Start + 4));
+  }
+
+  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups) const {
+    // Don't look yet!
+
+    // Convert the MCInst to a MachineInstr so we can (ab)use the regular
+    // emitter.
+    const X86InstrInfo &II = *TM.getInstrInfo();
+    const TargetInstrDesc &Desc = II.get(MI.getOpcode());    
+    MachineInstr *Instr = DummyMF->CreateMachineInstr(Desc, DebugLoc());
+    DummyMBB->push_back(Instr);
+
+    unsigned Opcode = MI.getOpcode();
+    unsigned NumOps = MI.getNumOperands();
+    unsigned CurOp = 0;
+    bool AddTied = false;
+    if (NumOps > 1 && Desc.getOperandConstraint(1, TOI::TIED_TO) != -1)
+      AddTied = true;
+    else if (NumOps > 2 && 
+             Desc.getOperandConstraint(NumOps-1, TOI::TIED_TO)== 0)
+      // Skip the last source operand that is tied_to the dest reg. e.g. LXADD32
+      --NumOps;
+
+    bool OK = true;
+    switch (Desc.TSFlags & X86II::FormMask) {
+    case X86II::MRMDestReg:
+    case X86II::MRMSrcReg:
+      // Matching doesn't fill this in completely, we have to choose operand 0
+      // for a tied register.
+      OK &= AddRegToInstr(MI, Instr, CurOp++);
+      if (AddTied)
+        OK &= AddRegToInstr(MI, Instr, CurOp++ - 1);
+      OK &= AddRegToInstr(MI, Instr, CurOp++);
+      if (CurOp < NumOps)
+        OK &= AddImmToInstr(MI, Instr, CurOp);
+      break;
+
+    case X86II::RawFrm:
+      if (CurOp < NumOps) {
+        // Hack to make branches work.
+        if (!(Desc.TSFlags & X86II::ImmMask) &&
+            MI.getOperand(0).isExpr() &&
+            isa<MCSymbolRefExpr>(MI.getOperand(0).getExpr()))
+          Instr->addOperand(MachineOperand::CreateMBB(DummyMBB));
+        else
+          OK &= AddImmToInstr(MI, Instr, CurOp);
+      }
+      break;
+
+    case X86II::AddRegFrm:
+      // Matching doesn't fill this in completely, we have to choose operand 0
+      // for a tied register.
+      OK &= AddRegToInstr(MI, Instr, CurOp++);
+      if (AddTied)
+        OK &= AddRegToInstr(MI, Instr, CurOp++ - 1);
+      if (CurOp < NumOps)
+        OK &= AddImmToInstr(MI, Instr, CurOp);
+      break;
+
+    case X86II::MRM0r: case X86II::MRM1r:
+    case X86II::MRM2r: case X86II::MRM3r:
+    case X86II::MRM4r: case X86II::MRM5r:
+    case X86II::MRM6r: case X86II::MRM7r:
+      // Matching doesn't fill this in completely, we have to choose operand 0
+      // for a tied register.
+      OK &= AddRegToInstr(MI, Instr, CurOp++);
+      if (AddTied)
+        OK &= AddRegToInstr(MI, Instr, CurOp++ - 1);
+      if (CurOp < NumOps)
+        OK &= AddImmToInstr(MI, Instr, CurOp);
+      break;
+      
+    case X86II::MRM0m: case X86II::MRM1m:
+    case X86II::MRM2m: case X86II::MRM3m:
+    case X86II::MRM4m: case X86II::MRM5m:
+    case X86II::MRM6m: case X86II::MRM7m:
+      OK &= AddMemToInstr(MI, Instr, CurOp); CurOp += 5;
+      if (CurOp < NumOps)
+        OK &= AddImmToInstr(MI, Instr, CurOp);
+      break;
+
+    case X86II::MRMSrcMem:
+      // Matching doesn't fill this in completely, we have to choose operand 0
+      // for a tied register.
+      OK &= AddRegToInstr(MI, Instr, CurOp++);
+      if (AddTied)
+        OK &= AddRegToInstr(MI, Instr, CurOp++ - 1);
+      if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r ||
+          Opcode == X86::LEA16r || Opcode == X86::LEA32r)
+        OK &= AddLMemToInstr(MI, Instr, CurOp);
+      else
+        OK &= AddMemToInstr(MI, Instr, CurOp);
+      break;
+
+    case X86II::MRMDestMem:
+      OK &= AddMemToInstr(MI, Instr, CurOp); CurOp += 5;
+      OK &= AddRegToInstr(MI, Instr, CurOp);
+      break;
+
+    default:
+    case X86II::MRMInitReg:
+    case X86II::Pseudo:
+      OK = false;
+      break;
+    }
+
+    if (!OK) {
+      dbgs() << "couldn't convert inst '";
+      MI.dump();
+      dbgs() << "' to machine instr:\n";
+      Instr->dump();
+    }
+
+    InstrEmitter->reset(&MI, &Fixups);
+    if (OK)
+      Emit->emitInstruction(*Instr, &Desc);
+    OS << InstrEmitter->str();
+
+    Instr->eraseFromParent();
+  }
+};
+}
+
+#include "llvm/Support/CommandLine.h"
+
+static cl::opt<bool> EnableNewEncoder("enable-new-x86-encoder",
+                                      cl::ReallyHidden);
+
+
+// Ok, now you can look.
+MCCodeEmitter *llvm::createHeinousX86MCCodeEmitter(const Target &T,
+                                                   TargetMachine &TM) {
+  
+  // FIXME: Remove the heinous one when the new one works.
+  if (EnableNewEncoder) {
+    if (TM.getTargetData()->getPointerSize() == 4)
+      return createX86_32MCCodeEmitter(T, TM);
+    return createX86_64MCCodeEmitter(T, TM);
+  }
+
+  return new X86MCCodeEmitter(static_cast<X86TargetMachine&>(TM));
+}

diff --git a/lib/Target/X86/X86CompilationCallback_Win64.asm b/lib/Target/X86/X86CompilationCallback_Win64.asm
new file mode 100644
index 0000000..f321778
--- /dev/null
+++ b/lib/Target/X86/X86CompilationCallback_Win64.asm

@@ -0,0 +1,68 @@
+;;===-- X86CompilationCallback_Win64.asm - Implement Win64 JIT callback ---===
+;;
+;;                     The LLVM Compiler Infrastructure
+;;
+;; This file is distributed under the University of Illinois Open Source
+;; License. See LICENSE.TXT for details.
+;;
+;;===----------------------------------------------------------------------===
+;;
+;; This file implements the JIT interfaces for the X86 target.
+;;
+;;===----------------------------------------------------------------------===
+
+extrn X86CompilationCallback2: PROC
+
+.code
+X86CompilationCallback proc
+    push    rbp
+
+    ; Save RSP.
+    mov     rbp, rsp
+
+    ; Save all int arg registers
+    ; WARNING: We cannot use register spill area - we're generating stubs by hands!
+    push    rcx
+    push    rdx
+    push    r8
+    push    r9
+
+    ; Align stack on 16-byte boundary.
+    and     rsp, -16
+
+    ; Save all XMM arg registers. Also allocate reg spill area.
+    sub     rsp, 96
+    movaps  [rsp   +32],  xmm0
+    movaps  [rsp+16+32],  xmm1
+    movaps  [rsp+32+32],  xmm2
+    movaps  [rsp+48+32],  xmm3
+
+    ; JIT callee
+
+    ; Pass prev frame and return address.
+    mov     rcx, rbp
+    mov     rdx, qword ptr [rbp+8]
+    call    X86CompilationCallback2
+
+    ; Restore all XMM arg registers.
+    movaps  xmm3, [rsp+48+32]
+    movaps  xmm2, [rsp+32+32]
+    movaps  xmm1, [rsp+16+32]
+    movaps  xmm0, [rsp   +32]
+
+    ; Restore RSP.
+    mov     rsp, rbp
+
+    ; Restore all int arg registers
+    sub     rsp, 32
+    pop     r9
+    pop     r8
+    pop     rdx
+    pop     rcx
+
+    ; Restore RBP.
+    pop     rbp
+    ret
+X86CompilationCallback endp
+
+End

diff --git a/lib/Target/X86/X86ELFWriterInfo.cpp b/lib/Target/X86/X86ELFWriterInfo.cpp
new file mode 100644
index 0000000..1597d2b
--- /dev/null
+++ b/lib/Target/X86/X86ELFWriterInfo.cpp

@@ -0,0 +1,153 @@
+//===-- X86ELFWriterInfo.cpp - ELF Writer Info for the X86 backend --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements ELF writer information for the X86 backend.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86ELFWriterInfo.h"
+#include "X86Relocations.h"
+#include "llvm/Function.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//  Implementation of the X86ELFWriterInfo class
+//===----------------------------------------------------------------------===//
+
+X86ELFWriterInfo::X86ELFWriterInfo(TargetMachine &TM)
+  : TargetELFWriterInfo(TM) {
+    bool is64Bit = TM.getTargetData()->getPointerSizeInBits() == 64;
+    EMachine = is64Bit ? EM_X86_64 : EM_386;
+  }
+
+X86ELFWriterInfo::~X86ELFWriterInfo() {}
+
+unsigned X86ELFWriterInfo::getRelocationType(unsigned MachineRelTy) const {
+  if (is64Bit) {
+    switch(MachineRelTy) {
+    case X86::reloc_pcrel_word:
+      return R_X86_64_PC32;
+    case X86::reloc_absolute_word:
+      return R_X86_64_32;
+    case X86::reloc_absolute_word_sext:
+      return R_X86_64_32S;
+    case X86::reloc_absolute_dword:
+      return R_X86_64_64;
+    case X86::reloc_picrel_word:
+    default:
+      llvm_unreachable("unknown x86_64 machine relocation type");
+    }
+  } else {
+    switch(MachineRelTy) {
+    case X86::reloc_pcrel_word:
+      return R_386_PC32;
+    case X86::reloc_absolute_word:
+      return R_386_32;
+    case X86::reloc_absolute_word_sext:
+    case X86::reloc_absolute_dword:
+    case X86::reloc_picrel_word:
+    default:
+      llvm_unreachable("unknown x86 machine relocation type");
+    }
+  }
+  return 0;
+}
+
+long int X86ELFWriterInfo::getDefaultAddendForRelTy(unsigned RelTy,
+                                                    long int Modifier) const {
+  if (is64Bit) {
+    switch(RelTy) {
+    case R_X86_64_PC32: return Modifier - 4;
+    case R_X86_64_32:
+    case R_X86_64_32S:
+    case R_X86_64_64:
+      return Modifier;
+    default:
+      llvm_unreachable("unknown x86_64 relocation type");
+    }
+  } else {
+    switch(RelTy) {
+      case R_386_PC32: return Modifier - 4;
+      case R_386_32: return Modifier;
+    default:
+      llvm_unreachable("unknown x86 relocation type");
+    }
+  }
+  return 0;
+}
+
+unsigned X86ELFWriterInfo::getRelocationTySize(unsigned RelTy) const {
+  if (is64Bit) {
+    switch(RelTy) {
+      case R_X86_64_PC32:
+      case R_X86_64_32:
+      case R_X86_64_32S:
+        return 32;
+      case R_X86_64_64:
+        return 64;
+    default:
+      llvm_unreachable("unknown x86_64 relocation type");
+    }
+  } else {
+    switch(RelTy) {
+      case R_386_PC32:
+      case R_386_32:
+        return 32;
+    default:
+      llvm_unreachable("unknown x86 relocation type");
+    }
+  }
+  return 0;
+}
+
+bool X86ELFWriterInfo::isPCRelativeRel(unsigned RelTy) const {
+  if (is64Bit) {
+    switch(RelTy) {
+      case R_X86_64_PC32:
+        return true;
+      case R_X86_64_32:
+      case R_X86_64_32S:
+      case R_X86_64_64:
+        return false;
+    default:
+      llvm_unreachable("unknown x86_64 relocation type");
+    }
+  } else {
+    switch(RelTy) {
+      case R_386_PC32:
+        return true;
+      case R_386_32:
+        return false;
+    default:
+      llvm_unreachable("unknown x86 relocation type");
+    }
+  }
+  return 0;
+}
+
+unsigned X86ELFWriterInfo::getAbsoluteLabelMachineRelTy() const {
+  return is64Bit ?
+    X86::reloc_absolute_dword : X86::reloc_absolute_word;
+}
+
+long int X86ELFWriterInfo::computeRelocation(unsigned SymOffset,
+                                             unsigned RelOffset,
+                                             unsigned RelTy) const {
+
+  if (RelTy == R_X86_64_PC32 || RelTy == R_386_PC32)
+    return SymOffset - (RelOffset + 4);
+  else
+    assert("computeRelocation unknown for this relocation type");
+
+  return 0;
+}

diff --git a/lib/Target/X86/X86ELFWriterInfo.h b/lib/Target/X86/X86ELFWriterInfo.h
new file mode 100644
index 0000000..342e6e6
--- /dev/null
+++ b/lib/Target/X86/X86ELFWriterInfo.h

@@ -0,0 +1,76 @@
+//===-- X86ELFWriterInfo.h - ELF Writer Info for X86 ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements ELF writer information for the X86 backend.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86_ELF_WRITER_INFO_H
+#define X86_ELF_WRITER_INFO_H
+
+#include "llvm/Target/TargetELFWriterInfo.h"
+
+namespace llvm {
+
+  class X86ELFWriterInfo : public TargetELFWriterInfo {
+
+    // ELF Relocation types for X86
+    enum X86RelocationType {
+      R_386_NONE = 0,
+      R_386_32   = 1,
+      R_386_PC32 = 2
+    };
+
+    // ELF Relocation types for X86_64
+    enum X86_64RelocationType {
+      R_X86_64_NONE = 0,
+      R_X86_64_64   = 1,
+      R_X86_64_PC32 = 2,
+      R_X86_64_32   = 10,
+      R_X86_64_32S  = 11,
+      R_X86_64_PC64 = 24
+    };
+
+  public:
+    X86ELFWriterInfo(TargetMachine &TM);
+    virtual ~X86ELFWriterInfo();
+
+    /// getRelocationType - Returns the target specific ELF Relocation type.
+    /// 'MachineRelTy' contains the object code independent relocation type
+    virtual unsigned getRelocationType(unsigned MachineRelTy) const;
+
+    /// hasRelocationAddend - True if the target uses an addend in the
+    /// ELF relocation entry.
+    virtual bool hasRelocationAddend() const { return is64Bit ? true : false; }
+
+    /// getDefaultAddendForRelTy - Gets the default addend value for a
+    /// relocation entry based on the target ELF relocation type.
+    virtual long int getDefaultAddendForRelTy(unsigned RelTy,
+                                              long int Modifier = 0) const;
+
+    /// getRelTySize - Returns the size of relocatable field in bits
+    virtual unsigned getRelocationTySize(unsigned RelTy) const;
+
+    /// isPCRelativeRel - True if the relocation type is pc relative
+    virtual bool isPCRelativeRel(unsigned RelTy) const;
+
+    /// getJumpTableRelocationTy - Returns the machine relocation type used
+    /// to reference a jumptable.
+    virtual unsigned getAbsoluteLabelMachineRelTy() const;
+
+    /// computeRelocation - Some relocatable fields could be relocated
+    /// directly, avoiding the relocation symbol emission, compute the
+    /// final relocation value for this symbol.
+    virtual long int computeRelocation(unsigned SymOffset, unsigned RelOffset,
+                                       unsigned RelTy) const;
+  };
+
+} // end llvm namespace
+
+#endif // X86_ELF_WRITER_INFO_H

diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
new file mode 100644
index 0000000..ea398e9
--- /dev/null
+++ b/lib/Target/X86/X86FastISel.cpp

@@ -0,0 +1,1717 @@
+//===-- X86FastISel.cpp - X86 FastISel implementation ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86-specific support for the FastISel class. Much
+// of the target-specific code is generated by tablegen in the file
+// X86GenFastISel.inc, which is #included here.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86ISelLowering.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/CallingConv.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Target/TargetOptions.h"
+using namespace llvm;
+
+namespace {
+  
+class X86FastISel : public FastISel {
+  /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const X86Subtarget *Subtarget;
+
+  /// StackPtr - Register used as the stack pointer.
+  ///
+  unsigned StackPtr;
+
+  /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87 
+  /// floating point ops.
+  /// When SSE is available, use it for f32 operations.
+  /// When SSE2 is available, use it for f64 operations.
+  bool X86ScalarSSEf64;
+  bool X86ScalarSSEf32;
+
+public:
+  explicit X86FastISel(MachineFunction &mf,
+                       MachineModuleInfo *mmi,
+                       DwarfWriter *dw,
+                       DenseMap<const Value *, unsigned> &vm,
+                       DenseMap<const BasicBlock *, MachineBasicBlock *> &bm,
+                       DenseMap<const AllocaInst *, int> &am
+#ifndef NDEBUG
+                       , SmallSet<Instruction*, 8> &cil
+#endif
+                       )
+    : FastISel(mf, mmi, dw, vm, bm, am
+#ifndef NDEBUG
+               , cil
+#endif
+               ) {
+    Subtarget = &TM.getSubtarget<X86Subtarget>();
+    StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
+    X86ScalarSSEf64 = Subtarget->hasSSE2();
+    X86ScalarSSEf32 = Subtarget->hasSSE1();
+  }
+
+  virtual bool TargetSelectInstruction(Instruction *I);
+
+#include "X86GenFastISel.inc"
+
+private:
+  bool X86FastEmitCompare(Value *LHS, Value *RHS, EVT VT);
+  
+  bool X86FastEmitLoad(EVT VT, const X86AddressMode &AM, unsigned &RR);
+
+  bool X86FastEmitStore(EVT VT, Value *Val,
+                        const X86AddressMode &AM);
+  bool X86FastEmitStore(EVT VT, unsigned Val,
+                        const X86AddressMode &AM);
+
+  bool X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT,
+                         unsigned &ResultReg);
+  
+  bool X86SelectAddress(Value *V, X86AddressMode &AM);
+  bool X86SelectCallAddress(Value *V, X86AddressMode &AM);
+
+  bool X86SelectLoad(Instruction *I);
+  
+  bool X86SelectStore(Instruction *I);
+
+  bool X86SelectCmp(Instruction *I);
+
+  bool X86SelectZExt(Instruction *I);
+
+  bool X86SelectBranch(Instruction *I);
+
+  bool X86SelectShift(Instruction *I);
+
+  bool X86SelectSelect(Instruction *I);
+
+  bool X86SelectTrunc(Instruction *I);
+ 
+  bool X86SelectFPExt(Instruction *I);
+  bool X86SelectFPTrunc(Instruction *I);
+
+  bool X86SelectExtractValue(Instruction *I);
+
+  bool X86VisitIntrinsicCall(IntrinsicInst &I);
+  bool X86SelectCall(Instruction *I);
+
+  CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool isTailCall = false);
+
+  const X86InstrInfo *getInstrInfo() const {
+    return getTargetMachine()->getInstrInfo();
+  }
+  const X86TargetMachine *getTargetMachine() const {
+    return static_cast<const X86TargetMachine *>(&TM);
+  }
+
+  unsigned TargetMaterializeConstant(Constant *C);
+
+  unsigned TargetMaterializeAlloca(AllocaInst *C);
+
+  /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
+  /// computed in an SSE register, not on the X87 floating point stack.
+  bool isScalarFPTypeInSSEReg(EVT VT) const {
+    return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
+      (VT == MVT::f32 && X86ScalarSSEf32);   // f32 is when SSE1
+  }
+
+  bool isTypeLegal(const Type *Ty, EVT &VT, bool AllowI1 = false);
+};
+  
+} // end anonymous namespace.
+
+bool X86FastISel::isTypeLegal(const Type *Ty, EVT &VT, bool AllowI1) {
+  VT = TLI.getValueType(Ty, /*HandleUnknown=*/true);
+  if (VT == MVT::Other || !VT.isSimple())
+    // Unhandled type. Halt "fast" selection and bail.
+    return false;
+  
+  // For now, require SSE/SSE2 for performing floating-point operations,
+  // since x87 requires additional work.
+  if (VT == MVT::f64 && !X86ScalarSSEf64)
+     return false;
+  if (VT == MVT::f32 && !X86ScalarSSEf32)
+     return false;
+  // Similarly, no f80 support yet.
+  if (VT == MVT::f80)
+    return false;
+  // We only handle legal types. For example, on x86-32 the instruction
+  // selector contains all of the 64-bit instructions from x86-64,
+  // under the assumption that i64 won't be used if the target doesn't
+  // support it.
+  return (AllowI1 && VT == MVT::i1) || TLI.isTypeLegal(VT);
+}
+
+#include "X86GenCallingConv.inc"
+
+/// CCAssignFnForCall - Selects the correct CCAssignFn for a given calling
+/// convention.
+CCAssignFn *X86FastISel::CCAssignFnForCall(CallingConv::ID CC,
+                                           bool isTaillCall) {
+  if (Subtarget->is64Bit()) {
+    if (Subtarget->isTargetWin64())
+      return CC_X86_Win64_C;
+    else
+      return CC_X86_64_C;
+  }
+
+  if (CC == CallingConv::X86_FastCall)
+    return CC_X86_32_FastCall;
+  else if (CC == CallingConv::Fast)
+    return CC_X86_32_FastCC;
+  else
+    return CC_X86_32_C;
+}
+
+/// X86FastEmitLoad - Emit a machine instruction to load a value of type VT.
+/// The address is either pre-computed, i.e. Ptr, or a GlobalAddress, i.e. GV.
+/// Return true and the result register by reference if it is possible.
+bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM,
+                                  unsigned &ResultReg) {
+  // Get opcode and regclass of the output for the given load instruction.
+  unsigned Opc = 0;
+  const TargetRegisterClass *RC = NULL;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: return false;
+  case MVT::i1:
+  case MVT::i8:
+    Opc = X86::MOV8rm;
+    RC  = X86::GR8RegisterClass;
+    break;
+  case MVT::i16:
+    Opc = X86::MOV16rm;
+    RC  = X86::GR16RegisterClass;
+    break;
+  case MVT::i32:
+    Opc = X86::MOV32rm;
+    RC  = X86::GR32RegisterClass;
+    break;
+  case MVT::i64:
+    // Must be in x86-64 mode.
+    Opc = X86::MOV64rm;
+    RC  = X86::GR64RegisterClass;
+    break;
+  case MVT::f32:
+    if (Subtarget->hasSSE1()) {
+      Opc = X86::MOVSSrm;
+      RC  = X86::FR32RegisterClass;
+    } else {
+      Opc = X86::LD_Fp32m;
+      RC  = X86::RFP32RegisterClass;
+    }
+    break;
+  case MVT::f64:
+    if (Subtarget->hasSSE2()) {
+      Opc = X86::MOVSDrm;
+      RC  = X86::FR64RegisterClass;
+    } else {
+      Opc = X86::LD_Fp64m;
+      RC  = X86::RFP64RegisterClass;
+    }
+    break;
+  case MVT::f80:
+    // No f80 support yet.
+    return false;
+  }
+
+  ResultReg = createResultReg(RC);
+  addFullAddress(BuildMI(MBB, DL, TII.get(Opc), ResultReg), AM);
+  return true;
+}
+
+/// X86FastEmitStore - Emit a machine instruction to store a value Val of
+/// type VT. The address is either pre-computed, consisted of a base ptr, Ptr
+/// and a displacement offset, or a GlobalAddress,
+/// i.e. V. Return true if it is possible.
+bool
+X86FastISel::X86FastEmitStore(EVT VT, unsigned Val,
+                              const X86AddressMode &AM) {
+  // Get opcode and regclass of the output for the given store instruction.
+  unsigned Opc = 0;
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::f80: // No f80 support yet.
+  default: return false;
+  case MVT::i1: {
+    // Mask out all but lowest bit.
+    unsigned AndResult = createResultReg(X86::GR8RegisterClass);
+    BuildMI(MBB, DL,
+            TII.get(X86::AND8ri), AndResult).addReg(Val).addImm(1);
+    Val = AndResult;
+  }
+  // FALLTHROUGH, handling i1 as i8.
+  case MVT::i8:  Opc = X86::MOV8mr;  break;
+  case MVT::i16: Opc = X86::MOV16mr; break;
+  case MVT::i32: Opc = X86::MOV32mr; break;
+  case MVT::i64: Opc = X86::MOV64mr; break; // Must be in x86-64 mode.
+  case MVT::f32:
+    Opc = Subtarget->hasSSE1() ? X86::MOVSSmr : X86::ST_Fp32m;
+    break;
+  case MVT::f64:
+    Opc = Subtarget->hasSSE2() ? X86::MOVSDmr : X86::ST_Fp64m;
+    break;
+  }
+  
+  addFullAddress(BuildMI(MBB, DL, TII.get(Opc)), AM).addReg(Val);
+  return true;
+}
+
+bool X86FastISel::X86FastEmitStore(EVT VT, Value *Val,
+                                   const X86AddressMode &AM) {
+  // Handle 'null' like i32/i64 0.
+  if (isa<ConstantPointerNull>(Val))
+    Val = Constant::getNullValue(TD.getIntPtrType(Val->getContext()));
+  
+  // If this is a store of a simple constant, fold the constant into the store.
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
+    unsigned Opc = 0;
+    bool Signed = true;
+    switch (VT.getSimpleVT().SimpleTy) {
+    default: break;
+    case MVT::i1:  Signed = false;     // FALLTHROUGH to handle as i8.
+    case MVT::i8:  Opc = X86::MOV8mi;  break;
+    case MVT::i16: Opc = X86::MOV16mi; break;
+    case MVT::i32: Opc = X86::MOV32mi; break;
+    case MVT::i64:
+      // Must be a 32-bit sign extended value.
+      if ((int)CI->getSExtValue() == CI->getSExtValue())
+        Opc = X86::MOV64mi32;
+      break;
+    }
+    
+    if (Opc) {
+      addFullAddress(BuildMI(MBB, DL, TII.get(Opc)), AM)
+                             .addImm(Signed ? CI->getSExtValue() :
+                                              CI->getZExtValue());
+      return true;
+    }
+  }
+  
+  unsigned ValReg = getRegForValue(Val);
+  if (ValReg == 0)
+    return false;    
+ 
+  return X86FastEmitStore(VT, ValReg, AM);
+}
+
+/// X86FastEmitExtend - Emit a machine instruction to extend a value Src of
+/// type SrcVT to type DstVT using the specified extension opcode Opc (e.g.
+/// ISD::SIGN_EXTEND).
+bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT,
+                                    unsigned Src, EVT SrcVT,
+                                    unsigned &ResultReg) {
+  unsigned RR = FastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc, Src);
+  
+  if (RR != 0) {
+    ResultReg = RR;
+    return true;
+  } else
+    return false;
+}
+
+/// X86SelectAddress - Attempt to fill in an address from the given value.
+///
+bool X86FastISel::X86SelectAddress(Value *V, X86AddressMode &AM) {
+  User *U = NULL;
+  unsigned Opcode = Instruction::UserOp1;
+  if (Instruction *I = dyn_cast<Instruction>(V)) {
+    Opcode = I->getOpcode();
+    U = I;
+  } else if (ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
+    Opcode = C->getOpcode();
+    U = C;
+  }
+
+  switch (Opcode) {
+  default: break;
+  case Instruction::BitCast:
+    // Look past bitcasts.
+    return X86SelectAddress(U->getOperand(0), AM);
+
+  case Instruction::IntToPtr:
+    // Look past no-op inttoptrs.
+    if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+      return X86SelectAddress(U->getOperand(0), AM);
+    break;
+
+  case Instruction::PtrToInt:
+    // Look past no-op ptrtoints.
+    if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
+      return X86SelectAddress(U->getOperand(0), AM);
+    break;
+
+  case Instruction::Alloca: {
+    // Do static allocas.
+    const AllocaInst *A = cast<AllocaInst>(V);
+    DenseMap<const AllocaInst*, int>::iterator SI = StaticAllocaMap.find(A);
+    if (SI != StaticAllocaMap.end()) {
+      AM.BaseType = X86AddressMode::FrameIndexBase;
+      AM.Base.FrameIndex = SI->second;
+      return true;
+    }
+    break;
+  }
+
+  case Instruction::Add: {
+    // Adds of constants are common and easy enough.
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(U->getOperand(1))) {
+      uint64_t Disp = (int32_t)AM.Disp + (uint64_t)CI->getSExtValue();
+      // They have to fit in the 32-bit signed displacement field though.
+      if (isInt32(Disp)) {
+        AM.Disp = (uint32_t)Disp;
+        return X86SelectAddress(U->getOperand(0), AM);
+      }
+    }
+    break;
+  }
+
+  case Instruction::GetElementPtr: {
+    // Pattern-match simple GEPs.
+    uint64_t Disp = (int32_t)AM.Disp;
+    unsigned IndexReg = AM.IndexReg;
+    unsigned Scale = AM.Scale;
+    gep_type_iterator GTI = gep_type_begin(U);
+    // Iterate through the indices, folding what we can. Constants can be
+    // folded, and one dynamic index can be handled, if the scale is supported.
+    for (User::op_iterator i = U->op_begin() + 1, e = U->op_end();
+         i != e; ++i, ++GTI) {
+      Value *Op = *i;
+      if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
+        const StructLayout *SL = TD.getStructLayout(STy);
+        unsigned Idx = cast<ConstantInt>(Op)->getZExtValue();
+        Disp += SL->getElementOffset(Idx);
+      } else {
+        uint64_t S = TD.getTypeAllocSize(GTI.getIndexedType());
+        if (ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+          // Constant-offset addressing.
+          Disp += CI->getSExtValue() * S;
+        } else if (IndexReg == 0 &&
+                   (!AM.GV || !Subtarget->isPICStyleRIPRel()) &&
+                   (S == 1 || S == 2 || S == 4 || S == 8)) {
+          // Scaled-index addressing.
+          Scale = S;
+          IndexReg = getRegForGEPIndex(Op);
+          if (IndexReg == 0)
+            return false;
+        } else
+          // Unsupported.
+          goto unsupported_gep;
+      }
+    }
+    // Check for displacement overflow.
+    if (!isInt32(Disp))
+      break;
+    // Ok, the GEP indices were covered by constant-offset and scaled-index
+    // addressing. Update the address state and move on to examining the base.
+    AM.IndexReg = IndexReg;
+    AM.Scale = Scale;
+    AM.Disp = (uint32_t)Disp;
+    return X86SelectAddress(U->getOperand(0), AM);
+  unsupported_gep:
+    // Ok, the GEP indices weren't all covered.
+    break;
+  }
+  }
+
+  // Handle constant address.
+  if (GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+    // Can't handle alternate code models yet.
+    if (TM.getCodeModel() != CodeModel::Small)
+      return false;
+
+    // RIP-relative addresses can't have additional register operands.
+    if (Subtarget->isPICStyleRIPRel() &&
+        (AM.Base.Reg != 0 || AM.IndexReg != 0))
+      return false;
+
+    // Can't handle TLS yet.
+    if (GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
+      if (GVar->isThreadLocal())
+        return false;
+
+    // Okay, we've committed to selecting this global. Set up the basic address.
+    AM.GV = GV;
+    
+    // Allow the subtarget to classify the global.
+    unsigned char GVFlags = Subtarget->ClassifyGlobalReference(GV, TM);
+
+    // If this reference is relative to the pic base, set it now.
+    if (isGlobalRelativeToPICBase(GVFlags)) {
+      // FIXME: How do we know Base.Reg is free??
+      AM.Base.Reg = getInstrInfo()->getGlobalBaseReg(&MF);
+    }
+    
+    // Unless the ABI requires an extra load, return a direct reference to
+    // the global.
+    if (!isGlobalStubReference(GVFlags)) {
+      if (Subtarget->isPICStyleRIPRel()) {
+        // Use rip-relative addressing if we can.  Above we verified that the
+        // base and index registers are unused.
+        assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
+        AM.Base.Reg = X86::RIP;
+      }
+      AM.GVOpFlags = GVFlags;
+      return true;
+    }
+    
+    // Ok, we need to do a load from a stub.  If we've already loaded from this
+    // stub, reuse the loaded pointer, otherwise emit the load now.
+    DenseMap<const Value*, unsigned>::iterator I = LocalValueMap.find(V);
+    unsigned LoadReg;
+    if (I != LocalValueMap.end() && I->second != 0) {
+      LoadReg = I->second;
+    } else {
+      // Issue load from stub.
+      unsigned Opc = 0;
+      const TargetRegisterClass *RC = NULL;
+      X86AddressMode StubAM;
+      StubAM.Base.Reg = AM.Base.Reg;
+      StubAM.GV = GV;
+      StubAM.GVOpFlags = GVFlags;
+
+      if (TLI.getPointerTy() == MVT::i64) {
+        Opc = X86::MOV64rm;
+        RC  = X86::GR64RegisterClass;
+        
+        if (Subtarget->isPICStyleRIPRel())
+          StubAM.Base.Reg = X86::RIP;
+      } else {
+        Opc = X86::MOV32rm;
+        RC  = X86::GR32RegisterClass;
+      }
+      
+      LoadReg = createResultReg(RC);
+      addFullAddress(BuildMI(MBB, DL, TII.get(Opc), LoadReg), StubAM);
+      
+      // Prevent loading GV stub multiple times in same MBB.
+      LocalValueMap[V] = LoadReg;
+    }
+    
+    // Now construct the final address. Note that the Disp, Scale,
+    // and Index values may already be set here.
+    AM.Base.Reg = LoadReg;
+    AM.GV = 0;
+    return true;
+  }
+
+  // If all else fails, try to materialize the value in a register.
+  if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
+    if (AM.Base.Reg == 0) {
+      AM.Base.Reg = getRegForValue(V);
+      return AM.Base.Reg != 0;
+    }
+    if (AM.IndexReg == 0) {
+      assert(AM.Scale == 1 && "Scale with no index!");
+      AM.IndexReg = getRegForValue(V);
+      return AM.IndexReg != 0;
+    }
+  }
+
+  return false;
+}
+
+/// X86SelectCallAddress - Attempt to fill in an address from the given value.
+///
+bool X86FastISel::X86SelectCallAddress(Value *V, X86AddressMode &AM) {
+  User *U = NULL;
+  unsigned Opcode = Instruction::UserOp1;
+  if (Instruction *I = dyn_cast<Instruction>(V)) {
+    Opcode = I->getOpcode();
+    U = I;
+  } else if (ConstantExpr *C = dyn_cast<ConstantExpr>(V)) {
+    Opcode = C->getOpcode();
+    U = C;
+  }
+
+  switch (Opcode) {
+  default: break;
+  case Instruction::BitCast:
+    // Look past bitcasts.
+    return X86SelectCallAddress(U->getOperand(0), AM);
+
+  case Instruction::IntToPtr:
+    // Look past no-op inttoptrs.
+    if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
+      return X86SelectCallAddress(U->getOperand(0), AM);
+    break;
+
+  case Instruction::PtrToInt:
+    // Look past no-op ptrtoints.
+    if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
+      return X86SelectCallAddress(U->getOperand(0), AM);
+    break;
+  }
+
+  // Handle constant address.
+  if (GlobalValue *GV = dyn_cast<GlobalValue>(V)) {
+    // Can't handle alternate code models yet.
+    if (TM.getCodeModel() != CodeModel::Small)
+      return false;
+
+    // RIP-relative addresses can't have additional register operands.
+    if (Subtarget->isPICStyleRIPRel() &&
+        (AM.Base.Reg != 0 || AM.IndexReg != 0))
+      return false;
+
+    // Can't handle TLS or DLLImport.
+    if (GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
+      if (GVar->isThreadLocal() || GVar->hasDLLImportLinkage())
+        return false;
+
+    // Okay, we've committed to selecting this global. Set up the basic address.
+    AM.GV = GV;
+    
+    // No ABI requires an extra load for anything other than DLLImport, which
+    // we rejected above. Return a direct reference to the global.
+    if (Subtarget->isPICStyleRIPRel()) {
+      // Use rip-relative addressing if we can.  Above we verified that the
+      // base and index registers are unused.
+      assert(AM.Base.Reg == 0 && AM.IndexReg == 0);
+      AM.Base.Reg = X86::RIP;
+    } else if (Subtarget->isPICStyleStubPIC()) {
+      AM.GVOpFlags = X86II::MO_PIC_BASE_OFFSET;
+    } else if (Subtarget->isPICStyleGOT()) {
+      AM.GVOpFlags = X86II::MO_GOTOFF;
+    }
+    
+    return true;
+  }
+
+  // If all else fails, try to materialize the value in a register.
+  if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
+    if (AM.Base.Reg == 0) {
+      AM.Base.Reg = getRegForValue(V);
+      return AM.Base.Reg != 0;
+    }
+    if (AM.IndexReg == 0) {
+      assert(AM.Scale == 1 && "Scale with no index!");
+      AM.IndexReg = getRegForValue(V);
+      return AM.IndexReg != 0;
+    }
+  }
+
+  return false;
+}
+
+
+/// X86SelectStore - Select and emit code to implement store instructions.
+bool X86FastISel::X86SelectStore(Instruction* I) {
+  EVT VT;
+  if (!isTypeLegal(I->getOperand(0)->getType(), VT, /*AllowI1=*/true))
+    return false;
+
+  X86AddressMode AM;
+  if (!X86SelectAddress(I->getOperand(1), AM))
+    return false;
+
+  return X86FastEmitStore(VT, I->getOperand(0), AM);
+}
+
+/// X86SelectLoad - Select and emit code to implement load instructions.
+///
+bool X86FastISel::X86SelectLoad(Instruction *I)  {
+  EVT VT;
+  if (!isTypeLegal(I->getType(), VT, /*AllowI1=*/true))
+    return false;
+
+  X86AddressMode AM;
+  if (!X86SelectAddress(I->getOperand(0), AM))
+    return false;
+
+  unsigned ResultReg = 0;
+  if (X86FastEmitLoad(VT, AM, ResultReg)) {
+    UpdateValueMap(I, ResultReg);
+    return true;
+  }
+  return false;
+}
+
+static unsigned X86ChooseCmpOpcode(EVT VT) {
+  switch (VT.getSimpleVT().SimpleTy) {
+  default:       return 0;
+  case MVT::i8:  return X86::CMP8rr;
+  case MVT::i16: return X86::CMP16rr;
+  case MVT::i32: return X86::CMP32rr;
+  case MVT::i64: return X86::CMP64rr;
+  case MVT::f32: return X86::UCOMISSrr;
+  case MVT::f64: return X86::UCOMISDrr;
+  }
+}
+
+/// X86ChooseCmpImmediateOpcode - If we have a comparison with RHS as the RHS
+/// of the comparison, return an opcode that works for the compare (e.g.
+/// CMP32ri) otherwise return 0.
+static unsigned X86ChooseCmpImmediateOpcode(EVT VT, ConstantInt *RHSC) {
+  switch (VT.getSimpleVT().SimpleTy) {
+  // Otherwise, we can't fold the immediate into this comparison.
+  default: return 0;
+  case MVT::i8: return X86::CMP8ri;
+  case MVT::i16: return X86::CMP16ri;
+  case MVT::i32: return X86::CMP32ri;
+  case MVT::i64:
+    // 64-bit comparisons are only valid if the immediate fits in a 32-bit sext
+    // field.
+    if ((int)RHSC->getSExtValue() == RHSC->getSExtValue())
+      return X86::CMP64ri32;
+    return 0;
+  }
+}
+
+bool X86FastISel::X86FastEmitCompare(Value *Op0, Value *Op1, EVT VT) {
+  unsigned Op0Reg = getRegForValue(Op0);
+  if (Op0Reg == 0) return false;
+  
+  // Handle 'null' like i32/i64 0.
+  if (isa<ConstantPointerNull>(Op1))
+    Op1 = Constant::getNullValue(TD.getIntPtrType(Op0->getContext()));
+  
+  // We have two options: compare with register or immediate.  If the RHS of
+  // the compare is an immediate that we can fold into this compare, use
+  // CMPri, otherwise use CMPrr.
+  if (ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
+    if (unsigned CompareImmOpc = X86ChooseCmpImmediateOpcode(VT, Op1C)) {
+      BuildMI(MBB, DL, TII.get(CompareImmOpc)).addReg(Op0Reg)
+                                          .addImm(Op1C->getSExtValue());
+      return true;
+    }
+  }
+  
+  unsigned CompareOpc = X86ChooseCmpOpcode(VT);
+  if (CompareOpc == 0) return false;
+    
+  unsigned Op1Reg = getRegForValue(Op1);
+  if (Op1Reg == 0) return false;
+  BuildMI(MBB, DL, TII.get(CompareOpc)).addReg(Op0Reg).addReg(Op1Reg);
+  
+  return true;
+}
+
+bool X86FastISel::X86SelectCmp(Instruction *I) {
+  CmpInst *CI = cast<CmpInst>(I);
+
+  EVT VT;
+  if (!isTypeLegal(I->getOperand(0)->getType(), VT))
+    return false;
+
+  unsigned ResultReg = createResultReg(&X86::GR8RegClass);
+  unsigned SetCCOpc;
+  bool SwapArgs;  // false -> compare Op0, Op1.  true -> compare Op1, Op0.
+  switch (CI->getPredicate()) {
+  case CmpInst::FCMP_OEQ: {
+    if (!X86FastEmitCompare(CI->getOperand(0), CI->getOperand(1), VT))
+      return false;
+    
+    unsigned EReg = createResultReg(&X86::GR8RegClass);
+    unsigned NPReg = createResultReg(&X86::GR8RegClass);
+    BuildMI(MBB, DL, TII.get(X86::SETEr), EReg);
+    BuildMI(MBB, DL, TII.get(X86::SETNPr), NPReg);
+    BuildMI(MBB, DL, 
+            TII.get(X86::AND8rr), ResultReg).addReg(NPReg).addReg(EReg);
+    UpdateValueMap(I, ResultReg);
+    return true;
+  }
+  case CmpInst::FCMP_UNE: {
+    if (!X86FastEmitCompare(CI->getOperand(0), CI->getOperand(1), VT))
+      return false;
+
+    unsigned NEReg = createResultReg(&X86::GR8RegClass);
+    unsigned PReg = createResultReg(&X86::GR8RegClass);
+    BuildMI(MBB, DL, TII.get(X86::SETNEr), NEReg);
+    BuildMI(MBB, DL, TII.get(X86::SETPr), PReg);
+    BuildMI(MBB, DL, TII.get(X86::OR8rr), ResultReg).addReg(PReg).addReg(NEReg);
+    UpdateValueMap(I, ResultReg);
+    return true;
+  }
+  case CmpInst::FCMP_OGT: SwapArgs = false; SetCCOpc = X86::SETAr;  break;
+  case CmpInst::FCMP_OGE: SwapArgs = false; SetCCOpc = X86::SETAEr; break;
+  case CmpInst::FCMP_OLT: SwapArgs = true;  SetCCOpc = X86::SETAr;  break;
+  case CmpInst::FCMP_OLE: SwapArgs = true;  SetCCOpc = X86::SETAEr; break;
+  case CmpInst::FCMP_ONE: SwapArgs = false; SetCCOpc = X86::SETNEr; break;
+  case CmpInst::FCMP_ORD: SwapArgs = false; SetCCOpc = X86::SETNPr; break;
+  case CmpInst::FCMP_UNO: SwapArgs = false; SetCCOpc = X86::SETPr;  break;
+  case CmpInst::FCMP_UEQ: SwapArgs = false; SetCCOpc = X86::SETEr;  break;
+  case CmpInst::FCMP_UGT: SwapArgs = true;  SetCCOpc = X86::SETBr;  break;
+  case CmpInst::FCMP_UGE: SwapArgs = true;  SetCCOpc = X86::SETBEr; break;
+  case CmpInst::FCMP_ULT: SwapArgs = false; SetCCOpc = X86::SETBr;  break;
+  case CmpInst::FCMP_ULE: SwapArgs = false; SetCCOpc = X86::SETBEr; break;
+  
+  case CmpInst::ICMP_EQ:  SwapArgs = false; SetCCOpc = X86::SETEr;  break;
+  case CmpInst::ICMP_NE:  SwapArgs = false; SetCCOpc = X86::SETNEr; break;
+  case CmpInst::ICMP_UGT: SwapArgs = false; SetCCOpc = X86::SETAr;  break;
+  case CmpInst::ICMP_UGE: SwapArgs = false; SetCCOpc = X86::SETAEr; break;
+  case CmpInst::ICMP_ULT: SwapArgs = false; SetCCOpc = X86::SETBr;  break;
+  case CmpInst::ICMP_ULE: SwapArgs = false; SetCCOpc = X86::SETBEr; break;
+  case CmpInst::ICMP_SGT: SwapArgs = false; SetCCOpc = X86::SETGr;  break;
+  case CmpInst::ICMP_SGE: SwapArgs = false; SetCCOpc = X86::SETGEr; break;
+  case CmpInst::ICMP_SLT: SwapArgs = false; SetCCOpc = X86::SETLr;  break;
+  case CmpInst::ICMP_SLE: SwapArgs = false; SetCCOpc = X86::SETLEr; break;
+  default:
+    return false;
+  }
+
+  Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1);
+  if (SwapArgs)
+    std::swap(Op0, Op1);
+
+  // Emit a compare of Op0/Op1.
+  if (!X86FastEmitCompare(Op0, Op1, VT))
+    return false;
+  
+  BuildMI(MBB, DL, TII.get(SetCCOpc), ResultReg);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool X86FastISel::X86SelectZExt(Instruction *I) {
+  // Handle zero-extension from i1 to i8, which is common.
+  if (I->getType()->isInteger(8) &&
+      I->getOperand(0)->getType()->isInteger(1)) {
+    unsigned ResultReg = getRegForValue(I->getOperand(0));
+    if (ResultReg == 0) return false;
+    // Set the high bits to zero.
+    ResultReg = FastEmitZExtFromI1(MVT::i8, ResultReg);
+    if (ResultReg == 0) return false;
+    UpdateValueMap(I, ResultReg);
+    return true;
+  }
+
+  return false;
+}
+
+
+bool X86FastISel::X86SelectBranch(Instruction *I) {
+  // Unconditional branches are selected by tablegen-generated code.
+  // Handle a conditional branch.
+  BranchInst *BI = cast<BranchInst>(I);
+  MachineBasicBlock *TrueMBB = MBBMap[BI->getSuccessor(0)];
+  MachineBasicBlock *FalseMBB = MBBMap[BI->getSuccessor(1)];
+
+  // Fold the common case of a conditional branch with a comparison.
+  if (CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
+    if (CI->hasOneUse()) {
+      EVT VT = TLI.getValueType(CI->getOperand(0)->getType());
+
+      // Try to take advantage of fallthrough opportunities.
+      CmpInst::Predicate Predicate = CI->getPredicate();
+      if (MBB->isLayoutSuccessor(TrueMBB)) {
+        std::swap(TrueMBB, FalseMBB);
+        Predicate = CmpInst::getInversePredicate(Predicate);
+      }
+
+      bool SwapArgs;  // false -> compare Op0, Op1.  true -> compare Op1, Op0.
+      unsigned BranchOpc; // Opcode to jump on, e.g. "X86::JA"
+
+      switch (Predicate) {
+      case CmpInst::FCMP_OEQ:
+        std::swap(TrueMBB, FalseMBB);
+        Predicate = CmpInst::FCMP_UNE;
+        // FALL THROUGH
+      case CmpInst::FCMP_UNE: SwapArgs = false; BranchOpc = X86::JNE; break;
+      case CmpInst::FCMP_OGT: SwapArgs = false; BranchOpc = X86::JA;  break;
+      case CmpInst::FCMP_OGE: SwapArgs = false; BranchOpc = X86::JAE; break;
+      case CmpInst::FCMP_OLT: SwapArgs = true;  BranchOpc = X86::JA;  break;
+      case CmpInst::FCMP_OLE: SwapArgs = true;  BranchOpc = X86::JAE; break;
+      case CmpInst::FCMP_ONE: SwapArgs = false; BranchOpc = X86::JNE; break;
+      case CmpInst::FCMP_ORD: SwapArgs = false; BranchOpc = X86::JNP; break;
+      case CmpInst::FCMP_UNO: SwapArgs = false; BranchOpc = X86::JP;  break;
+      case CmpInst::FCMP_UEQ: SwapArgs = false; BranchOpc = X86::JE;  break;
+      case CmpInst::FCMP_UGT: SwapArgs = true;  BranchOpc = X86::JB;  break;
+      case CmpInst::FCMP_UGE: SwapArgs = true;  BranchOpc = X86::JBE; break;
+      case CmpInst::FCMP_ULT: SwapArgs = false; BranchOpc = X86::JB;  break;
+      case CmpInst::FCMP_ULE: SwapArgs = false; BranchOpc = X86::JBE; break;
+          
+      case CmpInst::ICMP_EQ:  SwapArgs = false; BranchOpc = X86::JE;  break;
+      case CmpInst::ICMP_NE:  SwapArgs = false; BranchOpc = X86::JNE; break;
+      case CmpInst::ICMP_UGT: SwapArgs = false; BranchOpc = X86::JA;  break;
+      case CmpInst::ICMP_UGE: SwapArgs = false; BranchOpc = X86::JAE; break;
+      case CmpInst::ICMP_ULT: SwapArgs = false; BranchOpc = X86::JB;  break;
+      case CmpInst::ICMP_ULE: SwapArgs = false; BranchOpc = X86::JBE; break;
+      case CmpInst::ICMP_SGT: SwapArgs = false; BranchOpc = X86::JG;  break;
+      case CmpInst::ICMP_SGE: SwapArgs = false; BranchOpc = X86::JGE; break;
+      case CmpInst::ICMP_SLT: SwapArgs = false; BranchOpc = X86::JL;  break;
+      case CmpInst::ICMP_SLE: SwapArgs = false; BranchOpc = X86::JLE; break;
+      default:
+        return false;
+      }
+      
+      Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1);
+      if (SwapArgs)
+        std::swap(Op0, Op1);
+
+      // Emit a compare of the LHS and RHS, setting the flags.
+      if (!X86FastEmitCompare(Op0, Op1, VT))
+        return false;
+      
+      BuildMI(MBB, DL, TII.get(BranchOpc)).addMBB(TrueMBB);
+
+      if (Predicate == CmpInst::FCMP_UNE) {
+        // X86 requires a second branch to handle UNE (and OEQ,
+        // which is mapped to UNE above).
+        BuildMI(MBB, DL, TII.get(X86::JP)).addMBB(TrueMBB);
+      }
+
+      FastEmitBranch(FalseMBB);
+      MBB->addSuccessor(TrueMBB);
+      return true;
+    }
+  } else if (ExtractValueInst *EI =
+             dyn_cast<ExtractValueInst>(BI->getCondition())) {
+    // Check to see if the branch instruction is from an "arithmetic with
+    // overflow" intrinsic. The main way these intrinsics are used is:
+    //
+    //   %t = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+    //   %sum = extractvalue { i32, i1 } %t, 0
+    //   %obit = extractvalue { i32, i1 } %t, 1
+    //   br i1 %obit, label %overflow, label %normal
+    //
+    // The %sum and %obit are converted in an ADD and a SETO/SETB before
+    // reaching the branch. Therefore, we search backwards through the MBB
+    // looking for the SETO/SETB instruction. If an instruction modifies the
+    // EFLAGS register before we reach the SETO/SETB instruction, then we can't
+    // convert the branch into a JO/JB instruction.
+    if (IntrinsicInst *CI = dyn_cast<IntrinsicInst>(EI->getAggregateOperand())){
+      if (CI->getIntrinsicID() == Intrinsic::sadd_with_overflow ||
+          CI->getIntrinsicID() == Intrinsic::uadd_with_overflow) {
+        const MachineInstr *SetMI = 0;
+        unsigned Reg = lookUpRegForValue(EI);
+
+        for (MachineBasicBlock::const_reverse_iterator
+               RI = MBB->rbegin(), RE = MBB->rend(); RI != RE; ++RI) {
+          const MachineInstr &MI = *RI;
+
+          if (MI.modifiesRegister(Reg)) {
+            unsigned Src, Dst, SrcSR, DstSR;
+
+            if (getInstrInfo()->isMoveInstr(MI, Src, Dst, SrcSR, DstSR)) {
+              Reg = Src;
+              continue;
+            }
+
+            SetMI = &MI;
+            break;
+          }
+
+          const TargetInstrDesc &TID = MI.getDesc();
+          if (TID.hasUnmodeledSideEffects() ||
+              TID.hasImplicitDefOfPhysReg(X86::EFLAGS))
+            break;
+        }
+
+        if (SetMI) {
+          unsigned OpCode = SetMI->getOpcode();
+
+          if (OpCode == X86::SETOr || OpCode == X86::SETBr) {
+            BuildMI(MBB, DL, TII.get(OpCode == X86::SETOr ? X86::JO : X86::JB))
+              .addMBB(TrueMBB);
+            FastEmitBranch(FalseMBB);
+            MBB->addSuccessor(TrueMBB);
+            return true;
+          }
+        }
+      }
+    }
+  }
+
+  // Otherwise do a clumsy setcc and re-test it.
+  unsigned OpReg = getRegForValue(BI->getCondition());
+  if (OpReg == 0) return false;
+
+  BuildMI(MBB, DL, TII.get(X86::TEST8rr)).addReg(OpReg).addReg(OpReg);
+  BuildMI(MBB, DL, TII.get(X86::JNE)).addMBB(TrueMBB);
+  FastEmitBranch(FalseMBB);
+  MBB->addSuccessor(TrueMBB);
+  return true;
+}
+
+bool X86FastISel::X86SelectShift(Instruction *I) {
+  unsigned CReg = 0, OpReg = 0, OpImm = 0;
+  const TargetRegisterClass *RC = NULL;
+  if (I->getType()->isInteger(8)) {
+    CReg = X86::CL;
+    RC = &X86::GR8RegClass;
+    switch (I->getOpcode()) {
+    case Instruction::LShr: OpReg = X86::SHR8rCL; OpImm = X86::SHR8ri; break;
+    case Instruction::AShr: OpReg = X86::SAR8rCL; OpImm = X86::SAR8ri; break;
+    case Instruction::Shl:  OpReg = X86::SHL8rCL; OpImm = X86::SHL8ri; break;
+    default: return false;
+    }
+  } else if (I->getType()->isInteger(16)) {
+    CReg = X86::CX;
+    RC = &X86::GR16RegClass;
+    switch (I->getOpcode()) {
+    case Instruction::LShr: OpReg = X86::SHR16rCL; OpImm = X86::SHR16ri; break;
+    case Instruction::AShr: OpReg = X86::SAR16rCL; OpImm = X86::SAR16ri; break;
+    case Instruction::Shl:  OpReg = X86::SHL16rCL; OpImm = X86::SHL16ri; break;
+    default: return false;
+    }
+  } else if (I->getType()->isInteger(32)) {
+    CReg = X86::ECX;
+    RC = &X86::GR32RegClass;
+    switch (I->getOpcode()) {
+    case Instruction::LShr: OpReg = X86::SHR32rCL; OpImm = X86::SHR32ri; break;
+    case Instruction::AShr: OpReg = X86::SAR32rCL; OpImm = X86::SAR32ri; break;
+    case Instruction::Shl:  OpReg = X86::SHL32rCL; OpImm = X86::SHL32ri; break;
+    default: return false;
+    }
+  } else if (I->getType()->isInteger(64)) {
+    CReg = X86::RCX;
+    RC = &X86::GR64RegClass;
+    switch (I->getOpcode()) {
+    case Instruction::LShr: OpReg = X86::SHR64rCL; OpImm = X86::SHR64ri; break;
+    case Instruction::AShr: OpReg = X86::SAR64rCL; OpImm = X86::SAR64ri; break;
+    case Instruction::Shl:  OpReg = X86::SHL64rCL; OpImm = X86::SHL64ri; break;
+    default: return false;
+    }
+  } else {
+    return false;
+  }
+
+  EVT VT = TLI.getValueType(I->getType(), /*HandleUnknown=*/true);
+  if (VT == MVT::Other || !isTypeLegal(I->getType(), VT))
+    return false;
+
+  unsigned Op0Reg = getRegForValue(I->getOperand(0));
+  if (Op0Reg == 0) return false;
+  
+  // Fold immediate in shl(x,3).
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {
+    unsigned ResultReg = createResultReg(RC);
+    BuildMI(MBB, DL, TII.get(OpImm), 
+            ResultReg).addReg(Op0Reg).addImm(CI->getZExtValue() & 0xff);
+    UpdateValueMap(I, ResultReg);
+    return true;
+  }
+  
+  unsigned Op1Reg = getRegForValue(I->getOperand(1));
+  if (Op1Reg == 0) return false;
+  TII.copyRegToReg(*MBB, MBB->end(), CReg, Op1Reg, RC, RC);
+
+  // The shift instruction uses X86::CL. If we defined a super-register
+  // of X86::CL, emit an EXTRACT_SUBREG to precisely describe what
+  // we're doing here.
+  if (CReg != X86::CL)
+    BuildMI(MBB, DL, TII.get(TargetOpcode::EXTRACT_SUBREG), X86::CL)
+      .addReg(CReg).addImm(X86::SUBREG_8BIT);
+
+  unsigned ResultReg = createResultReg(RC);
+  BuildMI(MBB, DL, TII.get(OpReg), ResultReg).addReg(Op0Reg);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool X86FastISel::X86SelectSelect(Instruction *I) {
+  EVT VT = TLI.getValueType(I->getType(), /*HandleUnknown=*/true);
+  if (VT == MVT::Other || !isTypeLegal(I->getType(), VT))
+    return false;
+  
+  unsigned Opc = 0;
+  const TargetRegisterClass *RC = NULL;
+  if (VT.getSimpleVT() == MVT::i16) {
+    Opc = X86::CMOVE16rr;
+    RC = &X86::GR16RegClass;
+  } else if (VT.getSimpleVT() == MVT::i32) {
+    Opc = X86::CMOVE32rr;
+    RC = &X86::GR32RegClass;
+  } else if (VT.getSimpleVT() == MVT::i64) {
+    Opc = X86::CMOVE64rr;
+    RC = &X86::GR64RegClass;
+  } else {
+    return false; 
+  }
+
+  unsigned Op0Reg = getRegForValue(I->getOperand(0));
+  if (Op0Reg == 0) return false;
+  unsigned Op1Reg = getRegForValue(I->getOperand(1));
+  if (Op1Reg == 0) return false;
+  unsigned Op2Reg = getRegForValue(I->getOperand(2));
+  if (Op2Reg == 0) return false;
+
+  BuildMI(MBB, DL, TII.get(X86::TEST8rr)).addReg(Op0Reg).addReg(Op0Reg);
+  unsigned ResultReg = createResultReg(RC);
+  BuildMI(MBB, DL, TII.get(Opc), ResultReg).addReg(Op1Reg).addReg(Op2Reg);
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool X86FastISel::X86SelectFPExt(Instruction *I) {
+  // fpext from float to double.
+  if (Subtarget->hasSSE2() &&
+      I->getType()->isDoubleTy()) {
+    Value *V = I->getOperand(0);
+    if (V->getType()->isFloatTy()) {
+      unsigned OpReg = getRegForValue(V);
+      if (OpReg == 0) return false;
+      unsigned ResultReg = createResultReg(X86::FR64RegisterClass);
+      BuildMI(MBB, DL, TII.get(X86::CVTSS2SDrr), ResultReg).addReg(OpReg);
+      UpdateValueMap(I, ResultReg);
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool X86FastISel::X86SelectFPTrunc(Instruction *I) {
+  if (Subtarget->hasSSE2()) {
+    if (I->getType()->isFloatTy()) {
+      Value *V = I->getOperand(0);
+      if (V->getType()->isDoubleTy()) {
+        unsigned OpReg = getRegForValue(V);
+        if (OpReg == 0) return false;
+        unsigned ResultReg = createResultReg(X86::FR32RegisterClass);
+        BuildMI(MBB, DL, TII.get(X86::CVTSD2SSrr), ResultReg).addReg(OpReg);
+        UpdateValueMap(I, ResultReg);
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+bool X86FastISel::X86SelectTrunc(Instruction *I) {
+  if (Subtarget->is64Bit())
+    // All other cases should be handled by the tblgen generated code.
+    return false;
+  EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
+  EVT DstVT = TLI.getValueType(I->getType());
+  
+  // This code only handles truncation to byte right now.
+  if (DstVT != MVT::i8 && DstVT != MVT::i1)
+    // All other cases should be handled by the tblgen generated code.
+    return false;
+  if (SrcVT != MVT::i16 && SrcVT != MVT::i32)
+    // All other cases should be handled by the tblgen generated code.
+    return false;
+
+  unsigned InputReg = getRegForValue(I->getOperand(0));
+  if (!InputReg)
+    // Unhandled operand.  Halt "fast" selection and bail.
+    return false;
+
+  // First issue a copy to GR16_ABCD or GR32_ABCD.
+  unsigned CopyOpc = (SrcVT == MVT::i16) ? X86::MOV16rr : X86::MOV32rr;
+  const TargetRegisterClass *CopyRC = (SrcVT == MVT::i16)
+    ? X86::GR16_ABCDRegisterClass : X86::GR32_ABCDRegisterClass;
+  unsigned CopyReg = createResultReg(CopyRC);
+  BuildMI(MBB, DL, TII.get(CopyOpc), CopyReg).addReg(InputReg);
+
+  // Then issue an extract_subreg.
+  unsigned ResultReg = FastEmitInst_extractsubreg(MVT::i8,
+                                                  CopyReg, X86::SUBREG_8BIT);
+  if (!ResultReg)
+    return false;
+
+  UpdateValueMap(I, ResultReg);
+  return true;
+}
+
+bool X86FastISel::X86SelectExtractValue(Instruction *I) {
+  ExtractValueInst *EI = cast<ExtractValueInst>(I);
+  Value *Agg = EI->getAggregateOperand();
+
+  if (IntrinsicInst *CI = dyn_cast<IntrinsicInst>(Agg)) {
+    switch (CI->getIntrinsicID()) {
+    default: break;
+    case Intrinsic::sadd_with_overflow:
+    case Intrinsic::uadd_with_overflow:
+      // Cheat a little. We know that the registers for "add" and "seto" are
+      // allocated sequentially. However, we only keep track of the register
+      // for "add" in the value map. Use extractvalue's index to get the
+      // correct register for "seto".
+      UpdateValueMap(I, lookUpRegForValue(Agg) + *EI->idx_begin());
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool X86FastISel::X86VisitIntrinsicCall(IntrinsicInst &I) {
+  // FIXME: Handle more intrinsics.
+  switch (I.getIntrinsicID()) {
+  default: return false;
+  case Intrinsic::dbg_declare: {
+    DbgDeclareInst *DI = cast<DbgDeclareInst>(&I);
+    X86AddressMode AM;
+    assert(DI->getAddress() && "Null address should be checked earlier!");
+    if (!X86SelectAddress(DI->getAddress(), AM))
+      return false;
+    const TargetInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE);
+    addFullAddress(BuildMI(MBB, DL, II), AM).addImm(0).
+                                        addMetadata(DI->getVariable());
+    return true;
+  }
+  case Intrinsic::trap: {
+    BuildMI(MBB, DL, TII.get(X86::TRAP));
+    return true;
+  }
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::uadd_with_overflow: {
+    // Replace "add with overflow" intrinsics with an "add" instruction followed
+    // by a seto/setc instruction. Later on, when the "extractvalue"
+    // instructions are encountered, we use the fact that two registers were
+    // created sequentially to get the correct registers for the "sum" and the
+    // "overflow bit".
+    const Function *Callee = I.getCalledFunction();
+    const Type *RetTy =
+      cast<StructType>(Callee->getReturnType())->getTypeAtIndex(unsigned(0));
+
+    EVT VT;
+    if (!isTypeLegal(RetTy, VT))
+      return false;
+
+    Value *Op1 = I.getOperand(1);
+    Value *Op2 = I.getOperand(2);
+    unsigned Reg1 = getRegForValue(Op1);
+    unsigned Reg2 = getRegForValue(Op2);
+
+    if (Reg1 == 0 || Reg2 == 0)
+      // FIXME: Handle values *not* in registers.
+      return false;
+
+    unsigned OpC = 0;
+    if (VT == MVT::i32)
+      OpC = X86::ADD32rr;
+    else if (VT == MVT::i64)
+      OpC = X86::ADD64rr;
+    else
+      return false;
+
+    unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+    BuildMI(MBB, DL, TII.get(OpC), ResultReg).addReg(Reg1).addReg(Reg2);
+    unsigned DestReg1 = UpdateValueMap(&I, ResultReg);
+
+    // If the add with overflow is an intra-block value then we just want to
+    // create temporaries for it like normal.  If it is a cross-block value then
+    // UpdateValueMap will return the cross-block register used.  Since we
+    // *really* want the value to be live in the register pair known by
+    // UpdateValueMap, we have to use DestReg1+1 as the destination register in
+    // the cross block case.  In the non-cross-block case, we should just make
+    // another register for the value.
+    if (DestReg1 != ResultReg)
+      ResultReg = DestReg1+1;
+    else
+      ResultReg = createResultReg(TLI.getRegClassFor(MVT::i8));
+    
+    unsigned Opc = X86::SETBr;
+    if (I.getIntrinsicID() == Intrinsic::sadd_with_overflow)
+      Opc = X86::SETOr;
+    BuildMI(MBB, DL, TII.get(Opc), ResultReg);
+    return true;
+  }
+  }
+}
+
+bool X86FastISel::X86SelectCall(Instruction *I) {
+  CallInst *CI = cast<CallInst>(I);
+  Value *Callee = I->getOperand(0);
+
+  // Can't handle inline asm yet.
+  if (isa<InlineAsm>(Callee))
+    return false;
+
+  // Handle intrinsic calls.
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI))
+    return X86VisitIntrinsicCall(*II);
+
+  // Handle only C and fastcc calling conventions for now.
+  CallSite CS(CI);
+  CallingConv::ID CC = CS.getCallingConv();
+  if (CC != CallingConv::C &&
+      CC != CallingConv::Fast &&
+      CC != CallingConv::X86_FastCall)
+    return false;
+
+  // fastcc with -tailcallopt is intended to provide a guaranteed
+  // tail call optimization. Fastisel doesn't know how to do that.
+  if (CC == CallingConv::Fast && GuaranteedTailCallOpt)
+    return false;
+
+  // Let SDISel handle vararg functions.
+  const PointerType *PT = cast<PointerType>(CS.getCalledValue()->getType());
+  const FunctionType *FTy = cast<FunctionType>(PT->getElementType());
+  if (FTy->isVarArg())
+    return false;
+
+  // Handle *simple* calls for now.
+  const Type *RetTy = CS.getType();
+  EVT RetVT;
+  if (RetTy->isVoidTy())
+    RetVT = MVT::isVoid;
+  else if (!isTypeLegal(RetTy, RetVT, true))
+    return false;
+
+  // Materialize callee address in a register. FIXME: GV address can be
+  // handled with a CALLpcrel32 instead.
+  X86AddressMode CalleeAM;
+  if (!X86SelectCallAddress(Callee, CalleeAM))
+    return false;
+  unsigned CalleeOp = 0;
+  GlobalValue *GV = 0;
+  if (CalleeAM.GV != 0) {
+    GV = CalleeAM.GV;
+  } else if (CalleeAM.Base.Reg != 0) {
+    CalleeOp = CalleeAM.Base.Reg;
+  } else
+    return false;
+
+  // Allow calls which produce i1 results.
+  bool AndToI1 = false;
+  if (RetVT == MVT::i1) {
+    RetVT = MVT::i8;
+    AndToI1 = true;
+  }
+
+  // Deal with call operands first.
+  SmallVector<Value*, 8> ArgVals;
+  SmallVector<unsigned, 8> Args;
+  SmallVector<EVT, 8> ArgVTs;
+  SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
+  Args.reserve(CS.arg_size());
+  ArgVals.reserve(CS.arg_size());
+  ArgVTs.reserve(CS.arg_size());
+  ArgFlags.reserve(CS.arg_size());
+  for (CallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
+       i != e; ++i) {
+    unsigned Arg = getRegForValue(*i);
+    if (Arg == 0)
+      return false;
+    ISD::ArgFlagsTy Flags;
+    unsigned AttrInd = i - CS.arg_begin() + 1;
+    if (CS.paramHasAttr(AttrInd, Attribute::SExt))
+      Flags.setSExt();
+    if (CS.paramHasAttr(AttrInd, Attribute::ZExt))
+      Flags.setZExt();
+
+    // FIXME: Only handle *easy* calls for now.
+    if (CS.paramHasAttr(AttrInd, Attribute::InReg) ||
+        CS.paramHasAttr(AttrInd, Attribute::StructRet) ||
+        CS.paramHasAttr(AttrInd, Attribute::Nest) ||
+        CS.paramHasAttr(AttrInd, Attribute::ByVal))
+      return false;
+
+    const Type *ArgTy = (*i)->getType();
+    EVT ArgVT;
+    if (!isTypeLegal(ArgTy, ArgVT))
+      return false;
+    unsigned OriginalAlignment = TD.getABITypeAlignment(ArgTy);
+    Flags.setOrigAlign(OriginalAlignment);
+
+    Args.push_back(Arg);
+    ArgVals.push_back(*i);
+    ArgVTs.push_back(ArgVT);
+    ArgFlags.push_back(Flags);
+  }
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CC, false, TM, ArgLocs, I->getParent()->getContext());
+  CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CCAssignFnForCall(CC));
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+
+  // Issue CALLSEQ_START
+  unsigned AdjStackDown = TM.getRegisterInfo()->getCallFrameSetupOpcode();
+  BuildMI(MBB, DL, TII.get(AdjStackDown)).addImm(NumBytes);
+
+  // Process argument: walk the register/memloc assignments, inserting
+  // copies / loads.
+  SmallVector<unsigned, 4> RegArgs;
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    unsigned Arg = Args[VA.getValNo()];
+    EVT ArgVT = ArgVTs[VA.getValNo()];
+  
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::SExt: {
+      bool Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(),
+                                       Arg, ArgVT, Arg);
+      assert(Emitted && "Failed to emit a sext!"); Emitted=Emitted;
+      Emitted = true;
+      ArgVT = VA.getLocVT();
+      break;
+    }
+    case CCValAssign::ZExt: {
+      bool Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(),
+                                       Arg, ArgVT, Arg);
+      assert(Emitted && "Failed to emit a zext!"); Emitted=Emitted;
+      Emitted = true;
+      ArgVT = VA.getLocVT();
+      break;
+    }
+    case CCValAssign::AExt: {
+      bool Emitted = X86FastEmitExtend(ISD::ANY_EXTEND, VA.getLocVT(),
+                                       Arg, ArgVT, Arg);
+      if (!Emitted)
+        Emitted = X86FastEmitExtend(ISD::ZERO_EXTEND, VA.getLocVT(),
+                                    Arg, ArgVT, Arg);
+      if (!Emitted)
+        Emitted = X86FastEmitExtend(ISD::SIGN_EXTEND, VA.getLocVT(),
+                                    Arg, ArgVT, Arg);
+      
+      assert(Emitted && "Failed to emit a aext!"); Emitted=Emitted;
+      ArgVT = VA.getLocVT();
+      break;
+    }
+    case CCValAssign::BCvt: {
+      unsigned BC = FastEmit_r(ArgVT.getSimpleVT(), VA.getLocVT().getSimpleVT(),
+                               ISD::BIT_CONVERT, Arg);
+      assert(BC != 0 && "Failed to emit a bitcast!");
+      Arg = BC;
+      ArgVT = VA.getLocVT();
+      break;
+    }
+    }
+    
+    if (VA.isRegLoc()) {
+      TargetRegisterClass* RC = TLI.getRegClassFor(ArgVT);
+      bool Emitted = TII.copyRegToReg(*MBB, MBB->end(), VA.getLocReg(),
+                                      Arg, RC, RC);
+      assert(Emitted && "Failed to emit a copy instruction!"); Emitted=Emitted;
+      Emitted = true;
+      RegArgs.push_back(VA.getLocReg());
+    } else {
+      unsigned LocMemOffset = VA.getLocMemOffset();
+      X86AddressMode AM;
+      AM.Base.Reg = StackPtr;
+      AM.Disp = LocMemOffset;
+      Value *ArgVal = ArgVals[VA.getValNo()];
+      
+      // If this is a really simple value, emit this with the Value* version of
+      // X86FastEmitStore.  If it isn't simple, we don't want to do this, as it
+      // can cause us to reevaluate the argument.
+      if (isa<ConstantInt>(ArgVal) || isa<ConstantPointerNull>(ArgVal))
+        X86FastEmitStore(ArgVT, ArgVal, AM);
+      else
+        X86FastEmitStore(ArgVT, Arg, AM);
+    }
+  }
+
+  // ELF / PIC requires GOT in the EBX register before function calls via PLT
+  // GOT pointer.  
+  if (Subtarget->isPICStyleGOT()) {
+    TargetRegisterClass *RC = X86::GR32RegisterClass;
+    unsigned Base = getInstrInfo()->getGlobalBaseReg(&MF);
+    bool Emitted = TII.copyRegToReg(*MBB, MBB->end(), X86::EBX, Base, RC, RC);
+    assert(Emitted && "Failed to emit a copy instruction!"); Emitted=Emitted;
+    Emitted = true;
+  }
+  
+  // Issue the call.
+  MachineInstrBuilder MIB;
+  if (CalleeOp) {
+    // Register-indirect call.
+    unsigned CallOpc = Subtarget->is64Bit() ? X86::CALL64r : X86::CALL32r;
+    MIB = BuildMI(MBB, DL, TII.get(CallOpc)).addReg(CalleeOp);
+    
+  } else {
+    // Direct call.
+    assert(GV && "Not a direct call");
+    unsigned CallOpc =
+      Subtarget->is64Bit() ? X86::CALL64pcrel32 : X86::CALLpcrel32;
+    
+    // See if we need any target-specific flags on the GV operand.
+    unsigned char OpFlags = 0;
+    
+    // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
+    // external symbols most go through the PLT in PIC mode.  If the symbol
+    // has hidden or protected visibility, or if it is static or local, then
+    // we don't need to use the PLT - we can directly call it.
+    if (Subtarget->isTargetELF() &&
+        TM.getRelocationModel() == Reloc::PIC_ &&
+        GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
+      OpFlags = X86II::MO_PLT;
+    } else if (Subtarget->isPICStyleStubAny() &&
+               (GV->isDeclaration() || GV->isWeakForLinker()) &&
+               Subtarget->getDarwinVers() < 9) {
+      // PC-relative references to external symbols should go through $stub,
+      // unless we're building with the leopard linker or later, which
+      // automatically synthesizes these stubs.
+      OpFlags = X86II::MO_DARWIN_STUB;
+    }
+    
+    
+    MIB = BuildMI(MBB, DL, TII.get(CallOpc)).addGlobalAddress(GV, 0, OpFlags);
+  }
+
+  // Add an implicit use GOT pointer in EBX.
+  if (Subtarget->isPICStyleGOT())
+    MIB.addReg(X86::EBX);
+
+  // Add implicit physical register uses to the call.
+  for (unsigned i = 0, e = RegArgs.size(); i != e; ++i)
+    MIB.addReg(RegArgs[i]);
+
+  // Issue CALLSEQ_END
+  unsigned AdjStackUp = TM.getRegisterInfo()->getCallFrameDestroyOpcode();
+  BuildMI(MBB, DL, TII.get(AdjStackUp)).addImm(NumBytes).addImm(0);
+
+  // Now handle call return value (if any).
+  if (RetVT.getSimpleVT().SimpleTy != MVT::isVoid) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CC, false, TM, RVLocs, I->getParent()->getContext());
+    CCInfo.AnalyzeCallResult(RetVT, RetCC_X86);
+
+    // Copy all of the result registers out of their specified physreg.
+    assert(RVLocs.size() == 1 && "Can't handle multi-value calls!");
+    EVT CopyVT = RVLocs[0].getValVT();
+    TargetRegisterClass* DstRC = TLI.getRegClassFor(CopyVT);
+    TargetRegisterClass *SrcRC = DstRC;
+    
+    // If this is a call to a function that returns an fp value on the x87 fp
+    // stack, but where we prefer to use the value in xmm registers, copy it
+    // out as F80 and use a truncate to move it from fp stack reg to xmm reg.
+    if ((RVLocs[0].getLocReg() == X86::ST0 ||
+         RVLocs[0].getLocReg() == X86::ST1) &&
+        isScalarFPTypeInSSEReg(RVLocs[0].getValVT())) {
+      CopyVT = MVT::f80;
+      SrcRC = X86::RSTRegisterClass;
+      DstRC = X86::RFP80RegisterClass;
+    }
+
+    unsigned ResultReg = createResultReg(DstRC);
+    bool Emitted = TII.copyRegToReg(*MBB, MBB->end(), ResultReg,
+                                    RVLocs[0].getLocReg(), DstRC, SrcRC);
+    assert(Emitted && "Failed to emit a copy instruction!"); Emitted=Emitted;
+    Emitted = true;
+    if (CopyVT != RVLocs[0].getValVT()) {
+      // Round the F80 the right size, which also moves to the appropriate xmm
+      // register. This is accomplished by storing the F80 value in memory and
+      // then loading it back. Ewww...
+      EVT ResVT = RVLocs[0].getValVT();
+      unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64;
+      unsigned MemSize = ResVT.getSizeInBits()/8;
+      int FI = MFI.CreateStackObject(MemSize, MemSize, false);
+      addFrameReference(BuildMI(MBB, DL, TII.get(Opc)), FI).addReg(ResultReg);
+      DstRC = ResVT == MVT::f32
+        ? X86::FR32RegisterClass : X86::FR64RegisterClass;
+      Opc = ResVT == MVT::f32 ? X86::MOVSSrm : X86::MOVSDrm;
+      ResultReg = createResultReg(DstRC);
+      addFrameReference(BuildMI(MBB, DL, TII.get(Opc), ResultReg), FI);
+    }
+
+    if (AndToI1) {
+      // Mask out all but lowest bit for some call which produces an i1.
+      unsigned AndResult = createResultReg(X86::GR8RegisterClass);
+      BuildMI(MBB, DL, 
+              TII.get(X86::AND8ri), AndResult).addReg(ResultReg).addImm(1);
+      ResultReg = AndResult;
+    }
+
+    UpdateValueMap(I, ResultReg);
+  }
+
+  return true;
+}
+
+
+bool
+X86FastISel::TargetSelectInstruction(Instruction *I)  {
+  switch (I->getOpcode()) {
+  default: break;
+  case Instruction::Load:
+    return X86SelectLoad(I);
+  case Instruction::Store:
+    return X86SelectStore(I);
+  case Instruction::ICmp:
+  case Instruction::FCmp:
+    return X86SelectCmp(I);
+  case Instruction::ZExt:
+    return X86SelectZExt(I);
+  case Instruction::Br:
+    return X86SelectBranch(I);
+  case Instruction::Call:
+    return X86SelectCall(I);
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::Shl:
+    return X86SelectShift(I);
+  case Instruction::Select:
+    return X86SelectSelect(I);
+  case Instruction::Trunc:
+    return X86SelectTrunc(I);
+  case Instruction::FPExt:
+    return X86SelectFPExt(I);
+  case Instruction::FPTrunc:
+    return X86SelectFPTrunc(I);
+  case Instruction::ExtractValue:
+    return X86SelectExtractValue(I);
+  case Instruction::IntToPtr: // Deliberate fall-through.
+  case Instruction::PtrToInt: {
+    EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
+    EVT DstVT = TLI.getValueType(I->getType());
+    if (DstVT.bitsGT(SrcVT))
+      return X86SelectZExt(I);
+    if (DstVT.bitsLT(SrcVT))
+      return X86SelectTrunc(I);
+    unsigned Reg = getRegForValue(I->getOperand(0));
+    if (Reg == 0) return false;
+    UpdateValueMap(I, Reg);
+    return true;
+  }
+  }
+
+  return false;
+}
+
+unsigned X86FastISel::TargetMaterializeConstant(Constant *C) {
+  EVT VT;
+  if (!isTypeLegal(C->getType(), VT))
+    return false;
+  
+  // Get opcode and regclass of the output for the given load instruction.
+  unsigned Opc = 0;
+  const TargetRegisterClass *RC = NULL;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: return false;
+  case MVT::i8:
+    Opc = X86::MOV8rm;
+    RC  = X86::GR8RegisterClass;
+    break;
+  case MVT::i16:
+    Opc = X86::MOV16rm;
+    RC  = X86::GR16RegisterClass;
+    break;
+  case MVT::i32:
+    Opc = X86::MOV32rm;
+    RC  = X86::GR32RegisterClass;
+    break;
+  case MVT::i64:
+    // Must be in x86-64 mode.
+    Opc = X86::MOV64rm;
+    RC  = X86::GR64RegisterClass;
+    break;
+  case MVT::f32:
+    if (Subtarget->hasSSE1()) {
+      Opc = X86::MOVSSrm;
+      RC  = X86::FR32RegisterClass;
+    } else {
+      Opc = X86::LD_Fp32m;
+      RC  = X86::RFP32RegisterClass;
+    }
+    break;
+  case MVT::f64:
+    if (Subtarget->hasSSE2()) {
+      Opc = X86::MOVSDrm;
+      RC  = X86::FR64RegisterClass;
+    } else {
+      Opc = X86::LD_Fp64m;
+      RC  = X86::RFP64RegisterClass;
+    }
+    break;
+  case MVT::f80:
+    // No f80 support yet.
+    return false;
+  }
+  
+  // Materialize addresses with LEA instructions.
+  if (isa<GlobalValue>(C)) {
+    X86AddressMode AM;
+    if (X86SelectAddress(C, AM)) {
+      if (TLI.getPointerTy() == MVT::i32)
+        Opc = X86::LEA32r;
+      else
+        Opc = X86::LEA64r;
+      unsigned ResultReg = createResultReg(RC);
+      addLeaAddress(BuildMI(MBB, DL, TII.get(Opc), ResultReg), AM);
+      return ResultReg;
+    }
+    return 0;
+  }
+  
+  // MachineConstantPool wants an explicit alignment.
+  unsigned Align = TD.getPrefTypeAlignment(C->getType());
+  if (Align == 0) {
+    // Alignment of vector types.  FIXME!
+    Align = TD.getTypeAllocSize(C->getType());
+  }
+  
+  // x86-32 PIC requires a PIC base register for constant pools.
+  unsigned PICBase = 0;
+  unsigned char OpFlag = 0;
+  if (Subtarget->isPICStyleStubPIC()) { // Not dynamic-no-pic
+    OpFlag = X86II::MO_PIC_BASE_OFFSET;
+    PICBase = getInstrInfo()->getGlobalBaseReg(&MF);
+  } else if (Subtarget->isPICStyleGOT()) {
+    OpFlag = X86II::MO_GOTOFF;
+    PICBase = getInstrInfo()->getGlobalBaseReg(&MF);
+  } else if (Subtarget->isPICStyleRIPRel() &&
+             TM.getCodeModel() == CodeModel::Small) {
+    PICBase = X86::RIP;
+  }
+
+  // Create the load from the constant pool.
+  unsigned MCPOffset = MCP.getConstantPoolIndex(C, Align);
+  unsigned ResultReg = createResultReg(RC);
+  addConstantPoolReference(BuildMI(MBB, DL, TII.get(Opc), ResultReg),
+                           MCPOffset, PICBase, OpFlag);
+
+  return ResultReg;
+}
+
+unsigned X86FastISel::TargetMaterializeAlloca(AllocaInst *C) {
+  // Fail on dynamic allocas. At this point, getRegForValue has already
+  // checked its CSE maps, so if we're here trying to handle a dynamic
+  // alloca, we're not going to succeed. X86SelectAddress has a
+  // check for dynamic allocas, because it's called directly from
+  // various places, but TargetMaterializeAlloca also needs a check
+  // in order to avoid recursion between getRegForValue,
+  // X86SelectAddrss, and TargetMaterializeAlloca.
+  if (!StaticAllocaMap.count(C))
+    return 0;
+
+  X86AddressMode AM;
+  if (!X86SelectAddress(C, AM))
+    return 0;
+  unsigned Opc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
+  TargetRegisterClass* RC = TLI.getRegClassFor(TLI.getPointerTy());
+  unsigned ResultReg = createResultReg(RC);
+  addLeaAddress(BuildMI(MBB, DL, TII.get(Opc), ResultReg), AM);
+  return ResultReg;
+}
+
+namespace llvm {
+  llvm::FastISel *X86::createFastISel(MachineFunction &mf,
+                        MachineModuleInfo *mmi,
+                        DwarfWriter *dw,
+                        DenseMap<const Value *, unsigned> &vm,
+                        DenseMap<const BasicBlock *, MachineBasicBlock *> &bm,
+                        DenseMap<const AllocaInst *, int> &am
+#ifndef NDEBUG
+                        , SmallSet<Instruction*, 8> &cil
+#endif
+                        ) {
+    return new X86FastISel(mf, mmi, dw, vm, bm, am
+#ifndef NDEBUG
+                           , cil
+#endif
+                           );
+  }
+}

diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
new file mode 100644
index 0000000..6d6fe77
--- /dev/null
+++ b/lib/Target/X86/X86FloatingPoint.cpp

@@ -0,0 +1,1207 @@
+//===-- X86FloatingPoint.cpp - Floating point Reg -> Stack converter ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which converts floating point instructions from
+// virtual registers into register stack instructions.  This pass uses live
+// variable information to indicate where the FPn registers are used and their
+// lifetimes.
+//
+// This pass is hampered by the lack of decent CFG manipulation routines for
+// machine code.  In particular, this wants to be able to split critical edges
+// as necessary, traverse the machine basic block CFG in depth-first order, and
+// allow there to be multiple machine basic blocks for each LLVM basicblock
+// (needed for critical edge splitting).
+//
+// In particular, this pass currently barfs on critical edges.  Because of this,
+// it requires the instruction selector to insert FP_REG_KILL instructions on
+// the exits of any basic block that has critical edges going from it, or which
+// branch to a critical basic block.
+//
+// FIXME: this is not implemented yet.  The stackifier pass only works on local
+// basic blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-codegen"
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumFXCH, "Number of fxch instructions inserted");
+STATISTIC(NumFP  , "Number of floating point instructions");
+
+namespace {
+  struct FPS : public MachineFunctionPass {
+    static char ID;
+    FPS() : MachineFunctionPass(&ID) {}
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addPreservedID(MachineLoopInfoID);
+      AU.addPreservedID(MachineDominatorsID);
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual const char *getPassName() const { return "X86 FP Stackifier"; }
+
+  private:
+    const TargetInstrInfo *TII; // Machine instruction info.
+    MachineBasicBlock *MBB;     // Current basic block
+    unsigned Stack[8];          // FP<n> Registers in each stack slot...
+    unsigned RegMap[8];         // Track which stack slot contains each register
+    unsigned StackTop;          // The current top of the FP stack.
+
+    void dumpStack() const {
+      dbgs() << "Stack contents:";
+      for (unsigned i = 0; i != StackTop; ++i) {
+        dbgs() << " FP" << Stack[i];
+        assert(RegMap[Stack[i]] == i && "Stack[] doesn't match RegMap[]!");
+      }
+      dbgs() << "\n";
+    }
+  private:
+    /// isStackEmpty - Return true if the FP stack is empty.
+    bool isStackEmpty() const {
+      return StackTop == 0;
+    }
+    
+    // getSlot - Return the stack slot number a particular register number is
+    // in.
+    unsigned getSlot(unsigned RegNo) const {
+      assert(RegNo < 8 && "Regno out of range!");
+      return RegMap[RegNo];
+    }
+
+    // getStackEntry - Return the X86::FP<n> register in register ST(i).
+    unsigned getStackEntry(unsigned STi) const {
+      assert(STi < StackTop && "Access past stack top!");
+      return Stack[StackTop-1-STi];
+    }
+
+    // getSTReg - Return the X86::ST(i) register which contains the specified
+    // FP<RegNo> register.
+    unsigned getSTReg(unsigned RegNo) const {
+      return StackTop - 1 - getSlot(RegNo) + llvm::X86::ST0;
+    }
+
+    // pushReg - Push the specified FP<n> register onto the stack.
+    void pushReg(unsigned Reg) {
+      assert(Reg < 8 && "Register number out of range!");
+      assert(StackTop < 8 && "Stack overflow!");
+      Stack[StackTop] = Reg;
+      RegMap[Reg] = StackTop++;
+    }
+
+    bool isAtTop(unsigned RegNo) const { return getSlot(RegNo) == StackTop-1; }
+    void moveToTop(unsigned RegNo, MachineBasicBlock::iterator I) {
+      MachineInstr *MI = I;
+      DebugLoc dl = MI->getDebugLoc();
+      if (isAtTop(RegNo)) return;
+      
+      unsigned STReg = getSTReg(RegNo);
+      unsigned RegOnTop = getStackEntry(0);
+
+      // Swap the slots the regs are in.
+      std::swap(RegMap[RegNo], RegMap[RegOnTop]);
+
+      // Swap stack slot contents.
+      assert(RegMap[RegOnTop] < StackTop);
+      std::swap(Stack[RegMap[RegOnTop]], Stack[StackTop-1]);
+
+      // Emit an fxch to update the runtime processors version of the state.
+      BuildMI(*MBB, I, dl, TII->get(X86::XCH_F)).addReg(STReg);
+      NumFXCH++;
+    }
+
+    void duplicateToTop(unsigned RegNo, unsigned AsReg, MachineInstr *I) {
+      DebugLoc dl = I->getDebugLoc();
+      unsigned STReg = getSTReg(RegNo);
+      pushReg(AsReg);   // New register on top of stack
+
+      BuildMI(*MBB, I, dl, TII->get(X86::LD_Frr)).addReg(STReg);
+    }
+
+    // popStackAfter - Pop the current value off of the top of the FP stack
+    // after the specified instruction.
+    void popStackAfter(MachineBasicBlock::iterator &I);
+
+    // freeStackSlotAfter - Free the specified register from the register stack,
+    // so that it is no longer in a register.  If the register is currently at
+    // the top of the stack, we just pop the current instruction, otherwise we
+    // store the current top-of-stack into the specified slot, then pop the top
+    // of stack.
+    void freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned Reg);
+
+    bool processBasicBlock(MachineFunction &MF, MachineBasicBlock &MBB);
+
+    void handleZeroArgFP(MachineBasicBlock::iterator &I);
+    void handleOneArgFP(MachineBasicBlock::iterator &I);
+    void handleOneArgFPRW(MachineBasicBlock::iterator &I);
+    void handleTwoArgFP(MachineBasicBlock::iterator &I);
+    void handleCompareFP(MachineBasicBlock::iterator &I);
+    void handleCondMovFP(MachineBasicBlock::iterator &I);
+    void handleSpecialFP(MachineBasicBlock::iterator &I);
+  };
+  char FPS::ID = 0;
+}
+
+FunctionPass *llvm::createX86FloatingPointStackifierPass() { return new FPS(); }
+
+/// getFPReg - Return the X86::FPx register number for the specified operand.
+/// For example, this returns 3 for X86::FP3.
+static unsigned getFPReg(const MachineOperand &MO) {
+  assert(MO.isReg() && "Expected an FP register!");
+  unsigned Reg = MO.getReg();
+  assert(Reg >= X86::FP0 && Reg <= X86::FP6 && "Expected FP register!");
+  return Reg - X86::FP0;
+}
+
+
+/// runOnMachineFunction - Loop over all of the basic blocks, transforming FP
+/// register references into FP stack references.
+///
+bool FPS::runOnMachineFunction(MachineFunction &MF) {
+  // We only need to run this pass if there are any FP registers used in this
+  // function.  If it is all integer, there is nothing for us to do!
+  bool FPIsUsed = false;
+
+  assert(X86::FP6 == X86::FP0+6 && "Register enums aren't sorted right!");
+  for (unsigned i = 0; i <= 6; ++i)
+    if (MF.getRegInfo().isPhysRegUsed(X86::FP0+i)) {
+      FPIsUsed = true;
+      break;
+    }
+
+  // Early exit.
+  if (!FPIsUsed) return false;
+
+  TII = MF.getTarget().getInstrInfo();
+  StackTop = 0;
+
+  // Process the function in depth first order so that we process at least one
+  // of the predecessors for every reachable block in the function.
+  SmallPtrSet<MachineBasicBlock*, 8> Processed;
+  MachineBasicBlock *Entry = MF.begin();
+
+  bool Changed = false;
+  for (df_ext_iterator<MachineBasicBlock*, SmallPtrSet<MachineBasicBlock*, 8> >
+         I = df_ext_begin(Entry, Processed), E = df_ext_end(Entry, Processed);
+       I != E; ++I)
+    Changed |= processBasicBlock(MF, **I);
+
+  // Process any unreachable blocks in arbitrary order now.
+  if (MF.size() == Processed.size())
+    return Changed;
+
+  for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB)
+    if (Processed.insert(BB))
+      Changed |= processBasicBlock(MF, *BB);
+  
+  return Changed;
+}
+
+/// processBasicBlock - Loop over all of the instructions in the basic block,
+/// transforming FP instructions into their stack form.
+///
+bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
+  bool Changed = false;
+  MBB = &BB;
+
+  for (MachineBasicBlock::iterator I = BB.begin(); I != BB.end(); ++I) {
+    MachineInstr *MI = I;
+    unsigned Flags = MI->getDesc().TSFlags;
+    
+    unsigned FPInstClass = Flags & X86II::FPTypeMask;
+    if (MI->isInlineAsm())
+      FPInstClass = X86II::SpecialFP;
+    
+    if (FPInstClass == X86II::NotFP)
+      continue;  // Efficiently ignore non-fp insts!
+
+    MachineInstr *PrevMI = 0;
+    if (I != BB.begin())
+      PrevMI = prior(I);
+
+    ++NumFP;  // Keep track of # of pseudo instrs
+    DEBUG(dbgs() << "\nFPInst:\t" << *MI);
+
+    // Get dead variables list now because the MI pointer may be deleted as part
+    // of processing!
+    SmallVector<unsigned, 8> DeadRegs;
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = MI->getOperand(i);
+      if (MO.isReg() && MO.isDead())
+        DeadRegs.push_back(MO.getReg());
+    }
+
+    switch (FPInstClass) {
+    case X86II::ZeroArgFP:  handleZeroArgFP(I); break;
+    case X86II::OneArgFP:   handleOneArgFP(I);  break;  // fstp ST(0)
+    case X86II::OneArgFPRW: handleOneArgFPRW(I); break; // ST(0) = fsqrt(ST(0))
+    case X86II::TwoArgFP:   handleTwoArgFP(I);  break;
+    case X86II::CompareFP:  handleCompareFP(I); break;
+    case X86II::CondMovFP:  handleCondMovFP(I); break;
+    case X86II::SpecialFP:  handleSpecialFP(I); break;
+    default: llvm_unreachable("Unknown FP Type!");
+    }
+
+    // Check to see if any of the values defined by this instruction are dead
+    // after definition.  If so, pop them.
+    for (unsigned i = 0, e = DeadRegs.size(); i != e; ++i) {
+      unsigned Reg = DeadRegs[i];
+      if (Reg >= X86::FP0 && Reg <= X86::FP6) {
+        DEBUG(dbgs() << "Register FP#" << Reg-X86::FP0 << " is dead!\n");
+        freeStackSlotAfter(I, Reg-X86::FP0);
+      }
+    }
+
+    // Print out all of the instructions expanded to if -debug
+    DEBUG(
+      MachineBasicBlock::iterator PrevI(PrevMI);
+      if (I == PrevI) {
+        dbgs() << "Just deleted pseudo instruction\n";
+      } else {
+        MachineBasicBlock::iterator Start = I;
+        // Rewind to first instruction newly inserted.
+        while (Start != BB.begin() && prior(Start) != PrevI) --Start;
+        dbgs() << "Inserted instructions:\n\t";
+        Start->print(dbgs(), &MF.getTarget());
+        while (++Start != llvm::next(I)) {}
+      }
+      dumpStack();
+    );
+
+    Changed = true;
+  }
+
+  assert(isStackEmpty() && "Stack not empty at end of basic block?");
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// Efficient Lookup Table Support
+//===----------------------------------------------------------------------===//
+
+namespace {
+  struct TableEntry {
+    unsigned from;
+    unsigned to;
+    bool operator<(const TableEntry &TE) const { return from < TE.from; }
+    friend bool operator<(const TableEntry &TE, unsigned V) {
+      return TE.from < V;
+    }
+    friend bool operator<(unsigned V, const TableEntry &TE) {
+      return V < TE.from;
+    }
+  };
+}
+
+#ifndef NDEBUG
+static bool TableIsSorted(const TableEntry *Table, unsigned NumEntries) {
+  for (unsigned i = 0; i != NumEntries-1; ++i)
+    if (!(Table[i] < Table[i+1])) return false;
+  return true;
+}
+#endif
+
+static int Lookup(const TableEntry *Table, unsigned N, unsigned Opcode) {
+  const TableEntry *I = std::lower_bound(Table, Table+N, Opcode);
+  if (I != Table+N && I->from == Opcode)
+    return I->to;
+  return -1;
+}
+
+#ifdef NDEBUG
+#define ASSERT_SORTED(TABLE)
+#else
+#define ASSERT_SORTED(TABLE)                                              \
+  { static bool TABLE##Checked = false;                                   \
+    if (!TABLE##Checked) {                                                \
+       assert(TableIsSorted(TABLE, array_lengthof(TABLE)) &&              \
+              "All lookup tables must be sorted for efficient access!");  \
+       TABLE##Checked = true;                                             \
+    }                                                                     \
+  }
+#endif
+
+//===----------------------------------------------------------------------===//
+// Register File -> Register Stack Mapping Methods
+//===----------------------------------------------------------------------===//
+
+// OpcodeTable - Sorted map of register instructions to their stack version.
+// The first element is an register file pseudo instruction, the second is the
+// concrete X86 instruction which uses the register stack.
+//
+static const TableEntry OpcodeTable[] = {
+  { X86::ABS_Fp32     , X86::ABS_F     },
+  { X86::ABS_Fp64     , X86::ABS_F     },
+  { X86::ABS_Fp80     , X86::ABS_F     },
+  { X86::ADD_Fp32m    , X86::ADD_F32m  },
+  { X86::ADD_Fp64m    , X86::ADD_F64m  },
+  { X86::ADD_Fp64m32  , X86::ADD_F32m  },
+  { X86::ADD_Fp80m32  , X86::ADD_F32m  },
+  { X86::ADD_Fp80m64  , X86::ADD_F64m  },
+  { X86::ADD_FpI16m32 , X86::ADD_FI16m },
+  { X86::ADD_FpI16m64 , X86::ADD_FI16m },
+  { X86::ADD_FpI16m80 , X86::ADD_FI16m },
+  { X86::ADD_FpI32m32 , X86::ADD_FI32m },
+  { X86::ADD_FpI32m64 , X86::ADD_FI32m },
+  { X86::ADD_FpI32m80 , X86::ADD_FI32m },
+  { X86::CHS_Fp32     , X86::CHS_F     },
+  { X86::CHS_Fp64     , X86::CHS_F     },
+  { X86::CHS_Fp80     , X86::CHS_F     },
+  { X86::CMOVBE_Fp32  , X86::CMOVBE_F  },
+  { X86::CMOVBE_Fp64  , X86::CMOVBE_F  },
+  { X86::CMOVBE_Fp80  , X86::CMOVBE_F  },
+  { X86::CMOVB_Fp32   , X86::CMOVB_F   },
+  { X86::CMOVB_Fp64   , X86::CMOVB_F  },
+  { X86::CMOVB_Fp80   , X86::CMOVB_F  },
+  { X86::CMOVE_Fp32   , X86::CMOVE_F  },
+  { X86::CMOVE_Fp64   , X86::CMOVE_F   },
+  { X86::CMOVE_Fp80   , X86::CMOVE_F   },
+  { X86::CMOVNBE_Fp32 , X86::CMOVNBE_F },
+  { X86::CMOVNBE_Fp64 , X86::CMOVNBE_F },
+  { X86::CMOVNBE_Fp80 , X86::CMOVNBE_F },
+  { X86::CMOVNB_Fp32  , X86::CMOVNB_F  },
+  { X86::CMOVNB_Fp64  , X86::CMOVNB_F  },
+  { X86::CMOVNB_Fp80  , X86::CMOVNB_F  },
+  { X86::CMOVNE_Fp32  , X86::CMOVNE_F  },
+  { X86::CMOVNE_Fp64  , X86::CMOVNE_F  },
+  { X86::CMOVNE_Fp80  , X86::CMOVNE_F  },
+  { X86::CMOVNP_Fp32  , X86::CMOVNP_F  },
+  { X86::CMOVNP_Fp64  , X86::CMOVNP_F  },
+  { X86::CMOVNP_Fp80  , X86::CMOVNP_F  },
+  { X86::CMOVP_Fp32   , X86::CMOVP_F   },
+  { X86::CMOVP_Fp64   , X86::CMOVP_F   },
+  { X86::CMOVP_Fp80   , X86::CMOVP_F   },
+  { X86::COS_Fp32     , X86::COS_F     },
+  { X86::COS_Fp64     , X86::COS_F     },
+  { X86::COS_Fp80     , X86::COS_F     },
+  { X86::DIVR_Fp32m   , X86::DIVR_F32m },
+  { X86::DIVR_Fp64m   , X86::DIVR_F64m },
+  { X86::DIVR_Fp64m32 , X86::DIVR_F32m },
+  { X86::DIVR_Fp80m32 , X86::DIVR_F32m },
+  { X86::DIVR_Fp80m64 , X86::DIVR_F64m },
+  { X86::DIVR_FpI16m32, X86::DIVR_FI16m},
+  { X86::DIVR_FpI16m64, X86::DIVR_FI16m},
+  { X86::DIVR_FpI16m80, X86::DIVR_FI16m},
+  { X86::DIVR_FpI32m32, X86::DIVR_FI32m},
+  { X86::DIVR_FpI32m64, X86::DIVR_FI32m},
+  { X86::DIVR_FpI32m80, X86::DIVR_FI32m},
+  { X86::DIV_Fp32m    , X86::DIV_F32m  },
+  { X86::DIV_Fp64m    , X86::DIV_F64m  },
+  { X86::DIV_Fp64m32  , X86::DIV_F32m  },
+  { X86::DIV_Fp80m32  , X86::DIV_F32m  },
+  { X86::DIV_Fp80m64  , X86::DIV_F64m  },
+  { X86::DIV_FpI16m32 , X86::DIV_FI16m },
+  { X86::DIV_FpI16m64 , X86::DIV_FI16m },
+  { X86::DIV_FpI16m80 , X86::DIV_FI16m },
+  { X86::DIV_FpI32m32 , X86::DIV_FI32m },
+  { X86::DIV_FpI32m64 , X86::DIV_FI32m },
+  { X86::DIV_FpI32m80 , X86::DIV_FI32m },
+  { X86::ILD_Fp16m32  , X86::ILD_F16m  },
+  { X86::ILD_Fp16m64  , X86::ILD_F16m  },
+  { X86::ILD_Fp16m80  , X86::ILD_F16m  },
+  { X86::ILD_Fp32m32  , X86::ILD_F32m  },
+  { X86::ILD_Fp32m64  , X86::ILD_F32m  },
+  { X86::ILD_Fp32m80  , X86::ILD_F32m  },
+  { X86::ILD_Fp64m32  , X86::ILD_F64m  },
+  { X86::ILD_Fp64m64  , X86::ILD_F64m  },
+  { X86::ILD_Fp64m80  , X86::ILD_F64m  },
+  { X86::ISTT_Fp16m32 , X86::ISTT_FP16m},
+  { X86::ISTT_Fp16m64 , X86::ISTT_FP16m},
+  { X86::ISTT_Fp16m80 , X86::ISTT_FP16m},
+  { X86::ISTT_Fp32m32 , X86::ISTT_FP32m},
+  { X86::ISTT_Fp32m64 , X86::ISTT_FP32m},
+  { X86::ISTT_Fp32m80 , X86::ISTT_FP32m},
+  { X86::ISTT_Fp64m32 , X86::ISTT_FP64m},
+  { X86::ISTT_Fp64m64 , X86::ISTT_FP64m},
+  { X86::ISTT_Fp64m80 , X86::ISTT_FP64m},
+  { X86::IST_Fp16m32  , X86::IST_F16m  },
+  { X86::IST_Fp16m64  , X86::IST_F16m  },
+  { X86::IST_Fp16m80  , X86::IST_F16m  },
+  { X86::IST_Fp32m32  , X86::IST_F32m  },
+  { X86::IST_Fp32m64  , X86::IST_F32m  },
+  { X86::IST_Fp32m80  , X86::IST_F32m  },
+  { X86::IST_Fp64m32  , X86::IST_FP64m },
+  { X86::IST_Fp64m64  , X86::IST_FP64m },
+  { X86::IST_Fp64m80  , X86::IST_FP64m },
+  { X86::LD_Fp032     , X86::LD_F0     },
+  { X86::LD_Fp064     , X86::LD_F0     },
+  { X86::LD_Fp080     , X86::LD_F0     },
+  { X86::LD_Fp132     , X86::LD_F1     },
+  { X86::LD_Fp164     , X86::LD_F1     },
+  { X86::LD_Fp180     , X86::LD_F1     },
+  { X86::LD_Fp32m     , X86::LD_F32m   },
+  { X86::LD_Fp32m64   , X86::LD_F32m   },
+  { X86::LD_Fp32m80   , X86::LD_F32m   },
+  { X86::LD_Fp64m     , X86::LD_F64m   },
+  { X86::LD_Fp64m80   , X86::LD_F64m   },
+  { X86::LD_Fp80m     , X86::LD_F80m   },
+  { X86::MUL_Fp32m    , X86::MUL_F32m  },
+  { X86::MUL_Fp64m    , X86::MUL_F64m  },
+  { X86::MUL_Fp64m32  , X86::MUL_F32m  },
+  { X86::MUL_Fp80m32  , X86::MUL_F32m  },
+  { X86::MUL_Fp80m64  , X86::MUL_F64m  },
+  { X86::MUL_FpI16m32 , X86::MUL_FI16m },
+  { X86::MUL_FpI16m64 , X86::MUL_FI16m },
+  { X86::MUL_FpI16m80 , X86::MUL_FI16m },
+  { X86::MUL_FpI32m32 , X86::MUL_FI32m },
+  { X86::MUL_FpI32m64 , X86::MUL_FI32m },
+  { X86::MUL_FpI32m80 , X86::MUL_FI32m },
+  { X86::SIN_Fp32     , X86::SIN_F     },
+  { X86::SIN_Fp64     , X86::SIN_F     },
+  { X86::SIN_Fp80     , X86::SIN_F     },
+  { X86::SQRT_Fp32    , X86::SQRT_F    },
+  { X86::SQRT_Fp64    , X86::SQRT_F    },
+  { X86::SQRT_Fp80    , X86::SQRT_F    },
+  { X86::ST_Fp32m     , X86::ST_F32m   },
+  { X86::ST_Fp64m     , X86::ST_F64m   },
+  { X86::ST_Fp64m32   , X86::ST_F32m   },
+  { X86::ST_Fp80m32   , X86::ST_F32m   },
+  { X86::ST_Fp80m64   , X86::ST_F64m   },
+  { X86::ST_FpP80m    , X86::ST_FP80m  },
+  { X86::SUBR_Fp32m   , X86::SUBR_F32m },
+  { X86::SUBR_Fp64m   , X86::SUBR_F64m },
+  { X86::SUBR_Fp64m32 , X86::SUBR_F32m },
+  { X86::SUBR_Fp80m32 , X86::SUBR_F32m },
+  { X86::SUBR_Fp80m64 , X86::SUBR_F64m },
+  { X86::SUBR_FpI16m32, X86::SUBR_FI16m},
+  { X86::SUBR_FpI16m64, X86::SUBR_FI16m},
+  { X86::SUBR_FpI16m80, X86::SUBR_FI16m},
+  { X86::SUBR_FpI32m32, X86::SUBR_FI32m},
+  { X86::SUBR_FpI32m64, X86::SUBR_FI32m},
+  { X86::SUBR_FpI32m80, X86::SUBR_FI32m},
+  { X86::SUB_Fp32m    , X86::SUB_F32m  },
+  { X86::SUB_Fp64m    , X86::SUB_F64m  },
+  { X86::SUB_Fp64m32  , X86::SUB_F32m  },
+  { X86::SUB_Fp80m32  , X86::SUB_F32m  },
+  { X86::SUB_Fp80m64  , X86::SUB_F64m  },
+  { X86::SUB_FpI16m32 , X86::SUB_FI16m },
+  { X86::SUB_FpI16m64 , X86::SUB_FI16m },
+  { X86::SUB_FpI16m80 , X86::SUB_FI16m },
+  { X86::SUB_FpI32m32 , X86::SUB_FI32m },
+  { X86::SUB_FpI32m64 , X86::SUB_FI32m },
+  { X86::SUB_FpI32m80 , X86::SUB_FI32m },
+  { X86::TST_Fp32     , X86::TST_F     },
+  { X86::TST_Fp64     , X86::TST_F     },
+  { X86::TST_Fp80     , X86::TST_F     },
+  { X86::UCOM_FpIr32  , X86::UCOM_FIr  },
+  { X86::UCOM_FpIr64  , X86::UCOM_FIr  },
+  { X86::UCOM_FpIr80  , X86::UCOM_FIr  },
+  { X86::UCOM_Fpr32   , X86::UCOM_Fr   },
+  { X86::UCOM_Fpr64   , X86::UCOM_Fr   },
+  { X86::UCOM_Fpr80   , X86::UCOM_Fr   },
+};
+
+static unsigned getConcreteOpcode(unsigned Opcode) {
+  ASSERT_SORTED(OpcodeTable);
+  int Opc = Lookup(OpcodeTable, array_lengthof(OpcodeTable), Opcode);
+  assert(Opc != -1 && "FP Stack instruction not in OpcodeTable!");
+  return Opc;
+}
+
+//===----------------------------------------------------------------------===//
+// Helper Methods
+//===----------------------------------------------------------------------===//
+
+// PopTable - Sorted map of instructions to their popping version.  The first
+// element is an instruction, the second is the version which pops.
+//
+static const TableEntry PopTable[] = {
+  { X86::ADD_FrST0 , X86::ADD_FPrST0  },
+
+  { X86::DIVR_FrST0, X86::DIVR_FPrST0 },
+  { X86::DIV_FrST0 , X86::DIV_FPrST0  },
+
+  { X86::IST_F16m  , X86::IST_FP16m   },
+  { X86::IST_F32m  , X86::IST_FP32m   },
+
+  { X86::MUL_FrST0 , X86::MUL_FPrST0  },
+
+  { X86::ST_F32m   , X86::ST_FP32m    },
+  { X86::ST_F64m   , X86::ST_FP64m    },
+  { X86::ST_Frr    , X86::ST_FPrr     },
+
+  { X86::SUBR_FrST0, X86::SUBR_FPrST0 },
+  { X86::SUB_FrST0 , X86::SUB_FPrST0  },
+
+  { X86::UCOM_FIr  , X86::UCOM_FIPr   },
+
+  { X86::UCOM_FPr  , X86::UCOM_FPPr   },
+  { X86::UCOM_Fr   , X86::UCOM_FPr    },
+};
+
+/// popStackAfter - Pop the current value off of the top of the FP stack after
+/// the specified instruction.  This attempts to be sneaky and combine the pop
+/// into the instruction itself if possible.  The iterator is left pointing to
+/// the last instruction, be it a new pop instruction inserted, or the old
+/// instruction if it was modified in place.
+///
+void FPS::popStackAfter(MachineBasicBlock::iterator &I) {
+  MachineInstr* MI = I;
+  DebugLoc dl = MI->getDebugLoc();
+  ASSERT_SORTED(PopTable);
+  assert(StackTop > 0 && "Cannot pop empty stack!");
+  RegMap[Stack[--StackTop]] = ~0;     // Update state
+
+  // Check to see if there is a popping version of this instruction...
+  int Opcode = Lookup(PopTable, array_lengthof(PopTable), I->getOpcode());
+  if (Opcode != -1) {
+    I->setDesc(TII->get(Opcode));
+    if (Opcode == X86::UCOM_FPPr)
+      I->RemoveOperand(0);
+  } else {    // Insert an explicit pop
+    I = BuildMI(*MBB, ++I, dl, TII->get(X86::ST_FPrr)).addReg(X86::ST0);
+  }
+}
+
+/// freeStackSlotAfter - Free the specified register from the register stack, so
+/// that it is no longer in a register.  If the register is currently at the top
+/// of the stack, we just pop the current instruction, otherwise we store the
+/// current top-of-stack into the specified slot, then pop the top of stack.
+void FPS::freeStackSlotAfter(MachineBasicBlock::iterator &I, unsigned FPRegNo) {
+  if (getStackEntry(0) == FPRegNo) {  // already at the top of stack? easy.
+    popStackAfter(I);
+    return;
+  }
+
+  // Otherwise, store the top of stack into the dead slot, killing the operand
+  // without having to add in an explicit xchg then pop.
+  //
+  unsigned STReg    = getSTReg(FPRegNo);
+  unsigned OldSlot  = getSlot(FPRegNo);
+  unsigned TopReg   = Stack[StackTop-1];
+  Stack[OldSlot]    = TopReg;
+  RegMap[TopReg]    = OldSlot;
+  RegMap[FPRegNo]   = ~0;
+  Stack[--StackTop] = ~0;
+  MachineInstr *MI  = I;
+  DebugLoc dl = MI->getDebugLoc();
+  I = BuildMI(*MBB, ++I, dl, TII->get(X86::ST_FPrr)).addReg(STReg);
+}
+
+
+//===----------------------------------------------------------------------===//
+// Instruction transformation implementation
+//===----------------------------------------------------------------------===//
+
+/// handleZeroArgFP - ST(0) = fld0    ST(0) = flds <mem>
+///
+void FPS::handleZeroArgFP(MachineBasicBlock::iterator &I) {
+  MachineInstr *MI = I;
+  unsigned DestReg = getFPReg(MI->getOperand(0));
+
+  // Change from the pseudo instruction to the concrete instruction.
+  MI->RemoveOperand(0);   // Remove the explicit ST(0) operand
+  MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+  
+  // Result gets pushed on the stack.
+  pushReg(DestReg);
+}
+
+/// handleOneArgFP - fst <mem>, ST(0)
+///
+void FPS::handleOneArgFP(MachineBasicBlock::iterator &I) {
+  MachineInstr *MI = I;
+  unsigned NumOps = MI->getDesc().getNumOperands();
+  assert((NumOps == X86AddrNumOperands + 1 || NumOps == 1) &&
+         "Can only handle fst* & ftst instructions!");
+
+  // Is this the last use of the source register?
+  unsigned Reg = getFPReg(MI->getOperand(NumOps-1));
+  bool KillsSrc = MI->killsRegister(X86::FP0+Reg);
+
+  // FISTP64m is strange because there isn't a non-popping versions.
+  // If we have one _and_ we don't want to pop the operand, duplicate the value
+  // on the stack instead of moving it.  This ensure that popping the value is
+  // always ok.
+  // Ditto FISTTP16m, FISTTP32m, FISTTP64m, ST_FpP80m.
+  //
+  if (!KillsSrc &&
+      (MI->getOpcode() == X86::IST_Fp64m32 ||
+       MI->getOpcode() == X86::ISTT_Fp16m32 ||
+       MI->getOpcode() == X86::ISTT_Fp32m32 ||
+       MI->getOpcode() == X86::ISTT_Fp64m32 ||
+       MI->getOpcode() == X86::IST_Fp64m64 ||
+       MI->getOpcode() == X86::ISTT_Fp16m64 ||
+       MI->getOpcode() == X86::ISTT_Fp32m64 ||
+       MI->getOpcode() == X86::ISTT_Fp64m64 ||
+       MI->getOpcode() == X86::IST_Fp64m80 ||
+       MI->getOpcode() == X86::ISTT_Fp16m80 ||
+       MI->getOpcode() == X86::ISTT_Fp32m80 ||
+       MI->getOpcode() == X86::ISTT_Fp64m80 ||
+       MI->getOpcode() == X86::ST_FpP80m)) {
+    duplicateToTop(Reg, 7 /*temp register*/, I);
+  } else {
+    moveToTop(Reg, I);            // Move to the top of the stack...
+  }
+  
+  // Convert from the pseudo instruction to the concrete instruction.
+  MI->RemoveOperand(NumOps-1);    // Remove explicit ST(0) operand
+  MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+
+  if (MI->getOpcode() == X86::IST_FP64m ||
+      MI->getOpcode() == X86::ISTT_FP16m ||
+      MI->getOpcode() == X86::ISTT_FP32m ||
+      MI->getOpcode() == X86::ISTT_FP64m ||
+      MI->getOpcode() == X86::ST_FP80m) {
+    assert(StackTop > 0 && "Stack empty??");
+    --StackTop;
+  } else if (KillsSrc) { // Last use of operand?
+    popStackAfter(I);
+  }
+}
+
+
+/// handleOneArgFPRW: Handle instructions that read from the top of stack and
+/// replace the value with a newly computed value.  These instructions may have
+/// non-fp operands after their FP operands.
+///
+///  Examples:
+///     R1 = fchs R2
+///     R1 = fadd R2, [mem]
+///
+void FPS::handleOneArgFPRW(MachineBasicBlock::iterator &I) {
+  MachineInstr *MI = I;
+#ifndef NDEBUG
+  unsigned NumOps = MI->getDesc().getNumOperands();
+  assert(NumOps >= 2 && "FPRW instructions must have 2 ops!!");
+#endif
+
+  // Is this the last use of the source register?
+  unsigned Reg = getFPReg(MI->getOperand(1));
+  bool KillsSrc = MI->killsRegister(X86::FP0+Reg);
+
+  if (KillsSrc) {
+    // If this is the last use of the source register, just make sure it's on
+    // the top of the stack.
+    moveToTop(Reg, I);
+    assert(StackTop > 0 && "Stack cannot be empty!");
+    --StackTop;
+    pushReg(getFPReg(MI->getOperand(0)));
+  } else {
+    // If this is not the last use of the source register, _copy_ it to the top
+    // of the stack.
+    duplicateToTop(Reg, getFPReg(MI->getOperand(0)), I);
+  }
+
+  // Change from the pseudo instruction to the concrete instruction.
+  MI->RemoveOperand(1);   // Drop the source operand.
+  MI->RemoveOperand(0);   // Drop the destination operand.
+  MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define tables of various ways to map pseudo instructions
+//
+
+// ForwardST0Table - Map: A = B op C  into: ST(0) = ST(0) op ST(i)
+static const TableEntry ForwardST0Table[] = {
+  { X86::ADD_Fp32  , X86::ADD_FST0r },
+  { X86::ADD_Fp64  , X86::ADD_FST0r },
+  { X86::ADD_Fp80  , X86::ADD_FST0r },
+  { X86::DIV_Fp32  , X86::DIV_FST0r },
+  { X86::DIV_Fp64  , X86::DIV_FST0r },
+  { X86::DIV_Fp80  , X86::DIV_FST0r },
+  { X86::MUL_Fp32  , X86::MUL_FST0r },
+  { X86::MUL_Fp64  , X86::MUL_FST0r },
+  { X86::MUL_Fp80  , X86::MUL_FST0r },
+  { X86::SUB_Fp32  , X86::SUB_FST0r },
+  { X86::SUB_Fp64  , X86::SUB_FST0r },
+  { X86::SUB_Fp80  , X86::SUB_FST0r },
+};
+
+// ReverseST0Table - Map: A = B op C  into: ST(0) = ST(i) op ST(0)
+static const TableEntry ReverseST0Table[] = {
+  { X86::ADD_Fp32  , X86::ADD_FST0r  },   // commutative
+  { X86::ADD_Fp64  , X86::ADD_FST0r  },   // commutative
+  { X86::ADD_Fp80  , X86::ADD_FST0r  },   // commutative
+  { X86::DIV_Fp32  , X86::DIVR_FST0r },
+  { X86::DIV_Fp64  , X86::DIVR_FST0r },
+  { X86::DIV_Fp80  , X86::DIVR_FST0r },
+  { X86::MUL_Fp32  , X86::MUL_FST0r  },   // commutative
+  { X86::MUL_Fp64  , X86::MUL_FST0r  },   // commutative
+  { X86::MUL_Fp80  , X86::MUL_FST0r  },   // commutative
+  { X86::SUB_Fp32  , X86::SUBR_FST0r },
+  { X86::SUB_Fp64  , X86::SUBR_FST0r },
+  { X86::SUB_Fp80  , X86::SUBR_FST0r },
+};
+
+// ForwardSTiTable - Map: A = B op C  into: ST(i) = ST(0) op ST(i)
+static const TableEntry ForwardSTiTable[] = {
+  { X86::ADD_Fp32  , X86::ADD_FrST0  },   // commutative
+  { X86::ADD_Fp64  , X86::ADD_FrST0  },   // commutative
+  { X86::ADD_Fp80  , X86::ADD_FrST0  },   // commutative
+  { X86::DIV_Fp32  , X86::DIVR_FrST0 },
+  { X86::DIV_Fp64  , X86::DIVR_FrST0 },
+  { X86::DIV_Fp80  , X86::DIVR_FrST0 },
+  { X86::MUL_Fp32  , X86::MUL_FrST0  },   // commutative
+  { X86::MUL_Fp64  , X86::MUL_FrST0  },   // commutative
+  { X86::MUL_Fp80  , X86::MUL_FrST0  },   // commutative
+  { X86::SUB_Fp32  , X86::SUBR_FrST0 },
+  { X86::SUB_Fp64  , X86::SUBR_FrST0 },
+  { X86::SUB_Fp80  , X86::SUBR_FrST0 },
+};
+
+// ReverseSTiTable - Map: A = B op C  into: ST(i) = ST(i) op ST(0)
+static const TableEntry ReverseSTiTable[] = {
+  { X86::ADD_Fp32  , X86::ADD_FrST0 },
+  { X86::ADD_Fp64  , X86::ADD_FrST0 },
+  { X86::ADD_Fp80  , X86::ADD_FrST0 },
+  { X86::DIV_Fp32  , X86::DIV_FrST0 },
+  { X86::DIV_Fp64  , X86::DIV_FrST0 },
+  { X86::DIV_Fp80  , X86::DIV_FrST0 },
+  { X86::MUL_Fp32  , X86::MUL_FrST0 },
+  { X86::MUL_Fp64  , X86::MUL_FrST0 },
+  { X86::MUL_Fp80  , X86::MUL_FrST0 },
+  { X86::SUB_Fp32  , X86::SUB_FrST0 },
+  { X86::SUB_Fp64  , X86::SUB_FrST0 },
+  { X86::SUB_Fp80  , X86::SUB_FrST0 },
+};
+
+
+/// handleTwoArgFP - Handle instructions like FADD and friends which are virtual
+/// instructions which need to be simplified and possibly transformed.
+///
+/// Result: ST(0) = fsub  ST(0), ST(i)
+///         ST(i) = fsub  ST(0), ST(i)
+///         ST(0) = fsubr ST(0), ST(i)
+///         ST(i) = fsubr ST(0), ST(i)
+///
+void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) {
+  ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table);
+  ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable);
+  MachineInstr *MI = I;
+
+  unsigned NumOperands = MI->getDesc().getNumOperands();
+  assert(NumOperands == 3 && "Illegal TwoArgFP instruction!");
+  unsigned Dest = getFPReg(MI->getOperand(0));
+  unsigned Op0 = getFPReg(MI->getOperand(NumOperands-2));
+  unsigned Op1 = getFPReg(MI->getOperand(NumOperands-1));
+  bool KillsOp0 = MI->killsRegister(X86::FP0+Op0);
+  bool KillsOp1 = MI->killsRegister(X86::FP0+Op1);
+  DebugLoc dl = MI->getDebugLoc();
+
+  unsigned TOS = getStackEntry(0);
+
+  // One of our operands must be on the top of the stack.  If neither is yet, we
+  // need to move one.
+  if (Op0 != TOS && Op1 != TOS) {   // No operand at TOS?
+    // We can choose to move either operand to the top of the stack.  If one of
+    // the operands is killed by this instruction, we want that one so that we
+    // can update right on top of the old version.
+    if (KillsOp0) {
+      moveToTop(Op0, I);         // Move dead operand to TOS.
+      TOS = Op0;
+    } else if (KillsOp1) {
+      moveToTop(Op1, I);
+      TOS = Op1;
+    } else {
+      // All of the operands are live after this instruction executes, so we
+      // cannot update on top of any operand.  Because of this, we must
+      // duplicate one of the stack elements to the top.  It doesn't matter
+      // which one we pick.
+      //
+      duplicateToTop(Op0, Dest, I);
+      Op0 = TOS = Dest;
+      KillsOp0 = true;
+    }
+  } else if (!KillsOp0 && !KillsOp1) {
+    // If we DO have one of our operands at the top of the stack, but we don't
+    // have a dead operand, we must duplicate one of the operands to a new slot
+    // on the stack.
+    duplicateToTop(Op0, Dest, I);
+    Op0 = TOS = Dest;
+    KillsOp0 = true;
+  }
+
+  // Now we know that one of our operands is on the top of the stack, and at
+  // least one of our operands is killed by this instruction.
+  assert((TOS == Op0 || TOS == Op1) && (KillsOp0 || KillsOp1) &&
+         "Stack conditions not set up right!");
+
+  // We decide which form to use based on what is on the top of the stack, and
+  // which operand is killed by this instruction.
+  const TableEntry *InstTable;
+  bool isForward = TOS == Op0;
+  bool updateST0 = (TOS == Op0 && !KillsOp1) || (TOS == Op1 && !KillsOp0);
+  if (updateST0) {
+    if (isForward)
+      InstTable = ForwardST0Table;
+    else
+      InstTable = ReverseST0Table;
+  } else {
+    if (isForward)
+      InstTable = ForwardSTiTable;
+    else
+      InstTable = ReverseSTiTable;
+  }
+
+  int Opcode = Lookup(InstTable, array_lengthof(ForwardST0Table),
+                      MI->getOpcode());
+  assert(Opcode != -1 && "Unknown TwoArgFP pseudo instruction!");
+
+  // NotTOS - The register which is not on the top of stack...
+  unsigned NotTOS = (TOS == Op0) ? Op1 : Op0;
+
+  // Replace the old instruction with a new instruction
+  MBB->remove(I++);
+  I = BuildMI(*MBB, I, dl, TII->get(Opcode)).addReg(getSTReg(NotTOS));
+
+  // If both operands are killed, pop one off of the stack in addition to
+  // overwriting the other one.
+  if (KillsOp0 && KillsOp1 && Op0 != Op1) {
+    assert(!updateST0 && "Should have updated other operand!");
+    popStackAfter(I);   // Pop the top of stack
+  }
+
+  // Update stack information so that we know the destination register is now on
+  // the stack.
+  unsigned UpdatedSlot = getSlot(updateST0 ? TOS : NotTOS);
+  assert(UpdatedSlot < StackTop && Dest < 7);
+  Stack[UpdatedSlot]   = Dest;
+  RegMap[Dest]         = UpdatedSlot;
+  MBB->getParent()->DeleteMachineInstr(MI); // Remove the old instruction
+}
+
+/// handleCompareFP - Handle FUCOM and FUCOMI instructions, which have two FP
+/// register arguments and no explicit destinations.
+///
+void FPS::handleCompareFP(MachineBasicBlock::iterator &I) {
+  ASSERT_SORTED(ForwardST0Table); ASSERT_SORTED(ReverseST0Table);
+  ASSERT_SORTED(ForwardSTiTable); ASSERT_SORTED(ReverseSTiTable);
+  MachineInstr *MI = I;
+
+  unsigned NumOperands = MI->getDesc().getNumOperands();
+  assert(NumOperands == 2 && "Illegal FUCOM* instruction!");
+  unsigned Op0 = getFPReg(MI->getOperand(NumOperands-2));
+  unsigned Op1 = getFPReg(MI->getOperand(NumOperands-1));
+  bool KillsOp0 = MI->killsRegister(X86::FP0+Op0);
+  bool KillsOp1 = MI->killsRegister(X86::FP0+Op1);
+
+  // Make sure the first operand is on the top of stack, the other one can be
+  // anywhere.
+  moveToTop(Op0, I);
+
+  // Change from the pseudo instruction to the concrete instruction.
+  MI->getOperand(0).setReg(getSTReg(Op1));
+  MI->RemoveOperand(1);
+  MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+
+  // If any of the operands are killed by this instruction, free them.
+  if (KillsOp0) freeStackSlotAfter(I, Op0);
+  if (KillsOp1 && Op0 != Op1) freeStackSlotAfter(I, Op1);
+}
+
+/// handleCondMovFP - Handle two address conditional move instructions.  These
+/// instructions move a st(i) register to st(0) iff a condition is true.  These
+/// instructions require that the first operand is at the top of the stack, but
+/// otherwise don't modify the stack at all.
+void FPS::handleCondMovFP(MachineBasicBlock::iterator &I) {
+  MachineInstr *MI = I;
+
+  unsigned Op0 = getFPReg(MI->getOperand(0));
+  unsigned Op1 = getFPReg(MI->getOperand(2));
+  bool KillsOp1 = MI->killsRegister(X86::FP0+Op1);
+
+  // The first operand *must* be on the top of the stack.
+  moveToTop(Op0, I);
+
+  // Change the second operand to the stack register that the operand is in.
+  // Change from the pseudo instruction to the concrete instruction.
+  MI->RemoveOperand(0);
+  MI->RemoveOperand(1);
+  MI->getOperand(0).setReg(getSTReg(Op1));
+  MI->setDesc(TII->get(getConcreteOpcode(MI->getOpcode())));
+  
+  // If we kill the second operand, make sure to pop it from the stack.
+  if (Op0 != Op1 && KillsOp1) {
+    // Get this value off of the register stack.
+    freeStackSlotAfter(I, Op1);
+  }
+}
+
+
+/// handleSpecialFP - Handle special instructions which behave unlike other
+/// floating point instructions.  This is primarily intended for use by pseudo
+/// instructions.
+///
+void FPS::handleSpecialFP(MachineBasicBlock::iterator &I) {
+  MachineInstr *MI = I;
+  DebugLoc dl = MI->getDebugLoc();
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("Unknown SpecialFP instruction!");
+  case X86::FpGET_ST0_32:// Appears immediately after a call returning FP type!
+  case X86::FpGET_ST0_64:// Appears immediately after a call returning FP type!
+  case X86::FpGET_ST0_80:// Appears immediately after a call returning FP type!
+    assert(StackTop == 0 && "Stack should be empty after a call!");
+    pushReg(getFPReg(MI->getOperand(0)));
+    break;
+  case X86::FpGET_ST1_32:// Appears immediately after a call returning FP type!
+  case X86::FpGET_ST1_64:// Appears immediately after a call returning FP type!
+  case X86::FpGET_ST1_80:{// Appears immediately after a call returning FP type!
+    // FpGET_ST1 should occur right after a FpGET_ST0 for a call or inline asm.
+    // The pattern we expect is:
+    //  CALL
+    //  FP1 = FpGET_ST0
+    //  FP4 = FpGET_ST1
+    //
+    // At this point, we've pushed FP1 on the top of stack, so it should be
+    // present if it isn't dead.  If it was dead, we already emitted a pop to
+    // remove it from the stack and StackTop = 0.
+    
+    // Push FP4 as top of stack next.
+    pushReg(getFPReg(MI->getOperand(0)));
+
+    // If StackTop was 0 before we pushed our operand, then ST(0) must have been
+    // dead.  In this case, the ST(1) value is the only thing that is live, so
+    // it should be on the TOS (after the pop that was emitted) and is.  Just
+    // continue in this case.
+    if (StackTop == 1)
+      break;
+    
+    // Because pushReg just pushed ST(1) as TOS, we now have to swap the two top
+    // elements so that our accounting is correct.
+    unsigned RegOnTop = getStackEntry(0);
+    unsigned RegNo = getStackEntry(1);
+    
+    // Swap the slots the regs are in.
+    std::swap(RegMap[RegNo], RegMap[RegOnTop]);
+    
+    // Swap stack slot contents.
+    assert(RegMap[RegOnTop] < StackTop);
+    std::swap(Stack[RegMap[RegOnTop]], Stack[StackTop-1]);
+    break;
+  }
+  case X86::FpSET_ST0_32:
+  case X86::FpSET_ST0_64:
+  case X86::FpSET_ST0_80: {
+    unsigned Op0 = getFPReg(MI->getOperand(0));
+
+    // FpSET_ST0_80 is generated by copyRegToReg for both function return
+    // and inline assembly with the "st" constrain. In the latter case,
+    // it is possible for ST(0) to be alive after this instruction.
+    if (!MI->killsRegister(X86::FP0 + Op0)) {
+      // Duplicate Op0
+      duplicateToTop(0, 7 /*temp register*/, I);
+    } else {
+      moveToTop(Op0, I);
+    }
+    --StackTop;   // "Forget" we have something on the top of stack!
+    break;
+  }
+  case X86::FpSET_ST1_32:
+  case X86::FpSET_ST1_64:
+  case X86::FpSET_ST1_80:
+    // StackTop can be 1 if a FpSET_ST0_* was before this. Exchange them.
+    if (StackTop == 1) {
+      BuildMI(*MBB, I, dl, TII->get(X86::XCH_F)).addReg(X86::ST1);
+      NumFXCH++;
+      StackTop = 0;
+      break;
+    }
+    assert(StackTop == 2 && "Stack should have two element on it to return!");
+    --StackTop;   // "Forget" we have something on the top of stack!
+    break;
+  case X86::MOV_Fp3232:
+  case X86::MOV_Fp3264:
+  case X86::MOV_Fp6432:
+  case X86::MOV_Fp6464: 
+  case X86::MOV_Fp3280:
+  case X86::MOV_Fp6480:
+  case X86::MOV_Fp8032:
+  case X86::MOV_Fp8064: 
+  case X86::MOV_Fp8080: {
+    const MachineOperand &MO1 = MI->getOperand(1);
+    unsigned SrcReg = getFPReg(MO1);
+
+    const MachineOperand &MO0 = MI->getOperand(0);
+    // These can be created due to inline asm. Two address pass can introduce
+    // copies from RFP registers to virtual registers.
+    if (MO0.getReg() == X86::ST0 && SrcReg == 0) {
+      assert(MO1.isKill());
+      // Treat %ST0<def> = MOV_Fp8080 %FP0<kill>
+      // like  FpSET_ST0_80 %FP0<kill>, %ST0<imp-def>
+      assert((StackTop == 1 || StackTop == 2)
+             && "Stack should have one or two element on it to return!");
+      --StackTop;   // "Forget" we have something on the top of stack!
+      break;
+    } else if (MO0.getReg() == X86::ST1 && SrcReg == 1) {
+      assert(MO1.isKill());
+      // Treat %ST1<def> = MOV_Fp8080 %FP1<kill>
+      // like  FpSET_ST1_80 %FP0<kill>, %ST1<imp-def>
+      // StackTop can be 1 if a FpSET_ST0_* was before this. Exchange them.
+      if (StackTop == 1) {
+        BuildMI(*MBB, I, dl, TII->get(X86::XCH_F)).addReg(X86::ST1);
+        NumFXCH++;
+        StackTop = 0;
+        break;
+      }
+      assert(StackTop == 2 && "Stack should have two element on it to return!");
+      --StackTop;   // "Forget" we have something on the top of stack!
+      break;
+    }
+
+    unsigned DestReg = getFPReg(MO0);
+    if (MI->killsRegister(X86::FP0+SrcReg)) {
+      // If the input operand is killed, we can just change the owner of the
+      // incoming stack slot into the result.
+      unsigned Slot = getSlot(SrcReg);
+      assert(Slot < 7 && DestReg < 7 && "FpMOV operands invalid!");
+      Stack[Slot] = DestReg;
+      RegMap[DestReg] = Slot;
+
+    } else {
+      // For FMOV we just duplicate the specified value to a new stack slot.
+      // This could be made better, but would require substantial changes.
+      duplicateToTop(SrcReg, DestReg, I);
+    }
+    }
+    break;
+  case TargetOpcode::INLINEASM: {
+    // The inline asm MachineInstr currently only *uses* FP registers for the
+    // 'f' constraint.  These should be turned into the current ST(x) register
+    // in the machine instr.  Also, any kills should be explicitly popped after
+    // the inline asm.
+    unsigned Kills[7];
+    unsigned NumKills = 0;
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &Op = MI->getOperand(i);
+      if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
+        continue;
+      assert(Op.isUse() && "Only handle inline asm uses right now");
+      
+      unsigned FPReg = getFPReg(Op);
+      Op.setReg(getSTReg(FPReg));
+      
+      // If we kill this operand, make sure to pop it from the stack after the
+      // asm.  We just remember it for now, and pop them all off at the end in
+      // a batch.
+      if (Op.isKill())
+        Kills[NumKills++] = FPReg;
+    }
+
+    // If this asm kills any FP registers (is the last use of them) we must
+    // explicitly emit pop instructions for them.  Do this now after the asm has
+    // executed so that the ST(x) numbers are not off (which would happen if we
+    // did this inline with operand rewriting).
+    //
+    // Note: this might be a non-optimal pop sequence.  We might be able to do
+    // better by trying to pop in stack order or something.
+    MachineBasicBlock::iterator InsertPt = MI;
+    while (NumKills)
+      freeStackSlotAfter(InsertPt, Kills[--NumKills]);
+
+    // Don't delete the inline asm!
+    return;
+  }
+      
+  case X86::RET:
+  case X86::RETI:
+    // If RET has an FP register use operand, pass the first one in ST(0) and
+    // the second one in ST(1).
+    if (isStackEmpty()) return;  // Quick check to see if any are possible.
+    
+    // Find the register operands.
+    unsigned FirstFPRegOp = ~0U, SecondFPRegOp = ~0U;
+    
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &Op = MI->getOperand(i);
+      if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
+        continue;
+      // FP Register uses must be kills unless there are two uses of the same
+      // register, in which case only one will be a kill.
+      assert(Op.isUse() &&
+             (Op.isKill() ||                        // Marked kill.
+              getFPReg(Op) == FirstFPRegOp ||       // Second instance.
+              MI->killsRegister(Op.getReg())) &&    // Later use is marked kill.
+             "Ret only defs operands, and values aren't live beyond it");
+
+      if (FirstFPRegOp == ~0U)
+        FirstFPRegOp = getFPReg(Op);
+      else {
+        assert(SecondFPRegOp == ~0U && "More than two fp operands!");
+        SecondFPRegOp = getFPReg(Op);
+      }
+
+      // Remove the operand so that later passes don't see it.
+      MI->RemoveOperand(i);
+      --i, --e;
+    }
+    
+    // There are only four possibilities here:
+    // 1) we are returning a single FP value.  In this case, it has to be in
+    //    ST(0) already, so just declare success by removing the value from the
+    //    FP Stack.
+    if (SecondFPRegOp == ~0U) {
+      // Assert that the top of stack contains the right FP register.
+      assert(StackTop == 1 && FirstFPRegOp == getStackEntry(0) &&
+             "Top of stack not the right register for RET!");
+      
+      // Ok, everything is good, mark the value as not being on the stack
+      // anymore so that our assertion about the stack being empty at end of
+      // block doesn't fire.
+      StackTop = 0;
+      return;
+    }
+    
+    // Otherwise, we are returning two values:
+    // 2) If returning the same value for both, we only have one thing in the FP
+    //    stack.  Consider:  RET FP1, FP1
+    if (StackTop == 1) {
+      assert(FirstFPRegOp == SecondFPRegOp && FirstFPRegOp == getStackEntry(0)&&
+             "Stack misconfiguration for RET!");
+      
+      // Duplicate the TOS so that we return it twice.  Just pick some other FPx
+      // register to hold it.
+      unsigned NewReg = (FirstFPRegOp+1)%7;
+      duplicateToTop(FirstFPRegOp, NewReg, MI);
+      FirstFPRegOp = NewReg;
+    }
+    
+    /// Okay we know we have two different FPx operands now:
+    assert(StackTop == 2 && "Must have two values live!");
+    
+    /// 3) If SecondFPRegOp is currently in ST(0) and FirstFPRegOp is currently
+    ///    in ST(1).  In this case, emit an fxch.
+    if (getStackEntry(0) == SecondFPRegOp) {
+      assert(getStackEntry(1) == FirstFPRegOp && "Unknown regs live");
+      moveToTop(FirstFPRegOp, MI);
+    }
+    
+    /// 4) Finally, FirstFPRegOp must be in ST(0) and SecondFPRegOp must be in
+    /// ST(1).  Just remove both from our understanding of the stack and return.
+    assert(getStackEntry(0) == FirstFPRegOp && "Unknown regs live");
+    assert(getStackEntry(1) == SecondFPRegOp && "Unknown regs live");
+    StackTop = 0;
+    return;
+  }
+
+  I = MBB->erase(I);  // Remove the pseudo instruction
+  --I;
+}

diff --git a/lib/Target/X86/X86FloatingPointRegKill.cpp b/lib/Target/X86/X86FloatingPointRegKill.cpp
new file mode 100644
index 0000000..34a0045
--- /dev/null
+++ b/lib/Target/X86/X86FloatingPointRegKill.cpp

@@ -0,0 +1,140 @@
+//===-- X86FloatingPoint.cpp - FP_REG_KILL inserter -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which inserts FP_REG_KILL instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-codegen"
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/Instructions.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(NumFPKill, "Number of FP_REG_KILL instructions added");
+
+namespace {
+  struct FPRegKiller : public MachineFunctionPass {
+    static char ID;
+    FPRegKiller() : MachineFunctionPass(&ID) {}
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addPreservedID(MachineLoopInfoID);
+      AU.addPreservedID(MachineDominatorsID);
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    virtual bool runOnMachineFunction(MachineFunction &MF);
+
+    virtual const char *getPassName() const { return "X86 FP_REG_KILL inserter"; }
+  };
+  char FPRegKiller::ID = 0;
+}
+
+FunctionPass *llvm::createX87FPRegKillInserterPass() { return new FPRegKiller(); }
+
+bool FPRegKiller::runOnMachineFunction(MachineFunction &MF) {
+  // If we are emitting FP stack code, scan the basic block to determine if this
+  // block defines any FP values.  If so, put an FP_REG_KILL instruction before
+  // the terminator of the block.
+
+  // Note that FP stack instructions are used in all modes for long double,
+  // so we always need to do this check.
+  // Also note that it's possible for an FP stack register to be live across
+  // an instruction that produces multiple basic blocks (SSE CMOV) so we
+  // must check all the generated basic blocks.
+
+  // Scan all of the machine instructions in these MBBs, checking for FP
+  // stores.  (RFP32 and RFP64 will not exist in SSE mode, but RFP80 might.)
+
+  // Fast-path: If nothing is using the x87 registers, we don't need to do
+  // any scanning.
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  if (MRI.getRegClassVirtRegs(X86::RFP80RegisterClass).empty() &&
+      MRI.getRegClassVirtRegs(X86::RFP64RegisterClass).empty() &&
+      MRI.getRegClassVirtRegs(X86::RFP32RegisterClass).empty())
+    return false;
+
+  bool Changed = false;
+  const X86Subtarget &Subtarget = MF.getTarget().getSubtarget<X86Subtarget>();
+  MachineFunction::iterator MBBI = MF.begin();
+  MachineFunction::iterator EndMBB = MF.end();
+  for (; MBBI != EndMBB; ++MBBI) {
+    MachineBasicBlock *MBB = MBBI;
+    
+    // If this block returns, ignore it.  We don't want to insert an FP_REG_KILL
+    // before the return.
+    if (!MBB->empty()) {
+      MachineBasicBlock::iterator EndI = MBB->end();
+      --EndI;
+      if (EndI->getDesc().isReturn())
+        continue;
+    }
+    
+    bool ContainsFPCode = false;
+    for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+         !ContainsFPCode && I != E; ++I) {
+      if (I->getNumOperands() != 0 && I->getOperand(0).isReg()) {
+        const TargetRegisterClass *clas;
+        for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op) {
+          if (I->getOperand(op).isReg() && I->getOperand(op).isDef() &&
+            TargetRegisterInfo::isVirtualRegister(I->getOperand(op).getReg()) &&
+              ((clas = MRI.getRegClass(I->getOperand(op).getReg())) == 
+                 X86::RFP32RegisterClass ||
+               clas == X86::RFP64RegisterClass ||
+               clas == X86::RFP80RegisterClass)) {
+            ContainsFPCode = true;
+            break;
+          }
+        }
+      }
+    }
+    // Check PHI nodes in successor blocks.  These PHI's will be lowered to have
+    // a copy of the input value in this block.  In SSE mode, we only care about
+    // 80-bit values.
+    if (!ContainsFPCode) {
+      // Final check, check LLVM BB's that are successors to the LLVM BB
+      // corresponding to BB for FP PHI nodes.
+      const BasicBlock *LLVMBB = MBB->getBasicBlock();
+      const PHINode *PN;
+      for (succ_const_iterator SI = succ_begin(LLVMBB), E = succ_end(LLVMBB);
+           !ContainsFPCode && SI != E; ++SI) {
+        for (BasicBlock::const_iterator II = SI->begin();
+             (PN = dyn_cast<PHINode>(II)); ++II) {
+          if (PN->getType()==Type::getX86_FP80Ty(LLVMBB->getContext()) ||
+              (!Subtarget.hasSSE1() && PN->getType()->isFloatingPoint()) ||
+              (!Subtarget.hasSSE2() &&
+                PN->getType()==Type::getDoubleTy(LLVMBB->getContext()))) {
+            ContainsFPCode = true;
+            break;
+          }
+        }
+      }
+    }
+    // Finally, if we found any FP code, emit the FP_REG_KILL instruction.
+    if (ContainsFPCode) {
+      BuildMI(*MBB, MBBI->getFirstTerminator(), DebugLoc::getUnknownLoc(),
+              MF.getTarget().getInstrInfo()->get(X86::FP_REG_KILL));
+      ++NumFPKill;
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}

diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
new file mode 100644
index 0000000..e44ce421
--- /dev/null
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp

@@ -0,0 +1,2163 @@
+//===- X86ISelDAGToDAG.cpp - A DAG pattern matching inst selector for X86 -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a DAG pattern matching instruction selector for X86,
+// converting from a legalized dag to a X86 dag.
+//
+//===----------------------------------------------------------------------===//
+
+// Force NDEBUG on in any optimized build on Darwin.
+//
+// FIXME: This is a huge hack, to work around ridiculously awful compile times
+// on this file with gcc-4.2 on Darwin, in Release mode.
+#if (!defined(__llvm__) && defined(__APPLE__) && \
+     defined(__OPTIMIZE__) && !defined(NDEBUG))
+#define NDEBUG
+#endif
+
+#define DEBUG_TYPE "x86-isel"
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86ISelLowering.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Type.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
+
+//===----------------------------------------------------------------------===//
+//                      Pattern Matcher Implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+  /// X86ISelAddressMode - This corresponds to X86AddressMode, but uses
+  /// SDValue's instead of register numbers for the leaves of the matched
+  /// tree.
+  struct X86ISelAddressMode {
+    enum {
+      RegBase,
+      FrameIndexBase
+    } BaseType;
+
+    struct {            // This is really a union, discriminated by BaseType!
+      SDValue Reg;
+      int FrameIndex;
+    } Base;
+
+    unsigned Scale;
+    SDValue IndexReg; 
+    int32_t Disp;
+    SDValue Segment;
+    GlobalValue *GV;
+    Constant *CP;
+    BlockAddress *BlockAddr;
+    const char *ES;
+    int JT;
+    unsigned Align;    // CP alignment.
+    unsigned char SymbolFlags;  // X86II::MO_*
+
+    X86ISelAddressMode()
+      : BaseType(RegBase), Scale(1), IndexReg(), Disp(0),
+        Segment(), GV(0), CP(0), BlockAddr(0), ES(0), JT(-1), Align(0),
+        SymbolFlags(X86II::MO_NO_FLAG) {
+    }
+
+    bool hasSymbolicDisplacement() const {
+      return GV != 0 || CP != 0 || ES != 0 || JT != -1 || BlockAddr != 0;
+    }
+    
+    bool hasBaseOrIndexReg() const {
+      return IndexReg.getNode() != 0 || Base.Reg.getNode() != 0;
+    }
+    
+    /// isRIPRelative - Return true if this addressing mode is already RIP
+    /// relative.
+    bool isRIPRelative() const {
+      if (BaseType != RegBase) return false;
+      if (RegisterSDNode *RegNode =
+            dyn_cast_or_null<RegisterSDNode>(Base.Reg.getNode()))
+        return RegNode->getReg() == X86::RIP;
+      return false;
+    }
+    
+    void setBaseReg(SDValue Reg) {
+      BaseType = RegBase;
+      Base.Reg = Reg;
+    }
+
+    void dump() {
+      dbgs() << "X86ISelAddressMode " << this << '\n';
+      dbgs() << "Base.Reg ";
+      if (Base.Reg.getNode() != 0)
+        Base.Reg.getNode()->dump(); 
+      else
+        dbgs() << "nul";
+      dbgs() << " Base.FrameIndex " << Base.FrameIndex << '\n'
+             << " Scale" << Scale << '\n'
+             << "IndexReg ";
+      if (IndexReg.getNode() != 0)
+        IndexReg.getNode()->dump();
+      else
+        dbgs() << "nul"; 
+      dbgs() << " Disp " << Disp << '\n'
+             << "GV ";
+      if (GV)
+        GV->dump();
+      else
+        dbgs() << "nul";
+      dbgs() << " CP ";
+      if (CP)
+        CP->dump();
+      else
+        dbgs() << "nul";
+      dbgs() << '\n'
+             << "ES ";
+      if (ES)
+        dbgs() << ES;
+      else
+        dbgs() << "nul";
+      dbgs() << " JT" << JT << " Align" << Align << '\n';
+    }
+  };
+}
+
+namespace {
+  //===--------------------------------------------------------------------===//
+  /// ISel - X86 specific code to select X86 machine instructions for
+  /// SelectionDAG operations.
+  ///
+  class X86DAGToDAGISel : public SelectionDAGISel {
+    /// X86Lowering - This object fully describes how to lower LLVM code to an
+    /// X86-specific SelectionDAG.
+    X86TargetLowering &X86Lowering;
+
+    /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
+    /// make the right decision when generating code for different targets.
+    const X86Subtarget *Subtarget;
+
+    /// OptForSize - If true, selector should try to optimize for code size
+    /// instead of performance.
+    bool OptForSize;
+
+  public:
+    explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
+      : SelectionDAGISel(tm, OptLevel),
+        X86Lowering(*tm.getTargetLowering()),
+        Subtarget(&tm.getSubtarget<X86Subtarget>()),
+        OptForSize(false) {}
+
+    virtual const char *getPassName() const {
+      return "X86 DAG->DAG Instruction Selection";
+    }
+
+    /// InstructionSelect - This callback is invoked by
+    /// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+    virtual void InstructionSelect();
+
+    virtual void EmitFunctionEntryCode(Function &Fn, MachineFunction &MF);
+
+    virtual
+      bool IsLegalAndProfitableToFold(SDNode *N, SDNode *U, SDNode *Root) const;
+
+// Include the pieces autogenerated from the target description.
+#include "X86GenDAGISel.inc"
+
+  private:
+    SDNode *Select(SDNode *N);
+    SDNode *SelectAtomic64(SDNode *Node, unsigned Opc);
+    SDNode *SelectAtomicLoadAdd(SDNode *Node, EVT NVT);
+
+    bool MatchSegmentBaseAddress(SDValue N, X86ISelAddressMode &AM);
+    bool MatchLoad(SDValue N, X86ISelAddressMode &AM);
+    bool MatchWrapper(SDValue N, X86ISelAddressMode &AM);
+    bool MatchAddress(SDValue N, X86ISelAddressMode &AM);
+    bool MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
+                                 unsigned Depth);
+    bool MatchAddressBase(SDValue N, X86ISelAddressMode &AM);
+    bool SelectAddr(SDNode *Op, SDValue N, SDValue &Base,
+                    SDValue &Scale, SDValue &Index, SDValue &Disp,
+                    SDValue &Segment);
+    bool SelectLEAAddr(SDNode *Op, SDValue N, SDValue &Base,
+                       SDValue &Scale, SDValue &Index, SDValue &Disp);
+    bool SelectTLSADDRAddr(SDNode *Op, SDValue N, SDValue &Base,
+                       SDValue &Scale, SDValue &Index, SDValue &Disp);
+    bool SelectScalarSSELoad(SDNode *Op, SDValue Pred,
+                             SDValue N, SDValue &Base, SDValue &Scale,
+                             SDValue &Index, SDValue &Disp,
+                             SDValue &Segment,
+                             SDValue &InChain, SDValue &OutChain);
+    bool TryFoldLoad(SDNode *P, SDValue N,
+                     SDValue &Base, SDValue &Scale,
+                     SDValue &Index, SDValue &Disp,
+                     SDValue &Segment);
+    void PreprocessForRMW();
+    void PreprocessForFPConvert();
+
+    /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+    /// inline asm expressions.
+    virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                              char ConstraintCode,
+                                              std::vector<SDValue> &OutOps);
+    
+    void EmitSpecialCodeForMain(MachineBasicBlock *BB, MachineFrameInfo *MFI);
+
+    inline void getAddressOperands(X86ISelAddressMode &AM, SDValue &Base, 
+                                   SDValue &Scale, SDValue &Index,
+                                   SDValue &Disp, SDValue &Segment) {
+      Base  = (AM.BaseType == X86ISelAddressMode::FrameIndexBase) ?
+        CurDAG->getTargetFrameIndex(AM.Base.FrameIndex, TLI.getPointerTy()) :
+        AM.Base.Reg;
+      Scale = getI8Imm(AM.Scale);
+      Index = AM.IndexReg;
+      // These are 32-bit even in 64-bit mode since RIP relative offset
+      // is 32-bit.
+      if (AM.GV)
+        Disp = CurDAG->getTargetGlobalAddress(AM.GV, MVT::i32, AM.Disp,
+                                              AM.SymbolFlags);
+      else if (AM.CP)
+        Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32,
+                                             AM.Align, AM.Disp, AM.SymbolFlags);
+      else if (AM.ES)
+        Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
+      else if (AM.JT != -1)
+        Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
+      else if (AM.BlockAddr)
+        Disp = CurDAG->getBlockAddress(AM.BlockAddr, MVT::i32,
+                                       true, AM.SymbolFlags);
+      else
+        Disp = CurDAG->getTargetConstant(AM.Disp, MVT::i32);
+
+      if (AM.Segment.getNode())
+        Segment = AM.Segment;
+      else
+        Segment = CurDAG->getRegister(0, MVT::i32);
+    }
+
+    /// getI8Imm - Return a target constant with the specified value, of type
+    /// i8.
+    inline SDValue getI8Imm(unsigned Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i8);
+    }
+
+    /// getI16Imm - Return a target constant with the specified value, of type
+    /// i16.
+    inline SDValue getI16Imm(unsigned Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i16);
+    }
+
+    /// getI32Imm - Return a target constant with the specified value, of type
+    /// i32.
+    inline SDValue getI32Imm(unsigned Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i32);
+    }
+
+    /// getGlobalBaseReg - Return an SDNode that returns the value of
+    /// the global base register. Output instructions required to
+    /// initialize the global base register, if necessary.
+    ///
+    SDNode *getGlobalBaseReg();
+
+    /// getTargetMachine - Return a reference to the TargetMachine, casted
+    /// to the target-specific type.
+    const X86TargetMachine &getTargetMachine() {
+      return static_cast<const X86TargetMachine &>(TM);
+    }
+
+    /// getInstrInfo - Return a reference to the TargetInstrInfo, casted
+    /// to the target-specific type.
+    const X86InstrInfo *getInstrInfo() {
+      return getTargetMachine().getInstrInfo();
+    }
+
+#ifndef NDEBUG
+    unsigned Indent;
+#endif
+  };
+}
+
+
+bool X86DAGToDAGISel::IsLegalAndProfitableToFold(SDNode *N, SDNode *U,
+                                                 SDNode *Root) const {
+  if (OptLevel == CodeGenOpt::None) return false;
+
+  if (U == Root)
+    switch (U->getOpcode()) {
+    default: break;
+    case X86ISD::ADD:
+    case X86ISD::SUB:
+    case X86ISD::AND:
+    case X86ISD::XOR:
+    case X86ISD::OR:
+    case ISD::ADD:
+    case ISD::ADDC:
+    case ISD::ADDE:
+    case ISD::AND:
+    case ISD::OR:
+    case ISD::XOR: {
+      SDValue Op1 = U->getOperand(1);
+
+      // If the other operand is a 8-bit immediate we should fold the immediate
+      // instead. This reduces code size.
+      // e.g.
+      // movl 4(%esp), %eax
+      // addl $4, %eax
+      // vs.
+      // movl $4, %eax
+      // addl 4(%esp), %eax
+      // The former is 2 bytes shorter. In case where the increment is 1, then
+      // the saving can be 4 bytes (by using incl %eax).
+      if (ConstantSDNode *Imm = dyn_cast<ConstantSDNode>(Op1))
+        if (Imm->getAPIntValue().isSignedIntN(8))
+          return false;
+
+      // If the other operand is a TLS address, we should fold it instead.
+      // This produces
+      // movl    %gs:0, %eax
+      // leal    i@NTPOFF(%eax), %eax
+      // instead of
+      // movl    $i@NTPOFF, %eax
+      // addl    %gs:0, %eax
+      // if the block also has an access to a second TLS address this will save
+      // a load.
+      // FIXME: This is probably also true for non TLS addresses.
+      if (Op1.getOpcode() == X86ISD::Wrapper) {
+        SDValue Val = Op1.getOperand(0);
+        if (Val.getOpcode() == ISD::TargetGlobalTLSAddress)
+          return false;
+      }
+    }
+    }
+
+  // Proceed to 'generic' cycle finder code
+  return SelectionDAGISel::IsLegalAndProfitableToFold(N, U, Root);
+}
+
+/// MoveBelowTokenFactor - Replace TokenFactor operand with load's chain operand
+/// and move load below the TokenFactor. Replace store's chain operand with
+/// load's chain result.
+static void MoveBelowTokenFactor(SelectionDAG *CurDAG, SDValue Load,
+                                 SDValue Store, SDValue TF) {
+  SmallVector<SDValue, 4> Ops;
+  for (unsigned i = 0, e = TF.getNode()->getNumOperands(); i != e; ++i)
+    if (Load.getNode() == TF.getOperand(i).getNode())
+      Ops.push_back(Load.getOperand(0));
+    else
+      Ops.push_back(TF.getOperand(i));
+  SDValue NewTF = CurDAG->UpdateNodeOperands(TF, &Ops[0], Ops.size());
+  SDValue NewLoad = CurDAG->UpdateNodeOperands(Load, NewTF,
+                                               Load.getOperand(1),
+                                               Load.getOperand(2));
+  CurDAG->UpdateNodeOperands(Store, NewLoad.getValue(1), Store.getOperand(1),
+                             Store.getOperand(2), Store.getOperand(3));
+}
+
+/// isRMWLoad - Return true if N is a load that's part of RMW sub-DAG.  The 
+/// chain produced by the load must only be used by the store's chain operand,
+/// otherwise this may produce a cycle in the DAG.
+/// 
+static bool isRMWLoad(SDValue N, SDValue Chain, SDValue Address,
+                      SDValue &Load) {
+  if (N.getOpcode() == ISD::BIT_CONVERT) {
+    if (!N.hasOneUse())
+      return false;
+    N = N.getOperand(0);
+  }
+
+  LoadSDNode *LD = dyn_cast<LoadSDNode>(N);
+  if (!LD || LD->isVolatile())
+    return false;
+  if (LD->getAddressingMode() != ISD::UNINDEXED)
+    return false;
+
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  if (ExtType != ISD::NON_EXTLOAD && ExtType != ISD::EXTLOAD)
+    return false;
+
+  if (N.hasOneUse() &&
+      LD->hasNUsesOfValue(1, 1) &&
+      N.getOperand(1) == Address &&
+      LD->isOperandOf(Chain.getNode())) {
+    Load = N;
+    return true;
+  }
+  return false;
+}
+
+/// MoveBelowCallSeqStart - Replace CALLSEQ_START operand with load's chain
+/// operand and move load below the call's chain operand.
+static void MoveBelowCallSeqStart(SelectionDAG *CurDAG, SDValue Load,
+                                  SDValue Call, SDValue CallSeqStart) {
+  SmallVector<SDValue, 8> Ops;
+  SDValue Chain = CallSeqStart.getOperand(0);
+  if (Chain.getNode() == Load.getNode())
+    Ops.push_back(Load.getOperand(0));
+  else {
+    assert(Chain.getOpcode() == ISD::TokenFactor &&
+           "Unexpected CallSeqStart chain operand");
+    for (unsigned i = 0, e = Chain.getNumOperands(); i != e; ++i)
+      if (Chain.getOperand(i).getNode() == Load.getNode())
+        Ops.push_back(Load.getOperand(0));
+      else
+        Ops.push_back(Chain.getOperand(i));
+    SDValue NewChain =
+      CurDAG->getNode(ISD::TokenFactor, Load.getDebugLoc(),
+                      MVT::Other, &Ops[0], Ops.size());
+    Ops.clear();
+    Ops.push_back(NewChain);
+  }
+  for (unsigned i = 1, e = CallSeqStart.getNumOperands(); i != e; ++i)
+    Ops.push_back(CallSeqStart.getOperand(i));
+  CurDAG->UpdateNodeOperands(CallSeqStart, &Ops[0], Ops.size());
+  CurDAG->UpdateNodeOperands(Load, Call.getOperand(0),
+                             Load.getOperand(1), Load.getOperand(2));
+  Ops.clear();
+  Ops.push_back(SDValue(Load.getNode(), 1));
+  for (unsigned i = 1, e = Call.getNode()->getNumOperands(); i != e; ++i)
+    Ops.push_back(Call.getOperand(i));
+  CurDAG->UpdateNodeOperands(Call, &Ops[0], Ops.size());
+}
+
+/// isCalleeLoad - Return true if call address is a load and it can be
+/// moved below CALLSEQ_START and the chains leading up to the call.
+/// Return the CALLSEQ_START by reference as a second output.
+static bool isCalleeLoad(SDValue Callee, SDValue &Chain) {
+  if (Callee.getNode() == Chain.getNode() || !Callee.hasOneUse())
+    return false;
+  LoadSDNode *LD = dyn_cast<LoadSDNode>(Callee.getNode());
+  if (!LD ||
+      LD->isVolatile() ||
+      LD->getAddressingMode() != ISD::UNINDEXED ||
+      LD->getExtensionType() != ISD::NON_EXTLOAD)
+    return false;
+
+  // Now let's find the callseq_start.
+  while (Chain.getOpcode() != ISD::CALLSEQ_START) {
+    if (!Chain.hasOneUse())
+      return false;
+    Chain = Chain.getOperand(0);
+  }
+  
+  if (Chain.getOperand(0).getNode() == Callee.getNode())
+    return true;
+  if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
+      Callee.getValue(1).isOperandOf(Chain.getOperand(0).getNode()) &&
+      Callee.getValue(1).hasOneUse())
+    return true;
+  return false;
+}
+
+
+/// PreprocessForRMW - Preprocess the DAG to make instruction selection better.
+/// This is only run if not in -O0 mode.
+/// This allows the instruction selector to pick more read-modify-write
+/// instructions. This is a common case:
+///
+///     [Load chain]
+///         ^
+///         |
+///       [Load]
+///       ^    ^
+///       |    |
+///      /      \-
+///     /         |
+/// [TokenFactor] [Op]
+///     ^          ^
+///     |          |
+///      \        /
+///       \      /
+///       [Store]
+///
+/// The fact the store's chain operand != load's chain will prevent the
+/// (store (op (load))) instruction from being selected. We can transform it to:
+///
+///     [Load chain]
+///         ^
+///         |
+///    [TokenFactor]
+///         ^
+///         |
+///       [Load]
+///       ^    ^
+///       |    |
+///       |     \- 
+///       |       | 
+///       |     [Op]
+///       |       ^
+///       |       |
+///       \      /
+///        \    /
+///       [Store]
+void X86DAGToDAGISel::PreprocessForRMW() {
+  for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
+         E = CurDAG->allnodes_end(); I != E; ++I) {
+    if (I->getOpcode() == X86ISD::CALL) {
+      /// Also try moving call address load from outside callseq_start to just
+      /// before the call to allow it to be folded.
+      ///
+      ///     [Load chain]
+      ///         ^
+      ///         |
+      ///       [Load]
+      ///       ^    ^
+      ///       |    |
+      ///      /      \--
+      ///     /          |
+      ///[CALLSEQ_START] |
+      ///     ^          |
+      ///     |          |
+      /// [LOAD/C2Reg]   |
+      ///     |          |
+      ///      \        /
+      ///       \      /
+      ///       [CALL]
+      SDValue Chain = I->getOperand(0);
+      SDValue Load  = I->getOperand(1);
+      if (!isCalleeLoad(Load, Chain))
+        continue;
+      MoveBelowCallSeqStart(CurDAG, Load, SDValue(I, 0), Chain);
+      ++NumLoadMoved;
+      continue;
+    }
+
+    if (!ISD::isNON_TRUNCStore(I))
+      continue;
+    SDValue Chain = I->getOperand(0);
+
+    if (Chain.getNode()->getOpcode() != ISD::TokenFactor)
+      continue;
+
+    SDValue N1 = I->getOperand(1);
+    SDValue N2 = I->getOperand(2);
+    if ((N1.getValueType().isFloatingPoint() &&
+         !N1.getValueType().isVector()) ||
+        !N1.hasOneUse())
+      continue;
+
+    bool RModW = false;
+    SDValue Load;
+    unsigned Opcode = N1.getNode()->getOpcode();
+    switch (Opcode) {
+    case ISD::ADD:
+    case ISD::MUL:
+    case ISD::AND:
+    case ISD::OR:
+    case ISD::XOR:
+    case ISD::ADDC:
+    case ISD::ADDE:
+    case ISD::VECTOR_SHUFFLE: {
+      SDValue N10 = N1.getOperand(0);
+      SDValue N11 = N1.getOperand(1);
+      RModW = isRMWLoad(N10, Chain, N2, Load);
+      if (!RModW)
+        RModW = isRMWLoad(N11, Chain, N2, Load);
+      break;
+    }
+    case ISD::SUB:
+    case ISD::SHL:
+    case ISD::SRA:
+    case ISD::SRL:
+    case ISD::ROTL:
+    case ISD::ROTR:
+    case ISD::SUBC:
+    case ISD::SUBE:
+    case X86ISD::SHLD:
+    case X86ISD::SHRD: {
+      SDValue N10 = N1.getOperand(0);
+      RModW = isRMWLoad(N10, Chain, N2, Load);
+      break;
+    }
+    }
+
+    if (RModW) {
+      MoveBelowTokenFactor(CurDAG, Load, SDValue(I, 0), Chain);
+      ++NumLoadMoved;
+      checkForCycles(I);
+    }
+  }
+}
+
+
+/// PreprocessForFPConvert - Walk over the dag lowering fpround and fpextend
+/// nodes that target the FP stack to be store and load to the stack.  This is a
+/// gross hack.  We would like to simply mark these as being illegal, but when
+/// we do that, legalize produces these when it expands calls, then expands
+/// these in the same legalize pass.  We would like dag combine to be able to
+/// hack on these between the call expansion and the node legalization.  As such
+/// this pass basically does "really late" legalization of these inline with the
+/// X86 isel pass.
+void X86DAGToDAGISel::PreprocessForFPConvert() {
+  for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
+       E = CurDAG->allnodes_end(); I != E; ) {
+    SDNode *N = I++;  // Preincrement iterator to avoid invalidation issues.
+    if (N->getOpcode() != ISD::FP_ROUND && N->getOpcode() != ISD::FP_EXTEND)
+      continue;
+    
+    // If the source and destination are SSE registers, then this is a legal
+    // conversion that should not be lowered.
+    EVT SrcVT = N->getOperand(0).getValueType();
+    EVT DstVT = N->getValueType(0);
+    bool SrcIsSSE = X86Lowering.isScalarFPTypeInSSEReg(SrcVT);
+    bool DstIsSSE = X86Lowering.isScalarFPTypeInSSEReg(DstVT);
+    if (SrcIsSSE && DstIsSSE)
+      continue;
+
+    if (!SrcIsSSE && !DstIsSSE) {
+      // If this is an FPStack extension, it is a noop.
+      if (N->getOpcode() == ISD::FP_EXTEND)
+        continue;
+      // If this is a value-preserving FPStack truncation, it is a noop.
+      if (N->getConstantOperandVal(1))
+        continue;
+    }
+   
+    // Here we could have an FP stack truncation or an FPStack <-> SSE convert.
+    // FPStack has extload and truncstore.  SSE can fold direct loads into other
+    // operations.  Based on this, decide what we want to do.
+    EVT MemVT;
+    if (N->getOpcode() == ISD::FP_ROUND)
+      MemVT = DstVT;  // FP_ROUND must use DstVT, we can't do a 'trunc load'.
+    else
+      MemVT = SrcIsSSE ? SrcVT : DstVT;
+    
+    SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
+    DebugLoc dl = N->getDebugLoc();
+    
+    // FIXME: optimize the case where the src/dest is a load or store?
+    SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl,
+                                          N->getOperand(0),
+                                          MemTmp, NULL, 0, MemVT);
+    SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
+                                        NULL, 0, MemVT);
+
+    // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
+    // extload we created.  This will cause general havok on the dag because
+    // anything below the conversion could be folded into other existing nodes.
+    // To avoid invalidating 'I', back it up to the convert node.
+    --I;
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
+    
+    // Now that we did that, the node is dead.  Increment the iterator to the
+    // next node to process, then delete N.
+    ++I;
+    CurDAG->DeleteNode(N);
+  }  
+}
+
+/// InstructionSelectBasicBlock - This callback is invoked by SelectionDAGISel
+/// when it has created a SelectionDAG for us to codegen.
+void X86DAGToDAGISel::InstructionSelect() {
+  const Function *F = MF->getFunction();
+  OptForSize = F->hasFnAttr(Attribute::OptimizeForSize);
+
+  if (OptLevel != CodeGenOpt::None)
+    PreprocessForRMW();
+
+  // FIXME: This should only happen when not compiled with -O0.
+  PreprocessForFPConvert();
+
+  // Codegen the basic block.
+#ifndef NDEBUG
+  DEBUG(dbgs() << "===== Instruction selection begins:\n");
+  Indent = 0;
+#endif
+  SelectRoot(*CurDAG);
+#ifndef NDEBUG
+  DEBUG(dbgs() << "===== Instruction selection ends:\n");
+#endif
+
+  CurDAG->RemoveDeadNodes();
+}
+
+/// EmitSpecialCodeForMain - Emit any code that needs to be executed only in
+/// the main function.
+void X86DAGToDAGISel::EmitSpecialCodeForMain(MachineBasicBlock *BB,
+                                             MachineFrameInfo *MFI) {
+  const TargetInstrInfo *TII = TM.getInstrInfo();
+  if (Subtarget->isTargetCygMing())
+    BuildMI(BB, DebugLoc::getUnknownLoc(),
+            TII->get(X86::CALLpcrel32)).addExternalSymbol("__main");
+}
+
+void X86DAGToDAGISel::EmitFunctionEntryCode(Function &Fn, MachineFunction &MF) {
+  // If this is main, emit special code for main.
+  MachineBasicBlock *BB = MF.begin();
+  if (Fn.hasExternalLinkage() && Fn.getName() == "main")
+    EmitSpecialCodeForMain(BB, MF.getFrameInfo());
+}
+
+
+bool X86DAGToDAGISel::MatchSegmentBaseAddress(SDValue N,
+                                              X86ISelAddressMode &AM) {
+  assert(N.getOpcode() == X86ISD::SegmentBaseAddress);
+  SDValue Segment = N.getOperand(0);
+
+  if (AM.Segment.getNode() == 0) {
+    AM.Segment = Segment;
+    return false;
+  }
+
+  return true;
+}
+
+bool X86DAGToDAGISel::MatchLoad(SDValue N, X86ISelAddressMode &AM) {
+  // This optimization is valid because the GNU TLS model defines that
+  // gs:0 (or fs:0 on X86-64) contains its own address.
+  // For more information see http://people.redhat.com/drepper/tls.pdf
+
+  SDValue Address = N.getOperand(1);
+  if (Address.getOpcode() == X86ISD::SegmentBaseAddress &&
+      !MatchSegmentBaseAddress (Address, AM))
+    return false;
+
+  return true;
+}
+
+/// MatchWrapper - Try to match X86ISD::Wrapper and X86ISD::WrapperRIP nodes
+/// into an addressing mode.  These wrap things that will resolve down into a
+/// symbol reference.  If no match is possible, this returns true, otherwise it
+/// returns false.
+bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) {
+  // If the addressing mode already has a symbol as the displacement, we can
+  // never match another symbol.
+  if (AM.hasSymbolicDisplacement())
+    return true;
+
+  SDValue N0 = N.getOperand(0);
+  CodeModel::Model M = TM.getCodeModel();
+
+  // Handle X86-64 rip-relative addresses.  We check this before checking direct
+  // folding because RIP is preferable to non-RIP accesses.
+  if (Subtarget->is64Bit() &&
+      // Under X86-64 non-small code model, GV (and friends) are 64-bits, so
+      // they cannot be folded into immediate fields.
+      // FIXME: This can be improved for kernel and other models?
+      (M == CodeModel::Small || M == CodeModel::Kernel) &&
+      // Base and index reg must be 0 in order to use %rip as base and lowering
+      // must allow RIP.
+      !AM.hasBaseOrIndexReg() && N.getOpcode() == X86ISD::WrapperRIP) {
+    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
+      int64_t Offset = AM.Disp + G->getOffset();
+      if (!X86::isOffsetSuitableForCodeModel(Offset, M)) return true;
+      AM.GV = G->getGlobal();
+      AM.Disp = Offset;
+      AM.SymbolFlags = G->getTargetFlags();
+    } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
+      int64_t Offset = AM.Disp + CP->getOffset();
+      if (!X86::isOffsetSuitableForCodeModel(Offset, M)) return true;
+      AM.CP = CP->getConstVal();
+      AM.Align = CP->getAlignment();
+      AM.Disp = Offset;
+      AM.SymbolFlags = CP->getTargetFlags();
+    } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
+      AM.ES = S->getSymbol();
+      AM.SymbolFlags = S->getTargetFlags();
+    } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
+      AM.JT = J->getIndex();
+      AM.SymbolFlags = J->getTargetFlags();
+    } else {
+      AM.BlockAddr = cast<BlockAddressSDNode>(N0)->getBlockAddress();
+      AM.SymbolFlags = cast<BlockAddressSDNode>(N0)->getTargetFlags();
+    }
+
+    if (N.getOpcode() == X86ISD::WrapperRIP)
+      AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
+    return false;
+  }
+
+  // Handle the case when globals fit in our immediate field: This is true for
+  // X86-32 always and X86-64 when in -static -mcmodel=small mode.  In 64-bit
+  // mode, this results in a non-RIP-relative computation.
+  if (!Subtarget->is64Bit() ||
+      ((M == CodeModel::Small || M == CodeModel::Kernel) &&
+       TM.getRelocationModel() == Reloc::Static)) {
+    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(N0)) {
+      AM.GV = G->getGlobal();
+      AM.Disp += G->getOffset();
+      AM.SymbolFlags = G->getTargetFlags();
+    } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
+      AM.CP = CP->getConstVal();
+      AM.Align = CP->getAlignment();
+      AM.Disp += CP->getOffset();
+      AM.SymbolFlags = CP->getTargetFlags();
+    } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
+      AM.ES = S->getSymbol();
+      AM.SymbolFlags = S->getTargetFlags();
+    } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
+      AM.JT = J->getIndex();
+      AM.SymbolFlags = J->getTargetFlags();
+    } else {
+      AM.BlockAddr = cast<BlockAddressSDNode>(N0)->getBlockAddress();
+      AM.SymbolFlags = cast<BlockAddressSDNode>(N0)->getTargetFlags();
+    }
+    return false;
+  }
+
+  return true;
+}
+
+/// MatchAddress - Add the specified node to the specified addressing mode,
+/// returning true if it cannot be done.  This just pattern matches for the
+/// addressing mode.
+bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM) {
+  if (MatchAddressRecursively(N, AM, 0))
+    return true;
+
+  // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
+  // a smaller encoding and avoids a scaled-index.
+  if (AM.Scale == 2 &&
+      AM.BaseType == X86ISelAddressMode::RegBase &&
+      AM.Base.Reg.getNode() == 0) {
+    AM.Base.Reg = AM.IndexReg;
+    AM.Scale = 1;
+  }
+
+  // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
+  // because it has a smaller encoding.
+  // TODO: Which other code models can use this?
+  if (TM.getCodeModel() == CodeModel::Small &&
+      Subtarget->is64Bit() &&
+      AM.Scale == 1 &&
+      AM.BaseType == X86ISelAddressMode::RegBase &&
+      AM.Base.Reg.getNode() == 0 &&
+      AM.IndexReg.getNode() == 0 &&
+      AM.SymbolFlags == X86II::MO_NO_FLAG &&
+      AM.hasSymbolicDisplacement())
+    AM.Base.Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
+
+  return false;
+}
+
+bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
+                                              unsigned Depth) {
+  bool is64Bit = Subtarget->is64Bit();
+  DebugLoc dl = N.getDebugLoc();
+  DEBUG({
+      dbgs() << "MatchAddress: ";
+      AM.dump();
+    });
+  // Limit recursion.
+  if (Depth > 5)
+    return MatchAddressBase(N, AM);
+
+  CodeModel::Model M = TM.getCodeModel();
+
+  // If this is already a %rip relative address, we can only merge immediates
+  // into it.  Instead of handling this in every case, we handle it here.
+  // RIP relative addressing: %rip + 32-bit displacement!
+  if (AM.isRIPRelative()) {
+    // FIXME: JumpTable and ExternalSymbol address currently don't like
+    // displacements.  It isn't very important, but this should be fixed for
+    // consistency.
+    if (!AM.ES && AM.JT != -1) return true;
+
+    if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N)) {
+      int64_t Val = AM.Disp + Cst->getSExtValue();
+      if (X86::isOffsetSuitableForCodeModel(Val, M,
+                                            AM.hasSymbolicDisplacement())) {
+        AM.Disp = Val;
+        return false;
+      }
+    }
+    return true;
+  }
+
+  switch (N.getOpcode()) {
+  default: break;
+  case ISD::Constant: {
+    uint64_t Val = cast<ConstantSDNode>(N)->getSExtValue();
+    if (!is64Bit ||
+        X86::isOffsetSuitableForCodeModel(AM.Disp + Val, M,
+                                          AM.hasSymbolicDisplacement())) {
+      AM.Disp += Val;
+      return false;
+    }
+    break;
+  }
+
+  case X86ISD::SegmentBaseAddress:
+    if (!MatchSegmentBaseAddress(N, AM))
+      return false;
+    break;
+
+  case X86ISD::Wrapper:
+  case X86ISD::WrapperRIP:
+    if (!MatchWrapper(N, AM))
+      return false;
+    break;
+
+  case ISD::LOAD:
+    if (!MatchLoad(N, AM))
+      return false;
+    break;
+
+  case ISD::FrameIndex:
+    if (AM.BaseType == X86ISelAddressMode::RegBase
+        && AM.Base.Reg.getNode() == 0) {
+      AM.BaseType = X86ISelAddressMode::FrameIndexBase;
+      AM.Base.FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
+      return false;
+    }
+    break;
+
+  case ISD::SHL:
+    if (AM.IndexReg.getNode() != 0 || AM.Scale != 1)
+      break;
+      
+    if (ConstantSDNode
+          *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1))) {
+      unsigned Val = CN->getZExtValue();
+      // Note that we handle x<<1 as (,x,2) rather than (x,x) here so
+      // that the base operand remains free for further matching. If
+      // the base doesn't end up getting used, a post-processing step
+      // in MatchAddress turns (,x,2) into (x,x), which is cheaper.
+      if (Val == 1 || Val == 2 || Val == 3) {
+        AM.Scale = 1 << Val;
+        SDValue ShVal = N.getNode()->getOperand(0);
+
+        // Okay, we know that we have a scale by now.  However, if the scaled
+        // value is an add of something and a constant, we can fold the
+        // constant into the disp field here.
+        if (ShVal.getNode()->getOpcode() == ISD::ADD &&
+            isa<ConstantSDNode>(ShVal.getNode()->getOperand(1))) {
+          AM.IndexReg = ShVal.getNode()->getOperand(0);
+          ConstantSDNode *AddVal =
+            cast<ConstantSDNode>(ShVal.getNode()->getOperand(1));
+          uint64_t Disp = AM.Disp + (AddVal->getSExtValue() << Val);
+          if (!is64Bit ||
+              X86::isOffsetSuitableForCodeModel(Disp, M,
+                                                AM.hasSymbolicDisplacement()))
+            AM.Disp = Disp;
+          else
+            AM.IndexReg = ShVal;
+        } else {
+          AM.IndexReg = ShVal;
+        }
+        return false;
+      }
+    break;
+    }
+
+  case ISD::SMUL_LOHI:
+  case ISD::UMUL_LOHI:
+    // A mul_lohi where we need the low part can be folded as a plain multiply.
+    if (N.getResNo() != 0) break;
+    // FALL THROUGH
+  case ISD::MUL:
+  case X86ISD::MUL_IMM:
+    // X*[3,5,9] -> X+X*[2,4,8]
+    if (AM.BaseType == X86ISelAddressMode::RegBase &&
+        AM.Base.Reg.getNode() == 0 &&
+        AM.IndexReg.getNode() == 0) {
+      if (ConstantSDNode
+            *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1)))
+        if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
+            CN->getZExtValue() == 9) {
+          AM.Scale = unsigned(CN->getZExtValue())-1;
+
+          SDValue MulVal = N.getNode()->getOperand(0);
+          SDValue Reg;
+
+          // Okay, we know that we have a scale by now.  However, if the scaled
+          // value is an add of something and a constant, we can fold the
+          // constant into the disp field here.
+          if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() &&
+              isa<ConstantSDNode>(MulVal.getNode()->getOperand(1))) {
+            Reg = MulVal.getNode()->getOperand(0);
+            ConstantSDNode *AddVal =
+              cast<ConstantSDNode>(MulVal.getNode()->getOperand(1));
+            uint64_t Disp = AM.Disp + AddVal->getSExtValue() *
+                                      CN->getZExtValue();
+            if (!is64Bit ||
+                X86::isOffsetSuitableForCodeModel(Disp, M,
+                                                  AM.hasSymbolicDisplacement()))
+              AM.Disp = Disp;
+            else
+              Reg = N.getNode()->getOperand(0);
+          } else {
+            Reg = N.getNode()->getOperand(0);
+          }
+
+          AM.IndexReg = AM.Base.Reg = Reg;
+          return false;
+        }
+    }
+    break;
+
+  case ISD::SUB: {
+    // Given A-B, if A can be completely folded into the address and
+    // the index field with the index field unused, use -B as the index.
+    // This is a win if a has multiple parts that can be folded into
+    // the address. Also, this saves a mov if the base register has
+    // other uses, since it avoids a two-address sub instruction, however
+    // it costs an additional mov if the index register has other uses.
+
+    // Test if the LHS of the sub can be folded.
+    X86ISelAddressMode Backup = AM;
+    if (MatchAddressRecursively(N.getNode()->getOperand(0), AM, Depth+1)) {
+      AM = Backup;
+      break;
+    }
+    // Test if the index field is free for use.
+    if (AM.IndexReg.getNode() || AM.isRIPRelative()) {
+      AM = Backup;
+      break;
+    }
+    int Cost = 0;
+    SDValue RHS = N.getNode()->getOperand(1);
+    // If the RHS involves a register with multiple uses, this
+    // transformation incurs an extra mov, due to the neg instruction
+    // clobbering its operand.
+    if (!RHS.getNode()->hasOneUse() ||
+        RHS.getNode()->getOpcode() == ISD::CopyFromReg ||
+        RHS.getNode()->getOpcode() == ISD::TRUNCATE ||
+        RHS.getNode()->getOpcode() == ISD::ANY_EXTEND ||
+        (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND &&
+         RHS.getNode()->getOperand(0).getValueType() == MVT::i32))
+      ++Cost;
+    // If the base is a register with multiple uses, this
+    // transformation may save a mov.
+    if ((AM.BaseType == X86ISelAddressMode::RegBase &&
+         AM.Base.Reg.getNode() &&
+         !AM.Base.Reg.getNode()->hasOneUse()) ||
+        AM.BaseType == X86ISelAddressMode::FrameIndexBase)
+      --Cost;
+    // If the folded LHS was interesting, this transformation saves
+    // address arithmetic.
+    if ((AM.hasSymbolicDisplacement() && !Backup.hasSymbolicDisplacement()) +
+        ((AM.Disp != 0) && (Backup.Disp == 0)) +
+        (AM.Segment.getNode() && !Backup.Segment.getNode()) >= 2)
+      --Cost;
+    // If it doesn't look like it may be an overall win, don't do it.
+    if (Cost >= 0) {
+      AM = Backup;
+      break;
+    }
+
+    // Ok, the transformation is legal and appears profitable. Go for it.
+    SDValue Zero = CurDAG->getConstant(0, N.getValueType());
+    SDValue Neg = CurDAG->getNode(ISD::SUB, dl, N.getValueType(), Zero, RHS);
+    AM.IndexReg = Neg;
+    AM.Scale = 1;
+
+    // Insert the new nodes into the topological ordering.
+    if (Zero.getNode()->getNodeId() == -1 ||
+        Zero.getNode()->getNodeId() > N.getNode()->getNodeId()) {
+      CurDAG->RepositionNode(N.getNode(), Zero.getNode());
+      Zero.getNode()->setNodeId(N.getNode()->getNodeId());
+    }
+    if (Neg.getNode()->getNodeId() == -1 ||
+        Neg.getNode()->getNodeId() > N.getNode()->getNodeId()) {
+      CurDAG->RepositionNode(N.getNode(), Neg.getNode());
+      Neg.getNode()->setNodeId(N.getNode()->getNodeId());
+    }
+    return false;
+  }
+
+  case ISD::ADD: {
+    X86ISelAddressMode Backup = AM;
+    if (!MatchAddressRecursively(N.getNode()->getOperand(0), AM, Depth+1) &&
+        !MatchAddressRecursively(N.getNode()->getOperand(1), AM, Depth+1))
+      return false;
+    AM = Backup;
+    if (!MatchAddressRecursively(N.getNode()->getOperand(1), AM, Depth+1) &&
+        !MatchAddressRecursively(N.getNode()->getOperand(0), AM, Depth+1))
+      return false;
+    AM = Backup;
+
+    // If we couldn't fold both operands into the address at the same time,
+    // see if we can just put each operand into a register and fold at least
+    // the add.
+    if (AM.BaseType == X86ISelAddressMode::RegBase &&
+        !AM.Base.Reg.getNode() &&
+        !AM.IndexReg.getNode()) {
+      AM.Base.Reg = N.getNode()->getOperand(0);
+      AM.IndexReg = N.getNode()->getOperand(1);
+      AM.Scale = 1;
+      return false;
+    }
+    break;
+  }
+
+  case ISD::OR:
+    // Handle "X | C" as "X + C" iff X is known to have C bits clear.
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      X86ISelAddressMode Backup = AM;
+      uint64_t Offset = CN->getSExtValue();
+      // Start with the LHS as an addr mode.
+      if (!MatchAddressRecursively(N.getOperand(0), AM, Depth+1) &&
+          // Address could not have picked a GV address for the displacement.
+          AM.GV == NULL &&
+          // On x86-64, the resultant disp must fit in 32-bits.
+          (!is64Bit ||
+           X86::isOffsetSuitableForCodeModel(AM.Disp + Offset, M,
+                                             AM.hasSymbolicDisplacement())) &&
+          // Check to see if the LHS & C is zero.
+          CurDAG->MaskedValueIsZero(N.getOperand(0), CN->getAPIntValue())) {
+        AM.Disp += Offset;
+        return false;
+      }
+      AM = Backup;
+    }
+    break;
+      
+  case ISD::AND: {
+    // Perform some heroic transforms on an and of a constant-count shift
+    // with a constant to enable use of the scaled offset field.
+
+    SDValue Shift = N.getOperand(0);
+    if (Shift.getNumOperands() != 2) break;
+
+    // Scale must not be used already.
+    if (AM.IndexReg.getNode() != 0 || AM.Scale != 1) break;
+
+    SDValue X = Shift.getOperand(0);
+    ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(N.getOperand(1));
+    ConstantSDNode *C1 = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
+    if (!C1 || !C2) break;
+
+    // Handle "(X >> (8-C1)) & C2" as "(X >> 8) & 0xff)" if safe. This
+    // allows us to convert the shift and and into an h-register extract and
+    // a scaled index.
+    if (Shift.getOpcode() == ISD::SRL && Shift.hasOneUse()) {
+      unsigned ScaleLog = 8 - C1->getZExtValue();
+      if (ScaleLog > 0 && ScaleLog < 4 &&
+          C2->getZExtValue() == (UINT64_C(0xff) << ScaleLog)) {
+        SDValue Eight = CurDAG->getConstant(8, MVT::i8);
+        SDValue Mask = CurDAG->getConstant(0xff, N.getValueType());
+        SDValue Srl = CurDAG->getNode(ISD::SRL, dl, N.getValueType(),
+                                      X, Eight);
+        SDValue And = CurDAG->getNode(ISD::AND, dl, N.getValueType(),
+                                      Srl, Mask);
+        SDValue ShlCount = CurDAG->getConstant(ScaleLog, MVT::i8);
+        SDValue Shl = CurDAG->getNode(ISD::SHL, dl, N.getValueType(),
+                                      And, ShlCount);
+
+        // Insert the new nodes into the topological ordering.
+        if (Eight.getNode()->getNodeId() == -1 ||
+            Eight.getNode()->getNodeId() > X.getNode()->getNodeId()) {
+          CurDAG->RepositionNode(X.getNode(), Eight.getNode());
+          Eight.getNode()->setNodeId(X.getNode()->getNodeId());
+        }
+        if (Mask.getNode()->getNodeId() == -1 ||
+            Mask.getNode()->getNodeId() > X.getNode()->getNodeId()) {
+          CurDAG->RepositionNode(X.getNode(), Mask.getNode());
+          Mask.getNode()->setNodeId(X.getNode()->getNodeId());
+        }
+        if (Srl.getNode()->getNodeId() == -1 ||
+            Srl.getNode()->getNodeId() > Shift.getNode()->getNodeId()) {
+          CurDAG->RepositionNode(Shift.getNode(), Srl.getNode());
+          Srl.getNode()->setNodeId(Shift.getNode()->getNodeId());
+        }
+        if (And.getNode()->getNodeId() == -1 ||
+            And.getNode()->getNodeId() > N.getNode()->getNodeId()) {
+          CurDAG->RepositionNode(N.getNode(), And.getNode());
+          And.getNode()->setNodeId(N.getNode()->getNodeId());
+        }
+        if (ShlCount.getNode()->getNodeId() == -1 ||
+            ShlCount.getNode()->getNodeId() > X.getNode()->getNodeId()) {
+          CurDAG->RepositionNode(X.getNode(), ShlCount.getNode());
+          ShlCount.getNode()->setNodeId(N.getNode()->getNodeId());
+        }
+        if (Shl.getNode()->getNodeId() == -1 ||
+            Shl.getNode()->getNodeId() > N.getNode()->getNodeId()) {
+          CurDAG->RepositionNode(N.getNode(), Shl.getNode());
+          Shl.getNode()->setNodeId(N.getNode()->getNodeId());
+        }
+        CurDAG->ReplaceAllUsesWith(N, Shl);
+        AM.IndexReg = And;
+        AM.Scale = (1 << ScaleLog);
+        return false;
+      }
+    }
+
+    // Handle "(X << C1) & C2" as "(X & (C2>>C1)) << C1" if safe and if this
+    // allows us to fold the shift into this addressing mode.
+    if (Shift.getOpcode() != ISD::SHL) break;
+
+    // Not likely to be profitable if either the AND or SHIFT node has more
+    // than one use (unless all uses are for address computation). Besides,
+    // isel mechanism requires their node ids to be reused.
+    if (!N.hasOneUse() || !Shift.hasOneUse())
+      break;
+    
+    // Verify that the shift amount is something we can fold.
+    unsigned ShiftCst = C1->getZExtValue();
+    if (ShiftCst != 1 && ShiftCst != 2 && ShiftCst != 3)
+      break;
+    
+    // Get the new AND mask, this folds to a constant.
+    SDValue NewANDMask = CurDAG->getNode(ISD::SRL, dl, N.getValueType(),
+                                         SDValue(C2, 0), SDValue(C1, 0));
+    SDValue NewAND = CurDAG->getNode(ISD::AND, dl, N.getValueType(), X, 
+                                     NewANDMask);
+    SDValue NewSHIFT = CurDAG->getNode(ISD::SHL, dl, N.getValueType(),
+                                       NewAND, SDValue(C1, 0));
+
+    // Insert the new nodes into the topological ordering.
+    if (C1->getNodeId() > X.getNode()->getNodeId()) {
+      CurDAG->RepositionNode(X.getNode(), C1);
+      C1->setNodeId(X.getNode()->getNodeId());
+    }
+    if (NewANDMask.getNode()->getNodeId() == -1 ||
+        NewANDMask.getNode()->getNodeId() > X.getNode()->getNodeId()) {
+      CurDAG->RepositionNode(X.getNode(), NewANDMask.getNode());
+      NewANDMask.getNode()->setNodeId(X.getNode()->getNodeId());
+    }
+    if (NewAND.getNode()->getNodeId() == -1 ||
+        NewAND.getNode()->getNodeId() > Shift.getNode()->getNodeId()) {
+      CurDAG->RepositionNode(Shift.getNode(), NewAND.getNode());
+      NewAND.getNode()->setNodeId(Shift.getNode()->getNodeId());
+    }
+    if (NewSHIFT.getNode()->getNodeId() == -1 ||
+        NewSHIFT.getNode()->getNodeId() > N.getNode()->getNodeId()) {
+      CurDAG->RepositionNode(N.getNode(), NewSHIFT.getNode());
+      NewSHIFT.getNode()->setNodeId(N.getNode()->getNodeId());
+    }
+
+    CurDAG->ReplaceAllUsesWith(N, NewSHIFT);
+    
+    AM.Scale = 1 << ShiftCst;
+    AM.IndexReg = NewAND;
+    return false;
+  }
+  }
+
+  return MatchAddressBase(N, AM);
+}
+
+/// MatchAddressBase - Helper for MatchAddress. Add the specified node to the
+/// specified addressing mode without any further recursion.
+bool X86DAGToDAGISel::MatchAddressBase(SDValue N, X86ISelAddressMode &AM) {
+  // Is the base register already occupied?
+  if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base.Reg.getNode()) {
+    // If so, check to see if the scale index register is set.
+    if (AM.IndexReg.getNode() == 0) {
+      AM.IndexReg = N;
+      AM.Scale = 1;
+      return false;
+    }
+
+    // Otherwise, we cannot select it.
+    return true;
+  }
+
+  // Default, generate it as a register.
+  AM.BaseType = X86ISelAddressMode::RegBase;
+  AM.Base.Reg = N;
+  return false;
+}
+
+/// SelectAddr - returns true if it is able pattern match an addressing mode.
+/// It returns the operands which make up the maximal addressing mode it can
+/// match by reference.
+bool X86DAGToDAGISel::SelectAddr(SDNode *Op, SDValue N, SDValue &Base,
+                                 SDValue &Scale, SDValue &Index,
+                                 SDValue &Disp, SDValue &Segment) {
+  X86ISelAddressMode AM;
+  if (MatchAddress(N, AM))
+    return false;
+
+  EVT VT = N.getValueType();
+  if (AM.BaseType == X86ISelAddressMode::RegBase) {
+    if (!AM.Base.Reg.getNode())
+      AM.Base.Reg = CurDAG->getRegister(0, VT);
+  }
+
+  if (!AM.IndexReg.getNode())
+    AM.IndexReg = CurDAG->getRegister(0, VT);
+
+  getAddressOperands(AM, Base, Scale, Index, Disp, Segment);
+  return true;
+}
+
+/// SelectScalarSSELoad - Match a scalar SSE load.  In particular, we want to
+/// match a load whose top elements are either undef or zeros.  The load flavor
+/// is derived from the type of N, which is either v4f32 or v2f64.
+bool X86DAGToDAGISel::SelectScalarSSELoad(SDNode *Op, SDValue Pred,
+                                          SDValue N, SDValue &Base,
+                                          SDValue &Scale, SDValue &Index,
+                                          SDValue &Disp, SDValue &Segment,
+                                          SDValue &InChain,
+                                          SDValue &OutChain) {
+  if (N.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+    InChain = N.getOperand(0).getValue(1);
+    if (ISD::isNON_EXTLoad(InChain.getNode()) &&
+        InChain.getValue(0).hasOneUse() &&
+        N.hasOneUse() &&
+        IsLegalAndProfitableToFold(N.getNode(), Pred.getNode(), Op)) {
+      LoadSDNode *LD = cast<LoadSDNode>(InChain);
+      if (!SelectAddr(Op, LD->getBasePtr(), Base, Scale, Index, Disp, Segment))
+        return false;
+      OutChain = LD->getChain();
+      return true;
+    }
+  }
+
+  // Also handle the case where we explicitly require zeros in the top
+  // elements.  This is a vector shuffle from the zero vector.
+  if (N.getOpcode() == X86ISD::VZEXT_MOVL && N.getNode()->hasOneUse() &&
+      // Check to see if the top elements are all zeros (or bitcast of zeros).
+      N.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && 
+      N.getOperand(0).getNode()->hasOneUse() &&
+      ISD::isNON_EXTLoad(N.getOperand(0).getOperand(0).getNode()) &&
+      N.getOperand(0).getOperand(0).hasOneUse()) {
+    // Okay, this is a zero extending load.  Fold it.
+    LoadSDNode *LD = cast<LoadSDNode>(N.getOperand(0).getOperand(0));
+    if (!SelectAddr(Op, LD->getBasePtr(), Base, Scale, Index, Disp, Segment))
+      return false;
+    OutChain = LD->getChain();
+    InChain = SDValue(LD, 1);
+    return true;
+  }
+  return false;
+}
+
+
+/// SelectLEAAddr - it calls SelectAddr and determines if the maximal addressing
+/// mode it matches can be cost effectively emitted as an LEA instruction.
+bool X86DAGToDAGISel::SelectLEAAddr(SDNode *Op, SDValue N,
+                                    SDValue &Base, SDValue &Scale,
+                                    SDValue &Index, SDValue &Disp) {
+  X86ISelAddressMode AM;
+
+  // Set AM.Segment to prevent MatchAddress from using one. LEA doesn't support
+  // segments.
+  SDValue Copy = AM.Segment;
+  SDValue T = CurDAG->getRegister(0, MVT::i32);
+  AM.Segment = T;
+  if (MatchAddress(N, AM))
+    return false;
+  assert (T == AM.Segment);
+  AM.Segment = Copy;
+
+  EVT VT = N.getValueType();
+  unsigned Complexity = 0;
+  if (AM.BaseType == X86ISelAddressMode::RegBase)
+    if (AM.Base.Reg.getNode())
+      Complexity = 1;
+    else
+      AM.Base.Reg = CurDAG->getRegister(0, VT);
+  else if (AM.BaseType == X86ISelAddressMode::FrameIndexBase)
+    Complexity = 4;
+
+  if (AM.IndexReg.getNode())
+    Complexity++;
+  else
+    AM.IndexReg = CurDAG->getRegister(0, VT);
+
+  // Don't match just leal(,%reg,2). It's cheaper to do addl %reg, %reg, or with
+  // a simple shift.
+  if (AM.Scale > 1)
+    Complexity++;
+
+  // FIXME: We are artificially lowering the criteria to turn ADD %reg, $GA
+  // to a LEA. This is determined with some expermentation but is by no means
+  // optimal (especially for code size consideration). LEA is nice because of
+  // its three-address nature. Tweak the cost function again when we can run
+  // convertToThreeAddress() at register allocation time.
+  if (AM.hasSymbolicDisplacement()) {
+    // For X86-64, we should always use lea to materialize RIP relative
+    // addresses.
+    if (Subtarget->is64Bit())
+      Complexity = 4;
+    else
+      Complexity += 2;
+  }
+
+  if (AM.Disp && (AM.Base.Reg.getNode() || AM.IndexReg.getNode()))
+    Complexity++;
+
+  // If it isn't worth using an LEA, reject it.
+  if (Complexity <= 2)
+    return false;
+  
+  SDValue Segment;
+  getAddressOperands(AM, Base, Scale, Index, Disp, Segment);
+  return true;
+}
+
+/// SelectTLSADDRAddr - This is only run on TargetGlobalTLSAddress nodes.
+bool X86DAGToDAGISel::SelectTLSADDRAddr(SDNode *Op, SDValue N, SDValue &Base,
+                                        SDValue &Scale, SDValue &Index,
+                                        SDValue &Disp) {
+  assert(Op->getOpcode() == X86ISD::TLSADDR);
+  assert(N.getOpcode() == ISD::TargetGlobalTLSAddress);
+  const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
+  
+  X86ISelAddressMode AM;
+  AM.GV = GA->getGlobal();
+  AM.Disp += GA->getOffset();
+  AM.Base.Reg = CurDAG->getRegister(0, N.getValueType());
+  AM.SymbolFlags = GA->getTargetFlags();
+
+  if (N.getValueType() == MVT::i32) {
+    AM.Scale = 1;
+    AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
+  } else {
+    AM.IndexReg = CurDAG->getRegister(0, MVT::i64);
+  }
+  
+  SDValue Segment;
+  getAddressOperands(AM, Base, Scale, Index, Disp, Segment);
+  return true;
+}
+
+
+bool X86DAGToDAGISel::TryFoldLoad(SDNode *P, SDValue N,
+                                  SDValue &Base, SDValue &Scale,
+                                  SDValue &Index, SDValue &Disp,
+                                  SDValue &Segment) {
+  if (ISD::isNON_EXTLoad(N.getNode()) &&
+      N.hasOneUse() &&
+      IsLegalAndProfitableToFold(N.getNode(), P, P))
+    return SelectAddr(P, N.getOperand(1), Base, Scale, Index, Disp, Segment);
+  return false;
+}
+
+/// getGlobalBaseReg - Return an SDNode that returns the value of
+/// the global base register. Output instructions required to
+/// initialize the global base register, if necessary.
+///
+SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
+  unsigned GlobalBaseReg = getInstrInfo()->getGlobalBaseReg(MF);
+  return CurDAG->getRegister(GlobalBaseReg, TLI.getPointerTy()).getNode();
+}
+
+static SDNode *FindCallStartFromCall(SDNode *Node) {
+  if (Node->getOpcode() == ISD::CALLSEQ_START) return Node;
+    assert(Node->getOperand(0).getValueType() == MVT::Other &&
+         "Node doesn't have a token chain argument!");
+  return FindCallStartFromCall(Node->getOperand(0).getNode());
+}
+
+SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) {
+  SDValue Chain = Node->getOperand(0);
+  SDValue In1 = Node->getOperand(1);
+  SDValue In2L = Node->getOperand(2);
+  SDValue In2H = Node->getOperand(3);
+  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+  if (!SelectAddr(In1.getNode(), In1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4))
+    return NULL;
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
+  const SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, In2L, In2H, Chain};
+  SDNode *ResNode = CurDAG->getMachineNode(Opc, Node->getDebugLoc(),
+                                           MVT::i32, MVT::i32, MVT::Other, Ops,
+                                           array_lengthof(Ops));
+  cast<MachineSDNode>(ResNode)->setMemRefs(MemOp, MemOp + 1);
+  return ResNode;
+}
+
+SDNode *X86DAGToDAGISel::SelectAtomicLoadAdd(SDNode *Node, EVT NVT) {
+  if (Node->hasAnyUseOfValue(0))
+    return 0;
+
+  // Optimize common patterns for __sync_add_and_fetch and
+  // __sync_sub_and_fetch where the result is not used. This allows us
+  // to use "lock" version of add, sub, inc, dec instructions.
+  // FIXME: Do not use special instructions but instead add the "lock"
+  // prefix to the target node somehow. The extra information will then be
+  // transferred to machine instruction and it denotes the prefix.
+  SDValue Chain = Node->getOperand(0);
+  SDValue Ptr = Node->getOperand(1);
+  SDValue Val = Node->getOperand(2);
+  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+  if (!SelectAddr(Ptr.getNode(), Ptr, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4))
+    return 0;
+
+  bool isInc = false, isDec = false, isSub = false, isCN = false;
+  ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val);
+  if (CN) {
+    isCN = true;
+    int64_t CNVal = CN->getSExtValue();
+    if (CNVal == 1)
+      isInc = true;
+    else if (CNVal == -1)
+      isDec = true;
+    else if (CNVal >= 0)
+      Val = CurDAG->getTargetConstant(CNVal, NVT);
+    else {
+      isSub = true;
+      Val = CurDAG->getTargetConstant(-CNVal, NVT);
+    }
+  } else if (Val.hasOneUse() &&
+             Val.getOpcode() == ISD::SUB &&
+             X86::isZeroNode(Val.getOperand(0))) {
+    isSub = true;
+    Val = Val.getOperand(1);
+  }
+
+  unsigned Opc = 0;
+  switch (NVT.getSimpleVT().SimpleTy) {
+  default: return 0;
+  case MVT::i8:
+    if (isInc)
+      Opc = X86::LOCK_INC8m;
+    else if (isDec)
+      Opc = X86::LOCK_DEC8m;
+    else if (isSub) {
+      if (isCN)
+        Opc = X86::LOCK_SUB8mi;
+      else
+        Opc = X86::LOCK_SUB8mr;
+    } else {
+      if (isCN)
+        Opc = X86::LOCK_ADD8mi;
+      else
+        Opc = X86::LOCK_ADD8mr;
+    }
+    break;
+  case MVT::i16:
+    if (isInc)
+      Opc = X86::LOCK_INC16m;
+    else if (isDec)
+      Opc = X86::LOCK_DEC16m;
+    else if (isSub) {
+      if (isCN) {
+        if (Predicate_i16immSExt8(Val.getNode()))
+          Opc = X86::LOCK_SUB16mi8;
+        else
+          Opc = X86::LOCK_SUB16mi;
+      } else
+        Opc = X86::LOCK_SUB16mr;
+    } else {
+      if (isCN) {
+        if (Predicate_i16immSExt8(Val.getNode()))
+          Opc = X86::LOCK_ADD16mi8;
+        else
+          Opc = X86::LOCK_ADD16mi;
+      } else
+        Opc = X86::LOCK_ADD16mr;
+    }
+    break;
+  case MVT::i32:
+    if (isInc)
+      Opc = X86::LOCK_INC32m;
+    else if (isDec)
+      Opc = X86::LOCK_DEC32m;
+    else if (isSub) {
+      if (isCN) {
+        if (Predicate_i32immSExt8(Val.getNode()))
+          Opc = X86::LOCK_SUB32mi8;
+        else
+          Opc = X86::LOCK_SUB32mi;
+      } else
+        Opc = X86::LOCK_SUB32mr;
+    } else {
+      if (isCN) {
+        if (Predicate_i32immSExt8(Val.getNode()))
+          Opc = X86::LOCK_ADD32mi8;
+        else
+          Opc = X86::LOCK_ADD32mi;
+      } else
+        Opc = X86::LOCK_ADD32mr;
+    }
+    break;
+  case MVT::i64:
+    if (isInc)
+      Opc = X86::LOCK_INC64m;
+    else if (isDec)
+      Opc = X86::LOCK_DEC64m;
+    else if (isSub) {
+      Opc = X86::LOCK_SUB64mr;
+      if (isCN) {
+        if (Predicate_i64immSExt8(Val.getNode()))
+          Opc = X86::LOCK_SUB64mi8;
+        else if (Predicate_i64immSExt32(Val.getNode()))
+          Opc = X86::LOCK_SUB64mi32;
+      }
+    } else {
+      Opc = X86::LOCK_ADD64mr;
+      if (isCN) {
+        if (Predicate_i64immSExt8(Val.getNode()))
+          Opc = X86::LOCK_ADD64mi8;
+        else if (Predicate_i64immSExt32(Val.getNode()))
+          Opc = X86::LOCK_ADD64mi32;
+      }
+    }
+    break;
+  }
+
+  DebugLoc dl = Node->getDebugLoc();
+  SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
+                                                 dl, NVT), 0);
+  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+  MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
+  if (isInc || isDec) {
+    SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain };
+    SDValue Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, 6), 0);
+    cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1);
+    SDValue RetVals[] = { Undef, Ret };
+    return CurDAG->getMergeValues(RetVals, 2, dl).getNode();
+  } else {
+    SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Val, Chain };
+    SDValue Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, 7), 0);
+    cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1);
+    SDValue RetVals[] = { Undef, Ret };
+    return CurDAG->getMergeValues(RetVals, 2, dl).getNode();
+  }
+}
+
+/// HasNoSignedComparisonUses - Test whether the given X86ISD::CMP node has
+/// any uses which require the SF or OF bits to be accurate.
+static bool HasNoSignedComparisonUses(SDNode *N) {
+  // Examine each user of the node.
+  for (SDNode::use_iterator UI = N->use_begin(),
+         UE = N->use_end(); UI != UE; ++UI) {
+    // Only examine CopyToReg uses.
+    if (UI->getOpcode() != ISD::CopyToReg)
+      return false;
+    // Only examine CopyToReg uses that copy to EFLAGS.
+    if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() !=
+          X86::EFLAGS)
+      return false;
+    // Examine each user of the CopyToReg use.
+    for (SDNode::use_iterator FlagUI = UI->use_begin(),
+           FlagUE = UI->use_end(); FlagUI != FlagUE; ++FlagUI) {
+      // Only examine the Flag result.
+      if (FlagUI.getUse().getResNo() != 1) continue;
+      // Anything unusual: assume conservatively.
+      if (!FlagUI->isMachineOpcode()) return false;
+      // Examine the opcode of the user.
+      switch (FlagUI->getMachineOpcode()) {
+      // These comparisons don't treat the most significant bit specially.
+      case X86::SETAr: case X86::SETAEr: case X86::SETBr: case X86::SETBEr:
+      case X86::SETEr: case X86::SETNEr: case X86::SETPr: case X86::SETNPr:
+      case X86::SETAm: case X86::SETAEm: case X86::SETBm: case X86::SETBEm:
+      case X86::SETEm: case X86::SETNEm: case X86::SETPm: case X86::SETNPm:
+      case X86::JA: case X86::JAE: case X86::JB: case X86::JBE:
+      case X86::JE: case X86::JNE: case X86::JP: case X86::JNP:
+      case X86::CMOVA16rr: case X86::CMOVA16rm:
+      case X86::CMOVA32rr: case X86::CMOVA32rm:
+      case X86::CMOVA64rr: case X86::CMOVA64rm:
+      case X86::CMOVAE16rr: case X86::CMOVAE16rm:
+      case X86::CMOVAE32rr: case X86::CMOVAE32rm:
+      case X86::CMOVAE64rr: case X86::CMOVAE64rm:
+      case X86::CMOVB16rr: case X86::CMOVB16rm:
+      case X86::CMOVB32rr: case X86::CMOVB32rm:
+      case X86::CMOVB64rr: case X86::CMOVB64rm:
+      case X86::CMOVBE16rr: case X86::CMOVBE16rm:
+      case X86::CMOVBE32rr: case X86::CMOVBE32rm:
+      case X86::CMOVBE64rr: case X86::CMOVBE64rm:
+      case X86::CMOVE16rr: case X86::CMOVE16rm:
+      case X86::CMOVE32rr: case X86::CMOVE32rm:
+      case X86::CMOVE64rr: case X86::CMOVE64rm:
+      case X86::CMOVNE16rr: case X86::CMOVNE16rm:
+      case X86::CMOVNE32rr: case X86::CMOVNE32rm:
+      case X86::CMOVNE64rr: case X86::CMOVNE64rm:
+      case X86::CMOVNP16rr: case X86::CMOVNP16rm:
+      case X86::CMOVNP32rr: case X86::CMOVNP32rm:
+      case X86::CMOVNP64rr: case X86::CMOVNP64rm:
+      case X86::CMOVP16rr: case X86::CMOVP16rm:
+      case X86::CMOVP32rr: case X86::CMOVP32rm:
+      case X86::CMOVP64rr: case X86::CMOVP64rm:
+        continue;
+      // Anything else: assume conservatively.
+      default: return false;
+      }
+    }
+  }
+  return true;
+}
+
+SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
+  EVT NVT = Node->getValueType(0);
+  unsigned Opc, MOpc;
+  unsigned Opcode = Node->getOpcode();
+  DebugLoc dl = Node->getDebugLoc();
+  
+#ifndef NDEBUG
+  DEBUG({
+      dbgs() << std::string(Indent, ' ') << "Selecting: ";
+      Node->dump(CurDAG);
+      dbgs() << '\n';
+    });
+  Indent += 2;
+#endif
+
+  if (Node->isMachineOpcode()) {
+#ifndef NDEBUG
+    DEBUG({
+        dbgs() << std::string(Indent-2, ' ') << "== ";
+        Node->dump(CurDAG);
+        dbgs() << '\n';
+      });
+    Indent -= 2;
+#endif
+    return NULL;   // Already selected.
+  }
+
+  switch (Opcode) {
+  default: break;
+  case X86ISD::GlobalBaseReg:
+    return getGlobalBaseReg();
+
+  case X86ISD::ATOMOR64_DAG:
+    return SelectAtomic64(Node, X86::ATOMOR6432);
+  case X86ISD::ATOMXOR64_DAG:
+    return SelectAtomic64(Node, X86::ATOMXOR6432);
+  case X86ISD::ATOMADD64_DAG:
+    return SelectAtomic64(Node, X86::ATOMADD6432);
+  case X86ISD::ATOMSUB64_DAG:
+    return SelectAtomic64(Node, X86::ATOMSUB6432);
+  case X86ISD::ATOMNAND64_DAG:
+    return SelectAtomic64(Node, X86::ATOMNAND6432);
+  case X86ISD::ATOMAND64_DAG:
+    return SelectAtomic64(Node, X86::ATOMAND6432);
+  case X86ISD::ATOMSWAP64_DAG:
+    return SelectAtomic64(Node, X86::ATOMSWAP6432);
+
+  case ISD::ATOMIC_LOAD_ADD: {
+    SDNode *RetVal = SelectAtomicLoadAdd(Node, NVT);
+    if (RetVal)
+      return RetVal;
+    break;
+  }
+
+  case ISD::SMUL_LOHI:
+  case ISD::UMUL_LOHI: {
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+
+    bool isSigned = Opcode == ISD::SMUL_LOHI;
+    if (!isSigned) {
+      switch (NVT.getSimpleVT().SimpleTy) {
+      default: llvm_unreachable("Unsupported VT!");
+      case MVT::i8:  Opc = X86::MUL8r;  MOpc = X86::MUL8m;  break;
+      case MVT::i16: Opc = X86::MUL16r; MOpc = X86::MUL16m; break;
+      case MVT::i32: Opc = X86::MUL32r; MOpc = X86::MUL32m; break;
+      case MVT::i64: Opc = X86::MUL64r; MOpc = X86::MUL64m; break;
+      }
+    } else {
+      switch (NVT.getSimpleVT().SimpleTy) {
+      default: llvm_unreachable("Unsupported VT!");
+      case MVT::i8:  Opc = X86::IMUL8r;  MOpc = X86::IMUL8m;  break;
+      case MVT::i16: Opc = X86::IMUL16r; MOpc = X86::IMUL16m; break;
+      case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break;
+      case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break;
+      }
+    }
+
+    unsigned LoReg, HiReg;
+    switch (NVT.getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unsupported VT!");
+    case MVT::i8:  LoReg = X86::AL;  HiReg = X86::AH;  break;
+    case MVT::i16: LoReg = X86::AX;  HiReg = X86::DX;  break;
+    case MVT::i32: LoReg = X86::EAX; HiReg = X86::EDX; break;
+    case MVT::i64: LoReg = X86::RAX; HiReg = X86::RDX; break;
+    }
+
+    SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+    bool foldedLoad = TryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+    // Multiply is commmutative.
+    if (!foldedLoad) {
+      foldedLoad = TryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+      if (foldedLoad)
+        std::swap(N0, N1);
+    }
+
+    SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
+                                            N0, SDValue()).getValue(1);
+
+    if (foldedLoad) {
+      SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
+                        InFlag };
+      SDNode *CNode =
+        CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Flag, Ops,
+                               array_lengthof(Ops));
+      InFlag = SDValue(CNode, 1);
+      // Update the chain.
+      ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
+    } else {
+      InFlag =
+        SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Flag, N1, InFlag), 0);
+    }
+
+    // Copy the low half of the result, if it is needed.
+    if (!SDValue(Node, 0).use_empty()) {
+      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                                LoReg, NVT, InFlag);
+      InFlag = Result.getValue(2);
+      ReplaceUses(SDValue(Node, 0), Result);
+#ifndef NDEBUG
+      DEBUG({
+          dbgs() << std::string(Indent-2, ' ') << "=> ";
+          Result.getNode()->dump(CurDAG);
+          dbgs() << '\n';
+        });
+#endif
+    }
+    // Copy the high half of the result, if it is needed.
+    if (!SDValue(Node, 1).use_empty()) {
+      SDValue Result;
+      if (HiReg == X86::AH && Subtarget->is64Bit()) {
+        // Prevent use of AH in a REX instruction by referencing AX instead.
+        // Shift it down 8 bits.
+        Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                        X86::AX, MVT::i16, InFlag);
+        InFlag = Result.getValue(2);
+        Result = SDValue(CurDAG->getMachineNode(X86::SHR16ri, dl, MVT::i16,
+                                                Result,
+                                   CurDAG->getTargetConstant(8, MVT::i8)), 0);
+        // Then truncate it down to i8.
+        Result = CurDAG->getTargetExtractSubreg(X86::SUBREG_8BIT, dl,
+                                                MVT::i8, Result);
+      } else {
+        Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                        HiReg, NVT, InFlag);
+        InFlag = Result.getValue(2);
+      }
+      ReplaceUses(SDValue(Node, 1), Result);
+#ifndef NDEBUG
+      DEBUG({
+          dbgs() << std::string(Indent-2, ' ') << "=> ";
+          Result.getNode()->dump(CurDAG);
+          dbgs() << '\n';
+        });
+#endif
+    }
+
+#ifndef NDEBUG
+    Indent -= 2;
+#endif
+
+    return NULL;
+  }
+
+  case ISD::SDIVREM:
+  case ISD::UDIVREM: {
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+
+    bool isSigned = Opcode == ISD::SDIVREM;
+    if (!isSigned) {
+      switch (NVT.getSimpleVT().SimpleTy) {
+      default: llvm_unreachable("Unsupported VT!");
+      case MVT::i8:  Opc = X86::DIV8r;  MOpc = X86::DIV8m;  break;
+      case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break;
+      case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break;
+      case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break;
+      }
+    } else {
+      switch (NVT.getSimpleVT().SimpleTy) {
+      default: llvm_unreachable("Unsupported VT!");
+      case MVT::i8:  Opc = X86::IDIV8r;  MOpc = X86::IDIV8m;  break;
+      case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
+      case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
+      case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
+      }
+    }
+
+    unsigned LoReg, HiReg, ClrReg;
+    unsigned ClrOpcode, SExtOpcode;
+    switch (NVT.getSimpleVT().SimpleTy) {
+    default: llvm_unreachable("Unsupported VT!");
+    case MVT::i8:
+      LoReg = X86::AL;  ClrReg = HiReg = X86::AH;
+      ClrOpcode  = 0;
+      SExtOpcode = X86::CBW;
+      break;
+    case MVT::i16:
+      LoReg = X86::AX;  HiReg = X86::DX;
+      ClrOpcode  = X86::MOV16r0; ClrReg = X86::DX;
+      SExtOpcode = X86::CWD;
+      break;
+    case MVT::i32:
+      LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
+      ClrOpcode  = X86::MOV32r0;
+      SExtOpcode = X86::CDQ;
+      break;
+    case MVT::i64:
+      LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
+      ClrOpcode  = X86::MOV64r0;
+      SExtOpcode = X86::CQO;
+      break;
+    }
+
+    SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+    bool foldedLoad = TryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
+    bool signBitIsZero = CurDAG->SignBitIsZero(N0);
+
+    SDValue InFlag;
+    if (NVT == MVT::i8 && (!isSigned || signBitIsZero)) {
+      // Special case for div8, just use a move with zero extension to AX to
+      // clear the upper 8 bits (AH).
+      SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Move, Chain;
+      if (TryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
+        SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
+        Move =
+          SDValue(CurDAG->getMachineNode(X86::MOVZX16rm8, dl, MVT::i16,
+                                         MVT::Other, Ops,
+                                         array_lengthof(Ops)), 0);
+        Chain = Move.getValue(1);
+        ReplaceUses(N0.getValue(1), Chain);
+      } else {
+        Move =
+          SDValue(CurDAG->getMachineNode(X86::MOVZX16rr8, dl, MVT::i16, N0),0);
+        Chain = CurDAG->getEntryNode();
+      }
+      Chain  = CurDAG->getCopyToReg(Chain, dl, X86::AX, Move, SDValue());
+      InFlag = Chain.getValue(1);
+    } else {
+      InFlag =
+        CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
+                             LoReg, N0, SDValue()).getValue(1);
+      if (isSigned && !signBitIsZero) {
+        // Sign extend the low part into the high part.
+        InFlag =
+          SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Flag, InFlag),0);
+      } else {
+        // Zero out the high part, effectively zero extending the input.
+        SDValue ClrNode =
+          SDValue(CurDAG->getMachineNode(ClrOpcode, dl, NVT), 0);
+        InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
+                                      ClrNode, InFlag).getValue(1);
+      }
+    }
+
+    if (foldedLoad) {
+      SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
+                        InFlag };
+      SDNode *CNode =
+        CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Flag, Ops,
+                               array_lengthof(Ops));
+      InFlag = SDValue(CNode, 1);
+      // Update the chain.
+      ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
+    } else {
+      InFlag =
+        SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Flag, N1, InFlag), 0);
+    }
+
+    // Copy the division (low) result, if it is needed.
+    if (!SDValue(Node, 0).use_empty()) {
+      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                                LoReg, NVT, InFlag);
+      InFlag = Result.getValue(2);
+      ReplaceUses(SDValue(Node, 0), Result);
+#ifndef NDEBUG
+      DEBUG({
+          dbgs() << std::string(Indent-2, ' ') << "=> ";
+          Result.getNode()->dump(CurDAG);
+          dbgs() << '\n';
+        });
+#endif
+    }
+    // Copy the remainder (high) result, if it is needed.
+    if (!SDValue(Node, 1).use_empty()) {
+      SDValue Result;
+      if (HiReg == X86::AH && Subtarget->is64Bit()) {
+        // Prevent use of AH in a REX instruction by referencing AX instead.
+        // Shift it down 8 bits.
+        Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                        X86::AX, MVT::i16, InFlag);
+        InFlag = Result.getValue(2);
+        Result = SDValue(CurDAG->getMachineNode(X86::SHR16ri, dl, MVT::i16,
+                                      Result,
+                                      CurDAG->getTargetConstant(8, MVT::i8)),
+                         0);
+        // Then truncate it down to i8.
+        Result = CurDAG->getTargetExtractSubreg(X86::SUBREG_8BIT, dl,
+                                                MVT::i8, Result);
+      } else {
+        Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
+                                        HiReg, NVT, InFlag);
+        InFlag = Result.getValue(2);
+      }
+      ReplaceUses(SDValue(Node, 1), Result);
+#ifndef NDEBUG
+      DEBUG({
+          dbgs() << std::string(Indent-2, ' ') << "=> ";
+          Result.getNode()->dump(CurDAG);
+          dbgs() << '\n';
+        });
+#endif
+    }
+
+#ifndef NDEBUG
+    Indent -= 2;
+#endif
+
+    return NULL;
+  }
+
+  case X86ISD::CMP: {
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+
+    // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
+    // use a smaller encoding.
+    if (N0.getNode()->getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
+        N0.getValueType() != MVT::i8 &&
+        X86::isZeroNode(N1)) {
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getNode()->getOperand(1));
+      if (!C) break;
+
+      // For example, convert "testl %eax, $8" to "testb %al, $8"
+      if ((C->getZExtValue() & ~UINT64_C(0xff)) == 0 &&
+          (!(C->getZExtValue() & 0x80) ||
+           HasNoSignedComparisonUses(Node))) {
+        SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i8);
+        SDValue Reg = N0.getNode()->getOperand(0);
+
+        // On x86-32, only the ABCD registers have 8-bit subregisters.
+        if (!Subtarget->is64Bit()) {
+          TargetRegisterClass *TRC = 0;
+          switch (N0.getValueType().getSimpleVT().SimpleTy) {
+          case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break;
+          case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break;
+          default: llvm_unreachable("Unsupported TEST operand type!");
+          }
+          SDValue RC = CurDAG->getTargetConstant(TRC->getID(), MVT::i32);
+          Reg = SDValue(CurDAG->getMachineNode(X86::COPY_TO_REGCLASS, dl,
+                                               Reg.getValueType(), Reg, RC), 0);
+        }
+
+        // Extract the l-register.
+        SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::SUBREG_8BIT, dl,
+                                                        MVT::i8, Reg);
+
+        // Emit a testb.
+        return CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32, Subreg, Imm);
+      }
+
+      // For example, "testl %eax, $2048" to "testb %ah, $8".
+      if ((C->getZExtValue() & ~UINT64_C(0xff00)) == 0 &&
+          (!(C->getZExtValue() & 0x8000) ||
+           HasNoSignedComparisonUses(Node))) {
+        // Shift the immediate right by 8 bits.
+        SDValue ShiftedImm = CurDAG->getTargetConstant(C->getZExtValue() >> 8,
+                                                       MVT::i8);
+        SDValue Reg = N0.getNode()->getOperand(0);
+
+        // Put the value in an ABCD register.
+        TargetRegisterClass *TRC = 0;
+        switch (N0.getValueType().getSimpleVT().SimpleTy) {
+        case MVT::i64: TRC = &X86::GR64_ABCDRegClass; break;
+        case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break;
+        case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break;
+        default: llvm_unreachable("Unsupported TEST operand type!");
+        }
+        SDValue RC = CurDAG->getTargetConstant(TRC->getID(), MVT::i32);
+        Reg = SDValue(CurDAG->getMachineNode(X86::COPY_TO_REGCLASS, dl,
+                                             Reg.getValueType(), Reg, RC), 0);
+
+        // Extract the h-register.
+        SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::SUBREG_8BIT_HI, dl,
+                                                        MVT::i8, Reg);
+
+        // Emit a testb. No special NOREX tricks are needed since there's
+        // only one GPR operand!
+        return CurDAG->getMachineNode(X86::TEST8ri, dl, MVT::i32,
+                                      Subreg, ShiftedImm);
+      }
+
+      // For example, "testl %eax, $32776" to "testw %ax, $32776".
+      if ((C->getZExtValue() & ~UINT64_C(0xffff)) == 0 &&
+          N0.getValueType() != MVT::i16 &&
+          (!(C->getZExtValue() & 0x8000) ||
+           HasNoSignedComparisonUses(Node))) {
+        SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i16);
+        SDValue Reg = N0.getNode()->getOperand(0);
+
+        // Extract the 16-bit subregister.
+        SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::SUBREG_16BIT, dl,
+                                                        MVT::i16, Reg);
+
+        // Emit a testw.
+        return CurDAG->getMachineNode(X86::TEST16ri, dl, MVT::i32, Subreg, Imm);
+      }
+
+      // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
+      if ((C->getZExtValue() & ~UINT64_C(0xffffffff)) == 0 &&
+          N0.getValueType() == MVT::i64 &&
+          (!(C->getZExtValue() & 0x80000000) ||
+           HasNoSignedComparisonUses(Node))) {
+        SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), MVT::i32);
+        SDValue Reg = N0.getNode()->getOperand(0);
+
+        // Extract the 32-bit subregister.
+        SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::SUBREG_32BIT, dl,
+                                                        MVT::i32, Reg);
+
+        // Emit a testl.
+        return CurDAG->getMachineNode(X86::TEST32ri, dl, MVT::i32, Subreg, Imm);
+      }
+    }
+    break;
+  }
+  }
+
+  SDNode *ResNode = SelectCode(Node);
+
+#ifndef NDEBUG
+  DEBUG({
+      dbgs() << std::string(Indent-2, ' ') << "=> ";
+      if (ResNode == NULL || ResNode == Node)
+        Node->dump(CurDAG);
+      else
+        ResNode->dump(CurDAG);
+      dbgs() << '\n';
+    });
+  Indent -= 2;
+#endif
+
+  return ResNode;
+}
+
+bool X86DAGToDAGISel::
+SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
+                             std::vector<SDValue> &OutOps) {
+  SDValue Op0, Op1, Op2, Op3, Op4;
+  switch (ConstraintCode) {
+  case 'o':   // offsetable        ??
+  case 'v':   // not offsetable    ??
+  default: return true;
+  case 'm':   // memory
+    if (!SelectAddr(Op.getNode(), Op, Op0, Op1, Op2, Op3, Op4))
+      return true;
+    break;
+  }
+  
+  OutOps.push_back(Op0);
+  OutOps.push_back(Op1);
+  OutOps.push_back(Op2);
+  OutOps.push_back(Op3);
+  OutOps.push_back(Op4);
+  return false;
+}
+
+/// createX86ISelDag - This pass converts a legalized DAG into a 
+/// X86-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createX86ISelDag(X86TargetMachine &TM,
+                                     llvm::CodeGenOpt::Level OptLevel) {
+  return new X86DAGToDAGISel(TM, OptLevel);
+}

diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
new file mode 100644
index 0000000..515bc84
--- /dev/null
+++ b/lib/Target/X86/X86ISelLowering.cpp

@@ -0,0 +1,10273 @@
+//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that X86 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-isel"
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86ISelLowering.h"
+#include "X86MCTargetExpr.h"
+#include "X86TargetMachine.h"
+#include "X86TargetObjectFile.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/GlobalAlias.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/VectorExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+STATISTIC(NumTailCalls, "Number of tail calls");
+
+static cl::opt<bool>
+DisableMMX("disable-mmx", cl::Hidden, cl::desc("Disable use of MMX"));
+
+// Disable16Bit - 16-bit operations typically have a larger encoding than
+// corresponding 32-bit instructions, and 16-bit code is slow on some
+// processors. This is an experimental flag to disable 16-bit operations
+// (which forces them to be Legalized to 32-bit operations).
+static cl::opt<bool>
+Disable16Bit("disable-16bit", cl::Hidden,
+             cl::desc("Disable use of 16-bit instructions"));
+
+// Forward declarations.
+static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
+                       SDValue V2);
+
+static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
+  switch (TM.getSubtarget<X86Subtarget>().TargetType) {
+  default: llvm_unreachable("unknown subtarget type");
+  case X86Subtarget::isDarwin:
+    if (TM.getSubtarget<X86Subtarget>().is64Bit())
+      return new X8664_MachoTargetObjectFile();
+    return new X8632_MachoTargetObjectFile();
+  case X86Subtarget::isELF:
+    return new TargetLoweringObjectFileELF();
+  case X86Subtarget::isMingw:
+  case X86Subtarget::isCygwin:
+  case X86Subtarget::isWindows:
+    return new TargetLoweringObjectFileCOFF();
+  }
+
+}
+
+X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
+  : TargetLowering(TM, createTLOF(TM)) {
+  Subtarget = &TM.getSubtarget<X86Subtarget>();
+  X86ScalarSSEf64 = Subtarget->hasSSE2();
+  X86ScalarSSEf32 = Subtarget->hasSSE1();
+  X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
+
+  RegInfo = TM.getRegisterInfo();
+  TD = getTargetData();
+
+  // Set up the TargetLowering object.
+
+  // X86 is weird, it always uses i8 for shift amounts and setcc results.
+  setShiftAmountType(MVT::i8);
+  setBooleanContents(ZeroOrOneBooleanContent);
+  setSchedulingPreference(SchedulingForRegPressure);
+  setStackPointerRegisterToSaveRestore(X86StackPtr);
+
+  if (Subtarget->isTargetDarwin()) {
+    // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
+    setUseUnderscoreSetJmp(false);
+    setUseUnderscoreLongJmp(false);
+  } else if (Subtarget->isTargetMingw()) {
+    // MS runtime is weird: it exports _setjmp, but longjmp!
+    setUseUnderscoreSetJmp(true);
+    setUseUnderscoreLongJmp(false);
+  } else {
+    setUseUnderscoreSetJmp(true);
+    setUseUnderscoreLongJmp(true);
+  }
+
+  // Set up the register classes.
+  addRegisterClass(MVT::i8, X86::GR8RegisterClass);
+  if (!Disable16Bit)
+    addRegisterClass(MVT::i16, X86::GR16RegisterClass);
+  addRegisterClass(MVT::i32, X86::GR32RegisterClass);
+  if (Subtarget->is64Bit())
+    addRegisterClass(MVT::i64, X86::GR64RegisterClass);
+
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+
+  // We don't accept any truncstore of integer registers.
+  setTruncStoreAction(MVT::i64, MVT::i32, Expand);
+  if (!Disable16Bit)
+    setTruncStoreAction(MVT::i64, MVT::i16, Expand);
+  setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
+  if (!Disable16Bit)
+    setTruncStoreAction(MVT::i32, MVT::i16, Expand);
+  setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
+  setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
+
+  // SETOEQ and SETUNE require checking two conditions.
+  setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
+  setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
+  setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
+  setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
+
+  // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
+  // operation.
+  setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
+  setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
+  setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
+
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
+    setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Expand);
+  } else if (!UseSoftFloat) {
+    if (X86ScalarSSEf64) {
+      // We have an impenetrably clever algorithm for ui64->double only.
+      setOperationAction(ISD::UINT_TO_FP   , MVT::i64  , Custom);
+    }
+    // We have an algorithm for SSE2, and we turn this into a 64-bit
+    // FILD for other targets.
+    setOperationAction(ISD::UINT_TO_FP   , MVT::i32  , Custom);
+  }
+
+  // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
+  // this operation.
+  setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
+  setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
+
+  if (!UseSoftFloat) {
+    // SSE has no i16 to fp conversion, only i32
+    if (X86ScalarSSEf32) {
+      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
+      // f32 and f64 cases are Legal, f80 case is not
+      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
+    } else {
+      setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
+      setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
+    }
+  } else {
+    setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
+    setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
+  }
+
+  // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
+  // are Legal, f80 is custom lowered.
+  setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
+  setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
+
+  // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
+  // this operation.
+  setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
+  setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
+
+  if (X86ScalarSSEf32) {
+    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
+    // f32 and f64 cases are Legal, f80 case is not
+    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
+  } else {
+    setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
+    setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
+  }
+
+  // Handle FP_TO_UINT by promoting the destination to a larger signed
+  // conversion.
+  setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
+  setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
+  setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
+
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
+    setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
+  } else if (!UseSoftFloat) {
+    if (X86ScalarSSEf32 && !Subtarget->hasSSE3())
+      // Expand FP_TO_UINT into a select.
+      // FIXME: We would like to use a Custom expander here eventually to do
+      // the optimal thing for SSE vs. the default expansion in the legalizer.
+      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
+    else
+      // With SSE3 we can use fisttpll to convert to a signed i64; without
+      // SSE, we're stuck with a fistpll.
+      setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
+  }
+
+  // TODO: when we have SSE, these could be more efficient, by using movd/movq.
+  if (!X86ScalarSSEf64) {
+    setOperationAction(ISD::BIT_CONVERT      , MVT::f32  , Expand);
+    setOperationAction(ISD::BIT_CONVERT      , MVT::i32  , Expand);
+  }
+
+  // Scalar integer divide and remainder are lowered to use operations that
+  // produce two results, to match the available instructions. This exposes
+  // the two-result form to trivial CSE, which is able to combine x/y and x%y
+  // into a single instruction.
+  //
+  // Scalar integer multiply-high is also lowered to use two-result
+  // operations, to match the available instructions. However, plain multiply
+  // (low) operations are left as Legal, as there are single-result
+  // instructions for this in x86. Using the two-result multiply instructions
+  // when both high and low results are needed must be arranged by dagcombine.
+  setOperationAction(ISD::MULHS           , MVT::i8    , Expand);
+  setOperationAction(ISD::MULHU           , MVT::i8    , Expand);
+  setOperationAction(ISD::SDIV            , MVT::i8    , Expand);
+  setOperationAction(ISD::UDIV            , MVT::i8    , Expand);
+  setOperationAction(ISD::SREM            , MVT::i8    , Expand);
+  setOperationAction(ISD::UREM            , MVT::i8    , Expand);
+  setOperationAction(ISD::MULHS           , MVT::i16   , Expand);
+  setOperationAction(ISD::MULHU           , MVT::i16   , Expand);
+  setOperationAction(ISD::SDIV            , MVT::i16   , Expand);
+  setOperationAction(ISD::UDIV            , MVT::i16   , Expand);
+  setOperationAction(ISD::SREM            , MVT::i16   , Expand);
+  setOperationAction(ISD::UREM            , MVT::i16   , Expand);
+  setOperationAction(ISD::MULHS           , MVT::i32   , Expand);
+  setOperationAction(ISD::MULHU           , MVT::i32   , Expand);
+  setOperationAction(ISD::SDIV            , MVT::i32   , Expand);
+  setOperationAction(ISD::UDIV            , MVT::i32   , Expand);
+  setOperationAction(ISD::SREM            , MVT::i32   , Expand);
+  setOperationAction(ISD::UREM            , MVT::i32   , Expand);
+  setOperationAction(ISD::MULHS           , MVT::i64   , Expand);
+  setOperationAction(ISD::MULHU           , MVT::i64   , Expand);
+  setOperationAction(ISD::SDIV            , MVT::i64   , Expand);
+  setOperationAction(ISD::UDIV            , MVT::i64   , Expand);
+  setOperationAction(ISD::SREM            , MVT::i64   , Expand);
+  setOperationAction(ISD::UREM            , MVT::i64   , Expand);
+
+  setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
+  setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
+  setOperationAction(ISD::BR_CC            , MVT::Other, Expand);
+  setOperationAction(ISD::SELECT_CC        , MVT::Other, Expand);
+  if (Subtarget->is64Bit())
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
+  setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
+  setOperationAction(ISD::FREM             , MVT::f32  , Expand);
+  setOperationAction(ISD::FREM             , MVT::f64  , Expand);
+  setOperationAction(ISD::FREM             , MVT::f80  , Expand);
+  setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
+
+  setOperationAction(ISD::CTPOP            , MVT::i8   , Expand);
+  setOperationAction(ISD::CTTZ             , MVT::i8   , Custom);
+  setOperationAction(ISD::CTLZ             , MVT::i8   , Custom);
+  setOperationAction(ISD::CTPOP            , MVT::i16  , Expand);
+  if (Disable16Bit) {
+    setOperationAction(ISD::CTTZ           , MVT::i16  , Expand);
+    setOperationAction(ISD::CTLZ           , MVT::i16  , Expand);
+  } else {
+    setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
+    setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
+  }
+  setOperationAction(ISD::CTPOP            , MVT::i32  , Expand);
+  setOperationAction(ISD::CTTZ             , MVT::i32  , Custom);
+  setOperationAction(ISD::CTLZ             , MVT::i32  , Custom);
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::CTPOP          , MVT::i64  , Expand);
+    setOperationAction(ISD::CTTZ           , MVT::i64  , Custom);
+    setOperationAction(ISD::CTLZ           , MVT::i64  , Custom);
+  }
+
+  setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
+  setOperationAction(ISD::BSWAP            , MVT::i16  , Expand);
+
+  // These should be promoted to a larger select which is supported.
+  setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
+  // X86 wants to expand cmov itself.
+  setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
+  if (Disable16Bit)
+    setOperationAction(ISD::SELECT        , MVT::i16  , Expand);
+  else
+    setOperationAction(ISD::SELECT        , MVT::i16  , Custom);
+  setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
+  setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
+  setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
+  setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
+  setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
+  if (Disable16Bit)
+    setOperationAction(ISD::SETCC         , MVT::i16  , Expand);
+  else
+    setOperationAction(ISD::SETCC         , MVT::i16  , Custom);
+  setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
+  setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
+  setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
+  setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
+    setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
+  }
+  setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
+
+  // Darwin ABI issue.
+  setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
+  setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
+  setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
+  if (Subtarget->is64Bit())
+    setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
+  setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
+  setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
+    setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
+    setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
+    setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
+    setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
+  }
+  // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
+  setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
+  setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
+  setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
+    setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
+    setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
+  }
+
+  if (Subtarget->hasSSE1())
+    setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
+
+  if (!Subtarget->hasSSE2())
+    setOperationAction(ISD::MEMBARRIER    , MVT::Other, Expand);
+
+  // Expand certain atomics
+  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i8, Custom);
+  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i16, Custom);
+  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i64, Custom);
+
+  setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i8, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i16, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i32, Custom);
+  setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
+
+  if (!Subtarget->is64Bit()) {
+    setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i64, Custom);
+    setOperationAction(ISD::ATOMIC_SWAP, MVT::i64, Custom);
+  }
+
+  // FIXME - use subtarget debug flags
+  if (!Subtarget->isTargetDarwin() &&
+      !Subtarget->isTargetELF() &&
+      !Subtarget->isTargetCygMing()) {
+    setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
+  }
+
+  setOperationAction(ISD::EXCEPTIONADDR, MVT::i64, Expand);
+  setOperationAction(ISD::EHSELECTION,   MVT::i64, Expand);
+  setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
+  setOperationAction(ISD::EHSELECTION,   MVT::i32, Expand);
+  if (Subtarget->is64Bit()) {
+    setExceptionPointerRegister(X86::RAX);
+    setExceptionSelectorRegister(X86::RDX);
+  } else {
+    setExceptionPointerRegister(X86::EAX);
+    setExceptionSelectorRegister(X86::EDX);
+  }
+  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
+  setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
+
+  setOperationAction(ISD::TRAMPOLINE, MVT::Other, Custom);
+
+  setOperationAction(ISD::TRAP, MVT::Other, Legal);
+
+  // VASTART needs to be custom lowered to use the VarArgsFrameIndex
+  setOperationAction(ISD::VASTART           , MVT::Other, Custom);
+  setOperationAction(ISD::VAEND             , MVT::Other, Expand);
+  if (Subtarget->is64Bit()) {
+    setOperationAction(ISD::VAARG           , MVT::Other, Custom);
+    setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
+  } else {
+    setOperationAction(ISD::VAARG           , MVT::Other, Expand);
+    setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
+  }
+
+  setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
+  if (Subtarget->is64Bit())
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
+  if (Subtarget->isTargetCygMing())
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+  else
+    setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
+
+  if (!UseSoftFloat && X86ScalarSSEf64) {
+    // f32 and f64 use SSE.
+    // Set up the FP register classes.
+    addRegisterClass(MVT::f32, X86::FR32RegisterClass);
+    addRegisterClass(MVT::f64, X86::FR64RegisterClass);
+
+    // Use ANDPD to simulate FABS.
+    setOperationAction(ISD::FABS , MVT::f64, Custom);
+    setOperationAction(ISD::FABS , MVT::f32, Custom);
+
+    // Use XORP to simulate FNEG.
+    setOperationAction(ISD::FNEG , MVT::f64, Custom);
+    setOperationAction(ISD::FNEG , MVT::f32, Custom);
+
+    // Use ANDPD and ORPD to simulate FCOPYSIGN.
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+
+    // We don't support sin/cos/fmod
+    setOperationAction(ISD::FSIN , MVT::f64, Expand);
+    setOperationAction(ISD::FCOS , MVT::f64, Expand);
+    setOperationAction(ISD::FSIN , MVT::f32, Expand);
+    setOperationAction(ISD::FCOS , MVT::f32, Expand);
+
+    // Expand FP immediates into loads from the stack, except for the special
+    // cases we handle.
+    addLegalFPImmediate(APFloat(+0.0)); // xorpd
+    addLegalFPImmediate(APFloat(+0.0f)); // xorps
+  } else if (!UseSoftFloat && X86ScalarSSEf32) {
+    // Use SSE for f32, x87 for f64.
+    // Set up the FP register classes.
+    addRegisterClass(MVT::f32, X86::FR32RegisterClass);
+    addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
+
+    // Use ANDPS to simulate FABS.
+    setOperationAction(ISD::FABS , MVT::f32, Custom);
+
+    // Use XORP to simulate FNEG.
+    setOperationAction(ISD::FNEG , MVT::f32, Custom);
+
+    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
+
+    // Use ANDPS and ORPS to simulate FCOPYSIGN.
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+
+    // We don't support sin/cos/fmod
+    setOperationAction(ISD::FSIN , MVT::f32, Expand);
+    setOperationAction(ISD::FCOS , MVT::f32, Expand);
+
+    // Special cases we handle for FP constants.
+    addLegalFPImmediate(APFloat(+0.0f)); // xorps
+    addLegalFPImmediate(APFloat(+0.0)); // FLD0
+    addLegalFPImmediate(APFloat(+1.0)); // FLD1
+    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
+    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
+
+    if (!UnsafeFPMath) {
+      setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
+      setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
+    }
+  } else if (!UseSoftFloat) {
+    // f32 and f64 in x87.
+    // Set up the FP register classes.
+    addRegisterClass(MVT::f64, X86::RFP64RegisterClass);
+    addRegisterClass(MVT::f32, X86::RFP32RegisterClass);
+
+    setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
+    setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
+
+    if (!UnsafeFPMath) {
+      setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
+      setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
+    }
+    addLegalFPImmediate(APFloat(+0.0)); // FLD0
+    addLegalFPImmediate(APFloat(+1.0)); // FLD1
+    addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
+    addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
+    addLegalFPImmediate(APFloat(+0.0f)); // FLD0
+    addLegalFPImmediate(APFloat(+1.0f)); // FLD1
+    addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
+    addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
+  }
+
+  // Long double always uses X87.
+  if (!UseSoftFloat) {
+    addRegisterClass(MVT::f80, X86::RFP80RegisterClass);
+    setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
+    {
+      bool ignored;
+      APFloat TmpFlt(+0.0);
+      TmpFlt.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
+                     &ignored);
+      addLegalFPImmediate(TmpFlt);  // FLD0
+      TmpFlt.changeSign();
+      addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
+      APFloat TmpFlt2(+1.0);
+      TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
+                      &ignored);
+      addLegalFPImmediate(TmpFlt2);  // FLD1
+      TmpFlt2.changeSign();
+      addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
+    }
+
+    if (!UnsafeFPMath) {
+      setOperationAction(ISD::FSIN           , MVT::f80  , Expand);
+      setOperationAction(ISD::FCOS           , MVT::f80  , Expand);
+    }
+  }
+
+  // Always use a library call for pow.
+  setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
+  setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
+  setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
+
+  setOperationAction(ISD::FLOG, MVT::f80, Expand);
+  setOperationAction(ISD::FLOG2, MVT::f80, Expand);
+  setOperationAction(ISD::FLOG10, MVT::f80, Expand);
+  setOperationAction(ISD::FEXP, MVT::f80, Expand);
+  setOperationAction(ISD::FEXP2, MVT::f80, Expand);
+
+  // First set operation action for all vector types to either promote
+  // (for widening) or expand (for scalarization). Then we will selectively
+  // turn on ones that can be effectively codegen'd.
+  for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+       VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
+    setOperationAction(ISD::ADD , (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::SUB , (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FADD, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FNEG, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FSUB, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::MUL , (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FMUL, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::SDIV, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::UDIV, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FDIV, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::SREM, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::UREM, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::LOAD, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::VECTOR_SHUFFLE, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT,(MVT::SimpleValueType)VT,Expand);
+    setOperationAction(ISD::EXTRACT_SUBVECTOR,(MVT::SimpleValueType)VT,Expand);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,(MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FABS, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FSIN, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FCOS, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FREM, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FPOWI, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FSQRT, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::SDIVREM, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::UDIVREM, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FPOW, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::CTPOP, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::CTTZ, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::CTLZ, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::SHL, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::SRA, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::SRL, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::ROTL, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::ROTR, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::VSETCC, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FLOG, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FLOG2, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FLOG10, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FEXP, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FEXP2, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FP_TO_UINT, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FP_TO_SINT, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::UINT_TO_FP, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::SINT_TO_FP, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,Expand);
+    setOperationAction(ISD::TRUNCATE,  (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::SIGN_EXTEND,  (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::ZERO_EXTEND,  (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::ANY_EXTEND,  (MVT::SimpleValueType)VT, Expand);
+    for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+         InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
+      setTruncStoreAction((MVT::SimpleValueType)VT,
+                          (MVT::SimpleValueType)InnerVT, Expand);
+    setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
+    setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
+    setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
+  }
+
+  // FIXME: In order to prevent SSE instructions being expanded to MMX ones
+  // with -msoft-float, disable use of MMX as well.
+  if (!UseSoftFloat && !DisableMMX && Subtarget->hasMMX()) {
+    addRegisterClass(MVT::v8i8,  X86::VR64RegisterClass);
+    addRegisterClass(MVT::v4i16, X86::VR64RegisterClass);
+    addRegisterClass(MVT::v2i32, X86::VR64RegisterClass);
+    addRegisterClass(MVT::v2f32, X86::VR64RegisterClass);
+    addRegisterClass(MVT::v1i64, X86::VR64RegisterClass);
+
+    setOperationAction(ISD::ADD,                MVT::v8i8,  Legal);
+    setOperationAction(ISD::ADD,                MVT::v4i16, Legal);
+    setOperationAction(ISD::ADD,                MVT::v2i32, Legal);
+    setOperationAction(ISD::ADD,                MVT::v1i64, Legal);
+
+    setOperationAction(ISD::SUB,                MVT::v8i8,  Legal);
+    setOperationAction(ISD::SUB,                MVT::v4i16, Legal);
+    setOperationAction(ISD::SUB,                MVT::v2i32, Legal);
+    setOperationAction(ISD::SUB,                MVT::v1i64, Legal);
+
+    setOperationAction(ISD::MULHS,              MVT::v4i16, Legal);
+    setOperationAction(ISD::MUL,                MVT::v4i16, Legal);
+
+    setOperationAction(ISD::AND,                MVT::v8i8,  Promote);
+    AddPromotedToType (ISD::AND,                MVT::v8i8,  MVT::v1i64);
+    setOperationAction(ISD::AND,                MVT::v4i16, Promote);
+    AddPromotedToType (ISD::AND,                MVT::v4i16, MVT::v1i64);
+    setOperationAction(ISD::AND,                MVT::v2i32, Promote);
+    AddPromotedToType (ISD::AND,                MVT::v2i32, MVT::v1i64);
+    setOperationAction(ISD::AND,                MVT::v1i64, Legal);
+
+    setOperationAction(ISD::OR,                 MVT::v8i8,  Promote);
+    AddPromotedToType (ISD::OR,                 MVT::v8i8,  MVT::v1i64);
+    setOperationAction(ISD::OR,                 MVT::v4i16, Promote);
+    AddPromotedToType (ISD::OR,                 MVT::v4i16, MVT::v1i64);
+    setOperationAction(ISD::OR,                 MVT::v2i32, Promote);
+    AddPromotedToType (ISD::OR,                 MVT::v2i32, MVT::v1i64);
+    setOperationAction(ISD::OR,                 MVT::v1i64, Legal);
+
+    setOperationAction(ISD::XOR,                MVT::v8i8,  Promote);
+    AddPromotedToType (ISD::XOR,                MVT::v8i8,  MVT::v1i64);
+    setOperationAction(ISD::XOR,                MVT::v4i16, Promote);
+    AddPromotedToType (ISD::XOR,                MVT::v4i16, MVT::v1i64);
+    setOperationAction(ISD::XOR,                MVT::v2i32, Promote);
+    AddPromotedToType (ISD::XOR,                MVT::v2i32, MVT::v1i64);
+    setOperationAction(ISD::XOR,                MVT::v1i64, Legal);
+
+    setOperationAction(ISD::LOAD,               MVT::v8i8,  Promote);
+    AddPromotedToType (ISD::LOAD,               MVT::v8i8,  MVT::v1i64);
+    setOperationAction(ISD::LOAD,               MVT::v4i16, Promote);
+    AddPromotedToType (ISD::LOAD,               MVT::v4i16, MVT::v1i64);
+    setOperationAction(ISD::LOAD,               MVT::v2i32, Promote);
+    AddPromotedToType (ISD::LOAD,               MVT::v2i32, MVT::v1i64);
+    setOperationAction(ISD::LOAD,               MVT::v2f32, Promote);
+    AddPromotedToType (ISD::LOAD,               MVT::v2f32, MVT::v1i64);
+    setOperationAction(ISD::LOAD,               MVT::v1i64, Legal);
+
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i8,  Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4i16, Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i32, Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f32, Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v1i64, Custom);
+
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v8i8,  Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4i16, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i32, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v1i64, Custom);
+
+    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2f32, Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Custom);
+
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i16, Custom);
+
+    setOperationAction(ISD::SELECT,             MVT::v8i8, Promote);
+    setOperationAction(ISD::SELECT,             MVT::v4i16, Promote);
+    setOperationAction(ISD::SELECT,             MVT::v2i32, Promote);
+    setOperationAction(ISD::SELECT,             MVT::v1i64, Custom);
+    setOperationAction(ISD::VSETCC,             MVT::v8i8, Custom);
+    setOperationAction(ISD::VSETCC,             MVT::v4i16, Custom);
+    setOperationAction(ISD::VSETCC,             MVT::v2i32, Custom);
+  }
+
+  if (!UseSoftFloat && Subtarget->hasSSE1()) {
+    addRegisterClass(MVT::v4f32, X86::VR128RegisterClass);
+
+    setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
+    setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
+    setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
+    setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
+    setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
+    setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
+    setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
+    setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
+    setOperationAction(ISD::VSETCC,             MVT::v4f32, Custom);
+  }
+
+  if (!UseSoftFloat && Subtarget->hasSSE2()) {
+    addRegisterClass(MVT::v2f64, X86::VR128RegisterClass);
+
+    // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM
+    // registers cannot be used even for integer operations.
+    addRegisterClass(MVT::v16i8, X86::VR128RegisterClass);
+    addRegisterClass(MVT::v8i16, X86::VR128RegisterClass);
+    addRegisterClass(MVT::v4i32, X86::VR128RegisterClass);
+    addRegisterClass(MVT::v2i64, X86::VR128RegisterClass);
+
+    setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
+    setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
+    setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
+    setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
+    setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
+    setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
+    setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
+    setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
+    setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
+    setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
+    setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
+    setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
+    setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
+    setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
+    setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
+    setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
+
+    setOperationAction(ISD::VSETCC,             MVT::v2f64, Custom);
+    setOperationAction(ISD::VSETCC,             MVT::v16i8, Custom);
+    setOperationAction(ISD::VSETCC,             MVT::v8i16, Custom);
+    setOperationAction(ISD::VSETCC,             MVT::v4i32, Custom);
+
+    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
+    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
+
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v2f64, Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v2i64, Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i8, Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i16, Custom);
+    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i32, Custom);
+
+    // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
+    for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; ++i) {
+      EVT VT = (MVT::SimpleValueType)i;
+      // Do not attempt to custom lower non-power-of-2 vectors
+      if (!isPowerOf2_32(VT.getVectorNumElements()))
+        continue;
+      // Do not attempt to custom lower non-128-bit vectors
+      if (!VT.is128BitVector())
+        continue;
+      setOperationAction(ISD::BUILD_VECTOR,
+                         VT.getSimpleVT().SimpleTy, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE,
+                         VT.getSimpleVT().SimpleTy, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT,
+                         VT.getSimpleVT().SimpleTy, Custom);
+    }
+
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
+
+    if (Subtarget->is64Bit()) {
+      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
+    }
+
+    // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
+    for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v2i64; i++) {
+      MVT::SimpleValueType SVT = (MVT::SimpleValueType)i;
+      EVT VT = SVT;
+
+      // Do not attempt to promote non-128-bit vectors
+      if (!VT.is128BitVector()) {
+        continue;
+      }
+      setOperationAction(ISD::AND,    SVT, Promote);
+      AddPromotedToType (ISD::AND,    SVT, MVT::v2i64);
+      setOperationAction(ISD::OR,     SVT, Promote);
+      AddPromotedToType (ISD::OR,     SVT, MVT::v2i64);
+      setOperationAction(ISD::XOR,    SVT, Promote);
+      AddPromotedToType (ISD::XOR,    SVT, MVT::v2i64);
+      setOperationAction(ISD::LOAD,   SVT, Promote);
+      AddPromotedToType (ISD::LOAD,   SVT, MVT::v2i64);
+      setOperationAction(ISD::SELECT, SVT, Promote);
+      AddPromotedToType (ISD::SELECT, SVT, MVT::v2i64);
+    }
+
+    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+
+    // Custom lower v2i64 and v2f64 selects.
+    setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
+    setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
+    setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
+    setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
+
+    setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
+    setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
+    if (!DisableMMX && Subtarget->hasMMX()) {
+      setOperationAction(ISD::FP_TO_SINT,         MVT::v2i32, Custom);
+      setOperationAction(ISD::SINT_TO_FP,         MVT::v2i32, Custom);
+    }
+  }
+
+  if (Subtarget->hasSSE41()) {
+    // FIXME: Do we need to handle scalar-to-vector here?
+    setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
+
+    // i8 and i16 vectors are custom , because the source register and source
+    // source memory operand types are not the same width.  f32 vectors are
+    // custom since the immediate controlling the insert encodes additional
+    // information.
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
+
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
+
+    if (Subtarget->is64Bit()) {
+      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Legal);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Legal);
+    }
+  }
+
+  if (Subtarget->hasSSE42()) {
+    setOperationAction(ISD::VSETCC,             MVT::v2i64, Custom);
+  }
+
+  if (!UseSoftFloat && Subtarget->hasAVX()) {
+    addRegisterClass(MVT::v8f32, X86::VR256RegisterClass);
+    addRegisterClass(MVT::v4f64, X86::VR256RegisterClass);
+    addRegisterClass(MVT::v8i32, X86::VR256RegisterClass);
+    addRegisterClass(MVT::v4i64, X86::VR256RegisterClass);
+
+    setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
+    setOperationAction(ISD::LOAD,               MVT::v8i32, Legal);
+    setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
+    setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
+    setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
+    setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
+    setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
+    setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
+    setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
+    setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
+    //setOperationAction(ISD::BUILD_VECTOR,       MVT::v8f32, Custom);
+    //setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v8f32, Custom);
+    //setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8f32, Custom);
+    //setOperationAction(ISD::SELECT,             MVT::v8f32, Custom);
+    //setOperationAction(ISD::VSETCC,             MVT::v8f32, Custom);
+
+    // Operations to consider commented out -v16i16 v32i8
+    //setOperationAction(ISD::ADD,                MVT::v16i16, Legal);
+    setOperationAction(ISD::ADD,                MVT::v8i32, Custom);
+    setOperationAction(ISD::ADD,                MVT::v4i64, Custom);
+    //setOperationAction(ISD::SUB,                MVT::v32i8, Legal);
+    //setOperationAction(ISD::SUB,                MVT::v16i16, Legal);
+    setOperationAction(ISD::SUB,                MVT::v8i32, Custom);
+    setOperationAction(ISD::SUB,                MVT::v4i64, Custom);
+    //setOperationAction(ISD::MUL,                MVT::v16i16, Legal);
+    setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
+    setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
+    setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
+    setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
+    setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
+    setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
+
+    setOperationAction(ISD::VSETCC,             MVT::v4f64, Custom);
+    // setOperationAction(ISD::VSETCC,             MVT::v32i8, Custom);
+    // setOperationAction(ISD::VSETCC,             MVT::v16i16, Custom);
+    setOperationAction(ISD::VSETCC,             MVT::v8i32, Custom);
+
+    // setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v32i8, Custom);
+    // setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i16, Custom);
+    // setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i16, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i32, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8f32, Custom);
+
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f64, Custom);
+    setOperationAction(ISD::BUILD_VECTOR,       MVT::v4i64, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f64, Custom);
+    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4i64, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f64, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f64, Custom);
+
+#if 0
+    // Not sure we want to do this since there are no 256-bit integer
+    // operations in AVX
+
+    // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
+    // This includes 256-bit vectors
+    for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; ++i) {
+      EVT VT = (MVT::SimpleValueType)i;
+
+      // Do not attempt to custom lower non-power-of-2 vectors
+      if (!isPowerOf2_32(VT.getVectorNumElements()))
+        continue;
+
+      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+    }
+
+    if (Subtarget->is64Bit()) {
+      setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i64, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i64, Custom);
+    }
+#endif
+
+#if 0
+    // Not sure we want to do this since there are no 256-bit integer
+    // operations in AVX
+
+    // Promote v32i8, v16i16, v8i32 load, select, and, or, xor to v4i64.
+    // Including 256-bit vectors
+    for (unsigned i = (unsigned)MVT::v16i8; i != (unsigned)MVT::v4i64; i++) {
+      EVT VT = (MVT::SimpleValueType)i;
+
+      if (!VT.is256BitVector()) {
+        continue;
+      }
+      setOperationAction(ISD::AND,    VT, Promote);
+      AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
+      setOperationAction(ISD::OR,     VT, Promote);
+      AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
+      setOperationAction(ISD::XOR,    VT, Promote);
+      AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
+      setOperationAction(ISD::LOAD,   VT, Promote);
+      AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
+      setOperationAction(ISD::SELECT, VT, Promote);
+      AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
+    }
+
+    setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+#endif
+  }
+
+  // We want to custom lower some of our intrinsics.
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
+  // Add/Sub/Mul with overflow operations are custom lowered.
+  setOperationAction(ISD::SADDO, MVT::i32, Custom);
+  setOperationAction(ISD::SADDO, MVT::i64, Custom);
+  setOperationAction(ISD::UADDO, MVT::i32, Custom);
+  setOperationAction(ISD::UADDO, MVT::i64, Custom);
+  setOperationAction(ISD::SSUBO, MVT::i32, Custom);
+  setOperationAction(ISD::SSUBO, MVT::i64, Custom);
+  setOperationAction(ISD::USUBO, MVT::i32, Custom);
+  setOperationAction(ISD::USUBO, MVT::i64, Custom);
+  setOperationAction(ISD::SMULO, MVT::i32, Custom);
+  setOperationAction(ISD::SMULO, MVT::i64, Custom);
+
+  if (!Subtarget->is64Bit()) {
+    // These libcalls are not available in 32-bit.
+    setLibcallName(RTLIB::SHL_I128, 0);
+    setLibcallName(RTLIB::SRL_I128, 0);
+    setLibcallName(RTLIB::SRA_I128, 0);
+  }
+
+  // We have target-specific dag combine patterns for the following nodes:
+  setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
+  setTargetDAGCombine(ISD::BUILD_VECTOR);
+  setTargetDAGCombine(ISD::SELECT);
+  setTargetDAGCombine(ISD::SHL);
+  setTargetDAGCombine(ISD::SRA);
+  setTargetDAGCombine(ISD::SRL);
+  setTargetDAGCombine(ISD::OR);
+  setTargetDAGCombine(ISD::STORE);
+  setTargetDAGCombine(ISD::MEMBARRIER);
+  setTargetDAGCombine(ISD::ZERO_EXTEND);
+  if (Subtarget->is64Bit())
+    setTargetDAGCombine(ISD::MUL);
+
+  computeRegisterProperties();
+
+  // Divide and reminder operations have no vector equivalent and can
+  // trap. Do a custom widening for these operations in which we never
+  // generate more divides/remainder than the original vector width.
+  for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+       VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
+    if (!isTypeLegal((MVT::SimpleValueType)VT)) {
+      setOperationAction(ISD::SDIV, (MVT::SimpleValueType) VT, Custom);
+      setOperationAction(ISD::UDIV, (MVT::SimpleValueType) VT, Custom);
+      setOperationAction(ISD::SREM, (MVT::SimpleValueType) VT, Custom);
+      setOperationAction(ISD::UREM, (MVT::SimpleValueType) VT, Custom);
+    }
+  }
+
+  // FIXME: These should be based on subtarget info. Plus, the values should
+  // be smaller when we are in optimizing for size mode.
+  maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
+  maxStoresPerMemcpy = 16; // For @llvm.memcpy -> sequence of stores
+  maxStoresPerMemmove = 3; // For @llvm.memmove -> sequence of stores
+  setPrefLoopAlignment(16);
+  benefitFromCodePlacementOpt = true;
+}
+
+
+MVT::SimpleValueType X86TargetLowering::getSetCCResultType(EVT VT) const {
+  return MVT::i8;
+}
+
+
+/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
+/// the desired ByVal argument alignment.
+static void getMaxByValAlign(const Type *Ty, unsigned &MaxAlign) {
+  if (MaxAlign == 16)
+    return;
+  if (const VectorType *VTy = dyn_cast<VectorType>(Ty)) {
+    if (VTy->getBitWidth() == 128)
+      MaxAlign = 16;
+  } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+    unsigned EltAlign = 0;
+    getMaxByValAlign(ATy->getElementType(), EltAlign);
+    if (EltAlign > MaxAlign)
+      MaxAlign = EltAlign;
+  } else if (const StructType *STy = dyn_cast<StructType>(Ty)) {
+    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+      unsigned EltAlign = 0;
+      getMaxByValAlign(STy->getElementType(i), EltAlign);
+      if (EltAlign > MaxAlign)
+        MaxAlign = EltAlign;
+      if (MaxAlign == 16)
+        break;
+    }
+  }
+  return;
+}
+
+/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
+/// function arguments in the caller parameter area. For X86, aggregates
+/// that contain SSE vectors are placed at 16-byte boundaries while the rest
+/// are at 4-byte boundaries.
+unsigned X86TargetLowering::getByValTypeAlignment(const Type *Ty) const {
+  if (Subtarget->is64Bit()) {
+    // Max of 8 and alignment of type.
+    unsigned TyAlign = TD->getABITypeAlignment(Ty);
+    if (TyAlign > 8)
+      return TyAlign;
+    return 8;
+  }
+
+  unsigned Align = 4;
+  if (Subtarget->hasSSE1())
+    getMaxByValAlign(Ty, Align);
+  return Align;
+}
+
+/// getOptimalMemOpType - Returns the target specific optimal type for load
+/// and store operations as a result of memset, memcpy, and memmove
+/// lowering. It returns MVT::iAny if SelectionDAG should be responsible for
+/// determining it.
+EVT
+X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned Align,
+                                       bool isSrcConst, bool isSrcStr,
+                                       SelectionDAG &DAG) const {
+  // FIXME: This turns off use of xmm stores for memset/memcpy on targets like
+  // linux.  This is because the stack realignment code can't handle certain
+  // cases like PR2962.  This should be removed when PR2962 is fixed.
+  const Function *F = DAG.getMachineFunction().getFunction();
+  bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat);
+  if (!NoImplicitFloatOps && Subtarget->getStackAlignment() >= 16) {
+    if ((isSrcConst || isSrcStr) && Subtarget->hasSSE2() && Size >= 16)
+      return MVT::v4i32;
+    if ((isSrcConst || isSrcStr) && Subtarget->hasSSE1() && Size >= 16)
+      return MVT::v4f32;
+  }
+  if (Subtarget->is64Bit() && Size >= 8)
+    return MVT::i64;
+  return MVT::i32;
+}
+
+/// getJumpTableEncoding - Return the entry encoding for a jump table in the
+/// current function.  The returned value is a member of the
+/// MachineJumpTableInfo::JTEntryKind enum.
+unsigned X86TargetLowering::getJumpTableEncoding() const {
+  // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
+  // symbol.
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+      Subtarget->isPICStyleGOT())
+    return MachineJumpTableInfo::EK_Custom32;
+  
+  // Otherwise, use the normal jump table encoding heuristics.
+  return TargetLowering::getJumpTableEncoding();
+}
+
+/// getPICBaseSymbol - Return the X86-32 PIC base.
+MCSymbol *
+X86TargetLowering::getPICBaseSymbol(const MachineFunction *MF,
+                                    MCContext &Ctx) const {
+  const MCAsmInfo &MAI = *getTargetMachine().getMCAsmInfo();
+  return Ctx.GetOrCreateSymbol(Twine(MAI.getPrivateGlobalPrefix())+
+                               Twine(MF->getFunctionNumber())+"$pb");
+}
+
+
+const MCExpr *
+X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
+                                             const MachineBasicBlock *MBB,
+                                             unsigned uid,MCContext &Ctx) const{
+  assert(getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+         Subtarget->isPICStyleGOT());
+  // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
+  // entries.
+  return X86MCTargetExpr::Create(MBB->getSymbol(Ctx),
+                                 X86MCTargetExpr::GOTOFF, Ctx);
+}
+
+/// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
+/// jumptable.
+SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
+                                                    SelectionDAG &DAG) const {
+  if (!Subtarget->is64Bit())
+    // This doesn't have DebugLoc associated with it, but is not really the
+    // same as a Register.
+    return DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc::getUnknownLoc(),
+                       getPointerTy());
+  return Table;
+}
+
+/// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
+/// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
+/// MCExpr.
+const MCExpr *X86TargetLowering::
+getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
+                             MCContext &Ctx) const {
+  // X86-64 uses RIP relative addressing based on the jump table label.
+  if (Subtarget->isPICStyleRIPRel())
+    return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
+
+  // Otherwise, the reference is relative to the PIC base.
+  return MCSymbolRefExpr::Create(getPICBaseSymbol(MF, Ctx), Ctx);
+}
+
+/// getFunctionAlignment - Return the Log2 alignment of this function.
+unsigned X86TargetLowering::getFunctionAlignment(const Function *F) const {
+  return F->hasFnAttr(Attribute::OptimizeForSize) ? 0 : 4;
+}
+
+//===----------------------------------------------------------------------===//
+//               Return Value Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "X86GenCallingConv.inc"
+
+bool 
+X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, bool isVarArg,
+                        const SmallVectorImpl<EVT> &OutTys,
+                        const SmallVectorImpl<ISD::ArgFlagsTy> &ArgsFlags,
+                        SelectionDAG &DAG) {
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 RVLocs, *DAG.getContext());
+  return CCInfo.CheckReturn(OutTys, ArgsFlags, RetCC_X86);
+}
+
+SDValue
+X86TargetLowering::LowerReturn(SDValue Chain,
+                               CallingConv::ID CallConv, bool isVarArg,
+                               const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               DebugLoc dl, SelectionDAG &DAG) {
+
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 RVLocs, *DAG.getContext());
+  CCInfo.AnalyzeReturn(Outs, RetCC_X86);
+
+  // Add the regs to the liveout set for the function.
+  MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
+  for (unsigned i = 0; i != RVLocs.size(); ++i)
+    if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg()))
+      MRI.addLiveOut(RVLocs[i].getLocReg());
+
+  SDValue Flag;
+
+  SmallVector<SDValue, 6> RetOps;
+  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
+  // Operand #1 = Bytes To Pop
+  RetOps.push_back(DAG.getTargetConstant(getBytesToPopOnReturn(), MVT::i16));
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+    SDValue ValToCopy = Outs[i].Val;
+
+    // Returns in ST0/ST1 are handled specially: these are pushed as operands to
+    // the RET instruction and handled by the FP Stackifier.
+    if (VA.getLocReg() == X86::ST0 ||
+        VA.getLocReg() == X86::ST1) {
+      // If this is a copy from an xmm register to ST(0), use an FPExtend to
+      // change the value to the FP stack register class.
+      if (isScalarFPTypeInSSEReg(VA.getValVT()))
+        ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
+      RetOps.push_back(ValToCopy);
+      // Don't emit a copytoreg.
+      continue;
+    }
+
+    // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
+    // which is returned in RAX / RDX.
+    if (Subtarget->is64Bit()) {
+      EVT ValVT = ValToCopy.getValueType();
+      if (ValVT.isVector() && ValVT.getSizeInBits() == 64) {
+        ValToCopy = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, ValToCopy);
+        if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1)
+          ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, ValToCopy);
+      }
+    }
+
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
+    Flag = Chain.getValue(1);
+  }
+
+  // The x86-64 ABI for returning structs by value requires that we copy
+  // the sret argument into %rax for the return. We saved the argument into
+  // a virtual register in the entry block, so now we copy the value out
+  // and into %rax.
+  if (Subtarget->is64Bit() &&
+      DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
+    MachineFunction &MF = DAG.getMachineFunction();
+    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+    unsigned Reg = FuncInfo->getSRetReturnReg();
+    if (!Reg) {
+      Reg = MRI.createVirtualRegister(getRegClassFor(MVT::i64));
+      FuncInfo->setSRetReturnReg(Reg);
+    }
+    SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
+
+    Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag);
+    Flag = Chain.getValue(1);
+
+    // RAX now acts like a return value.
+    MRI.addLiveOut(X86::RAX);
+  }
+
+  RetOps[0] = Chain;  // Update chain.
+
+  // Add the flag if we have it.
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
+  return DAG.getNode(X86ISD::RET_FLAG, dl,
+                     MVT::Other, &RetOps[0], RetOps.size());
+}
+
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+///
+SDValue
+X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
+                                   CallingConv::ID CallConv, bool isVarArg,
+                                   const SmallVectorImpl<ISD::InputArg> &Ins,
+                                   DebugLoc dl, SelectionDAG &DAG,
+                                   SmallVectorImpl<SDValue> &InVals) {
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  bool Is64Bit = Subtarget->is64Bit();
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 RVLocs, *DAG.getContext());
+  CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    EVT CopyVT = VA.getValVT();
+
+    // If this is x86-64, and we disabled SSE, we can't return FP values
+    if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
+        ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
+      llvm_report_error("SSE register return with SSE disabled");
+    }
+
+    // If this is a call to a function that returns an fp value on the floating
+    // point stack, but where we prefer to use the value in xmm registers, copy
+    // it out as F80 and use a truncate to move it from fp stack reg to xmm reg.
+    if ((VA.getLocReg() == X86::ST0 ||
+         VA.getLocReg() == X86::ST1) &&
+        isScalarFPTypeInSSEReg(VA.getValVT())) {
+      CopyVT = MVT::f80;
+    }
+
+    SDValue Val;
+    if (Is64Bit && CopyVT.isVector() && CopyVT.getSizeInBits() == 64) {
+      // For x86-64, MMX values are returned in XMM0 / XMM1 except for v1i64.
+      if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
+        Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
+                                   MVT::v2i64, InFlag).getValue(1);
+        Val = Chain.getValue(0);
+        Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
+                          Val, DAG.getConstant(0, MVT::i64));
+      } else {
+        Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
+                                   MVT::i64, InFlag).getValue(1);
+        Val = Chain.getValue(0);
+      }
+      Val = DAG.getNode(ISD::BIT_CONVERT, dl, CopyVT, Val);
+    } else {
+      Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
+                                 CopyVT, InFlag).getValue(1);
+      Val = Chain.getValue(0);
+    }
+    InFlag = Chain.getValue(2);
+
+    if (CopyVT != VA.getValVT()) {
+      // Round the F80 the right size, which also moves to the appropriate xmm
+      // register.
+      Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
+                        // This truncation won't change the value.
+                        DAG.getIntPtrConstant(1));
+    }
+
+    InVals.push_back(Val);
+  }
+
+  return Chain;
+}
+
+
+//===----------------------------------------------------------------------===//
+//                C & StdCall & Fast Calling Convention implementation
+//===----------------------------------------------------------------------===//
+//  StdCall calling convention seems to be standard for many Windows' API
+//  routines and around. It differs from C calling convention just a little:
+//  callee should clean up the stack, not caller. Symbols should be also
+//  decorated in some fancy way :) It doesn't support any vector arguments.
+//  For info on fast calling convention see Fast Calling Convention (tail call)
+//  implementation LowerX86_32FastCCCallTo.
+
+/// CallIsStructReturn - Determines whether a call uses struct return
+/// semantics.
+static bool CallIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
+  if (Outs.empty())
+    return false;
+
+  return Outs[0].Flags.isSRet();
+}
+
+/// ArgsAreStructReturn - Determines whether a function uses struct
+/// return semantics.
+static bool
+ArgsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
+  if (Ins.empty())
+    return false;
+
+  return Ins[0].Flags.isSRet();
+}
+
+/// IsCalleePop - Determines whether the callee is required to pop its
+/// own arguments. Callee pop is necessary to support tail calls.
+bool X86TargetLowering::IsCalleePop(bool IsVarArg, CallingConv::ID CallingConv){
+  if (IsVarArg)
+    return false;
+
+  switch (CallingConv) {
+  default:
+    return false;
+  case CallingConv::X86_StdCall:
+    return !Subtarget->is64Bit();
+  case CallingConv::X86_FastCall:
+    return !Subtarget->is64Bit();
+  case CallingConv::Fast:
+    return GuaranteedTailCallOpt;
+  }
+}
+
+/// CCAssignFnForNode - Selects the correct CCAssignFn for a the
+/// given CallingConvention value.
+CCAssignFn *X86TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const {
+  if (Subtarget->is64Bit()) {
+    if (Subtarget->isTargetWin64())
+      return CC_X86_Win64_C;
+    else
+      return CC_X86_64_C;
+  }
+
+  if (CC == CallingConv::X86_FastCall)
+    return CC_X86_32_FastCall;
+  else if (CC == CallingConv::Fast)
+    return CC_X86_32_FastCC;
+  else
+    return CC_X86_32_C;
+}
+
+/// NameDecorationForCallConv - Selects the appropriate decoration to
+/// apply to a MachineFunction containing a given calling convention.
+NameDecorationStyle
+X86TargetLowering::NameDecorationForCallConv(CallingConv::ID CallConv) {
+  if (CallConv == CallingConv::X86_FastCall)
+    return FastCall;
+  else if (CallConv == CallingConv::X86_StdCall)
+    return StdCall;
+  return None;
+}
+
+
+/// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
+/// by "Src" to address "Dst" with size and alignment information specified by
+/// the specific parameter attribute. The copy will be passed as a byval
+/// function parameter.
+static SDValue
+CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
+                          ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
+                          DebugLoc dl) {
+  SDValue SizeNode     = DAG.getConstant(Flags.getByValSize(), MVT::i32);
+  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
+                       /*AlwaysInline=*/true, NULL, 0, NULL, 0);
+}
+
+/// FuncIsMadeTailCallSafe - Return true if the function is being made into
+/// a tailcall target by changing its ABI.
+static bool FuncIsMadeTailCallSafe(CallingConv::ID CC) {
+  return GuaranteedTailCallOpt && CC == CallingConv::Fast;
+}
+
+SDValue
+X86TargetLowering::LowerMemArgument(SDValue Chain,
+                                    CallingConv::ID CallConv,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    DebugLoc dl, SelectionDAG &DAG,
+                                    const CCValAssign &VA,
+                                    MachineFrameInfo *MFI,
+                                    unsigned i) {
+  // Create the nodes corresponding to a load from this parameter slot.
+  ISD::ArgFlagsTy Flags = Ins[i].Flags;
+  bool AlwaysUseMutable = FuncIsMadeTailCallSafe(CallConv);
+  bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
+  EVT ValVT;
+
+  // If value is passed by pointer we have address passed instead of the value
+  // itself.
+  if (VA.getLocInfo() == CCValAssign::Indirect)
+    ValVT = VA.getLocVT();
+  else
+    ValVT = VA.getValVT();
+
+  // FIXME: For now, all byval parameter objects are marked mutable. This can be
+  // changed with more analysis.
+  // In case of tail call optimization mark all arguments mutable. Since they
+  // could be overwritten by lowering of arguments in case of a tail call.
+  if (Flags.isByVal()) {
+    int FI = MFI->CreateFixedObject(Flags.getByValSize(),
+                                    VA.getLocMemOffset(), isImmutable, false);
+    return DAG.getFrameIndex(FI, getPointerTy());
+  } else {
+    int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
+                                    VA.getLocMemOffset(), isImmutable, false);
+    SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+    return DAG.getLoad(ValVT, dl, Chain, FIN,
+                       PseudoSourceValue::getFixedStack(FI), 0);
+  }
+}
+
+SDValue
+X86TargetLowering::LowerFormalArguments(SDValue Chain,
+                                        CallingConv::ID CallConv,
+                                        bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                        DebugLoc dl,
+                                        SelectionDAG &DAG,
+                                        SmallVectorImpl<SDValue> &InVals) {
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+
+  const Function* Fn = MF.getFunction();
+  if (Fn->hasExternalLinkage() &&
+      Subtarget->isTargetCygMing() &&
+      Fn->getName() == "main")
+    FuncInfo->setForceFramePointer(true);
+
+  // Decorate the function name.
+  FuncInfo->setDecorationStyle(NameDecorationForCallConv(CallConv));
+
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  bool Is64Bit = Subtarget->is64Bit();
+  bool IsWin64 = Subtarget->isTargetWin64();
+
+  assert(!(isVarArg && CallConv == CallingConv::Fast) &&
+         "Var args not supported with calling convention fastcc");
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 ArgLocs, *DAG.getContext());
+  CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv));
+
+  unsigned LastVal = ~0U;
+  SDValue ArgValue;
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
+    // places.
+    assert(VA.getValNo() != LastVal &&
+           "Don't support value assigned to multiple locs yet");
+    LastVal = VA.getValNo();
+
+    if (VA.isRegLoc()) {
+      EVT RegVT = VA.getLocVT();
+      TargetRegisterClass *RC = NULL;
+      if (RegVT == MVT::i32)
+        RC = X86::GR32RegisterClass;
+      else if (Is64Bit && RegVT == MVT::i64)
+        RC = X86::GR64RegisterClass;
+      else if (RegVT == MVT::f32)
+        RC = X86::FR32RegisterClass;
+      else if (RegVT == MVT::f64)
+        RC = X86::FR64RegisterClass;
+      else if (RegVT.isVector() && RegVT.getSizeInBits() == 128)
+        RC = X86::VR128RegisterClass;
+      else if (RegVT.isVector() && RegVT.getSizeInBits() == 64)
+        RC = X86::VR64RegisterClass;
+      else
+        llvm_unreachable("Unknown argument type!");
+
+      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+      ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
+
+      // If this is an 8 or 16-bit value, it is really passed promoted to 32
+      // bits.  Insert an assert[sz]ext to capture this, then truncate to the
+      // right size.
+      if (VA.getLocInfo() == CCValAssign::SExt)
+        ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+      else if (VA.getLocInfo() == CCValAssign::ZExt)
+        ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
+                               DAG.getValueType(VA.getValVT()));
+      else if (VA.getLocInfo() == CCValAssign::BCvt)
+        ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue);
+
+      if (VA.isExtInLoc()) {
+        // Handle MMX values passed in XMM regs.
+        if (RegVT.isVector()) {
+          ArgValue = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
+                                 ArgValue, DAG.getConstant(0, MVT::i64));
+          ArgValue = DAG.getNode(ISD::BIT_CONVERT, dl, VA.getValVT(), ArgValue);
+        } else
+          ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
+      }
+    } else {
+      assert(VA.isMemLoc());
+      ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
+    }
+
+    // If value is passed via pointer - do a load.
+    if (VA.getLocInfo() == CCValAssign::Indirect)
+      ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, NULL, 0);
+
+    InVals.push_back(ArgValue);
+  }
+
+  // The x86-64 ABI for returning structs by value requires that we copy
+  // the sret argument into %rax for the return. Save the argument into
+  // a virtual register so that we can access it from the return points.
+  if (Is64Bit && MF.getFunction()->hasStructRetAttr()) {
+    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+    unsigned Reg = FuncInfo->getSRetReturnReg();
+    if (!Reg) {
+      Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
+      FuncInfo->setSRetReturnReg(Reg);
+    }
+    SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
+  }
+
+  unsigned StackSize = CCInfo.getNextStackOffset();
+  // Align stack specially for tail calls.
+  if (FuncIsMadeTailCallSafe(CallConv))
+    StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
+
+  // If the function takes variable number of arguments, make a frame index for
+  // the start of the first vararg value... for expansion of llvm.va_start.
+  if (isVarArg) {
+    if (Is64Bit || CallConv != CallingConv::X86_FastCall) {
+      VarArgsFrameIndex = MFI->CreateFixedObject(1, StackSize, true, false);
+    }
+    if (Is64Bit) {
+      unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
+
+      // FIXME: We should really autogenerate these arrays
+      static const unsigned GPR64ArgRegsWin64[] = {
+        X86::RCX, X86::RDX, X86::R8,  X86::R9
+      };
+      static const unsigned XMMArgRegsWin64[] = {
+        X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3
+      };
+      static const unsigned GPR64ArgRegs64Bit[] = {
+        X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
+      };
+      static const unsigned XMMArgRegs64Bit[] = {
+        X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+        X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+      };
+      const unsigned *GPR64ArgRegs, *XMMArgRegs;
+
+      if (IsWin64) {
+        TotalNumIntRegs = 4; TotalNumXMMRegs = 4;
+        GPR64ArgRegs = GPR64ArgRegsWin64;
+        XMMArgRegs = XMMArgRegsWin64;
+      } else {
+        TotalNumIntRegs = 6; TotalNumXMMRegs = 8;
+        GPR64ArgRegs = GPR64ArgRegs64Bit;
+        XMMArgRegs = XMMArgRegs64Bit;
+      }
+      unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
+                                                       TotalNumIntRegs);
+      unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs,
+                                                       TotalNumXMMRegs);
+
+      bool NoImplicitFloatOps = Fn->hasFnAttr(Attribute::NoImplicitFloat);
+      assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
+             "SSE register cannot be used when SSE is disabled!");
+      assert(!(NumXMMRegs && UseSoftFloat && NoImplicitFloatOps) &&
+             "SSE register cannot be used when SSE is disabled!");
+      if (UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1())
+        // Kernel mode asks for SSE to be disabled, so don't push them
+        // on the stack.
+        TotalNumXMMRegs = 0;
+
+      // For X86-64, if there are vararg parameters that are passed via
+      // registers, then we must store them to their spots on the stack so they
+      // may be loaded by deferencing the result of va_next.
+      VarArgsGPOffset = NumIntRegs * 8;
+      VarArgsFPOffset = TotalNumIntRegs * 8 + NumXMMRegs * 16;
+      RegSaveFrameIndex = MFI->CreateStackObject(TotalNumIntRegs * 8 +
+                                                 TotalNumXMMRegs * 16, 16,
+                                                 false);
+
+      // Store the integer parameter registers.
+      SmallVector<SDValue, 8> MemOps;
+      SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy());
+      unsigned Offset = VarArgsGPOffset;
+      for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) {
+        SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
+                                  DAG.getIntPtrConstant(Offset));
+        unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs],
+                                     X86::GR64RegisterClass);
+        SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64);
+        SDValue Store =
+          DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                       PseudoSourceValue::getFixedStack(RegSaveFrameIndex),
+                       Offset);
+        MemOps.push_back(Store);
+        Offset += 8;
+      }
+
+      if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) {
+        // Now store the XMM (fp + vector) parameter registers.
+        SmallVector<SDValue, 11> SaveXMMOps;
+        SaveXMMOps.push_back(Chain);
+
+        unsigned AL = MF.addLiveIn(X86::AL, X86::GR8RegisterClass);
+        SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8);
+        SaveXMMOps.push_back(ALVal);
+
+        SaveXMMOps.push_back(DAG.getIntPtrConstant(RegSaveFrameIndex));
+        SaveXMMOps.push_back(DAG.getIntPtrConstant(VarArgsFPOffset));
+
+        for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) {
+          unsigned VReg = MF.addLiveIn(XMMArgRegs[NumXMMRegs],
+                                       X86::VR128RegisterClass);
+          SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32);
+          SaveXMMOps.push_back(Val);
+        }
+        MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
+                                     MVT::Other,
+                                     &SaveXMMOps[0], SaveXMMOps.size()));
+      }
+
+      if (!MemOps.empty())
+        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                            &MemOps[0], MemOps.size());
+    }
+  }
+
+  // Some CCs need callee pop.
+  if (IsCalleePop(isVarArg, CallConv)) {
+    BytesToPopOnReturn  = StackSize; // Callee pops everything.
+  } else {
+    BytesToPopOnReturn  = 0; // Callee pops nothing.
+    // If this is an sret function, the return should pop the hidden pointer.
+    if (!Is64Bit && CallConv != CallingConv::Fast && ArgsAreStructReturn(Ins))
+      BytesToPopOnReturn = 4;
+  }
+
+  if (!Is64Bit) {
+    RegSaveFrameIndex = 0xAAAAAAA;   // RegSaveFrameIndex is X86-64 only.
+    if (CallConv == CallingConv::X86_FastCall)
+      VarArgsFrameIndex = 0xAAAAAAA;   // fastcc functions can't have varargs.
+  }
+
+  FuncInfo->setBytesToPopOnReturn(BytesToPopOnReturn);
+
+  return Chain;
+}
+
+SDValue
+X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
+                                    SDValue StackPtr, SDValue Arg,
+                                    DebugLoc dl, SelectionDAG &DAG,
+                                    const CCValAssign &VA,
+                                    ISD::ArgFlagsTy Flags) {
+  const unsigned FirstStackArgOffset = (Subtarget->isTargetWin64() ? 32 : 0);
+  unsigned LocMemOffset = FirstStackArgOffset + VA.getLocMemOffset();
+  SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
+  PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
+  if (Flags.isByVal()) {
+    return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
+  }
+  return DAG.getStore(Chain, dl, Arg, PtrOff,
+                      PseudoSourceValue::getStack(), LocMemOffset);
+}
+
+/// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
+/// optimization is performed and it is required.
+SDValue
+X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
+                                           SDValue &OutRetAddr, SDValue Chain,
+                                           bool IsTailCall, bool Is64Bit,
+                                           int FPDiff, DebugLoc dl) {
+  // Adjust the Return address stack slot.
+  EVT VT = getPointerTy();
+  OutRetAddr = getReturnAddressFrameIndex(DAG);
+
+  // Load the "old" Return address.
+  OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, NULL, 0);
+  return SDValue(OutRetAddr.getNode(), 1);
+}
+
+/// EmitTailCallStoreRetAddr - Emit a store of the return adress if tail call
+/// optimization is performed and it is required (FPDiff!=0).
+static SDValue
+EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
+                         SDValue Chain, SDValue RetAddrFrIdx,
+                         bool Is64Bit, int FPDiff, DebugLoc dl) {
+  // Store the return address to the appropriate stack slot.
+  if (!FPDiff) return Chain;
+  // Calculate the new stack slot for the return address.
+  int SlotSize = Is64Bit ? 8 : 4;
+  int NewReturnAddrFI =
+    MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, true,false);
+  EVT VT = Is64Bit ? MVT::i64 : MVT::i32;
+  SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT);
+  Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
+                       PseudoSourceValue::getFixedStack(NewReturnAddrFI), 0);
+  return Chain;
+}
+
+SDValue
+X86TargetLowering::LowerCall(SDValue Chain, SDValue Callee,
+                             CallingConv::ID CallConv, bool isVarArg,
+                             bool &isTailCall,
+                             const SmallVectorImpl<ISD::OutputArg> &Outs,
+                             const SmallVectorImpl<ISD::InputArg> &Ins,
+                             DebugLoc dl, SelectionDAG &DAG,
+                             SmallVectorImpl<SDValue> &InVals) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool Is64Bit        = Subtarget->is64Bit();
+  bool IsStructRet    = CallIsStructReturn(Outs);
+  bool IsSibcall      = false;
+
+  if (isTailCall) {
+    // Check if it's really possible to do a tail call.
+    isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg,
+                                                   Outs, Ins, DAG);
+
+    // Sibcalls are automatically detected tailcalls which do not require
+    // ABI changes.
+    if (!GuaranteedTailCallOpt && isTailCall)
+      IsSibcall = true;
+
+    if (isTailCall)
+      ++NumTailCalls;
+  }
+
+  assert(!(isVarArg && CallConv == CallingConv::Fast) &&
+         "Var args not supported with calling convention fastcc");
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 ArgLocs, *DAG.getContext());
+  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv));
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+  if (IsSibcall)
+    // This is a sibcall. The memory operands are available in caller's
+    // own caller's stack.
+    NumBytes = 0;
+  else if (GuaranteedTailCallOpt && CallConv == CallingConv::Fast)
+    NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
+
+  int FPDiff = 0;
+  if (isTailCall && !IsSibcall) {
+    // Lower arguments at fp - stackoffset + fpdiff.
+    unsigned NumBytesCallerPushed =
+      MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn();
+    FPDiff = NumBytesCallerPushed - NumBytes;
+
+    // Set the delta of movement of the returnaddr stackslot.
+    // But only set if delta is greater than previous delta.
+    if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta()))
+      MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff);
+  }
+
+  if (!IsSibcall)
+    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true));
+
+  SDValue RetAddrFrIdx;
+  // Load return adress for tail calls.
+  if (isTailCall && FPDiff)
+    Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
+                                    Is64Bit, FPDiff, dl);
+
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+  SDValue StackPtr;
+
+  // Walk the register/memloc assignments, inserting copies/loads.  In the case
+  // of tail call optimization arguments are handle later.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    EVT RegVT = VA.getLocVT();
+    SDValue Arg = Outs[i].Val;
+    ISD::ArgFlagsTy Flags = Outs[i].Flags;
+    bool isByVal = Flags.isByVal();
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
+      break;
+    case CCValAssign::AExt:
+      if (RegVT.isVector() && RegVT.getSizeInBits() == 128) {
+        // Special case: passing MMX values in XMM registers.
+        Arg = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i64, Arg);
+        Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
+        Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
+      } else
+        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
+      break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BIT_CONVERT, dl, RegVT, Arg);
+      break;
+    case CCValAssign::Indirect: {
+      // Store the argument.
+      SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
+      int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
+      Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
+                           PseudoSourceValue::getFixedStack(FI), 0);
+      Arg = SpillSlot;
+      break;
+    }
+    }
+
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else if (!IsSibcall && (!isTailCall || isByVal)) {
+      assert(VA.isMemLoc());
+      if (StackPtr.getNode() == 0)
+        StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, getPointerTy());
+      MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
+                                             dl, DAG, VA, Flags));
+    }
+  }
+
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into registers.
+  SDValue InFlag;
+  // Tail call byval lowering might overwrite argument registers so in case of
+  // tail call optimization the copies to registers are lowered later.
+  if (!isTailCall)
+    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                               RegsToPass[i].second, InFlag);
+      InFlag = Chain.getValue(1);
+    }
+
+  if (Subtarget->isPICStyleGOT()) {
+    // ELF / PIC requires GOT in the EBX register before function calls via PLT
+    // GOT pointer.
+    if (!isTailCall) {
+      Chain = DAG.getCopyToReg(Chain, dl, X86::EBX,
+                               DAG.getNode(X86ISD::GlobalBaseReg,
+                                           DebugLoc::getUnknownLoc(),
+                                           getPointerTy()),
+                               InFlag);
+      InFlag = Chain.getValue(1);
+    } else {
+      // If we are tail calling and generating PIC/GOT style code load the
+      // address of the callee into ECX. The value in ecx is used as target of
+      // the tail jump. This is done to circumvent the ebx/callee-saved problem
+      // for tail calls on PIC/GOT architectures. Normally we would just put the
+      // address of GOT into ebx and then call target@PLT. But for tail calls
+      // ebx would be restored (since ebx is callee saved) before jumping to the
+      // target@PLT.
+
+      // Note: The actual moving to ECX is done further down.
+      GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
+      if (G && !G->getGlobal()->hasHiddenVisibility() &&
+          !G->getGlobal()->hasProtectedVisibility())
+        Callee = LowerGlobalAddress(Callee, DAG);
+      else if (isa<ExternalSymbolSDNode>(Callee))
+        Callee = LowerExternalSymbol(Callee, DAG);
+    }
+  }
+
+  if (Is64Bit && isVarArg) {
+    // From AMD64 ABI document:
+    // For calls that may call functions that use varargs or stdargs
+    // (prototype-less calls or calls to functions containing ellipsis (...) in
+    // the declaration) %al is used as hidden argument to specify the number
+    // of SSE registers used. The contents of %al do not need to match exactly
+    // the number of registers, but must be an ubound on the number of SSE
+    // registers used and is in the range 0 - 8 inclusive.
+
+    // FIXME: Verify this on Win64
+    // Count the number of XMM registers allocated.
+    static const unsigned XMMArgRegs[] = {
+      X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
+      X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
+    };
+    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
+    assert((Subtarget->hasSSE1() || !NumXMMRegs)
+           && "SSE registers cannot be used when SSE is disabled");
+
+    Chain = DAG.getCopyToReg(Chain, dl, X86::AL,
+                             DAG.getConstant(NumXMMRegs, MVT::i8), InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+
+  // For tail calls lower the arguments to the 'real' stack slot.
+  if (isTailCall) {
+    // Force all the incoming stack arguments to be loaded from the stack
+    // before any new outgoing arguments are stored to the stack, because the
+    // outgoing stack slots may alias the incoming argument stack slots, and
+    // the alias isn't otherwise explicit. This is slightly more conservative
+    // than necessary, because it means that each store effectively depends
+    // on every argument instead of just those arguments it would clobber.
+    SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
+
+    SmallVector<SDValue, 8> MemOpChains2;
+    SDValue FIN;
+    int FI = 0;
+    // Do not flag preceeding copytoreg stuff together with the following stuff.
+    InFlag = SDValue();
+    if (GuaranteedTailCallOpt) {
+      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+        CCValAssign &VA = ArgLocs[i];
+        if (VA.isRegLoc())
+          continue;
+        assert(VA.isMemLoc());
+        SDValue Arg = Outs[i].Val;
+        ISD::ArgFlagsTy Flags = Outs[i].Flags;
+        // Create frame index.
+        int32_t Offset = VA.getLocMemOffset()+FPDiff;
+        uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
+        FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true, false);
+        FIN = DAG.getFrameIndex(FI, getPointerTy());
+
+        if (Flags.isByVal()) {
+          // Copy relative to framepointer.
+          SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
+          if (StackPtr.getNode() == 0)
+            StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr,
+                                          getPointerTy());
+          Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
+
+          MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
+                                                           ArgChain,
+                                                           Flags, DAG, dl));
+        } else {
+          // Store relative to framepointer.
+          MemOpChains2.push_back(
+            DAG.getStore(ArgChain, dl, Arg, FIN,
+                         PseudoSourceValue::getFixedStack(FI), 0));
+        }
+      }
+    }
+
+    if (!MemOpChains2.empty())
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                          &MemOpChains2[0], MemOpChains2.size());
+
+    // Copy arguments to their registers.
+    for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+      Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
+                               RegsToPass[i].second, InFlag);
+      InFlag = Chain.getValue(1);
+    }
+    InFlag =SDValue();
+
+    // Store the return address to the appropriate stack slot.
+    Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit,
+                                     FPDiff, dl);
+  }
+
+  bool WasGlobalOrExternal = false;
+  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
+    assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
+    // In the 64-bit large code model, we have to make all calls
+    // through a register, since the call instruction's 32-bit
+    // pc-relative offset may not be large enough to hold the whole
+    // address.
+  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    WasGlobalOrExternal = true;
+    // If the callee is a GlobalAddress node (quite common, every direct call
+    // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
+    // it.
+
+    // We should use extra load for direct calls to dllimported functions in
+    // non-JIT mode.
+    GlobalValue *GV = G->getGlobal();
+    if (!GV->hasDLLImportLinkage()) {
+      unsigned char OpFlags = 0;
+
+      // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
+      // external symbols most go through the PLT in PIC mode.  If the symbol
+      // has hidden or protected visibility, or if it is static or local, then
+      // we don't need to use the PLT - we can directly call it.
+      if (Subtarget->isTargetELF() &&
+          getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+          GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
+        OpFlags = X86II::MO_PLT;
+      } else if (Subtarget->isPICStyleStubAny() &&
+               (GV->isDeclaration() || GV->isWeakForLinker()) &&
+               Subtarget->getDarwinVers() < 9) {
+        // PC-relative references to external symbols should go through $stub,
+        // unless we're building with the leopard linker or later, which
+        // automatically synthesizes these stubs.
+        OpFlags = X86II::MO_DARWIN_STUB;
+      }
+
+      Callee = DAG.getTargetGlobalAddress(GV, getPointerTy(),
+                                          G->getOffset(), OpFlags);
+    }
+  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    WasGlobalOrExternal = true;
+    unsigned char OpFlags = 0;
+
+    // On ELF targets, in either X86-64 or X86-32 mode, direct calls to external
+    // symbols should go through the PLT.
+    if (Subtarget->isTargetELF() &&
+        getTargetMachine().getRelocationModel() == Reloc::PIC_) {
+      OpFlags = X86II::MO_PLT;
+    } else if (Subtarget->isPICStyleStubAny() &&
+             Subtarget->getDarwinVers() < 9) {
+      // PC-relative references to external symbols should go through $stub,
+      // unless we're building with the leopard linker or later, which
+      // automatically synthesizes these stubs.
+      OpFlags = X86II::MO_DARWIN_STUB;
+    }
+
+    Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
+                                         OpFlags);
+  }
+
+  if (isTailCall && !WasGlobalOrExternal) {
+    // Force the address into a (call preserved) caller-saved register since
+    // tailcall must happen after callee-saved registers are poped.
+    // FIXME: Give it a special register class that contains caller-saved
+    // register instead?
+    unsigned TCReg = Is64Bit ? X86::R11 : X86::EAX;
+    Chain = DAG.getCopyToReg(Chain,  dl,
+                             DAG.getRegister(TCReg, getPointerTy()),
+                             Callee,InFlag);
+    Callee = DAG.getRegister(TCReg, getPointerTy());
+  }
+
+  // Returns a chain & a flag for retval copy to use.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SmallVector<SDValue, 8> Ops;
+
+  if (!IsSibcall && isTailCall) {
+    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                           DAG.getIntPtrConstant(0, true), InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  if (isTailCall)
+    Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
+
+  // Add argument registers to the end of the list so that they are known live
+  // into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  // Add an implicit use GOT pointer in EBX.
+  if (!isTailCall && Subtarget->isPICStyleGOT())
+    Ops.push_back(DAG.getRegister(X86::EBX, getPointerTy()));
+
+  // Add an implicit use of AL for x86 vararg functions.
+  if (Is64Bit && isVarArg)
+    Ops.push_back(DAG.getRegister(X86::AL, MVT::i8));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  if (isTailCall) {
+    // If this is the first return lowered for this function, add the regs
+    // to the liveout set for the function.
+    if (MF.getRegInfo().liveout_empty()) {
+      SmallVector<CCValAssign, 16> RVLocs;
+      CCState CCInfo(CallConv, isVarArg, getTargetMachine(), RVLocs,
+                     *DAG.getContext());
+      CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
+      for (unsigned i = 0; i != RVLocs.size(); ++i)
+        if (RVLocs[i].isRegLoc())
+          MF.getRegInfo().addLiveOut(RVLocs[i].getLocReg());
+    }
+
+    assert(((Callee.getOpcode() == ISD::Register &&
+               (cast<RegisterSDNode>(Callee)->getReg() == X86::EAX ||
+                cast<RegisterSDNode>(Callee)->getReg() == X86::R11)) ||
+              Callee.getOpcode() == ISD::TargetExternalSymbol ||
+              Callee.getOpcode() == ISD::TargetGlobalAddress) &&
+           "Expecting a global address, external symbol, or scratch register");
+
+    return DAG.getNode(X86ISD::TC_RETURN, dl,
+                       NodeTys, &Ops[0], Ops.size());
+  }
+
+  Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  // Create the CALLSEQ_END node.
+  unsigned NumBytesForCalleeToPush;
+  if (IsCalleePop(isVarArg, CallConv))
+    NumBytesForCalleeToPush = NumBytes;    // Callee pops everything
+  else if (!Is64Bit && CallConv != CallingConv::Fast && IsStructRet)
+    // If this is a call to a struct-return function, the callee
+    // pops the hidden struct pointer, so we have to push it back.
+    // This is common for Darwin/X86, Linux & Mingw32 targets.
+    NumBytesForCalleeToPush = 4;
+  else
+    NumBytesForCalleeToPush = 0;  // Callee pops nothing.
+
+  // Returns a flag for retval copy to use.
+  if (!IsSibcall) {
+    Chain = DAG.getCALLSEQ_END(Chain,
+                               DAG.getIntPtrConstant(NumBytes, true),
+                               DAG.getIntPtrConstant(NumBytesForCalleeToPush,
+                                                     true),
+                               InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
+                         Ins, dl, DAG, InVals);
+}
+
+
+//===----------------------------------------------------------------------===//
+//                Fast Calling Convention (tail call) implementation
+//===----------------------------------------------------------------------===//
+
+//  Like std call, callee cleans arguments, convention except that ECX is
+//  reserved for storing the tail called function address. Only 2 registers are
+//  free for argument passing (inreg). Tail call optimization is performed
+//  provided:
+//                * tailcallopt is enabled
+//                * caller/callee are fastcc
+//  On X86_64 architecture with GOT-style position independent code only local
+//  (within module) calls are supported at the moment.
+//  To keep the stack aligned according to platform abi the function
+//  GetAlignedArgumentStackSize ensures that argument delta is always multiples
+//  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
+//  If a tail called function callee has more arguments than the caller the
+//  caller needs to make sure that there is room to move the RETADDR to. This is
+//  achieved by reserving an area the size of the argument delta right after the
+//  original REtADDR, but before the saved framepointer or the spilled registers
+//  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
+//  stack layout:
+//    arg1
+//    arg2
+//    RETADDR
+//    [ new RETADDR
+//      move area ]
+//    (possible EBP)
+//    ESI
+//    EDI
+//    local1 ..
+
+/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
+/// for a 16 byte align requirement.
+unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
+                                                        SelectionDAG& DAG) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  const TargetMachine &TM = MF.getTarget();
+  const TargetFrameInfo &TFI = *TM.getFrameInfo();
+  unsigned StackAlignment = TFI.getStackAlignment();
+  uint64_t AlignMask = StackAlignment - 1;
+  int64_t Offset = StackSize;
+  uint64_t SlotSize = TD->getPointerSize();
+  if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
+    // Number smaller than 12 so just add the difference.
+    Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
+  } else {
+    // Mask out lower bits, add stackalignment once plus the 12 bytes.
+    Offset = ((~AlignMask) & Offset) + StackAlignment +
+      (StackAlignment-SlotSize);
+  }
+  return Offset;
+}
+
+/// MatchingStackOffset - Return true if the given stack call argument is
+/// already available in the same position (relatively) of the caller's
+/// incoming argument stack.
+static
+bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
+                         MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
+                         const X86InstrInfo *TII) {
+  int FI;
+  if (Arg.getOpcode() == ISD::CopyFromReg) {
+    unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
+    if (!VR || TargetRegisterInfo::isPhysicalRegister(VR))
+      return false;
+    MachineInstr *Def = MRI->getVRegDef(VR);
+    if (!Def)
+      return false;
+    if (!Flags.isByVal()) {
+      if (!TII->isLoadFromStackSlot(Def, FI))
+        return false;
+    } else {
+      unsigned Opcode = Def->getOpcode();
+      if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
+          Def->getOperand(1).isFI()) {
+        FI = Def->getOperand(1).getIndex();
+        if (MFI->getObjectSize(FI) != Flags.getByValSize())
+          return false;
+      } else
+        return false;
+    }
+  } else {
+    LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg);
+    if (!Ld)
+      return false;
+    SDValue Ptr = Ld->getBasePtr();
+    FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
+    if (!FINode)
+      return false;
+    FI = FINode->getIndex();
+  }
+
+  if (!MFI->isFixedObjectIndex(FI))
+    return false;
+  return Offset == MFI->getObjectOffset(FI);
+}
+
+/// IsEligibleForTailCallOptimization - Check whether the call is eligible
+/// for tail call optimization. Targets which want to do tail call
+/// optimization should implement this function.
+bool
+X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
+                                                     CallingConv::ID CalleeCC,
+                                                     bool isVarArg,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                                     SelectionDAG& DAG) const {
+  if (CalleeCC != CallingConv::Fast &&
+      CalleeCC != CallingConv::C)
+    return false;
+
+  // If -tailcallopt is specified, make fastcc functions tail-callable.
+  const Function *CallerF = DAG.getMachineFunction().getFunction();
+  if (GuaranteedTailCallOpt) {
+    if (CalleeCC == CallingConv::Fast &&
+        CallerF->getCallingConv() == CalleeCC)
+      return true;
+    return false;
+  }
+
+  // Look for obvious safe cases to perform tail call optimization that does not
+  // requite ABI changes. This is what gcc calls sibcall.
+
+  // Do not tail call optimize vararg calls for now.
+  if (isVarArg)
+    return false;
+
+  // If the callee takes no arguments then go on to check the results of the
+  // call.
+  if (!Outs.empty()) {
+    // Check if stack adjustment is needed. For now, do not do this if any
+    // argument is passed on the stack.
+    SmallVector<CCValAssign, 16> ArgLocs;
+    CCState CCInfo(CalleeCC, isVarArg, getTargetMachine(),
+                   ArgLocs, *DAG.getContext());
+    CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC));
+    if (CCInfo.getNextStackOffset()) {
+      MachineFunction &MF = DAG.getMachineFunction();
+      if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
+        return false;
+      if (Subtarget->isTargetWin64())
+        // Win64 ABI has additional complications.
+        return false;
+
+      // Check if the arguments are already laid out in the right way as
+      // the caller's fixed stack objects.
+      MachineFrameInfo *MFI = MF.getFrameInfo();
+      const MachineRegisterInfo *MRI = &MF.getRegInfo();
+      const X86InstrInfo *TII =
+        ((X86TargetMachine&)getTargetMachine()).getInstrInfo();
+      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+        CCValAssign &VA = ArgLocs[i];
+        EVT RegVT = VA.getLocVT();
+        SDValue Arg = Outs[i].Val;
+        ISD::ArgFlagsTy Flags = Outs[i].Flags;
+        if (VA.getLocInfo() == CCValAssign::Indirect)
+          return false;
+        if (!VA.isRegLoc()) {
+          if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
+                                   MFI, MRI, TII))
+            return false;
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+FastISel *
+X86TargetLowering::createFastISel(MachineFunction &mf, MachineModuleInfo *mmo,
+                            DwarfWriter *dw,
+                            DenseMap<const Value *, unsigned> &vm,
+                            DenseMap<const BasicBlock*, MachineBasicBlock*> &bm,
+                            DenseMap<const AllocaInst *, int> &am
+#ifndef NDEBUG
+                          , SmallSet<Instruction*, 8> &cil
+#endif
+                                  ) {
+  return X86::createFastISel(mf, mmo, dw, vm, bm, am
+#ifndef NDEBUG
+                             , cil
+#endif
+                             );
+}
+
+
+//===----------------------------------------------------------------------===//
+//                           Other Lowering Hooks
+//===----------------------------------------------------------------------===//
+
+
+SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
+  int ReturnAddrIndex = FuncInfo->getRAIndex();
+
+  if (ReturnAddrIndex == 0) {
+    // Set up a frame object for the return address.
+    uint64_t SlotSize = TD->getPointerSize();
+    ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize,
+                                                           true, false);
+    FuncInfo->setRAIndex(ReturnAddrIndex);
+  }
+
+  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
+}
+
+
+bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
+                                       bool hasSymbolicDisplacement) {
+  // Offset should fit into 32 bit immediate field.
+  if (!isInt32(Offset))
+    return false;
+
+  // If we don't have a symbolic displacement - we don't have any extra
+  // restrictions.
+  if (!hasSymbolicDisplacement)
+    return true;
+
+  // FIXME: Some tweaks might be needed for medium code model.
+  if (M != CodeModel::Small && M != CodeModel::Kernel)
+    return false;
+
+  // For small code model we assume that latest object is 16MB before end of 31
+  // bits boundary. We may also accept pretty large negative constants knowing
+  // that all objects are in the positive half of address space.
+  if (M == CodeModel::Small && Offset < 16*1024*1024)
+    return true;
+
+  // For kernel code model we know that all object resist in the negative half
+  // of 32bits address space. We may not accept negative offsets, since they may
+  // be just off and we may accept pretty large positive ones.
+  if (M == CodeModel::Kernel && Offset > 0)
+    return true;
+
+  return false;
+}
+
+/// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
+/// specific condition code, returning the condition code and the LHS/RHS of the
+/// comparison to make.
+static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
+                               SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
+  if (!isFP) {
+    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
+      if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
+        // X > -1   -> X == 0, jump !sign.
+        RHS = DAG.getConstant(0, RHS.getValueType());
+        return X86::COND_NS;
+      } else if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
+        // X < 0   -> X == 0, jump on sign.
+        return X86::COND_S;
+      } else if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
+        // X < 1   -> X <= 0
+        RHS = DAG.getConstant(0, RHS.getValueType());
+        return X86::COND_LE;
+      }
+    }
+
+    switch (SetCCOpcode) {
+    default: llvm_unreachable("Invalid integer condition!");
+    case ISD::SETEQ:  return X86::COND_E;
+    case ISD::SETGT:  return X86::COND_G;
+    case ISD::SETGE:  return X86::COND_GE;
+    case ISD::SETLT:  return X86::COND_L;
+    case ISD::SETLE:  return X86::COND_LE;
+    case ISD::SETNE:  return X86::COND_NE;
+    case ISD::SETULT: return X86::COND_B;
+    case ISD::SETUGT: return X86::COND_A;
+    case ISD::SETULE: return X86::COND_BE;
+    case ISD::SETUGE: return X86::COND_AE;
+    }
+  }
+
+  // First determine if it is required or is profitable to flip the operands.
+
+  // If LHS is a foldable load, but RHS is not, flip the condition.
+  if ((ISD::isNON_EXTLoad(LHS.getNode()) && LHS.hasOneUse()) &&
+      !(ISD::isNON_EXTLoad(RHS.getNode()) && RHS.hasOneUse())) {
+    SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
+    std::swap(LHS, RHS);
+  }
+
+  switch (SetCCOpcode) {
+  default: break;
+  case ISD::SETOLT:
+  case ISD::SETOLE:
+  case ISD::SETUGT:
+  case ISD::SETUGE:
+    std::swap(LHS, RHS);
+    break;
+  }
+
+  // On a floating point condition, the flags are set as follows:
+  // ZF  PF  CF   op
+  //  0 | 0 | 0 | X > Y
+  //  0 | 0 | 1 | X < Y
+  //  1 | 0 | 0 | X == Y
+  //  1 | 1 | 1 | unordered
+  switch (SetCCOpcode) {
+  default: llvm_unreachable("Condcode should be pre-legalized away");
+  case ISD::SETUEQ:
+  case ISD::SETEQ:   return X86::COND_E;
+  case ISD::SETOLT:              // flipped
+  case ISD::SETOGT:
+  case ISD::SETGT:   return X86::COND_A;
+  case ISD::SETOLE:              // flipped
+  case ISD::SETOGE:
+  case ISD::SETGE:   return X86::COND_AE;
+  case ISD::SETUGT:              // flipped
+  case ISD::SETULT:
+  case ISD::SETLT:   return X86::COND_B;
+  case ISD::SETUGE:              // flipped
+  case ISD::SETULE:
+  case ISD::SETLE:   return X86::COND_BE;
+  case ISD::SETONE:
+  case ISD::SETNE:   return X86::COND_NE;
+  case ISD::SETUO:   return X86::COND_P;
+  case ISD::SETO:    return X86::COND_NP;
+  case ISD::SETOEQ:
+  case ISD::SETUNE:  return X86::COND_INVALID;
+  }
+}
+
+/// hasFPCMov - is there a floating point cmov for the specific X86 condition
+/// code. Current x86 isa includes the following FP cmov instructions:
+/// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
+static bool hasFPCMov(unsigned X86CC) {
+  switch (X86CC) {
+  default:
+    return false;
+  case X86::COND_B:
+  case X86::COND_BE:
+  case X86::COND_E:
+  case X86::COND_P:
+  case X86::COND_A:
+  case X86::COND_AE:
+  case X86::COND_NE:
+  case X86::COND_NP:
+    return true;
+  }
+}
+
+/// isFPImmLegal - Returns true if the target can instruction select the
+/// specified FP immediate natively. If false, the legalizer will
+/// materialize the FP immediate as a load from a constant pool.
+bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+  for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
+    if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
+      return true;
+  }
+  return false;
+}
+
+/// isUndefOrInRange - Return true if Val is undef or if its value falls within
+/// the specified range (L, H].
+static bool isUndefOrInRange(int Val, int Low, int Hi) {
+  return (Val < 0) || (Val >= Low && Val < Hi);
+}
+
+/// isUndefOrEqual - Val is either less than zero (undef) or equal to the
+/// specified value.
+static bool isUndefOrEqual(int Val, int CmpVal) {
+  if (Val < 0 || Val == CmpVal)
+    return true;
+  return false;
+}
+
+/// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
+/// is suitable for input to PSHUFD or PSHUFW.  That is, it doesn't reference
+/// the second operand.
+static bool isPSHUFDMask(const SmallVectorImpl<int> &Mask, EVT VT) {
+  if (VT == MVT::v4f32 || VT == MVT::v4i32 || VT == MVT::v4i16)
+    return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4);
+  if (VT == MVT::v2f64 || VT == MVT::v2i64)
+    return (Mask[0] < 2 && Mask[1] < 2);
+  return false;
+}
+
+bool X86::isPSHUFDMask(ShuffleVectorSDNode *N) {
+  SmallVector<int, 8> M;
+  N->getMask(M);
+  return ::isPSHUFDMask(M, N->getValueType(0));
+}
+
+/// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
+/// is suitable for input to PSHUFHW.
+static bool isPSHUFHWMask(const SmallVectorImpl<int> &Mask, EVT VT) {
+  if (VT != MVT::v8i16)
+    return false;
+
+  // Lower quadword copied in order or undef.
+  for (int i = 0; i != 4; ++i)
+    if (Mask[i] >= 0 && Mask[i] != i)
+      return false;
+
+  // Upper quadword shuffled.
+  for (int i = 4; i != 8; ++i)
+    if (Mask[i] >= 0 && (Mask[i] < 4 || Mask[i] > 7))
+      return false;
+
+  return true;
+}
+
+bool X86::isPSHUFHWMask(ShuffleVectorSDNode *N) {
+  SmallVector<int, 8> M;
+  N->getMask(M);
+  return ::isPSHUFHWMask(M, N->getValueType(0));
+}
+
+/// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
+/// is suitable for input to PSHUFLW.
+static bool isPSHUFLWMask(const SmallVectorImpl<int> &Mask, EVT VT) {
+  if (VT != MVT::v8i16)
+    return false;
+
+  // Upper quadword copied in order.
+  for (int i = 4; i != 8; ++i)
+    if (Mask[i] >= 0 && Mask[i] != i)
+      return false;
+
+  // Lower quadword shuffled.
+  for (int i = 0; i != 4; ++i)
+    if (Mask[i] >= 4)
+      return false;
+
+  return true;
+}
+
+bool X86::isPSHUFLWMask(ShuffleVectorSDNode *N) {
+  SmallVector<int, 8> M;
+  N->getMask(M);
+  return ::isPSHUFLWMask(M, N->getValueType(0));
+}
+
+/// isPALIGNRMask - Return true if the node specifies a shuffle of elements that
+/// is suitable for input to PALIGNR.
+static bool isPALIGNRMask(const SmallVectorImpl<int> &Mask, EVT VT,
+                          bool hasSSSE3) {
+  int i, e = VT.getVectorNumElements();
+  
+  // Do not handle v2i64 / v2f64 shuffles with palignr.
+  if (e < 4 || !hasSSSE3)
+    return false;
+  
+  for (i = 0; i != e; ++i)
+    if (Mask[i] >= 0)
+      break;
+  
+  // All undef, not a palignr.
+  if (i == e)
+    return false;
+
+  // Determine if it's ok to perform a palignr with only the LHS, since we
+  // don't have access to the actual shuffle elements to see if RHS is undef.
+  bool Unary = Mask[i] < (int)e;
+  bool NeedsUnary = false;
+
+  int s = Mask[i] - i;
+  
+  // Check the rest of the elements to see if they are consecutive.
+  for (++i; i != e; ++i) {
+    int m = Mask[i];
+    if (m < 0) 
+      continue;
+    
+    Unary = Unary && (m < (int)e);
+    NeedsUnary = NeedsUnary || (m < s);
+
+    if (NeedsUnary && !Unary)
+      return false;
+    if (Unary && m != ((s+i) & (e-1)))
+      return false;
+    if (!Unary && m != (s+i))
+      return false;
+  }
+  return true;
+}
+
+bool X86::isPALIGNRMask(ShuffleVectorSDNode *N) {
+  SmallVector<int, 8> M;
+  N->getMask(M);
+  return ::isPALIGNRMask(M, N->getValueType(0), true);
+}
+
+/// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to SHUFP*.
+static bool isSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) {
+  int NumElems = VT.getVectorNumElements();
+  if (NumElems != 2 && NumElems != 4)
+    return false;
+
+  int Half = NumElems / 2;
+  for (int i = 0; i < Half; ++i)
+    if (!isUndefOrInRange(Mask[i], 0, NumElems))
+      return false;
+  for (int i = Half; i < NumElems; ++i)
+    if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2))
+      return false;
+
+  return true;
+}
+
+bool X86::isSHUFPMask(ShuffleVectorSDNode *N) {
+  SmallVector<int, 8> M;
+  N->getMask(M);
+  return ::isSHUFPMask(M, N->getValueType(0));
+}
+
+/// isCommutedSHUFP - Returns true if the shuffle mask is exactly
+/// the reverse of what x86 shuffles want. x86 shuffles requires the lower
+/// half elements to come from vector 1 (which would equal the dest.) and
+/// the upper half to come from vector 2.
+static bool isCommutedSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT) {
+  int NumElems = VT.getVectorNumElements();
+
+  if (NumElems != 2 && NumElems != 4)
+    return false;
+
+  int Half = NumElems / 2;
+  for (int i = 0; i < Half; ++i)
+    if (!isUndefOrInRange(Mask[i], NumElems, NumElems*2))
+      return false;
+  for (int i = Half; i < NumElems; ++i)
+    if (!isUndefOrInRange(Mask[i], 0, NumElems))
+      return false;
+  return true;
+}
+
+static bool isCommutedSHUFP(ShuffleVectorSDNode *N) {
+  SmallVector<int, 8> M;
+  N->getMask(M);
+  return isCommutedSHUFPMask(M, N->getValueType(0));
+}
+
+/// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVHLPS.
+bool X86::isMOVHLPSMask(ShuffleVectorSDNode *N) {
+  if (N->getValueType(0).getVectorNumElements() != 4)
+    return false;
+
+  // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
+  return isUndefOrEqual(N->getMaskElt(0), 6) &&
+         isUndefOrEqual(N->getMaskElt(1), 7) &&
+         isUndefOrEqual(N->getMaskElt(2), 2) &&
+         isUndefOrEqual(N->getMaskElt(3), 3);
+}
+
+/// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
+/// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
+/// <2, 3, 2, 3>
+bool X86::isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N) {
+  unsigned NumElems = N->getValueType(0).getVectorNumElements();
+  
+  if (NumElems != 4)
+    return false;
+  
+  return isUndefOrEqual(N->getMaskElt(0), 2) &&
+  isUndefOrEqual(N->getMaskElt(1), 3) &&
+  isUndefOrEqual(N->getMaskElt(2), 2) &&
+  isUndefOrEqual(N->getMaskElt(3), 3);
+}
+
+/// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
+bool X86::isMOVLPMask(ShuffleVectorSDNode *N) {
+  unsigned NumElems = N->getValueType(0).getVectorNumElements();
+
+  if (NumElems != 2 && NumElems != 4)
+    return false;
+
+  for (unsigned i = 0; i < NumElems/2; ++i)
+    if (!isUndefOrEqual(N->getMaskElt(i), i + NumElems))
+      return false;
+
+  for (unsigned i = NumElems/2; i < NumElems; ++i)
+    if (!isUndefOrEqual(N->getMaskElt(i), i))
+      return false;
+
+  return true;
+}
+
+/// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVLHPS.
+bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) {
+  unsigned NumElems = N->getValueType(0).getVectorNumElements();
+
+  if (NumElems != 2 && NumElems != 4)
+    return false;
+
+  for (unsigned i = 0; i < NumElems/2; ++i)
+    if (!isUndefOrEqual(N->getMaskElt(i), i))
+      return false;
+
+  for (unsigned i = 0; i < NumElems/2; ++i)
+    if (!isUndefOrEqual(N->getMaskElt(i + NumElems/2), i + NumElems))
+      return false;
+
+  return true;
+}
+
+/// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to UNPCKL.
+static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT,
+                         bool V2IsSplat = false) {
+  int NumElts = VT.getVectorNumElements();
+  if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
+    return false;
+
+  for (int i = 0, j = 0; i != NumElts; i += 2, ++j) {
+    int BitI  = Mask[i];
+    int BitI1 = Mask[i+1];
+    if (!isUndefOrEqual(BitI, j))
+      return false;
+    if (V2IsSplat) {
+      if (!isUndefOrEqual(BitI1, NumElts))
+        return false;
+    } else {
+      if (!isUndefOrEqual(BitI1, j + NumElts))
+        return false;
+    }
+  }
+  return true;
+}
+
+bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) {
+  SmallVector<int, 8> M;
+  N->getMask(M);
+  return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat);
+}
+
+/// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to UNPCKH.
+static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT,
+                         bool V2IsSplat = false) {
+  int NumElts = VT.getVectorNumElements();
+  if (NumElts != 2 && NumElts != 4 && NumElts != 8 && NumElts != 16)
+    return false;
+
+  for (int i = 0, j = 0; i != NumElts; i += 2, ++j) {
+    int BitI  = Mask[i];
+    int BitI1 = Mask[i+1];
+    if (!isUndefOrEqual(BitI, j + NumElts/2))
+      return false;
+    if (V2IsSplat) {
+      if (isUndefOrEqual(BitI1, NumElts))
+        return false;
+    } else {
+      if (!isUndefOrEqual(BitI1, j + NumElts/2 + NumElts))
+        return false;
+    }
+  }
+  return true;
+}
+
+bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) {
+  SmallVector<int, 8> M;
+  N->getMask(M);
+  return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat);
+}
+
+/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
+/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
+/// <0, 0, 1, 1>
+static bool isUNPCKL_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
+  int NumElems = VT.getVectorNumElements();
+  if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
+    return false;
+
+  for (int i = 0, j = 0; i != NumElems; i += 2, ++j) {
+    int BitI  = Mask[i];
+    int BitI1 = Mask[i+1];
+    if (!isUndefOrEqual(BitI, j))
+      return false;
+    if (!isUndefOrEqual(BitI1, j))
+      return false;
+  }
+  return true;
+}
+
+bool X86::isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N) {
+  SmallVector<int, 8> M;
+  N->getMask(M);
+  return ::isUNPCKL_v_undef_Mask(M, N->getValueType(0));
+}
+
+/// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
+/// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
+/// <2, 2, 3, 3>
+static bool isUNPCKH_v_undef_Mask(const SmallVectorImpl<int> &Mask, EVT VT) {
+  int NumElems = VT.getVectorNumElements();
+  if (NumElems != 2 && NumElems != 4 && NumElems != 8 && NumElems != 16)
+    return false;
+
+  for (int i = 0, j = NumElems / 2; i != NumElems; i += 2, ++j) {
+    int BitI  = Mask[i];
+    int BitI1 = Mask[i+1];
+    if (!isUndefOrEqual(BitI, j))
+      return false;
+    if (!isUndefOrEqual(BitI1, j))
+      return false;
+  }
+  return true;
+}
+
+bool X86::isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N) {
+  SmallVector<int, 8> M;
+  N->getMask(M);
+  return ::isUNPCKH_v_undef_Mask(M, N->getValueType(0));
+}
+
+/// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVSS,
+/// MOVSD, and MOVD, i.e. setting the lowest element.
+static bool isMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT) {
+  if (VT.getVectorElementType().getSizeInBits() < 32)
+    return false;
+
+  int NumElts = VT.getVectorNumElements();
+
+  if (!isUndefOrEqual(Mask[0], NumElts))
+    return false;
+
+  for (int i = 1; i < NumElts; ++i)
+    if (!isUndefOrEqual(Mask[i], i))
+      return false;
+
+  return true;
+}
+
+bool X86::isMOVLMask(ShuffleVectorSDNode *N) {
+  SmallVector<int, 8> M;
+  N->getMask(M);
+  return ::isMOVLMask(M, N->getValueType(0));
+}
+
+/// isCommutedMOVL - Returns true if the shuffle mask is except the reverse
+/// of what x86 movss want. X86 movs requires the lowest  element to be lowest
+/// element of vector 2 and the other elements to come from vector 1 in order.
+static bool isCommutedMOVLMask(const SmallVectorImpl<int> &Mask, EVT VT,
+                               bool V2IsSplat = false, bool V2IsUndef = false) {
+  int NumOps = VT.getVectorNumElements();
+  if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
+    return false;
+
+  if (!isUndefOrEqual(Mask[0], 0))
+    return false;
+
+  for (int i = 1; i < NumOps; ++i)
+    if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
+          (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
+          (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
+      return false;
+
+  return true;
+}
+
+static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false,
+                           bool V2IsUndef = false) {
+  SmallVector<int, 8> M;
+  N->getMask(M);
+  return isCommutedMOVLMask(M, N->getValueType(0), V2IsSplat, V2IsUndef);
+}
+
+/// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
+bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N) {
+  if (N->getValueType(0).getVectorNumElements() != 4)
+    return false;
+
+  // Expect 1, 1, 3, 3
+  for (unsigned i = 0; i < 2; ++i) {
+    int Elt = N->getMaskElt(i);
+    if (Elt >= 0 && Elt != 1)
+      return false;
+  }
+
+  bool HasHi = false;
+  for (unsigned i = 2; i < 4; ++i) {
+    int Elt = N->getMaskElt(i);
+    if (Elt >= 0 && Elt != 3)
+      return false;
+    if (Elt == 3)
+      HasHi = true;
+  }
+  // Don't use movshdup if it can be done with a shufps.
+  // FIXME: verify that matching u, u, 3, 3 is what we want.
+  return HasHi;
+}
+
+/// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
+bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N) {
+  if (N->getValueType(0).getVectorNumElements() != 4)
+    return false;
+
+  // Expect 0, 0, 2, 2
+  for (unsigned i = 0; i < 2; ++i)
+    if (N->getMaskElt(i) > 0)
+      return false;
+
+  bool HasHi = false;
+  for (unsigned i = 2; i < 4; ++i) {
+    int Elt = N->getMaskElt(i);
+    if (Elt >= 0 && Elt != 2)
+      return false;
+    if (Elt == 2)
+      HasHi = true;
+  }
+  // Don't use movsldup if it can be done with a shufps.
+  return HasHi;
+}
+
+/// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to MOVDDUP.
+bool X86::isMOVDDUPMask(ShuffleVectorSDNode *N) {
+  int e = N->getValueType(0).getVectorNumElements() / 2;
+
+  for (int i = 0; i < e; ++i)
+    if (!isUndefOrEqual(N->getMaskElt(i), i))
+      return false;
+  for (int i = 0; i < e; ++i)
+    if (!isUndefOrEqual(N->getMaskElt(e+i), i))
+      return false;
+  return true;
+}
+
+/// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
+unsigned X86::getShuffleSHUFImmediate(SDNode *N) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  int NumOperands = SVOp->getValueType(0).getVectorNumElements();
+
+  unsigned Shift = (NumOperands == 4) ? 2 : 1;
+  unsigned Mask = 0;
+  for (int i = 0; i < NumOperands; ++i) {
+    int Val = SVOp->getMaskElt(NumOperands-i-1);
+    if (Val < 0) Val = 0;
+    if (Val >= NumOperands) Val -= NumOperands;
+    Mask |= Val;
+    if (i != NumOperands - 1)
+      Mask <<= Shift;
+  }
+  return Mask;
+}
+
+/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
+unsigned X86::getShufflePSHUFHWImmediate(SDNode *N) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  unsigned Mask = 0;
+  // 8 nodes, but we only care about the last 4.
+  for (unsigned i = 7; i >= 4; --i) {
+    int Val = SVOp->getMaskElt(i);
+    if (Val >= 0)
+      Mask |= (Val - 4);
+    if (i != 4)
+      Mask <<= 2;
+  }
+  return Mask;
+}
+
+/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
+unsigned X86::getShufflePSHUFLWImmediate(SDNode *N) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  unsigned Mask = 0;
+  // 8 nodes, but we only care about the first 4.
+  for (int i = 3; i >= 0; --i) {
+    int Val = SVOp->getMaskElt(i);
+    if (Val >= 0)
+      Mask |= Val;
+    if (i != 0)
+      Mask <<= 2;
+  }
+  return Mask;
+}
+
+/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
+/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
+unsigned X86::getShufflePALIGNRImmediate(SDNode *N) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  EVT VVT = N->getValueType(0);
+  unsigned EltSize = VVT.getVectorElementType().getSizeInBits() >> 3;
+  int Val = 0;
+
+  unsigned i, e;
+  for (i = 0, e = VVT.getVectorNumElements(); i != e; ++i) {
+    Val = SVOp->getMaskElt(i);
+    if (Val >= 0)
+      break;
+  }
+  return (Val - i) * EltSize;
+}
+
+/// isZeroNode - Returns true if Elt is a constant zero or a floating point
+/// constant +0.0.
+bool X86::isZeroNode(SDValue Elt) {
+  return ((isa<ConstantSDNode>(Elt) &&
+           cast<ConstantSDNode>(Elt)->getZExtValue() == 0) ||
+          (isa<ConstantFPSDNode>(Elt) &&
+           cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero()));
+}
+
+/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in
+/// their permute mask.
+static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
+                                    SelectionDAG &DAG) {
+  EVT VT = SVOp->getValueType(0);
+  unsigned NumElems = VT.getVectorNumElements();
+  SmallVector<int, 8> MaskVec;
+
+  for (unsigned i = 0; i != NumElems; ++i) {
+    int idx = SVOp->getMaskElt(i);
+    if (idx < 0)
+      MaskVec.push_back(idx);
+    else if (idx < (int)NumElems)
+      MaskVec.push_back(idx + NumElems);
+    else
+      MaskVec.push_back(idx - NumElems);
+  }
+  return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(1),
+                              SVOp->getOperand(0), &MaskVec[0]);
+}
+
+/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
+/// the two vector operands have swapped position.
+static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) {
+  unsigned NumElems = VT.getVectorNumElements();
+  for (unsigned i = 0; i != NumElems; ++i) {
+    int idx = Mask[i];
+    if (idx < 0)
+      continue;
+    else if (idx < (int)NumElems)
+      Mask[i] = idx + NumElems;
+    else
+      Mask[i] = idx - NumElems;
+  }
+}
+
+/// ShouldXformToMOVHLPS - Return true if the node should be transformed to
+/// match movhlps. The lower half elements should come from upper half of
+/// V1 (and in order), and the upper half elements should come from the upper
+/// half of V2 (and in order).
+static bool ShouldXformToMOVHLPS(ShuffleVectorSDNode *Op) {
+  if (Op->getValueType(0).getVectorNumElements() != 4)
+    return false;
+  for (unsigned i = 0, e = 2; i != e; ++i)
+    if (!isUndefOrEqual(Op->getMaskElt(i), i+2))
+      return false;
+  for (unsigned i = 2; i != 4; ++i)
+    if (!isUndefOrEqual(Op->getMaskElt(i), i+4))
+      return false;
+  return true;
+}
+
+/// isScalarLoadToVector - Returns true if the node is a scalar load that
+/// is promoted to a vector. It also returns the LoadSDNode by reference if
+/// required.
+static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
+  if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
+    return false;
+  N = N->getOperand(0).getNode();
+  if (!ISD::isNON_EXTLoad(N))
+    return false;
+  if (LD)
+    *LD = cast<LoadSDNode>(N);
+  return true;
+}
+
+/// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
+/// match movlp{s|d}. The lower half elements should come from lower half of
+/// V1 (and in order), and the upper half elements should come from the upper
+/// half of V2 (and in order). And since V1 will become the source of the
+/// MOVLP, it must be either a vector load or a scalar load to vector.
+static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
+                               ShuffleVectorSDNode *Op) {
+  if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
+    return false;
+  // Is V2 is a vector load, don't do this transformation. We will try to use
+  // load folding shufps op.
+  if (ISD::isNON_EXTLoad(V2))
+    return false;
+
+  unsigned NumElems = Op->getValueType(0).getVectorNumElements();
+
+  if (NumElems != 2 && NumElems != 4)
+    return false;
+  for (unsigned i = 0, e = NumElems/2; i != e; ++i)
+    if (!isUndefOrEqual(Op->getMaskElt(i), i))
+      return false;
+  for (unsigned i = NumElems/2; i != NumElems; ++i)
+    if (!isUndefOrEqual(Op->getMaskElt(i), i+NumElems))
+      return false;
+  return true;
+}
+
+/// isSplatVector - Returns true if N is a BUILD_VECTOR node whose elements are
+/// all the same.
+static bool isSplatVector(SDNode *N) {
+  if (N->getOpcode() != ISD::BUILD_VECTOR)
+    return false;
+
+  SDValue SplatValue = N->getOperand(0);
+  for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
+    if (N->getOperand(i) != SplatValue)
+      return false;
+  return true;
+}
+
+/// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
+/// to an zero vector.
+/// FIXME: move to dag combiner / method on ShuffleVectorSDNode
+static bool isZeroShuffle(ShuffleVectorSDNode *N) {
+  SDValue V1 = N->getOperand(0);
+  SDValue V2 = N->getOperand(1);
+  unsigned NumElems = N->getValueType(0).getVectorNumElements();
+  for (unsigned i = 0; i != NumElems; ++i) {
+    int Idx = N->getMaskElt(i);
+    if (Idx >= (int)NumElems) {
+      unsigned Opc = V2.getOpcode();
+      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
+        continue;
+      if (Opc != ISD::BUILD_VECTOR ||
+          !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
+        return false;
+    } else if (Idx >= 0) {
+      unsigned Opc = V1.getOpcode();
+      if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
+        continue;
+      if (Opc != ISD::BUILD_VECTOR ||
+          !X86::isZeroNode(V1.getOperand(Idx)))
+        return false;
+    }
+  }
+  return true;
+}
+
+/// getZeroVector - Returns a vector of specified type with all zero elements.
+///
+static SDValue getZeroVector(EVT VT, bool HasSSE2, SelectionDAG &DAG,
+                             DebugLoc dl) {
+  assert(VT.isVector() && "Expected a vector type");
+
+  // Always build zero vectors as <4 x i32> or <2 x i32> bitcasted to their dest
+  // type.  This ensures they get CSE'd.
+  SDValue Vec;
+  if (VT.getSizeInBits() == 64) { // MMX
+    SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
+    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
+  } else if (HasSSE2) {  // SSE2
+    SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
+    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
+  } else { // SSE1
+    SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
+    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
+  }
+  return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
+}
+
+/// getOnesVector - Returns a vector of specified type with all bits set.
+///
+static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) {
+  assert(VT.isVector() && "Expected a vector type");
+
+  // Always build ones vectors as <4 x i32> or <2 x i32> bitcasted to their dest
+  // type.  This ensures they get CSE'd.
+  SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
+  SDValue Vec;
+  if (VT.getSizeInBits() == 64)  // MMX
+    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2i32, Cst, Cst);
+  else                                              // SSE
+    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
+  return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Vec);
+}
+
+
+/// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
+/// that point to V2 points to its first element.
+static SDValue NormalizeMask(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
+  EVT VT = SVOp->getValueType(0);
+  unsigned NumElems = VT.getVectorNumElements();
+
+  bool Changed = false;
+  SmallVector<int, 8> MaskVec;
+  SVOp->getMask(MaskVec);
+
+  for (unsigned i = 0; i != NumElems; ++i) {
+    if (MaskVec[i] > (int)NumElems) {
+      MaskVec[i] = NumElems;
+      Changed = true;
+    }
+  }
+  if (Changed)
+    return DAG.getVectorShuffle(VT, SVOp->getDebugLoc(), SVOp->getOperand(0),
+                                SVOp->getOperand(1), &MaskVec[0]);
+  return SDValue(SVOp, 0);
+}
+
+/// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
+/// operation of specified width.
+static SDValue getMOVL(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
+                       SDValue V2) {
+  unsigned NumElems = VT.getVectorNumElements();
+  SmallVector<int, 8> Mask;
+  Mask.push_back(NumElems);
+  for (unsigned i = 1; i != NumElems; ++i)
+    Mask.push_back(i);
+  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
+}
+
+/// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
+static SDValue getUnpackl(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
+                          SDValue V2) {
+  unsigned NumElems = VT.getVectorNumElements();
+  SmallVector<int, 8> Mask;
+  for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
+    Mask.push_back(i);
+    Mask.push_back(i + NumElems);
+  }
+  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
+}
+
+/// getUnpackhMask - Returns a vector_shuffle node for an unpackh operation.
+static SDValue getUnpackh(SelectionDAG &DAG, DebugLoc dl, EVT VT, SDValue V1,
+                          SDValue V2) {
+  unsigned NumElems = VT.getVectorNumElements();
+  unsigned Half = NumElems/2;
+  SmallVector<int, 8> Mask;
+  for (unsigned i = 0; i != Half; ++i) {
+    Mask.push_back(i + Half);
+    Mask.push_back(i + NumElems + Half);
+  }
+  return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
+}
+
+/// PromoteSplat - Promote a splat of v4f32, v8i16 or v16i8 to v4i32.
+static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG,
+                            bool HasSSE2) {
+  if (SV->getValueType(0).getVectorNumElements() <= 4)
+    return SDValue(SV, 0);
+
+  EVT PVT = MVT::v4f32;
+  EVT VT = SV->getValueType(0);
+  DebugLoc dl = SV->getDebugLoc();
+  SDValue V1 = SV->getOperand(0);
+  int NumElems = VT.getVectorNumElements();
+  int EltNo = SV->getSplatIndex();
+
+  // unpack elements to the correct location
+  while (NumElems > 4) {
+    if (EltNo < NumElems/2) {
+      V1 = getUnpackl(DAG, dl, VT, V1, V1);
+    } else {
+      V1 = getUnpackh(DAG, dl, VT, V1, V1);
+      EltNo -= NumElems/2;
+    }
+    NumElems >>= 1;
+  }
+
+  // Perform the splat.
+  int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
+  V1 = DAG.getNode(ISD::BIT_CONVERT, dl, PVT, V1);
+  V1 = DAG.getVectorShuffle(PVT, dl, V1, DAG.getUNDEF(PVT), &SplatMask[0]);
+  return DAG.getNode(ISD::BIT_CONVERT, dl, VT, V1);
+}
+
+/// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
+/// vector of zero or undef vector.  This produces a shuffle where the low
+/// element of V2 is swizzled into the zero/undef vector, landing at element
+/// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
+static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
+                                             bool isZero, bool HasSSE2,
+                                             SelectionDAG &DAG) {
+  EVT VT = V2.getValueType();
+  SDValue V1 = isZero
+    ? getZeroVector(VT, HasSSE2, DAG, V2.getDebugLoc()) : DAG.getUNDEF(VT);
+  unsigned NumElems = VT.getVectorNumElements();
+  SmallVector<int, 16> MaskVec;
+  for (unsigned i = 0; i != NumElems; ++i)
+    // If this is the insertion idx, put the low elt of V2 here.
+    MaskVec.push_back(i == Idx ? NumElems : i);
+  return DAG.getVectorShuffle(VT, V2.getDebugLoc(), V1, V2, &MaskVec[0]);
+}
+
+/// getNumOfConsecutiveZeros - Return the number of elements in a result of
+/// a shuffle that is zero.
+static
+unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, int NumElems,
+                                  bool Low, SelectionDAG &DAG) {
+  unsigned NumZeros = 0;
+  for (int i = 0; i < NumElems; ++i) {
+    unsigned Index = Low ? i : NumElems-i-1;
+    int Idx = SVOp->getMaskElt(Index);
+    if (Idx < 0) {
+      ++NumZeros;
+      continue;
+    }
+    SDValue Elt = DAG.getShuffleScalarElt(SVOp, Index);
+    if (Elt.getNode() && X86::isZeroNode(Elt))
+      ++NumZeros;
+    else
+      break;
+  }
+  return NumZeros;
+}
+
+/// isVectorShift - Returns true if the shuffle can be implemented as a
+/// logical left or right shift of a vector.
+/// FIXME: split into pslldqi, psrldqi, palignr variants.
+static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
+                          bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
+  int NumElems = SVOp->getValueType(0).getVectorNumElements();
+
+  isLeft = true;
+  unsigned NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, true, DAG);
+  if (!NumZeros) {
+    isLeft = false;
+    NumZeros = getNumOfConsecutiveZeros(SVOp, NumElems, false, DAG);
+    if (!NumZeros)
+      return false;
+  }
+  bool SeenV1 = false;
+  bool SeenV2 = false;
+  for (int i = NumZeros; i < NumElems; ++i) {
+    int Val = isLeft ? (i - NumZeros) : i;
+    int Idx = SVOp->getMaskElt(isLeft ? i : (i - NumZeros));
+    if (Idx < 0)
+      continue;
+    if (Idx < NumElems)
+      SeenV1 = true;
+    else {
+      Idx -= NumElems;
+      SeenV2 = true;
+    }
+    if (Idx != Val)
+      return false;
+  }
+  if (SeenV1 && SeenV2)
+    return false;
+
+  ShVal = SeenV1 ? SVOp->getOperand(0) : SVOp->getOperand(1);
+  ShAmt = NumZeros;
+  return true;
+}
+
+
+/// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
+///
+static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
+                                       unsigned NumNonZero, unsigned NumZero,
+                                       SelectionDAG &DAG, TargetLowering &TLI) {
+  if (NumNonZero > 8)
+    return SDValue();
+
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue V(0, 0);
+  bool First = true;
+  for (unsigned i = 0; i < 16; ++i) {
+    bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
+    if (ThisIsNonZero && First) {
+      if (NumZero)
+        V = getZeroVector(MVT::v8i16, true, DAG, dl);
+      else
+        V = DAG.getUNDEF(MVT::v8i16);
+      First = false;
+    }
+
+    if ((i & 1) != 0) {
+      SDValue ThisElt(0, 0), LastElt(0, 0);
+      bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
+      if (LastIsNonZero) {
+        LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
+                              MVT::i16, Op.getOperand(i-1));
+      }
+      if (ThisIsNonZero) {
+        ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
+        ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
+                              ThisElt, DAG.getConstant(8, MVT::i8));
+        if (LastIsNonZero)
+          ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
+      } else
+        ThisElt = LastElt;
+
+      if (ThisElt.getNode())
+        V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
+                        DAG.getIntPtrConstant(i/2));
+    }
+  }
+
+  return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V);
+}
+
+/// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
+///
+static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
+                                       unsigned NumNonZero, unsigned NumZero,
+                                       SelectionDAG &DAG, TargetLowering &TLI) {
+  if (NumNonZero > 4)
+    return SDValue();
+
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue V(0, 0);
+  bool First = true;
+  for (unsigned i = 0; i < 8; ++i) {
+    bool isNonZero = (NonZeros & (1 << i)) != 0;
+    if (isNonZero) {
+      if (First) {
+        if (NumZero)
+          V = getZeroVector(MVT::v8i16, true, DAG, dl);
+        else
+          V = DAG.getUNDEF(MVT::v8i16);
+        First = false;
+      }
+      V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
+                      MVT::v8i16, V, Op.getOperand(i),
+                      DAG.getIntPtrConstant(i));
+    }
+  }
+
+  return V;
+}
+
+/// getVShift - Return a vector logical shift node.
+///
+static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
+                         unsigned NumBits, SelectionDAG &DAG,
+                         const TargetLowering &TLI, DebugLoc dl) {
+  bool isMMX = VT.getSizeInBits() == 64;
+  EVT ShVT = isMMX ? MVT::v1i64 : MVT::v2i64;
+  unsigned Opc = isLeft ? X86ISD::VSHL : X86ISD::VSRL;
+  SrcOp = DAG.getNode(ISD::BIT_CONVERT, dl, ShVT, SrcOp);
+  return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
+                     DAG.getNode(Opc, dl, ShVT, SrcOp,
+                             DAG.getConstant(NumBits, TLI.getShiftAmountTy())));
+}
+
+SDValue
+X86TargetLowering::LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
+                                          SelectionDAG &DAG) {
+  
+  // Check if the scalar load can be widened into a vector load. And if
+  // the address is "base + cst" see if the cst can be "absorbed" into
+  // the shuffle mask.
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
+    SDValue Ptr = LD->getBasePtr();
+    if (!ISD::isNormalLoad(LD) || LD->isVolatile())
+      return SDValue();
+    EVT PVT = LD->getValueType(0);
+    if (PVT != MVT::i32 && PVT != MVT::f32)
+      return SDValue();
+
+    int FI = -1;
+    int64_t Offset = 0;
+    if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
+      FI = FINode->getIndex();
+      Offset = 0;
+    } else if (Ptr.getOpcode() == ISD::ADD &&
+               isa<ConstantSDNode>(Ptr.getOperand(1)) &&
+               isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
+      FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
+      Offset = Ptr.getConstantOperandVal(1);
+      Ptr = Ptr.getOperand(0);
+    } else {
+      return SDValue();
+    }
+
+    SDValue Chain = LD->getChain();
+    // Make sure the stack object alignment is at least 16.
+    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+    if (DAG.InferPtrAlignment(Ptr) < 16) {
+      if (MFI->isFixedObjectIndex(FI)) {
+        // Can't change the alignment. FIXME: It's possible to compute
+        // the exact stack offset and reference FI + adjust offset instead.
+        // If someone *really* cares about this. That's the way to implement it.
+        return SDValue();
+      } else {
+        MFI->setObjectAlignment(FI, 16);
+      }
+    }
+
+    // (Offset % 16) must be multiple of 4. Then address is then
+    // Ptr + (Offset & ~15).
+    if (Offset < 0)
+      return SDValue();
+    if ((Offset % 16) & 3)
+      return SDValue();
+    int64_t StartOffset = Offset & ~15;
+    if (StartOffset)
+      Ptr = DAG.getNode(ISD::ADD, Ptr.getDebugLoc(), Ptr.getValueType(),
+                        Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
+
+    int EltNo = (Offset - StartOffset) >> 2;
+    int Mask[4] = { EltNo, EltNo, EltNo, EltNo };
+    EVT VT = (PVT == MVT::i32) ? MVT::v4i32 : MVT::v4f32;
+    SDValue V1 = DAG.getLoad(VT, dl, Chain, Ptr,LD->getSrcValue(),0);
+    // Canonicalize it to a v4i32 shuffle.
+    V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32, V1);
+    return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
+                       DAG.getVectorShuffle(MVT::v4i32, dl, V1,
+                                            DAG.getUNDEF(MVT::v4i32), &Mask[0]));
+  }
+
+  return SDValue();
+}
+
+SDValue
+X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  // All zero's are handled with pxor, all one's are handled with pcmpeqd.
+  if (ISD::isBuildVectorAllZeros(Op.getNode())
+      || ISD::isBuildVectorAllOnes(Op.getNode())) {
+    // Canonicalize this to either <4 x i32> or <2 x i32> (SSE vs MMX) to
+    // 1) ensure the zero vectors are CSE'd, and 2) ensure that i64 scalars are
+    // eliminated on x86-32 hosts.
+    if (Op.getValueType() == MVT::v4i32 || Op.getValueType() == MVT::v2i32)
+      return Op;
+
+    if (ISD::isBuildVectorAllOnes(Op.getNode()))
+      return getOnesVector(Op.getValueType(), DAG, dl);
+    return getZeroVector(Op.getValueType(), Subtarget->hasSSE2(), DAG, dl);
+  }
+
+  EVT VT = Op.getValueType();
+  EVT ExtVT = VT.getVectorElementType();
+  unsigned EVTBits = ExtVT.getSizeInBits();
+
+  unsigned NumElems = Op.getNumOperands();
+  unsigned NumZero  = 0;
+  unsigned NumNonZero = 0;
+  unsigned NonZeros = 0;
+  bool IsAllConstants = true;
+  SmallSet<SDValue, 8> Values;
+  for (unsigned i = 0; i < NumElems; ++i) {
+    SDValue Elt = Op.getOperand(i);
+    if (Elt.getOpcode() == ISD::UNDEF)
+      continue;
+    Values.insert(Elt);
+    if (Elt.getOpcode() != ISD::Constant &&
+        Elt.getOpcode() != ISD::ConstantFP)
+      IsAllConstants = false;
+    if (X86::isZeroNode(Elt))
+      NumZero++;
+    else {
+      NonZeros |= (1 << i);
+      NumNonZero++;
+    }
+  }
+
+  if (NumNonZero == 0) {
+    // All undef vector. Return an UNDEF.  All zero vectors were handled above.
+    return DAG.getUNDEF(VT);
+  }
+
+  // Special case for single non-zero, non-undef, element.
+  if (NumNonZero == 1) {
+    unsigned Idx = CountTrailingZeros_32(NonZeros);
+    SDValue Item = Op.getOperand(Idx);
+
+    // If this is an insertion of an i64 value on x86-32, and if the top bits of
+    // the value are obviously zero, truncate the value to i32 and do the
+    // insertion that way.  Only do this if the value is non-constant or if the
+    // value is a constant being inserted into element 0.  It is cheaper to do
+    // a constant pool load than it is to do a movd + shuffle.
+    if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
+        (!IsAllConstants || Idx == 0)) {
+      if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
+        // Handle MMX and SSE both.
+        EVT VecVT = VT == MVT::v2i64 ? MVT::v4i32 : MVT::v2i32;
+        unsigned VecElts = VT == MVT::v2i64 ? 4 : 2;
+
+        // Truncate the value (which may itself be a constant) to i32, and
+        // convert it to a vector with movd (S2V+shuffle to zero extend).
+        Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
+        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
+        Item = getShuffleVectorZeroOrUndef(Item, 0, true,
+                                           Subtarget->hasSSE2(), DAG);
+
+        // Now we have our 32-bit value zero extended in the low element of
+        // a vector.  If Idx != 0, swizzle it into place.
+        if (Idx != 0) {
+          SmallVector<int, 4> Mask;
+          Mask.push_back(Idx);
+          for (unsigned i = 1; i != VecElts; ++i)
+            Mask.push_back(i);
+          Item = DAG.getVectorShuffle(VecVT, dl, Item,
+                                      DAG.getUNDEF(Item.getValueType()),
+                                      &Mask[0]);
+        }
+        return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(), Item);
+      }
+    }
+
+    // If we have a constant or non-constant insertion into the low element of
+    // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
+    // the rest of the elements.  This will be matched as movd/movq/movss/movsd
+    // depending on what the source datatype is.
+    if (Idx == 0) {
+      if (NumZero == 0) {
+        return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
+      } else if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
+          (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
+        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
+        // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
+        return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget->hasSSE2(),
+                                           DAG);
+      } else if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
+        Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
+        EVT MiddleVT = VT.getSizeInBits() == 64 ? MVT::v2i32 : MVT::v4i32;
+        Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MiddleVT, Item);
+        Item = getShuffleVectorZeroOrUndef(Item, 0, true,
+                                           Subtarget->hasSSE2(), DAG);
+        return DAG.getNode(ISD::BIT_CONVERT, dl, VT, Item);
+      }
+    }
+
+    // Is it a vector logical left shift?
+    if (NumElems == 2 && Idx == 1 &&
+        X86::isZeroNode(Op.getOperand(0)) &&
+        !X86::isZeroNode(Op.getOperand(1))) {
+      unsigned NumBits = VT.getSizeInBits();
+      return getVShift(true, VT,
+                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+                                   VT, Op.getOperand(1)),
+                       NumBits/2, DAG, *this, dl);
+    }
+
+    if (IsAllConstants) // Otherwise, it's better to do a constpool load.
+      return SDValue();
+
+    // Otherwise, if this is a vector with i32 or f32 elements, and the element
+    // is a non-constant being inserted into an element other than the low one,
+    // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
+    // movd/movss) to move this into the low element, then shuffle it into
+    // place.
+    if (EVTBits == 32) {
+      Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
+
+      // Turn it into a shuffle of zero and zero-extended scalar to vector.
+      Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0,
+                                         Subtarget->hasSSE2(), DAG);
+      SmallVector<int, 8> MaskVec;
+      for (unsigned i = 0; i < NumElems; i++)
+        MaskVec.push_back(i == Idx ? 0 : 1);
+      return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
+    }
+  }
+
+  // Splat is obviously ok. Let legalizer expand it to a shuffle.
+  if (Values.size() == 1) {
+    if (EVTBits == 32) {
+      // Instead of a shuffle like this:
+      // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
+      // Check if it's possible to issue this instead.
+      // shuffle (vload ptr)), undef, <1, 1, 1, 1>
+      unsigned Idx = CountTrailingZeros_32(NonZeros);
+      SDValue Item = Op.getOperand(Idx);
+      if (Op.getNode()->isOnlyUserOf(Item.getNode()))
+        return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
+    }
+    return SDValue();
+  }
+
+  // A vector full of immediates; various special cases are already
+  // handled, so this is best done with a single constant-pool load.
+  if (IsAllConstants)
+    return SDValue();
+
+  // Let legalizer expand 2-wide build_vectors.
+  if (EVTBits == 64) {
+    if (NumNonZero == 1) {
+      // One half is zero or undef.
+      unsigned Idx = CountTrailingZeros_32(NonZeros);
+      SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
+                                 Op.getOperand(Idx));
+      return getShuffleVectorZeroOrUndef(V2, Idx, true,
+                                         Subtarget->hasSSE2(), DAG);
+    }
+    return SDValue();
+  }
+
+  // If element VT is < 32 bits, convert it to inserts into a zero vector.
+  if (EVTBits == 8 && NumElems == 16) {
+    SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
+                                        *this);
+    if (V.getNode()) return V;
+  }
+
+  if (EVTBits == 16 && NumElems == 8) {
+    SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
+                                        *this);
+    if (V.getNode()) return V;
+  }
+
+  // If element VT is == 32 bits, turn it into a number of shuffles.
+  SmallVector<SDValue, 8> V;
+  V.resize(NumElems);
+  if (NumElems == 4 && NumZero > 0) {
+    for (unsigned i = 0; i < 4; ++i) {
+      bool isZero = !(NonZeros & (1 << i));
+      if (isZero)
+        V[i] = getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
+      else
+        V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
+    }
+
+    for (unsigned i = 0; i < 2; ++i) {
+      switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
+        default: break;
+        case 0:
+          V[i] = V[i*2];  // Must be a zero vector.
+          break;
+        case 1:
+          V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
+          break;
+        case 2:
+          V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
+          break;
+        case 3:
+          V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
+          break;
+      }
+    }
+
+    SmallVector<int, 8> MaskVec;
+    bool Reverse = (NonZeros & 0x3) == 2;
+    for (unsigned i = 0; i < 2; ++i)
+      MaskVec.push_back(Reverse ? 1-i : i);
+    Reverse = ((NonZeros & (0x3 << 2)) >> 2) == 2;
+    for (unsigned i = 0; i < 2; ++i)
+      MaskVec.push_back(Reverse ? 1-i+NumElems : i+NumElems);
+    return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
+  }
+
+  if (Values.size() > 2) {
+    // If we have SSE 4.1, Expand into a number of inserts unless the number of
+    // values to be inserted is equal to the number of elements, in which case
+    // use the unpack code below in the hopes of matching the consecutive elts
+    // load merge pattern for shuffles.
+    // FIXME: We could probably just check that here directly.
+    if (Values.size() < NumElems && VT.getSizeInBits() == 128 &&
+        getSubtarget()->hasSSE41()) {
+      V[0] = DAG.getUNDEF(VT);
+      for (unsigned i = 0; i < NumElems; ++i)
+        if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
+          V[0] = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V[0],
+                             Op.getOperand(i), DAG.getIntPtrConstant(i));
+      return V[0];
+    }
+    // Expand into a number of unpckl*.
+    // e.g. for v4f32
+    //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
+    //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
+    //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
+    for (unsigned i = 0; i < NumElems; ++i)
+      V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
+    NumElems >>= 1;
+    while (NumElems != 0) {
+      for (unsigned i = 0; i < NumElems; ++i)
+        V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + NumElems]);
+      NumElems >>= 1;
+    }
+    return V[0];
+  }
+
+  return SDValue();
+}
+
+SDValue
+X86TargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
+  // We support concatenate two MMX registers and place them in a MMX
+  // register.  This is better than doing a stack convert.
+  DebugLoc dl = Op.getDebugLoc();
+  EVT ResVT = Op.getValueType();
+  assert(Op.getNumOperands() == 2);
+  assert(ResVT == MVT::v2i64 || ResVT == MVT::v4i32 ||
+         ResVT == MVT::v8i16 || ResVT == MVT::v16i8);
+  int Mask[2];
+  SDValue InVec = DAG.getNode(ISD::BIT_CONVERT,dl, MVT::v1i64, Op.getOperand(0));
+  SDValue VecOp = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec);
+  InVec = Op.getOperand(1);
+  if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+    unsigned NumElts = ResVT.getVectorNumElements();
+    VecOp = DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp);
+    VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ResVT, VecOp,
+                       InVec.getOperand(0), DAG.getIntPtrConstant(NumElts/2+1));
+  } else {
+    InVec = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v1i64, InVec);
+    SDValue VecOp2 = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, InVec);
+    Mask[0] = 0; Mask[1] = 2;
+    VecOp = DAG.getVectorShuffle(MVT::v2i64, dl, VecOp, VecOp2, Mask);
+  }
+  return DAG.getNode(ISD::BIT_CONVERT, dl, ResVT, VecOp);
+}
+
+// v8i16 shuffles - Prefer shuffles in the following order:
+// 1. [all]   pshuflw, pshufhw, optional move
+// 2. [ssse3] 1 x pshufb
+// 3. [ssse3] 2 x pshufb + 1 x por
+// 4. [all]   mov + pshuflw + pshufhw + N x (pextrw + pinsrw)
+static
+SDValue LowerVECTOR_SHUFFLEv8i16(ShuffleVectorSDNode *SVOp,
+                                 SelectionDAG &DAG, X86TargetLowering &TLI) {
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  DebugLoc dl = SVOp->getDebugLoc();
+  SmallVector<int, 8> MaskVals;
+
+  // Determine if more than 1 of the words in each of the low and high quadwords
+  // of the result come from the same quadword of one of the two inputs.  Undef
+  // mask values count as coming from any quadword, for better codegen.
+  SmallVector<unsigned, 4> LoQuad(4);
+  SmallVector<unsigned, 4> HiQuad(4);
+  BitVector InputQuads(4);
+  for (unsigned i = 0; i < 8; ++i) {
+    SmallVectorImpl<unsigned> &Quad = i < 4 ? LoQuad : HiQuad;
+    int EltIdx = SVOp->getMaskElt(i);
+    MaskVals.push_back(EltIdx);
+    if (EltIdx < 0) {
+      ++Quad[0];
+      ++Quad[1];
+      ++Quad[2];
+      ++Quad[3];
+      continue;
+    }
+    ++Quad[EltIdx / 4];
+    InputQuads.set(EltIdx / 4);
+  }
+
+  int BestLoQuad = -1;
+  unsigned MaxQuad = 1;
+  for (unsigned i = 0; i < 4; ++i) {
+    if (LoQuad[i] > MaxQuad) {
+      BestLoQuad = i;
+      MaxQuad = LoQuad[i];
+    }
+  }
+
+  int BestHiQuad = -1;
+  MaxQuad = 1;
+  for (unsigned i = 0; i < 4; ++i) {
+    if (HiQuad[i] > MaxQuad) {
+      BestHiQuad = i;
+      MaxQuad = HiQuad[i];
+    }
+  }
+
+  // For SSSE3, If all 8 words of the result come from only 1 quadword of each
+  // of the two input vectors, shuffle them into one input vector so only a
+  // single pshufb instruction is necessary. If There are more than 2 input
+  // quads, disable the next transformation since it does not help SSSE3.
+  bool V1Used = InputQuads[0] || InputQuads[1];
+  bool V2Used = InputQuads[2] || InputQuads[3];
+  if (TLI.getSubtarget()->hasSSSE3()) {
+    if (InputQuads.count() == 2 && V1Used && V2Used) {
+      BestLoQuad = InputQuads.find_first();
+      BestHiQuad = InputQuads.find_next(BestLoQuad);
+    }
+    if (InputQuads.count() > 2) {
+      BestLoQuad = -1;
+      BestHiQuad = -1;
+    }
+  }
+
+  // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update
+  // the shuffle mask.  If a quad is scored as -1, that means that it contains
+  // words from all 4 input quadwords.
+  SDValue NewV;
+  if (BestLoQuad >= 0 || BestHiQuad >= 0) {
+    SmallVector<int, 8> MaskV;
+    MaskV.push_back(BestLoQuad < 0 ? 0 : BestLoQuad);
+    MaskV.push_back(BestHiQuad < 0 ? 1 : BestHiQuad);
+    NewV = DAG.getVectorShuffle(MVT::v2i64, dl,
+                  DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V1),
+                  DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, V2), &MaskV[0]);
+    NewV = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, NewV);
+
+    // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the
+    // source words for the shuffle, to aid later transformations.
+    bool AllWordsInNewV = true;
+    bool InOrder[2] = { true, true };
+    for (unsigned i = 0; i != 8; ++i) {
+      int idx = MaskVals[i];
+      if (idx != (int)i)
+        InOrder[i/4] = false;
+      if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad)
+        continue;
+      AllWordsInNewV = false;
+      break;
+    }
+
+    bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV;
+    if (AllWordsInNewV) {
+      for (int i = 0; i != 8; ++i) {
+        int idx = MaskVals[i];
+        if (idx < 0)
+          continue;
+        idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4;
+        if ((idx != i) && idx < 4)
+          pshufhw = false;
+        if ((idx != i) && idx > 3)
+          pshuflw = false;
+      }
+      V1 = NewV;
+      V2Used = false;
+      BestLoQuad = 0;
+      BestHiQuad = 1;
+    }
+
+    // If we've eliminated the use of V2, and the new mask is a pshuflw or
+    // pshufhw, that's as cheap as it gets.  Return the new shuffle.
+    if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) {
+      return DAG.getVectorShuffle(MVT::v8i16, dl, NewV,
+                                  DAG.getUNDEF(MVT::v8i16), &MaskVals[0]);
+    }
+  }
+
+  // If we have SSSE3, and all words of the result are from 1 input vector,
+  // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
+  // is present, fall back to case 4.
+  if (TLI.getSubtarget()->hasSSSE3()) {
+    SmallVector<SDValue,16> pshufbMask;
+
+    // If we have elements from both input vectors, set the high bit of the
+    // shuffle mask element to zero out elements that come from V2 in the V1
+    // mask, and elements that come from V1 in the V2 mask, so that the two
+    // results can be OR'd together.
+    bool TwoInputs = V1Used && V2Used;
+    for (unsigned i = 0; i != 8; ++i) {
+      int EltIdx = MaskVals[i] * 2;
+      if (TwoInputs && (EltIdx >= 16)) {
+        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
+        continue;
+      }
+      pshufbMask.push_back(DAG.getConstant(EltIdx,   MVT::i8));
+      pshufbMask.push_back(DAG.getConstant(EltIdx+1, MVT::i8));
+    }
+    V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V1);
+    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
+                     DAG.getNode(ISD::BUILD_VECTOR, dl,
+                                 MVT::v16i8, &pshufbMask[0], 16));
+    if (!TwoInputs)
+      return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
+
+    // Calculate the shuffle mask for the second input, shuffle it, and
+    // OR it with the first shuffled input.
+    pshufbMask.clear();
+    for (unsigned i = 0; i != 8; ++i) {
+      int EltIdx = MaskVals[i] * 2;
+      if (EltIdx < 16) {
+        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
+        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
+        continue;
+      }
+      pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
+      pshufbMask.push_back(DAG.getConstant(EltIdx - 15, MVT::i8));
+    }
+    V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, V2);
+    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
+                     DAG.getNode(ISD::BUILD_VECTOR, dl,
+                                 MVT::v16i8, &pshufbMask[0], 16));
+    V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
+    return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
+  }
+
+  // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order,
+  // and update MaskVals with new element order.
+  BitVector InOrder(8);
+  if (BestLoQuad >= 0) {
+    SmallVector<int, 8> MaskV;
+    for (int i = 0; i != 4; ++i) {
+      int idx = MaskVals[i];
+      if (idx < 0) {
+        MaskV.push_back(-1);
+        InOrder.set(i);
+      } else if ((idx / 4) == BestLoQuad) {
+        MaskV.push_back(idx & 3);
+        InOrder.set(i);
+      } else {
+        MaskV.push_back(-1);
+      }
+    }
+    for (unsigned i = 4; i != 8; ++i)
+      MaskV.push_back(i);
+    NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
+                                &MaskV[0]);
+  }
+
+  // If BestHi >= 0, generate a pshufhw to put the high elements in order,
+  // and update MaskVals with the new element order.
+  if (BestHiQuad >= 0) {
+    SmallVector<int, 8> MaskV;
+    for (unsigned i = 0; i != 4; ++i)
+      MaskV.push_back(i);
+    for (unsigned i = 4; i != 8; ++i) {
+      int idx = MaskVals[i];
+      if (idx < 0) {
+        MaskV.push_back(-1);
+        InOrder.set(i);
+      } else if ((idx / 4) == BestHiQuad) {
+        MaskV.push_back((idx & 3) + 4);
+        InOrder.set(i);
+      } else {
+        MaskV.push_back(-1);
+      }
+    }
+    NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
+                                &MaskV[0]);
+  }
+
+  // In case BestHi & BestLo were both -1, which means each quadword has a word
+  // from each of the four input quadwords, calculate the InOrder bitvector now
+  // before falling through to the insert/extract cleanup.
+  if (BestLoQuad == -1 && BestHiQuad == -1) {
+    NewV = V1;
+    for (int i = 0; i != 8; ++i)
+      if (MaskVals[i] < 0 || MaskVals[i] == i)
+        InOrder.set(i);
+  }
+
+  // The other elements are put in the right place using pextrw and pinsrw.
+  for (unsigned i = 0; i != 8; ++i) {
+    if (InOrder[i])
+      continue;
+    int EltIdx = MaskVals[i];
+    if (EltIdx < 0)
+      continue;
+    SDValue ExtOp = (EltIdx < 8)
+    ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1,
+                  DAG.getIntPtrConstant(EltIdx))
+    : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2,
+                  DAG.getIntPtrConstant(EltIdx - 8));
+    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp,
+                       DAG.getIntPtrConstant(i));
+  }
+  return NewV;
+}
+
+// v16i8 shuffles - Prefer shuffles in the following order:
+// 1. [ssse3] 1 x pshufb
+// 2. [ssse3] 2 x pshufb + 1 x por
+// 3. [all]   v8i16 shuffle + N x pextrw + rotate + pinsrw
+static
+SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
+                                 SelectionDAG &DAG, X86TargetLowering &TLI) {
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  DebugLoc dl = SVOp->getDebugLoc();
+  SmallVector<int, 16> MaskVals;
+  SVOp->getMask(MaskVals);
+
+  // If we have SSSE3, case 1 is generated when all result bytes come from
+  // one of  the inputs.  Otherwise, case 2 is generated.  If no SSSE3 is
+  // present, fall back to case 3.
+  // FIXME: kill V2Only once shuffles are canonizalized by getNode.
+  bool V1Only = true;
+  bool V2Only = true;
+  for (unsigned i = 0; i < 16; ++i) {
+    int EltIdx = MaskVals[i];
+    if (EltIdx < 0)
+      continue;
+    if (EltIdx < 16)
+      V2Only = false;
+    else
+      V1Only = false;
+  }
+
+  // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
+  if (TLI.getSubtarget()->hasSSSE3()) {
+    SmallVector<SDValue,16> pshufbMask;
+
+    // If all result elements are from one input vector, then only translate
+    // undef mask values to 0x80 (zero out result) in the pshufb mask.
+    //
+    // Otherwise, we have elements from both input vectors, and must zero out
+    // elements that come from V2 in the first mask, and V1 in the second mask
+    // so that we can OR them together.
+    bool TwoInputs = !(V1Only || V2Only);
+    for (unsigned i = 0; i != 16; ++i) {
+      int EltIdx = MaskVals[i];
+      if (EltIdx < 0 || (TwoInputs && EltIdx >= 16)) {
+        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
+        continue;
+      }
+      pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8));
+    }
+    // If all the elements are from V2, assign it to V1 and return after
+    // building the first pshufb.
+    if (V2Only)
+      V1 = V2;
+    V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
+                     DAG.getNode(ISD::BUILD_VECTOR, dl,
+                                 MVT::v16i8, &pshufbMask[0], 16));
+    if (!TwoInputs)
+      return V1;
+
+    // Calculate the shuffle mask for the second input, shuffle it, and
+    // OR it with the first shuffled input.
+    pshufbMask.clear();
+    for (unsigned i = 0; i != 16; ++i) {
+      int EltIdx = MaskVals[i];
+      if (EltIdx < 16) {
+        pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
+        continue;
+      }
+      pshufbMask.push_back(DAG.getConstant(EltIdx - 16, MVT::i8));
+    }
+    V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
+                     DAG.getNode(ISD::BUILD_VECTOR, dl,
+                                 MVT::v16i8, &pshufbMask[0], 16));
+    return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
+  }
+
+  // No SSSE3 - Calculate in place words and then fix all out of place words
+  // With 0-16 extracts & inserts.  Worst case is 16 bytes out of order from
+  // the 16 different words that comprise the two doublequadword input vectors.
+  V1 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V1);
+  V2 = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v8i16, V2);
+  SDValue NewV = V2Only ? V2 : V1;
+  for (int i = 0; i != 8; ++i) {
+    int Elt0 = MaskVals[i*2];
+    int Elt1 = MaskVals[i*2+1];
+
+    // This word of the result is all undef, skip it.
+    if (Elt0 < 0 && Elt1 < 0)
+      continue;
+
+    // This word of the result is already in the correct place, skip it.
+    if (V1Only && (Elt0 == i*2) && (Elt1 == i*2+1))
+      continue;
+    if (V2Only && (Elt0 == i*2+16) && (Elt1 == i*2+17))
+      continue;
+
+    SDValue Elt0Src = Elt0 < 16 ? V1 : V2;
+    SDValue Elt1Src = Elt1 < 16 ? V1 : V2;
+    SDValue InsElt;
+
+    // If Elt0 and Elt1 are defined, are consecutive, and can be load
+    // using a single extract together, load it and store it.
+    if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) {
+      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
+                           DAG.getIntPtrConstant(Elt1 / 2));
+      NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
+                        DAG.getIntPtrConstant(i));
+      continue;
+    }
+
+    // If Elt1 is defined, extract it from the appropriate source.  If the
+    // source byte is not also odd, shift the extracted word left 8 bits
+    // otherwise clear the bottom 8 bits if we need to do an or.
+    if (Elt1 >= 0) {
+      InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src,
+                           DAG.getIntPtrConstant(Elt1 / 2));
+      if ((Elt1 & 1) == 0)
+        InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt,
+                             DAG.getConstant(8, TLI.getShiftAmountTy()));
+      else if (Elt0 >= 0)
+        InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt,
+                             DAG.getConstant(0xFF00, MVT::i16));
+    }
+    // If Elt0 is defined, extract it from the appropriate source.  If the
+    // source byte is not also even, shift the extracted word right 8 bits. If
+    // Elt1 was also defined, OR the extracted values together before
+    // inserting them in the result.
+    if (Elt0 >= 0) {
+      SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
+                                    Elt0Src, DAG.getIntPtrConstant(Elt0 / 2));
+      if ((Elt0 & 1) != 0)
+        InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0,
+                              DAG.getConstant(8, TLI.getShiftAmountTy()));
+      else if (Elt1 >= 0)
+        InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0,
+                             DAG.getConstant(0x00FF, MVT::i16));
+      InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0)
+                         : InsElt0;
+    }
+    NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt,
+                       DAG.getIntPtrConstant(i));
+  }
+  return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v16i8, NewV);
+}
+
+/// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide
+/// ones, or rewriting v4i32 / v2f32 as 2 wide ones if possible. This can be
+/// done when every pair / quad of shuffle mask elements point to elements in
+/// the right sequence. e.g.
+/// vector_shuffle <>, <>, < 3, 4, | 10, 11, | 0, 1, | 14, 15>
+static
+SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
+                                 SelectionDAG &DAG,
+                                 TargetLowering &TLI, DebugLoc dl) {
+  EVT VT = SVOp->getValueType(0);
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  unsigned NumElems = VT.getVectorNumElements();
+  unsigned NewWidth = (NumElems == 4) ? 2 : 4;
+  EVT MaskVT = MVT::getIntVectorWithNumElements(NewWidth);
+  EVT MaskEltVT = MaskVT.getVectorElementType();
+  EVT NewVT = MaskVT;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: assert(false && "Unexpected!");
+  case MVT::v4f32: NewVT = MVT::v2f64; break;
+  case MVT::v4i32: NewVT = MVT::v2i64; break;
+  case MVT::v8i16: NewVT = MVT::v4i32; break;
+  case MVT::v16i8: NewVT = MVT::v4i32; break;
+  }
+
+  if (NewWidth == 2) {
+    if (VT.isInteger())
+      NewVT = MVT::v2i64;
+    else
+      NewVT = MVT::v2f64;
+  }
+  int Scale = NumElems / NewWidth;
+  SmallVector<int, 8> MaskVec;
+  for (unsigned i = 0; i < NumElems; i += Scale) {
+    int StartIdx = -1;
+    for (int j = 0; j < Scale; ++j) {
+      int EltIdx = SVOp->getMaskElt(i+j);
+      if (EltIdx < 0)
+        continue;
+      if (StartIdx == -1)
+        StartIdx = EltIdx - (EltIdx % Scale);
+      if (EltIdx != StartIdx + j)
+        return SDValue();
+    }
+    if (StartIdx == -1)
+      MaskVec.push_back(-1);
+    else
+      MaskVec.push_back(StartIdx / Scale);
+  }
+
+  V1 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V1);
+  V2 = DAG.getNode(ISD::BIT_CONVERT, dl, NewVT, V2);
+  return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]);
+}
+
+/// getVZextMovL - Return a zero-extending vector move low node.
+///
+static SDValue getVZextMovL(EVT VT, EVT OpVT,
+                            SDValue SrcOp, SelectionDAG &DAG,
+                            const X86Subtarget *Subtarget, DebugLoc dl) {
+  if (VT == MVT::v2f64 || VT == MVT::v4f32) {
+    LoadSDNode *LD = NULL;
+    if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
+      LD = dyn_cast<LoadSDNode>(SrcOp);
+    if (!LD) {
+      // movssrr and movsdrr do not clear top bits. Try to use movd, movq
+      // instead.
+      MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32;
+      if ((ExtVT.SimpleTy != MVT::i64 || Subtarget->is64Bit()) &&
+          SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+          SrcOp.getOperand(0).getOpcode() == ISD::BIT_CONVERT &&
+          SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) {
+        // PR2108
+        OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32;
+        return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
+                           DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
+                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+                                                   OpVT,
+                                                   SrcOp.getOperand(0)
+                                                          .getOperand(0))));
+      }
+    }
+  }
+
+  return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
+                     DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT,
+                                 DAG.getNode(ISD::BIT_CONVERT, dl,
+                                             OpVT, SrcOp)));
+}
+
+/// LowerVECTOR_SHUFFLE_4wide - Handle all 4 wide cases with a number of
+/// shuffles.
+static SDValue
+LowerVECTOR_SHUFFLE_4wide(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  DebugLoc dl = SVOp->getDebugLoc();
+  EVT VT = SVOp->getValueType(0);
+
+  SmallVector<std::pair<int, int>, 8> Locs;
+  Locs.resize(4);
+  SmallVector<int, 8> Mask1(4U, -1);
+  SmallVector<int, 8> PermMask;
+  SVOp->getMask(PermMask);
+
+  unsigned NumHi = 0;
+  unsigned NumLo = 0;
+  for (unsigned i = 0; i != 4; ++i) {
+    int Idx = PermMask[i];
+    if (Idx < 0) {
+      Locs[i] = std::make_pair(-1, -1);
+    } else {
+      assert(Idx < 8 && "Invalid VECTOR_SHUFFLE index!");
+      if (Idx < 4) {
+        Locs[i] = std::make_pair(0, NumLo);
+        Mask1[NumLo] = Idx;
+        NumLo++;
+      } else {
+        Locs[i] = std::make_pair(1, NumHi);
+        if (2+NumHi < 4)
+          Mask1[2+NumHi] = Idx;
+        NumHi++;
+      }
+    }
+  }
+
+  if (NumLo <= 2 && NumHi <= 2) {
+    // If no more than two elements come from either vector. This can be
+    // implemented with two shuffles. First shuffle gather the elements.
+    // The second shuffle, which takes the first shuffle as both of its
+    // vector operands, put the elements into the right order.
+    V1 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
+
+    SmallVector<int, 8> Mask2(4U, -1);
+
+    for (unsigned i = 0; i != 4; ++i) {
+      if (Locs[i].first == -1)
+        continue;
+      else {
+        unsigned Idx = (i < 2) ? 0 : 4;
+        Idx += Locs[i].first * 2 + Locs[i].second;
+        Mask2[i] = Idx;
+      }
+    }
+
+    return DAG.getVectorShuffle(VT, dl, V1, V1, &Mask2[0]);
+  } else if (NumLo == 3 || NumHi == 3) {
+    // Otherwise, we must have three elements from one vector, call it X, and
+    // one element from the other, call it Y.  First, use a shufps to build an
+    // intermediate vector with the one element from Y and the element from X
+    // that will be in the same half in the final destination (the indexes don't
+    // matter). Then, use a shufps to build the final vector, taking the half
+    // containing the element from Y from the intermediate, and the other half
+    // from X.
+    if (NumHi == 3) {
+      // Normalize it so the 3 elements come from V1.
+      CommuteVectorShuffleMask(PermMask, VT);
+      std::swap(V1, V2);
+    }
+
+    // Find the element from V2.
+    unsigned HiIndex;
+    for (HiIndex = 0; HiIndex < 3; ++HiIndex) {
+      int Val = PermMask[HiIndex];
+      if (Val < 0)
+        continue;
+      if (Val >= 4)
+        break;
+    }
+
+    Mask1[0] = PermMask[HiIndex];
+    Mask1[1] = -1;
+    Mask1[2] = PermMask[HiIndex^1];
+    Mask1[3] = -1;
+    V2 = DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
+
+    if (HiIndex >= 2) {
+      Mask1[0] = PermMask[0];
+      Mask1[1] = PermMask[1];
+      Mask1[2] = HiIndex & 1 ? 6 : 4;
+      Mask1[3] = HiIndex & 1 ? 4 : 6;
+      return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask1[0]);
+    } else {
+      Mask1[0] = HiIndex & 1 ? 2 : 0;
+      Mask1[1] = HiIndex & 1 ? 0 : 2;
+      Mask1[2] = PermMask[2];
+      Mask1[3] = PermMask[3];
+      if (Mask1[2] >= 0)
+        Mask1[2] += 4;
+      if (Mask1[3] >= 0)
+        Mask1[3] += 4;
+      return DAG.getVectorShuffle(VT, dl, V2, V1, &Mask1[0]);
+    }
+  }
+
+  // Break it into (shuffle shuffle_hi, shuffle_lo).
+  Locs.clear();
+  SmallVector<int,8> LoMask(4U, -1);
+  SmallVector<int,8> HiMask(4U, -1);
+
+  SmallVector<int,8> *MaskPtr = &LoMask;
+  unsigned MaskIdx = 0;
+  unsigned LoIdx = 0;
+  unsigned HiIdx = 2;
+  for (unsigned i = 0; i != 4; ++i) {
+    if (i == 2) {
+      MaskPtr = &HiMask;
+      MaskIdx = 1;
+      LoIdx = 0;
+      HiIdx = 2;
+    }
+    int Idx = PermMask[i];
+    if (Idx < 0) {
+      Locs[i] = std::make_pair(-1, -1);
+    } else if (Idx < 4) {
+      Locs[i] = std::make_pair(MaskIdx, LoIdx);
+      (*MaskPtr)[LoIdx] = Idx;
+      LoIdx++;
+    } else {
+      Locs[i] = std::make_pair(MaskIdx, HiIdx);
+      (*MaskPtr)[HiIdx] = Idx;
+      HiIdx++;
+    }
+  }
+
+  SDValue LoShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &LoMask[0]);
+  SDValue HiShuffle = DAG.getVectorShuffle(VT, dl, V1, V2, &HiMask[0]);
+  SmallVector<int, 8> MaskOps;
+  for (unsigned i = 0; i != 4; ++i) {
+    if (Locs[i].first == -1) {
+      MaskOps.push_back(-1);
+    } else {
+      unsigned Idx = Locs[i].first * 4 + Locs[i].second;
+      MaskOps.push_back(Idx);
+    }
+  }
+  return DAG.getVectorShuffle(VT, dl, LoShuffle, HiShuffle, &MaskOps[0]);
+}
+
+SDValue
+X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  EVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned NumElems = VT.getVectorNumElements();
+  bool isMMX = VT.getSizeInBits() == 64;
+  bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
+  bool V2IsUndef = V2.getOpcode() == ISD::UNDEF;
+  bool V1IsSplat = false;
+  bool V2IsSplat = false;
+
+  if (isZeroShuffle(SVOp))
+    return getZeroVector(VT, Subtarget->hasSSE2(), DAG, dl);
+
+  // Promote splats to v4f32.
+  if (SVOp->isSplat()) {
+    if (isMMX || NumElems < 4)
+      return Op;
+    return PromoteSplat(SVOp, DAG, Subtarget->hasSSE2());
+  }
+
+  // If the shuffle can be profitably rewritten as a narrower shuffle, then
+  // do it!
+  if (VT == MVT::v8i16 || VT == MVT::v16i8) {
+    SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl);
+    if (NewOp.getNode())
+      return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
+                         LowerVECTOR_SHUFFLE(NewOp, DAG));
+  } else if ((VT == MVT::v4i32 || (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
+    // FIXME: Figure out a cleaner way to do this.
+    // Try to make use of movq to zero out the top part.
+    if (ISD::isBuildVectorAllZeros(V2.getNode())) {
+      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl);
+      if (NewOp.getNode()) {
+        if (isCommutedMOVL(cast<ShuffleVectorSDNode>(NewOp), true, false))
+          return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(0),
+                              DAG, Subtarget, dl);
+      }
+    } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
+      SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, *this, dl);
+      if (NewOp.getNode() && X86::isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)))
+        return getVZextMovL(VT, NewOp.getValueType(), NewOp.getOperand(1),
+                            DAG, Subtarget, dl);
+    }
+  }
+
+  if (X86::isPSHUFDMask(SVOp))
+    return Op;
+
+  // Check if this can be converted into a logical shift.
+  bool isLeft = false;
+  unsigned ShAmt = 0;
+  SDValue ShVal;
+  bool isShift = getSubtarget()->hasSSE2() &&
+    isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
+  if (isShift && ShVal.hasOneUse()) {
+    // If the shifted value has multiple uses, it may be cheaper to use
+    // v_set0 + movlhps or movhlps, etc.
+    EVT EltVT = VT.getVectorElementType();
+    ShAmt *= EltVT.getSizeInBits();
+    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
+  }
+
+  if (X86::isMOVLMask(SVOp)) {
+    if (V1IsUndef)
+      return V2;
+    if (ISD::isBuildVectorAllZeros(V1.getNode()))
+      return getVZextMovL(VT, VT, V2, DAG, Subtarget, dl);
+    if (!isMMX)
+      return Op;
+  }
+
+  // FIXME: fold these into legal mask.
+  if (!isMMX && (X86::isMOVSHDUPMask(SVOp) ||
+                 X86::isMOVSLDUPMask(SVOp) ||
+                 X86::isMOVHLPSMask(SVOp) ||
+                 X86::isMOVLHPSMask(SVOp) ||
+                 X86::isMOVLPMask(SVOp)))
+    return Op;
+
+  if (ShouldXformToMOVHLPS(SVOp) ||
+      ShouldXformToMOVLP(V1.getNode(), V2.getNode(), SVOp))
+    return CommuteVectorShuffle(SVOp, DAG);
+
+  if (isShift) {
+    // No better options. Use a vshl / vsrl.
+    EVT EltVT = VT.getVectorElementType();
+    ShAmt *= EltVT.getSizeInBits();
+    return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
+  }
+
+  bool Commuted = false;
+  // FIXME: This should also accept a bitcast of a splat?  Be careful, not
+  // 1,1,1,1 -> v8i16 though.
+  V1IsSplat = isSplatVector(V1.getNode());
+  V2IsSplat = isSplatVector(V2.getNode());
+
+  // Canonicalize the splat or undef, if present, to be on the RHS.
+  if ((V1IsSplat || V1IsUndef) && !(V2IsSplat || V2IsUndef)) {
+    Op = CommuteVectorShuffle(SVOp, DAG);
+    SVOp = cast<ShuffleVectorSDNode>(Op);
+    V1 = SVOp->getOperand(0);
+    V2 = SVOp->getOperand(1);
+    std::swap(V1IsSplat, V2IsSplat);
+    std::swap(V1IsUndef, V2IsUndef);
+    Commuted = true;
+  }
+
+  if (isCommutedMOVL(SVOp, V2IsSplat, V2IsUndef)) {
+    // Shuffling low element of v1 into undef, just return v1.
+    if (V2IsUndef)
+      return V1;
+    // If V2 is a splat, the mask may be malformed such as <4,3,3,3>, which
+    // the instruction selector will not match, so get a canonical MOVL with
+    // swapped operands to undo the commute.
+    return getMOVL(DAG, dl, VT, V2, V1);
+  }
+
+  if (X86::isUNPCKL_v_undef_Mask(SVOp) ||
+      X86::isUNPCKH_v_undef_Mask(SVOp) ||
+      X86::isUNPCKLMask(SVOp) ||
+      X86::isUNPCKHMask(SVOp))
+    return Op;
+
+  if (V2IsSplat) {
+    // Normalize mask so all entries that point to V2 points to its first
+    // element then try to match unpck{h|l} again. If match, return a
+    // new vector_shuffle with the corrected mask.
+    SDValue NewMask = NormalizeMask(SVOp, DAG);
+    ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask);
+    if (NSVOp != SVOp) {
+      if (X86::isUNPCKLMask(NSVOp, true)) {
+        return NewMask;
+      } else if (X86::isUNPCKHMask(NSVOp, true)) {
+        return NewMask;
+      }
+    }
+  }
+
+  if (Commuted) {
+    // Commute is back and try unpck* again.
+    // FIXME: this seems wrong.
+    SDValue NewOp = CommuteVectorShuffle(SVOp, DAG);
+    ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp);
+    if (X86::isUNPCKL_v_undef_Mask(NewSVOp) ||
+        X86::isUNPCKH_v_undef_Mask(NewSVOp) ||
+        X86::isUNPCKLMask(NewSVOp) ||
+        X86::isUNPCKHMask(NewSVOp))
+      return NewOp;
+  }
+
+  // FIXME: for mmx, bitcast v2i32 to v4i16 for shuffle.
+
+  // Normalize the node to match x86 shuffle ops if needed
+  if (!isMMX && V2.getOpcode() != ISD::UNDEF && isCommutedSHUFP(SVOp))
+    return CommuteVectorShuffle(SVOp, DAG);
+
+  // Check for legal shuffle and return?
+  SmallVector<int, 16> PermMask;
+  SVOp->getMask(PermMask);
+  if (isShuffleMaskLegal(PermMask, VT))
+    return Op;
+
+  // Handle v8i16 specifically since SSE can do byte extraction and insertion.
+  if (VT == MVT::v8i16) {
+    SDValue NewOp = LowerVECTOR_SHUFFLEv8i16(SVOp, DAG, *this);
+    if (NewOp.getNode())
+      return NewOp;
+  }
+
+  if (VT == MVT::v16i8) {
+    SDValue NewOp = LowerVECTOR_SHUFFLEv16i8(SVOp, DAG, *this);
+    if (NewOp.getNode())
+      return NewOp;
+  }
+
+  // Handle all 4 wide cases with a number of shuffles except for MMX.
+  if (NumElems == 4 && !isMMX)
+    return LowerVECTOR_SHUFFLE_4wide(SVOp, DAG);
+
+  return SDValue();
+}
+
+SDValue
+X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
+                                                SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+  if (VT.getSizeInBits() == 8) {
+    SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
+                                    Op.getOperand(0), Op.getOperand(1));
+    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
+                                    DAG.getValueType(VT));
+    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
+  } else if (VT.getSizeInBits() == 16) {
+    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    // If Idx is 0, it's cheaper to do a move instead of a pextrw.
+    if (Idx == 0)
+      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
+                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+                                     DAG.getNode(ISD::BIT_CONVERT, dl,
+                                                 MVT::v4i32,
+                                                 Op.getOperand(0)),
+                                     Op.getOperand(1)));
+    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
+                                    Op.getOperand(0), Op.getOperand(1));
+    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
+                                    DAG.getValueType(VT));
+    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
+  } else if (VT == MVT::f32) {
+    // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
+    // the result back to FR32 register. It's only worth matching if the
+    // result has a single use which is a store or a bitcast to i32.  And in
+    // the case of a store, it's not worth it if the index is a constant 0,
+    // because a MOVSSmr can be used instead, which is smaller and faster.
+    if (!Op.hasOneUse())
+      return SDValue();
+    SDNode *User = *Op.getNode()->use_begin();
+    if ((User->getOpcode() != ISD::STORE ||
+         (isa<ConstantSDNode>(Op.getOperand(1)) &&
+          cast<ConstantSDNode>(Op.getOperand(1))->isNullValue())) &&
+        (User->getOpcode() != ISD::BIT_CONVERT ||
+         User->getValueType(0) != MVT::i32))
+      return SDValue();
+    SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+                                  DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4i32,
+                                              Op.getOperand(0)),
+                                              Op.getOperand(1));
+    return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::f32, Extract);
+  } else if (VT == MVT::i32) {
+    // ExtractPS works with constant index.
+    if (isa<ConstantSDNode>(Op.getOperand(1)))
+      return Op;
+  }
+  return SDValue();
+}
+
+
+SDValue
+X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
+  if (!isa<ConstantSDNode>(Op.getOperand(1)))
+    return SDValue();
+
+  if (Subtarget->hasSSE41()) {
+    SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
+    if (Res.getNode())
+      return Res;
+  }
+
+  EVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+  // TODO: handle v16i8.
+  if (VT.getSizeInBits() == 16) {
+    SDValue Vec = Op.getOperand(0);
+    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    if (Idx == 0)
+      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
+                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+                                     DAG.getNode(ISD::BIT_CONVERT, dl,
+                                                 MVT::v4i32, Vec),
+                                     Op.getOperand(1)));
+    // Transform it so it match pextrw which produces a 32-bit result.
+    EVT EltVT = MVT::i32;
+    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
+                                    Op.getOperand(0), Op.getOperand(1));
+    SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
+                                    DAG.getValueType(VT));
+    return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
+  } else if (VT.getSizeInBits() == 32) {
+    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    if (Idx == 0)
+      return Op;
+
+    // SHUFPS the element to the lowest double word, then movss.
+    int Mask[4] = { Idx, -1, -1, -1 };
+    EVT VVT = Op.getOperand(0).getValueType();
+    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
+                                       DAG.getUNDEF(VVT), Mask);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
+                       DAG.getIntPtrConstant(0));
+  } else if (VT.getSizeInBits() == 64) {
+    // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
+    // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
+    //        to match extract_elt for f64.
+    unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    if (Idx == 0)
+      return Op;
+
+    // UNPCKHPD the element to the lowest double word, then movsd.
+    // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
+    // to a f64mem, the whole operation is folded into a single MOVHPDmr.
+    int Mask[2] = { 1, -1 };
+    EVT VVT = Op.getOperand(0).getValueType();
+    SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
+                                       DAG.getUNDEF(VVT), Mask);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
+                       DAG.getIntPtrConstant(0));
+  }
+
+  return SDValue();
+}
+
+SDValue
+X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG){
+  EVT VT = Op.getValueType();
+  EVT EltVT = VT.getVectorElementType();
+  DebugLoc dl = Op.getDebugLoc();
+
+  SDValue N0 = Op.getOperand(0);
+  SDValue N1 = Op.getOperand(1);
+  SDValue N2 = Op.getOperand(2);
+
+  if ((EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) &&
+      isa<ConstantSDNode>(N2)) {
+    unsigned Opc = (EltVT.getSizeInBits() == 8) ? X86ISD::PINSRB
+                                                : X86ISD::PINSRW;
+    // Transform it so it match pinsr{b,w} which expects a GR32 as its second
+    // argument.
+    if (N1.getValueType() != MVT::i32)
+      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
+    if (N2.getValueType() != MVT::i32)
+      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
+    return DAG.getNode(Opc, dl, VT, N0, N1, N2);
+  } else if (EltVT == MVT::f32 && isa<ConstantSDNode>(N2)) {
+    // Bits [7:6] of the constant are the source select.  This will always be
+    //  zero here.  The DAG Combiner may combine an extract_elt index into these
+    //  bits.  For example (insert (extract, 3), 2) could be matched by putting
+    //  the '3' into bits [7:6] of X86ISD::INSERTPS.
+    // Bits [5:4] of the constant are the destination select.  This is the
+    //  value of the incoming immediate.
+    // Bits [3:0] of the constant are the zero mask.  The DAG Combiner may
+    //   combine either bitwise AND or insert of float 0.0 to set these bits.
+    N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue() << 4);
+    // Create this as a scalar to vector..
+    N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
+    return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
+  } else if (EltVT == MVT::i32 && isa<ConstantSDNode>(N2)) {
+    // PINSR* works with constant index.
+    return Op;
+  }
+  return SDValue();
+}
+
+SDValue
+X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+  EVT EltVT = VT.getVectorElementType();
+
+  if (Subtarget->hasSSE41())
+    return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
+
+  if (EltVT == MVT::i8)
+    return SDValue();
+
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue N0 = Op.getOperand(0);
+  SDValue N1 = Op.getOperand(1);
+  SDValue N2 = Op.getOperand(2);
+
+  if (EltVT.getSizeInBits() == 16 && isa<ConstantSDNode>(N2)) {
+    // Transform it so it match pinsrw which expects a 16-bit value in a GR32
+    // as its second argument.
+    if (N1.getValueType() != MVT::i32)
+      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
+    if (N2.getValueType() != MVT::i32)
+      N2 = DAG.getIntPtrConstant(cast<ConstantSDNode>(N2)->getZExtValue());
+    return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
+  }
+  return SDValue();
+}
+
+SDValue
+X86TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  if (Op.getValueType() == MVT::v2f32)
+    return DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f32,
+                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i32,
+                                   DAG.getNode(ISD::BIT_CONVERT, dl, MVT::i32,
+                                               Op.getOperand(0))));
+
+  if (Op.getValueType() == MVT::v1i64 && Op.getOperand(0).getValueType() == MVT::i64)
+    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i64, Op.getOperand(0));
+
+  SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
+  EVT VT = MVT::v2i32;
+  switch (Op.getValueType().getSimpleVT().SimpleTy) {
+  default: break;
+  case MVT::v16i8:
+  case MVT::v8i16:
+    VT = MVT::v4i32;
+    break;
+  }
+  return DAG.getNode(ISD::BIT_CONVERT, dl, Op.getValueType(),
+                     DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, AnyExt));
+}
+
+// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
+// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
+// one of the above mentioned nodes. It has to be wrapped because otherwise
+// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
+// be used to form addressing mode. These wrapped nodes will be selected
+// into MOV32ri.
+SDValue
+X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+
+  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+  // global base reg.
+  unsigned char OpFlag = 0;
+  unsigned WrapperKind = X86ISD::Wrapper;
+  CodeModel::Model M = getTargetMachine().getCodeModel();
+
+  if (Subtarget->isPICStyleRIPRel() &&
+      (M == CodeModel::Small || M == CodeModel::Kernel))
+    WrapperKind = X86ISD::WrapperRIP;
+  else if (Subtarget->isPICStyleGOT())
+    OpFlag = X86II::MO_GOTOFF;
+  else if (Subtarget->isPICStyleStubPIC())
+    OpFlag = X86II::MO_PIC_BASE_OFFSET;
+
+  SDValue Result = DAG.getTargetConstantPool(CP->getConstVal(), getPointerTy(),
+                                             CP->getAlignment(),
+                                             CP->getOffset(), OpFlag);
+  DebugLoc DL = CP->getDebugLoc();
+  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
+  // With PIC, the address is actually $g + Offset.
+  if (OpFlag) {
+    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
+                         DAG.getNode(X86ISD::GlobalBaseReg,
+                                     DebugLoc::getUnknownLoc(), getPointerTy()),
+                         Result);
+  }
+
+  return Result;
+}
+
+SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) {
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+
+  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+  // global base reg.
+  unsigned char OpFlag = 0;
+  unsigned WrapperKind = X86ISD::Wrapper;
+  CodeModel::Model M = getTargetMachine().getCodeModel();
+
+  if (Subtarget->isPICStyleRIPRel() &&
+      (M == CodeModel::Small || M == CodeModel::Kernel))
+    WrapperKind = X86ISD::WrapperRIP;
+  else if (Subtarget->isPICStyleGOT())
+    OpFlag = X86II::MO_GOTOFF;
+  else if (Subtarget->isPICStyleStubPIC())
+    OpFlag = X86II::MO_PIC_BASE_OFFSET;
+
+  SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), getPointerTy(),
+                                          OpFlag);
+  DebugLoc DL = JT->getDebugLoc();
+  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
+
+  // With PIC, the address is actually $g + Offset.
+  if (OpFlag) {
+    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
+                         DAG.getNode(X86ISD::GlobalBaseReg,
+                                     DebugLoc::getUnknownLoc(), getPointerTy()),
+                         Result);
+  }
+
+  return Result;
+}
+
+SDValue
+X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) {
+  const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
+
+  // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
+  // global base reg.
+  unsigned char OpFlag = 0;
+  unsigned WrapperKind = X86ISD::Wrapper;
+  CodeModel::Model M = getTargetMachine().getCodeModel();
+
+  if (Subtarget->isPICStyleRIPRel() &&
+      (M == CodeModel::Small || M == CodeModel::Kernel))
+    WrapperKind = X86ISD::WrapperRIP;
+  else if (Subtarget->isPICStyleGOT())
+    OpFlag = X86II::MO_GOTOFF;
+  else if (Subtarget->isPICStyleStubPIC())
+    OpFlag = X86II::MO_PIC_BASE_OFFSET;
+
+  SDValue Result = DAG.getTargetExternalSymbol(Sym, getPointerTy(), OpFlag);
+
+  DebugLoc DL = Op.getDebugLoc();
+  Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
+
+
+  // With PIC, the address is actually $g + Offset.
+  if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+      !Subtarget->is64Bit()) {
+    Result = DAG.getNode(ISD::ADD, DL, getPointerTy(),
+                         DAG.getNode(X86ISD::GlobalBaseReg,
+                                     DebugLoc::getUnknownLoc(),
+                                     getPointerTy()),
+                         Result);
+  }
+
+  return Result;
+}
+
+SDValue
+X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) {
+  // Create the TargetBlockAddressAddress node.
+  unsigned char OpFlags =
+    Subtarget->ClassifyBlockAddressReference();
+  CodeModel::Model M = getTargetMachine().getCodeModel();
+  BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue Result = DAG.getBlockAddress(BA, getPointerTy(),
+                                       /*isTarget=*/true, OpFlags);
+
+  if (Subtarget->isPICStyleRIPRel() &&
+      (M == CodeModel::Small || M == CodeModel::Kernel))
+    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
+  else
+    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
+
+  // With PIC, the address is actually $g + Offset.
+  if (isGlobalRelativeToPICBase(OpFlags)) {
+    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
+                         Result);
+  }
+
+  return Result;
+}
+
+SDValue
+X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
+                                      int64_t Offset,
+                                      SelectionDAG &DAG) const {
+  // Create the TargetGlobalAddress node, folding in the constant
+  // offset if it is legal.
+  unsigned char OpFlags =
+    Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
+  CodeModel::Model M = getTargetMachine().getCodeModel();
+  SDValue Result;
+  if (OpFlags == X86II::MO_NO_FLAG &&
+      X86::isOffsetSuitableForCodeModel(Offset, M)) {
+    // A direct static reference to a global.
+    Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), Offset);
+    Offset = 0;
+  } else {
+    Result = DAG.getTargetGlobalAddress(GV, getPointerTy(), 0, OpFlags);
+  }
+
+  if (Subtarget->isPICStyleRIPRel() &&
+      (M == CodeModel::Small || M == CodeModel::Kernel))
+    Result = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Result);
+  else
+    Result = DAG.getNode(X86ISD::Wrapper, dl, getPointerTy(), Result);
+
+  // With PIC, the address is actually $g + Offset.
+  if (isGlobalRelativeToPICBase(OpFlags)) {
+    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+                         DAG.getNode(X86ISD::GlobalBaseReg, dl, getPointerTy()),
+                         Result);
+  }
+
+  // For globals that require a load from a stub to get the address, emit the
+  // load.
+  if (isGlobalStubReference(OpFlags))
+    Result = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Result,
+                         PseudoSourceValue::getGOT(), 0);
+
+  // If there was a non-zero offset that we didn't fold, create an explicit
+  // addition for it.
+  if (Offset != 0)
+    Result = DAG.getNode(ISD::ADD, dl, getPointerTy(), Result,
+                         DAG.getConstant(Offset, getPointerTy()));
+
+  return Result;
+}
+
+SDValue
+X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) {
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
+  return LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG);
+}
+
+static SDValue
+GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
+           SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
+           unsigned char OperandFlags) {
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+  DebugLoc dl = GA->getDebugLoc();
+  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(),
+                                           GA->getValueType(0),
+                                           GA->getOffset(),
+                                           OperandFlags);
+  if (InFlag) {
+    SDValue Ops[] = { Chain,  TGA, *InFlag };
+    Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 3);
+  } else {
+    SDValue Ops[]  = { Chain, TGA };
+    Chain = DAG.getNode(X86ISD::TLSADDR, dl, NodeTys, Ops, 2);
+  }
+
+  // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
+  MFI->setHasCalls(true);
+
+  SDValue Flag = Chain.getValue(1);
+  return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
+}
+
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
+static SDValue
+LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+                                const EVT PtrVT) {
+  SDValue InFlag;
+  DebugLoc dl = GA->getDebugLoc();  // ? function entry point might be better
+  SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
+                                     DAG.getNode(X86ISD::GlobalBaseReg,
+                                                 DebugLoc::getUnknownLoc(),
+                                                 PtrVT), InFlag);
+  InFlag = Chain.getValue(1);
+
+  return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
+}
+
+// Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
+static SDValue
+LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+                                const EVT PtrVT) {
+  return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT,
+                    X86::RAX, X86II::MO_TLSGD);
+}
+
+// Lower ISD::GlobalTLSAddress using the "initial exec" (for no-pic) or
+// "local exec" model.
+static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+                                   const EVT PtrVT, TLSModel::Model model,
+                                   bool is64Bit) {
+  DebugLoc dl = GA->getDebugLoc();
+  // Get the Thread Pointer
+  SDValue Base = DAG.getNode(X86ISD::SegmentBaseAddress,
+                             DebugLoc::getUnknownLoc(), PtrVT,
+                             DAG.getRegister(is64Bit? X86::FS : X86::GS,
+                                             MVT::i32));
+
+  SDValue ThreadPointer = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Base,
+                                      NULL, 0);
+
+  unsigned char OperandFlags = 0;
+  // Most TLS accesses are not RIP relative, even on x86-64.  One exception is
+  // initialexec.
+  unsigned WrapperKind = X86ISD::Wrapper;
+  if (model == TLSModel::LocalExec) {
+    OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
+  } else if (is64Bit) {
+    assert(model == TLSModel::InitialExec);
+    OperandFlags = X86II::MO_GOTTPOFF;
+    WrapperKind = X86ISD::WrapperRIP;
+  } else {
+    assert(model == TLSModel::InitialExec);
+    OperandFlags = X86II::MO_INDNTPOFF;
+  }
+
+  // emit "addl x@ntpoff,%eax" (local exec) or "addl x@indntpoff,%eax" (initial
+  // exec)
+  SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), GA->getValueType(0),
+                                           GA->getOffset(), OperandFlags);
+  SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
+
+  if (model == TLSModel::InitialExec)
+    Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
+                         PseudoSourceValue::getGOT(), 0);
+
+  // The address of the thread local variable is the add of the thread
+  // pointer with the offset of the variable.
+  return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
+}
+
+SDValue
+X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) {
+  // TODO: implement the "local dynamic" model
+  // TODO: implement the "initial exec"model for pic executables
+  assert(Subtarget->isTargetELF() &&
+         "TLS not implemented for non-ELF targets");
+  GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
+  const GlobalValue *GV = GA->getGlobal();
+
+  // If GV is an alias then use the aliasee for determining
+  // thread-localness.
+  if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+    GV = GA->resolveAliasedGlobal(false);
+
+  TLSModel::Model model = getTLSModel(GV,
+                                      getTargetMachine().getRelocationModel());
+
+  switch (model) {
+  case TLSModel::GeneralDynamic:
+  case TLSModel::LocalDynamic: // not implemented
+    if (Subtarget->is64Bit())
+      return LowerToTLSGeneralDynamicModel64(GA, DAG, getPointerTy());
+    return LowerToTLSGeneralDynamicModel32(GA, DAG, getPointerTy());
+
+  case TLSModel::InitialExec:
+  case TLSModel::LocalExec:
+    return LowerToTLSExecModel(GA, DAG, getPointerTy(), model,
+                               Subtarget->is64Bit());
+  }
+
+  llvm_unreachable("Unreachable");
+  return SDValue();
+}
+
+
+/// LowerShift - Lower SRA_PARTS and friends, which return two i32 values and
+/// take a 2 x i32 value to shift plus a shift amount.
+SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) {
+  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+  EVT VT = Op.getValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  DebugLoc dl = Op.getDebugLoc();
+  bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt  = Op.getOperand(2);
+  SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
+                                     DAG.getConstant(VTBits - 1, MVT::i8))
+                       : DAG.getConstant(0, VT);
+
+  SDValue Tmp2, Tmp3;
+  if (Op.getOpcode() == ISD::SHL_PARTS) {
+    Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
+    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+  } else {
+    Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
+    Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, ShAmt);
+  }
+
+  SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
+                                DAG.getConstant(VTBits, MVT::i8));
+  SDValue Cond = DAG.getNode(X86ISD::CMP, dl, VT,
+                             AndNode, DAG.getConstant(0, MVT::i8));
+
+  SDValue Hi, Lo;
+  SDValue CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+  SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
+  SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
+
+  if (Op.getOpcode() == ISD::SHL_PARTS) {
+    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
+    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
+  } else {
+    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
+    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
+  }
+
+  SDValue Ops[2] = { Lo, Hi };
+  return DAG.getMergeValues(Ops, 2, dl);
+}
+
+SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
+  EVT SrcVT = Op.getOperand(0).getValueType();
+
+  if (SrcVT.isVector()) {
+    if (SrcVT == MVT::v2i32 && Op.getValueType() == MVT::v2f64) {
+      return Op;
+    }
+    return SDValue();
+  }
+
+  assert(SrcVT.getSimpleVT() <= MVT::i64 && SrcVT.getSimpleVT() >= MVT::i16 &&
+         "Unknown SINT_TO_FP to lower!");
+
+  // These are really Legal; return the operand so the caller accepts it as
+  // Legal.
+  if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
+    return Op;
+  if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
+      Subtarget->is64Bit()) {
+    return Op;
+  }
+
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned Size = SrcVT.getSizeInBits()/8;
+  MachineFunction &MF = DAG.getMachineFunction();
+  int SSFI = MF.getFrameInfo()->CreateStackObject(Size, Size, false);
+  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+  SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
+                               StackSlot,
+                               PseudoSourceValue::getFixedStack(SSFI), 0);
+  return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
+}
+
+SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
+                                     SDValue StackSlot,
+                                     SelectionDAG &DAG) {
+  // Build the FILD
+  DebugLoc dl = Op.getDebugLoc();
+  SDVTList Tys;
+  bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
+  if (useSSE)
+    Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Flag);
+  else
+    Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
+  SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
+  SDValue Result = DAG.getNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, dl,
+                               Tys, Ops, array_lengthof(Ops));
+
+  if (useSSE) {
+    Chain = Result.getValue(1);
+    SDValue InFlag = Result.getValue(2);
+
+    // FIXME: Currently the FST is flagged to the FILD_FLAG. This
+    // shouldn't be necessary except that RFP cannot be live across
+    // multiple blocks. When stackifier is fixed, they can be uncoupled.
+    MachineFunction &MF = DAG.getMachineFunction();
+    int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8, false);
+    SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+    Tys = DAG.getVTList(MVT::Other);
+    SDValue Ops[] = {
+      Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
+    };
+    Chain = DAG.getNode(X86ISD::FST, dl, Tys, Ops, array_lengthof(Ops));
+    Result = DAG.getLoad(Op.getValueType(), dl, Chain, StackSlot,
+                         PseudoSourceValue::getFixedStack(SSFI), 0);
+  }
+
+  return Result;
+}
+
+// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
+SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) {
+  // This algorithm is not obvious. Here it is in C code, more or less:
+  /*
+    double uint64_to_double( uint32_t hi, uint32_t lo ) {
+      static const __m128i exp = { 0x4330000045300000ULL, 0 };
+      static const __m128d bias = { 0x1.0p84, 0x1.0p52 };
+
+      // Copy ints to xmm registers.
+      __m128i xh = _mm_cvtsi32_si128( hi );
+      __m128i xl = _mm_cvtsi32_si128( lo );
+
+      // Combine into low half of a single xmm register.
+      __m128i x = _mm_unpacklo_epi32( xh, xl );
+      __m128d d;
+      double sd;
+
+      // Merge in appropriate exponents to give the integer bits the right
+      // magnitude.
+      x = _mm_unpacklo_epi32( x, exp );
+
+      // Subtract away the biases to deal with the IEEE-754 double precision
+      // implicit 1.
+      d = _mm_sub_pd( (__m128d) x, bias );
+
+      // All conversions up to here are exact. The correctly rounded result is
+      // calculated using the current rounding mode using the following
+      // horizontal add.
+      d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) );
+      _mm_store_sd( &sd, d );   // Because we are returning doubles in XMM, this
+                                // store doesn't really need to be here (except
+                                // maybe to zero the other double)
+      return sd;
+    }
+  */
+
+  DebugLoc dl = Op.getDebugLoc();
+  LLVMContext *Context = DAG.getContext();
+
+  // Build some magic constants.
+  std::vector<Constant*> CV0;
+  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000)));
+  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000)));
+  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
+  CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
+  Constant *C0 = ConstantVector::get(CV0);
+  SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
+
+  std::vector<Constant*> CV1;
+  CV1.push_back(
+    ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL))));
+  CV1.push_back(
+    ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL))));
+  Constant *C1 = ConstantVector::get(CV1);
+  SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
+
+  SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
+                            DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                                        Op.getOperand(0),
+                                        DAG.getIntPtrConstant(1)));
+  SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
+                            DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                                        Op.getOperand(0),
+                                        DAG.getIntPtrConstant(0)));
+  SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2);
+  SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
+                              PseudoSourceValue::getConstantPool(), 0,
+                              false, 16);
+  SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0);
+  SDValue XR2F = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Unpck2);
+  SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
+                              PseudoSourceValue::getConstantPool(), 0,
+                              false, 16);
+  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
+
+  // Add the halves; easiest way is to swap them into another reg first.
+  int ShufMask[2] = { 1, -1 };
+  SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub,
+                                      DAG.getUNDEF(MVT::v2f64), ShufMask);
+  SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add,
+                     DAG.getIntPtrConstant(0));
+}
+
+// LowerUINT_TO_FP_i32 - 32-bit unsigned integer to float expansion.
+SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  // FP constant to bias correct the final result.
+  SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
+                                   MVT::f64);
+
+  // Load the 32-bit value into an XMM register.
+  SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
+                             DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                                         Op.getOperand(0),
+                                         DAG.getIntPtrConstant(0)));
+
+  Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+                     DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Load),
+                     DAG.getIntPtrConstant(0));
+
+  // Or the load with the bias.
+  SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64,
+                           DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
+                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+                                                   MVT::v2f64, Load)),
+                           DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
+                                       DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+                                                   MVT::v2f64, Bias)));
+  Or = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
+                   DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2f64, Or),
+                   DAG.getIntPtrConstant(0));
+
+  // Subtract the bias.
+  SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
+
+  // Handle final rounding.
+  EVT DestVT = Op.getValueType();
+
+  if (DestVT.bitsLT(MVT::f64)) {
+    return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
+                       DAG.getIntPtrConstant(0));
+  } else if (DestVT.bitsGT(MVT::f64)) {
+    return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
+  }
+
+  // Handle final rounding.
+  return Sub;
+}
+
+SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
+  SDValue N0 = Op.getOperand(0);
+  DebugLoc dl = Op.getDebugLoc();
+
+  // Now not UINT_TO_FP is legal (it's marked custom), dag combiner won't
+  // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
+  // the optimization here.
+  if (DAG.SignBitIsZero(N0))
+    return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
+
+  EVT SrcVT = N0.getValueType();
+  if (SrcVT == MVT::i64) {
+    // We only handle SSE2 f64 target here; caller can expand the rest.
+    if (Op.getValueType() != MVT::f64 || !X86ScalarSSEf64)
+      return SDValue();
+
+    return LowerUINT_TO_FP_i64(Op, DAG);
+  } else if (SrcVT == MVT::i32 && X86ScalarSSEf64) {
+    return LowerUINT_TO_FP_i32(Op, DAG);
+  }
+
+  assert(SrcVT == MVT::i32 && "Unknown UINT_TO_FP to lower!");
+
+  // Make a 64-bit buffer, and use it to build an FILD.
+  SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
+  SDValue WordOff = DAG.getConstant(4, getPointerTy());
+  SDValue OffsetSlot = DAG.getNode(ISD::ADD, dl,
+                                   getPointerTy(), StackSlot, WordOff);
+  SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
+                                StackSlot, NULL, 0);
+  SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, MVT::i32),
+                                OffsetSlot, NULL, 0);
+  return BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
+}
+
+std::pair<SDValue,SDValue> X86TargetLowering::
+FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned) {
+  DebugLoc dl = Op.getDebugLoc();
+
+  EVT DstTy = Op.getValueType();
+
+  if (!IsSigned) {
+    assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
+    DstTy = MVT::i64;
+  }
+
+  assert(DstTy.getSimpleVT() <= MVT::i64 &&
+         DstTy.getSimpleVT() >= MVT::i16 &&
+         "Unknown FP_TO_SINT to lower!");
+
+  // These are really Legal.
+  if (DstTy == MVT::i32 &&
+      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
+    return std::make_pair(SDValue(), SDValue());
+  if (Subtarget->is64Bit() &&
+      DstTy == MVT::i64 &&
+      isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
+    return std::make_pair(SDValue(), SDValue());
+
+  // We lower FP->sint64 into FISTP64, followed by a load, all to a temporary
+  // stack slot.
+  MachineFunction &MF = DAG.getMachineFunction();
+  unsigned MemSize = DstTy.getSizeInBits()/8;
+  int SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
+  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+
+  unsigned Opc;
+  switch (DstTy.getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
+  case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
+  case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
+  case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
+  }
+
+  SDValue Chain = DAG.getEntryNode();
+  SDValue Value = Op.getOperand(0);
+  if (isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType())) {
+    assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
+    Chain = DAG.getStore(Chain, dl, Value, StackSlot,
+                         PseudoSourceValue::getFixedStack(SSFI), 0);
+    SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
+    SDValue Ops[] = {
+      Chain, StackSlot, DAG.getValueType(Op.getOperand(0).getValueType())
+    };
+    Value = DAG.getNode(X86ISD::FLD, dl, Tys, Ops, 3);
+    Chain = Value.getValue(1);
+    SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
+    StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+  }
+
+  // Build the FP_TO_INT*_IN_MEM
+  SDValue Ops[] = { Chain, Value, StackSlot };
+  SDValue FIST = DAG.getNode(Opc, dl, MVT::Other, Ops, 3);
+
+  return std::make_pair(FIST, StackSlot);
+}
+
+SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) {
+  if (Op.getValueType().isVector()) {
+    if (Op.getValueType() == MVT::v2i32 &&
+        Op.getOperand(0).getValueType() == MVT::v2f64) {
+      return Op;
+    }
+    return SDValue();
+  }
+
+  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, true);
+  SDValue FIST = Vals.first, StackSlot = Vals.second;
+  // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
+  if (FIST.getNode() == 0) return Op;
+
+  // Load the result.
+  return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
+                     FIST, StackSlot, NULL, 0);
+}
+
+SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) {
+  std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG, false);
+  SDValue FIST = Vals.first, StackSlot = Vals.second;
+  assert(FIST.getNode() && "Unexpected failure");
+
+  // Load the result.
+  return DAG.getLoad(Op.getValueType(), Op.getDebugLoc(),
+                     FIST, StackSlot, NULL, 0);
+}
+
+SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) {
+  LLVMContext *Context = DAG.getContext();
+  DebugLoc dl = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+  EVT EltVT = VT;
+  if (VT.isVector())
+    EltVT = VT.getVectorElementType();
+  std::vector<Constant*> CV;
+  if (EltVT == MVT::f64) {
+    Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))));
+    CV.push_back(C);
+    CV.push_back(C);
+  } else {
+    Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))));
+    CV.push_back(C);
+    CV.push_back(C);
+    CV.push_back(C);
+    CV.push_back(C);
+  }
+  Constant *C = ConstantVector::get(CV);
+  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
+  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+                               PseudoSourceValue::getConstantPool(), 0,
+                               false, 16);
+  return DAG.getNode(X86ISD::FAND, dl, VT, Op.getOperand(0), Mask);
+}
+
+SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) {
+  LLVMContext *Context = DAG.getContext();
+  DebugLoc dl = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+  EVT EltVT = VT;
+  if (VT.isVector())
+    EltVT = VT.getVectorElementType();
+  std::vector<Constant*> CV;
+  if (EltVT == MVT::f64) {
+    Constant *C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)));
+    CV.push_back(C);
+    CV.push_back(C);
+  } else {
+    Constant *C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)));
+    CV.push_back(C);
+    CV.push_back(C);
+    CV.push_back(C);
+    CV.push_back(C);
+  }
+  Constant *C = ConstantVector::get(CV);
+  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
+  SDValue Mask = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+                               PseudoSourceValue::getConstantPool(), 0,
+                               false, 16);
+  if (VT.isVector()) {
+    return DAG.getNode(ISD::BIT_CONVERT, dl, VT,
+                       DAG.getNode(ISD::XOR, dl, MVT::v2i64,
+                    DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64,
+                                Op.getOperand(0)),
+                    DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v2i64, Mask)));
+  } else {
+    return DAG.getNode(X86ISD::FXOR, dl, VT, Op.getOperand(0), Mask);
+  }
+}
+
+SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
+  LLVMContext *Context = DAG.getContext();
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  DebugLoc dl = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+  EVT SrcVT = Op1.getValueType();
+
+  // If second operand is smaller, extend it first.
+  if (SrcVT.bitsLT(VT)) {
+    Op1 = DAG.getNode(ISD::FP_EXTEND, dl, VT, Op1);
+    SrcVT = VT;
+  }
+  // And if it is bigger, shrink it first.
+  if (SrcVT.bitsGT(VT)) {
+    Op1 = DAG.getNode(ISD::FP_ROUND, dl, VT, Op1, DAG.getIntPtrConstant(1));
+    SrcVT = VT;
+  }
+
+  // At this point the operands and the result should have the same
+  // type, and that won't be f80 since that is not custom lowered.
+
+  // First get the sign bit of second operand.
+  std::vector<Constant*> CV;
+  if (SrcVT == MVT::f64) {
+    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
+  } else {
+    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
+  }
+  Constant *C = ConstantVector::get(CV);
+  SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
+  SDValue Mask1 = DAG.getLoad(SrcVT, dl, DAG.getEntryNode(), CPIdx,
+                                PseudoSourceValue::getConstantPool(), 0,
+                                false, 16);
+  SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, SrcVT, Op1, Mask1);
+
+  // Shift sign bit right or left if the two operands have different types.
+  if (SrcVT.bitsGT(VT)) {
+    // Op0 is MVT::f32, Op1 is MVT::f64.
+    SignBit = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, SignBit);
+    SignBit = DAG.getNode(X86ISD::FSRL, dl, MVT::v2f64, SignBit,
+                          DAG.getConstant(32, MVT::i32));
+    SignBit = DAG.getNode(ISD::BIT_CONVERT, dl, MVT::v4f32, SignBit);
+    SignBit = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, SignBit,
+                          DAG.getIntPtrConstant(0));
+  }
+
+  // Clear first operand sign bit.
+  CV.clear();
+  if (VT == MVT::f64) {
+    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
+  } else {
+    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
+    CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
+  }
+  C = ConstantVector::get(CV);
+  CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
+  SDValue Mask2 = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
+                                PseudoSourceValue::getConstantPool(), 0,
+                                false, 16);
+  SDValue Val = DAG.getNode(X86ISD::FAND, dl, VT, Op0, Mask2);
+
+  // Or the value with the sign bit.
+  return DAG.getNode(X86ISD::FOR, dl, VT, Val, SignBit);
+}
+
+/// Emit nodes that will be selected as "test Op0,Op0", or something
+/// equivalent.
+SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
+                                    SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+
+  // CF and OF aren't always set the way we want. Determine which
+  // of these we need.
+  bool NeedCF = false;
+  bool NeedOF = false;
+  switch (X86CC) {
+  case X86::COND_A: case X86::COND_AE:
+  case X86::COND_B: case X86::COND_BE:
+    NeedCF = true;
+    break;
+  case X86::COND_G: case X86::COND_GE:
+  case X86::COND_L: case X86::COND_LE:
+  case X86::COND_O: case X86::COND_NO:
+    NeedOF = true;
+    break;
+  default: break;
+  }
+
+  // See if we can use the EFLAGS value from the operand instead of
+  // doing a separate TEST. TEST always sets OF and CF to 0, so unless
+  // we prove that the arithmetic won't overflow, we can't use OF or CF.
+  if (Op.getResNo() == 0 && !NeedOF && !NeedCF) {
+    unsigned Opcode = 0;
+    unsigned NumOperands = 0;
+    switch (Op.getNode()->getOpcode()) {
+    case ISD::ADD:
+      // Due to an isel shortcoming, be conservative if this add is likely to
+      // be selected as part of a load-modify-store instruction. When the root
+      // node in a match is a store, isel doesn't know how to remap non-chain
+      // non-flag uses of other nodes in the match, such as the ADD in this
+      // case. This leads to the ADD being left around and reselected, with
+      // the result being two adds in the output.
+      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+           UE = Op.getNode()->use_end(); UI != UE; ++UI)
+        if (UI->getOpcode() == ISD::STORE)
+          goto default_case;
+      if (ConstantSDNode *C =
+            dyn_cast<ConstantSDNode>(Op.getNode()->getOperand(1))) {
+        // An add of one will be selected as an INC.
+        if (C->getAPIntValue() == 1) {
+          Opcode = X86ISD::INC;
+          NumOperands = 1;
+          break;
+        }
+        // An add of negative one (subtract of one) will be selected as a DEC.
+        if (C->getAPIntValue().isAllOnesValue()) {
+          Opcode = X86ISD::DEC;
+          NumOperands = 1;
+          break;
+        }
+      }
+      // Otherwise use a regular EFLAGS-setting add.
+      Opcode = X86ISD::ADD;
+      NumOperands = 2;
+      break;
+    case ISD::AND: {
+      // If the primary and result isn't used, don't bother using X86ISD::AND,
+      // because a TEST instruction will be better.
+      bool NonFlagUse = false;
+      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+             UE = Op.getNode()->use_end(); UI != UE; ++UI) {
+        SDNode *User = *UI;
+        unsigned UOpNo = UI.getOperandNo();
+        if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
+          // Look pass truncate.
+          UOpNo = User->use_begin().getOperandNo();
+          User = *User->use_begin();
+        }
+        if (User->getOpcode() != ISD::BRCOND &&
+            User->getOpcode() != ISD::SETCC &&
+            (User->getOpcode() != ISD::SELECT || UOpNo != 0)) {
+          NonFlagUse = true;
+          break;
+        }
+      }
+      if (!NonFlagUse)
+        break;
+    }
+    // FALL THROUGH
+    case ISD::SUB:
+    case ISD::OR:
+    case ISD::XOR:
+      // Due to the ISEL shortcoming noted above, be conservative if this op is
+      // likely to be selected as part of a load-modify-store instruction.
+      for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
+           UE = Op.getNode()->use_end(); UI != UE; ++UI)
+        if (UI->getOpcode() == ISD::STORE)
+          goto default_case;
+      // Otherwise use a regular EFLAGS-setting instruction.
+      switch (Op.getNode()->getOpcode()) {
+      case ISD::SUB: Opcode = X86ISD::SUB; break;
+      case ISD::OR:  Opcode = X86ISD::OR;  break;
+      case ISD::XOR: Opcode = X86ISD::XOR; break;
+      case ISD::AND: Opcode = X86ISD::AND; break;
+      default: llvm_unreachable("unexpected operator!");
+      }
+      NumOperands = 2;
+      break;
+    case X86ISD::ADD:
+    case X86ISD::SUB:
+    case X86ISD::INC:
+    case X86ISD::DEC:
+    case X86ISD::OR:
+    case X86ISD::XOR:
+    case X86ISD::AND:
+      return SDValue(Op.getNode(), 1);
+    default:
+    default_case:
+      break;
+    }
+    if (Opcode != 0) {
+      SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+      SmallVector<SDValue, 4> Ops;
+      for (unsigned i = 0; i != NumOperands; ++i)
+        Ops.push_back(Op.getOperand(i));
+      SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands);
+      DAG.ReplaceAllUsesWith(Op, New);
+      return SDValue(New.getNode(), 1);
+    }
+  }
+
+  // Otherwise just emit a CMP with 0, which is the TEST pattern.
+  return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
+                     DAG.getConstant(0, Op.getValueType()));
+}
+
+/// Emit nodes that will be selected as "cmp Op0,Op1", or something
+/// equivalent.
+SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
+                                   SelectionDAG &DAG) {
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1))
+    if (C->getAPIntValue() == 0)
+      return EmitTest(Op0, X86CC, DAG);
+
+  DebugLoc dl = Op0.getDebugLoc();
+  return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
+}
+
+/// LowerToBT - Result of 'and' is compared against zero. Turn it into a BT node
+/// if it's possible.
+static SDValue LowerToBT(SDValue Op0, ISD::CondCode CC,
+                         DebugLoc dl, SelectionDAG &DAG) {
+  SDValue LHS, RHS;
+  if (Op0.getOperand(1).getOpcode() == ISD::SHL) {
+    if (ConstantSDNode *Op010C =
+        dyn_cast<ConstantSDNode>(Op0.getOperand(1).getOperand(0)))
+      if (Op010C->getZExtValue() == 1) {
+        LHS = Op0.getOperand(0);
+        RHS = Op0.getOperand(1).getOperand(1);
+      }
+  } else if (Op0.getOperand(0).getOpcode() == ISD::SHL) {
+    if (ConstantSDNode *Op000C =
+        dyn_cast<ConstantSDNode>(Op0.getOperand(0).getOperand(0)))
+      if (Op000C->getZExtValue() == 1) {
+        LHS = Op0.getOperand(1);
+        RHS = Op0.getOperand(0).getOperand(1);
+      }
+  } else if (Op0.getOperand(1).getOpcode() == ISD::Constant) {
+    ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op0.getOperand(1));
+    SDValue AndLHS = Op0.getOperand(0);
+    if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) {
+      LHS = AndLHS.getOperand(0);
+      RHS = AndLHS.getOperand(1);
+    }
+  }
+
+  if (LHS.getNode()) {
+    // If LHS is i8, promote it to i16 with any_extend.  There is no i8 BT
+    // instruction.  Since the shift amount is in-range-or-undefined, we know
+    // that doing a bittest on the i16 value is ok.  We extend to i32 because
+    // the encoding for the i16 version is larger than the i32 version.
+    if (LHS.getValueType() == MVT::i8)
+      LHS = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
+
+    // If the operand types disagree, extend the shift amount to match.  Since
+    // BT ignores high bits (like shifts) we can use anyextend.
+    if (LHS.getValueType() != RHS.getValueType())
+      RHS = DAG.getNode(ISD::ANY_EXTEND, dl, LHS.getValueType(), RHS);
+
+    SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, LHS, RHS);
+    unsigned Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
+    return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                       DAG.getConstant(Cond, MVT::i8), BT);
+  }
+
+  return SDValue();
+}
+
+SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) {
+  assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer");
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  DebugLoc dl = Op.getDebugLoc();
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+
+  // Optimize to BT if possible.
+  // Lower (X & (1 << N)) == 0 to BT(X, N).
+  // Lower ((X >>u N) & 1) != 0 to BT(X, N).
+  // Lower ((X >>s N) & 1) != 0 to BT(X, N).
+  if (Op0.getOpcode() == ISD::AND &&
+      Op0.hasOneUse() &&
+      Op1.getOpcode() == ISD::Constant &&
+      cast<ConstantSDNode>(Op1)->getZExtValue() == 0 &&
+      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+    SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
+    if (NewSetCC.getNode())
+      return NewSetCC;
+  }
+
+  bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
+  unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
+  if (X86CC == X86::COND_INVALID)
+    return SDValue();
+
+  SDValue Cond = EmitCmp(Op0, Op1, X86CC, DAG);
+
+  // Use sbb x, x to materialize carry bit into a GPR.
+  if (X86CC == X86::COND_B)
+    return DAG.getNode(ISD::AND, dl, MVT::i8,
+                       DAG.getNode(X86ISD::SETCC_CARRY, dl, MVT::i8,
+                                   DAG.getConstant(X86CC, MVT::i8), Cond),
+                       DAG.getConstant(1, MVT::i8));
+
+  return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                     DAG.getConstant(X86CC, MVT::i8), Cond);
+}
+
+SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) {
+  SDValue Cond;
+  SDValue Op0 = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDValue CC = Op.getOperand(2);
+  EVT VT = Op.getValueType();
+  ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
+  bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (isFP) {
+    unsigned SSECC = 8;
+    EVT VT0 = Op0.getValueType();
+    assert(VT0 == MVT::v4f32 || VT0 == MVT::v2f64);
+    unsigned Opc = VT0 == MVT::v4f32 ? X86ISD::CMPPS : X86ISD::CMPPD;
+    bool Swap = false;
+
+    switch (SetCCOpcode) {
+    default: break;
+    case ISD::SETOEQ:
+    case ISD::SETEQ:  SSECC = 0; break;
+    case ISD::SETOGT:
+    case ISD::SETGT: Swap = true; // Fallthrough
+    case ISD::SETLT:
+    case ISD::SETOLT: SSECC = 1; break;
+    case ISD::SETOGE:
+    case ISD::SETGE: Swap = true; // Fallthrough
+    case ISD::SETLE:
+    case ISD::SETOLE: SSECC = 2; break;
+    case ISD::SETUO:  SSECC = 3; break;
+    case ISD::SETUNE:
+    case ISD::SETNE:  SSECC = 4; break;
+    case ISD::SETULE: Swap = true;
+    case ISD::SETUGE: SSECC = 5; break;
+    case ISD::SETULT: Swap = true;
+    case ISD::SETUGT: SSECC = 6; break;
+    case ISD::SETO:   SSECC = 7; break;
+    }
+    if (Swap)
+      std::swap(Op0, Op1);
+
+    // In the two special cases we can't handle, emit two comparisons.
+    if (SSECC == 8) {
+      if (SetCCOpcode == ISD::SETUEQ) {
+        SDValue UNORD, EQ;
+        UNORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(3, MVT::i8));
+        EQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(0, MVT::i8));
+        return DAG.getNode(ISD::OR, dl, VT, UNORD, EQ);
+      }
+      else if (SetCCOpcode == ISD::SETONE) {
+        SDValue ORD, NEQ;
+        ORD = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(7, MVT::i8));
+        NEQ = DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(4, MVT::i8));
+        return DAG.getNode(ISD::AND, dl, VT, ORD, NEQ);
+      }
+      llvm_unreachable("Illegal FP comparison");
+    }
+    // Handle all other FP comparisons here.
+    return DAG.getNode(Opc, dl, VT, Op0, Op1, DAG.getConstant(SSECC, MVT::i8));
+  }
+
+  // We are handling one of the integer comparisons here.  Since SSE only has
+  // GT and EQ comparisons for integer, swapping operands and multiple
+  // operations may be required for some comparisons.
+  unsigned Opc = 0, EQOpc = 0, GTOpc = 0;
+  bool Swap = false, Invert = false, FlipSigns = false;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: break;
+  case MVT::v8i8:
+  case MVT::v16i8: EQOpc = X86ISD::PCMPEQB; GTOpc = X86ISD::PCMPGTB; break;
+  case MVT::v4i16:
+  case MVT::v8i16: EQOpc = X86ISD::PCMPEQW; GTOpc = X86ISD::PCMPGTW; break;
+  case MVT::v2i32:
+  case MVT::v4i32: EQOpc = X86ISD::PCMPEQD; GTOpc = X86ISD::PCMPGTD; break;
+  case MVT::v2i64: EQOpc = X86ISD::PCMPEQQ; GTOpc = X86ISD::PCMPGTQ; break;
+  }
+
+  switch (SetCCOpcode) {
+  default: break;
+  case ISD::SETNE:  Invert = true;
+  case ISD::SETEQ:  Opc = EQOpc; break;
+  case ISD::SETLT:  Swap = true;
+  case ISD::SETGT:  Opc = GTOpc; break;
+  case ISD::SETGE:  Swap = true;
+  case ISD::SETLE:  Opc = GTOpc; Invert = true; break;
+  case ISD::SETULT: Swap = true;
+  case ISD::SETUGT: Opc = GTOpc; FlipSigns = true; break;
+  case ISD::SETUGE: Swap = true;
+  case ISD::SETULE: Opc = GTOpc; FlipSigns = true; Invert = true; break;
+  }
+  if (Swap)
+    std::swap(Op0, Op1);
+
+  // Since SSE has no unsigned integer comparisons, we need to flip  the sign
+  // bits of the inputs before performing those operations.
+  if (FlipSigns) {
+    EVT EltVT = VT.getVectorElementType();
+    SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()),
+                                      EltVT);
+    std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit);
+    SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0],
+                                    SignBits.size());
+    Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec);
+    Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec);
+  }
+
+  SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
+
+  // If the logical-not of the result is required, perform that now.
+  if (Invert)
+    Result = DAG.getNOT(dl, Result, VT);
+
+  return Result;
+}
+
+// isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
+static bool isX86LogicalCmp(SDValue Op) {
+  unsigned Opc = Op.getNode()->getOpcode();
+  if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI)
+    return true;
+  if (Op.getResNo() == 1 &&
+      (Opc == X86ISD::ADD ||
+       Opc == X86ISD::SUB ||
+       Opc == X86ISD::SMUL ||
+       Opc == X86ISD::UMUL ||
+       Opc == X86ISD::INC ||
+       Opc == X86ISD::DEC ||
+       Opc == X86ISD::OR ||
+       Opc == X86ISD::XOR ||
+       Opc == X86ISD::AND))
+    return true;
+
+  return false;
+}
+
+SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) {
+  bool addTest = true;
+  SDValue Cond  = Op.getOperand(0);
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue CC;
+
+  if (Cond.getOpcode() == ISD::SETCC) {
+    SDValue NewCond = LowerSETCC(Cond, DAG);
+    if (NewCond.getNode())
+      Cond = NewCond;
+  }
+
+  // (select (x == 0), -1, 0) -> (sign_bit (x - 1))
+  SDValue Op1 = Op.getOperand(1);
+  SDValue Op2 = Op.getOperand(2);
+  if (Cond.getOpcode() == X86ISD::SETCC &&
+      cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue() == X86::COND_E) {
+    SDValue Cmp = Cond.getOperand(1);
+    if (Cmp.getOpcode() == X86ISD::CMP) {
+      ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op1);
+      ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
+      ConstantSDNode *RHSC =
+        dyn_cast<ConstantSDNode>(Cmp.getOperand(1).getNode());
+      if (N1C && N1C->isAllOnesValue() &&
+          N2C && N2C->isNullValue() &&
+          RHSC && RHSC->isNullValue()) {
+        SDValue CmpOp0 = Cmp.getOperand(0);
+        Cmp = DAG.getNode(X86ISD::CMP, dl, CmpOp0.getValueType(),
+                          CmpOp0, DAG.getConstant(1, CmpOp0.getValueType()));
+        return DAG.getNode(X86ISD::SETCC_CARRY, dl, Op.getValueType(),
+                           DAG.getConstant(X86::COND_B, MVT::i8), Cmp);
+      }
+    }
+  }
+
+  // Look pass (and (setcc_carry (cmp ...)), 1).
+  if (Cond.getOpcode() == ISD::AND &&
+      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
+    if (C && C->getAPIntValue() == 1) 
+      Cond = Cond.getOperand(0);
+  }
+
+  // If condition flag is set by a X86ISD::CMP, then use it as the condition
+  // setting operand in place of the X86ISD::SETCC.
+  if (Cond.getOpcode() == X86ISD::SETCC ||
+      Cond.getOpcode() == X86ISD::SETCC_CARRY) {
+    CC = Cond.getOperand(0);
+
+    SDValue Cmp = Cond.getOperand(1);
+    unsigned Opc = Cmp.getOpcode();
+    EVT VT = Op.getValueType();
+
+    bool IllegalFPCMov = false;
+    if (VT.isFloatingPoint() && !VT.isVector() &&
+        !isScalarFPTypeInSSEReg(VT))  // FPStack?
+      IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
+
+    if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
+        Opc == X86ISD::BT) { // FIXME
+      Cond = Cmp;
+      addTest = false;
+    }
+  }
+
+  if (addTest) {
+    // Look pass the truncate.
+    if (Cond.getOpcode() == ISD::TRUNCATE)
+      Cond = Cond.getOperand(0);
+
+    // We know the result of AND is compared against zero. Try to match
+    // it to BT.
+    if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 
+      SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
+      if (NewSetCC.getNode()) {
+        CC = NewSetCC.getOperand(0);
+        Cond = NewSetCC.getOperand(1);
+        addTest = false;
+      }
+    }
+  }
+
+  if (addTest) {
+    CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+    Cond = EmitTest(Cond, X86::COND_NE, DAG);
+  }
+
+  // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
+  // condition is true.
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Flag);
+  SDValue Ops[] = { Op2, Op1, CC, Cond };
+  return DAG.getNode(X86ISD::CMOV, dl, VTs, Ops, array_lengthof(Ops));
+}
+
+// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
+// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
+// from the AND / OR.
+static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
+  Opc = Op.getOpcode();
+  if (Opc != ISD::OR && Opc != ISD::AND)
+    return false;
+  return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
+          Op.getOperand(0).hasOneUse() &&
+          Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
+          Op.getOperand(1).hasOneUse());
+}
+
+// isXor1OfSetCC - Return true if node is an ISD::XOR of a X86ISD::SETCC and
+// 1 and that the SETCC node has a single use.
+static bool isXor1OfSetCC(SDValue Op) {
+  if (Op.getOpcode() != ISD::XOR)
+    return false;
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+  if (N1C && N1C->getAPIntValue() == 1) {
+    return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
+      Op.getOperand(0).hasOneUse();
+  }
+  return false;
+}
+
+SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) {
+  bool addTest = true;
+  SDValue Chain = Op.getOperand(0);
+  SDValue Cond  = Op.getOperand(1);
+  SDValue Dest  = Op.getOperand(2);
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue CC;
+
+  if (Cond.getOpcode() == ISD::SETCC) {
+    SDValue NewCond = LowerSETCC(Cond, DAG);
+    if (NewCond.getNode())
+      Cond = NewCond;
+  }
+#if 0
+  // FIXME: LowerXALUO doesn't handle these!!
+  else if (Cond.getOpcode() == X86ISD::ADD  ||
+           Cond.getOpcode() == X86ISD::SUB  ||
+           Cond.getOpcode() == X86ISD::SMUL ||
+           Cond.getOpcode() == X86ISD::UMUL)
+    Cond = LowerXALUO(Cond, DAG);
+#endif
+
+  // Look pass (and (setcc_carry (cmp ...)), 1).
+  if (Cond.getOpcode() == ISD::AND &&
+      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
+    if (C && C->getAPIntValue() == 1) 
+      Cond = Cond.getOperand(0);
+  }
+
+  // If condition flag is set by a X86ISD::CMP, then use it as the condition
+  // setting operand in place of the X86ISD::SETCC.
+  if (Cond.getOpcode() == X86ISD::SETCC ||
+      Cond.getOpcode() == X86ISD::SETCC_CARRY) {
+    CC = Cond.getOperand(0);
+
+    SDValue Cmp = Cond.getOperand(1);
+    unsigned Opc = Cmp.getOpcode();
+    // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
+    if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
+      Cond = Cmp;
+      addTest = false;
+    } else {
+      switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
+      default: break;
+      case X86::COND_O:
+      case X86::COND_B:
+        // These can only come from an arithmetic instruction with overflow,
+        // e.g. SADDO, UADDO.
+        Cond = Cond.getNode()->getOperand(1);
+        addTest = false;
+        break;
+      }
+    }
+  } else {
+    unsigned CondOpc;
+    if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
+      SDValue Cmp = Cond.getOperand(0).getOperand(1);
+      if (CondOpc == ISD::OR) {
+        // Also, recognize the pattern generated by an FCMP_UNE. We can emit
+        // two branches instead of an explicit OR instruction with a
+        // separate test.
+        if (Cmp == Cond.getOperand(1).getOperand(1) &&
+            isX86LogicalCmp(Cmp)) {
+          CC = Cond.getOperand(0).getOperand(0);
+          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+                              Chain, Dest, CC, Cmp);
+          CC = Cond.getOperand(1).getOperand(0);
+          Cond = Cmp;
+          addTest = false;
+        }
+      } else { // ISD::AND
+        // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
+        // two branches instead of an explicit AND instruction with a
+        // separate test. However, we only do this if this block doesn't
+        // have a fall-through edge, because this requires an explicit
+        // jmp when the condition is false.
+        if (Cmp == Cond.getOperand(1).getOperand(1) &&
+            isX86LogicalCmp(Cmp) &&
+            Op.getNode()->hasOneUse()) {
+          X86::CondCode CCode =
+            (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
+          CCode = X86::GetOppositeBranchCondition(CCode);
+          CC = DAG.getConstant(CCode, MVT::i8);
+          SDValue User = SDValue(*Op.getNode()->use_begin(), 0);
+          // Look for an unconditional branch following this conditional branch.
+          // We need this because we need to reverse the successors in order
+          // to implement FCMP_OEQ.
+          if (User.getOpcode() == ISD::BR) {
+            SDValue FalseBB = User.getOperand(1);
+            SDValue NewBR =
+              DAG.UpdateNodeOperands(User, User.getOperand(0), Dest);
+            assert(NewBR == User);
+            Dest = FalseBB;
+
+            Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+                                Chain, Dest, CC, Cmp);
+            X86::CondCode CCode =
+              (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
+            CCode = X86::GetOppositeBranchCondition(CCode);
+            CC = DAG.getConstant(CCode, MVT::i8);
+            Cond = Cmp;
+            addTest = false;
+          }
+        }
+      }
+    } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
+      // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
+      // It should be transformed during dag combiner except when the condition
+      // is set by a arithmetics with overflow node.
+      X86::CondCode CCode =
+        (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
+      CCode = X86::GetOppositeBranchCondition(CCode);
+      CC = DAG.getConstant(CCode, MVT::i8);
+      Cond = Cond.getOperand(0).getOperand(1);
+      addTest = false;
+    }
+  }
+
+  if (addTest) {
+    // Look pass the truncate.
+    if (Cond.getOpcode() == ISD::TRUNCATE)
+      Cond = Cond.getOperand(0);
+
+    // We know the result of AND is compared against zero. Try to match
+    // it to BT.
+    if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { 
+      SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG);
+      if (NewSetCC.getNode()) {
+        CC = NewSetCC.getOperand(0);
+        Cond = NewSetCC.getOperand(1);
+        addTest = false;
+      }
+    }
+  }
+
+  if (addTest) {
+    CC = DAG.getConstant(X86::COND_NE, MVT::i8);
+    Cond = EmitTest(Cond, X86::COND_NE, DAG);
+  }
+  return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
+                     Chain, Dest, CC, Cond);
+}
+
+
+// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
+// Calls to _alloca is needed to probe the stack when allocating more than 4k
+// bytes in one go. Touching the stack at 4K increments is necessary to ensure
+// that the guard pages used by the OS virtual memory manager are allocated in
+// correct sequence.
+SDValue
+X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+                                           SelectionDAG &DAG) {
+  assert(Subtarget->isTargetCygMing() &&
+         "This should be used only on Cygwin/Mingw targets");
+  DebugLoc dl = Op.getDebugLoc();
+
+  // Get the inputs.
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size  = Op.getOperand(1);
+  // FIXME: Ensure alignment here
+
+  SDValue Flag;
+
+  EVT IntPtr = getPointerTy();
+  EVT SPTy = Subtarget->is64Bit() ? MVT::i64 : MVT::i32;
+
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true));
+
+  Chain = DAG.getCopyToReg(Chain, dl, X86::EAX, Size, Flag);
+  Flag = Chain.getValue(1);
+
+  SDVTList  NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SDValue Ops[] = { Chain,
+                      DAG.getTargetExternalSymbol("_alloca", IntPtr),
+                      DAG.getRegister(X86::EAX, IntPtr),
+                      DAG.getRegister(X86StackPtr, SPTy),
+                      Flag };
+  Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops, 5);
+  Flag = Chain.getValue(1);
+
+  Chain = DAG.getCALLSEQ_END(Chain,
+                             DAG.getIntPtrConstant(0, true),
+                             DAG.getIntPtrConstant(0, true),
+                             Flag);
+
+  Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, SPTy).getValue(1);
+
+  SDValue Ops1[2] = { Chain.getValue(0), Chain };
+  return DAG.getMergeValues(Ops1, 2, dl);
+}
+
+SDValue
+X86TargetLowering::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
+                                           SDValue Chain,
+                                           SDValue Dst, SDValue Src,
+                                           SDValue Size, unsigned Align,
+                                           const Value *DstSV,
+                                           uint64_t DstSVOff) {
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+
+  // If not DWORD aligned or size is more than the threshold, call the library.
+  // The libc version is likely to be faster for these cases. It can use the
+  // address value and run time information about the CPU.
+  if ((Align & 3) != 0 ||
+      !ConstantSize ||
+      ConstantSize->getZExtValue() >
+        getSubtarget()->getMaxInlineSizeThreshold()) {
+    SDValue InFlag(0, 0);
+
+    // Check to see if there is a specialized entry-point for memory zeroing.
+    ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
+
+    if (const char *bzeroEntry =  V &&
+        V->isNullValue() ? Subtarget->getBZeroEntry() : 0) {
+      EVT IntPtr = getPointerTy();
+      const Type *IntPtrTy = TD->getIntPtrType(*DAG.getContext());
+      TargetLowering::ArgListTy Args;
+      TargetLowering::ArgListEntry Entry;
+      Entry.Node = Dst;
+      Entry.Ty = IntPtrTy;
+      Args.push_back(Entry);
+      Entry.Node = Size;
+      Args.push_back(Entry);
+      std::pair<SDValue,SDValue> CallResult =
+        LowerCallTo(Chain, Type::getVoidTy(*DAG.getContext()),
+                    false, false, false, false,
+                    0, CallingConv::C, false, /*isReturnValueUsed=*/false,
+                    DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG, dl,
+                    DAG.GetOrdering(Chain.getNode()));
+      return CallResult.second;
+    }
+
+    // Otherwise have the target-independent code call memset.
+    return SDValue();
+  }
+
+  uint64_t SizeVal = ConstantSize->getZExtValue();
+  SDValue InFlag(0, 0);
+  EVT AVT;
+  SDValue Count;
+  ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src);
+  unsigned BytesLeft = 0;
+  bool TwoRepStos = false;
+  if (ValC) {
+    unsigned ValReg;
+    uint64_t Val = ValC->getZExtValue() & 255;
+
+    // If the value is a constant, then we can potentially use larger sets.
+    switch (Align & 3) {
+    case 2:   // WORD aligned
+      AVT = MVT::i16;
+      ValReg = X86::AX;
+      Val = (Val << 8) | Val;
+      break;
+    case 0:  // DWORD aligned
+      AVT = MVT::i32;
+      ValReg = X86::EAX;
+      Val = (Val << 8)  | Val;
+      Val = (Val << 16) | Val;
+      if (Subtarget->is64Bit() && ((Align & 0x7) == 0)) {  // QWORD aligned
+        AVT = MVT::i64;
+        ValReg = X86::RAX;
+        Val = (Val << 32) | Val;
+      }
+      break;
+    default:  // Byte aligned
+      AVT = MVT::i8;
+      ValReg = X86::AL;
+      Count = DAG.getIntPtrConstant(SizeVal);
+      break;
+    }
+
+    if (AVT.bitsGT(MVT::i8)) {
+      unsigned UBytes = AVT.getSizeInBits() / 8;
+      Count = DAG.getIntPtrConstant(SizeVal / UBytes);
+      BytesLeft = SizeVal % UBytes;
+    }
+
+    Chain  = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, AVT),
+                              InFlag);
+    InFlag = Chain.getValue(1);
+  } else {
+    AVT = MVT::i8;
+    Count  = DAG.getIntPtrConstant(SizeVal);
+    Chain  = DAG.getCopyToReg(Chain, dl, X86::AL, Src, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX :
+                                                              X86::ECX,
+                            Count, InFlag);
+  InFlag = Chain.getValue(1);
+  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI :
+                                                              X86::EDI,
+                            Dst, InFlag);
+  InFlag = Chain.getValue(1);
+
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag };
+  Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops, array_lengthof(Ops));
+
+  if (TwoRepStos) {
+    InFlag = Chain.getValue(1);
+    Count  = Size;
+    EVT CVT = Count.getValueType();
+    SDValue Left = DAG.getNode(ISD::AND, dl, CVT, Count,
+                               DAG.getConstant((AVT == MVT::i64) ? 7 : 3, CVT));
+    Chain  = DAG.getCopyToReg(Chain, dl, (CVT == MVT::i64) ? X86::RCX :
+                                                             X86::ECX,
+                              Left, InFlag);
+    InFlag = Chain.getValue(1);
+    Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+    SDValue Ops[] = { Chain, DAG.getValueType(MVT::i8), InFlag };
+    Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops, array_lengthof(Ops));
+  } else if (BytesLeft) {
+    // Handle the last 1 - 7 bytes.
+    unsigned Offset = SizeVal - BytesLeft;
+    EVT AddrVT = Dst.getValueType();
+    EVT SizeVT = Size.getValueType();
+
+    Chain = DAG.getMemset(Chain, dl,
+                          DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
+                                      DAG.getConstant(Offset, AddrVT)),
+                          Src,
+                          DAG.getConstant(BytesLeft, SizeVT),
+                          Align, DstSV, DstSVOff + Offset);
+  }
+
+  // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.
+  return Chain;
+}
+
+SDValue
+X86TargetLowering::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
+                                      SDValue Chain, SDValue Dst, SDValue Src,
+                                      SDValue Size, unsigned Align,
+                                      bool AlwaysInline,
+                                      const Value *DstSV, uint64_t DstSVOff,
+                                      const Value *SrcSV, uint64_t SrcSVOff) {
+  // This requires the copy size to be a constant, preferrably
+  // within a subtarget-specific limit.
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  if (!ConstantSize)
+    return SDValue();
+  uint64_t SizeVal = ConstantSize->getZExtValue();
+  if (!AlwaysInline && SizeVal > getSubtarget()->getMaxInlineSizeThreshold())
+    return SDValue();
+
+  /// If not DWORD aligned, call the library.
+  if ((Align & 3) != 0)
+    return SDValue();
+
+  // DWORD aligned
+  EVT AVT = MVT::i32;
+  if (Subtarget->is64Bit() && ((Align & 0x7) == 0))  // QWORD aligned
+    AVT = MVT::i64;
+
+  unsigned UBytes = AVT.getSizeInBits() / 8;
+  unsigned CountVal = SizeVal / UBytes;
+  SDValue Count = DAG.getIntPtrConstant(CountVal);
+  unsigned BytesLeft = SizeVal % UBytes;
+
+  SDValue InFlag(0, 0);
+  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX :
+                                                              X86::ECX,
+                            Count, InFlag);
+  InFlag = Chain.getValue(1);
+  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RDI :
+                                                             X86::EDI,
+                            Dst, InFlag);
+  InFlag = Chain.getValue(1);
+  Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RSI :
+                                                              X86::ESI,
+                            Src, InFlag);
+  InFlag = Chain.getValue(1);
+
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag };
+  SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops,
+                                array_lengthof(Ops));
+
+  SmallVector<SDValue, 4> Results;
+  Results.push_back(RepMovs);
+  if (BytesLeft) {
+    // Handle the last 1 - 7 bytes.
+    unsigned Offset = SizeVal - BytesLeft;
+    EVT DstVT = Dst.getValueType();
+    EVT SrcVT = Src.getValueType();
+    EVT SizeVT = Size.getValueType();
+    Results.push_back(DAG.getMemcpy(Chain, dl,
+                                    DAG.getNode(ISD::ADD, dl, DstVT, Dst,
+                                                DAG.getConstant(Offset, DstVT)),
+                                    DAG.getNode(ISD::ADD, dl, SrcVT, Src,
+                                                DAG.getConstant(Offset, SrcVT)),
+                                    DAG.getConstant(BytesLeft, SizeVT),
+                                    Align, AlwaysInline,
+                                    DstSV, DstSVOff + Offset,
+                                    SrcSV, SrcSVOff + Offset));
+  }
+
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                     &Results[0], Results.size());
+}
+
+SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) {
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (!Subtarget->is64Bit()) {
+    // vastart just stores the address of the VarArgsFrameIndex slot into the
+    // memory location argument.
+    SDValue FR = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy());
+    return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1), SV, 0);
+  }
+
+  // __va_list_tag:
+  //   gp_offset         (0 - 6 * 8)
+  //   fp_offset         (48 - 48 + 8 * 16)
+  //   overflow_arg_area (point to parameters coming in memory).
+  //   reg_save_area
+  SmallVector<SDValue, 8> MemOps;
+  SDValue FIN = Op.getOperand(1);
+  // Store gp_offset
+  SDValue Store = DAG.getStore(Op.getOperand(0), dl,
+                                 DAG.getConstant(VarArgsGPOffset, MVT::i32),
+                                 FIN, SV, 0);
+  MemOps.push_back(Store);
+
+  // Store fp_offset
+  FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+                    FIN, DAG.getIntPtrConstant(4));
+  Store = DAG.getStore(Op.getOperand(0), dl,
+                       DAG.getConstant(VarArgsFPOffset, MVT::i32),
+                       FIN, SV, 0);
+  MemOps.push_back(Store);
+
+  // Store ptr to overflow_arg_area
+  FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+                    FIN, DAG.getIntPtrConstant(4));
+  SDValue OVFIN = DAG.getFrameIndex(VarArgsFrameIndex, getPointerTy());
+  Store = DAG.getStore(Op.getOperand(0), dl, OVFIN, FIN, SV, 0);
+  MemOps.push_back(Store);
+
+  // Store ptr to reg_save_area.
+  FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(),
+                    FIN, DAG.getIntPtrConstant(8));
+  SDValue RSFIN = DAG.getFrameIndex(RegSaveFrameIndex, getPointerTy());
+  Store = DAG.getStore(Op.getOperand(0), dl, RSFIN, FIN, SV, 0);
+  MemOps.push_back(Store);
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                     &MemOps[0], MemOps.size());
+}
+
+SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) {
+  // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
+  assert(Subtarget->is64Bit() && "This code only handles 64-bit va_arg!");
+  SDValue Chain = Op.getOperand(0);
+  SDValue SrcPtr = Op.getOperand(1);
+  SDValue SrcSV = Op.getOperand(2);
+
+  llvm_report_error("VAArgInst is not yet implemented for x86-64!");
+  return SDValue();
+}
+
+SDValue X86TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) {
+  // X86-64 va_list is a struct { i32, i32, i8*, i8* }.
+  assert(Subtarget->is64Bit() && "This code only handles 64-bit va_copy!");
+  SDValue Chain = Op.getOperand(0);
+  SDValue DstPtr = Op.getOperand(1);
+  SDValue SrcPtr = Op.getOperand(2);
+  const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
+  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+  DebugLoc dl = Op.getDebugLoc();
+
+  return DAG.getMemcpy(Chain, dl, DstPtr, SrcPtr,
+                       DAG.getIntPtrConstant(24), 8, false,
+                       DstSV, 0, SrcSV, 0);
+}
+
+SDValue
+X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  switch (IntNo) {
+  default: return SDValue();    // Don't custom lower most intrinsics.
+  // Comparison intrinsics.
+  case Intrinsic::x86_sse_comieq_ss:
+  case Intrinsic::x86_sse_comilt_ss:
+  case Intrinsic::x86_sse_comile_ss:
+  case Intrinsic::x86_sse_comigt_ss:
+  case Intrinsic::x86_sse_comige_ss:
+  case Intrinsic::x86_sse_comineq_ss:
+  case Intrinsic::x86_sse_ucomieq_ss:
+  case Intrinsic::x86_sse_ucomilt_ss:
+  case Intrinsic::x86_sse_ucomile_ss:
+  case Intrinsic::x86_sse_ucomigt_ss:
+  case Intrinsic::x86_sse_ucomige_ss:
+  case Intrinsic::x86_sse_ucomineq_ss:
+  case Intrinsic::x86_sse2_comieq_sd:
+  case Intrinsic::x86_sse2_comilt_sd:
+  case Intrinsic::x86_sse2_comile_sd:
+  case Intrinsic::x86_sse2_comigt_sd:
+  case Intrinsic::x86_sse2_comige_sd:
+  case Intrinsic::x86_sse2_comineq_sd:
+  case Intrinsic::x86_sse2_ucomieq_sd:
+  case Intrinsic::x86_sse2_ucomilt_sd:
+  case Intrinsic::x86_sse2_ucomile_sd:
+  case Intrinsic::x86_sse2_ucomigt_sd:
+  case Intrinsic::x86_sse2_ucomige_sd:
+  case Intrinsic::x86_sse2_ucomineq_sd: {
+    unsigned Opc = 0;
+    ISD::CondCode CC = ISD::SETCC_INVALID;
+    switch (IntNo) {
+    default: break;
+    case Intrinsic::x86_sse_comieq_ss:
+    case Intrinsic::x86_sse2_comieq_sd:
+      Opc = X86ISD::COMI;
+      CC = ISD::SETEQ;
+      break;
+    case Intrinsic::x86_sse_comilt_ss:
+    case Intrinsic::x86_sse2_comilt_sd:
+      Opc = X86ISD::COMI;
+      CC = ISD::SETLT;
+      break;
+    case Intrinsic::x86_sse_comile_ss:
+    case Intrinsic::x86_sse2_comile_sd:
+      Opc = X86ISD::COMI;
+      CC = ISD::SETLE;
+      break;
+    case Intrinsic::x86_sse_comigt_ss:
+    case Intrinsic::x86_sse2_comigt_sd:
+      Opc = X86ISD::COMI;
+      CC = ISD::SETGT;
+      break;
+    case Intrinsic::x86_sse_comige_ss:
+    case Intrinsic::x86_sse2_comige_sd:
+      Opc = X86ISD::COMI;
+      CC = ISD::SETGE;
+      break;
+    case Intrinsic::x86_sse_comineq_ss:
+    case Intrinsic::x86_sse2_comineq_sd:
+      Opc = X86ISD::COMI;
+      CC = ISD::SETNE;
+      break;
+    case Intrinsic::x86_sse_ucomieq_ss:
+    case Intrinsic::x86_sse2_ucomieq_sd:
+      Opc = X86ISD::UCOMI;
+      CC = ISD::SETEQ;
+      break;
+    case Intrinsic::x86_sse_ucomilt_ss:
+    case Intrinsic::x86_sse2_ucomilt_sd:
+      Opc = X86ISD::UCOMI;
+      CC = ISD::SETLT;
+      break;
+    case Intrinsic::x86_sse_ucomile_ss:
+    case Intrinsic::x86_sse2_ucomile_sd:
+      Opc = X86ISD::UCOMI;
+      CC = ISD::SETLE;
+      break;
+    case Intrinsic::x86_sse_ucomigt_ss:
+    case Intrinsic::x86_sse2_ucomigt_sd:
+      Opc = X86ISD::UCOMI;
+      CC = ISD::SETGT;
+      break;
+    case Intrinsic::x86_sse_ucomige_ss:
+    case Intrinsic::x86_sse2_ucomige_sd:
+      Opc = X86ISD::UCOMI;
+      CC = ISD::SETGE;
+      break;
+    case Intrinsic::x86_sse_ucomineq_ss:
+    case Intrinsic::x86_sse2_ucomineq_sd:
+      Opc = X86ISD::UCOMI;
+      CC = ISD::SETNE;
+      break;
+    }
+
+    SDValue LHS = Op.getOperand(1);
+    SDValue RHS = Op.getOperand(2);
+    unsigned X86CC = TranslateX86CC(CC, true, LHS, RHS, DAG);
+    assert(X86CC != X86::COND_INVALID && "Unexpected illegal condition!");
+    SDValue Cond = DAG.getNode(Opc, dl, MVT::i32, LHS, RHS);
+    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                                DAG.getConstant(X86CC, MVT::i8), Cond);
+    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+  }
+  // ptest intrinsics. The intrinsic these come from are designed to return
+  // an integer value, not just an instruction so lower it to the ptest
+  // pattern and a setcc for the result.
+  case Intrinsic::x86_sse41_ptestz:
+  case Intrinsic::x86_sse41_ptestc:
+  case Intrinsic::x86_sse41_ptestnzc:{
+    unsigned X86CC = 0;
+    switch (IntNo) {
+    default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
+    case Intrinsic::x86_sse41_ptestz:
+      // ZF = 1
+      X86CC = X86::COND_E;
+      break;
+    case Intrinsic::x86_sse41_ptestc:
+      // CF = 1
+      X86CC = X86::COND_B;
+      break;
+    case Intrinsic::x86_sse41_ptestnzc:
+      // ZF and CF = 0
+      X86CC = X86::COND_A;
+      break;
+    }
+
+    SDValue LHS = Op.getOperand(1);
+    SDValue RHS = Op.getOperand(2);
+    SDValue Test = DAG.getNode(X86ISD::PTEST, dl, MVT::i32, LHS, RHS);
+    SDValue CC = DAG.getConstant(X86CC, MVT::i8);
+    SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, CC, Test);
+    return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
+  }
+
+  // Fix vector shift instructions where the last operand is a non-immediate
+  // i32 value.
+  case Intrinsic::x86_sse2_pslli_w:
+  case Intrinsic::x86_sse2_pslli_d:
+  case Intrinsic::x86_sse2_pslli_q:
+  case Intrinsic::x86_sse2_psrli_w:
+  case Intrinsic::x86_sse2_psrli_d:
+  case Intrinsic::x86_sse2_psrli_q:
+  case Intrinsic::x86_sse2_psrai_w:
+  case Intrinsic::x86_sse2_psrai_d:
+  case Intrinsic::x86_mmx_pslli_w:
+  case Intrinsic::x86_mmx_pslli_d:
+  case Intrinsic::x86_mmx_pslli_q:
+  case Intrinsic::x86_mmx_psrli_w:
+  case Intrinsic::x86_mmx_psrli_d:
+  case Intrinsic::x86_mmx_psrli_q:
+  case Intrinsic::x86_mmx_psrai_w:
+  case Intrinsic::x86_mmx_psrai_d: {
+    SDValue ShAmt = Op.getOperand(2);
+    if (isa<ConstantSDNode>(ShAmt))
+      return SDValue();
+
+    unsigned NewIntNo = 0;
+    EVT ShAmtVT = MVT::v4i32;
+    switch (IntNo) {
+    case Intrinsic::x86_sse2_pslli_w:
+      NewIntNo = Intrinsic::x86_sse2_psll_w;
+      break;
+    case Intrinsic::x86_sse2_pslli_d:
+      NewIntNo = Intrinsic::x86_sse2_psll_d;
+      break;
+    case Intrinsic::x86_sse2_pslli_q:
+      NewIntNo = Intrinsic::x86_sse2_psll_q;
+      break;
+    case Intrinsic::x86_sse2_psrli_w:
+      NewIntNo = Intrinsic::x86_sse2_psrl_w;
+      break;
+    case Intrinsic::x86_sse2_psrli_d:
+      NewIntNo = Intrinsic::x86_sse2_psrl_d;
+      break;
+    case Intrinsic::x86_sse2_psrli_q:
+      NewIntNo = Intrinsic::x86_sse2_psrl_q;
+      break;
+    case Intrinsic::x86_sse2_psrai_w:
+      NewIntNo = Intrinsic::x86_sse2_psra_w;
+      break;
+    case Intrinsic::x86_sse2_psrai_d:
+      NewIntNo = Intrinsic::x86_sse2_psra_d;
+      break;
+    default: {
+      ShAmtVT = MVT::v2i32;
+      switch (IntNo) {
+      case Intrinsic::x86_mmx_pslli_w:
+        NewIntNo = Intrinsic::x86_mmx_psll_w;
+        break;
+      case Intrinsic::x86_mmx_pslli_d:
+        NewIntNo = Intrinsic::x86_mmx_psll_d;
+        break;
+      case Intrinsic::x86_mmx_pslli_q:
+        NewIntNo = Intrinsic::x86_mmx_psll_q;
+        break;
+      case Intrinsic::x86_mmx_psrli_w:
+        NewIntNo = Intrinsic::x86_mmx_psrl_w;
+        break;
+      case Intrinsic::x86_mmx_psrli_d:
+        NewIntNo = Intrinsic::x86_mmx_psrl_d;
+        break;
+      case Intrinsic::x86_mmx_psrli_q:
+        NewIntNo = Intrinsic::x86_mmx_psrl_q;
+        break;
+      case Intrinsic::x86_mmx_psrai_w:
+        NewIntNo = Intrinsic::x86_mmx_psra_w;
+        break;
+      case Intrinsic::x86_mmx_psrai_d:
+        NewIntNo = Intrinsic::x86_mmx_psra_d;
+        break;
+      default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+      }
+      break;
+    }
+    }
+
+    // The vector shift intrinsics with scalars uses 32b shift amounts but
+    // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits
+    // to be zero.
+    SDValue ShOps[4];
+    ShOps[0] = ShAmt;
+    ShOps[1] = DAG.getConstant(0, MVT::i32);
+    if (ShAmtVT == MVT::v4i32) {
+      ShOps[2] = DAG.getUNDEF(MVT::i32);
+      ShOps[3] = DAG.getUNDEF(MVT::i32);
+      ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 4);
+    } else {
+      ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, dl, ShAmtVT, &ShOps[0], 2);
+    }
+
+    EVT VT = Op.getValueType();
+    ShAmt = DAG.getNode(ISD::BIT_CONVERT, dl, VT, ShAmt);
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                       DAG.getConstant(NewIntNo, MVT::i32),
+                       Op.getOperand(1), ShAmt);
+  }
+  }
+}
+
+SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) {
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  DebugLoc dl = Op.getDebugLoc();
+
+  if (Depth > 0) {
+    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+    SDValue Offset =
+      DAG.getConstant(TD->getPointerSize(),
+                      Subtarget->is64Bit() ? MVT::i64 : MVT::i32);
+    return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
+                       DAG.getNode(ISD::ADD, dl, getPointerTy(),
+                                   FrameAddr, Offset),
+                       NULL, 0);
+  }
+
+  // Just load the return address.
+  SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
+  return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
+                     RetAddrFI, NULL, 0);
+}
+
+SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) {
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setFrameAddressIsTaken(true);
+  EVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();  // FIXME probably not meaningful
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP;
+  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
+  while (Depth--)
+    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, NULL, 0);
+  return FrameAddr;
+}
+
+SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
+                                                     SelectionDAG &DAG) {
+  return DAG.getIntPtrConstant(2*TD->getPointerSize());
+}
+
+SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG)
+{
+  MachineFunction &MF = DAG.getMachineFunction();
+  SDValue Chain     = Op.getOperand(0);
+  SDValue Offset    = Op.getOperand(1);
+  SDValue Handler   = Op.getOperand(2);
+  DebugLoc dl       = Op.getDebugLoc();
+
+  SDValue Frame = DAG.getRegister(Subtarget->is64Bit() ? X86::RBP : X86::EBP,
+                                  getPointerTy());
+  unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX);
+
+  SDValue StoreAddr = DAG.getNode(ISD::SUB, dl, getPointerTy(), Frame,
+                                  DAG.getIntPtrConstant(-TD->getPointerSize()));
+  StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset);
+  Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, NULL, 0);
+  Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
+  MF.getRegInfo().addLiveOut(StoreAddrReg);
+
+  return DAG.getNode(X86ISD::EH_RETURN, dl,
+                     MVT::Other,
+                     Chain, DAG.getRegister(StoreAddrReg, getPointerTy()));
+}
+
+SDValue X86TargetLowering::LowerTRAMPOLINE(SDValue Op,
+                                             SelectionDAG &DAG) {
+  SDValue Root = Op.getOperand(0);
+  SDValue Trmp = Op.getOperand(1); // trampoline
+  SDValue FPtr = Op.getOperand(2); // nested function
+  SDValue Nest = Op.getOperand(3); // 'nest' parameter value
+  DebugLoc dl  = Op.getDebugLoc();
+
+  const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+
+  if (Subtarget->is64Bit()) {
+    SDValue OutChains[6];
+
+    // Large code-model.
+    const unsigned char JMP64r  = 0xFF; // 64-bit jmp through register opcode.
+    const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
+
+    const unsigned char N86R10 = RegInfo->getX86RegNum(X86::R10);
+    const unsigned char N86R11 = RegInfo->getX86RegNum(X86::R11);
+
+    const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
+
+    // Load the pointer to the nested function into R11.
+    unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
+    SDValue Addr = Trmp;
+    OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
+                                Addr, TrmpAddr, 0);
+
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                       DAG.getConstant(2, MVT::i64));
+    OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, TrmpAddr, 2, false, 2);
+
+    // Load the 'nest' parameter value into R10.
+    // R10 is specified in X86CallingConv.td
+    OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                       DAG.getConstant(10, MVT::i64));
+    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
+                                Addr, TrmpAddr, 10);
+
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                       DAG.getConstant(12, MVT::i64));
+    OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 12, false, 2);
+
+    // Jump to the nested function.
+    OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                       DAG.getConstant(20, MVT::i64));
+    OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, MVT::i16),
+                                Addr, TrmpAddr, 20);
+
+    unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                       DAG.getConstant(22, MVT::i64));
+    OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, MVT::i8), Addr,
+                                TrmpAddr, 22);
+
+    SDValue Ops[] =
+      { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6) };
+    return DAG.getMergeValues(Ops, 2, dl);
+  } else {
+    const Function *Func =
+      cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
+    CallingConv::ID CC = Func->getCallingConv();
+    unsigned NestReg;
+
+    switch (CC) {
+    default:
+      llvm_unreachable("Unsupported calling convention");
+    case CallingConv::C:
+    case CallingConv::X86_StdCall: {
+      // Pass 'nest' parameter in ECX.
+      // Must be kept in sync with X86CallingConv.td
+      NestReg = X86::ECX;
+
+      // Check that ECX wasn't needed by an 'inreg' parameter.
+      const FunctionType *FTy = Func->getFunctionType();
+      const AttrListPtr &Attrs = Func->getAttributes();
+
+      if (!Attrs.isEmpty() && !Func->isVarArg()) {
+        unsigned InRegCount = 0;
+        unsigned Idx = 1;
+
+        for (FunctionType::param_iterator I = FTy->param_begin(),
+             E = FTy->param_end(); I != E; ++I, ++Idx)
+          if (Attrs.paramHasAttr(Idx, Attribute::InReg))
+            // FIXME: should only count parameters that are lowered to integers.
+            InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
+
+        if (InRegCount > 2) {
+          llvm_report_error("Nest register in use - reduce number of inreg parameters!");
+        }
+      }
+      break;
+    }
+    case CallingConv::X86_FastCall:
+    case CallingConv::Fast:
+      // Pass 'nest' parameter in EAX.
+      // Must be kept in sync with X86CallingConv.td
+      NestReg = X86::EAX;
+      break;
+    }
+
+    SDValue OutChains[4];
+    SDValue Addr, Disp;
+
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+                       DAG.getConstant(10, MVT::i32));
+    Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
+
+    // This is storing the opcode for MOV32ri.
+    const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
+    const unsigned char N86Reg = RegInfo->getX86RegNum(NestReg);
+    OutChains[0] = DAG.getStore(Root, dl,
+                                DAG.getConstant(MOV32ri|N86Reg, MVT::i8),
+                                Trmp, TrmpAddr, 0);
+
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+                       DAG.getConstant(1, MVT::i32));
+    OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, TrmpAddr, 1, false, 1);
+
+    const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+                       DAG.getConstant(5, MVT::i32));
+    OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, MVT::i8), Addr,
+                                TrmpAddr, 5, false, 1);
+
+    Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
+                       DAG.getConstant(6, MVT::i32));
+    OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, TrmpAddr, 6, false, 1);
+
+    SDValue Ops[] =
+      { Trmp, DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4) };
+    return DAG.getMergeValues(Ops, 2, dl);
+  }
+}
+
+SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) {
+  /*
+   The rounding mode is in bits 11:10 of FPSR, and has the following
+   settings:
+     00 Round to nearest
+     01 Round to -inf
+     10 Round to +inf
+     11 Round to 0
+
+  FLT_ROUNDS, on the other hand, expects the following:
+    -1 Undefined
+     0 Round to 0
+     1 Round to nearest
+     2 Round to +inf
+     3 Round to -inf
+
+  To perform the conversion, we do:
+    (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
+  */
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  const TargetMachine &TM = MF.getTarget();
+  const TargetFrameInfo &TFI = *TM.getFrameInfo();
+  unsigned StackAlignment = TFI.getStackAlignment();
+  EVT VT = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+
+  // Save FP Control Word to stack slot
+  int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
+  SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
+
+  SDValue Chain = DAG.getNode(X86ISD::FNSTCW16m, dl, MVT::Other,
+                              DAG.getEntryNode(), StackSlot);
+
+  // Load FP Control Word from stack slot
+  SDValue CWD = DAG.getLoad(MVT::i16, dl, Chain, StackSlot, NULL, 0);
+
+  // Transform as necessary
+  SDValue CWD1 =
+    DAG.getNode(ISD::SRL, dl, MVT::i16,
+                DAG.getNode(ISD::AND, dl, MVT::i16,
+                            CWD, DAG.getConstant(0x800, MVT::i16)),
+                DAG.getConstant(11, MVT::i8));
+  SDValue CWD2 =
+    DAG.getNode(ISD::SRL, dl, MVT::i16,
+                DAG.getNode(ISD::AND, dl, MVT::i16,
+                            CWD, DAG.getConstant(0x400, MVT::i16)),
+                DAG.getConstant(9, MVT::i8));
+
+  SDValue RetVal =
+    DAG.getNode(ISD::AND, dl, MVT::i16,
+                DAG.getNode(ISD::ADD, dl, MVT::i16,
+                            DAG.getNode(ISD::OR, dl, MVT::i16, CWD1, CWD2),
+                            DAG.getConstant(1, MVT::i16)),
+                DAG.getConstant(3, MVT::i16));
+
+
+  return DAG.getNode((VT.getSizeInBits() < 16 ?
+                      ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal);
+}
+
+SDValue X86TargetLowering::LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+  EVT OpVT = VT;
+  unsigned NumBits = VT.getSizeInBits();
+  DebugLoc dl = Op.getDebugLoc();
+
+  Op = Op.getOperand(0);
+  if (VT == MVT::i8) {
+    // Zero extend to i32 since there is not an i8 bsr.
+    OpVT = MVT::i32;
+    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
+  }
+
+  // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
+  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
+  Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
+
+  // If src is zero (i.e. bsr sets ZF), returns NumBits.
+  SDValue Ops[] = {
+    Op,
+    DAG.getConstant(NumBits+NumBits-1, OpVT),
+    DAG.getConstant(X86::COND_E, MVT::i8),
+    Op.getValue(1)
+  };
+  Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops));
+
+  // Finally xor with NumBits-1.
+  Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
+
+  if (VT == MVT::i8)
+    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
+  return Op;
+}
+
+SDValue X86TargetLowering::LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+  EVT OpVT = VT;
+  unsigned NumBits = VT.getSizeInBits();
+  DebugLoc dl = Op.getDebugLoc();
+
+  Op = Op.getOperand(0);
+  if (VT == MVT::i8) {
+    OpVT = MVT::i32;
+    Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
+  }
+
+  // Issue a bsf (scan bits forward) which also sets EFLAGS.
+  SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
+  Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op);
+
+  // If src is zero (i.e. bsf sets ZF), returns NumBits.
+  SDValue Ops[] = {
+    Op,
+    DAG.getConstant(NumBits, OpVT),
+    DAG.getConstant(X86::COND_E, MVT::i8),
+    Op.getValue(1)
+  };
+  Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops));
+
+  if (VT == MVT::i8)
+    Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
+  return Op;
+}
+
+SDValue X86TargetLowering::LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+  assert(VT == MVT::v2i64 && "Only know how to lower V2I64 multiply");
+  DebugLoc dl = Op.getDebugLoc();
+
+  //  ulong2 Ahi = __builtin_ia32_psrlqi128( a, 32);
+  //  ulong2 Bhi = __builtin_ia32_psrlqi128( b, 32);
+  //  ulong2 AloBlo = __builtin_ia32_pmuludq128( a, b );
+  //  ulong2 AloBhi = __builtin_ia32_pmuludq128( a, Bhi );
+  //  ulong2 AhiBlo = __builtin_ia32_pmuludq128( Ahi, b );
+  //
+  //  AloBhi = __builtin_ia32_psllqi128( AloBhi, 32 );
+  //  AhiBlo = __builtin_ia32_psllqi128( AhiBlo, 32 );
+  //  return AloBlo + AloBhi + AhiBlo;
+
+  SDValue A = Op.getOperand(0);
+  SDValue B = Op.getOperand(1);
+
+  SDValue Ahi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                       DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
+                       A, DAG.getConstant(32, MVT::i32));
+  SDValue Bhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                       DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
+                       B, DAG.getConstant(32, MVT::i32));
+  SDValue AloBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                       DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
+                       A, B);
+  SDValue AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                       DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
+                       A, Bhi);
+  SDValue AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                       DAG.getConstant(Intrinsic::x86_sse2_pmulu_dq, MVT::i32),
+                       Ahi, B);
+  AloBhi = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                       DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
+                       AloBhi, DAG.getConstant(32, MVT::i32));
+  AhiBlo = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                       DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
+                       AhiBlo, DAG.getConstant(32, MVT::i32));
+  SDValue Res = DAG.getNode(ISD::ADD, dl, VT, AloBlo, AloBhi);
+  Res = DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
+  return Res;
+}
+
+
+SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) {
+  // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
+  // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
+  // looks for this combo and may remove the "setcc" instruction if the "setcc"
+  // has only one use.
+  SDNode *N = Op.getNode();
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  unsigned BaseOp = 0;
+  unsigned Cond = 0;
+  DebugLoc dl = Op.getDebugLoc();
+
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Unknown ovf instruction!");
+  case ISD::SADDO:
+    // A subtract of one will be selected as a INC. Note that INC doesn't
+    // set CF, so we can't do this for UADDO.
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op))
+      if (C->getAPIntValue() == 1) {
+        BaseOp = X86ISD::INC;
+        Cond = X86::COND_O;
+        break;
+      }
+    BaseOp = X86ISD::ADD;
+    Cond = X86::COND_O;
+    break;
+  case ISD::UADDO:
+    BaseOp = X86ISD::ADD;
+    Cond = X86::COND_B;
+    break;
+  case ISD::SSUBO:
+    // A subtract of one will be selected as a DEC. Note that DEC doesn't
+    // set CF, so we can't do this for USUBO.
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op))
+      if (C->getAPIntValue() == 1) {
+        BaseOp = X86ISD::DEC;
+        Cond = X86::COND_O;
+        break;
+      }
+    BaseOp = X86ISD::SUB;
+    Cond = X86::COND_O;
+    break;
+  case ISD::USUBO:
+    BaseOp = X86ISD::SUB;
+    Cond = X86::COND_B;
+    break;
+  case ISD::SMULO:
+    BaseOp = X86ISD::SMUL;
+    Cond = X86::COND_O;
+    break;
+  case ISD::UMULO:
+    BaseOp = X86ISD::UMUL;
+    Cond = X86::COND_B;
+    break;
+  }
+
+  // Also sets EFLAGS.
+  SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
+  SDValue Sum = DAG.getNode(BaseOp, dl, VTs, LHS, RHS);
+
+  SDValue SetCC =
+    DAG.getNode(X86ISD::SETCC, dl, N->getValueType(1),
+                DAG.getConstant(Cond, MVT::i32), SDValue(Sum.getNode(), 1));
+
+  DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SetCC);
+  return Sum;
+}
+
+SDValue X86TargetLowering::LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG) {
+  EVT T = Op.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+  unsigned Reg = 0;
+  unsigned size = 0;
+  switch(T.getSimpleVT().SimpleTy) {
+  default:
+    assert(false && "Invalid value type!");
+  case MVT::i8:  Reg = X86::AL;  size = 1; break;
+  case MVT::i16: Reg = X86::AX;  size = 2; break;
+  case MVT::i32: Reg = X86::EAX; size = 4; break;
+  case MVT::i64:
+    assert(Subtarget->is64Bit() && "Node not type legal!");
+    Reg = X86::RAX; size = 8;
+    break;
+  }
+  SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), dl, Reg,
+                                    Op.getOperand(2), SDValue());
+  SDValue Ops[] = { cpIn.getValue(0),
+                    Op.getOperand(1),
+                    Op.getOperand(3),
+                    DAG.getTargetConstant(size, MVT::i8),
+                    cpIn.getValue(1) };
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SDValue Result = DAG.getNode(X86ISD::LCMPXCHG_DAG, dl, Tys, Ops, 5);
+  SDValue cpOut =
+    DAG.getCopyFromReg(Result.getValue(0), dl, Reg, T, Result.getValue(1));
+  return cpOut;
+}
+
+SDValue X86TargetLowering::LowerREADCYCLECOUNTER(SDValue Op,
+                                                 SelectionDAG &DAG) {
+  assert(Subtarget->is64Bit() && "Result not type legalized?");
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SDValue TheChain = Op.getOperand(0);
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
+  SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1));
+  SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64,
+                                   rax.getValue(2));
+  SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx,
+                            DAG.getConstant(32, MVT::i8));
+  SDValue Ops[] = {
+    DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp),
+    rdx.getValue(1)
+  };
+  return DAG.getMergeValues(Ops, 2, dl);
+}
+
+SDValue X86TargetLowering::LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
+  SDNode *Node = Op.getNode();
+  DebugLoc dl = Node->getDebugLoc();
+  EVT T = Node->getValueType(0);
+  SDValue negOp = DAG.getNode(ISD::SUB, dl, T,
+                              DAG.getConstant(0, T), Node->getOperand(2));
+  return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, dl,
+                       cast<AtomicSDNode>(Node)->getMemoryVT(),
+                       Node->getOperand(0),
+                       Node->getOperand(1), negOp,
+                       cast<AtomicSDNode>(Node)->getSrcValue(),
+                       cast<AtomicSDNode>(Node)->getAlignment());
+}
+
+/// LowerOperation - Provide custom lowering hooks for some operations.
+///
+SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) {
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Should not custom lower this!");
+  case ISD::ATOMIC_CMP_SWAP:    return LowerCMP_SWAP(Op,DAG);
+  case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
+  case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
+  case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, DAG);
+  case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
+  case ISD::SCALAR_TO_VECTOR:   return LowerSCALAR_TO_VECTOR(Op, DAG);
+  case ISD::ConstantPool:       return LowerConstantPool(Op, DAG);
+  case ISD::GlobalAddress:      return LowerGlobalAddress(Op, DAG);
+  case ISD::GlobalTLSAddress:   return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::ExternalSymbol:     return LowerExternalSymbol(Op, DAG);
+  case ISD::BlockAddress:       return LowerBlockAddress(Op, DAG);
+  case ISD::SHL_PARTS:
+  case ISD::SRA_PARTS:
+  case ISD::SRL_PARTS:          return LowerShift(Op, DAG);
+  case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
+  case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
+  case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
+  case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
+  case ISD::FABS:               return LowerFABS(Op, DAG);
+  case ISD::FNEG:               return LowerFNEG(Op, DAG);
+  case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
+  case ISD::SETCC:              return LowerSETCC(Op, DAG);
+  case ISD::VSETCC:             return LowerVSETCC(Op, DAG);
+  case ISD::SELECT:             return LowerSELECT(Op, DAG);
+  case ISD::BRCOND:             return LowerBRCOND(Op, DAG);
+  case ISD::JumpTable:          return LowerJumpTable(Op, DAG);
+  case ISD::VASTART:            return LowerVASTART(Op, DAG);
+  case ISD::VAARG:              return LowerVAARG(Op, DAG);
+  case ISD::VACOPY:             return LowerVACOPY(Op, DAG);
+  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::RETURNADDR:         return LowerRETURNADDR(Op, DAG);
+  case ISD::FRAMEADDR:          return LowerFRAMEADDR(Op, DAG);
+  case ISD::FRAME_TO_ARGS_OFFSET:
+                                return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
+  case ISD::EH_RETURN:          return LowerEH_RETURN(Op, DAG);
+  case ISD::TRAMPOLINE:         return LowerTRAMPOLINE(Op, DAG);
+  case ISD::FLT_ROUNDS_:        return LowerFLT_ROUNDS_(Op, DAG);
+  case ISD::CTLZ:               return LowerCTLZ(Op, DAG);
+  case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
+  case ISD::MUL:                return LowerMUL_V2I64(Op, DAG);
+  case ISD::SADDO:
+  case ISD::UADDO:
+  case ISD::SSUBO:
+  case ISD::USUBO:
+  case ISD::SMULO:
+  case ISD::UMULO:              return LowerXALUO(Op, DAG);
+  case ISD::READCYCLECOUNTER:   return LowerREADCYCLECOUNTER(Op, DAG);
+  }
+}
+
+void X86TargetLowering::
+ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
+                        SelectionDAG &DAG, unsigned NewOp) {
+  EVT T = Node->getValueType(0);
+  DebugLoc dl = Node->getDebugLoc();
+  assert (T == MVT::i64 && "Only know how to expand i64 atomics");
+
+  SDValue Chain = Node->getOperand(0);
+  SDValue In1 = Node->getOperand(1);
+  SDValue In2L = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                             Node->getOperand(2), DAG.getIntPtrConstant(0));
+  SDValue In2H = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                             Node->getOperand(2), DAG.getIntPtrConstant(1));
+  SDValue Ops[] = { Chain, In1, In2L, In2H };
+  SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
+  SDValue Result =
+    DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, 4, MVT::i64,
+                            cast<MemSDNode>(Node)->getMemOperand());
+  SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)};
+  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
+  Results.push_back(Result.getValue(2));
+}
+
+/// ReplaceNodeResults - Replace a node with an illegal result type
+/// with a new node built out of custom code.
+void X86TargetLowering::ReplaceNodeResults(SDNode *N,
+                                           SmallVectorImpl<SDValue>&Results,
+                                           SelectionDAG &DAG) {
+  DebugLoc dl = N->getDebugLoc();
+  switch (N->getOpcode()) {
+  default:
+    assert(false && "Do not know how to custom type legalize this operation!");
+    return;
+  case ISD::FP_TO_SINT: {
+    std::pair<SDValue,SDValue> Vals =
+        FP_TO_INTHelper(SDValue(N, 0), DAG, true);
+    SDValue FIST = Vals.first, StackSlot = Vals.second;
+    if (FIST.getNode() != 0) {
+      EVT VT = N->getValueType(0);
+      // Return a load from the stack slot.
+      Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot, NULL, 0));
+    }
+    return;
+  }
+  case ISD::READCYCLECOUNTER: {
+    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+    SDValue TheChain = N->getOperand(0);
+    SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
+    SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32,
+                                     rd.getValue(1));
+    SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32,
+                                     eax.getValue(2));
+    // Use a buildpair to merge the two 32-bit values into a 64-bit one.
+    SDValue Ops[] = { eax, edx };
+    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops, 2));
+    Results.push_back(edx.getValue(1));
+    return;
+  }
+  case ISD::SDIV:
+  case ISD::UDIV:
+  case ISD::SREM:
+  case ISD::UREM: {
+    EVT WidenVT = getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+    Results.push_back(DAG.UnrollVectorOp(N, WidenVT.getVectorNumElements()));
+    return;
+  }
+  case ISD::ATOMIC_CMP_SWAP: {
+    EVT T = N->getValueType(0);
+    assert (T == MVT::i64 && "Only know how to expand i64 Cmp and Swap");
+    SDValue cpInL, cpInH;
+    cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2),
+                        DAG.getConstant(0, MVT::i32));
+    cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(2),
+                        DAG.getConstant(1, MVT::i32));
+    cpInL = DAG.getCopyToReg(N->getOperand(0), dl, X86::EAX, cpInL, SDValue());
+    cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl, X86::EDX, cpInH,
+                             cpInL.getValue(1));
+    SDValue swapInL, swapInH;
+    swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3),
+                          DAG.getConstant(0, MVT::i32));
+    swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, N->getOperand(3),
+                          DAG.getConstant(1, MVT::i32));
+    swapInL = DAG.getCopyToReg(cpInH.getValue(0), dl, X86::EBX, swapInL,
+                               cpInH.getValue(1));
+    swapInH = DAG.getCopyToReg(swapInL.getValue(0), dl, X86::ECX, swapInH,
+                               swapInL.getValue(1));
+    SDValue Ops[] = { swapInH.getValue(0),
+                      N->getOperand(1),
+                      swapInH.getValue(1) };
+    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Flag);
+    SDValue Result = DAG.getNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, 3);
+    SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, X86::EAX,
+                                        MVT::i32, Result.getValue(1));
+    SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, X86::EDX,
+                                        MVT::i32, cpOutL.getValue(2));
+    SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
+    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
+    Results.push_back(cpOutH.getValue(1));
+    return;
+  }
+  case ISD::ATOMIC_LOAD_ADD:
+    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMADD64_DAG);
+    return;
+  case ISD::ATOMIC_LOAD_AND:
+    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMAND64_DAG);
+    return;
+  case ISD::ATOMIC_LOAD_NAND:
+    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMNAND64_DAG);
+    return;
+  case ISD::ATOMIC_LOAD_OR:
+    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMOR64_DAG);
+    return;
+  case ISD::ATOMIC_LOAD_SUB:
+    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSUB64_DAG);
+    return;
+  case ISD::ATOMIC_LOAD_XOR:
+    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMXOR64_DAG);
+    return;
+  case ISD::ATOMIC_SWAP:
+    ReplaceATOMIC_BINARY_64(N, Results, DAG, X86ISD::ATOMSWAP64_DAG);
+    return;
+  }
+}
+
+const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return NULL;
+  case X86ISD::BSF:                return "X86ISD::BSF";
+  case X86ISD::BSR:                return "X86ISD::BSR";
+  case X86ISD::SHLD:               return "X86ISD::SHLD";
+  case X86ISD::SHRD:               return "X86ISD::SHRD";
+  case X86ISD::FAND:               return "X86ISD::FAND";
+  case X86ISD::FOR:                return "X86ISD::FOR";
+  case X86ISD::FXOR:               return "X86ISD::FXOR";
+  case X86ISD::FSRL:               return "X86ISD::FSRL";
+  case X86ISD::FILD:               return "X86ISD::FILD";
+  case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
+  case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
+  case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
+  case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
+  case X86ISD::FLD:                return "X86ISD::FLD";
+  case X86ISD::FST:                return "X86ISD::FST";
+  case X86ISD::CALL:               return "X86ISD::CALL";
+  case X86ISD::RDTSC_DAG:          return "X86ISD::RDTSC_DAG";
+  case X86ISD::BT:                 return "X86ISD::BT";
+  case X86ISD::CMP:                return "X86ISD::CMP";
+  case X86ISD::COMI:               return "X86ISD::COMI";
+  case X86ISD::UCOMI:              return "X86ISD::UCOMI";
+  case X86ISD::SETCC:              return "X86ISD::SETCC";
+  case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
+  case X86ISD::CMOV:               return "X86ISD::CMOV";
+  case X86ISD::BRCOND:             return "X86ISD::BRCOND";
+  case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
+  case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
+  case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
+  case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
+  case X86ISD::Wrapper:            return "X86ISD::Wrapper";
+  case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
+  case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
+  case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
+  case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
+  case X86ISD::PINSRB:             return "X86ISD::PINSRB";
+  case X86ISD::PINSRW:             return "X86ISD::PINSRW";
+  case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
+  case X86ISD::FMAX:               return "X86ISD::FMAX";
+  case X86ISD::FMIN:               return "X86ISD::FMIN";
+  case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
+  case X86ISD::FRCP:               return "X86ISD::FRCP";
+  case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
+  case X86ISD::SegmentBaseAddress: return "X86ISD::SegmentBaseAddress";
+  case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
+  case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
+  case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
+  case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
+  case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
+  case X86ISD::ATOMADD64_DAG:      return "X86ISD::ATOMADD64_DAG";
+  case X86ISD::ATOMSUB64_DAG:      return "X86ISD::ATOMSUB64_DAG";
+  case X86ISD::ATOMOR64_DAG:       return "X86ISD::ATOMOR64_DAG";
+  case X86ISD::ATOMXOR64_DAG:      return "X86ISD::ATOMXOR64_DAG";
+  case X86ISD::ATOMAND64_DAG:      return "X86ISD::ATOMAND64_DAG";
+  case X86ISD::ATOMNAND64_DAG:     return "X86ISD::ATOMNAND64_DAG";
+  case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
+  case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
+  case X86ISD::VSHL:               return "X86ISD::VSHL";
+  case X86ISD::VSRL:               return "X86ISD::VSRL";
+  case X86ISD::CMPPD:              return "X86ISD::CMPPD";
+  case X86ISD::CMPPS:              return "X86ISD::CMPPS";
+  case X86ISD::PCMPEQB:            return "X86ISD::PCMPEQB";
+  case X86ISD::PCMPEQW:            return "X86ISD::PCMPEQW";
+  case X86ISD::PCMPEQD:            return "X86ISD::PCMPEQD";
+  case X86ISD::PCMPEQQ:            return "X86ISD::PCMPEQQ";
+  case X86ISD::PCMPGTB:            return "X86ISD::PCMPGTB";
+  case X86ISD::PCMPGTW:            return "X86ISD::PCMPGTW";
+  case X86ISD::PCMPGTD:            return "X86ISD::PCMPGTD";
+  case X86ISD::PCMPGTQ:            return "X86ISD::PCMPGTQ";
+  case X86ISD::ADD:                return "X86ISD::ADD";
+  case X86ISD::SUB:                return "X86ISD::SUB";
+  case X86ISD::SMUL:               return "X86ISD::SMUL";
+  case X86ISD::UMUL:               return "X86ISD::UMUL";
+  case X86ISD::INC:                return "X86ISD::INC";
+  case X86ISD::DEC:                return "X86ISD::DEC";
+  case X86ISD::OR:                 return "X86ISD::OR";
+  case X86ISD::XOR:                return "X86ISD::XOR";
+  case X86ISD::AND:                return "X86ISD::AND";
+  case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
+  case X86ISD::PTEST:              return "X86ISD::PTEST";
+  case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
+  }
+}
+
+// isLegalAddressingMode - Return true if the addressing mode represented
+// by AM is legal for this target, for a load/store of the specified type.
+bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
+                                              const Type *Ty) const {
+  // X86 supports extremely general addressing modes.
+  CodeModel::Model M = getTargetMachine().getCodeModel();
+
+  // X86 allows a sign-extended 32-bit immediate field as a displacement.
+  if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL))
+    return false;
+
+  if (AM.BaseGV) {
+    unsigned GVFlags =
+      Subtarget->ClassifyGlobalReference(AM.BaseGV, getTargetMachine());
+
+    // If a reference to this global requires an extra load, we can't fold it.
+    if (isGlobalStubReference(GVFlags))
+      return false;
+
+    // If BaseGV requires a register for the PIC base, we cannot also have a
+    // BaseReg specified.
+    if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
+      return false;
+
+    // If lower 4G is not available, then we must use rip-relative addressing.
+    if (Subtarget->is64Bit() && (AM.BaseOffs || AM.Scale > 1))
+      return false;
+  }
+
+  switch (AM.Scale) {
+  case 0:
+  case 1:
+  case 2:
+  case 4:
+  case 8:
+    // These scales always work.
+    break;
+  case 3:
+  case 5:
+  case 9:
+    // These scales are formed with basereg+scalereg.  Only accept if there is
+    // no basereg yet.
+    if (AM.HasBaseReg)
+      return false;
+    break;
+  default:  // Other stuff never works.
+    return false;
+  }
+
+  return true;
+}
+
+
+bool X86TargetLowering::isTruncateFree(const Type *Ty1, const Type *Ty2) const {
+  if (!Ty1->isInteger() || !Ty2->isInteger())
+    return false;
+  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
+  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
+  if (NumBits1 <= NumBits2)
+    return false;
+  return Subtarget->is64Bit() || NumBits1 < 64;
+}
+
+bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
+  if (!VT1.isInteger() || !VT2.isInteger())
+    return false;
+  unsigned NumBits1 = VT1.getSizeInBits();
+  unsigned NumBits2 = VT2.getSizeInBits();
+  if (NumBits1 <= NumBits2)
+    return false;
+  return Subtarget->is64Bit() || NumBits1 < 64;
+}
+
+bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const {
+  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
+  return Ty1->isInteger(32) && Ty2->isInteger(64) && Subtarget->is64Bit();
+}
+
+bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
+  // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
+  return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
+}
+
+bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
+  // i16 instructions are longer (0x66 prefix) and potentially slower.
+  return !(VT1 == MVT::i32 && VT2 == MVT::i16);
+}
+
+/// isShuffleMaskLegal - Targets can use this to indicate that they only
+/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
+/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
+/// are assumed to be legal.
+bool
+X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
+                                      EVT VT) const {
+  // Only do shuffles on 128-bit vector types for now.
+  if (VT.getSizeInBits() == 64)
+    return false;
+
+  // FIXME: pshufb, blends, shifts.
+  return (VT.getVectorNumElements() == 2 ||
+          ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
+          isMOVLMask(M, VT) ||
+          isSHUFPMask(M, VT) ||
+          isPSHUFDMask(M, VT) ||
+          isPSHUFHWMask(M, VT) ||
+          isPSHUFLWMask(M, VT) ||
+          isPALIGNRMask(M, VT, Subtarget->hasSSSE3()) ||
+          isUNPCKLMask(M, VT) ||
+          isUNPCKHMask(M, VT) ||
+          isUNPCKL_v_undef_Mask(M, VT) ||
+          isUNPCKH_v_undef_Mask(M, VT));
+}
+
+bool
+X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
+                                          EVT VT) const {
+  unsigned NumElts = VT.getVectorNumElements();
+  // FIXME: This collection of masks seems suspect.
+  if (NumElts == 2)
+    return true;
+  if (NumElts == 4 && VT.getSizeInBits() == 128) {
+    return (isMOVLMask(Mask, VT)  ||
+            isCommutedMOVLMask(Mask, VT, true) ||
+            isSHUFPMask(Mask, VT) ||
+            isCommutedSHUFPMask(Mask, VT));
+  }
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+//                           X86 Scheduler Hooks
+//===----------------------------------------------------------------------===//
+
+// private utility function
+MachineBasicBlock *
+X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
+                                                       MachineBasicBlock *MBB,
+                                                       unsigned regOpc,
+                                                       unsigned immOpc,
+                                                       unsigned LoadOpc,
+                                                       unsigned CXchgOpc,
+                                                       unsigned copyOpc,
+                                                       unsigned notOpc,
+                                                       unsigned EAXreg,
+                                                       TargetRegisterClass *RC,
+                                                       bool invSrc) const {
+  // For the atomic bitwise operator, we generate
+  //   thisMBB:
+  //   newMBB:
+  //     ld  t1 = [bitinstr.addr]
+  //     op  t2 = t1, [bitinstr.val]
+  //     mov EAX = t1
+  //     lcs dest = [bitinstr.addr], t2  [EAX is implicit]
+  //     bz  newMBB
+  //     fallthrough -->nextMBB
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+  MachineFunction::iterator MBBIter = MBB;
+  ++MBBIter;
+
+  /// First build the CFG
+  MachineFunction *F = MBB->getParent();
+  MachineBasicBlock *thisMBB = MBB;
+  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(MBBIter, newMBB);
+  F->insert(MBBIter, nextMBB);
+
+  // Move all successors to thisMBB to nextMBB
+  nextMBB->transferSuccessors(thisMBB);
+
+  // Update thisMBB to fall through to newMBB
+  thisMBB->addSuccessor(newMBB);
+
+  // newMBB jumps to itself and fall through to nextMBB
+  newMBB->addSuccessor(nextMBB);
+  newMBB->addSuccessor(newMBB);
+
+  // Insert instructions into newMBB based on incoming instruction
+  assert(bInstr->getNumOperands() < X86AddrNumOperands + 4 &&
+         "unexpected number of operands");
+  DebugLoc dl = bInstr->getDebugLoc();
+  MachineOperand& destOper = bInstr->getOperand(0);
+  MachineOperand* argOpers[2 + X86AddrNumOperands];
+  int numArgs = bInstr->getNumOperands() - 1;
+  for (int i=0; i < numArgs; ++i)
+    argOpers[i] = &bInstr->getOperand(i+1);
+
+  // x86 address has 4 operands: base, index, scale, and displacement
+  int lastAddrIndx = X86AddrNumOperands - 1; // [0,3]
+  int valArgIndx = lastAddrIndx + 1;
+
+  unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
+  MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1);
+  for (int i=0; i <= lastAddrIndx; ++i)
+    (*MIB).addOperand(*argOpers[i]);
+
+  unsigned tt = F->getRegInfo().createVirtualRegister(RC);
+  if (invSrc) {
+    MIB = BuildMI(newMBB, dl, TII->get(notOpc), tt).addReg(t1);
+  }
+  else
+    tt = t1;
+
+  unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
+  assert((argOpers[valArgIndx]->isReg() ||
+          argOpers[valArgIndx]->isImm()) &&
+         "invalid operand");
+  if (argOpers[valArgIndx]->isReg())
+    MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2);
+  else
+    MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2);
+  MIB.addReg(tt);
+  (*MIB).addOperand(*argOpers[valArgIndx]);
+
+  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), EAXreg);
+  MIB.addReg(t1);
+
+  MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc));
+  for (int i=0; i <= lastAddrIndx; ++i)
+    (*MIB).addOperand(*argOpers[i]);
+  MIB.addReg(t2);
+  assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
+  (*MIB).setMemRefs(bInstr->memoperands_begin(),
+                    bInstr->memoperands_end());
+
+  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), destOper.getReg());
+  MIB.addReg(EAXreg);
+
+  // insert branch
+  BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB);
+
+  F->DeleteMachineInstr(bInstr);   // The pseudo instruction is gone now.
+  return nextMBB;
+}
+
+// private utility function:  64 bit atomics on 32 bit host.
+MachineBasicBlock *
+X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
+                                                       MachineBasicBlock *MBB,
+                                                       unsigned regOpcL,
+                                                       unsigned regOpcH,
+                                                       unsigned immOpcL,
+                                                       unsigned immOpcH,
+                                                       bool invSrc) const {
+  // For the atomic bitwise operator, we generate
+  //   thisMBB (instructions are in pairs, except cmpxchg8b)
+  //     ld t1,t2 = [bitinstr.addr]
+  //   newMBB:
+  //     out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4)
+  //     op  t5, t6 <- out1, out2, [bitinstr.val]
+  //      (for SWAP, substitute:  mov t5, t6 <- [bitinstr.val])
+  //     mov ECX, EBX <- t5, t6
+  //     mov EAX, EDX <- t1, t2
+  //     cmpxchg8b [bitinstr.addr]  [EAX, EDX, EBX, ECX implicit]
+  //     mov t3, t4 <- EAX, EDX
+  //     bz  newMBB
+  //     result in out1, out2
+  //     fallthrough -->nextMBB
+
+  const TargetRegisterClass *RC = X86::GR32RegisterClass;
+  const unsigned LoadOpc = X86::MOV32rm;
+  const unsigned copyOpc = X86::MOV32rr;
+  const unsigned NotOpc = X86::NOT32r;
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+  MachineFunction::iterator MBBIter = MBB;
+  ++MBBIter;
+
+  /// First build the CFG
+  MachineFunction *F = MBB->getParent();
+  MachineBasicBlock *thisMBB = MBB;
+  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(MBBIter, newMBB);
+  F->insert(MBBIter, nextMBB);
+
+  // Move all successors to thisMBB to nextMBB
+  nextMBB->transferSuccessors(thisMBB);
+
+  // Update thisMBB to fall through to newMBB
+  thisMBB->addSuccessor(newMBB);
+
+  // newMBB jumps to itself and fall through to nextMBB
+  newMBB->addSuccessor(nextMBB);
+  newMBB->addSuccessor(newMBB);
+
+  DebugLoc dl = bInstr->getDebugLoc();
+  // Insert instructions into newMBB based on incoming instruction
+  // There are 8 "real" operands plus 9 implicit def/uses, ignored here.
+  assert(bInstr->getNumOperands() < X86AddrNumOperands + 14 &&
+         "unexpected number of operands");
+  MachineOperand& dest1Oper = bInstr->getOperand(0);
+  MachineOperand& dest2Oper = bInstr->getOperand(1);
+  MachineOperand* argOpers[2 + X86AddrNumOperands];
+  for (int i=0; i < 2 + X86AddrNumOperands; ++i)
+    argOpers[i] = &bInstr->getOperand(i+2);
+
+  // x86 address has 5 operands: base, index, scale, displacement, and segment.
+  int lastAddrIndx = X86AddrNumOperands - 1; // [0,3]
+
+  unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
+  MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1);
+  for (int i=0; i <= lastAddrIndx; ++i)
+    (*MIB).addOperand(*argOpers[i]);
+  unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
+  MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2);
+  // add 4 to displacement.
+  for (int i=0; i <= lastAddrIndx-2; ++i)
+    (*MIB).addOperand(*argOpers[i]);
+  MachineOperand newOp3 = *(argOpers[3]);
+  if (newOp3.isImm())
+    newOp3.setImm(newOp3.getImm()+4);
+  else
+    newOp3.setOffset(newOp3.getOffset()+4);
+  (*MIB).addOperand(newOp3);
+  (*MIB).addOperand(*argOpers[lastAddrIndx]);
+
+  // t3/4 are defined later, at the bottom of the loop
+  unsigned t3 = F->getRegInfo().createVirtualRegister(RC);
+  unsigned t4 = F->getRegInfo().createVirtualRegister(RC);
+  BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg())
+    .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB);
+  BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg())
+    .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB);
+
+  // The subsequent operations should be using the destination registers of
+  //the PHI instructions.
+  if (invSrc) {
+    t1 = F->getRegInfo().createVirtualRegister(RC);
+    t2 = F->getRegInfo().createVirtualRegister(RC);
+    MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t1).addReg(dest1Oper.getReg());
+    MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t2).addReg(dest2Oper.getReg());
+  } else {
+    t1 = dest1Oper.getReg();
+    t2 = dest2Oper.getReg();
+  }
+
+  int valArgIndx = lastAddrIndx + 1;
+  assert((argOpers[valArgIndx]->isReg() ||
+          argOpers[valArgIndx]->isImm()) &&
+         "invalid operand");
+  unsigned t5 = F->getRegInfo().createVirtualRegister(RC);
+  unsigned t6 = F->getRegInfo().createVirtualRegister(RC);
+  if (argOpers[valArgIndx]->isReg())
+    MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5);
+  else
+    MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5);
+  if (regOpcL != X86::MOV32rr)
+    MIB.addReg(t1);
+  (*MIB).addOperand(*argOpers[valArgIndx]);
+  assert(argOpers[valArgIndx + 1]->isReg() ==
+         argOpers[valArgIndx]->isReg());
+  assert(argOpers[valArgIndx + 1]->isImm() ==
+         argOpers[valArgIndx]->isImm());
+  if (argOpers[valArgIndx + 1]->isReg())
+    MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6);
+  else
+    MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6);
+  if (regOpcH != X86::MOV32rr)
+    MIB.addReg(t2);
+  (*MIB).addOperand(*argOpers[valArgIndx + 1]);
+
+  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EAX);
+  MIB.addReg(t1);
+  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EDX);
+  MIB.addReg(t2);
+
+  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::EBX);
+  MIB.addReg(t5);
+  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), X86::ECX);
+  MIB.addReg(t6);
+
+  MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B));
+  for (int i=0; i <= lastAddrIndx; ++i)
+    (*MIB).addOperand(*argOpers[i]);
+
+  assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
+  (*MIB).setMemRefs(bInstr->memoperands_begin(),
+                    bInstr->memoperands_end());
+
+  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t3);
+  MIB.addReg(X86::EAX);
+  MIB = BuildMI(newMBB, dl, TII->get(copyOpc), t4);
+  MIB.addReg(X86::EDX);
+
+  // insert branch
+  BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB);
+
+  F->DeleteMachineInstr(bInstr);   // The pseudo instruction is gone now.
+  return nextMBB;
+}
+
+// private utility function
+MachineBasicBlock *
+X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
+                                                      MachineBasicBlock *MBB,
+                                                      unsigned cmovOpc) const {
+  // For the atomic min/max operator, we generate
+  //   thisMBB:
+  //   newMBB:
+  //     ld t1 = [min/max.addr]
+  //     mov t2 = [min/max.val]
+  //     cmp  t1, t2
+  //     cmov[cond] t2 = t1
+  //     mov EAX = t1
+  //     lcs dest = [bitinstr.addr], t2  [EAX is implicit]
+  //     bz   newMBB
+  //     fallthrough -->nextMBB
+  //
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+  MachineFunction::iterator MBBIter = MBB;
+  ++MBBIter;
+
+  /// First build the CFG
+  MachineFunction *F = MBB->getParent();
+  MachineBasicBlock *thisMBB = MBB;
+  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(MBBIter, newMBB);
+  F->insert(MBBIter, nextMBB);
+
+  // Move all successors of thisMBB to nextMBB
+  nextMBB->transferSuccessors(thisMBB);
+
+  // Update thisMBB to fall through to newMBB
+  thisMBB->addSuccessor(newMBB);
+
+  // newMBB jumps to newMBB and fall through to nextMBB
+  newMBB->addSuccessor(nextMBB);
+  newMBB->addSuccessor(newMBB);
+
+  DebugLoc dl = mInstr->getDebugLoc();
+  // Insert instructions into newMBB based on incoming instruction
+  assert(mInstr->getNumOperands() < X86AddrNumOperands + 4 &&
+         "unexpected number of operands");
+  MachineOperand& destOper = mInstr->getOperand(0);
+  MachineOperand* argOpers[2 + X86AddrNumOperands];
+  int numArgs = mInstr->getNumOperands() - 1;
+  for (int i=0; i < numArgs; ++i)
+    argOpers[i] = &mInstr->getOperand(i+1);
+
+  // x86 address has 4 operands: base, index, scale, and displacement
+  int lastAddrIndx = X86AddrNumOperands - 1; // [0,3]
+  int valArgIndx = lastAddrIndx + 1;
+
+  unsigned t1 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
+  MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1);
+  for (int i=0; i <= lastAddrIndx; ++i)
+    (*MIB).addOperand(*argOpers[i]);
+
+  // We only support register and immediate values
+  assert((argOpers[valArgIndx]->isReg() ||
+          argOpers[valArgIndx]->isImm()) &&
+         "invalid operand");
+
+  unsigned t2 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
+  if (argOpers[valArgIndx]->isReg())
+    MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2);
+  else
+    MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2);
+  (*MIB).addOperand(*argOpers[valArgIndx]);
+
+  MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), X86::EAX);
+  MIB.addReg(t1);
+
+  MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr));
+  MIB.addReg(t1);
+  MIB.addReg(t2);
+
+  // Generate movc
+  unsigned t3 = F->getRegInfo().createVirtualRegister(X86::GR32RegisterClass);
+  MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3);
+  MIB.addReg(t2);
+  MIB.addReg(t1);
+
+  // Cmp and exchange if none has modified the memory location
+  MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32));
+  for (int i=0; i <= lastAddrIndx; ++i)
+    (*MIB).addOperand(*argOpers[i]);
+  MIB.addReg(t3);
+  assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand");
+  (*MIB).setMemRefs(mInstr->memoperands_begin(),
+                    mInstr->memoperands_end());
+
+  MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), destOper.getReg());
+  MIB.addReg(X86::EAX);
+
+  // insert branch
+  BuildMI(newMBB, dl, TII->get(X86::JNE)).addMBB(newMBB);
+
+  F->DeleteMachineInstr(mInstr);   // The pseudo instruction is gone now.
+  return nextMBB;
+}
+
+// FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
+// all of this code can be replaced with that in the .td file.
+MachineBasicBlock *
+X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB,
+                            unsigned numArgs, bool memArg) const {
+
+  MachineFunction *F = BB->getParent();
+  DebugLoc dl = MI->getDebugLoc();
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+  unsigned Opc;
+  if (memArg)
+    Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm;
+  else
+    Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr;
+
+  MachineInstrBuilder MIB = BuildMI(BB, dl, TII->get(Opc));
+
+  for (unsigned i = 0; i < numArgs; ++i) {
+    MachineOperand &Op = MI->getOperand(i+1);
+
+    if (!(Op.isReg() && Op.isImplicit()))
+      MIB.addOperand(Op);
+  }
+
+  BuildMI(BB, dl, TII->get(X86::MOVAPSrr), MI->getOperand(0).getReg())
+    .addReg(X86::XMM0);
+
+  F->DeleteMachineInstr(MI);
+
+  return BB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
+                                                 MachineInstr *MI,
+                                                 MachineBasicBlock *MBB) const {
+  // Emit code to save XMM registers to the stack. The ABI says that the
+  // number of registers to save is given in %al, so it's theoretically
+  // possible to do an indirect jump trick to avoid saving all of them,
+  // however this code takes a simpler approach and just executes all
+  // of the stores if %al is non-zero. It's less code, and it's probably
+  // easier on the hardware branch predictor, and stores aren't all that
+  // expensive anyway.
+
+  // Create the new basic blocks. One block contains all the XMM stores,
+  // and one block is the final destination regardless of whether any
+  // stores were performed.
+  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+  MachineFunction *F = MBB->getParent();
+  MachineFunction::iterator MBBIter = MBB;
+  ++MBBIter;
+  MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  F->insert(MBBIter, XMMSaveMBB);
+  F->insert(MBBIter, EndMBB);
+
+  // Set up the CFG.
+  // Move any original successors of MBB to the end block.
+  EndMBB->transferSuccessors(MBB);
+  // The original block will now fall through to the XMM save block.
+  MBB->addSuccessor(XMMSaveMBB);
+  // The XMMSaveMBB will fall through to the end block.
+  XMMSaveMBB->addSuccessor(EndMBB);
+
+  // Now add the instructions.
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+
+  unsigned CountReg = MI->getOperand(0).getReg();
+  int64_t RegSaveFrameIndex = MI->getOperand(1).getImm();
+  int64_t VarArgsFPOffset = MI->getOperand(2).getImm();
+
+  if (!Subtarget->isTargetWin64()) {
+    // If %al is 0, branch around the XMM save block.
+    BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
+    BuildMI(MBB, DL, TII->get(X86::JE)).addMBB(EndMBB);
+    MBB->addSuccessor(EndMBB);
+  }
+
+  // In the XMM save block, save all the XMM argument registers.
+  for (int i = 3, e = MI->getNumOperands(); i != e; ++i) {
+    int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
+    MachineMemOperand *MMO =
+      F->getMachineMemOperand(
+        PseudoSourceValue::getFixedStack(RegSaveFrameIndex),
+        MachineMemOperand::MOStore, Offset,
+        /*Size=*/16, /*Align=*/16);
+    BuildMI(XMMSaveMBB, DL, TII->get(X86::MOVAPSmr))
+      .addFrameIndex(RegSaveFrameIndex)
+      .addImm(/*Scale=*/1)
+      .addReg(/*IndexReg=*/0)
+      .addImm(/*Disp=*/Offset)
+      .addReg(/*Segment=*/0)
+      .addReg(MI->getOperand(i).getReg())
+      .addMemOperand(MMO);
+  }
+
+  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+
+  return EndMBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredSelect(MachineInstr *MI,
+                                     MachineBasicBlock *BB,
+                   DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+
+  // To "insert" a SELECT_CC instruction, we actually have to insert the
+  // diamond control-flow pattern.  The incoming instruction knows the
+  // destination vreg to set, the condition code register to branch on, the
+  // true/false values to select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It;
+
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   cmpTY ccX, r1, r2
+  //   bCC copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  unsigned Opc =
+    X86::GetCondBranchFromCond((X86::CondCode)MI->getOperand(3).getImm());
+  BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
+  F->insert(It, copy0MBB);
+  F->insert(It, sinkMBB);
+  // Update machine-CFG edges by first adding all successors of the current
+  // block to the new block which will contain the Phi node for the select.
+  // Also inform sdisel of the edge changes.
+  for (MachineBasicBlock::succ_iterator I = BB->succ_begin(),
+         E = BB->succ_end(); I != E; ++I) {
+    EM->insert(std::make_pair(*I, sinkMBB));
+    sinkMBB->addSuccessor(*I);
+  }
+  // Next, remove all successors of the current block, and add the true
+  // and fallthrough blocks as its successors.
+  while (!BB->succ_empty())
+    BB->removeSuccessor(BB->succ_begin());
+  // Add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(sinkMBB);
+
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to sinkMBB
+  BB = copy0MBB;
+
+  // Update machine-CFG edges
+  BB->addSuccessor(sinkMBB);
+
+  //  sinkMBB:
+  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+  //  ...
+  BB = sinkMBB;
+  BuildMI(BB, DL, TII->get(X86::PHI), MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(1).getReg()).addMBB(copy0MBB)
+    .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
+
+  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+  return BB;
+}
+
+
+MachineBasicBlock *
+X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                               MachineBasicBlock *BB,
+                   DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const {
+  switch (MI->getOpcode()) {
+  default: assert(false && "Unexpected instr type to insert");
+  case X86::CMOV_GR8:
+  case X86::CMOV_V1I64:
+  case X86::CMOV_FR32:
+  case X86::CMOV_FR64:
+  case X86::CMOV_V4F32:
+  case X86::CMOV_V2F64:
+  case X86::CMOV_V2I64:
+    return EmitLoweredSelect(MI, BB, EM);
+
+  case X86::FP32_TO_INT16_IN_MEM:
+  case X86::FP32_TO_INT32_IN_MEM:
+  case X86::FP32_TO_INT64_IN_MEM:
+  case X86::FP64_TO_INT16_IN_MEM:
+  case X86::FP64_TO_INT32_IN_MEM:
+  case X86::FP64_TO_INT64_IN_MEM:
+  case X86::FP80_TO_INT16_IN_MEM:
+  case X86::FP80_TO_INT32_IN_MEM:
+  case X86::FP80_TO_INT64_IN_MEM: {
+    const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+    DebugLoc DL = MI->getDebugLoc();
+
+    // Change the floating point control register to use "round towards zero"
+    // mode when truncating to an integer value.
+    MachineFunction *F = BB->getParent();
+    int CWFrameIdx = F->getFrameInfo()->CreateStackObject(2, 2, false);
+    addFrameReference(BuildMI(BB, DL, TII->get(X86::FNSTCW16m)), CWFrameIdx);
+
+    // Load the old value of the high byte of the control word...
+    unsigned OldCW =
+      F->getRegInfo().createVirtualRegister(X86::GR16RegisterClass);
+    addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16rm), OldCW),
+                      CWFrameIdx);
+
+    // Set the high part to be round to zero...
+    addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
+      .addImm(0xC7F);
+
+    // Reload the modified control word now...
+    addFrameReference(BuildMI(BB, DL, TII->get(X86::FLDCW16m)), CWFrameIdx);
+
+    // Restore the memory image of control word to original value
+    addFrameReference(BuildMI(BB, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
+      .addReg(OldCW);
+
+    // Get the X86 opcode to use.
+    unsigned Opc;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("illegal opcode!");
+    case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
+    case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
+    case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
+    case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
+    case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
+    case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
+    case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
+    case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
+    case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
+    }
+
+    X86AddressMode AM;
+    MachineOperand &Op = MI->getOperand(0);
+    if (Op.isReg()) {
+      AM.BaseType = X86AddressMode::RegBase;
+      AM.Base.Reg = Op.getReg();
+    } else {
+      AM.BaseType = X86AddressMode::FrameIndexBase;
+      AM.Base.FrameIndex = Op.getIndex();
+    }
+    Op = MI->getOperand(1);
+    if (Op.isImm())
+      AM.Scale = Op.getImm();
+    Op = MI->getOperand(2);
+    if (Op.isImm())
+      AM.IndexReg = Op.getImm();
+    Op = MI->getOperand(3);
+    if (Op.isGlobal()) {
+      AM.GV = Op.getGlobal();
+    } else {
+      AM.Disp = Op.getImm();
+    }
+    addFullAddress(BuildMI(BB, DL, TII->get(Opc)), AM)
+                      .addReg(MI->getOperand(X86AddrNumOperands).getReg());
+
+    // Reload the original control word now.
+    addFrameReference(BuildMI(BB, DL, TII->get(X86::FLDCW16m)), CWFrameIdx);
+
+    F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+    return BB;
+  }
+    // String/text processing lowering.
+  case X86::PCMPISTRM128REG:
+    return EmitPCMP(MI, BB, 3, false /* in-mem */);
+  case X86::PCMPISTRM128MEM:
+    return EmitPCMP(MI, BB, 3, true /* in-mem */);
+  case X86::PCMPESTRM128REG:
+    return EmitPCMP(MI, BB, 5, false /* in mem */);
+  case X86::PCMPESTRM128MEM:
+    return EmitPCMP(MI, BB, 5, true /* in mem */);
+
+    // Atomic Lowering.
+  case X86::ATOMAND32:
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
+                                               X86::AND32ri, X86::MOV32rm,
+                                               X86::LCMPXCHG32, X86::MOV32rr,
+                                               X86::NOT32r, X86::EAX,
+                                               X86::GR32RegisterClass);
+  case X86::ATOMOR32:
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR32rr,
+                                               X86::OR32ri, X86::MOV32rm,
+                                               X86::LCMPXCHG32, X86::MOV32rr,
+                                               X86::NOT32r, X86::EAX,
+                                               X86::GR32RegisterClass);
+  case X86::ATOMXOR32:
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR32rr,
+                                               X86::XOR32ri, X86::MOV32rm,
+                                               X86::LCMPXCHG32, X86::MOV32rr,
+                                               X86::NOT32r, X86::EAX,
+                                               X86::GR32RegisterClass);
+  case X86::ATOMNAND32:
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND32rr,
+                                               X86::AND32ri, X86::MOV32rm,
+                                               X86::LCMPXCHG32, X86::MOV32rr,
+                                               X86::NOT32r, X86::EAX,
+                                               X86::GR32RegisterClass, true);
+  case X86::ATOMMIN32:
+    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL32rr);
+  case X86::ATOMMAX32:
+    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG32rr);
+  case X86::ATOMUMIN32:
+    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB32rr);
+  case X86::ATOMUMAX32:
+    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA32rr);
+
+  case X86::ATOMAND16:
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
+                                               X86::AND16ri, X86::MOV16rm,
+                                               X86::LCMPXCHG16, X86::MOV16rr,
+                                               X86::NOT16r, X86::AX,
+                                               X86::GR16RegisterClass);
+  case X86::ATOMOR16:
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR16rr,
+                                               X86::OR16ri, X86::MOV16rm,
+                                               X86::LCMPXCHG16, X86::MOV16rr,
+                                               X86::NOT16r, X86::AX,
+                                               X86::GR16RegisterClass);
+  case X86::ATOMXOR16:
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR16rr,
+                                               X86::XOR16ri, X86::MOV16rm,
+                                               X86::LCMPXCHG16, X86::MOV16rr,
+                                               X86::NOT16r, X86::AX,
+                                               X86::GR16RegisterClass);
+  case X86::ATOMNAND16:
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND16rr,
+                                               X86::AND16ri, X86::MOV16rm,
+                                               X86::LCMPXCHG16, X86::MOV16rr,
+                                               X86::NOT16r, X86::AX,
+                                               X86::GR16RegisterClass, true);
+  case X86::ATOMMIN16:
+    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL16rr);
+  case X86::ATOMMAX16:
+    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG16rr);
+  case X86::ATOMUMIN16:
+    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB16rr);
+  case X86::ATOMUMAX16:
+    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA16rr);
+
+  case X86::ATOMAND8:
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
+                                               X86::AND8ri, X86::MOV8rm,
+                                               X86::LCMPXCHG8, X86::MOV8rr,
+                                               X86::NOT8r, X86::AL,
+                                               X86::GR8RegisterClass);
+  case X86::ATOMOR8:
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR8rr,
+                                               X86::OR8ri, X86::MOV8rm,
+                                               X86::LCMPXCHG8, X86::MOV8rr,
+                                               X86::NOT8r, X86::AL,
+                                               X86::GR8RegisterClass);
+  case X86::ATOMXOR8:
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR8rr,
+                                               X86::XOR8ri, X86::MOV8rm,
+                                               X86::LCMPXCHG8, X86::MOV8rr,
+                                               X86::NOT8r, X86::AL,
+                                               X86::GR8RegisterClass);
+  case X86::ATOMNAND8:
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND8rr,
+                                               X86::AND8ri, X86::MOV8rm,
+                                               X86::LCMPXCHG8, X86::MOV8rr,
+                                               X86::NOT8r, X86::AL,
+                                               X86::GR8RegisterClass, true);
+  // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way.
+  // This group is for 64-bit host.
+  case X86::ATOMAND64:
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
+                                               X86::AND64ri32, X86::MOV64rm,
+                                               X86::LCMPXCHG64, X86::MOV64rr,
+                                               X86::NOT64r, X86::RAX,
+                                               X86::GR64RegisterClass);
+  case X86::ATOMOR64:
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::OR64rr,
+                                               X86::OR64ri32, X86::MOV64rm,
+                                               X86::LCMPXCHG64, X86::MOV64rr,
+                                               X86::NOT64r, X86::RAX,
+                                               X86::GR64RegisterClass);
+  case X86::ATOMXOR64:
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::XOR64rr,
+                                               X86::XOR64ri32, X86::MOV64rm,
+                                               X86::LCMPXCHG64, X86::MOV64rr,
+                                               X86::NOT64r, X86::RAX,
+                                               X86::GR64RegisterClass);
+  case X86::ATOMNAND64:
+    return EmitAtomicBitwiseWithCustomInserter(MI, BB, X86::AND64rr,
+                                               X86::AND64ri32, X86::MOV64rm,
+                                               X86::LCMPXCHG64, X86::MOV64rr,
+                                               X86::NOT64r, X86::RAX,
+                                               X86::GR64RegisterClass, true);
+  case X86::ATOMMIN64:
+    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVL64rr);
+  case X86::ATOMMAX64:
+    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVG64rr);
+  case X86::ATOMUMIN64:
+    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVB64rr);
+  case X86::ATOMUMAX64:
+    return EmitAtomicMinMaxWithCustomInserter(MI, BB, X86::CMOVA64rr);
+
+  // This group does 64-bit operations on a 32-bit host.
+  case X86::ATOMAND6432:
+    return EmitAtomicBit6432WithCustomInserter(MI, BB,
+                                               X86::AND32rr, X86::AND32rr,
+                                               X86::AND32ri, X86::AND32ri,
+                                               false);
+  case X86::ATOMOR6432:
+    return EmitAtomicBit6432WithCustomInserter(MI, BB,
+                                               X86::OR32rr, X86::OR32rr,
+                                               X86::OR32ri, X86::OR32ri,
+                                               false);
+  case X86::ATOMXOR6432:
+    return EmitAtomicBit6432WithCustomInserter(MI, BB,
+                                               X86::XOR32rr, X86::XOR32rr,
+                                               X86::XOR32ri, X86::XOR32ri,
+                                               false);
+  case X86::ATOMNAND6432:
+    return EmitAtomicBit6432WithCustomInserter(MI, BB,
+                                               X86::AND32rr, X86::AND32rr,
+                                               X86::AND32ri, X86::AND32ri,
+                                               true);
+  case X86::ATOMADD6432:
+    return EmitAtomicBit6432WithCustomInserter(MI, BB,
+                                               X86::ADD32rr, X86::ADC32rr,
+                                               X86::ADD32ri, X86::ADC32ri,
+                                               false);
+  case X86::ATOMSUB6432:
+    return EmitAtomicBit6432WithCustomInserter(MI, BB,
+                                               X86::SUB32rr, X86::SBB32rr,
+                                               X86::SUB32ri, X86::SBB32ri,
+                                               false);
+  case X86::ATOMSWAP6432:
+    return EmitAtomicBit6432WithCustomInserter(MI, BB,
+                                               X86::MOV32rr, X86::MOV32rr,
+                                               X86::MOV32ri, X86::MOV32ri,
+                                               false);
+  case X86::VASTART_SAVE_XMM_REGS:
+    return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                           X86 Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
+                                                       const APInt &Mask,
+                                                       APInt &KnownZero,
+                                                       APInt &KnownOne,
+                                                       const SelectionDAG &DAG,
+                                                       unsigned Depth) const {
+  unsigned Opc = Op.getOpcode();
+  assert((Opc >= ISD::BUILTIN_OP_END ||
+          Opc == ISD::INTRINSIC_WO_CHAIN ||
+          Opc == ISD::INTRINSIC_W_CHAIN ||
+          Opc == ISD::INTRINSIC_VOID) &&
+         "Should use MaskedValueIsZero if you don't know whether Op"
+         " is a target node!");
+
+  KnownZero = KnownOne = APInt(Mask.getBitWidth(), 0);   // Don't know anything.
+  switch (Opc) {
+  default: break;
+  case X86ISD::ADD:
+  case X86ISD::SUB:
+  case X86ISD::SMUL:
+  case X86ISD::UMUL:
+  case X86ISD::INC:
+  case X86ISD::DEC:
+  case X86ISD::OR:
+  case X86ISD::XOR:
+  case X86ISD::AND:
+    // These nodes' second result is a boolean.
+    if (Op.getResNo() == 0)
+      break;
+    // Fallthrough
+  case X86ISD::SETCC:
+    KnownZero |= APInt::getHighBitsSet(Mask.getBitWidth(),
+                                       Mask.getBitWidth() - 1);
+    break;
+  }
+}
+
+/// isGAPlusOffset - Returns true (and the GlobalValue and the offset) if the
+/// node is a GlobalAddress + offset.
+bool X86TargetLowering::isGAPlusOffset(SDNode *N,
+                                       GlobalValue* &GA, int64_t &Offset) const{
+  if (N->getOpcode() == X86ISD::Wrapper) {
+    if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
+      GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
+      Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
+      return true;
+    }
+  }
+  return TargetLowering::isGAPlusOffset(N, GA, Offset);
+}
+
+static bool EltsFromConsecutiveLoads(ShuffleVectorSDNode *N, unsigned NumElems,
+                                     EVT EltVT, LoadSDNode *&LDBase,
+                                     unsigned &LastLoadedElt,
+                                     SelectionDAG &DAG, MachineFrameInfo *MFI,
+                                     const TargetLowering &TLI) {
+  LDBase = NULL;
+  LastLoadedElt = -1U;
+  for (unsigned i = 0; i < NumElems; ++i) {
+    if (N->getMaskElt(i) < 0) {
+      if (!LDBase)
+        return false;
+      continue;
+    }
+
+    SDValue Elt = DAG.getShuffleScalarElt(N, i);
+    if (!Elt.getNode() ||
+        (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
+      return false;
+    if (!LDBase) {
+      if (Elt.getNode()->getOpcode() == ISD::UNDEF)
+        return false;
+      LDBase = cast<LoadSDNode>(Elt.getNode());
+      LastLoadedElt = i;
+      continue;
+    }
+    if (Elt.getOpcode() == ISD::UNDEF)
+      continue;
+
+    LoadSDNode *LD = cast<LoadSDNode>(Elt);
+    if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
+      return false;
+    LastLoadedElt = i;
+  }
+  return true;
+}
+
+/// PerformShuffleCombine - Combine a vector_shuffle that is equal to
+/// build_vector load1, load2, load3, load4, <0, 1, 2, 3> into a 128-bit load
+/// if the load addresses are consecutive, non-overlapping, and in the right
+/// order.  In the case of v2i64, it will see if it can rewrite the
+/// shuffle to be an appropriate build vector so it can take advantage of
+// performBuildVectorCombine.
+static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
+                                     const TargetLowering &TLI) {
+  DebugLoc dl = N->getDebugLoc();
+  EVT VT = N->getValueType(0);
+  EVT EltVT = VT.getVectorElementType();
+  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
+  unsigned NumElems = VT.getVectorNumElements();
+
+  if (VT.getSizeInBits() != 128)
+    return SDValue();
+
+  // Try to combine a vector_shuffle into a 128-bit load.
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  LoadSDNode *LD = NULL;
+  unsigned LastLoadedElt;
+  if (!EltsFromConsecutiveLoads(SVN, NumElems, EltVT, LD, LastLoadedElt, DAG,
+                                MFI, TLI))
+    return SDValue();
+
+  if (LastLoadedElt == NumElems - 1) {
+    if (DAG.InferPtrAlignment(LD->getBasePtr()) >= 16)
+      return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(),
+                         LD->getSrcValue(), LD->getSrcValueOffset(),
+                         LD->isVolatile());
+    return DAG.getLoad(VT, dl, LD->getChain(), LD->getBasePtr(),
+                       LD->getSrcValue(), LD->getSrcValueOffset(),
+                       LD->isVolatile(), LD->getAlignment());
+  } else if (NumElems == 4 && LastLoadedElt == 1) {
+    SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
+    SDValue Ops[] = { LD->getChain(), LD->getBasePtr() };
+    SDValue ResNode = DAG.getNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, 2);
+    return DAG.getNode(ISD::BIT_CONVERT, dl, VT, ResNode);
+  }
+  return SDValue();
+}
+
+/// PerformSELECTCombine - Do target-specific dag combines on SELECT nodes.
+static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
+                                    const X86Subtarget *Subtarget) {
+  DebugLoc DL = N->getDebugLoc();
+  SDValue Cond = N->getOperand(0);
+  // Get the LHS/RHS of the select.
+  SDValue LHS = N->getOperand(1);
+  SDValue RHS = N->getOperand(2);
+
+  // If we have SSE[12] support, try to form min/max nodes. SSE min/max
+  // instructions have the peculiarity that if either operand is a NaN,
+  // they chose what we call the RHS operand (and as such are not symmetric).
+  // It happens that this matches the semantics of the common C idiom
+  // x<y?x:y and related forms, so we can recognize these cases.
+  if (Subtarget->hasSSE2() &&
+      (LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64) &&
+      Cond.getOpcode() == ISD::SETCC) {
+    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+    unsigned Opcode = 0;
+    // Check for x CC y ? x : y.
+    if (LHS == Cond.getOperand(0) && RHS == Cond.getOperand(1)) {
+      switch (CC) {
+      default: break;
+      case ISD::SETULT:
+        // This can be a min if we can prove that at least one of the operands
+        // is not a nan.
+        if (!FiniteOnlyFPMath()) {
+          if (DAG.isKnownNeverNaN(RHS)) {
+            // Put the potential NaN in the RHS so that SSE will preserve it.
+            std::swap(LHS, RHS);
+          } else if (!DAG.isKnownNeverNaN(LHS))
+            break;
+        }
+        Opcode = X86ISD::FMIN;
+        break;
+      case ISD::SETOLE:
+        // This can be a min if we can prove that at least one of the operands
+        // is not a nan.
+        if (!FiniteOnlyFPMath()) {
+          if (DAG.isKnownNeverNaN(LHS)) {
+            // Put the potential NaN in the RHS so that SSE will preserve it.
+            std::swap(LHS, RHS);
+          } else if (!DAG.isKnownNeverNaN(RHS))
+            break;
+        }
+        Opcode = X86ISD::FMIN;
+        break;
+      case ISD::SETULE:
+        // This can be a min, but if either operand is a NaN we need it to
+        // preserve the original LHS.
+        std::swap(LHS, RHS);
+      case ISD::SETOLT:
+      case ISD::SETLT:
+      case ISD::SETLE:
+        Opcode = X86ISD::FMIN;
+        break;
+
+      case ISD::SETOGE:
+        // This can be a max if we can prove that at least one of the operands
+        // is not a nan.
+        if (!FiniteOnlyFPMath()) {
+          if (DAG.isKnownNeverNaN(LHS)) {
+            // Put the potential NaN in the RHS so that SSE will preserve it.
+            std::swap(LHS, RHS);
+          } else if (!DAG.isKnownNeverNaN(RHS))
+            break;
+        }
+        Opcode = X86ISD::FMAX;
+        break;
+      case ISD::SETUGT:
+        // This can be a max if we can prove that at least one of the operands
+        // is not a nan.
+        if (!FiniteOnlyFPMath()) {
+          if (DAG.isKnownNeverNaN(RHS)) {
+            // Put the potential NaN in the RHS so that SSE will preserve it.
+            std::swap(LHS, RHS);
+          } else if (!DAG.isKnownNeverNaN(LHS))
+            break;
+        }
+        Opcode = X86ISD::FMAX;
+        break;
+      case ISD::SETUGE:
+        // This can be a max, but if either operand is a NaN we need it to
+        // preserve the original LHS.
+        std::swap(LHS, RHS);
+      case ISD::SETOGT:
+      case ISD::SETGT:
+      case ISD::SETGE:
+        Opcode = X86ISD::FMAX;
+        break;
+      }
+    // Check for x CC y ? y : x -- a min/max with reversed arms.
+    } else if (LHS == Cond.getOperand(1) && RHS == Cond.getOperand(0)) {
+      switch (CC) {
+      default: break;
+      case ISD::SETOGE:
+        // This can be a min if we can prove that at least one of the operands
+        // is not a nan.
+        if (!FiniteOnlyFPMath()) {
+          if (DAG.isKnownNeverNaN(RHS)) {
+            // Put the potential NaN in the RHS so that SSE will preserve it.
+            std::swap(LHS, RHS);
+          } else if (!DAG.isKnownNeverNaN(LHS))
+            break;
+        }
+        Opcode = X86ISD::FMIN;
+        break;
+      case ISD::SETUGT:
+        // This can be a min if we can prove that at least one of the operands
+        // is not a nan.
+        if (!FiniteOnlyFPMath()) {
+          if (DAG.isKnownNeverNaN(LHS)) {
+            // Put the potential NaN in the RHS so that SSE will preserve it.
+            std::swap(LHS, RHS);
+          } else if (!DAG.isKnownNeverNaN(RHS))
+            break;
+        }
+        Opcode = X86ISD::FMIN;
+        break;
+      case ISD::SETUGE:
+        // This can be a min, but if either operand is a NaN we need it to
+        // preserve the original LHS.
+        std::swap(LHS, RHS);
+      case ISD::SETOGT:
+      case ISD::SETGT:
+      case ISD::SETGE:
+        Opcode = X86ISD::FMIN;
+        break;
+
+      case ISD::SETULT:
+        // This can be a max if we can prove that at least one of the operands
+        // is not a nan.
+        if (!FiniteOnlyFPMath()) {
+          if (DAG.isKnownNeverNaN(LHS)) {
+            // Put the potential NaN in the RHS so that SSE will preserve it.
+            std::swap(LHS, RHS);
+          } else if (!DAG.isKnownNeverNaN(RHS))
+            break;
+        }
+        Opcode = X86ISD::FMAX;
+        break;
+      case ISD::SETOLE:
+        // This can be a max if we can prove that at least one of the operands
+        // is not a nan.
+        if (!FiniteOnlyFPMath()) {
+          if (DAG.isKnownNeverNaN(RHS)) {
+            // Put the potential NaN in the RHS so that SSE will preserve it.
+            std::swap(LHS, RHS);
+          } else if (!DAG.isKnownNeverNaN(LHS))
+            break;
+        }
+        Opcode = X86ISD::FMAX;
+        break;
+      case ISD::SETULE:
+        // This can be a max, but if either operand is a NaN we need it to
+        // preserve the original LHS.
+        std::swap(LHS, RHS);
+      case ISD::SETOLT:
+      case ISD::SETLT:
+      case ISD::SETLE:
+        Opcode = X86ISD::FMAX;
+        break;
+      }
+    }
+
+    if (Opcode)
+      return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
+  }
+
+  // If this is a select between two integer constants, try to do some
+  // optimizations.
+  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(LHS)) {
+    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(RHS))
+      // Don't do this for crazy integer types.
+      if (DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) {
+        // If this is efficiently invertible, canonicalize the LHSC/RHSC values
+        // so that TrueC (the true value) is larger than FalseC.
+        bool NeedsCondInvert = false;
+
+        if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
+            // Efficiently invertible.
+            (Cond.getOpcode() == ISD::SETCC ||  // setcc -> invertible.
+             (Cond.getOpcode() == ISD::XOR &&   // xor(X, C) -> invertible.
+              isa<ConstantSDNode>(Cond.getOperand(1))))) {
+          NeedsCondInvert = true;
+          std::swap(TrueC, FalseC);
+        }
+
+        // Optimize C ? 8 : 0 -> zext(C) << 3.  Likewise for any pow2/0.
+        if (FalseC->getAPIntValue() == 0 &&
+            TrueC->getAPIntValue().isPowerOf2()) {
+          if (NeedsCondInvert) // Invert the condition if needed.
+            Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+                               DAG.getConstant(1, Cond.getValueType()));
+
+          // Zero extend the condition if needed.
+          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
+
+          unsigned ShAmt = TrueC->getAPIntValue().logBase2();
+          return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
+                             DAG.getConstant(ShAmt, MVT::i8));
+        }
+
+        // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
+        if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
+          if (NeedsCondInvert) // Invert the condition if needed.
+            Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+                               DAG.getConstant(1, Cond.getValueType()));
+
+          // Zero extend the condition if needed.
+          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
+                             FalseC->getValueType(0), Cond);
+          return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+                             SDValue(FalseC, 0));
+        }
+
+        // Optimize cases that will turn into an LEA instruction.  This requires
+        // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
+        if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
+          uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
+          if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
+
+          bool isFastMultiplier = false;
+          if (Diff < 10) {
+            switch ((unsigned char)Diff) {
+              default: break;
+              case 1:  // result = add base, cond
+              case 2:  // result = lea base(    , cond*2)
+              case 3:  // result = lea base(cond, cond*2)
+              case 4:  // result = lea base(    , cond*4)
+              case 5:  // result = lea base(cond, cond*4)
+              case 8:  // result = lea base(    , cond*8)
+              case 9:  // result = lea base(cond, cond*8)
+                isFastMultiplier = true;
+                break;
+            }
+          }
+
+          if (isFastMultiplier) {
+            APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
+            if (NeedsCondInvert) // Invert the condition if needed.
+              Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+                                 DAG.getConstant(1, Cond.getValueType()));
+
+            // Zero extend the condition if needed.
+            Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
+                               Cond);
+            // Scale the condition by the difference.
+            if (Diff != 1)
+              Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
+                                 DAG.getConstant(Diff, Cond.getValueType()));
+
+            // Add the base if non-zero.
+            if (FalseC->getAPIntValue() != 0)
+              Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+                                 SDValue(FalseC, 0));
+            return Cond;
+          }
+        }
+      }
+  }
+
+  return SDValue();
+}
+
+/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
+static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
+                                  TargetLowering::DAGCombinerInfo &DCI) {
+  DebugLoc DL = N->getDebugLoc();
+
+  // If the flag operand isn't dead, don't touch this CMOV.
+  if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
+    return SDValue();
+
+  // If this is a select between two integer constants, try to do some
+  // optimizations.  Note that the operands are ordered the opposite of SELECT
+  // operands.
+  if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+    if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
+      // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
+      // larger than FalseC (the false value).
+      X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
+
+      if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
+        CC = X86::GetOppositeBranchCondition(CC);
+        std::swap(TrueC, FalseC);
+      }
+
+      // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3.  Likewise for any pow2/0.
+      // This is efficient for any integer data type (including i8/i16) and
+      // shift amount.
+      if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
+        SDValue Cond = N->getOperand(3);
+        Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
+                           DAG.getConstant(CC, MVT::i8), Cond);
+
+        // Zero extend the condition if needed.
+        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
+
+        unsigned ShAmt = TrueC->getAPIntValue().logBase2();
+        Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
+                           DAG.getConstant(ShAmt, MVT::i8));
+        if (N->getNumValues() == 2)  // Dead flag value?
+          return DCI.CombineTo(N, Cond, SDValue());
+        return Cond;
+      }
+
+      // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.  This is efficient
+      // for any integer data type, including i8/i16.
+      if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
+        SDValue Cond = N->getOperand(3);
+        Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
+                           DAG.getConstant(CC, MVT::i8), Cond);
+
+        // Zero extend the condition if needed.
+        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
+                           FalseC->getValueType(0), Cond);
+        Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+                           SDValue(FalseC, 0));
+
+        if (N->getNumValues() == 2)  // Dead flag value?
+          return DCI.CombineTo(N, Cond, SDValue());
+        return Cond;
+      }
+
+      // Optimize cases that will turn into an LEA instruction.  This requires
+      // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
+      if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
+        uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
+        if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
+
+        bool isFastMultiplier = false;
+        if (Diff < 10) {
+          switch ((unsigned char)Diff) {
+          default: break;
+          case 1:  // result = add base, cond
+          case 2:  // result = lea base(    , cond*2)
+          case 3:  // result = lea base(cond, cond*2)
+          case 4:  // result = lea base(    , cond*4)
+          case 5:  // result = lea base(cond, cond*4)
+          case 8:  // result = lea base(    , cond*8)
+          case 9:  // result = lea base(cond, cond*8)
+            isFastMultiplier = true;
+            break;
+          }
+        }
+
+        if (isFastMultiplier) {
+          APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
+          SDValue Cond = N->getOperand(3);
+          Cond = DAG.getNode(X86ISD::SETCC, DL, MVT::i8,
+                             DAG.getConstant(CC, MVT::i8), Cond);
+          // Zero extend the condition if needed.
+          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
+                             Cond);
+          // Scale the condition by the difference.
+          if (Diff != 1)
+            Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
+                               DAG.getConstant(Diff, Cond.getValueType()));
+
+          // Add the base if non-zero.
+          if (FalseC->getAPIntValue() != 0)
+            Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
+                               SDValue(FalseC, 0));
+          if (N->getNumValues() == 2)  // Dead flag value?
+            return DCI.CombineTo(N, Cond, SDValue());
+          return Cond;
+        }
+      }
+    }
+  }
+  return SDValue();
+}
+
+
+/// PerformMulCombine - Optimize a single multiply with constant into two
+/// in order to implement it with two cheaper instructions, e.g.
+/// LEA + SHL, LEA + LEA.
+static SDValue PerformMulCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI) {
+  if (DAG.getMachineFunction().
+      getFunction()->hasFnAttr(Attribute::OptimizeForSize))
+    return SDValue();
+
+  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::i64)
+    return SDValue();
+
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!C)
+    return SDValue();
+  uint64_t MulAmt = C->getZExtValue();
+  if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
+    return SDValue();
+
+  uint64_t MulAmt1 = 0;
+  uint64_t MulAmt2 = 0;
+  if ((MulAmt % 9) == 0) {
+    MulAmt1 = 9;
+    MulAmt2 = MulAmt / 9;
+  } else if ((MulAmt % 5) == 0) {
+    MulAmt1 = 5;
+    MulAmt2 = MulAmt / 5;
+  } else if ((MulAmt % 3) == 0) {
+    MulAmt1 = 3;
+    MulAmt2 = MulAmt / 3;
+  }
+  if (MulAmt2 &&
+      (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
+    DebugLoc DL = N->getDebugLoc();
+
+    if (isPowerOf2_64(MulAmt2) &&
+        !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
+      // If second multiplifer is pow2, issue it first. We want the multiply by
+      // 3, 5, or 9 to be folded into the addressing mode unless the lone use
+      // is an add.
+      std::swap(MulAmt1, MulAmt2);
+
+    SDValue NewMul;
+    if (isPowerOf2_64(MulAmt1))
+      NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+                           DAG.getConstant(Log2_64(MulAmt1), MVT::i8));
+    else
+      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
+                           DAG.getConstant(MulAmt1, VT));
+
+    if (isPowerOf2_64(MulAmt2))
+      NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
+                           DAG.getConstant(Log2_64(MulAmt2), MVT::i8));
+    else
+      NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
+                           DAG.getConstant(MulAmt2, VT));
+
+    // Do not add new nodes to DAG combiner worklist.
+    DCI.CombineTo(N, NewMul, false);
+  }
+  return SDValue();
+}
+
+static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  EVT VT = N0.getValueType();
+
+  // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
+  // since the result of setcc_c is all zero's or all ones.
+  if (N1C && N0.getOpcode() == ISD::AND &&
+      N0.getOperand(1).getOpcode() == ISD::Constant) {
+    SDValue N00 = N0.getOperand(0);
+    if (N00.getOpcode() == X86ISD::SETCC_CARRY ||
+        ((N00.getOpcode() == ISD::ANY_EXTEND ||
+          N00.getOpcode() == ISD::ZERO_EXTEND) &&
+         N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY)) {
+      APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+      APInt ShAmt = N1C->getAPIntValue();
+      Mask = Mask.shl(ShAmt);
+      if (Mask != 0)
+        return DAG.getNode(ISD::AND, N->getDebugLoc(), VT,
+                           N00, DAG.getConstant(Mask, VT));
+    }
+  }
+
+  return SDValue();
+}
+
+/// PerformShiftCombine - Transforms vector shift nodes to use vector shifts
+///                       when possible.
+static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
+                                   const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  if (!VT.isVector() && VT.isInteger() &&
+      N->getOpcode() == ISD::SHL)
+    return PerformSHLCombine(N, DAG);
+
+  // On X86 with SSE2 support, we can transform this to a vector shift if
+  // all elements are shifted by the same amount.  We can't do this in legalize
+  // because the a constant vector is typically transformed to a constant pool
+  // so we have no knowledge of the shift amount.
+  if (!Subtarget->hasSSE2())
+    return SDValue();
+
+  if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16)
+    return SDValue();
+
+  SDValue ShAmtOp = N->getOperand(1);
+  EVT EltVT = VT.getVectorElementType();
+  DebugLoc DL = N->getDebugLoc();
+  SDValue BaseShAmt = SDValue();
+  if (ShAmtOp.getOpcode() == ISD::BUILD_VECTOR) {
+    unsigned NumElts = VT.getVectorNumElements();
+    unsigned i = 0;
+    for (; i != NumElts; ++i) {
+      SDValue Arg = ShAmtOp.getOperand(i);
+      if (Arg.getOpcode() == ISD::UNDEF) continue;
+      BaseShAmt = Arg;
+      break;
+    }
+    for (; i != NumElts; ++i) {
+      SDValue Arg = ShAmtOp.getOperand(i);
+      if (Arg.getOpcode() == ISD::UNDEF) continue;
+      if (Arg != BaseShAmt) {
+        return SDValue();
+      }
+    }
+  } else if (ShAmtOp.getOpcode() == ISD::VECTOR_SHUFFLE &&
+             cast<ShuffleVectorSDNode>(ShAmtOp)->isSplat()) {
+    SDValue InVec = ShAmtOp.getOperand(0);
+    if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
+      unsigned NumElts = InVec.getValueType().getVectorNumElements();
+      unsigned i = 0;
+      for (; i != NumElts; ++i) {
+        SDValue Arg = InVec.getOperand(i);
+        if (Arg.getOpcode() == ISD::UNDEF) continue;
+        BaseShAmt = Arg;
+        break;
+      }
+    } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
+       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
+         unsigned SplatIdx = cast<ShuffleVectorSDNode>(ShAmtOp)->getSplatIndex();
+         if (C->getZExtValue() == SplatIdx)
+           BaseShAmt = InVec.getOperand(1);
+       }
+    }
+    if (BaseShAmt.getNode() == 0)
+      BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, ShAmtOp,
+                              DAG.getIntPtrConstant(0));
+  } else
+    return SDValue();
+
+  // The shift amount is an i32.
+  if (EltVT.bitsGT(MVT::i32))
+    BaseShAmt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, BaseShAmt);
+  else if (EltVT.bitsLT(MVT::i32))
+    BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, BaseShAmt);
+
+  // The shift amount is identical so we can do a vector shift.
+  SDValue  ValOp = N->getOperand(0);
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Unknown shift opcode!");
+    break;
+  case ISD::SHL:
+    if (VT == MVT::v2i64)
+      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                         DAG.getConstant(Intrinsic::x86_sse2_pslli_q, MVT::i32),
+                         ValOp, BaseShAmt);
+    if (VT == MVT::v4i32)
+      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                         DAG.getConstant(Intrinsic::x86_sse2_pslli_d, MVT::i32),
+                         ValOp, BaseShAmt);
+    if (VT == MVT::v8i16)
+      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                         DAG.getConstant(Intrinsic::x86_sse2_pslli_w, MVT::i32),
+                         ValOp, BaseShAmt);
+    break;
+  case ISD::SRA:
+    if (VT == MVT::v4i32)
+      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                         DAG.getConstant(Intrinsic::x86_sse2_psrai_d, MVT::i32),
+                         ValOp, BaseShAmt);
+    if (VT == MVT::v8i16)
+      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                         DAG.getConstant(Intrinsic::x86_sse2_psrai_w, MVT::i32),
+                         ValOp, BaseShAmt);
+    break;
+  case ISD::SRL:
+    if (VT == MVT::v2i64)
+      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                         DAG.getConstant(Intrinsic::x86_sse2_psrli_q, MVT::i32),
+                         ValOp, BaseShAmt);
+    if (VT == MVT::v4i32)
+      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                         DAG.getConstant(Intrinsic::x86_sse2_psrli_d, MVT::i32),
+                         ValOp, BaseShAmt);
+    if (VT ==  MVT::v8i16)
+      return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                         DAG.getConstant(Intrinsic::x86_sse2_psrli_w, MVT::i32),
+                         ValOp, BaseShAmt);
+    break;
+  }
+  return SDValue();
+}
+
+static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
+                                const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::i64 || !Subtarget->is64Bit())
+    return SDValue();
+
+  // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
+    std::swap(N0, N1);
+  if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
+    return SDValue();
+
+  SDValue ShAmt0 = N0.getOperand(1);
+  if (ShAmt0.getValueType() != MVT::i8)
+    return SDValue();
+  SDValue ShAmt1 = N1.getOperand(1);
+  if (ShAmt1.getValueType() != MVT::i8)
+    return SDValue();
+  if (ShAmt0.getOpcode() == ISD::TRUNCATE)
+    ShAmt0 = ShAmt0.getOperand(0);
+  if (ShAmt1.getOpcode() == ISD::TRUNCATE)
+    ShAmt1 = ShAmt1.getOperand(0);
+
+  DebugLoc DL = N->getDebugLoc();
+  unsigned Opc = X86ISD::SHLD;
+  SDValue Op0 = N0.getOperand(0);
+  SDValue Op1 = N1.getOperand(0);
+  if (ShAmt0.getOpcode() == ISD::SUB) {
+    Opc = X86ISD::SHRD;
+    std::swap(Op0, Op1);
+    std::swap(ShAmt0, ShAmt1);
+  }
+
+  if (ShAmt1.getOpcode() == ISD::SUB) {
+    SDValue Sum = ShAmt1.getOperand(0);
+    if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
+      if (SumC->getSExtValue() == 64 &&
+          ShAmt1.getOperand(1) == ShAmt0)
+        return DAG.getNode(Opc, DL, VT,
+                           Op0, Op1,
+                           DAG.getNode(ISD::TRUNCATE, DL,
+                                       MVT::i8, ShAmt0));
+    }
+  } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
+    ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
+    if (ShAmt0C &&
+        ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue() == 64)
+      return DAG.getNode(Opc, DL, VT,
+                         N0.getOperand(0), N1.getOperand(0),
+                         DAG.getNode(ISD::TRUNCATE, DL,
+                                       MVT::i8, ShAmt0));
+  }
+
+  return SDValue();
+}
+
+/// PerformSTORECombine - Do target-specific dag combines on STORE nodes.
+static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
+                                   const X86Subtarget *Subtarget) {
+  // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
+  // the FP state in cases where an emms may be missing.
+  // A preferable solution to the general problem is to figure out the right
+  // places to insert EMMS.  This qualifies as a quick hack.
+
+  // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
+  StoreSDNode *St = cast<StoreSDNode>(N);
+  EVT VT = St->getValue().getValueType();
+  if (VT.getSizeInBits() != 64)
+    return SDValue();
+
+  const Function *F = DAG.getMachineFunction().getFunction();
+  bool NoImplicitFloatOps = F->hasFnAttr(Attribute::NoImplicitFloat);
+  bool F64IsLegal = !UseSoftFloat && !NoImplicitFloatOps
+    && Subtarget->hasSSE2();
+  if ((VT.isVector() ||
+       (VT == MVT::i64 && F64IsLegal && !Subtarget->is64Bit())) &&
+      isa<LoadSDNode>(St->getValue()) &&
+      !cast<LoadSDNode>(St->getValue())->isVolatile() &&
+      St->getChain().hasOneUse() && !St->isVolatile()) {
+    SDNode* LdVal = St->getValue().getNode();
+    LoadSDNode *Ld = 0;
+    int TokenFactorIndex = -1;
+    SmallVector<SDValue, 8> Ops;
+    SDNode* ChainVal = St->getChain().getNode();
+    // Must be a store of a load.  We currently handle two cases:  the load
+    // is a direct child, and it's under an intervening TokenFactor.  It is
+    // possible to dig deeper under nested TokenFactors.
+    if (ChainVal == LdVal)
+      Ld = cast<LoadSDNode>(St->getChain());
+    else if (St->getValue().hasOneUse() &&
+             ChainVal->getOpcode() == ISD::TokenFactor) {
+      for (unsigned i=0, e = ChainVal->getNumOperands(); i != e; ++i) {
+        if (ChainVal->getOperand(i).getNode() == LdVal) {
+          TokenFactorIndex = i;
+          Ld = cast<LoadSDNode>(St->getValue());
+        } else
+          Ops.push_back(ChainVal->getOperand(i));
+      }
+    }
+
+    if (!Ld || !ISD::isNormalLoad(Ld))
+      return SDValue();
+
+    // If this is not the MMX case, i.e. we are just turning i64 load/store
+    // into f64 load/store, avoid the transformation if there are multiple
+    // uses of the loaded value.
+    if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
+      return SDValue();
+
+    DebugLoc LdDL = Ld->getDebugLoc();
+    DebugLoc StDL = N->getDebugLoc();
+    // If we are a 64-bit capable x86, lower to a single movq load/store pair.
+    // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
+    // pair instead.
+    if (Subtarget->is64Bit() || F64IsLegal) {
+      EVT LdVT = Subtarget->is64Bit() ? MVT::i64 : MVT::f64;
+      SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(),
+                                  Ld->getBasePtr(), Ld->getSrcValue(),
+                                  Ld->getSrcValueOffset(), Ld->isVolatile(),
+                                  Ld->getAlignment());
+      SDValue NewChain = NewLd.getValue(1);
+      if (TokenFactorIndex != -1) {
+        Ops.push_back(NewChain);
+        NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
+                               Ops.size());
+      }
+      return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
+                          St->getSrcValue(), St->getSrcValueOffset(),
+                          St->isVolatile(), St->getAlignment());
+    }
+
+    // Otherwise, lower to two pairs of 32-bit loads / stores.
+    SDValue LoAddr = Ld->getBasePtr();
+    SDValue HiAddr = DAG.getNode(ISD::ADD, LdDL, MVT::i32, LoAddr,
+                                 DAG.getConstant(4, MVT::i32));
+
+    SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
+                               Ld->getSrcValue(), Ld->getSrcValueOffset(),
+                               Ld->isVolatile(), Ld->getAlignment());
+    SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
+                               Ld->getSrcValue(), Ld->getSrcValueOffset()+4,
+                               Ld->isVolatile(),
+                               MinAlign(Ld->getAlignment(), 4));
+
+    SDValue NewChain = LoLd.getValue(1);
+    if (TokenFactorIndex != -1) {
+      Ops.push_back(LoLd);
+      Ops.push_back(HiLd);
+      NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
+                             Ops.size());
+    }
+
+    LoAddr = St->getBasePtr();
+    HiAddr = DAG.getNode(ISD::ADD, StDL, MVT::i32, LoAddr,
+                         DAG.getConstant(4, MVT::i32));
+
+    SDValue LoSt = DAG.getStore(NewChain, StDL, LoLd, LoAddr,
+                                St->getSrcValue(), St->getSrcValueOffset(),
+                                St->isVolatile(), St->getAlignment());
+    SDValue HiSt = DAG.getStore(NewChain, StDL, HiLd, HiAddr,
+                                St->getSrcValue(),
+                                St->getSrcValueOffset() + 4,
+                                St->isVolatile(),
+                                MinAlign(St->getAlignment(), 4));
+    return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
+  }
+  return SDValue();
+}
+
+/// PerformFORCombine - Do target-specific dag combines on X86ISD::FOR and
+/// X86ISD::FXOR nodes.
+static SDValue PerformFORCombine(SDNode *N, SelectionDAG &DAG) {
+  assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
+  // F[X]OR(0.0, x) -> x
+  // F[X]OR(x, 0.0) -> x
+  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
+    if (C->getValueAPF().isPosZero())
+      return N->getOperand(1);
+  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
+    if (C->getValueAPF().isPosZero())
+      return N->getOperand(0);
+  return SDValue();
+}
+
+/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
+static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
+  // FAND(0.0, x) -> 0.0
+  // FAND(x, 0.0) -> 0.0
+  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(0)))
+    if (C->getValueAPF().isPosZero())
+      return N->getOperand(0);
+  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N->getOperand(1)))
+    if (C->getValueAPF().isPosZero())
+      return N->getOperand(1);
+  return SDValue();
+}
+
+static SDValue PerformBTCombine(SDNode *N,
+                                SelectionDAG &DAG,
+                                TargetLowering::DAGCombinerInfo &DCI) {
+  // BT ignores high bits in the bit index operand.
+  SDValue Op1 = N->getOperand(1);
+  if (Op1.hasOneUse()) {
+    unsigned BitWidth = Op1.getValueSizeInBits();
+    APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
+    APInt KnownZero, KnownOne;
+    TargetLowering::TargetLoweringOpt TLO(DAG);
+    TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
+        TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
+      DCI.CommitTargetLoweringOpt(TLO);
+  }
+  return SDValue();
+}
+
+static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
+  SDValue Op = N->getOperand(0);
+  if (Op.getOpcode() == ISD::BIT_CONVERT)
+    Op = Op.getOperand(0);
+  EVT VT = N->getValueType(0), OpVT = Op.getValueType();
+  if (Op.getOpcode() == X86ISD::VZEXT_LOAD &&
+      VT.getVectorElementType().getSizeInBits() ==
+      OpVT.getVectorElementType().getSizeInBits()) {
+    return DAG.getNode(ISD::BIT_CONVERT, N->getDebugLoc(), VT, Op);
+  }
+  return SDValue();
+}
+
+// On X86 and X86-64, atomic operations are lowered to locked instructions.
+// Locked instructions, in turn, have implicit fence semantics (all memory
+// operations are flushed before issuing the locked instruction, and the
+// are not buffered), so we can fold away the common pattern of
+// fence-atomic-fence.
+static SDValue PerformMEMBARRIERCombine(SDNode* N, SelectionDAG &DAG) {
+  SDValue atomic = N->getOperand(0);
+  switch (atomic.getOpcode()) {
+    case ISD::ATOMIC_CMP_SWAP:
+    case ISD::ATOMIC_SWAP:
+    case ISD::ATOMIC_LOAD_ADD:
+    case ISD::ATOMIC_LOAD_SUB:
+    case ISD::ATOMIC_LOAD_AND:
+    case ISD::ATOMIC_LOAD_OR:
+    case ISD::ATOMIC_LOAD_XOR:
+    case ISD::ATOMIC_LOAD_NAND:
+    case ISD::ATOMIC_LOAD_MIN:
+    case ISD::ATOMIC_LOAD_MAX:
+    case ISD::ATOMIC_LOAD_UMIN:
+    case ISD::ATOMIC_LOAD_UMAX:
+      break;
+    default:
+      return SDValue();
+  }
+
+  SDValue fence = atomic.getOperand(0);
+  if (fence.getOpcode() != ISD::MEMBARRIER)
+    return SDValue();
+
+  switch (atomic.getOpcode()) {
+    case ISD::ATOMIC_CMP_SWAP:
+      return DAG.UpdateNodeOperands(atomic, fence.getOperand(0),
+                                    atomic.getOperand(1), atomic.getOperand(2),
+                                    atomic.getOperand(3));
+    case ISD::ATOMIC_SWAP:
+    case ISD::ATOMIC_LOAD_ADD:
+    case ISD::ATOMIC_LOAD_SUB:
+    case ISD::ATOMIC_LOAD_AND:
+    case ISD::ATOMIC_LOAD_OR:
+    case ISD::ATOMIC_LOAD_XOR:
+    case ISD::ATOMIC_LOAD_NAND:
+    case ISD::ATOMIC_LOAD_MIN:
+    case ISD::ATOMIC_LOAD_MAX:
+    case ISD::ATOMIC_LOAD_UMIN:
+    case ISD::ATOMIC_LOAD_UMAX:
+      return DAG.UpdateNodeOperands(atomic, fence.getOperand(0),
+                                    atomic.getOperand(1), atomic.getOperand(2));
+    default:
+      return SDValue();
+  }
+}
+
+static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG) {
+  // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
+  //           (and (i32 x86isd::setcc_carry), 1)
+  // This eliminates the zext. This transformation is necessary because
+  // ISD::SETCC is always legalized to i8.
+  DebugLoc dl = N->getDebugLoc();
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  if (N0.getOpcode() == ISD::AND &&
+      N0.hasOneUse() &&
+      N0.getOperand(0).hasOneUse()) {
+    SDValue N00 = N0.getOperand(0);
+    if (N00.getOpcode() != X86ISD::SETCC_CARRY)
+      return SDValue();
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+    if (!C || C->getZExtValue() != 1)
+      return SDValue();
+    return DAG.getNode(ISD::AND, dl, VT,
+                       DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
+                                   N00.getOperand(0), N00.getOperand(1)),
+                       DAG.getConstant(1, VT));
+  }
+
+  return SDValue();
+}
+
+SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
+                                             DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, *this);
+  case ISD::SELECT:         return PerformSELECTCombine(N, DAG, Subtarget);
+  case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI);
+  case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:            return PerformShiftCombine(N, DAG, Subtarget);
+  case ISD::OR:             return PerformOrCombine(N, DAG, Subtarget);
+  case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
+  case X86ISD::FXOR:
+  case X86ISD::FOR:         return PerformFORCombine(N, DAG);
+  case X86ISD::FAND:        return PerformFANDCombine(N, DAG);
+  case X86ISD::BT:          return PerformBTCombine(N, DAG, DCI);
+  case X86ISD::VZEXT_MOVL:  return PerformVZEXT_MOVLCombine(N, DAG);
+  case ISD::MEMBARRIER:     return PerformMEMBARRIERCombine(N, DAG);
+  case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG);
+  }
+
+  return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+//                           X86 Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+static bool LowerToBSwap(CallInst *CI) {
+  // FIXME: this should verify that we are targetting a 486 or better.  If not,
+  // we will turn this bswap into something that will be lowered to logical ops
+  // instead of emitting the bswap asm.  For now, we don't support 486 or lower
+  // so don't worry about this.
+
+  // Verify this is a simple bswap.
+  if (CI->getNumOperands() != 2 ||
+      CI->getType() != CI->getOperand(1)->getType() ||
+      !CI->getType()->isInteger())
+    return false;
+
+  const IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+  if (!Ty || Ty->getBitWidth() % 16 != 0)
+    return false;
+
+  // Okay, we can do this xform, do so now.
+  const Type *Tys[] = { Ty };
+  Module *M = CI->getParent()->getParent()->getParent();
+  Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 1);
+
+  Value *Op = CI->getOperand(1);
+  Op = CallInst::Create(Int, Op, CI->getName(), CI);
+
+  CI->replaceAllUsesWith(Op);
+  CI->eraseFromParent();
+  return true;
+}
+
+bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
+  InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
+  std::vector<InlineAsm::ConstraintInfo> Constraints = IA->ParseConstraints();
+
+  std::string AsmStr = IA->getAsmString();
+
+  // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
+  SmallVector<StringRef, 4> AsmPieces;
+  SplitString(AsmStr, AsmPieces, "\n");  // ; as separator?
+
+  switch (AsmPieces.size()) {
+  default: return false;
+  case 1:
+    AsmStr = AsmPieces[0];
+    AsmPieces.clear();
+    SplitString(AsmStr, AsmPieces, " \t");  // Split with whitespace.
+
+    // bswap $0
+    if (AsmPieces.size() == 2 &&
+        (AsmPieces[0] == "bswap" ||
+         AsmPieces[0] == "bswapq" ||
+         AsmPieces[0] == "bswapl") &&
+        (AsmPieces[1] == "$0" ||
+         AsmPieces[1] == "${0:q}")) {
+      // No need to check constraints, nothing other than the equivalent of
+      // "=r,0" would be valid here.
+      return LowerToBSwap(CI);
+    }
+    // rorw $$8, ${0:w}  -->  llvm.bswap.i16
+    if (CI->getType()->isInteger(16) &&
+        AsmPieces.size() == 3 &&
+        AsmPieces[0] == "rorw" &&
+        AsmPieces[1] == "$$8," &&
+        AsmPieces[2] == "${0:w}" &&
+        IA->getConstraintString() == "=r,0,~{dirflag},~{fpsr},~{flags},~{cc}") {
+      return LowerToBSwap(CI);
+    }
+    break;
+  case 3:
+    if (CI->getType()->isInteger(64) &&
+        Constraints.size() >= 2 &&
+        Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
+        Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
+      // bswap %eax / bswap %edx / xchgl %eax, %edx  -> llvm.bswap.i64
+      SmallVector<StringRef, 4> Words;
+      SplitString(AsmPieces[0], Words, " \t");
+      if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%eax") {
+        Words.clear();
+        SplitString(AsmPieces[1], Words, " \t");
+        if (Words.size() == 2 && Words[0] == "bswap" && Words[1] == "%edx") {
+          Words.clear();
+          SplitString(AsmPieces[2], Words, " \t,");
+          if (Words.size() == 3 && Words[0] == "xchgl" && Words[1] == "%eax" &&
+              Words[2] == "%edx") {
+            return LowerToBSwap(CI);
+          }
+        }
+      }
+    }
+    break;
+  }
+  return false;
+}
+
+
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+X86TargetLowering::ConstraintType
+X86TargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'A':
+      return C_Register;
+    case 'f':
+    case 'r':
+    case 'R':
+    case 'l':
+    case 'q':
+    case 'Q':
+    case 'x':
+    case 'y':
+    case 'Y':
+      return C_RegisterClass;
+    case 'e':
+    case 'Z':
+      return C_Other;
+    default:
+      break;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+/// LowerXConstraint - try to replace an X constraint, which matches anything,
+/// with another that has more specific requirements based on the type of the
+/// corresponding operand.
+const char *X86TargetLowering::
+LowerXConstraint(EVT ConstraintVT) const {
+  // FP X constraints get lowered to SSE1/2 registers if available, otherwise
+  // 'f' like normal targets.
+  if (ConstraintVT.isFloatingPoint()) {
+    if (Subtarget->hasSSE2())
+      return "Y";
+    if (Subtarget->hasSSE1())
+      return "x";
+  }
+
+  return TargetLowering::LowerXConstraint(ConstraintVT);
+}
+
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector.  If it is invalid, don't add anything to Ops.
+void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+                                                     char Constraint,
+                                                     bool hasMemory,
+                                                     std::vector<SDValue>&Ops,
+                                                     SelectionDAG &DAG) const {
+  SDValue Result(0, 0);
+
+  switch (Constraint) {
+  default: break;
+  case 'I':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (C->getZExtValue() <= 31) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+        break;
+      }
+    }
+    return;
+  case 'J':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (C->getZExtValue() <= 63) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+        break;
+      }
+    }
+    return;
+  case 'K':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if ((int8_t)C->getSExtValue() == C->getSExtValue()) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+        break;
+      }
+    }
+    return;
+  case 'N':
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      if (C->getZExtValue() <= 255) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+        break;
+      }
+    }
+    return;
+  case 'e': {
+    // 32-bit signed value
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      const ConstantInt *CI = C->getConstantIntValue();
+      if (CI->isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
+                                  C->getSExtValue())) {
+        // Widen to 64 bits here to get it sign extended.
+        Result = DAG.getTargetConstant(C->getSExtValue(), MVT::i64);
+        break;
+      }
+    // FIXME gcc accepts some relocatable values here too, but only in certain
+    // memory models; it's complicated.
+    }
+    return;
+  }
+  case 'Z': {
+    // 32-bit unsigned value
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+      const ConstantInt *CI = C->getConstantIntValue();
+      if (CI->isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
+                                  C->getZExtValue())) {
+        Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
+        break;
+      }
+    }
+    // FIXME gcc accepts some relocatable values here too, but only in certain
+    // memory models; it's complicated.
+    return;
+  }
+  case 'i': {
+    // Literal immediates are always ok.
+    if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
+      // Widen to 64 bits here to get it sign extended.
+      Result = DAG.getTargetConstant(CST->getSExtValue(), MVT::i64);
+      break;
+    }
+
+    // If we are in non-pic codegen mode, we allow the address of a global (with
+    // an optional displacement) to be used with 'i'.
+    GlobalAddressSDNode *GA = 0;
+    int64_t Offset = 0;
+
+    // Match either (GA), (GA+C), (GA+C1+C2), etc.
+    while (1) {
+      if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
+        Offset += GA->getOffset();
+        break;
+      } else if (Op.getOpcode() == ISD::ADD) {
+        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+          Offset += C->getZExtValue();
+          Op = Op.getOperand(0);
+          continue;
+        }
+      } else if (Op.getOpcode() == ISD::SUB) {
+        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+          Offset += -C->getZExtValue();
+          Op = Op.getOperand(0);
+          continue;
+        }
+      }
+
+      // Otherwise, this isn't something we can handle, reject it.
+      return;
+    }
+
+    GlobalValue *GV = GA->getGlobal();
+    // If we require an extra load to get this address, as in PIC mode, we
+    // can't accept it.
+    if (isGlobalStubReference(Subtarget->ClassifyGlobalReference(GV,
+                                                        getTargetMachine())))
+      return;
+
+    if (hasMemory)
+      Op = LowerGlobalAddress(GV, Op.getDebugLoc(), Offset, DAG);
+    else
+      Op = DAG.getTargetGlobalAddress(GV, GA->getValueType(0), Offset);
+    Result = Op;
+    break;
+  }
+  }
+
+  if (Result.getNode()) {
+    Ops.push_back(Result);
+    return;
+  }
+  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, hasMemory,
+                                                      Ops, DAG);
+}
+
+std::vector<unsigned> X86TargetLowering::
+getRegClassForInlineAsmConstraint(const std::string &Constraint,
+                                  EVT VT) const {
+  if (Constraint.size() == 1) {
+    // FIXME: not handling fp-stack yet!
+    switch (Constraint[0]) {      // GCC X86 Constraint Letters
+    default: break;  // Unknown constraint letter
+    case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
+      if (Subtarget->is64Bit()) {
+        if (VT == MVT::i32)
+          return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX,
+                                       X86::ESI, X86::EDI, X86::R8D, X86::R9D,
+                                       X86::R10D,X86::R11D,X86::R12D,
+                                       X86::R13D,X86::R14D,X86::R15D,
+                                       X86::EBP, X86::ESP, 0);
+        else if (VT == MVT::i16)
+          return make_vector<unsigned>(X86::AX,  X86::DX,  X86::CX, X86::BX,
+                                       X86::SI,  X86::DI,  X86::R8W,X86::R9W,
+                                       X86::R10W,X86::R11W,X86::R12W,
+                                       X86::R13W,X86::R14W,X86::R15W,
+                                       X86::BP,  X86::SP, 0);
+        else if (VT == MVT::i8)
+          return make_vector<unsigned>(X86::AL,  X86::DL,  X86::CL, X86::BL,
+                                       X86::SIL, X86::DIL, X86::R8B,X86::R9B,
+                                       X86::R10B,X86::R11B,X86::R12B,
+                                       X86::R13B,X86::R14B,X86::R15B,
+                                       X86::BPL, X86::SPL, 0);
+
+        else if (VT == MVT::i64)
+          return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX,
+                                       X86::RSI, X86::RDI, X86::R8,  X86::R9,
+                                       X86::R10, X86::R11, X86::R12,
+                                       X86::R13, X86::R14, X86::R15,
+                                       X86::RBP, X86::RSP, 0);
+
+        break;
+      }
+      // 32-bit fallthrough
+    case 'Q':   // Q_REGS
+      if (VT == MVT::i32)
+        return make_vector<unsigned>(X86::EAX, X86::EDX, X86::ECX, X86::EBX, 0);
+      else if (VT == MVT::i16)
+        return make_vector<unsigned>(X86::AX, X86::DX, X86::CX, X86::BX, 0);
+      else if (VT == MVT::i8)
+        return make_vector<unsigned>(X86::AL, X86::DL, X86::CL, X86::BL, 0);
+      else if (VT == MVT::i64)
+        return make_vector<unsigned>(X86::RAX, X86::RDX, X86::RCX, X86::RBX, 0);
+      break;
+    }
+  }
+
+  return std::vector<unsigned>();
+}
+
+std::pair<unsigned, const TargetRegisterClass*>
+X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
+                                                EVT VT) const {
+  // First, see if this is a constraint that directly corresponds to an LLVM
+  // register class.
+  if (Constraint.size() == 1) {
+    // GCC Constraint Letters
+    switch (Constraint[0]) {
+    default: break;
+    case 'r':   // GENERAL_REGS
+    case 'l':   // INDEX_REGS
+      if (VT == MVT::i8)
+        return std::make_pair(0U, X86::GR8RegisterClass);
+      if (VT == MVT::i16)
+        return std::make_pair(0U, X86::GR16RegisterClass);
+      if (VT == MVT::i32 || !Subtarget->is64Bit())
+        return std::make_pair(0U, X86::GR32RegisterClass);
+      return std::make_pair(0U, X86::GR64RegisterClass);
+    case 'R':   // LEGACY_REGS
+      if (VT == MVT::i8)
+        return std::make_pair(0U, X86::GR8_NOREXRegisterClass);
+      if (VT == MVT::i16)
+        return std::make_pair(0U, X86::GR16_NOREXRegisterClass);
+      if (VT == MVT::i32 || !Subtarget->is64Bit())
+        return std::make_pair(0U, X86::GR32_NOREXRegisterClass);
+      return std::make_pair(0U, X86::GR64_NOREXRegisterClass);
+    case 'f':  // FP Stack registers.
+      // If SSE is enabled for this VT, use f80 to ensure the isel moves the
+      // value to the correct fpstack register class.
+      if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
+        return std::make_pair(0U, X86::RFP32RegisterClass);
+      if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
+        return std::make_pair(0U, X86::RFP64RegisterClass);
+      return std::make_pair(0U, X86::RFP80RegisterClass);
+    case 'y':   // MMX_REGS if MMX allowed.
+      if (!Subtarget->hasMMX()) break;
+      return std::make_pair(0U, X86::VR64RegisterClass);
+    case 'Y':   // SSE_REGS if SSE2 allowed
+      if (!Subtarget->hasSSE2()) break;
+      // FALL THROUGH.
+    case 'x':   // SSE_REGS if SSE1 allowed
+      if (!Subtarget->hasSSE1()) break;
+
+      switch (VT.getSimpleVT().SimpleTy) {
+      default: break;
+      // Scalar SSE types.
+      case MVT::f32:
+      case MVT::i32:
+        return std::make_pair(0U, X86::FR32RegisterClass);
+      case MVT::f64:
+      case MVT::i64:
+        return std::make_pair(0U, X86::FR64RegisterClass);
+      // Vector types.
+      case MVT::v16i8:
+      case MVT::v8i16:
+      case MVT::v4i32:
+      case MVT::v2i64:
+      case MVT::v4f32:
+      case MVT::v2f64:
+        return std::make_pair(0U, X86::VR128RegisterClass);
+      }
+      break;
+    }
+  }
+
+  // Use the default implementation in TargetLowering to convert the register
+  // constraint into a member of a register class.
+  std::pair<unsigned, const TargetRegisterClass*> Res;
+  Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+
+  // Not found as a standard register?
+  if (Res.second == 0) {
+    // Map st(0) -> st(7) -> ST0
+    if (Constraint.size() == 7 && Constraint[0] == '{' &&
+        tolower(Constraint[1]) == 's' &&
+        tolower(Constraint[2]) == 't' &&
+        Constraint[3] == '(' &&
+        (Constraint[4] >= '0' && Constraint[4] <= '7') &&
+        Constraint[5] == ')' &&
+        Constraint[6] == '}') {
+
+      Res.first = X86::ST0+Constraint[4]-'0';
+      Res.second = X86::RFP80RegisterClass;
+      return Res;
+    }
+
+    // GCC allows "st(0)" to be called just plain "st".
+    if (StringRef("{st}").equals_lower(Constraint)) {
+      Res.first = X86::ST0;
+      Res.second = X86::RFP80RegisterClass;
+      return Res;
+    }
+
+    // flags -> EFLAGS
+    if (StringRef("{flags}").equals_lower(Constraint)) {
+      Res.first = X86::EFLAGS;
+      Res.second = X86::CCRRegisterClass;
+      return Res;
+    }
+
+    // 'A' means EAX + EDX.
+    if (Constraint == "A") {
+      Res.first = X86::EAX;
+      Res.second = X86::GR32_ADRegisterClass;
+      return Res;
+    }
+    return Res;
+  }
+
+  // Otherwise, check to see if this is a register class of the wrong value
+  // type.  For example, we want to map "{ax},i32" -> {eax}, we don't want it to
+  // turn into {ax},{dx}.
+  if (Res.second->hasType(VT))
+    return Res;   // Correct type already, nothing to do.
+
+  // All of the single-register GCC register classes map their values onto
+  // 16-bit register pieces "ax","dx","cx","bx","si","di","bp","sp".  If we
+  // really want an 8-bit or 32-bit register, map to the appropriate register
+  // class and return the appropriate register.
+  if (Res.second == X86::GR16RegisterClass) {
+    if (VT == MVT::i8) {
+      unsigned DestReg = 0;
+      switch (Res.first) {
+      default: break;
+      case X86::AX: DestReg = X86::AL; break;
+      case X86::DX: DestReg = X86::DL; break;
+      case X86::CX: DestReg = X86::CL; break;
+      case X86::BX: DestReg = X86::BL; break;
+      }
+      if (DestReg) {
+        Res.first = DestReg;
+        Res.second = X86::GR8RegisterClass;
+      }
+    } else if (VT == MVT::i32) {
+      unsigned DestReg = 0;
+      switch (Res.first) {
+      default: break;
+      case X86::AX: DestReg = X86::EAX; break;
+      case X86::DX: DestReg = X86::EDX; break;
+      case X86::CX: DestReg = X86::ECX; break;
+      case X86::BX: DestReg = X86::EBX; break;
+      case X86::SI: DestReg = X86::ESI; break;
+      case X86::DI: DestReg = X86::EDI; break;
+      case X86::BP: DestReg = X86::EBP; break;
+      case X86::SP: DestReg = X86::ESP; break;
+      }
+      if (DestReg) {
+        Res.first = DestReg;
+        Res.second = X86::GR32RegisterClass;
+      }
+    } else if (VT == MVT::i64) {
+      unsigned DestReg = 0;
+      switch (Res.first) {
+      default: break;
+      case X86::AX: DestReg = X86::RAX; break;
+      case X86::DX: DestReg = X86::RDX; break;
+      case X86::CX: DestReg = X86::RCX; break;
+      case X86::BX: DestReg = X86::RBX; break;
+      case X86::SI: DestReg = X86::RSI; break;
+      case X86::DI: DestReg = X86::RDI; break;
+      case X86::BP: DestReg = X86::RBP; break;
+      case X86::SP: DestReg = X86::RSP; break;
+      }
+      if (DestReg) {
+        Res.first = DestReg;
+        Res.second = X86::GR64RegisterClass;
+      }
+    }
+  } else if (Res.second == X86::FR32RegisterClass ||
+             Res.second == X86::FR64RegisterClass ||
+             Res.second == X86::VR128RegisterClass) {
+    // Handle references to XMM physical registers that got mapped into the
+    // wrong class.  This can happen with constraints like {xmm0} where the
+    // target independent register mapper will just pick the first match it can
+    // find, ignoring the required type.
+    if (VT == MVT::f32)
+      Res.second = X86::FR32RegisterClass;
+    else if (VT == MVT::f64)
+      Res.second = X86::FR64RegisterClass;
+    else if (X86::VR128RegisterClass->hasType(VT))
+      Res.second = X86::VR128RegisterClass;
+  }
+
+  return Res;
+}
+
+//===----------------------------------------------------------------------===//
+//                           X86 Widen vector type
+//===----------------------------------------------------------------------===//
+
+/// getWidenVectorType: given a vector type, returns the type to widen
+/// to (e.g., v7i8 to v8i8). If the vector type is legal, it returns itself.
+/// If there is no vector type that we want to widen to, returns MVT::Other
+/// When and where to widen is target dependent based on the cost of
+/// scalarizing vs using the wider vector type.
+
+EVT X86TargetLowering::getWidenVectorType(EVT VT) const {
+  assert(VT.isVector());
+  if (isTypeLegal(VT))
+    return VT;
+
+  // TODO: In computeRegisterProperty, we can compute the list of legal vector
+  //       type based on element type.  This would speed up our search (though
+  //       it may not be worth it since the size of the list is relatively
+  //       small).
+  EVT EltVT = VT.getVectorElementType();
+  unsigned NElts = VT.getVectorNumElements();
+
+  // On X86, it make sense to widen any vector wider than 1
+  if (NElts <= 1)
+    return MVT::Other;
+
+  for (unsigned nVT = MVT::FIRST_VECTOR_VALUETYPE;
+       nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
+    EVT SVT = (MVT::SimpleValueType)nVT;
+
+    if (isTypeLegal(SVT) &&
+        SVT.getVectorElementType() == EltVT &&
+        SVT.getVectorNumElements() > NElts)
+      return SVT;
+  }
+  return MVT::Other;
+}

diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
new file mode 100644
index 0000000..193ef05
--- /dev/null
+++ b/lib/Target/X86/X86ISelLowering.h

@@ -0,0 +1,817 @@
+//===-- X86ISelLowering.h - X86 DAG Lowering Interface ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that X86 uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86ISELLOWERING_H
+#define X86ISELLOWERING_H
+
+#include "X86Subtarget.h"
+#include "X86RegisterInfo.h"
+#include "X86MachineFunctionInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+
+namespace llvm {
+  namespace X86ISD {
+    // X86 Specific DAG Nodes
+    enum NodeType {
+      // Start the numbering where the builtin ops leave off.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+      /// BSF - Bit scan forward.
+      /// BSR - Bit scan reverse.
+      BSF,
+      BSR,
+
+      /// SHLD, SHRD - Double shift instructions. These correspond to
+      /// X86::SHLDxx and X86::SHRDxx instructions.
+      SHLD,
+      SHRD,
+
+      /// FAND - Bitwise logical AND of floating point values. This corresponds
+      /// to X86::ANDPS or X86::ANDPD.
+      FAND,
+
+      /// FOR - Bitwise logical OR of floating point values. This corresponds
+      /// to X86::ORPS or X86::ORPD.
+      FOR,
+
+      /// FXOR - Bitwise logical XOR of floating point values. This corresponds
+      /// to X86::XORPS or X86::XORPD.
+      FXOR,
+
+      /// FSRL - Bitwise logical right shift of floating point values. These
+      /// corresponds to X86::PSRLDQ.
+      FSRL,
+
+      /// FILD, FILD_FLAG - This instruction implements SINT_TO_FP with the
+      /// integer source in memory and FP reg result.  This corresponds to the
+      /// X86::FILD*m instructions. It has three inputs (token chain, address,
+      /// and source type) and two outputs (FP value and token chain). FILD_FLAG
+      /// also produces a flag).
+      FILD,
+      FILD_FLAG,
+
+      /// FP_TO_INT*_IN_MEM - This instruction implements FP_TO_SINT with the
+      /// integer destination in memory and a FP reg source.  This corresponds
+      /// to the X86::FIST*m instructions and the rounding mode change stuff. It
+      /// has two inputs (token chain and address) and two outputs (int value
+      /// and token chain).
+      FP_TO_INT16_IN_MEM,
+      FP_TO_INT32_IN_MEM,
+      FP_TO_INT64_IN_MEM,
+
+      /// FLD - This instruction implements an extending load to FP stack slots.
+      /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain
+      /// operand, ptr to load from, and a ValueType node indicating the type
+      /// to load to.
+      FLD,
+
+      /// FST - This instruction implements a truncating store to FP stack
+      /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a
+      /// chain operand, value to store, address, and a ValueType to store it
+      /// as.
+      FST,
+
+      /// CALL - These operations represent an abstract X86 call
+      /// instruction, which includes a bunch of information.  In particular the
+      /// operands of these node are:
+      ///
+      ///     #0 - The incoming token chain
+      ///     #1 - The callee
+      ///     #2 - The number of arg bytes the caller pushes on the stack.
+      ///     #3 - The number of arg bytes the callee pops off the stack.
+      ///     #4 - The value to pass in AL/AX/EAX (optional)
+      ///     #5 - The value to pass in DL/DX/EDX (optional)
+      ///
+      /// The result values of these nodes are:
+      ///
+      ///     #0 - The outgoing token chain
+      ///     #1 - The first register result value (optional)
+      ///     #2 - The second register result value (optional)
+      ///
+      CALL,
+
+      /// RDTSC_DAG - This operation implements the lowering for 
+      /// readcyclecounter
+      RDTSC_DAG,
+
+      /// X86 compare and logical compare instructions.
+      CMP, COMI, UCOMI,
+
+      /// X86 bit-test instructions.
+      BT,
+
+      /// X86 SetCC. Operand 0 is condition code, and operand 1 is the flag
+      /// operand produced by a CMP instruction.
+      SETCC,
+
+      // Same as SETCC except it's materialized with a sbb and the value is all
+      // one's or all zero's.
+      SETCC_CARRY,
+
+      /// X86 conditional moves. Operand 0 and operand 1 are the two values
+      /// to select from. Operand 2 is the condition code, and operand 3 is the
+      /// flag operand produced by a CMP or TEST instruction. It also writes a
+      /// flag result.
+      CMOV,
+
+      /// X86 conditional branches. Operand 0 is the chain operand, operand 1
+      /// is the block to branch if condition is true, operand 2 is the
+      /// condition code, and operand 3 is the flag operand produced by a CMP
+      /// or TEST instruction.
+      BRCOND,
+
+      /// Return with a flag operand. Operand 0 is the chain operand, operand
+      /// 1 is the number of bytes of stack to pop.
+      RET_FLAG,
+
+      /// REP_STOS - Repeat fill, corresponds to X86::REP_STOSx.
+      REP_STOS,
+
+      /// REP_MOVS - Repeat move, corresponds to X86::REP_MOVSx.
+      REP_MOVS,
+
+      /// GlobalBaseReg - On Darwin, this node represents the result of the popl
+      /// at function entry, used for PIC code.
+      GlobalBaseReg,
+
+      /// Wrapper - A wrapper node for TargetConstantPool,
+      /// TargetExternalSymbol, and TargetGlobalAddress.
+      Wrapper,
+
+      /// WrapperRIP - Special wrapper used under X86-64 PIC mode for RIP
+      /// relative displacements.
+      WrapperRIP,
+
+      /// MOVQ2DQ - Copies a 64-bit value from a vector to another vector.
+      /// Can be used to move a vector value from a MMX register to a XMM
+      /// register.
+      MOVQ2DQ,
+
+      /// PEXTRB - Extract an 8-bit value from a vector and zero extend it to
+      /// i32, corresponds to X86::PEXTRB.
+      PEXTRB,
+
+      /// PEXTRW - Extract a 16-bit value from a vector and zero extend it to
+      /// i32, corresponds to X86::PEXTRW.
+      PEXTRW,
+
+      /// INSERTPS - Insert any element of a 4 x float vector into any element
+      /// of a destination 4 x floatvector.
+      INSERTPS,
+
+      /// PINSRB - Insert the lower 8-bits of a 32-bit value to a vector,
+      /// corresponds to X86::PINSRB.
+      PINSRB,
+
+      /// PINSRW - Insert the lower 16-bits of a 32-bit value to a vector,
+      /// corresponds to X86::PINSRW.
+      PINSRW,
+
+      /// PSHUFB - Shuffle 16 8-bit values within a vector.
+      PSHUFB,
+
+      /// FMAX, FMIN - Floating point max and min.
+      ///
+      FMAX, FMIN,
+
+      /// FRSQRT, FRCP - Floating point reciprocal-sqrt and reciprocal
+      /// approximation.  Note that these typically require refinement
+      /// in order to obtain suitable precision.
+      FRSQRT, FRCP,
+
+      // TLSADDR - Thread Local Storage.
+      TLSADDR,
+
+      // SegmentBaseAddress - The address segment:0
+      SegmentBaseAddress,
+
+      // EH_RETURN - Exception Handling helpers.
+      EH_RETURN,
+      
+      /// TC_RETURN - Tail call return.
+      ///   operand #0 chain
+      ///   operand #1 callee (register or absolute)
+      ///   operand #2 stack adjustment
+      ///   operand #3 optional in flag
+      TC_RETURN,
+
+      // LCMPXCHG_DAG, LCMPXCHG8_DAG - Compare and swap.
+      LCMPXCHG_DAG,
+      LCMPXCHG8_DAG,
+
+      // FNSTCW16m - Store FP control world into i16 memory.
+      FNSTCW16m,
+
+      // VZEXT_MOVL - Vector move low and zero extend.
+      VZEXT_MOVL,
+
+      // VZEXT_LOAD - Load, scalar_to_vector, and zero extend.
+      VZEXT_LOAD,
+
+      // VSHL, VSRL - Vector logical left / right shift.
+      VSHL, VSRL,
+
+      // CMPPD, CMPPS - Vector double/float comparison.
+      // CMPPD, CMPPS - Vector double/float comparison.
+      CMPPD, CMPPS,
+      
+      // PCMP* - Vector integer comparisons.
+      PCMPEQB, PCMPEQW, PCMPEQD, PCMPEQQ,
+      PCMPGTB, PCMPGTW, PCMPGTD, PCMPGTQ,
+
+      // ADD, SUB, SMUL, UMUL, etc. - Arithmetic operations with FLAGS results.
+      ADD, SUB, SMUL, UMUL,
+      INC, DEC, OR, XOR, AND,
+
+      // MUL_IMM - X86 specific multiply by immediate.
+      MUL_IMM,
+      
+      // PTEST - Vector bitwise comparisons
+      PTEST,
+
+      // VASTART_SAVE_XMM_REGS - Save xmm argument registers to the stack,
+      // according to %al. An operator is needed so that this can be expanded
+      // with control flow.
+      VASTART_SAVE_XMM_REGS,
+
+      // ATOMADD64_DAG, ATOMSUB64_DAG, ATOMOR64_DAG, ATOMAND64_DAG, 
+      // ATOMXOR64_DAG, ATOMNAND64_DAG, ATOMSWAP64_DAG - 
+      // Atomic 64-bit binary operations.
+      ATOMADD64_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
+      ATOMSUB64_DAG,
+      ATOMOR64_DAG,
+      ATOMXOR64_DAG,
+      ATOMAND64_DAG,
+      ATOMNAND64_DAG,
+      ATOMSWAP64_DAG
+    };
+  }
+
+  /// Define some predicates that are used for node matching.
+  namespace X86 {
+    /// isPSHUFDMask - Return true if the specified VECTOR_SHUFFLE operand
+    /// specifies a shuffle of elements that is suitable for input to PSHUFD.
+    bool isPSHUFDMask(ShuffleVectorSDNode *N);
+
+    /// isPSHUFHWMask - Return true if the specified VECTOR_SHUFFLE operand
+    /// specifies a shuffle of elements that is suitable for input to PSHUFD.
+    bool isPSHUFHWMask(ShuffleVectorSDNode *N);
+
+    /// isPSHUFLWMask - Return true if the specified VECTOR_SHUFFLE operand
+    /// specifies a shuffle of elements that is suitable for input to PSHUFD.
+    bool isPSHUFLWMask(ShuffleVectorSDNode *N);
+
+    /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
+    /// specifies a shuffle of elements that is suitable for input to SHUFP*.
+    bool isSHUFPMask(ShuffleVectorSDNode *N);
+
+    /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
+    /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
+    bool isMOVHLPSMask(ShuffleVectorSDNode *N);
+
+    /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
+    /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
+    /// <2, 3, 2, 3>
+    bool isMOVHLPS_v_undef_Mask(ShuffleVectorSDNode *N);
+
+    /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
+    /// specifies a shuffle of elements that is suitable for MOVLP{S|D}.
+    bool isMOVLPMask(ShuffleVectorSDNode *N);
+
+    /// isMOVHPMask - Return true if the specified VECTOR_SHUFFLE operand
+    /// specifies a shuffle of elements that is suitable for MOVHP{S|D}.
+    /// as well as MOVLHPS.
+    bool isMOVLHPSMask(ShuffleVectorSDNode *N);
+
+    /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
+    /// specifies a shuffle of elements that is suitable for input to UNPCKL.
+    bool isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat = false);
+
+    /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
+    /// specifies a shuffle of elements that is suitable for input to UNPCKH.
+    bool isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat = false);
+
+    /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
+    /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
+    /// <0, 0, 1, 1>
+    bool isUNPCKL_v_undef_Mask(ShuffleVectorSDNode *N);
+
+    /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
+    /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
+    /// <2, 2, 3, 3>
+    bool isUNPCKH_v_undef_Mask(ShuffleVectorSDNode *N);
+
+    /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
+    /// specifies a shuffle of elements that is suitable for input to MOVSS,
+    /// MOVSD, and MOVD, i.e. setting the lowest element.
+    bool isMOVLMask(ShuffleVectorSDNode *N);
+
+    /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+    /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
+    bool isMOVSHDUPMask(ShuffleVectorSDNode *N);
+
+    /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+    /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
+    bool isMOVSLDUPMask(ShuffleVectorSDNode *N);
+
+    /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
+    /// specifies a shuffle of elements that is suitable for input to MOVDDUP.
+    bool isMOVDDUPMask(ShuffleVectorSDNode *N);
+
+    /// isPALIGNRMask - Return true if the specified VECTOR_SHUFFLE operand
+    /// specifies a shuffle of elements that is suitable for input to PALIGNR.
+    bool isPALIGNRMask(ShuffleVectorSDNode *N);
+
+    /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
+    /// the specified isShuffleMask VECTOR_SHUFFLE mask with PSHUF* and SHUFP*
+    /// instructions.
+    unsigned getShuffleSHUFImmediate(SDNode *N);
+
+    /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
+    /// the specified VECTOR_SHUFFLE mask with PSHUFHW instruction.
+    unsigned getShufflePSHUFHWImmediate(SDNode *N);
+
+    /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
+    /// the specified VECTOR_SHUFFLE mask with PSHUFLW instruction.
+    unsigned getShufflePSHUFLWImmediate(SDNode *N);
+
+    /// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
+    /// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
+    unsigned getShufflePALIGNRImmediate(SDNode *N);
+
+    /// isZeroNode - Returns true if Elt is a constant zero or a floating point
+    /// constant +0.0.
+    bool isZeroNode(SDValue Elt);
+
+    /// isOffsetSuitableForCodeModel - Returns true of the given offset can be
+    /// fit into displacement field of the instruction.
+    bool isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
+                                      bool hasSymbolicDisplacement = true);
+  }
+
+  //===--------------------------------------------------------------------===//
+  //  X86TargetLowering - X86 Implementation of the TargetLowering interface
+  class X86TargetLowering : public TargetLowering {
+    int VarArgsFrameIndex;            // FrameIndex for start of varargs area.
+    int RegSaveFrameIndex;            // X86-64 vararg func register save area.
+    unsigned VarArgsGPOffset;         // X86-64 vararg func int reg offset.
+    unsigned VarArgsFPOffset;         // X86-64 vararg func fp reg offset.
+    int BytesToPopOnReturn;           // Number of arg bytes ret should pop.
+
+  public:
+    explicit X86TargetLowering(X86TargetMachine &TM);
+
+    /// getPICBaseSymbol - Return the X86-32 PIC base.
+    MCSymbol *getPICBaseSymbol(const MachineFunction *MF, MCContext &Ctx) const;
+    
+    virtual unsigned getJumpTableEncoding() const;
+
+    virtual const MCExpr *
+    LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
+                              const MachineBasicBlock *MBB, unsigned uid,
+                              MCContext &Ctx) const;
+    
+    /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
+    /// jumptable.
+    virtual SDValue getPICJumpTableRelocBase(SDValue Table,
+                                             SelectionDAG &DAG) const;
+    virtual const MCExpr *
+    getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
+                                 unsigned JTI, MCContext &Ctx) const;
+    
+    // Return the number of bytes that a function should pop when it returns (in
+    // addition to the space used by the return address).
+    //
+    unsigned getBytesToPopOnReturn() const { return BytesToPopOnReturn; }
+
+    /// getStackPtrReg - Return the stack pointer register we are using: either
+    /// ESP or RSP.
+    unsigned getStackPtrReg() const { return X86StackPtr; }
+
+    /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
+    /// function arguments in the caller parameter area. For X86, aggregates
+    /// that contains are placed at 16-byte boundaries while the rest are at
+    /// 4-byte boundaries.
+    virtual unsigned getByValTypeAlignment(const Type *Ty) const;
+
+    /// getOptimalMemOpType - Returns the target specific optimal type for load
+    /// and store operations as a result of memset, memcpy, and memmove
+    /// lowering. It returns EVT::iAny if SelectionDAG should be responsible for
+    /// determining it.
+    virtual EVT getOptimalMemOpType(uint64_t Size, unsigned Align,
+                                    bool isSrcConst, bool isSrcStr,
+                                    SelectionDAG &DAG) const;
+
+    /// allowsUnalignedMemoryAccesses - Returns true if the target allows
+    /// unaligned memory accesses. of the specified type.
+    virtual bool allowsUnalignedMemoryAccesses(EVT VT) const {
+      return true;
+    }
+
+    /// LowerOperation - Provide custom lowering hooks for some operations.
+    ///
+    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG);
+
+    /// ReplaceNodeResults - Replace the results of node with an illegal result
+    /// type with new values built out of custom code.
+    ///
+    virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+                                    SelectionDAG &DAG);
+
+    
+    virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+    virtual MachineBasicBlock *EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                         MachineBasicBlock *MBB,
+                    DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const;
+
+ 
+    /// getTargetNodeName - This method returns the name of a target specific
+    /// DAG node.
+    virtual const char *getTargetNodeName(unsigned Opcode) const;
+
+    /// getSetCCResultType - Return the ISD::SETCC ValueType
+    virtual MVT::SimpleValueType getSetCCResultType(EVT VT) const;
+
+    /// computeMaskedBitsForTargetNode - Determine which of the bits specified 
+    /// in Mask are known to be either zero or one and return them in the 
+    /// KnownZero/KnownOne bitsets.
+    virtual void computeMaskedBitsForTargetNode(const SDValue Op,
+                                                const APInt &Mask,
+                                                APInt &KnownZero, 
+                                                APInt &KnownOne,
+                                                const SelectionDAG &DAG,
+                                                unsigned Depth = 0) const;
+
+    virtual bool
+    isGAPlusOffset(SDNode *N, GlobalValue* &GA, int64_t &Offset) const;
+    
+    SDValue getReturnAddressFrameIndex(SelectionDAG &DAG);
+
+    virtual bool ExpandInlineAsm(CallInst *CI) const;
+    
+    ConstraintType getConstraintType(const std::string &Constraint) const;
+     
+    std::vector<unsigned> 
+      getRegClassForInlineAsmConstraint(const std::string &Constraint,
+                                        EVT VT) const;
+
+    virtual const char *LowerXConstraint(EVT ConstraintVT) const;
+
+    /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+    /// vector.  If it is invalid, don't add anything to Ops. If hasMemory is
+    /// true it means one of the asm constraint of the inline asm instruction
+    /// being processed is 'm'.
+    virtual void LowerAsmOperandForConstraint(SDValue Op,
+                                              char ConstraintLetter,
+                                              bool hasMemory,
+                                              std::vector<SDValue> &Ops,
+                                              SelectionDAG &DAG) const;
+    
+    /// getRegForInlineAsmConstraint - Given a physical register constraint
+    /// (e.g. {edx}), return the register number and the register class for the
+    /// register.  This should only be used for C_Register constraints.  On
+    /// error, this returns a register number of 0.
+    std::pair<unsigned, const TargetRegisterClass*> 
+      getRegForInlineAsmConstraint(const std::string &Constraint,
+                                   EVT VT) const;
+    
+    /// isLegalAddressingMode - Return true if the addressing mode represented
+    /// by AM is legal for this target, for a load/store of the specified type.
+    virtual bool isLegalAddressingMode(const AddrMode &AM, const Type *Ty)const;
+
+    /// isTruncateFree - Return true if it's free to truncate a value of
+    /// type Ty1 to type Ty2. e.g. On x86 it's free to truncate a i32 value in
+    /// register EAX to i16 by referencing its sub-register AX.
+    virtual bool isTruncateFree(const Type *Ty1, const Type *Ty2) const;
+    virtual bool isTruncateFree(EVT VT1, EVT VT2) const;
+
+    /// isZExtFree - Return true if any actual instruction that defines a
+    /// value of type Ty1 implicit zero-extends the value to Ty2 in the result
+    /// register. This does not necessarily include registers defined in
+    /// unknown ways, such as incoming arguments, or copies from unknown
+    /// virtual registers. Also, if isTruncateFree(Ty2, Ty1) is true, this
+    /// does not necessarily apply to truncate instructions. e.g. on x86-64,
+    /// all instructions that define 32-bit values implicit zero-extend the
+    /// result out to 64 bits.
+    virtual bool isZExtFree(const Type *Ty1, const Type *Ty2) const;
+    virtual bool isZExtFree(EVT VT1, EVT VT2) const;
+
+    /// isNarrowingProfitable - Return true if it's profitable to narrow
+    /// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow
+    /// from i32 to i8 but not from i32 to i16.
+    virtual bool isNarrowingProfitable(EVT VT1, EVT VT2) const;
+
+    /// isFPImmLegal - Returns true if the target can instruction select the
+    /// specified FP immediate natively. If false, the legalizer will
+    /// materialize the FP immediate as a load from a constant pool.
+    virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
+
+    /// isShuffleMaskLegal - Targets can use this to indicate that they only
+    /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
+    /// By default, if a target supports the VECTOR_SHUFFLE node, all mask
+    /// values are assumed to be legal.
+    virtual bool isShuffleMaskLegal(const SmallVectorImpl<int> &Mask,
+                                    EVT VT) const;
+
+    /// isVectorClearMaskLegal - Similar to isShuffleMaskLegal. This is
+    /// used by Targets can use this to indicate if there is a suitable
+    /// VECTOR_SHUFFLE that can be used to replace a VAND with a constant
+    /// pool entry.
+    virtual bool isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
+                                        EVT VT) const;
+
+    /// ShouldShrinkFPConstant - If true, then instruction selection should
+    /// seek to shrink the FP constant of the specified type to a smaller type
+    /// in order to save space and / or reduce runtime.
+    virtual bool ShouldShrinkFPConstant(EVT VT) const {
+      // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more
+      // expensive than a straight movsd. On the other hand, it's important to
+      // shrink long double fp constant since fldt is very slow.
+      return !X86ScalarSSEf64 || VT == MVT::f80;
+    }
+    
+    virtual const X86Subtarget* getSubtarget() {
+      return Subtarget;
+    }
+
+    /// isScalarFPTypeInSSEReg - Return true if the specified scalar FP type is
+    /// computed in an SSE register, not on the X87 floating point stack.
+    bool isScalarFPTypeInSSEReg(EVT VT) const {
+      return (VT == MVT::f64 && X86ScalarSSEf64) || // f64 is when SSE2
+      (VT == MVT::f32 && X86ScalarSSEf32);   // f32 is when SSE1
+    }
+
+    /// getWidenVectorType: given a vector type, returns the type to widen
+    /// to (e.g., v7i8 to v8i8). If the vector type is legal, it returns itself.
+    /// If there is no vector type that we want to widen to, returns EVT::Other
+    /// When and were to widen is target dependent based on the cost of
+    /// scalarizing vs using the wider vector type.
+    virtual EVT getWidenVectorType(EVT VT) const;
+
+    /// createFastISel - This method returns a target specific FastISel object,
+    /// or null if the target does not support "fast" ISel.
+    virtual FastISel *
+    createFastISel(MachineFunction &mf,
+                   MachineModuleInfo *mmi, DwarfWriter *dw,
+                   DenseMap<const Value *, unsigned> &,
+                   DenseMap<const BasicBlock *, MachineBasicBlock *> &,
+                   DenseMap<const AllocaInst *, int> &
+#ifndef NDEBUG
+                   , SmallSet<Instruction*, 8> &
+#endif
+                   );
+
+    /// getFunctionAlignment - Return the Log2 alignment of this function.
+    virtual unsigned getFunctionAlignment(const Function *F) const;
+
+  private:
+    /// Subtarget - Keep a pointer to the X86Subtarget around so that we can
+    /// make the right decision when generating code for different targets.
+    const X86Subtarget *Subtarget;
+    const X86RegisterInfo *RegInfo;
+    const TargetData *TD;
+
+    /// X86StackPtr - X86 physical register used as stack ptr.
+    unsigned X86StackPtr;
+   
+    /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87 
+    /// floating point ops.
+    /// When SSE is available, use it for f32 operations.
+    /// When SSE2 is available, use it for f64 operations.
+    bool X86ScalarSSEf32;
+    bool X86ScalarSSEf64;
+
+    /// LegalFPImmediates - A list of legal fp immediates.
+    std::vector<APFloat> LegalFPImmediates;
+
+    /// addLegalFPImmediate - Indicate that this x86 target can instruction
+    /// select the specified FP immediate natively.
+    void addLegalFPImmediate(const APFloat& Imm) {
+      LegalFPImmediates.push_back(Imm);
+    }
+
+    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                            CallingConv::ID CallConv, bool isVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            DebugLoc dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals);
+    SDValue LowerMemArgument(SDValue Chain,
+                             CallingConv::ID CallConv,
+                             const SmallVectorImpl<ISD::InputArg> &ArgInfo,
+                             DebugLoc dl, SelectionDAG &DAG,
+                             const CCValAssign &VA,  MachineFrameInfo *MFI,
+                              unsigned i);
+    SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
+                             DebugLoc dl, SelectionDAG &DAG,
+                             const CCValAssign &VA,
+                             ISD::ArgFlagsTy Flags);
+
+    // Call lowering helpers.
+
+    /// IsEligibleForTailCallOptimization - Check whether the call is eligible
+    /// for tail call optimization. Targets which want to do tail call
+    /// optimization should implement this function.
+    bool IsEligibleForTailCallOptimization(SDValue Callee,
+                                           CallingConv::ID CalleeCC,
+                                           bool isVarArg,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                           SelectionDAG& DAG) const;
+    bool IsCalleePop(bool isVarArg, CallingConv::ID CallConv);
+    SDValue EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr,
+                                SDValue Chain, bool IsTailCall, bool Is64Bit,
+                                int FPDiff, DebugLoc dl);
+
+    CCAssignFn *CCAssignFnForNode(CallingConv::ID CallConv) const;
+    NameDecorationStyle NameDecorationForCallConv(CallingConv::ID CallConv);
+    unsigned GetAlignedArgumentStackSize(unsigned StackSize, SelectionDAG &DAG);
+
+    std::pair<SDValue,SDValue> FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
+                                               bool isSigned);
+
+    SDValue LowerAsSplatVectorLoad(SDValue SrcOp, EVT VT, DebugLoc dl,
+                                   SelectionDAG &DAG);
+    SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
+                               int64_t Offset, SelectionDAG &DAG) const;
+    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerShift(SDValue Op, SelectionDAG &DAG);
+    SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot,
+                      SelectionDAG &DAG);
+    SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerFABS(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerMEMSET(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerTRAMPOLINE(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerMUL_V2I64(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG);
+
+    SDValue LowerCMP_SWAP(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG);
+
+    virtual SDValue
+      LowerFormalArguments(SDValue Chain,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals);
+    virtual SDValue
+      LowerCall(SDValue Chain, SDValue Callee,
+                CallingConv::ID CallConv, bool isVarArg, bool &isTailCall,
+                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<ISD::InputArg> &Ins,
+                DebugLoc dl, SelectionDAG &DAG,
+                SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerReturn(SDValue Chain,
+                  CallingConv::ID CallConv, bool isVarArg,
+                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  DebugLoc dl, SelectionDAG &DAG);
+
+    virtual bool
+      CanLowerReturn(CallingConv::ID CallConv, bool isVarArg,
+                     const SmallVectorImpl<EVT> &OutTys,
+                     const SmallVectorImpl<ISD::ArgFlagsTy> &ArgsFlags,
+                     SelectionDAG &DAG);
+
+    void ReplaceATOMIC_BINARY_64(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                                 SelectionDAG &DAG, unsigned NewOp);
+
+    SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
+                                    SDValue Chain,
+                                    SDValue Dst, SDValue Src,
+                                    SDValue Size, unsigned Align,
+                                    const Value *DstSV, uint64_t DstSVOff);
+    SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
+                                    SDValue Chain,
+                                    SDValue Dst, SDValue Src,
+                                    SDValue Size, unsigned Align,
+                                    bool AlwaysInline,
+                                    const Value *DstSV, uint64_t DstSVOff,
+                                    const Value *SrcSV, uint64_t SrcSVOff);
+    
+    /// Utility function to emit string processing sse4.2 instructions
+    /// that return in xmm0.
+    /// This takes the instruction to expand, the associated machine basic
+    /// block, the number of args, and whether or not the second arg is
+    /// in memory or not.
+    MachineBasicBlock *EmitPCMP(MachineInstr *BInstr, MachineBasicBlock *BB,
+				unsigned argNum, bool inMem) const;
+
+    /// Utility function to emit atomic bitwise operations (and, or, xor).
+    /// It takes the bitwise instruction to expand, the associated machine basic
+    /// block, and the associated X86 opcodes for reg/reg and reg/imm.
+    MachineBasicBlock *EmitAtomicBitwiseWithCustomInserter(
+                                                    MachineInstr *BInstr,
+                                                    MachineBasicBlock *BB,
+                                                    unsigned regOpc,
+                                                    unsigned immOpc,
+                                                    unsigned loadOpc,
+                                                    unsigned cxchgOpc,
+                                                    unsigned copyOpc,
+                                                    unsigned notOpc,
+                                                    unsigned EAXreg,
+                                                    TargetRegisterClass *RC,
+                                                    bool invSrc = false) const;
+
+    MachineBasicBlock *EmitAtomicBit6432WithCustomInserter(
+                                                    MachineInstr *BInstr,
+                                                    MachineBasicBlock *BB,
+                                                    unsigned regOpcL,
+                                                    unsigned regOpcH,
+                                                    unsigned immOpcL,
+                                                    unsigned immOpcH,
+                                                    bool invSrc = false) const;
+    
+    /// Utility function to emit atomic min and max.  It takes the min/max
+    /// instruction to expand, the associated basic block, and the associated
+    /// cmov opcode for moving the min or max value.
+    MachineBasicBlock *EmitAtomicMinMaxWithCustomInserter(MachineInstr *BInstr,
+                                                          MachineBasicBlock *BB,
+                                                        unsigned cmovOpc) const;
+
+    /// Utility function to emit the xmm reg save portion of va_start.
+    MachineBasicBlock *EmitVAStartSaveXMMRegsWithCustomInserter(
+                                                   MachineInstr *BInstr,
+                                                   MachineBasicBlock *BB) const;
+
+    MachineBasicBlock *EmitLoweredSelect(MachineInstr *I,
+                                         MachineBasicBlock *BB,
+                    DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const;
+    
+    /// Emit nodes that will be selected as "test Op0,Op0", or something
+    /// equivalent, for use with the given x86 condition code.
+    SDValue EmitTest(SDValue Op0, unsigned X86CC, SelectionDAG &DAG);
+
+    /// Emit nodes that will be selected as "cmp Op0,Op1", or something
+    /// equivalent, for use with the given x86 condition code.
+    SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
+                    SelectionDAG &DAG);
+  };
+
+  namespace X86 {
+    FastISel *createFastISel(MachineFunction &mf,
+                           MachineModuleInfo *mmi, DwarfWriter *dw,
+                           DenseMap<const Value *, unsigned> &,
+                           DenseMap<const BasicBlock *, MachineBasicBlock *> &,
+                           DenseMap<const AllocaInst *, int> &
+#ifndef NDEBUG
+                           , SmallSet<Instruction*, 8> &
+#endif
+                           );
+  }
+}
+
+#endif    // X86ISELLOWERING_H

diff --git a/lib/Target/X86/X86Instr64bit.td b/lib/Target/X86/X86Instr64bit.td
new file mode 100644
index 0000000..468dd67
--- /dev/null
+++ b/lib/Target/X86/X86Instr64bit.td

@@ -0,0 +1,2480 @@
+//====- X86Instr64bit.td - Describe X86-64 Instructions ----*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86-64 instruction set, defining the instructions,
+// and properties of the instructions which are needed for code generation,
+// machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Operand Definitions.
+//
+
+// 64-bits but only 32 bits are significant.
+def i64i32imm  : Operand<i64>;
+
+// 64-bits but only 32 bits are significant, and those bits are treated as being
+// pc relative.
+def i64i32imm_pcrel : Operand<i64> {
+  let PrintMethod = "print_pcrel_imm";
+}
+
+
+// 64-bits but only 8 bits are significant.
+def i64i8imm   : Operand<i64> {
+  let ParserMatchClass = ImmSExt8AsmOperand;
+}
+
+def lea64mem : Operand<i64> {
+  let PrintMethod = "printlea64mem";
+  let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm);
+  let ParserMatchClass = X86MemAsmOperand;
+}
+
+def lea64_32mem : Operand<i32> {
+  let PrintMethod = "printlea64_32mem";
+  let AsmOperandLowerMethod = "lower_lea64_32mem";
+  let MIOperandInfo = (ops GR32, i8imm, GR32_NOSP, i32imm);
+  let ParserMatchClass = X86MemAsmOperand;
+}
+
+//===----------------------------------------------------------------------===//
+// Complex Pattern Definitions.
+//
+def lea64addr : ComplexPattern<i64, 4, "SelectLEAAddr",
+                        [add, sub, mul, X86mul_imm, shl, or, frameindex,
+                         X86WrapperRIP], []>;
+
+def tls64addr : ComplexPattern<i64, 4, "SelectTLSADDRAddr",
+                               [tglobaltlsaddr], []>;
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments.
+//
+
+def i64immSExt8  : PatLeaf<(i64 imm), [{
+  // i64immSExt8 predicate - True if the 64-bit immediate fits in a 8-bit
+  // sign extended field.
+  return (int64_t)N->getZExtValue() == (int8_t)N->getZExtValue();
+}]>;
+
+def i64immSExt32  : PatLeaf<(i64 imm), [{
+  // i64immSExt32 predicate - True if the 64-bit immediate fits in a 32-bit
+  // sign extended field.
+  return (int64_t)N->getZExtValue() == (int32_t)N->getZExtValue();
+}]>;
+
+def i64immZExt32  : PatLeaf<(i64 imm), [{
+  // i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit
+  // unsignedsign extended field.
+  return (uint64_t)N->getZExtValue() == (uint32_t)N->getZExtValue();
+}]>;
+
+def sextloadi64i8  : PatFrag<(ops node:$ptr), (i64 (sextloadi8 node:$ptr))>;
+def sextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (sextloadi16 node:$ptr))>;
+def sextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (sextloadi32 node:$ptr))>;
+
+def zextloadi64i1  : PatFrag<(ops node:$ptr), (i64 (zextloadi1 node:$ptr))>;
+def zextloadi64i8  : PatFrag<(ops node:$ptr), (i64 (zextloadi8 node:$ptr))>;
+def zextloadi64i16 : PatFrag<(ops node:$ptr), (i64 (zextloadi16 node:$ptr))>;
+def zextloadi64i32 : PatFrag<(ops node:$ptr), (i64 (zextloadi32 node:$ptr))>;
+
+def extloadi64i1   : PatFrag<(ops node:$ptr), (i64 (extloadi1 node:$ptr))>;
+def extloadi64i8   : PatFrag<(ops node:$ptr), (i64 (extloadi8 node:$ptr))>;
+def extloadi64i16  : PatFrag<(ops node:$ptr), (i64 (extloadi16 node:$ptr))>;
+def extloadi64i32  : PatFrag<(ops node:$ptr), (i64 (extloadi32 node:$ptr))>;
+
+//===----------------------------------------------------------------------===//
+// Instruction list...
+//
+
+// ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into
+// a stack adjustment and the codegen must know that they may modify the stack
+// pointer before prolog-epilog rewriting occurs.
+// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
+// sub / add which can clobber EFLAGS.
+let Defs = [RSP, EFLAGS], Uses = [RSP] in {
+def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt),
+                           "#ADJCALLSTACKDOWN",
+                           [(X86callseq_start timm:$amt)]>,
+                          Requires<[In64BitMode]>;
+def ADJCALLSTACKUP64   : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+                           "#ADJCALLSTACKUP",
+                           [(X86callseq_end timm:$amt1, timm:$amt2)]>,
+                          Requires<[In64BitMode]>;
+}
+
+// Interrupt Instructions
+def IRET64 : RI<0xcf, RawFrm, (outs), (ins), "iret{q}", []>;
+
+//===----------------------------------------------------------------------===//
+//  Call Instructions...
+//
+let isCall = 1 in
+  // All calls clobber the non-callee saved registers. RSP is marked as
+  // a use to prevent stack-pointer assignments that appear immediately
+  // before calls from potentially appearing dead. Uses for argument
+  // registers are added manually.
+  let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+              FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, ST1,
+              MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+              XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+              XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+      Uses = [RSP] in {
+      
+    // NOTE: this pattern doesn't match "X86call imm", because we do not know
+    // that the offset between an arbitrary immediate and the call will fit in
+    // the 32-bit pcrel field that we have.
+    def CALL64pcrel32 : Ii32<0xE8, RawFrm,
+                          (outs), (ins i64i32imm_pcrel:$dst, variable_ops),
+                          "call{q}\t$dst", []>,
+                        Requires<[In64BitMode, NotWin64]>;
+    def CALL64r       : I<0xFF, MRM2r, (outs), (ins GR64:$dst, variable_ops),
+                          "call{q}\t{*}$dst", [(X86call GR64:$dst)]>,
+                        Requires<[NotWin64]>;
+    def CALL64m       : I<0xFF, MRM2m, (outs), (ins i64mem:$dst, variable_ops),
+                          "call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))]>,
+                        Requires<[NotWin64]>;
+                        
+    def FARCALL64   : RI<0xFF, MRM3m, (outs), (ins opaque80mem:$dst),
+                         "lcall{q}\t{*}$dst", []>;
+  }
+
+  // FIXME: We need to teach codegen about single list of call-clobbered 
+  // registers.
+let isCall = 1 in
+  // All calls clobber the non-callee saved registers. RSP is marked as
+  // a use to prevent stack-pointer assignments that appear immediately
+  // before calls from potentially appearing dead. Uses for argument
+  // registers are added manually.
+  let Defs = [RAX, RCX, RDX, R8, R9, R10, R11,
+              FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, ST1,
+              MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+              XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, EFLAGS],
+      Uses = [RSP] in {
+    def WINCALL64pcrel32 : I<0xE8, RawFrm,
+                             (outs), (ins i64i32imm_pcrel:$dst, variable_ops),
+                             "call\t$dst", []>,
+                           Requires<[IsWin64]>;
+    def WINCALL64r       : I<0xFF, MRM2r, (outs), (ins GR64:$dst, variable_ops),
+                             "call\t{*}$dst",
+                             [(X86call GR64:$dst)]>, Requires<[IsWin64]>;
+    def WINCALL64m       : I<0xFF, MRM2m, (outs), 
+                             (ins i64mem:$dst, variable_ops), "call\t{*}$dst",
+                             [(X86call (loadi64 addr:$dst))]>, 
+                           Requires<[IsWin64]>;
+  }
+
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
+def TCRETURNdi64 : I<0, Pseudo, (outs), (ins i64imm:$dst, i32imm:$offset,
+                                         variable_ops),
+                 "#TC_RETURN $dst $offset",
+                 []>;
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
+def TCRETURNri64 : I<0, Pseudo, (outs), (ins GR64:$dst, i32imm:$offset,
+                                         variable_ops),
+                 "#TC_RETURN $dst $offset",
+                 []>;
+
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
+  def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins GR64:$dst, variable_ops),
+                   "jmp{q}\t{*}$dst  # TAILCALL",
+                   []>;     
+
+// Branches
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+  def JMP64pcrel32 : I<0xE9, RawFrm, (outs), (ins brtarget:$dst), 
+                       "jmp{q}\t$dst", []>;
+  def JMP64r     : I<0xFF, MRM4r, (outs), (ins GR64:$dst), "jmp{q}\t{*}$dst",
+                     [(brind GR64:$dst)]>;
+  def JMP64m     : I<0xFF, MRM4m, (outs), (ins i64mem:$dst), "jmp{q}\t{*}$dst",
+                     [(brind (loadi64 addr:$dst))]>;
+  def FARJMP64   : RI<0xFF, MRM5m, (outs), (ins opaque80mem:$dst),
+                      "ljmp{q}\t{*}$dst", []>;
+}
+
+//===----------------------------------------------------------------------===//
+// EH Pseudo Instructions
+//
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+    hasCtrlDep = 1, isCodeGenOnly = 1 in {
+def EH_RETURN64   : I<0xC3, RawFrm, (outs), (ins GR64:$addr),
+                     "ret\t#eh_return, addr: $addr",
+                     [(X86ehret GR64:$addr)]>;
+
+}
+
+//===----------------------------------------------------------------------===//
+//  Miscellaneous Instructions...
+//
+
+def POPCNT64rr : RI<0xB8, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+                    "popcnt{q}\t{$src, $dst|$dst, $src}", []>, XS;
+def POPCNT64rm : RI<0xB8, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                    "popcnt{q}\t{$src, $dst|$dst, $src}", []>, XS;
+
+let Defs = [RBP,RSP], Uses = [RBP,RSP], mayLoad = 1, neverHasSideEffects = 1 in
+def LEAVE64  : I<0xC9, RawFrm,
+                 (outs), (ins), "leave", []>;
+let Defs = [RSP], Uses = [RSP], neverHasSideEffects=1 in {
+let mayLoad = 1 in {
+def POP64r   : I<0x58, AddRegFrm,
+                 (outs GR64:$reg), (ins), "pop{q}\t$reg", []>;
+def POP64rmr: I<0x8F, MRM0r, (outs GR64:$reg), (ins), "pop{q}\t$reg", []>;
+def POP64rmm: I<0x8F, MRM0m, (outs i64mem:$dst), (ins), "pop{q}\t$dst", []>;
+}
+let mayStore = 1 in {
+def PUSH64r  : I<0x50, AddRegFrm,
+                 (outs), (ins GR64:$reg), "push{q}\t$reg", []>;
+def PUSH64rmr: I<0xFF, MRM6r, (outs), (ins GR64:$reg), "push{q}\t$reg", []>;
+def PUSH64rmm: I<0xFF, MRM6m, (outs), (ins i64mem:$src), "push{q}\t$src", []>;
+}
+}
+
+let Defs = [RSP], Uses = [RSP], neverHasSideEffects = 1, mayStore = 1 in {
+def PUSH64i8   : Ii8<0x6a, RawFrm, (outs), (ins i8imm:$imm), 
+                     "push{q}\t$imm", []>;
+def PUSH64i16  : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm), 
+                      "push{q}\t$imm", []>;
+def PUSH64i32  : Ii32<0x68, RawFrm, (outs), (ins i32imm:$imm), 
+                      "push{q}\t$imm", []>;
+}
+
+let Defs = [RSP, EFLAGS], Uses = [RSP], mayLoad = 1 in
+def POPFQ    : I<0x9D, RawFrm, (outs), (ins), "popf{q}", []>, REX_W;
+let Defs = [RSP], Uses = [RSP, EFLAGS], mayStore = 1 in
+def PUSHFQ64   : I<0x9C, RawFrm, (outs), (ins), "pushf{q}", []>;
+
+def LEA64_32r : I<0x8D, MRMSrcMem,
+                  (outs GR32:$dst), (ins lea64_32mem:$src),
+                  "lea{l}\t{$src|$dst}, {$dst|$src}",
+                  [(set GR32:$dst, lea32addr:$src)]>, Requires<[In64BitMode]>;
+
+let isReMaterializable = 1 in
+def LEA64r   : RI<0x8D, MRMSrcMem, (outs GR64:$dst), (ins lea64mem:$src),
+                  "lea{q}\t{$src|$dst}, {$dst|$src}",
+                  [(set GR64:$dst, lea64addr:$src)]>;
+
+let isTwoAddress = 1 in
+def BSWAP64r : RI<0xC8, AddRegFrm, (outs GR64:$dst), (ins GR64:$src),
+                  "bswap{q}\t$dst", 
+                  [(set GR64:$dst, (bswap GR64:$src))]>, TB;
+
+// Bit scan instructions.
+let Defs = [EFLAGS] in {
+def BSF64rr  : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+                  "bsf{q}\t{$src, $dst|$dst, $src}",
+                  [(set GR64:$dst, (X86bsf GR64:$src)), (implicit EFLAGS)]>, TB;
+def BSF64rm  : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                  "bsf{q}\t{$src, $dst|$dst, $src}",
+                  [(set GR64:$dst, (X86bsf (loadi64 addr:$src))),
+                   (implicit EFLAGS)]>, TB;
+
+def BSR64rr  : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+                  "bsr{q}\t{$src, $dst|$dst, $src}",
+                  [(set GR64:$dst, (X86bsr GR64:$src)), (implicit EFLAGS)]>, TB;
+def BSR64rm  : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                  "bsr{q}\t{$src, $dst|$dst, $src}",
+                  [(set GR64:$dst, (X86bsr (loadi64 addr:$src))),
+                   (implicit EFLAGS)]>, TB;
+} // Defs = [EFLAGS]
+
+// Repeat string ops
+let Defs = [RCX,RDI,RSI], Uses = [RCX,RDI,RSI] in
+def REP_MOVSQ : RI<0xA5, RawFrm, (outs), (ins), "{rep;movsq|rep movsq}",
+                   [(X86rep_movs i64)]>, REP;
+let Defs = [RCX,RDI], Uses = [RAX,RCX,RDI] in
+def REP_STOSQ : RI<0xAB, RawFrm, (outs), (ins), "{rep;stosq|rep stosq}",
+                   [(X86rep_stos i64)]>, REP;
+
+def SCAS64 : RI<0xAF, RawFrm, (outs), (ins), "scas{q}", []>;
+
+def CMPS64 : RI<0xA7, RawFrm, (outs), (ins), "cmps{q}", []>;
+
+// Fast system-call instructions
+def SYSEXIT64 : RI<0x35, RawFrm,
+                   (outs), (ins), "sysexit", []>, TB;
+
+//===----------------------------------------------------------------------===//
+//  Move Instructions...
+//
+
+let neverHasSideEffects = 1 in
+def MOV64rr : RI<0x89, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
+                 "mov{q}\t{$src, $dst|$dst, $src}", []>;
+
+let isReMaterializable = 1, isAsCheapAsAMove = 1  in {
+def MOV64ri : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src),
+                    "movabs{q}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, imm:$src)]>;
+def MOV64ri32 : RIi32<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src),
+                      "mov{q}\t{$src, $dst|$dst, $src}",
+                      [(set GR64:$dst, i64immSExt32:$src)]>;
+}
+
+def MOV64rr_REV : RI<0x8B, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+                     "mov{q}\t{$src, $dst|$dst, $src}", []>;
+
+let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+def MOV64rm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                 "mov{q}\t{$src, $dst|$dst, $src}",
+                 [(set GR64:$dst, (load addr:$src))]>;
+
+def MOV64mr : RI<0x89, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+                 "mov{q}\t{$src, $dst|$dst, $src}",
+                 [(store GR64:$src, addr:$dst)]>;
+def MOV64mi32 : RIi32<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src),
+                      "mov{q}\t{$src, $dst|$dst, $src}",
+                      [(store i64immSExt32:$src, addr:$dst)]>;
+
+def MOV64o8a : RIi8<0xA0, RawFrm, (outs), (ins offset8:$src),
+                      "mov{q}\t{$src, %rax|%rax, $src}", []>;
+def MOV64o64a : RIi32<0xA1, RawFrm, (outs), (ins offset64:$src),
+                       "mov{q}\t{$src, %rax|%rax, $src}", []>;
+def MOV64ao8 : RIi8<0xA2, RawFrm, (outs offset8:$dst), (ins),
+                       "mov{q}\t{%rax, $dst|$dst, %rax}", []>;
+def MOV64ao64 : RIi32<0xA3, RawFrm, (outs offset64:$dst), (ins),
+                       "mov{q}\t{%rax, $dst|$dst, %rax}", []>;
+
+// Moves to and from segment registers
+def MOV64rs : RI<0x8C, MRMDestReg, (outs GR64:$dst), (ins SEGMENT_REG:$src),
+                 "mov{q}\t{$src, $dst|$dst, $src}", []>;
+def MOV64ms : RI<0x8C, MRMDestMem, (outs i64mem:$dst), (ins SEGMENT_REG:$src),
+                 "mov{q}\t{$src, $dst|$dst, $src}", []>;
+def MOV64sr : RI<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR64:$src),
+                 "mov{q}\t{$src, $dst|$dst, $src}", []>;
+def MOV64sm : RI<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i64mem:$src),
+                 "mov{q}\t{$src, $dst|$dst, $src}", []>;
+
+// Moves to and from debug registers
+def MOV64rd : I<0x21, MRMDestReg, (outs GR64:$dst), (ins DEBUG_REG:$src),
+                "mov{q}\t{$src, $dst|$dst, $src}", []>, TB;
+def MOV64dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR64:$src),
+                "mov{q}\t{$src, $dst|$dst, $src}", []>, TB;
+
+// Moves to and from control registers
+def MOV64rc : I<0x20, MRMDestReg, (outs GR64:$dst), (ins CONTROL_REG_64:$src),
+                "mov{q}\t{$src, $dst|$dst, $src}", []>, TB;
+def MOV64cr : I<0x22, MRMSrcReg, (outs CONTROL_REG_64:$dst), (ins GR64:$src),
+                "mov{q}\t{$src, $dst|$dst, $src}", []>, TB;
+
+// Sign/Zero extenders
+
+// MOVSX64rr8 always has a REX prefix and it has an 8-bit register
+// operand, which makes it a rare instruction with an 8-bit register
+// operand that can never access an h register. If support for h registers
+// were generalized, this would require a special register class.
+def MOVSX64rr8 : RI<0xBE, MRMSrcReg, (outs GR64:$dst), (ins GR8 :$src),
+                    "movs{bq|x}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sext GR8:$src))]>, TB;
+def MOVSX64rm8 : RI<0xBE, MRMSrcMem, (outs GR64:$dst), (ins i8mem :$src),
+                    "movs{bq|x}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sextloadi64i8 addr:$src))]>, TB;
+def MOVSX64rr16: RI<0xBF, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
+                    "movs{wq|x}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sext GR16:$src))]>, TB;
+def MOVSX64rm16: RI<0xBF, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
+                    "movs{wq|x}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sextloadi64i16 addr:$src))]>, TB;
+def MOVSX64rr32: RI<0x63, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src),
+                    "movs{lq|xd}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sext GR32:$src))]>;
+def MOVSX64rm32: RI<0x63, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src),
+                    "movs{lq|xd}\t{$src, $dst|$dst, $src}",
+                    [(set GR64:$dst, (sextloadi64i32 addr:$src))]>;
+
+// movzbq and movzwq encodings for the disassembler
+def MOVZX64rr8_Q : RI<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8:$src),
+                       "movz{bq|x}\t{$src, $dst|$dst, $src}", []>, TB;
+def MOVZX64rm8_Q : RI<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem:$src),
+                       "movz{bq|x}\t{$src, $dst|$dst, $src}", []>, TB;
+def MOVZX64rr16_Q : RI<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
+                       "movz{wq|x}\t{$src, $dst|$dst, $src}", []>, TB;
+def MOVZX64rm16_Q : RI<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
+                       "movz{wq|x}\t{$src, $dst|$dst, $src}", []>, TB;
+
+// Use movzbl instead of movzbq when the destination is a register; it's
+// equivalent due to implicit zero-extending, and it has a smaller encoding.
+def MOVZX64rr8 : I<0xB6, MRMSrcReg, (outs GR64:$dst), (ins GR8 :$src),
+                   "", [(set GR64:$dst, (zext GR8:$src))]>, TB;
+def MOVZX64rm8 : I<0xB6, MRMSrcMem, (outs GR64:$dst), (ins i8mem :$src),
+                   "", [(set GR64:$dst, (zextloadi64i8 addr:$src))]>, TB;
+// Use movzwl instead of movzwq when the destination is a register; it's
+// equivalent due to implicit zero-extending, and it has a smaller encoding.
+def MOVZX64rr16: I<0xB7, MRMSrcReg, (outs GR64:$dst), (ins GR16:$src),
+                   "", [(set GR64:$dst, (zext GR16:$src))]>, TB;
+def MOVZX64rm16: I<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
+                   "", [(set GR64:$dst, (zextloadi64i16 addr:$src))]>, TB;
+
+// There's no movzlq instruction, but movl can be used for this purpose, using
+// implicit zero-extension. The preferred way to do 32-bit-to-64-bit zero
+// extension on x86-64 is to use a SUBREG_TO_REG to utilize implicit
+// zero-extension, however this isn't possible when the 32-bit value is
+// defined by a truncate or is copied from something where the high bits aren't
+// necessarily all zero. In such cases, we fall back to these explicit zext
+// instructions.
+def MOVZX64rr32 : I<0x89, MRMDestReg, (outs GR64:$dst), (ins GR32:$src),
+                    "", [(set GR64:$dst, (zext GR32:$src))]>;
+def MOVZX64rm32 : I<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src),
+                    "", [(set GR64:$dst, (zextloadi64i32 addr:$src))]>;
+
+// Any instruction that defines a 32-bit result leaves the high half of the
+// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
+// be copying from a truncate. And x86's cmov doesn't do anything if the
+// condition is false. But any other 32-bit operation will zero-extend
+// up to 64 bits.
+def def32 : PatLeaf<(i32 GR32:$src), [{
+  return N->getOpcode() != ISD::TRUNCATE &&
+         N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
+         N->getOpcode() != ISD::CopyFromReg &&
+         N->getOpcode() != X86ISD::CMOV;
+}]>;
+
+// In the case of a 32-bit def that is known to implicitly zero-extend,
+// we can use a SUBREG_TO_REG.
+def : Pat<(i64 (zext def32:$src)),
+          (SUBREG_TO_REG (i64 0), GR32:$src, x86_subreg_32bit)>;
+
+let neverHasSideEffects = 1 in {
+  let Defs = [RAX], Uses = [EAX] in
+  def CDQE : RI<0x98, RawFrm, (outs), (ins),
+               "{cltq|cdqe}", []>;     // RAX = signext(EAX)
+
+  let Defs = [RAX,RDX], Uses = [RAX] in
+  def CQO  : RI<0x99, RawFrm, (outs), (ins),
+                "{cqto|cqo}", []>; // RDX:RAX = signext(RAX)
+}
+
+//===----------------------------------------------------------------------===//
+//  Arithmetic Instructions...
+//
+
+let Defs = [EFLAGS] in {
+
+def ADD64i32 : RI<0x05, RawFrm, (outs), (ins i32imm:$src),
+                  "add{q}\t{$src, %rax|%rax, $src}", []>;
+
+let isTwoAddress = 1 in {
+let isConvertibleToThreeAddress = 1 in {
+let isCommutable = 1 in
+// Register-Register Addition
+def ADD64rr    : RI<0x01, MRMDestReg, (outs GR64:$dst), 
+                    (ins GR64:$src1, GR64:$src2),
+                    "add{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (add GR64:$src1, GR64:$src2)),
+                     (implicit EFLAGS)]>;
+
+// Register-Integer Addition
+def ADD64ri8  : RIi8<0x83, MRM0r, (outs GR64:$dst), 
+                     (ins GR64:$src1, i64i8imm:$src2),
+                     "add{q}\t{$src2, $dst|$dst, $src2}",
+                     [(set GR64:$dst, (add GR64:$src1, i64immSExt8:$src2)),
+                      (implicit EFLAGS)]>;
+def ADD64ri32 : RIi32<0x81, MRM0r, (outs GR64:$dst), 
+                      (ins GR64:$src1, i64i32imm:$src2),
+                      "add{q}\t{$src2, $dst|$dst, $src2}",
+                      [(set GR64:$dst, (add GR64:$src1, i64immSExt32:$src2)),
+                       (implicit EFLAGS)]>;
+} // isConvertibleToThreeAddress
+
+// Register-Memory Addition
+def ADD64rm     : RI<0x03, MRMSrcMem, (outs GR64:$dst), 
+                     (ins GR64:$src1, i64mem:$src2),
+                     "add{q}\t{$src2, $dst|$dst, $src2}",
+                     [(set GR64:$dst, (add GR64:$src1, (load addr:$src2))),
+                      (implicit EFLAGS)]>;
+
+// Register-Register Addition - Equivalent to the normal rr form (ADD64rr), but
+//   differently encoded.
+def ADD64mrmrr  : RI<0x03, MRMSrcReg, (outs GR64:$dst), 
+                     (ins GR64:$src1, GR64:$src2),
+                     "add{l}\t{$src2, $dst|$dst, $src2}", []>;
+
+} // isTwoAddress
+
+// Memory-Register Addition
+def ADD64mr  : RI<0x01, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+                  "add{q}\t{$src2, $dst|$dst, $src2}",
+                  [(store (add (load addr:$dst), GR64:$src2), addr:$dst),
+                   (implicit EFLAGS)]>;
+def ADD64mi8 : RIi8<0x83, MRM0m, (outs), (ins i64mem:$dst, i64i8imm :$src2),
+                    "add{q}\t{$src2, $dst|$dst, $src2}",
+                [(store (add (load addr:$dst), i64immSExt8:$src2), addr:$dst),
+                 (implicit EFLAGS)]>;
+def ADD64mi32 : RIi32<0x81, MRM0m, (outs), (ins i64mem:$dst, i64i32imm :$src2),
+                      "add{q}\t{$src2, $dst|$dst, $src2}",
+               [(store (add (load addr:$dst), i64immSExt32:$src2), addr:$dst),
+                (implicit EFLAGS)]>;
+
+let Uses = [EFLAGS] in {
+
+def ADC64i32 : RI<0x15, RawFrm, (outs), (ins i32imm:$src),
+                  "adc{q}\t{$src, %rax|%rax, $src}", []>;
+
+let isTwoAddress = 1 in {
+let isCommutable = 1 in
+def ADC64rr  : RI<0x11, MRMDestReg, (outs GR64:$dst), 
+                  (ins GR64:$src1, GR64:$src2),
+                  "adc{q}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (adde GR64:$src1, GR64:$src2))]>;
+
+def ADC64rr_REV : RI<0x13, MRMSrcReg , (outs GR32:$dst), 
+                     (ins GR64:$src1, GR64:$src2),
+                    "adc{q}\t{$src2, $dst|$dst, $src2}", []>;
+
+def ADC64rm  : RI<0x13, MRMSrcMem , (outs GR64:$dst), 
+                  (ins GR64:$src1, i64mem:$src2),
+                  "adc{q}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (adde GR64:$src1, (load addr:$src2)))]>;
+
+def ADC64ri8 : RIi8<0x83, MRM2r, (outs GR64:$dst), 
+                    (ins GR64:$src1, i64i8imm:$src2),
+                    "adc{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (adde GR64:$src1, i64immSExt8:$src2))]>;
+def ADC64ri32 : RIi32<0x81, MRM2r, (outs GR64:$dst), 
+                      (ins GR64:$src1, i64i32imm:$src2),
+                      "adc{q}\t{$src2, $dst|$dst, $src2}",
+                      [(set GR64:$dst, (adde GR64:$src1, i64immSExt32:$src2))]>;
+} // isTwoAddress
+
+def ADC64mr  : RI<0x11, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+                  "adc{q}\t{$src2, $dst|$dst, $src2}",
+                  [(store (adde (load addr:$dst), GR64:$src2), addr:$dst)]>;
+def ADC64mi8 : RIi8<0x83, MRM2m, (outs), (ins i64mem:$dst, i64i8imm :$src2),
+                    "adc{q}\t{$src2, $dst|$dst, $src2}",
+                 [(store (adde (load addr:$dst), i64immSExt8:$src2), 
+                  addr:$dst)]>;
+def ADC64mi32 : RIi32<0x81, MRM2m, (outs), (ins i64mem:$dst, i64i32imm:$src2),
+                      "adc{q}\t{$src2, $dst|$dst, $src2}",
+                 [(store (adde (load addr:$dst), i64immSExt8:$src2), 
+                  addr:$dst)]>;
+} // Uses = [EFLAGS]
+
+let isTwoAddress = 1 in {
+// Register-Register Subtraction
+def SUB64rr  : RI<0x29, MRMDestReg, (outs GR64:$dst), 
+                  (ins GR64:$src1, GR64:$src2),
+                  "sub{q}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (sub GR64:$src1, GR64:$src2)),
+                   (implicit EFLAGS)]>;
+
+def SUB64rr_REV : RI<0x2B, MRMSrcReg, (outs GR64:$dst), 
+                     (ins GR64:$src1, GR64:$src2),
+                     "sub{q}\t{$src2, $dst|$dst, $src2}", []>;
+
+// Register-Memory Subtraction
+def SUB64rm  : RI<0x2B, MRMSrcMem, (outs GR64:$dst), 
+                  (ins GR64:$src1, i64mem:$src2),
+                  "sub{q}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (sub GR64:$src1, (load addr:$src2))),
+                   (implicit EFLAGS)]>;
+
+// Register-Integer Subtraction
+def SUB64ri8 : RIi8<0x83, MRM5r, (outs GR64:$dst),
+                                 (ins GR64:$src1, i64i8imm:$src2),
+                    "sub{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (sub GR64:$src1, i64immSExt8:$src2)),
+                     (implicit EFLAGS)]>;
+def SUB64ri32 : RIi32<0x81, MRM5r, (outs GR64:$dst),
+                                   (ins GR64:$src1, i64i32imm:$src2),
+                      "sub{q}\t{$src2, $dst|$dst, $src2}",
+                      [(set GR64:$dst, (sub GR64:$src1, i64immSExt32:$src2)),
+                       (implicit EFLAGS)]>;
+} // isTwoAddress
+
+def SUB64i32 : RI<0x2D, RawFrm, (outs), (ins i32imm:$src),
+                  "sub{q}\t{$src, %rax|%rax, $src}", []>;
+
+// Memory-Register Subtraction
+def SUB64mr  : RI<0x29, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), 
+                  "sub{q}\t{$src2, $dst|$dst, $src2}",
+                  [(store (sub (load addr:$dst), GR64:$src2), addr:$dst),
+                   (implicit EFLAGS)]>;
+
+// Memory-Integer Subtraction
+def SUB64mi8 : RIi8<0x83, MRM5m, (outs), (ins i64mem:$dst, i64i8imm :$src2), 
+                    "sub{q}\t{$src2, $dst|$dst, $src2}",
+                    [(store (sub (load addr:$dst), i64immSExt8:$src2),
+                            addr:$dst),
+                     (implicit EFLAGS)]>;
+def SUB64mi32 : RIi32<0x81, MRM5m, (outs), (ins i64mem:$dst, i64i32imm:$src2),
+                      "sub{q}\t{$src2, $dst|$dst, $src2}",
+                      [(store (sub (load addr:$dst), i64immSExt32:$src2),
+                              addr:$dst),
+                       (implicit EFLAGS)]>;
+
+let Uses = [EFLAGS] in {
+let isTwoAddress = 1 in {
+def SBB64rr    : RI<0x19, MRMDestReg, (outs GR64:$dst), 
+                    (ins GR64:$src1, GR64:$src2),
+                    "sbb{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (sube GR64:$src1, GR64:$src2))]>;
+
+def SBB64rr_REV : RI<0x1B, MRMSrcReg, (outs GR64:$dst), 
+                     (ins GR64:$src1, GR64:$src2),
+                     "sbb{q}\t{$src2, $dst|$dst, $src2}", []>;
+                     
+def SBB64rm  : RI<0x1B, MRMSrcMem, (outs GR64:$dst), 
+                  (ins GR64:$src1, i64mem:$src2),
+                  "sbb{q}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (sube GR64:$src1, (load addr:$src2)))]>;
+
+def SBB64ri8 : RIi8<0x83, MRM3r, (outs GR64:$dst), 
+                    (ins GR64:$src1, i64i8imm:$src2),
+                    "sbb{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (sube GR64:$src1, i64immSExt8:$src2))]>;
+def SBB64ri32 : RIi32<0x81, MRM3r, (outs GR64:$dst), 
+                      (ins GR64:$src1, i64i32imm:$src2),
+                      "sbb{q}\t{$src2, $dst|$dst, $src2}",
+                      [(set GR64:$dst, (sube GR64:$src1, i64immSExt32:$src2))]>;
+} // isTwoAddress
+
+def SBB64i32 : RI<0x1D, RawFrm, (outs), (ins i32imm:$src),
+                  "sbb{q}\t{$src, %rax|%rax, $src}", []>;
+
+def SBB64mr  : RI<0x19, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), 
+                  "sbb{q}\t{$src2, $dst|$dst, $src2}",
+                  [(store (sube (load addr:$dst), GR64:$src2), addr:$dst)]>;
+def SBB64mi8 : RIi8<0x83, MRM3m, (outs), (ins i64mem:$dst, i64i8imm :$src2), 
+                    "sbb{q}\t{$src2, $dst|$dst, $src2}",
+               [(store (sube (load addr:$dst), i64immSExt8:$src2), addr:$dst)]>;
+def SBB64mi32 : RIi32<0x81, MRM3m, (outs), (ins i64mem:$dst, i64i32imm:$src2), 
+                      "sbb{q}\t{$src2, $dst|$dst, $src2}",
+              [(store (sube (load addr:$dst), i64immSExt32:$src2), addr:$dst)]>;
+} // Uses = [EFLAGS]
+} // Defs = [EFLAGS]
+
+// Unsigned multiplication
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX], neverHasSideEffects = 1 in {
+def MUL64r : RI<0xF7, MRM4r, (outs), (ins GR64:$src),
+                "mul{q}\t$src", []>;         // RAX,RDX = RAX*GR64
+let mayLoad = 1 in
+def MUL64m : RI<0xF7, MRM4m, (outs), (ins i64mem:$src),
+                "mul{q}\t$src", []>;         // RAX,RDX = RAX*[mem64]
+
+// Signed multiplication
+def IMUL64r : RI<0xF7, MRM5r, (outs), (ins GR64:$src),
+                 "imul{q}\t$src", []>;         // RAX,RDX = RAX*GR64
+let mayLoad = 1 in
+def IMUL64m : RI<0xF7, MRM5m, (outs), (ins i64mem:$src),
+                 "imul{q}\t$src", []>;         // RAX,RDX = RAX*[mem64]
+}
+
+let Defs = [EFLAGS] in {
+let isTwoAddress = 1 in {
+let isCommutable = 1 in
+// Register-Register Signed Integer Multiplication
+def IMUL64rr : RI<0xAF, MRMSrcReg, (outs GR64:$dst),
+                                   (ins GR64:$src1, GR64:$src2),
+                  "imul{q}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (mul GR64:$src1, GR64:$src2)),
+                   (implicit EFLAGS)]>, TB;
+
+// Register-Memory Signed Integer Multiplication
+def IMUL64rm : RI<0xAF, MRMSrcMem, (outs GR64:$dst),
+                                   (ins GR64:$src1, i64mem:$src2),
+                  "imul{q}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (mul GR64:$src1, (load addr:$src2))),
+                   (implicit EFLAGS)]>, TB;
+} // isTwoAddress
+
+// Suprisingly enough, these are not two address instructions!
+
+// Register-Integer Signed Integer Multiplication
+def IMUL64rri8 : RIi8<0x6B, MRMSrcReg,                      // GR64 = GR64*I8
+                      (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
+                      "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR64:$dst, (mul GR64:$src1, i64immSExt8:$src2)),
+                       (implicit EFLAGS)]>;
+def IMUL64rri32 : RIi32<0x69, MRMSrcReg,                    // GR64 = GR64*I32
+                        (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
+                        "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                       [(set GR64:$dst, (mul GR64:$src1, i64immSExt32:$src2)),
+                        (implicit EFLAGS)]>;
+
+// Memory-Integer Signed Integer Multiplication
+def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem,                      // GR64 = [mem64]*I8
+                      (outs GR64:$dst), (ins i64mem:$src1, i64i8imm: $src2),
+                      "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR64:$dst, (mul (load addr:$src1),
+                                            i64immSExt8:$src2)),
+                       (implicit EFLAGS)]>;
+def IMUL64rmi32 : RIi32<0x69, MRMSrcMem,                   // GR64 = [mem64]*I32
+                        (outs GR64:$dst), (ins i64mem:$src1, i64i32imm:$src2),
+                        "imul{q}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                        [(set GR64:$dst, (mul (load addr:$src1),
+                                              i64immSExt32:$src2)),
+                         (implicit EFLAGS)]>;
+} // Defs = [EFLAGS]
+
+// Unsigned division / remainder
+let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in {
+// RDX:RAX/r64 = RAX,RDX
+def DIV64r : RI<0xF7, MRM6r, (outs), (ins GR64:$src),
+                "div{q}\t$src", []>;
+// Signed division / remainder
+// RDX:RAX/r64 = RAX,RDX
+def IDIV64r: RI<0xF7, MRM7r, (outs), (ins GR64:$src),
+                "idiv{q}\t$src", []>;
+let mayLoad = 1 in {
+// RDX:RAX/[mem64] = RAX,RDX
+def DIV64m : RI<0xF7, MRM6m, (outs), (ins i64mem:$src),
+                "div{q}\t$src", []>;
+// RDX:RAX/[mem64] = RAX,RDX
+def IDIV64m: RI<0xF7, MRM7m, (outs), (ins i64mem:$src),
+                "idiv{q}\t$src", []>;
+}
+}
+
+// Unary instructions
+let Defs = [EFLAGS], CodeSize = 2 in {
+let isTwoAddress = 1 in
+def NEG64r : RI<0xF7, MRM3r, (outs GR64:$dst), (ins GR64:$src), "neg{q}\t$dst",
+                [(set GR64:$dst, (ineg GR64:$src)),
+                 (implicit EFLAGS)]>;
+def NEG64m : RI<0xF7, MRM3m, (outs), (ins i64mem:$dst), "neg{q}\t$dst",
+                [(store (ineg (loadi64 addr:$dst)), addr:$dst),
+                 (implicit EFLAGS)]>;
+
+let isTwoAddress = 1, isConvertibleToThreeAddress = 1 in
+def INC64r : RI<0xFF, MRM0r, (outs GR64:$dst), (ins GR64:$src), "inc{q}\t$dst",
+                [(set GR64:$dst, (add GR64:$src, 1)),
+                 (implicit EFLAGS)]>;
+def INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst), "inc{q}\t$dst",
+                [(store (add (loadi64 addr:$dst), 1), addr:$dst),
+                 (implicit EFLAGS)]>;
+
+let isTwoAddress = 1, isConvertibleToThreeAddress = 1 in
+def DEC64r : RI<0xFF, MRM1r, (outs GR64:$dst), (ins GR64:$src), "dec{q}\t$dst",
+                [(set GR64:$dst, (add GR64:$src, -1)),
+                 (implicit EFLAGS)]>;
+def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
+                [(store (add (loadi64 addr:$dst), -1), addr:$dst),
+                 (implicit EFLAGS)]>;
+
+// In 64-bit mode, single byte INC and DEC cannot be encoded.
+let isTwoAddress = 1, isConvertibleToThreeAddress = 1 in {
+// Can transform into LEA.
+def INC64_16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src), 
+                  "inc{w}\t$dst",
+                  [(set GR16:$dst, (add GR16:$src, 1)),
+                   (implicit EFLAGS)]>,
+                OpSize, Requires<[In64BitMode]>;
+def INC64_32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src), 
+                  "inc{l}\t$dst",
+                  [(set GR32:$dst, (add GR32:$src, 1)),
+                   (implicit EFLAGS)]>,
+                Requires<[In64BitMode]>;
+def DEC64_16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src), 
+                  "dec{w}\t$dst",
+                  [(set GR16:$dst, (add GR16:$src, -1)),
+                   (implicit EFLAGS)]>,
+                OpSize, Requires<[In64BitMode]>;
+def DEC64_32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src), 
+                  "dec{l}\t$dst",
+                  [(set GR32:$dst, (add GR32:$src, -1)),
+                   (implicit EFLAGS)]>,
+                Requires<[In64BitMode]>;
+} // isConvertibleToThreeAddress
+
+// These are duplicates of their 32-bit counterparts. Only needed so X86 knows
+// how to unfold them.
+let isTwoAddress = 0, CodeSize = 2 in {
+  def INC64_16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "inc{w}\t$dst",
+                    [(store (add (loadi16 addr:$dst), 1), addr:$dst),
+                     (implicit EFLAGS)]>,
+                  OpSize, Requires<[In64BitMode]>;
+  def INC64_32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst",
+                    [(store (add (loadi32 addr:$dst), 1), addr:$dst),
+                     (implicit EFLAGS)]>,
+                  Requires<[In64BitMode]>;
+  def DEC64_16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "dec{w}\t$dst",
+                    [(store (add (loadi16 addr:$dst), -1), addr:$dst),
+                     (implicit EFLAGS)]>,
+                  OpSize, Requires<[In64BitMode]>;
+  def DEC64_32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst",
+                    [(store (add (loadi32 addr:$dst), -1), addr:$dst),
+                     (implicit EFLAGS)]>,
+                  Requires<[In64BitMode]>;
+}
+} // Defs = [EFLAGS], CodeSize
+
+
+let Defs = [EFLAGS] in {
+// Shift instructions
+let isTwoAddress = 1 in {
+let Uses = [CL] in
+def SHL64rCL : RI<0xD3, MRM4r, (outs GR64:$dst), (ins GR64:$src),
+                  "shl{q}\t{%cl, $dst|$dst, %CL}",
+                  [(set GR64:$dst, (shl GR64:$src, CL))]>;
+let isConvertibleToThreeAddress = 1 in   // Can transform into LEA.
+def SHL64ri  : RIi8<0xC1, MRM4r, (outs GR64:$dst), 
+                    (ins GR64:$src1, i8imm:$src2),
+                    "shl{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (shl GR64:$src1, (i8 imm:$src2)))]>;
+// NOTE: We don't include patterns for shifts of a register by one, because
+// 'add reg,reg' is cheaper.
+def SHL64r1  : RI<0xD1, MRM4r, (outs GR64:$dst), (ins GR64:$src1),
+                 "shl{q}\t$dst", []>;
+} // isTwoAddress
+
+let Uses = [CL] in
+def SHL64mCL : RI<0xD3, MRM4m, (outs), (ins i64mem:$dst),
+                  "shl{q}\t{%cl, $dst|$dst, %CL}",
+                  [(store (shl (loadi64 addr:$dst), CL), addr:$dst)]>;
+def SHL64mi : RIi8<0xC1, MRM4m, (outs), (ins i64mem:$dst, i8imm:$src),
+                  "shl{q}\t{$src, $dst|$dst, $src}",
+                 [(store (shl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def SHL64m1 : RI<0xD1, MRM4m, (outs), (ins i64mem:$dst),
+                  "shl{q}\t$dst",
+                 [(store (shl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+let Uses = [CL] in
+def SHR64rCL : RI<0xD3, MRM5r, (outs GR64:$dst), (ins GR64:$src),
+                  "shr{q}\t{%cl, $dst|$dst, %CL}",
+                  [(set GR64:$dst, (srl GR64:$src, CL))]>;
+def SHR64ri : RIi8<0xC1, MRM5r, (outs GR64:$dst), (ins GR64:$src1, i8imm:$src2),
+                  "shr{q}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (srl GR64:$src1, (i8 imm:$src2)))]>;
+def SHR64r1  : RI<0xD1, MRM5r, (outs GR64:$dst), (ins GR64:$src1),
+                 "shr{q}\t$dst",
+                 [(set GR64:$dst, (srl GR64:$src1, (i8 1)))]>;
+} // isTwoAddress
+
+let Uses = [CL] in
+def SHR64mCL : RI<0xD3, MRM5m, (outs), (ins i64mem:$dst),
+                  "shr{q}\t{%cl, $dst|$dst, %CL}",
+                  [(store (srl (loadi64 addr:$dst), CL), addr:$dst)]>;
+def SHR64mi : RIi8<0xC1, MRM5m, (outs), (ins i64mem:$dst, i8imm:$src),
+                  "shr{q}\t{$src, $dst|$dst, $src}",
+                 [(store (srl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def SHR64m1 : RI<0xD1, MRM5m, (outs), (ins i64mem:$dst),
+                  "shr{q}\t$dst",
+                 [(store (srl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+let Uses = [CL] in
+def SAR64rCL : RI<0xD3, MRM7r, (outs GR64:$dst), (ins GR64:$src),
+                 "sar{q}\t{%cl, $dst|$dst, %CL}",
+                 [(set GR64:$dst, (sra GR64:$src, CL))]>;
+def SAR64ri  : RIi8<0xC1, MRM7r, (outs GR64:$dst),
+                    (ins GR64:$src1, i8imm:$src2),
+                    "sar{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (sra GR64:$src1, (i8 imm:$src2)))]>;
+def SAR64r1  : RI<0xD1, MRM7r, (outs GR64:$dst), (ins GR64:$src1),
+                 "sar{q}\t$dst",
+                 [(set GR64:$dst, (sra GR64:$src1, (i8 1)))]>;
+} // isTwoAddress
+
+let Uses = [CL] in
+def SAR64mCL : RI<0xD3, MRM7m, (outs), (ins i64mem:$dst), 
+                 "sar{q}\t{%cl, $dst|$dst, %CL}",
+                 [(store (sra (loadi64 addr:$dst), CL), addr:$dst)]>;
+def SAR64mi  : RIi8<0xC1, MRM7m, (outs), (ins i64mem:$dst, i8imm:$src),
+                    "sar{q}\t{$src, $dst|$dst, $src}",
+                 [(store (sra (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def SAR64m1 : RI<0xD1, MRM7m, (outs), (ins i64mem:$dst),
+                  "sar{q}\t$dst",
+                 [(store (sra (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
+
+// Rotate instructions
+
+let isTwoAddress = 1 in {
+def RCL64r1 : RI<0xD1, MRM2r, (outs GR64:$dst), (ins GR64:$src),
+                 "rcl{q}\t{1, $dst|$dst, 1}", []>;
+def RCL64m1 : RI<0xD1, MRM2m, (outs i64mem:$dst), (ins i64mem:$src),
+                 "rcl{q}\t{1, $dst|$dst, 1}", []>;
+let Uses = [CL] in {
+def RCL64rCL : RI<0xD3, MRM2r, (outs GR64:$dst), (ins GR64:$src),
+                  "rcl{q}\t{%cl, $dst|$dst, CL}", []>;
+def RCL64mCL : RI<0xD3, MRM2m, (outs i64mem:$dst), (ins i64mem:$src),
+                  "rcl{q}\t{%cl, $dst|$dst, CL}", []>;
+}
+def RCL64ri : RIi8<0xC1, MRM2r, (outs GR64:$dst), (ins GR64:$src, i8imm:$cnt),
+                   "rcl{q}\t{$cnt, $dst|$dst, $cnt}", []>;
+def RCL64mi : RIi8<0xC1, MRM2m, (outs i64mem:$dst), 
+                   (ins i64mem:$src, i8imm:$cnt),
+                   "rcl{q}\t{$cnt, $dst|$dst, $cnt}", []>;
+
+def RCR64r1 : RI<0xD1, MRM3r, (outs GR64:$dst), (ins GR64:$src),
+                 "rcr{q}\t{1, $dst|$dst, 1}", []>;
+def RCR64m1 : RI<0xD1, MRM3m, (outs i64mem:$dst), (ins i64mem:$src),
+                 "rcr{q}\t{1, $dst|$dst, 1}", []>;
+let Uses = [CL] in {
+def RCR64rCL : RI<0xD3, MRM3r, (outs GR64:$dst), (ins GR64:$src),
+                  "rcr{q}\t{%cl, $dst|$dst, CL}", []>;
+def RCR64mCL : RI<0xD3, MRM3m, (outs i64mem:$dst), (ins i64mem:$src),
+                  "rcr{q}\t{%cl, $dst|$dst, CL}", []>;
+}
+def RCR64ri : RIi8<0xC1, MRM3r, (outs GR64:$dst), (ins GR64:$src, i8imm:$cnt),
+                   "rcr{q}\t{$cnt, $dst|$dst, $cnt}", []>;
+def RCR64mi : RIi8<0xC1, MRM3m, (outs i64mem:$dst), 
+                   (ins i64mem:$src, i8imm:$cnt),
+                   "rcr{q}\t{$cnt, $dst|$dst, $cnt}", []>;
+}
+
+let isTwoAddress = 1 in {
+let Uses = [CL] in
+def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src),
+                  "rol{q}\t{%cl, $dst|$dst, %CL}",
+                  [(set GR64:$dst, (rotl GR64:$src, CL))]>;
+def ROL64ri  : RIi8<0xC1, MRM0r, (outs GR64:$dst), 
+                    (ins GR64:$src1, i8imm:$src2),
+                    "rol{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))]>;
+def ROL64r1  : RI<0xD1, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
+                  "rol{q}\t$dst",
+                  [(set GR64:$dst, (rotl GR64:$src1, (i8 1)))]>;
+} // isTwoAddress
+
+let Uses = [CL] in
+def ROL64mCL :  RI<0xD3, MRM0m, (outs), (ins i64mem:$dst),
+                   "rol{q}\t{%cl, $dst|$dst, %CL}",
+                   [(store (rotl (loadi64 addr:$dst), CL), addr:$dst)]>;
+def ROL64mi  : RIi8<0xC1, MRM0m, (outs), (ins i64mem:$dst, i8imm:$src),
+                    "rol{q}\t{$src, $dst|$dst, $src}",
+                [(store (rotl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def ROL64m1  : RI<0xD1, MRM0m, (outs), (ins i64mem:$dst),
+                 "rol{q}\t$dst",
+               [(store (rotl (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
+
+let isTwoAddress = 1 in {
+let Uses = [CL] in
+def ROR64rCL : RI<0xD3, MRM1r, (outs GR64:$dst), (ins GR64:$src),
+                  "ror{q}\t{%cl, $dst|$dst, %CL}",
+                  [(set GR64:$dst, (rotr GR64:$src, CL))]>;
+def ROR64ri  : RIi8<0xC1, MRM1r, (outs GR64:$dst), 
+                    (ins GR64:$src1, i8imm:$src2),
+                    "ror{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$src2)))]>;
+def ROR64r1  : RI<0xD1, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
+                  "ror{q}\t$dst",
+                  [(set GR64:$dst, (rotr GR64:$src1, (i8 1)))]>;
+} // isTwoAddress
+
+let Uses = [CL] in
+def ROR64mCL : RI<0xD3, MRM1m, (outs), (ins i64mem:$dst), 
+                  "ror{q}\t{%cl, $dst|$dst, %CL}",
+                  [(store (rotr (loadi64 addr:$dst), CL), addr:$dst)]>;
+def ROR64mi  : RIi8<0xC1, MRM1m, (outs), (ins i64mem:$dst, i8imm:$src),
+                    "ror{q}\t{$src, $dst|$dst, $src}",
+                [(store (rotr (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+def ROR64m1  : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst),
+                 "ror{q}\t$dst",
+               [(store (rotr (loadi64 addr:$dst), (i8 1)), addr:$dst)]>;
+
+// Double shift instructions (generalizations of rotate)
+let isTwoAddress = 1 in {
+let Uses = [CL] in {
+def SHLD64rrCL : RI<0xA5, MRMDestReg, (outs GR64:$dst), 
+                    (ins GR64:$src1, GR64:$src2),
+                    "shld{q}\t{%cl, $src2, $dst|$dst, $src2, %CL}",
+                    [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, CL))]>, 
+                    TB;
+def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst), 
+                    (ins GR64:$src1, GR64:$src2),
+                    "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, %CL}",
+                    [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, CL))]>, 
+                    TB;
+}
+
+let isCommutable = 1 in {  // FIXME: Update X86InstrInfo::commuteInstruction
+def SHLD64rri8 : RIi8<0xA4, MRMDestReg,
+                      (outs GR64:$dst), 
+                      (ins GR64:$src1, GR64:$src2, i8imm:$src3),
+                      "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2,
+                                       (i8 imm:$src3)))]>,
+                 TB;
+def SHRD64rri8 : RIi8<0xAC, MRMDestReg,
+                      (outs GR64:$dst), 
+                      (ins GR64:$src1, GR64:$src2, i8imm:$src3),
+                      "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2,
+                                       (i8 imm:$src3)))]>,
+                 TB;
+} // isCommutable
+} // isTwoAddress
+
+let Uses = [CL] in {
+def SHLD64mrCL : RI<0xA5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+                    "shld{q}\t{%cl, $src2, $dst|$dst, $src2, %CL}",
+                    [(store (X86shld (loadi64 addr:$dst), GR64:$src2, CL),
+                      addr:$dst)]>, TB;
+def SHRD64mrCL : RI<0xAD, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+                    "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, %CL}",
+                    [(store (X86shrd (loadi64 addr:$dst), GR64:$src2, CL),
+                      addr:$dst)]>, TB;
+}
+def SHLD64mri8 : RIi8<0xA4, MRMDestMem,
+                      (outs), (ins i64mem:$dst, GR64:$src2, i8imm:$src3),
+                      "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(store (X86shld (loadi64 addr:$dst), GR64:$src2,
+                                       (i8 imm:$src3)), addr:$dst)]>,
+                 TB;
+def SHRD64mri8 : RIi8<0xAC, MRMDestMem, 
+                      (outs), (ins i64mem:$dst, GR64:$src2, i8imm:$src3),
+                      "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(store (X86shrd (loadi64 addr:$dst), GR64:$src2,
+                                       (i8 imm:$src3)), addr:$dst)]>,
+                 TB;
+} // Defs = [EFLAGS]
+
+//===----------------------------------------------------------------------===//
+//  Logical Instructions...
+//
+
+let isTwoAddress = 1 , AddedComplexity = 15 in
+def NOT64r : RI<0xF7, MRM2r, (outs GR64:$dst), (ins GR64:$src), "not{q}\t$dst",
+                [(set GR64:$dst, (not GR64:$src))]>;
+def NOT64m : RI<0xF7, MRM2m, (outs), (ins i64mem:$dst), "not{q}\t$dst",
+                [(store (not (loadi64 addr:$dst)), addr:$dst)]>;
+
+let Defs = [EFLAGS] in {
+def AND64i32 : RI<0x25, RawFrm, (outs), (ins i32imm:$src),
+                  "and{q}\t{$src, %rax|%rax, $src}", []>;
+
+let isTwoAddress = 1 in {
+let isCommutable = 1 in
+def AND64rr  : RI<0x21, MRMDestReg, 
+                  (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                  "and{q}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (and GR64:$src1, GR64:$src2)),
+                   (implicit EFLAGS)]>;
+def AND64rr_REV : RI<0x23, MRMSrcReg, (outs GR64:$dst), 
+                     (ins GR64:$src1, GR64:$src2),
+                     "and{q}\t{$src2, $dst|$dst, $src2}", []>;
+def AND64rm  : RI<0x23, MRMSrcMem,
+                  (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+                  "and{q}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (and GR64:$src1, (load addr:$src2))),
+                   (implicit EFLAGS)]>;
+def AND64ri8 : RIi8<0x83, MRM4r, 
+                    (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
+                    "and{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (and GR64:$src1, i64immSExt8:$src2)),
+                     (implicit EFLAGS)]>;
+def AND64ri32  : RIi32<0x81, MRM4r, 
+                       (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2),
+                       "and{q}\t{$src2, $dst|$dst, $src2}",
+                       [(set GR64:$dst, (and GR64:$src1, i64immSExt32:$src2)),
+                        (implicit EFLAGS)]>;
+} // isTwoAddress
+
+def AND64mr  : RI<0x21, MRMDestMem,
+                  (outs), (ins i64mem:$dst, GR64:$src),
+                  "and{q}\t{$src, $dst|$dst, $src}",
+                  [(store (and (load addr:$dst), GR64:$src), addr:$dst),
+                   (implicit EFLAGS)]>;
+def AND64mi8 : RIi8<0x83, MRM4m,
+                    (outs), (ins i64mem:$dst, i64i8imm :$src),
+                    "and{q}\t{$src, $dst|$dst, $src}",
+                 [(store (and (load addr:$dst), i64immSExt8:$src), addr:$dst),
+                  (implicit EFLAGS)]>;
+def AND64mi32  : RIi32<0x81, MRM4m,
+                       (outs), (ins i64mem:$dst, i64i32imm:$src),
+                       "and{q}\t{$src, $dst|$dst, $src}",
+             [(store (and (loadi64 addr:$dst), i64immSExt32:$src), addr:$dst),
+              (implicit EFLAGS)]>;
+
+let isTwoAddress = 1 in {
+let isCommutable = 1 in
+def OR64rr   : RI<0x09, MRMDestReg, (outs GR64:$dst), 
+                  (ins GR64:$src1, GR64:$src2),
+                  "or{q}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (or GR64:$src1, GR64:$src2)),
+                   (implicit EFLAGS)]>;
+def OR64rr_REV : RI<0x0B, MRMSrcReg, (outs GR64:$dst), 
+                    (ins GR64:$src1, GR64:$src2),
+                    "or{q}\t{$src2, $dst|$dst, $src2}", []>;
+def OR64rm   : RI<0x0B, MRMSrcMem , (outs GR64:$dst),
+                  (ins GR64:$src1, i64mem:$src2),
+                  "or{q}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (or GR64:$src1, (load addr:$src2))),
+                   (implicit EFLAGS)]>;
+def OR64ri8  : RIi8<0x83, MRM1r, (outs GR64:$dst),
+                    (ins GR64:$src1, i64i8imm:$src2),
+                    "or{q}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (or GR64:$src1, i64immSExt8:$src2)),
+                    (implicit EFLAGS)]>;
+def OR64ri32 : RIi32<0x81, MRM1r, (outs GR64:$dst),
+                     (ins GR64:$src1, i64i32imm:$src2),
+                     "or{q}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (or GR64:$src1, i64immSExt32:$src2)),
+                    (implicit EFLAGS)]>;
+} // isTwoAddress
+
+def OR64mr : RI<0x09, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+                "or{q}\t{$src, $dst|$dst, $src}",
+                [(store (or (load addr:$dst), GR64:$src), addr:$dst),
+                 (implicit EFLAGS)]>;
+def OR64mi8  : RIi8<0x83, MRM1m, (outs), (ins i64mem:$dst, i64i8imm:$src),
+                    "or{q}\t{$src, $dst|$dst, $src}",
+                  [(store (or (load addr:$dst), i64immSExt8:$src), addr:$dst),
+                   (implicit EFLAGS)]>;
+def OR64mi32 : RIi32<0x81, MRM1m, (outs), (ins i64mem:$dst, i64i32imm:$src),
+                     "or{q}\t{$src, $dst|$dst, $src}",
+              [(store (or (loadi64 addr:$dst), i64immSExt32:$src), addr:$dst),
+               (implicit EFLAGS)]>;
+
+def OR64i32 : RIi32<0x0D, RawFrm, (outs), (ins i32imm:$src),
+                    "or{q}\t{$src, %rax|%rax, $src}", []>;
+
+let isTwoAddress = 1 in {
+let isCommutable = 1 in
+def XOR64rr  : RI<0x31, MRMDestReg,  (outs GR64:$dst), 
+                  (ins GR64:$src1, GR64:$src2), 
+                  "xor{q}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (xor GR64:$src1, GR64:$src2)),
+                   (implicit EFLAGS)]>;
+def XOR64rr_REV : RI<0x33, MRMSrcReg, (outs GR64:$dst), 
+                     (ins GR64:$src1, GR64:$src2),
+                    "xor{q}\t{$src2, $dst|$dst, $src2}", []>;
+def XOR64rm  : RI<0x33, MRMSrcMem, (outs GR64:$dst), 
+                  (ins GR64:$src1, i64mem:$src2), 
+                  "xor{q}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR64:$dst, (xor GR64:$src1, (load addr:$src2))),
+                   (implicit EFLAGS)]>;
+def XOR64ri8 : RIi8<0x83, MRM6r,  (outs GR64:$dst), 
+                    (ins GR64:$src1, i64i8imm:$src2),
+                    "xor{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (xor GR64:$src1, i64immSExt8:$src2)),
+                     (implicit EFLAGS)]>;
+def XOR64ri32 : RIi32<0x81, MRM6r, 
+                      (outs GR64:$dst), (ins GR64:$src1, i64i32imm:$src2), 
+                      "xor{q}\t{$src2, $dst|$dst, $src2}",
+                      [(set GR64:$dst, (xor GR64:$src1, i64immSExt32:$src2)),
+                       (implicit EFLAGS)]>;
+} // isTwoAddress
+
+def XOR64mr  : RI<0x31, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+                  "xor{q}\t{$src, $dst|$dst, $src}",
+                  [(store (xor (load addr:$dst), GR64:$src), addr:$dst),
+                   (implicit EFLAGS)]>;
+def XOR64mi8 : RIi8<0x83, MRM6m, (outs), (ins i64mem:$dst, i64i8imm :$src),
+                    "xor{q}\t{$src, $dst|$dst, $src}",
+                 [(store (xor (load addr:$dst), i64immSExt8:$src), addr:$dst),
+                  (implicit EFLAGS)]>;
+def XOR64mi32 : RIi32<0x81, MRM6m, (outs), (ins i64mem:$dst, i64i32imm:$src),
+                      "xor{q}\t{$src, $dst|$dst, $src}",
+             [(store (xor (loadi64 addr:$dst), i64immSExt32:$src), addr:$dst),
+              (implicit EFLAGS)]>;
+              
+def XOR64i32 : RIi32<0x35, RawFrm, (outs), (ins i32imm:$src),
+                     "xor{q}\t{$src, %rax|%rax, $src}", []>;
+
+} // Defs = [EFLAGS]
+
+//===----------------------------------------------------------------------===//
+//  Comparison Instructions...
+//
+
+// Integer comparison
+let Defs = [EFLAGS] in {
+def TEST64i32 : RI<0xa9, RawFrm, (outs), (ins i32imm:$src),
+                   "test{q}\t{$src, %rax|%rax, $src}", []>;
+let isCommutable = 1 in
+def TEST64rr : RI<0x85, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+                  "test{q}\t{$src2, $src1|$src1, $src2}",
+                  [(X86cmp (and GR64:$src1, GR64:$src2), 0),
+                   (implicit EFLAGS)]>;
+def TEST64rm : RI<0x85, MRMSrcMem, (outs), (ins GR64:$src1, i64mem:$src2),
+                  "test{q}\t{$src2, $src1|$src1, $src2}",
+                  [(X86cmp (and GR64:$src1, (loadi64 addr:$src2)), 0),
+                   (implicit EFLAGS)]>;
+def TEST64ri32 : RIi32<0xF7, MRM0r, (outs),
+                                        (ins GR64:$src1, i64i32imm:$src2),
+                       "test{q}\t{$src2, $src1|$src1, $src2}",
+                     [(X86cmp (and GR64:$src1, i64immSExt32:$src2), 0),
+                      (implicit EFLAGS)]>;
+def TEST64mi32 : RIi32<0xF7, MRM0m, (outs),
+                                        (ins i64mem:$src1, i64i32imm:$src2),
+                       "test{q}\t{$src2, $src1|$src1, $src2}",
+                [(X86cmp (and (loadi64 addr:$src1), i64immSExt32:$src2), 0),
+                 (implicit EFLAGS)]>;
+
+
+def CMP64i32 : RI<0x3D, RawFrm, (outs), (ins i32imm:$src),
+                  "cmp{q}\t{$src, %rax|%rax, $src}", []>;
+def CMP64rr : RI<0x39, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+                 "cmp{q}\t{$src2, $src1|$src1, $src2}",
+                 [(X86cmp GR64:$src1, GR64:$src2),
+                  (implicit EFLAGS)]>;
+def CMP64mrmrr : RI<0x3B, MRMSrcReg, (outs), (ins GR64:$src1, GR64:$src2),
+                    "cmp{q}\t{$src2, $src1|$src1, $src2}", []>;
+def CMP64mr : RI<0x39, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+                 "cmp{q}\t{$src2, $src1|$src1, $src2}",
+                 [(X86cmp (loadi64 addr:$src1), GR64:$src2),
+                   (implicit EFLAGS)]>;
+def CMP64rm : RI<0x3B, MRMSrcMem, (outs), (ins GR64:$src1, i64mem:$src2),
+                 "cmp{q}\t{$src2, $src1|$src1, $src2}",
+                 [(X86cmp GR64:$src1, (loadi64 addr:$src2)),
+                  (implicit EFLAGS)]>;
+def CMP64ri8 : RIi8<0x83, MRM7r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+                    "cmp{q}\t{$src2, $src1|$src1, $src2}",
+                    [(X86cmp GR64:$src1, i64immSExt8:$src2),
+                     (implicit EFLAGS)]>;
+def CMP64ri32 : RIi32<0x81, MRM7r, (outs), (ins GR64:$src1, i64i32imm:$src2),
+                      "cmp{q}\t{$src2, $src1|$src1, $src2}",
+                      [(X86cmp GR64:$src1, i64immSExt32:$src2),
+                       (implicit EFLAGS)]>;
+def CMP64mi8 : RIi8<0x83, MRM7m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+                    "cmp{q}\t{$src2, $src1|$src1, $src2}",
+                    [(X86cmp (loadi64 addr:$src1), i64immSExt8:$src2),
+                     (implicit EFLAGS)]>;
+def CMP64mi32 : RIi32<0x81, MRM7m, (outs),
+                                       (ins i64mem:$src1, i64i32imm:$src2),
+                      "cmp{q}\t{$src2, $src1|$src1, $src2}",
+                      [(X86cmp (loadi64 addr:$src1), i64immSExt32:$src2),
+                       (implicit EFLAGS)]>;
+} // Defs = [EFLAGS]
+
+// Bit tests.
+// TODO: BTC, BTR, and BTS
+let Defs = [EFLAGS] in {
+def BT64rr : RI<0xA3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+               "bt{q}\t{$src2, $src1|$src1, $src2}",
+               [(X86bt GR64:$src1, GR64:$src2),
+                (implicit EFLAGS)]>, TB;
+
+// Unlike with the register+register form, the memory+register form of the
+// bt instruction does not ignore the high bits of the index. From ISel's
+// perspective, this is pretty bizarre. Disable these instructions for now.
+def BT64mr : RI<0xA3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+               "bt{q}\t{$src2, $src1|$src1, $src2}",
+//               [(X86bt (loadi64 addr:$src1), GR64:$src2),
+//                (implicit EFLAGS)]
+                []
+                >, TB;
+
+def BT64ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+                "bt{q}\t{$src2, $src1|$src1, $src2}",
+                [(X86bt GR64:$src1, i64immSExt8:$src2),
+                 (implicit EFLAGS)]>, TB;
+// Note that these instructions don't need FastBTMem because that
+// only applies when the other operand is in a register. When it's
+// an immediate, bt is still fast.
+def BT64mi8 : Ii8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+                "bt{q}\t{$src2, $src1|$src1, $src2}",
+                [(X86bt (loadi64 addr:$src1), i64immSExt8:$src2),
+                 (implicit EFLAGS)]>, TB;
+
+def BTC64rr : RI<0xBB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+                 "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+def BTC64mr : RI<0xBB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+                 "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+def BTC64ri8 : RIi8<0xBA, MRM7r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+                    "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+                    "btc{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+
+def BTR64rr : RI<0xB3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+                 "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+def BTR64mr : RI<0xB3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+                 "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+def BTR64ri8 : RIi8<0xBA, MRM6r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+                    "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+                    "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+
+def BTS64rr : RI<0xAB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
+                 "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+def BTS64mr : RI<0xAB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+                 "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+def BTS64ri8 : RIi8<0xBA, MRM5r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+                    "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
+                    "bts{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+} // Defs = [EFLAGS]
+
+// Conditional moves
+let Uses = [EFLAGS], isTwoAddress = 1 in {
+let isCommutable = 1 in {
+def CMOVB64rr : RI<0x42, MRMSrcReg,       // if <u, GR64 = GR64
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "cmovb{q}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                     X86_COND_B, EFLAGS))]>, TB;
+def CMOVAE64rr: RI<0x43, MRMSrcReg,       // if >=u, GR64 = GR64
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "cmovae{q}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                     X86_COND_AE, EFLAGS))]>, TB;
+def CMOVE64rr : RI<0x44, MRMSrcReg,       // if ==, GR64 = GR64
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "cmove{q}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                     X86_COND_E, EFLAGS))]>, TB;
+def CMOVNE64rr: RI<0x45, MRMSrcReg,       // if !=, GR64 = GR64
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "cmovne{q}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_NE, EFLAGS))]>, TB;
+def CMOVBE64rr: RI<0x46, MRMSrcReg,       // if <=u, GR64 = GR64
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "cmovbe{q}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_BE, EFLAGS))]>, TB;
+def CMOVA64rr : RI<0x47, MRMSrcReg,       // if >u, GR64 = GR64
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "cmova{q}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_A, EFLAGS))]>, TB;
+def CMOVL64rr : RI<0x4C, MRMSrcReg,       // if <s, GR64 = GR64
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "cmovl{q}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_L, EFLAGS))]>, TB;
+def CMOVGE64rr: RI<0x4D, MRMSrcReg,       // if >=s, GR64 = GR64
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "cmovge{q}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_GE, EFLAGS))]>, TB;
+def CMOVLE64rr: RI<0x4E, MRMSrcReg,       // if <=s, GR64 = GR64
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "cmovle{q}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_LE, EFLAGS))]>, TB;
+def CMOVG64rr : RI<0x4F, MRMSrcReg,       // if >s, GR64 = GR64
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "cmovg{q}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_G, EFLAGS))]>, TB;
+def CMOVS64rr : RI<0x48, MRMSrcReg,       // if signed, GR64 = GR64
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "cmovs{q}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_S, EFLAGS))]>, TB;
+def CMOVNS64rr: RI<0x49, MRMSrcReg,       // if !signed, GR64 = GR64
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "cmovns{q}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_NS, EFLAGS))]>, TB;
+def CMOVP64rr : RI<0x4A, MRMSrcReg,       // if parity, GR64 = GR64
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "cmovp{q}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_P, EFLAGS))]>, TB;
+def CMOVNP64rr : RI<0x4B, MRMSrcReg,       // if !parity, GR64 = GR64
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "cmovnp{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                     X86_COND_NP, EFLAGS))]>, TB;
+def CMOVO64rr : RI<0x40, MRMSrcReg,       // if overflow, GR64 = GR64
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "cmovo{q}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                    X86_COND_O, EFLAGS))]>, TB;
+def CMOVNO64rr : RI<0x41, MRMSrcReg,       // if !overflow, GR64 = GR64
+                   (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+                   "cmovno{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (X86cmov GR64:$src1, GR64:$src2,
+                                     X86_COND_NO, EFLAGS))]>, TB;
+} // isCommutable = 1
+
+def CMOVB64rm : RI<0x42, MRMSrcMem,       // if <u, GR64 = [mem64]
+                   (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+                   "cmovb{q}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                     X86_COND_B, EFLAGS))]>, TB;
+def CMOVAE64rm: RI<0x43, MRMSrcMem,       // if >=u, GR64 = [mem64]
+                   (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+                   "cmovae{q}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                     X86_COND_AE, EFLAGS))]>, TB;
+def CMOVE64rm : RI<0x44, MRMSrcMem,       // if ==, GR64 = [mem64]
+                   (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+                   "cmove{q}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                     X86_COND_E, EFLAGS))]>, TB;
+def CMOVNE64rm: RI<0x45, MRMSrcMem,       // if !=, GR64 = [mem64]
+                   (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+                   "cmovne{q}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_NE, EFLAGS))]>, TB;
+def CMOVBE64rm: RI<0x46, MRMSrcMem,       // if <=u, GR64 = [mem64]
+                   (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+                   "cmovbe{q}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_BE, EFLAGS))]>, TB;
+def CMOVA64rm : RI<0x47, MRMSrcMem,       // if >u, GR64 = [mem64]
+                   (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+                   "cmova{q}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_A, EFLAGS))]>, TB;
+def CMOVL64rm : RI<0x4C, MRMSrcMem,       // if <s, GR64 = [mem64]
+                   (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+                   "cmovl{q}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_L, EFLAGS))]>, TB;
+def CMOVGE64rm: RI<0x4D, MRMSrcMem,       // if >=s, GR64 = [mem64]
+                   (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+                   "cmovge{q}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_GE, EFLAGS))]>, TB;
+def CMOVLE64rm: RI<0x4E, MRMSrcMem,       // if <=s, GR64 = [mem64]
+                   (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+                   "cmovle{q}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_LE, EFLAGS))]>, TB;
+def CMOVG64rm : RI<0x4F, MRMSrcMem,       // if >s, GR64 = [mem64]
+                   (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+                   "cmovg{q}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_G, EFLAGS))]>, TB;
+def CMOVS64rm : RI<0x48, MRMSrcMem,       // if signed, GR64 = [mem64]
+                   (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+                   "cmovs{q}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_S, EFLAGS))]>, TB;
+def CMOVNS64rm: RI<0x49, MRMSrcMem,       // if !signed, GR64 = [mem64]
+                   (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+                   "cmovns{q}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_NS, EFLAGS))]>, TB;
+def CMOVP64rm : RI<0x4A, MRMSrcMem,       // if parity, GR64 = [mem64]
+                   (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+                   "cmovp{q}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_P, EFLAGS))]>, TB;
+def CMOVNP64rm : RI<0x4B, MRMSrcMem,       // if !parity, GR64 = [mem64]
+                   (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+                   "cmovnp{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                     X86_COND_NP, EFLAGS))]>, TB;
+def CMOVO64rm : RI<0x40, MRMSrcMem,       // if overflow, GR64 = [mem64]
+                   (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+                   "cmovo{q}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                    X86_COND_O, EFLAGS))]>, TB;
+def CMOVNO64rm : RI<0x41, MRMSrcMem,       // if !overflow, GR64 = [mem64]
+                   (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
+                   "cmovno{q}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
+                                     X86_COND_NO, EFLAGS))]>, TB;
+} // isTwoAddress
+
+// Use sbb to materialize carry flag into a GPR.
+// FIXME: This are pseudo ops that should be replaced with Pat<> patterns.
+// However, Pat<> can't replicate the destination reg into the inputs of the
+// result.
+// FIXME: Change this to have encoding Pseudo when X86MCCodeEmitter replaces
+// X86CodeEmitter.
+let Defs = [EFLAGS], Uses = [EFLAGS], isCodeGenOnly = 1 in
+def SETB_C64r : RI<0x19, MRMInitReg, (outs GR64:$dst), (ins), "",
+                 [(set GR64:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+
+def : Pat<(i64 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+          (SETB_C64r)>;
+
+//===----------------------------------------------------------------------===//
+//  Conversion Instructions...
+//
+
+// f64 -> signed i64
+def CVTSD2SI64rr: RSDI<0x2D, MRMSrcReg, (outs GR64:$dst), (ins FR64:$src),
+                       "cvtsd2si{q}\t{$src, $dst|$dst, $src}", []>;
+def CVTSD2SI64rm: RSDI<0x2D, MRMSrcMem, (outs GR64:$dst), (ins f64mem:$src),
+                       "cvtsd2si{q}\t{$src, $dst|$dst, $src}", []>;
+def Int_CVTSD2SI64rr: RSDI<0x2D, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
+                           "cvtsd2si{q}\t{$src, $dst|$dst, $src}",
+                           [(set GR64:$dst,
+                             (int_x86_sse2_cvtsd2si64 VR128:$src))]>;
+def Int_CVTSD2SI64rm: RSDI<0x2D, MRMSrcMem, (outs GR64:$dst), 
+                           (ins f128mem:$src),
+                           "cvtsd2si{q}\t{$src, $dst|$dst, $src}",
+                           [(set GR64:$dst, (int_x86_sse2_cvtsd2si64
+                                             (load addr:$src)))]>;
+def CVTTSD2SI64rr: RSDI<0x2C, MRMSrcReg, (outs GR64:$dst), (ins FR64:$src),
+                        "cvttsd2si{q}\t{$src, $dst|$dst, $src}",
+                        [(set GR64:$dst, (fp_to_sint FR64:$src))]>;
+def CVTTSD2SI64rm: RSDI<0x2C, MRMSrcMem, (outs GR64:$dst), (ins f64mem:$src),
+                        "cvttsd2si{q}\t{$src, $dst|$dst, $src}",
+                        [(set GR64:$dst, (fp_to_sint (loadf64 addr:$src)))]>;
+def Int_CVTTSD2SI64rr: RSDI<0x2C, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
+                            "cvttsd2si{q}\t{$src, $dst|$dst, $src}",
+                            [(set GR64:$dst,
+                              (int_x86_sse2_cvttsd2si64 VR128:$src))]>;
+def Int_CVTTSD2SI64rm: RSDI<0x2C, MRMSrcMem, (outs GR64:$dst), 
+                            (ins f128mem:$src),
+                            "cvttsd2si{q}\t{$src, $dst|$dst, $src}",
+                            [(set GR64:$dst,
+                              (int_x86_sse2_cvttsd2si64
+                               (load addr:$src)))]>;
+
+// Signed i64 -> f64
+def CVTSI2SD64rr: RSDI<0x2A, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+                       "cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
+                       [(set FR64:$dst, (sint_to_fp GR64:$src))]>;
+def CVTSI2SD64rm: RSDI<0x2A, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
+                       "cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
+                       [(set FR64:$dst, (sint_to_fp (loadi64 addr:$src)))]>;
+
+let isTwoAddress = 1 in {
+def Int_CVTSI2SD64rr: RSDI<0x2A, MRMSrcReg,
+                           (outs VR128:$dst), (ins VR128:$src1, GR64:$src2),
+                           "cvtsi2sd{q}\t{$src2, $dst|$dst, $src2}",
+                           [(set VR128:$dst,
+                             (int_x86_sse2_cvtsi642sd VR128:$src1,
+                              GR64:$src2))]>;
+def Int_CVTSI2SD64rm: RSDI<0x2A, MRMSrcMem,
+                           (outs VR128:$dst), (ins VR128:$src1, i64mem:$src2),
+                           "cvtsi2sd{q}\t{$src2, $dst|$dst, $src2}",
+                           [(set VR128:$dst,
+                             (int_x86_sse2_cvtsi642sd VR128:$src1,
+                              (loadi64 addr:$src2)))]>;
+} // isTwoAddress
+
+// Signed i64 -> f32
+def CVTSI2SS64rr: RSSI<0x2A, MRMSrcReg, (outs FR32:$dst), (ins GR64:$src),
+                       "cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
+                       [(set FR32:$dst, (sint_to_fp GR64:$src))]>;
+def CVTSI2SS64rm: RSSI<0x2A, MRMSrcMem, (outs FR32:$dst), (ins i64mem:$src),
+                       "cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
+                       [(set FR32:$dst, (sint_to_fp (loadi64 addr:$src)))]>;
+
+let isTwoAddress = 1 in {
+  def Int_CVTSI2SS64rr : RSSI<0x2A, MRMSrcReg,
+                              (outs VR128:$dst), (ins VR128:$src1, GR64:$src2),
+                              "cvtsi2ss{q}\t{$src2, $dst|$dst, $src2}",
+                              [(set VR128:$dst,
+                                (int_x86_sse_cvtsi642ss VR128:$src1,
+                                 GR64:$src2))]>;
+  def Int_CVTSI2SS64rm : RSSI<0x2A, MRMSrcMem,
+                              (outs VR128:$dst), 
+                              (ins VR128:$src1, i64mem:$src2),
+                              "cvtsi2ss{q}\t{$src2, $dst|$dst, $src2}",
+                              [(set VR128:$dst,
+                                (int_x86_sse_cvtsi642ss VR128:$src1,
+                                 (loadi64 addr:$src2)))]>;
+}
+
+// f32 -> signed i64
+def CVTSS2SI64rr: RSSI<0x2D, MRMSrcReg, (outs GR64:$dst), (ins FR32:$src),
+                       "cvtss2si{q}\t{$src, $dst|$dst, $src}", []>;
+def CVTSS2SI64rm: RSSI<0x2D, MRMSrcMem, (outs GR64:$dst), (ins f32mem:$src),
+                       "cvtss2si{q}\t{$src, $dst|$dst, $src}", []>;
+def Int_CVTSS2SI64rr: RSSI<0x2D, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
+                           "cvtss2si{q}\t{$src, $dst|$dst, $src}",
+                           [(set GR64:$dst,
+                             (int_x86_sse_cvtss2si64 VR128:$src))]>;
+def Int_CVTSS2SI64rm: RSSI<0x2D, MRMSrcMem, (outs GR64:$dst), (ins f32mem:$src),
+                           "cvtss2si{q}\t{$src, $dst|$dst, $src}",
+                           [(set GR64:$dst, (int_x86_sse_cvtss2si64
+                                             (load addr:$src)))]>;
+def CVTTSS2SI64rr: RSSI<0x2C, MRMSrcReg, (outs GR64:$dst), (ins FR32:$src),
+                        "cvttss2si{q}\t{$src, $dst|$dst, $src}",
+                        [(set GR64:$dst, (fp_to_sint FR32:$src))]>;
+def CVTTSS2SI64rm: RSSI<0x2C, MRMSrcMem, (outs GR64:$dst), (ins f32mem:$src),
+                        "cvttss2si{q}\t{$src, $dst|$dst, $src}",
+                        [(set GR64:$dst, (fp_to_sint (loadf32 addr:$src)))]>;
+def Int_CVTTSS2SI64rr: RSSI<0x2C, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
+                            "cvttss2si{q}\t{$src, $dst|$dst, $src}",
+                            [(set GR64:$dst,
+                              (int_x86_sse_cvttss2si64 VR128:$src))]>;
+def Int_CVTTSS2SI64rm: RSSI<0x2C, MRMSrcMem, (outs GR64:$dst),
+                            (ins f32mem:$src),
+                            "cvttss2si{q}\t{$src, $dst|$dst, $src}",
+                            [(set GR64:$dst,
+                              (int_x86_sse_cvttss2si64 (load addr:$src)))]>;
+                              
+// Descriptor-table support instructions
+
+// LLDT is not interpreted specially in 64-bit mode because there is no sign
+//   extension.
+def SLDT64r : RI<0x00, MRM0r, (outs GR64:$dst), (ins),
+                 "sldt{q}\t$dst", []>, TB;
+def SLDT64m : RI<0x00, MRM0m, (outs i16mem:$dst), (ins),
+                 "sldt{q}\t$dst", []>, TB;
+
+//===----------------------------------------------------------------------===//
+// Alias Instructions
+//===----------------------------------------------------------------------===//
+
+// We want to rewrite MOV64r0 in terms of MOV32r0, because it's sometimes a
+// smaller encoding, but doing so at isel time interferes with rematerialization
+// in the current register allocator. For now, this is rewritten when the
+// instruction is lowered to an MCInst.
+// FIXME: AddedComplexity gives this a higher priority than MOV64ri32. Remove
+// when we have a better way to specify isel priority.
+let Defs = [EFLAGS],
+    AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in
+def MOV64r0   : I<0x31, MRMInitReg, (outs GR64:$dst), (ins), "",
+                 [(set GR64:$dst, 0)]>;
+
+// Materialize i64 constant where top 32-bits are zero. This could theoretically
+// use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however
+// that would make it more difficult to rematerialize.
+let AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in
+def MOV64ri64i32 : Ii32<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64i32imm:$src),
+                        "", [(set GR64:$dst, i64immZExt32:$src)]>;
+
+//===----------------------------------------------------------------------===//
+// Thread Local Storage Instructions
+//===----------------------------------------------------------------------===//
+
+// All calls clobber the non-callee saved registers. RSP is marked as
+// a use to prevent stack-pointer assignments that appear immediately
+// before calls from potentially appearing dead.
+let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+            FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, ST1,
+            MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+            XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+            XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+    Uses = [RSP] in
+def TLS_addr64 : I<0, Pseudo, (outs), (ins lea64mem:$sym),
+                   ".byte\t0x66; "
+                   "leaq\t$sym(%rip), %rdi; "
+                   ".word\t0x6666; "
+                   "rex64; "
+                   "call\t__tls_get_addr@PLT",
+                  [(X86tlsaddr tls64addr:$sym)]>,
+                  Requires<[In64BitMode]>;
+
+let AddedComplexity = 5, isCodeGenOnly = 1 in
+def MOV64GSrm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                 "movq\t%gs:$src, $dst",
+                 [(set GR64:$dst, (gsload addr:$src))]>, SegGS;
+
+let AddedComplexity = 5, isCodeGenOnly = 1 in
+def MOV64FSrm : RI<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                 "movq\t%fs:$src, $dst",
+                 [(set GR64:$dst, (fsload addr:$src))]>, SegFS;
+
+//===----------------------------------------------------------------------===//
+// Atomic Instructions
+//===----------------------------------------------------------------------===//
+
+let Defs = [RAX, EFLAGS], Uses = [RAX] in {
+def LCMPXCHG64 : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$ptr, GR64:$swap),
+               "lock\n\t"
+               "cmpxchgq\t$swap,$ptr",
+               [(X86cas addr:$ptr, GR64:$swap, 8)]>, TB, LOCK;
+}
+
+let Constraints = "$val = $dst" in {
+let Defs = [EFLAGS] in
+def LXADD64 : RI<0xC1, MRMSrcMem, (outs GR64:$dst), (ins GR64:$val,i64mem:$ptr),
+               "lock\n\t"
+               "xadd\t$val, $ptr",
+               [(set GR64:$dst, (atomic_load_add_64 addr:$ptr, GR64:$val))]>,
+                TB, LOCK;
+
+def XCHG64rm : RI<0x87, MRMSrcMem, (outs GR64:$dst), 
+                  (ins GR64:$val,i64mem:$ptr),
+                  "xchg{q}\t{$val, $ptr|$ptr, $val}", 
+                  [(set GR64:$dst, (atomic_swap_64 addr:$ptr, GR64:$val))]>;
+
+def XCHG64rr : RI<0x87, MRMSrcReg, (outs GR64:$dst), (ins GR64:$val,GR64:$src),
+                  "xchg{q}\t{$val, $src|$src, $val}", []>;
+}
+
+def XADD64rr  : RI<0xC1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
+                   "xadd{q}\t{$src, $dst|$dst, $src}", []>, TB;
+def XADD64rm  : RI<0xC1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+                   "xadd{q}\t{$src, $dst|$dst, $src}", []>, TB;
+                   
+def CMPXCHG64rr  : RI<0xB1, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
+                      "cmpxchg{q}\t{$src, $dst|$dst, $src}", []>, TB;
+def CMPXCHG64rm  : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+                      "cmpxchg{q}\t{$src, $dst|$dst, $src}", []>, TB;
+                      
+let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX] in
+def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst),
+                    "cmpxchg16b\t$dst", []>, TB;
+
+def XCHG64ar : RI<0x90, AddRegFrm, (outs), (ins GR64:$src),
+                  "xchg{q}\t{$src, %rax|%rax, $src}", []>;
+
+// Optimized codegen when the non-memory output is not used.
+let Defs = [EFLAGS] in {
+// FIXME: Use normal add / sub instructions and add lock prefix dynamically.
+def LOCK_ADD64mr : RI<0x03, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+                      "lock\n\t"
+                      "add{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_ADD64mi8 : RIi8<0x83, MRM0m, (outs),
+                                      (ins i64mem:$dst, i64i8imm :$src2),
+                    "lock\n\t"
+                    "add{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_ADD64mi32 : RIi32<0x81, MRM0m, (outs),
+                                        (ins i64mem:$dst, i64i32imm :$src2),
+                      "lock\n\t"
+                      "add{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_SUB64mr : RI<0x29, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2), 
+                      "lock\n\t"
+                      "sub{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_SUB64mi8 : RIi8<0x83, MRM5m, (outs),
+                                      (ins i64mem:$dst, i64i8imm :$src2), 
+                      "lock\n\t"
+                      "sub{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_SUB64mi32 : RIi32<0x81, MRM5m, (outs),
+                                        (ins i64mem:$dst, i64i32imm:$src2),
+                      "lock\n\t"
+                      "sub{q}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst),
+                     "lock\n\t"
+                     "inc{q}\t$dst", []>, LOCK;
+def LOCK_DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst),
+                      "lock\n\t"
+                      "dec{q}\t$dst", []>, LOCK;
+}
+// Atomic exchange, and, or, xor
+let Constraints = "$val = $dst", Defs = [EFLAGS],
+                  usesCustomInserter = 1 in {
+def ATOMAND64 : I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
+               "#ATOMAND64 PSEUDO!", 
+               [(set GR64:$dst, (atomic_load_and_64 addr:$ptr, GR64:$val))]>;
+def ATOMOR64 : I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
+               "#ATOMOR64 PSEUDO!", 
+               [(set GR64:$dst, (atomic_load_or_64 addr:$ptr, GR64:$val))]>;
+def ATOMXOR64 : I<0, Pseudo,(outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
+               "#ATOMXOR64 PSEUDO!", 
+               [(set GR64:$dst, (atomic_load_xor_64 addr:$ptr, GR64:$val))]>;
+def ATOMNAND64 : I<0, Pseudo,(outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
+               "#ATOMNAND64 PSEUDO!", 
+               [(set GR64:$dst, (atomic_load_nand_64 addr:$ptr, GR64:$val))]>;
+def ATOMMIN64: I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$ptr, GR64:$val),
+               "#ATOMMIN64 PSEUDO!", 
+               [(set GR64:$dst, (atomic_load_min_64 addr:$ptr, GR64:$val))]>;
+def ATOMMAX64: I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
+               "#ATOMMAX64 PSEUDO!", 
+               [(set GR64:$dst, (atomic_load_max_64 addr:$ptr, GR64:$val))]>;
+def ATOMUMIN64: I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
+               "#ATOMUMIN64 PSEUDO!", 
+               [(set GR64:$dst, (atomic_load_umin_64 addr:$ptr, GR64:$val))]>;
+def ATOMUMAX64: I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
+               "#ATOMUMAX64 PSEUDO!", 
+               [(set GR64:$dst, (atomic_load_umax_64 addr:$ptr, GR64:$val))]>;
+}
+
+// Segmentation support instructions
+
+// i16mem operand in LAR64rm and GR32 operand in LAR32rr is not a typo.
+def LAR64rm : RI<0x02, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src), 
+                 "lar{q}\t{$src, $dst|$dst, $src}", []>, TB;
+def LAR64rr : RI<0x02, MRMSrcReg, (outs GR64:$dst), (ins GR32:$src),
+                 "lar{q}\t{$src, $dst|$dst, $src}", []>, TB;
+                 
+def LSL64rm : RI<0x03, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+                 "lsl{q}\t{$src, $dst|$dst, $src}", []>, TB; 
+def LSL64rr : RI<0x03, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+                 "lsl{q}\t{$src, $dst|$dst, $src}", []>, TB;
+
+def SWPGS : I<0x01, RawFrm, (outs), (ins), "swpgs", []>, TB;
+
+def PUSHFS64 : I<0xa0, RawFrm, (outs), (ins),
+                 "push{q}\t%fs", []>, TB;
+def PUSHGS64 : I<0xa8, RawFrm, (outs), (ins),
+                 "push{q}\t%gs", []>, TB;
+
+def POPFS64 : I<0xa1, RawFrm, (outs), (ins),
+                "pop{q}\t%fs", []>, TB;
+def POPGS64 : I<0xa9, RawFrm, (outs), (ins),
+                "pop{q}\t%gs", []>, TB;
+                 
+def LSS64rm : RI<0xb2, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
+                 "lss{q}\t{$src, $dst|$dst, $src}", []>, TB;
+def LFS64rm : RI<0xb4, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
+                 "lfs{q}\t{$src, $dst|$dst, $src}", []>, TB;
+def LGS64rm : RI<0xb5, MRMSrcMem, (outs GR64:$dst), (ins opaque80mem:$src),
+                 "lgs{q}\t{$src, $dst|$dst, $src}", []>, TB;
+
+// Specialized register support
+
+// no m form encodable; use SMSW16m
+def SMSW64r : RI<0x01, MRM4r, (outs GR64:$dst), (ins), 
+                 "smsw{q}\t$dst", []>, TB;
+
+// String manipulation instructions
+
+def LODSQ : RI<0xAD, RawFrm, (outs), (ins), "lodsq", []>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable when not in small
+// code model mode, should use 'movabs'.  FIXME: This is really a hack, the
+//  'movabs' predicate should handle this sort of thing.
+def : Pat<(i64 (X86Wrapper tconstpool  :$dst)),
+          (MOV64ri tconstpool  :$dst)>, Requires<[FarData]>;
+def : Pat<(i64 (X86Wrapper tjumptable  :$dst)),
+          (MOV64ri tjumptable  :$dst)>, Requires<[FarData]>;
+def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
+          (MOV64ri tglobaladdr :$dst)>, Requires<[FarData]>;
+def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
+          (MOV64ri texternalsym:$dst)>, Requires<[FarData]>;
+def : Pat<(i64 (X86Wrapper tblockaddress:$dst)),
+          (MOV64ri tblockaddress:$dst)>, Requires<[FarData]>;
+
+// In static codegen with small code model, we can get the address of a label
+// into a register with 'movl'.  FIXME: This is a hack, the 'imm' predicate of
+// the MOV64ri64i32 should accept these.
+def : Pat<(i64 (X86Wrapper tconstpool  :$dst)),
+          (MOV64ri64i32 tconstpool  :$dst)>, Requires<[SmallCode]>;
+def : Pat<(i64 (X86Wrapper tjumptable  :$dst)),
+          (MOV64ri64i32 tjumptable  :$dst)>, Requires<[SmallCode]>;
+def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
+          (MOV64ri64i32 tglobaladdr :$dst)>, Requires<[SmallCode]>;
+def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
+          (MOV64ri64i32 texternalsym:$dst)>, Requires<[SmallCode]>;
+def : Pat<(i64 (X86Wrapper tblockaddress:$dst)),
+          (MOV64ri64i32 tblockaddress:$dst)>, Requires<[SmallCode]>;
+
+// In kernel code model, we can get the address of a label
+// into a register with 'movq'.  FIXME: This is a hack, the 'imm' predicate of
+// the MOV64ri32 should accept these.
+def : Pat<(i64 (X86Wrapper tconstpool  :$dst)),
+          (MOV64ri32 tconstpool  :$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper tjumptable  :$dst)),
+          (MOV64ri32 tjumptable  :$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper tglobaladdr :$dst)),
+          (MOV64ri32 tglobaladdr :$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper texternalsym:$dst)),
+          (MOV64ri32 texternalsym:$dst)>, Requires<[KernelCode]>;
+def : Pat<(i64 (X86Wrapper tblockaddress:$dst)),
+          (MOV64ri32 tblockaddress:$dst)>, Requires<[KernelCode]>;
+
+// If we have small model and -static mode, it is safe to store global addresses
+// directly as immediates.  FIXME: This is really a hack, the 'imm' predicate
+// for MOV64mi32 should handle this sort of thing.
+def : Pat<(store (i64 (X86Wrapper tconstpool:$src)), addr:$dst),
+          (MOV64mi32 addr:$dst, tconstpool:$src)>,
+          Requires<[NearData, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper tjumptable:$src)), addr:$dst),
+          (MOV64mi32 addr:$dst, tjumptable:$src)>,
+          Requires<[NearData, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper tglobaladdr:$src)), addr:$dst),
+          (MOV64mi32 addr:$dst, tglobaladdr:$src)>,
+          Requires<[NearData, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper texternalsym:$src)), addr:$dst),
+          (MOV64mi32 addr:$dst, texternalsym:$src)>,
+          Requires<[NearData, IsStatic]>;
+def : Pat<(store (i64 (X86Wrapper tblockaddress:$src)), addr:$dst),
+          (MOV64mi32 addr:$dst, tblockaddress:$src)>,
+          Requires<[NearData, IsStatic]>;
+
+// Calls
+// Direct PC relative function call for small code model. 32-bit displacement
+// sign extended to 64-bit.
+def : Pat<(X86call (i64 tglobaladdr:$dst)),
+          (CALL64pcrel32 tglobaladdr:$dst)>, Requires<[NotWin64]>;
+def : Pat<(X86call (i64 texternalsym:$dst)),
+          (CALL64pcrel32 texternalsym:$dst)>, Requires<[NotWin64]>;
+
+def : Pat<(X86call (i64 tglobaladdr:$dst)),
+          (WINCALL64pcrel32 tglobaladdr:$dst)>, Requires<[IsWin64]>;
+def : Pat<(X86call (i64 texternalsym:$dst)),
+          (WINCALL64pcrel32 texternalsym:$dst)>, Requires<[IsWin64]>;
+
+// tailcall stuff
+def : Pat<(X86tcret GR64:$dst, imm:$off),
+          (TCRETURNri64 GR64:$dst, imm:$off)>;
+
+def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off),
+          (TCRETURNdi64 tglobaladdr:$dst, imm:$off)>;
+
+def : Pat<(X86tcret (i64 texternalsym:$dst), imm:$off),
+          (TCRETURNdi64 texternalsym:$dst, imm:$off)>;
+
+// Comparisons.
+
+// TEST R,R is smaller than CMP R,0
+def : Pat<(parallel (X86cmp GR64:$src1, 0), (implicit EFLAGS)),
+          (TEST64rr GR64:$src1, GR64:$src1)>;
+
+// Conditional moves with folded loads with operands swapped and conditions
+// inverted.
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_B, EFLAGS),
+          (CMOVAE64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_AE, EFLAGS),
+          (CMOVB64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_E, EFLAGS),
+          (CMOVNE64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_NE, EFLAGS),
+          (CMOVE64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_BE, EFLAGS),
+          (CMOVA64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_A, EFLAGS),
+          (CMOVBE64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_L, EFLAGS),
+          (CMOVGE64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_GE, EFLAGS),
+          (CMOVL64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_LE, EFLAGS),
+          (CMOVG64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_G, EFLAGS),
+          (CMOVLE64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_P, EFLAGS),
+          (CMOVNP64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_NP, EFLAGS),
+          (CMOVP64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_S, EFLAGS),
+          (CMOVNS64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_NS, EFLAGS),
+          (CMOVS64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_O, EFLAGS),
+          (CMOVNO64rm GR64:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_NO, EFLAGS),
+          (CMOVO64rm GR64:$src2, addr:$src1)>;
+
+// zextload bool -> zextload byte
+def : Pat<(zextloadi64i1 addr:$src), (MOVZX64rm8 addr:$src)>;
+
+// extload
+// When extloading from 16-bit and smaller memory locations into 64-bit 
+// registers, use zero-extending loads so that the entire 64-bit register is 
+// defined, avoiding partial-register updates.
+def : Pat<(extloadi64i1 addr:$src),  (MOVZX64rm8  addr:$src)>;
+def : Pat<(extloadi64i8 addr:$src),  (MOVZX64rm8  addr:$src)>;
+def : Pat<(extloadi64i16 addr:$src), (MOVZX64rm16 addr:$src)>;
+// For other extloads, use subregs, since the high contents of the register are
+// defined after an extload.
+def : Pat<(extloadi64i32 addr:$src),
+          (SUBREG_TO_REG (i64 0), (MOV32rm addr:$src),
+                         x86_subreg_32bit)>;
+
+// anyext. Define these to do an explicit zero-extend to
+// avoid partial-register updates.
+def : Pat<(i64 (anyext GR8 :$src)), (MOVZX64rr8  GR8  :$src)>;
+def : Pat<(i64 (anyext GR16:$src)), (MOVZX64rr16 GR16 :$src)>;
+def : Pat<(i64 (anyext GR32:$src)),
+          (SUBREG_TO_REG (i64 0), GR32:$src, x86_subreg_32bit)>;
+
+//===----------------------------------------------------------------------===//
+// Some peepholes
+//===----------------------------------------------------------------------===//
+
+// Odd encoding trick: -128 fits into an 8-bit immediate field while
+// +128 doesn't, so in this special case use a sub instead of an add.
+def : Pat<(add GR64:$src1, 128),
+          (SUB64ri8 GR64:$src1, -128)>;
+def : Pat<(store (add (loadi64 addr:$dst), 128), addr:$dst),
+          (SUB64mi8 addr:$dst, -128)>;
+
+// The same trick applies for 32-bit immediate fields in 64-bit
+// instructions.
+def : Pat<(add GR64:$src1, 0x0000000080000000),
+          (SUB64ri32 GR64:$src1, 0xffffffff80000000)>;
+def : Pat<(store (add (loadi64 addr:$dst), 0x00000000800000000), addr:$dst),
+          (SUB64mi32 addr:$dst, 0xffffffff80000000)>;
+
+// Use a 32-bit and with implicit zero-extension instead of a 64-bit and if it
+// has an immediate with at least 32 bits of leading zeros, to avoid needing to
+// materialize that immediate in a register first.
+def : Pat<(and GR64:$src, i64immZExt32:$imm),
+          (SUBREG_TO_REG
+            (i64 0),
+            (AND32ri
+              (EXTRACT_SUBREG GR64:$src, x86_subreg_32bit),
+              imm:$imm),
+            x86_subreg_32bit)>;
+
+// r & (2^32-1) ==> movz
+def : Pat<(and GR64:$src, 0x00000000FFFFFFFF),
+          (MOVZX64rr32 (EXTRACT_SUBREG GR64:$src, x86_subreg_32bit))>;
+// r & (2^16-1) ==> movz
+def : Pat<(and GR64:$src, 0xffff),
+          (MOVZX64rr16 (i16 (EXTRACT_SUBREG GR64:$src, x86_subreg_16bit)))>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR64:$src, 0xff),
+          (MOVZX64rr8 (i8 (EXTRACT_SUBREG GR64:$src, x86_subreg_8bit)))>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR32:$src1, 0xff),
+           (MOVZX32rr8 (EXTRACT_SUBREG GR32:$src1, x86_subreg_8bit))>,
+      Requires<[In64BitMode]>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR16:$src1, 0xff),
+           (MOVZX16rr8 (i8 (EXTRACT_SUBREG GR16:$src1, x86_subreg_8bit)))>,
+      Requires<[In64BitMode]>;
+
+// sext_inreg patterns
+def : Pat<(sext_inreg GR64:$src, i32),
+          (MOVSX64rr32 (EXTRACT_SUBREG GR64:$src, x86_subreg_32bit))>;
+def : Pat<(sext_inreg GR64:$src, i16),
+          (MOVSX64rr16 (EXTRACT_SUBREG GR64:$src, x86_subreg_16bit))>;
+def : Pat<(sext_inreg GR64:$src, i8),
+          (MOVSX64rr8 (EXTRACT_SUBREG GR64:$src, x86_subreg_8bit))>;
+def : Pat<(sext_inreg GR32:$src, i8),
+          (MOVSX32rr8 (EXTRACT_SUBREG GR32:$src, x86_subreg_8bit))>,
+      Requires<[In64BitMode]>;
+def : Pat<(sext_inreg GR16:$src, i8),
+          (MOVSX16rr8 (i8 (EXTRACT_SUBREG GR16:$src, x86_subreg_8bit)))>,
+      Requires<[In64BitMode]>;
+
+// trunc patterns
+def : Pat<(i32 (trunc GR64:$src)),
+          (EXTRACT_SUBREG GR64:$src, x86_subreg_32bit)>;
+def : Pat<(i16 (trunc GR64:$src)),
+          (EXTRACT_SUBREG GR64:$src, x86_subreg_16bit)>;
+def : Pat<(i8 (trunc GR64:$src)),
+          (EXTRACT_SUBREG GR64:$src, x86_subreg_8bit)>;
+def : Pat<(i8 (trunc GR32:$src)),
+          (EXTRACT_SUBREG GR32:$src, x86_subreg_8bit)>,
+      Requires<[In64BitMode]>;
+def : Pat<(i8 (trunc GR16:$src)),
+          (EXTRACT_SUBREG GR16:$src, x86_subreg_8bit)>,
+      Requires<[In64BitMode]>;
+
+// h-register tricks.
+// For now, be conservative on x86-64 and use an h-register extract only if the
+// value is immediately zero-extended or stored, which are somewhat common
+// cases. This uses a bunch of code to prevent a register requiring a REX prefix
+// from being allocated in the same instruction as the h register, as there's
+// currently no way to describe this requirement to the register allocator.
+
+// h-register extract and zero-extend.
+def : Pat<(and (srl_su GR64:$src, (i8 8)), (i64 255)),
+          (SUBREG_TO_REG
+            (i64 0),
+            (MOVZX32_NOREXrr8
+              (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS GR64:$src, GR64_ABCD)),
+                              x86_subreg_8bit_hi)),
+            x86_subreg_32bit)>;
+def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
+          (MOVZX32_NOREXrr8
+            (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
+                            x86_subreg_8bit_hi))>,
+      Requires<[In64BitMode]>;
+def : Pat<(srl GR16:$src, (i8 8)),
+          (EXTRACT_SUBREG
+            (MOVZX32_NOREXrr8
+              (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                              x86_subreg_8bit_hi)),
+            x86_subreg_16bit)>,
+      Requires<[In64BitMode]>;
+def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))),
+          (MOVZX32_NOREXrr8
+            (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                            x86_subreg_8bit_hi))>,
+      Requires<[In64BitMode]>;
+def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))),
+          (MOVZX32_NOREXrr8
+            (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                            x86_subreg_8bit_hi))>,
+      Requires<[In64BitMode]>;
+def : Pat<(i64 (zext (srl_su GR16:$src, (i8 8)))),
+          (SUBREG_TO_REG
+            (i64 0),
+            (MOVZX32_NOREXrr8
+              (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                              x86_subreg_8bit_hi)),
+            x86_subreg_32bit)>;
+def : Pat<(i64 (anyext (srl_su GR16:$src, (i8 8)))),
+          (SUBREG_TO_REG
+            (i64 0),
+            (MOVZX32_NOREXrr8
+              (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                              x86_subreg_8bit_hi)),
+            x86_subreg_32bit)>;
+
+// h-register extract and store.
+def : Pat<(store (i8 (trunc_su (srl_su GR64:$src, (i8 8)))), addr:$dst),
+          (MOV8mr_NOREX
+            addr:$dst,
+            (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS GR64:$src, GR64_ABCD)),
+                            x86_subreg_8bit_hi))>;
+def : Pat<(store (i8 (trunc_su (srl_su GR32:$src, (i8 8)))), addr:$dst),
+          (MOV8mr_NOREX
+            addr:$dst,
+            (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
+                            x86_subreg_8bit_hi))>,
+      Requires<[In64BitMode]>;
+def : Pat<(store (i8 (trunc_su (srl_su GR16:$src, (i8 8)))), addr:$dst),
+          (MOV8mr_NOREX
+            addr:$dst,
+            (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                            x86_subreg_8bit_hi))>,
+      Requires<[In64BitMode]>;
+
+// (shl x, 1) ==> (add x, x)
+def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>;
+
+// (shl x (and y, 63)) ==> (shl x, y)
+def : Pat<(shl GR64:$src1, (and CL:$amt, 63)),
+          (SHL64rCL GR64:$src1)>;
+def : Pat<(store (shl (loadi64 addr:$dst), (and CL:$amt, 63)), addr:$dst),
+          (SHL64mCL addr:$dst)>;
+
+def : Pat<(srl GR64:$src1, (and CL:$amt, 63)),
+          (SHR64rCL GR64:$src1)>;
+def : Pat<(store (srl (loadi64 addr:$dst), (and CL:$amt, 63)), addr:$dst),
+          (SHR64mCL addr:$dst)>;
+
+def : Pat<(sra GR64:$src1, (and CL:$amt, 63)),
+          (SAR64rCL GR64:$src1)>;
+def : Pat<(store (sra (loadi64 addr:$dst), (and CL:$amt, 63)), addr:$dst),
+          (SAR64mCL addr:$dst)>;
+
+// Double shift patterns
+def : Pat<(shrd GR64:$src1, (i8 imm:$amt1), GR64:$src2, (i8 imm:$amt2)),
+          (SHRD64rri8 GR64:$src1, GR64:$src2, (i8 imm:$amt1))>;
+
+def : Pat<(store (shrd (loadi64 addr:$dst), (i8 imm:$amt1),
+                       GR64:$src2, (i8 imm:$amt2)), addr:$dst),
+          (SHRD64mri8 addr:$dst, GR64:$src2, (i8 imm:$amt1))>;
+
+def : Pat<(shld GR64:$src1, (i8 imm:$amt1), GR64:$src2, (i8 imm:$amt2)),
+          (SHLD64rri8 GR64:$src1, GR64:$src2, (i8 imm:$amt1))>;
+
+def : Pat<(store (shld (loadi64 addr:$dst), (i8 imm:$amt1),
+                       GR64:$src2, (i8 imm:$amt2)), addr:$dst),
+          (SHLD64mri8 addr:$dst, GR64:$src2, (i8 imm:$amt1))>;
+
+// (or x1, x2) -> (add x1, x2) if two operands are known not to share bits.
+let AddedComplexity = 5 in {  // Try this before the selecting to OR
+def : Pat<(parallel (or_is_add GR64:$src1, i64immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(parallel (or_is_add GR64:$src1, i64immSExt32:$src2),
+                    (implicit EFLAGS)),
+          (ADD64ri32 GR64:$src1, i64immSExt32:$src2)>;
+def : Pat<(parallel (or_is_add GR64:$src1, GR64:$src2),
+                    (implicit EFLAGS)),
+          (ADD64rr GR64:$src1, GR64:$src2)>;
+} // AddedComplexity
+
+// X86 specific add which produces a flag.
+def : Pat<(addc GR64:$src1, GR64:$src2),
+          (ADD64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(addc GR64:$src1, (load addr:$src2)),
+          (ADD64rm GR64:$src1, addr:$src2)>;
+def : Pat<(addc GR64:$src1, i64immSExt8:$src2),
+          (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(addc GR64:$src1, i64immSExt32:$src2),
+          (ADD64ri32 GR64:$src1, imm:$src2)>;
+
+def : Pat<(subc GR64:$src1, GR64:$src2),
+          (SUB64rr GR64:$src1, GR64:$src2)>;
+def : Pat<(subc GR64:$src1, (load addr:$src2)),
+          (SUB64rm GR64:$src1, addr:$src2)>;
+def : Pat<(subc GR64:$src1, i64immSExt8:$src2),
+          (SUB64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(subc GR64:$src1, imm:$src2),
+          (SUB64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+//===----------------------------------------------------------------------===//
+// EFLAGS-defining Patterns
+//===----------------------------------------------------------------------===//
+
+// Register-Register Addition with EFLAGS result
+def : Pat<(parallel (X86add_flag GR64:$src1, GR64:$src2),
+                    (implicit EFLAGS)),
+          (ADD64rr GR64:$src1, GR64:$src2)>;
+
+// Register-Integer Addition with EFLAGS result
+def : Pat<(parallel (X86add_flag GR64:$src1, i64immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (ADD64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(parallel (X86add_flag GR64:$src1, i64immSExt32:$src2),
+                    (implicit EFLAGS)),
+          (ADD64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// Register-Memory Addition with EFLAGS result
+def : Pat<(parallel (X86add_flag GR64:$src1, (loadi64 addr:$src2)),
+                    (implicit EFLAGS)),
+          (ADD64rm GR64:$src1, addr:$src2)>;
+
+// Memory-Register Addition with EFLAGS result
+def : Pat<(parallel (store (X86add_flag (loadi64 addr:$dst), GR64:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (ADD64mr addr:$dst, GR64:$src2)>;
+def : Pat<(parallel (store (X86add_flag (loadi64 addr:$dst), i64immSExt8:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (ADD64mi8 addr:$dst, i64immSExt8:$src2)>;
+def : Pat<(parallel (store (X86add_flag (loadi64 addr:$dst), 
+                                        i64immSExt32:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (ADD64mi32 addr:$dst, i64immSExt32:$src2)>;
+
+// Register-Register Subtraction with EFLAGS result
+def : Pat<(parallel (X86sub_flag GR64:$src1, GR64:$src2),
+                    (implicit EFLAGS)),
+          (SUB64rr GR64:$src1, GR64:$src2)>;
+
+// Register-Memory Subtraction with EFLAGS result
+def : Pat<(parallel (X86sub_flag GR64:$src1, (loadi64 addr:$src2)),
+                    (implicit EFLAGS)),
+          (SUB64rm GR64:$src1, addr:$src2)>;
+
+// Register-Integer Subtraction with EFLAGS result
+def : Pat<(parallel (X86sub_flag GR64:$src1, i64immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (SUB64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(parallel (X86sub_flag GR64:$src1, i64immSExt32:$src2),
+                    (implicit EFLAGS)),
+          (SUB64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// Memory-Register Subtraction with EFLAGS result
+def : Pat<(parallel (store (X86sub_flag (loadi64 addr:$dst), GR64:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (SUB64mr addr:$dst, GR64:$src2)>;
+
+// Memory-Integer Subtraction with EFLAGS result
+def : Pat<(parallel (store (X86sub_flag (loadi64 addr:$dst), 
+                                        i64immSExt8:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (SUB64mi8 addr:$dst, i64immSExt8:$src2)>;
+def : Pat<(parallel (store (X86sub_flag (loadi64 addr:$dst),
+                                        i64immSExt32:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (SUB64mi32 addr:$dst, i64immSExt32:$src2)>;
+
+// Register-Register Signed Integer Multiplication with EFLAGS result
+def : Pat<(parallel (X86smul_flag GR64:$src1, GR64:$src2),
+                    (implicit EFLAGS)),
+          (IMUL64rr GR64:$src1, GR64:$src2)>;
+
+// Register-Memory Signed Integer Multiplication with EFLAGS result
+def : Pat<(parallel (X86smul_flag GR64:$src1, (loadi64 addr:$src2)),
+                    (implicit EFLAGS)),
+          (IMUL64rm GR64:$src1, addr:$src2)>;
+
+// Register-Integer Signed Integer Multiplication with EFLAGS result
+def : Pat<(parallel (X86smul_flag GR64:$src1, i64immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (IMUL64rri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(parallel (X86smul_flag GR64:$src1, i64immSExt32:$src2),
+                    (implicit EFLAGS)),
+          (IMUL64rri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// Memory-Integer Signed Integer Multiplication with EFLAGS result
+def : Pat<(parallel (X86smul_flag (loadi64 addr:$src1), i64immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (IMUL64rmi8 addr:$src1, i64immSExt8:$src2)>;
+def : Pat<(parallel (X86smul_flag (loadi64 addr:$src1), i64immSExt32:$src2),
+                    (implicit EFLAGS)),
+          (IMUL64rmi32 addr:$src1, i64immSExt32:$src2)>;
+
+// INC and DEC with EFLAGS result. Note that these do not set CF.
+def : Pat<(parallel (X86inc_flag GR16:$src), (implicit EFLAGS)),
+          (INC64_16r GR16:$src)>, Requires<[In64BitMode]>;
+def : Pat<(parallel (store (i16 (X86inc_flag (loadi16 addr:$dst))), addr:$dst),
+                    (implicit EFLAGS)),
+          (INC64_16m addr:$dst)>, Requires<[In64BitMode]>;
+def : Pat<(parallel (X86dec_flag GR16:$src), (implicit EFLAGS)),
+          (DEC64_16r GR16:$src)>, Requires<[In64BitMode]>;
+def : Pat<(parallel (store (i16 (X86dec_flag (loadi16 addr:$dst))), addr:$dst),
+                    (implicit EFLAGS)),
+          (DEC64_16m addr:$dst)>, Requires<[In64BitMode]>;
+
+def : Pat<(parallel (X86inc_flag GR32:$src), (implicit EFLAGS)),
+          (INC64_32r GR32:$src)>, Requires<[In64BitMode]>;
+def : Pat<(parallel (store (i32 (X86inc_flag (loadi32 addr:$dst))), addr:$dst),
+                    (implicit EFLAGS)),
+          (INC64_32m addr:$dst)>, Requires<[In64BitMode]>;
+def : Pat<(parallel (X86dec_flag GR32:$src), (implicit EFLAGS)),
+          (DEC64_32r GR32:$src)>, Requires<[In64BitMode]>;
+def : Pat<(parallel (store (i32 (X86dec_flag (loadi32 addr:$dst))), addr:$dst),
+                    (implicit EFLAGS)),
+          (DEC64_32m addr:$dst)>, Requires<[In64BitMode]>;
+
+def : Pat<(parallel (X86inc_flag GR64:$src), (implicit EFLAGS)),
+          (INC64r GR64:$src)>;
+def : Pat<(parallel (store (i64 (X86inc_flag (loadi64 addr:$dst))), addr:$dst),
+                    (implicit EFLAGS)),
+          (INC64m addr:$dst)>;
+def : Pat<(parallel (X86dec_flag GR64:$src), (implicit EFLAGS)),
+          (DEC64r GR64:$src)>;
+def : Pat<(parallel (store (i64 (X86dec_flag (loadi64 addr:$dst))), addr:$dst),
+                    (implicit EFLAGS)),
+          (DEC64m addr:$dst)>;
+
+// Register-Register Logical Or with EFLAGS result
+def : Pat<(parallel (X86or_flag GR64:$src1, GR64:$src2),
+                    (implicit EFLAGS)),
+          (OR64rr GR64:$src1, GR64:$src2)>;
+
+// Register-Integer Logical Or with EFLAGS result
+def : Pat<(parallel (X86or_flag GR64:$src1, i64immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (OR64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(parallel (X86or_flag GR64:$src1, i64immSExt32:$src2),
+                    (implicit EFLAGS)),
+          (OR64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// Register-Memory Logical Or with EFLAGS result
+def : Pat<(parallel (X86or_flag GR64:$src1, (loadi64 addr:$src2)),
+                    (implicit EFLAGS)),
+          (OR64rm GR64:$src1, addr:$src2)>;
+
+// Memory-Register Logical Or with EFLAGS result
+def : Pat<(parallel (store (X86or_flag (loadi64 addr:$dst), GR64:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (OR64mr addr:$dst, GR64:$src2)>;
+def : Pat<(parallel (store (X86or_flag (loadi64 addr:$dst), i64immSExt8:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (OR64mi8 addr:$dst, i64immSExt8:$src2)>;
+def : Pat<(parallel (store (X86or_flag (loadi64 addr:$dst), i64immSExt32:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (OR64mi32 addr:$dst, i64immSExt32:$src2)>;
+
+// Register-Register Logical XOr with EFLAGS result
+def : Pat<(parallel (X86xor_flag GR64:$src1, GR64:$src2),
+                    (implicit EFLAGS)),
+          (XOR64rr GR64:$src1, GR64:$src2)>;
+
+// Register-Integer Logical XOr with EFLAGS result
+def : Pat<(parallel (X86xor_flag GR64:$src1, i64immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (XOR64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(parallel (X86xor_flag GR64:$src1, i64immSExt32:$src2),
+                    (implicit EFLAGS)),
+          (XOR64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// Register-Memory Logical XOr with EFLAGS result
+def : Pat<(parallel (X86xor_flag GR64:$src1, (loadi64 addr:$src2)),
+                    (implicit EFLAGS)),
+          (XOR64rm GR64:$src1, addr:$src2)>;
+
+// Memory-Register Logical XOr with EFLAGS result
+def : Pat<(parallel (store (X86xor_flag (loadi64 addr:$dst), GR64:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (XOR64mr addr:$dst, GR64:$src2)>;
+def : Pat<(parallel (store (X86xor_flag (loadi64 addr:$dst), i64immSExt8:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (XOR64mi8 addr:$dst, i64immSExt8:$src2)>;
+def : Pat<(parallel (store (X86xor_flag (loadi64 addr:$dst), 
+                                        i64immSExt32:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (XOR64mi32 addr:$dst, i64immSExt32:$src2)>;
+
+// Register-Register Logical And with EFLAGS result
+def : Pat<(parallel (X86and_flag GR64:$src1, GR64:$src2),
+                    (implicit EFLAGS)),
+          (AND64rr GR64:$src1, GR64:$src2)>;
+
+// Register-Integer Logical And with EFLAGS result
+def : Pat<(parallel (X86and_flag GR64:$src1, i64immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (AND64ri8 GR64:$src1, i64immSExt8:$src2)>;
+def : Pat<(parallel (X86and_flag GR64:$src1, i64immSExt32:$src2),
+                    (implicit EFLAGS)),
+          (AND64ri32 GR64:$src1, i64immSExt32:$src2)>;
+
+// Register-Memory Logical And with EFLAGS result
+def : Pat<(parallel (X86and_flag GR64:$src1, (loadi64 addr:$src2)),
+                    (implicit EFLAGS)),
+          (AND64rm GR64:$src1, addr:$src2)>;
+
+// Memory-Register Logical And with EFLAGS result
+def : Pat<(parallel (store (X86and_flag (loadi64 addr:$dst), GR64:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (AND64mr addr:$dst, GR64:$src2)>;
+def : Pat<(parallel (store (X86and_flag (loadi64 addr:$dst), i64immSExt8:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (AND64mi8 addr:$dst, i64immSExt8:$src2)>;
+def : Pat<(parallel (store (X86and_flag (loadi64 addr:$dst), 
+                                        i64immSExt32:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (AND64mi32 addr:$dst, i64immSExt32:$src2)>;
+
+//===----------------------------------------------------------------------===//
+// X86-64 SSE Instructions
+//===----------------------------------------------------------------------===//
+
+// Move instructions...
+
+def MOV64toPQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+                        "mov{d|q}\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst,
+                          (v2i64 (scalar_to_vector GR64:$src)))]>;
+def MOVPQIto64rr  : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
+                         "mov{d|q}\t{$src, $dst|$dst, $src}",
+                         [(set GR64:$dst, (vector_extract (v2i64 VR128:$src),
+                                           (iPTR 0)))]>;
+
+def MOV64toSDrr : RPDI<0x6E, MRMSrcReg, (outs FR64:$dst), (ins GR64:$src),
+                       "mov{d|q}\t{$src, $dst|$dst, $src}",
+                       [(set FR64:$dst, (bitconvert GR64:$src))]>;
+def MOV64toSDrm : RPDI<0x6E, MRMSrcMem, (outs FR64:$dst), (ins i64mem:$src),
+                       "movq\t{$src, $dst|$dst, $src}",
+                       [(set FR64:$dst, (bitconvert (loadi64 addr:$src)))]>;
+
+def MOVSDto64rr  : RPDI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64:$src),
+                        "mov{d|q}\t{$src, $dst|$dst, $src}",
+                        [(set GR64:$dst, (bitconvert FR64:$src))]>;
+def MOVSDto64mr  : RPDI<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, FR64:$src),
+                        "movq\t{$src, $dst|$dst, $src}",
+                        [(store (i64 (bitconvert FR64:$src)), addr:$dst)]>;
+
+//===----------------------------------------------------------------------===//
+// X86-64 SSE4.1 Instructions
+//===----------------------------------------------------------------------===//
+
+/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
+multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
+  def rr : SS4AIi8<opc, MRMDestReg, (outs GR64:$dst),
+                 (ins VR128:$src1, i32i8imm:$src2),
+                 !strconcat(OpcodeStr, 
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(set GR64:$dst,
+                  (extractelt (v2i64 VR128:$src1), imm:$src2))]>, OpSize, REX_W;
+  def mr : SS4AIi8<opc, MRMDestMem, (outs),
+                 (ins i64mem:$dst, VR128:$src1, i32i8imm:$src2),
+                 !strconcat(OpcodeStr, 
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
+                          addr:$dst)]>, OpSize, REX_W;
+}
+
+defm PEXTRQ      : SS41I_extract64<0x16, "pextrq">;
+
+let isTwoAddress = 1 in {
+  multiclass SS41I_insert64<bits<8> opc, string OpcodeStr> {
+    def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+                   (ins VR128:$src1, GR64:$src2, i32i8imm:$src3),
+                   !strconcat(OpcodeStr, 
+                    "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR128:$dst, 
+                     (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
+                   OpSize, REX_W;
+    def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+                   (ins VR128:$src1, i64mem:$src2, i32i8imm:$src3),
+                   !strconcat(OpcodeStr,
+                    "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR128:$dst, 
+                     (v2i64 (insertelt VR128:$src1, (loadi64 addr:$src2),
+                                       imm:$src3)))]>, OpSize, REX_W;
+  }
+}
+
+defm PINSRQ      : SS41I_insert64<0x22, "pinsrq">;
+
+// -disable-16bit support.
+def : Pat<(truncstorei16 (i64 imm:$src), addr:$dst),
+          (MOV16mi addr:$dst, imm:$src)>;
+def : Pat<(truncstorei16 GR64:$src, addr:$dst),
+          (MOV16mr addr:$dst, (EXTRACT_SUBREG GR64:$src, x86_subreg_16bit))>;
+def : Pat<(i64 (sextloadi16 addr:$dst)),
+          (MOVSX64rm16 addr:$dst)>;
+def : Pat<(i64 (zextloadi16 addr:$dst)),
+          (MOVZX64rm16 addr:$dst)>;
+def : Pat<(i64 (extloadi16 addr:$dst)),
+          (MOVZX64rm16 addr:$dst)>;

diff --git a/lib/Target/X86/X86InstrBuilder.h b/lib/Target/X86/X86InstrBuilder.h
new file mode 100644
index 0000000..c475b56
--- /dev/null
+++ b/lib/Target/X86/X86InstrBuilder.h

@@ -0,0 +1,172 @@
+//===-- X86InstrBuilder.h - Functions to aid building x86 insts -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes functions that may be used with BuildMI from the
+// MachineInstrBuilder.h file to handle X86'isms in a clean way.
+//
+// The BuildMem function may be used with the BuildMI function to add entire
+// memory references in a single, typed, function call.  X86 memory references
+// can be very complex expressions (described in the README), so wrapping them
+// up behind an easier to use interface makes sense.  Descriptions of the
+// functions are included below.
+//
+// For reference, the order of operands for memory references is:
+// (Operand), Base, Scale, Index, Displacement.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86INSTRBUILDER_H
+#define X86INSTRBUILDER_H
+
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+
+namespace llvm {
+
+/// X86AddressMode - This struct holds a generalized full x86 address mode.
+/// The base register can be a frame index, which will eventually be replaced
+/// with BP or SP and Disp being offsetted accordingly.  The displacement may
+/// also include the offset of a global value.
+struct X86AddressMode {
+  enum {
+    RegBase,
+    FrameIndexBase
+  } BaseType;
+
+  union {
+    unsigned Reg;
+    int FrameIndex;
+  } Base;
+
+  unsigned Scale;
+  unsigned IndexReg;
+  int Disp;
+  GlobalValue *GV;
+  unsigned GVOpFlags;
+
+  X86AddressMode()
+    : BaseType(RegBase), Scale(1), IndexReg(0), Disp(0), GV(0), GVOpFlags(0) {
+    Base.Reg = 0;
+  }
+};
+
+/// addDirectMem - This function is used to add a direct memory reference to the
+/// current instruction -- that is, a dereference of an address in a register,
+/// with no scale, index or displacement. An example is: DWORD PTR [EAX].
+///
+static inline const MachineInstrBuilder &
+addDirectMem(const MachineInstrBuilder &MIB, unsigned Reg) {
+  // Because memory references are always represented with four
+  // values, this adds: Reg, [1, NoReg, 0] to the instruction.
+  return MIB.addReg(Reg).addImm(1).addReg(0).addImm(0);
+}
+
+static inline const MachineInstrBuilder &
+addLeaOffset(const MachineInstrBuilder &MIB, int Offset) {
+  return MIB.addImm(1).addReg(0).addImm(Offset);
+}
+
+static inline const MachineInstrBuilder &
+addOffset(const MachineInstrBuilder &MIB, int Offset) {
+  return addLeaOffset(MIB, Offset).addReg(0);
+}
+
+/// addRegOffset - This function is used to add a memory reference of the form
+/// [Reg + Offset], i.e., one with no scale or index, but with a
+/// displacement. An example is: DWORD PTR [EAX + 4].
+///
+static inline const MachineInstrBuilder &
+addRegOffset(const MachineInstrBuilder &MIB,
+             unsigned Reg, bool isKill, int Offset) {
+  return addOffset(MIB.addReg(Reg, getKillRegState(isKill)), Offset);
+}
+
+static inline const MachineInstrBuilder &
+addLeaRegOffset(const MachineInstrBuilder &MIB,
+                unsigned Reg, bool isKill, int Offset) {
+  return addLeaOffset(MIB.addReg(Reg, getKillRegState(isKill)), Offset);
+}
+
+/// addRegReg - This function is used to add a memory reference of the form:
+/// [Reg + Reg].
+static inline const MachineInstrBuilder &addRegReg(const MachineInstrBuilder &MIB,
+                                            unsigned Reg1, bool isKill1,
+                                            unsigned Reg2, bool isKill2) {
+  return MIB.addReg(Reg1, getKillRegState(isKill1)).addImm(1)
+    .addReg(Reg2, getKillRegState(isKill2)).addImm(0);
+}
+
+static inline const MachineInstrBuilder &
+addLeaAddress(const MachineInstrBuilder &MIB, const X86AddressMode &AM) {
+  assert (AM.Scale == 1 || AM.Scale == 2 || AM.Scale == 4 || AM.Scale == 8);
+
+  if (AM.BaseType == X86AddressMode::RegBase)
+    MIB.addReg(AM.Base.Reg);
+  else if (AM.BaseType == X86AddressMode::FrameIndexBase)
+    MIB.addFrameIndex(AM.Base.FrameIndex);
+  else
+    assert (0);
+  MIB.addImm(AM.Scale).addReg(AM.IndexReg);
+  if (AM.GV)
+    return MIB.addGlobalAddress(AM.GV, AM.Disp, AM.GVOpFlags);
+  else
+    return MIB.addImm(AM.Disp);
+}
+
+static inline const MachineInstrBuilder &
+addFullAddress(const MachineInstrBuilder &MIB,
+               const X86AddressMode &AM) {
+  return addLeaAddress(MIB, AM).addReg(0);
+}
+
+/// addFrameReference - This function is used to add a reference to the base of
+/// an abstract object on the stack frame of the current function.  This
+/// reference has base register as the FrameIndex offset until it is resolved.
+/// This allows a constant offset to be specified as well...
+///
+static inline const MachineInstrBuilder &
+addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0) {
+  MachineInstr *MI = MIB;
+  MachineFunction &MF = *MI->getParent()->getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  const TargetInstrDesc &TID = MI->getDesc();
+  unsigned Flags = 0;
+  if (TID.mayLoad())
+    Flags |= MachineMemOperand::MOLoad;
+  if (TID.mayStore())
+    Flags |= MachineMemOperand::MOStore;
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(PseudoSourceValue::getFixedStack(FI),
+                            Flags, Offset,
+                            MFI.getObjectSize(FI),
+                            MFI.getObjectAlignment(FI));
+  return addOffset(MIB.addFrameIndex(FI), Offset)
+            .addMemOperand(MMO);
+}
+
+/// addConstantPoolReference - This function is used to add a reference to the
+/// base of a constant value spilled to the per-function constant pool.  The
+/// reference uses the abstract ConstantPoolIndex which is retained until
+/// either machine code emission or assembly output. In PIC mode on x86-32,
+/// the GlobalBaseReg parameter can be used to make this a
+/// GlobalBaseReg-relative reference.
+///
+static inline const MachineInstrBuilder &
+addConstantPoolReference(const MachineInstrBuilder &MIB, unsigned CPI,
+                         unsigned GlobalBaseReg, unsigned char OpFlags) {
+  //FIXME: factor this
+  return MIB.addReg(GlobalBaseReg).addImm(1).addReg(0)
+    .addConstantPoolIndex(CPI, 0, OpFlags).addReg(0);
+}
+
+} // End llvm namespace
+
+#endif

diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td
new file mode 100644
index 0000000..e22a903
--- /dev/null
+++ b/lib/Target/X86/X86InstrFPStack.td

@@ -0,0 +1,691 @@
+//==- X86InstrFPStack.td - Describe the X86 Instruction Set --*- tablegen -*-=//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 x87 FPU instruction set, defining the
+// instructions, and properties of the instructions which are needed for code
+// generation, machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// FPStack specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDTX86FpGet2    : SDTypeProfile<2, 0, [SDTCisVT<0, f80>, 
+                                           SDTCisVT<1, f80>]>;
+def SDTX86Fld       : SDTypeProfile<1, 2, [SDTCisFP<0>,
+                                           SDTCisPtrTy<1>, 
+                                           SDTCisVT<2, OtherVT>]>;
+def SDTX86Fst       : SDTypeProfile<0, 3, [SDTCisFP<0>,
+                                           SDTCisPtrTy<1>, 
+                                           SDTCisVT<2, OtherVT>]>;
+def SDTX86Fild      : SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisPtrTy<1>,
+                                           SDTCisVT<2, OtherVT>]>;
+def SDTX86FpToIMem  : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>;
+
+def SDTX86CwdStore  : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+
+def X86fld          : SDNode<"X86ISD::FLD", SDTX86Fld,
+                             [SDNPHasChain, SDNPMayLoad]>;
+def X86fst          : SDNode<"X86ISD::FST", SDTX86Fst,
+                             [SDNPHasChain, SDNPInFlag, SDNPMayStore]>;
+def X86fild         : SDNode<"X86ISD::FILD", SDTX86Fild,
+                             [SDNPHasChain, SDNPMayLoad]>;
+def X86fildflag     : SDNode<"X86ISD::FILD_FLAG", SDTX86Fild,
+                             [SDNPHasChain, SDNPOutFlag, SDNPMayLoad]>;
+def X86fp_to_i16mem : SDNode<"X86ISD::FP_TO_INT16_IN_MEM", SDTX86FpToIMem,
+                             [SDNPHasChain, SDNPMayStore]>;
+def X86fp_to_i32mem : SDNode<"X86ISD::FP_TO_INT32_IN_MEM", SDTX86FpToIMem,
+                             [SDNPHasChain, SDNPMayStore]>;
+def X86fp_to_i64mem : SDNode<"X86ISD::FP_TO_INT64_IN_MEM", SDTX86FpToIMem,
+                             [SDNPHasChain, SDNPMayStore]>;
+def X86fp_cwd_get16 : SDNode<"X86ISD::FNSTCW16m",          SDTX86CwdStore,
+                             [SDNPHasChain, SDNPMayStore, SDNPSideEffect]>;
+
+//===----------------------------------------------------------------------===//
+// FPStack pattern fragments
+//===----------------------------------------------------------------------===//
+
+def fpimm0 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(+0.0);
+}]>;
+
+def fpimmneg0 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(-0.0);
+}]>;
+
+def fpimm1 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(+1.0);
+}]>;
+
+def fpimmneg1 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(-1.0);
+}]>;
+
+// Some 'special' instructions
+let usesCustomInserter = 1 in {  // Expanded after instruction selection.
+  def FP32_TO_INT16_IN_MEM : I<0, Pseudo,
+                              (outs), (ins i16mem:$dst, RFP32:$src),
+                              "##FP32_TO_INT16_IN_MEM PSEUDO!",
+                              [(X86fp_to_i16mem RFP32:$src, addr:$dst)]>;
+  def FP32_TO_INT32_IN_MEM : I<0, Pseudo,
+                              (outs), (ins i32mem:$dst, RFP32:$src),
+                              "##FP32_TO_INT32_IN_MEM PSEUDO!",
+                              [(X86fp_to_i32mem RFP32:$src, addr:$dst)]>;
+  def FP32_TO_INT64_IN_MEM : I<0, Pseudo,
+                              (outs), (ins i64mem:$dst, RFP32:$src),
+                              "##FP32_TO_INT64_IN_MEM PSEUDO!",
+                              [(X86fp_to_i64mem RFP32:$src, addr:$dst)]>;
+  def FP64_TO_INT16_IN_MEM : I<0, Pseudo,
+                              (outs), (ins i16mem:$dst, RFP64:$src),
+                              "##FP64_TO_INT16_IN_MEM PSEUDO!",
+                              [(X86fp_to_i16mem RFP64:$src, addr:$dst)]>;
+  def FP64_TO_INT32_IN_MEM : I<0, Pseudo,
+                              (outs), (ins i32mem:$dst, RFP64:$src),
+                              "##FP64_TO_INT32_IN_MEM PSEUDO!",
+                              [(X86fp_to_i32mem RFP64:$src, addr:$dst)]>;
+  def FP64_TO_INT64_IN_MEM : I<0, Pseudo,
+                              (outs), (ins i64mem:$dst, RFP64:$src),
+                              "##FP64_TO_INT64_IN_MEM PSEUDO!",
+                              [(X86fp_to_i64mem RFP64:$src, addr:$dst)]>;
+  def FP80_TO_INT16_IN_MEM : I<0, Pseudo,
+                              (outs), (ins i16mem:$dst, RFP80:$src),
+                              "##FP80_TO_INT16_IN_MEM PSEUDO!",
+                              [(X86fp_to_i16mem RFP80:$src, addr:$dst)]>;
+  def FP80_TO_INT32_IN_MEM : I<0, Pseudo,
+                              (outs), (ins i32mem:$dst, RFP80:$src),
+                              "##FP80_TO_INT32_IN_MEM PSEUDO!",
+                              [(X86fp_to_i32mem RFP80:$src, addr:$dst)]>;
+  def FP80_TO_INT64_IN_MEM : I<0, Pseudo,
+                              (outs), (ins i64mem:$dst, RFP80:$src),
+                              "##FP80_TO_INT64_IN_MEM PSEUDO!",
+                              [(X86fp_to_i64mem RFP80:$src, addr:$dst)]>;
+}
+
+let isTerminator = 1 in
+  let Defs = [FP0, FP1, FP2, FP3, FP4, FP5, FP6] in
+    def FP_REG_KILL  : I<0, Pseudo, (outs), (ins), "##FP_REG_KILL", []>;
+
+// All FP Stack operations are represented with four instructions here.  The
+// first three instructions, generated by the instruction selector, use "RFP32"
+// "RFP64" or "RFP80" registers: traditional register files to reference 32-bit,
+// 64-bit or 80-bit floating point values.  These sizes apply to the values, 
+// not the registers, which are always 80 bits; RFP32, RFP64 and RFP80 can be
+// copied to each other without losing information.  These instructions are all
+// pseudo instructions and use the "_Fp" suffix.
+// In some cases there are additional variants with a mixture of different
+// register sizes.
+// The second instruction is defined with FPI, which is the actual instruction
+// emitted by the assembler.  These use "RST" registers, although frequently
+// the actual register(s) used are implicit.  These are always 80 bits.
+// The FP stackifier pass converts one to the other after register allocation 
+// occurs.
+//
+// Note that the FpI instruction should have instruction selection info (e.g.
+// a pattern) and the FPI instruction should have emission info (e.g. opcode
+// encoding and asm printing info).
+
+// Pseudo Instructions for FP stack return values.
+def FpGET_ST0_32 : FpI_<(outs RFP32:$dst), (ins), SpecialFP, []>; // FPR = ST(0)
+def FpGET_ST0_64 : FpI_<(outs RFP64:$dst), (ins), SpecialFP, []>; // FPR = ST(0)
+def FpGET_ST0_80 : FpI_<(outs RFP80:$dst), (ins), SpecialFP, []>; // FPR = ST(0)
+
+// FpGET_ST1* should only be issued *after* an FpGET_ST0* has been issued when
+// there are two values live out on the stack from a call or inlineasm.  This
+// magic is handled by the stackifier.  It is not valid to emit FpGET_ST1* and
+// then FpGET_ST0*.  In addition, it is invalid for any FP-using operations to
+// occur between them.
+def FpGET_ST1_32 : FpI_<(outs RFP32:$dst), (ins), SpecialFP, []>; // FPR = ST(1)
+def FpGET_ST1_64 : FpI_<(outs RFP64:$dst), (ins), SpecialFP, []>; // FPR = ST(1)
+def FpGET_ST1_80 : FpI_<(outs RFP80:$dst), (ins), SpecialFP, []>; // FPR = ST(1)
+
+let Defs = [ST0] in {
+def FpSET_ST0_32 : FpI_<(outs), (ins RFP32:$src), SpecialFP, []>; // ST(0) = FPR
+def FpSET_ST0_64 : FpI_<(outs), (ins RFP64:$src), SpecialFP, []>; // ST(0) = FPR
+def FpSET_ST0_80 : FpI_<(outs), (ins RFP80:$src), SpecialFP, []>; // ST(0) = FPR
+}
+
+let Defs = [ST1] in {
+def FpSET_ST1_32 : FpI_<(outs), (ins RFP32:$src), SpecialFP, []>; // ST(1) = FPR
+def FpSET_ST1_64 : FpI_<(outs), (ins RFP64:$src), SpecialFP, []>; // ST(1) = FPR
+def FpSET_ST1_80 : FpI_<(outs), (ins RFP80:$src), SpecialFP, []>; // ST(1) = FPR
+}
+
+// FpIf32, FpIf64 - Floating Point Psuedo Instruction template.
+// f32 instructions can use SSE1 and are predicated on FPStackf32 == !SSE1.
+// f64 instructions can use SSE2 and are predicated on FPStackf64 == !SSE2.
+// f80 instructions cannot use SSE and use neither of these.
+class FpIf32<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+  FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32]>;
+class FpIf64<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
+  FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64]>;
+
+// Register copies.  Just copies, the shortening ones do not truncate.
+let neverHasSideEffects = 1 in {
+  def MOV_Fp3232 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src), SpecialFP, []>; 
+  def MOV_Fp3264 : FpIf32<(outs RFP64:$dst), (ins RFP32:$src), SpecialFP, []>; 
+  def MOV_Fp6432 : FpIf32<(outs RFP32:$dst), (ins RFP64:$src), SpecialFP, []>; 
+  def MOV_Fp6464 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src), SpecialFP, []>; 
+  def MOV_Fp8032 : FpIf32<(outs RFP32:$dst), (ins RFP80:$src), SpecialFP, []>; 
+  def MOV_Fp3280 : FpIf32<(outs RFP80:$dst), (ins RFP32:$src), SpecialFP, []>; 
+  def MOV_Fp8064 : FpIf64<(outs RFP64:$dst), (ins RFP80:$src), SpecialFP, []>; 
+  def MOV_Fp6480 : FpIf64<(outs RFP80:$dst), (ins RFP64:$src), SpecialFP, []>; 
+  def MOV_Fp8080 : FpI_  <(outs RFP80:$dst), (ins RFP80:$src), SpecialFP, []>; 
+}
+
+// Factoring for arithmetic.
+multiclass FPBinary_rr<SDNode OpNode> {
+// Register op register -> register
+// These are separated out because they have no reversed form.
+def _Fp32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2), TwoArgFP,
+                [(set RFP32:$dst, (OpNode RFP32:$src1, RFP32:$src2))]>;
+def _Fp64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2), TwoArgFP,
+                [(set RFP64:$dst, (OpNode RFP64:$src1, RFP64:$src2))]>;
+def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), TwoArgFP,
+                [(set RFP80:$dst, (OpNode RFP80:$src1, RFP80:$src2))]>;
+}
+// The FopST0 series are not included here because of the irregularities
+// in where the 'r' goes in assembly output.
+// These instructions cannot address 80-bit memory.
+multiclass FPBinary<SDNode OpNode, Format fp, string asmstring> {
+// ST(0) = ST(0) + [mem]
+def _Fp32m  : FpIf32<(outs RFP32:$dst), 
+                     (ins RFP32:$src1, f32mem:$src2), OneArgFPRW,
+                  [(set RFP32:$dst, 
+                    (OpNode RFP32:$src1, (loadf32 addr:$src2)))]>;
+def _Fp64m  : FpIf64<(outs RFP64:$dst), 
+                     (ins RFP64:$src1, f64mem:$src2), OneArgFPRW,
+                  [(set RFP64:$dst, 
+                    (OpNode RFP64:$src1, (loadf64 addr:$src2)))]>;
+def _Fp64m32: FpIf64<(outs RFP64:$dst), 
+                     (ins RFP64:$src1, f32mem:$src2), OneArgFPRW,
+                  [(set RFP64:$dst, 
+                    (OpNode RFP64:$src1, (f64 (extloadf32 addr:$src2))))]>;
+def _Fp80m32: FpI_<(outs RFP80:$dst), 
+                   (ins RFP80:$src1, f32mem:$src2), OneArgFPRW,
+                  [(set RFP80:$dst, 
+                    (OpNode RFP80:$src1, (f80 (extloadf32 addr:$src2))))]>;
+def _Fp80m64: FpI_<(outs RFP80:$dst), 
+                   (ins RFP80:$src1, f64mem:$src2), OneArgFPRW,
+                  [(set RFP80:$dst, 
+                    (OpNode RFP80:$src1, (f80 (extloadf64 addr:$src2))))]>;
+def _F32m  : FPI<0xD8, fp, (outs), (ins f32mem:$src), 
+                 !strconcat("f", !strconcat(asmstring, "{s}\t$src"))> { 
+  let mayLoad = 1; 
+}
+def _F64m  : FPI<0xDC, fp, (outs), (ins f64mem:$src), 
+                 !strconcat("f", !strconcat(asmstring, "{l}\t$src"))> { 
+  let mayLoad = 1; 
+}
+// ST(0) = ST(0) + [memint]
+def _FpI16m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i16mem:$src2), 
+                       OneArgFPRW,
+                    [(set RFP32:$dst, (OpNode RFP32:$src1,
+                                       (X86fild addr:$src2, i16)))]>;
+def _FpI32m32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, i32mem:$src2), 
+                       OneArgFPRW,
+                    [(set RFP32:$dst, (OpNode RFP32:$src1,
+                                       (X86fild addr:$src2, i32)))]>;
+def _FpI16m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i16mem:$src2), 
+                       OneArgFPRW,
+                    [(set RFP64:$dst, (OpNode RFP64:$src1,
+                                       (X86fild addr:$src2, i16)))]>;
+def _FpI32m64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, i32mem:$src2), 
+                       OneArgFPRW,
+                    [(set RFP64:$dst, (OpNode RFP64:$src1,
+                                       (X86fild addr:$src2, i32)))]>;
+def _FpI16m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i16mem:$src2), 
+                       OneArgFPRW,
+                    [(set RFP80:$dst, (OpNode RFP80:$src1,
+                                       (X86fild addr:$src2, i16)))]>;
+def _FpI32m80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, i32mem:$src2), 
+                       OneArgFPRW,
+                    [(set RFP80:$dst, (OpNode RFP80:$src1,
+                                       (X86fild addr:$src2, i32)))]>;
+def _FI16m  : FPI<0xDE, fp, (outs), (ins i16mem:$src), 
+                  !strconcat("fi", !strconcat(asmstring, "{s}\t$src"))> { 
+  let mayLoad = 1; 
+}
+def _FI32m  : FPI<0xDA, fp, (outs), (ins i32mem:$src), 
+                  !strconcat("fi", !strconcat(asmstring, "{l}\t$src"))> { 
+  let mayLoad = 1; 
+}
+}
+
+defm ADD : FPBinary_rr<fadd>;
+defm SUB : FPBinary_rr<fsub>;
+defm MUL : FPBinary_rr<fmul>;
+defm DIV : FPBinary_rr<fdiv>;
+defm ADD : FPBinary<fadd, MRM0m, "add">;
+defm SUB : FPBinary<fsub, MRM4m, "sub">;
+defm SUBR: FPBinary<fsub ,MRM5m, "subr">;
+defm MUL : FPBinary<fmul, MRM1m, "mul">;
+defm DIV : FPBinary<fdiv, MRM6m, "div">;
+defm DIVR: FPBinary<fdiv, MRM7m, "divr">;
+
+class FPST0rInst<bits<8> o, string asm>
+  : FPI<o, AddRegFrm, (outs), (ins RST:$op), asm>, D8;
+class FPrST0Inst<bits<8> o, string asm>
+  : FPI<o, AddRegFrm, (outs), (ins RST:$op), asm>, DC;
+class FPrST0PInst<bits<8> o, string asm>
+  : FPI<o, AddRegFrm, (outs), (ins RST:$op), asm>, DE;
+
+// NOTE: GAS and apparently all other AT&T style assemblers have a broken notion
+// of some of the 'reverse' forms of the fsub and fdiv instructions.  As such,
+// we have to put some 'r's in and take them out of weird places.
+def ADD_FST0r   : FPST0rInst <0xC0, "fadd\t$op">;
+def ADD_FrST0   : FPrST0Inst <0xC0, "fadd\t{%st(0), $op|$op, %ST(0)}">;
+def ADD_FPrST0  : FPrST0PInst<0xC0, "faddp\t$op">;
+def SUBR_FST0r  : FPST0rInst <0xE8, "fsubr\t$op">;
+def SUB_FrST0   : FPrST0Inst <0xE8, "fsub{r}\t{%st(0), $op|$op, %ST(0)}">;
+def SUB_FPrST0  : FPrST0PInst<0xE8, "fsub{r}p\t$op">;
+def SUB_FST0r   : FPST0rInst <0xE0, "fsub\t$op">;
+def SUBR_FrST0  : FPrST0Inst <0xE0, "fsub{|r}\t{%st(0), $op|$op, %ST(0)}">;
+def SUBR_FPrST0 : FPrST0PInst<0xE0, "fsub{|r}p\t$op">;
+def MUL_FST0r   : FPST0rInst <0xC8, "fmul\t$op">;
+def MUL_FrST0   : FPrST0Inst <0xC8, "fmul\t{%st(0), $op|$op, %ST(0)}">;
+def MUL_FPrST0  : FPrST0PInst<0xC8, "fmulp\t$op">;
+def DIVR_FST0r  : FPST0rInst <0xF8, "fdivr\t$op">;
+def DIV_FrST0   : FPrST0Inst <0xF8, "fdiv{r}\t{%st(0), $op|$op, %ST(0)}">;
+def DIV_FPrST0  : FPrST0PInst<0xF8, "fdiv{r}p\t$op">;
+def DIV_FST0r   : FPST0rInst <0xF0, "fdiv\t$op">;
+def DIVR_FrST0  : FPrST0Inst <0xF0, "fdiv{|r}\t{%st(0), $op|$op, %ST(0)}">;
+def DIVR_FPrST0 : FPrST0PInst<0xF0, "fdiv{|r}p\t$op">;
+
+def COM_FST0r   : FPST0rInst <0xD0, "fcom\t$op">;
+def COMP_FST0r  : FPST0rInst <0xD8, "fcomp\t$op">;
+
+// Unary operations.
+multiclass FPUnary<SDNode OpNode, bits<8> opcode, string asmstring> {
+def _Fp32  : FpIf32<(outs RFP32:$dst), (ins RFP32:$src), OneArgFPRW,
+                 [(set RFP32:$dst, (OpNode RFP32:$src))]>;
+def _Fp64  : FpIf64<(outs RFP64:$dst), (ins RFP64:$src), OneArgFPRW,
+                 [(set RFP64:$dst, (OpNode RFP64:$src))]>;
+def _Fp80  : FpI_<(outs RFP80:$dst), (ins RFP80:$src), OneArgFPRW,
+                 [(set RFP80:$dst, (OpNode RFP80:$src))]>;
+def _F     : FPI<opcode, RawFrm, (outs), (ins), asmstring>, D9;
+}
+
+defm CHS : FPUnary<fneg, 0xE0, "fchs">;
+defm ABS : FPUnary<fabs, 0xE1, "fabs">;
+defm SQRT: FPUnary<fsqrt,0xFA, "fsqrt">;
+defm SIN : FPUnary<fsin, 0xFE, "fsin">;
+defm COS : FPUnary<fcos, 0xFF, "fcos">;
+
+let neverHasSideEffects = 1 in {
+def TST_Fp32  : FpIf32<(outs), (ins RFP32:$src), OneArgFP, []>;
+def TST_Fp64  : FpIf64<(outs), (ins RFP64:$src), OneArgFP, []>;
+def TST_Fp80  : FpI_<(outs), (ins RFP80:$src), OneArgFP, []>;
+}
+def TST_F  : FPI<0xE4, RawFrm, (outs), (ins), "ftst">, D9;
+
+// Versions of FP instructions that take a single memory operand.  Added for the
+//   disassembler; remove as they are included with patterns elsewhere.
+def FCOM32m  : FPI<0xD8, MRM2m, (outs), (ins f32mem:$src), "fcom{l}\t$src">;
+def FCOMP32m : FPI<0xD8, MRM3m, (outs), (ins f32mem:$src), "fcomp{l}\t$src">;
+
+def FLDENVm  : FPI<0xD9, MRM4m, (outs), (ins f32mem:$src), "fldenv\t$src">;
+def FSTENVm  : FPI<0xD9, MRM6m, (outs f32mem:$dst), (ins), "fnstenv\t$dst">;
+
+def FICOM32m : FPI<0xDA, MRM2m, (outs), (ins i32mem:$src), "ficom{l}\t$src">;
+def FICOMP32m: FPI<0xDA, MRM3m, (outs), (ins i32mem:$src), "ficomp{l}\t$src">;
+
+def FCOM64m  : FPI<0xDC, MRM2m, (outs), (ins f64mem:$src), "fcom{ll}\t$src">;
+def FCOMP64m : FPI<0xDC, MRM3m, (outs), (ins f64mem:$src), "fcomp{ll}\t$src">;
+
+def FRSTORm  : FPI<0xDD, MRM4m, (outs f32mem:$dst), (ins), "frstor\t$dst">;
+def FSAVEm   : FPI<0xDD, MRM6m, (outs f32mem:$dst), (ins), "fnsave\t$dst">;
+def FNSTSWm  : FPI<0xDD, MRM7m, (outs f32mem:$dst), (ins), "fnstsw\t$dst">;
+
+def FICOM16m : FPI<0xDE, MRM2m, (outs), (ins i16mem:$src), "ficom{w}\t$src">;
+def FICOMP16m: FPI<0xDE, MRM3m, (outs), (ins i16mem:$src), "ficomp{w}\t$src">;
+
+def FBLDm    : FPI<0xDF, MRM4m, (outs), (ins f32mem:$src), "fbld\t$src">;
+def FBSTPm   : FPI<0xDF, MRM6m, (outs f32mem:$dst), (ins), "fbstp\t$dst">;
+
+// Floating point cmovs.
+multiclass FPCMov<PatLeaf cc> {
+  def _Fp32  : FpIf32<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2),
+                       CondMovFP,
+                     [(set RFP32:$dst, (X86cmov RFP32:$src1, RFP32:$src2,
+                                        cc, EFLAGS))]>;
+  def _Fp64  : FpIf64<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2),
+                       CondMovFP,
+                     [(set RFP64:$dst, (X86cmov RFP64:$src1, RFP64:$src2,
+                                        cc, EFLAGS))]>;
+  def _Fp80  : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2),
+                     CondMovFP,
+                     [(set RFP80:$dst, (X86cmov RFP80:$src1, RFP80:$src2,
+                                        cc, EFLAGS))]>;
+}
+let Uses = [EFLAGS], isTwoAddress = 1 in {
+defm CMOVB  : FPCMov<X86_COND_B>;
+defm CMOVBE : FPCMov<X86_COND_BE>;
+defm CMOVE  : FPCMov<X86_COND_E>;
+defm CMOVP  : FPCMov<X86_COND_P>;
+defm CMOVNB : FPCMov<X86_COND_AE>;
+defm CMOVNBE: FPCMov<X86_COND_A>;
+defm CMOVNE : FPCMov<X86_COND_NE>;
+defm CMOVNP : FPCMov<X86_COND_NP>;
+}
+
+// These are not factored because there's no clean way to pass DA/DB.
+def CMOVB_F  : FPI<0xC0, AddRegFrm, (outs RST:$op), (ins),
+                  "fcmovb\t{$op, %st(0)|%ST(0), $op}">, DA;
+def CMOVBE_F : FPI<0xD0, AddRegFrm, (outs RST:$op), (ins),
+                  "fcmovbe\t{$op, %st(0)|%ST(0), $op}">, DA;
+def CMOVE_F  : FPI<0xC8, AddRegFrm, (outs RST:$op), (ins),
+                  "fcmove\t{$op, %st(0)|%ST(0), $op}">, DA;
+def CMOVP_F  : FPI<0xD8, AddRegFrm, (outs RST:$op), (ins),
+                  "fcmovu\t {$op, %st(0)|%ST(0), $op}">, DA;
+def CMOVNB_F : FPI<0xC0, AddRegFrm, (outs RST:$op), (ins),
+                  "fcmovnb\t{$op, %st(0)|%ST(0), $op}">, DB;
+def CMOVNBE_F: FPI<0xD0, AddRegFrm, (outs RST:$op), (ins),
+                  "fcmovnbe\t{$op, %st(0)|%ST(0), $op}">, DB;
+def CMOVNE_F : FPI<0xC8, AddRegFrm, (outs RST:$op), (ins),
+                  "fcmovne\t{$op, %st(0)|%ST(0), $op}">, DB;
+def CMOVNP_F : FPI<0xD8, AddRegFrm, (outs RST:$op), (ins),
+                  "fcmovnu\t{$op, %st(0)|%ST(0), $op}">, DB;
+
+// Floating point loads & stores.
+let canFoldAsLoad = 1 in {
+def LD_Fp32m   : FpIf32<(outs RFP32:$dst), (ins f32mem:$src), ZeroArgFP,
+                  [(set RFP32:$dst, (loadf32 addr:$src))]>;
+let isReMaterializable = 1, mayHaveSideEffects = 1 in
+  def LD_Fp64m : FpIf64<(outs RFP64:$dst), (ins f64mem:$src), ZeroArgFP,
+                  [(set RFP64:$dst, (loadf64 addr:$src))]>;
+def LD_Fp80m   : FpI_<(outs RFP80:$dst), (ins f80mem:$src), ZeroArgFP,
+                  [(set RFP80:$dst, (loadf80 addr:$src))]>;
+}
+def LD_Fp32m64 : FpIf64<(outs RFP64:$dst), (ins f32mem:$src), ZeroArgFP,
+                  [(set RFP64:$dst, (f64 (extloadf32 addr:$src)))]>;
+def LD_Fp64m80 : FpI_<(outs RFP80:$dst), (ins f64mem:$src), ZeroArgFP,
+                  [(set RFP80:$dst, (f80 (extloadf64 addr:$src)))]>;
+def LD_Fp32m80 : FpI_<(outs RFP80:$dst), (ins f32mem:$src), ZeroArgFP,
+                  [(set RFP80:$dst, (f80 (extloadf32 addr:$src)))]>;
+def ILD_Fp16m32: FpIf32<(outs RFP32:$dst), (ins i16mem:$src), ZeroArgFP,
+                  [(set RFP32:$dst, (X86fild addr:$src, i16))]>;
+def ILD_Fp32m32: FpIf32<(outs RFP32:$dst), (ins i32mem:$src), ZeroArgFP,
+                  [(set RFP32:$dst, (X86fild addr:$src, i32))]>;
+def ILD_Fp64m32: FpIf32<(outs RFP32:$dst), (ins i64mem:$src), ZeroArgFP,
+                  [(set RFP32:$dst, (X86fild addr:$src, i64))]>;
+def ILD_Fp16m64: FpIf64<(outs RFP64:$dst), (ins i16mem:$src), ZeroArgFP,
+                  [(set RFP64:$dst, (X86fild addr:$src, i16))]>;
+def ILD_Fp32m64: FpIf64<(outs RFP64:$dst), (ins i32mem:$src), ZeroArgFP,
+                  [(set RFP64:$dst, (X86fild addr:$src, i32))]>;
+def ILD_Fp64m64: FpIf64<(outs RFP64:$dst), (ins i64mem:$src), ZeroArgFP,
+                  [(set RFP64:$dst, (X86fild addr:$src, i64))]>;
+def ILD_Fp16m80: FpI_<(outs RFP80:$dst), (ins i16mem:$src), ZeroArgFP,
+                  [(set RFP80:$dst, (X86fild addr:$src, i16))]>;
+def ILD_Fp32m80: FpI_<(outs RFP80:$dst), (ins i32mem:$src), ZeroArgFP,
+                  [(set RFP80:$dst, (X86fild addr:$src, i32))]>;
+def ILD_Fp64m80: FpI_<(outs RFP80:$dst), (ins i64mem:$src), ZeroArgFP,
+                  [(set RFP80:$dst, (X86fild addr:$src, i64))]>;
+
+def ST_Fp32m   : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP,
+                  [(store RFP32:$src, addr:$op)]>;
+def ST_Fp64m32 : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP,
+                  [(truncstoref32 RFP64:$src, addr:$op)]>;
+def ST_Fp64m   : FpIf64<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP,
+                  [(store RFP64:$src, addr:$op)]>;
+def ST_Fp80m32 : FpI_<(outs), (ins f32mem:$op, RFP80:$src), OneArgFP,
+                  [(truncstoref32 RFP80:$src, addr:$op)]>;
+def ST_Fp80m64 : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP,
+                  [(truncstoref64 RFP80:$src, addr:$op)]>;
+// FST does not support 80-bit memory target; FSTP must be used.
+
+let mayStore = 1, neverHasSideEffects = 1 in {
+def ST_FpP32m    : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP, []>;
+def ST_FpP64m32  : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP, []>;
+def ST_FpP64m    : FpIf64<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP, []>;
+def ST_FpP80m32  : FpI_<(outs), (ins f32mem:$op, RFP80:$src), OneArgFP, []>;
+def ST_FpP80m64  : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP, []>;
+}
+def ST_FpP80m    : FpI_<(outs), (ins f80mem:$op, RFP80:$src), OneArgFP,
+                    [(store RFP80:$src, addr:$op)]>;
+let mayStore = 1, neverHasSideEffects = 1 in {
+def IST_Fp16m32  : FpIf32<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp32m32  : FpIf32<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp64m32  : FpIf32<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp16m64  : FpIf64<(outs), (ins i16mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp32m64  : FpIf64<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp64m64  : FpIf64<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp16m80  : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP, []>;
+def IST_Fp32m80  : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP, []>;
+def IST_Fp64m80  : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, []>;
+}
+
+let mayLoad = 1 in {
+def LD_F32m   : FPI<0xD9, MRM0m, (outs), (ins f32mem:$src), "fld{s}\t$src">;
+def LD_F64m   : FPI<0xDD, MRM0m, (outs), (ins f64mem:$src), "fld{l}\t$src">;
+def LD_F80m   : FPI<0xDB, MRM5m, (outs), (ins f80mem:$src), "fld{t}\t$src">;
+def ILD_F16m  : FPI<0xDF, MRM0m, (outs), (ins i16mem:$src), "fild{s}\t$src">;
+def ILD_F32m  : FPI<0xDB, MRM0m, (outs), (ins i32mem:$src), "fild{l}\t$src">;
+def ILD_F64m  : FPI<0xDF, MRM5m, (outs), (ins i64mem:$src), "fild{ll}\t$src">;
+}
+let mayStore = 1 in {
+def ST_F32m   : FPI<0xD9, MRM2m, (outs), (ins f32mem:$dst), "fst{s}\t$dst">;
+def ST_F64m   : FPI<0xDD, MRM2m, (outs), (ins f64mem:$dst), "fst{l}\t$dst">;
+def ST_FP32m  : FPI<0xD9, MRM3m, (outs), (ins f32mem:$dst), "fstp{s}\t$dst">;
+def ST_FP64m  : FPI<0xDD, MRM3m, (outs), (ins f64mem:$dst), "fstp{l}\t$dst">;
+def ST_FP80m  : FPI<0xDB, MRM7m, (outs), (ins f80mem:$dst), "fstp{t}\t$dst">;
+def IST_F16m  : FPI<0xDF, MRM2m, (outs), (ins i16mem:$dst), "fist{s}\t$dst">;
+def IST_F32m  : FPI<0xDB, MRM2m, (outs), (ins i32mem:$dst), "fist{l}\t$dst">;
+def IST_FP16m : FPI<0xDF, MRM3m, (outs), (ins i16mem:$dst), "fistp{s}\t$dst">;
+def IST_FP32m : FPI<0xDB, MRM3m, (outs), (ins i32mem:$dst), "fistp{l}\t$dst">;
+def IST_FP64m : FPI<0xDF, MRM7m, (outs), (ins i64mem:$dst), "fistp{ll}\t$dst">;
+}
+
+// FISTTP requires SSE3 even though it's a FPStack op.
+def ISTT_Fp16m32 : FpI_<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP,
+                    [(X86fp_to_i16mem RFP32:$src, addr:$op)]>,
+                    Requires<[HasSSE3]>;
+def ISTT_Fp32m32 : FpI_<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP,
+                    [(X86fp_to_i32mem RFP32:$src, addr:$op)]>,
+                    Requires<[HasSSE3]>;
+def ISTT_Fp64m32 : FpI_<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP,
+                    [(X86fp_to_i64mem RFP32:$src, addr:$op)]>,
+                    Requires<[HasSSE3]>;
+def ISTT_Fp16m64 : FpI_<(outs), (ins i16mem:$op, RFP64:$src), OneArgFP,
+                    [(X86fp_to_i16mem RFP64:$src, addr:$op)]>,
+                    Requires<[HasSSE3]>;
+def ISTT_Fp32m64 : FpI_<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP,
+                    [(X86fp_to_i32mem RFP64:$src, addr:$op)]>,
+                    Requires<[HasSSE3]>;
+def ISTT_Fp64m64 : FpI_<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP,
+                    [(X86fp_to_i64mem RFP64:$src, addr:$op)]>,
+                    Requires<[HasSSE3]>;
+def ISTT_Fp16m80 : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP,
+                    [(X86fp_to_i16mem RFP80:$src, addr:$op)]>,
+                    Requires<[HasSSE3]>;
+def ISTT_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP,
+                    [(X86fp_to_i32mem RFP80:$src, addr:$op)]>,
+                    Requires<[HasSSE3]>;
+def ISTT_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP,
+                    [(X86fp_to_i64mem RFP80:$src, addr:$op)]>,
+                    Requires<[HasSSE3]>;
+
+let mayStore = 1 in {
+def ISTT_FP16m : FPI<0xDF, MRM1m, (outs), (ins i16mem:$dst), "fisttp{s}\t$dst">;
+def ISTT_FP32m : FPI<0xDB, MRM1m, (outs), (ins i32mem:$dst), "fisttp{l}\t$dst">;
+def ISTT_FP64m : FPI<0xDD, MRM1m, (outs), (ins i64mem:$dst), 
+  "fisttp{ll}\t$dst">;
+}
+
+// FP Stack manipulation instructions.
+def LD_Frr   : FPI<0xC0, AddRegFrm, (outs), (ins RST:$op), "fld\t$op">, D9;
+def ST_Frr   : FPI<0xD0, AddRegFrm, (outs), (ins RST:$op), "fst\t$op">, DD;
+def ST_FPrr  : FPI<0xD8, AddRegFrm, (outs), (ins RST:$op), "fstp\t$op">, DD;
+def XCH_F    : FPI<0xC8, AddRegFrm, (outs), (ins RST:$op), "fxch\t$op">, D9;
+
+// Floating point constant loads.
+let isReMaterializable = 1 in {
+def LD_Fp032 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP,
+                [(set RFP32:$dst, fpimm0)]>;
+def LD_Fp132 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP,
+                [(set RFP32:$dst, fpimm1)]>;
+def LD_Fp064 : FpIf64<(outs RFP64:$dst), (ins), ZeroArgFP,
+                [(set RFP64:$dst, fpimm0)]>;
+def LD_Fp164 : FpIf64<(outs RFP64:$dst), (ins), ZeroArgFP,
+                [(set RFP64:$dst, fpimm1)]>;
+def LD_Fp080 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP,
+                [(set RFP80:$dst, fpimm0)]>;
+def LD_Fp180 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP,
+                [(set RFP80:$dst, fpimm1)]>;
+}
+
+def LD_F0 : FPI<0xEE, RawFrm, (outs), (ins), "fldz">, D9;
+def LD_F1 : FPI<0xE8, RawFrm, (outs), (ins), "fld1">, D9;
+
+
+// Floating point compares.
+let Defs = [EFLAGS] in {
+def UCOM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
+                        []>;  // FPSW = cmp ST(0) with ST(i)
+def UCOM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
+                        []>;  // FPSW = cmp ST(0) with ST(i)
+def UCOM_Fpr80 : FpI_  <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
+                        []>;  // FPSW = cmp ST(0) with ST(i)
+                        
+def UCOM_FpIr32: FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
+                  [(X86cmp RFP32:$lhs, RFP32:$rhs),
+                   (implicit EFLAGS)]>; // CC = ST(0) cmp ST(i)
+def UCOM_FpIr64: FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
+                  [(X86cmp RFP64:$lhs, RFP64:$rhs),
+                   (implicit EFLAGS)]>; // CC = ST(0) cmp ST(i)
+def UCOM_FpIr80: FpI_<(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
+                  [(X86cmp RFP80:$lhs, RFP80:$rhs),
+                   (implicit EFLAGS)]>; // CC = ST(0) cmp ST(i)
+}
+
+let Defs = [EFLAGS], Uses = [ST0] in {
+def UCOM_Fr    : FPI<0xE0, AddRegFrm,    // FPSW = cmp ST(0) with ST(i)
+                    (outs), (ins RST:$reg),
+                    "fucom\t$reg">, DD;
+def UCOM_FPr   : FPI<0xE8, AddRegFrm,    // FPSW = cmp ST(0) with ST(i), pop
+                    (outs), (ins RST:$reg),
+                    "fucomp\t$reg">, DD;
+def UCOM_FPPr  : FPI<0xE9, RawFrm,       // cmp ST(0) with ST(1), pop, pop
+                    (outs), (ins),
+                    "fucompp">, DA;
+
+def UCOM_FIr   : FPI<0xE8, AddRegFrm,     // CC = cmp ST(0) with ST(i)
+                    (outs), (ins RST:$reg),
+                    "fucomi\t{$reg, %st(0)|%ST(0), $reg}">, DB;
+def UCOM_FIPr  : FPI<0xE8, AddRegFrm,     // CC = cmp ST(0) with ST(i), pop
+                    (outs), (ins RST:$reg),
+                    "fucomip\t{$reg, %st(0)|%ST(0), $reg}">, DF;
+}
+
+def COM_FIr : FPI<0xF0, AddRegFrm, (outs), (ins RST:$reg),
+                  "fcomi\t{$reg, %st(0)|%ST(0), $reg}">, DB;
+def COM_FIPr : FPI<0xF0, AddRegFrm, (outs), (ins RST:$reg),
+                   "fcomip\t{$reg, %st(0)|%ST(0), $reg}">, DF;
+
+// Floating point flag ops.
+let Defs = [AX] in
+def FNSTSW8r  : I<0xE0, RawFrm,                  // AX = fp flags
+                  (outs), (ins), "fnstsw %ax", []>, DF;
+
+def FNSTCW16m : I<0xD9, MRM7m,                   // [mem16] = X87 control world
+                  (outs), (ins i16mem:$dst), "fnstcw\t$dst",
+                  [(X86fp_cwd_get16 addr:$dst)]>;
+                  
+let mayLoad = 1 in
+def FLDCW16m  : I<0xD9, MRM5m,                   // X87 control world = [mem16]
+                  (outs), (ins i16mem:$dst), "fldcw\t$dst", []>;
+
+// Register free
+
+def FFREE : FPI<0xC0, AddRegFrm, (outs), (ins RST:$reg),
+                "ffree\t$reg">, DD;
+
+// Clear exceptions
+
+def FNCLEX : I<0xE2, RawFrm, (outs), (ins), "fnclex", []>, DB;
+
+// Operandless floating-point instructions for the disassembler
+
+def FNOP : I<0xD0, RawFrm, (outs), (ins), "fnop", []>, D9;
+def FXAM : I<0xE5, RawFrm, (outs), (ins), "fxam", []>, D9;
+def FLDL2T : I<0xE9, RawFrm, (outs), (ins), "fldl2t", []>, D9;
+def FLDL2E : I<0xEA, RawFrm, (outs), (ins), "fldl2e", []>, D9;
+def FLDPI : I<0xEB, RawFrm, (outs), (ins), "fldpi", []>, D9;
+def FLDLG2 : I<0xEC, RawFrm, (outs), (ins), "fldlg2", []>, D9;
+def FLDLN2 : I<0xED, RawFrm, (outs), (ins), "fldln2", []>, D9;
+def F2XM1 : I<0xF0, RawFrm, (outs), (ins), "f2xm1", []>, D9;
+def FYL2X : I<0xF1, RawFrm, (outs), (ins), "fyl2x", []>, D9;
+def FPTAN : I<0xF2, RawFrm, (outs), (ins), "fptan", []>, D9;
+def FPATAN : I<0xF3, RawFrm, (outs), (ins), "fpatan", []>, D9;
+def FXTRACT : I<0xF4, RawFrm, (outs), (ins), "fxtract", []>, D9;
+def FPREM1 : I<0xF5, RawFrm, (outs), (ins), "fprem1", []>, D9;
+def FDECSTP : I<0xF6, RawFrm, (outs), (ins), "fdecstp", []>, D9;
+def FINCSTP : I<0xF7, RawFrm, (outs), (ins), "fincstp", []>, D9;
+def FPREM : I<0xF8, RawFrm, (outs), (ins), "fprem", []>, D9;
+def FYL2XP1 : I<0xF9, RawFrm, (outs), (ins), "fyl2xp1", []>, D9;
+def FSINCOS : I<0xFB, RawFrm, (outs), (ins), "fsincos", []>, D9;
+def FRNDINT : I<0xFC, RawFrm, (outs), (ins), "frndint", []>, D9;
+def FSCALE : I<0xFD, RawFrm, (outs), (ins), "fscale", []>, D9;
+def FCOMPP : I<0xD9, RawFrm, (outs), (ins), "fcompp", []>, DE;
+
+def FXSAVE : I<0xAE, MRM0m, (outs opaque512mem:$dst), (ins),
+               "fxsave\t$dst", []>, TB;
+def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
+                "fxrstor\t$src", []>, TB;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// Required for RET of f32 / f64 / f80 values.
+def : Pat<(X86fld addr:$src, f32), (LD_Fp32m addr:$src)>;
+def : Pat<(X86fld addr:$src, f64), (LD_Fp64m addr:$src)>;
+def : Pat<(X86fld addr:$src, f80), (LD_Fp80m addr:$src)>;
+
+// Required for CALL which return f32 / f64 / f80 values.
+def : Pat<(X86fst RFP32:$src, addr:$op, f32), (ST_Fp32m addr:$op, RFP32:$src)>;
+def : Pat<(X86fst RFP64:$src, addr:$op, f32), (ST_Fp64m32 addr:$op, 
+                                                          RFP64:$src)>;
+def : Pat<(X86fst RFP64:$src, addr:$op, f64), (ST_Fp64m addr:$op, RFP64:$src)>;
+def : Pat<(X86fst RFP80:$src, addr:$op, f32), (ST_Fp80m32 addr:$op, 
+                                                          RFP80:$src)>;
+def : Pat<(X86fst RFP80:$src, addr:$op, f64), (ST_Fp80m64 addr:$op, 
+                                                          RFP80:$src)>;
+def : Pat<(X86fst RFP80:$src, addr:$op, f80), (ST_FpP80m addr:$op,
+                                                         RFP80:$src)>;
+
+// Floating point constant -0.0 and -1.0
+def : Pat<(f32 fpimmneg0), (CHS_Fp32 (LD_Fp032))>, Requires<[FPStackf32]>;
+def : Pat<(f32 fpimmneg1), (CHS_Fp32 (LD_Fp132))>, Requires<[FPStackf32]>;
+def : Pat<(f64 fpimmneg0), (CHS_Fp64 (LD_Fp064))>, Requires<[FPStackf64]>;
+def : Pat<(f64 fpimmneg1), (CHS_Fp64 (LD_Fp164))>, Requires<[FPStackf64]>;
+def : Pat<(f80 fpimmneg0), (CHS_Fp80 (LD_Fp080))>;
+def : Pat<(f80 fpimmneg1), (CHS_Fp80 (LD_Fp180))>;
+
+// Used to conv. i64 to f64 since there isn't a SSE version.
+def : Pat<(X86fildflag addr:$src, i64), (ILD_Fp64m64 addr:$src)>;
+
+// FP extensions map onto simple pseudo-value conversions if they are to/from
+// the FP stack.
+def : Pat<(f64 (fextend RFP32:$src)), (MOV_Fp3264 RFP32:$src)>,
+          Requires<[FPStackf32]>;
+def : Pat<(f80 (fextend RFP32:$src)), (MOV_Fp3280 RFP32:$src)>,
+           Requires<[FPStackf32]>;
+def : Pat<(f80 (fextend RFP64:$src)), (MOV_Fp6480 RFP64:$src)>,
+           Requires<[FPStackf64]>;
+
+// FP truncations map onto simple pseudo-value conversions if they are to/from
+// the FP stack.  We have validated that only value-preserving truncations make
+// it through isel.
+def : Pat<(f32 (fround RFP64:$src)), (MOV_Fp6432 RFP64:$src)>,
+          Requires<[FPStackf32]>;
+def : Pat<(f32 (fround RFP80:$src)), (MOV_Fp8032 RFP80:$src)>,
+           Requires<[FPStackf32]>;
+def : Pat<(f64 (fround RFP80:$src)), (MOV_Fp8064 RFP80:$src)>,
+           Requires<[FPStackf64]>;

diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
new file mode 100644
index 0000000..a799f16
--- /dev/null
+++ b/lib/Target/X86/X86InstrFormats.td

@@ -0,0 +1,326 @@
+//===- X86InstrFormats.td - X86 Instruction Formats --------*- tablegen -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Format Definitions.
+//
+
+// Format specifies the encoding used by the instruction.  This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<6> val> {
+  bits<6> Value = val;
+}
+
+def Pseudo     : Format<0>; def RawFrm     : Format<1>;
+def AddRegFrm  : Format<2>; def MRMDestReg : Format<3>;
+def MRMDestMem : Format<4>; def MRMSrcReg  : Format<5>;
+def MRMSrcMem  : Format<6>;
+def MRM0r  : Format<16>; def MRM1r  : Format<17>; def MRM2r  : Format<18>;
+def MRM3r  : Format<19>; def MRM4r  : Format<20>; def MRM5r  : Format<21>;
+def MRM6r  : Format<22>; def MRM7r  : Format<23>;
+def MRM0m  : Format<24>; def MRM1m  : Format<25>; def MRM2m  : Format<26>;
+def MRM3m  : Format<27>; def MRM4m  : Format<28>; def MRM5m  : Format<29>;
+def MRM6m  : Format<30>; def MRM7m  : Format<31>;
+def MRMInitReg : Format<32>;
+
+
+// ImmType - This specifies the immediate type used by an instruction. This is
+// part of the ad-hoc solution used to emit machine instruction encodings by our
+// machine code emitter.
+class ImmType<bits<3> val> {
+  bits<3> Value = val;
+}
+def NoImm  : ImmType<0>;
+def Imm8   : ImmType<1>;
+def Imm16  : ImmType<2>;
+def Imm32  : ImmType<3>;
+def Imm64  : ImmType<4>;
+
+// FPFormat - This specifies what form this FP instruction has.  This is used by
+// the Floating-Point stackifier pass.
+class FPFormat<bits<3> val> {
+  bits<3> Value = val;
+}
+def NotFP      : FPFormat<0>;
+def ZeroArgFP  : FPFormat<1>;
+def OneArgFP   : FPFormat<2>;
+def OneArgFPRW : FPFormat<3>;
+def TwoArgFP   : FPFormat<4>;
+def CompareFP  : FPFormat<5>;
+def CondMovFP  : FPFormat<6>;
+def SpecialFP  : FPFormat<7>;
+
+// Prefix byte classes which are used to indicate to the ad-hoc machine code
+// emitter that various prefix bytes are required.
+class OpSize { bit hasOpSizePrefix = 1; }
+class AdSize { bit hasAdSizePrefix = 1; }
+class REX_W  { bit hasREX_WPrefix = 1; }
+class LOCK   { bit hasLockPrefix = 1; }
+class SegFS  { bits<2> SegOvrBits = 1; }
+class SegGS  { bits<2> SegOvrBits = 2; }
+class TB     { bits<4> Prefix = 1; }
+class REP    { bits<4> Prefix = 2; }
+class D8     { bits<4> Prefix = 3; }
+class D9     { bits<4> Prefix = 4; }
+class DA     { bits<4> Prefix = 5; }
+class DB     { bits<4> Prefix = 6; }
+class DC     { bits<4> Prefix = 7; }
+class DD     { bits<4> Prefix = 8; }
+class DE     { bits<4> Prefix = 9; }
+class DF     { bits<4> Prefix = 10; }
+class XD     { bits<4> Prefix = 11; }
+class XS     { bits<4> Prefix = 12; }
+class T8     { bits<4> Prefix = 13; }
+class TA     { bits<4> Prefix = 14; }
+class TF     { bits<4> Prefix = 15; }
+
+class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
+              string AsmStr>
+  : Instruction {
+  let Namespace = "X86";
+
+  bits<8> Opcode = opcod;
+  Format Form = f;
+  bits<6> FormBits = Form.Value;
+  ImmType ImmT = i;
+  bits<3> ImmTypeBits = ImmT.Value;
+
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  string AsmString = AsmStr;
+
+  //
+  // Attributes specific to X86 instructions...
+  //
+  bit hasOpSizePrefix = 0;  // Does this inst have a 0x66 prefix?
+  bit hasAdSizePrefix = 0;  // Does this inst have a 0x67 prefix?
+
+  bits<4> Prefix = 0;       // Which prefix byte does this inst have?
+  bit hasREX_WPrefix  = 0;  // Does this inst requires the REX.W prefix?
+  FPFormat FPForm;          // What flavor of FP instruction is this?
+  bits<3> FPFormBits = 0;
+  bit hasLockPrefix = 0;    // Does this inst have a 0xF0 prefix?
+  bits<2> SegOvrBits = 0;   // Segment override prefix.
+}
+
+class I<bits<8> o, Format f, dag outs, dag ins, string asm, list<dag> pattern>
+  : X86Inst<o, f, NoImm, outs, ins, asm> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+class Ii8 <bits<8> o, Format f, dag outs, dag ins, string asm, 
+           list<dag> pattern>
+  : X86Inst<o, f, Imm8 , outs, ins, asm> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+class Ii16<bits<8> o, Format f, dag outs, dag ins, string asm, 
+           list<dag> pattern>
+  : X86Inst<o, f, Imm16, outs, ins, asm> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+class Ii32<bits<8> o, Format f, dag outs, dag ins, string asm, 
+           list<dag> pattern>
+  : X86Inst<o, f, Imm32, outs, ins, asm> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
+// FPStack Instruction Templates:
+// FPI - Floating Point Instruction template.
+class FPI<bits<8> o, Format F, dag outs, dag ins, string asm>
+  : I<o, F, outs, ins, asm, []> {}
+
+// FpI_ - Floating Point Psuedo Instruction template. Not Predicated.
+class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern>
+  : X86Inst<0, Pseudo, NoImm, outs, ins, ""> {
+  let FPForm = fp; let FPFormBits = FPForm.Value;
+  let Pattern = pattern;
+}
+
+// Templates for instructions that use a 16- or 32-bit segmented address as
+//  their only operand: lcall (FAR CALL) and ljmp (FAR JMP)
+//
+//   Iseg16 - 16-bit segment selector, 16-bit offset
+//   Iseg32 - 16-bit segment selector, 32-bit offset
+
+class Iseg16 <bits<8> o, Format f, dag outs, dag ins, string asm, 
+              list<dag> pattern> : X86Inst<o, f, NoImm, outs, ins, asm> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
+class Iseg32 <bits<8> o, Format f, dag outs, dag ins, string asm, 
+              list<dag> pattern> : X86Inst<o, f, NoImm, outs, ins, asm> {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
+// SSE1 Instruction Templates:
+// 
+//   SSI   - SSE1 instructions with XS prefix.
+//   PSI   - SSE1 instructions with TB prefix.
+//   PSIi8 - SSE1 instructions with ImmT == Imm8 and TB prefix.
+
+class SSI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE1]>;
+class SSIi8<bits<8> o, Format F, dag outs, dag ins, string asm, 
+            list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE1]>;
+class PSI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, TB, Requires<[HasSSE1]>;
+class PSIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern>, TB, Requires<[HasSSE1]>;
+
+// SSE2 Instruction Templates:
+// 
+//   SDI    - SSE2 instructions with XD prefix.
+//   SDIi8  - SSE2 instructions with ImmT == Imm8 and XD prefix.
+//   SSDIi8 - SSE2 instructions with ImmT == Imm8 and XS prefix.
+//   PDI    - SSE2 instructions with TB and OpSize prefixes.
+//   PDIi8  - SSE2 instructions with ImmT == Imm8 and TB and OpSize prefixes.
+
+class SDI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, XD, Requires<[HasSSE2]>;
+class SDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern>, XD, Requires<[HasSSE2]>;
+class SSDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+             list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE2]>;
+class PDI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, TB, OpSize, Requires<[HasSSE2]>;
+class PDIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern>, TB, OpSize, Requires<[HasSSE2]>;
+
+// SSE3 Instruction Templates:
+// 
+//   S3I   - SSE3 instructions with TB and OpSize prefixes.
+//   S3SI  - SSE3 instructions with XS prefix.
+//   S3DI  - SSE3 instructions with XD prefix.
+
+class S3SI<bits<8> o, Format F, dag outs, dag ins, string asm, 
+           list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, XS, Requires<[HasSSE3]>;
+class S3DI<bits<8> o, Format F, dag outs, dag ins, string asm, 
+           list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, XD, Requires<[HasSSE3]>;
+class S3I<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, TB, OpSize, Requires<[HasSSE3]>;
+
+
+// SSSE3 Instruction Templates:
+// 
+//   SS38I - SSSE3 instructions with T8 prefix.
+//   SS3AI - SSSE3 instructions with TA prefix.
+//
+// Note: SSSE3 instructions have 64-bit and 128-bit versions. The 64-bit version
+// uses the MMX registers. We put those instructions here because they better
+// fit into the SSSE3 instruction category rather than the MMX category.
+
+class SS38I<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern>, T8, Requires<[HasSSSE3]>;
+class SS3AI<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern>, TA, Requires<[HasSSSE3]>;
+
+// SSE4.1 Instruction Templates:
+// 
+//   SS48I - SSE 4.1 instructions with T8 prefix.
+//   SS41AIi8 - SSE 4.1 instructions with TA prefix and ImmT == Imm8.
+//
+class SS48I<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, T8, Requires<[HasSSE41]>;
+class SS4AIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern>, TA, Requires<[HasSSE41]>;
+
+// SSE4.2 Instruction Templates:
+// 
+//   SS428I - SSE 4.2 instructions with T8 prefix.
+class SS428I<bits<8> o, Format F, dag outs, dag ins, string asm,
+             list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, T8, Requires<[HasSSE42]>;
+
+//   SS42FI - SSE 4.2 instructions with TF prefix.
+class SS42FI<bits<8> o, Format F, dag outs, dag ins, string asm,
+              list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, TF, Requires<[HasSSE42]>;
+      
+//   SS42AI = SSE 4.2 instructions with TA prefix
+class SS42AI<bits<8> o, Format F, dag outs, dag ins, string asm,
+             list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, TA, Requires<[HasSSE42]>;
+
+// X86-64 Instruction templates...
+//
+
+class RI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, REX_W;
+class RIi8 <bits<8> o, Format F, dag outs, dag ins, string asm,
+            list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern>, REX_W;
+class RIi32 <bits<8> o, Format F, dag outs, dag ins, string asm,
+             list<dag> pattern>
+      : Ii32<o, F, outs, ins, asm, pattern>, REX_W;
+
+class RIi64<bits<8> o, Format f, dag outs, dag ins, string asm,
+            list<dag> pattern>
+  : X86Inst<o, f, Imm64, outs, ins, asm>, REX_W {
+  let Pattern = pattern;
+  let CodeSize = 3;
+}
+
+class RSSI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern>
+      : SSI<o, F, outs, ins, asm, pattern>, REX_W;
+class RSDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern>
+      : SDI<o, F, outs, ins, asm, pattern>, REX_W;
+class RPDI<bits<8> o, Format F, dag outs, dag ins, string asm,
+           list<dag> pattern>
+      : PDI<o, F, outs, ins, asm, pattern>, REX_W;
+
+// MMX Instruction templates
+//
+
+// MMXI   - MMX instructions with TB prefix.
+// MMXI64 - MMX instructions with TB prefix valid only in 64 bit mode.
+// MMX2I  - MMX / SSE2 instructions with TB and OpSize prefixes.
+// MMXIi8 - MMX instructions with ImmT == Imm8 and TB prefix.
+// MMXIi8 - MMX instructions with ImmT == Imm8 and TB prefix.
+// MMXID  - MMX instructions with XD prefix.
+// MMXIS  - MMX instructions with XS prefix.
+class MMXI<bits<8> o, Format F, dag outs, dag ins, string asm, 
+           list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, TB, Requires<[HasMMX]>;
+class MMXI64<bits<8> o, Format F, dag outs, dag ins, string asm, 
+             list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, TB, Requires<[HasMMX,In64BitMode]>;
+class MMXRI<bits<8> o, Format F, dag outs, dag ins, string asm, 
+            list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, TB, REX_W, Requires<[HasMMX]>;
+class MMX2I<bits<8> o, Format F, dag outs, dag ins, string asm, 
+            list<dag> pattern>
+      : I<o, F, outs, ins, asm, pattern>, TB, OpSize, Requires<[HasMMX]>;
+class MMXIi8<bits<8> o, Format F, dag outs, dag ins, string asm, 
+             list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern>, TB, Requires<[HasMMX]>;
+class MMXID<bits<8> o, Format F, dag outs, dag ins, string asm, 
+            list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern>, XD, Requires<[HasMMX]>;
+class MMXIS<bits<8> o, Format F, dag outs, dag ins, string asm, 
+            list<dag> pattern>
+      : Ii8<o, F, outs, ins, asm, pattern>, XS, Requires<[HasMMX]>;

diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
new file mode 100644
index 0000000..6b9478d
--- /dev/null
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td

@@ -0,0 +1,62 @@
+//======- X86InstrFragmentsSIMD.td - x86 ISA -------------*- tablegen -*-=====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file provides pattern fragments useful for SIMD instructions.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MMX Pattern Fragments
+//===----------------------------------------------------------------------===//
+
+def load_mmx : PatFrag<(ops node:$ptr), (v1i64 (load node:$ptr))>;
+
+def bc_v8i8  : PatFrag<(ops node:$in), (v8i8  (bitconvert node:$in))>;
+def bc_v4i16 : PatFrag<(ops node:$in), (v4i16 (bitconvert node:$in))>;
+def bc_v2i32 : PatFrag<(ops node:$in), (v2i32 (bitconvert node:$in))>;
+def bc_v1i64 : PatFrag<(ops node:$in), (v1i64 (bitconvert node:$in))>;
+
+//===----------------------------------------------------------------------===//
+// MMX Masks
+//===----------------------------------------------------------------------===//
+
+// MMX_SHUFFLE_get_shuf_imm xform function: convert vector_shuffle mask to
+// PSHUFW imm.
+def MMX_SHUFFLE_get_shuf_imm : SDNodeXForm<vector_shuffle, [{
+  return getI8Imm(X86::getShuffleSHUFImmediate(N));
+}]>;
+
+// Patterns for: vector_shuffle v1, v2, <2, 6, 3, 7, ...>
+def mmx_unpckh : PatFrag<(ops node:$lhs, node:$rhs),
+                         (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isUNPCKHMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+// Patterns for: vector_shuffle v1, v2, <0, 4, 2, 5, ...>
+def mmx_unpckl : PatFrag<(ops node:$lhs, node:$rhs),
+                         (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isUNPCKLMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+// Patterns for: vector_shuffle v1, <undef>, <0, 0, 1, 1, ...>
+def mmx_unpckh_undef : PatFrag<(ops node:$lhs, node:$rhs),
+                               (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isUNPCKH_v_undef_Mask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+// Patterns for: vector_shuffle v1, <undef>, <2, 2, 3, 3, ...>
+def mmx_unpckl_undef : PatFrag<(ops node:$lhs, node:$rhs),
+                               (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isUNPCKL_v_undef_Mask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def mmx_pshufw : PatFrag<(ops node:$lhs, node:$rhs),
+                         (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isPSHUFDMask(cast<ShuffleVectorSDNode>(N));
+}], MMX_SHUFFLE_get_shuf_imm>;

diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
new file mode 100644
index 0000000..8d13c0f
--- /dev/null
+++ b/lib/Target/X86/X86InstrInfo.cpp

@@ -0,0 +1,3630 @@
+//===- X86InstrInfo.cpp - X86 Instruction Information -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstrInfo.h"
+#include "X86.h"
+#include "X86GenInstrInfo.inc"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/MC/MCAsmInfo.h"
+
+#include <limits>
+
+using namespace llvm;
+
+static cl::opt<bool>
+NoFusing("disable-spill-fusing",
+         cl::desc("Disable fusing of spill code into instructions"));
+static cl::opt<bool>
+PrintFailedFusing("print-failed-fuse-candidates",
+                  cl::desc("Print instructions that the allocator wants to"
+                           " fuse, but the X86 backend currently can't"),
+                  cl::Hidden);
+static cl::opt<bool>
+ReMatPICStubLoad("remat-pic-stub-load",
+                 cl::desc("Re-materialize load from stub in PIC mode"),
+                 cl::init(false), cl::Hidden);
+
+X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
+  : TargetInstrInfoImpl(X86Insts, array_lengthof(X86Insts)),
+    TM(tm), RI(tm, *this) {
+  SmallVector<unsigned,16> AmbEntries;
+  static const unsigned OpTbl2Addr[][2] = {
+    { X86::ADC32ri,     X86::ADC32mi },
+    { X86::ADC32ri8,    X86::ADC32mi8 },
+    { X86::ADC32rr,     X86::ADC32mr },
+    { X86::ADC64ri32,   X86::ADC64mi32 },
+    { X86::ADC64ri8,    X86::ADC64mi8 },
+    { X86::ADC64rr,     X86::ADC64mr },
+    { X86::ADD16ri,     X86::ADD16mi },
+    { X86::ADD16ri8,    X86::ADD16mi8 },
+    { X86::ADD16rr,     X86::ADD16mr },
+    { X86::ADD32ri,     X86::ADD32mi },
+    { X86::ADD32ri8,    X86::ADD32mi8 },
+    { X86::ADD32rr,     X86::ADD32mr },
+    { X86::ADD64ri32,   X86::ADD64mi32 },
+    { X86::ADD64ri8,    X86::ADD64mi8 },
+    { X86::ADD64rr,     X86::ADD64mr },
+    { X86::ADD8ri,      X86::ADD8mi },
+    { X86::ADD8rr,      X86::ADD8mr },
+    { X86::AND16ri,     X86::AND16mi },
+    { X86::AND16ri8,    X86::AND16mi8 },
+    { X86::AND16rr,     X86::AND16mr },
+    { X86::AND32ri,     X86::AND32mi },
+    { X86::AND32ri8,    X86::AND32mi8 },
+    { X86::AND32rr,     X86::AND32mr },
+    { X86::AND64ri32,   X86::AND64mi32 },
+    { X86::AND64ri8,    X86::AND64mi8 },
+    { X86::AND64rr,     X86::AND64mr },
+    { X86::AND8ri,      X86::AND8mi },
+    { X86::AND8rr,      X86::AND8mr },
+    { X86::DEC16r,      X86::DEC16m },
+    { X86::DEC32r,      X86::DEC32m },
+    { X86::DEC64_16r,   X86::DEC64_16m },
+    { X86::DEC64_32r,   X86::DEC64_32m },
+    { X86::DEC64r,      X86::DEC64m },
+    { X86::DEC8r,       X86::DEC8m },
+    { X86::INC16r,      X86::INC16m },
+    { X86::INC32r,      X86::INC32m },
+    { X86::INC64_16r,   X86::INC64_16m },
+    { X86::INC64_32r,   X86::INC64_32m },
+    { X86::INC64r,      X86::INC64m },
+    { X86::INC8r,       X86::INC8m },
+    { X86::NEG16r,      X86::NEG16m },
+    { X86::NEG32r,      X86::NEG32m },
+    { X86::NEG64r,      X86::NEG64m },
+    { X86::NEG8r,       X86::NEG8m },
+    { X86::NOT16r,      X86::NOT16m },
+    { X86::NOT32r,      X86::NOT32m },
+    { X86::NOT64r,      X86::NOT64m },
+    { X86::NOT8r,       X86::NOT8m },
+    { X86::OR16ri,      X86::OR16mi },
+    { X86::OR16ri8,     X86::OR16mi8 },
+    { X86::OR16rr,      X86::OR16mr },
+    { X86::OR32ri,      X86::OR32mi },
+    { X86::OR32ri8,     X86::OR32mi8 },
+    { X86::OR32rr,      X86::OR32mr },
+    { X86::OR64ri32,    X86::OR64mi32 },
+    { X86::OR64ri8,     X86::OR64mi8 },
+    { X86::OR64rr,      X86::OR64mr },
+    { X86::OR8ri,       X86::OR8mi },
+    { X86::OR8rr,       X86::OR8mr },
+    { X86::ROL16r1,     X86::ROL16m1 },
+    { X86::ROL16rCL,    X86::ROL16mCL },
+    { X86::ROL16ri,     X86::ROL16mi },
+    { X86::ROL32r1,     X86::ROL32m1 },
+    { X86::ROL32rCL,    X86::ROL32mCL },
+    { X86::ROL32ri,     X86::ROL32mi },
+    { X86::ROL64r1,     X86::ROL64m1 },
+    { X86::ROL64rCL,    X86::ROL64mCL },
+    { X86::ROL64ri,     X86::ROL64mi },
+    { X86::ROL8r1,      X86::ROL8m1 },
+    { X86::ROL8rCL,     X86::ROL8mCL },
+    { X86::ROL8ri,      X86::ROL8mi },
+    { X86::ROR16r1,     X86::ROR16m1 },
+    { X86::ROR16rCL,    X86::ROR16mCL },
+    { X86::ROR16ri,     X86::ROR16mi },
+    { X86::ROR32r1,     X86::ROR32m1 },
+    { X86::ROR32rCL,    X86::ROR32mCL },
+    { X86::ROR32ri,     X86::ROR32mi },
+    { X86::ROR64r1,     X86::ROR64m1 },
+    { X86::ROR64rCL,    X86::ROR64mCL },
+    { X86::ROR64ri,     X86::ROR64mi },
+    { X86::ROR8r1,      X86::ROR8m1 },
+    { X86::ROR8rCL,     X86::ROR8mCL },
+    { X86::ROR8ri,      X86::ROR8mi },
+    { X86::SAR16r1,     X86::SAR16m1 },
+    { X86::SAR16rCL,    X86::SAR16mCL },
+    { X86::SAR16ri,     X86::SAR16mi },
+    { X86::SAR32r1,     X86::SAR32m1 },
+    { X86::SAR32rCL,    X86::SAR32mCL },
+    { X86::SAR32ri,     X86::SAR32mi },
+    { X86::SAR64r1,     X86::SAR64m1 },
+    { X86::SAR64rCL,    X86::SAR64mCL },
+    { X86::SAR64ri,     X86::SAR64mi },
+    { X86::SAR8r1,      X86::SAR8m1 },
+    { X86::SAR8rCL,     X86::SAR8mCL },
+    { X86::SAR8ri,      X86::SAR8mi },
+    { X86::SBB32ri,     X86::SBB32mi },
+    { X86::SBB32ri8,    X86::SBB32mi8 },
+    { X86::SBB32rr,     X86::SBB32mr },
+    { X86::SBB64ri32,   X86::SBB64mi32 },
+    { X86::SBB64ri8,    X86::SBB64mi8 },
+    { X86::SBB64rr,     X86::SBB64mr },
+    { X86::SHL16rCL,    X86::SHL16mCL },
+    { X86::SHL16ri,     X86::SHL16mi },
+    { X86::SHL32rCL,    X86::SHL32mCL },
+    { X86::SHL32ri,     X86::SHL32mi },
+    { X86::SHL64rCL,    X86::SHL64mCL },
+    { X86::SHL64ri,     X86::SHL64mi },
+    { X86::SHL8rCL,     X86::SHL8mCL },
+    { X86::SHL8ri,      X86::SHL8mi },
+    { X86::SHLD16rrCL,  X86::SHLD16mrCL },
+    { X86::SHLD16rri8,  X86::SHLD16mri8 },
+    { X86::SHLD32rrCL,  X86::SHLD32mrCL },
+    { X86::SHLD32rri8,  X86::SHLD32mri8 },
+    { X86::SHLD64rrCL,  X86::SHLD64mrCL },
+    { X86::SHLD64rri8,  X86::SHLD64mri8 },
+    { X86::SHR16r1,     X86::SHR16m1 },
+    { X86::SHR16rCL,    X86::SHR16mCL },
+    { X86::SHR16ri,     X86::SHR16mi },
+    { X86::SHR32r1,     X86::SHR32m1 },
+    { X86::SHR32rCL,    X86::SHR32mCL },
+    { X86::SHR32ri,     X86::SHR32mi },
+    { X86::SHR64r1,     X86::SHR64m1 },
+    { X86::SHR64rCL,    X86::SHR64mCL },
+    { X86::SHR64ri,     X86::SHR64mi },
+    { X86::SHR8r1,      X86::SHR8m1 },
+    { X86::SHR8rCL,     X86::SHR8mCL },
+    { X86::SHR8ri,      X86::SHR8mi },
+    { X86::SHRD16rrCL,  X86::SHRD16mrCL },
+    { X86::SHRD16rri8,  X86::SHRD16mri8 },
+    { X86::SHRD32rrCL,  X86::SHRD32mrCL },
+    { X86::SHRD32rri8,  X86::SHRD32mri8 },
+    { X86::SHRD64rrCL,  X86::SHRD64mrCL },
+    { X86::SHRD64rri8,  X86::SHRD64mri8 },
+    { X86::SUB16ri,     X86::SUB16mi },
+    { X86::SUB16ri8,    X86::SUB16mi8 },
+    { X86::SUB16rr,     X86::SUB16mr },
+    { X86::SUB32ri,     X86::SUB32mi },
+    { X86::SUB32ri8,    X86::SUB32mi8 },
+    { X86::SUB32rr,     X86::SUB32mr },
+    { X86::SUB64ri32,   X86::SUB64mi32 },
+    { X86::SUB64ri8,    X86::SUB64mi8 },
+    { X86::SUB64rr,     X86::SUB64mr },
+    { X86::SUB8ri,      X86::SUB8mi },
+    { X86::SUB8rr,      X86::SUB8mr },
+    { X86::XOR16ri,     X86::XOR16mi },
+    { X86::XOR16ri8,    X86::XOR16mi8 },
+    { X86::XOR16rr,     X86::XOR16mr },
+    { X86::XOR32ri,     X86::XOR32mi },
+    { X86::XOR32ri8,    X86::XOR32mi8 },
+    { X86::XOR32rr,     X86::XOR32mr },
+    { X86::XOR64ri32,   X86::XOR64mi32 },
+    { X86::XOR64ri8,    X86::XOR64mi8 },
+    { X86::XOR64rr,     X86::XOR64mr },
+    { X86::XOR8ri,      X86::XOR8mi },
+    { X86::XOR8rr,      X86::XOR8mr }
+  };
+
+  for (unsigned i = 0, e = array_lengthof(OpTbl2Addr); i != e; ++i) {
+    unsigned RegOp = OpTbl2Addr[i][0];
+    unsigned MemOp = OpTbl2Addr[i][1];
+    if (!RegOp2MemOpTable2Addr.insert(std::make_pair((unsigned*)RegOp,
+                                               std::make_pair(MemOp,0))).second)
+      assert(false && "Duplicated entries?");
+    // Index 0, folded load and store, no alignment requirement.
+    unsigned AuxInfo = 0 | (1 << 4) | (1 << 5);
+    if (!MemOp2RegOpTable.insert(std::make_pair((unsigned*)MemOp,
+                                                std::make_pair(RegOp,
+                                                              AuxInfo))).second)
+      AmbEntries.push_back(MemOp);
+  }
+
+  // If the third value is 1, then it's folding either a load or a store.
+  static const unsigned OpTbl0[][4] = {
+    { X86::BT16ri8,     X86::BT16mi8, 1, 0 },
+    { X86::BT32ri8,     X86::BT32mi8, 1, 0 },
+    { X86::BT64ri8,     X86::BT64mi8, 1, 0 },
+    { X86::CALL32r,     X86::CALL32m, 1, 0 },
+    { X86::CALL64r,     X86::CALL64m, 1, 0 },
+    { X86::CMP16ri,     X86::CMP16mi, 1, 0 },
+    { X86::CMP16ri8,    X86::CMP16mi8, 1, 0 },
+    { X86::CMP16rr,     X86::CMP16mr, 1, 0 },
+    { X86::CMP32ri,     X86::CMP32mi, 1, 0 },
+    { X86::CMP32ri8,    X86::CMP32mi8, 1, 0 },
+    { X86::CMP32rr,     X86::CMP32mr, 1, 0 },
+    { X86::CMP64ri32,   X86::CMP64mi32, 1, 0 },
+    { X86::CMP64ri8,    X86::CMP64mi8, 1, 0 },
+    { X86::CMP64rr,     X86::CMP64mr, 1, 0 },
+    { X86::CMP8ri,      X86::CMP8mi, 1, 0 },
+    { X86::CMP8rr,      X86::CMP8mr, 1, 0 },
+    { X86::DIV16r,      X86::DIV16m, 1, 0 },
+    { X86::DIV32r,      X86::DIV32m, 1, 0 },
+    { X86::DIV64r,      X86::DIV64m, 1, 0 },
+    { X86::DIV8r,       X86::DIV8m, 1, 0 },
+    { X86::EXTRACTPSrr, X86::EXTRACTPSmr, 0, 16 },
+    { X86::FsMOVAPDrr,  X86::MOVSDmr, 0, 0 },
+    { X86::FsMOVAPSrr,  X86::MOVSSmr, 0, 0 },
+    { X86::IDIV16r,     X86::IDIV16m, 1, 0 },
+    { X86::IDIV32r,     X86::IDIV32m, 1, 0 },
+    { X86::IDIV64r,     X86::IDIV64m, 1, 0 },
+    { X86::IDIV8r,      X86::IDIV8m, 1, 0 },
+    { X86::IMUL16r,     X86::IMUL16m, 1, 0 },
+    { X86::IMUL32r,     X86::IMUL32m, 1, 0 },
+    { X86::IMUL64r,     X86::IMUL64m, 1, 0 },
+    { X86::IMUL8r,      X86::IMUL8m, 1, 0 },
+    { X86::JMP32r,      X86::JMP32m, 1, 0 },
+    { X86::JMP64r,      X86::JMP64m, 1, 0 },
+    { X86::MOV16ri,     X86::MOV16mi, 0, 0 },
+    { X86::MOV16rr,     X86::MOV16mr, 0, 0 },
+    { X86::MOV32ri,     X86::MOV32mi, 0, 0 },
+    { X86::MOV32rr,     X86::MOV32mr, 0, 0 },
+    { X86::MOV64ri32,   X86::MOV64mi32, 0, 0 },
+    { X86::MOV64rr,     X86::MOV64mr, 0, 0 },
+    { X86::MOV8ri,      X86::MOV8mi, 0, 0 },
+    { X86::MOV8rr,      X86::MOV8mr, 0, 0 },
+    { X86::MOV8rr_NOREX, X86::MOV8mr_NOREX, 0, 0 },
+    { X86::MOVAPDrr,    X86::MOVAPDmr, 0, 16 },
+    { X86::MOVAPSrr,    X86::MOVAPSmr, 0, 16 },
+    { X86::MOVDQArr,    X86::MOVDQAmr, 0, 16 },
+    { X86::MOVPDI2DIrr, X86::MOVPDI2DImr, 0, 0 },
+    { X86::MOVPQIto64rr,X86::MOVPQI2QImr, 0, 0 },
+    { X86::MOVPS2SSrr,  X86::MOVPS2SSmr, 0, 0 },
+    { X86::MOVSDrr,     X86::MOVSDmr, 0, 0 },
+    { X86::MOVSDto64rr, X86::MOVSDto64mr, 0, 0 },
+    { X86::MOVSS2DIrr,  X86::MOVSS2DImr, 0, 0 },
+    { X86::MOVSSrr,     X86::MOVSSmr, 0, 0 },
+    { X86::MOVUPDrr,    X86::MOVUPDmr, 0, 0 },
+    { X86::MOVUPSrr,    X86::MOVUPSmr, 0, 0 },
+    { X86::MUL16r,      X86::MUL16m, 1, 0 },
+    { X86::MUL32r,      X86::MUL32m, 1, 0 },
+    { X86::MUL64r,      X86::MUL64m, 1, 0 },
+    { X86::MUL8r,       X86::MUL8m, 1, 0 },
+    { X86::SETAEr,      X86::SETAEm, 0, 0 },
+    { X86::SETAr,       X86::SETAm, 0, 0 },
+    { X86::SETBEr,      X86::SETBEm, 0, 0 },
+    { X86::SETBr,       X86::SETBm, 0, 0 },
+    { X86::SETEr,       X86::SETEm, 0, 0 },
+    { X86::SETGEr,      X86::SETGEm, 0, 0 },
+    { X86::SETGr,       X86::SETGm, 0, 0 },
+    { X86::SETLEr,      X86::SETLEm, 0, 0 },
+    { X86::SETLr,       X86::SETLm, 0, 0 },
+    { X86::SETNEr,      X86::SETNEm, 0, 0 },
+    { X86::SETNOr,      X86::SETNOm, 0, 0 },
+    { X86::SETNPr,      X86::SETNPm, 0, 0 },
+    { X86::SETNSr,      X86::SETNSm, 0, 0 },
+    { X86::SETOr,       X86::SETOm, 0, 0 },
+    { X86::SETPr,       X86::SETPm, 0, 0 },
+    { X86::SETSr,       X86::SETSm, 0, 0 },
+    { X86::TAILJMPr,    X86::TAILJMPm, 1, 0 },
+    { X86::TEST16ri,    X86::TEST16mi, 1, 0 },
+    { X86::TEST32ri,    X86::TEST32mi, 1, 0 },
+    { X86::TEST64ri32,  X86::TEST64mi32, 1, 0 },
+    { X86::TEST8ri,     X86::TEST8mi, 1, 0 }
+  };
+
+  for (unsigned i = 0, e = array_lengthof(OpTbl0); i != e; ++i) {
+    unsigned RegOp = OpTbl0[i][0];
+    unsigned MemOp = OpTbl0[i][1];
+    unsigned Align = OpTbl0[i][3];
+    if (!RegOp2MemOpTable0.insert(std::make_pair((unsigned*)RegOp,
+                                           std::make_pair(MemOp,Align))).second)
+      assert(false && "Duplicated entries?");
+    unsigned FoldedLoad = OpTbl0[i][2];
+    // Index 0, folded load or store.
+    unsigned AuxInfo = 0 | (FoldedLoad << 4) | ((FoldedLoad^1) << 5);
+    if (RegOp != X86::FsMOVAPDrr && RegOp != X86::FsMOVAPSrr)
+      if (!MemOp2RegOpTable.insert(std::make_pair((unsigned*)MemOp,
+                                     std::make_pair(RegOp, AuxInfo))).second)
+        AmbEntries.push_back(MemOp);
+  }
+
+  static const unsigned OpTbl1[][3] = {
+    { X86::CMP16rr,         X86::CMP16rm, 0 },
+    { X86::CMP32rr,         X86::CMP32rm, 0 },
+    { X86::CMP64rr,         X86::CMP64rm, 0 },
+    { X86::CMP8rr,          X86::CMP8rm, 0 },
+    { X86::CVTSD2SSrr,      X86::CVTSD2SSrm, 0 },
+    { X86::CVTSI2SD64rr,    X86::CVTSI2SD64rm, 0 },
+    { X86::CVTSI2SDrr,      X86::CVTSI2SDrm, 0 },
+    { X86::CVTSI2SS64rr,    X86::CVTSI2SS64rm, 0 },
+    { X86::CVTSI2SSrr,      X86::CVTSI2SSrm, 0 },
+    { X86::CVTSS2SDrr,      X86::CVTSS2SDrm, 0 },
+    { X86::CVTTSD2SI64rr,   X86::CVTTSD2SI64rm, 0 },
+    { X86::CVTTSD2SIrr,     X86::CVTTSD2SIrm, 0 },
+    { X86::CVTTSS2SI64rr,   X86::CVTTSS2SI64rm, 0 },
+    { X86::CVTTSS2SIrr,     X86::CVTTSS2SIrm, 0 },
+    { X86::FsMOVAPDrr,      X86::MOVSDrm, 0 },
+    { X86::FsMOVAPSrr,      X86::MOVSSrm, 0 },
+    { X86::IMUL16rri,       X86::IMUL16rmi, 0 },
+    { X86::IMUL16rri8,      X86::IMUL16rmi8, 0 },
+    { X86::IMUL32rri,       X86::IMUL32rmi, 0 },
+    { X86::IMUL32rri8,      X86::IMUL32rmi8, 0 },
+    { X86::IMUL64rri32,     X86::IMUL64rmi32, 0 },
+    { X86::IMUL64rri8,      X86::IMUL64rmi8, 0 },
+    { X86::Int_CMPSDrr,     X86::Int_CMPSDrm, 0 },
+    { X86::Int_CMPSSrr,     X86::Int_CMPSSrm, 0 },
+    { X86::Int_COMISDrr,    X86::Int_COMISDrm, 0 },
+    { X86::Int_COMISSrr,    X86::Int_COMISSrm, 0 },
+    { X86::Int_CVTDQ2PDrr,  X86::Int_CVTDQ2PDrm, 16 },
+    { X86::Int_CVTDQ2PSrr,  X86::Int_CVTDQ2PSrm, 16 },
+    { X86::Int_CVTPD2DQrr,  X86::Int_CVTPD2DQrm, 16 },
+    { X86::Int_CVTPD2PSrr,  X86::Int_CVTPD2PSrm, 16 },
+    { X86::Int_CVTPS2DQrr,  X86::Int_CVTPS2DQrm, 16 },
+    { X86::Int_CVTPS2PDrr,  X86::Int_CVTPS2PDrm, 0 },
+    { X86::Int_CVTSD2SI64rr,X86::Int_CVTSD2SI64rm, 0 },
+    { X86::Int_CVTSD2SIrr,  X86::Int_CVTSD2SIrm, 0 },
+    { X86::Int_CVTSD2SSrr,  X86::Int_CVTSD2SSrm, 0 },
+    { X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm, 0 },
+    { X86::Int_CVTSI2SDrr,  X86::Int_CVTSI2SDrm, 0 },
+    { X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm, 0 },
+    { X86::Int_CVTSI2SSrr,  X86::Int_CVTSI2SSrm, 0 },
+    { X86::Int_CVTSS2SDrr,  X86::Int_CVTSS2SDrm, 0 },
+    { X86::Int_CVTSS2SI64rr,X86::Int_CVTSS2SI64rm, 0 },
+    { X86::Int_CVTSS2SIrr,  X86::Int_CVTSS2SIrm, 0 },
+    { X86::Int_CVTTPD2DQrr, X86::Int_CVTTPD2DQrm, 16 },
+    { X86::Int_CVTTPS2DQrr, X86::Int_CVTTPS2DQrm, 16 },
+    { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm, 0 },
+    { X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm, 0 },
+    { X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm, 0 },
+    { X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm, 0 },
+    { X86::Int_UCOMISDrr,   X86::Int_UCOMISDrm, 0 },
+    { X86::Int_UCOMISSrr,   X86::Int_UCOMISSrm, 0 },
+    { X86::MOV16rr,         X86::MOV16rm, 0 },
+    { X86::MOV32rr,         X86::MOV32rm, 0 },
+    { X86::MOV64rr,         X86::MOV64rm, 0 },
+    { X86::MOV64toPQIrr,    X86::MOVQI2PQIrm, 0 },
+    { X86::MOV64toSDrr,     X86::MOV64toSDrm, 0 },
+    { X86::MOV8rr,          X86::MOV8rm, 0 },
+    { X86::MOVAPDrr,        X86::MOVAPDrm, 16 },
+    { X86::MOVAPSrr,        X86::MOVAPSrm, 16 },
+    { X86::MOVDDUPrr,       X86::MOVDDUPrm, 0 },
+    { X86::MOVDI2PDIrr,     X86::MOVDI2PDIrm, 0 },
+    { X86::MOVDI2SSrr,      X86::MOVDI2SSrm, 0 },
+    { X86::MOVDQArr,        X86::MOVDQArm, 16 },
+    { X86::MOVSD2PDrr,      X86::MOVSD2PDrm, 0 },
+    { X86::MOVSDrr,         X86::MOVSDrm, 0 },
+    { X86::MOVSHDUPrr,      X86::MOVSHDUPrm, 16 },
+    { X86::MOVSLDUPrr,      X86::MOVSLDUPrm, 16 },
+    { X86::MOVSS2PSrr,      X86::MOVSS2PSrm, 0 },
+    { X86::MOVSSrr,         X86::MOVSSrm, 0 },
+    { X86::MOVSX16rr8,      X86::MOVSX16rm8, 0 },
+    { X86::MOVSX32rr16,     X86::MOVSX32rm16, 0 },
+    { X86::MOVSX32rr8,      X86::MOVSX32rm8, 0 },
+    { X86::MOVSX64rr16,     X86::MOVSX64rm16, 0 },
+    { X86::MOVSX64rr32,     X86::MOVSX64rm32, 0 },
+    { X86::MOVSX64rr8,      X86::MOVSX64rm8, 0 },
+    { X86::MOVUPDrr,        X86::MOVUPDrm, 16 },
+    { X86::MOVUPSrr,        X86::MOVUPSrm, 0 },
+    { X86::MOVZDI2PDIrr,    X86::MOVZDI2PDIrm, 0 },
+    { X86::MOVZQI2PQIrr,    X86::MOVZQI2PQIrm, 0 },
+    { X86::MOVZPQILo2PQIrr, X86::MOVZPQILo2PQIrm, 16 },
+    { X86::MOVZX16rr8,      X86::MOVZX16rm8, 0 },
+    { X86::MOVZX32rr16,     X86::MOVZX32rm16, 0 },
+    { X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8, 0 },
+    { X86::MOVZX32rr8,      X86::MOVZX32rm8, 0 },
+    { X86::MOVZX64rr16,     X86::MOVZX64rm16, 0 },
+    { X86::MOVZX64rr32,     X86::MOVZX64rm32, 0 },
+    { X86::MOVZX64rr8,      X86::MOVZX64rm8, 0 },
+    { X86::PSHUFDri,        X86::PSHUFDmi, 16 },
+    { X86::PSHUFHWri,       X86::PSHUFHWmi, 16 },
+    { X86::PSHUFLWri,       X86::PSHUFLWmi, 16 },
+    { X86::RCPPSr,          X86::RCPPSm, 16 },
+    { X86::RCPPSr_Int,      X86::RCPPSm_Int, 16 },
+    { X86::RSQRTPSr,        X86::RSQRTPSm, 16 },
+    { X86::RSQRTPSr_Int,    X86::RSQRTPSm_Int, 16 },
+    { X86::RSQRTSSr,        X86::RSQRTSSm, 0 },
+    { X86::RSQRTSSr_Int,    X86::RSQRTSSm_Int, 0 },
+    { X86::SQRTPDr,         X86::SQRTPDm, 16 },
+    { X86::SQRTPDr_Int,     X86::SQRTPDm_Int, 16 },
+    { X86::SQRTPSr,         X86::SQRTPSm, 16 },
+    { X86::SQRTPSr_Int,     X86::SQRTPSm_Int, 16 },
+    { X86::SQRTSDr,         X86::SQRTSDm, 0 },
+    { X86::SQRTSDr_Int,     X86::SQRTSDm_Int, 0 },
+    { X86::SQRTSSr,         X86::SQRTSSm, 0 },
+    { X86::SQRTSSr_Int,     X86::SQRTSSm_Int, 0 },
+    { X86::TEST16rr,        X86::TEST16rm, 0 },
+    { X86::TEST32rr,        X86::TEST32rm, 0 },
+    { X86::TEST64rr,        X86::TEST64rm, 0 },
+    { X86::TEST8rr,         X86::TEST8rm, 0 },
+    // FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0
+    { X86::UCOMISDrr,       X86::UCOMISDrm, 0 },
+    { X86::UCOMISSrr,       X86::UCOMISSrm, 0 }
+  };
+
+  for (unsigned i = 0, e = array_lengthof(OpTbl1); i != e; ++i) {
+    unsigned RegOp = OpTbl1[i][0];
+    unsigned MemOp = OpTbl1[i][1];
+    unsigned Align = OpTbl1[i][2];
+    if (!RegOp2MemOpTable1.insert(std::make_pair((unsigned*)RegOp,
+                                           std::make_pair(MemOp,Align))).second)
+      assert(false && "Duplicated entries?");
+    // Index 1, folded load
+    unsigned AuxInfo = 1 | (1 << 4);
+    if (RegOp != X86::FsMOVAPDrr && RegOp != X86::FsMOVAPSrr)
+      if (!MemOp2RegOpTable.insert(std::make_pair((unsigned*)MemOp,
+                                     std::make_pair(RegOp, AuxInfo))).second)
+        AmbEntries.push_back(MemOp);
+  }
+
+  static const unsigned OpTbl2[][3] = {
+    { X86::ADC32rr,         X86::ADC32rm, 0 },
+    { X86::ADC64rr,         X86::ADC64rm, 0 },
+    { X86::ADD16rr,         X86::ADD16rm, 0 },
+    { X86::ADD32rr,         X86::ADD32rm, 0 },
+    { X86::ADD64rr,         X86::ADD64rm, 0 },
+    { X86::ADD8rr,          X86::ADD8rm, 0 },
+    { X86::ADDPDrr,         X86::ADDPDrm, 16 },
+    { X86::ADDPSrr,         X86::ADDPSrm, 16 },
+    { X86::ADDSDrr,         X86::ADDSDrm, 0 },
+    { X86::ADDSSrr,         X86::ADDSSrm, 0 },
+    { X86::ADDSUBPDrr,      X86::ADDSUBPDrm, 16 },
+    { X86::ADDSUBPSrr,      X86::ADDSUBPSrm, 16 },
+    { X86::AND16rr,         X86::AND16rm, 0 },
+    { X86::AND32rr,         X86::AND32rm, 0 },
+    { X86::AND64rr,         X86::AND64rm, 0 },
+    { X86::AND8rr,          X86::AND8rm, 0 },
+    { X86::ANDNPDrr,        X86::ANDNPDrm, 16 },
+    { X86::ANDNPSrr,        X86::ANDNPSrm, 16 },
+    { X86::ANDPDrr,         X86::ANDPDrm, 16 },
+    { X86::ANDPSrr,         X86::ANDPSrm, 16 },
+    { X86::CMOVA16rr,       X86::CMOVA16rm, 0 },
+    { X86::CMOVA32rr,       X86::CMOVA32rm, 0 },
+    { X86::CMOVA64rr,       X86::CMOVA64rm, 0 },
+    { X86::CMOVAE16rr,      X86::CMOVAE16rm, 0 },
+    { X86::CMOVAE32rr,      X86::CMOVAE32rm, 0 },
+    { X86::CMOVAE64rr,      X86::CMOVAE64rm, 0 },
+    { X86::CMOVB16rr,       X86::CMOVB16rm, 0 },
+    { X86::CMOVB32rr,       X86::CMOVB32rm, 0 },
+    { X86::CMOVB64rr,       X86::CMOVB64rm, 0 },
+    { X86::CMOVBE16rr,      X86::CMOVBE16rm, 0 },
+    { X86::CMOVBE32rr,      X86::CMOVBE32rm, 0 },
+    { X86::CMOVBE64rr,      X86::CMOVBE64rm, 0 },
+    { X86::CMOVE16rr,       X86::CMOVE16rm, 0 },
+    { X86::CMOVE32rr,       X86::CMOVE32rm, 0 },
+    { X86::CMOVE64rr,       X86::CMOVE64rm, 0 },
+    { X86::CMOVG16rr,       X86::CMOVG16rm, 0 },
+    { X86::CMOVG32rr,       X86::CMOVG32rm, 0 },
+    { X86::CMOVG64rr,       X86::CMOVG64rm, 0 },
+    { X86::CMOVGE16rr,      X86::CMOVGE16rm, 0 },
+    { X86::CMOVGE32rr,      X86::CMOVGE32rm, 0 },
+    { X86::CMOVGE64rr,      X86::CMOVGE64rm, 0 },
+    { X86::CMOVL16rr,       X86::CMOVL16rm, 0 },
+    { X86::CMOVL32rr,       X86::CMOVL32rm, 0 },
+    { X86::CMOVL64rr,       X86::CMOVL64rm, 0 },
+    { X86::CMOVLE16rr,      X86::CMOVLE16rm, 0 },
+    { X86::CMOVLE32rr,      X86::CMOVLE32rm, 0 },
+    { X86::CMOVLE64rr,      X86::CMOVLE64rm, 0 },
+    { X86::CMOVNE16rr,      X86::CMOVNE16rm, 0 },
+    { X86::CMOVNE32rr,      X86::CMOVNE32rm, 0 },
+    { X86::CMOVNE64rr,      X86::CMOVNE64rm, 0 },
+    { X86::CMOVNO16rr,      X86::CMOVNO16rm, 0 },
+    { X86::CMOVNO32rr,      X86::CMOVNO32rm, 0 },
+    { X86::CMOVNO64rr,      X86::CMOVNO64rm, 0 },
+    { X86::CMOVNP16rr,      X86::CMOVNP16rm, 0 },
+    { X86::CMOVNP32rr,      X86::CMOVNP32rm, 0 },
+    { X86::CMOVNP64rr,      X86::CMOVNP64rm, 0 },
+    { X86::CMOVNS16rr,      X86::CMOVNS16rm, 0 },
+    { X86::CMOVNS32rr,      X86::CMOVNS32rm, 0 },
+    { X86::CMOVNS64rr,      X86::CMOVNS64rm, 0 },
+    { X86::CMOVO16rr,       X86::CMOVO16rm, 0 },
+    { X86::CMOVO32rr,       X86::CMOVO32rm, 0 },
+    { X86::CMOVO64rr,       X86::CMOVO64rm, 0 },
+    { X86::CMOVP16rr,       X86::CMOVP16rm, 0 },
+    { X86::CMOVP32rr,       X86::CMOVP32rm, 0 },
+    { X86::CMOVP64rr,       X86::CMOVP64rm, 0 },
+    { X86::CMOVS16rr,       X86::CMOVS16rm, 0 },
+    { X86::CMOVS32rr,       X86::CMOVS32rm, 0 },
+    { X86::CMOVS64rr,       X86::CMOVS64rm, 0 },
+    { X86::CMPPDrri,        X86::CMPPDrmi, 16 },
+    { X86::CMPPSrri,        X86::CMPPSrmi, 16 },
+    { X86::CMPSDrr,         X86::CMPSDrm, 0 },
+    { X86::CMPSSrr,         X86::CMPSSrm, 0 },
+    { X86::DIVPDrr,         X86::DIVPDrm, 16 },
+    { X86::DIVPSrr,         X86::DIVPSrm, 16 },
+    { X86::DIVSDrr,         X86::DIVSDrm, 0 },
+    { X86::DIVSSrr,         X86::DIVSSrm, 0 },
+    { X86::FsANDNPDrr,      X86::FsANDNPDrm, 16 },
+    { X86::FsANDNPSrr,      X86::FsANDNPSrm, 16 },
+    { X86::FsANDPDrr,       X86::FsANDPDrm, 16 },
+    { X86::FsANDPSrr,       X86::FsANDPSrm, 16 },
+    { X86::FsORPDrr,        X86::FsORPDrm, 16 },
+    { X86::FsORPSrr,        X86::FsORPSrm, 16 },
+    { X86::FsXORPDrr,       X86::FsXORPDrm, 16 },
+    { X86::FsXORPSrr,       X86::FsXORPSrm, 16 },
+    { X86::HADDPDrr,        X86::HADDPDrm, 16 },
+    { X86::HADDPSrr,        X86::HADDPSrm, 16 },
+    { X86::HSUBPDrr,        X86::HSUBPDrm, 16 },
+    { X86::HSUBPSrr,        X86::HSUBPSrm, 16 },
+    { X86::IMUL16rr,        X86::IMUL16rm, 0 },
+    { X86::IMUL32rr,        X86::IMUL32rm, 0 },
+    { X86::IMUL64rr,        X86::IMUL64rm, 0 },
+    { X86::MAXPDrr,         X86::MAXPDrm, 16 },
+    { X86::MAXPDrr_Int,     X86::MAXPDrm_Int, 16 },
+    { X86::MAXPSrr,         X86::MAXPSrm, 16 },
+    { X86::MAXPSrr_Int,     X86::MAXPSrm_Int, 16 },
+    { X86::MAXSDrr,         X86::MAXSDrm, 0 },
+    { X86::MAXSDrr_Int,     X86::MAXSDrm_Int, 0 },
+    { X86::MAXSSrr,         X86::MAXSSrm, 0 },
+    { X86::MAXSSrr_Int,     X86::MAXSSrm_Int, 0 },
+    { X86::MINPDrr,         X86::MINPDrm, 16 },
+    { X86::MINPDrr_Int,     X86::MINPDrm_Int, 16 },
+    { X86::MINPSrr,         X86::MINPSrm, 16 },
+    { X86::MINPSrr_Int,     X86::MINPSrm_Int, 16 },
+    { X86::MINSDrr,         X86::MINSDrm, 0 },
+    { X86::MINSDrr_Int,     X86::MINSDrm_Int, 0 },
+    { X86::MINSSrr,         X86::MINSSrm, 0 },
+    { X86::MINSSrr_Int,     X86::MINSSrm_Int, 0 },
+    { X86::MULPDrr,         X86::MULPDrm, 16 },
+    { X86::MULPSrr,         X86::MULPSrm, 16 },
+    { X86::MULSDrr,         X86::MULSDrm, 0 },
+    { X86::MULSSrr,         X86::MULSSrm, 0 },
+    { X86::OR16rr,          X86::OR16rm, 0 },
+    { X86::OR32rr,          X86::OR32rm, 0 },
+    { X86::OR64rr,          X86::OR64rm, 0 },
+    { X86::OR8rr,           X86::OR8rm, 0 },
+    { X86::ORPDrr,          X86::ORPDrm, 16 },
+    { X86::ORPSrr,          X86::ORPSrm, 16 },
+    { X86::PACKSSDWrr,      X86::PACKSSDWrm, 16 },
+    { X86::PACKSSWBrr,      X86::PACKSSWBrm, 16 },
+    { X86::PACKUSWBrr,      X86::PACKUSWBrm, 16 },
+    { X86::PADDBrr,         X86::PADDBrm, 16 },
+    { X86::PADDDrr,         X86::PADDDrm, 16 },
+    { X86::PADDQrr,         X86::PADDQrm, 16 },
+    { X86::PADDSBrr,        X86::PADDSBrm, 16 },
+    { X86::PADDSWrr,        X86::PADDSWrm, 16 },
+    { X86::PADDWrr,         X86::PADDWrm, 16 },
+    { X86::PANDNrr,         X86::PANDNrm, 16 },
+    { X86::PANDrr,          X86::PANDrm, 16 },
+    { X86::PAVGBrr,         X86::PAVGBrm, 16 },
+    { X86::PAVGWrr,         X86::PAVGWrm, 16 },
+    { X86::PCMPEQBrr,       X86::PCMPEQBrm, 16 },
+    { X86::PCMPEQDrr,       X86::PCMPEQDrm, 16 },
+    { X86::PCMPEQWrr,       X86::PCMPEQWrm, 16 },
+    { X86::PCMPGTBrr,       X86::PCMPGTBrm, 16 },
+    { X86::PCMPGTDrr,       X86::PCMPGTDrm, 16 },
+    { X86::PCMPGTWrr,       X86::PCMPGTWrm, 16 },
+    { X86::PINSRWrri,       X86::PINSRWrmi, 16 },
+    { X86::PMADDWDrr,       X86::PMADDWDrm, 16 },
+    { X86::PMAXSWrr,        X86::PMAXSWrm, 16 },
+    { X86::PMAXUBrr,        X86::PMAXUBrm, 16 },
+    { X86::PMINSWrr,        X86::PMINSWrm, 16 },
+    { X86::PMINUBrr,        X86::PMINUBrm, 16 },
+    { X86::PMULDQrr,        X86::PMULDQrm, 16 },
+    { X86::PMULHUWrr,       X86::PMULHUWrm, 16 },
+    { X86::PMULHWrr,        X86::PMULHWrm, 16 },
+    { X86::PMULLDrr,        X86::PMULLDrm, 16 },
+    { X86::PMULLDrr_int,    X86::PMULLDrm_int, 16 },
+    { X86::PMULLWrr,        X86::PMULLWrm, 16 },
+    { X86::PMULUDQrr,       X86::PMULUDQrm, 16 },
+    { X86::PORrr,           X86::PORrm, 16 },
+    { X86::PSADBWrr,        X86::PSADBWrm, 16 },
+    { X86::PSLLDrr,         X86::PSLLDrm, 16 },
+    { X86::PSLLQrr,         X86::PSLLQrm, 16 },
+    { X86::PSLLWrr,         X86::PSLLWrm, 16 },
+    { X86::PSRADrr,         X86::PSRADrm, 16 },
+    { X86::PSRAWrr,         X86::PSRAWrm, 16 },
+    { X86::PSRLDrr,         X86::PSRLDrm, 16 },
+    { X86::PSRLQrr,         X86::PSRLQrm, 16 },
+    { X86::PSRLWrr,         X86::PSRLWrm, 16 },
+    { X86::PSUBBrr,         X86::PSUBBrm, 16 },
+    { X86::PSUBDrr,         X86::PSUBDrm, 16 },
+    { X86::PSUBSBrr,        X86::PSUBSBrm, 16 },
+    { X86::PSUBSWrr,        X86::PSUBSWrm, 16 },
+    { X86::PSUBWrr,         X86::PSUBWrm, 16 },
+    { X86::PUNPCKHBWrr,     X86::PUNPCKHBWrm, 16 },
+    { X86::PUNPCKHDQrr,     X86::PUNPCKHDQrm, 16 },
+    { X86::PUNPCKHQDQrr,    X86::PUNPCKHQDQrm, 16 },
+    { X86::PUNPCKHWDrr,     X86::PUNPCKHWDrm, 16 },
+    { X86::PUNPCKLBWrr,     X86::PUNPCKLBWrm, 16 },
+    { X86::PUNPCKLDQrr,     X86::PUNPCKLDQrm, 16 },
+    { X86::PUNPCKLQDQrr,    X86::PUNPCKLQDQrm, 16 },
+    { X86::PUNPCKLWDrr,     X86::PUNPCKLWDrm, 16 },
+    { X86::PXORrr,          X86::PXORrm, 16 },
+    { X86::SBB32rr,         X86::SBB32rm, 0 },
+    { X86::SBB64rr,         X86::SBB64rm, 0 },
+    { X86::SHUFPDrri,       X86::SHUFPDrmi, 16 },
+    { X86::SHUFPSrri,       X86::SHUFPSrmi, 16 },
+    { X86::SUB16rr,         X86::SUB16rm, 0 },
+    { X86::SUB32rr,         X86::SUB32rm, 0 },
+    { X86::SUB64rr,         X86::SUB64rm, 0 },
+    { X86::SUB8rr,          X86::SUB8rm, 0 },
+    { X86::SUBPDrr,         X86::SUBPDrm, 16 },
+    { X86::SUBPSrr,         X86::SUBPSrm, 16 },
+    { X86::SUBSDrr,         X86::SUBSDrm, 0 },
+    { X86::SUBSSrr,         X86::SUBSSrm, 0 },
+    // FIXME: TEST*rr -> swapped operand of TEST*mr.
+    { X86::UNPCKHPDrr,      X86::UNPCKHPDrm, 16 },
+    { X86::UNPCKHPSrr,      X86::UNPCKHPSrm, 16 },
+    { X86::UNPCKLPDrr,      X86::UNPCKLPDrm, 16 },
+    { X86::UNPCKLPSrr,      X86::UNPCKLPSrm, 16 },
+    { X86::XOR16rr,         X86::XOR16rm, 0 },
+    { X86::XOR32rr,         X86::XOR32rm, 0 },
+    { X86::XOR64rr,         X86::XOR64rm, 0 },
+    { X86::XOR8rr,          X86::XOR8rm, 0 },
+    { X86::XORPDrr,         X86::XORPDrm, 16 },
+    { X86::XORPSrr,         X86::XORPSrm, 16 }
+  };
+
+  for (unsigned i = 0, e = array_lengthof(OpTbl2); i != e; ++i) {
+    unsigned RegOp = OpTbl2[i][0];
+    unsigned MemOp = OpTbl2[i][1];
+    unsigned Align = OpTbl2[i][2];
+    if (!RegOp2MemOpTable2.insert(std::make_pair((unsigned*)RegOp,
+                                           std::make_pair(MemOp,Align))).second)
+      assert(false && "Duplicated entries?");
+    // Index 2, folded load
+    unsigned AuxInfo = 2 | (1 << 4);
+    if (!MemOp2RegOpTable.insert(std::make_pair((unsigned*)MemOp,
+                                   std::make_pair(RegOp, AuxInfo))).second)
+      AmbEntries.push_back(MemOp);
+  }
+
+  // Remove ambiguous entries.
+  assert(AmbEntries.empty() && "Duplicated entries in unfolding maps?");
+}
+
+bool X86InstrInfo::isMoveInstr(const MachineInstr& MI,
+                               unsigned &SrcReg, unsigned &DstReg,
+                               unsigned &SrcSubIdx, unsigned &DstSubIdx) const {
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case X86::MOV8rr:
+  case X86::MOV8rr_NOREX:
+  case X86::MOV16rr:
+  case X86::MOV32rr: 
+  case X86::MOV64rr:
+  case X86::MOVSSrr:
+  case X86::MOVSDrr:
+
+  // FP Stack register class copies
+  case X86::MOV_Fp3232: case X86::MOV_Fp6464: case X86::MOV_Fp8080:
+  case X86::MOV_Fp3264: case X86::MOV_Fp3280:
+  case X86::MOV_Fp6432: case X86::MOV_Fp8032:
+      
+  case X86::FsMOVAPSrr:
+  case X86::FsMOVAPDrr:
+  case X86::MOVAPSrr:
+  case X86::MOVAPDrr:
+  case X86::MOVDQArr:
+  case X86::MOVSS2PSrr:
+  case X86::MOVSD2PDrr:
+  case X86::MOVPS2SSrr:
+  case X86::MOVPD2SDrr:
+  case X86::MMX_MOVQ64rr:
+    assert(MI.getNumOperands() >= 2 &&
+           MI.getOperand(0).isReg() &&
+           MI.getOperand(1).isReg() &&
+           "invalid register-register move instruction");
+    SrcReg = MI.getOperand(1).getReg();
+    DstReg = MI.getOperand(0).getReg();
+    SrcSubIdx = MI.getOperand(1).getSubReg();
+    DstSubIdx = MI.getOperand(0).getSubReg();
+    return true;
+  }
+}
+
+bool
+X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
+                                    unsigned &SrcReg, unsigned &DstReg,
+                                    unsigned &SubIdx) const {
+  switch (MI.getOpcode()) {
+  default: break;
+  case X86::MOVSX16rr8:
+  case X86::MOVZX16rr8:
+  case X86::MOVSX32rr8:
+  case X86::MOVZX32rr8:
+  case X86::MOVSX64rr8:
+  case X86::MOVZX64rr8:
+    if (!TM.getSubtarget<X86Subtarget>().is64Bit())
+      // It's not always legal to reference the low 8-bit of the larger
+      // register in 32-bit mode.
+      return false;
+  case X86::MOVSX32rr16:
+  case X86::MOVZX32rr16:
+  case X86::MOVSX64rr16:
+  case X86::MOVZX64rr16:
+  case X86::MOVSX64rr32:
+  case X86::MOVZX64rr32: {
+    if (MI.getOperand(0).getSubReg() || MI.getOperand(1).getSubReg())
+      // Be conservative.
+      return false;
+    SrcReg = MI.getOperand(1).getReg();
+    DstReg = MI.getOperand(0).getReg();
+    switch (MI.getOpcode()) {
+    default:
+      llvm_unreachable(0);
+      break;
+    case X86::MOVSX16rr8:
+    case X86::MOVZX16rr8:
+    case X86::MOVSX32rr8:
+    case X86::MOVZX32rr8:
+    case X86::MOVSX64rr8:
+    case X86::MOVZX64rr8:
+      SubIdx = 1;
+      break;
+    case X86::MOVSX32rr16:
+    case X86::MOVZX32rr16:
+    case X86::MOVSX64rr16:
+    case X86::MOVZX64rr16:
+      SubIdx = 3;
+      break;
+    case X86::MOVSX64rr32:
+    case X86::MOVZX64rr32:
+      SubIdx = 4;
+      break;
+    }
+    return true;
+  }
+  }
+  return false;
+}
+
+/// isFrameOperand - Return true and the FrameIndex if the specified
+/// operand and follow operands form a reference to the stack frame.
+bool X86InstrInfo::isFrameOperand(const MachineInstr *MI, unsigned int Op,
+                                  int &FrameIndex) const {
+  if (MI->getOperand(Op).isFI() && MI->getOperand(Op+1).isImm() &&
+      MI->getOperand(Op+2).isReg() && MI->getOperand(Op+3).isImm() &&
+      MI->getOperand(Op+1).getImm() == 1 &&
+      MI->getOperand(Op+2).getReg() == 0 &&
+      MI->getOperand(Op+3).getImm() == 0) {
+    FrameIndex = MI->getOperand(Op).getIndex();
+    return true;
+  }
+  return false;
+}
+
+static bool isFrameLoadOpcode(int Opcode) {
+  switch (Opcode) {
+  default: break;
+  case X86::MOV8rm:
+  case X86::MOV16rm:
+  case X86::MOV32rm:
+  case X86::MOV64rm:
+  case X86::LD_Fp64m:
+  case X86::MOVSSrm:
+  case X86::MOVSDrm:
+  case X86::MOVAPSrm:
+  case X86::MOVAPDrm:
+  case X86::MOVDQArm:
+  case X86::MMX_MOVD64rm:
+  case X86::MMX_MOVQ64rm:
+    return true;
+    break;
+  }
+  return false;
+}
+
+static bool isFrameStoreOpcode(int Opcode) {
+  switch (Opcode) {
+  default: break;
+  case X86::MOV8mr:
+  case X86::MOV16mr:
+  case X86::MOV32mr:
+  case X86::MOV64mr:
+  case X86::ST_FpP64m:
+  case X86::MOVSSmr:
+  case X86::MOVSDmr:
+  case X86::MOVAPSmr:
+  case X86::MOVAPDmr:
+  case X86::MOVDQAmr:
+  case X86::MMX_MOVD64mr:
+  case X86::MMX_MOVQ64mr:
+  case X86::MMX_MOVNTQmr:
+    return true;
+  }
+  return false;
+}
+
+unsigned X86InstrInfo::isLoadFromStackSlot(const MachineInstr *MI, 
+                                           int &FrameIndex) const {
+  if (isFrameLoadOpcode(MI->getOpcode()))
+    if (isFrameOperand(MI, 1, FrameIndex))
+      return MI->getOperand(0).getReg();
+  return 0;
+}
+
+unsigned X86InstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI, 
+                                                 int &FrameIndex) const {
+  if (isFrameLoadOpcode(MI->getOpcode())) {
+    unsigned Reg;
+    if ((Reg = isLoadFromStackSlot(MI, FrameIndex)))
+      return Reg;
+    // Check for post-frame index elimination operations
+    const MachineMemOperand *Dummy;
+    return hasLoadFromStackSlot(MI, Dummy, FrameIndex);
+  }
+  return 0;
+}
+
+bool X86InstrInfo::hasLoadFromStackSlot(const MachineInstr *MI,
+                                        const MachineMemOperand *&MMO,
+                                        int &FrameIndex) const {
+  for (MachineInstr::mmo_iterator o = MI->memoperands_begin(),
+         oe = MI->memoperands_end();
+       o != oe;
+       ++o) {
+    if ((*o)->isLoad() && (*o)->getValue())
+      if (const FixedStackPseudoSourceValue *Value =
+          dyn_cast<const FixedStackPseudoSourceValue>((*o)->getValue())) {
+        FrameIndex = Value->getFrameIndex();
+        MMO = *o;
+        return true;
+      }
+  }
+  return false;
+}
+
+unsigned X86InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                          int &FrameIndex) const {
+  if (isFrameStoreOpcode(MI->getOpcode()))
+    if (isFrameOperand(MI, 0, FrameIndex))
+      return MI->getOperand(X86AddrNumOperands).getReg();
+  return 0;
+}
+
+unsigned X86InstrInfo::isStoreToStackSlotPostFE(const MachineInstr *MI,
+                                                int &FrameIndex) const {
+  if (isFrameStoreOpcode(MI->getOpcode())) {
+    unsigned Reg;
+    if ((Reg = isStoreToStackSlot(MI, FrameIndex)))
+      return Reg;
+    // Check for post-frame index elimination operations
+    const MachineMemOperand *Dummy;
+    return hasStoreToStackSlot(MI, Dummy, FrameIndex);
+  }
+  return 0;
+}
+
+bool X86InstrInfo::hasStoreToStackSlot(const MachineInstr *MI,
+                                       const MachineMemOperand *&MMO,
+                                       int &FrameIndex) const {
+  for (MachineInstr::mmo_iterator o = MI->memoperands_begin(),
+         oe = MI->memoperands_end();
+       o != oe;
+       ++o) {
+    if ((*o)->isStore() && (*o)->getValue())
+      if (const FixedStackPseudoSourceValue *Value =
+          dyn_cast<const FixedStackPseudoSourceValue>((*o)->getValue())) {
+        FrameIndex = Value->getFrameIndex();
+        MMO = *o;
+        return true;
+      }
+  }
+  return false;
+}
+
+/// regIsPICBase - Return true if register is PIC base (i.e.g defined by
+/// X86::MOVPC32r.
+static bool regIsPICBase(unsigned BaseReg, const MachineRegisterInfo &MRI) {
+  bool isPICBase = false;
+  for (MachineRegisterInfo::def_iterator I = MRI.def_begin(BaseReg),
+         E = MRI.def_end(); I != E; ++I) {
+    MachineInstr *DefMI = I.getOperand().getParent();
+    if (DefMI->getOpcode() != X86::MOVPC32r)
+      return false;
+    assert(!isPICBase && "More than one PIC base?");
+    isPICBase = true;
+  }
+  return isPICBase;
+}
+
+bool
+X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
+                                                AliasAnalysis *AA) const {
+  switch (MI->getOpcode()) {
+  default: break;
+    case X86::MOV8rm:
+    case X86::MOV16rm:
+    case X86::MOV32rm:
+    case X86::MOV64rm:
+    case X86::LD_Fp64m:
+    case X86::MOVSSrm:
+    case X86::MOVSDrm:
+    case X86::MOVAPSrm:
+    case X86::MOVUPSrm:
+    case X86::MOVUPSrm_Int:
+    case X86::MOVAPDrm:
+    case X86::MOVDQArm:
+    case X86::MMX_MOVD64rm:
+    case X86::MMX_MOVQ64rm:
+    case X86::FsMOVAPSrm:
+    case X86::FsMOVAPDrm: {
+      // Loads from constant pools are trivially rematerializable.
+      if (MI->getOperand(1).isReg() &&
+          MI->getOperand(2).isImm() &&
+          MI->getOperand(3).isReg() && MI->getOperand(3).getReg() == 0 &&
+          MI->isInvariantLoad(AA)) {
+        unsigned BaseReg = MI->getOperand(1).getReg();
+        if (BaseReg == 0 || BaseReg == X86::RIP)
+          return true;
+        // Allow re-materialization of PIC load.
+        if (!ReMatPICStubLoad && MI->getOperand(4).isGlobal())
+          return false;
+        const MachineFunction &MF = *MI->getParent()->getParent();
+        const MachineRegisterInfo &MRI = MF.getRegInfo();
+        bool isPICBase = false;
+        for (MachineRegisterInfo::def_iterator I = MRI.def_begin(BaseReg),
+               E = MRI.def_end(); I != E; ++I) {
+          MachineInstr *DefMI = I.getOperand().getParent();
+          if (DefMI->getOpcode() != X86::MOVPC32r)
+            return false;
+          assert(!isPICBase && "More than one PIC base?");
+          isPICBase = true;
+        }
+        return isPICBase;
+      } 
+      return false;
+    }
+ 
+     case X86::LEA32r:
+     case X86::LEA64r: {
+       if (MI->getOperand(2).isImm() &&
+           MI->getOperand(3).isReg() && MI->getOperand(3).getReg() == 0 &&
+           !MI->getOperand(4).isReg()) {
+         // lea fi#, lea GV, etc. are all rematerializable.
+         if (!MI->getOperand(1).isReg())
+           return true;
+         unsigned BaseReg = MI->getOperand(1).getReg();
+         if (BaseReg == 0)
+           return true;
+         // Allow re-materialization of lea PICBase + x.
+         const MachineFunction &MF = *MI->getParent()->getParent();
+         const MachineRegisterInfo &MRI = MF.getRegInfo();
+         return regIsPICBase(BaseReg, MRI);
+       }
+       return false;
+     }
+  }
+
+  // All other instructions marked M_REMATERIALIZABLE are always trivially
+  // rematerializable.
+  return true;
+}
+
+/// isSafeToClobberEFLAGS - Return true if it's safe insert an instruction that
+/// would clobber the EFLAGS condition register. Note the result may be
+/// conservative. If it cannot definitely determine the safety after visiting
+/// a few instructions in each direction it assumes it's not safe.
+static bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I) {
+  // It's always safe to clobber EFLAGS at the end of a block.
+  if (I == MBB.end())
+    return true;
+
+  // For compile time consideration, if we are not able to determine the
+  // safety after visiting 4 instructions in each direction, we will assume
+  // it's not safe.
+  MachineBasicBlock::iterator Iter = I;
+  for (unsigned i = 0; i < 4; ++i) {
+    bool SeenDef = false;
+    for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) {
+      MachineOperand &MO = Iter->getOperand(j);
+      if (!MO.isReg())
+        continue;
+      if (MO.getReg() == X86::EFLAGS) {
+        if (MO.isUse())
+          return false;
+        SeenDef = true;
+      }
+    }
+
+    if (SeenDef)
+      // This instruction defines EFLAGS, no need to look any further.
+      return true;
+    ++Iter;
+
+    // If we make it to the end of the block, it's safe to clobber EFLAGS.
+    if (Iter == MBB.end())
+      return true;
+  }
+
+  Iter = I;
+  for (unsigned i = 0; i < 4; ++i) {
+    // If we make it to the beginning of the block, it's safe to clobber
+    // EFLAGS iff EFLAGS is not live-in.
+    if (Iter == MBB.begin())
+      return !MBB.isLiveIn(X86::EFLAGS);
+
+    --Iter;
+    bool SawKill = false;
+    for (unsigned j = 0, e = Iter->getNumOperands(); j != e; ++j) {
+      MachineOperand &MO = Iter->getOperand(j);
+      if (MO.isReg() && MO.getReg() == X86::EFLAGS) {
+        if (MO.isDef()) return MO.isDead();
+        if (MO.isKill()) SawKill = true;
+      }
+    }
+
+    if (SawKill)
+      // This instruction kills EFLAGS and doesn't redefine it, so
+      // there's no need to look further.
+      return true;
+  }
+
+  // Conservative answer.
+  return false;
+}
+
+void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator I,
+                                 unsigned DestReg, unsigned SubIdx,
+                                 const MachineInstr *Orig,
+                                 const TargetRegisterInfo *TRI) const {
+  DebugLoc DL = MBB.findDebugLoc(I);
+
+  if (SubIdx && TargetRegisterInfo::isPhysicalRegister(DestReg)) {
+    DestReg = TRI->getSubReg(DestReg, SubIdx);
+    SubIdx = 0;
+  }
+
+  // MOV32r0 etc. are implemented with xor which clobbers condition code.
+  // Re-materialize them as movri instructions to avoid side effects.
+  bool Clone = true;
+  unsigned Opc = Orig->getOpcode();
+  switch (Opc) {
+  default: break;
+  case X86::MOV8r0:
+  case X86::MOV16r0:
+  case X86::MOV32r0:
+  case X86::MOV64r0: {
+    if (!isSafeToClobberEFLAGS(MBB, I)) {
+      switch (Opc) {
+      default: break;
+      case X86::MOV8r0:  Opc = X86::MOV8ri;  break;
+      case X86::MOV16r0: Opc = X86::MOV16ri; break;
+      case X86::MOV32r0: Opc = X86::MOV32ri; break;
+      case X86::MOV64r0: Opc = X86::MOV64ri; break;
+      }
+      Clone = false;
+    }
+    break;
+  }
+  }
+
+  if (Clone) {
+    MachineInstr *MI = MBB.getParent()->CloneMachineInstr(Orig);
+    MI->getOperand(0).setReg(DestReg);
+    MBB.insert(I, MI);
+  } else {
+    BuildMI(MBB, I, DL, get(Opc), DestReg).addImm(0);
+  }
+
+  MachineInstr *NewMI = prior(I);
+  NewMI->getOperand(0).setSubReg(SubIdx);
+}
+
+/// hasLiveCondCodeDef - True if MI has a condition code def, e.g. EFLAGS, that
+/// is not marked dead.
+static bool hasLiveCondCodeDef(MachineInstr *MI) {
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.isDef() &&
+        MO.getReg() == X86::EFLAGS && !MO.isDead()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+/// convertToThreeAddressWithLEA - Helper for convertToThreeAddress when
+/// 16-bit LEA is disabled, use 32-bit LEA to form 3-address code by promoting
+/// to a 32-bit superregister and then truncating back down to a 16-bit
+/// subregister.
+MachineInstr *
+X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
+                                           MachineFunction::iterator &MFI,
+                                           MachineBasicBlock::iterator &MBBI,
+                                           LiveVariables *LV) const {
+  MachineInstr *MI = MBBI;
+  unsigned Dest = MI->getOperand(0).getReg();
+  unsigned Src = MI->getOperand(1).getReg();
+  bool isDead = MI->getOperand(0).isDead();
+  bool isKill = MI->getOperand(1).isKill();
+
+  unsigned Opc = TM.getSubtarget<X86Subtarget>().is64Bit()
+    ? X86::LEA64_32r : X86::LEA32r;
+  MachineRegisterInfo &RegInfo = MFI->getParent()->getRegInfo();
+  unsigned leaInReg = RegInfo.createVirtualRegister(&X86::GR32RegClass);
+  unsigned leaOutReg = RegInfo.createVirtualRegister(&X86::GR32RegClass);
+            
+  // Build and insert into an implicit UNDEF value. This is OK because
+  // well be shifting and then extracting the lower 16-bits. 
+  // This has the potential to cause partial register stall. e.g.
+  //   movw    (%rbp,%rcx,2), %dx
+  //   leal    -65(%rdx), %esi
+  // But testing has shown this *does* help performance in 64-bit mode (at
+  // least on modern x86 machines).
+  BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg);
+  MachineInstr *InsMI =
+    BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(X86::INSERT_SUBREG),leaInReg)
+    .addReg(leaInReg)
+    .addReg(Src, getKillRegState(isKill))
+    .addImm(X86::SUBREG_16BIT);
+
+  MachineInstrBuilder MIB = BuildMI(*MFI, MBBI, MI->getDebugLoc(),
+                                    get(Opc), leaOutReg);
+  switch (MIOpc) {
+  default:
+    llvm_unreachable(0);
+    break;
+  case X86::SHL16ri: {
+    unsigned ShAmt = MI->getOperand(2).getImm();
+    MIB.addReg(0).addImm(1 << ShAmt)
+       .addReg(leaInReg, RegState::Kill).addImm(0);
+    break;
+  }
+  case X86::INC16r:
+  case X86::INC64_16r:
+    addLeaRegOffset(MIB, leaInReg, true, 1);
+    break;
+  case X86::DEC16r:
+  case X86::DEC64_16r:
+    addLeaRegOffset(MIB, leaInReg, true, -1);
+    break;
+  case X86::ADD16ri:
+  case X86::ADD16ri8:
+    addLeaRegOffset(MIB, leaInReg, true, MI->getOperand(2).getImm());    
+    break;
+  case X86::ADD16rr: {
+    unsigned Src2 = MI->getOperand(2).getReg();
+    bool isKill2 = MI->getOperand(2).isKill();
+    unsigned leaInReg2 = 0;
+    MachineInstr *InsMI2 = 0;
+    if (Src == Src2) {
+      // ADD16rr %reg1028<kill>, %reg1028
+      // just a single insert_subreg.
+      addRegReg(MIB, leaInReg, true, leaInReg, false);
+    } else {
+      leaInReg2 = RegInfo.createVirtualRegister(&X86::GR32RegClass);
+      // Build and insert into an implicit UNDEF value. This is OK because
+      // well be shifting and then extracting the lower 16-bits. 
+      BuildMI(*MFI, MIB, MI->getDebugLoc(), get(X86::IMPLICIT_DEF), leaInReg2);
+      InsMI2 =
+        BuildMI(*MFI, MIB, MI->getDebugLoc(), get(X86::INSERT_SUBREG),leaInReg2)
+        .addReg(leaInReg2)
+        .addReg(Src2, getKillRegState(isKill2))
+        .addImm(X86::SUBREG_16BIT);
+      addRegReg(MIB, leaInReg, true, leaInReg2, true);
+    }
+    if (LV && isKill2 && InsMI2)
+      LV->replaceKillInstruction(Src2, MI, InsMI2);
+    break;
+  }
+  }
+
+  MachineInstr *NewMI = MIB;
+  MachineInstr *ExtMI =
+    BuildMI(*MFI, MBBI, MI->getDebugLoc(), get(X86::EXTRACT_SUBREG))
+    .addReg(Dest, RegState::Define | getDeadRegState(isDead))
+    .addReg(leaOutReg, RegState::Kill)
+    .addImm(X86::SUBREG_16BIT);
+
+  if (LV) {
+    // Update live variables
+    LV->getVarInfo(leaInReg).Kills.push_back(NewMI);
+    LV->getVarInfo(leaOutReg).Kills.push_back(ExtMI);
+    if (isKill)
+      LV->replaceKillInstruction(Src, MI, InsMI);
+    if (isDead)
+      LV->replaceKillInstruction(Dest, MI, ExtMI);
+  }
+
+  return ExtMI;
+}
+
+/// convertToThreeAddress - This method must be implemented by targets that
+/// set the M_CONVERTIBLE_TO_3_ADDR flag.  When this flag is set, the target
+/// may be able to convert a two-address instruction into a true
+/// three-address instruction on demand.  This allows the X86 target (for
+/// example) to convert ADD and SHL instructions into LEA instructions if they
+/// would require register copies due to two-addressness.
+///
+/// This method returns a null pointer if the transformation cannot be
+/// performed, otherwise it returns the new instruction.
+///
+MachineInstr *
+X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
+                                    MachineBasicBlock::iterator &MBBI,
+                                    LiveVariables *LV) const {
+  MachineInstr *MI = MBBI;
+  MachineFunction &MF = *MI->getParent()->getParent();
+  // All instructions input are two-addr instructions.  Get the known operands.
+  unsigned Dest = MI->getOperand(0).getReg();
+  unsigned Src = MI->getOperand(1).getReg();
+  bool isDead = MI->getOperand(0).isDead();
+  bool isKill = MI->getOperand(1).isKill();
+
+  MachineInstr *NewMI = NULL;
+  // FIXME: 16-bit LEA's are really slow on Athlons, but not bad on P4's.  When
+  // we have better subtarget support, enable the 16-bit LEA generation here.
+  // 16-bit LEA is also slow on Core2.
+  bool DisableLEA16 = true;
+  bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit();
+
+  unsigned MIOpc = MI->getOpcode();
+  switch (MIOpc) {
+  case X86::SHUFPSrri: {
+    assert(MI->getNumOperands() == 4 && "Unknown shufps instruction!");
+    if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return 0;
+    
+    unsigned B = MI->getOperand(1).getReg();
+    unsigned C = MI->getOperand(2).getReg();
+    if (B != C) return 0;
+    unsigned A = MI->getOperand(0).getReg();
+    unsigned M = MI->getOperand(3).getImm();
+    NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::PSHUFDri))
+      .addReg(A, RegState::Define | getDeadRegState(isDead))
+      .addReg(B, getKillRegState(isKill)).addImm(M);
+    break;
+  }
+  case X86::SHL64ri: {
+    assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
+    // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses
+    // the flags produced by a shift yet, so this is safe.
+    unsigned ShAmt = MI->getOperand(2).getImm();
+    if (ShAmt == 0 || ShAmt >= 4) return 0;
+
+    NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
+      .addReg(Dest, RegState::Define | getDeadRegState(isDead))
+      .addReg(0).addImm(1 << ShAmt)
+      .addReg(Src, getKillRegState(isKill))
+      .addImm(0);
+    break;
+  }
+  case X86::SHL32ri: {
+    assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
+    // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses
+    // the flags produced by a shift yet, so this is safe.
+    unsigned ShAmt = MI->getOperand(2).getImm();
+    if (ShAmt == 0 || ShAmt >= 4) return 0;
+
+    unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
+    NewMI = BuildMI(MF, MI->getDebugLoc(), get(Opc))
+      .addReg(Dest, RegState::Define | getDeadRegState(isDead))
+      .addReg(0).addImm(1 << ShAmt)
+      .addReg(Src, getKillRegState(isKill)).addImm(0);
+    break;
+  }
+  case X86::SHL16ri: {
+    assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
+    // NOTE: LEA doesn't produce flags like shift does, but LLVM never uses
+    // the flags produced by a shift yet, so this is safe.
+    unsigned ShAmt = MI->getOperand(2).getImm();
+    if (ShAmt == 0 || ShAmt >= 4) return 0;
+
+    if (DisableLEA16)
+      return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
+    NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+      .addReg(Dest, RegState::Define | getDeadRegState(isDead))
+      .addReg(0).addImm(1 << ShAmt)
+      .addReg(Src, getKillRegState(isKill))
+      .addImm(0);
+    break;
+  }
+  default: {
+    // The following opcodes also sets the condition code register(s). Only
+    // convert them to equivalent lea if the condition code register def's
+    // are dead!
+    if (hasLiveCondCodeDef(MI))
+      return 0;
+
+    switch (MIOpc) {
+    default: return 0;
+    case X86::INC64r:
+    case X86::INC32r:
+    case X86::INC64_32r: {
+      assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
+      unsigned Opc = MIOpc == X86::INC64r ? X86::LEA64r
+        : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
+      NewMI = addLeaRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+                              .addReg(Dest, RegState::Define |
+                                      getDeadRegState(isDead)),
+                              Src, isKill, 1);
+      break;
+    }
+    case X86::INC16r:
+    case X86::INC64_16r:
+      if (DisableLEA16)
+        return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
+      assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
+      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+                           .addReg(Dest, RegState::Define |
+                                   getDeadRegState(isDead)),
+                           Src, isKill, 1);
+      break;
+    case X86::DEC64r:
+    case X86::DEC32r:
+    case X86::DEC64_32r: {
+      assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
+      unsigned Opc = MIOpc == X86::DEC64r ? X86::LEA64r
+        : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
+      NewMI = addLeaRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+                              .addReg(Dest, RegState::Define |
+                                      getDeadRegState(isDead)),
+                              Src, isKill, -1);
+      break;
+    }
+    case X86::DEC16r:
+    case X86::DEC64_16r:
+      if (DisableLEA16)
+        return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
+      assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
+      NewMI = addRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+                           .addReg(Dest, RegState::Define |
+                                   getDeadRegState(isDead)),
+                           Src, isKill, -1);
+      break;
+    case X86::ADD64rr:
+    case X86::ADD32rr: {
+      assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+      unsigned Opc = MIOpc == X86::ADD64rr ? X86::LEA64r
+        : (is64Bit ? X86::LEA64_32r : X86::LEA32r);
+      unsigned Src2 = MI->getOperand(2).getReg();
+      bool isKill2 = MI->getOperand(2).isKill();
+      NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+                        .addReg(Dest, RegState::Define |
+                                getDeadRegState(isDead)),
+                        Src, isKill, Src2, isKill2);
+      if (LV && isKill2)
+        LV->replaceKillInstruction(Src2, MI, NewMI);
+      break;
+    }
+    case X86::ADD16rr: {
+      if (DisableLEA16)
+        return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
+      assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+      unsigned Src2 = MI->getOperand(2).getReg();
+      bool isKill2 = MI->getOperand(2).isKill();
+      NewMI = addRegReg(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+                        .addReg(Dest, RegState::Define |
+                                getDeadRegState(isDead)),
+                        Src, isKill, Src2, isKill2);
+      if (LV && isKill2)
+        LV->replaceKillInstruction(Src2, MI, NewMI);
+      break;
+    }
+    case X86::ADD64ri32:
+    case X86::ADD64ri8:
+      assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+      NewMI = addLeaRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
+                              .addReg(Dest, RegState::Define |
+                                      getDeadRegState(isDead)),
+                              Src, isKill, MI->getOperand(2).getImm());
+      break;
+    case X86::ADD32ri:
+    case X86::ADD32ri8: {
+      assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+      unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
+      NewMI = addLeaRegOffset(BuildMI(MF, MI->getDebugLoc(), get(Opc))
+                              .addReg(Dest, RegState::Define |
+                                      getDeadRegState(isDead)),
+                                Src, isKill, MI->getOperand(2).getImm());
+      break;
+    }
+    case X86::ADD16ri:
+    case X86::ADD16ri8:
+      if (DisableLEA16)
+        return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
+      assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
+      NewMI = addLeaRegOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
+                              .addReg(Dest, RegState::Define |
+                                      getDeadRegState(isDead)),
+                              Src, isKill, MI->getOperand(2).getImm());
+      break;
+    }
+  }
+  }
+
+  if (!NewMI) return 0;
+
+  if (LV) {  // Update live variables
+    if (isKill)
+      LV->replaceKillInstruction(Src, MI, NewMI);
+    if (isDead)
+      LV->replaceKillInstruction(Dest, MI, NewMI);
+  }
+
+  MFI->insert(MBBI, NewMI);          // Insert the new inst    
+  return NewMI;
+}
+
+/// commuteInstruction - We have a few instructions that must be hacked on to
+/// commute them.
+///
+MachineInstr *
+X86InstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
+  switch (MI->getOpcode()) {
+  case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I)
+  case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I)
+  case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I)
+  case X86::SHLD32rri8: // A = SHLD32rri8 B, C, I -> A = SHRD32rri8 C, B, (32-I)
+  case X86::SHRD64rri8: // A = SHRD64rri8 B, C, I -> A = SHLD64rri8 C, B, (64-I)
+  case X86::SHLD64rri8:{// A = SHLD64rri8 B, C, I -> A = SHRD64rri8 C, B, (64-I)
+    unsigned Opc;
+    unsigned Size;
+    switch (MI->getOpcode()) {
+    default: llvm_unreachable("Unreachable!");
+    case X86::SHRD16rri8: Size = 16; Opc = X86::SHLD16rri8; break;
+    case X86::SHLD16rri8: Size = 16; Opc = X86::SHRD16rri8; break;
+    case X86::SHRD32rri8: Size = 32; Opc = X86::SHLD32rri8; break;
+    case X86::SHLD32rri8: Size = 32; Opc = X86::SHRD32rri8; break;
+    case X86::SHRD64rri8: Size = 64; Opc = X86::SHLD64rri8; break;
+    case X86::SHLD64rri8: Size = 64; Opc = X86::SHRD64rri8; break;
+    }
+    unsigned Amt = MI->getOperand(3).getImm();
+    if (NewMI) {
+      MachineFunction &MF = *MI->getParent()->getParent();
+      MI = MF.CloneMachineInstr(MI);
+      NewMI = false;
+    }
+    MI->setDesc(get(Opc));
+    MI->getOperand(3).setImm(Size-Amt);
+    return TargetInstrInfoImpl::commuteInstruction(MI, NewMI);
+  }
+  case X86::CMOVB16rr:
+  case X86::CMOVB32rr:
+  case X86::CMOVB64rr:
+  case X86::CMOVAE16rr:
+  case X86::CMOVAE32rr:
+  case X86::CMOVAE64rr:
+  case X86::CMOVE16rr:
+  case X86::CMOVE32rr:
+  case X86::CMOVE64rr:
+  case X86::CMOVNE16rr:
+  case X86::CMOVNE32rr:
+  case X86::CMOVNE64rr:
+  case X86::CMOVBE16rr:
+  case X86::CMOVBE32rr:
+  case X86::CMOVBE64rr:
+  case X86::CMOVA16rr:
+  case X86::CMOVA32rr:
+  case X86::CMOVA64rr:
+  case X86::CMOVL16rr:
+  case X86::CMOVL32rr:
+  case X86::CMOVL64rr:
+  case X86::CMOVGE16rr:
+  case X86::CMOVGE32rr:
+  case X86::CMOVGE64rr:
+  case X86::CMOVLE16rr:
+  case X86::CMOVLE32rr:
+  case X86::CMOVLE64rr:
+  case X86::CMOVG16rr:
+  case X86::CMOVG32rr:
+  case X86::CMOVG64rr:
+  case X86::CMOVS16rr:
+  case X86::CMOVS32rr:
+  case X86::CMOVS64rr:
+  case X86::CMOVNS16rr:
+  case X86::CMOVNS32rr:
+  case X86::CMOVNS64rr:
+  case X86::CMOVP16rr:
+  case X86::CMOVP32rr:
+  case X86::CMOVP64rr:
+  case X86::CMOVNP16rr:
+  case X86::CMOVNP32rr:
+  case X86::CMOVNP64rr:
+  case X86::CMOVO16rr:
+  case X86::CMOVO32rr:
+  case X86::CMOVO64rr:
+  case X86::CMOVNO16rr:
+  case X86::CMOVNO32rr:
+  case X86::CMOVNO64rr: {
+    unsigned Opc = 0;
+    switch (MI->getOpcode()) {
+    default: break;
+    case X86::CMOVB16rr:  Opc = X86::CMOVAE16rr; break;
+    case X86::CMOVB32rr:  Opc = X86::CMOVAE32rr; break;
+    case X86::CMOVB64rr:  Opc = X86::CMOVAE64rr; break;
+    case X86::CMOVAE16rr: Opc = X86::CMOVB16rr; break;
+    case X86::CMOVAE32rr: Opc = X86::CMOVB32rr; break;
+    case X86::CMOVAE64rr: Opc = X86::CMOVB64rr; break;
+    case X86::CMOVE16rr:  Opc = X86::CMOVNE16rr; break;
+    case X86::CMOVE32rr:  Opc = X86::CMOVNE32rr; break;
+    case X86::CMOVE64rr:  Opc = X86::CMOVNE64rr; break;
+    case X86::CMOVNE16rr: Opc = X86::CMOVE16rr; break;
+    case X86::CMOVNE32rr: Opc = X86::CMOVE32rr; break;
+    case X86::CMOVNE64rr: Opc = X86::CMOVE64rr; break;
+    case X86::CMOVBE16rr: Opc = X86::CMOVA16rr; break;
+    case X86::CMOVBE32rr: Opc = X86::CMOVA32rr; break;
+    case X86::CMOVBE64rr: Opc = X86::CMOVA64rr; break;
+    case X86::CMOVA16rr:  Opc = X86::CMOVBE16rr; break;
+    case X86::CMOVA32rr:  Opc = X86::CMOVBE32rr; break;
+    case X86::CMOVA64rr:  Opc = X86::CMOVBE64rr; break;
+    case X86::CMOVL16rr:  Opc = X86::CMOVGE16rr; break;
+    case X86::CMOVL32rr:  Opc = X86::CMOVGE32rr; break;
+    case X86::CMOVL64rr:  Opc = X86::CMOVGE64rr; break;
+    case X86::CMOVGE16rr: Opc = X86::CMOVL16rr; break;
+    case X86::CMOVGE32rr: Opc = X86::CMOVL32rr; break;
+    case X86::CMOVGE64rr: Opc = X86::CMOVL64rr; break;
+    case X86::CMOVLE16rr: Opc = X86::CMOVG16rr; break;
+    case X86::CMOVLE32rr: Opc = X86::CMOVG32rr; break;
+    case X86::CMOVLE64rr: Opc = X86::CMOVG64rr; break;
+    case X86::CMOVG16rr:  Opc = X86::CMOVLE16rr; break;
+    case X86::CMOVG32rr:  Opc = X86::CMOVLE32rr; break;
+    case X86::CMOVG64rr:  Opc = X86::CMOVLE64rr; break;
+    case X86::CMOVS16rr:  Opc = X86::CMOVNS16rr; break;
+    case X86::CMOVS32rr:  Opc = X86::CMOVNS32rr; break;
+    case X86::CMOVS64rr:  Opc = X86::CMOVNS64rr; break;
+    case X86::CMOVNS16rr: Opc = X86::CMOVS16rr; break;
+    case X86::CMOVNS32rr: Opc = X86::CMOVS32rr; break;
+    case X86::CMOVNS64rr: Opc = X86::CMOVS64rr; break;
+    case X86::CMOVP16rr:  Opc = X86::CMOVNP16rr; break;
+    case X86::CMOVP32rr:  Opc = X86::CMOVNP32rr; break;
+    case X86::CMOVP64rr:  Opc = X86::CMOVNP64rr; break;
+    case X86::CMOVNP16rr: Opc = X86::CMOVP16rr; break;
+    case X86::CMOVNP32rr: Opc = X86::CMOVP32rr; break;
+    case X86::CMOVNP64rr: Opc = X86::CMOVP64rr; break;
+    case X86::CMOVO16rr:  Opc = X86::CMOVNO16rr; break;
+    case X86::CMOVO32rr:  Opc = X86::CMOVNO32rr; break;
+    case X86::CMOVO64rr:  Opc = X86::CMOVNO64rr; break;
+    case X86::CMOVNO16rr: Opc = X86::CMOVO16rr; break;
+    case X86::CMOVNO32rr: Opc = X86::CMOVO32rr; break;
+    case X86::CMOVNO64rr: Opc = X86::CMOVO64rr; break;
+    }
+    if (NewMI) {
+      MachineFunction &MF = *MI->getParent()->getParent();
+      MI = MF.CloneMachineInstr(MI);
+      NewMI = false;
+    }
+    MI->setDesc(get(Opc));
+    // Fallthrough intended.
+  }
+  default:
+    return TargetInstrInfoImpl::commuteInstruction(MI, NewMI);
+  }
+}
+
+static X86::CondCode GetCondFromBranchOpc(unsigned BrOpc) {
+  switch (BrOpc) {
+  default: return X86::COND_INVALID;
+  case X86::JE:  return X86::COND_E;
+  case X86::JNE: return X86::COND_NE;
+  case X86::JL:  return X86::COND_L;
+  case X86::JLE: return X86::COND_LE;
+  case X86::JG:  return X86::COND_G;
+  case X86::JGE: return X86::COND_GE;
+  case X86::JB:  return X86::COND_B;
+  case X86::JBE: return X86::COND_BE;
+  case X86::JA:  return X86::COND_A;
+  case X86::JAE: return X86::COND_AE;
+  case X86::JS:  return X86::COND_S;
+  case X86::JNS: return X86::COND_NS;
+  case X86::JP:  return X86::COND_P;
+  case X86::JNP: return X86::COND_NP;
+  case X86::JO:  return X86::COND_O;
+  case X86::JNO: return X86::COND_NO;
+  }
+}
+
+unsigned X86::GetCondBranchFromCond(X86::CondCode CC) {
+  switch (CC) {
+  default: llvm_unreachable("Illegal condition code!");
+  case X86::COND_E:  return X86::JE;
+  case X86::COND_NE: return X86::JNE;
+  case X86::COND_L:  return X86::JL;
+  case X86::COND_LE: return X86::JLE;
+  case X86::COND_G:  return X86::JG;
+  case X86::COND_GE: return X86::JGE;
+  case X86::COND_B:  return X86::JB;
+  case X86::COND_BE: return X86::JBE;
+  case X86::COND_A:  return X86::JA;
+  case X86::COND_AE: return X86::JAE;
+  case X86::COND_S:  return X86::JS;
+  case X86::COND_NS: return X86::JNS;
+  case X86::COND_P:  return X86::JP;
+  case X86::COND_NP: return X86::JNP;
+  case X86::COND_O:  return X86::JO;
+  case X86::COND_NO: return X86::JNO;
+  }
+}
+
+/// GetOppositeBranchCondition - Return the inverse of the specified condition,
+/// e.g. turning COND_E to COND_NE.
+X86::CondCode X86::GetOppositeBranchCondition(X86::CondCode CC) {
+  switch (CC) {
+  default: llvm_unreachable("Illegal condition code!");
+  case X86::COND_E:  return X86::COND_NE;
+  case X86::COND_NE: return X86::COND_E;
+  case X86::COND_L:  return X86::COND_GE;
+  case X86::COND_LE: return X86::COND_G;
+  case X86::COND_G:  return X86::COND_LE;
+  case X86::COND_GE: return X86::COND_L;
+  case X86::COND_B:  return X86::COND_AE;
+  case X86::COND_BE: return X86::COND_A;
+  case X86::COND_A:  return X86::COND_BE;
+  case X86::COND_AE: return X86::COND_B;
+  case X86::COND_S:  return X86::COND_NS;
+  case X86::COND_NS: return X86::COND_S;
+  case X86::COND_P:  return X86::COND_NP;
+  case X86::COND_NP: return X86::COND_P;
+  case X86::COND_O:  return X86::COND_NO;
+  case X86::COND_NO: return X86::COND_O;
+  }
+}
+
+bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr *MI) const {
+  const TargetInstrDesc &TID = MI->getDesc();
+  if (!TID.isTerminator()) return false;
+  
+  // Conditional branch is a special case.
+  if (TID.isBranch() && !TID.isBarrier())
+    return true;
+  if (!TID.isPredicable())
+    return true;
+  return !isPredicated(MI);
+}
+
+// For purposes of branch analysis do not count FP_REG_KILL as a terminator.
+static bool isBrAnalysisUnpredicatedTerminator(const MachineInstr *MI,
+                                               const X86InstrInfo &TII) {
+  if (MI->getOpcode() == X86::FP_REG_KILL)
+    return false;
+  return TII.isUnpredicatedTerminator(MI);
+}
+
+bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, 
+                                 MachineBasicBlock *&TBB,
+                                 MachineBasicBlock *&FBB,
+                                 SmallVectorImpl<MachineOperand> &Cond,
+                                 bool AllowModify) const {
+  // Start from the bottom of the block and work up, examining the
+  // terminator instructions.
+  MachineBasicBlock::iterator I = MBB.end();
+  while (I != MBB.begin()) {
+    --I;
+
+    // Working from the bottom, when we see a non-terminator instruction, we're
+    // done.
+    if (!isBrAnalysisUnpredicatedTerminator(I, *this))
+      break;
+
+    // A terminator that isn't a branch can't easily be handled by this
+    // analysis.
+    if (!I->getDesc().isBranch())
+      return true;
+
+    // Handle unconditional branches.
+    if (I->getOpcode() == X86::JMP) {
+      if (!AllowModify) {
+        TBB = I->getOperand(0).getMBB();
+        continue;
+      }
+
+      // If the block has any instructions after a JMP, delete them.
+      while (llvm::next(I) != MBB.end())
+        llvm::next(I)->eraseFromParent();
+
+      Cond.clear();
+      FBB = 0;
+
+      // Delete the JMP if it's equivalent to a fall-through.
+      if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
+        TBB = 0;
+        I->eraseFromParent();
+        I = MBB.end();
+        continue;
+      }
+
+      // TBB is used to indicate the unconditinal destination.
+      TBB = I->getOperand(0).getMBB();
+      continue;
+    }
+
+    // Handle conditional branches.
+    X86::CondCode BranchCode = GetCondFromBranchOpc(I->getOpcode());
+    if (BranchCode == X86::COND_INVALID)
+      return true;  // Can't handle indirect branch.
+
+    // Working from the bottom, handle the first conditional branch.
+    if (Cond.empty()) {
+      FBB = TBB;
+      TBB = I->getOperand(0).getMBB();
+      Cond.push_back(MachineOperand::CreateImm(BranchCode));
+      continue;
+    }
+
+    // Handle subsequent conditional branches. Only handle the case where all
+    // conditional branches branch to the same destination and their condition
+    // opcodes fit one of the special multi-branch idioms.
+    assert(Cond.size() == 1);
+    assert(TBB);
+
+    // Only handle the case where all conditional branches branch to the same
+    // destination.
+    if (TBB != I->getOperand(0).getMBB())
+      return true;
+
+    // If the conditions are the same, we can leave them alone.
+    X86::CondCode OldBranchCode = (X86::CondCode)Cond[0].getImm();
+    if (OldBranchCode == BranchCode)
+      continue;
+
+    // If they differ, see if they fit one of the known patterns. Theoretically,
+    // we could handle more patterns here, but we shouldn't expect to see them
+    // if instruction selection has done a reasonable job.
+    if ((OldBranchCode == X86::COND_NP &&
+         BranchCode == X86::COND_E) ||
+        (OldBranchCode == X86::COND_E &&
+         BranchCode == X86::COND_NP))
+      BranchCode = X86::COND_NP_OR_E;
+    else if ((OldBranchCode == X86::COND_P &&
+              BranchCode == X86::COND_NE) ||
+             (OldBranchCode == X86::COND_NE &&
+              BranchCode == X86::COND_P))
+      BranchCode = X86::COND_NE_OR_P;
+    else
+      return true;
+
+    // Update the MachineOperand.
+    Cond[0].setImm(BranchCode);
+  }
+
+  return false;
+}
+
+unsigned X86InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  unsigned Count = 0;
+
+  while (I != MBB.begin()) {
+    --I;
+    if (I->getOpcode() != X86::JMP &&
+        GetCondFromBranchOpc(I->getOpcode()) == X86::COND_INVALID)
+      break;
+    // Remove the branch.
+    I->eraseFromParent();
+    I = MBB.end();
+    ++Count;
+  }
+  
+  return Count;
+}
+
+unsigned
+X86InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                           MachineBasicBlock *FBB,
+                           const SmallVectorImpl<MachineOperand> &Cond) const {
+  // FIXME this should probably have a DebugLoc operand
+  DebugLoc dl = DebugLoc::getUnknownLoc();
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 1 || Cond.size() == 0) &&
+         "X86 branch conditions have one component!");
+
+  if (Cond.empty()) {
+    // Unconditional branch?
+    assert(!FBB && "Unconditional branch with multiple successors!");
+    BuildMI(&MBB, dl, get(X86::JMP)).addMBB(TBB);
+    return 1;
+  }
+
+  // Conditional branch.
+  unsigned Count = 0;
+  X86::CondCode CC = (X86::CondCode)Cond[0].getImm();
+  switch (CC) {
+  case X86::COND_NP_OR_E:
+    // Synthesize NP_OR_E with two branches.
+    BuildMI(&MBB, dl, get(X86::JNP)).addMBB(TBB);
+    ++Count;
+    BuildMI(&MBB, dl, get(X86::JE)).addMBB(TBB);
+    ++Count;
+    break;
+  case X86::COND_NE_OR_P:
+    // Synthesize NE_OR_P with two branches.
+    BuildMI(&MBB, dl, get(X86::JNE)).addMBB(TBB);
+    ++Count;
+    BuildMI(&MBB, dl, get(X86::JP)).addMBB(TBB);
+    ++Count;
+    break;
+  default: {
+    unsigned Opc = GetCondBranchFromCond(CC);
+    BuildMI(&MBB, dl, get(Opc)).addMBB(TBB);
+    ++Count;
+  }
+  }
+  if (FBB) {
+    // Two-way Conditional branch. Insert the second branch.
+    BuildMI(&MBB, dl, get(X86::JMP)).addMBB(FBB);
+    ++Count;
+  }
+  return Count;
+}
+
+/// isHReg - Test if the given register is a physical h register.
+static bool isHReg(unsigned Reg) {
+  return X86::GR8_ABCD_HRegClass.contains(Reg);
+}
+
+bool X86InstrInfo::copyRegToReg(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MI,
+                                unsigned DestReg, unsigned SrcReg,
+                                const TargetRegisterClass *DestRC,
+                                const TargetRegisterClass *SrcRC) const {
+  DebugLoc DL = MBB.findDebugLoc(MI);
+
+  // Determine if DstRC and SrcRC have a common superclass in common.
+  const TargetRegisterClass *CommonRC = DestRC;
+  if (DestRC == SrcRC)
+    /* Source and destination have the same register class. */;
+  else if (CommonRC->hasSuperClass(SrcRC))
+    CommonRC = SrcRC;
+  else if (!DestRC->hasSubClass(SrcRC)) {
+    // Neither of GR64_NOREX or GR64_NOSP is a superclass of the other,
+    // but we want to copy then as GR64. Similarly, for GR32_NOREX and
+    // GR32_NOSP, copy as GR32.
+    if (SrcRC->hasSuperClass(&X86::GR64RegClass) &&
+        DestRC->hasSuperClass(&X86::GR64RegClass))
+      CommonRC = &X86::GR64RegClass;
+    else if (SrcRC->hasSuperClass(&X86::GR32RegClass) &&
+             DestRC->hasSuperClass(&X86::GR32RegClass))
+      CommonRC = &X86::GR32RegClass;
+    else
+      CommonRC = 0;
+  }
+
+  if (CommonRC) {
+    unsigned Opc;
+    if (CommonRC == &X86::GR64RegClass || CommonRC == &X86::GR64_NOSPRegClass) {
+      Opc = X86::MOV64rr;
+    } else if (CommonRC == &X86::GR32RegClass ||
+               CommonRC == &X86::GR32_NOSPRegClass) {
+      Opc = X86::MOV32rr;
+    } else if (CommonRC == &X86::GR16RegClass) {
+      Opc = X86::MOV16rr;
+    } else if (CommonRC == &X86::GR8RegClass) {
+      // Copying to or from a physical H register on x86-64 requires a NOREX
+      // move.  Otherwise use a normal move.
+      if ((isHReg(DestReg) || isHReg(SrcReg)) &&
+          TM.getSubtarget<X86Subtarget>().is64Bit())
+        Opc = X86::MOV8rr_NOREX;
+      else
+        Opc = X86::MOV8rr;
+    } else if (CommonRC == &X86::GR64_ABCDRegClass) {
+      Opc = X86::MOV64rr;
+    } else if (CommonRC == &X86::GR32_ABCDRegClass) {
+      Opc = X86::MOV32rr;
+    } else if (CommonRC == &X86::GR16_ABCDRegClass) {
+      Opc = X86::MOV16rr;
+    } else if (CommonRC == &X86::GR8_ABCD_LRegClass) {
+      Opc = X86::MOV8rr;
+    } else if (CommonRC == &X86::GR8_ABCD_HRegClass) {
+      if (TM.getSubtarget<X86Subtarget>().is64Bit())
+        Opc = X86::MOV8rr_NOREX;
+      else
+        Opc = X86::MOV8rr;
+    } else if (CommonRC == &X86::GR64_NOREXRegClass ||
+               CommonRC == &X86::GR64_NOREX_NOSPRegClass) {
+      Opc = X86::MOV64rr;
+    } else if (CommonRC == &X86::GR32_NOREXRegClass) {
+      Opc = X86::MOV32rr;
+    } else if (CommonRC == &X86::GR16_NOREXRegClass) {
+      Opc = X86::MOV16rr;
+    } else if (CommonRC == &X86::GR8_NOREXRegClass) {
+      Opc = X86::MOV8rr;
+    } else if (CommonRC == &X86::RFP32RegClass) {
+      Opc = X86::MOV_Fp3232;
+    } else if (CommonRC == &X86::RFP64RegClass || CommonRC == &X86::RSTRegClass) {
+      Opc = X86::MOV_Fp6464;
+    } else if (CommonRC == &X86::RFP80RegClass) {
+      Opc = X86::MOV_Fp8080;
+    } else if (CommonRC == &X86::FR32RegClass) {
+      Opc = X86::FsMOVAPSrr;
+    } else if (CommonRC == &X86::FR64RegClass) {
+      Opc = X86::FsMOVAPDrr;
+    } else if (CommonRC == &X86::VR128RegClass) {
+      Opc = X86::MOVAPSrr;
+    } else if (CommonRC == &X86::VR64RegClass) {
+      Opc = X86::MMX_MOVQ64rr;
+    } else {
+      return false;
+    }
+    BuildMI(MBB, MI, DL, get(Opc), DestReg).addReg(SrcReg);
+    return true;
+  }
+
+  // Moving EFLAGS to / from another register requires a push and a pop.
+  if (SrcRC == &X86::CCRRegClass) {
+    if (SrcReg != X86::EFLAGS)
+      return false;
+    if (DestRC == &X86::GR64RegClass || DestRC == &X86::GR64_NOSPRegClass) {
+      BuildMI(MBB, MI, DL, get(X86::PUSHFQ64));
+      BuildMI(MBB, MI, DL, get(X86::POP64r), DestReg);
+      return true;
+    } else if (DestRC == &X86::GR32RegClass ||
+               DestRC == &X86::GR32_NOSPRegClass) {
+      BuildMI(MBB, MI, DL, get(X86::PUSHFD));
+      BuildMI(MBB, MI, DL, get(X86::POP32r), DestReg);
+      return true;
+    }
+  } else if (DestRC == &X86::CCRRegClass) {
+    if (DestReg != X86::EFLAGS)
+      return false;
+    if (SrcRC == &X86::GR64RegClass || DestRC == &X86::GR64_NOSPRegClass) {
+      BuildMI(MBB, MI, DL, get(X86::PUSH64r)).addReg(SrcReg);
+      BuildMI(MBB, MI, DL, get(X86::POPFQ));
+      return true;
+    } else if (SrcRC == &X86::GR32RegClass ||
+               DestRC == &X86::GR32_NOSPRegClass) {
+      BuildMI(MBB, MI, DL, get(X86::PUSH32r)).addReg(SrcReg);
+      BuildMI(MBB, MI, DL, get(X86::POPFD));
+      return true;
+    }
+  }
+
+  // Moving from ST(0) turns into FpGET_ST0_32 etc.
+  if (SrcRC == &X86::RSTRegClass) {
+    // Copying from ST(0)/ST(1).
+    if (SrcReg != X86::ST0 && SrcReg != X86::ST1)
+      // Can only copy from ST(0)/ST(1) right now
+      return false;
+    bool isST0 = SrcReg == X86::ST0;
+    unsigned Opc;
+    if (DestRC == &X86::RFP32RegClass)
+      Opc = isST0 ? X86::FpGET_ST0_32 : X86::FpGET_ST1_32;
+    else if (DestRC == &X86::RFP64RegClass)
+      Opc = isST0 ? X86::FpGET_ST0_64 : X86::FpGET_ST1_64;
+    else {
+      if (DestRC != &X86::RFP80RegClass)
+        return false;
+      Opc = isST0 ? X86::FpGET_ST0_80 : X86::FpGET_ST1_80;
+    }
+    BuildMI(MBB, MI, DL, get(Opc), DestReg);
+    return true;
+  }
+
+  // Moving to ST(0) turns into FpSET_ST0_32 etc.
+  if (DestRC == &X86::RSTRegClass) {
+    // Copying to ST(0) / ST(1).
+    if (DestReg != X86::ST0 && DestReg != X86::ST1)
+      // Can only copy to TOS right now
+      return false;
+    bool isST0 = DestReg == X86::ST0;
+    unsigned Opc;
+    if (SrcRC == &X86::RFP32RegClass)
+      Opc = isST0 ? X86::FpSET_ST0_32 : X86::FpSET_ST1_32;
+    else if (SrcRC == &X86::RFP64RegClass)
+      Opc = isST0 ? X86::FpSET_ST0_64 : X86::FpSET_ST1_64;
+    else {
+      if (SrcRC != &X86::RFP80RegClass)
+        return false;
+      Opc = isST0 ? X86::FpSET_ST0_80 : X86::FpSET_ST1_80;
+    }
+    BuildMI(MBB, MI, DL, get(Opc)).addReg(SrcReg);
+    return true;
+  }
+  
+  // Not yet supported!
+  return false;
+}
+
+static unsigned getStoreRegOpcode(unsigned SrcReg,
+                                  const TargetRegisterClass *RC,
+                                  bool isStackAligned,
+                                  TargetMachine &TM) {
+  unsigned Opc = 0;
+  if (RC == &X86::GR64RegClass || RC == &X86::GR64_NOSPRegClass) {
+    Opc = X86::MOV64mr;
+  } else if (RC == &X86::GR32RegClass || RC == &X86::GR32_NOSPRegClass) {
+    Opc = X86::MOV32mr;
+  } else if (RC == &X86::GR16RegClass) {
+    Opc = X86::MOV16mr;
+  } else if (RC == &X86::GR8RegClass) {
+    // Copying to or from a physical H register on x86-64 requires a NOREX
+    // move.  Otherwise use a normal move.
+    if (isHReg(SrcReg) &&
+        TM.getSubtarget<X86Subtarget>().is64Bit())
+      Opc = X86::MOV8mr_NOREX;
+    else
+      Opc = X86::MOV8mr;
+  } else if (RC == &X86::GR64_ABCDRegClass) {
+    Opc = X86::MOV64mr;
+  } else if (RC == &X86::GR32_ABCDRegClass) {
+    Opc = X86::MOV32mr;
+  } else if (RC == &X86::GR16_ABCDRegClass) {
+    Opc = X86::MOV16mr;
+  } else if (RC == &X86::GR8_ABCD_LRegClass) {
+    Opc = X86::MOV8mr;
+  } else if (RC == &X86::GR8_ABCD_HRegClass) {
+    if (TM.getSubtarget<X86Subtarget>().is64Bit())
+      Opc = X86::MOV8mr_NOREX;
+    else
+      Opc = X86::MOV8mr;
+  } else if (RC == &X86::GR64_NOREXRegClass ||
+             RC == &X86::GR64_NOREX_NOSPRegClass) {
+    Opc = X86::MOV64mr;
+  } else if (RC == &X86::GR32_NOREXRegClass) {
+    Opc = X86::MOV32mr;
+  } else if (RC == &X86::GR16_NOREXRegClass) {
+    Opc = X86::MOV16mr;
+  } else if (RC == &X86::GR8_NOREXRegClass) {
+    Opc = X86::MOV8mr;
+  } else if (RC == &X86::RFP80RegClass) {
+    Opc = X86::ST_FpP80m;   // pops
+  } else if (RC == &X86::RFP64RegClass) {
+    Opc = X86::ST_Fp64m;
+  } else if (RC == &X86::RFP32RegClass) {
+    Opc = X86::ST_Fp32m;
+  } else if (RC == &X86::FR32RegClass) {
+    Opc = X86::MOVSSmr;
+  } else if (RC == &X86::FR64RegClass) {
+    Opc = X86::MOVSDmr;
+  } else if (RC == &X86::VR128RegClass) {
+    // If stack is realigned we can use aligned stores.
+    Opc = isStackAligned ? X86::MOVAPSmr : X86::MOVUPSmr;
+  } else if (RC == &X86::VR64RegClass) {
+    Opc = X86::MMX_MOVQ64mr;
+  } else {
+    llvm_unreachable("Unknown regclass");
+  }
+
+  return Opc;
+}
+
+void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MI,
+                                       unsigned SrcReg, bool isKill, int FrameIdx,
+                                       const TargetRegisterClass *RC) const {
+  const MachineFunction &MF = *MBB.getParent();
+  bool isAligned = (RI.getStackAlignment() >= 16) || RI.canRealignStack(MF);
+  unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM);
+  DebugLoc DL = MBB.findDebugLoc(MI);
+  addFrameReference(BuildMI(MBB, MI, DL, get(Opc)), FrameIdx)
+    .addReg(SrcReg, getKillRegState(isKill));
+}
+
+void X86InstrInfo::storeRegToAddr(MachineFunction &MF, unsigned SrcReg,
+                                  bool isKill,
+                                  SmallVectorImpl<MachineOperand> &Addr,
+                                  const TargetRegisterClass *RC,
+                                  MachineInstr::mmo_iterator MMOBegin,
+                                  MachineInstr::mmo_iterator MMOEnd,
+                                  SmallVectorImpl<MachineInstr*> &NewMIs) const {
+  bool isAligned = (*MMOBegin)->getAlignment() >= 16;
+  unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, TM);
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
+  for (unsigned i = 0, e = Addr.size(); i != e; ++i)
+    MIB.addOperand(Addr[i]);
+  MIB.addReg(SrcReg, getKillRegState(isKill));
+  (*MIB).setMemRefs(MMOBegin, MMOEnd);
+  NewMIs.push_back(MIB);
+}
+
+static unsigned getLoadRegOpcode(unsigned DestReg,
+                                 const TargetRegisterClass *RC,
+                                 bool isStackAligned,
+                                 const TargetMachine &TM) {
+  unsigned Opc = 0;
+  if (RC == &X86::GR64RegClass || RC == &X86::GR64_NOSPRegClass) {
+    Opc = X86::MOV64rm;
+  } else if (RC == &X86::GR32RegClass || RC == &X86::GR32_NOSPRegClass) {
+    Opc = X86::MOV32rm;
+  } else if (RC == &X86::GR16RegClass) {
+    Opc = X86::MOV16rm;
+  } else if (RC == &X86::GR8RegClass) {
+    // Copying to or from a physical H register on x86-64 requires a NOREX
+    // move.  Otherwise use a normal move.
+    if (isHReg(DestReg) &&
+        TM.getSubtarget<X86Subtarget>().is64Bit())
+      Opc = X86::MOV8rm_NOREX;
+    else
+      Opc = X86::MOV8rm;
+  } else if (RC == &X86::GR64_ABCDRegClass) {
+    Opc = X86::MOV64rm;
+  } else if (RC == &X86::GR32_ABCDRegClass) {
+    Opc = X86::MOV32rm;
+  } else if (RC == &X86::GR16_ABCDRegClass) {
+    Opc = X86::MOV16rm;
+  } else if (RC == &X86::GR8_ABCD_LRegClass) {
+    Opc = X86::MOV8rm;
+  } else if (RC == &X86::GR8_ABCD_HRegClass) {
+    if (TM.getSubtarget<X86Subtarget>().is64Bit())
+      Opc = X86::MOV8rm_NOREX;
+    else
+      Opc = X86::MOV8rm;
+  } else if (RC == &X86::GR64_NOREXRegClass ||
+             RC == &X86::GR64_NOREX_NOSPRegClass) {
+    Opc = X86::MOV64rm;
+  } else if (RC == &X86::GR32_NOREXRegClass) {
+    Opc = X86::MOV32rm;
+  } else if (RC == &X86::GR16_NOREXRegClass) {
+    Opc = X86::MOV16rm;
+  } else if (RC == &X86::GR8_NOREXRegClass) {
+    Opc = X86::MOV8rm;
+  } else if (RC == &X86::RFP80RegClass) {
+    Opc = X86::LD_Fp80m;
+  } else if (RC == &X86::RFP64RegClass) {
+    Opc = X86::LD_Fp64m;
+  } else if (RC == &X86::RFP32RegClass) {
+    Opc = X86::LD_Fp32m;
+  } else if (RC == &X86::FR32RegClass) {
+    Opc = X86::MOVSSrm;
+  } else if (RC == &X86::FR64RegClass) {
+    Opc = X86::MOVSDrm;
+  } else if (RC == &X86::VR128RegClass) {
+    // If stack is realigned we can use aligned loads.
+    Opc = isStackAligned ? X86::MOVAPSrm : X86::MOVUPSrm;
+  } else if (RC == &X86::VR64RegClass) {
+    Opc = X86::MMX_MOVQ64rm;
+  } else {
+    llvm_unreachable("Unknown regclass");
+  }
+
+  return Opc;
+}
+
+void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                        unsigned DestReg, int FrameIdx,
+                                        const TargetRegisterClass *RC) const{
+  const MachineFunction &MF = *MBB.getParent();
+  bool isAligned = (RI.getStackAlignment() >= 16) || RI.canRealignStack(MF);
+  unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM);
+  DebugLoc DL = MBB.findDebugLoc(MI);
+  addFrameReference(BuildMI(MBB, MI, DL, get(Opc), DestReg), FrameIdx);
+}
+
+void X86InstrInfo::loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+                                 SmallVectorImpl<MachineOperand> &Addr,
+                                 const TargetRegisterClass *RC,
+                                 MachineInstr::mmo_iterator MMOBegin,
+                                 MachineInstr::mmo_iterator MMOEnd,
+                                 SmallVectorImpl<MachineInstr*> &NewMIs) const {
+  bool isAligned = (*MMOBegin)->getAlignment() >= 16;
+  unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, TM);
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc), DestReg);
+  for (unsigned i = 0, e = Addr.size(); i != e; ++i)
+    MIB.addOperand(Addr[i]);
+  (*MIB).setMemRefs(MMOBegin, MMOEnd);
+  NewMIs.push_back(MIB);
+}
+
+bool X86InstrInfo::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                             MachineBasicBlock::iterator MI,
+                                const std::vector<CalleeSavedInfo> &CSI) const {
+  if (CSI.empty())
+    return false;
+
+  DebugLoc DL = MBB.findDebugLoc(MI);
+
+  bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit();
+  bool isWin64 = TM.getSubtarget<X86Subtarget>().isTargetWin64();
+  unsigned SlotSize = is64Bit ? 8 : 4;
+
+  MachineFunction &MF = *MBB.getParent();
+  unsigned FPReg = RI.getFrameRegister(MF);
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  unsigned CalleeFrameSize = 0;
+  
+  unsigned Opc = is64Bit ? X86::PUSH64r : X86::PUSH32r;
+  for (unsigned i = CSI.size(); i != 0; --i) {
+    unsigned Reg = CSI[i-1].getReg();
+    const TargetRegisterClass *RegClass = CSI[i-1].getRegClass();
+    // Add the callee-saved register as live-in. It's killed at the spill.
+    MBB.addLiveIn(Reg);
+    if (Reg == FPReg)
+      // X86RegisterInfo::emitPrologue will handle spilling of frame register.
+      continue;
+    if (RegClass != &X86::VR128RegClass && !isWin64) {
+      CalleeFrameSize += SlotSize;
+      BuildMI(MBB, MI, DL, get(Opc)).addReg(Reg, RegState::Kill);
+    } else {
+      storeRegToStackSlot(MBB, MI, Reg, true, CSI[i-1].getFrameIdx(), RegClass);
+    }
+  }
+
+  X86FI->setCalleeSavedFrameSize(CalleeFrameSize);
+  return true;
+}
+
+bool X86InstrInfo::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                               MachineBasicBlock::iterator MI,
+                                const std::vector<CalleeSavedInfo> &CSI) const {
+  if (CSI.empty())
+    return false;
+
+  DebugLoc DL = MBB.findDebugLoc(MI);
+
+  MachineFunction &MF = *MBB.getParent();
+  unsigned FPReg = RI.getFrameRegister(MF);
+  bool is64Bit = TM.getSubtarget<X86Subtarget>().is64Bit();
+  bool isWin64 = TM.getSubtarget<X86Subtarget>().isTargetWin64();
+  unsigned Opc = is64Bit ? X86::POP64r : X86::POP32r;
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    if (Reg == FPReg)
+      // X86RegisterInfo::emitEpilogue will handle restoring of frame register.
+      continue;
+    const TargetRegisterClass *RegClass = CSI[i].getRegClass();
+    if (RegClass != &X86::VR128RegClass && !isWin64) {
+      BuildMI(MBB, MI, DL, get(Opc), Reg);
+    } else {
+      loadRegFromStackSlot(MBB, MI, Reg, CSI[i].getFrameIdx(), RegClass);
+    }
+  }
+  return true;
+}
+
+static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
+                                     const SmallVectorImpl<MachineOperand> &MOs,
+                                     MachineInstr *MI,
+                                     const TargetInstrInfo &TII) {
+  // Create the base instruction with the memory operand as the first part.
+  MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode),
+                                              MI->getDebugLoc(), true);
+  MachineInstrBuilder MIB(NewMI);
+  unsigned NumAddrOps = MOs.size();
+  for (unsigned i = 0; i != NumAddrOps; ++i)
+    MIB.addOperand(MOs[i]);
+  if (NumAddrOps < 4)  // FrameIndex only
+    addOffset(MIB, 0);
+  
+  // Loop over the rest of the ri operands, converting them over.
+  unsigned NumOps = MI->getDesc().getNumOperands()-2;
+  for (unsigned i = 0; i != NumOps; ++i) {
+    MachineOperand &MO = MI->getOperand(i+2);
+    MIB.addOperand(MO);
+  }
+  for (unsigned i = NumOps+2, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    MIB.addOperand(MO);
+  }
+  return MIB;
+}
+
+static MachineInstr *FuseInst(MachineFunction &MF,
+                              unsigned Opcode, unsigned OpNo,
+                              const SmallVectorImpl<MachineOperand> &MOs,
+                              MachineInstr *MI, const TargetInstrInfo &TII) {
+  MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode),
+                                              MI->getDebugLoc(), true);
+  MachineInstrBuilder MIB(NewMI);
+  
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (i == OpNo) {
+      assert(MO.isReg() && "Expected to fold into reg operand!");
+      unsigned NumAddrOps = MOs.size();
+      for (unsigned i = 0; i != NumAddrOps; ++i)
+        MIB.addOperand(MOs[i]);
+      if (NumAddrOps < 4)  // FrameIndex only
+        addOffset(MIB, 0);
+    } else {
+      MIB.addOperand(MO);
+    }
+  }
+  return MIB;
+}
+
+static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode,
+                                const SmallVectorImpl<MachineOperand> &MOs,
+                                MachineInstr *MI) {
+  MachineFunction &MF = *MI->getParent()->getParent();
+  MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), TII.get(Opcode));
+
+  unsigned NumAddrOps = MOs.size();
+  for (unsigned i = 0; i != NumAddrOps; ++i)
+    MIB.addOperand(MOs[i]);
+  if (NumAddrOps < 4)  // FrameIndex only
+    addOffset(MIB, 0);
+  return MIB.addImm(0);
+}
+
+MachineInstr*
+X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+                                    MachineInstr *MI, unsigned i,
+                                    const SmallVectorImpl<MachineOperand> &MOs,
+                                    unsigned Size, unsigned Align) const {
+  const DenseMap<unsigned*, std::pair<unsigned,unsigned> > *OpcodeTablePtr=NULL;
+  bool isTwoAddrFold = false;
+  unsigned NumOps = MI->getDesc().getNumOperands();
+  bool isTwoAddr = NumOps > 1 &&
+    MI->getDesc().getOperandConstraint(1, TOI::TIED_TO) != -1;
+
+  MachineInstr *NewMI = NULL;
+  // Folding a memory location into the two-address part of a two-address
+  // instruction is different than folding it other places.  It requires
+  // replacing the *two* registers with the memory location.
+  if (isTwoAddr && NumOps >= 2 && i < 2 &&
+      MI->getOperand(0).isReg() &&
+      MI->getOperand(1).isReg() &&
+      MI->getOperand(0).getReg() == MI->getOperand(1).getReg()) { 
+    OpcodeTablePtr = &RegOp2MemOpTable2Addr;
+    isTwoAddrFold = true;
+  } else if (i == 0) { // If operand 0
+    if (MI->getOpcode() == X86::MOV64r0)
+      NewMI = MakeM0Inst(*this, X86::MOV64mi32, MOs, MI);
+    else if (MI->getOpcode() == X86::MOV32r0)
+      NewMI = MakeM0Inst(*this, X86::MOV32mi, MOs, MI);
+    else if (MI->getOpcode() == X86::MOV16r0)
+      NewMI = MakeM0Inst(*this, X86::MOV16mi, MOs, MI);
+    else if (MI->getOpcode() == X86::MOV8r0)
+      NewMI = MakeM0Inst(*this, X86::MOV8mi, MOs, MI);
+    if (NewMI)
+      return NewMI;
+    
+    OpcodeTablePtr = &RegOp2MemOpTable0;
+  } else if (i == 1) {
+    OpcodeTablePtr = &RegOp2MemOpTable1;
+  } else if (i == 2) {
+    OpcodeTablePtr = &RegOp2MemOpTable2;
+  }
+  
+  // If table selected...
+  if (OpcodeTablePtr) {
+    // Find the Opcode to fuse
+    DenseMap<unsigned*, std::pair<unsigned,unsigned> >::const_iterator I =
+      OpcodeTablePtr->find((unsigned*)MI->getOpcode());
+    if (I != OpcodeTablePtr->end()) {
+      unsigned Opcode = I->second.first;
+      unsigned MinAlign = I->second.second;
+      if (Align < MinAlign)
+        return NULL;
+      bool NarrowToMOV32rm = false;
+      if (Size) {
+        unsigned RCSize =  MI->getDesc().OpInfo[i].getRegClass(&RI)->getSize();
+        if (Size < RCSize) {
+          // Check if it's safe to fold the load. If the size of the object is
+          // narrower than the load width, then it's not.
+          if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
+            return NULL;
+          // If this is a 64-bit load, but the spill slot is 32, then we can do
+          // a 32-bit load which is implicitly zero-extended. This likely is due
+          // to liveintervalanalysis remat'ing a load from stack slot.
+          if (MI->getOperand(0).getSubReg() || MI->getOperand(1).getSubReg())
+            return NULL;
+          Opcode = X86::MOV32rm;
+          NarrowToMOV32rm = true;
+        }
+      }
+
+      if (isTwoAddrFold)
+        NewMI = FuseTwoAddrInst(MF, Opcode, MOs, MI, *this);
+      else
+        NewMI = FuseInst(MF, Opcode, i, MOs, MI, *this);
+
+      if (NarrowToMOV32rm) {
+        // If this is the special case where we use a MOV32rm to load a 32-bit
+        // value and zero-extend the top bits. Change the destination register
+        // to a 32-bit one.
+        unsigned DstReg = NewMI->getOperand(0).getReg();
+        if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+          NewMI->getOperand(0).setReg(RI.getSubReg(DstReg,
+                                                   4/*x86_subreg_32bit*/));
+        else
+          NewMI->getOperand(0).setSubReg(4/*x86_subreg_32bit*/);
+      }
+      return NewMI;
+    }
+  }
+  
+  // No fusion 
+  if (PrintFailedFusing)
+    dbgs() << "We failed to fuse operand " << i << " in " << *MI;
+  return NULL;
+}
+
+
+MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+                                                  MachineInstr *MI,
+                                           const SmallVectorImpl<unsigned> &Ops,
+                                                  int FrameIndex) const {
+  // Check switch flag 
+  if (NoFusing) return NULL;
+
+  if (!MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize))
+    switch (MI->getOpcode()) {
+    case X86::CVTSD2SSrr:
+    case X86::Int_CVTSD2SSrr:
+    case X86::CVTSS2SDrr:
+    case X86::Int_CVTSS2SDrr:
+    case X86::RCPSSr:
+    case X86::RCPSSr_Int:
+    case X86::ROUNDSDr_Int:
+    case X86::ROUNDSSr_Int:
+    case X86::RSQRTSSr:
+    case X86::RSQRTSSr_Int:
+    case X86::SQRTSSr:
+    case X86::SQRTSSr_Int:
+      return 0;
+    }
+
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  unsigned Size = MFI->getObjectSize(FrameIndex);
+  unsigned Alignment = MFI->getObjectAlignment(FrameIndex);
+  if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
+    unsigned NewOpc = 0;
+    unsigned RCSize = 0;
+    switch (MI->getOpcode()) {
+    default: return NULL;
+    case X86::TEST8rr:  NewOpc = X86::CMP8ri; RCSize = 1; break;
+    case X86::TEST16rr: NewOpc = X86::CMP16ri; RCSize = 2; break;
+    case X86::TEST32rr: NewOpc = X86::CMP32ri; RCSize = 4; break;
+    case X86::TEST64rr: NewOpc = X86::CMP64ri32; RCSize = 8; break;
+    }
+    // Check if it's safe to fold the load. If the size of the object is
+    // narrower than the load width, then it's not.
+    if (Size < RCSize)
+      return NULL;
+    // Change to CMPXXri r, 0 first.
+    MI->setDesc(get(NewOpc));
+    MI->getOperand(1).ChangeToImmediate(0);
+  } else if (Ops.size() != 1)
+    return NULL;
+
+  SmallVector<MachineOperand,4> MOs;
+  MOs.push_back(MachineOperand::CreateFI(FrameIndex));
+  return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, Size, Alignment);
+}
+
+MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+                                                  MachineInstr *MI,
+                                           const SmallVectorImpl<unsigned> &Ops,
+                                                  MachineInstr *LoadMI) const {
+  // Check switch flag 
+  if (NoFusing) return NULL;
+
+  if (!MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize))
+    switch (MI->getOpcode()) {
+    case X86::CVTSD2SSrr:
+    case X86::Int_CVTSD2SSrr:
+    case X86::CVTSS2SDrr:
+    case X86::Int_CVTSS2SDrr:
+    case X86::RCPSSr:
+    case X86::RCPSSr_Int:
+    case X86::ROUNDSDr_Int:
+    case X86::ROUNDSSr_Int:
+    case X86::RSQRTSSr:
+    case X86::RSQRTSSr_Int:
+    case X86::SQRTSSr:
+    case X86::SQRTSSr_Int:
+      return 0;
+    }
+
+  // Determine the alignment of the load.
+  unsigned Alignment = 0;
+  if (LoadMI->hasOneMemOperand())
+    Alignment = (*LoadMI->memoperands_begin())->getAlignment();
+  else
+    switch (LoadMI->getOpcode()) {
+    case X86::V_SET0:
+    case X86::V_SETALLONES:
+      Alignment = 16;
+      break;
+    case X86::FsFLD0SD:
+      Alignment = 8;
+      break;
+    case X86::FsFLD0SS:
+      Alignment = 4;
+      break;
+    default:
+      llvm_unreachable("Don't know how to fold this instruction!");
+    }
+  if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
+    unsigned NewOpc = 0;
+    switch (MI->getOpcode()) {
+    default: return NULL;
+    case X86::TEST8rr:  NewOpc = X86::CMP8ri; break;
+    case X86::TEST16rr: NewOpc = X86::CMP16ri; break;
+    case X86::TEST32rr: NewOpc = X86::CMP32ri; break;
+    case X86::TEST64rr: NewOpc = X86::CMP64ri32; break;
+    }
+    // Change to CMPXXri r, 0 first.
+    MI->setDesc(get(NewOpc));
+    MI->getOperand(1).ChangeToImmediate(0);
+  } else if (Ops.size() != 1)
+    return NULL;
+
+  SmallVector<MachineOperand,X86AddrNumOperands> MOs;
+  switch (LoadMI->getOpcode()) {
+  case X86::V_SET0:
+  case X86::V_SETALLONES:
+  case X86::FsFLD0SD:
+  case X86::FsFLD0SS: {
+    // Folding a V_SET0 or V_SETALLONES as a load, to ease register pressure.
+    // Create a constant-pool entry and operands to load from it.
+
+    // x86-32 PIC requires a PIC base register for constant pools.
+    unsigned PICBase = 0;
+    if (TM.getRelocationModel() == Reloc::PIC_) {
+      if (TM.getSubtarget<X86Subtarget>().is64Bit())
+        PICBase = X86::RIP;
+      else
+        // FIXME: PICBase = TM.getInstrInfo()->getGlobalBaseReg(&MF);
+        // This doesn't work for several reasons.
+        // 1. GlobalBaseReg may have been spilled.
+        // 2. It may not be live at MI.
+        return NULL;
+    }
+
+    // Create a constant-pool entry.
+    MachineConstantPool &MCP = *MF.getConstantPool();
+    const Type *Ty;
+    if (LoadMI->getOpcode() == X86::FsFLD0SS)
+      Ty = Type::getFloatTy(MF.getFunction()->getContext());
+    else if (LoadMI->getOpcode() == X86::FsFLD0SD)
+      Ty = Type::getDoubleTy(MF.getFunction()->getContext());
+    else
+      Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4);
+    Constant *C = LoadMI->getOpcode() == X86::V_SETALLONES ?
+                    Constant::getAllOnesValue(Ty) :
+                    Constant::getNullValue(Ty);
+    unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);
+
+    // Create operands to load from the constant pool entry.
+    MOs.push_back(MachineOperand::CreateReg(PICBase, false));
+    MOs.push_back(MachineOperand::CreateImm(1));
+    MOs.push_back(MachineOperand::CreateReg(0, false));
+    MOs.push_back(MachineOperand::CreateCPI(CPI, 0));
+    MOs.push_back(MachineOperand::CreateReg(0, false));
+    break;
+  }
+  default: {
+    // Folding a normal load. Just copy the load's address operands.
+    unsigned NumOps = LoadMI->getDesc().getNumOperands();
+    for (unsigned i = NumOps - X86AddrNumOperands; i != NumOps; ++i)
+      MOs.push_back(LoadMI->getOperand(i));
+    break;
+  }
+  }
+  return foldMemoryOperandImpl(MF, MI, Ops[0], MOs, 0, Alignment);
+}
+
+
+bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
+                                  const SmallVectorImpl<unsigned> &Ops) const {
+  // Check switch flag 
+  if (NoFusing) return 0;
+
+  if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
+    switch (MI->getOpcode()) {
+    default: return false;
+    case X86::TEST8rr: 
+    case X86::TEST16rr:
+    case X86::TEST32rr:
+    case X86::TEST64rr:
+      return true;
+    }
+  }
+
+  if (Ops.size() != 1)
+    return false;
+
+  unsigned OpNum = Ops[0];
+  unsigned Opc = MI->getOpcode();
+  unsigned NumOps = MI->getDesc().getNumOperands();
+  bool isTwoAddr = NumOps > 1 &&
+    MI->getDesc().getOperandConstraint(1, TOI::TIED_TO) != -1;
+
+  // Folding a memory location into the two-address part of a two-address
+  // instruction is different than folding it other places.  It requires
+  // replacing the *two* registers with the memory location.
+  const DenseMap<unsigned*, std::pair<unsigned,unsigned> > *OpcodeTablePtr=NULL;
+  if (isTwoAddr && NumOps >= 2 && OpNum < 2) { 
+    OpcodeTablePtr = &RegOp2MemOpTable2Addr;
+  } else if (OpNum == 0) { // If operand 0
+    switch (Opc) {
+    case X86::MOV8r0:
+    case X86::MOV16r0:
+    case X86::MOV32r0:
+    case X86::MOV64r0:
+      return true;
+    default: break;
+    }
+    OpcodeTablePtr = &RegOp2MemOpTable0;
+  } else if (OpNum == 1) {
+    OpcodeTablePtr = &RegOp2MemOpTable1;
+  } else if (OpNum == 2) {
+    OpcodeTablePtr = &RegOp2MemOpTable2;
+  }
+  
+  if (OpcodeTablePtr) {
+    // Find the Opcode to fuse
+    DenseMap<unsigned*, std::pair<unsigned,unsigned> >::const_iterator I =
+      OpcodeTablePtr->find((unsigned*)Opc);
+    if (I != OpcodeTablePtr->end())
+      return true;
+  }
+  return false;
+}
+
+bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
+                                unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
+                                SmallVectorImpl<MachineInstr*> &NewMIs) const {
+  DenseMap<unsigned*, std::pair<unsigned,unsigned> >::const_iterator I =
+    MemOp2RegOpTable.find((unsigned*)MI->getOpcode());
+  if (I == MemOp2RegOpTable.end())
+    return false;
+  unsigned Opc = I->second.first;
+  unsigned Index = I->second.second & 0xf;
+  bool FoldedLoad = I->second.second & (1 << 4);
+  bool FoldedStore = I->second.second & (1 << 5);
+  if (UnfoldLoad && !FoldedLoad)
+    return false;
+  UnfoldLoad &= FoldedLoad;
+  if (UnfoldStore && !FoldedStore)
+    return false;
+  UnfoldStore &= FoldedStore;
+
+  const TargetInstrDesc &TID = get(Opc);
+  const TargetOperandInfo &TOI = TID.OpInfo[Index];
+  const TargetRegisterClass *RC = TOI.getRegClass(&RI);
+  SmallVector<MachineOperand, X86AddrNumOperands> AddrOps;
+  SmallVector<MachineOperand,2> BeforeOps;
+  SmallVector<MachineOperand,2> AfterOps;
+  SmallVector<MachineOperand,4> ImpOps;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &Op = MI->getOperand(i);
+    if (i >= Index && i < Index + X86AddrNumOperands)
+      AddrOps.push_back(Op);
+    else if (Op.isReg() && Op.isImplicit())
+      ImpOps.push_back(Op);
+    else if (i < Index)
+      BeforeOps.push_back(Op);
+    else if (i > Index)
+      AfterOps.push_back(Op);
+  }
+
+  // Emit the load instruction.
+  if (UnfoldLoad) {
+    std::pair<MachineInstr::mmo_iterator,
+              MachineInstr::mmo_iterator> MMOs =
+      MF.extractLoadMemRefs(MI->memoperands_begin(),
+                            MI->memoperands_end());
+    loadRegFromAddr(MF, Reg, AddrOps, RC, MMOs.first, MMOs.second, NewMIs);
+    if (UnfoldStore) {
+      // Address operands cannot be marked isKill.
+      for (unsigned i = 1; i != 1 + X86AddrNumOperands; ++i) {
+        MachineOperand &MO = NewMIs[0]->getOperand(i);
+        if (MO.isReg())
+          MO.setIsKill(false);
+      }
+    }
+  }
+
+  // Emit the data processing instruction.
+  MachineInstr *DataMI = MF.CreateMachineInstr(TID, MI->getDebugLoc(), true);
+  MachineInstrBuilder MIB(DataMI);
+  
+  if (FoldedStore)
+    MIB.addReg(Reg, RegState::Define);
+  for (unsigned i = 0, e = BeforeOps.size(); i != e; ++i)
+    MIB.addOperand(BeforeOps[i]);
+  if (FoldedLoad)
+    MIB.addReg(Reg);
+  for (unsigned i = 0, e = AfterOps.size(); i != e; ++i)
+    MIB.addOperand(AfterOps[i]);
+  for (unsigned i = 0, e = ImpOps.size(); i != e; ++i) {
+    MachineOperand &MO = ImpOps[i];
+    MIB.addReg(MO.getReg(),
+               getDefRegState(MO.isDef()) |
+               RegState::Implicit |
+               getKillRegState(MO.isKill()) |
+               getDeadRegState(MO.isDead()) |
+               getUndefRegState(MO.isUndef()));
+  }
+  // Change CMP32ri r, 0 back to TEST32rr r, r, etc.
+  unsigned NewOpc = 0;
+  switch (DataMI->getOpcode()) {
+  default: break;
+  case X86::CMP64ri32:
+  case X86::CMP32ri:
+  case X86::CMP16ri:
+  case X86::CMP8ri: {
+    MachineOperand &MO0 = DataMI->getOperand(0);
+    MachineOperand &MO1 = DataMI->getOperand(1);
+    if (MO1.getImm() == 0) {
+      switch (DataMI->getOpcode()) {
+      default: break;
+      case X86::CMP64ri32: NewOpc = X86::TEST64rr; break;
+      case X86::CMP32ri:   NewOpc = X86::TEST32rr; break;
+      case X86::CMP16ri:   NewOpc = X86::TEST16rr; break;
+      case X86::CMP8ri:    NewOpc = X86::TEST8rr; break;
+      }
+      DataMI->setDesc(get(NewOpc));
+      MO1.ChangeToRegister(MO0.getReg(), false);
+    }
+  }
+  }
+  NewMIs.push_back(DataMI);
+
+  // Emit the store instruction.
+  if (UnfoldStore) {
+    const TargetRegisterClass *DstRC = TID.OpInfo[0].getRegClass(&RI);
+    std::pair<MachineInstr::mmo_iterator,
+              MachineInstr::mmo_iterator> MMOs =
+      MF.extractStoreMemRefs(MI->memoperands_begin(),
+                             MI->memoperands_end());
+    storeRegToAddr(MF, Reg, true, AddrOps, DstRC, MMOs.first, MMOs.second, NewMIs);
+  }
+
+  return true;
+}
+
+bool
+X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
+                                  SmallVectorImpl<SDNode*> &NewNodes) const {
+  if (!N->isMachineOpcode())
+    return false;
+
+  DenseMap<unsigned*, std::pair<unsigned,unsigned> >::const_iterator I =
+    MemOp2RegOpTable.find((unsigned*)N->getMachineOpcode());
+  if (I == MemOp2RegOpTable.end())
+    return false;
+  unsigned Opc = I->second.first;
+  unsigned Index = I->second.second & 0xf;
+  bool FoldedLoad = I->second.second & (1 << 4);
+  bool FoldedStore = I->second.second & (1 << 5);
+  const TargetInstrDesc &TID = get(Opc);
+  const TargetRegisterClass *RC = TID.OpInfo[Index].getRegClass(&RI);
+  unsigned NumDefs = TID.NumDefs;
+  std::vector<SDValue> AddrOps;
+  std::vector<SDValue> BeforeOps;
+  std::vector<SDValue> AfterOps;
+  DebugLoc dl = N->getDebugLoc();
+  unsigned NumOps = N->getNumOperands();
+  for (unsigned i = 0; i != NumOps-1; ++i) {
+    SDValue Op = N->getOperand(i);
+    if (i >= Index-NumDefs && i < Index-NumDefs + X86AddrNumOperands)
+      AddrOps.push_back(Op);
+    else if (i < Index-NumDefs)
+      BeforeOps.push_back(Op);
+    else if (i > Index-NumDefs)
+      AfterOps.push_back(Op);
+  }
+  SDValue Chain = N->getOperand(NumOps-1);
+  AddrOps.push_back(Chain);
+
+  // Emit the load instruction.
+  SDNode *Load = 0;
+  MachineFunction &MF = DAG.getMachineFunction();
+  if (FoldedLoad) {
+    EVT VT = *RC->vt_begin();
+    std::pair<MachineInstr::mmo_iterator,
+              MachineInstr::mmo_iterator> MMOs =
+      MF.extractLoadMemRefs(cast<MachineSDNode>(N)->memoperands_begin(),
+                            cast<MachineSDNode>(N)->memoperands_end());
+    bool isAligned = (*MMOs.first)->getAlignment() >= 16;
+    Load = DAG.getMachineNode(getLoadRegOpcode(0, RC, isAligned, TM), dl,
+                              VT, MVT::Other, &AddrOps[0], AddrOps.size());
+    NewNodes.push_back(Load);
+
+    // Preserve memory reference information.
+    cast<MachineSDNode>(Load)->setMemRefs(MMOs.first, MMOs.second);
+  }
+
+  // Emit the data processing instruction.
+  std::vector<EVT> VTs;
+  const TargetRegisterClass *DstRC = 0;
+  if (TID.getNumDefs() > 0) {
+    DstRC = TID.OpInfo[0].getRegClass(&RI);
+    VTs.push_back(*DstRC->vt_begin());
+  }
+  for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
+    EVT VT = N->getValueType(i);
+    if (VT != MVT::Other && i >= (unsigned)TID.getNumDefs())
+      VTs.push_back(VT);
+  }
+  if (Load)
+    BeforeOps.push_back(SDValue(Load, 0));
+  std::copy(AfterOps.begin(), AfterOps.end(), std::back_inserter(BeforeOps));
+  SDNode *NewNode= DAG.getMachineNode(Opc, dl, VTs, &BeforeOps[0],
+                                      BeforeOps.size());
+  NewNodes.push_back(NewNode);
+
+  // Emit the store instruction.
+  if (FoldedStore) {
+    AddrOps.pop_back();
+    AddrOps.push_back(SDValue(NewNode, 0));
+    AddrOps.push_back(Chain);
+    std::pair<MachineInstr::mmo_iterator,
+              MachineInstr::mmo_iterator> MMOs =
+      MF.extractStoreMemRefs(cast<MachineSDNode>(N)->memoperands_begin(),
+                             cast<MachineSDNode>(N)->memoperands_end());
+    bool isAligned = (*MMOs.first)->getAlignment() >= 16;
+    SDNode *Store = DAG.getMachineNode(getStoreRegOpcode(0, DstRC,
+                                                         isAligned, TM),
+                                       dl, MVT::Other,
+                                       &AddrOps[0], AddrOps.size());
+    NewNodes.push_back(Store);
+
+    // Preserve memory reference information.
+    cast<MachineSDNode>(Load)->setMemRefs(MMOs.first, MMOs.second);
+  }
+
+  return true;
+}
+
+unsigned X86InstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc,
+                                      bool UnfoldLoad, bool UnfoldStore,
+                                      unsigned *LoadRegIndex) const {
+  DenseMap<unsigned*, std::pair<unsigned,unsigned> >::const_iterator I =
+    MemOp2RegOpTable.find((unsigned*)Opc);
+  if (I == MemOp2RegOpTable.end())
+    return 0;
+  bool FoldedLoad = I->second.second & (1 << 4);
+  bool FoldedStore = I->second.second & (1 << 5);
+  if (UnfoldLoad && !FoldedLoad)
+    return 0;
+  if (UnfoldStore && !FoldedStore)
+    return 0;
+  if (LoadRegIndex)
+    *LoadRegIndex = I->second.second & 0xf;
+  return I->second.first;
+}
+
+bool
+X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
+                                     int64_t &Offset1, int64_t &Offset2) const {
+  if (!Load1->isMachineOpcode() || !Load2->isMachineOpcode())
+    return false;
+  unsigned Opc1 = Load1->getMachineOpcode();
+  unsigned Opc2 = Load2->getMachineOpcode();
+  switch (Opc1) {
+  default: return false;
+  case X86::MOV8rm:
+  case X86::MOV16rm:
+  case X86::MOV32rm:
+  case X86::MOV64rm:
+  case X86::LD_Fp32m:
+  case X86::LD_Fp64m:
+  case X86::LD_Fp80m:
+  case X86::MOVSSrm:
+  case X86::MOVSDrm:
+  case X86::MMX_MOVD64rm:
+  case X86::MMX_MOVQ64rm:
+  case X86::FsMOVAPSrm:
+  case X86::FsMOVAPDrm:
+  case X86::MOVAPSrm:
+  case X86::MOVUPSrm:
+  case X86::MOVUPSrm_Int:
+  case X86::MOVAPDrm:
+  case X86::MOVDQArm:
+  case X86::MOVDQUrm:
+  case X86::MOVDQUrm_Int:
+    break;
+  }
+  switch (Opc2) {
+  default: return false;
+  case X86::MOV8rm:
+  case X86::MOV16rm:
+  case X86::MOV32rm:
+  case X86::MOV64rm:
+  case X86::LD_Fp32m:
+  case X86::LD_Fp64m:
+  case X86::LD_Fp80m:
+  case X86::MOVSSrm:
+  case X86::MOVSDrm:
+  case X86::MMX_MOVD64rm:
+  case X86::MMX_MOVQ64rm:
+  case X86::FsMOVAPSrm:
+  case X86::FsMOVAPDrm:
+  case X86::MOVAPSrm:
+  case X86::MOVUPSrm:
+  case X86::MOVUPSrm_Int:
+  case X86::MOVAPDrm:
+  case X86::MOVDQArm:
+  case X86::MOVDQUrm:
+  case X86::MOVDQUrm_Int:
+    break;
+  }
+
+  // Check if chain operands and base addresses match.
+  if (Load1->getOperand(0) != Load2->getOperand(0) ||
+      Load1->getOperand(5) != Load2->getOperand(5))
+    return false;
+  // Segment operands should match as well.
+  if (Load1->getOperand(4) != Load2->getOperand(4))
+    return false;
+  // Scale should be 1, Index should be Reg0.
+  if (Load1->getOperand(1) == Load2->getOperand(1) &&
+      Load1->getOperand(2) == Load2->getOperand(2)) {
+    if (cast<ConstantSDNode>(Load1->getOperand(1))->getZExtValue() != 1)
+      return false;
+    SDValue Op2 = Load1->getOperand(2);
+    if (!isa<RegisterSDNode>(Op2) ||
+        cast<RegisterSDNode>(Op2)->getReg() != 0)
+      return 0;
+
+    // Now let's examine the displacements.
+    if (isa<ConstantSDNode>(Load1->getOperand(3)) &&
+        isa<ConstantSDNode>(Load2->getOperand(3))) {
+      Offset1 = cast<ConstantSDNode>(Load1->getOperand(3))->getSExtValue();
+      Offset2 = cast<ConstantSDNode>(Load2->getOperand(3))->getSExtValue();
+      return true;
+    }
+  }
+  return false;
+}
+
+bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+                                           int64_t Offset1, int64_t Offset2,
+                                           unsigned NumLoads) const {
+  assert(Offset2 > Offset1);
+  if ((Offset2 - Offset1) / 8 > 64)
+    return false;
+
+  unsigned Opc1 = Load1->getMachineOpcode();
+  unsigned Opc2 = Load2->getMachineOpcode();
+  if (Opc1 != Opc2)
+    return false;  // FIXME: overly conservative?
+
+  switch (Opc1) {
+  default: break;
+  case X86::LD_Fp32m:
+  case X86::LD_Fp64m:
+  case X86::LD_Fp80m:
+  case X86::MMX_MOVD64rm:
+  case X86::MMX_MOVQ64rm:
+    return false;
+  }
+
+  EVT VT = Load1->getValueType(0);
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: {
+    // XMM registers. In 64-bit mode we can be a bit more aggressive since we
+    // have 16 of them to play with.
+    if (TM.getSubtargetImpl()->is64Bit()) {
+      if (NumLoads >= 3)
+        return false;
+    } else if (NumLoads)
+      return false;
+    break;
+  }
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+  case MVT::i64:
+  case MVT::f32:
+  case MVT::f64:
+    if (NumLoads)
+      return false;
+  }
+
+  return true;
+}
+
+
+bool X86InstrInfo::
+ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+  assert(Cond.size() == 1 && "Invalid X86 branch condition!");
+  X86::CondCode CC = static_cast<X86::CondCode>(Cond[0].getImm());
+  if (CC == X86::COND_NE_OR_P || CC == X86::COND_NP_OR_E)
+    return true;
+  Cond[0].setImm(GetOppositeBranchCondition(CC));
+  return false;
+}
+
+bool X86InstrInfo::
+isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
+  // FIXME: Return false for x87 stack register classes for now. We can't
+  // allow any loads of these registers before FpGet_ST0_80.
+  return !(RC == &X86::CCRRegClass || RC == &X86::RFP32RegClass ||
+           RC == &X86::RFP64RegClass || RC == &X86::RFP80RegClass);
+}
+
+
+/// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended (r8 or higher)
+/// register?  e.g. r8, xmm8, xmm13, etc.
+bool X86InstrInfo::isX86_64ExtendedReg(unsigned RegNo) {
+  switch (RegNo) {
+  default: break;
+  case X86::R8:    case X86::R9:    case X86::R10:   case X86::R11:
+  case X86::R12:   case X86::R13:   case X86::R14:   case X86::R15:
+  case X86::R8D:   case X86::R9D:   case X86::R10D:  case X86::R11D:
+  case X86::R12D:  case X86::R13D:  case X86::R14D:  case X86::R15D:
+  case X86::R8W:   case X86::R9W:   case X86::R10W:  case X86::R11W:
+  case X86::R12W:  case X86::R13W:  case X86::R14W:  case X86::R15W:
+  case X86::R8B:   case X86::R9B:   case X86::R10B:  case X86::R11B:
+  case X86::R12B:  case X86::R13B:  case X86::R14B:  case X86::R15B:
+  case X86::XMM8:  case X86::XMM9:  case X86::XMM10: case X86::XMM11:
+  case X86::XMM12: case X86::XMM13: case X86::XMM14: case X86::XMM15:
+    return true;
+  }
+  return false;
+}
+
+
+/// determineREX - Determine if the MachineInstr has to be encoded with a X86-64
+/// REX prefix which specifies 1) 64-bit instructions, 2) non-default operand
+/// size, and 3) use of X86-64 extended registers.
+unsigned X86InstrInfo::determineREX(const MachineInstr &MI) {
+  unsigned REX = 0;
+  const TargetInstrDesc &Desc = MI.getDesc();
+
+  // Pseudo instructions do not need REX prefix byte.
+  if ((Desc.TSFlags & X86II::FormMask) == X86II::Pseudo)
+    return 0;
+  if (Desc.TSFlags & X86II::REX_W)
+    REX |= 1 << 3;
+
+  unsigned NumOps = Desc.getNumOperands();
+  if (NumOps) {
+    bool isTwoAddr = NumOps > 1 &&
+      Desc.getOperandConstraint(1, TOI::TIED_TO) != -1;
+
+    // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
+    unsigned i = isTwoAddr ? 1 : 0;
+    for (unsigned e = NumOps; i != e; ++i) {
+      const MachineOperand& MO = MI.getOperand(i);
+      if (MO.isReg()) {
+        unsigned Reg = MO.getReg();
+        if (isX86_64NonExtLowByteReg(Reg))
+          REX |= 0x40;
+      }
+    }
+
+    switch (Desc.TSFlags & X86II::FormMask) {
+    case X86II::MRMInitReg:
+      if (isX86_64ExtendedReg(MI.getOperand(0)))
+        REX |= (1 << 0) | (1 << 2);
+      break;
+    case X86II::MRMSrcReg: {
+      if (isX86_64ExtendedReg(MI.getOperand(0)))
+        REX |= 1 << 2;
+      i = isTwoAddr ? 2 : 1;
+      for (unsigned e = NumOps; i != e; ++i) {
+        const MachineOperand& MO = MI.getOperand(i);
+        if (isX86_64ExtendedReg(MO))
+          REX |= 1 << 0;
+      }
+      break;
+    }
+    case X86II::MRMSrcMem: {
+      if (isX86_64ExtendedReg(MI.getOperand(0)))
+        REX |= 1 << 2;
+      unsigned Bit = 0;
+      i = isTwoAddr ? 2 : 1;
+      for (; i != NumOps; ++i) {
+        const MachineOperand& MO = MI.getOperand(i);
+        if (MO.isReg()) {
+          if (isX86_64ExtendedReg(MO))
+            REX |= 1 << Bit;
+          Bit++;
+        }
+      }
+      break;
+    }
+    case X86II::MRM0m: case X86II::MRM1m:
+    case X86II::MRM2m: case X86II::MRM3m:
+    case X86II::MRM4m: case X86II::MRM5m:
+    case X86II::MRM6m: case X86II::MRM7m:
+    case X86II::MRMDestMem: {
+      unsigned e = (isTwoAddr ? X86AddrNumOperands+1 : X86AddrNumOperands);
+      i = isTwoAddr ? 1 : 0;
+      if (NumOps > e && isX86_64ExtendedReg(MI.getOperand(e)))
+        REX |= 1 << 2;
+      unsigned Bit = 0;
+      for (; i != e; ++i) {
+        const MachineOperand& MO = MI.getOperand(i);
+        if (MO.isReg()) {
+          if (isX86_64ExtendedReg(MO))
+            REX |= 1 << Bit;
+          Bit++;
+        }
+      }
+      break;
+    }
+    default: {
+      if (isX86_64ExtendedReg(MI.getOperand(0)))
+        REX |= 1 << 0;
+      i = isTwoAddr ? 2 : 1;
+      for (unsigned e = NumOps; i != e; ++i) {
+        const MachineOperand& MO = MI.getOperand(i);
+        if (isX86_64ExtendedReg(MO))
+          REX |= 1 << 2;
+      }
+      break;
+    }
+    }
+  }
+  return REX;
+}
+
+/// sizePCRelativeBlockAddress - This method returns the size of a PC
+/// relative block address instruction
+///
+static unsigned sizePCRelativeBlockAddress() {
+  return 4;
+}
+
+/// sizeGlobalAddress - Give the size of the emission of this global address
+///
+static unsigned sizeGlobalAddress(bool dword) {
+  return dword ? 8 : 4;
+}
+
+/// sizeConstPoolAddress - Give the size of the emission of this constant
+/// pool address
+///
+static unsigned sizeConstPoolAddress(bool dword) {
+  return dword ? 8 : 4;
+}
+
+/// sizeExternalSymbolAddress - Give the size of the emission of this external
+/// symbol
+///
+static unsigned sizeExternalSymbolAddress(bool dword) {
+  return dword ? 8 : 4;
+}
+
+/// sizeJumpTableAddress - Give the size of the emission of this jump
+/// table address
+///
+static unsigned sizeJumpTableAddress(bool dword) {
+  return dword ? 8 : 4;
+}
+
+static unsigned sizeConstant(unsigned Size) {
+  return Size;
+}
+
+static unsigned sizeRegModRMByte(){
+  return 1;
+}
+
+static unsigned sizeSIBByte(){
+  return 1;
+}
+
+static unsigned getDisplacementFieldSize(const MachineOperand *RelocOp) {
+  unsigned FinalSize = 0;
+  // If this is a simple integer displacement that doesn't require a relocation.
+  if (!RelocOp) {
+    FinalSize += sizeConstant(4);
+    return FinalSize;
+  }
+  
+  // Otherwise, this is something that requires a relocation.
+  if (RelocOp->isGlobal()) {
+    FinalSize += sizeGlobalAddress(false);
+  } else if (RelocOp->isCPI()) {
+    FinalSize += sizeConstPoolAddress(false);
+  } else if (RelocOp->isJTI()) {
+    FinalSize += sizeJumpTableAddress(false);
+  } else {
+    llvm_unreachable("Unknown value to relocate!");
+  }
+  return FinalSize;
+}
+
+static unsigned getMemModRMByteSize(const MachineInstr &MI, unsigned Op,
+                                    bool IsPIC, bool Is64BitMode) {
+  const MachineOperand &Op3 = MI.getOperand(Op+3);
+  int DispVal = 0;
+  const MachineOperand *DispForReloc = 0;
+  unsigned FinalSize = 0;
+  
+  // Figure out what sort of displacement we have to handle here.
+  if (Op3.isGlobal()) {
+    DispForReloc = &Op3;
+  } else if (Op3.isCPI()) {
+    if (Is64BitMode || IsPIC) {
+      DispForReloc = &Op3;
+    } else {
+      DispVal = 1;
+    }
+  } else if (Op3.isJTI()) {
+    if (Is64BitMode || IsPIC) {
+      DispForReloc = &Op3;
+    } else {
+      DispVal = 1; 
+    }
+  } else {
+    DispVal = 1;
+  }
+
+  const MachineOperand &Base     = MI.getOperand(Op);
+  const MachineOperand &IndexReg = MI.getOperand(Op+2);
+
+  unsigned BaseReg = Base.getReg();
+
+  // Is a SIB byte needed?
+  if ((!Is64BitMode || DispForReloc || BaseReg != 0) &&
+      IndexReg.getReg() == 0 &&
+      (BaseReg == 0 || X86RegisterInfo::getX86RegNum(BaseReg) != N86::ESP)) {      
+    if (BaseReg == 0) {  // Just a displacement?
+      // Emit special case [disp32] encoding
+      ++FinalSize; 
+      FinalSize += getDisplacementFieldSize(DispForReloc);
+    } else {
+      unsigned BaseRegNo = X86RegisterInfo::getX86RegNum(BaseReg);
+      if (!DispForReloc && DispVal == 0 && BaseRegNo != N86::EBP) {
+        // Emit simple indirect register encoding... [EAX] f.e.
+        ++FinalSize;
+      // Be pessimistic and assume it's a disp32, not a disp8
+      } else {
+        // Emit the most general non-SIB encoding: [REG+disp32]
+        ++FinalSize;
+        FinalSize += getDisplacementFieldSize(DispForReloc);
+      }
+    }
+
+  } else {  // We need a SIB byte, so start by outputting the ModR/M byte first
+    assert(IndexReg.getReg() != X86::ESP &&
+           IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!");
+
+    bool ForceDisp32 = false;
+    if (BaseReg == 0 || DispForReloc) {
+      // Emit the normal disp32 encoding.
+      ++FinalSize;
+      ForceDisp32 = true;
+    } else {
+      ++FinalSize;
+    }
+
+    FinalSize += sizeSIBByte();
+
+    // Do we need to output a displacement?
+    if (DispVal != 0 || ForceDisp32) {
+      FinalSize += getDisplacementFieldSize(DispForReloc);
+    }
+  }
+  return FinalSize;
+}
+
+
+static unsigned GetInstSizeWithDesc(const MachineInstr &MI,
+                                    const TargetInstrDesc *Desc,
+                                    bool IsPIC, bool Is64BitMode) {
+  
+  unsigned Opcode = Desc->Opcode;
+  unsigned FinalSize = 0;
+
+  // Emit the lock opcode prefix as needed.
+  if (Desc->TSFlags & X86II::LOCK) ++FinalSize;
+
+  // Emit segment override opcode prefix as needed.
+  switch (Desc->TSFlags & X86II::SegOvrMask) {
+  case X86II::FS:
+  case X86II::GS:
+   ++FinalSize;
+   break;
+  default: llvm_unreachable("Invalid segment!");
+  case 0: break;  // No segment override!
+  }
+
+  // Emit the repeat opcode prefix as needed.
+  if ((Desc->TSFlags & X86II::Op0Mask) == X86II::REP) ++FinalSize;
+
+  // Emit the operand size opcode prefix as needed.
+  if (Desc->TSFlags & X86II::OpSize) ++FinalSize;
+
+  // Emit the address size opcode prefix as needed.
+  if (Desc->TSFlags & X86II::AdSize) ++FinalSize;
+
+  bool Need0FPrefix = false;
+  switch (Desc->TSFlags & X86II::Op0Mask) {
+  case X86II::TB:  // Two-byte opcode prefix
+  case X86II::T8:  // 0F 38
+  case X86II::TA:  // 0F 3A
+    Need0FPrefix = true;
+    break;
+  case X86II::TF: // F2 0F 38
+    ++FinalSize;
+    Need0FPrefix = true;
+    break;
+  case X86II::REP: break; // already handled.
+  case X86II::XS:   // F3 0F
+    ++FinalSize;
+    Need0FPrefix = true;
+    break;
+  case X86II::XD:   // F2 0F
+    ++FinalSize;
+    Need0FPrefix = true;
+    break;
+  case X86II::D8: case X86II::D9: case X86II::DA: case X86II::DB:
+  case X86II::DC: case X86II::DD: case X86II::DE: case X86II::DF:
+    ++FinalSize;
+    break; // Two-byte opcode prefix
+  default: llvm_unreachable("Invalid prefix!");
+  case 0: break;  // No prefix!
+  }
+
+  if (Is64BitMode) {
+    // REX prefix
+    unsigned REX = X86InstrInfo::determineREX(MI);
+    if (REX)
+      ++FinalSize;
+  }
+
+  // 0x0F escape code must be emitted just before the opcode.
+  if (Need0FPrefix)
+    ++FinalSize;
+
+  switch (Desc->TSFlags & X86II::Op0Mask) {
+  case X86II::T8:  // 0F 38
+    ++FinalSize;
+    break;
+  case X86II::TA:  // 0F 3A
+    ++FinalSize;
+    break;
+  case X86II::TF: // F2 0F 38
+    ++FinalSize;
+    break;
+  }
+
+  // If this is a two-address instruction, skip one of the register operands.
+  unsigned NumOps = Desc->getNumOperands();
+  unsigned CurOp = 0;
+  if (NumOps > 1 && Desc->getOperandConstraint(1, TOI::TIED_TO) != -1)
+    CurOp++;
+  else if (NumOps > 2 && Desc->getOperandConstraint(NumOps-1, TOI::TIED_TO)== 0)
+    // Skip the last source operand that is tied_to the dest reg. e.g. LXADD32
+    --NumOps;
+
+  switch (Desc->TSFlags & X86II::FormMask) {
+  default: llvm_unreachable("Unknown FormMask value in X86 MachineCodeEmitter!");
+  case X86II::Pseudo:
+    // Remember the current PC offset, this is the PIC relocation
+    // base address.
+    switch (Opcode) {
+    default: 
+      break;
+    case TargetOpcode::INLINEASM: {
+      const MachineFunction *MF = MI.getParent()->getParent();
+      const TargetInstrInfo &TII = *MF->getTarget().getInstrInfo();
+      FinalSize += TII.getInlineAsmLength(MI.getOperand(0).getSymbolName(),
+                                          *MF->getTarget().getMCAsmInfo());
+      break;
+    }
+    case TargetOpcode::DBG_LABEL:
+    case TargetOpcode::EH_LABEL:
+      break;
+    case TargetOpcode::IMPLICIT_DEF:
+    case TargetOpcode::KILL:
+    case X86::FP_REG_KILL:
+      break;
+    case X86::MOVPC32r: {
+      // This emits the "call" portion of this pseudo instruction.
+      ++FinalSize;
+      FinalSize += sizeConstant(X86II::getSizeOfImm(Desc->TSFlags));
+      break;
+    }
+    }
+    CurOp = NumOps;
+    break;
+  case X86II::RawFrm:
+    ++FinalSize;
+
+    if (CurOp != NumOps) {
+      const MachineOperand &MO = MI.getOperand(CurOp++);
+      if (MO.isMBB()) {
+        FinalSize += sizePCRelativeBlockAddress();
+      } else if (MO.isGlobal()) {
+        FinalSize += sizeGlobalAddress(false);
+      } else if (MO.isSymbol()) {
+        FinalSize += sizeExternalSymbolAddress(false);
+      } else if (MO.isImm()) {
+        FinalSize += sizeConstant(X86II::getSizeOfImm(Desc->TSFlags));
+      } else {
+        llvm_unreachable("Unknown RawFrm operand!");
+      }
+    }
+    break;
+
+  case X86II::AddRegFrm:
+    ++FinalSize;
+    ++CurOp;
+    
+    if (CurOp != NumOps) {
+      const MachineOperand &MO1 = MI.getOperand(CurOp++);
+      unsigned Size = X86II::getSizeOfImm(Desc->TSFlags);
+      if (MO1.isImm())
+        FinalSize += sizeConstant(Size);
+      else {
+        bool dword = false;
+        if (Opcode == X86::MOV64ri)
+          dword = true; 
+        if (MO1.isGlobal()) {
+          FinalSize += sizeGlobalAddress(dword);
+        } else if (MO1.isSymbol())
+          FinalSize += sizeExternalSymbolAddress(dword);
+        else if (MO1.isCPI())
+          FinalSize += sizeConstPoolAddress(dword);
+        else if (MO1.isJTI())
+          FinalSize += sizeJumpTableAddress(dword);
+      }
+    }
+    break;
+
+  case X86II::MRMDestReg: {
+    ++FinalSize; 
+    FinalSize += sizeRegModRMByte();
+    CurOp += 2;
+    if (CurOp != NumOps) {
+      ++CurOp;
+      FinalSize += sizeConstant(X86II::getSizeOfImm(Desc->TSFlags));
+    }
+    break;
+  }
+  case X86II::MRMDestMem: {
+    ++FinalSize;
+    FinalSize += getMemModRMByteSize(MI, CurOp, IsPIC, Is64BitMode);
+    CurOp +=  X86AddrNumOperands + 1;
+    if (CurOp != NumOps) {
+      ++CurOp;
+      FinalSize += sizeConstant(X86II::getSizeOfImm(Desc->TSFlags));
+    }
+    break;
+  }
+
+  case X86II::MRMSrcReg:
+    ++FinalSize;
+    FinalSize += sizeRegModRMByte();
+    CurOp += 2;
+    if (CurOp != NumOps) {
+      ++CurOp;
+      FinalSize += sizeConstant(X86II::getSizeOfImm(Desc->TSFlags));
+    }
+    break;
+
+  case X86II::MRMSrcMem: {
+    int AddrOperands;
+    if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r ||
+        Opcode == X86::LEA16r || Opcode == X86::LEA32r)
+      AddrOperands = X86AddrNumOperands - 1; // No segment register
+    else
+      AddrOperands = X86AddrNumOperands;
+
+    ++FinalSize;
+    FinalSize += getMemModRMByteSize(MI, CurOp+1, IsPIC, Is64BitMode);
+    CurOp += AddrOperands + 1;
+    if (CurOp != NumOps) {
+      ++CurOp;
+      FinalSize += sizeConstant(X86II::getSizeOfImm(Desc->TSFlags));
+    }
+    break;
+  }
+
+  case X86II::MRM0r: case X86II::MRM1r:
+  case X86II::MRM2r: case X86II::MRM3r:
+  case X86II::MRM4r: case X86II::MRM5r:
+  case X86II::MRM6r: case X86II::MRM7r:
+    ++FinalSize;
+    if (Desc->getOpcode() == X86::LFENCE ||
+        Desc->getOpcode() == X86::MFENCE) {
+      // Special handling of lfence and mfence;
+      FinalSize += sizeRegModRMByte();
+    } else if (Desc->getOpcode() == X86::MONITOR ||
+               Desc->getOpcode() == X86::MWAIT) {
+      // Special handling of monitor and mwait.
+      FinalSize += sizeRegModRMByte() + 1; // +1 for the opcode.
+    } else {
+      ++CurOp;
+      FinalSize += sizeRegModRMByte();
+    }
+
+    if (CurOp != NumOps) {
+      const MachineOperand &MO1 = MI.getOperand(CurOp++);
+      unsigned Size = X86II::getSizeOfImm(Desc->TSFlags);
+      if (MO1.isImm())
+        FinalSize += sizeConstant(Size);
+      else {
+        bool dword = false;
+        if (Opcode == X86::MOV64ri32)
+          dword = true;
+        if (MO1.isGlobal()) {
+          FinalSize += sizeGlobalAddress(dword);
+        } else if (MO1.isSymbol())
+          FinalSize += sizeExternalSymbolAddress(dword);
+        else if (MO1.isCPI())
+          FinalSize += sizeConstPoolAddress(dword);
+        else if (MO1.isJTI())
+          FinalSize += sizeJumpTableAddress(dword);
+      }
+    }
+    break;
+
+  case X86II::MRM0m: case X86II::MRM1m:
+  case X86II::MRM2m: case X86II::MRM3m:
+  case X86II::MRM4m: case X86II::MRM5m:
+  case X86II::MRM6m: case X86II::MRM7m: {
+    
+    ++FinalSize;
+    FinalSize += getMemModRMByteSize(MI, CurOp, IsPIC, Is64BitMode);
+    CurOp += X86AddrNumOperands;
+
+    if (CurOp != NumOps) {
+      const MachineOperand &MO = MI.getOperand(CurOp++);
+      unsigned Size = X86II::getSizeOfImm(Desc->TSFlags);
+      if (MO.isImm())
+        FinalSize += sizeConstant(Size);
+      else {
+        bool dword = false;
+        if (Opcode == X86::MOV64mi32)
+          dword = true;
+        if (MO.isGlobal()) {
+          FinalSize += sizeGlobalAddress(dword);
+        } else if (MO.isSymbol())
+          FinalSize += sizeExternalSymbolAddress(dword);
+        else if (MO.isCPI())
+          FinalSize += sizeConstPoolAddress(dword);
+        else if (MO.isJTI())
+          FinalSize += sizeJumpTableAddress(dword);
+      }
+    }
+    break;
+  }
+
+  case X86II::MRMInitReg:
+    ++FinalSize;
+    // Duplicate register, used by things like MOV8r0 (aka xor reg,reg).
+    FinalSize += sizeRegModRMByte();
+    ++CurOp;
+    break;
+  }
+
+  if (!Desc->isVariadic() && CurOp != NumOps) {
+    std::string msg;
+    raw_string_ostream Msg(msg);
+    Msg << "Cannot determine size: " << MI;
+    llvm_report_error(Msg.str());
+  }
+  
+
+  return FinalSize;
+}
+
+
+unsigned X86InstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
+  const TargetInstrDesc &Desc = MI->getDesc();
+  bool IsPIC = TM.getRelocationModel() == Reloc::PIC_;
+  bool Is64BitMode = TM.getSubtargetImpl()->is64Bit();
+  unsigned Size = GetInstSizeWithDesc(*MI, &Desc, IsPIC, Is64BitMode);
+  if (Desc.getOpcode() == X86::MOVPC32r)
+    Size += GetInstSizeWithDesc(*MI, &get(X86::POP32r), IsPIC, Is64BitMode);
+  return Size;
+}
+
+/// getGlobalBaseReg - Return a virtual register initialized with the
+/// the global base register value. Output instructions required to
+/// initialize the register in the function entry block, if necessary.
+///
+unsigned X86InstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
+  assert(!TM.getSubtarget<X86Subtarget>().is64Bit() &&
+         "X86-64 PIC uses RIP relative addressing");
+
+  X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
+  unsigned GlobalBaseReg = X86FI->getGlobalBaseReg();
+  if (GlobalBaseReg != 0)
+    return GlobalBaseReg;
+
+  // Insert the set of GlobalBaseReg into the first MBB of the function
+  MachineBasicBlock &FirstMBB = MF->front();
+  MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+  DebugLoc DL = FirstMBB.findDebugLoc(MBBI);
+  MachineRegisterInfo &RegInfo = MF->getRegInfo();
+  unsigned PC = RegInfo.createVirtualRegister(X86::GR32RegisterClass);
+  
+  const TargetInstrInfo *TII = TM.getInstrInfo();
+  // Operand of MovePCtoStack is completely ignored by asm printer. It's
+  // only used in JIT code emission as displacement to pc.
+  BuildMI(FirstMBB, MBBI, DL, TII->get(X86::MOVPC32r), PC).addImm(0);
+  
+  // If we're using vanilla 'GOT' PIC style, we should use relative addressing
+  // not to pc, but to _GLOBAL_OFFSET_TABLE_ external.
+  if (TM.getSubtarget<X86Subtarget>().isPICStyleGOT()) {
+    GlobalBaseReg = RegInfo.createVirtualRegister(X86::GR32RegisterClass);
+    // Generate addl $__GLOBAL_OFFSET_TABLE_ + [.-piclabel], %some_register
+    BuildMI(FirstMBB, MBBI, DL, TII->get(X86::ADD32ri), GlobalBaseReg)
+      .addReg(PC).addExternalSymbol("_GLOBAL_OFFSET_TABLE_",
+                                    X86II::MO_GOT_ABSOLUTE_ADDRESS);
+  } else {
+    GlobalBaseReg = PC;
+  }
+
+  X86FI->setGlobalBaseReg(GlobalBaseReg);
+  return GlobalBaseReg;
+}

diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
new file mode 100644
index 0000000..a6b3863
--- /dev/null
+++ b/lib/Target/X86/X86InstrInfo.h

@@ -0,0 +1,703 @@
+//===- X86InstrInfo.h - X86 Instruction Information ------------*- C++ -*- ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86INSTRUCTIONINFO_H
+#define X86INSTRUCTIONINFO_H
+
+#include "llvm/Target/TargetInstrInfo.h"
+#include "X86.h"
+#include "X86RegisterInfo.h"
+#include "llvm/ADT/DenseMap.h"
+
+namespace llvm {
+  class X86RegisterInfo;
+  class X86TargetMachine;
+
+namespace X86 {
+  // X86 specific condition code. These correspond to X86_*_COND in
+  // X86InstrInfo.td. They must be kept in synch.
+  enum CondCode {
+    COND_A  = 0,
+    COND_AE = 1,
+    COND_B  = 2,
+    COND_BE = 3,
+    COND_E  = 4,
+    COND_G  = 5,
+    COND_GE = 6,
+    COND_L  = 7,
+    COND_LE = 8,
+    COND_NE = 9,
+    COND_NO = 10,
+    COND_NP = 11,
+    COND_NS = 12,
+    COND_O  = 13,
+    COND_P  = 14,
+    COND_S  = 15,
+
+    // Artificial condition codes. These are used by AnalyzeBranch
+    // to indicate a block terminated with two conditional branches to
+    // the same location. This occurs in code using FCMP_OEQ or FCMP_UNE,
+    // which can't be represented on x86 with a single condition. These
+    // are never used in MachineInstrs.
+    COND_NE_OR_P,
+    COND_NP_OR_E,
+
+    COND_INVALID
+  };
+    
+  // Turn condition code into conditional branch opcode.
+  unsigned GetCondBranchFromCond(CondCode CC);
+  
+  /// GetOppositeBranchCondition - Return the inverse of the specified cond,
+  /// e.g. turning COND_E to COND_NE.
+  CondCode GetOppositeBranchCondition(X86::CondCode CC);
+
+}
+  
+/// X86II - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace X86II {
+  /// Target Operand Flag enum.
+  enum TOF {
+    //===------------------------------------------------------------------===//
+    // X86 Specific MachineOperand flags.
+    
+    MO_NO_FLAG,
+    
+    /// MO_GOT_ABSOLUTE_ADDRESS - On a symbol operand, this represents a
+    /// relocation of:
+    ///    SYMBOL_LABEL + [. - PICBASELABEL]
+    MO_GOT_ABSOLUTE_ADDRESS,
+    
+    /// MO_PIC_BASE_OFFSET - On a symbol operand this indicates that the
+    /// immediate should get the value of the symbol minus the PIC base label:
+    ///    SYMBOL_LABEL - PICBASELABEL
+    MO_PIC_BASE_OFFSET,
+
+    /// MO_GOT - On a symbol operand this indicates that the immediate is the
+    /// offset to the GOT entry for the symbol name from the base of the GOT.
+    ///
+    /// See the X86-64 ELF ABI supplement for more details. 
+    ///    SYMBOL_LABEL @GOT
+    MO_GOT,
+    
+    /// MO_GOTOFF - On a symbol operand this indicates that the immediate is
+    /// the offset to the location of the symbol name from the base of the GOT. 
+    ///
+    /// See the X86-64 ELF ABI supplement for more details. 
+    ///    SYMBOL_LABEL @GOTOFF
+    MO_GOTOFF,
+    
+    /// MO_GOTPCREL - On a symbol operand this indicates that the immediate is
+    /// offset to the GOT entry for the symbol name from the current code
+    /// location. 
+    ///
+    /// See the X86-64 ELF ABI supplement for more details. 
+    ///    SYMBOL_LABEL @GOTPCREL
+    MO_GOTPCREL,
+    
+    /// MO_PLT - On a symbol operand this indicates that the immediate is
+    /// offset to the PLT entry of symbol name from the current code location. 
+    ///
+    /// See the X86-64 ELF ABI supplement for more details. 
+    ///    SYMBOL_LABEL @PLT
+    MO_PLT,
+    
+    /// MO_TLSGD - On a symbol operand this indicates that the immediate is
+    /// some TLS offset.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details. 
+    ///    SYMBOL_LABEL @TLSGD
+    MO_TLSGD,
+    
+    /// MO_GOTTPOFF - On a symbol operand this indicates that the immediate is
+    /// some TLS offset.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details. 
+    ///    SYMBOL_LABEL @GOTTPOFF
+    MO_GOTTPOFF,
+   
+    /// MO_INDNTPOFF - On a symbol operand this indicates that the immediate is
+    /// some TLS offset.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details. 
+    ///    SYMBOL_LABEL @INDNTPOFF
+    MO_INDNTPOFF,
+    
+    /// MO_TPOFF - On a symbol operand this indicates that the immediate is
+    /// some TLS offset.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details. 
+    ///    SYMBOL_LABEL @TPOFF
+    MO_TPOFF,
+    
+    /// MO_NTPOFF - On a symbol operand this indicates that the immediate is
+    /// some TLS offset.
+    ///
+    /// See 'ELF Handling for Thread-Local Storage' for more details. 
+    ///    SYMBOL_LABEL @NTPOFF
+    MO_NTPOFF,
+    
+    /// MO_DLLIMPORT - On a symbol operand "FOO", this indicates that the
+    /// reference is actually to the "__imp_FOO" symbol.  This is used for
+    /// dllimport linkage on windows.
+    MO_DLLIMPORT,
+    
+    /// MO_DARWIN_STUB - On a symbol operand "FOO", this indicates that the
+    /// reference is actually to the "FOO$stub" symbol.  This is used for calls
+    /// and jumps to external functions on Tiger and before.
+    MO_DARWIN_STUB,
+    
+    /// MO_DARWIN_NONLAZY - On a symbol operand "FOO", this indicates that the
+    /// reference is actually to the "FOO$non_lazy_ptr" symbol, which is a
+    /// non-PIC-base-relative reference to a non-hidden dyld lazy pointer stub.
+    MO_DARWIN_NONLAZY,
+
+    /// MO_DARWIN_NONLAZY_PIC_BASE - On a symbol operand "FOO", this indicates
+    /// that the reference is actually to "FOO$non_lazy_ptr - PICBASE", which is
+    /// a PIC-base-relative reference to a non-hidden dyld lazy pointer stub.
+    MO_DARWIN_NONLAZY_PIC_BASE,
+    
+    /// MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE - On a symbol operand "FOO", this
+    /// indicates that the reference is actually to "FOO$non_lazy_ptr -PICBASE",
+    /// which is a PIC-base-relative reference to a hidden dyld lazy pointer
+    /// stub.
+    MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE
+  };
+}
+
+/// isGlobalStubReference - Return true if the specified TargetFlag operand is
+/// a reference to a stub for a global, not the global itself.
+inline static bool isGlobalStubReference(unsigned char TargetFlag) {
+  switch (TargetFlag) {
+  case X86II::MO_DLLIMPORT: // dllimport stub.
+  case X86II::MO_GOTPCREL:  // rip-relative GOT reference.
+  case X86II::MO_GOT:       // normal GOT reference.
+  case X86II::MO_DARWIN_NONLAZY_PIC_BASE:        // Normal $non_lazy_ptr ref.
+  case X86II::MO_DARWIN_NONLAZY:                 // Normal $non_lazy_ptr ref.
+  case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: // Hidden $non_lazy_ptr ref.
+    return true;
+  default:
+    return false;
+  }
+}
+
+/// isGlobalRelativeToPICBase - Return true if the specified global value
+/// reference is relative to a 32-bit PIC base (X86ISD::GlobalBaseReg).  If this
+/// is true, the addressing mode has the PIC base register added in (e.g. EBX).
+inline static bool isGlobalRelativeToPICBase(unsigned char TargetFlag) {
+  switch (TargetFlag) {
+  case X86II::MO_GOTOFF:                         // isPICStyleGOT: local global.
+  case X86II::MO_GOT:                            // isPICStyleGOT: other global.
+  case X86II::MO_PIC_BASE_OFFSET:                // Darwin local global.
+  case X86II::MO_DARWIN_NONLAZY_PIC_BASE:        // Darwin/32 external global.
+  case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: // Darwin/32 hidden global.
+    return true;
+  default:
+    return false;
+  }
+}
+ 
+/// X86II - This namespace holds all of the target specific flags that
+/// instruction info tracks.
+///
+namespace X86II {
+  enum {
+    //===------------------------------------------------------------------===//
+    // Instruction encodings.  These are the standard/most common forms for X86
+    // instructions.
+    //
+
+    // PseudoFrm - This represents an instruction that is a pseudo instruction
+    // or one that has not been implemented yet.  It is illegal to code generate
+    // it, but tolerated for intermediate implementation stages.
+    Pseudo         = 0,
+
+    /// Raw - This form is for instructions that don't have any operands, so
+    /// they are just a fixed opcode value, like 'leave'.
+    RawFrm         = 1,
+
+    /// AddRegFrm - This form is used for instructions like 'push r32' that have
+    /// their one register operand added to their opcode.
+    AddRegFrm      = 2,
+
+    /// MRMDestReg - This form is used for instructions that use the Mod/RM byte
+    /// to specify a destination, which in this case is a register.
+    ///
+    MRMDestReg     = 3,
+
+    /// MRMDestMem - This form is used for instructions that use the Mod/RM byte
+    /// to specify a destination, which in this case is memory.
+    ///
+    MRMDestMem     = 4,
+
+    /// MRMSrcReg - This form is used for instructions that use the Mod/RM byte
+    /// to specify a source, which in this case is a register.
+    ///
+    MRMSrcReg      = 5,
+
+    /// MRMSrcMem - This form is used for instructions that use the Mod/RM byte
+    /// to specify a source, which in this case is memory.
+    ///
+    MRMSrcMem      = 6,
+
+    /// MRM[0-7][rm] - These forms are used to represent instructions that use
+    /// a Mod/RM byte, and use the middle field to hold extended opcode
+    /// information.  In the intel manual these are represented as /0, /1, ...
+    ///
+
+    // First, instructions that operate on a register r/m operand...
+    MRM0r = 16,  MRM1r = 17,  MRM2r = 18,  MRM3r = 19, // Format /0 /1 /2 /3
+    MRM4r = 20,  MRM5r = 21,  MRM6r = 22,  MRM7r = 23, // Format /4 /5 /6 /7
+
+    // Next, instructions that operate on a memory r/m operand...
+    MRM0m = 24,  MRM1m = 25,  MRM2m = 26,  MRM3m = 27, // Format /0 /1 /2 /3
+    MRM4m = 28,  MRM5m = 29,  MRM6m = 30,  MRM7m = 31, // Format /4 /5 /6 /7
+
+    // MRMInitReg - This form is used for instructions whose source and
+    // destinations are the same register.
+    MRMInitReg = 32,
+
+    FormMask       = 63,
+
+    //===------------------------------------------------------------------===//
+    // Actual flags...
+
+    // OpSize - Set if this instruction requires an operand size prefix (0x66),
+    // which most often indicates that the instruction operates on 16 bit data
+    // instead of 32 bit data.
+    OpSize      = 1 << 6,
+
+    // AsSize - Set if this instruction requires an operand size prefix (0x67),
+    // which most often indicates that the instruction address 16 bit address
+    // instead of 32 bit address (or 32 bit address in 64 bit mode).
+    AdSize      = 1 << 7,
+
+    //===------------------------------------------------------------------===//
+    // Op0Mask - There are several prefix bytes that are used to form two byte
+    // opcodes.  These are currently 0x0F, 0xF3, and 0xD8-0xDF.  This mask is
+    // used to obtain the setting of this field.  If no bits in this field is
+    // set, there is no prefix byte for obtaining a multibyte opcode.
+    //
+    Op0Shift    = 8,
+    Op0Mask     = 0xF << Op0Shift,
+
+    // TB - TwoByte - Set if this instruction has a two byte opcode, which
+    // starts with a 0x0F byte before the real opcode.
+    TB          = 1 << Op0Shift,
+
+    // REP - The 0xF3 prefix byte indicating repetition of the following
+    // instruction.
+    REP         = 2 << Op0Shift,
+
+    // D8-DF - These escape opcodes are used by the floating point unit.  These
+    // values must remain sequential.
+    D8 = 3 << Op0Shift,   D9 = 4 << Op0Shift,
+    DA = 5 << Op0Shift,   DB = 6 << Op0Shift,
+    DC = 7 << Op0Shift,   DD = 8 << Op0Shift,
+    DE = 9 << Op0Shift,   DF = 10 << Op0Shift,
+
+    // XS, XD - These prefix codes are for single and double precision scalar
+    // floating point operations performed in the SSE registers.
+    XD = 11 << Op0Shift,  XS = 12 << Op0Shift,
+
+    // T8, TA - Prefix after the 0x0F prefix.
+    T8 = 13 << Op0Shift,  TA = 14 << Op0Shift,
+    
+    // TF - Prefix before and after 0x0F
+    TF = 15 << Op0Shift,
+
+    //===------------------------------------------------------------------===//
+    // REX_W - REX prefixes are instruction prefixes used in 64-bit mode.
+    // They are used to specify GPRs and SSE registers, 64-bit operand size,
+    // etc. We only cares about REX.W and REX.R bits and only the former is
+    // statically determined.
+    //
+    REXShift    = 12,
+    REX_W       = 1 << REXShift,
+
+    //===------------------------------------------------------------------===//
+    // This three-bit field describes the size of an immediate operand.  Zero is
+    // unused so that we can tell if we forgot to set a value.
+    ImmShift = 13,
+    ImmMask  = 7 << ImmShift,
+    Imm8     = 1 << ImmShift,
+    Imm16    = 2 << ImmShift,
+    Imm32    = 3 << ImmShift,
+    Imm64    = 4 << ImmShift,
+
+    //===------------------------------------------------------------------===//
+    // FP Instruction Classification...  Zero is non-fp instruction.
+
+    // FPTypeMask - Mask for all of the FP types...
+    FPTypeShift = 16,
+    FPTypeMask  = 7 << FPTypeShift,
+
+    // NotFP - The default, set for instructions that do not use FP registers.
+    NotFP      = 0 << FPTypeShift,
+
+    // ZeroArgFP - 0 arg FP instruction which implicitly pushes ST(0), f.e. fld0
+    ZeroArgFP  = 1 << FPTypeShift,
+
+    // OneArgFP - 1 arg FP instructions which implicitly read ST(0), such as fst
+    OneArgFP   = 2 << FPTypeShift,
+
+    // OneArgFPRW - 1 arg FP instruction which implicitly read ST(0) and write a
+    // result back to ST(0).  For example, fcos, fsqrt, etc.
+    //
+    OneArgFPRW = 3 << FPTypeShift,
+
+    // TwoArgFP - 2 arg FP instructions which implicitly read ST(0), and an
+    // explicit argument, storing the result to either ST(0) or the implicit
+    // argument.  For example: fadd, fsub, fmul, etc...
+    TwoArgFP   = 4 << FPTypeShift,
+
+    // CompareFP - 2 arg FP instructions which implicitly read ST(0) and an
+    // explicit argument, but have no destination.  Example: fucom, fucomi, ...
+    CompareFP  = 5 << FPTypeShift,
+
+    // CondMovFP - "2 operand" floating point conditional move instructions.
+    CondMovFP  = 6 << FPTypeShift,
+
+    // SpecialFP - Special instruction forms.  Dispatch by opcode explicitly.
+    SpecialFP  = 7 << FPTypeShift,
+
+    // Lock prefix
+    LOCKShift = 19,
+    LOCK = 1 << LOCKShift,
+
+    // Segment override prefixes. Currently we just need ability to address
+    // stuff in gs and fs segments.
+    SegOvrShift = 20,
+    SegOvrMask  = 3 << SegOvrShift,
+    FS          = 1 << SegOvrShift,
+    GS          = 2 << SegOvrShift,
+
+    // Bits 22 -> 23 are unused
+    OpcodeShift   = 24,
+    OpcodeMask    = 0xFF << OpcodeShift
+  };
+  
+  // getBaseOpcodeFor - This function returns the "base" X86 opcode for the
+  // specified machine instruction.
+  //
+  static inline unsigned char getBaseOpcodeFor(unsigned TSFlags) {
+    return TSFlags >> X86II::OpcodeShift;
+  }
+  
+  /// getSizeOfImm - Decode the "size of immediate" field from the TSFlags field
+  /// of the specified instruction.
+  static inline unsigned getSizeOfImm(unsigned TSFlags) {
+    switch (TSFlags & X86II::ImmMask) {
+    default: assert(0 && "Unknown immediate size");
+    case X86II::Imm8:   return 1;
+    case X86II::Imm16:  return 2;
+    case X86II::Imm32:  return 4;
+    case X86II::Imm64:  return 8;
+    }
+  }    
+}
+
+const int X86AddrNumOperands = 5;
+
+inline static bool isScale(const MachineOperand &MO) {
+  return MO.isImm() &&
+    (MO.getImm() == 1 || MO.getImm() == 2 ||
+     MO.getImm() == 4 || MO.getImm() == 8);
+}
+
+inline static bool isLeaMem(const MachineInstr *MI, unsigned Op) {
+  if (MI->getOperand(Op).isFI()) return true;
+  return Op+4 <= MI->getNumOperands() &&
+    MI->getOperand(Op  ).isReg() && isScale(MI->getOperand(Op+1)) &&
+    MI->getOperand(Op+2).isReg() &&
+    (MI->getOperand(Op+3).isImm() ||
+     MI->getOperand(Op+3).isGlobal() ||
+     MI->getOperand(Op+3).isCPI() ||
+     MI->getOperand(Op+3).isJTI());
+}
+
+inline static bool isMem(const MachineInstr *MI, unsigned Op) {
+  if (MI->getOperand(Op).isFI()) return true;
+  return Op+5 <= MI->getNumOperands() &&
+    MI->getOperand(Op+4).isReg() &&
+    isLeaMem(MI, Op);
+}
+
+class X86InstrInfo : public TargetInstrInfoImpl {
+  X86TargetMachine &TM;
+  const X86RegisterInfo RI;
+  
+  /// RegOp2MemOpTable2Addr, RegOp2MemOpTable0, RegOp2MemOpTable1,
+  /// RegOp2MemOpTable2 - Load / store folding opcode maps.
+  ///
+  DenseMap<unsigned*, std::pair<unsigned,unsigned> > RegOp2MemOpTable2Addr;
+  DenseMap<unsigned*, std::pair<unsigned,unsigned> > RegOp2MemOpTable0;
+  DenseMap<unsigned*, std::pair<unsigned,unsigned> > RegOp2MemOpTable1;
+  DenseMap<unsigned*, std::pair<unsigned,unsigned> > RegOp2MemOpTable2;
+  
+  /// MemOp2RegOpTable - Load / store unfolding opcode map.
+  ///
+  DenseMap<unsigned*, std::pair<unsigned, unsigned> > MemOp2RegOpTable;
+  
+public:
+  explicit X86InstrInfo(X86TargetMachine &tm);
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  virtual const X86RegisterInfo &getRegisterInfo() const { return RI; }
+
+  /// Return true if the instruction is a register to register move and return
+  /// the source and dest operands and their sub-register indices by reference.
+  virtual bool isMoveInstr(const MachineInstr &MI,
+                           unsigned &SrcReg, unsigned &DstReg,
+                           unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
+
+  /// isCoalescableExtInstr - Return true if the instruction is a "coalescable"
+  /// extension instruction. That is, it's like a copy where it's legal for the
+  /// source to overlap the destination. e.g. X86::MOVSX64rr32. If this returns
+  /// true, then it's expected the pre-extension value is available as a subreg
+  /// of the result register. This also returns the sub-register index in
+  /// SubIdx.
+  virtual bool isCoalescableExtInstr(const MachineInstr &MI,
+                                     unsigned &SrcReg, unsigned &DstReg,
+                                     unsigned &SubIdx) const;
+
+  unsigned isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const;
+  /// isLoadFromStackSlotPostFE - Check for post-frame ptr elimination
+  /// stack locations as well.  This uses a heuristic so it isn't
+  /// reliable for correctness.
+  unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI,
+                                     int &FrameIndex) const;
+
+  /// hasLoadFromStackSlot - If the specified machine instruction has
+  /// a load from a stack slot, return true along with the FrameIndex
+  /// of the loaded stack slot and the machine mem operand containing
+  /// the reference.  If not, return false.  Unlike
+  /// isLoadFromStackSlot, this returns true for any instructions that
+  /// loads from the stack.  This is a hint only and may not catch all
+  /// cases.
+  bool hasLoadFromStackSlot(const MachineInstr *MI,
+                            const MachineMemOperand *&MMO,
+                            int &FrameIndex) const;
+
+  unsigned isStoreToStackSlot(const MachineInstr *MI, int &FrameIndex) const;
+  /// isStoreToStackSlotPostFE - Check for post-frame ptr elimination
+  /// stack locations as well.  This uses a heuristic so it isn't
+  /// reliable for correctness.
+  unsigned isStoreToStackSlotPostFE(const MachineInstr *MI,
+                                    int &FrameIndex) const;
+
+  /// hasStoreToStackSlot - If the specified machine instruction has a
+  /// store to a stack slot, return true along with the FrameIndex of
+  /// the loaded stack slot and the machine mem operand containing the
+  /// reference.  If not, return false.  Unlike isStoreToStackSlot,
+  /// this returns true for any instructions that loads from the
+  /// stack.  This is a hint only and may not catch all cases.
+  bool hasStoreToStackSlot(const MachineInstr *MI,
+                           const MachineMemOperand *&MMO,
+                           int &FrameIndex) const;
+
+  bool isReallyTriviallyReMaterializable(const MachineInstr *MI,
+                                         AliasAnalysis *AA) const;
+  void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                     unsigned DestReg, unsigned SubIdx,
+                     const MachineInstr *Orig,
+                     const TargetRegisterInfo *TRI) const;
+
+  /// convertToThreeAddress - This method must be implemented by targets that
+  /// set the M_CONVERTIBLE_TO_3_ADDR flag.  When this flag is set, the target
+  /// may be able to convert a two-address instruction into a true
+  /// three-address instruction on demand.  This allows the X86 target (for
+  /// example) to convert ADD and SHL instructions into LEA instructions if they
+  /// would require register copies due to two-addressness.
+  ///
+  /// This method returns a null pointer if the transformation cannot be
+  /// performed, otherwise it returns the new instruction.
+  ///
+  virtual MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
+                                              MachineBasicBlock::iterator &MBBI,
+                                              LiveVariables *LV) const;
+
+  /// commuteInstruction - We have a few instructions that must be hacked on to
+  /// commute them.
+  ///
+  virtual MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI) const;
+
+  // Branch analysis.
+  virtual bool isUnpredicatedTerminator(const MachineInstr* MI) const;
+  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                             MachineBasicBlock *&FBB,
+                             SmallVectorImpl<MachineOperand> &Cond,
+                             bool AllowModify) const;
+  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                                MachineBasicBlock *FBB,
+                            const SmallVectorImpl<MachineOperand> &Cond) const;
+  virtual bool copyRegToReg(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            unsigned DestReg, unsigned SrcReg,
+                            const TargetRegisterClass *DestRC,
+                            const TargetRegisterClass *SrcRC) const;
+  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   unsigned SrcReg, bool isKill, int FrameIndex,
+                                   const TargetRegisterClass *RC) const;
+
+  virtual void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
+                              SmallVectorImpl<MachineOperand> &Addr,
+                              const TargetRegisterClass *RC,
+                              MachineInstr::mmo_iterator MMOBegin,
+                              MachineInstr::mmo_iterator MMOEnd,
+                              SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MI,
+                                    unsigned DestReg, int FrameIndex,
+                                    const TargetRegisterClass *RC) const;
+
+  virtual void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+                               SmallVectorImpl<MachineOperand> &Addr,
+                               const TargetRegisterClass *RC,
+                               MachineInstr::mmo_iterator MMOBegin,
+                               MachineInstr::mmo_iterator MMOEnd,
+                               SmallVectorImpl<MachineInstr*> &NewMIs) const;
+  
+  virtual bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI) const;
+
+  virtual bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                           MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI) const;
+  
+  /// foldMemoryOperand - If this target supports it, fold a load or store of
+  /// the specified stack slot into the specified machine instruction for the
+  /// specified operand(s).  If this is possible, the target should perform the
+  /// folding and return true, otherwise it should return false.  If it folds
+  /// the instruction, it is likely that the MachineInstruction the iterator
+  /// references has been changed.
+  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                              MachineInstr* MI,
+                                           const SmallVectorImpl<unsigned> &Ops,
+                                              int FrameIndex) const;
+
+  /// foldMemoryOperand - Same as the previous version except it allows folding
+  /// of any load and store from / to any address, not just from a specific
+  /// stack slot.
+  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                              MachineInstr* MI,
+                                           const SmallVectorImpl<unsigned> &Ops,
+                                              MachineInstr* LoadMI) const;
+
+  /// canFoldMemoryOperand - Returns true if the specified load / store is
+  /// folding is possible.
+  virtual bool canFoldMemoryOperand(const MachineInstr*,
+                                    const SmallVectorImpl<unsigned> &) const;
+
+  /// unfoldMemoryOperand - Separate a single instruction which folded a load or
+  /// a store or a load and a store into two or more instruction. If this is
+  /// possible, returns true as well as the new instructions by reference.
+  virtual bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
+                           unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
+                           SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+  virtual bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
+                           SmallVectorImpl<SDNode*> &NewNodes) const;
+
+  /// getOpcodeAfterMemoryUnfold - Returns the opcode of the would be new
+  /// instruction after load / store are unfolded from an instruction of the
+  /// specified opcode. It returns zero if the specified unfolding is not
+  /// possible. If LoadRegIndex is non-null, it is filled in with the operand
+  /// index of the operand which will hold the register holding the loaded
+  /// value.
+  virtual unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
+                                      bool UnfoldLoad, bool UnfoldStore,
+                                      unsigned *LoadRegIndex = 0) const;
+  
+  /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler
+  /// to determine if two loads are loading from the same base address. It
+  /// should only return true if the base pointers are the same and the
+  /// only differences between the two addresses are the offset. It also returns
+  /// the offsets by reference.
+  virtual bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
+                                       int64_t &Offset1, int64_t &Offset2) const;
+
+  /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to
+  /// determine (in conjuction with areLoadsFromSameBasePtr) if two loads should
+  /// be scheduled togther. On some targets if two loads are loading from
+  /// addresses in the same cache line, it's better if they are scheduled
+  /// together. This function takes two integers that represent the load offsets
+  /// from the common base address. It returns true if it decides it's desirable
+  /// to schedule the two loads together. "NumLoads" is the number of loads that
+  /// have already been scheduled after Load1.
+  virtual bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+                                       int64_t Offset1, int64_t Offset2,
+                                       unsigned NumLoads) const;
+
+  virtual
+  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+
+  /// isSafeToMoveRegClassDefs - Return true if it's safe to move a machine
+  /// instruction that defines the specified register class.
+  bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const;
+
+  static bool isX86_64NonExtLowByteReg(unsigned reg) {
+    return (reg == X86::SPL || reg == X86::BPL ||
+          reg == X86::SIL || reg == X86::DIL);
+  }
+  
+  static bool isX86_64ExtendedReg(const MachineOperand &MO) {
+    if (!MO.isReg()) return false;
+    return isX86_64ExtendedReg(MO.getReg());
+  }
+  static unsigned determineREX(const MachineInstr &MI);
+
+  /// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended (r8 or
+  /// higher) register?  e.g. r8, xmm8, xmm13, etc.
+  static bool isX86_64ExtendedReg(unsigned RegNo);
+
+  /// GetInstSize - Returns the size of the specified MachineInstr.
+  ///
+  virtual unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
+
+  /// getGlobalBaseReg - Return a virtual register initialized with the
+  /// the global base register value. Output instructions required to
+  /// initialize the register in the function entry block, if necessary.
+  ///
+  unsigned getGlobalBaseReg(MachineFunction *MF) const;
+
+private:
+  MachineInstr * convertToThreeAddressWithLEA(unsigned MIOpc,
+                                              MachineFunction::iterator &MFI,
+                                              MachineBasicBlock::iterator &MBBI,
+                                              LiveVariables *LV) const;
+
+  MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                     MachineInstr* MI,
+                                     unsigned OpNum,
+                                     const SmallVectorImpl<MachineOperand> &MOs,
+                                     unsigned Size, unsigned Alignment) const;
+
+  /// isFrameOperand - Return true and the FrameIndex if the specified
+  /// operand and follow operands form a reference to the stack frame.
+  bool isFrameOperand(const MachineInstr *MI, unsigned int Op,
+                      int &FrameIndex) const;
+};
+
+} // End llvm namespace
+
+#endif

diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
new file mode 100644
index 0000000..f0b4239
--- /dev/null
+++ b/lib/Target/X86/X86InstrInfo.td

@@ -0,0 +1,5242 @@
+
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 instruction set, defining the instructions, and
+// properties of the instructions which are needed for code generation, machine
+// code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// X86 specific DAG Nodes.
+//
+
+def SDTIntShiftDOp: SDTypeProfile<1, 3,
+                                  [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+                                   SDTCisInt<0>, SDTCisInt<3>]>;
+
+def SDTX86CmpTest : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
+
+def SDTX86Cmov    : SDTypeProfile<1, 4,
+                                  [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
+                                   SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
+
+// Unary and binary operator instructions that set EFLAGS as a side-effect.
+def SDTUnaryArithWithFlags  : SDTypeProfile<1, 1,
+                                            [SDTCisInt<0>]>;
+def SDTBinaryArithWithFlags : SDTypeProfile<1, 2,
+                                            [SDTCisSameAs<0, 1>,
+                                             SDTCisSameAs<0, 2>,
+                                             SDTCisInt<0>]>;
+def SDTX86BrCond  : SDTypeProfile<0, 3,
+                                  [SDTCisVT<0, OtherVT>,
+                                   SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;
+
+def SDTX86SetCC   : SDTypeProfile<1, 2,
+                                  [SDTCisVT<0, i8>,
+                                   SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;
+def SDTX86SetCC_C : SDTypeProfile<1, 2,
+                                  [SDTCisInt<0>,
+                                   SDTCisVT<1, i8>, SDTCisVT<2, i32>]>;
+
+def SDTX86cas : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisInt<1>, 
+                                     SDTCisVT<2, i8>]>;
+def SDTX86cas8 : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+
+def SDTX86atomicBinary : SDTypeProfile<2, 3, [SDTCisInt<0>, SDTCisInt<1>,
+                                SDTCisPtrTy<2>, SDTCisInt<3>,SDTCisInt<4>]>;
+def SDTX86Ret     : SDTypeProfile<0, -1, [SDTCisVT<0, i16>]>;
+
+def SDT_X86CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>;
+def SDT_X86CallSeqEnd   : SDCallSeqEnd<[SDTCisVT<0, i32>,
+                                        SDTCisVT<1, i32>]>;
+
+def SDT_X86Call   : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>;
+
+def SDT_X86VASTART_SAVE_XMM_REGS : SDTypeProfile<0, -1, [SDTCisVT<0, i8>,
+                                                         SDTCisVT<1, iPTR>,
+                                                         SDTCisVT<2, iPTR>]>;
+
+def SDTX86RepStr  : SDTypeProfile<0, 1, [SDTCisVT<0, OtherVT>]>;
+
+def SDTX86RdTsc   : SDTypeProfile<0, 0, []>;
+
+def SDTX86Wrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+
+def SDT_X86TLSADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_X86SegmentBaseAddress : SDTypeProfile<1, 1, [SDTCisPtrTy<0>]>;
+
+def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
+
+def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>;
+
+def X86bsf     : SDNode<"X86ISD::BSF",      SDTIntUnaryOp>;
+def X86bsr     : SDNode<"X86ISD::BSR",      SDTIntUnaryOp>;
+def X86shld    : SDNode<"X86ISD::SHLD",     SDTIntShiftDOp>;
+def X86shrd    : SDNode<"X86ISD::SHRD",     SDTIntShiftDOp>;
+
+def X86cmp     : SDNode<"X86ISD::CMP" ,     SDTX86CmpTest>;
+
+def X86bt      : SDNode<"X86ISD::BT",       SDTX86CmpTest>;
+
+def X86cmov    : SDNode<"X86ISD::CMOV",     SDTX86Cmov>;
+def X86brcond  : SDNode<"X86ISD::BRCOND",   SDTX86BrCond,
+                        [SDNPHasChain]>;
+def X86setcc   : SDNode<"X86ISD::SETCC",    SDTX86SetCC>;
+def X86setcc_c : SDNode<"X86ISD::SETCC_CARRY", SDTX86SetCC_C>;
+
+def X86cas : SDNode<"X86ISD::LCMPXCHG_DAG", SDTX86cas,
+                        [SDNPHasChain, SDNPInFlag, SDNPOutFlag, SDNPMayStore,
+                         SDNPMayLoad]>;
+def X86cas8 : SDNode<"X86ISD::LCMPXCHG8_DAG", SDTX86cas8,
+                        [SDNPHasChain, SDNPInFlag, SDNPOutFlag, SDNPMayStore,
+                         SDNPMayLoad]>;
+def X86AtomAdd64 : SDNode<"X86ISD::ATOMADD64_DAG", SDTX86atomicBinary,
+                        [SDNPHasChain, SDNPMayStore, 
+                         SDNPMayLoad, SDNPMemOperand]>;
+def X86AtomSub64 : SDNode<"X86ISD::ATOMSUB64_DAG", SDTX86atomicBinary,
+                        [SDNPHasChain, SDNPMayStore, 
+                         SDNPMayLoad, SDNPMemOperand]>;
+def X86AtomOr64 : SDNode<"X86ISD::ATOMOR64_DAG", SDTX86atomicBinary,
+                        [SDNPHasChain, SDNPMayStore, 
+                         SDNPMayLoad, SDNPMemOperand]>;
+def X86AtomXor64 : SDNode<"X86ISD::ATOMXOR64_DAG", SDTX86atomicBinary,
+                        [SDNPHasChain, SDNPMayStore, 
+                         SDNPMayLoad, SDNPMemOperand]>;
+def X86AtomAnd64 : SDNode<"X86ISD::ATOMAND64_DAG", SDTX86atomicBinary,
+                        [SDNPHasChain, SDNPMayStore, 
+                         SDNPMayLoad, SDNPMemOperand]>;
+def X86AtomNand64 : SDNode<"X86ISD::ATOMNAND64_DAG", SDTX86atomicBinary,
+                        [SDNPHasChain, SDNPMayStore, 
+                         SDNPMayLoad, SDNPMemOperand]>;
+def X86AtomSwap64 : SDNode<"X86ISD::ATOMSWAP64_DAG", SDTX86atomicBinary,
+                        [SDNPHasChain, SDNPMayStore, 
+                         SDNPMayLoad, SDNPMemOperand]>;
+def X86retflag : SDNode<"X86ISD::RET_FLAG", SDTX86Ret,
+                        [SDNPHasChain, SDNPOptInFlag]>;
+
+def X86vastart_save_xmm_regs :
+                 SDNode<"X86ISD::VASTART_SAVE_XMM_REGS",
+                        SDT_X86VASTART_SAVE_XMM_REGS,
+                        [SDNPHasChain]>;
+
+def X86callseq_start :
+                 SDNode<"ISD::CALLSEQ_START", SDT_X86CallSeqStart,
+                        [SDNPHasChain, SDNPOutFlag]>;
+def X86callseq_end :
+                 SDNode<"ISD::CALLSEQ_END",   SDT_X86CallSeqEnd,
+                        [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;       
+
+def X86call    : SDNode<"X86ISD::CALL",     SDT_X86Call,
+                        [SDNPHasChain, SDNPOutFlag, SDNPOptInFlag]>;
+
+def X86rep_stos: SDNode<"X86ISD::REP_STOS", SDTX86RepStr,
+                        [SDNPHasChain, SDNPInFlag, SDNPOutFlag, SDNPMayStore]>;
+def X86rep_movs: SDNode<"X86ISD::REP_MOVS", SDTX86RepStr,
+                        [SDNPHasChain, SDNPInFlag, SDNPOutFlag, SDNPMayStore,
+                         SDNPMayLoad]>;
+
+def X86rdtsc   : SDNode<"X86ISD::RDTSC_DAG",SDTX86RdTsc,
+                        [SDNPHasChain, SDNPOutFlag, SDNPSideEffect]>;
+
+def X86Wrapper    : SDNode<"X86ISD::Wrapper",     SDTX86Wrapper>;
+def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP",  SDTX86Wrapper>;
+
+def X86tlsaddr : SDNode<"X86ISD::TLSADDR", SDT_X86TLSADDR,
+                        [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+def X86SegmentBaseAddress : SDNode<"X86ISD::SegmentBaseAddress",
+                                 SDT_X86SegmentBaseAddress, []>;
+
+def X86ehret : SDNode<"X86ISD::EH_RETURN", SDT_X86EHRET,
+                        [SDNPHasChain]>;
+
+def X86tcret : SDNode<"X86ISD::TC_RETURN", SDT_X86TCRET, 
+                        [SDNPHasChain,  SDNPOptInFlag]>;
+
+def X86add_flag  : SDNode<"X86ISD::ADD",  SDTBinaryArithWithFlags,
+                          [SDNPCommutative]>;
+def X86sub_flag  : SDNode<"X86ISD::SUB",  SDTBinaryArithWithFlags>;
+def X86smul_flag : SDNode<"X86ISD::SMUL", SDTBinaryArithWithFlags,
+                          [SDNPCommutative]>;
+def X86umul_flag : SDNode<"X86ISD::UMUL", SDTUnaryArithWithFlags,
+                          [SDNPCommutative]>;
+def X86inc_flag  : SDNode<"X86ISD::INC",  SDTUnaryArithWithFlags>;
+def X86dec_flag  : SDNode<"X86ISD::DEC",  SDTUnaryArithWithFlags>;
+def X86or_flag   : SDNode<"X86ISD::OR",   SDTBinaryArithWithFlags,
+                          [SDNPCommutative]>;
+def X86xor_flag  : SDNode<"X86ISD::XOR",  SDTBinaryArithWithFlags,
+                          [SDNPCommutative]>;
+def X86and_flag  : SDNode<"X86ISD::AND",  SDTBinaryArithWithFlags,
+                          [SDNPCommutative]>;
+
+def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>;
+
+//===----------------------------------------------------------------------===//
+// X86 Operand Definitions.
+//
+
+// A version of ptr_rc which excludes SP, ESP, and RSP. This is used for
+// the index operand of an address, to conform to x86 encoding restrictions.
+def ptr_rc_nosp : PointerLikeRegClass<1>;
+
+// *mem - Operand definitions for the funky X86 addressing mode operands.
+//
+def X86MemAsmOperand : AsmOperandClass {
+  let Name = "Mem";
+  let SuperClass = ?;
+}
+def X86AbsMemAsmOperand : AsmOperandClass {
+  let Name = "AbsMem";
+  let SuperClass = X86MemAsmOperand;
+}
+def X86NoSegMemAsmOperand : AsmOperandClass {
+  let Name = "NoSegMem";
+  let SuperClass = X86MemAsmOperand;
+}
+class X86MemOperand<string printMethod> : Operand<iPTR> {
+  let PrintMethod = printMethod;
+  let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm);
+  let ParserMatchClass = X86MemAsmOperand;
+}
+
+def opaque32mem : X86MemOperand<"printopaquemem">;
+def opaque48mem : X86MemOperand<"printopaquemem">;
+def opaque80mem : X86MemOperand<"printopaquemem">;
+def opaque512mem : X86MemOperand<"printopaquemem">;
+
+def i8mem   : X86MemOperand<"printi8mem">;
+def i16mem  : X86MemOperand<"printi16mem">;
+def i32mem  : X86MemOperand<"printi32mem">;
+def i64mem  : X86MemOperand<"printi64mem">;
+def i128mem : X86MemOperand<"printi128mem">;
+//def i256mem : X86MemOperand<"printi256mem">;
+def f32mem  : X86MemOperand<"printf32mem">;
+def f64mem  : X86MemOperand<"printf64mem">;
+def f80mem  : X86MemOperand<"printf80mem">;
+def f128mem : X86MemOperand<"printf128mem">;
+//def f256mem : X86MemOperand<"printf256mem">;
+
+// A version of i8mem for use on x86-64 that uses GR64_NOREX instead of
+// plain GR64, so that it doesn't potentially require a REX prefix.
+def i8mem_NOREX : Operand<i64> {
+  let PrintMethod = "printi8mem";
+  let MIOperandInfo = (ops GR64_NOREX, i8imm, GR64_NOREX_NOSP, i32imm, i8imm);
+  let ParserMatchClass = X86MemAsmOperand;
+}
+
+def lea32mem : Operand<i32> {
+  let PrintMethod = "printlea32mem";
+  let MIOperandInfo = (ops GR32, i8imm, GR32_NOSP, i32imm);
+  let ParserMatchClass = X86NoSegMemAsmOperand;
+}
+
+let ParserMatchClass = X86AbsMemAsmOperand,
+    PrintMethod = "print_pcrel_imm" in {
+def i32imm_pcrel : Operand<i32>;
+
+def offset8 : Operand<i64>;
+def offset16 : Operand<i64>;
+def offset32 : Operand<i64>;
+def offset64 : Operand<i64>;
+
+// Branch targets have OtherVT type and print as pc-relative values.
+def brtarget : Operand<OtherVT>;
+def brtarget8 : Operand<OtherVT>;
+
+}
+
+def SSECC : Operand<i8> {
+  let PrintMethod = "printSSECC";
+}
+
+def ImmSExt8AsmOperand : AsmOperandClass {
+  let Name = "ImmSExt8";
+  let SuperClass = ImmAsmOperand;
+}
+
+// A couple of more descriptive operand definitions.
+// 16-bits but only 8 bits are significant.
+def i16i8imm  : Operand<i16> {
+  let ParserMatchClass = ImmSExt8AsmOperand;
+}
+// 32-bits but only 8 bits are significant.
+def i32i8imm  : Operand<i32> {
+  let ParserMatchClass = ImmSExt8AsmOperand;
+}
+
+//===----------------------------------------------------------------------===//
+// X86 Complex Pattern Definitions.
+//
+
+// Define X86 specific addressing mode.
+def addr      : ComplexPattern<iPTR, 5, "SelectAddr", [], []>;
+def lea32addr : ComplexPattern<i32, 4, "SelectLEAAddr",
+                               [add, sub, mul, X86mul_imm, shl, or, frameindex],
+                               []>;
+def tls32addr : ComplexPattern<i32, 4, "SelectTLSADDRAddr",
+                               [tglobaltlsaddr], []>;
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Predicate Definitions.
+def HasMMX       : Predicate<"Subtarget->hasMMX()">;
+def HasSSE1      : Predicate<"Subtarget->hasSSE1()">;
+def HasSSE2      : Predicate<"Subtarget->hasSSE2()">;
+def HasSSE3      : Predicate<"Subtarget->hasSSE3()">;
+def HasSSSE3     : Predicate<"Subtarget->hasSSSE3()">;
+def HasSSE41     : Predicate<"Subtarget->hasSSE41()">;
+def HasSSE42     : Predicate<"Subtarget->hasSSE42()">;
+def HasSSE4A     : Predicate<"Subtarget->hasSSE4A()">;
+def HasAVX       : Predicate<"Subtarget->hasAVX()">;
+def HasFMA3      : Predicate<"Subtarget->hasFMA3()">;
+def HasFMA4      : Predicate<"Subtarget->hasFMA4()">;
+def FPStackf32   : Predicate<"!Subtarget->hasSSE1()">;
+def FPStackf64   : Predicate<"!Subtarget->hasSSE2()">;
+def In32BitMode  : Predicate<"!Subtarget->is64Bit()">;
+def In64BitMode  : Predicate<"Subtarget->is64Bit()">;
+def IsWin64      : Predicate<"Subtarget->isTargetWin64()">;
+def NotWin64     : Predicate<"!Subtarget->isTargetWin64()">;
+def SmallCode    : Predicate<"TM.getCodeModel() == CodeModel::Small">;
+def KernelCode   : Predicate<"TM.getCodeModel() == CodeModel::Kernel">;
+def FarData      : Predicate<"TM.getCodeModel() != CodeModel::Small &&"
+                             "TM.getCodeModel() != CodeModel::Kernel">;
+def NearData     : Predicate<"TM.getCodeModel() == CodeModel::Small ||"
+                             "TM.getCodeModel() == CodeModel::Kernel">;
+def IsStatic     : Predicate<"TM.getRelocationModel() == Reloc::Static">;
+def OptForSize   : Predicate<"OptForSize">;
+def OptForSpeed  : Predicate<"!OptForSize">;
+def FastBTMem    : Predicate<"!Subtarget->isBTMemSlow()">;
+def CallImmAddr  : Predicate<"Subtarget->IsLegalToCallImmediateAddr(TM)">;
+
+//===----------------------------------------------------------------------===//
+// X86 Instruction Format Definitions.
+//
+
+include "X86InstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Pattern fragments...
+//
+
+// X86 specific condition code. These correspond to CondCode in
+// X86InstrInfo.h. They must be kept in synch.
+def X86_COND_A   : PatLeaf<(i8 0)>;  // alt. COND_NBE
+def X86_COND_AE  : PatLeaf<(i8 1)>;  // alt. COND_NC
+def X86_COND_B   : PatLeaf<(i8 2)>;  // alt. COND_C
+def X86_COND_BE  : PatLeaf<(i8 3)>;  // alt. COND_NA
+def X86_COND_E   : PatLeaf<(i8 4)>;  // alt. COND_Z
+def X86_COND_G   : PatLeaf<(i8 5)>;  // alt. COND_NLE
+def X86_COND_GE  : PatLeaf<(i8 6)>;  // alt. COND_NL
+def X86_COND_L   : PatLeaf<(i8 7)>;  // alt. COND_NGE
+def X86_COND_LE  : PatLeaf<(i8 8)>;  // alt. COND_NG
+def X86_COND_NE  : PatLeaf<(i8 9)>;  // alt. COND_NZ
+def X86_COND_NO  : PatLeaf<(i8 10)>;
+def X86_COND_NP  : PatLeaf<(i8 11)>; // alt. COND_PO
+def X86_COND_NS  : PatLeaf<(i8 12)>;
+def X86_COND_O   : PatLeaf<(i8 13)>;
+def X86_COND_P   : PatLeaf<(i8 14)>; // alt. COND_PE
+def X86_COND_S   : PatLeaf<(i8 15)>;
+
+def i16immSExt8  : PatLeaf<(i16 imm), [{
+  // i16immSExt8 predicate - True if the 16-bit immediate fits in a 8-bit
+  // sign extended field.
+  return (int16_t)N->getZExtValue() == (int8_t)N->getZExtValue();
+}]>;
+
+def i32immSExt8  : PatLeaf<(i32 imm), [{
+  // i32immSExt8 predicate - True if the 32-bit immediate fits in a 8-bit
+  // sign extended field.
+  return (int32_t)N->getZExtValue() == (int8_t)N->getZExtValue();
+}]>;
+
+// Helper fragments for loads.
+// It's always safe to treat a anyext i16 load as a i32 load if the i16 is
+// known to be 32-bit aligned or better. Ditto for i8 to i16.
+def loadi16 : PatFrag<(ops node:$ptr), (i16 (unindexedload node:$ptr)), [{
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  if (const Value *Src = LD->getSrcValue())
+    if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+      if (PT->getAddressSpace() > 255)
+        return false;
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  if (ExtType == ISD::NON_EXTLOAD)
+    return true;
+  if (ExtType == ISD::EXTLOAD)
+    return LD->getAlignment() >= 2 && !LD->isVolatile();
+  return false;
+}]>;
+
+def loadi16_anyext : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)),
+[{
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  if (const Value *Src = LD->getSrcValue())
+    if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+      if (PT->getAddressSpace() > 255)
+        return false;
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  if (ExtType == ISD::EXTLOAD)
+    return LD->getAlignment() >= 2 && !LD->isVolatile();
+  return false;
+}]>;
+
+def loadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  if (const Value *Src = LD->getSrcValue())
+    if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+      if (PT->getAddressSpace() > 255)
+        return false;
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  if (ExtType == ISD::NON_EXTLOAD)
+    return true;
+  if (ExtType == ISD::EXTLOAD)
+    return LD->getAlignment() >= 4 && !LD->isVolatile();
+  return false;
+}]>;
+
+def nvloadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  if (const Value *Src = LD->getSrcValue())
+    if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+      if (PT->getAddressSpace() > 255)
+        return false;
+  if (LD->isVolatile())
+    return false;
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  if (ExtType == ISD::NON_EXTLOAD)
+    return true;
+  if (ExtType == ISD::EXTLOAD)
+    return LD->getAlignment() >= 4;
+  return false;
+}]>;
+
+def gsload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
+    if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+      return PT->getAddressSpace() == 256;
+  return false;
+}]>;
+
+def fsload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
+    if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+      return PT->getAddressSpace() == 257;
+  return false;
+}]>;
+
+def loadi8  : PatFrag<(ops node:$ptr), (i8  (load node:$ptr)), [{
+  if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
+    if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+      if (PT->getAddressSpace() > 255)
+        return false;
+  return true;
+}]>;
+def loadi64 : PatFrag<(ops node:$ptr), (i64 (load node:$ptr)), [{
+  if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
+    if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+      if (PT->getAddressSpace() > 255)
+        return false;
+  return true;
+}]>;
+
+def loadf32 : PatFrag<(ops node:$ptr), (f32 (load node:$ptr)), [{
+  if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
+    if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+      if (PT->getAddressSpace() > 255)
+        return false;
+  return true;
+}]>;
+def loadf64 : PatFrag<(ops node:$ptr), (f64 (load node:$ptr)), [{
+  if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
+    if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+      if (PT->getAddressSpace() > 255)
+        return false;
+  return true;
+}]>;
+def loadf80 : PatFrag<(ops node:$ptr), (f80 (load node:$ptr)), [{
+  if (const Value *Src = cast<LoadSDNode>(N)->getSrcValue())
+    if (const PointerType *PT = dyn_cast<PointerType>(Src->getType()))
+      if (PT->getAddressSpace() > 255)
+        return false;
+  return true;
+}]>;
+
+def sextloadi16i8  : PatFrag<(ops node:$ptr), (i16 (sextloadi8 node:$ptr))>;
+def sextloadi32i8  : PatFrag<(ops node:$ptr), (i32 (sextloadi8 node:$ptr))>;
+def sextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (sextloadi16 node:$ptr))>;
+
+def zextloadi8i1   : PatFrag<(ops node:$ptr), (i8  (zextloadi1 node:$ptr))>;
+def zextloadi16i1  : PatFrag<(ops node:$ptr), (i16 (zextloadi1 node:$ptr))>;
+def zextloadi32i1  : PatFrag<(ops node:$ptr), (i32 (zextloadi1 node:$ptr))>;
+def zextloadi16i8  : PatFrag<(ops node:$ptr), (i16 (zextloadi8 node:$ptr))>;
+def zextloadi32i8  : PatFrag<(ops node:$ptr), (i32 (zextloadi8 node:$ptr))>;
+def zextloadi32i16 : PatFrag<(ops node:$ptr), (i32 (zextloadi16 node:$ptr))>;
+
+def extloadi8i1    : PatFrag<(ops node:$ptr), (i8  (extloadi1 node:$ptr))>;
+def extloadi16i1   : PatFrag<(ops node:$ptr), (i16 (extloadi1 node:$ptr))>;
+def extloadi32i1   : PatFrag<(ops node:$ptr), (i32 (extloadi1 node:$ptr))>;
+def extloadi16i8   : PatFrag<(ops node:$ptr), (i16 (extloadi8 node:$ptr))>;
+def extloadi32i8   : PatFrag<(ops node:$ptr), (i32 (extloadi8 node:$ptr))>;
+def extloadi32i16  : PatFrag<(ops node:$ptr), (i32 (extloadi16 node:$ptr))>;
+
+
+// An 'and' node with a single use.
+def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{
+  return N->hasOneUse();
+}]>;
+// An 'srl' node with a single use.
+def srl_su : PatFrag<(ops node:$lhs, node:$rhs), (srl node:$lhs, node:$rhs), [{
+  return N->hasOneUse();
+}]>;
+// An 'trunc' node with a single use.
+def trunc_su : PatFrag<(ops node:$src), (trunc node:$src), [{
+  return N->hasOneUse();
+}]>;
+
+// Treat an 'or' node is as an 'add' if the or'ed bits are known to be zero.
+def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1)))
+    return CurDAG->MaskedValueIsZero(N->getOperand(0), CN->getAPIntValue());
+  else {
+    unsigned BitWidth = N->getValueType(0).getScalarType().getSizeInBits();
+    APInt Mask = APInt::getAllOnesValue(BitWidth);
+    APInt KnownZero0, KnownOne0;
+    CurDAG->ComputeMaskedBits(N->getOperand(0), Mask, KnownZero0, KnownOne0, 0);
+    APInt KnownZero1, KnownOne1;
+    CurDAG->ComputeMaskedBits(N->getOperand(1), Mask, KnownZero1, KnownOne1, 0);
+    return (~KnownZero0 & ~KnownZero1) == 0;
+  }
+}]>;
+
+// 'shld' and 'shrd' instruction patterns. Note that even though these have
+// the srl and shl in their patterns, the C++ code must still check for them,
+// because predicates are tested before children nodes are explored.
+
+def shrd : PatFrag<(ops node:$src1, node:$amt1, node:$src2, node:$amt2),
+                   (or (srl node:$src1, node:$amt1),
+                       (shl node:$src2, node:$amt2)), [{
+  assert(N->getOpcode() == ISD::OR);
+  return N->getOperand(0).getOpcode() == ISD::SRL &&
+         N->getOperand(1).getOpcode() == ISD::SHL &&
+         isa<ConstantSDNode>(N->getOperand(0).getOperand(1)) &&
+         isa<ConstantSDNode>(N->getOperand(1).getOperand(1)) &&
+         N->getOperand(0).getConstantOperandVal(1) ==
+         N->getValueSizeInBits(0) - N->getOperand(1).getConstantOperandVal(1);
+}]>;
+
+def shld : PatFrag<(ops node:$src1, node:$amt1, node:$src2, node:$amt2),
+                   (or (shl node:$src1, node:$amt1),
+                       (srl node:$src2, node:$amt2)), [{
+  assert(N->getOpcode() == ISD::OR);
+  return N->getOperand(0).getOpcode() == ISD::SHL &&
+         N->getOperand(1).getOpcode() == ISD::SRL &&
+         isa<ConstantSDNode>(N->getOperand(0).getOperand(1)) &&
+         isa<ConstantSDNode>(N->getOperand(1).getOperand(1)) &&
+         N->getOperand(0).getConstantOperandVal(1) ==
+         N->getValueSizeInBits(0) - N->getOperand(1).getConstantOperandVal(1);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction list...
+//
+
+// ADJCALLSTACKDOWN/UP implicitly use/def ESP because they may be expanded into
+// a stack adjustment and the codegen must know that they may modify the stack
+// pointer before prolog-epilog rewriting occurs.
+// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
+// sub / add which can clobber EFLAGS.
+let Defs = [ESP, EFLAGS], Uses = [ESP] in {
+def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt),
+                           "#ADJCALLSTACKDOWN",
+                           [(X86callseq_start timm:$amt)]>,
+                          Requires<[In32BitMode]>;
+def ADJCALLSTACKUP32   : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
+                           "#ADJCALLSTACKUP",
+                           [(X86callseq_end timm:$amt1, timm:$amt2)]>,
+                          Requires<[In32BitMode]>;
+}
+
+// x86-64 va_start lowering magic.
+let usesCustomInserter = 1 in
+def VASTART_SAVE_XMM_REGS : I<0, Pseudo,
+                              (outs),
+                              (ins GR8:$al,
+                                   i64imm:$regsavefi, i64imm:$offset,
+                                   variable_ops),
+                              "#VASTART_SAVE_XMM_REGS $al, $regsavefi, $offset",
+                              [(X86vastart_save_xmm_regs GR8:$al,
+                                                         imm:$regsavefi,
+                                                         imm:$offset)]>;
+
+// Nop
+let neverHasSideEffects = 1 in {
+  def NOOP : I<0x90, RawFrm, (outs), (ins), "nop", []>;
+  def NOOPW : I<0x1f, MRM0m, (outs), (ins i16mem:$zero),
+                "nop{w}\t$zero", []>, TB, OpSize;
+  def NOOPL : I<0x1f, MRM0m, (outs), (ins i32mem:$zero),
+                "nop{l}\t$zero", []>, TB;
+}
+
+// Trap
+def INT3 : I<0xcc, RawFrm, (outs), (ins), "int\t3", []>;
+def INT : I<0xcd, RawFrm, (outs), (ins i8imm:$trap), "int\t$trap", []>;
+def IRET16 : I<0xcf, RawFrm, (outs), (ins), "iret{w}", []>, OpSize;
+def IRET32 : I<0xcf, RawFrm, (outs), (ins), "iret{l}", []>;
+
+// PIC base construction.  This expands to code that looks like this:
+//     call  $next_inst
+//     popl %destreg"
+let neverHasSideEffects = 1, isNotDuplicable = 1, Uses = [ESP] in
+  def MOVPC32r : Ii32<0xE8, Pseudo, (outs GR32:$reg), (ins i32imm:$label),
+                      "", []>;
+
+//===----------------------------------------------------------------------===//
+//  Control Flow Instructions...
+//
+
+// Return instructions.
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+    hasCtrlDep = 1, FPForm = SpecialFP, FPFormBits = SpecialFP.Value in {
+  def RET    : I   <0xC3, RawFrm, (outs), (ins variable_ops),
+                    "ret",
+                    [(X86retflag 0)]>;
+  def RETI   : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
+                    "ret\t$amt",
+                    [(X86retflag timm:$amt)]>;
+  def LRET   : I   <0xCB, RawFrm, (outs), (ins),
+                    "lret", []>;
+  def LRETI  : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
+                    "lret\t$amt", []>;
+}
+
+// All branches are RawFrm, Void, Branch, and Terminators
+let isBranch = 1, isTerminator = 1 in
+  class IBr<bits<8> opcode, dag ins, string asm, list<dag> pattern> :
+        I<opcode, RawFrm, (outs), ins, asm, pattern>;
+
+let isBranch = 1, isBarrier = 1 in {
+  def JMP : IBr<0xE9, (ins brtarget:$dst), "jmp\t$dst", [(br bb:$dst)]>;
+  def JMP8 : IBr<0xEB, (ins brtarget8:$dst), "jmp\t$dst", []>;
+}
+
+// Indirect branches
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+  def JMP32r     : I<0xFF, MRM4r, (outs), (ins GR32:$dst), "jmp{l}\t{*}$dst",
+                     [(brind GR32:$dst)]>;
+  def JMP32m     : I<0xFF, MRM4m, (outs), (ins i32mem:$dst), "jmp{l}\t{*}$dst",
+                     [(brind (loadi32 addr:$dst))]>;
+                     
+  def FARJMP16i  : Iseg16<0xEA, RawFrm, (outs), 
+                          (ins i16imm:$seg, i16imm:$off),
+                          "ljmp{w}\t$seg, $off", []>, OpSize;
+  def FARJMP32i  : Iseg32<0xEA, RawFrm, (outs),
+                          (ins i16imm:$seg, i32imm:$off),
+                          "ljmp{l}\t$seg, $off", []>;                     
+
+  def FARJMP16m  : I<0xFF, MRM5m, (outs), (ins opaque32mem:$dst), 
+                     "ljmp{w}\t{*}$dst", []>, OpSize;
+  def FARJMP32m  : I<0xFF, MRM5m, (outs), (ins opaque48mem:$dst),
+                     "ljmp{l}\t{*}$dst", []>;
+}
+
+// Conditional branches
+let Uses = [EFLAGS] in {
+// Short conditional jumps
+def JO8   : IBr<0x70, (ins brtarget8:$dst), "jo\t$dst", []>;
+def JNO8  : IBr<0x71, (ins brtarget8:$dst), "jno\t$dst", []>;
+def JB8   : IBr<0x72, (ins brtarget8:$dst), "jb\t$dst", []>;
+def JAE8  : IBr<0x73, (ins brtarget8:$dst), "jae\t$dst", []>;
+def JE8   : IBr<0x74, (ins brtarget8:$dst), "je\t$dst", []>;
+def JNE8  : IBr<0x75, (ins brtarget8:$dst), "jne\t$dst", []>;
+def JBE8  : IBr<0x76, (ins brtarget8:$dst), "jbe\t$dst", []>;
+def JA8   : IBr<0x77, (ins brtarget8:$dst), "ja\t$dst", []>;
+def JS8   : IBr<0x78, (ins brtarget8:$dst), "js\t$dst", []>;
+def JNS8  : IBr<0x79, (ins brtarget8:$dst), "jns\t$dst", []>;
+def JP8   : IBr<0x7A, (ins brtarget8:$dst), "jp\t$dst", []>;
+def JNP8  : IBr<0x7B, (ins brtarget8:$dst), "jnp\t$dst", []>;
+def JL8   : IBr<0x7C, (ins brtarget8:$dst), "jl\t$dst", []>;
+def JGE8  : IBr<0x7D, (ins brtarget8:$dst), "jge\t$dst", []>;
+def JLE8  : IBr<0x7E, (ins brtarget8:$dst), "jle\t$dst", []>;
+def JG8   : IBr<0x7F, (ins brtarget8:$dst), "jg\t$dst", []>;
+
+def JCXZ8 : IBr<0xE3, (ins brtarget8:$dst), "jcxz\t$dst", []>;
+
+def JE  : IBr<0x84, (ins brtarget:$dst), "je\t$dst",
+              [(X86brcond bb:$dst, X86_COND_E, EFLAGS)]>, TB;
+def JNE : IBr<0x85, (ins brtarget:$dst), "jne\t$dst",
+              [(X86brcond bb:$dst, X86_COND_NE, EFLAGS)]>, TB;
+def JL  : IBr<0x8C, (ins brtarget:$dst), "jl\t$dst",
+              [(X86brcond bb:$dst, X86_COND_L, EFLAGS)]>, TB;
+def JLE : IBr<0x8E, (ins brtarget:$dst), "jle\t$dst",
+              [(X86brcond bb:$dst, X86_COND_LE, EFLAGS)]>, TB;
+def JG  : IBr<0x8F, (ins brtarget:$dst), "jg\t$dst",
+              [(X86brcond bb:$dst, X86_COND_G, EFLAGS)]>, TB;
+def JGE : IBr<0x8D, (ins brtarget:$dst), "jge\t$dst",
+              [(X86brcond bb:$dst, X86_COND_GE, EFLAGS)]>, TB;
+
+def JB  : IBr<0x82, (ins brtarget:$dst), "jb\t$dst",
+              [(X86brcond bb:$dst, X86_COND_B, EFLAGS)]>, TB;
+def JBE : IBr<0x86, (ins brtarget:$dst), "jbe\t$dst",
+              [(X86brcond bb:$dst, X86_COND_BE, EFLAGS)]>, TB;
+def JA  : IBr<0x87, (ins brtarget:$dst), "ja\t$dst",
+              [(X86brcond bb:$dst, X86_COND_A, EFLAGS)]>, TB;
+def JAE : IBr<0x83, (ins brtarget:$dst), "jae\t$dst",
+              [(X86brcond bb:$dst, X86_COND_AE, EFLAGS)]>, TB;
+
+def JS  : IBr<0x88, (ins brtarget:$dst), "js\t$dst",
+              [(X86brcond bb:$dst, X86_COND_S, EFLAGS)]>, TB;
+def JNS : IBr<0x89, (ins brtarget:$dst), "jns\t$dst",
+              [(X86brcond bb:$dst, X86_COND_NS, EFLAGS)]>, TB;
+def JP  : IBr<0x8A, (ins brtarget:$dst), "jp\t$dst",
+              [(X86brcond bb:$dst, X86_COND_P, EFLAGS)]>, TB;
+def JNP : IBr<0x8B, (ins brtarget:$dst), "jnp\t$dst",
+              [(X86brcond bb:$dst, X86_COND_NP, EFLAGS)]>, TB;
+def JO  : IBr<0x80, (ins brtarget:$dst), "jo\t$dst",
+              [(X86brcond bb:$dst, X86_COND_O, EFLAGS)]>, TB;
+def JNO : IBr<0x81, (ins brtarget:$dst), "jno\t$dst",
+              [(X86brcond bb:$dst, X86_COND_NO, EFLAGS)]>, TB;
+} // Uses = [EFLAGS]
+
+// Loop instructions
+
+def LOOP   : I<0xE2, RawFrm, (ins brtarget8:$dst), (outs), "loop\t$dst", []>;
+def LOOPE  : I<0xE1, RawFrm, (ins brtarget8:$dst), (outs), "loope\t$dst", []>;
+def LOOPNE : I<0xE0, RawFrm, (ins brtarget8:$dst), (outs), "loopne\t$dst", []>;
+
+//===----------------------------------------------------------------------===//
+//  Call Instructions...
+//
+let isCall = 1 in
+  // All calls clobber the non-callee saved registers. ESP is marked as
+  // a use to prevent stack-pointer assignments that appear immediately
+  // before calls from potentially appearing dead. Uses for argument
+  // registers are added manually.
+  let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
+              MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+              XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+              XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+      Uses = [ESP] in {
+    def CALLpcrel32 : Ii32<0xE8, RawFrm,
+                           (outs), (ins i32imm_pcrel:$dst,variable_ops),
+                           "call\t$dst", []>;
+    def CALL32r     : I<0xFF, MRM2r, (outs), (ins GR32:$dst, variable_ops),
+                        "call\t{*}$dst", [(X86call GR32:$dst)]>;
+    def CALL32m     : I<0xFF, MRM2m, (outs), (ins i32mem:$dst, variable_ops),
+                        "call\t{*}$dst", [(X86call (loadi32 addr:$dst))]>;
+  
+    def FARCALL16i  : Iseg16<0x9A, RawFrm, (outs), 
+                             (ins i16imm:$seg, i16imm:$off),
+                             "lcall{w}\t$seg, $off", []>, OpSize;
+    def FARCALL32i  : Iseg32<0x9A, RawFrm, (outs),
+                             (ins i16imm:$seg, i32imm:$off),
+                             "lcall{l}\t$seg, $off", []>;
+                             
+    def FARCALL16m  : I<0xFF, MRM3m, (outs), (ins opaque32mem:$dst),
+                        "lcall{w}\t{*}$dst", []>, OpSize;
+    def FARCALL32m  : I<0xFF, MRM3m, (outs), (ins opaque48mem:$dst),
+                        "lcall{l}\t{*}$dst", []>;
+  }
+
+// Constructing a stack frame.
+
+def ENTER : I<0xC8, RawFrm, (outs), (ins i16imm:$len, i8imm:$lvl),
+              "enter\t$len, $lvl", []>;
+
+// Tail call stuff.
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
+def TCRETURNdi : I<0, Pseudo, (outs), 
+                   (ins i32imm:$dst, i32imm:$offset, variable_ops),
+                 "#TC_RETURN $dst $offset",
+                 []>;
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
+def TCRETURNri : I<0, Pseudo, (outs), 
+                   (ins GR32:$dst, i32imm:$offset, variable_ops),
+                 "#TC_RETURN $dst $offset",
+                 []>;
+
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
+  def TAILJMPd : IBr<0xE9, (ins i32imm_pcrel:$dst, variable_ops),
+                 "jmp\t$dst  # TAILCALL",
+                 []>;
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
+  def TAILJMPr : I<0xFF, MRM4r, (outs), (ins GR32:$dst, variable_ops), 
+                   "jmp{l}\t{*}$dst  # TAILCALL",
+                 []>;     
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in
+  def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem:$dst, variable_ops),
+                   "jmp\t{*}$dst  # TAILCALL", []>;
+
+//===----------------------------------------------------------------------===//
+//  Miscellaneous Instructions...
+//
+let Defs = [EBP, ESP], Uses = [EBP, ESP], mayLoad = 1, neverHasSideEffects=1 in
+def LEAVE    : I<0xC9, RawFrm,
+                 (outs), (ins), "leave", []>;
+
+def POPCNT16rr : I<0xB8, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                   "popcnt{w}\t{$src, $dst|$dst, $src}", []>, OpSize, XS;
+def POPCNT16rm : I<0xB8, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                   "popcnt{w}\t{$src, $dst|$dst, $src}", []>, OpSize, XS;
+def POPCNT32rr : I<0xB8, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                   "popcnt{l}\t{$src, $dst|$dst, $src}", []>, XS;
+def POPCNT32rm : I<0xB8, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                   "popcnt{l}\t{$src, $dst|$dst, $src}", []>, XS;
+
+let Defs = [ESP], Uses = [ESP], neverHasSideEffects=1 in {
+let mayLoad = 1 in {
+def POP16r  : I<0x58, AddRegFrm, (outs GR16:$reg), (ins), "pop{w}\t$reg", []>,
+  OpSize;
+def POP32r  : I<0x58, AddRegFrm, (outs GR32:$reg), (ins), "pop{l}\t$reg", []>;
+def POP16rmr: I<0x8F, MRM0r, (outs GR16:$reg), (ins), "pop{w}\t$reg", []>,
+  OpSize;
+def POP16rmm: I<0x8F, MRM0m, (outs i16mem:$dst), (ins), "pop{w}\t$dst", []>,
+  OpSize;
+def POP32rmr: I<0x8F, MRM0r, (outs GR32:$reg), (ins), "pop{l}\t$reg", []>;
+def POP32rmm: I<0x8F, MRM0m, (outs i32mem:$dst), (ins), "pop{l}\t$dst", []>;
+}
+
+let mayStore = 1 in {
+def PUSH16r  : I<0x50, AddRegFrm, (outs), (ins GR16:$reg), "push{w}\t$reg",[]>,
+  OpSize;
+def PUSH32r  : I<0x50, AddRegFrm, (outs), (ins GR32:$reg), "push{l}\t$reg",[]>;
+def PUSH16rmr: I<0xFF, MRM6r, (outs), (ins GR16:$reg), "push{w}\t$reg",[]>,
+  OpSize;
+def PUSH16rmm: I<0xFF, MRM6m, (outs), (ins i16mem:$src), "push{w}\t$src",[]>,
+  OpSize;
+def PUSH32rmr: I<0xFF, MRM6r, (outs), (ins GR32:$reg), "push{l}\t$reg",[]>;
+def PUSH32rmm: I<0xFF, MRM6m, (outs), (ins i32mem:$src), "push{l}\t$src",[]>;
+}
+}
+
+let Defs = [ESP], Uses = [ESP], neverHasSideEffects = 1, mayStore = 1 in {
+def PUSH32i8   : Ii8<0x6a, RawFrm, (outs), (ins i8imm:$imm), 
+                     "push{l}\t$imm", []>;
+def PUSH32i16  : Ii16<0x68, RawFrm, (outs), (ins i16imm:$imm), 
+                      "push{l}\t$imm", []>;
+def PUSH32i32  : Ii32<0x68, RawFrm, (outs), (ins i32imm:$imm), 
+                      "push{l}\t$imm", []>;
+}
+
+let Defs = [ESP, EFLAGS], Uses = [ESP], mayLoad = 1, neverHasSideEffects=1 in {
+def POPF     : I<0x9D, RawFrm, (outs), (ins), "popf{w}", []>, OpSize;
+def POPFD    : I<0x9D, RawFrm, (outs), (ins), "popf{l}", []>;
+}
+let Defs = [ESP], Uses = [ESP, EFLAGS], mayStore = 1, neverHasSideEffects=1 in {
+def PUSHF    : I<0x9C, RawFrm, (outs), (ins), "pushf{w}", []>, OpSize;
+def PUSHFD   : I<0x9C, RawFrm, (outs), (ins), "pushf{l}", []>;
+}
+
+let isTwoAddress = 1 in                               // GR32 = bswap GR32
+  def BSWAP32r : I<0xC8, AddRegFrm,
+                   (outs GR32:$dst), (ins GR32:$src),
+                   "bswap{l}\t$dst", 
+                   [(set GR32:$dst, (bswap GR32:$src))]>, TB;
+
+
+// Bit scan instructions.
+let Defs = [EFLAGS] in {
+def BSF16rr  : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                 "bsf{w}\t{$src, $dst|$dst, $src}",
+                 [(set GR16:$dst, (X86bsf GR16:$src)), (implicit EFLAGS)]>, TB;
+def BSF16rm  : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                 "bsf{w}\t{$src, $dst|$dst, $src}",
+                 [(set GR16:$dst, (X86bsf (loadi16 addr:$src))),
+                  (implicit EFLAGS)]>, TB;
+def BSF32rr  : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                 "bsf{l}\t{$src, $dst|$dst, $src}",
+                 [(set GR32:$dst, (X86bsf GR32:$src)), (implicit EFLAGS)]>, TB;
+def BSF32rm  : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                 "bsf{l}\t{$src, $dst|$dst, $src}",
+                 [(set GR32:$dst, (X86bsf (loadi32 addr:$src))),
+                  (implicit EFLAGS)]>, TB;
+
+def BSR16rr  : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                 "bsr{w}\t{$src, $dst|$dst, $src}",
+                 [(set GR16:$dst, (X86bsr GR16:$src)), (implicit EFLAGS)]>, TB;
+def BSR16rm  : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                 "bsr{w}\t{$src, $dst|$dst, $src}",
+                 [(set GR16:$dst, (X86bsr (loadi16 addr:$src))),
+                  (implicit EFLAGS)]>, TB;
+def BSR32rr  : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                 "bsr{l}\t{$src, $dst|$dst, $src}",
+                 [(set GR32:$dst, (X86bsr GR32:$src)), (implicit EFLAGS)]>, TB;
+def BSR32rm  : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                 "bsr{l}\t{$src, $dst|$dst, $src}",
+                 [(set GR32:$dst, (X86bsr (loadi32 addr:$src))),
+                  (implicit EFLAGS)]>, TB;
+} // Defs = [EFLAGS]
+
+let neverHasSideEffects = 1 in
+def LEA16r   : I<0x8D, MRMSrcMem,
+                 (outs GR16:$dst), (ins lea32mem:$src),
+                 "lea{w}\t{$src|$dst}, {$dst|$src}", []>, OpSize;
+let isReMaterializable = 1 in
+def LEA32r   : I<0x8D, MRMSrcMem,
+                 (outs GR32:$dst), (ins lea32mem:$src),
+                 "lea{l}\t{$src|$dst}, {$dst|$src}",
+                 [(set GR32:$dst, lea32addr:$src)]>, Requires<[In32BitMode]>;
+
+let Defs = [ECX,EDI,ESI], Uses = [ECX,EDI,ESI], isCodeGenOnly = 1 in {
+def REP_MOVSB : I<0xA4, RawFrm, (outs), (ins), "{rep;movsb|rep movsb}",
+                  [(X86rep_movs i8)]>, REP;
+def REP_MOVSW : I<0xA5, RawFrm, (outs), (ins), "{rep;movsw|rep movsw}",
+                  [(X86rep_movs i16)]>, REP, OpSize;
+def REP_MOVSD : I<0xA5, RawFrm, (outs), (ins), "{rep;movsl|rep movsd}",
+                  [(X86rep_movs i32)]>, REP;
+}
+
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [EDI,ESI], Uses = [EDI,ESI,EFLAGS] in {
+def MOVSB : I<0xA4, RawFrm, (outs), (ins), "{movsb}", []>;
+def MOVSW : I<0xA5, RawFrm, (outs), (ins), "{movsw}", []>, OpSize;
+def MOVSD : I<0xA5, RawFrm, (outs), (ins), "{movsl|movsd}", []>;
+}
+
+let Defs = [ECX,EDI], Uses = [AL,ECX,EDI], isCodeGenOnly = 1 in
+def REP_STOSB : I<0xAA, RawFrm, (outs), (ins), "{rep;stosb|rep stosb}",
+                  [(X86rep_stos i8)]>, REP;
+let Defs = [ECX,EDI], Uses = [AX,ECX,EDI], isCodeGenOnly = 1 in
+def REP_STOSW : I<0xAB, RawFrm, (outs), (ins), "{rep;stosw|rep stosw}",
+                  [(X86rep_stos i16)]>, REP, OpSize;
+let Defs = [ECX,EDI], Uses = [EAX,ECX,EDI], isCodeGenOnly = 1 in
+def REP_STOSD : I<0xAB, RawFrm, (outs), (ins), "{rep;stosl|rep stosd}",
+                  [(X86rep_stos i32)]>, REP;
+
+// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
+let Defs = [EDI], Uses = [AL,EDI,EFLAGS] in
+def STOSB : I<0xAA, RawFrm, (outs), (ins), "{stosb}", []>;
+let Defs = [EDI], Uses = [AX,EDI,EFLAGS] in
+def STOSW : I<0xAB, RawFrm, (outs), (ins), "{stosw}", []>, OpSize;
+let Defs = [EDI], Uses = [EAX,EDI,EFLAGS] in
+def STOSD : I<0xAB, RawFrm, (outs), (ins), "{stosl|stosd}", []>;
+
+def SCAS8 : I<0xAE, RawFrm, (outs), (ins), "scas{b}", []>;
+def SCAS16 : I<0xAF, RawFrm, (outs), (ins), "scas{w}", []>, OpSize;
+def SCAS32 : I<0xAF, RawFrm, (outs), (ins), "scas{l}", []>;
+
+def CMPS8 : I<0xA6, RawFrm, (outs), (ins), "cmps{b}", []>;
+def CMPS16 : I<0xA7, RawFrm, (outs), (ins), "cmps{w}", []>, OpSize;
+def CMPS32 : I<0xA7, RawFrm, (outs), (ins), "cmps{l}", []>;
+
+let Defs = [RAX, RDX] in
+def RDTSC : I<0x31, RawFrm, (outs), (ins), "rdtsc", [(X86rdtsc)]>,
+            TB;
+
+let isBarrier = 1, hasCtrlDep = 1 in {
+def TRAP    : I<0x0B, RawFrm, (outs), (ins), "ud2", [(trap)]>, TB;
+}
+
+def SYSCALL  : I<0x05, RawFrm,
+                 (outs), (ins), "syscall", []>, TB;
+def SYSRET   : I<0x07, RawFrm,
+                 (outs), (ins), "sysret", []>, TB;
+def SYSENTER : I<0x34, RawFrm,
+                 (outs), (ins), "sysenter", []>, TB;
+def SYSEXIT  : I<0x35, RawFrm,
+                 (outs), (ins), "sysexit", []>, TB;
+
+def WAIT : I<0x9B, RawFrm, (outs), (ins), "wait", []>;
+
+
+//===----------------------------------------------------------------------===//
+//  Input/Output Instructions...
+//
+let Defs = [AL], Uses = [DX] in
+def IN8rr  : I<0xEC, RawFrm, (outs), (ins),
+               "in{b}\t{%dx, %al|%AL, %DX}", []>;
+let Defs = [AX], Uses = [DX] in
+def IN16rr : I<0xED, RawFrm, (outs), (ins),
+               "in{w}\t{%dx, %ax|%AX, %DX}", []>,  OpSize;
+let Defs = [EAX], Uses = [DX] in
+def IN32rr : I<0xED, RawFrm, (outs), (ins),
+               "in{l}\t{%dx, %eax|%EAX, %DX}", []>;
+
+let Defs = [AL] in
+def IN8ri  : Ii8<0xE4, RawFrm, (outs), (ins i16i8imm:$port),
+                  "in{b}\t{$port, %al|%AL, $port}", []>;
+let Defs = [AX] in
+def IN16ri : Ii8<0xE5, RawFrm, (outs), (ins i16i8imm:$port),
+                  "in{w}\t{$port, %ax|%AX, $port}", []>, OpSize;
+let Defs = [EAX] in
+def IN32ri : Ii8<0xE5, RawFrm, (outs), (ins i16i8imm:$port),
+                  "in{l}\t{$port, %eax|%EAX, $port}", []>;
+
+let Uses = [DX, AL] in
+def OUT8rr  : I<0xEE, RawFrm, (outs), (ins),
+                "out{b}\t{%al, %dx|%DX, %AL}", []>;
+let Uses = [DX, AX] in
+def OUT16rr : I<0xEF, RawFrm, (outs), (ins),
+                "out{w}\t{%ax, %dx|%DX, %AX}", []>, OpSize;
+let Uses = [DX, EAX] in
+def OUT32rr : I<0xEF, RawFrm, (outs), (ins),
+                "out{l}\t{%eax, %dx|%DX, %EAX}", []>;
+
+let Uses = [AL] in
+def OUT8ir  : Ii8<0xE6, RawFrm, (outs), (ins i16i8imm:$port),
+                   "out{b}\t{%al, $port|$port, %AL}", []>;
+let Uses = [AX] in
+def OUT16ir : Ii8<0xE7, RawFrm, (outs), (ins i16i8imm:$port),
+                   "out{w}\t{%ax, $port|$port, %AX}", []>, OpSize;
+let Uses = [EAX] in
+def OUT32ir : Ii8<0xE7, RawFrm, (outs), (ins i16i8imm:$port),
+                   "out{l}\t{%eax, $port|$port, %EAX}", []>;
+
+def IN8  : I<0x6C, RawFrm, (outs), (ins),
+             "ins{b}", []>;
+def IN16 : I<0x6D, RawFrm, (outs), (ins),
+             "ins{w}", []>,  OpSize;
+def IN32 : I<0x6D, RawFrm, (outs), (ins),
+             "ins{l}", []>;
+
+//===----------------------------------------------------------------------===//
+//  Move Instructions...
+//
+let neverHasSideEffects = 1 in {
+def MOV8rr  : I<0x88, MRMDestReg, (outs GR8 :$dst), (ins GR8 :$src),
+                "mov{b}\t{$src, $dst|$dst, $src}", []>;
+def MOV16rr : I<0x89, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
+def MOV32rr : I<0x89, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}", []>;
+}
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+def MOV8ri  : Ii8 <0xB0, AddRegFrm, (outs GR8 :$dst), (ins i8imm :$src),
+                   "mov{b}\t{$src, $dst|$dst, $src}",
+                   [(set GR8:$dst, imm:$src)]>;
+def MOV16ri : Ii16<0xB8, AddRegFrm, (outs GR16:$dst), (ins i16imm:$src),
+                   "mov{w}\t{$src, $dst|$dst, $src}",
+                   [(set GR16:$dst, imm:$src)]>, OpSize;
+def MOV32ri : Ii32<0xB8, AddRegFrm, (outs GR32:$dst), (ins i32imm:$src),
+                   "mov{l}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, imm:$src)]>;
+}
+
+def MOV8mi  : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src),
+                   "mov{b}\t{$src, $dst|$dst, $src}",
+                   [(store (i8 imm:$src), addr:$dst)]>;
+def MOV16mi : Ii16<0xC7, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src),
+                   "mov{w}\t{$src, $dst|$dst, $src}",
+                   [(store (i16 imm:$src), addr:$dst)]>, OpSize;
+def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src),
+                   "mov{l}\t{$src, $dst|$dst, $src}",
+                   [(store (i32 imm:$src), addr:$dst)]>;
+
+def MOV8o8a : Ii8 <0xA0, RawFrm, (outs), (ins offset8:$src),
+                   "mov{b}\t{$src, %al|%al, $src}", []>;
+def MOV16o16a : Ii16 <0xA1, RawFrm, (outs), (ins offset16:$src),
+                      "mov{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
+def MOV32o32a : Ii32 <0xA1, RawFrm, (outs), (ins offset32:$src),
+                      "mov{l}\t{$src, %eax|%eax, $src}", []>;
+
+def MOV8ao8 : Ii8 <0xA2, RawFrm, (outs offset8:$dst), (ins),
+                   "mov{b}\t{%al, $dst|$dst, %al}", []>;
+def MOV16ao16 : Ii16 <0xA3, RawFrm, (outs offset16:$dst), (ins),
+                      "mov{w}\t{%ax, $dst|$dst, %ax}", []>, OpSize;
+def MOV32ao32 : Ii32 <0xA3, RawFrm, (outs offset32:$dst), (ins),
+                      "mov{l}\t{%eax, $dst|$dst, %eax}", []>;
+
+// Moves to and from segment registers
+def MOV16rs : I<0x8C, MRMDestReg, (outs GR16:$dst), (ins SEGMENT_REG:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}", []>;
+def MOV16ms : I<0x8C, MRMDestMem, (outs i16mem:$dst), (ins SEGMENT_REG:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}", []>;
+def MOV16sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR16:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}", []>;
+def MOV16sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i16mem:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}", []>;
+
+def MOV8rr_REV : I<0x8A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src),
+                   "mov{b}\t{$src, $dst|$dst, $src}", []>;
+def MOV16rr_REV : I<0x8B, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                    "mov{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
+def MOV32rr_REV : I<0x8B, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                    "mov{l}\t{$src, $dst|$dst, $src}", []>;
+
+let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in {
+def MOV8rm  : I<0x8A, MRMSrcMem, (outs GR8 :$dst), (ins i8mem :$src),
+                "mov{b}\t{$src, $dst|$dst, $src}",
+                [(set GR8:$dst, (loadi8 addr:$src))]>;
+def MOV16rm : I<0x8B, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}",
+                [(set GR16:$dst, (loadi16 addr:$src))]>, OpSize;
+def MOV32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}",
+                [(set GR32:$dst, (loadi32 addr:$src))]>;
+}
+
+def MOV8mr  : I<0x88, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src),
+                "mov{b}\t{$src, $dst|$dst, $src}",
+                [(store GR8:$src, addr:$dst)]>;
+def MOV16mr : I<0x89, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+                "mov{w}\t{$src, $dst|$dst, $src}",
+                [(store GR16:$src, addr:$dst)]>, OpSize;
+def MOV32mr : I<0x89, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}",
+                [(store GR32:$src, addr:$dst)]>;
+
+// Versions of MOV8rr, MOV8mr, and MOV8rm that use i8mem_NOREX and GR8_NOREX so
+// that they can be used for copying and storing h registers, which can't be
+// encoded when a REX prefix is present.
+let neverHasSideEffects = 1 in
+def MOV8rr_NOREX : I<0x88, MRMDestReg,
+                     (outs GR8_NOREX:$dst), (ins GR8_NOREX:$src),
+                     "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", []>;
+let mayStore = 1 in
+def MOV8mr_NOREX : I<0x88, MRMDestMem,
+                     (outs), (ins i8mem_NOREX:$dst, GR8_NOREX:$src),
+                     "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", []>;
+let mayLoad = 1,
+    canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+def MOV8rm_NOREX : I<0x8A, MRMSrcMem,
+                     (outs GR8_NOREX:$dst), (ins i8mem_NOREX:$src),
+                     "mov{b}\t{$src, $dst|$dst, $src}  # NOREX", []>;
+
+// Moves to and from debug registers
+def MOV32rd : I<0x21, MRMDestReg, (outs GR32:$dst), (ins DEBUG_REG:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}", []>, TB;
+def MOV32dr : I<0x23, MRMSrcReg, (outs DEBUG_REG:$dst), (ins GR32:$src),
+                "mov{l}\t{$src, $dst|$dst, $src}", []>, TB;
+                
+// Moves to and from control registers
+def MOV32rc : I<0x20, MRMDestReg, (outs GR32:$dst), (ins CONTROL_REG_32:$src),
+                "mov{q}\t{$src, $dst|$dst, $src}", []>, TB;
+def MOV32cr : I<0x22, MRMSrcReg, (outs CONTROL_REG_32:$dst), (ins GR32:$src),
+                "mov{q}\t{$src, $dst|$dst, $src}", []>, TB;
+
+//===----------------------------------------------------------------------===//
+//  Fixed-Register Multiplication and Division Instructions...
+//
+
+// Extra precision multiplication
+let Defs = [AL,AH,EFLAGS], Uses = [AL] in
+def MUL8r  : I<0xF6, MRM4r, (outs),  (ins GR8:$src), "mul{b}\t$src",
+               // FIXME: Used for 8-bit mul, ignore result upper 8 bits.
+               // This probably ought to be moved to a def : Pat<> if the
+               // syntax can be accepted.
+               [(set AL, (mul AL, GR8:$src)),
+                (implicit EFLAGS)]>;     // AL,AH = AL*GR8
+
+let Defs = [AX,DX,EFLAGS], Uses = [AX], neverHasSideEffects = 1 in
+def MUL16r : I<0xF7, MRM4r, (outs),  (ins GR16:$src),
+               "mul{w}\t$src", 
+               []>, OpSize;    // AX,DX = AX*GR16
+
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX], neverHasSideEffects = 1 in
+def MUL32r : I<0xF7, MRM4r, (outs),  (ins GR32:$src),
+               "mul{l}\t$src",
+               []>; // EAX,EDX = EAX*GR32
+
+let Defs = [AL,AH,EFLAGS], Uses = [AL] in
+def MUL8m  : I<0xF6, MRM4m, (outs), (ins i8mem :$src),
+               "mul{b}\t$src",
+               // FIXME: Used for 8-bit mul, ignore result upper 8 bits.
+               // This probably ought to be moved to a def : Pat<> if the
+               // syntax can be accepted.
+               [(set AL, (mul AL, (loadi8 addr:$src))),
+                (implicit EFLAGS)]>;   // AL,AH = AL*[mem8]
+
+let mayLoad = 1, neverHasSideEffects = 1 in {
+let Defs = [AX,DX,EFLAGS], Uses = [AX] in
+def MUL16m : I<0xF7, MRM4m, (outs), (ins i16mem:$src),
+               "mul{w}\t$src",
+               []>, OpSize; // AX,DX = AX*[mem16]
+
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
+def MUL32m : I<0xF7, MRM4m, (outs), (ins i32mem:$src),
+              "mul{l}\t$src",
+              []>;          // EAX,EDX = EAX*[mem32]
+}
+
+let neverHasSideEffects = 1 in {
+let Defs = [AL,AH,EFLAGS], Uses = [AL] in
+def IMUL8r  : I<0xF6, MRM5r, (outs),  (ins GR8:$src), "imul{b}\t$src", []>;
+              // AL,AH = AL*GR8
+let Defs = [AX,DX,EFLAGS], Uses = [AX] in
+def IMUL16r : I<0xF7, MRM5r, (outs),  (ins GR16:$src), "imul{w}\t$src", []>,
+              OpSize;    // AX,DX = AX*GR16
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
+def IMUL32r : I<0xF7, MRM5r, (outs),  (ins GR32:$src), "imul{l}\t$src", []>;
+              // EAX,EDX = EAX*GR32
+let mayLoad = 1 in {
+let Defs = [AL,AH,EFLAGS], Uses = [AL] in
+def IMUL8m  : I<0xF6, MRM5m, (outs), (ins i8mem :$src),
+                "imul{b}\t$src", []>;    // AL,AH = AL*[mem8]
+let Defs = [AX,DX,EFLAGS], Uses = [AX] in
+def IMUL16m : I<0xF7, MRM5m, (outs), (ins i16mem:$src),
+                "imul{w}\t$src", []>, OpSize; // AX,DX = AX*[mem16]
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX] in
+def IMUL32m : I<0xF7, MRM5m, (outs), (ins i32mem:$src),
+                "imul{l}\t$src", []>;  // EAX,EDX = EAX*[mem32]
+}
+} // neverHasSideEffects
+
+// unsigned division/remainder
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+def DIV8r  : I<0xF6, MRM6r, (outs),  (ins GR8:$src),    // AX/r8 = AL,AH
+               "div{b}\t$src", []>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def DIV16r : I<0xF7, MRM6r, (outs),  (ins GR16:$src),   // DX:AX/r16 = AX,DX
+               "div{w}\t$src", []>, OpSize;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
+def DIV32r : I<0xF7, MRM6r, (outs),  (ins GR32:$src),   // EDX:EAX/r32 = EAX,EDX
+               "div{l}\t$src", []>;
+let mayLoad = 1 in {
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+def DIV8m  : I<0xF6, MRM6m, (outs), (ins i8mem:$src),   // AX/[mem8] = AL,AH
+               "div{b}\t$src", []>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def DIV16m : I<0xF7, MRM6m, (outs), (ins i16mem:$src),  // DX:AX/[mem16] = AX,DX
+               "div{w}\t$src", []>, OpSize;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
+                                                    // EDX:EAX/[mem32] = EAX,EDX
+def DIV32m : I<0xF7, MRM6m, (outs), (ins i32mem:$src),
+               "div{l}\t$src", []>;
+}
+
+// Signed division/remainder.
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+def IDIV8r : I<0xF6, MRM7r, (outs),  (ins GR8:$src),    // AX/r8 = AL,AH
+               "idiv{b}\t$src", []>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def IDIV16r: I<0xF7, MRM7r, (outs),  (ins GR16:$src),   // DX:AX/r16 = AX,DX
+               "idiv{w}\t$src", []>, OpSize;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
+def IDIV32r: I<0xF7, MRM7r, (outs),  (ins GR32:$src),   // EDX:EAX/r32 = EAX,EDX
+               "idiv{l}\t$src", []>;
+let mayLoad = 1, mayLoad = 1 in {
+let Defs = [AL,AH,EFLAGS], Uses = [AX] in
+def IDIV8m : I<0xF6, MRM7m, (outs), (ins i8mem:$src),   // AX/[mem8] = AL,AH
+               "idiv{b}\t$src", []>;
+let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
+def IDIV16m: I<0xF7, MRM7m, (outs), (ins i16mem:$src),  // DX:AX/[mem16] = AX,DX
+               "idiv{w}\t$src", []>, OpSize;
+let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in
+def IDIV32m: I<0xF7, MRM7m, (outs), (ins i32mem:$src), 
+                                                    // EDX:EAX/[mem32] = EAX,EDX
+               "idiv{l}\t$src", []>;
+}
+
+//===----------------------------------------------------------------------===//
+//  Two address Instructions.
+//
+let isTwoAddress = 1 in {
+
+// Conditional moves
+let Uses = [EFLAGS] in {
+
+// X86 doesn't have 8-bit conditional moves. Use a customInserter to
+// emit control flow. An alternative to this is to mark i8 SELECT as Promote,
+// however that requires promoting the operands, and can induce additional
+// i8 register pressure. Note that CMOV_GR8 is conservatively considered to
+// clobber EFLAGS, because if one of the operands is zero, the expansion
+// could involve an xor.
+let usesCustomInserter = 1, isTwoAddress = 0, Defs = [EFLAGS] in
+def CMOV_GR8 : I<0, Pseudo,
+                 (outs GR8:$dst), (ins GR8:$src1, GR8:$src2, i8imm:$cond),
+                 "#CMOV_GR8 PSEUDO!",
+                 [(set GR8:$dst, (X86cmov GR8:$src1, GR8:$src2,
+                                          imm:$cond, EFLAGS))]>;
+
+let isCommutable = 1 in {
+def CMOVB16rr : I<0x42, MRMSrcReg,       // if <u, GR16 = GR16
+                  (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                  "cmovb{w}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_B, EFLAGS))]>,
+                  TB, OpSize;
+def CMOVB32rr : I<0x42, MRMSrcReg,       // if <u, GR32 = GR32
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "cmovb{l}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_B, EFLAGS))]>,
+                   TB;
+def CMOVAE16rr: I<0x43, MRMSrcReg,       // if >=u, GR16 = GR16
+                  (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                  "cmovae{w}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_AE, EFLAGS))]>,
+                   TB, OpSize;
+def CMOVAE32rr: I<0x43, MRMSrcReg,       // if >=u, GR32 = GR32
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "cmovae{l}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_AE, EFLAGS))]>,
+                   TB;
+def CMOVE16rr : I<0x44, MRMSrcReg,       // if ==, GR16 = GR16
+                  (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                  "cmove{w}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_E, EFLAGS))]>,
+                   TB, OpSize;
+def CMOVE32rr : I<0x44, MRMSrcReg,       // if ==, GR32 = GR32
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "cmove{l}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_E, EFLAGS))]>,
+                   TB;
+def CMOVNE16rr: I<0x45, MRMSrcReg,       // if !=, GR16 = GR16
+                  (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                  "cmovne{w}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_NE, EFLAGS))]>,
+                   TB, OpSize;
+def CMOVNE32rr: I<0x45, MRMSrcReg,       // if !=, GR32 = GR32
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "cmovne{l}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_NE, EFLAGS))]>,
+                   TB;
+def CMOVBE16rr: I<0x46, MRMSrcReg,       // if <=u, GR16 = GR16
+                  (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                  "cmovbe{w}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_BE, EFLAGS))]>,
+                   TB, OpSize;
+def CMOVBE32rr: I<0x46, MRMSrcReg,       // if <=u, GR32 = GR32
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "cmovbe{l}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_BE, EFLAGS))]>,
+                   TB;
+def CMOVA16rr : I<0x47, MRMSrcReg,       // if >u, GR16 = GR16
+                  (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                  "cmova{w}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_A, EFLAGS))]>,
+                   TB, OpSize;
+def CMOVA32rr : I<0x47, MRMSrcReg,       // if >u, GR32 = GR32
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "cmova{l}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_A, EFLAGS))]>,
+                   TB;
+def CMOVL16rr : I<0x4C, MRMSrcReg,       // if <s, GR16 = GR16
+                  (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                  "cmovl{w}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_L, EFLAGS))]>,
+                   TB, OpSize;
+def CMOVL32rr : I<0x4C, MRMSrcReg,       // if <s, GR32 = GR32
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "cmovl{l}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_L, EFLAGS))]>,
+                   TB;
+def CMOVGE16rr: I<0x4D, MRMSrcReg,       // if >=s, GR16 = GR16
+                  (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                  "cmovge{w}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_GE, EFLAGS))]>,
+                   TB, OpSize;
+def CMOVGE32rr: I<0x4D, MRMSrcReg,       // if >=s, GR32 = GR32
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "cmovge{l}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_GE, EFLAGS))]>,
+                   TB;
+def CMOVLE16rr: I<0x4E, MRMSrcReg,       // if <=s, GR16 = GR16
+                  (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                  "cmovle{w}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_LE, EFLAGS))]>,
+                   TB, OpSize;
+def CMOVLE32rr: I<0x4E, MRMSrcReg,       // if <=s, GR32 = GR32
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "cmovle{l}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_LE, EFLAGS))]>,
+                   TB;
+def CMOVG16rr : I<0x4F, MRMSrcReg,       // if >s, GR16 = GR16
+                  (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                  "cmovg{w}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_G, EFLAGS))]>,
+                   TB, OpSize;
+def CMOVG32rr : I<0x4F, MRMSrcReg,       // if >s, GR32 = GR32
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "cmovg{l}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_G, EFLAGS))]>,
+                   TB;
+def CMOVS16rr : I<0x48, MRMSrcReg,       // if signed, GR16 = GR16
+                  (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                  "cmovs{w}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_S, EFLAGS))]>,
+                  TB, OpSize;
+def CMOVS32rr : I<0x48, MRMSrcReg,       // if signed, GR32 = GR32
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "cmovs{l}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_S, EFLAGS))]>,
+                  TB;
+def CMOVNS16rr: I<0x49, MRMSrcReg,       // if !signed, GR16 = GR16
+                  (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                  "cmovns{w}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_NS, EFLAGS))]>,
+                  TB, OpSize;
+def CMOVNS32rr: I<0x49, MRMSrcReg,       // if !signed, GR32 = GR32
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "cmovns{l}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_NS, EFLAGS))]>,
+                  TB;
+def CMOVP16rr : I<0x4A, MRMSrcReg,       // if parity, GR16 = GR16
+                  (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                  "cmovp{w}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_P, EFLAGS))]>,
+                  TB, OpSize;
+def CMOVP32rr : I<0x4A, MRMSrcReg,       // if parity, GR32 = GR32
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "cmovp{l}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_P, EFLAGS))]>,
+                  TB;
+def CMOVNP16rr : I<0x4B, MRMSrcReg,       // if !parity, GR16 = GR16
+                  (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                  "cmovnp{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                    X86_COND_NP, EFLAGS))]>,
+                  TB, OpSize;
+def CMOVNP32rr : I<0x4B, MRMSrcReg,       // if !parity, GR32 = GR32
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "cmovnp{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                    X86_COND_NP, EFLAGS))]>,
+                  TB;
+def CMOVO16rr : I<0x40, MRMSrcReg,       // if overflow, GR16 = GR16
+                  (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                  "cmovo{w}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                   X86_COND_O, EFLAGS))]>,
+                  TB, OpSize;
+def CMOVO32rr : I<0x40, MRMSrcReg,       // if overflow, GR32 = GR32
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "cmovo{l}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                   X86_COND_O, EFLAGS))]>,
+                  TB;
+def CMOVNO16rr : I<0x41, MRMSrcReg,       // if !overflow, GR16 = GR16
+                  (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                  "cmovno{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (X86cmov GR16:$src1, GR16:$src2,
+                                    X86_COND_NO, EFLAGS))]>,
+                  TB, OpSize;
+def CMOVNO32rr : I<0x41, MRMSrcReg,       // if !overflow, GR32 = GR32
+                  (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                  "cmovno{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (X86cmov GR32:$src1, GR32:$src2,
+                                    X86_COND_NO, EFLAGS))]>,
+                  TB;
+} // isCommutable = 1
+
+def CMOVB16rm : I<0x42, MRMSrcMem,       // if <u, GR16 = [mem16]
+                  (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+                  "cmovb{w}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_B, EFLAGS))]>,
+                  TB, OpSize;
+def CMOVB32rm : I<0x42, MRMSrcMem,       // if <u, GR32 = [mem32]
+                  (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+                  "cmovb{l}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_B, EFLAGS))]>,
+                   TB;
+def CMOVAE16rm: I<0x43, MRMSrcMem,       // if >=u, GR16 = [mem16]
+                  (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+                  "cmovae{w}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_AE, EFLAGS))]>,
+                   TB, OpSize;
+def CMOVAE32rm: I<0x43, MRMSrcMem,       // if >=u, GR32 = [mem32]
+                  (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+                  "cmovae{l}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_AE, EFLAGS))]>,
+                   TB;
+def CMOVE16rm : I<0x44, MRMSrcMem,       // if ==, GR16 = [mem16]
+                  (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+                  "cmove{w}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_E, EFLAGS))]>,
+                   TB, OpSize;
+def CMOVE32rm : I<0x44, MRMSrcMem,       // if ==, GR32 = [mem32]
+                  (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+                  "cmove{l}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_E, EFLAGS))]>,
+                   TB;
+def CMOVNE16rm: I<0x45, MRMSrcMem,       // if !=, GR16 = [mem16]
+                  (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+                  "cmovne{w}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_NE, EFLAGS))]>,
+                   TB, OpSize;
+def CMOVNE32rm: I<0x45, MRMSrcMem,       // if !=, GR32 = [mem32]
+                  (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+                  "cmovne{l}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_NE, EFLAGS))]>,
+                   TB;
+def CMOVBE16rm: I<0x46, MRMSrcMem,       // if <=u, GR16 = [mem16]
+                  (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+                  "cmovbe{w}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_BE, EFLAGS))]>,
+                   TB, OpSize;
+def CMOVBE32rm: I<0x46, MRMSrcMem,       // if <=u, GR32 = [mem32]
+                  (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+                  "cmovbe{l}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_BE, EFLAGS))]>,
+                   TB;
+def CMOVA16rm : I<0x47, MRMSrcMem,       // if >u, GR16 = [mem16]
+                  (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+                  "cmova{w}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_A, EFLAGS))]>,
+                   TB, OpSize;
+def CMOVA32rm : I<0x47, MRMSrcMem,       // if >u, GR32 = [mem32]
+                  (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+                  "cmova{l}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_A, EFLAGS))]>,
+                   TB;
+def CMOVL16rm : I<0x4C, MRMSrcMem,       // if <s, GR16 = [mem16]
+                  (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+                  "cmovl{w}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_L, EFLAGS))]>,
+                   TB, OpSize;
+def CMOVL32rm : I<0x4C, MRMSrcMem,       // if <s, GR32 = [mem32]
+                  (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+                  "cmovl{l}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_L, EFLAGS))]>,
+                   TB;
+def CMOVGE16rm: I<0x4D, MRMSrcMem,       // if >=s, GR16 = [mem16]
+                  (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+                  "cmovge{w}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_GE, EFLAGS))]>,
+                   TB, OpSize;
+def CMOVGE32rm: I<0x4D, MRMSrcMem,       // if >=s, GR32 = [mem32]
+                  (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+                  "cmovge{l}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_GE, EFLAGS))]>,
+                   TB;
+def CMOVLE16rm: I<0x4E, MRMSrcMem,       // if <=s, GR16 = [mem16]
+                  (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+                  "cmovle{w}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_LE, EFLAGS))]>,
+                   TB, OpSize;
+def CMOVLE32rm: I<0x4E, MRMSrcMem,       // if <=s, GR32 = [mem32]
+                  (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+                  "cmovle{l}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_LE, EFLAGS))]>,
+                   TB;
+def CMOVG16rm : I<0x4F, MRMSrcMem,       // if >s, GR16 = [mem16]
+                  (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+                  "cmovg{w}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_G, EFLAGS))]>,
+                   TB, OpSize;
+def CMOVG32rm : I<0x4F, MRMSrcMem,       // if >s, GR32 = [mem32]
+                  (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+                  "cmovg{l}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_G, EFLAGS))]>,
+                   TB;
+def CMOVS16rm : I<0x48, MRMSrcMem,       // if signed, GR16 = [mem16]
+                  (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+                  "cmovs{w}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_S, EFLAGS))]>,
+                  TB, OpSize;
+def CMOVS32rm : I<0x48, MRMSrcMem,       // if signed, GR32 = [mem32]
+                  (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+                  "cmovs{l}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_S, EFLAGS))]>,
+                  TB;
+def CMOVNS16rm: I<0x49, MRMSrcMem,       // if !signed, GR16 = [mem16]
+                  (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+                  "cmovns{w}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_NS, EFLAGS))]>,
+                  TB, OpSize;
+def CMOVNS32rm: I<0x49, MRMSrcMem,       // if !signed, GR32 = [mem32]
+                  (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+                  "cmovns{l}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_NS, EFLAGS))]>,
+                  TB;
+def CMOVP16rm : I<0x4A, MRMSrcMem,       // if parity, GR16 = [mem16]
+                  (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+                  "cmovp{w}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_P, EFLAGS))]>,
+                  TB, OpSize;
+def CMOVP32rm : I<0x4A, MRMSrcMem,       // if parity, GR32 = [mem32]
+                  (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+                  "cmovp{l}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_P, EFLAGS))]>,
+                  TB;
+def CMOVNP16rm : I<0x4B, MRMSrcMem,       // if !parity, GR16 = [mem16]
+                  (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+                  "cmovnp{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                    X86_COND_NP, EFLAGS))]>,
+                  TB, OpSize;
+def CMOVNP32rm : I<0x4B, MRMSrcMem,       // if !parity, GR32 = [mem32]
+                  (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+                  "cmovnp{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                    X86_COND_NP, EFLAGS))]>,
+                  TB;
+def CMOVO16rm : I<0x40, MRMSrcMem,       // if overflow, GR16 = [mem16]
+                  (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+                  "cmovo{w}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                   X86_COND_O, EFLAGS))]>,
+                  TB, OpSize;
+def CMOVO32rm : I<0x40, MRMSrcMem,       // if overflow, GR32 = [mem32]
+                  (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+                  "cmovo{l}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                   X86_COND_O, EFLAGS))]>,
+                  TB;
+def CMOVNO16rm : I<0x41, MRMSrcMem,       // if !overflow, GR16 = [mem16]
+                  (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+                  "cmovno{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
+                                    X86_COND_NO, EFLAGS))]>,
+                  TB, OpSize;
+def CMOVNO32rm : I<0x41, MRMSrcMem,       // if !overflow, GR32 = [mem32]
+                  (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+                  "cmovno{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
+                                    X86_COND_NO, EFLAGS))]>,
+                  TB;
+} // Uses = [EFLAGS]
+
+
+// unary instructions
+let CodeSize = 2 in {
+let Defs = [EFLAGS] in {
+def NEG8r  : I<0xF6, MRM3r, (outs GR8 :$dst), (ins GR8 :$src), "neg{b}\t$dst",
+               [(set GR8:$dst, (ineg GR8:$src)),
+                (implicit EFLAGS)]>;
+def NEG16r : I<0xF7, MRM3r, (outs GR16:$dst), (ins GR16:$src), "neg{w}\t$dst",
+               [(set GR16:$dst, (ineg GR16:$src)),
+                (implicit EFLAGS)]>, OpSize;
+def NEG32r : I<0xF7, MRM3r, (outs GR32:$dst), (ins GR32:$src), "neg{l}\t$dst",
+               [(set GR32:$dst, (ineg GR32:$src)),
+                (implicit EFLAGS)]>;
+let isTwoAddress = 0 in {
+  def NEG8m  : I<0xF6, MRM3m, (outs), (ins i8mem :$dst), "neg{b}\t$dst",
+                 [(store (ineg (loadi8 addr:$dst)), addr:$dst),
+                  (implicit EFLAGS)]>;
+  def NEG16m : I<0xF7, MRM3m, (outs), (ins i16mem:$dst), "neg{w}\t$dst",
+                 [(store (ineg (loadi16 addr:$dst)), addr:$dst),
+                  (implicit EFLAGS)]>, OpSize;
+  def NEG32m : I<0xF7, MRM3m, (outs), (ins i32mem:$dst), "neg{l}\t$dst",
+                 [(store (ineg (loadi32 addr:$dst)), addr:$dst),
+                  (implicit EFLAGS)]>;
+}
+} // Defs = [EFLAGS]
+
+// Match xor -1 to not. Favors these over a move imm + xor to save code size.
+let AddedComplexity = 15 in {
+def NOT8r  : I<0xF6, MRM2r, (outs GR8 :$dst), (ins GR8 :$src), "not{b}\t$dst",
+               [(set GR8:$dst, (not GR8:$src))]>;
+def NOT16r : I<0xF7, MRM2r, (outs GR16:$dst), (ins GR16:$src), "not{w}\t$dst",
+               [(set GR16:$dst, (not GR16:$src))]>, OpSize;
+def NOT32r : I<0xF7, MRM2r, (outs GR32:$dst), (ins GR32:$src), "not{l}\t$dst",
+               [(set GR32:$dst, (not GR32:$src))]>;
+}
+let isTwoAddress = 0 in {
+  def NOT8m  : I<0xF6, MRM2m, (outs), (ins i8mem :$dst), "not{b}\t$dst",
+                 [(store (not (loadi8 addr:$dst)), addr:$dst)]>;
+  def NOT16m : I<0xF7, MRM2m, (outs), (ins i16mem:$dst), "not{w}\t$dst",
+                 [(store (not (loadi16 addr:$dst)), addr:$dst)]>, OpSize;
+  def NOT32m : I<0xF7, MRM2m, (outs), (ins i32mem:$dst), "not{l}\t$dst",
+                 [(store (not (loadi32 addr:$dst)), addr:$dst)]>;
+}
+} // CodeSize
+
+// TODO: inc/dec is slow for P4, but fast for Pentium-M.
+let Defs = [EFLAGS] in {
+let CodeSize = 2 in
+def INC8r  : I<0xFE, MRM0r, (outs GR8 :$dst), (ins GR8 :$src), "inc{b}\t$dst",
+               [(set GR8:$dst, (add GR8:$src, 1)),
+                (implicit EFLAGS)]>;
+let isConvertibleToThreeAddress = 1, CodeSize = 1 in {  // Can xform into LEA.
+def INC16r : I<0x40, AddRegFrm, (outs GR16:$dst), (ins GR16:$src), 
+               "inc{w}\t$dst",
+               [(set GR16:$dst, (add GR16:$src, 1)),
+                (implicit EFLAGS)]>,
+             OpSize, Requires<[In32BitMode]>;
+def INC32r : I<0x40, AddRegFrm, (outs GR32:$dst), (ins GR32:$src), 
+               "inc{l}\t$dst",
+               [(set GR32:$dst, (add GR32:$src, 1)),
+                (implicit EFLAGS)]>, Requires<[In32BitMode]>;
+}
+let isTwoAddress = 0, CodeSize = 2 in {
+  def INC8m  : I<0xFE, MRM0m, (outs), (ins i8mem :$dst), "inc{b}\t$dst",
+               [(store (add (loadi8 addr:$dst), 1), addr:$dst),
+                (implicit EFLAGS)]>;
+  def INC16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst), "inc{w}\t$dst",
+               [(store (add (loadi16 addr:$dst), 1), addr:$dst),
+                (implicit EFLAGS)]>,
+               OpSize, Requires<[In32BitMode]>;
+  def INC32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst",
+               [(store (add (loadi32 addr:$dst), 1), addr:$dst),
+                (implicit EFLAGS)]>,
+               Requires<[In32BitMode]>;
+}
+
+let CodeSize = 2 in
+def DEC8r  : I<0xFE, MRM1r, (outs GR8 :$dst), (ins GR8 :$src), "dec{b}\t$dst",
+               [(set GR8:$dst, (add GR8:$src, -1)),
+                (implicit EFLAGS)]>;
+let isConvertibleToThreeAddress = 1, CodeSize = 1 in {   // Can xform into LEA.
+def DEC16r : I<0x48, AddRegFrm, (outs GR16:$dst), (ins GR16:$src), 
+               "dec{w}\t$dst",
+               [(set GR16:$dst, (add GR16:$src, -1)),
+                (implicit EFLAGS)]>,
+             OpSize, Requires<[In32BitMode]>;
+def DEC32r : I<0x48, AddRegFrm, (outs GR32:$dst), (ins GR32:$src), 
+               "dec{l}\t$dst",
+               [(set GR32:$dst, (add GR32:$src, -1)),
+                (implicit EFLAGS)]>, Requires<[In32BitMode]>;
+}
+
+let isTwoAddress = 0, CodeSize = 2 in {
+  def DEC8m  : I<0xFE, MRM1m, (outs), (ins i8mem :$dst), "dec{b}\t$dst",
+               [(store (add (loadi8 addr:$dst), -1), addr:$dst),
+                (implicit EFLAGS)]>;
+  def DEC16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst), "dec{w}\t$dst",
+               [(store (add (loadi16 addr:$dst), -1), addr:$dst),
+                (implicit EFLAGS)]>,
+               OpSize, Requires<[In32BitMode]>;
+  def DEC32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst",
+               [(store (add (loadi32 addr:$dst), -1), addr:$dst),
+                (implicit EFLAGS)]>,
+               Requires<[In32BitMode]>;
+}
+} // Defs = [EFLAGS]
+
+// Logical operators...
+let Defs = [EFLAGS] in {
+let isCommutable = 1 in {   // X = AND Y, Z   --> X = AND Z, Y
+def AND8rr   : I<0x20, MRMDestReg,
+                (outs GR8 :$dst), (ins GR8 :$src1, GR8 :$src2),
+                "and{b}\t{$src2, $dst|$dst, $src2}",
+                [(set GR8:$dst, (and GR8:$src1, GR8:$src2)),
+                 (implicit EFLAGS)]>;
+def AND16rr  : I<0x21, MRMDestReg,
+                 (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
+                 "and{w}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (and GR16:$src1, GR16:$src2)),
+                  (implicit EFLAGS)]>, OpSize;
+def AND32rr  : I<0x21, MRMDestReg, 
+                 (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
+                 "and{l}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (and GR32:$src1, GR32:$src2)),
+                  (implicit EFLAGS)]>;
+}
+
+// AND instructions with the destination register in REG and the source register
+//   in R/M.  Included for the disassembler.
+def AND8rr_REV : I<0x22, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+                  "and{b}\t{$src2, $dst|$dst, $src2}", []>;
+def AND16rr_REV : I<0x23, MRMSrcReg, (outs GR16:$dst), 
+                    (ins GR16:$src1, GR16:$src2),
+                   "and{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize;
+def AND32rr_REV : I<0x23, MRMSrcReg, (outs GR32:$dst), 
+                    (ins GR32:$src1, GR32:$src2),
+                   "and{l}\t{$src2, $dst|$dst, $src2}", []>;
+
+def AND8rm   : I<0x22, MRMSrcMem, 
+                 (outs GR8 :$dst), (ins GR8 :$src1, i8mem :$src2),
+                 "and{b}\t{$src2, $dst|$dst, $src2}",
+                [(set GR8:$dst, (and GR8:$src1, (loadi8 addr:$src2))),
+                 (implicit EFLAGS)]>;
+def AND16rm  : I<0x23, MRMSrcMem, 
+                 (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
+                 "and{w}\t{$src2, $dst|$dst, $src2}",
+                [(set GR16:$dst, (and GR16:$src1, (loadi16 addr:$src2))),
+                 (implicit EFLAGS)]>, OpSize;
+def AND32rm  : I<0x23, MRMSrcMem,
+                 (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
+                 "and{l}\t{$src2, $dst|$dst, $src2}",
+                [(set GR32:$dst, (and GR32:$src1, (loadi32 addr:$src2))),
+                 (implicit EFLAGS)]>;
+
+def AND8ri   : Ii8<0x80, MRM4r, 
+                   (outs GR8 :$dst), (ins GR8 :$src1, i8imm :$src2),
+                   "and{b}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (and GR8:$src1, imm:$src2)),
+                    (implicit EFLAGS)]>;
+def AND16ri  : Ii16<0x81, MRM4r, 
+                    (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+                    "and{w}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR16:$dst, (and GR16:$src1, imm:$src2)),
+                     (implicit EFLAGS)]>, OpSize;
+def AND32ri  : Ii32<0x81, MRM4r, 
+                    (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+                    "and{l}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR32:$dst, (and GR32:$src1, imm:$src2)),
+                     (implicit EFLAGS)]>;
+def AND16ri8 : Ii8<0x83, MRM4r, 
+                   (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
+                   "and{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (and GR16:$src1, i16immSExt8:$src2)),
+                    (implicit EFLAGS)]>,
+                   OpSize;
+def AND32ri8 : Ii8<0x83, MRM4r, 
+                   (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
+                   "and{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (and GR32:$src1, i32immSExt8:$src2)),
+                    (implicit EFLAGS)]>;
+
+let isTwoAddress = 0 in {
+  def AND8mr   : I<0x20, MRMDestMem,
+                   (outs), (ins i8mem :$dst, GR8 :$src),
+                   "and{b}\t{$src, $dst|$dst, $src}",
+                   [(store (and (load addr:$dst), GR8:$src), addr:$dst),
+                    (implicit EFLAGS)]>;
+  def AND16mr  : I<0x21, MRMDestMem,
+                   (outs), (ins i16mem:$dst, GR16:$src),
+                   "and{w}\t{$src, $dst|$dst, $src}",
+                   [(store (and (load addr:$dst), GR16:$src), addr:$dst),
+                    (implicit EFLAGS)]>,
+                   OpSize;
+  def AND32mr  : I<0x21, MRMDestMem,
+                   (outs), (ins i32mem:$dst, GR32:$src),
+                   "and{l}\t{$src, $dst|$dst, $src}",
+                   [(store (and (load addr:$dst), GR32:$src), addr:$dst),
+                    (implicit EFLAGS)]>;
+  def AND8mi   : Ii8<0x80, MRM4m,
+                     (outs), (ins i8mem :$dst, i8imm :$src),
+                     "and{b}\t{$src, $dst|$dst, $src}",
+                      [(store (and (loadi8 addr:$dst), imm:$src), addr:$dst),
+                       (implicit EFLAGS)]>;
+  def AND16mi  : Ii16<0x81, MRM4m,
+                      (outs), (ins i16mem:$dst, i16imm:$src),
+                      "and{w}\t{$src, $dst|$dst, $src}",
+                      [(store (and (loadi16 addr:$dst), imm:$src), addr:$dst),
+                       (implicit EFLAGS)]>,
+                      OpSize;
+  def AND32mi  : Ii32<0x81, MRM4m,
+                      (outs), (ins i32mem:$dst, i32imm:$src),
+                      "and{l}\t{$src, $dst|$dst, $src}",
+                      [(store (and (loadi32 addr:$dst), imm:$src), addr:$dst),
+                       (implicit EFLAGS)]>;
+  def AND16mi8 : Ii8<0x83, MRM4m,
+                     (outs), (ins i16mem:$dst, i16i8imm :$src),
+                     "and{w}\t{$src, $dst|$dst, $src}",
+                [(store (and (load addr:$dst), i16immSExt8:$src), addr:$dst),
+                 (implicit EFLAGS)]>,
+                     OpSize;
+  def AND32mi8 : Ii8<0x83, MRM4m,
+                     (outs), (ins i32mem:$dst, i32i8imm :$src),
+                     "and{l}\t{$src, $dst|$dst, $src}",
+                [(store (and (load addr:$dst), i32immSExt8:$src), addr:$dst),
+                 (implicit EFLAGS)]>;
+
+  def AND8i8 : Ii8<0x24, RawFrm, (outs), (ins i8imm:$src),
+                   "and{b}\t{$src, %al|%al, $src}", []>;
+  def AND16i16 : Ii16<0x25, RawFrm, (outs), (ins i16imm:$src),
+                      "and{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
+  def AND32i32 : Ii32<0x25, RawFrm, (outs), (ins i32imm:$src),
+                      "and{l}\t{$src, %eax|%eax, $src}", []>;
+
+}
+
+
+let isCommutable = 1 in {   // X = OR Y, Z   --> X = OR Z, Y
+def OR8rr    : I<0x08, MRMDestReg, (outs GR8 :$dst), 
+                 (ins GR8 :$src1, GR8 :$src2),
+                 "or{b}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR8:$dst, (or GR8:$src1, GR8:$src2)),
+                  (implicit EFLAGS)]>;
+def OR16rr   : I<0x09, MRMDestReg, (outs GR16:$dst), 
+                 (ins GR16:$src1, GR16:$src2),
+                 "or{w}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (or GR16:$src1, GR16:$src2)),
+                  (implicit EFLAGS)]>, OpSize;
+def OR32rr   : I<0x09, MRMDestReg, (outs GR32:$dst), 
+                 (ins GR32:$src1, GR32:$src2),
+                 "or{l}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (or GR32:$src1, GR32:$src2)),
+                  (implicit EFLAGS)]>;
+}
+
+// OR instructions with the destination register in REG and the source register
+//   in R/M.  Included for the disassembler.
+def OR8rr_REV : I<0x0A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+                  "or{b}\t{$src2, $dst|$dst, $src2}", []>;
+def OR16rr_REV : I<0x0B, MRMSrcReg, (outs GR16:$dst),
+                   (ins GR16:$src1, GR16:$src2),
+                   "or{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize;
+def OR32rr_REV : I<0x0B, MRMSrcReg, (outs GR32:$dst), 
+                   (ins GR32:$src1, GR32:$src2),
+                   "or{l}\t{$src2, $dst|$dst, $src2}", []>;
+                  
+def OR8rm    : I<0x0A, MRMSrcMem , (outs GR8 :$dst), 
+                 (ins GR8 :$src1, i8mem :$src2),
+                 "or{b}\t{$src2, $dst|$dst, $src2}",
+                [(set GR8:$dst, (or GR8:$src1, (load addr:$src2))),
+                 (implicit EFLAGS)]>;
+def OR16rm   : I<0x0B, MRMSrcMem , (outs GR16:$dst), 
+                 (ins GR16:$src1, i16mem:$src2),
+                 "or{w}\t{$src2, $dst|$dst, $src2}",
+                [(set GR16:$dst, (or GR16:$src1, (load addr:$src2))),
+                 (implicit EFLAGS)]>, OpSize;
+def OR32rm   : I<0x0B, MRMSrcMem , (outs GR32:$dst), 
+                 (ins GR32:$src1, i32mem:$src2),
+                 "or{l}\t{$src2, $dst|$dst, $src2}",
+                [(set GR32:$dst, (or GR32:$src1, (load addr:$src2))),
+                 (implicit EFLAGS)]>;
+
+def OR8ri    : Ii8 <0x80, MRM1r, (outs GR8 :$dst), 
+                    (ins GR8 :$src1, i8imm:$src2),
+                    "or{b}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR8:$dst, (or GR8:$src1, imm:$src2)),
+                     (implicit EFLAGS)]>;
+def OR16ri   : Ii16<0x81, MRM1r, (outs GR16:$dst), 
+                    (ins GR16:$src1, i16imm:$src2),
+                    "or{w}\t{$src2, $dst|$dst, $src2}", 
+                    [(set GR16:$dst, (or GR16:$src1, imm:$src2)),
+                     (implicit EFLAGS)]>, OpSize;
+def OR32ri   : Ii32<0x81, MRM1r, (outs GR32:$dst), 
+                    (ins GR32:$src1, i32imm:$src2),
+                    "or{l}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR32:$dst, (or GR32:$src1, imm:$src2)),
+                     (implicit EFLAGS)]>;
+
+def OR16ri8  : Ii8<0x83, MRM1r, (outs GR16:$dst), 
+                   (ins GR16:$src1, i16i8imm:$src2),
+                   "or{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (or GR16:$src1, i16immSExt8:$src2)),
+                    (implicit EFLAGS)]>, OpSize;
+def OR32ri8  : Ii8<0x83, MRM1r, (outs GR32:$dst), 
+                   (ins GR32:$src1, i32i8imm:$src2),
+                   "or{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (or GR32:$src1, i32immSExt8:$src2)),
+                    (implicit EFLAGS)]>;
+let isTwoAddress = 0 in {
+  def OR8mr  : I<0x08, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
+                 "or{b}\t{$src, $dst|$dst, $src}",
+                 [(store (or (load addr:$dst), GR8:$src), addr:$dst),
+                  (implicit EFLAGS)]>;
+  def OR16mr : I<0x09, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+                 "or{w}\t{$src, $dst|$dst, $src}",
+                 [(store (or (load addr:$dst), GR16:$src), addr:$dst),
+                  (implicit EFLAGS)]>, OpSize;
+  def OR32mr : I<0x09, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+                 "or{l}\t{$src, $dst|$dst, $src}",
+                 [(store (or (load addr:$dst), GR32:$src), addr:$dst),
+                  (implicit EFLAGS)]>;
+  def OR8mi    : Ii8<0x80, MRM1m, (outs), (ins i8mem :$dst, i8imm:$src),
+                 "or{b}\t{$src, $dst|$dst, $src}",
+                 [(store (or (loadi8 addr:$dst), imm:$src), addr:$dst),
+                  (implicit EFLAGS)]>;
+  def OR16mi   : Ii16<0x81, MRM1m, (outs), (ins i16mem:$dst, i16imm:$src),
+                 "or{w}\t{$src, $dst|$dst, $src}",
+                 [(store (or (loadi16 addr:$dst), imm:$src), addr:$dst),
+                  (implicit EFLAGS)]>,
+                 OpSize;
+  def OR32mi   : Ii32<0x81, MRM1m, (outs), (ins i32mem:$dst, i32imm:$src),
+                 "or{l}\t{$src, $dst|$dst, $src}",
+                 [(store (or (loadi32 addr:$dst), imm:$src), addr:$dst),
+                  (implicit EFLAGS)]>;
+  def OR16mi8  : Ii8<0x83, MRM1m, (outs), (ins i16mem:$dst, i16i8imm:$src),
+                 "or{w}\t{$src, $dst|$dst, $src}",
+                 [(store (or (load addr:$dst), i16immSExt8:$src), addr:$dst),
+                  (implicit EFLAGS)]>,
+                     OpSize;
+  def OR32mi8  : Ii8<0x83, MRM1m, (outs), (ins i32mem:$dst, i32i8imm:$src),
+                 "or{l}\t{$src, $dst|$dst, $src}",
+                 [(store (or (load addr:$dst), i32immSExt8:$src), addr:$dst),
+                  (implicit EFLAGS)]>;
+                  
+  def OR8i8 : Ii8 <0x0C, RawFrm, (outs), (ins i8imm:$src),
+                   "or{b}\t{$src, %al|%al, $src}", []>;
+  def OR16i16 : Ii16 <0x0D, RawFrm, (outs), (ins i16imm:$src),
+                      "or{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
+  def OR32i32 : Ii32 <0x0D, RawFrm, (outs), (ins i32imm:$src),
+                      "or{l}\t{$src, %eax|%eax, $src}", []>;
+} // isTwoAddress = 0
+
+
+let isCommutable = 1 in { // X = XOR Y, Z --> X = XOR Z, Y
+  def XOR8rr   : I<0x30, MRMDestReg,
+                   (outs GR8 :$dst), (ins GR8 :$src1, GR8 :$src2),
+                   "xor{b}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (xor GR8:$src1, GR8:$src2)),
+                    (implicit EFLAGS)]>;
+  def XOR16rr  : I<0x31, MRMDestReg, 
+                   (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), 
+                   "xor{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (xor GR16:$src1, GR16:$src2)),
+                    (implicit EFLAGS)]>, OpSize;
+  def XOR32rr  : I<0x31, MRMDestReg, 
+                   (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), 
+                   "xor{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (xor GR32:$src1, GR32:$src2)),
+                    (implicit EFLAGS)]>;
+} // isCommutable = 1
+
+// XOR instructions with the destination register in REG and the source register
+//   in R/M.  Included for the disassembler.
+def XOR8rr_REV : I<0x32, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+                  "xor{b}\t{$src2, $dst|$dst, $src2}", []>;
+def XOR16rr_REV : I<0x33, MRMSrcReg, (outs GR16:$dst), 
+                    (ins GR16:$src1, GR16:$src2),
+                   "xor{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize;
+def XOR32rr_REV : I<0x33, MRMSrcReg, (outs GR32:$dst), 
+                    (ins GR32:$src1, GR32:$src2),
+                   "xor{l}\t{$src2, $dst|$dst, $src2}", []>;
+
+def XOR8rm   : I<0x32, MRMSrcMem , 
+                 (outs GR8 :$dst), (ins GR8:$src1, i8mem :$src2), 
+                 "xor{b}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR8:$dst, (xor GR8:$src1, (load addr:$src2))),
+                  (implicit EFLAGS)]>;
+def XOR16rm  : I<0x33, MRMSrcMem , 
+                 (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2), 
+                 "xor{w}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (xor GR16:$src1, (load addr:$src2))),
+                  (implicit EFLAGS)]>,
+                 OpSize;
+def XOR32rm  : I<0x33, MRMSrcMem , 
+                 (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2), 
+                 "xor{l}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (xor GR32:$src1, (load addr:$src2))),
+                  (implicit EFLAGS)]>;
+
+def XOR8ri   : Ii8<0x80, MRM6r, 
+                   (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2), 
+                   "xor{b}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (xor GR8:$src1, imm:$src2)),
+                    (implicit EFLAGS)]>;
+def XOR16ri  : Ii16<0x81, MRM6r, 
+                    (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2), 
+                    "xor{w}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR16:$dst, (xor GR16:$src1, imm:$src2)),
+                     (implicit EFLAGS)]>, OpSize;
+def XOR32ri  : Ii32<0x81, MRM6r, 
+                    (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2), 
+                    "xor{l}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR32:$dst, (xor GR32:$src1, imm:$src2)),
+                     (implicit EFLAGS)]>;
+def XOR16ri8 : Ii8<0x83, MRM6r, 
+                   (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
+                   "xor{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (xor GR16:$src1, i16immSExt8:$src2)),
+                    (implicit EFLAGS)]>,
+                   OpSize;
+def XOR32ri8 : Ii8<0x83, MRM6r, 
+                   (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
+                   "xor{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (xor GR32:$src1, i32immSExt8:$src2)),
+                    (implicit EFLAGS)]>;
+
+let isTwoAddress = 0 in {
+  def XOR8mr   : I<0x30, MRMDestMem,
+                   (outs), (ins i8mem :$dst, GR8 :$src),
+                   "xor{b}\t{$src, $dst|$dst, $src}",
+                   [(store (xor (load addr:$dst), GR8:$src), addr:$dst),
+                    (implicit EFLAGS)]>;
+  def XOR16mr  : I<0x31, MRMDestMem,
+                   (outs), (ins i16mem:$dst, GR16:$src),
+                   "xor{w}\t{$src, $dst|$dst, $src}",
+                   [(store (xor (load addr:$dst), GR16:$src), addr:$dst),
+                    (implicit EFLAGS)]>,
+                   OpSize;
+  def XOR32mr  : I<0x31, MRMDestMem,
+                   (outs), (ins i32mem:$dst, GR32:$src),
+                   "xor{l}\t{$src, $dst|$dst, $src}",
+                   [(store (xor (load addr:$dst), GR32:$src), addr:$dst),
+                    (implicit EFLAGS)]>;
+  def XOR8mi   : Ii8<0x80, MRM6m,
+                     (outs), (ins i8mem :$dst, i8imm :$src),
+                     "xor{b}\t{$src, $dst|$dst, $src}",
+                    [(store (xor (loadi8 addr:$dst), imm:$src), addr:$dst),
+                     (implicit EFLAGS)]>;
+  def XOR16mi  : Ii16<0x81, MRM6m,
+                      (outs), (ins i16mem:$dst, i16imm:$src),
+                      "xor{w}\t{$src, $dst|$dst, $src}",
+                   [(store (xor (loadi16 addr:$dst), imm:$src), addr:$dst),
+                    (implicit EFLAGS)]>,
+                      OpSize;
+  def XOR32mi  : Ii32<0x81, MRM6m,
+                      (outs), (ins i32mem:$dst, i32imm:$src),
+                      "xor{l}\t{$src, $dst|$dst, $src}",
+                   [(store (xor (loadi32 addr:$dst), imm:$src), addr:$dst),
+                    (implicit EFLAGS)]>;
+  def XOR16mi8 : Ii8<0x83, MRM6m,
+                     (outs), (ins i16mem:$dst, i16i8imm :$src),
+                     "xor{w}\t{$src, $dst|$dst, $src}",
+                 [(store (xor (load addr:$dst), i16immSExt8:$src), addr:$dst),
+                  (implicit EFLAGS)]>,
+                     OpSize;
+  def XOR32mi8 : Ii8<0x83, MRM6m,
+                     (outs), (ins i32mem:$dst, i32i8imm :$src),
+                     "xor{l}\t{$src, $dst|$dst, $src}",
+                 [(store (xor (load addr:$dst), i32immSExt8:$src), addr:$dst),
+                  (implicit EFLAGS)]>;
+                  
+  def XOR8i8 : Ii8 <0x34, RawFrm, (outs), (ins i8imm:$src),
+                   "xor{b}\t{$src, %al|%al, $src}", []>;
+  def XOR16i16 : Ii16 <0x35, RawFrm, (outs), (ins i16imm:$src),
+                        "xor{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
+  def XOR32i32 : Ii32 <0x35, RawFrm, (outs), (ins i32imm:$src),
+                        "xor{l}\t{$src, %eax|%eax, $src}", []>;
+} // isTwoAddress = 0
+} // Defs = [EFLAGS]
+
+// Shift instructions
+let Defs = [EFLAGS] in {
+let Uses = [CL] in {
+def SHL8rCL  : I<0xD2, MRM4r, (outs GR8 :$dst), (ins GR8 :$src),
+                 "shl{b}\t{%cl, $dst|$dst, CL}",
+                 [(set GR8:$dst, (shl GR8:$src, CL))]>;
+def SHL16rCL : I<0xD3, MRM4r, (outs GR16:$dst), (ins GR16:$src),
+                 "shl{w}\t{%cl, $dst|$dst, CL}",
+                 [(set GR16:$dst, (shl GR16:$src, CL))]>, OpSize;
+def SHL32rCL : I<0xD3, MRM4r, (outs GR32:$dst), (ins GR32:$src),
+                 "shl{l}\t{%cl, $dst|$dst, CL}",
+                 [(set GR32:$dst, (shl GR32:$src, CL))]>;
+} // Uses = [CL]
+
+def SHL8ri   : Ii8<0xC0, MRM4r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
+                   "shl{b}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (shl GR8:$src1, (i8 imm:$src2)))]>;
+let isConvertibleToThreeAddress = 1 in {   // Can transform into LEA.
+def SHL16ri  : Ii8<0xC1, MRM4r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+                   "shl{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (shl GR16:$src1, (i8 imm:$src2)))]>, OpSize;
+def SHL32ri  : Ii8<0xC1, MRM4r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+                   "shl{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (shl GR32:$src1, (i8 imm:$src2)))]>;
+
+// NOTE: We don't include patterns for shifts of a register by one, because
+// 'add reg,reg' is cheaper.
+
+def SHL8r1   : I<0xD0, MRM4r, (outs GR8:$dst), (ins GR8:$src1),
+                 "shl{b}\t$dst", []>;
+def SHL16r1  : I<0xD1, MRM4r, (outs GR16:$dst), (ins GR16:$src1),
+                 "shl{w}\t$dst", []>, OpSize;
+def SHL32r1  : I<0xD1, MRM4r, (outs GR32:$dst), (ins GR32:$src1),
+                 "shl{l}\t$dst", []>;
+
+} // isConvertibleToThreeAddress = 1
+
+let isTwoAddress = 0 in {
+  let Uses = [CL] in {
+  def SHL8mCL  : I<0xD2, MRM4m, (outs), (ins i8mem :$dst),
+                   "shl{b}\t{%cl, $dst|$dst, CL}",
+                   [(store (shl (loadi8 addr:$dst), CL), addr:$dst)]>;
+  def SHL16mCL : I<0xD3, MRM4m, (outs), (ins i16mem:$dst),
+                   "shl{w}\t{%cl, $dst|$dst, CL}",
+                   [(store (shl (loadi16 addr:$dst), CL), addr:$dst)]>, OpSize;
+  def SHL32mCL : I<0xD3, MRM4m, (outs), (ins i32mem:$dst),
+                   "shl{l}\t{%cl, $dst|$dst, CL}",
+                   [(store (shl (loadi32 addr:$dst), CL), addr:$dst)]>;
+  }
+  def SHL8mi   : Ii8<0xC0, MRM4m, (outs), (ins i8mem :$dst, i8imm:$src),
+                     "shl{b}\t{$src, $dst|$dst, $src}",
+                  [(store (shl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+  def SHL16mi  : Ii8<0xC1, MRM4m, (outs), (ins i16mem:$dst, i8imm:$src),
+                     "shl{w}\t{$src, $dst|$dst, $src}",
+                 [(store (shl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+                     OpSize;
+  def SHL32mi  : Ii8<0xC1, MRM4m, (outs), (ins i32mem:$dst, i8imm:$src),
+                     "shl{l}\t{$src, $dst|$dst, $src}",
+                 [(store (shl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+
+  // Shift by 1
+  def SHL8m1   : I<0xD0, MRM4m, (outs), (ins i8mem :$dst),
+                   "shl{b}\t$dst",
+                  [(store (shl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+  def SHL16m1  : I<0xD1, MRM4m, (outs), (ins i16mem:$dst),
+                   "shl{w}\t$dst",
+                 [(store (shl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+                     OpSize;
+  def SHL32m1  : I<0xD1, MRM4m, (outs), (ins i32mem:$dst),
+                   "shl{l}\t$dst",
+                 [(store (shl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
+}
+
+let Uses = [CL] in {
+def SHR8rCL  : I<0xD2, MRM5r, (outs GR8 :$dst), (ins GR8 :$src),
+                 "shr{b}\t{%cl, $dst|$dst, CL}",
+                 [(set GR8:$dst, (srl GR8:$src, CL))]>;
+def SHR16rCL : I<0xD3, MRM5r, (outs GR16:$dst), (ins GR16:$src),
+                 "shr{w}\t{%cl, $dst|$dst, CL}",
+                 [(set GR16:$dst, (srl GR16:$src, CL))]>, OpSize;
+def SHR32rCL : I<0xD3, MRM5r, (outs GR32:$dst), (ins GR32:$src),
+                 "shr{l}\t{%cl, $dst|$dst, CL}",
+                 [(set GR32:$dst, (srl GR32:$src, CL))]>;
+}
+
+def SHR8ri   : Ii8<0xC0, MRM5r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+                   "shr{b}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (srl GR8:$src1, (i8 imm:$src2)))]>;
+def SHR16ri  : Ii8<0xC1, MRM5r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+                   "shr{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (srl GR16:$src1, (i8 imm:$src2)))]>, OpSize;
+def SHR32ri  : Ii8<0xC1, MRM5r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+                   "shr{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (srl GR32:$src1, (i8 imm:$src2)))]>;
+
+// Shift by 1
+def SHR8r1   : I<0xD0, MRM5r, (outs GR8:$dst), (ins GR8:$src1),
+                 "shr{b}\t$dst",
+                 [(set GR8:$dst, (srl GR8:$src1, (i8 1)))]>;
+def SHR16r1  : I<0xD1, MRM5r, (outs GR16:$dst), (ins GR16:$src1),
+                 "shr{w}\t$dst",
+                 [(set GR16:$dst, (srl GR16:$src1, (i8 1)))]>, OpSize;
+def SHR32r1  : I<0xD1, MRM5r, (outs GR32:$dst), (ins GR32:$src1),
+                 "shr{l}\t$dst",
+                 [(set GR32:$dst, (srl GR32:$src1, (i8 1)))]>;
+
+let isTwoAddress = 0 in {
+  let Uses = [CL] in {
+  def SHR8mCL  : I<0xD2, MRM5m, (outs), (ins i8mem :$dst),
+                   "shr{b}\t{%cl, $dst|$dst, CL}",
+                   [(store (srl (loadi8 addr:$dst), CL), addr:$dst)]>;
+  def SHR16mCL : I<0xD3, MRM5m, (outs), (ins i16mem:$dst),
+                   "shr{w}\t{%cl, $dst|$dst, CL}",
+                   [(store (srl (loadi16 addr:$dst), CL), addr:$dst)]>,
+                   OpSize;
+  def SHR32mCL : I<0xD3, MRM5m, (outs), (ins i32mem:$dst),
+                   "shr{l}\t{%cl, $dst|$dst, CL}",
+                   [(store (srl (loadi32 addr:$dst), CL), addr:$dst)]>;
+  }
+  def SHR8mi   : Ii8<0xC0, MRM5m, (outs), (ins i8mem :$dst, i8imm:$src),
+                     "shr{b}\t{$src, $dst|$dst, $src}",
+                  [(store (srl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+  def SHR16mi  : Ii8<0xC1, MRM5m, (outs), (ins i16mem:$dst, i8imm:$src),
+                     "shr{w}\t{$src, $dst|$dst, $src}",
+                 [(store (srl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+                     OpSize;
+  def SHR32mi  : Ii8<0xC1, MRM5m, (outs), (ins i32mem:$dst, i8imm:$src),
+                     "shr{l}\t{$src, $dst|$dst, $src}",
+                 [(store (srl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+
+  // Shift by 1
+  def SHR8m1   : I<0xD0, MRM5m, (outs), (ins i8mem :$dst),
+                   "shr{b}\t$dst",
+                  [(store (srl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+  def SHR16m1  : I<0xD1, MRM5m, (outs), (ins i16mem:$dst),
+                   "shr{w}\t$dst",
+                 [(store (srl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,OpSize;
+  def SHR32m1  : I<0xD1, MRM5m, (outs), (ins i32mem:$dst),
+                   "shr{l}\t$dst",
+                 [(store (srl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
+}
+
+let Uses = [CL] in {
+def SAR8rCL  : I<0xD2, MRM7r, (outs GR8 :$dst), (ins GR8 :$src),
+                 "sar{b}\t{%cl, $dst|$dst, CL}",
+                 [(set GR8:$dst, (sra GR8:$src, CL))]>;
+def SAR16rCL : I<0xD3, MRM7r, (outs GR16:$dst), (ins GR16:$src),
+                 "sar{w}\t{%cl, $dst|$dst, CL}",
+                 [(set GR16:$dst, (sra GR16:$src, CL))]>, OpSize;
+def SAR32rCL : I<0xD3, MRM7r, (outs GR32:$dst), (ins GR32:$src),
+                 "sar{l}\t{%cl, $dst|$dst, CL}",
+                 [(set GR32:$dst, (sra GR32:$src, CL))]>;
+}
+
+def SAR8ri   : Ii8<0xC0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
+                   "sar{b}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (sra GR8:$src1, (i8 imm:$src2)))]>;
+def SAR16ri  : Ii8<0xC1, MRM7r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+                   "sar{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (sra GR16:$src1, (i8 imm:$src2)))]>,
+                   OpSize;
+def SAR32ri  : Ii8<0xC1, MRM7r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+                   "sar{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (sra GR32:$src1, (i8 imm:$src2)))]>;
+
+// Shift by 1
+def SAR8r1   : I<0xD0, MRM7r, (outs GR8 :$dst), (ins GR8 :$src1),
+                 "sar{b}\t$dst",
+                 [(set GR8:$dst, (sra GR8:$src1, (i8 1)))]>;
+def SAR16r1  : I<0xD1, MRM7r, (outs GR16:$dst), (ins GR16:$src1),
+                 "sar{w}\t$dst",
+                 [(set GR16:$dst, (sra GR16:$src1, (i8 1)))]>, OpSize;
+def SAR32r1  : I<0xD1, MRM7r, (outs GR32:$dst), (ins GR32:$src1),
+                 "sar{l}\t$dst",
+                 [(set GR32:$dst, (sra GR32:$src1, (i8 1)))]>;
+
+let isTwoAddress = 0 in {
+  let Uses = [CL] in {
+  def SAR8mCL  : I<0xD2, MRM7m, (outs), (ins i8mem :$dst),
+                   "sar{b}\t{%cl, $dst|$dst, CL}",
+                   [(store (sra (loadi8 addr:$dst), CL), addr:$dst)]>;
+  def SAR16mCL : I<0xD3, MRM7m, (outs), (ins i16mem:$dst),
+                   "sar{w}\t{%cl, $dst|$dst, CL}",
+                   [(store (sra (loadi16 addr:$dst), CL), addr:$dst)]>, OpSize;
+  def SAR32mCL : I<0xD3, MRM7m, (outs), (ins i32mem:$dst), 
+                   "sar{l}\t{%cl, $dst|$dst, CL}",
+                   [(store (sra (loadi32 addr:$dst), CL), addr:$dst)]>;
+  }
+  def SAR8mi   : Ii8<0xC0, MRM7m, (outs), (ins i8mem :$dst, i8imm:$src),
+                     "sar{b}\t{$src, $dst|$dst, $src}",
+                  [(store (sra (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+  def SAR16mi  : Ii8<0xC1, MRM7m, (outs), (ins i16mem:$dst, i8imm:$src),
+                     "sar{w}\t{$src, $dst|$dst, $src}",
+                 [(store (sra (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+                     OpSize;
+  def SAR32mi  : Ii8<0xC1, MRM7m, (outs), (ins i32mem:$dst, i8imm:$src),
+                     "sar{l}\t{$src, $dst|$dst, $src}",
+                 [(store (sra (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+
+  // Shift by 1
+  def SAR8m1   : I<0xD0, MRM7m, (outs), (ins i8mem :$dst),
+                   "sar{b}\t$dst",
+                  [(store (sra (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+  def SAR16m1  : I<0xD1, MRM7m, (outs), (ins i16mem:$dst),
+                   "sar{w}\t$dst",
+                 [(store (sra (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+                     OpSize;
+  def SAR32m1  : I<0xD1, MRM7m, (outs), (ins i32mem:$dst),
+                   "sar{l}\t$dst",
+                 [(store (sra (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
+}
+
+// Rotate instructions
+
+def RCL8r1 : I<0xD0, MRM2r, (outs GR8:$dst), (ins GR8:$src),
+               "rcl{b}\t{1, $dst|$dst, 1}", []>;
+def RCL8m1 : I<0xD0, MRM2m, (outs i8mem:$dst), (ins i8mem:$src),
+               "rcl{b}\t{1, $dst|$dst, 1}", []>;
+let Uses = [CL] in {
+def RCL8rCL : I<0xD2, MRM2r, (outs GR8:$dst), (ins GR8:$src),
+                "rcl{b}\t{%cl, $dst|$dst, CL}", []>;
+def RCL8mCL : I<0xD2, MRM2m, (outs i8mem:$dst), (ins i8mem:$src),
+                "rcl{b}\t{%cl, $dst|$dst, CL}", []>;
+}
+def RCL8ri : Ii8<0xC0, MRM2r, (outs GR8:$dst), (ins GR8:$src, i8imm:$cnt),
+                 "rcl{b}\t{$cnt, $dst|$dst, $cnt}", []>;
+def RCL8mi : Ii8<0xC0, MRM2m, (outs i8mem:$dst), (ins i8mem:$src, i8imm:$cnt),
+                 "rcl{b}\t{$cnt, $dst|$dst, $cnt}", []>;
+  
+def RCL16r1 : I<0xD1, MRM2r, (outs GR16:$dst), (ins GR16:$src),
+                "rcl{w}\t{1, $dst|$dst, 1}", []>, OpSize;
+def RCL16m1 : I<0xD1, MRM2m, (outs i16mem:$dst), (ins i16mem:$src),
+                "rcl{w}\t{1, $dst|$dst, 1}", []>, OpSize;
+let Uses = [CL] in {
+def RCL16rCL : I<0xD3, MRM2r, (outs GR16:$dst), (ins GR16:$src),
+                 "rcl{w}\t{%cl, $dst|$dst, CL}", []>, OpSize;
+def RCL16mCL : I<0xD3, MRM2m, (outs i16mem:$dst), (ins i16mem:$src),
+                 "rcl{w}\t{%cl, $dst|$dst, CL}", []>, OpSize;
+}
+def RCL16ri : Ii8<0xC1, MRM2r, (outs GR16:$dst), (ins GR16:$src, i8imm:$cnt),
+                  "rcl{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize;
+def RCL16mi : Ii8<0xC1, MRM2m, (outs i16mem:$dst), 
+                  (ins i16mem:$src, i8imm:$cnt),
+                  "rcl{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize;
+
+def RCL32r1 : I<0xD1, MRM2r, (outs GR32:$dst), (ins GR32:$src),
+                "rcl{l}\t{1, $dst|$dst, 1}", []>;
+def RCL32m1 : I<0xD1, MRM2m, (outs i32mem:$dst), (ins i32mem:$src),
+                "rcl{l}\t{1, $dst|$dst, 1}", []>;
+let Uses = [CL] in {
+def RCL32rCL : I<0xD3, MRM2r, (outs GR32:$dst), (ins GR32:$src),
+                 "rcl{l}\t{%cl, $dst|$dst, CL}", []>;
+def RCL32mCL : I<0xD3, MRM2m, (outs i32mem:$dst), (ins i32mem:$src),
+                 "rcl{l}\t{%cl, $dst|$dst, CL}", []>;
+}
+def RCL32ri : Ii8<0xC1, MRM2r, (outs GR32:$dst), (ins GR32:$src, i8imm:$cnt),
+                  "rcl{l}\t{$cnt, $dst|$dst, $cnt}", []>;
+def RCL32mi : Ii8<0xC1, MRM2m, (outs i32mem:$dst), 
+                  (ins i32mem:$src, i8imm:$cnt),
+                  "rcl{l}\t{$cnt, $dst|$dst, $cnt}", []>;
+                  
+def RCR8r1 : I<0xD0, MRM3r, (outs GR8:$dst), (ins GR8:$src),
+               "rcr{b}\t{1, $dst|$dst, 1}", []>;
+def RCR8m1 : I<0xD0, MRM3m, (outs i8mem:$dst), (ins i8mem:$src),
+               "rcr{b}\t{1, $dst|$dst, 1}", []>;
+let Uses = [CL] in {
+def RCR8rCL : I<0xD2, MRM3r, (outs GR8:$dst), (ins GR8:$src),
+                "rcr{b}\t{%cl, $dst|$dst, CL}", []>;
+def RCR8mCL : I<0xD2, MRM3m, (outs i8mem:$dst), (ins i8mem:$src),
+                "rcr{b}\t{%cl, $dst|$dst, CL}", []>;
+}
+def RCR8ri : Ii8<0xC0, MRM3r, (outs GR8:$dst), (ins GR8:$src, i8imm:$cnt),
+                 "rcr{b}\t{$cnt, $dst|$dst, $cnt}", []>;
+def RCR8mi : Ii8<0xC0, MRM3m, (outs i8mem:$dst), (ins i8mem:$src, i8imm:$cnt),
+                 "rcr{b}\t{$cnt, $dst|$dst, $cnt}", []>;
+  
+def RCR16r1 : I<0xD1, MRM3r, (outs GR16:$dst), (ins GR16:$src),
+                "rcr{w}\t{1, $dst|$dst, 1}", []>, OpSize;
+def RCR16m1 : I<0xD1, MRM3m, (outs i16mem:$dst), (ins i16mem:$src),
+                "rcr{w}\t{1, $dst|$dst, 1}", []>, OpSize;
+let Uses = [CL] in {
+def RCR16rCL : I<0xD3, MRM3r, (outs GR16:$dst), (ins GR16:$src),
+                 "rcr{w}\t{%cl, $dst|$dst, CL}", []>, OpSize;
+def RCR16mCL : I<0xD3, MRM3m, (outs i16mem:$dst), (ins i16mem:$src),
+                 "rcr{w}\t{%cl, $dst|$dst, CL}", []>, OpSize;
+}
+def RCR16ri : Ii8<0xC1, MRM3r, (outs GR16:$dst), (ins GR16:$src, i8imm:$cnt),
+                  "rcr{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize;
+def RCR16mi : Ii8<0xC1, MRM3m, (outs i16mem:$dst), 
+                  (ins i16mem:$src, i8imm:$cnt),
+                  "rcr{w}\t{$cnt, $dst|$dst, $cnt}", []>, OpSize;
+
+def RCR32r1 : I<0xD1, MRM3r, (outs GR32:$dst), (ins GR32:$src),
+                "rcr{l}\t{1, $dst|$dst, 1}", []>;
+def RCR32m1 : I<0xD1, MRM3m, (outs i32mem:$dst), (ins i32mem:$src),
+                "rcr{l}\t{1, $dst|$dst, 1}", []>;
+let Uses = [CL] in {
+def RCR32rCL : I<0xD3, MRM3r, (outs GR32:$dst), (ins GR32:$src),
+                 "rcr{l}\t{%cl, $dst|$dst, CL}", []>;
+def RCR32mCL : I<0xD3, MRM3m, (outs i32mem:$dst), (ins i32mem:$src),
+                 "rcr{l}\t{%cl, $dst|$dst, CL}", []>;
+}
+def RCR32ri : Ii8<0xC1, MRM3r, (outs GR32:$dst), (ins GR32:$src, i8imm:$cnt),
+                  "rcr{l}\t{$cnt, $dst|$dst, $cnt}", []>;
+def RCR32mi : Ii8<0xC1, MRM3m, (outs i32mem:$dst), 
+                  (ins i32mem:$src, i8imm:$cnt),
+                  "rcr{l}\t{$cnt, $dst|$dst, $cnt}", []>;
+
+// FIXME: provide shorter instructions when imm8 == 1
+let Uses = [CL] in {
+def ROL8rCL  : I<0xD2, MRM0r, (outs GR8 :$dst), (ins GR8 :$src),
+                 "rol{b}\t{%cl, $dst|$dst, CL}",
+                 [(set GR8:$dst, (rotl GR8:$src, CL))]>;
+def ROL16rCL : I<0xD3, MRM0r, (outs GR16:$dst), (ins GR16:$src),
+                 "rol{w}\t{%cl, $dst|$dst, CL}",
+                 [(set GR16:$dst, (rotl GR16:$src, CL))]>, OpSize;
+def ROL32rCL : I<0xD3, MRM0r, (outs GR32:$dst), (ins GR32:$src),
+                 "rol{l}\t{%cl, $dst|$dst, CL}",
+                 [(set GR32:$dst, (rotl GR32:$src, CL))]>;
+}
+
+def ROL8ri   : Ii8<0xC0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
+                   "rol{b}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (rotl GR8:$src1, (i8 imm:$src2)))]>;
+def ROL16ri  : Ii8<0xC1, MRM0r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+                   "rol{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (rotl GR16:$src1, (i8 imm:$src2)))]>, 
+                   OpSize;
+def ROL32ri  : Ii8<0xC1, MRM0r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+                   "rol{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))]>;
+
+// Rotate by 1
+def ROL8r1   : I<0xD0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
+                 "rol{b}\t$dst",
+                 [(set GR8:$dst, (rotl GR8:$src1, (i8 1)))]>;
+def ROL16r1  : I<0xD1, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
+                 "rol{w}\t$dst",
+                 [(set GR16:$dst, (rotl GR16:$src1, (i8 1)))]>, OpSize;
+def ROL32r1  : I<0xD1, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
+                 "rol{l}\t$dst",
+                 [(set GR32:$dst, (rotl GR32:$src1, (i8 1)))]>;
+
+let isTwoAddress = 0 in {
+  let Uses = [CL] in {
+  def ROL8mCL  : I<0xD2, MRM0m, (outs), (ins i8mem :$dst),
+                   "rol{b}\t{%cl, $dst|$dst, CL}",
+                   [(store (rotl (loadi8 addr:$dst), CL), addr:$dst)]>;
+  def ROL16mCL : I<0xD3, MRM0m, (outs), (ins i16mem:$dst),
+                   "rol{w}\t{%cl, $dst|$dst, CL}",
+                   [(store (rotl (loadi16 addr:$dst), CL), addr:$dst)]>, OpSize;
+  def ROL32mCL : I<0xD3, MRM0m, (outs), (ins i32mem:$dst),
+                   "rol{l}\t{%cl, $dst|$dst, CL}",
+                   [(store (rotl (loadi32 addr:$dst), CL), addr:$dst)]>;
+  }
+  def ROL8mi   : Ii8<0xC0, MRM0m, (outs), (ins i8mem :$dst, i8imm:$src),
+                     "rol{b}\t{$src, $dst|$dst, $src}",
+                 [(store (rotl (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+  def ROL16mi  : Ii8<0xC1, MRM0m, (outs), (ins i16mem:$dst, i8imm:$src),
+                     "rol{w}\t{$src, $dst|$dst, $src}",
+                [(store (rotl (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+                     OpSize;
+  def ROL32mi  : Ii8<0xC1, MRM0m, (outs), (ins i32mem:$dst, i8imm:$src),
+                     "rol{l}\t{$src, $dst|$dst, $src}",
+                [(store (rotl (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+
+  // Rotate by 1
+  def ROL8m1   : I<0xD0, MRM0m, (outs), (ins i8mem :$dst),
+                   "rol{b}\t$dst",
+                 [(store (rotl (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+  def ROL16m1  : I<0xD1, MRM0m, (outs), (ins i16mem:$dst),
+                   "rol{w}\t$dst",
+                [(store (rotl (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+                     OpSize;
+  def ROL32m1  : I<0xD1, MRM0m, (outs), (ins i32mem:$dst),
+                   "rol{l}\t$dst",
+                [(store (rotl (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
+}
+
+let Uses = [CL] in {
+def ROR8rCL  : I<0xD2, MRM1r, (outs GR8 :$dst), (ins GR8 :$src),
+                 "ror{b}\t{%cl, $dst|$dst, CL}",
+                 [(set GR8:$dst, (rotr GR8:$src, CL))]>;
+def ROR16rCL : I<0xD3, MRM1r, (outs GR16:$dst), (ins GR16:$src),
+                 "ror{w}\t{%cl, $dst|$dst, CL}",
+                 [(set GR16:$dst, (rotr GR16:$src, CL))]>, OpSize;
+def ROR32rCL : I<0xD3, MRM1r, (outs GR32:$dst), (ins GR32:$src),
+                 "ror{l}\t{%cl, $dst|$dst, CL}",
+                 [(set GR32:$dst, (rotr GR32:$src, CL))]>;
+}
+
+def ROR8ri   : Ii8<0xC0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1, i8imm:$src2),
+                   "ror{b}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR8:$dst, (rotr GR8:$src1, (i8 imm:$src2)))]>;
+def ROR16ri  : Ii8<0xC1, MRM1r, (outs GR16:$dst), (ins GR16:$src1, i8imm:$src2),
+                   "ror{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (rotr GR16:$src1, (i8 imm:$src2)))]>, 
+                   OpSize;
+def ROR32ri  : Ii8<0xC1, MRM1r, (outs GR32:$dst), (ins GR32:$src1, i8imm:$src2),
+                   "ror{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (rotr GR32:$src1, (i8 imm:$src2)))]>;
+
+// Rotate by 1
+def ROR8r1   : I<0xD0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
+                 "ror{b}\t$dst",
+                 [(set GR8:$dst, (rotr GR8:$src1, (i8 1)))]>;
+def ROR16r1  : I<0xD1, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
+                 "ror{w}\t$dst",
+                 [(set GR16:$dst, (rotr GR16:$src1, (i8 1)))]>, OpSize;
+def ROR32r1  : I<0xD1, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
+                 "ror{l}\t$dst",
+                 [(set GR32:$dst, (rotr GR32:$src1, (i8 1)))]>;
+
+let isTwoAddress = 0 in {
+  let Uses = [CL] in {
+  def ROR8mCL  : I<0xD2, MRM1m, (outs), (ins i8mem :$dst),
+                   "ror{b}\t{%cl, $dst|$dst, CL}",
+                   [(store (rotr (loadi8 addr:$dst), CL), addr:$dst)]>;
+  def ROR16mCL : I<0xD3, MRM1m, (outs), (ins i16mem:$dst),
+                   "ror{w}\t{%cl, $dst|$dst, CL}",
+                   [(store (rotr (loadi16 addr:$dst), CL), addr:$dst)]>, OpSize;
+  def ROR32mCL : I<0xD3, MRM1m, (outs), (ins i32mem:$dst), 
+                   "ror{l}\t{%cl, $dst|$dst, CL}",
+                   [(store (rotr (loadi32 addr:$dst), CL), addr:$dst)]>;
+  }
+  def ROR8mi   : Ii8<0xC0, MRM1m, (outs), (ins i8mem :$dst, i8imm:$src),
+                     "ror{b}\t{$src, $dst|$dst, $src}",
+                 [(store (rotr (loadi8 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+  def ROR16mi  : Ii8<0xC1, MRM1m, (outs), (ins i16mem:$dst, i8imm:$src),
+                     "ror{w}\t{$src, $dst|$dst, $src}",
+                [(store (rotr (loadi16 addr:$dst), (i8 imm:$src)), addr:$dst)]>,
+                     OpSize;
+  def ROR32mi  : Ii8<0xC1, MRM1m, (outs), (ins i32mem:$dst, i8imm:$src),
+                     "ror{l}\t{$src, $dst|$dst, $src}",
+                [(store (rotr (loadi32 addr:$dst), (i8 imm:$src)), addr:$dst)]>;
+
+  // Rotate by 1
+  def ROR8m1   : I<0xD0, MRM1m, (outs), (ins i8mem :$dst),
+                   "ror{b}\t$dst",
+                 [(store (rotr (loadi8 addr:$dst), (i8 1)), addr:$dst)]>;
+  def ROR16m1  : I<0xD1, MRM1m, (outs), (ins i16mem:$dst),
+                   "ror{w}\t$dst",
+                [(store (rotr (loadi16 addr:$dst), (i8 1)), addr:$dst)]>,
+                     OpSize;
+  def ROR32m1  : I<0xD1, MRM1m, (outs), (ins i32mem:$dst),
+                   "ror{l}\t$dst",
+                [(store (rotr (loadi32 addr:$dst), (i8 1)), addr:$dst)]>;
+}
+
+
+
+// Double shift instructions (generalizations of rotate)
+let Uses = [CL] in {
+def SHLD32rrCL : I<0xA5, MRMDestReg, (outs GR32:$dst), 
+                   (ins GR32:$src1, GR32:$src2),
+                   "shld{l}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+                   [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))]>, TB;
+def SHRD32rrCL : I<0xAD, MRMDestReg, (outs GR32:$dst),
+                   (ins GR32:$src1, GR32:$src2),
+                   "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+                   [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))]>, TB;
+def SHLD16rrCL : I<0xA5, MRMDestReg, (outs GR16:$dst), 
+                   (ins GR16:$src1, GR16:$src2),
+                   "shld{w}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+                   [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))]>,
+                   TB, OpSize;
+def SHRD16rrCL : I<0xAD, MRMDestReg, (outs GR16:$dst), 
+                   (ins GR16:$src1, GR16:$src2),
+                   "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+                   [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))]>,
+                   TB, OpSize;
+}
+
+let isCommutable = 1 in {  // These instructions commute to each other.
+def SHLD32rri8 : Ii8<0xA4, MRMDestReg,
+                     (outs GR32:$dst), 
+                     (ins GR32:$src1, GR32:$src2, i8imm:$src3),
+                     "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2,
+                                      (i8 imm:$src3)))]>,
+                 TB;
+def SHRD32rri8 : Ii8<0xAC, MRMDestReg,
+                     (outs GR32:$dst), 
+                     (ins GR32:$src1, GR32:$src2, i8imm:$src3),
+                     "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2,
+                                      (i8 imm:$src3)))]>,
+                 TB;
+def SHLD16rri8 : Ii8<0xA4, MRMDestReg,
+                     (outs GR16:$dst), 
+                     (ins GR16:$src1, GR16:$src2, i8imm:$src3),
+                     "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2,
+                                      (i8 imm:$src3)))]>,
+                     TB, OpSize;
+def SHRD16rri8 : Ii8<0xAC, MRMDestReg,
+                     (outs GR16:$dst), 
+                     (ins GR16:$src1, GR16:$src2, i8imm:$src3),
+                     "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2,
+                                      (i8 imm:$src3)))]>,
+                     TB, OpSize;
+}
+
+let isTwoAddress = 0 in {
+  let Uses = [CL] in {
+  def SHLD32mrCL : I<0xA5, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+                     "shld{l}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+                     [(store (X86shld (loadi32 addr:$dst), GR32:$src2, CL),
+                       addr:$dst)]>, TB;
+  def SHRD32mrCL : I<0xAD, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+                    "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+                    [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, CL),
+                      addr:$dst)]>, TB;
+  }
+  def SHLD32mri8 : Ii8<0xA4, MRMDestMem,
+                      (outs), (ins i32mem:$dst, GR32:$src2, i8imm:$src3),
+                      "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(store (X86shld (loadi32 addr:$dst), GR32:$src2,
+                                        (i8 imm:$src3)), addr:$dst)]>,
+                      TB;
+  def SHRD32mri8 : Ii8<0xAC, MRMDestMem, 
+                       (outs), (ins i32mem:$dst, GR32:$src2, i8imm:$src3),
+                       "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                       [(store (X86shrd (loadi32 addr:$dst), GR32:$src2,
+                                         (i8 imm:$src3)), addr:$dst)]>,
+                       TB;
+
+  let Uses = [CL] in {
+  def SHLD16mrCL : I<0xA5, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+                     "shld{w}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+                     [(store (X86shld (loadi16 addr:$dst), GR16:$src2, CL),
+                       addr:$dst)]>, TB, OpSize;
+  def SHRD16mrCL : I<0xAD, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+                    "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, CL}",
+                    [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, CL),
+                      addr:$dst)]>, TB, OpSize;
+  }
+  def SHLD16mri8 : Ii8<0xA4, MRMDestMem,
+                      (outs), (ins i16mem:$dst, GR16:$src2, i8imm:$src3),
+                      "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(store (X86shld (loadi16 addr:$dst), GR16:$src2,
+                                        (i8 imm:$src3)), addr:$dst)]>,
+                      TB, OpSize;
+  def SHRD16mri8 : Ii8<0xAC, MRMDestMem, 
+                       (outs), (ins i16mem:$dst, GR16:$src2, i8imm:$src3),
+                       "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(store (X86shrd (loadi16 addr:$dst), GR16:$src2,
+                                        (i8 imm:$src3)), addr:$dst)]>,
+                       TB, OpSize;
+}
+} // Defs = [EFLAGS]
+
+
+// Arithmetic.
+let Defs = [EFLAGS] in {
+let isCommutable = 1 in {   // X = ADD Y, Z   --> X = ADD Z, Y
+// Register-Register Addition
+def ADD8rr    : I<0x00, MRMDestReg, (outs GR8 :$dst),
+                                    (ins GR8 :$src1, GR8 :$src2),
+                  "add{b}\t{$src2, $dst|$dst, $src2}",
+                  [(set GR8:$dst, (add GR8:$src1, GR8:$src2)),
+                   (implicit EFLAGS)]>;
+
+let isConvertibleToThreeAddress = 1 in {   // Can transform into LEA.
+// Register-Register Addition
+def ADD16rr  : I<0x01, MRMDestReg, (outs GR16:$dst),
+                                   (ins GR16:$src1, GR16:$src2),
+                 "add{w}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (add GR16:$src1, GR16:$src2)),
+                  (implicit EFLAGS)]>, OpSize;
+def ADD32rr  : I<0x01, MRMDestReg, (outs GR32:$dst),
+                                   (ins GR32:$src1, GR32:$src2),
+                 "add{l}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (add GR32:$src1, GR32:$src2)),
+                  (implicit EFLAGS)]>;
+} // end isConvertibleToThreeAddress
+} // end isCommutable
+
+// Register-Memory Addition
+def ADD8rm   : I<0x02, MRMSrcMem, (outs GR8 :$dst),
+                                  (ins GR8 :$src1, i8mem :$src2),
+                 "add{b}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR8:$dst, (add GR8:$src1, (load addr:$src2))),
+                  (implicit EFLAGS)]>;
+def ADD16rm  : I<0x03, MRMSrcMem, (outs GR16:$dst),
+                                  (ins GR16:$src1, i16mem:$src2),
+                 "add{w}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (add GR16:$src1, (load addr:$src2))),
+                  (implicit EFLAGS)]>, OpSize;
+def ADD32rm  : I<0x03, MRMSrcMem, (outs GR32:$dst),
+                                  (ins GR32:$src1, i32mem:$src2),
+                 "add{l}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (add GR32:$src1, (load addr:$src2))),
+                  (implicit EFLAGS)]>;
+                  
+// Register-Register Addition - Equivalent to the normal rr forms (ADD8rr, 
+//   ADD16rr, and ADD32rr), but differently encoded.
+def ADD8mrmrr: I<0x02, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+                 "add{b}\t{$src2, $dst|$dst, $src2}", []>;
+def ADD16mrmrr: I<0x03, MRMSrcReg,(outs GR16:$dst),(ins GR16:$src1, GR16:$src2),
+                  "add{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize;
+def ADD32mrmrr: I<0x03, MRMSrcReg,(outs GR16:$dst),(ins GR16:$src1, GR16:$src2),
+                  "add{l}\t{$src2, $dst|$dst, $src2}", []>;
+
+// Register-Integer Addition
+def ADD8ri    : Ii8<0x80, MRM0r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+                    "add{b}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR8:$dst, (add GR8:$src1, imm:$src2)),
+                     (implicit EFLAGS)]>;
+
+let isConvertibleToThreeAddress = 1 in {   // Can transform into LEA.
+// Register-Integer Addition
+def ADD16ri  : Ii16<0x81, MRM0r, (outs GR16:$dst),
+                                 (ins GR16:$src1, i16imm:$src2),
+                    "add{w}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR16:$dst, (add GR16:$src1, imm:$src2)),
+                     (implicit EFLAGS)]>, OpSize;
+def ADD32ri  : Ii32<0x81, MRM0r, (outs GR32:$dst),
+                                 (ins GR32:$src1, i32imm:$src2),
+                    "add{l}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR32:$dst, (add GR32:$src1, imm:$src2)),
+                     (implicit EFLAGS)]>;
+def ADD16ri8 : Ii8<0x83, MRM0r, (outs GR16:$dst),
+                                (ins GR16:$src1, i16i8imm:$src2),
+                   "add{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (add GR16:$src1, i16immSExt8:$src2)),
+                    (implicit EFLAGS)]>, OpSize;
+def ADD32ri8 : Ii8<0x83, MRM0r, (outs GR32:$dst),
+                                (ins GR32:$src1, i32i8imm:$src2),
+                   "add{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (add GR32:$src1, i32immSExt8:$src2)),
+                    (implicit EFLAGS)]>;
+}
+
+let isTwoAddress = 0 in {
+  // Memory-Register Addition
+  def ADD8mr   : I<0x00, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2),
+                   "add{b}\t{$src2, $dst|$dst, $src2}",
+                   [(store (add (load addr:$dst), GR8:$src2), addr:$dst),
+                    (implicit EFLAGS)]>;
+  def ADD16mr  : I<0x01, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+                   "add{w}\t{$src2, $dst|$dst, $src2}",
+                   [(store (add (load addr:$dst), GR16:$src2), addr:$dst),
+                    (implicit EFLAGS)]>, OpSize;
+  def ADD32mr  : I<0x01, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+                   "add{l}\t{$src2, $dst|$dst, $src2}",
+                   [(store (add (load addr:$dst), GR32:$src2), addr:$dst),
+                    (implicit EFLAGS)]>;
+  def ADD8mi   : Ii8<0x80, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src2),
+                     "add{b}\t{$src2, $dst|$dst, $src2}",
+                   [(store (add (loadi8 addr:$dst), imm:$src2), addr:$dst),
+                    (implicit EFLAGS)]>;
+  def ADD16mi  : Ii16<0x81, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src2),
+                      "add{w}\t{$src2, $dst|$dst, $src2}",
+                  [(store (add (loadi16 addr:$dst), imm:$src2), addr:$dst),
+                   (implicit EFLAGS)]>, OpSize;
+  def ADD32mi  : Ii32<0x81, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src2),
+                      "add{l}\t{$src2, $dst|$dst, $src2}",
+                      [(store (add (loadi32 addr:$dst), imm:$src2), addr:$dst),
+                       (implicit EFLAGS)]>;
+  def ADD16mi8 : Ii8<0x83, MRM0m, (outs), (ins i16mem:$dst, i16i8imm :$src2),
+                     "add{w}\t{$src2, $dst|$dst, $src2}",
+                     [(store (add (load addr:$dst), i16immSExt8:$src2),
+                                  addr:$dst),
+                      (implicit EFLAGS)]>, OpSize;
+  def ADD32mi8 : Ii8<0x83, MRM0m, (outs), (ins i32mem:$dst, i32i8imm :$src2),
+                     "add{l}\t{$src2, $dst|$dst, $src2}",
+                  [(store (add (load addr:$dst), i32immSExt8:$src2),
+                               addr:$dst),
+                   (implicit EFLAGS)]>;
+
+  // addition to rAX
+  def ADD8i8 : Ii8<0x04, RawFrm, (outs), (ins i8imm:$src),
+                   "add{b}\t{$src, %al|%al, $src}", []>;
+  def ADD16i16 : Ii16<0x05, RawFrm, (outs), (ins i16imm:$src),
+                      "add{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
+  def ADD32i32 : Ii32<0x05, RawFrm, (outs), (ins i32imm:$src),
+                      "add{l}\t{$src, %eax|%eax, $src}", []>;
+}
+
+let Uses = [EFLAGS] in {
+let isCommutable = 1 in {  // X = ADC Y, Z --> X = ADC Z, Y
+def ADC8rr   : I<0x10, MRMDestReg, (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+                 "adc{b}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR8:$dst, (adde GR8:$src1, GR8:$src2))]>;
+def ADC16rr  : I<0x11, MRMDestReg, (outs GR16:$dst),
+                                   (ins GR16:$src1, GR16:$src2),
+                 "adc{w}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (adde GR16:$src1, GR16:$src2))]>, OpSize;
+def ADC32rr  : I<0x11, MRMDestReg, (outs GR32:$dst),
+                                   (ins GR32:$src1, GR32:$src2),
+                 "adc{l}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (adde GR32:$src1, GR32:$src2))]>;
+}
+
+def ADC8rr_REV : I<0x12, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+                 "adc{b}\t{$src2, $dst|$dst, $src2}", []>;
+def ADC16rr_REV : I<0x13, MRMSrcReg, (outs GR16:$dst), 
+                    (ins GR16:$src1, GR16:$src2),
+                    "adc{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize;
+def ADC32rr_REV : I<0x13, MRMSrcReg, (outs GR32:$dst), 
+                    (ins GR32:$src1, GR32:$src2),
+                    "adc{l}\t{$src2, $dst|$dst, $src2}", []>;
+
+def ADC8rm   : I<0x12, MRMSrcMem , (outs GR8:$dst), 
+                                   (ins GR8:$src1, i8mem:$src2),
+                 "adc{b}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR8:$dst, (adde GR8:$src1, (load addr:$src2)))]>;
+def ADC16rm  : I<0x13, MRMSrcMem , (outs GR16:$dst),
+                                   (ins GR16:$src1, i16mem:$src2),
+                 "adc{w}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (adde GR16:$src1, (load addr:$src2)))]>,
+                 OpSize;
+def ADC32rm  : I<0x13, MRMSrcMem , (outs GR32:$dst),
+                                   (ins GR32:$src1, i32mem:$src2),
+                 "adc{l}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (adde GR32:$src1, (load addr:$src2)))]>;
+def ADC8ri   : Ii8<0x80, MRM2r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+                    "adc{b}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR8:$dst, (adde GR8:$src1, imm:$src2))]>;
+def ADC16ri  : Ii16<0x81, MRM2r, (outs GR16:$dst),
+                                 (ins GR16:$src1, i16imm:$src2),
+                    "adc{w}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (adde GR16:$src1, imm:$src2))]>, OpSize;
+def ADC16ri8 : Ii8<0x83, MRM2r, (outs GR16:$dst),
+                                (ins GR16:$src1, i16i8imm:$src2),
+                   "adc{w}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (adde GR16:$src1, i16immSExt8:$src2))]>,
+                 OpSize;
+def ADC32ri  : Ii32<0x81, MRM2r, (outs GR32:$dst),
+                                 (ins GR32:$src1, i32imm:$src2),
+                    "adc{l}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (adde GR32:$src1, imm:$src2))]>;
+def ADC32ri8 : Ii8<0x83, MRM2r, (outs GR32:$dst),
+                                (ins GR32:$src1, i32i8imm:$src2),
+                   "adc{l}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (adde GR32:$src1, i32immSExt8:$src2))]>;
+
+let isTwoAddress = 0 in {
+  def ADC8mr   : I<0x10, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2),
+                   "adc{b}\t{$src2, $dst|$dst, $src2}",
+                   [(store (adde (load addr:$dst), GR8:$src2), addr:$dst)]>;
+  def ADC16mr  : I<0x11, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+                   "adc{w}\t{$src2, $dst|$dst, $src2}",
+                   [(store (adde (load addr:$dst), GR16:$src2), addr:$dst)]>,
+                   OpSize;
+  def ADC32mr  : I<0x11, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+                   "adc{l}\t{$src2, $dst|$dst, $src2}",
+                   [(store (adde (load addr:$dst), GR32:$src2), addr:$dst)]>;
+  def ADC8mi   : Ii8<0x80, MRM2m, (outs), (ins i8mem:$dst, i8imm:$src2),
+                      "adc{b}\t{$src2, $dst|$dst, $src2}",
+                  [(store (adde (loadi8 addr:$dst), imm:$src2), addr:$dst)]>;
+  def ADC16mi  : Ii16<0x81, MRM2m, (outs), (ins i16mem:$dst, i16imm:$src2),
+                      "adc{w}\t{$src2, $dst|$dst, $src2}",
+                  [(store (adde (loadi16 addr:$dst), imm:$src2), addr:$dst)]>,
+                  OpSize;
+  def ADC16mi8 : Ii8<0x83, MRM2m, (outs), (ins i16mem:$dst, i16i8imm :$src2),
+                     "adc{w}\t{$src2, $dst|$dst, $src2}",
+               [(store (adde (load addr:$dst), i16immSExt8:$src2), addr:$dst)]>,
+               OpSize;
+  def ADC32mi  : Ii32<0x81, MRM2m, (outs), (ins i32mem:$dst, i32imm:$src2),
+                      "adc{l}\t{$src2, $dst|$dst, $src2}",
+                  [(store (adde (loadi32 addr:$dst), imm:$src2), addr:$dst)]>;
+  def ADC32mi8 : Ii8<0x83, MRM2m, (outs), (ins i32mem:$dst, i32i8imm :$src2),
+                     "adc{l}\t{$src2, $dst|$dst, $src2}",
+               [(store (adde (load addr:$dst), i32immSExt8:$src2), addr:$dst)]>;
+
+  def ADC8i8 : Ii8<0x14, RawFrm, (outs), (ins i8imm:$src),
+                   "adc{b}\t{$src, %al|%al, $src}", []>;
+  def ADC16i16 : Ii16<0x15, RawFrm, (outs), (ins i16imm:$src),
+                      "adc{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
+  def ADC32i32 : Ii32<0x15, RawFrm, (outs), (ins i32imm:$src),
+                      "adc{l}\t{$src, %eax|%eax, $src}", []>;
+}
+} // Uses = [EFLAGS]
+
+// Register-Register Subtraction
+def SUB8rr  : I<0x28, MRMDestReg, (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+                "sub{b}\t{$src2, $dst|$dst, $src2}",
+                [(set GR8:$dst, (sub GR8:$src1, GR8:$src2)),
+                 (implicit EFLAGS)]>;
+def SUB16rr : I<0x29, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1,GR16:$src2),
+                "sub{w}\t{$src2, $dst|$dst, $src2}",
+                [(set GR16:$dst, (sub GR16:$src1, GR16:$src2)),
+                 (implicit EFLAGS)]>, OpSize;
+def SUB32rr : I<0x29, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1,GR32:$src2),
+                "sub{l}\t{$src2, $dst|$dst, $src2}",
+                [(set GR32:$dst, (sub GR32:$src1, GR32:$src2)),
+                 (implicit EFLAGS)]>;
+
+def SUB8rr_REV : I<0x2A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+                   "sub{b}\t{$src2, $dst|$dst, $src2}", []>;
+def SUB16rr_REV : I<0x2B, MRMSrcReg, (outs GR16:$dst), 
+                    (ins GR16:$src1, GR16:$src2),
+                    "sub{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize;
+def SUB32rr_REV : I<0x2B, MRMSrcReg, (outs GR32:$dst), 
+                    (ins GR32:$src1, GR32:$src2),
+                    "sub{l}\t{$src2, $dst|$dst, $src2}", []>;
+
+// Register-Memory Subtraction
+def SUB8rm  : I<0x2A, MRMSrcMem, (outs GR8 :$dst),
+                                 (ins GR8 :$src1, i8mem :$src2),
+                "sub{b}\t{$src2, $dst|$dst, $src2}",
+                [(set GR8:$dst, (sub GR8:$src1, (load addr:$src2))),
+                 (implicit EFLAGS)]>;
+def SUB16rm : I<0x2B, MRMSrcMem, (outs GR16:$dst),
+                                 (ins GR16:$src1, i16mem:$src2),
+                "sub{w}\t{$src2, $dst|$dst, $src2}",
+                [(set GR16:$dst, (sub GR16:$src1, (load addr:$src2))),
+                 (implicit EFLAGS)]>, OpSize;
+def SUB32rm : I<0x2B, MRMSrcMem, (outs GR32:$dst),
+                                 (ins GR32:$src1, i32mem:$src2),
+                "sub{l}\t{$src2, $dst|$dst, $src2}",
+                [(set GR32:$dst, (sub GR32:$src1, (load addr:$src2))),
+                 (implicit EFLAGS)]>;
+
+// Register-Integer Subtraction
+def SUB8ri   : Ii8 <0x80, MRM5r, (outs GR8:$dst),
+                                 (ins GR8:$src1, i8imm:$src2),
+                    "sub{b}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR8:$dst, (sub GR8:$src1, imm:$src2)),
+                     (implicit EFLAGS)]>;
+def SUB16ri  : Ii16<0x81, MRM5r, (outs GR16:$dst),
+                                 (ins GR16:$src1, i16imm:$src2),
+                    "sub{w}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR16:$dst, (sub GR16:$src1, imm:$src2)),
+                     (implicit EFLAGS)]>, OpSize;
+def SUB32ri  : Ii32<0x81, MRM5r, (outs GR32:$dst),
+                                 (ins GR32:$src1, i32imm:$src2),
+                    "sub{l}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR32:$dst, (sub GR32:$src1, imm:$src2)),
+                     (implicit EFLAGS)]>;
+def SUB16ri8 : Ii8<0x83, MRM5r, (outs GR16:$dst),
+                                (ins GR16:$src1, i16i8imm:$src2),
+                   "sub{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (sub GR16:$src1, i16immSExt8:$src2)),
+                    (implicit EFLAGS)]>, OpSize;
+def SUB32ri8 : Ii8<0x83, MRM5r, (outs GR32:$dst),
+                                (ins GR32:$src1, i32i8imm:$src2),
+                   "sub{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (sub GR32:$src1, i32immSExt8:$src2)),
+                    (implicit EFLAGS)]>;
+
+let isTwoAddress = 0 in {
+  // Memory-Register Subtraction
+  def SUB8mr   : I<0x28, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src2),
+                   "sub{b}\t{$src2, $dst|$dst, $src2}",
+                   [(store (sub (load addr:$dst), GR8:$src2), addr:$dst),
+                    (implicit EFLAGS)]>;
+  def SUB16mr  : I<0x29, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+                   "sub{w}\t{$src2, $dst|$dst, $src2}",
+                   [(store (sub (load addr:$dst), GR16:$src2), addr:$dst),
+                    (implicit EFLAGS)]>, OpSize;
+  def SUB32mr  : I<0x29, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), 
+                   "sub{l}\t{$src2, $dst|$dst, $src2}",
+                   [(store (sub (load addr:$dst), GR32:$src2), addr:$dst),
+                    (implicit EFLAGS)]>;
+
+  // Memory-Integer Subtraction
+  def SUB8mi   : Ii8<0x80, MRM5m, (outs), (ins i8mem :$dst, i8imm:$src2), 
+                     "sub{b}\t{$src2, $dst|$dst, $src2}",
+                     [(store (sub (loadi8 addr:$dst), imm:$src2), addr:$dst),
+                      (implicit EFLAGS)]>;
+  def SUB16mi  : Ii16<0x81, MRM5m, (outs), (ins i16mem:$dst, i16imm:$src2), 
+                      "sub{w}\t{$src2, $dst|$dst, $src2}",
+                      [(store (sub (loadi16 addr:$dst), imm:$src2),addr:$dst),
+                       (implicit EFLAGS)]>, OpSize;
+  def SUB32mi  : Ii32<0x81, MRM5m, (outs), (ins i32mem:$dst, i32imm:$src2), 
+                      "sub{l}\t{$src2, $dst|$dst, $src2}",
+                      [(store (sub (loadi32 addr:$dst), imm:$src2),addr:$dst),
+                       (implicit EFLAGS)]>;
+  def SUB16mi8 : Ii8<0x83, MRM5m, (outs), (ins i16mem:$dst, i16i8imm :$src2), 
+                     "sub{w}\t{$src2, $dst|$dst, $src2}",
+                     [(store (sub (load addr:$dst), i16immSExt8:$src2),
+                             addr:$dst),
+                      (implicit EFLAGS)]>, OpSize;
+  def SUB32mi8 : Ii8<0x83, MRM5m, (outs), (ins i32mem:$dst, i32i8imm :$src2),
+                     "sub{l}\t{$src2, $dst|$dst, $src2}",
+                     [(store (sub (load addr:$dst), i32immSExt8:$src2),
+                             addr:$dst),
+                      (implicit EFLAGS)]>;
+                      
+  def SUB8i8 : Ii8<0x2C, RawFrm, (outs), (ins i8imm:$src),
+                   "sub{b}\t{$src, %al|%al, $src}", []>;
+  def SUB16i16 : Ii16<0x2D, RawFrm, (outs), (ins i16imm:$src),
+                      "sub{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
+  def SUB32i32 : Ii32<0x2D, RawFrm, (outs), (ins i32imm:$src),
+                      "sub{l}\t{$src, %eax|%eax, $src}", []>;
+}
+
+let Uses = [EFLAGS] in {
+def SBB8rr     : I<0x18, MRMDestReg, (outs GR8:$dst),
+                                     (ins GR8:$src1, GR8:$src2),
+                  "sbb{b}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR8:$dst, (sube GR8:$src1, GR8:$src2))]>;
+def SBB16rr    : I<0x19, MRMDestReg, (outs GR16:$dst),
+                                     (ins GR16:$src1, GR16:$src2),
+                  "sbb{w}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (sube GR16:$src1, GR16:$src2))]>, OpSize;
+def SBB32rr    : I<0x19, MRMDestReg, (outs GR32:$dst),
+                                      (ins GR32:$src1, GR32:$src2),
+                  "sbb{l}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (sube GR32:$src1, GR32:$src2))]>;
+
+let isTwoAddress = 0 in {
+  def SBB8mr   : I<0x18, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2), 
+                   "sbb{b}\t{$src2, $dst|$dst, $src2}",
+                   [(store (sube (load addr:$dst), GR8:$src2), addr:$dst)]>;
+  def SBB16mr  : I<0x19, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2), 
+                   "sbb{w}\t{$src2, $dst|$dst, $src2}",
+                   [(store (sube (load addr:$dst), GR16:$src2), addr:$dst)]>,
+                   OpSize;
+  def SBB32mr  : I<0x19, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), 
+                   "sbb{l}\t{$src2, $dst|$dst, $src2}",
+                   [(store (sube (load addr:$dst), GR32:$src2), addr:$dst)]>;
+  def SBB8mi  : Ii8<0x80, MRM3m, (outs), (ins i8mem:$dst, i8imm:$src2), 
+                    "sbb{b}\t{$src2, $dst|$dst, $src2}",
+                   [(store (sube (loadi8 addr:$dst), imm:$src2), addr:$dst)]>;
+  def SBB16mi  : Ii16<0x81, MRM3m, (outs), (ins i16mem:$dst, i16imm:$src2), 
+                      "sbb{w}\t{$src2, $dst|$dst, $src2}",
+                  [(store (sube (loadi16 addr:$dst), imm:$src2), addr:$dst)]>,
+                  OpSize;
+  def SBB16mi8 : Ii8<0x83, MRM3m, (outs), (ins i16mem:$dst, i16i8imm :$src2), 
+                     "sbb{w}\t{$src2, $dst|$dst, $src2}",
+               [(store (sube (load addr:$dst), i16immSExt8:$src2), addr:$dst)]>,
+               OpSize;
+  def SBB32mi  : Ii32<0x81, MRM3m, (outs), (ins i32mem:$dst, i32imm:$src2), 
+                      "sbb{l}\t{$src2, $dst|$dst, $src2}",
+                  [(store (sube (loadi32 addr:$dst), imm:$src2), addr:$dst)]>;
+  def SBB32mi8 : Ii8<0x83, MRM3m, (outs), (ins i32mem:$dst, i32i8imm :$src2), 
+                     "sbb{l}\t{$src2, $dst|$dst, $src2}",
+               [(store (sube (load addr:$dst), i32immSExt8:$src2), addr:$dst)]>;
+               
+  def SBB8i8 : Ii8<0x1C, RawFrm, (outs), (ins i8imm:$src),
+                   "sbb{b}\t{$src, %al|%al, $src}", []>;
+  def SBB16i16 : Ii16<0x1D, RawFrm, (outs), (ins i16imm:$src),
+                      "sbb{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
+  def SBB32i32 : Ii32<0x1D, RawFrm, (outs), (ins i32imm:$src),
+                      "sbb{l}\t{$src, %eax|%eax, $src}", []>;
+}
+
+def SBB8rr_REV : I<0x1A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src1, GR8:$src2),
+                   "sbb{b}\t{$src2, $dst|$dst, $src2}", []>;
+def SBB16rr_REV : I<0x1B, MRMSrcReg, (outs GR16:$dst), 
+                    (ins GR16:$src1, GR16:$src2),
+                    "sbb{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize;
+def SBB32rr_REV : I<0x1B, MRMSrcReg, (outs GR32:$dst), 
+                    (ins GR32:$src1, GR32:$src2),
+                    "sbb{l}\t{$src2, $dst|$dst, $src2}", []>;
+
+def SBB8rm   : I<0x1A, MRMSrcMem, (outs GR8:$dst), (ins GR8:$src1, i8mem:$src2),
+                    "sbb{b}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR8:$dst, (sube GR8:$src1, (load addr:$src2)))]>;
+def SBB16rm  : I<0x1B, MRMSrcMem, (outs GR16:$dst),
+                                  (ins GR16:$src1, i16mem:$src2),
+                    "sbb{w}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR16:$dst, (sube GR16:$src1, (load addr:$src2)))]>,
+                    OpSize;
+def SBB32rm  : I<0x1B, MRMSrcMem, (outs GR32:$dst),
+                                  (ins GR32:$src1, i32mem:$src2),
+                    "sbb{l}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR32:$dst, (sube GR32:$src1, (load addr:$src2)))]>;
+def SBB8ri   : Ii8<0x80, MRM3r, (outs GR8:$dst), (ins GR8:$src1, i8imm:$src2),
+                    "sbb{b}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR8:$dst, (sube GR8:$src1, imm:$src2))]>;
+def SBB16ri  : Ii16<0x81, MRM3r, (outs GR16:$dst),
+                                 (ins GR16:$src1, i16imm:$src2),
+                    "sbb{w}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR16:$dst, (sube GR16:$src1, imm:$src2))]>, OpSize;
+def SBB16ri8 : Ii8<0x83, MRM3r, (outs GR16:$dst),
+                                (ins GR16:$src1, i16i8imm:$src2),
+                   "sbb{w}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR16:$dst, (sube GR16:$src1, i16immSExt8:$src2))]>,
+                   OpSize;
+def SBB32ri  : Ii32<0x81, MRM3r, (outs GR32:$dst), 
+                                 (ins GR32:$src1, i32imm:$src2),
+                    "sbb{l}\t{$src2, $dst|$dst, $src2}",
+                    [(set GR32:$dst, (sube GR32:$src1, imm:$src2))]>;
+def SBB32ri8 : Ii8<0x83, MRM3r, (outs GR32:$dst),
+                                (ins GR32:$src1, i32i8imm:$src2),
+                   "sbb{l}\t{$src2, $dst|$dst, $src2}",
+                   [(set GR32:$dst, (sube GR32:$src1, i32immSExt8:$src2))]>;
+} // Uses = [EFLAGS]
+} // Defs = [EFLAGS]
+
+let Defs = [EFLAGS] in {
+let isCommutable = 1 in {  // X = IMUL Y, Z --> X = IMUL Z, Y
+// Register-Register Signed Integer Multiply
+def IMUL16rr : I<0xAF, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1,GR16:$src2),
+                 "imul{w}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (mul GR16:$src1, GR16:$src2)),
+                  (implicit EFLAGS)]>, TB, OpSize;
+def IMUL32rr : I<0xAF, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1,GR32:$src2),
+                 "imul{l}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (mul GR32:$src1, GR32:$src2)),
+                  (implicit EFLAGS)]>, TB;
+}
+
+// Register-Memory Signed Integer Multiply
+def IMUL16rm : I<0xAF, MRMSrcMem, (outs GR16:$dst),
+                                  (ins GR16:$src1, i16mem:$src2),
+                 "imul{w}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR16:$dst, (mul GR16:$src1, (load addr:$src2))),
+                  (implicit EFLAGS)]>, TB, OpSize;
+def IMUL32rm : I<0xAF, MRMSrcMem, (outs GR32:$dst), 
+                 (ins GR32:$src1, i32mem:$src2),
+                 "imul{l}\t{$src2, $dst|$dst, $src2}",
+                 [(set GR32:$dst, (mul GR32:$src1, (load addr:$src2))),
+                  (implicit EFLAGS)]>, TB;
+} // Defs = [EFLAGS]
+} // end Two Address instructions
+
+// Suprisingly enough, these are not two address instructions!
+let Defs = [EFLAGS] in {
+// Register-Integer Signed Integer Multiply
+def IMUL16rri  : Ii16<0x69, MRMSrcReg,                      // GR16 = GR16*I16
+                      (outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
+                      "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR16:$dst, (mul GR16:$src1, imm:$src2)),
+                       (implicit EFLAGS)]>, OpSize;
+def IMUL32rri  : Ii32<0x69, MRMSrcReg,                      // GR32 = GR32*I32
+                      (outs GR32:$dst), (ins GR32:$src1, i32imm:$src2),
+                      "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR32:$dst, (mul GR32:$src1, imm:$src2)),
+                       (implicit EFLAGS)]>;
+def IMUL16rri8 : Ii8<0x6B, MRMSrcReg,                       // GR16 = GR16*I8
+                     (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
+                     "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     [(set GR16:$dst, (mul GR16:$src1, i16immSExt8:$src2)),
+                      (implicit EFLAGS)]>, OpSize;
+def IMUL32rri8 : Ii8<0x6B, MRMSrcReg,                       // GR32 = GR32*I8
+                     (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
+                     "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     [(set GR32:$dst, (mul GR32:$src1, i32immSExt8:$src2)),
+                      (implicit EFLAGS)]>;
+
+// Memory-Integer Signed Integer Multiply
+def IMUL16rmi  : Ii16<0x69, MRMSrcMem,                     // GR16 = [mem16]*I16
+                      (outs GR16:$dst), (ins i16mem:$src1, i16imm:$src2),
+                      "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR16:$dst, (mul (load addr:$src1), imm:$src2)),
+                       (implicit EFLAGS)]>, OpSize;
+def IMUL32rmi  : Ii32<0x69, MRMSrcMem,                     // GR32 = [mem32]*I32
+                      (outs GR32:$dst), (ins i32mem:$src1, i32imm:$src2),
+                      "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      [(set GR32:$dst, (mul (load addr:$src1), imm:$src2)),
+                       (implicit EFLAGS)]>;
+def IMUL16rmi8 : Ii8<0x6B, MRMSrcMem,                       // GR16 = [mem16]*I8
+                     (outs GR16:$dst), (ins i16mem:$src1, i16i8imm :$src2),
+                     "imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     [(set GR16:$dst, (mul (load addr:$src1),
+                                       i16immSExt8:$src2)),
+                      (implicit EFLAGS)]>, OpSize;
+def IMUL32rmi8 : Ii8<0x6B, MRMSrcMem,                       // GR32 = [mem32]*I8
+                     (outs GR32:$dst), (ins i32mem:$src1, i32i8imm: $src2),
+                     "imul{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     [(set GR32:$dst, (mul (load addr:$src1),
+                                           i32immSExt8:$src2)),
+                      (implicit EFLAGS)]>;
+} // Defs = [EFLAGS]
+
+//===----------------------------------------------------------------------===//
+// Test instructions are just like AND, except they don't generate a result.
+//
+let Defs = [EFLAGS] in {
+let isCommutable = 1 in {   // TEST X, Y   --> TEST Y, X
+def TEST8rr  : I<0x84, MRMDestReg, (outs),  (ins GR8:$src1, GR8:$src2),
+                     "test{b}\t{$src2, $src1|$src1, $src2}",
+                     [(X86cmp (and_su GR8:$src1, GR8:$src2), 0),
+                      (implicit EFLAGS)]>;
+def TEST16rr : I<0x85, MRMDestReg, (outs),  (ins GR16:$src1, GR16:$src2),
+                     "test{w}\t{$src2, $src1|$src1, $src2}",
+                     [(X86cmp (and_su GR16:$src1, GR16:$src2), 0),
+                      (implicit EFLAGS)]>,
+                 OpSize;
+def TEST32rr : I<0x85, MRMDestReg, (outs),  (ins GR32:$src1, GR32:$src2),
+                     "test{l}\t{$src2, $src1|$src1, $src2}",
+                     [(X86cmp (and_su GR32:$src1, GR32:$src2), 0),
+                      (implicit EFLAGS)]>;
+}
+
+def TEST8i8  : Ii8<0xA8, RawFrm, (outs), (ins i8imm:$src),
+                   "test{b}\t{$src, %al|%al, $src}", []>;
+def TEST16i16 : Ii16<0xA9, RawFrm, (outs), (ins i16imm:$src),
+                     "test{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
+def TEST32i32 : Ii32<0xA9, RawFrm, (outs), (ins i32imm:$src),
+                     "test{l}\t{$src, %eax|%eax, $src}", []>;
+
+def TEST8rm  : I<0x84, MRMSrcMem, (outs),  (ins GR8 :$src1, i8mem :$src2),
+                     "test{b}\t{$src2, $src1|$src1, $src2}",
+                     [(X86cmp (and GR8:$src1, (loadi8 addr:$src2)), 0),
+                      (implicit EFLAGS)]>;
+def TEST16rm : I<0x85, MRMSrcMem, (outs),  (ins GR16:$src1, i16mem:$src2),
+                     "test{w}\t{$src2, $src1|$src1, $src2}",
+                     [(X86cmp (and GR16:$src1, (loadi16 addr:$src2)), 0),
+                      (implicit EFLAGS)]>, OpSize;
+def TEST32rm : I<0x85, MRMSrcMem, (outs),  (ins GR32:$src1, i32mem:$src2),
+                     "test{l}\t{$src2, $src1|$src1, $src2}",
+                     [(X86cmp (and GR32:$src1, (loadi32 addr:$src2)), 0),
+                      (implicit EFLAGS)]>;
+
+def TEST8ri  : Ii8 <0xF6, MRM0r,                     // flags = GR8  & imm8
+                    (outs),  (ins GR8:$src1, i8imm:$src2),
+                    "test{b}\t{$src2, $src1|$src1, $src2}",
+                    [(X86cmp (and_su GR8:$src1, imm:$src2), 0),
+                     (implicit EFLAGS)]>;
+def TEST16ri : Ii16<0xF7, MRM0r,                     // flags = GR16 & imm16
+                    (outs),  (ins GR16:$src1, i16imm:$src2),
+                    "test{w}\t{$src2, $src1|$src1, $src2}",
+                    [(X86cmp (and_su GR16:$src1, imm:$src2), 0),
+                     (implicit EFLAGS)]>, OpSize;
+def TEST32ri : Ii32<0xF7, MRM0r,                     // flags = GR32 & imm32
+                    (outs),  (ins GR32:$src1, i32imm:$src2),
+                    "test{l}\t{$src2, $src1|$src1, $src2}",
+                    [(X86cmp (and_su GR32:$src1, imm:$src2), 0),
+                     (implicit EFLAGS)]>;
+
+def TEST8mi  : Ii8 <0xF6, MRM0m,                   // flags = [mem8]  & imm8
+                    (outs), (ins i8mem:$src1, i8imm:$src2),
+                    "test{b}\t{$src2, $src1|$src1, $src2}",
+                    [(X86cmp (and (loadi8 addr:$src1), imm:$src2), 0),
+                     (implicit EFLAGS)]>;
+def TEST16mi : Ii16<0xF7, MRM0m,                   // flags = [mem16] & imm16
+                    (outs), (ins i16mem:$src1, i16imm:$src2),
+                    "test{w}\t{$src2, $src1|$src1, $src2}",
+                    [(X86cmp (and (loadi16 addr:$src1), imm:$src2), 0),
+                     (implicit EFLAGS)]>, OpSize;
+def TEST32mi : Ii32<0xF7, MRM0m,                   // flags = [mem32] & imm32
+                    (outs), (ins i32mem:$src1, i32imm:$src2),
+                    "test{l}\t{$src2, $src1|$src1, $src2}",
+                    [(X86cmp (and (loadi32 addr:$src1), imm:$src2), 0),
+                     (implicit EFLAGS)]>;
+} // Defs = [EFLAGS]
+
+
+// Condition code ops, incl. set if equal/not equal/...
+let Defs = [EFLAGS], Uses = [AH], neverHasSideEffects = 1 in
+def SAHF     : I<0x9E, RawFrm, (outs),  (ins), "sahf", []>;  // flags = AH
+let Defs = [AH], Uses = [EFLAGS], neverHasSideEffects = 1 in
+def LAHF     : I<0x9F, RawFrm, (outs),  (ins), "lahf", []>;  // AH = flags
+
+let Uses = [EFLAGS] in {
+// Use sbb to materialize carry bit.
+let Defs = [EFLAGS], isCodeGenOnly = 1 in {
+// FIXME: These are pseudo ops that should be replaced with Pat<> patterns.
+// However, Pat<> can't replicate the destination reg into the inputs of the
+// result.
+// FIXME: Change these to have encoding Pseudo when X86MCCodeEmitter replaces
+// X86CodeEmitter.
+def SETB_C8r : I<0x18, MRMInitReg, (outs GR8:$dst), (ins), "",
+                 [(set GR8:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+def SETB_C16r : I<0x19, MRMInitReg, (outs GR16:$dst), (ins), "",
+                 [(set GR16:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>,
+                OpSize;
+def SETB_C32r : I<0x19, MRMInitReg, (outs GR32:$dst), (ins), "",
+                 [(set GR32:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+} // isCodeGenOnly
+
+def SETEr    : I<0x94, MRM0r, 
+                 (outs GR8   :$dst), (ins),
+                 "sete\t$dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_E, EFLAGS))]>,
+               TB;                        // GR8 = ==
+def SETEm    : I<0x94, MRM0m, 
+                 (outs), (ins i8mem:$dst),
+                 "sete\t$dst",
+                 [(store (X86setcc X86_COND_E, EFLAGS), addr:$dst)]>,
+               TB;                        // [mem8] = ==
+
+def SETNEr   : I<0x95, MRM0r, 
+                 (outs GR8   :$dst), (ins),
+                 "setne\t$dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_NE, EFLAGS))]>,
+               TB;                        // GR8 = !=
+def SETNEm   : I<0x95, MRM0m, 
+                 (outs), (ins i8mem:$dst),
+                 "setne\t$dst",
+                 [(store (X86setcc X86_COND_NE, EFLAGS), addr:$dst)]>,
+               TB;                        // [mem8] = !=
+
+def SETLr    : I<0x9C, MRM0r, 
+                 (outs GR8   :$dst), (ins),
+                 "setl\t$dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_L, EFLAGS))]>,
+               TB;                        // GR8 = <  signed
+def SETLm    : I<0x9C, MRM0m, 
+                 (outs), (ins i8mem:$dst),
+                 "setl\t$dst",
+                 [(store (X86setcc X86_COND_L, EFLAGS), addr:$dst)]>,
+               TB;                        // [mem8] = <  signed
+
+def SETGEr   : I<0x9D, MRM0r, 
+                 (outs GR8   :$dst), (ins),
+                 "setge\t$dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_GE, EFLAGS))]>,
+               TB;                        // GR8 = >= signed
+def SETGEm   : I<0x9D, MRM0m, 
+                 (outs), (ins i8mem:$dst),
+                 "setge\t$dst",
+                 [(store (X86setcc X86_COND_GE, EFLAGS), addr:$dst)]>,
+               TB;                        // [mem8] = >= signed
+
+def SETLEr   : I<0x9E, MRM0r, 
+                 (outs GR8   :$dst), (ins),
+                 "setle\t$dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_LE, EFLAGS))]>,
+               TB;                        // GR8 = <= signed
+def SETLEm   : I<0x9E, MRM0m, 
+                 (outs), (ins i8mem:$dst),
+                 "setle\t$dst",
+                 [(store (X86setcc X86_COND_LE, EFLAGS), addr:$dst)]>,
+               TB;                        // [mem8] = <= signed
+
+def SETGr    : I<0x9F, MRM0r, 
+                 (outs GR8   :$dst), (ins),
+                 "setg\t$dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_G, EFLAGS))]>,
+               TB;                        // GR8 = >  signed
+def SETGm    : I<0x9F, MRM0m, 
+                 (outs), (ins i8mem:$dst),
+                 "setg\t$dst",
+                 [(store (X86setcc X86_COND_G, EFLAGS), addr:$dst)]>,
+               TB;                        // [mem8] = >  signed
+
+def SETBr    : I<0x92, MRM0r,
+                 (outs GR8   :$dst), (ins),
+                 "setb\t$dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_B, EFLAGS))]>,
+               TB;                        // GR8 = <  unsign
+def SETBm    : I<0x92, MRM0m,
+                 (outs), (ins i8mem:$dst),
+                 "setb\t$dst",
+                 [(store (X86setcc X86_COND_B, EFLAGS), addr:$dst)]>,
+               TB;                        // [mem8] = <  unsign
+
+def SETAEr   : I<0x93, MRM0r, 
+                 (outs GR8   :$dst), (ins),
+                 "setae\t$dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_AE, EFLAGS))]>,
+               TB;                        // GR8 = >= unsign
+def SETAEm   : I<0x93, MRM0m, 
+                 (outs), (ins i8mem:$dst),
+                 "setae\t$dst",
+                 [(store (X86setcc X86_COND_AE, EFLAGS), addr:$dst)]>,
+               TB;                        // [mem8] = >= unsign
+
+def SETBEr   : I<0x96, MRM0r, 
+                 (outs GR8   :$dst), (ins),
+                 "setbe\t$dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_BE, EFLAGS))]>,
+               TB;                        // GR8 = <= unsign
+def SETBEm   : I<0x96, MRM0m, 
+                 (outs), (ins i8mem:$dst),
+                 "setbe\t$dst",
+                 [(store (X86setcc X86_COND_BE, EFLAGS), addr:$dst)]>,
+               TB;                        // [mem8] = <= unsign
+
+def SETAr    : I<0x97, MRM0r, 
+                 (outs GR8   :$dst), (ins),
+                 "seta\t$dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_A, EFLAGS))]>,
+               TB;                        // GR8 = >  signed
+def SETAm    : I<0x97, MRM0m, 
+                 (outs), (ins i8mem:$dst),
+                 "seta\t$dst",
+                 [(store (X86setcc X86_COND_A, EFLAGS), addr:$dst)]>,
+               TB;                        // [mem8] = >  signed
+
+def SETSr    : I<0x98, MRM0r, 
+                 (outs GR8   :$dst), (ins),
+                 "sets\t$dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_S, EFLAGS))]>,
+               TB;                        // GR8 = <sign bit>
+def SETSm    : I<0x98, MRM0m, 
+                 (outs), (ins i8mem:$dst),
+                 "sets\t$dst",
+                 [(store (X86setcc X86_COND_S, EFLAGS), addr:$dst)]>,
+               TB;                        // [mem8] = <sign bit>
+def SETNSr   : I<0x99, MRM0r, 
+                 (outs GR8   :$dst), (ins),
+                 "setns\t$dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_NS, EFLAGS))]>,
+               TB;                        // GR8 = !<sign bit>
+def SETNSm   : I<0x99, MRM0m, 
+                 (outs), (ins i8mem:$dst),
+                 "setns\t$dst",
+                 [(store (X86setcc X86_COND_NS, EFLAGS), addr:$dst)]>,
+               TB;                        // [mem8] = !<sign bit>
+
+def SETPr    : I<0x9A, MRM0r, 
+                 (outs GR8   :$dst), (ins),
+                 "setp\t$dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_P, EFLAGS))]>,
+               TB;                        // GR8 = parity
+def SETPm    : I<0x9A, MRM0m, 
+                 (outs), (ins i8mem:$dst),
+                 "setp\t$dst",
+                 [(store (X86setcc X86_COND_P, EFLAGS), addr:$dst)]>,
+               TB;                        // [mem8] = parity
+def SETNPr   : I<0x9B, MRM0r, 
+                 (outs GR8   :$dst), (ins),
+                 "setnp\t$dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_NP, EFLAGS))]>,
+               TB;                        // GR8 = not parity
+def SETNPm   : I<0x9B, MRM0m, 
+                 (outs), (ins i8mem:$dst),
+                 "setnp\t$dst",
+                 [(store (X86setcc X86_COND_NP, EFLAGS), addr:$dst)]>,
+               TB;                        // [mem8] = not parity
+
+def SETOr    : I<0x90, MRM0r, 
+                 (outs GR8   :$dst), (ins),
+                 "seto\t$dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_O, EFLAGS))]>,
+               TB;                        // GR8 = overflow
+def SETOm    : I<0x90, MRM0m, 
+                 (outs), (ins i8mem:$dst),
+                 "seto\t$dst",
+                 [(store (X86setcc X86_COND_O, EFLAGS), addr:$dst)]>,
+               TB;                        // [mem8] = overflow
+def SETNOr   : I<0x91, MRM0r, 
+                 (outs GR8   :$dst), (ins),
+                 "setno\t$dst",
+                 [(set GR8:$dst, (X86setcc X86_COND_NO, EFLAGS))]>,
+               TB;                        // GR8 = not overflow
+def SETNOm   : I<0x91, MRM0m, 
+                 (outs), (ins i8mem:$dst),
+                 "setno\t$dst",
+                 [(store (X86setcc X86_COND_NO, EFLAGS), addr:$dst)]>,
+               TB;                        // [mem8] = not overflow
+} // Uses = [EFLAGS]
+
+
+// Integer comparisons
+let Defs = [EFLAGS] in {
+def CMP8i8 : Ii8<0x3C, RawFrm, (outs), (ins i8imm:$src),
+                 "cmp{b}\t{$src, %al|%al, $src}", []>;
+def CMP16i16 : Ii16<0x3D, RawFrm, (outs), (ins i16imm:$src),
+                    "cmp{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
+def CMP32i32 : Ii32<0x3D, RawFrm, (outs), (ins i32imm:$src),
+                    "cmp{l}\t{$src, %eax|%eax, $src}", []>;
+
+def CMP8rr  : I<0x38, MRMDestReg,
+                (outs), (ins GR8 :$src1, GR8 :$src2),
+                "cmp{b}\t{$src2, $src1|$src1, $src2}",
+                [(X86cmp GR8:$src1, GR8:$src2), (implicit EFLAGS)]>;
+def CMP16rr : I<0x39, MRMDestReg,
+                (outs), (ins GR16:$src1, GR16:$src2),
+                "cmp{w}\t{$src2, $src1|$src1, $src2}",
+                [(X86cmp GR16:$src1, GR16:$src2), (implicit EFLAGS)]>, OpSize;
+def CMP32rr : I<0x39, MRMDestReg,
+                (outs), (ins GR32:$src1, GR32:$src2),
+                "cmp{l}\t{$src2, $src1|$src1, $src2}",
+                [(X86cmp GR32:$src1, GR32:$src2), (implicit EFLAGS)]>;
+def CMP8mr  : I<0x38, MRMDestMem,
+                (outs), (ins i8mem :$src1, GR8 :$src2),
+                "cmp{b}\t{$src2, $src1|$src1, $src2}",
+                [(X86cmp (loadi8 addr:$src1), GR8:$src2),
+                 (implicit EFLAGS)]>;
+def CMP16mr : I<0x39, MRMDestMem,
+                (outs), (ins i16mem:$src1, GR16:$src2),
+                "cmp{w}\t{$src2, $src1|$src1, $src2}",
+                [(X86cmp (loadi16 addr:$src1), GR16:$src2),
+                 (implicit EFLAGS)]>, OpSize;
+def CMP32mr : I<0x39, MRMDestMem,
+                (outs), (ins i32mem:$src1, GR32:$src2),
+                "cmp{l}\t{$src2, $src1|$src1, $src2}",
+                [(X86cmp (loadi32 addr:$src1), GR32:$src2),
+                 (implicit EFLAGS)]>;
+def CMP8rm  : I<0x3A, MRMSrcMem,
+                (outs), (ins GR8 :$src1, i8mem :$src2),
+                "cmp{b}\t{$src2, $src1|$src1, $src2}",
+                [(X86cmp GR8:$src1, (loadi8 addr:$src2)),
+                 (implicit EFLAGS)]>;
+def CMP16rm : I<0x3B, MRMSrcMem,
+                (outs), (ins GR16:$src1, i16mem:$src2),
+                "cmp{w}\t{$src2, $src1|$src1, $src2}",
+                [(X86cmp GR16:$src1, (loadi16 addr:$src2)),
+                 (implicit EFLAGS)]>, OpSize;
+def CMP32rm : I<0x3B, MRMSrcMem,
+                (outs), (ins GR32:$src1, i32mem:$src2),
+                "cmp{l}\t{$src2, $src1|$src1, $src2}",
+                [(X86cmp GR32:$src1, (loadi32 addr:$src2)),
+                 (implicit EFLAGS)]>;
+def CMP8mrmrr : I<0x3A, MRMSrcReg, (outs), (ins GR8:$src1, GR8:$src2),
+                  "cmp{b}\t{$src2, $src1|$src1, $src2}", []>;
+def CMP16mrmrr : I<0x3B, MRMSrcReg, (outs), (ins GR16:$src1, GR16:$src2),
+                   "cmp{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize;
+def CMP32mrmrr : I<0x3B, MRMSrcReg, (outs), (ins GR32:$src1, GR32:$src2),
+                   "cmp{l}\t{$src2, $src1|$src1, $src2}", []>;
+def CMP8ri  : Ii8<0x80, MRM7r,
+                  (outs), (ins GR8:$src1, i8imm:$src2),
+                  "cmp{b}\t{$src2, $src1|$src1, $src2}",
+                  [(X86cmp GR8:$src1, imm:$src2), (implicit EFLAGS)]>;
+def CMP16ri : Ii16<0x81, MRM7r,
+                   (outs), (ins GR16:$src1, i16imm:$src2),
+                   "cmp{w}\t{$src2, $src1|$src1, $src2}",
+                   [(X86cmp GR16:$src1, imm:$src2),
+                    (implicit EFLAGS)]>, OpSize;
+def CMP32ri : Ii32<0x81, MRM7r,
+                   (outs), (ins GR32:$src1, i32imm:$src2),
+                   "cmp{l}\t{$src2, $src1|$src1, $src2}",
+                   [(X86cmp GR32:$src1, imm:$src2), (implicit EFLAGS)]>;
+def CMP8mi  : Ii8 <0x80, MRM7m,
+                   (outs), (ins i8mem :$src1, i8imm :$src2),
+                   "cmp{b}\t{$src2, $src1|$src1, $src2}",
+                   [(X86cmp (loadi8 addr:$src1), imm:$src2),
+                    (implicit EFLAGS)]>;
+def CMP16mi : Ii16<0x81, MRM7m,
+                   (outs), (ins i16mem:$src1, i16imm:$src2),
+                   "cmp{w}\t{$src2, $src1|$src1, $src2}",
+                   [(X86cmp (loadi16 addr:$src1), imm:$src2),
+                    (implicit EFLAGS)]>, OpSize;
+def CMP32mi : Ii32<0x81, MRM7m,
+                   (outs), (ins i32mem:$src1, i32imm:$src2),
+                   "cmp{l}\t{$src2, $src1|$src1, $src2}",
+                   [(X86cmp (loadi32 addr:$src1), imm:$src2),
+                    (implicit EFLAGS)]>;
+def CMP16ri8 : Ii8<0x83, MRM7r,
+                   (outs), (ins GR16:$src1, i16i8imm:$src2),
+                   "cmp{w}\t{$src2, $src1|$src1, $src2}",
+                   [(X86cmp GR16:$src1, i16immSExt8:$src2),
+                    (implicit EFLAGS)]>, OpSize;
+def CMP16mi8 : Ii8<0x83, MRM7m,
+                   (outs), (ins i16mem:$src1, i16i8imm:$src2),
+                   "cmp{w}\t{$src2, $src1|$src1, $src2}",
+                   [(X86cmp (loadi16 addr:$src1), i16immSExt8:$src2),
+                    (implicit EFLAGS)]>, OpSize;
+def CMP32mi8 : Ii8<0x83, MRM7m,
+                   (outs), (ins i32mem:$src1, i32i8imm:$src2),
+                   "cmp{l}\t{$src2, $src1|$src1, $src2}",
+                   [(X86cmp (loadi32 addr:$src1), i32immSExt8:$src2),
+                    (implicit EFLAGS)]>;
+def CMP32ri8 : Ii8<0x83, MRM7r,
+                   (outs), (ins GR32:$src1, i32i8imm:$src2),
+                   "cmp{l}\t{$src2, $src1|$src1, $src2}",
+                   [(X86cmp GR32:$src1, i32immSExt8:$src2),
+                    (implicit EFLAGS)]>;
+} // Defs = [EFLAGS]
+
+// Bit tests.
+// TODO: BTC, BTR, and BTS
+let Defs = [EFLAGS] in {
+def BT16rr : I<0xA3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+               "bt{w}\t{$src2, $src1|$src1, $src2}",
+               [(X86bt GR16:$src1, GR16:$src2),
+                (implicit EFLAGS)]>, OpSize, TB;
+def BT32rr : I<0xA3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+               "bt{l}\t{$src2, $src1|$src1, $src2}",
+               [(X86bt GR32:$src1, GR32:$src2),
+                (implicit EFLAGS)]>, TB;
+
+// Unlike with the register+register form, the memory+register form of the
+// bt instruction does not ignore the high bits of the index. From ISel's
+// perspective, this is pretty bizarre. Make these instructions disassembly
+// only for now.
+
+def BT16mr : I<0xA3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+               "bt{w}\t{$src2, $src1|$src1, $src2}", 
+//               [(X86bt (loadi16 addr:$src1), GR16:$src2),
+//                (implicit EFLAGS)]
+               []
+               >, OpSize, TB, Requires<[FastBTMem]>;
+def BT32mr : I<0xA3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+               "bt{l}\t{$src2, $src1|$src1, $src2}", 
+//               [(X86bt (loadi32 addr:$src1), GR32:$src2),
+//                (implicit EFLAGS)]
+               []
+               >, TB, Requires<[FastBTMem]>;
+
+def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+                "bt{w}\t{$src2, $src1|$src1, $src2}",
+                [(X86bt GR16:$src1, i16immSExt8:$src2),
+                 (implicit EFLAGS)]>, OpSize, TB;
+def BT32ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+                "bt{l}\t{$src2, $src1|$src1, $src2}",
+                [(X86bt GR32:$src1, i32immSExt8:$src2),
+                 (implicit EFLAGS)]>, TB;
+// Note that these instructions don't need FastBTMem because that
+// only applies when the other operand is in a register. When it's
+// an immediate, bt is still fast.
+def BT16mi8 : Ii8<0xBA, MRM4m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+                "bt{w}\t{$src2, $src1|$src1, $src2}",
+                [(X86bt (loadi16 addr:$src1), i16immSExt8:$src2),
+                 (implicit EFLAGS)]>, OpSize, TB;
+def BT32mi8 : Ii8<0xBA, MRM4m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+                "bt{l}\t{$src2, $src1|$src1, $src2}",
+                [(X86bt (loadi32 addr:$src1), i32immSExt8:$src2),
+                 (implicit EFLAGS)]>, TB;
+
+def BTC16rr : I<0xBB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+                "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+def BTC32rr : I<0xBB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+                "btc{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+def BTC16mr : I<0xBB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+                "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+def BTC32mr : I<0xBB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+                "btc{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+def BTC16ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+                    "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+def BTC32ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+                    "btc{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+def BTC16mi8 : Ii8<0xBA, MRM7m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+                    "btc{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+def BTC32mi8 : Ii8<0xBA, MRM7m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+                    "btc{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+
+def BTR16rr : I<0xB3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+                "btr{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+def BTR32rr : I<0xB3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+                "btr{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+def BTR16mr : I<0xB3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+                "btr{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+def BTR32mr : I<0xB3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+                "btr{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+def BTR16ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+                    "btr{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+def BTR32ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+                    "btr{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+def BTR16mi8 : Ii8<0xBA, MRM6m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+                    "btr{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+def BTR32mi8 : Ii8<0xBA, MRM6m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+                    "btr{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+
+def BTS16rr : I<0xAB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+                "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+def BTS32rr : I<0xAB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+                "bts{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+def BTS16mr : I<0xAB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+                "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+def BTS32mr : I<0xAB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+                "bts{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+def BTS16ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+                    "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+def BTS32ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+                    "bts{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+def BTS16mi8 : Ii8<0xBA, MRM5m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
+                    "bts{w}\t{$src2, $src1|$src1, $src2}", []>, OpSize, TB;
+def BTS32mi8 : Ii8<0xBA, MRM5m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
+                    "bts{l}\t{$src2, $src1|$src1, $src2}", []>, TB;
+} // Defs = [EFLAGS]
+
+// Sign/Zero extenders
+// Use movsbl intead of movsbw; we don't care about the high 16 bits
+// of the register here. This has a smaller encoding and avoids a
+// partial-register update.  Actual movsbw included for the disassembler.
+def MOVSX16rr8W : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
+                    "movs{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+def MOVSX16rm8W : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
+                    "movs{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+def MOVSX16rr8 : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8 :$src),
+                   "", [(set GR16:$dst, (sext GR8:$src))]>, TB;
+def MOVSX16rm8 : I<0xBE, MRMSrcMem, (outs GR16:$dst), (ins i8mem :$src),
+                   "", [(set GR16:$dst, (sextloadi16i8 addr:$src))]>, TB;
+def MOVSX32rr8 : I<0xBE, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src),
+                   "movs{bl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (sext GR8:$src))]>, TB;
+def MOVSX32rm8 : I<0xBE, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src),
+                   "movs{bl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (sextloadi32i8 addr:$src))]>, TB;
+def MOVSX32rr16: I<0xBF, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src),
+                   "movs{wl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (sext GR16:$src))]>, TB;
+def MOVSX32rm16: I<0xBF, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
+                   "movs{wl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (sextloadi32i16 addr:$src))]>, TB;
+
+// Use movzbl intead of movzbw; we don't care about the high 16 bits
+// of the register here. This has a smaller encoding and avoids a
+// partial-register update.  Actual movzbw included for the disassembler.
+def MOVZX16rr8W : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
+                    "movz{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+def MOVZX16rm8W : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem:$src),
+                    "movz{bw|x}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;  
+def MOVZX16rr8 : I<0xB6, MRMSrcReg, (outs GR16:$dst), (ins GR8 :$src),
+                   "", [(set GR16:$dst, (zext GR8:$src))]>, TB;
+def MOVZX16rm8 : I<0xB6, MRMSrcMem, (outs GR16:$dst), (ins i8mem :$src),
+                   "", [(set GR16:$dst, (zextloadi16i8 addr:$src))]>, TB;
+def MOVZX32rr8 : I<0xB6, MRMSrcReg, (outs GR32:$dst), (ins GR8 :$src),
+                   "movz{bl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (zext GR8:$src))]>, TB;
+def MOVZX32rm8 : I<0xB6, MRMSrcMem, (outs GR32:$dst), (ins i8mem :$src),
+                   "movz{bl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (zextloadi32i8 addr:$src))]>, TB;
+def MOVZX32rr16: I<0xB7, MRMSrcReg, (outs GR32:$dst), (ins GR16:$src),
+                   "movz{wl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (zext GR16:$src))]>, TB;
+def MOVZX32rm16: I<0xB7, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src),
+                   "movz{wl|x}\t{$src, $dst|$dst, $src}",
+                   [(set GR32:$dst, (zextloadi32i16 addr:$src))]>, TB;
+
+// These are the same as the regular MOVZX32rr8 and MOVZX32rm8
+// except that they use GR32_NOREX for the output operand register class
+// instead of GR32. This allows them to operate on h registers on x86-64.
+def MOVZX32_NOREXrr8 : I<0xB6, MRMSrcReg,
+                         (outs GR32_NOREX:$dst), (ins GR8:$src),
+                         "movz{bl|x}\t{$src, $dst|$dst, $src}  # NOREX",
+                         []>, TB;
+let mayLoad = 1 in
+def MOVZX32_NOREXrm8 : I<0xB6, MRMSrcMem,
+                         (outs GR32_NOREX:$dst), (ins i8mem:$src),
+                         "movz{bl|x}\t{$src, $dst|$dst, $src}  # NOREX",
+                         []>, TB;
+
+let neverHasSideEffects = 1 in {
+  let Defs = [AX], Uses = [AL] in
+  def CBW : I<0x98, RawFrm, (outs), (ins),
+              "{cbtw|cbw}", []>, OpSize;   // AX = signext(AL)
+  let Defs = [EAX], Uses = [AX] in
+  def CWDE : I<0x98, RawFrm, (outs), (ins),
+              "{cwtl|cwde}", []>;   // EAX = signext(AX)
+
+  let Defs = [AX,DX], Uses = [AX] in
+  def CWD : I<0x99, RawFrm, (outs), (ins),
+              "{cwtd|cwd}", []>, OpSize; // DX:AX = signext(AX)
+  let Defs = [EAX,EDX], Uses = [EAX] in
+  def CDQ : I<0x99, RawFrm, (outs), (ins),
+              "{cltd|cdq}", []>; // EDX:EAX = signext(EAX)
+}
+
+//===----------------------------------------------------------------------===//
+// Alias Instructions
+//===----------------------------------------------------------------------===//
+
+// Alias instructions that map movr0 to xor.
+// FIXME: remove when we can teach regalloc that xor reg, reg is ok.
+// FIXME: Set encoding to pseudo.
+let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1,
+    isCodeGenOnly = 1 in {
+def MOV8r0   : I<0x30, MRMInitReg, (outs GR8 :$dst), (ins), "",
+                 [(set GR8:$dst, 0)]>;
+
+// We want to rewrite MOV16r0 in terms of MOV32r0, because it's a smaller
+// encoding and avoids a partial-register update sometimes, but doing so
+// at isel time interferes with rematerialization in the current register
+// allocator. For now, this is rewritten when the instruction is lowered
+// to an MCInst.
+def MOV16r0   : I<0x31, MRMInitReg, (outs GR16:$dst), (ins),
+                 "",
+                 [(set GR16:$dst, 0)]>, OpSize;
+                 
+// FIXME: Set encoding to pseudo.
+def MOV32r0  : I<0x31, MRMInitReg, (outs GR32:$dst), (ins), "",
+                 [(set GR32:$dst, 0)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Thread Local Storage Instructions
+//
+
+// All calls clobber the non-callee saved registers. ESP is marked as
+// a use to prevent stack-pointer assignments that appear immediately
+// before calls from potentially appearing dead.
+let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
+            MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+            XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+            XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+    Uses = [ESP] in
+def TLS_addr32 : I<0, Pseudo, (outs), (ins lea32mem:$sym),
+                  "leal\t$sym, %eax; "
+                  "call\t___tls_get_addr@PLT",
+                  [(X86tlsaddr tls32addr:$sym)]>,
+                  Requires<[In32BitMode]>;
+
+let AddedComplexity = 5, isCodeGenOnly = 1 in
+def GS_MOV32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                   "movl\t%gs:$src, $dst",
+                   [(set GR32:$dst, (gsload addr:$src))]>, SegGS;
+
+let AddedComplexity = 5, isCodeGenOnly = 1 in
+def FS_MOV32rm : I<0x8B, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                   "movl\t%fs:$src, $dst",
+                   [(set GR32:$dst, (fsload addr:$src))]>, SegFS;
+
+//===----------------------------------------------------------------------===//
+// EH Pseudo Instructions
+//
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+    hasCtrlDep = 1, isCodeGenOnly = 1 in {
+def EH_RETURN   : I<0xC3, RawFrm, (outs), (ins GR32:$addr),
+                    "ret\t#eh_return, addr: $addr",
+                    [(X86ehret GR32:$addr)]>;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Atomic support
+//
+
+// Atomic swap. These are just normal xchg instructions. But since a memory
+// operand is referenced, the atomicity is ensured.
+let Constraints = "$val = $dst" in {
+def XCHG32rm : I<0x87, MRMSrcMem, (outs GR32:$dst), 
+                 (ins GR32:$val, i32mem:$ptr),
+               "xchg{l}\t{$val, $ptr|$ptr, $val}", 
+               [(set GR32:$dst, (atomic_swap_32 addr:$ptr, GR32:$val))]>;
+def XCHG16rm : I<0x87, MRMSrcMem, (outs GR16:$dst), 
+                 (ins GR16:$val, i16mem:$ptr),
+               "xchg{w}\t{$val, $ptr|$ptr, $val}", 
+               [(set GR16:$dst, (atomic_swap_16 addr:$ptr, GR16:$val))]>, 
+                OpSize;
+def XCHG8rm  : I<0x86, MRMSrcMem, (outs GR8:$dst), (ins GR8:$val, i8mem:$ptr),
+               "xchg{b}\t{$val, $ptr|$ptr, $val}", 
+               [(set GR8:$dst, (atomic_swap_8 addr:$ptr, GR8:$val))]>;
+
+def XCHG32rr : I<0x87, MRMSrcReg, (outs GR32:$dst), (ins GR32:$val, GR32:$src),
+                 "xchg{l}\t{$val, $src|$src, $val}", []>;
+def XCHG16rr : I<0x87, MRMSrcReg, (outs GR16:$dst), (ins GR16:$val, GR16:$src),
+                 "xchg{w}\t{$val, $src|$src, $val}", []>, OpSize;
+def XCHG8rr : I<0x86, MRMSrcReg, (outs GR8:$dst), (ins GR8:$val, GR8:$src),
+                "xchg{b}\t{$val, $src|$src, $val}", []>;
+}
+
+def XCHG16ar : I<0x90, AddRegFrm, (outs), (ins GR16:$src),
+                  "xchg{w}\t{$src, %ax|%ax, $src}", []>, OpSize;
+def XCHG32ar : I<0x90, AddRegFrm, (outs), (ins GR32:$src),
+                  "xchg{l}\t{$src, %eax|%eax, $src}", []>;
+
+// Atomic compare and swap.
+let Defs = [EAX, EFLAGS], Uses = [EAX] in {
+def LCMPXCHG32 : I<0xB1, MRMDestMem, (outs), (ins i32mem:$ptr, GR32:$swap),
+               "lock\n\t"
+               "cmpxchg{l}\t{$swap, $ptr|$ptr, $swap}",
+               [(X86cas addr:$ptr, GR32:$swap, 4)]>, TB, LOCK;
+}
+let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX] in {
+def LCMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$ptr),
+               "lock\n\t"
+               "cmpxchg8b\t$ptr",
+               [(X86cas8 addr:$ptr)]>, TB, LOCK;
+}
+
+let Defs = [AX, EFLAGS], Uses = [AX] in {
+def LCMPXCHG16 : I<0xB1, MRMDestMem, (outs), (ins i16mem:$ptr, GR16:$swap),
+               "lock\n\t"
+               "cmpxchg{w}\t{$swap, $ptr|$ptr, $swap}",
+               [(X86cas addr:$ptr, GR16:$swap, 2)]>, TB, OpSize, LOCK;
+}
+let Defs = [AL, EFLAGS], Uses = [AL] in {
+def LCMPXCHG8 : I<0xB0, MRMDestMem, (outs), (ins i8mem:$ptr, GR8:$swap),
+               "lock\n\t"
+               "cmpxchg{b}\t{$swap, $ptr|$ptr, $swap}",
+               [(X86cas addr:$ptr, GR8:$swap, 1)]>, TB, LOCK;
+}
+
+// Atomic exchange and add
+let Constraints = "$val = $dst", Defs = [EFLAGS] in {
+def LXADD32 : I<0xC1, MRMSrcMem, (outs GR32:$dst), (ins GR32:$val, i32mem:$ptr),
+               "lock\n\t"
+               "xadd{l}\t{$val, $ptr|$ptr, $val}",
+               [(set GR32:$dst, (atomic_load_add_32 addr:$ptr, GR32:$val))]>,
+                TB, LOCK;
+def LXADD16 : I<0xC1, MRMSrcMem, (outs GR16:$dst), (ins GR16:$val, i16mem:$ptr),
+               "lock\n\t"
+               "xadd{w}\t{$val, $ptr|$ptr, $val}",
+               [(set GR16:$dst, (atomic_load_add_16 addr:$ptr, GR16:$val))]>,
+                TB, OpSize, LOCK;
+def LXADD8  : I<0xC0, MRMSrcMem, (outs GR8:$dst), (ins GR8:$val, i8mem:$ptr),
+               "lock\n\t"
+               "xadd{b}\t{$val, $ptr|$ptr, $val}",
+               [(set GR8:$dst, (atomic_load_add_8 addr:$ptr, GR8:$val))]>,
+                TB, LOCK;
+}
+
+def XADD8rr : I<0xC0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src),
+                "xadd{b}\t{$src, $dst|$dst, $src}", []>, TB;
+def XADD16rr : I<0xC1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
+                 "xadd{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+def XADD32rr  : I<0xC1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
+                 "xadd{l}\t{$src, $dst|$dst, $src}", []>, TB;
+
+def XADD8rm   : I<0xC0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
+                 "xadd{b}\t{$src, $dst|$dst, $src}", []>, TB;
+def XADD16rm  : I<0xC1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+                 "xadd{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+def XADD32rm  : I<0xC1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+                 "xadd{l}\t{$src, $dst|$dst, $src}", []>, TB;
+
+def CMPXCHG8rr : I<0xB0, MRMDestReg, (outs GR8:$dst), (ins GR8:$src),
+                   "cmpxchg{b}\t{$src, $dst|$dst, $src}", []>, TB;
+def CMPXCHG16rr : I<0xB1, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
+                    "cmpxchg{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+def CMPXCHG32rr  : I<0xB1, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
+                     "cmpxchg{l}\t{$src, $dst|$dst, $src}", []>, TB;
+
+def CMPXCHG8rm   : I<0xB0, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src),
+                     "cmpxchg{b}\t{$src, $dst|$dst, $src}", []>, TB;
+def CMPXCHG16rm  : I<0xB1, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
+                     "cmpxchg{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+def CMPXCHG32rm  : I<0xB1, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+                     "cmpxchg{l}\t{$src, $dst|$dst, $src}", []>, TB;
+
+let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX] in
+def CMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$dst),
+                  "cmpxchg8b\t$dst", []>, TB;
+
+// Optimized codegen when the non-memory output is not used.
+// FIXME: Use normal add / sub instructions and add lock prefix dynamically.
+let Defs = [EFLAGS] in {
+def LOCK_ADD8mr  : I<0x00, MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2),
+                    "lock\n\t"
+                    "add{b}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_ADD16mr  : I<0x01, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+                    "lock\n\t"
+                    "add{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize, LOCK;
+def LOCK_ADD32mr  : I<0x01, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+                    "lock\n\t"
+                    "add{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_ADD8mi   : Ii8<0x80, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src2),
+                    "lock\n\t"
+                    "add{b}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_ADD16mi  : Ii16<0x81, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src2),
+                    "lock\n\t"
+                     "add{w}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_ADD32mi  : Ii32<0x81, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src2),
+                    "lock\n\t"
+                    "add{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_ADD16mi8 : Ii8<0x83, MRM0m, (outs), (ins i16mem:$dst, i16i8imm :$src2),
+                    "lock\n\t"
+                    "add{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize, LOCK;
+def LOCK_ADD32mi8 : Ii8<0x83, MRM0m, (outs), (ins i32mem:$dst, i32i8imm :$src2),
+                    "lock\n\t"
+                    "add{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+
+def LOCK_INC8m  : I<0xFE, MRM0m, (outs), (ins i8mem :$dst),
+                    "lock\n\t"
+                    "inc{b}\t$dst", []>, LOCK;
+def LOCK_INC16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst),
+                    "lock\n\t"
+                    "inc{w}\t$dst", []>, OpSize, LOCK;
+def LOCK_INC32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst),
+                    "lock\n\t"
+                    "inc{l}\t$dst", []>, LOCK;
+
+def LOCK_SUB8mr   : I<0x28, MRMDestMem, (outs), (ins i8mem :$dst, GR8 :$src2),
+                    "lock\n\t"
+                    "sub{b}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_SUB16mr  : I<0x29, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+                    "lock\n\t"
+                    "sub{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize, LOCK;
+def LOCK_SUB32mr  : I<0x29, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2), 
+                    "lock\n\t"
+                    "sub{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_SUB8mi   : Ii8<0x80, MRM5m, (outs), (ins i8mem :$dst, i8imm:$src2), 
+                    "lock\n\t"
+                    "sub{b}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_SUB16mi  : Ii16<0x81, MRM5m, (outs), (ins i16mem:$dst, i16imm:$src2), 
+                    "lock\n\t"
+                    "sub{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize, LOCK;
+def LOCK_SUB32mi  : Ii32<0x81, MRM5m, (outs), (ins i32mem:$dst, i32imm:$src2), 
+                    "lock\n\t"
+                     "sub{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+def LOCK_SUB16mi8 : Ii8<0x83, MRM5m, (outs), (ins i16mem:$dst, i16i8imm :$src2),
+                    "lock\n\t"
+                     "sub{w}\t{$src2, $dst|$dst, $src2}", []>, OpSize, LOCK;
+def LOCK_SUB32mi8 : Ii8<0x83, MRM5m, (outs), (ins i32mem:$dst, i32i8imm :$src2),
+                    "lock\n\t"
+                     "sub{l}\t{$src2, $dst|$dst, $src2}", []>, LOCK;
+
+def LOCK_DEC8m  : I<0xFE, MRM1m, (outs), (ins i8mem :$dst),
+                    "lock\n\t"
+                    "dec{b}\t$dst", []>, LOCK;
+def LOCK_DEC16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst),
+                    "lock\n\t"
+                    "dec{w}\t$dst", []>, OpSize, LOCK;
+def LOCK_DEC32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst),
+                    "lock\n\t"
+                    "dec{l}\t$dst", []>, LOCK;
+}
+
+// Atomic exchange, and, or, xor
+let Constraints = "$val = $dst", Defs = [EFLAGS],
+                  usesCustomInserter = 1 in {
+def ATOMAND32 : I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
+               "#ATOMAND32 PSEUDO!", 
+               [(set GR32:$dst, (atomic_load_and_32 addr:$ptr, GR32:$val))]>;
+def ATOMOR32 : I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
+               "#ATOMOR32 PSEUDO!", 
+               [(set GR32:$dst, (atomic_load_or_32 addr:$ptr, GR32:$val))]>;
+def ATOMXOR32 : I<0, Pseudo,(outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
+               "#ATOMXOR32 PSEUDO!", 
+               [(set GR32:$dst, (atomic_load_xor_32 addr:$ptr, GR32:$val))]>;
+def ATOMNAND32 : I<0, Pseudo,(outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
+               "#ATOMNAND32 PSEUDO!", 
+               [(set GR32:$dst, (atomic_load_nand_32 addr:$ptr, GR32:$val))]>;
+def ATOMMIN32: I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$ptr, GR32:$val),
+               "#ATOMMIN32 PSEUDO!", 
+               [(set GR32:$dst, (atomic_load_min_32 addr:$ptr, GR32:$val))]>;
+def ATOMMAX32: I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
+               "#ATOMMAX32 PSEUDO!", 
+               [(set GR32:$dst, (atomic_load_max_32 addr:$ptr, GR32:$val))]>;
+def ATOMUMIN32: I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
+               "#ATOMUMIN32 PSEUDO!", 
+               [(set GR32:$dst, (atomic_load_umin_32 addr:$ptr, GR32:$val))]>;
+def ATOMUMAX32: I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
+               "#ATOMUMAX32 PSEUDO!", 
+               [(set GR32:$dst, (atomic_load_umax_32 addr:$ptr, GR32:$val))]>;
+
+def ATOMAND16 : I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
+               "#ATOMAND16 PSEUDO!", 
+               [(set GR16:$dst, (atomic_load_and_16 addr:$ptr, GR16:$val))]>;
+def ATOMOR16 : I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
+               "#ATOMOR16 PSEUDO!", 
+               [(set GR16:$dst, (atomic_load_or_16 addr:$ptr, GR16:$val))]>;
+def ATOMXOR16 : I<0, Pseudo,(outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
+               "#ATOMXOR16 PSEUDO!", 
+               [(set GR16:$dst, (atomic_load_xor_16 addr:$ptr, GR16:$val))]>;
+def ATOMNAND16 : I<0, Pseudo,(outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
+               "#ATOMNAND16 PSEUDO!", 
+               [(set GR16:$dst, (atomic_load_nand_16 addr:$ptr, GR16:$val))]>;
+def ATOMMIN16: I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$ptr, GR16:$val),
+               "#ATOMMIN16 PSEUDO!", 
+               [(set GR16:$dst, (atomic_load_min_16 addr:$ptr, GR16:$val))]>;
+def ATOMMAX16: I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
+               "#ATOMMAX16 PSEUDO!", 
+               [(set GR16:$dst, (atomic_load_max_16 addr:$ptr, GR16:$val))]>;
+def ATOMUMIN16: I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
+               "#ATOMUMIN16 PSEUDO!", 
+               [(set GR16:$dst, (atomic_load_umin_16 addr:$ptr, GR16:$val))]>;
+def ATOMUMAX16: I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
+               "#ATOMUMAX16 PSEUDO!", 
+               [(set GR16:$dst, (atomic_load_umax_16 addr:$ptr, GR16:$val))]>;
+
+def ATOMAND8 : I<0, Pseudo, (outs GR8:$dst),(ins i8mem:$ptr, GR8:$val),
+               "#ATOMAND8 PSEUDO!", 
+               [(set GR8:$dst, (atomic_load_and_8 addr:$ptr, GR8:$val))]>;
+def ATOMOR8 : I<0, Pseudo, (outs GR8:$dst),(ins i8mem:$ptr, GR8:$val),
+               "#ATOMOR8 PSEUDO!", 
+               [(set GR8:$dst, (atomic_load_or_8 addr:$ptr, GR8:$val))]>;
+def ATOMXOR8 : I<0, Pseudo,(outs GR8:$dst),(ins i8mem:$ptr, GR8:$val),
+               "#ATOMXOR8 PSEUDO!", 
+               [(set GR8:$dst, (atomic_load_xor_8 addr:$ptr, GR8:$val))]>;
+def ATOMNAND8 : I<0, Pseudo,(outs GR8:$dst),(ins i8mem:$ptr, GR8:$val),
+               "#ATOMNAND8 PSEUDO!", 
+               [(set GR8:$dst, (atomic_load_nand_8 addr:$ptr, GR8:$val))]>;
+}
+
+let Constraints = "$val1 = $dst1, $val2 = $dst2", 
+                  Defs = [EFLAGS, EAX, EBX, ECX, EDX],
+                  Uses = [EAX, EBX, ECX, EDX],
+                  mayLoad = 1, mayStore = 1,
+                  usesCustomInserter = 1 in {
+def ATOMAND6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
+                               (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
+               "#ATOMAND6432 PSEUDO!", []>;
+def ATOMOR6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
+                               (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
+               "#ATOMOR6432 PSEUDO!", []>;
+def ATOMXOR6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
+                               (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
+               "#ATOMXOR6432 PSEUDO!", []>;
+def ATOMNAND6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
+                               (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
+               "#ATOMNAND6432 PSEUDO!", []>;
+def ATOMADD6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
+                               (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
+               "#ATOMADD6432 PSEUDO!", []>;
+def ATOMSUB6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
+                               (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
+               "#ATOMSUB6432 PSEUDO!", []>;
+def ATOMSWAP6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
+                               (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
+               "#ATOMSWAP6432 PSEUDO!", []>;
+}
+
+// Segmentation support instructions.
+
+def LAR16rm : I<0x02, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), 
+                "lar{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+def LAR16rr : I<0x02, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                "lar{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+
+// i16mem operand in LAR32rm and GR32 operand in LAR32rr is not a typo.
+def LAR32rm : I<0x02, MRMSrcMem, (outs GR32:$dst), (ins i16mem:$src), 
+                "lar{l}\t{$src, $dst|$dst, $src}", []>, TB;
+def LAR32rr : I<0x02, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                "lar{l}\t{$src, $dst|$dst, $src}", []>, TB;
+
+def LSL16rm : I<0x03, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
+                "lsl{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize; 
+def LSL16rr : I<0x03, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
+                "lsl{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+def LSL32rm : I<0x03, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+                "lsl{l}\t{$src, $dst|$dst, $src}", []>, TB; 
+def LSL32rr : I<0x03, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+                "lsl{l}\t{$src, $dst|$dst, $src}", []>, TB;
+                
+def INVLPG : I<0x01, RawFrm, (outs), (ins), "invlpg", []>, TB;
+
+def STRr : I<0x00, MRM1r, (outs GR16:$dst), (ins),
+             "str{w}\t{$dst}", []>, TB;
+def STRm : I<0x00, MRM1m, (outs i16mem:$dst), (ins),
+             "str{w}\t{$dst}", []>, TB;
+def LTRr : I<0x00, MRM3r, (outs), (ins GR16:$src),
+             "ltr{w}\t{$src}", []>, TB;
+def LTRm : I<0x00, MRM3m, (outs), (ins i16mem:$src),
+             "ltr{w}\t{$src}", []>, TB;
+             
+def PUSHFS16 : I<0xa0, RawFrm, (outs), (ins),
+                 "push{w}\t%fs", []>, OpSize, TB;
+def PUSHFS32 : I<0xa0, RawFrm, (outs), (ins),
+                 "push{l}\t%fs", []>, TB;
+def PUSHGS16 : I<0xa8, RawFrm, (outs), (ins),
+                 "push{w}\t%gs", []>, OpSize, TB;
+def PUSHGS32 : I<0xa8, RawFrm, (outs), (ins),
+                 "push{l}\t%gs", []>, TB;
+
+def POPFS16 : I<0xa1, RawFrm, (outs), (ins),
+                "pop{w}\t%fs", []>, OpSize, TB;
+def POPFS32 : I<0xa1, RawFrm, (outs), (ins),
+                "pop{l}\t%fs", []>, TB;
+def POPGS16 : I<0xa9, RawFrm, (outs), (ins),
+                "pop{w}\t%gs", []>, OpSize, TB;
+def POPGS32 : I<0xa9, RawFrm, (outs), (ins),
+                "pop{l}\t%gs", []>, TB;
+
+def LDS16rm : I<0xc5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+                "lds{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
+def LDS32rm : I<0xc5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+                "lds{l}\t{$src, $dst|$dst, $src}", []>;
+def LSS16rm : I<0xb2, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+                "lss{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+def LSS32rm : I<0xb2, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+                "lss{l}\t{$src, $dst|$dst, $src}", []>, TB;
+def LES16rm : I<0xc4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+                "les{w}\t{$src, $dst|$dst, $src}", []>, OpSize;
+def LES32rm : I<0xc4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+                "les{l}\t{$src, $dst|$dst, $src}", []>;
+def LFS16rm : I<0xb4, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+                "lfs{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+def LFS32rm : I<0xb4, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+                "lfs{l}\t{$src, $dst|$dst, $src}", []>, TB;
+def LGS16rm : I<0xb5, MRMSrcMem, (outs GR16:$dst), (ins opaque32mem:$src),
+                "lgs{w}\t{$src, $dst|$dst, $src}", []>, TB, OpSize;
+def LGS32rm : I<0xb5, MRMSrcMem, (outs GR32:$dst), (ins opaque48mem:$src),
+                "lgs{l}\t{$src, $dst|$dst, $src}", []>, TB;
+
+def VERRr : I<0x00, MRM4r, (outs), (ins GR16:$seg),
+              "verr\t$seg", []>, TB;
+def VERRm : I<0x00, MRM4m, (outs), (ins i16mem:$seg),
+              "verr\t$seg", []>, TB;
+def VERWr : I<0x00, MRM5r, (outs), (ins GR16:$seg),
+              "verw\t$seg", []>, TB;
+def VERWm : I<0x00, MRM5m, (outs), (ins i16mem:$seg),
+              "verw\t$seg", []>, TB;
+
+// Descriptor-table support instructions
+
+def SGDTm : I<0x01, MRM0m, (outs opaque48mem:$dst), (ins),
+              "sgdt\t$dst", []>, TB;
+def SIDTm : I<0x01, MRM1m, (outs opaque48mem:$dst), (ins),
+              "sidt\t$dst", []>, TB;
+def SLDT16r : I<0x00, MRM0r, (outs GR16:$dst), (ins),
+                "sldt{w}\t$dst", []>, TB;
+def SLDT16m : I<0x00, MRM0m, (outs i16mem:$dst), (ins),
+                "sldt{w}\t$dst", []>, TB;
+def LGDTm : I<0x01, MRM2m, (outs), (ins opaque48mem:$src),
+              "lgdt\t$src", []>, TB;
+def LIDTm : I<0x01, MRM3m, (outs), (ins opaque48mem:$src),
+              "lidt\t$src", []>, TB;
+def LLDT16r : I<0x00, MRM2r, (outs), (ins GR16:$src),
+                "lldt{w}\t$src", []>, TB;
+def LLDT16m : I<0x00, MRM2m, (outs), (ins i16mem:$src),
+                "lldt{w}\t$src", []>, TB;
+                
+// Lock instruction prefix
+def LOCK_PREFIX : I<0xF0, RawFrm, (outs),  (ins), "lock", []>;
+
+// Repeat string operation instruction prefixes
+// These uses the DF flag in the EFLAGS register to inc or dec ECX
+let Defs = [ECX], Uses = [ECX,EFLAGS] in {
+// Repeat (used with INS, OUTS, MOVS, LODS and STOS)
+def REP_PREFIX : I<0xF3, RawFrm, (outs),  (ins), "rep", []>;
+// Repeat while not equal (used with CMPS and SCAS)
+def REPNE_PREFIX : I<0xF2, RawFrm, (outs),  (ins), "repne", []>;
+}
+
+// Segment override instruction prefixes
+def CS_PREFIX : I<0x2E, RawFrm, (outs),  (ins), "cs", []>;
+def SS_PREFIX : I<0x36, RawFrm, (outs),  (ins), "ss", []>;
+def DS_PREFIX : I<0x3E, RawFrm, (outs),  (ins), "ds", []>;
+def ES_PREFIX : I<0x26, RawFrm, (outs),  (ins), "es", []>;
+def FS_PREFIX : I<0x64, RawFrm, (outs),  (ins), "fs", []>;
+def GS_PREFIX : I<0x65, RawFrm, (outs),  (ins), "gs", []>;
+
+// String manipulation instructions
+
+def LODSB : I<0xAC, RawFrm, (outs), (ins), "lodsb", []>;
+def LODSW : I<0xAD, RawFrm, (outs), (ins), "lodsw", []>, OpSize;
+def LODSD : I<0xAD, RawFrm, (outs), (ins), "lods{l|d}", []>;
+
+def OUTSB : I<0x6E, RawFrm, (outs), (ins), "outsb", []>;
+def OUTSW : I<0x6F, RawFrm, (outs), (ins), "outsw", []>, OpSize;
+def OUTSD : I<0x6F, RawFrm, (outs), (ins), "outs{l|d}", []>;
+
+// CPU flow control instructions
+
+def HLT : I<0xF4, RawFrm, (outs), (ins), "hlt", []>;
+def RSM : I<0xAA, RawFrm, (outs), (ins), "rsm", []>, TB;
+
+// FPU control instructions
+
+def FNINIT : I<0xE3, RawFrm, (outs), (ins), "fninit", []>, DB;
+
+// Flag instructions
+
+def CLC : I<0xF8, RawFrm, (outs), (ins), "clc", []>;
+def STC : I<0xF9, RawFrm, (outs), (ins), "stc", []>;
+def CLI : I<0xFA, RawFrm, (outs), (ins), "cli", []>;
+def STI : I<0xFB, RawFrm, (outs), (ins), "sti", []>;
+def CLD : I<0xFC, RawFrm, (outs), (ins), "cld", []>;
+def STD : I<0xFD, RawFrm, (outs), (ins), "std", []>;
+def CMC : I<0xF5, RawFrm, (outs), (ins), "cmc", []>;
+
+def CLTS : I<0x06, RawFrm, (outs), (ins), "clts", []>, TB;
+
+// Table lookup instructions
+
+def XLAT : I<0xD7, RawFrm, (outs), (ins), "xlatb", []>;
+
+// Specialized register support
+
+def WRMSR : I<0x30, RawFrm, (outs), (ins), "wrmsr", []>, TB;
+def RDMSR : I<0x32, RawFrm, (outs), (ins), "rdmsr", []>, TB;
+def RDPMC : I<0x33, RawFrm, (outs), (ins), "rdpmc", []>, TB;
+
+def SMSW16r : I<0x01, MRM4r, (outs GR16:$dst), (ins), 
+                "smsw{w}\t$dst", []>, OpSize, TB;
+def SMSW32r : I<0x01, MRM4r, (outs GR32:$dst), (ins), 
+                "smsw{l}\t$dst", []>, TB;
+// For memory operands, there is only a 16-bit form
+def SMSW16m : I<0x01, MRM4m, (outs i16mem:$dst), (ins),
+                "smsw{w}\t$dst", []>, TB;
+
+def LMSW16r : I<0x01, MRM6r, (outs), (ins GR16:$src),
+                "lmsw{w}\t$src", []>, TB;
+def LMSW16m : I<0x01, MRM6m, (outs), (ins i16mem:$src),
+                "lmsw{w}\t$src", []>, TB;
+                
+def CPUID : I<0xA2, RawFrm, (outs), (ins), "cpuid", []>, TB;
+
+// Cache instructions
+
+def INVD : I<0x08, RawFrm, (outs), (ins), "invd", []>, TB;
+def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", []>, TB;
+
+// VMX instructions
+
+// 66 0F 38 80
+def INVEPT : I<0x38, RawFrm, (outs), (ins), "invept", []>, OpSize, TB;
+// 66 0F 38 81
+def INVVPID : I<0x38, RawFrm, (outs), (ins), "invvpid", []>, OpSize, TB;
+// 0F 01 C1
+def VMCALL : I<0x01, RawFrm, (outs), (ins), "vmcall", []>, TB;
+def VMCLEARm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
+  "vmclear\t$vmcs", []>, OpSize, TB;
+// 0F 01 C2
+def VMLAUNCH : I<0x01, RawFrm, (outs), (ins), "vmlaunch", []>, TB;
+// 0F 01 C3
+def VMRESUME : I<0x01, RawFrm, (outs), (ins), "vmresume", []>, TB;
+def VMPTRLDm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
+  "vmptrld\t$vmcs", []>, TB;
+def VMPTRSTm : I<0xC7, MRM7m, (outs i64mem:$vmcs), (ins),
+  "vmptrst\t$vmcs", []>, TB;
+def VMREAD64rm : I<0x78, MRMDestMem, (outs i64mem:$dst), (ins GR64:$src),
+  "vmread{q}\t{$src, $dst|$dst, $src}", []>, TB;
+def VMREAD64rr : I<0x78, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
+  "vmread{q}\t{$src, $dst|$dst, $src}", []>, TB;
+def VMREAD32rm : I<0x78, MRMDestMem, (outs i32mem:$dst), (ins GR32:$src),
+  "vmread{l}\t{$src, $dst|$dst, $src}", []>, TB;
+def VMREAD32rr : I<0x78, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
+  "vmread{l}\t{$src, $dst|$dst, $src}", []>, TB;
+def VMWRITE64rm : I<0x79, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+  "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, TB;
+def VMWRITE64rr : I<0x79, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+  "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, TB;
+def VMWRITE32rm : I<0x79, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+  "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, TB;
+def VMWRITE32rr : I<0x79, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+  "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, TB;
+// 0F 01 C4
+def VMXOFF : I<0x01, RawFrm, (outs), (ins), "vmxoff", []>, OpSize;
+def VMXON : I<0xC7, MRM6m, (outs), (ins i64mem:$vmxon),
+  "vmxon\t{$vmxon}", []>, XD;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// ConstantPool GlobalAddress, ExternalSymbol, and JumpTable
+def : Pat<(i32 (X86Wrapper tconstpool  :$dst)), (MOV32ri tconstpool  :$dst)>;
+def : Pat<(i32 (X86Wrapper tjumptable  :$dst)), (MOV32ri tjumptable  :$dst)>;
+def : Pat<(i32 (X86Wrapper tglobaltlsaddr:$dst)),(MOV32ri tglobaltlsaddr:$dst)>;
+def : Pat<(i32 (X86Wrapper tglobaladdr :$dst)), (MOV32ri tglobaladdr :$dst)>;
+def : Pat<(i32 (X86Wrapper texternalsym:$dst)), (MOV32ri texternalsym:$dst)>;
+def : Pat<(i32 (X86Wrapper tblockaddress:$dst)), (MOV32ri tblockaddress:$dst)>;
+
+def : Pat<(add GR32:$src1, (X86Wrapper tconstpool:$src2)),
+          (ADD32ri GR32:$src1, tconstpool:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper tjumptable:$src2)),
+          (ADD32ri GR32:$src1, tjumptable:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper tglobaladdr :$src2)),
+          (ADD32ri GR32:$src1, tglobaladdr:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper texternalsym:$src2)),
+          (ADD32ri GR32:$src1, texternalsym:$src2)>;
+def : Pat<(add GR32:$src1, (X86Wrapper tblockaddress:$src2)),
+          (ADD32ri GR32:$src1, tblockaddress:$src2)>;
+
+def : Pat<(store (i32 (X86Wrapper tglobaladdr:$src)), addr:$dst),
+          (MOV32mi addr:$dst, tglobaladdr:$src)>;
+def : Pat<(store (i32 (X86Wrapper texternalsym:$src)), addr:$dst),
+          (MOV32mi addr:$dst, texternalsym:$src)>;
+def : Pat<(store (i32 (X86Wrapper tblockaddress:$src)), addr:$dst),
+          (MOV32mi addr:$dst, tblockaddress:$src)>;
+
+// Calls
+// tailcall stuff
+def : Pat<(X86tcret GR32:$dst, imm:$off),
+          (TCRETURNri GR32:$dst, imm:$off)>;
+
+def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off),
+          (TCRETURNdi texternalsym:$dst, imm:$off)>;
+
+def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off),
+          (TCRETURNdi texternalsym:$dst, imm:$off)>;
+
+// Normal calls, with various flavors of addresses.
+def : Pat<(X86call (i32 tglobaladdr:$dst)),
+          (CALLpcrel32 tglobaladdr:$dst)>;
+def : Pat<(X86call (i32 texternalsym:$dst)),
+          (CALLpcrel32 texternalsym:$dst)>;
+def : Pat<(X86call (i32 imm:$dst)),
+          (CALLpcrel32 imm:$dst)>, Requires<[CallImmAddr]>;
+
+// X86 specific add which produces a flag.
+def : Pat<(addc GR32:$src1, GR32:$src2),
+          (ADD32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(addc GR32:$src1, (load addr:$src2)),
+          (ADD32rm GR32:$src1, addr:$src2)>;
+def : Pat<(addc GR32:$src1, imm:$src2),
+          (ADD32ri GR32:$src1, imm:$src2)>;
+def : Pat<(addc GR32:$src1, i32immSExt8:$src2),
+          (ADD32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+def : Pat<(subc GR32:$src1, GR32:$src2),
+          (SUB32rr GR32:$src1, GR32:$src2)>;
+def : Pat<(subc GR32:$src1, (load addr:$src2)),
+          (SUB32rm GR32:$src1, addr:$src2)>;
+def : Pat<(subc GR32:$src1, imm:$src2),
+          (SUB32ri GR32:$src1, imm:$src2)>;
+def : Pat<(subc GR32:$src1, i32immSExt8:$src2),
+          (SUB32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// Comparisons.
+
+// TEST R,R is smaller than CMP R,0
+def : Pat<(parallel (X86cmp GR8:$src1, 0), (implicit EFLAGS)),
+          (TEST8rr GR8:$src1, GR8:$src1)>;
+def : Pat<(parallel (X86cmp GR16:$src1, 0), (implicit EFLAGS)),
+          (TEST16rr GR16:$src1, GR16:$src1)>;
+def : Pat<(parallel (X86cmp GR32:$src1, 0), (implicit EFLAGS)),
+          (TEST32rr GR32:$src1, GR32:$src1)>;
+
+// Conditional moves with folded loads with operands swapped and conditions
+// inverted.
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_B, EFLAGS),
+          (CMOVAE16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_B, EFLAGS),
+          (CMOVAE32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_AE, EFLAGS),
+          (CMOVB16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_AE, EFLAGS),
+          (CMOVB32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_E, EFLAGS),
+          (CMOVNE16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_E, EFLAGS),
+          (CMOVNE32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_NE, EFLAGS),
+          (CMOVE16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_NE, EFLAGS),
+          (CMOVE32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_BE, EFLAGS),
+          (CMOVA16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_BE, EFLAGS),
+          (CMOVA32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_A, EFLAGS),
+          (CMOVBE16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_A, EFLAGS),
+          (CMOVBE32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_L, EFLAGS),
+          (CMOVGE16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_L, EFLAGS),
+          (CMOVGE32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_GE, EFLAGS),
+          (CMOVL16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_GE, EFLAGS),
+          (CMOVL32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_LE, EFLAGS),
+          (CMOVG16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_LE, EFLAGS),
+          (CMOVG32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_G, EFLAGS),
+          (CMOVLE16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_G, EFLAGS),
+          (CMOVLE32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_P, EFLAGS),
+          (CMOVNP16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_P, EFLAGS),
+          (CMOVNP32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_NP, EFLAGS),
+          (CMOVP16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_NP, EFLAGS),
+          (CMOVP32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_S, EFLAGS),
+          (CMOVNS16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_S, EFLAGS),
+          (CMOVNS32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_NS, EFLAGS),
+          (CMOVS16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_NS, EFLAGS),
+          (CMOVS32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_O, EFLAGS),
+          (CMOVNO16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_O, EFLAGS),
+          (CMOVNO32rm GR32:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, X86_COND_NO, EFLAGS),
+          (CMOVO16rm GR16:$src2, addr:$src1)>;
+def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, X86_COND_NO, EFLAGS),
+          (CMOVO32rm GR32:$src2, addr:$src1)>;
+
+// zextload bool -> zextload byte
+def : Pat<(zextloadi8i1  addr:$src), (MOV8rm     addr:$src)>;
+def : Pat<(zextloadi16i1 addr:$src), (MOVZX16rm8 addr:$src)>;
+def : Pat<(zextloadi32i1 addr:$src), (MOVZX32rm8 addr:$src)>;
+
+// extload bool -> extload byte
+def : Pat<(extloadi8i1 addr:$src),   (MOV8rm      addr:$src)>;
+def : Pat<(extloadi16i1 addr:$src),  (MOVZX16rm8  addr:$src)>;
+def : Pat<(extloadi32i1 addr:$src),  (MOVZX32rm8  addr:$src)>;
+def : Pat<(extloadi16i8 addr:$src),  (MOVZX16rm8  addr:$src)>;
+def : Pat<(extloadi32i8 addr:$src),  (MOVZX32rm8  addr:$src)>;
+def : Pat<(extloadi32i16 addr:$src), (MOVZX32rm16 addr:$src)>;
+
+// anyext. Define these to do an explicit zero-extend to
+// avoid partial-register updates.
+def : Pat<(i16 (anyext GR8 :$src)), (MOVZX16rr8  GR8 :$src)>;
+def : Pat<(i32 (anyext GR8 :$src)), (MOVZX32rr8  GR8 :$src)>;
+def : Pat<(i32 (anyext GR16:$src)), (MOVZX32rr16 GR16:$src)>;
+
+// (and (i32 load), 255) -> (zextload i8)
+def : Pat<(i32 (and (nvloadi32 addr:$src), (i32 255))),
+          (MOVZX32rm8 addr:$src)>;
+def : Pat<(i32 (and (nvloadi32 addr:$src), (i32 65535))),
+          (MOVZX32rm16 addr:$src)>;
+
+//===----------------------------------------------------------------------===//
+// Some peepholes
+//===----------------------------------------------------------------------===//
+
+// Odd encoding trick: -128 fits into an 8-bit immediate field while
+// +128 doesn't, so in this special case use a sub instead of an add.
+def : Pat<(add GR16:$src1, 128),
+          (SUB16ri8 GR16:$src1, -128)>;
+def : Pat<(store (add (loadi16 addr:$dst), 128), addr:$dst),
+          (SUB16mi8 addr:$dst, -128)>;
+def : Pat<(add GR32:$src1, 128),
+          (SUB32ri8 GR32:$src1, -128)>;
+def : Pat<(store (add (loadi32 addr:$dst), 128), addr:$dst),
+          (SUB32mi8 addr:$dst, -128)>;
+
+// r & (2^16-1) ==> movz
+def : Pat<(and GR32:$src1, 0xffff),
+          (MOVZX32rr16 (EXTRACT_SUBREG GR32:$src1, x86_subreg_16bit))>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR32:$src1, 0xff),
+          (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src1, 
+                                                             GR32_ABCD)),
+                                      x86_subreg_8bit))>,
+      Requires<[In32BitMode]>;
+// r & (2^8-1) ==> movz
+def : Pat<(and GR16:$src1, 0xff),
+          (MOVZX16rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src1, 
+                                                             GR16_ABCD)),
+                                      x86_subreg_8bit))>,
+      Requires<[In32BitMode]>;
+
+// sext_inreg patterns
+def : Pat<(sext_inreg GR32:$src, i16),
+          (MOVSX32rr16 (EXTRACT_SUBREG GR32:$src, x86_subreg_16bit))>;
+def : Pat<(sext_inreg GR32:$src, i8),
+          (MOVSX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, 
+                                                             GR32_ABCD)),
+                                      x86_subreg_8bit))>,
+      Requires<[In32BitMode]>;
+def : Pat<(sext_inreg GR16:$src, i8),
+          (MOVSX16rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, 
+                                                             GR16_ABCD)),
+                                      x86_subreg_8bit))>,
+      Requires<[In32BitMode]>;
+
+// trunc patterns
+def : Pat<(i16 (trunc GR32:$src)),
+          (EXTRACT_SUBREG GR32:$src, x86_subreg_16bit)>;
+def : Pat<(i8 (trunc GR32:$src)),
+          (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
+                          x86_subreg_8bit)>,
+      Requires<[In32BitMode]>;
+def : Pat<(i8 (trunc GR16:$src)),
+          (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                          x86_subreg_8bit)>,
+      Requires<[In32BitMode]>;
+
+// h-register tricks
+def : Pat<(i8 (trunc (srl_su GR16:$src, (i8 8)))),
+          (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                          x86_subreg_8bit_hi)>,
+      Requires<[In32BitMode]>;
+def : Pat<(i8 (trunc (srl_su GR32:$src, (i8 8)))),
+          (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
+                          x86_subreg_8bit_hi)>,
+      Requires<[In32BitMode]>;
+def : Pat<(srl GR16:$src, (i8 8)),
+          (EXTRACT_SUBREG
+            (MOVZX32rr8
+              (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
+                              x86_subreg_8bit_hi)),
+            x86_subreg_16bit)>,
+      Requires<[In32BitMode]>;
+def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))),
+          (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, 
+                                                             GR16_ABCD)),
+                                      x86_subreg_8bit_hi))>,
+      Requires<[In32BitMode]>;
+def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))),
+          (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, 
+                                                             GR16_ABCD)),
+                                      x86_subreg_8bit_hi))>,
+      Requires<[In32BitMode]>;
+def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
+          (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, 
+                                                             GR32_ABCD)),
+                                      x86_subreg_8bit_hi))>,
+      Requires<[In32BitMode]>;
+
+// (shl x, 1) ==> (add x, x)
+def : Pat<(shl GR8 :$src1, (i8 1)), (ADD8rr  GR8 :$src1, GR8 :$src1)>;
+def : Pat<(shl GR16:$src1, (i8 1)), (ADD16rr GR16:$src1, GR16:$src1)>;
+def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>;
+
+// (shl x (and y, 31)) ==> (shl x, y)
+def : Pat<(shl GR8:$src1, (and CL:$amt, 31)),
+          (SHL8rCL GR8:$src1)>;
+def : Pat<(shl GR16:$src1, (and CL:$amt, 31)),
+          (SHL16rCL GR16:$src1)>;
+def : Pat<(shl GR32:$src1, (and CL:$amt, 31)),
+          (SHL32rCL GR32:$src1)>;
+def : Pat<(store (shl (loadi8 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+          (SHL8mCL addr:$dst)>;
+def : Pat<(store (shl (loadi16 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+          (SHL16mCL addr:$dst)>;
+def : Pat<(store (shl (loadi32 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+          (SHL32mCL addr:$dst)>;
+
+def : Pat<(srl GR8:$src1, (and CL:$amt, 31)),
+          (SHR8rCL GR8:$src1)>;
+def : Pat<(srl GR16:$src1, (and CL:$amt, 31)),
+          (SHR16rCL GR16:$src1)>;
+def : Pat<(srl GR32:$src1, (and CL:$amt, 31)),
+          (SHR32rCL GR32:$src1)>;
+def : Pat<(store (srl (loadi8 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+          (SHR8mCL addr:$dst)>;
+def : Pat<(store (srl (loadi16 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+          (SHR16mCL addr:$dst)>;
+def : Pat<(store (srl (loadi32 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+          (SHR32mCL addr:$dst)>;
+
+def : Pat<(sra GR8:$src1, (and CL:$amt, 31)),
+          (SAR8rCL GR8:$src1)>;
+def : Pat<(sra GR16:$src1, (and CL:$amt, 31)),
+          (SAR16rCL GR16:$src1)>;
+def : Pat<(sra GR32:$src1, (and CL:$amt, 31)),
+          (SAR32rCL GR32:$src1)>;
+def : Pat<(store (sra (loadi8 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+          (SAR8mCL addr:$dst)>;
+def : Pat<(store (sra (loadi16 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+          (SAR16mCL addr:$dst)>;
+def : Pat<(store (sra (loadi32 addr:$dst), (and CL:$amt, 31)), addr:$dst),
+          (SAR32mCL addr:$dst)>;
+
+// (or (x >> c) | (y << (32 - c))) ==> (shrd32 x, y, c)
+def : Pat<(or (srl GR32:$src1, CL:$amt),
+              (shl GR32:$src2, (sub 32, CL:$amt))),
+          (SHRD32rrCL GR32:$src1, GR32:$src2)>;
+
+def : Pat<(store (or (srl (loadi32 addr:$dst), CL:$amt),
+                     (shl GR32:$src2, (sub 32, CL:$amt))), addr:$dst),
+          (SHRD32mrCL addr:$dst, GR32:$src2)>;
+
+def : Pat<(or (srl GR32:$src1, (i8 (trunc ECX:$amt))),
+              (shl GR32:$src2, (i8 (trunc (sub 32, ECX:$amt))))),
+          (SHRD32rrCL GR32:$src1, GR32:$src2)>;
+
+def : Pat<(store (or (srl (loadi32 addr:$dst), (i8 (trunc ECX:$amt))),
+                     (shl GR32:$src2, (i8 (trunc (sub 32, ECX:$amt))))),
+                 addr:$dst),
+          (SHRD32mrCL addr:$dst, GR32:$src2)>;
+
+def : Pat<(shrd GR32:$src1, (i8 imm:$amt1), GR32:$src2, (i8 imm:$amt2)),
+          (SHRD32rri8 GR32:$src1, GR32:$src2, (i8 imm:$amt1))>;
+
+def : Pat<(store (shrd (loadi32 addr:$dst), (i8 imm:$amt1),
+                       GR32:$src2, (i8 imm:$amt2)), addr:$dst),
+          (SHRD32mri8 addr:$dst, GR32:$src2, (i8 imm:$amt1))>;
+
+// (or (x << c) | (y >> (32 - c))) ==> (shld32 x, y, c)
+def : Pat<(or (shl GR32:$src1, CL:$amt),
+              (srl GR32:$src2, (sub 32, CL:$amt))),
+          (SHLD32rrCL GR32:$src1, GR32:$src2)>;
+
+def : Pat<(store (or (shl (loadi32 addr:$dst), CL:$amt),
+                     (srl GR32:$src2, (sub 32, CL:$amt))), addr:$dst),
+          (SHLD32mrCL addr:$dst, GR32:$src2)>;
+
+def : Pat<(or (shl GR32:$src1, (i8 (trunc ECX:$amt))),
+              (srl GR32:$src2, (i8 (trunc (sub 32, ECX:$amt))))),
+          (SHLD32rrCL GR32:$src1, GR32:$src2)>;
+
+def : Pat<(store (or (shl (loadi32 addr:$dst), (i8 (trunc ECX:$amt))),
+                     (srl GR32:$src2, (i8 (trunc (sub 32, ECX:$amt))))),
+                 addr:$dst),
+          (SHLD32mrCL addr:$dst, GR32:$src2)>;
+
+def : Pat<(shld GR32:$src1, (i8 imm:$amt1), GR32:$src2, (i8 imm:$amt2)),
+          (SHLD32rri8 GR32:$src1, GR32:$src2, (i8 imm:$amt1))>;
+
+def : Pat<(store (shld (loadi32 addr:$dst), (i8 imm:$amt1),
+                       GR32:$src2, (i8 imm:$amt2)), addr:$dst),
+          (SHLD32mri8 addr:$dst, GR32:$src2, (i8 imm:$amt1))>;
+
+// (or (x >> c) | (y << (16 - c))) ==> (shrd16 x, y, c)
+def : Pat<(or (srl GR16:$src1, CL:$amt),
+              (shl GR16:$src2, (sub 16, CL:$amt))),
+          (SHRD16rrCL GR16:$src1, GR16:$src2)>;
+
+def : Pat<(store (or (srl (loadi16 addr:$dst), CL:$amt),
+                     (shl GR16:$src2, (sub 16, CL:$amt))), addr:$dst),
+          (SHRD16mrCL addr:$dst, GR16:$src2)>;
+
+def : Pat<(or (srl GR16:$src1, (i8 (trunc CX:$amt))),
+              (shl GR16:$src2, (i8 (trunc (sub 16, CX:$amt))))),
+          (SHRD16rrCL GR16:$src1, GR16:$src2)>;
+
+def : Pat<(store (or (srl (loadi16 addr:$dst), (i8 (trunc CX:$amt))),
+                     (shl GR16:$src2, (i8 (trunc (sub 16, CX:$amt))))),
+                 addr:$dst),
+          (SHRD16mrCL addr:$dst, GR16:$src2)>;
+
+def : Pat<(shrd GR16:$src1, (i8 imm:$amt1), GR16:$src2, (i8 imm:$amt2)),
+          (SHRD16rri8 GR16:$src1, GR16:$src2, (i8 imm:$amt1))>;
+
+def : Pat<(store (shrd (loadi16 addr:$dst), (i8 imm:$amt1),
+                       GR16:$src2, (i8 imm:$amt2)), addr:$dst),
+          (SHRD16mri8 addr:$dst, GR16:$src2, (i8 imm:$amt1))>;
+
+// (or (x << c) | (y >> (16 - c))) ==> (shld16 x, y, c)
+def : Pat<(or (shl GR16:$src1, CL:$amt),
+              (srl GR16:$src2, (sub 16, CL:$amt))),
+          (SHLD16rrCL GR16:$src1, GR16:$src2)>;
+
+def : Pat<(store (or (shl (loadi16 addr:$dst), CL:$amt),
+                     (srl GR16:$src2, (sub 16, CL:$amt))), addr:$dst),
+          (SHLD16mrCL addr:$dst, GR16:$src2)>;
+
+def : Pat<(or (shl GR16:$src1, (i8 (trunc CX:$amt))),
+              (srl GR16:$src2, (i8 (trunc (sub 16, CX:$amt))))),
+          (SHLD16rrCL GR16:$src1, GR16:$src2)>;
+
+def : Pat<(store (or (shl (loadi16 addr:$dst), (i8 (trunc CX:$amt))),
+                     (srl GR16:$src2, (i8 (trunc (sub 16, CX:$amt))))),
+                 addr:$dst),
+          (SHLD16mrCL addr:$dst, GR16:$src2)>;
+
+def : Pat<(shld GR16:$src1, (i8 imm:$amt1), GR16:$src2, (i8 imm:$amt2)),
+          (SHLD16rri8 GR16:$src1, GR16:$src2, (i8 imm:$amt1))>;
+
+def : Pat<(store (shld (loadi16 addr:$dst), (i8 imm:$amt1),
+                       GR16:$src2, (i8 imm:$amt2)), addr:$dst),
+          (SHLD16mri8 addr:$dst, GR16:$src2, (i8 imm:$amt1))>;
+
+// (anyext (setcc_carry)) -> (setcc_carry)
+def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+          (SETB_C16r)>;
+def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
+          (SETB_C32r)>;
+
+// (or x1, x2) -> (add x1, x2) if two operands are known not to share bits.
+let AddedComplexity = 5 in { // Try this before the selecting to OR
+def : Pat<(parallel (or_is_add GR16:$src1, imm:$src2),
+                    (implicit EFLAGS)),
+          (ADD16ri GR16:$src1, imm:$src2)>;
+def : Pat<(parallel (or_is_add GR32:$src1, imm:$src2),
+                    (implicit EFLAGS)),
+          (ADD32ri GR32:$src1, imm:$src2)>;
+def : Pat<(parallel (or_is_add GR16:$src1, i16immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (ADD16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(parallel (or_is_add GR32:$src1, i32immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (ADD32ri8 GR32:$src1, i32immSExt8:$src2)>;
+def : Pat<(parallel (or_is_add GR16:$src1, GR16:$src2),
+                    (implicit EFLAGS)),
+          (ADD16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(parallel (or_is_add GR32:$src1, GR32:$src2),
+                    (implicit EFLAGS)),
+          (ADD32rr GR32:$src1, GR32:$src2)>;
+} // AddedComplexity
+
+//===----------------------------------------------------------------------===//
+// EFLAGS-defining Patterns
+//===----------------------------------------------------------------------===//
+
+// Register-Register Addition with EFLAGS result
+def : Pat<(parallel (X86add_flag GR8:$src1, GR8:$src2),
+                    (implicit EFLAGS)),
+          (ADD8rr GR8:$src1, GR8:$src2)>;
+def : Pat<(parallel (X86add_flag GR16:$src1, GR16:$src2),
+                    (implicit EFLAGS)),
+          (ADD16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(parallel (X86add_flag GR32:$src1, GR32:$src2),
+                    (implicit EFLAGS)),
+          (ADD32rr GR32:$src1, GR32:$src2)>;
+
+// Register-Memory Addition with EFLAGS result
+def : Pat<(parallel (X86add_flag GR8:$src1, (loadi8 addr:$src2)),
+                    (implicit EFLAGS)),
+          (ADD8rm GR8:$src1, addr:$src2)>;
+def : Pat<(parallel (X86add_flag GR16:$src1, (loadi16 addr:$src2)),
+                    (implicit EFLAGS)),
+          (ADD16rm GR16:$src1, addr:$src2)>;
+def : Pat<(parallel (X86add_flag GR32:$src1, (loadi32 addr:$src2)),
+                    (implicit EFLAGS)),
+          (ADD32rm GR32:$src1, addr:$src2)>;
+
+// Register-Integer Addition with EFLAGS result
+def : Pat<(parallel (X86add_flag GR8:$src1, imm:$src2),
+                    (implicit EFLAGS)),
+          (ADD8ri GR8:$src1, imm:$src2)>;
+def : Pat<(parallel (X86add_flag GR16:$src1, imm:$src2),
+                    (implicit EFLAGS)),
+          (ADD16ri GR16:$src1, imm:$src2)>;
+def : Pat<(parallel (X86add_flag GR32:$src1, imm:$src2),
+                    (implicit EFLAGS)),
+          (ADD32ri GR32:$src1, imm:$src2)>;
+def : Pat<(parallel (X86add_flag GR16:$src1, i16immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (ADD16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(parallel (X86add_flag GR32:$src1, i32immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (ADD32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// Memory-Register Addition with EFLAGS result
+def : Pat<(parallel (store (X86add_flag (loadi8 addr:$dst), GR8:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (ADD8mr addr:$dst, GR8:$src2)>;
+def : Pat<(parallel (store (X86add_flag (loadi16 addr:$dst), GR16:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (ADD16mr addr:$dst, GR16:$src2)>;
+def : Pat<(parallel (store (X86add_flag (loadi32 addr:$dst), GR32:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (ADD32mr addr:$dst, GR32:$src2)>;
+
+// Memory-Integer Addition with EFLAGS result
+def : Pat<(parallel (store (X86add_flag (loadi8 addr:$dst), imm:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (ADD8mi addr:$dst, imm:$src2)>;
+def : Pat<(parallel (store (X86add_flag (loadi16 addr:$dst), imm:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (ADD16mi addr:$dst, imm:$src2)>;
+def : Pat<(parallel (store (X86add_flag (loadi32 addr:$dst), imm:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (ADD32mi addr:$dst, imm:$src2)>;
+def : Pat<(parallel (store (X86add_flag (loadi16 addr:$dst), i16immSExt8:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (ADD16mi8 addr:$dst, i16immSExt8:$src2)>;
+def : Pat<(parallel (store (X86add_flag (loadi32 addr:$dst), i32immSExt8:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (ADD32mi8 addr:$dst, i32immSExt8:$src2)>;
+
+// Register-Register Subtraction with EFLAGS result
+def : Pat<(parallel (X86sub_flag GR8:$src1, GR8:$src2),
+                    (implicit EFLAGS)),
+          (SUB8rr GR8:$src1, GR8:$src2)>;
+def : Pat<(parallel (X86sub_flag GR16:$src1, GR16:$src2),
+                    (implicit EFLAGS)),
+          (SUB16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(parallel (X86sub_flag GR32:$src1, GR32:$src2),
+                    (implicit EFLAGS)),
+          (SUB32rr GR32:$src1, GR32:$src2)>;
+
+// Register-Memory Subtraction with EFLAGS result
+def : Pat<(parallel (X86sub_flag GR8:$src1, (loadi8 addr:$src2)),
+                    (implicit EFLAGS)),
+          (SUB8rm GR8:$src1, addr:$src2)>;
+def : Pat<(parallel (X86sub_flag GR16:$src1, (loadi16 addr:$src2)),
+                    (implicit EFLAGS)),
+          (SUB16rm GR16:$src1, addr:$src2)>;
+def : Pat<(parallel (X86sub_flag GR32:$src1, (loadi32 addr:$src2)),
+                    (implicit EFLAGS)),
+          (SUB32rm GR32:$src1, addr:$src2)>;
+
+// Register-Integer Subtraction with EFLAGS result
+def : Pat<(parallel (X86sub_flag GR8:$src1, imm:$src2),
+                    (implicit EFLAGS)),
+          (SUB8ri GR8:$src1, imm:$src2)>;
+def : Pat<(parallel (X86sub_flag GR16:$src1, imm:$src2),
+                    (implicit EFLAGS)),
+          (SUB16ri GR16:$src1, imm:$src2)>;
+def : Pat<(parallel (X86sub_flag GR32:$src1, imm:$src2),
+                    (implicit EFLAGS)),
+          (SUB32ri GR32:$src1, imm:$src2)>;
+def : Pat<(parallel (X86sub_flag GR16:$src1, i16immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (SUB16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(parallel (X86sub_flag GR32:$src1, i32immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (SUB32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// Memory-Register Subtraction with EFLAGS result
+def : Pat<(parallel (store (X86sub_flag (loadi8 addr:$dst), GR8:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (SUB8mr addr:$dst, GR8:$src2)>;
+def : Pat<(parallel (store (X86sub_flag (loadi16 addr:$dst), GR16:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (SUB16mr addr:$dst, GR16:$src2)>;
+def : Pat<(parallel (store (X86sub_flag (loadi32 addr:$dst), GR32:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (SUB32mr addr:$dst, GR32:$src2)>;
+
+// Memory-Integer Subtraction with EFLAGS result
+def : Pat<(parallel (store (X86sub_flag (loadi8 addr:$dst), imm:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (SUB8mi addr:$dst, imm:$src2)>;
+def : Pat<(parallel (store (X86sub_flag (loadi16 addr:$dst), imm:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (SUB16mi addr:$dst, imm:$src2)>;
+def : Pat<(parallel (store (X86sub_flag (loadi32 addr:$dst), imm:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (SUB32mi addr:$dst, imm:$src2)>;
+def : Pat<(parallel (store (X86sub_flag (loadi16 addr:$dst), i16immSExt8:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (SUB16mi8 addr:$dst, i16immSExt8:$src2)>;
+def : Pat<(parallel (store (X86sub_flag (loadi32 addr:$dst), i32immSExt8:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (SUB32mi8 addr:$dst, i32immSExt8:$src2)>;
+
+
+// Register-Register Signed Integer Multiply with EFLAGS result
+def : Pat<(parallel (X86smul_flag GR16:$src1, GR16:$src2),
+                    (implicit EFLAGS)),
+          (IMUL16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(parallel (X86smul_flag GR32:$src1, GR32:$src2),
+                    (implicit EFLAGS)),
+          (IMUL32rr GR32:$src1, GR32:$src2)>;
+
+// Register-Memory Signed Integer Multiply with EFLAGS result
+def : Pat<(parallel (X86smul_flag GR16:$src1, (loadi16 addr:$src2)),
+                    (implicit EFLAGS)),
+          (IMUL16rm GR16:$src1, addr:$src2)>;
+def : Pat<(parallel (X86smul_flag GR32:$src1, (loadi32 addr:$src2)),
+                    (implicit EFLAGS)),
+          (IMUL32rm GR32:$src1, addr:$src2)>;
+
+// Register-Integer Signed Integer Multiply with EFLAGS result
+def : Pat<(parallel (X86smul_flag GR16:$src1, imm:$src2),
+                    (implicit EFLAGS)),
+          (IMUL16rri GR16:$src1, imm:$src2)>;
+def : Pat<(parallel (X86smul_flag GR32:$src1, imm:$src2),
+                    (implicit EFLAGS)),
+          (IMUL32rri GR32:$src1, imm:$src2)>;
+def : Pat<(parallel (X86smul_flag GR16:$src1, i16immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (IMUL16rri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(parallel (X86smul_flag GR32:$src1, i32immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (IMUL32rri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// Memory-Integer Signed Integer Multiply with EFLAGS result
+def : Pat<(parallel (X86smul_flag (loadi16 addr:$src1), imm:$src2),
+                    (implicit EFLAGS)),
+          (IMUL16rmi addr:$src1, imm:$src2)>;
+def : Pat<(parallel (X86smul_flag (loadi32 addr:$src1), imm:$src2),
+                    (implicit EFLAGS)),
+          (IMUL32rmi addr:$src1, imm:$src2)>;
+def : Pat<(parallel (X86smul_flag (loadi16 addr:$src1), i16immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (IMUL16rmi8 addr:$src1, i16immSExt8:$src2)>;
+def : Pat<(parallel (X86smul_flag (loadi32 addr:$src1), i32immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (IMUL32rmi8 addr:$src1, i32immSExt8:$src2)>;
+
+// Optimize multiply by 2 with EFLAGS result.
+let AddedComplexity = 2 in {
+def : Pat<(parallel (X86smul_flag GR16:$src1, 2),
+                    (implicit EFLAGS)),
+          (ADD16rr GR16:$src1, GR16:$src1)>;
+
+def : Pat<(parallel (X86smul_flag GR32:$src1, 2),
+                    (implicit EFLAGS)),
+          (ADD32rr GR32:$src1, GR32:$src1)>;
+}
+
+// INC and DEC with EFLAGS result. Note that these do not set CF.
+def : Pat<(parallel (X86inc_flag GR8:$src), (implicit EFLAGS)),
+          (INC8r GR8:$src)>;
+def : Pat<(parallel (store (i8 (X86inc_flag (loadi8 addr:$dst))), addr:$dst),
+                    (implicit EFLAGS)),
+          (INC8m addr:$dst)>;
+def : Pat<(parallel (X86dec_flag GR8:$src), (implicit EFLAGS)),
+          (DEC8r GR8:$src)>;
+def : Pat<(parallel (store (i8 (X86dec_flag (loadi8 addr:$dst))), addr:$dst),
+                    (implicit EFLAGS)),
+          (DEC8m addr:$dst)>;
+
+def : Pat<(parallel (X86inc_flag GR16:$src), (implicit EFLAGS)),
+          (INC16r GR16:$src)>, Requires<[In32BitMode]>;
+def : Pat<(parallel (store (i16 (X86inc_flag (loadi16 addr:$dst))), addr:$dst),
+                    (implicit EFLAGS)),
+          (INC16m addr:$dst)>, Requires<[In32BitMode]>;
+def : Pat<(parallel (X86dec_flag GR16:$src), (implicit EFLAGS)),
+          (DEC16r GR16:$src)>, Requires<[In32BitMode]>;
+def : Pat<(parallel (store (i16 (X86dec_flag (loadi16 addr:$dst))), addr:$dst),
+                    (implicit EFLAGS)),
+          (DEC16m addr:$dst)>, Requires<[In32BitMode]>;
+
+def : Pat<(parallel (X86inc_flag GR32:$src), (implicit EFLAGS)),
+          (INC32r GR32:$src)>, Requires<[In32BitMode]>;
+def : Pat<(parallel (store (i32 (X86inc_flag (loadi32 addr:$dst))), addr:$dst),
+                    (implicit EFLAGS)),
+          (INC32m addr:$dst)>, Requires<[In32BitMode]>;
+def : Pat<(parallel (X86dec_flag GR32:$src), (implicit EFLAGS)),
+          (DEC32r GR32:$src)>, Requires<[In32BitMode]>;
+def : Pat<(parallel (store (i32 (X86dec_flag (loadi32 addr:$dst))), addr:$dst),
+                    (implicit EFLAGS)),
+          (DEC32m addr:$dst)>, Requires<[In32BitMode]>;
+
+// Register-Register Or with EFLAGS result
+def : Pat<(parallel (X86or_flag GR8:$src1, GR8:$src2),
+                    (implicit EFLAGS)),
+          (OR8rr GR8:$src1, GR8:$src2)>;
+def : Pat<(parallel (X86or_flag GR16:$src1, GR16:$src2),
+                    (implicit EFLAGS)),
+          (OR16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(parallel (X86or_flag GR32:$src1, GR32:$src2),
+                    (implicit EFLAGS)),
+          (OR32rr GR32:$src1, GR32:$src2)>;
+
+// Register-Memory Or with EFLAGS result
+def : Pat<(parallel (X86or_flag GR8:$src1, (loadi8 addr:$src2)),
+                    (implicit EFLAGS)),
+          (OR8rm GR8:$src1, addr:$src2)>;
+def : Pat<(parallel (X86or_flag GR16:$src1, (loadi16 addr:$src2)),
+                    (implicit EFLAGS)),
+          (OR16rm GR16:$src1, addr:$src2)>;
+def : Pat<(parallel (X86or_flag GR32:$src1, (loadi32 addr:$src2)),
+                    (implicit EFLAGS)),
+          (OR32rm GR32:$src1, addr:$src2)>;
+
+// Register-Integer Or with EFLAGS result
+def : Pat<(parallel (X86or_flag GR8:$src1, imm:$src2),
+                    (implicit EFLAGS)),
+          (OR8ri GR8:$src1, imm:$src2)>;
+def : Pat<(parallel (X86or_flag GR16:$src1, imm:$src2),
+                    (implicit EFLAGS)),
+          (OR16ri GR16:$src1, imm:$src2)>;
+def : Pat<(parallel (X86or_flag GR32:$src1, imm:$src2),
+                    (implicit EFLAGS)),
+          (OR32ri GR32:$src1, imm:$src2)>;
+def : Pat<(parallel (X86or_flag GR16:$src1, i16immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (OR16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(parallel (X86or_flag GR32:$src1, i32immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (OR32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// Memory-Register Or with EFLAGS result
+def : Pat<(parallel (store (X86or_flag (loadi8 addr:$dst), GR8:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (OR8mr addr:$dst, GR8:$src2)>;
+def : Pat<(parallel (store (X86or_flag (loadi16 addr:$dst), GR16:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (OR16mr addr:$dst, GR16:$src2)>;
+def : Pat<(parallel (store (X86or_flag (loadi32 addr:$dst), GR32:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (OR32mr addr:$dst, GR32:$src2)>;
+
+// Memory-Integer Or with EFLAGS result
+def : Pat<(parallel (store (X86or_flag (loadi8 addr:$dst), imm:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (OR8mi addr:$dst, imm:$src2)>;
+def : Pat<(parallel (store (X86or_flag (loadi16 addr:$dst), imm:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (OR16mi addr:$dst, imm:$src2)>;
+def : Pat<(parallel (store (X86or_flag (loadi32 addr:$dst), imm:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (OR32mi addr:$dst, imm:$src2)>;
+def : Pat<(parallel (store (X86or_flag (loadi16 addr:$dst), i16immSExt8:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (OR16mi8 addr:$dst, i16immSExt8:$src2)>;
+def : Pat<(parallel (store (X86or_flag (loadi32 addr:$dst), i32immSExt8:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (OR32mi8 addr:$dst, i32immSExt8:$src2)>;
+
+// Register-Register XOr with EFLAGS result
+def : Pat<(parallel (X86xor_flag GR8:$src1, GR8:$src2),
+                    (implicit EFLAGS)),
+          (XOR8rr GR8:$src1, GR8:$src2)>;
+def : Pat<(parallel (X86xor_flag GR16:$src1, GR16:$src2),
+                    (implicit EFLAGS)),
+          (XOR16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(parallel (X86xor_flag GR32:$src1, GR32:$src2),
+                    (implicit EFLAGS)),
+          (XOR32rr GR32:$src1, GR32:$src2)>;
+
+// Register-Memory XOr with EFLAGS result
+def : Pat<(parallel (X86xor_flag GR8:$src1, (loadi8 addr:$src2)),
+                    (implicit EFLAGS)),
+          (XOR8rm GR8:$src1, addr:$src2)>;
+def : Pat<(parallel (X86xor_flag GR16:$src1, (loadi16 addr:$src2)),
+                    (implicit EFLAGS)),
+          (XOR16rm GR16:$src1, addr:$src2)>;
+def : Pat<(parallel (X86xor_flag GR32:$src1, (loadi32 addr:$src2)),
+                    (implicit EFLAGS)),
+          (XOR32rm GR32:$src1, addr:$src2)>;
+
+// Register-Integer XOr with EFLAGS result
+def : Pat<(parallel (X86xor_flag GR8:$src1, imm:$src2),
+                    (implicit EFLAGS)),
+          (XOR8ri GR8:$src1, imm:$src2)>;
+def : Pat<(parallel (X86xor_flag GR16:$src1, imm:$src2),
+                    (implicit EFLAGS)),
+          (XOR16ri GR16:$src1, imm:$src2)>;
+def : Pat<(parallel (X86xor_flag GR32:$src1, imm:$src2),
+                    (implicit EFLAGS)),
+          (XOR32ri GR32:$src1, imm:$src2)>;
+def : Pat<(parallel (X86xor_flag GR16:$src1, i16immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (XOR16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(parallel (X86xor_flag GR32:$src1, i32immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (XOR32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// Memory-Register XOr with EFLAGS result
+def : Pat<(parallel (store (X86xor_flag (loadi8 addr:$dst), GR8:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (XOR8mr addr:$dst, GR8:$src2)>;
+def : Pat<(parallel (store (X86xor_flag (loadi16 addr:$dst), GR16:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (XOR16mr addr:$dst, GR16:$src2)>;
+def : Pat<(parallel (store (X86xor_flag (loadi32 addr:$dst), GR32:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (XOR32mr addr:$dst, GR32:$src2)>;
+
+// Memory-Integer XOr with EFLAGS result
+def : Pat<(parallel (store (X86xor_flag (loadi8 addr:$dst), imm:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (XOR8mi addr:$dst, imm:$src2)>;
+def : Pat<(parallel (store (X86xor_flag (loadi16 addr:$dst), imm:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (XOR16mi addr:$dst, imm:$src2)>;
+def : Pat<(parallel (store (X86xor_flag (loadi32 addr:$dst), imm:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (XOR32mi addr:$dst, imm:$src2)>;
+def : Pat<(parallel (store (X86xor_flag (loadi16 addr:$dst), i16immSExt8:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (XOR16mi8 addr:$dst, i16immSExt8:$src2)>;
+def : Pat<(parallel (store (X86xor_flag (loadi32 addr:$dst), i32immSExt8:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (XOR32mi8 addr:$dst, i32immSExt8:$src2)>;
+
+// Register-Register And with EFLAGS result
+def : Pat<(parallel (X86and_flag GR8:$src1, GR8:$src2),
+                    (implicit EFLAGS)),
+          (AND8rr GR8:$src1, GR8:$src2)>;
+def : Pat<(parallel (X86and_flag GR16:$src1, GR16:$src2),
+                    (implicit EFLAGS)),
+          (AND16rr GR16:$src1, GR16:$src2)>;
+def : Pat<(parallel (X86and_flag GR32:$src1, GR32:$src2),
+                    (implicit EFLAGS)),
+          (AND32rr GR32:$src1, GR32:$src2)>;
+
+// Register-Memory And with EFLAGS result
+def : Pat<(parallel (X86and_flag GR8:$src1, (loadi8 addr:$src2)),
+                    (implicit EFLAGS)),
+          (AND8rm GR8:$src1, addr:$src2)>;
+def : Pat<(parallel (X86and_flag GR16:$src1, (loadi16 addr:$src2)),
+                    (implicit EFLAGS)),
+          (AND16rm GR16:$src1, addr:$src2)>;
+def : Pat<(parallel (X86and_flag GR32:$src1, (loadi32 addr:$src2)),
+                    (implicit EFLAGS)),
+          (AND32rm GR32:$src1, addr:$src2)>;
+
+// Register-Integer And with EFLAGS result
+def : Pat<(parallel (X86and_flag GR8:$src1, imm:$src2),
+                    (implicit EFLAGS)),
+          (AND8ri GR8:$src1, imm:$src2)>;
+def : Pat<(parallel (X86and_flag GR16:$src1, imm:$src2),
+                    (implicit EFLAGS)),
+          (AND16ri GR16:$src1, imm:$src2)>;
+def : Pat<(parallel (X86and_flag GR32:$src1, imm:$src2),
+                    (implicit EFLAGS)),
+          (AND32ri GR32:$src1, imm:$src2)>;
+def : Pat<(parallel (X86and_flag GR16:$src1, i16immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (AND16ri8 GR16:$src1, i16immSExt8:$src2)>;
+def : Pat<(parallel (X86and_flag GR32:$src1, i32immSExt8:$src2),
+                    (implicit EFLAGS)),
+          (AND32ri8 GR32:$src1, i32immSExt8:$src2)>;
+
+// Memory-Register And with EFLAGS result
+def : Pat<(parallel (store (X86and_flag (loadi8 addr:$dst), GR8:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (AND8mr addr:$dst, GR8:$src2)>;
+def : Pat<(parallel (store (X86and_flag (loadi16 addr:$dst), GR16:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (AND16mr addr:$dst, GR16:$src2)>;
+def : Pat<(parallel (store (X86and_flag (loadi32 addr:$dst), GR32:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (AND32mr addr:$dst, GR32:$src2)>;
+
+// Memory-Integer And with EFLAGS result
+def : Pat<(parallel (store (X86and_flag (loadi8 addr:$dst), imm:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (AND8mi addr:$dst, imm:$src2)>;
+def : Pat<(parallel (store (X86and_flag (loadi16 addr:$dst), imm:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (AND16mi addr:$dst, imm:$src2)>;
+def : Pat<(parallel (store (X86and_flag (loadi32 addr:$dst), imm:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (AND32mi addr:$dst, imm:$src2)>;
+def : Pat<(parallel (store (X86and_flag (loadi16 addr:$dst), i16immSExt8:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (AND16mi8 addr:$dst, i16immSExt8:$src2)>;
+def : Pat<(parallel (store (X86and_flag (loadi32 addr:$dst), i32immSExt8:$src2),
+                           addr:$dst),
+                    (implicit EFLAGS)),
+          (AND32mi8 addr:$dst, i32immSExt8:$src2)>;
+
+// -disable-16bit support.
+def : Pat<(truncstorei16 (i32 imm:$src), addr:$dst),
+          (MOV16mi addr:$dst, imm:$src)>;
+def : Pat<(truncstorei16 GR32:$src, addr:$dst),
+          (MOV16mr addr:$dst, (EXTRACT_SUBREG GR32:$src, x86_subreg_16bit))>;
+def : Pat<(i32 (sextloadi16 addr:$dst)),
+          (MOVSX32rm16 addr:$dst)>;
+def : Pat<(i32 (zextloadi16 addr:$dst)),
+          (MOVZX32rm16 addr:$dst)>;
+def : Pat<(i32 (extloadi16 addr:$dst)),
+          (MOVZX32rm16 addr:$dst)>;
+
+//===----------------------------------------------------------------------===//
+// Floating Point Stack Support
+//===----------------------------------------------------------------------===//
+
+include "X86InstrFPStack.td"
+
+//===----------------------------------------------------------------------===//
+// X86-64 Support
+//===----------------------------------------------------------------------===//
+
+include "X86Instr64bit.td"
+
+//===----------------------------------------------------------------------===//
+// SIMD support (SSE, MMX and AVX)
+//===----------------------------------------------------------------------===//
+
+include "X86InstrFragmentsSIMD.td"
+
+//===----------------------------------------------------------------------===//
+// XMM Floating point support (requires SSE / SSE2)
+//===----------------------------------------------------------------------===//
+
+include "X86InstrSSE.td"
+
+//===----------------------------------------------------------------------===//
+// MMX and XMM Packed Integer support (requires MMX, SSE, and SSE2)
+//===----------------------------------------------------------------------===//
+
+include "X86InstrMMX.td"

diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
new file mode 100644
index 0000000..89f020c
--- /dev/null
+++ b/lib/Target/X86/X86InstrMMX.td

@@ -0,0 +1,686 @@
+//====- X86InstrMMX.td - Describe the X86 Instruction Set --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 MMX instruction set, defining the instructions,
+// and properties of the instructions which are needed for code generation,
+// machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// MMX Multiclasses
+//===----------------------------------------------------------------------===//
+
+let Constraints = "$src1 = $dst" in {
+  // MMXI_binop_rm - Simple MMX binary operator.
+  multiclass MMXI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                           ValueType OpVT, bit Commutable = 0> {
+    def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
+                  (ins VR64:$src1, VR64:$src2),
+                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                  [(set VR64:$dst, (OpVT (OpNode VR64:$src1, VR64:$src2)))]> {
+      let isCommutable = Commutable;
+    }
+    def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
+                  (ins VR64:$src1, i64mem:$src2),
+                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                  [(set VR64:$dst, (OpVT (OpNode VR64:$src1,
+                                         (bitconvert
+                                          (load_mmx addr:$src2)))))]>;
+  }
+
+  multiclass MMXI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+                               bit Commutable = 0> {
+    def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
+                 (ins VR64:$src1, VR64:$src2),
+                 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                 [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))]> {
+      let isCommutable = Commutable;
+    }
+    def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
+                 (ins VR64:$src1, i64mem:$src2),
+                 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                 [(set VR64:$dst, (IntId VR64:$src1,
+                                   (bitconvert (load_mmx addr:$src2))))]>;
+  }
+
+  // MMXI_binop_rm_v1i64 - Simple MMX binary operator whose type is v1i64.
+  //
+  // FIXME: we could eliminate this and use MMXI_binop_rm instead if tblgen knew
+  // to collapse (bitconvert VT to VT) into its operand.
+  //
+  multiclass MMXI_binop_rm_v1i64<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                                 bit Commutable = 0> {
+    def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
+                                  (ins VR64:$src1, VR64:$src2),
+                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                  [(set VR64:$dst, (v1i64 (OpNode VR64:$src1, VR64:$src2)))]> {
+      let isCommutable = Commutable;
+    }
+    def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
+                                  (ins VR64:$src1, i64mem:$src2),
+                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                  [(set VR64:$dst,
+                    (OpNode VR64:$src1,(load_mmx addr:$src2)))]>;
+  }
+
+  multiclass MMXI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm,
+                                string OpcodeStr, Intrinsic IntId,
+                                Intrinsic IntId2> {
+    def rr : MMXI<opc, MRMSrcReg, (outs VR64:$dst),
+                                  (ins VR64:$src1, VR64:$src2),
+                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                  [(set VR64:$dst, (IntId VR64:$src1, VR64:$src2))]>;
+    def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
+                                  (ins VR64:$src1, i64mem:$src2),
+                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                  [(set VR64:$dst, (IntId VR64:$src1,
+                                    (bitconvert (load_mmx addr:$src2))))]>;
+    def ri : MMXIi8<opc2, ImmForm, (outs VR64:$dst),
+                                   (ins VR64:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           [(set VR64:$dst, (IntId2 VR64:$src1, (i32 imm:$src2)))]>;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// MMX EMMS & FEMMS Instructions
+//===----------------------------------------------------------------------===//
+
+def MMX_EMMS  : MMXI<0x77, RawFrm, (outs), (ins), "emms",
+                     [(int_x86_mmx_emms)]>;
+def MMX_FEMMS : MMXI<0x0E, RawFrm, (outs), (ins), "femms",
+                     [(int_x86_mmx_femms)]>;
+
+//===----------------------------------------------------------------------===//
+// MMX Scalar Instructions
+//===----------------------------------------------------------------------===//
+
+// Data Transfer Instructions
+def MMX_MOVD64rr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+                        [(set VR64:$dst, 
+                         (v2i32 (scalar_to_vector GR32:$src)))]>;
+let canFoldAsLoad = 1, isReMaterializable = 1 in
+def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src),
+                        "movd\t{$src, $dst|$dst, $src}",
+              [(set VR64:$dst,
+               (v2i32 (scalar_to_vector (loadi32 addr:$src))))]>;
+let mayStore = 1 in
+def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR64:$src),
+                        "movd\t{$src, $dst|$dst, $src}", []>;
+def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs), (ins GR32:$dst, VR64:$src),
+                        "movd\t{$src, $dst|$dst, $src}", []>;
+def MMX_MOVQ64gmr : MMXRI<0x7E, MRMDestMem, (outs), 
+                         (ins i64mem:$dst, VR64:$src),
+                         "movq\t{$src, $dst|$dst, $src}", []>;
+
+let neverHasSideEffects = 1 in
+def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
+                             "movd\t{$src, $dst|$dst, $src}",
+                             []>;
+
+let neverHasSideEffects = 1 in
+// These are 64 bit moves, but since the OS X assembler doesn't
+// recognize a register-register movq, we write them as
+// movd.
+def MMX_MOVD64from64rr : MMXRI<0x7E, MRMDestReg,
+                               (outs GR64:$dst), (ins VR64:$src),
+                               "movd\t{$src, $dst|$dst, $src}", []>;
+def MMX_MOVD64rrv164 : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
+                            "movd\t{$src, $dst|$dst, $src}",
+                            [(set VR64:$dst,
+                             (v1i64 (scalar_to_vector GR64:$src)))]>;
+
+let neverHasSideEffects = 1 in
+def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
+                        "movq\t{$src, $dst|$dst, $src}", []>;
+let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
+                        "movq\t{$src, $dst|$dst, $src}",
+                        [(set VR64:$dst, (load_mmx addr:$src))]>;
+def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
+                        "movq\t{$src, $dst|$dst, $src}",
+                        [(store (v1i64 VR64:$src), addr:$dst)]>;
+
+def MMX_MOVDQ2Qrr : SDIi8<0xD6, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
+                          "movdq2q\t{$src, $dst|$dst, $src}",
+                          [(set VR64:$dst,
+                            (v1i64 (bitconvert
+                            (i64 (vector_extract (v2i64 VR128:$src),
+                                  (iPTR 0))))))]>;
+
+def MMX_MOVQ2DQrr : SSDIi8<0xD6, MRMSrcReg, (outs VR128:$dst), (ins VR64:$src),
+                           "movq2dq\t{$src, $dst|$dst, $src}",
+          [(set VR128:$dst,
+            (movl immAllZerosV,
+                  (v2i64 (scalar_to_vector (i64 (bitconvert VR64:$src))))))]>;
+
+let neverHasSideEffects = 1 in
+def MMX_MOVQ2FR64rr: SSDIi8<0xD6, MRMSrcReg, (outs FR64:$dst), (ins VR64:$src),
+                           "movq2dq\t{$src, $dst|$dst, $src}", []>;
+
+def MMX_MOVNTQmr  : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
+                         "movntq\t{$src, $dst|$dst, $src}",
+                         [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)]>;
+
+let AddedComplexity = 15 in
+// movd to MMX register zero-extends
+def MMX_MOVZDI2PDIrr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
+                             "movd\t{$src, $dst|$dst, $src}",
+              [(set VR64:$dst,
+                    (v2i32 (X86vzmovl (v2i32 (scalar_to_vector GR32:$src)))))]>;
+let AddedComplexity = 20 in
+def MMX_MOVZDI2PDIrm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst),
+                           (ins i32mem:$src),
+                             "movd\t{$src, $dst|$dst, $src}",
+          [(set VR64:$dst,
+                (v2i32 (X86vzmovl (v2i32
+                                   (scalar_to_vector (loadi32 addr:$src))))))]>;
+
+// Arithmetic Instructions
+
+// -- Addition
+defm MMX_PADDB : MMXI_binop_rm<0xFC, "paddb", add, v8i8,  1>;
+defm MMX_PADDW : MMXI_binop_rm<0xFD, "paddw", add, v4i16, 1>;
+defm MMX_PADDD : MMXI_binop_rm<0xFE, "paddd", add, v2i32, 1>;
+defm MMX_PADDQ : MMXI_binop_rm<0xD4, "paddq", add, v1i64, 1>;
+
+defm MMX_PADDSB  : MMXI_binop_rm_int<0xEC, "paddsb" , int_x86_mmx_padds_b, 1>;
+defm MMX_PADDSW  : MMXI_binop_rm_int<0xED, "paddsw" , int_x86_mmx_padds_w, 1>;
+
+defm MMX_PADDUSB : MMXI_binop_rm_int<0xDC, "paddusb", int_x86_mmx_paddus_b, 1>;
+defm MMX_PADDUSW : MMXI_binop_rm_int<0xDD, "paddusw", int_x86_mmx_paddus_w, 1>;
+
+// -- Subtraction
+defm MMX_PSUBB : MMXI_binop_rm<0xF8, "psubb", sub, v8i8>;
+defm MMX_PSUBW : MMXI_binop_rm<0xF9, "psubw", sub, v4i16>;
+defm MMX_PSUBD : MMXI_binop_rm<0xFA, "psubd", sub, v2i32>;
+defm MMX_PSUBQ : MMXI_binop_rm<0xFB, "psubq", sub, v1i64>;
+
+defm MMX_PSUBSB  : MMXI_binop_rm_int<0xE8, "psubsb" , int_x86_mmx_psubs_b>;
+defm MMX_PSUBSW  : MMXI_binop_rm_int<0xE9, "psubsw" , int_x86_mmx_psubs_w>;
+
+defm MMX_PSUBUSB : MMXI_binop_rm_int<0xD8, "psubusb", int_x86_mmx_psubus_b>;
+defm MMX_PSUBUSW : MMXI_binop_rm_int<0xD9, "psubusw", int_x86_mmx_psubus_w>;
+
+// -- Multiplication
+defm MMX_PMULLW  : MMXI_binop_rm<0xD5, "pmullw", mul, v4i16, 1>;
+
+defm MMX_PMULHW  : MMXI_binop_rm_int<0xE5, "pmulhw",  int_x86_mmx_pmulh_w,  1>;
+defm MMX_PMULHUW : MMXI_binop_rm_int<0xE4, "pmulhuw", int_x86_mmx_pmulhu_w, 1>;
+defm MMX_PMULUDQ : MMXI_binop_rm_int<0xF4, "pmuludq", int_x86_mmx_pmulu_dq, 1>;
+
+// -- Miscellanea
+defm MMX_PMADDWD : MMXI_binop_rm_int<0xF5, "pmaddwd", int_x86_mmx_pmadd_wd, 1>;
+
+defm MMX_PAVGB   : MMXI_binop_rm_int<0xE0, "pavgb", int_x86_mmx_pavg_b, 1>;
+defm MMX_PAVGW   : MMXI_binop_rm_int<0xE3, "pavgw", int_x86_mmx_pavg_w, 1>;
+
+defm MMX_PMINUB  : MMXI_binop_rm_int<0xDA, "pminub", int_x86_mmx_pminu_b, 1>;
+defm MMX_PMINSW  : MMXI_binop_rm_int<0xEA, "pminsw", int_x86_mmx_pmins_w, 1>;
+
+defm MMX_PMAXUB  : MMXI_binop_rm_int<0xDE, "pmaxub", int_x86_mmx_pmaxu_b, 1>;
+defm MMX_PMAXSW  : MMXI_binop_rm_int<0xEE, "pmaxsw", int_x86_mmx_pmaxs_w, 1>;
+
+defm MMX_PSADBW  : MMXI_binop_rm_int<0xF6, "psadbw", int_x86_mmx_psad_bw, 1>;
+
+// Logical Instructions
+defm MMX_PAND : MMXI_binop_rm_v1i64<0xDB, "pand", and, 1>;
+defm MMX_POR  : MMXI_binop_rm_v1i64<0xEB, "por" , or,  1>;
+defm MMX_PXOR : MMXI_binop_rm_v1i64<0xEF, "pxor", xor, 1>;
+
+let Constraints = "$src1 = $dst" in {
+  def MMX_PANDNrr : MMXI<0xDF, MRMSrcReg,
+                         (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
+                         "pandn\t{$src2, $dst|$dst, $src2}",
+                         [(set VR64:$dst, (v1i64 (and (vnot VR64:$src1),
+                                                  VR64:$src2)))]>;
+  def MMX_PANDNrm : MMXI<0xDF, MRMSrcMem,
+                         (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
+                         "pandn\t{$src2, $dst|$dst, $src2}",
+                         [(set VR64:$dst, (v1i64 (and (vnot VR64:$src1),
+                                                  (load addr:$src2))))]>;
+}
+
+// Shift Instructions
+defm MMX_PSRLW : MMXI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw",
+                                    int_x86_mmx_psrl_w, int_x86_mmx_psrli_w>;
+defm MMX_PSRLD : MMXI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld",
+                                    int_x86_mmx_psrl_d, int_x86_mmx_psrli_d>;
+defm MMX_PSRLQ : MMXI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq",
+                                    int_x86_mmx_psrl_q, int_x86_mmx_psrli_q>;
+
+defm MMX_PSLLW : MMXI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw",
+                                    int_x86_mmx_psll_w, int_x86_mmx_pslli_w>;
+defm MMX_PSLLD : MMXI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld",
+                                    int_x86_mmx_psll_d, int_x86_mmx_pslli_d>;
+defm MMX_PSLLQ : MMXI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq",
+                                    int_x86_mmx_psll_q, int_x86_mmx_pslli_q>;
+
+defm MMX_PSRAW : MMXI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw",
+                                    int_x86_mmx_psra_w, int_x86_mmx_psrai_w>;
+defm MMX_PSRAD : MMXI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad",
+                                    int_x86_mmx_psra_d, int_x86_mmx_psrai_d>;
+
+// Shift up / down and insert zero's.
+def : Pat<(v1i64 (X86vshl     VR64:$src, (i8 imm:$amt))),
+          (v1i64 (MMX_PSLLQri VR64:$src, imm:$amt))>;
+def : Pat<(v1i64 (X86vshr     VR64:$src, (i8 imm:$amt))),
+          (v1i64 (MMX_PSRLQri VR64:$src, imm:$amt))>;
+
+// Comparison Instructions
+defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b>;
+defm MMX_PCMPEQW : MMXI_binop_rm_int<0x75, "pcmpeqw", int_x86_mmx_pcmpeq_w>;
+defm MMX_PCMPEQD : MMXI_binop_rm_int<0x76, "pcmpeqd", int_x86_mmx_pcmpeq_d>;
+
+defm MMX_PCMPGTB : MMXI_binop_rm_int<0x64, "pcmpgtb", int_x86_mmx_pcmpgt_b>;
+defm MMX_PCMPGTW : MMXI_binop_rm_int<0x65, "pcmpgtw", int_x86_mmx_pcmpgt_w>;
+defm MMX_PCMPGTD : MMXI_binop_rm_int<0x66, "pcmpgtd", int_x86_mmx_pcmpgt_d>;
+
+// Conversion Instructions
+
+// -- Unpack Instructions
+let Constraints = "$src1 = $dst" in {
+  // Unpack High Packed Data Instructions
+  def MMX_PUNPCKHBWrr : MMXI<0x68, MRMSrcReg,
+                             (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
+                             "punpckhbw\t{$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v8i8 (mmx_unpckh VR64:$src1, VR64:$src2)))]>;
+  def MMX_PUNPCKHBWrm : MMXI<0x68, MRMSrcMem,
+                             (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
+                             "punpckhbw\t{$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v8i8 (mmx_unpckh VR64:$src1,
+                                      (bc_v8i8 (load_mmx addr:$src2)))))]>;
+
+  def MMX_PUNPCKHWDrr : MMXI<0x69, MRMSrcReg,
+                             (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
+                             "punpckhwd\t{$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v4i16 (mmx_unpckh VR64:$src1, VR64:$src2)))]>;
+  def MMX_PUNPCKHWDrm : MMXI<0x69, MRMSrcMem,
+                             (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
+                             "punpckhwd\t{$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v4i16 (mmx_unpckh VR64:$src1,
+                                       (bc_v4i16 (load_mmx addr:$src2)))))]>;
+
+  def MMX_PUNPCKHDQrr : MMXI<0x6A, MRMSrcReg,
+                             (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
+                             "punpckhdq\t{$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v2i32 (mmx_unpckh VR64:$src1, VR64:$src2)))]>;
+  def MMX_PUNPCKHDQrm : MMXI<0x6A, MRMSrcMem,
+                             (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
+                             "punpckhdq\t{$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v2i32 (mmx_unpckh VR64:$src1,
+                                       (bc_v2i32 (load_mmx addr:$src2)))))]>;
+
+  // Unpack Low Packed Data Instructions
+  def MMX_PUNPCKLBWrr : MMXI<0x60, MRMSrcReg,
+                             (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
+                             "punpcklbw\t{$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v8i8 (mmx_unpckl VR64:$src1, VR64:$src2)))]>;
+  def MMX_PUNPCKLBWrm : MMXI<0x60, MRMSrcMem,
+                             (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
+                             "punpcklbw\t{$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v8i8 (mmx_unpckl VR64:$src1,
+                                      (bc_v8i8 (load_mmx addr:$src2)))))]>;
+
+  def MMX_PUNPCKLWDrr : MMXI<0x61, MRMSrcReg,
+                             (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
+                             "punpcklwd\t{$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v4i16 (mmx_unpckl VR64:$src1, VR64:$src2)))]>;
+  def MMX_PUNPCKLWDrm : MMXI<0x61, MRMSrcMem,
+                             (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
+                             "punpcklwd\t{$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v4i16 (mmx_unpckl VR64:$src1,
+                                       (bc_v4i16 (load_mmx addr:$src2)))))]>;
+
+  def MMX_PUNPCKLDQrr : MMXI<0x62, MRMSrcReg,
+                             (outs VR64:$dst), (ins VR64:$src1, VR64:$src2),
+                             "punpckldq\t{$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v2i32 (mmx_unpckl VR64:$src1, VR64:$src2)))]>;
+  def MMX_PUNPCKLDQrm : MMXI<0x62, MRMSrcMem,
+                             (outs VR64:$dst), (ins VR64:$src1, i64mem:$src2),
+                             "punpckldq\t{$src2, $dst|$dst, $src2}",
+                             [(set VR64:$dst,
+                               (v2i32 (mmx_unpckl VR64:$src1,
+                                       (bc_v2i32 (load_mmx addr:$src2)))))]>;
+}
+
+// -- Pack Instructions
+defm MMX_PACKSSWB : MMXI_binop_rm_int<0x63, "packsswb", int_x86_mmx_packsswb>;
+defm MMX_PACKSSDW : MMXI_binop_rm_int<0x6B, "packssdw", int_x86_mmx_packssdw>;
+defm MMX_PACKUSWB : MMXI_binop_rm_int<0x67, "packuswb", int_x86_mmx_packuswb>;
+
+// -- Shuffle Instructions
+def MMX_PSHUFWri : MMXIi8<0x70, MRMSrcReg,
+                          (outs VR64:$dst), (ins VR64:$src1, i8imm:$src2),
+                          "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                          [(set VR64:$dst,
+                            (v4i16 (mmx_pshufw:$src2 VR64:$src1, (undef))))]>;
+def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem,
+                          (outs VR64:$dst), (ins i64mem:$src1, i8imm:$src2),
+                          "pshufw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                          [(set VR64:$dst,
+                            (mmx_pshufw:$src2 (bc_v4i16 (load_mmx addr:$src1)),
+                                              (undef)))]>;
+
+// -- Conversion Instructions
+let neverHasSideEffects = 1 in {
+def MMX_CVTPD2PIrr  : MMX2I<0x2D, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
+                            "cvtpd2pi\t{$src, $dst|$dst, $src}", []>;
+let mayLoad = 1 in
+def MMX_CVTPD2PIrm  : MMX2I<0x2D, MRMSrcMem, (outs VR64:$dst),
+                            (ins f128mem:$src),
+                            "cvtpd2pi\t{$src, $dst|$dst, $src}", []>;
+
+def MMX_CVTPI2PDrr  : MMX2I<0x2A, MRMSrcReg, (outs VR128:$dst), (ins VR64:$src),
+                            "cvtpi2pd\t{$src, $dst|$dst, $src}", []>;
+let mayLoad = 1 in
+def MMX_CVTPI2PDrm  : MMX2I<0x2A, MRMSrcMem, (outs VR128:$dst),
+                            (ins i64mem:$src),
+                            "cvtpi2pd\t{$src, $dst|$dst, $src}", []>;
+
+def MMX_CVTPI2PSrr  : MMXI<0x2A, MRMSrcReg, (outs VR128:$dst), (ins VR64:$src),
+                           "cvtpi2ps\t{$src, $dst|$dst, $src}", []>;
+let mayLoad = 1 in
+def MMX_CVTPI2PSrm  : MMXI<0x2A, MRMSrcMem, (outs VR128:$dst),
+                           (ins i64mem:$src),
+                           "cvtpi2ps\t{$src, $dst|$dst, $src}", []>;
+
+def MMX_CVTPS2PIrr  : MMXI<0x2D, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
+                           "cvtps2pi\t{$src, $dst|$dst, $src}", []>;
+let mayLoad = 1 in
+def MMX_CVTPS2PIrm  : MMXI<0x2D, MRMSrcMem, (outs VR64:$dst), (ins f64mem:$src),
+                           "cvtps2pi\t{$src, $dst|$dst, $src}", []>;
+
+def MMX_CVTTPD2PIrr : MMX2I<0x2C, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
+                            "cvttpd2pi\t{$src, $dst|$dst, $src}", []>;
+let mayLoad = 1 in
+def MMX_CVTTPD2PIrm : MMX2I<0x2C, MRMSrcMem, (outs VR64:$dst),
+                            (ins f128mem:$src),
+                            "cvttpd2pi\t{$src, $dst|$dst, $src}", []>;
+
+def MMX_CVTTPS2PIrr : MMXI<0x2C, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
+                           "cvttps2pi\t{$src, $dst|$dst, $src}", []>;
+let mayLoad = 1 in
+def MMX_CVTTPS2PIrm : MMXI<0x2C, MRMSrcMem, (outs VR64:$dst), (ins f64mem:$src),
+                           "cvttps2pi\t{$src, $dst|$dst, $src}", []>;
+} // end neverHasSideEffects
+
+
+// Extract / Insert
+def MMX_X86pextrw : SDNode<"X86ISD::PEXTRW", SDTypeProfile<1, 2, []>, []>;
+def MMX_X86pinsrw : SDNode<"X86ISD::PINSRW", SDTypeProfile<1, 3, []>, []>;
+
+def MMX_PEXTRWri  : MMXIi8<0xC5, MRMSrcReg,
+                           (outs GR32:$dst), (ins VR64:$src1, i16i8imm:$src2),
+                           "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                           [(set GR32:$dst, (MMX_X86pextrw (v4i16 VR64:$src1),
+                                             (iPTR imm:$src2)))]>;
+let Constraints = "$src1 = $dst" in {
+  def MMX_PINSRWrri : MMXIi8<0xC4, MRMSrcReg,
+                      (outs VR64:$dst), 
+                      (ins VR64:$src1, GR32:$src2,i16i8imm:$src3),
+                      "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                      [(set VR64:$dst, (v4i16 (MMX_X86pinsrw (v4i16 VR64:$src1),
+                                               GR32:$src2,(iPTR imm:$src3))))]>;
+  def MMX_PINSRWrmi : MMXIi8<0xC4, MRMSrcMem,
+                     (outs VR64:$dst),
+                     (ins VR64:$src1, i16mem:$src2, i16i8imm:$src3),
+                     "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                     [(set VR64:$dst,
+                       (v4i16 (MMX_X86pinsrw (v4i16 VR64:$src1),
+                               (i32 (anyext (loadi16 addr:$src2))),
+                               (iPTR imm:$src3))))]>;
+}
+
+// MMX to XMM for vector types
+def MMX_X86movq2dq : SDNode<"X86ISD::MOVQ2DQ", SDTypeProfile<1, 1,
+                            [SDTCisVT<0, v2i64>, SDTCisVT<1, v1i64>]>>;
+
+def : Pat<(v2i64 (MMX_X86movq2dq VR64:$src)),
+          (v2i64 (MMX_MOVQ2DQrr VR64:$src))>;
+
+def : Pat<(v2i64 (MMX_X86movq2dq (load_mmx addr:$src))),
+          (v2i64 (MOVQI2PQIrm addr:$src))>;
+
+def : Pat<(v2i64 (MMX_X86movq2dq (v1i64 (bitconvert
+                            (v2i32 (scalar_to_vector (loadi32 addr:$src))))))),
+          (v2i64 (MOVDI2PDIrm addr:$src))>;
+
+// Mask creation
+def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR64:$src),
+                          "pmovmskb\t{$src, $dst|$dst, $src}",
+                          [(set GR32:$dst, (int_x86_mmx_pmovmskb VR64:$src))]>;
+
+// Misc.
+let Uses = [EDI] in
+def MMX_MASKMOVQ : MMXI<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
+                        "maskmovq\t{$mask, $src|$src, $mask}",
+                        [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, EDI)]>;
+let Uses = [RDI] in
+def MMX_MASKMOVQ64: MMXI64<0xF7, MRMSrcReg, (outs), (ins VR64:$src, VR64:$mask),
+                           "maskmovq\t{$mask, $src|$src, $mask}",
+                           [(int_x86_mmx_maskmovq VR64:$src, VR64:$mask, RDI)]>;
+
+//===----------------------------------------------------------------------===//
+// Alias Instructions
+//===----------------------------------------------------------------------===//
+
+// Alias instructions that map zero vector to pxor.
+let isReMaterializable = 1, isCodeGenOnly = 1 in {
+  // FIXME: Change encoding to pseudo.
+  def MMX_V_SET0       : MMXI<0xEF, MRMInitReg, (outs VR64:$dst), (ins), "",
+                              [(set VR64:$dst, (v2i32 immAllZerosV))]>;
+  def MMX_V_SETALLONES : MMXI<0x76, MRMInitReg, (outs VR64:$dst), (ins), "",
+                              [(set VR64:$dst, (v2i32 immAllOnesV))]>;
+}
+
+let Predicates = [HasMMX] in {
+  def : Pat<(v1i64 immAllZerosV), (MMX_V_SET0)>;
+  def : Pat<(v4i16 immAllZerosV), (MMX_V_SET0)>;
+  def : Pat<(v8i8  immAllZerosV), (MMX_V_SET0)>;
+}
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+// Store 64-bit integer vector values.
+def : Pat<(store (v8i8  VR64:$src), addr:$dst),
+          (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
+def : Pat<(store (v4i16 VR64:$src), addr:$dst),
+          (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
+def : Pat<(store (v2i32 VR64:$src), addr:$dst),
+          (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
+def : Pat<(store (v2f32 VR64:$src), addr:$dst),
+          (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
+def : Pat<(store (v1i64 VR64:$src), addr:$dst),
+          (MMX_MOVQ64mr addr:$dst, VR64:$src)>;
+
+// Bit convert.
+def : Pat<(v8i8  (bitconvert (v1i64 VR64:$src))), (v8i8  VR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v2i32 VR64:$src))), (v8i8  VR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v2f32 VR64:$src))), (v8i8  VR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v4i16 VR64:$src))), (v8i8  VR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v1i64 VR64:$src))), (v4i16 VR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v2i32 VR64:$src))), (v4i16 VR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v2f32 VR64:$src))), (v4i16 VR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v8i8  VR64:$src))), (v4i16 VR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v1i64 VR64:$src))), (v2i32 VR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v2f32 VR64:$src))), (v2i32 VR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v4i16 VR64:$src))), (v2i32 VR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v8i8  VR64:$src))), (v2i32 VR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v1i64 VR64:$src))), (v2f32 VR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v2i32 VR64:$src))), (v2f32 VR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v4i16 VR64:$src))), (v2f32 VR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v8i8  VR64:$src))), (v2f32 VR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v2i32 VR64:$src))), (v1i64 VR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v2f32 VR64:$src))), (v1i64 VR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v4i16 VR64:$src))), (v1i64 VR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v8i8  VR64:$src))), (v1i64 VR64:$src)>;
+
+// 64-bit bit convert.
+def : Pat<(v1i64 (bitconvert (i64 GR64:$src))),
+          (MMX_MOVD64to64rr GR64:$src)>;
+def : Pat<(v2i32 (bitconvert (i64 GR64:$src))),
+          (MMX_MOVD64to64rr GR64:$src)>;
+def : Pat<(v2f32 (bitconvert (i64 GR64:$src))),
+          (MMX_MOVD64to64rr GR64:$src)>;
+def : Pat<(v4i16 (bitconvert (i64 GR64:$src))),
+          (MMX_MOVD64to64rr GR64:$src)>;
+def : Pat<(v8i8  (bitconvert (i64 GR64:$src))),
+          (MMX_MOVD64to64rr GR64:$src)>;
+def : Pat<(i64 (bitconvert (v1i64 VR64:$src))),
+          (MMX_MOVD64from64rr VR64:$src)>;
+def : Pat<(i64 (bitconvert (v2i32 VR64:$src))),
+          (MMX_MOVD64from64rr VR64:$src)>;
+def : Pat<(i64 (bitconvert (v2f32 VR64:$src))),
+          (MMX_MOVD64from64rr VR64:$src)>;
+def : Pat<(i64 (bitconvert (v4i16 VR64:$src))),
+          (MMX_MOVD64from64rr VR64:$src)>;
+def : Pat<(i64  (bitconvert (v8i8 VR64:$src))),
+          (MMX_MOVD64from64rr VR64:$src)>;
+def : Pat<(f64 (bitconvert (v1i64 VR64:$src))),
+          (MMX_MOVQ2FR64rr VR64:$src)>;
+def : Pat<(f64 (bitconvert (v2i32 VR64:$src))),
+          (MMX_MOVQ2FR64rr VR64:$src)>;
+def : Pat<(f64 (bitconvert (v4i16 VR64:$src))),
+          (MMX_MOVQ2FR64rr VR64:$src)>;
+def : Pat<(f64 (bitconvert (v8i8 VR64:$src))),
+          (MMX_MOVQ2FR64rr VR64:$src)>;
+
+let AddedComplexity = 20 in {
+  def : Pat<(v2i32 (X86vzmovl (bc_v2i32 (load_mmx addr:$src)))),
+            (MMX_MOVZDI2PDIrm addr:$src)>;
+}
+
+// Clear top half.
+let AddedComplexity = 15 in {
+  def : Pat<(v2i32 (X86vzmovl VR64:$src)),
+            (MMX_PUNPCKLDQrr VR64:$src, (MMX_V_SET0))>;
+}
+
+// Patterns to perform canonical versions of vector shuffling.
+let AddedComplexity = 10 in {
+  def : Pat<(v8i8  (mmx_unpckl_undef VR64:$src, (undef))),
+            (MMX_PUNPCKLBWrr VR64:$src, VR64:$src)>;
+  def : Pat<(v4i16 (mmx_unpckl_undef VR64:$src, (undef))),
+            (MMX_PUNPCKLWDrr VR64:$src, VR64:$src)>;
+  def : Pat<(v2i32 (mmx_unpckl_undef VR64:$src, (undef))),
+            (MMX_PUNPCKLDQrr VR64:$src, VR64:$src)>;
+}
+
+let AddedComplexity = 10 in {
+  def : Pat<(v8i8  (mmx_unpckh_undef VR64:$src, (undef))),
+            (MMX_PUNPCKHBWrr VR64:$src, VR64:$src)>;
+  def : Pat<(v4i16 (mmx_unpckh_undef VR64:$src, (undef))),
+            (MMX_PUNPCKHWDrr VR64:$src, VR64:$src)>;
+  def : Pat<(v2i32 (mmx_unpckh_undef VR64:$src, (undef))),
+            (MMX_PUNPCKHDQrr VR64:$src, VR64:$src)>;
+}
+
+// Patterns to perform vector shuffling with a zeroed out vector.
+let AddedComplexity = 20 in {
+  def : Pat<(bc_v2i32 (mmx_unpckl immAllZerosV,
+                       (v2i32 (scalar_to_vector (load_mmx addr:$src))))),
+            (MMX_PUNPCKLDQrm VR64:$src, VR64:$src)>;
+}
+
+// Some special case PANDN patterns.
+// FIXME: Get rid of these.
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v2i32 immAllOnesV))),
+                  VR64:$src2)),
+          (MMX_PANDNrr VR64:$src1, VR64:$src2)>;
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v4i16 immAllOnesV_bc))),
+                  VR64:$src2)),
+          (MMX_PANDNrr VR64:$src1, VR64:$src2)>;
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v8i8  immAllOnesV_bc))),
+                  VR64:$src2)),
+          (MMX_PANDNrr VR64:$src1, VR64:$src2)>;
+
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v2i32 immAllOnesV))),
+                  (load addr:$src2))),
+          (MMX_PANDNrm VR64:$src1, addr:$src2)>;
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v4i16 immAllOnesV_bc))),
+                  (load addr:$src2))),
+          (MMX_PANDNrm VR64:$src1, addr:$src2)>;
+def : Pat<(v1i64 (and (xor VR64:$src1, (bc_v1i64 (v8i8  immAllOnesV_bc))),
+                  (load addr:$src2))),
+          (MMX_PANDNrm VR64:$src1, addr:$src2)>;
+
+// Move MMX to lower 64-bit of XMM
+def : Pat<(v2i64 (scalar_to_vector (i64 (bitconvert (v8i8 VR64:$src))))),
+          (v2i64 (MMX_MOVQ2DQrr VR64:$src))>;
+def : Pat<(v2i64 (scalar_to_vector (i64 (bitconvert (v4i16 VR64:$src))))),
+          (v2i64 (MMX_MOVQ2DQrr VR64:$src))>;
+def : Pat<(v2i64 (scalar_to_vector (i64 (bitconvert (v2i32 VR64:$src))))),
+          (v2i64 (MMX_MOVQ2DQrr VR64:$src))>;
+def : Pat<(v2i64 (scalar_to_vector (i64 (bitconvert (v1i64 VR64:$src))))),
+          (v2i64 (MMX_MOVQ2DQrr VR64:$src))>;
+
+// Move lower 64-bit of XMM to MMX.
+def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+                                                  (iPTR 0))))),
+          (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>;
+def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+                                                  (iPTR 0))))),
+          (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>;
+def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
+                                                  (iPTR 0))))),
+          (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>;
+
+// Patterns for vector comparisons
+def : Pat<(v8i8 (X86pcmpeqb VR64:$src1, VR64:$src2)),
+          (MMX_PCMPEQBrr VR64:$src1, VR64:$src2)>;
+def : Pat<(v8i8 (X86pcmpeqb VR64:$src1, (bitconvert (load_mmx addr:$src2)))),
+          (MMX_PCMPEQBrm VR64:$src1, addr:$src2)>;
+def : Pat<(v4i16 (X86pcmpeqw VR64:$src1, VR64:$src2)),
+          (MMX_PCMPEQWrr VR64:$src1, VR64:$src2)>;
+def : Pat<(v4i16 (X86pcmpeqw VR64:$src1, (bitconvert (load_mmx addr:$src2)))),
+          (MMX_PCMPEQWrm VR64:$src1, addr:$src2)>;
+def : Pat<(v2i32 (X86pcmpeqd VR64:$src1, VR64:$src2)),
+          (MMX_PCMPEQDrr VR64:$src1, VR64:$src2)>;
+def : Pat<(v2i32 (X86pcmpeqd VR64:$src1, (bitconvert (load_mmx addr:$src2)))),
+          (MMX_PCMPEQDrm VR64:$src1, addr:$src2)>;
+
+def : Pat<(v8i8 (X86pcmpgtb VR64:$src1, VR64:$src2)),
+          (MMX_PCMPGTBrr VR64:$src1, VR64:$src2)>;
+def : Pat<(v8i8 (X86pcmpgtb VR64:$src1, (bitconvert (load_mmx addr:$src2)))),
+          (MMX_PCMPGTBrm VR64:$src1, addr:$src2)>;
+def : Pat<(v4i16 (X86pcmpgtw VR64:$src1, VR64:$src2)),
+          (MMX_PCMPGTWrr VR64:$src1, VR64:$src2)>;
+def : Pat<(v4i16 (X86pcmpgtw VR64:$src1, (bitconvert (load_mmx addr:$src2)))),
+          (MMX_PCMPGTWrm VR64:$src1, addr:$src2)>;
+def : Pat<(v2i32 (X86pcmpgtd VR64:$src1, VR64:$src2)),
+          (MMX_PCMPGTDrr VR64:$src1, VR64:$src2)>;
+def : Pat<(v2i32 (X86pcmpgtd VR64:$src1, (bitconvert (load_mmx addr:$src2)))),
+          (MMX_PCMPGTDrm VR64:$src1, addr:$src2)>;
+
+// CMOV* - Used to implement the SELECT DAG operation.  Expanded after
+// instruction selection into a branch sequence.
+let Uses = [EFLAGS], usesCustomInserter = 1 in {
+  def CMOV_V1I64 : I<0, Pseudo,
+                    (outs VR64:$dst), (ins VR64:$t, VR64:$f, i8imm:$cond),
+                    "#CMOV_V1I64 PSEUDO!",
+                    [(set VR64:$dst,
+                      (v1i64 (X86cmov VR64:$t, VR64:$f, imm:$cond,
+                                          EFLAGS)))]>;
+}

diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
new file mode 100644
index 0000000..e26c979
--- /dev/null
+++ b/lib/Target/X86/X86InstrSSE.td

@@ -0,0 +1,3934 @@
+//====- X86InstrSSE.td - Describe the X86 Instruction Set --*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 SSE instruction set, defining the instructions,
+// and properties of the instructions which are needed for code generation,
+// machine code emission, and analysis.
+//
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// SSE specific DAG Nodes.
+//===----------------------------------------------------------------------===//
+
+def SDTX86FPShiftOp : SDTypeProfile<1, 2, [ SDTCisSameAs<0, 1>,
+                                            SDTCisFP<0>, SDTCisInt<2> ]>;
+def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>,
+                                       SDTCisFP<1>, SDTCisVT<3, i8>]>;
+
+def X86fmin    : SDNode<"X86ISD::FMIN",      SDTFPBinOp>;
+def X86fmax    : SDNode<"X86ISD::FMAX",      SDTFPBinOp>;
+def X86fand    : SDNode<"X86ISD::FAND",      SDTFPBinOp,
+                        [SDNPCommutative, SDNPAssociative]>;
+def X86for     : SDNode<"X86ISD::FOR",       SDTFPBinOp,
+                        [SDNPCommutative, SDNPAssociative]>;
+def X86fxor    : SDNode<"X86ISD::FXOR",      SDTFPBinOp,
+                        [SDNPCommutative, SDNPAssociative]>;
+def X86frsqrt  : SDNode<"X86ISD::FRSQRT",    SDTFPUnaryOp>;
+def X86frcp    : SDNode<"X86ISD::FRCP",      SDTFPUnaryOp>;
+def X86fsrl    : SDNode<"X86ISD::FSRL",      SDTX86FPShiftOp>;
+def X86comi    : SDNode<"X86ISD::COMI",      SDTX86CmpTest>;
+def X86ucomi   : SDNode<"X86ISD::UCOMI",     SDTX86CmpTest>;
+def X86pshufb  : SDNode<"X86ISD::PSHUFB",
+                 SDTypeProfile<1, 2, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
+                                      SDTCisSameAs<0,2>]>>;
+def X86pextrb  : SDNode<"X86ISD::PEXTRB",
+                 SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>;
+def X86pextrw  : SDNode<"X86ISD::PEXTRW",
+                 SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>;
+def X86pinsrb  : SDNode<"X86ISD::PINSRB",
+                 SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
+                                      SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
+def X86pinsrw  : SDNode<"X86ISD::PINSRW",
+                 SDTypeProfile<1, 3, [SDTCisVT<0, v8i16>, SDTCisSameAs<0,1>,
+                                      SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
+def X86insrtps : SDNode<"X86ISD::INSERTPS",
+                 SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>,
+                                      SDTCisVT<2, v4f32>, SDTCisPtrTy<3>]>>;
+def X86vzmovl  : SDNode<"X86ISD::VZEXT_MOVL",
+                 SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>;
+def X86vzload  : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,
+                        [SDNPHasChain, SDNPMayLoad]>;
+def X86vshl    : SDNode<"X86ISD::VSHL",      SDTIntShiftOp>;
+def X86vshr    : SDNode<"X86ISD::VSRL",      SDTIntShiftOp>;
+def X86cmpps   : SDNode<"X86ISD::CMPPS",     SDTX86VFCMP>;
+def X86cmppd   : SDNode<"X86ISD::CMPPD",     SDTX86VFCMP>;
+def X86pcmpeqb : SDNode<"X86ISD::PCMPEQB", SDTIntBinOp, [SDNPCommutative]>;
+def X86pcmpeqw : SDNode<"X86ISD::PCMPEQW", SDTIntBinOp, [SDNPCommutative]>;
+def X86pcmpeqd : SDNode<"X86ISD::PCMPEQD", SDTIntBinOp, [SDNPCommutative]>;
+def X86pcmpeqq : SDNode<"X86ISD::PCMPEQQ", SDTIntBinOp, [SDNPCommutative]>;
+def X86pcmpgtb : SDNode<"X86ISD::PCMPGTB", SDTIntBinOp>;
+def X86pcmpgtw : SDNode<"X86ISD::PCMPGTW", SDTIntBinOp>;
+def X86pcmpgtd : SDNode<"X86ISD::PCMPGTD", SDTIntBinOp>;
+def X86pcmpgtq : SDNode<"X86ISD::PCMPGTQ", SDTIntBinOp>;
+
+def SDTX86CmpPTest : SDTypeProfile<0, 2, [SDTCisVT<0, v4f32>,
+                                          SDTCisVT<1, v4f32>]>;
+def X86ptest   : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
+
+//===----------------------------------------------------------------------===//
+// SSE Complex Patterns
+//===----------------------------------------------------------------------===//
+
+// These are 'extloads' from a scalar to the low element of a vector, zeroing
+// the top elements.  These are used for the SSE 'ss' and 'sd' instruction
+// forms.
+def sse_load_f32 : ComplexPattern<v4f32, 5, "SelectScalarSSELoad", [],
+                                  [SDNPHasChain, SDNPMayLoad]>;
+def sse_load_f64 : ComplexPattern<v2f64, 5, "SelectScalarSSELoad", [],
+                                  [SDNPHasChain, SDNPMayLoad]>;
+
+def ssmem : Operand<v4f32> {
+  let PrintMethod = "printf32mem";
+  let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm);
+  let ParserMatchClass = X86MemAsmOperand;
+}
+def sdmem : Operand<v2f64> {
+  let PrintMethod = "printf64mem";
+  let MIOperandInfo = (ops ptr_rc, i8imm, ptr_rc_nosp, i32imm, i8imm);
+  let ParserMatchClass = X86MemAsmOperand;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE pattern fragments
+//===----------------------------------------------------------------------===//
+
+def loadv4f32    : PatFrag<(ops node:$ptr), (v4f32 (load node:$ptr))>;
+def loadv2f64    : PatFrag<(ops node:$ptr), (v2f64 (load node:$ptr))>;
+def loadv4i32    : PatFrag<(ops node:$ptr), (v4i32 (load node:$ptr))>;
+def loadv2i64    : PatFrag<(ops node:$ptr), (v2i64 (load node:$ptr))>;
+
+// Like 'store', but always requires vector alignment.
+def alignedstore : PatFrag<(ops node:$val, node:$ptr),
+                           (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+// Like 'load', but always requires vector alignment.
+def alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+def alignedloadfsf32 : PatFrag<(ops node:$ptr), 
+                               (f32 (alignedload node:$ptr))>;
+def alignedloadfsf64 : PatFrag<(ops node:$ptr), 
+                               (f64 (alignedload node:$ptr))>;
+def alignedloadv4f32 : PatFrag<(ops node:$ptr), 
+                               (v4f32 (alignedload node:$ptr))>;
+def alignedloadv2f64 : PatFrag<(ops node:$ptr), 
+                               (v2f64 (alignedload node:$ptr))>;
+def alignedloadv4i32 : PatFrag<(ops node:$ptr), 
+                               (v4i32 (alignedload node:$ptr))>;
+def alignedloadv2i64 : PatFrag<(ops node:$ptr), 
+                               (v2i64 (alignedload node:$ptr))>;
+
+// Like 'load', but uses special alignment checks suitable for use in
+// memory operands in most SSE instructions, which are required to
+// be naturally aligned on some targets but not on others.  If the subtarget
+// allows unaligned accesses, match any load, though this may require
+// setting a feature bit in the processor (on startup, for example).
+// Opteron 10h and later implement such a feature.
+def memop : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return    Subtarget->hasVectorUAMem()
+         || cast<LoadSDNode>(N)->getAlignment() >= 16;
+}]>;
+
+def memopfsf32 : PatFrag<(ops node:$ptr), (f32   (memop node:$ptr))>;
+def memopfsf64 : PatFrag<(ops node:$ptr), (f64   (memop node:$ptr))>;
+def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>;
+def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>;
+def memopv4i32 : PatFrag<(ops node:$ptr), (v4i32 (memop node:$ptr))>;
+def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>;
+def memopv16i8 : PatFrag<(ops node:$ptr), (v16i8 (memop node:$ptr))>;
+
+// SSSE3 uses MMX registers for some instructions. They aren't aligned on a
+// 16-byte boundary.
+// FIXME: 8 byte alignment for mmx reads is not required
+def memop64 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 8;
+}]>;
+
+def memopv8i8  : PatFrag<(ops node:$ptr), (v8i8  (memop64 node:$ptr))>;
+def memopv4i16 : PatFrag<(ops node:$ptr), (v4i16 (memop64 node:$ptr))>;
+def memopv8i16 : PatFrag<(ops node:$ptr), (v8i16 (memop64 node:$ptr))>;
+def memopv2i32 : PatFrag<(ops node:$ptr), (v2i32 (memop64 node:$ptr))>;
+
+def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>;
+def bc_v2f64 : PatFrag<(ops node:$in), (v2f64 (bitconvert node:$in))>;
+def bc_v16i8 : PatFrag<(ops node:$in), (v16i8 (bitconvert node:$in))>;
+def bc_v8i16 : PatFrag<(ops node:$in), (v8i16 (bitconvert node:$in))>;
+def bc_v4i32 : PatFrag<(ops node:$in), (v4i32 (bitconvert node:$in))>;
+def bc_v2i64 : PatFrag<(ops node:$in), (v2i64 (bitconvert node:$in))>;
+
+def vzmovl_v2i64 : PatFrag<(ops node:$src),
+                           (bitconvert (v2i64 (X86vzmovl
+                             (v2i64 (scalar_to_vector (loadi64 node:$src))))))>;
+def vzmovl_v4i32 : PatFrag<(ops node:$src),
+                           (bitconvert (v4i32 (X86vzmovl
+                             (v4i32 (scalar_to_vector (loadi32 node:$src))))))>;
+
+def vzload_v2i64 : PatFrag<(ops node:$src),
+                           (bitconvert (v2i64 (X86vzload node:$src)))>;
+
+
+def fp32imm0 : PatLeaf<(f32 fpimm), [{
+  return N->isExactlyValue(+0.0);
+}]>;
+
+// BYTE_imm - Transform bit immediates into byte immediates.
+def BYTE_imm  : SDNodeXForm<imm, [{
+  // Transformation function: imm >> 3
+  return getI32Imm(N->getZExtValue() >> 3);
+}]>;
+
+// SHUFFLE_get_shuf_imm xform function: convert vector_shuffle mask to PSHUF*,
+// SHUFP* etc. imm.
+def SHUFFLE_get_shuf_imm : SDNodeXForm<vector_shuffle, [{
+  return getI8Imm(X86::getShuffleSHUFImmediate(N));
+}]>;
+
+// SHUFFLE_get_pshufhw_imm xform function: convert vector_shuffle mask to
+// PSHUFHW imm.
+def SHUFFLE_get_pshufhw_imm : SDNodeXForm<vector_shuffle, [{
+  return getI8Imm(X86::getShufflePSHUFHWImmediate(N));
+}]>;
+
+// SHUFFLE_get_pshuflw_imm xform function: convert vector_shuffle mask to
+// PSHUFLW imm.
+def SHUFFLE_get_pshuflw_imm : SDNodeXForm<vector_shuffle, [{
+  return getI8Imm(X86::getShufflePSHUFLWImmediate(N));
+}]>;
+
+// SHUFFLE_get_palign_imm xform function: convert vector_shuffle mask to
+// a PALIGNR imm.
+def SHUFFLE_get_palign_imm : SDNodeXForm<vector_shuffle, [{
+  return getI8Imm(X86::getShufflePALIGNRImmediate(N));
+}]>;
+
+def splat_lo : PatFrag<(ops node:$lhs, node:$rhs),
+                       (vector_shuffle node:$lhs, node:$rhs), [{
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
+  return SVOp->isSplat() && SVOp->getSplatIndex() == 0;
+}]>;
+
+def movddup : PatFrag<(ops node:$lhs, node:$rhs),
+                      (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isMOVDDUPMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def movhlps : PatFrag<(ops node:$lhs, node:$rhs),
+                      (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isMOVHLPSMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def movhlps_undef : PatFrag<(ops node:$lhs, node:$rhs),
+                            (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isMOVHLPS_v_undef_Mask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def movlhps : PatFrag<(ops node:$lhs, node:$rhs),
+                      (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isMOVLHPSMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def movlp : PatFrag<(ops node:$lhs, node:$rhs),
+                    (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isMOVLPMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def movl : PatFrag<(ops node:$lhs, node:$rhs),
+                   (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isMOVLMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def movshdup : PatFrag<(ops node:$lhs, node:$rhs),
+                       (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isMOVSHDUPMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def movsldup : PatFrag<(ops node:$lhs, node:$rhs),
+                       (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isMOVSLDUPMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def unpckl : PatFrag<(ops node:$lhs, node:$rhs),
+                     (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isUNPCKLMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def unpckh : PatFrag<(ops node:$lhs, node:$rhs),
+                     (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isUNPCKHMask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def unpckl_undef : PatFrag<(ops node:$lhs, node:$rhs),
+                           (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isUNPCKL_v_undef_Mask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def unpckh_undef : PatFrag<(ops node:$lhs, node:$rhs),
+                           (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isUNPCKH_v_undef_Mask(cast<ShuffleVectorSDNode>(N));
+}]>;
+
+def pshufd : PatFrag<(ops node:$lhs, node:$rhs),
+                     (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isPSHUFDMask(cast<ShuffleVectorSDNode>(N));
+}], SHUFFLE_get_shuf_imm>;
+
+def shufp : PatFrag<(ops node:$lhs, node:$rhs),
+                    (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isSHUFPMask(cast<ShuffleVectorSDNode>(N));
+}], SHUFFLE_get_shuf_imm>;
+
+def pshufhw : PatFrag<(ops node:$lhs, node:$rhs),
+                      (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isPSHUFHWMask(cast<ShuffleVectorSDNode>(N));
+}], SHUFFLE_get_pshufhw_imm>;
+
+def pshuflw : PatFrag<(ops node:$lhs, node:$rhs),
+                      (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isPSHUFLWMask(cast<ShuffleVectorSDNode>(N));
+}], SHUFFLE_get_pshuflw_imm>;
+
+def palign : PatFrag<(ops node:$lhs, node:$rhs),
+                     (vector_shuffle node:$lhs, node:$rhs), [{
+  return X86::isPALIGNRMask(cast<ShuffleVectorSDNode>(N));
+}], SHUFFLE_get_palign_imm>;
+
+//===----------------------------------------------------------------------===//
+// SSE scalar FP Instructions
+//===----------------------------------------------------------------------===//
+
+// CMOV* - Used to implement the SSE SELECT DAG operation.  Expanded after
+// instruction selection into a branch sequence.
+let Uses = [EFLAGS], usesCustomInserter = 1 in {
+  def CMOV_FR32 : I<0, Pseudo,
+                    (outs FR32:$dst), (ins FR32:$t, FR32:$f, i8imm:$cond),
+                    "#CMOV_FR32 PSEUDO!",
+                    [(set FR32:$dst, (X86cmov FR32:$t, FR32:$f, imm:$cond,
+                                                  EFLAGS))]>;
+  def CMOV_FR64 : I<0, Pseudo,
+                    (outs FR64:$dst), (ins FR64:$t, FR64:$f, i8imm:$cond),
+                    "#CMOV_FR64 PSEUDO!",
+                    [(set FR64:$dst, (X86cmov FR64:$t, FR64:$f, imm:$cond,
+                                                  EFLAGS))]>;
+  def CMOV_V4F32 : I<0, Pseudo,
+                    (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond),
+                    "#CMOV_V4F32 PSEUDO!",
+                    [(set VR128:$dst,
+                      (v4f32 (X86cmov VR128:$t, VR128:$f, imm:$cond,
+                                          EFLAGS)))]>;
+  def CMOV_V2F64 : I<0, Pseudo,
+                    (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond),
+                    "#CMOV_V2F64 PSEUDO!",
+                    [(set VR128:$dst,
+                      (v2f64 (X86cmov VR128:$t, VR128:$f, imm:$cond,
+                                          EFLAGS)))]>;
+  def CMOV_V2I64 : I<0, Pseudo,
+                    (outs VR128:$dst), (ins VR128:$t, VR128:$f, i8imm:$cond),
+                    "#CMOV_V2I64 PSEUDO!",
+                    [(set VR128:$dst,
+                      (v2i64 (X86cmov VR128:$t, VR128:$f, imm:$cond,
+                                          EFLAGS)))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// SSE1 Instructions
+//===----------------------------------------------------------------------===//
+
+// Move Instructions
+let neverHasSideEffects = 1 in
+def MOVSSrr : SSI<0x10, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
+                  "movss\t{$src, $dst|$dst, $src}", []>;
+let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+def MOVSSrm : SSI<0x10, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
+                  "movss\t{$src, $dst|$dst, $src}",
+                  [(set FR32:$dst, (loadf32 addr:$src))]>;
+def MOVSSmr : SSI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src),
+                  "movss\t{$src, $dst|$dst, $src}",
+                  [(store FR32:$src, addr:$dst)]>;
+
+// Conversion instructions
+def CVTTSS2SIrr : SSI<0x2C, MRMSrcReg, (outs GR32:$dst), (ins FR32:$src),
+                      "cvttss2si\t{$src, $dst|$dst, $src}",
+                      [(set GR32:$dst, (fp_to_sint FR32:$src))]>;
+def CVTTSS2SIrm : SSI<0x2C, MRMSrcMem, (outs GR32:$dst), (ins f32mem:$src),
+                      "cvttss2si\t{$src, $dst|$dst, $src}",
+                      [(set GR32:$dst, (fp_to_sint (loadf32 addr:$src)))]>;
+def CVTSI2SSrr  : SSI<0x2A, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
+                      "cvtsi2ss\t{$src, $dst|$dst, $src}",
+                      [(set FR32:$dst, (sint_to_fp GR32:$src))]>;
+def CVTSI2SSrm  : SSI<0x2A, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
+                      "cvtsi2ss\t{$src, $dst|$dst, $src}",
+                      [(set FR32:$dst, (sint_to_fp (loadi32 addr:$src)))]>;
+
+// Match intrinsics which expect XMM operand(s).
+def CVTSS2SIrr: SSI<0x2D, MRMSrcReg, (outs GR32:$dst), (ins FR32:$src),
+                    "cvtss2si{l}\t{$src, $dst|$dst, $src}", []>;
+def CVTSS2SIrm: SSI<0x2D, MRMSrcMem, (outs GR32:$dst), (ins f32mem:$src),
+                    "cvtss2si{l}\t{$src, $dst|$dst, $src}", []>;
+
+def Int_CVTSS2SIrr : SSI<0x2D, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
+                         "cvtss2si\t{$src, $dst|$dst, $src}",
+                         [(set GR32:$dst, (int_x86_sse_cvtss2si VR128:$src))]>;
+def Int_CVTSS2SIrm : SSI<0x2D, MRMSrcMem, (outs GR32:$dst), (ins f32mem:$src),
+                         "cvtss2si\t{$src, $dst|$dst, $src}",
+                         [(set GR32:$dst, (int_x86_sse_cvtss2si
+                                           (load addr:$src)))]>;
+
+// Match intrinisics which expect MM and XMM operand(s).
+def Int_CVTPS2PIrr : PSI<0x2D, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
+                         "cvtps2pi\t{$src, $dst|$dst, $src}",
+                         [(set VR64:$dst, (int_x86_sse_cvtps2pi VR128:$src))]>;
+def Int_CVTPS2PIrm : PSI<0x2D, MRMSrcMem, (outs VR64:$dst), (ins f64mem:$src),
+                         "cvtps2pi\t{$src, $dst|$dst, $src}",
+                         [(set VR64:$dst, (int_x86_sse_cvtps2pi
+                                           (load addr:$src)))]>;
+def Int_CVTTPS2PIrr: PSI<0x2C, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
+                         "cvttps2pi\t{$src, $dst|$dst, $src}",
+                         [(set VR64:$dst, (int_x86_sse_cvttps2pi VR128:$src))]>;
+def Int_CVTTPS2PIrm: PSI<0x2C, MRMSrcMem, (outs VR64:$dst), (ins f64mem:$src),
+                         "cvttps2pi\t{$src, $dst|$dst, $src}",
+                         [(set VR64:$dst, (int_x86_sse_cvttps2pi
+                                           (load addr:$src)))]>;
+let Constraints = "$src1 = $dst" in {
+  def Int_CVTPI2PSrr : PSI<0x2A, MRMSrcReg,
+                           (outs VR128:$dst), (ins VR128:$src1, VR64:$src2),
+                        "cvtpi2ps\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst, (int_x86_sse_cvtpi2ps VR128:$src1,
+                                           VR64:$src2))]>;
+  def Int_CVTPI2PSrm : PSI<0x2A, MRMSrcMem,
+                           (outs VR128:$dst), (ins VR128:$src1, i64mem:$src2),
+                        "cvtpi2ps\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst, (int_x86_sse_cvtpi2ps VR128:$src1,
+                                            (load addr:$src2)))]>;
+}
+
+// Aliases for intrinsics
+def Int_CVTTSS2SIrr : SSI<0x2C, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
+                          "cvttss2si\t{$src, $dst|$dst, $src}",
+                          [(set GR32:$dst,
+                            (int_x86_sse_cvttss2si VR128:$src))]>;
+def Int_CVTTSS2SIrm : SSI<0x2C, MRMSrcMem, (outs GR32:$dst), (ins f32mem:$src),
+                          "cvttss2si\t{$src, $dst|$dst, $src}",
+                          [(set GR32:$dst,
+                            (int_x86_sse_cvttss2si(load addr:$src)))]>;
+
+let Constraints = "$src1 = $dst" in {
+  def Int_CVTSI2SSrr : SSI<0x2A, MRMSrcReg,
+                           (outs VR128:$dst), (ins VR128:$src1, GR32:$src2),
+                           "cvtsi2ss\t{$src2, $dst|$dst, $src2}",
+                           [(set VR128:$dst, (int_x86_sse_cvtsi2ss VR128:$src1,
+                                              GR32:$src2))]>;
+  def Int_CVTSI2SSrm : SSI<0x2A, MRMSrcMem,
+                           (outs VR128:$dst), (ins VR128:$src1, i32mem:$src2),
+                           "cvtsi2ss\t{$src2, $dst|$dst, $src2}",
+                           [(set VR128:$dst, (int_x86_sse_cvtsi2ss VR128:$src1,
+                                              (loadi32 addr:$src2)))]>;
+}
+
+// Comparison instructions
+let Constraints = "$src1 = $dst", neverHasSideEffects = 1 in {
+  def CMPSSrr : SSIi8<0xC2, MRMSrcReg,
+                    (outs FR32:$dst), (ins FR32:$src1, FR32:$src, SSECC:$cc),
+                    "cmp${cc}ss\t{$src, $dst|$dst, $src}", []>;
+let mayLoad = 1 in
+  def CMPSSrm : SSIi8<0xC2, MRMSrcMem,
+                    (outs FR32:$dst), (ins FR32:$src1, f32mem:$src, SSECC:$cc),
+                    "cmp${cc}ss\t{$src, $dst|$dst, $src}", []>;
+}
+
+let Defs = [EFLAGS] in {
+def UCOMISSrr: PSI<0x2E, MRMSrcReg, (outs), (ins FR32:$src1, FR32:$src2),
+                   "ucomiss\t{$src2, $src1|$src1, $src2}",
+                   [(X86cmp FR32:$src1, FR32:$src2), (implicit EFLAGS)]>;
+def UCOMISSrm: PSI<0x2E, MRMSrcMem, (outs), (ins FR32:$src1, f32mem:$src2),
+                   "ucomiss\t{$src2, $src1|$src1, $src2}",
+                   [(X86cmp FR32:$src1, (loadf32 addr:$src2)),
+                    (implicit EFLAGS)]>;
+                    
+def COMISSrr: PSI<0x2F, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+                  "comiss\t{$src2, $src1|$src1, $src2}", []>;
+def COMISSrm: PSI<0x2F, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
+                  "comiss\t{$src2, $src1|$src1, $src2}", []>;
+                  
+} // Defs = [EFLAGS]
+
+// Aliases to match intrinsics which expect XMM operand(s).
+let Constraints = "$src1 = $dst" in {
+  def Int_CMPSSrr : SSIi8<0xC2, MRMSrcReg,
+                        (outs VR128:$dst), 
+                        (ins VR128:$src1, VR128:$src, SSECC:$cc),
+                        "cmp${cc}ss\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst, (int_x86_sse_cmp_ss 
+                                             VR128:$src1,
+                                             VR128:$src, imm:$cc))]>;
+  def Int_CMPSSrm : SSIi8<0xC2, MRMSrcMem,
+                        (outs VR128:$dst), 
+                        (ins VR128:$src1, f32mem:$src, SSECC:$cc),
+                        "cmp${cc}ss\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst, (int_x86_sse_cmp_ss VR128:$src1,
+                                           (load addr:$src), imm:$cc))]>;
+}
+
+let Defs = [EFLAGS] in {
+def Int_UCOMISSrr: PSI<0x2E, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+                       "ucomiss\t{$src2, $src1|$src1, $src2}",
+                       [(X86ucomi (v4f32 VR128:$src1), VR128:$src2),
+                        (implicit EFLAGS)]>;
+def Int_UCOMISSrm: PSI<0x2E, MRMSrcMem, (outs),(ins VR128:$src1, f128mem:$src2),
+                       "ucomiss\t{$src2, $src1|$src1, $src2}",
+                       [(X86ucomi (v4f32 VR128:$src1), (load addr:$src2)),
+                        (implicit EFLAGS)]>;
+
+def Int_COMISSrr: PSI<0x2F, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+                      "comiss\t{$src2, $src1|$src1, $src2}",
+                      [(X86comi (v4f32 VR128:$src1), VR128:$src2),
+                       (implicit EFLAGS)]>;
+def Int_COMISSrm: PSI<0x2F, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
+                      "comiss\t{$src2, $src1|$src1, $src2}",
+                      [(X86comi (v4f32 VR128:$src1), (load addr:$src2)),
+                       (implicit EFLAGS)]>;
+} // Defs = [EFLAGS]
+
+// Aliases of packed SSE1 instructions for scalar use. These all have names
+// that start with 'Fs'.
+
+// Alias instructions that map fld0 to pxor for sse.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isCodeGenOnly = 1,
+    canFoldAsLoad = 1 in
+  // FIXME: Set encoding to pseudo!
+def FsFLD0SS : I<0xEF, MRMInitReg, (outs FR32:$dst), (ins), "",
+                 [(set FR32:$dst, fp32imm0)]>,
+                 Requires<[HasSSE1]>, TB, OpSize;
+
+// Alias instruction to do FR32 reg-to-reg copy using movaps. Upper bits are
+// disregarded.
+let neverHasSideEffects = 1 in
+def FsMOVAPSrr : PSI<0x28, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
+                     "movaps\t{$src, $dst|$dst, $src}", []>;
+
+// Alias instruction to load FR32 from f128mem using movaps. Upper bits are
+// disregarded.
+let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+def FsMOVAPSrm : PSI<0x28, MRMSrcMem, (outs FR32:$dst), (ins f128mem:$src),
+                     "movaps\t{$src, $dst|$dst, $src}",
+                     [(set FR32:$dst, (alignedloadfsf32 addr:$src))]>;
+
+// Alias bitwise logical operations using SSE logical ops on packed FP values.
+let Constraints = "$src1 = $dst" in {
+let isCommutable = 1 in {
+  def FsANDPSrr : PSI<0x54, MRMSrcReg, (outs FR32:$dst),
+                                       (ins FR32:$src1, FR32:$src2),
+                      "andps\t{$src2, $dst|$dst, $src2}",
+                      [(set FR32:$dst, (X86fand FR32:$src1, FR32:$src2))]>;
+  def FsORPSrr  : PSI<0x56, MRMSrcReg, (outs FR32:$dst),
+                                       (ins FR32:$src1, FR32:$src2),
+                      "orps\t{$src2, $dst|$dst, $src2}",
+                      [(set FR32:$dst, (X86for FR32:$src1, FR32:$src2))]>;
+  def FsXORPSrr : PSI<0x57, MRMSrcReg, (outs FR32:$dst),
+                                       (ins FR32:$src1, FR32:$src2),
+                      "xorps\t{$src2, $dst|$dst, $src2}",
+                      [(set FR32:$dst, (X86fxor FR32:$src1, FR32:$src2))]>;
+}
+
+def FsANDPSrm : PSI<0x54, MRMSrcMem, (outs FR32:$dst),
+                                     (ins FR32:$src1, f128mem:$src2),
+                    "andps\t{$src2, $dst|$dst, $src2}",
+                    [(set FR32:$dst, (X86fand FR32:$src1,
+                                      (memopfsf32 addr:$src2)))]>;
+def FsORPSrm  : PSI<0x56, MRMSrcMem, (outs FR32:$dst),
+                                     (ins FR32:$src1, f128mem:$src2),
+                    "orps\t{$src2, $dst|$dst, $src2}",
+                    [(set FR32:$dst, (X86for FR32:$src1,
+                                      (memopfsf32 addr:$src2)))]>;
+def FsXORPSrm : PSI<0x57, MRMSrcMem, (outs FR32:$dst),
+                                     (ins FR32:$src1, f128mem:$src2),
+                    "xorps\t{$src2, $dst|$dst, $src2}",
+                    [(set FR32:$dst, (X86fxor FR32:$src1,
+                                      (memopfsf32 addr:$src2)))]>;
+
+let neverHasSideEffects = 1 in {
+def FsANDNPSrr : PSI<0x55, MRMSrcReg,
+                     (outs FR32:$dst), (ins FR32:$src1, FR32:$src2),
+                     "andnps\t{$src2, $dst|$dst, $src2}", []>;
+let mayLoad = 1 in
+def FsANDNPSrm : PSI<0x55, MRMSrcMem,
+                     (outs FR32:$dst), (ins FR32:$src1, f128mem:$src2),
+                     "andnps\t{$src2, $dst|$dst, $src2}", []>;
+}
+}
+
+/// basic_sse1_fp_binop_rm - SSE1 binops come in both scalar and vector forms.
+///
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation.  This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a scalar)
+/// and leaves the top elements unmodified (therefore these cannot be commuted).
+///
+/// These three forms can each be reg+reg or reg+mem, so there are a total of
+/// six "instructions".
+///
+let Constraints = "$src1 = $dst" in {
+multiclass basic_sse1_fp_binop_rm<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, Intrinsic F32Int,
+                                  bit Commutable = 0> {
+  // Scalar operation, reg+reg.
+  def SSrr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR32:$src2),
+                 !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+                 [(set FR32:$dst, (OpNode FR32:$src1, FR32:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Scalar operation, reg+mem.
+  def SSrm : SSI<opc, MRMSrcMem, (outs FR32:$dst),
+                                 (ins FR32:$src1, f32mem:$src2),
+                 !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+                 [(set FR32:$dst, (OpNode FR32:$src1, (load addr:$src2)))]>;
+
+  // Vector operation, reg+reg.
+  def PSrr : PSI<opc, MRMSrcReg, (outs VR128:$dst),
+                                 (ins VR128:$src1, VR128:$src2),
+               !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (v4f32 (OpNode VR128:$src1, VR128:$src2)))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector operation, reg+mem.
+  def PSrm : PSI<opc, MRMSrcMem, (outs VR128:$dst),
+                                 (ins VR128:$src1, f128mem:$src2),
+                 !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
+             [(set VR128:$dst, (OpNode VR128:$src1, (memopv4f32 addr:$src2)))]>;
+
+  // Intrinsic operation, reg+reg.
+  def SSrr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst),
+                                     (ins VR128:$src1, VR128:$src2),
+                     !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2))]>;
+
+  // Intrinsic operation, reg+mem.
+  def SSrm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
+                                     (ins VR128:$src1, ssmem:$src2),
+                     !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (F32Int VR128:$src1,
+                                               sse_load_f32:$src2))]>;
+}
+}
+
+// Arithmetic instructions
+defm ADD : basic_sse1_fp_binop_rm<0x58, "add", fadd, int_x86_sse_add_ss, 1>;
+defm MUL : basic_sse1_fp_binop_rm<0x59, "mul", fmul, int_x86_sse_mul_ss, 1>;
+defm SUB : basic_sse1_fp_binop_rm<0x5C, "sub", fsub, int_x86_sse_sub_ss>;
+defm DIV : basic_sse1_fp_binop_rm<0x5E, "div", fdiv, int_x86_sse_div_ss>;
+
+/// sse1_fp_binop_rm - Other SSE1 binops
+///
+/// This multiclass is like basic_sse1_fp_binop_rm, with the addition of
+/// instructions for a full-vector intrinsic form.  Operations that map
+/// onto C operators don't use this form since they just use the plain
+/// vector form instead of having a separate vector intrinsic form.
+///
+/// This provides a total of eight "instructions".
+///
+let Constraints = "$src1 = $dst" in {
+multiclass sse1_fp_binop_rm<bits<8> opc, string OpcodeStr,
+                            SDNode OpNode,
+                            Intrinsic F32Int,
+                            Intrinsic V4F32Int,
+                            bit Commutable = 0> {
+
+  // Scalar operation, reg+reg.
+  def SSrr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR32:$src2),
+                 !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+                 [(set FR32:$dst, (OpNode FR32:$src1, FR32:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Scalar operation, reg+mem.
+  def SSrm : SSI<opc, MRMSrcMem, (outs FR32:$dst),
+                                 (ins FR32:$src1, f32mem:$src2),
+                 !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+                 [(set FR32:$dst, (OpNode FR32:$src1, (load addr:$src2)))]>;
+
+  // Vector operation, reg+reg.
+  def PSrr : PSI<opc, MRMSrcReg, (outs VR128:$dst),
+                                 (ins VR128:$src1, VR128:$src2),
+               !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (v4f32 (OpNode VR128:$src1, VR128:$src2)))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector operation, reg+mem.
+  def PSrm : PSI<opc, MRMSrcMem, (outs VR128:$dst),
+                                 (ins VR128:$src1, f128mem:$src2),
+                 !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
+             [(set VR128:$dst, (OpNode VR128:$src1, (memopv4f32 addr:$src2)))]>;
+
+  // Intrinsic operation, reg+reg.
+  def SSrr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst),
+                                     (ins VR128:$src1, VR128:$src2),
+                     !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Intrinsic operation, reg+mem.
+  def SSrm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
+                                     (ins VR128:$src1, ssmem:$src2),
+                     !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (F32Int VR128:$src1,
+                                               sse_load_f32:$src2))]>;
+
+  // Vector intrinsic operation, reg+reg.
+  def PSrr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst),
+                                     (ins VR128:$src1, VR128:$src2),
+                     !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (V4F32Int VR128:$src1, VR128:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector intrinsic operation, reg+mem.
+  def PSrm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst),
+                                     (ins VR128:$src1, f128mem:$src2),
+                     !strconcat(OpcodeStr, "ps\t{$src2, $dst|$dst, $src2}"),
+           [(set VR128:$dst, (V4F32Int VR128:$src1, (memopv4f32 addr:$src2)))]>;
+}
+}
+
+defm MAX : sse1_fp_binop_rm<0x5F, "max", X86fmax,
+                            int_x86_sse_max_ss, int_x86_sse_max_ps>;
+defm MIN : sse1_fp_binop_rm<0x5D, "min", X86fmin,
+                            int_x86_sse_min_ss, int_x86_sse_min_ps>;
+
+//===----------------------------------------------------------------------===//
+// SSE packed FP Instructions
+
+// Move Instructions
+let neverHasSideEffects = 1 in
+def MOVAPSrr : PSI<0x28, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                   "movaps\t{$src, $dst|$dst, $src}", []>;
+let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+def MOVAPSrm : PSI<0x28, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                   "movaps\t{$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (alignedloadv4f32 addr:$src))]>;
+
+def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movaps\t{$src, $dst|$dst, $src}",
+                   [(alignedstore (v4f32 VR128:$src), addr:$dst)]>;
+
+let neverHasSideEffects = 1 in
+def MOVUPSrr : PSI<0x10, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                   "movups\t{$src, $dst|$dst, $src}", []>;
+let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+def MOVUPSrm : PSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                   "movups\t{$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (loadv4f32 addr:$src))]>;
+def MOVUPSmr : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movups\t{$src, $dst|$dst, $src}",
+                   [(store (v4f32 VR128:$src), addr:$dst)]>;
+
+// Intrinsic forms of MOVUPS load and store
+let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+def MOVUPSrm_Int : PSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                       "movups\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse_loadu_ps addr:$src))]>;
+def MOVUPSmr_Int : PSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                       "movups\t{$src, $dst|$dst, $src}",
+                       [(int_x86_sse_storeu_ps addr:$dst, VR128:$src)]>;
+
+let Constraints = "$src1 = $dst" in {
+  let AddedComplexity = 20 in {
+    def MOVLPSrm : PSI<0x12, MRMSrcMem,
+                       (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
+                       "movlps\t{$src2, $dst|$dst, $src2}",
+       [(set VR128:$dst,
+         (movlp VR128:$src1,
+                (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))]>;
+    def MOVHPSrm : PSI<0x16, MRMSrcMem,
+                       (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
+                       "movhps\t{$src2, $dst|$dst, $src2}",
+       [(set VR128:$dst,
+         (movlhps VR128:$src1,
+                (bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))]>;
+  } // AddedComplexity
+} // Constraints = "$src1 = $dst"
+
+
+def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movlps\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract (bc_v2f64 (v4f32 VR128:$src)),
+                                 (iPTR 0))), addr:$dst)]>;
+
+// v2f64 extract element 1 is always custom lowered to unpack high to low
+// and extract element 0 so the non-store version isn't too horrible.
+def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movhps\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract
+                                 (unpckh (bc_v2f64 (v4f32 VR128:$src)),
+                                         (undef)), (iPTR 0))), addr:$dst)]>;
+
+let Constraints = "$src1 = $dst" in {
+let AddedComplexity = 20 in {
+def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
+                                     (ins VR128:$src1, VR128:$src2),
+                    "movlhps\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst,
+                      (v4f32 (movlhps VR128:$src1, VR128:$src2)))]>;
+
+def MOVHLPSrr : PSI<0x12, MRMSrcReg, (outs VR128:$dst),
+                                     (ins VR128:$src1, VR128:$src2),
+                    "movhlps\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst,
+                      (v4f32 (movhlps VR128:$src1, VR128:$src2)))]>;
+} // AddedComplexity
+} // Constraints = "$src1 = $dst"
+
+let AddedComplexity = 20 in {
+def : Pat<(v4f32 (movddup VR128:$src, (undef))),
+          (MOVLHPSrr VR128:$src, VR128:$src)>, Requires<[HasSSE1]>;
+def : Pat<(v2i64 (movddup VR128:$src, (undef))),
+          (MOVLHPSrr VR128:$src, VR128:$src)>, Requires<[HasSSE1]>;
+}
+
+
+
+// Arithmetic
+
+/// sse1_fp_unop_rm - SSE1 unops come in both scalar and vector forms.
+///
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation.  This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a
+/// scalar) and leaves the top elements undefined.
+///
+/// And, we have a special variant form for a full-vector intrinsic form.
+///
+/// These four forms can each have a reg or a mem operand, so there are a
+/// total of eight "instructions".
+///
+multiclass sse1_fp_unop_rm<bits<8> opc, string OpcodeStr,
+                           SDNode OpNode,
+                           Intrinsic F32Int,
+                           Intrinsic V4F32Int,
+                           bit Commutable = 0> {
+  // Scalar operation, reg.
+  def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
+                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+                [(set FR32:$dst, (OpNode FR32:$src))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Scalar operation, mem.
+  def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
+                !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+                [(set FR32:$dst, (OpNode (load addr:$src)))]>, XS,
+            Requires<[HasSSE1, OptForSize]>;
+
+  // Vector operation, reg.
+  def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+              !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+              [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector operation, mem.
+  def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+                [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))]>;
+
+  // Intrinsic operation, reg.
+  def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                    !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (F32Int VR128:$src))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Intrinsic operation, mem.
+  def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst), (ins ssmem:$src),
+                    !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (F32Int sse_load_f32:$src))]>;
+
+  // Vector intrinsic operation, reg
+  def PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                    !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (V4F32Int VR128:$src))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector intrinsic operation, mem
+  def PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                    !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))]>;
+}
+
+// Square root.
+defm SQRT  : sse1_fp_unop_rm<0x51, "sqrt",  fsqrt,
+                             int_x86_sse_sqrt_ss, int_x86_sse_sqrt_ps>;
+
+// Reciprocal approximations. Note that these typically require refinement
+// in order to obtain suitable precision.
+defm RSQRT : sse1_fp_unop_rm<0x52, "rsqrt", X86frsqrt,
+                             int_x86_sse_rsqrt_ss, int_x86_sse_rsqrt_ps>;
+defm RCP   : sse1_fp_unop_rm<0x53, "rcp",   X86frcp,
+                             int_x86_sse_rcp_ss, int_x86_sse_rcp_ps>;
+
+// Logical
+let Constraints = "$src1 = $dst" in {
+  let isCommutable = 1 in {
+    def ANDPSrr : PSI<0x54, MRMSrcReg,
+                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                      "andps\t{$src2, $dst|$dst, $src2}",
+                      [(set VR128:$dst, (v2i64
+                                         (and VR128:$src1, VR128:$src2)))]>;
+    def ORPSrr  : PSI<0x56, MRMSrcReg,
+                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                      "orps\t{$src2, $dst|$dst, $src2}",
+                      [(set VR128:$dst, (v2i64
+                                         (or VR128:$src1, VR128:$src2)))]>;
+    def XORPSrr : PSI<0x57, MRMSrcReg,
+                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                      "xorps\t{$src2, $dst|$dst, $src2}",
+                      [(set VR128:$dst, (v2i64
+                                         (xor VR128:$src1, VR128:$src2)))]>;
+  }
+
+  def ANDPSrm : PSI<0x54, MRMSrcMem,
+                    (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+                    "andps\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst, (and (bc_v2i64 (v4f32 VR128:$src1)),
+                                       (memopv2i64 addr:$src2)))]>;
+  def ORPSrm  : PSI<0x56, MRMSrcMem,
+                    (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+                    "orps\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst, (or (bc_v2i64 (v4f32 VR128:$src1)),
+                                       (memopv2i64 addr:$src2)))]>;
+  def XORPSrm : PSI<0x57, MRMSrcMem,
+                    (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+                    "xorps\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst, (xor (bc_v2i64 (v4f32 VR128:$src1)),
+                                       (memopv2i64 addr:$src2)))]>;
+  def ANDNPSrr : PSI<0x55, MRMSrcReg,
+                     (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                     "andnps\t{$src2, $dst|$dst, $src2}",
+                     [(set VR128:$dst,
+                       (v2i64 (and (xor VR128:$src1,
+                                    (bc_v2i64 (v4i32 immAllOnesV))),
+                               VR128:$src2)))]>;
+  def ANDNPSrm : PSI<0x55, MRMSrcMem,
+                     (outs VR128:$dst), (ins VR128:$src1,f128mem:$src2),
+                     "andnps\t{$src2, $dst|$dst, $src2}",
+                     [(set VR128:$dst,
+                       (v2i64 (and (xor (bc_v2i64 (v4f32 VR128:$src1)),
+                                    (bc_v2i64 (v4i32 immAllOnesV))),
+                               (memopv2i64 addr:$src2))))]>;
+}
+
+let Constraints = "$src1 = $dst" in {
+  def CMPPSrri : PSIi8<0xC2, MRMSrcReg,
+                    (outs VR128:$dst), (ins VR128:$src1, VR128:$src, SSECC:$cc),
+                    "cmp${cc}ps\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst, (int_x86_sse_cmp_ps VR128:$src1,
+                                                        VR128:$src, imm:$cc))]>;
+  def CMPPSrmi : PSIi8<0xC2, MRMSrcMem,
+                  (outs VR128:$dst), (ins VR128:$src1, f128mem:$src, SSECC:$cc),
+                  "cmp${cc}ps\t{$src, $dst|$dst, $src}",
+                  [(set VR128:$dst, (int_x86_sse_cmp_ps VR128:$src1,
+                                            (memop addr:$src), imm:$cc))]>;
+}
+def : Pat<(v4i32 (X86cmpps (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
+          (CMPPSrri VR128:$src1, VR128:$src2, imm:$cc)>;
+def : Pat<(v4i32 (X86cmpps (v4f32 VR128:$src1), (memop addr:$src2), imm:$cc)),
+          (CMPPSrmi VR128:$src1, addr:$src2, imm:$cc)>;
+
+// Shuffle and unpack instructions
+let Constraints = "$src1 = $dst" in {
+  let isConvertibleToThreeAddress = 1 in // Convert to pshufd
+    def SHUFPSrri : PSIi8<0xC6, MRMSrcReg,
+                          (outs VR128:$dst), (ins VR128:$src1,
+                           VR128:$src2, i8imm:$src3),
+                          "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                          [(set VR128:$dst,
+                            (v4f32 (shufp:$src3 VR128:$src1, VR128:$src2)))]>;
+  def SHUFPSrmi : PSIi8<0xC6, MRMSrcMem,
+                        (outs VR128:$dst), (ins VR128:$src1,
+                         f128mem:$src2, i8imm:$src3),
+                        "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                        [(set VR128:$dst,
+                          (v4f32 (shufp:$src3
+                                  VR128:$src1, (memopv4f32 addr:$src2))))]>;
+
+  let AddedComplexity = 10 in {
+    def UNPCKHPSrr : PSI<0x15, MRMSrcReg,
+                         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                         "unpckhps\t{$src2, $dst|$dst, $src2}",
+                         [(set VR128:$dst,
+                           (v4f32 (unpckh VR128:$src1, VR128:$src2)))]>;
+    def UNPCKHPSrm : PSI<0x15, MRMSrcMem,
+                         (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+                         "unpckhps\t{$src2, $dst|$dst, $src2}",
+                         [(set VR128:$dst,
+                           (v4f32 (unpckh VR128:$src1,
+                                          (memopv4f32 addr:$src2))))]>;
+
+    def UNPCKLPSrr : PSI<0x14, MRMSrcReg,
+                         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                         "unpcklps\t{$src2, $dst|$dst, $src2}",
+                         [(set VR128:$dst,
+                           (v4f32 (unpckl VR128:$src1, VR128:$src2)))]>;
+    def UNPCKLPSrm : PSI<0x14, MRMSrcMem,
+                         (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+                         "unpcklps\t{$src2, $dst|$dst, $src2}",
+                         [(set VR128:$dst,
+                           (unpckl VR128:$src1, (memopv4f32 addr:$src2)))]>;
+  } // AddedComplexity
+} // Constraints = "$src1 = $dst"
+
+// Mask creation
+def MOVMSKPSrr : PSI<0x50, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
+                     "movmskps\t{$src, $dst|$dst, $src}",
+                     [(set GR32:$dst, (int_x86_sse_movmsk_ps VR128:$src))]>;
+def MOVMSKPDrr : PDI<0x50, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
+                     "movmskpd\t{$src, $dst|$dst, $src}",
+                     [(set GR32:$dst, (int_x86_sse2_movmsk_pd VR128:$src))]>;
+
+// Prefetch intrinsic.
+def PREFETCHT0   : PSI<0x18, MRM1m, (outs), (ins i8mem:$src),
+    "prefetcht0\t$src", [(prefetch addr:$src, imm, (i32 3))]>;
+def PREFETCHT1   : PSI<0x18, MRM2m, (outs), (ins i8mem:$src),
+    "prefetcht1\t$src", [(prefetch addr:$src, imm, (i32 2))]>;
+def PREFETCHT2   : PSI<0x18, MRM3m, (outs), (ins i8mem:$src),
+    "prefetcht2\t$src", [(prefetch addr:$src, imm, (i32 1))]>;
+def PREFETCHNTA  : PSI<0x18, MRM0m, (outs), (ins i8mem:$src),
+    "prefetchnta\t$src", [(prefetch addr:$src, imm, (i32 0))]>;
+
+// Non-temporal stores
+def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+                    "movntps\t{$src, $dst|$dst, $src}",
+                    [(int_x86_sse_movnt_ps addr:$dst, VR128:$src)]>;
+
+// Load, store, and memory fence
+def SFENCE : PSI<0xAE, MRM7r, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>;
+
+// MXCSR register
+def LDMXCSR : PSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
+                  "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>;
+def STMXCSR : PSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
+                  "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>;
+
+// Alias instructions that map zero vector to pxor / xorp* for sse.
+// We set canFoldAsLoad because this can be converted to a constant-pool
+// load of an all-zeros value if folding it would be beneficial.
+// FIXME: Change encoding to pseudo!
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isCodeGenOnly = 1 in
+def V_SET0 : PSI<0x57, MRMInitReg, (outs VR128:$dst), (ins), "",
+                 [(set VR128:$dst, (v4i32 immAllZerosV))]>;
+
+let Predicates = [HasSSE1] in {
+  def : Pat<(v2i64 immAllZerosV), (V_SET0)>;
+  def : Pat<(v8i16 immAllZerosV), (V_SET0)>;
+  def : Pat<(v16i8 immAllZerosV), (V_SET0)>;
+  def : Pat<(v2f64 immAllZerosV), (V_SET0)>;
+  def : Pat<(v4f32 immAllZerosV), (V_SET0)>;
+}
+
+// FR32 to 128-bit vector conversion.
+let isAsCheapAsAMove = 1 in
+def MOVSS2PSrr : SSI<0x10, MRMSrcReg, (outs VR128:$dst), (ins FR32:$src),
+                      "movss\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v4f32 (scalar_to_vector FR32:$src)))]>;
+def MOVSS2PSrm : SSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f32mem:$src),
+                     "movss\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst,
+                       (v4f32 (scalar_to_vector (loadf32 addr:$src))))]>;
+
+// FIXME: may not be able to eliminate this movss with coalescing the src and
+// dest register classes are different. We really want to write this pattern
+// like this:
+// def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+//           (f32 FR32:$src)>;
+let isAsCheapAsAMove = 1 in
+def MOVPS2SSrr : SSI<0x10, MRMSrcReg, (outs FR32:$dst), (ins VR128:$src),
+                     "movss\t{$src, $dst|$dst, $src}",
+                     [(set FR32:$dst, (vector_extract (v4f32 VR128:$src),
+                                       (iPTR 0)))]>;
+def MOVPS2SSmr : SSI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
+                     "movss\t{$src, $dst|$dst, $src}",
+                     [(store (f32 (vector_extract (v4f32 VR128:$src),
+                                   (iPTR 0))), addr:$dst)]>;
+
+
+// Move to lower bits of a VR128, leaving upper bits alone.
+// Three operand (but two address) aliases.
+let Constraints = "$src1 = $dst" in {
+let neverHasSideEffects = 1 in
+  def MOVLSS2PSrr : SSI<0x10, MRMSrcReg,
+                        (outs VR128:$dst), (ins VR128:$src1, FR32:$src2),
+                        "movss\t{$src2, $dst|$dst, $src2}", []>;
+
+  let AddedComplexity = 15 in
+    def MOVLPSrr : SSI<0x10, MRMSrcReg,
+                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                       "movss\t{$src2, $dst|$dst, $src2}",
+                       [(set VR128:$dst,
+                         (v4f32 (movl VR128:$src1, VR128:$src2)))]>;
+}
+
+// Move to lower bits of a VR128 and zeroing upper bits.
+// Loading from memory automatically zeroing upper bits.
+let AddedComplexity = 20 in
+def MOVZSS2PSrm : SSI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f32mem:$src),
+                      "movss\t{$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (v4f32 (X86vzmovl (v4f32 (scalar_to_vector
+                                                    (loadf32 addr:$src))))))]>;
+
+def : Pat<(v4f32 (X86vzmovl (loadv4f32 addr:$src))),
+          (MOVZSS2PSrm addr:$src)>;
+
+//===---------------------------------------------------------------------===//
+// SSE2 Instructions
+//===---------------------------------------------------------------------===//
+
+// Move Instructions
+let neverHasSideEffects = 1 in
+def MOVSDrr : SDI<0x10, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
+                  "movsd\t{$src, $dst|$dst, $src}", []>;
+let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+def MOVSDrm : SDI<0x10, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src),
+                  "movsd\t{$src, $dst|$dst, $src}",
+                  [(set FR64:$dst, (loadf64 addr:$src))]>;
+def MOVSDmr : SDI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src),
+                  "movsd\t{$src, $dst|$dst, $src}",
+                  [(store FR64:$src, addr:$dst)]>;
+
+// Conversion instructions
+def CVTTSD2SIrr : SDI<0x2C, MRMSrcReg, (outs GR32:$dst), (ins FR64:$src),
+                      "cvttsd2si\t{$src, $dst|$dst, $src}",
+                      [(set GR32:$dst, (fp_to_sint FR64:$src))]>;
+def CVTTSD2SIrm : SDI<0x2C, MRMSrcMem, (outs GR32:$dst), (ins f64mem:$src),
+                      "cvttsd2si\t{$src, $dst|$dst, $src}",
+                      [(set GR32:$dst, (fp_to_sint (loadf64 addr:$src)))]>;
+def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
+                      "cvtsd2ss\t{$src, $dst|$dst, $src}",
+                      [(set FR32:$dst, (fround FR64:$src))]>;
+def CVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
+                      "cvtsd2ss\t{$src, $dst|$dst, $src}",
+                      [(set FR32:$dst, (fround (loadf64 addr:$src)))]>, XD,
+                  Requires<[HasSSE2, OptForSize]>;
+def CVTSI2SDrr  : SDI<0x2A, MRMSrcReg, (outs FR64:$dst), (ins GR32:$src),
+                      "cvtsi2sd\t{$src, $dst|$dst, $src}",
+                      [(set FR64:$dst, (sint_to_fp GR32:$src))]>;
+def CVTSI2SDrm  : SDI<0x2A, MRMSrcMem, (outs FR64:$dst), (ins i32mem:$src),
+                      "cvtsi2sd\t{$src, $dst|$dst, $src}",
+                      [(set FR64:$dst, (sint_to_fp (loadi32 addr:$src)))]>;
+
+def CVTPD2DQrm  : S3DI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                       "cvtpd2dq\t{$src, $dst|$dst, $src}", []>;
+def CVTPD2DQrr  : S3DI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "cvtpd2dq\t{$src, $dst|$dst, $src}", []>;
+def CVTDQ2PDrm  : S3SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                       "cvtdq2pd\t{$src, $dst|$dst, $src}", []>;
+def CVTDQ2PDrr  : S3SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "cvtdq2pd\t{$src, $dst|$dst, $src}", []>;
+def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                     "cvtps2dq\t{$src, $dst|$dst, $src}", []>;
+def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                     "cvtps2dq\t{$src, $dst|$dst, $src}", []>;
+def CVTDQ2PSrr : PSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                     "cvtdq2ps\t{$src, $dst|$dst, $src}", []>;
+def CVTDQ2PSrm : PSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                     "cvtdq2ps\t{$src, $dst|$dst, $src}", []>;
+def COMISDrr: PDI<0x2F, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+                  "comisd\t{$src2, $src1|$src1, $src2}", []>;
+def COMISDrm: PDI<0x2F, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
+                      "comisd\t{$src2, $src1|$src1, $src2}", []>;
+
+// SSE2 instructions with XS prefix
+def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
+                   "cvtss2sd\t{$src, $dst|$dst, $src}",
+                   [(set FR64:$dst, (fextend FR32:$src))]>, XS,
+                 Requires<[HasSSE2]>;
+def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
+                   "cvtss2sd\t{$src, $dst|$dst, $src}",
+                   [(set FR64:$dst, (extloadf32 addr:$src))]>, XS,
+                 Requires<[HasSSE2, OptForSize]>;
+
+def : Pat<(extloadf32 addr:$src),
+          (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[HasSSE2, OptForSpeed]>;
+
+// Match intrinsics which expect XMM operand(s).
+def Int_CVTSD2SIrr : SDI<0x2D, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
+                         "cvtsd2si\t{$src, $dst|$dst, $src}",
+                         [(set GR32:$dst, (int_x86_sse2_cvtsd2si VR128:$src))]>;
+def Int_CVTSD2SIrm : SDI<0x2D, MRMSrcMem, (outs GR32:$dst), (ins f128mem:$src),
+                         "cvtsd2si\t{$src, $dst|$dst, $src}",
+                         [(set GR32:$dst, (int_x86_sse2_cvtsd2si
+                                           (load addr:$src)))]>;
+
+// Match intrinisics which expect MM and XMM operand(s).
+def Int_CVTPD2PIrr : PDI<0x2D, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
+                         "cvtpd2pi\t{$src, $dst|$dst, $src}",
+                         [(set VR64:$dst, (int_x86_sse_cvtpd2pi VR128:$src))]>;
+def Int_CVTPD2PIrm : PDI<0x2D, MRMSrcMem, (outs VR64:$dst), (ins f128mem:$src),
+                         "cvtpd2pi\t{$src, $dst|$dst, $src}",
+                         [(set VR64:$dst, (int_x86_sse_cvtpd2pi
+                                           (memop addr:$src)))]>;
+def Int_CVTTPD2PIrr: PDI<0x2C, MRMSrcReg, (outs VR64:$dst), (ins VR128:$src),
+                         "cvttpd2pi\t{$src, $dst|$dst, $src}",
+                         [(set VR64:$dst, (int_x86_sse_cvttpd2pi VR128:$src))]>;
+def Int_CVTTPD2PIrm: PDI<0x2C, MRMSrcMem, (outs VR64:$dst), (ins f128mem:$src),
+                         "cvttpd2pi\t{$src, $dst|$dst, $src}",
+                         [(set VR64:$dst, (int_x86_sse_cvttpd2pi
+                                           (memop addr:$src)))]>;
+def Int_CVTPI2PDrr : PDI<0x2A, MRMSrcReg, (outs VR128:$dst), (ins VR64:$src),
+                         "cvtpi2pd\t{$src, $dst|$dst, $src}",
+                         [(set VR128:$dst, (int_x86_sse_cvtpi2pd VR64:$src))]>;
+def Int_CVTPI2PDrm : PDI<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                         "cvtpi2pd\t{$src, $dst|$dst, $src}",
+                         [(set VR128:$dst, (int_x86_sse_cvtpi2pd
+                                            (load addr:$src)))]>;
+
+// Aliases for intrinsics
+def Int_CVTTSD2SIrr : SDI<0x2C, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
+                          "cvttsd2si\t{$src, $dst|$dst, $src}",
+                          [(set GR32:$dst,
+                            (int_x86_sse2_cvttsd2si VR128:$src))]>;
+def Int_CVTTSD2SIrm : SDI<0x2C, MRMSrcMem, (outs GR32:$dst), (ins f128mem:$src),
+                          "cvttsd2si\t{$src, $dst|$dst, $src}",
+                          [(set GR32:$dst, (int_x86_sse2_cvttsd2si
+                                            (load addr:$src)))]>;
+
+// Comparison instructions
+let Constraints = "$src1 = $dst", neverHasSideEffects = 1 in {
+  def CMPSDrr : SDIi8<0xC2, MRMSrcReg,
+                    (outs FR64:$dst), (ins FR64:$src1, FR64:$src, SSECC:$cc),
+                    "cmp${cc}sd\t{$src, $dst|$dst, $src}", []>;
+let mayLoad = 1 in
+  def CMPSDrm : SDIi8<0xC2, MRMSrcMem,
+                    (outs FR64:$dst), (ins FR64:$src1, f64mem:$src, SSECC:$cc),
+                    "cmp${cc}sd\t{$src, $dst|$dst, $src}", []>;
+}
+
+let Defs = [EFLAGS] in {
+def UCOMISDrr: PDI<0x2E, MRMSrcReg, (outs), (ins FR64:$src1, FR64:$src2),
+                   "ucomisd\t{$src2, $src1|$src1, $src2}",
+                   [(X86cmp FR64:$src1, FR64:$src2), (implicit EFLAGS)]>;
+def UCOMISDrm: PDI<0x2E, MRMSrcMem, (outs), (ins FR64:$src1, f64mem:$src2),
+                   "ucomisd\t{$src2, $src1|$src1, $src2}",
+                   [(X86cmp FR64:$src1, (loadf64 addr:$src2)),
+                    (implicit EFLAGS)]>;
+} // Defs = [EFLAGS]
+
+// Aliases to match intrinsics which expect XMM operand(s).
+let Constraints = "$src1 = $dst" in {
+  def Int_CMPSDrr : SDIi8<0xC2, MRMSrcReg,
+                        (outs VR128:$dst), 
+                        (ins VR128:$src1, VR128:$src, SSECC:$cc),
+                        "cmp${cc}sd\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst, (int_x86_sse2_cmp_sd VR128:$src1,
+                                           VR128:$src, imm:$cc))]>;
+  def Int_CMPSDrm : SDIi8<0xC2, MRMSrcMem,
+                        (outs VR128:$dst), 
+                        (ins VR128:$src1, f64mem:$src, SSECC:$cc),
+                        "cmp${cc}sd\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst, (int_x86_sse2_cmp_sd VR128:$src1,
+                                           (load addr:$src), imm:$cc))]>;
+}
+
+let Defs = [EFLAGS] in {
+def Int_UCOMISDrr: PDI<0x2E, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+                       "ucomisd\t{$src2, $src1|$src1, $src2}",
+                       [(X86ucomi (v2f64 VR128:$src1), (v2f64 VR128:$src2)),
+                        (implicit EFLAGS)]>;
+def Int_UCOMISDrm: PDI<0x2E, MRMSrcMem, (outs),(ins VR128:$src1, f128mem:$src2),
+                       "ucomisd\t{$src2, $src1|$src1, $src2}",
+                       [(X86ucomi (v2f64 VR128:$src1), (load addr:$src2)),
+                        (implicit EFLAGS)]>;
+
+def Int_COMISDrr: PDI<0x2F, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+                      "comisd\t{$src2, $src1|$src1, $src2}",
+                      [(X86comi (v2f64 VR128:$src1), (v2f64 VR128:$src2)),
+                       (implicit EFLAGS)]>;
+def Int_COMISDrm: PDI<0x2F, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
+                      "comisd\t{$src2, $src1|$src1, $src2}",
+                      [(X86comi (v2f64 VR128:$src1), (load addr:$src2)),
+                       (implicit EFLAGS)]>;
+} // Defs = [EFLAGS]
+
+// Aliases of packed SSE2 instructions for scalar use. These all have names
+// that start with 'Fs'.
+
+// Alias instructions that map fld0 to pxor for sse.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isCodeGenOnly = 1,
+    canFoldAsLoad = 1 in
+def FsFLD0SD : I<0xEF, MRMInitReg, (outs FR64:$dst), (ins), "",
+                 [(set FR64:$dst, fpimm0)]>,
+               Requires<[HasSSE2]>, TB, OpSize;
+
+// Alias instruction to do FR64 reg-to-reg copy using movapd. Upper bits are
+// disregarded.
+let neverHasSideEffects = 1 in
+def FsMOVAPDrr : PDI<0x28, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
+                     "movapd\t{$src, $dst|$dst, $src}", []>;
+
+// Alias instruction to load FR64 from f128mem using movapd. Upper bits are
+// disregarded.
+let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
+                     "movapd\t{$src, $dst|$dst, $src}",
+                     [(set FR64:$dst, (alignedloadfsf64 addr:$src))]>;
+
+// Alias bitwise logical operations using SSE logical ops on packed FP values.
+let Constraints = "$src1 = $dst" in {
+let isCommutable = 1 in {
+  def FsANDPDrr : PDI<0x54, MRMSrcReg, (outs FR64:$dst),
+                                       (ins FR64:$src1, FR64:$src2),
+                      "andpd\t{$src2, $dst|$dst, $src2}",
+                      [(set FR64:$dst, (X86fand FR64:$src1, FR64:$src2))]>;
+  def FsORPDrr  : PDI<0x56, MRMSrcReg, (outs FR64:$dst),
+                                       (ins FR64:$src1, FR64:$src2),
+                      "orpd\t{$src2, $dst|$dst, $src2}",
+                      [(set FR64:$dst, (X86for FR64:$src1, FR64:$src2))]>;
+  def FsXORPDrr : PDI<0x57, MRMSrcReg, (outs FR64:$dst),
+                                       (ins FR64:$src1, FR64:$src2),
+                      "xorpd\t{$src2, $dst|$dst, $src2}",
+                      [(set FR64:$dst, (X86fxor FR64:$src1, FR64:$src2))]>;
+}
+
+def FsANDPDrm : PDI<0x54, MRMSrcMem, (outs FR64:$dst),
+                                     (ins FR64:$src1, f128mem:$src2),
+                    "andpd\t{$src2, $dst|$dst, $src2}",
+                    [(set FR64:$dst, (X86fand FR64:$src1,
+                                      (memopfsf64 addr:$src2)))]>;
+def FsORPDrm  : PDI<0x56, MRMSrcMem, (outs FR64:$dst),
+                                     (ins FR64:$src1, f128mem:$src2),
+                    "orpd\t{$src2, $dst|$dst, $src2}",
+                    [(set FR64:$dst, (X86for FR64:$src1,
+                                      (memopfsf64 addr:$src2)))]>;
+def FsXORPDrm : PDI<0x57, MRMSrcMem, (outs FR64:$dst),
+                                     (ins FR64:$src1, f128mem:$src2),
+                    "xorpd\t{$src2, $dst|$dst, $src2}",
+                    [(set FR64:$dst, (X86fxor FR64:$src1,
+                                      (memopfsf64 addr:$src2)))]>;
+
+let neverHasSideEffects = 1 in {
+def FsANDNPDrr : PDI<0x55, MRMSrcReg,
+                     (outs FR64:$dst), (ins FR64:$src1, FR64:$src2),
+                     "andnpd\t{$src2, $dst|$dst, $src2}", []>;
+let mayLoad = 1 in
+def FsANDNPDrm : PDI<0x55, MRMSrcMem,
+                     (outs FR64:$dst), (ins FR64:$src1, f128mem:$src2),
+                     "andnpd\t{$src2, $dst|$dst, $src2}", []>;
+}
+}
+
+/// basic_sse2_fp_binop_rm - SSE2 binops come in both scalar and vector forms.
+///
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation.  This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a scalar)
+/// and leaves the top elements unmodified (therefore these cannot be commuted).
+///
+/// These three forms can each be reg+reg or reg+mem, so there are a total of
+/// six "instructions".
+///
+let Constraints = "$src1 = $dst" in {
+multiclass basic_sse2_fp_binop_rm<bits<8> opc, string OpcodeStr,
+                                  SDNode OpNode, Intrinsic F64Int,
+                                  bit Commutable = 0> {
+  // Scalar operation, reg+reg.
+  def SDrr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR64:$src2),
+                 !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
+                 [(set FR64:$dst, (OpNode FR64:$src1, FR64:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Scalar operation, reg+mem.
+  def SDrm : SDI<opc, MRMSrcMem, (outs FR64:$dst),
+                                 (ins FR64:$src1, f64mem:$src2),
+                 !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
+                 [(set FR64:$dst, (OpNode FR64:$src1, (load addr:$src2)))]>;
+
+  // Vector operation, reg+reg.
+  def PDrr : PDI<opc, MRMSrcReg, (outs VR128:$dst),
+                                 (ins VR128:$src1, VR128:$src2),
+               !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (v2f64 (OpNode VR128:$src1, VR128:$src2)))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector operation, reg+mem.
+  def PDrm : PDI<opc, MRMSrcMem, (outs VR128:$dst),
+                                 (ins VR128:$src1, f128mem:$src2),
+                 !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
+             [(set VR128:$dst, (OpNode VR128:$src1, (memopv2f64 addr:$src2)))]>;
+
+  // Intrinsic operation, reg+reg.
+  def SDrr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst),
+                                     (ins VR128:$src1, VR128:$src2),
+                     !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2))]>;
+
+  // Intrinsic operation, reg+mem.
+  def SDrm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst),
+                                     (ins VR128:$src1, sdmem:$src2),
+                     !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (F64Int VR128:$src1,
+                                               sse_load_f64:$src2))]>;
+}
+}
+
+// Arithmetic instructions
+defm ADD : basic_sse2_fp_binop_rm<0x58, "add", fadd, int_x86_sse2_add_sd, 1>;
+defm MUL : basic_sse2_fp_binop_rm<0x59, "mul", fmul, int_x86_sse2_mul_sd, 1>;
+defm SUB : basic_sse2_fp_binop_rm<0x5C, "sub", fsub, int_x86_sse2_sub_sd>;
+defm DIV : basic_sse2_fp_binop_rm<0x5E, "div", fdiv, int_x86_sse2_div_sd>;
+
+/// sse2_fp_binop_rm - Other SSE2 binops
+///
+/// This multiclass is like basic_sse2_fp_binop_rm, with the addition of
+/// instructions for a full-vector intrinsic form.  Operations that map
+/// onto C operators don't use this form since they just use the plain
+/// vector form instead of having a separate vector intrinsic form.
+///
+/// This provides a total of eight "instructions".
+///
+let Constraints = "$src1 = $dst" in {
+multiclass sse2_fp_binop_rm<bits<8> opc, string OpcodeStr,
+                            SDNode OpNode,
+                            Intrinsic F64Int,
+                            Intrinsic V2F64Int,
+                            bit Commutable = 0> {
+
+  // Scalar operation, reg+reg.
+  def SDrr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR64:$src2),
+                 !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
+                 [(set FR64:$dst, (OpNode FR64:$src1, FR64:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Scalar operation, reg+mem.
+  def SDrm : SDI<opc, MRMSrcMem, (outs FR64:$dst),
+                                 (ins FR64:$src1, f64mem:$src2),
+                 !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
+                 [(set FR64:$dst, (OpNode FR64:$src1, (load addr:$src2)))]>;
+
+  // Vector operation, reg+reg.
+  def PDrr : PDI<opc, MRMSrcReg, (outs VR128:$dst),
+                                 (ins VR128:$src1, VR128:$src2),
+               !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (v2f64 (OpNode VR128:$src1, VR128:$src2)))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector operation, reg+mem.
+  def PDrm : PDI<opc, MRMSrcMem, (outs VR128:$dst),
+                                 (ins VR128:$src1, f128mem:$src2),
+                 !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
+             [(set VR128:$dst, (OpNode VR128:$src1, (memopv2f64 addr:$src2)))]>;
+
+  // Intrinsic operation, reg+reg.
+  def SDrr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst),
+                                     (ins VR128:$src1, VR128:$src2),
+                     !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Intrinsic operation, reg+mem.
+  def SDrm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst),
+                                     (ins VR128:$src1, sdmem:$src2),
+                     !strconcat(OpcodeStr, "sd\t{$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (F64Int VR128:$src1,
+                                               sse_load_f64:$src2))]>;
+
+  // Vector intrinsic operation, reg+reg.
+  def PDrr_Int : PDI<opc, MRMSrcReg, (outs VR128:$dst),
+                                     (ins VR128:$src1, VR128:$src2),
+                     !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (V2F64Int VR128:$src1, VR128:$src2))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector intrinsic operation, reg+mem.
+  def PDrm_Int : PDI<opc, MRMSrcMem, (outs VR128:$dst),
+                                     (ins VR128:$src1, f128mem:$src2),
+                     !strconcat(OpcodeStr, "pd\t{$src2, $dst|$dst, $src2}"),
+                     [(set VR128:$dst, (V2F64Int VR128:$src1,
+                                                 (memopv2f64 addr:$src2)))]>;
+}
+}
+
+defm MAX : sse2_fp_binop_rm<0x5F, "max", X86fmax,
+                            int_x86_sse2_max_sd, int_x86_sse2_max_pd>;
+defm MIN : sse2_fp_binop_rm<0x5D, "min", X86fmin,
+                            int_x86_sse2_min_sd, int_x86_sse2_min_pd>;
+
+//===---------------------------------------------------------------------===//
+// SSE packed FP Instructions
+
+// Move Instructions
+let neverHasSideEffects = 1 in
+def MOVAPDrr : PDI<0x28, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                   "movapd\t{$src, $dst|$dst, $src}", []>;
+let canFoldAsLoad = 1, isReMaterializable = 1, mayHaveSideEffects = 1 in
+def MOVAPDrm : PDI<0x28, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                   "movapd\t{$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (alignedloadv2f64 addr:$src))]>;
+
+def MOVAPDmr : PDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movapd\t{$src, $dst|$dst, $src}",
+                   [(alignedstore (v2f64 VR128:$src), addr:$dst)]>;
+
+let neverHasSideEffects = 1 in
+def MOVUPDrr : PDI<0x10, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                   "movupd\t{$src, $dst|$dst, $src}", []>;
+let canFoldAsLoad = 1 in
+def MOVUPDrm : PDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                   "movupd\t{$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (loadv2f64 addr:$src))]>;
+def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                   "movupd\t{$src, $dst|$dst, $src}",
+                   [(store (v2f64 VR128:$src), addr:$dst)]>;
+
+// Intrinsic forms of MOVUPD load and store
+def MOVUPDrm_Int : PDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                       "movupd\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_loadu_pd addr:$src))]>;
+def MOVUPDmr_Int : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                       "movupd\t{$src, $dst|$dst, $src}",
+                       [(int_x86_sse2_storeu_pd addr:$dst, VR128:$src)]>;
+
+let Constraints = "$src1 = $dst" in {
+  let AddedComplexity = 20 in {
+    def MOVLPDrm : PDI<0x12, MRMSrcMem,
+                       (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
+                       "movlpd\t{$src2, $dst|$dst, $src2}",
+                       [(set VR128:$dst,
+                         (v2f64 (movlp VR128:$src1,
+                                 (scalar_to_vector (loadf64 addr:$src2)))))]>;
+    def MOVHPDrm : PDI<0x16, MRMSrcMem,
+                       (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
+                       "movhpd\t{$src2, $dst|$dst, $src2}",
+                       [(set VR128:$dst,
+                         (v2f64 (movlhps VR128:$src1,
+                                 (scalar_to_vector (loadf64 addr:$src2)))))]>;
+  } // AddedComplexity
+} // Constraints = "$src1 = $dst"
+
+def MOVLPDmr : PDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movlpd\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract (v2f64 VR128:$src),
+                                 (iPTR 0))), addr:$dst)]>;
+
+// v2f64 extract element 1 is always custom lowered to unpack high to low
+// and extract element 0 so the non-store version isn't too horrible.
+def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                   "movhpd\t{$src, $dst|$dst, $src}",
+                   [(store (f64 (vector_extract
+                                 (v2f64 (unpckh VR128:$src, (undef))),
+                                 (iPTR 0))), addr:$dst)]>;
+
+// SSE2 instructions without OpSize prefix
+def Int_CVTDQ2PSrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "cvtdq2ps\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtdq2ps VR128:$src))]>,
+                     TB, Requires<[HasSSE2]>;
+def Int_CVTDQ2PSrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                      "cvtdq2ps\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (int_x86_sse2_cvtdq2ps
+                                        (bitconvert (memopv2i64 addr:$src))))]>,
+                     TB, Requires<[HasSSE2]>;
+
+// SSE2 instructions with XS prefix
+def Int_CVTDQ2PDrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "cvtdq2pd\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtdq2pd VR128:$src))]>,
+                     XS, Requires<[HasSSE2]>;
+def Int_CVTDQ2PDrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                     "cvtdq2pd\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst, (int_x86_sse2_cvtdq2pd
+                                        (bitconvert (memopv2i64 addr:$src))))]>,
+                     XS, Requires<[HasSSE2]>;
+
+def Int_CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                        "cvtps2dq\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))]>;
+def Int_CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                         "cvtps2dq\t{$src, $dst|$dst, $src}",
+                         [(set VR128:$dst, (int_x86_sse2_cvtps2dq
+                                            (memop addr:$src)))]>;
+// SSE2 packed instructions with XS prefix
+def CVTTPS2DQrr : SSI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                      "cvttps2dq\t{$src, $dst|$dst, $src}", []>;
+def CVTTPS2DQrm : SSI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                      "cvttps2dq\t{$src, $dst|$dst, $src}", []>;
+
+def Int_CVTTPS2DQrr : I<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                        "cvttps2dq\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst, 
+                              (int_x86_sse2_cvttps2dq VR128:$src))]>,
+                      XS, Requires<[HasSSE2]>;
+def Int_CVTTPS2DQrm : I<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                        "cvttps2dq\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst, (int_x86_sse2_cvttps2dq
+                                           (memop addr:$src)))]>,
+                      XS, Requires<[HasSSE2]>;
+
+// SSE2 packed instructions with XD prefix
+def Int_CVTPD2DQrr : I<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "cvtpd2dq\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtpd2dq VR128:$src))]>,
+                     XD, Requires<[HasSSE2]>;
+def Int_CVTPD2DQrm : I<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                       "cvtpd2dq\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtpd2dq
+                                          (memop addr:$src)))]>,
+                     XD, Requires<[HasSSE2]>;
+
+def Int_CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                          "cvttpd2dq\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvttpd2dq VR128:$src))]>;
+def Int_CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
+                          "cvttpd2dq\t{$src, $dst|$dst, $src}",
+                          [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
+                                             (memop addr:$src)))]>;
+
+// SSE2 instructions without OpSize prefix
+def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "cvtps2pd\t{$src, $dst|$dst, $src}", []>, TB;
+def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+                       "cvtps2pd\t{$src, $dst|$dst, $src}", []>, TB;
+
+def Int_CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                       "cvtps2pd\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtps2pd VR128:$src))]>,
+                     TB, Requires<[HasSSE2]>;
+def Int_CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+                       "cvtps2pd\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_cvtps2pd
+                                          (load addr:$src)))]>,
+                     TB, Requires<[HasSSE2]>;
+
+def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                     "cvtpd2ps\t{$src, $dst|$dst, $src}", []>;
+def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                     "cvtpd2ps\t{$src, $dst|$dst, $src}", []>;
+
+
+def Int_CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                         "cvtpd2ps\t{$src, $dst|$dst, $src}",
+                        [(set VR128:$dst, (int_x86_sse2_cvtpd2ps VR128:$src))]>;
+def Int_CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                         "cvtpd2ps\t{$src, $dst|$dst, $src}",
+                         [(set VR128:$dst, (int_x86_sse2_cvtpd2ps
+                                            (memop addr:$src)))]>;
+
+// Match intrinsics which expect XMM operand(s).
+// Aliases for intrinsics
+let Constraints = "$src1 = $dst" in {
+def Int_CVTSI2SDrr: SDI<0x2A, MRMSrcReg,
+                        (outs VR128:$dst), (ins VR128:$src1, GR32:$src2),
+                        "cvtsi2sd\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst, (int_x86_sse2_cvtsi2sd VR128:$src1,
+                                           GR32:$src2))]>;
+def Int_CVTSI2SDrm: SDI<0x2A, MRMSrcMem,
+                        (outs VR128:$dst), (ins VR128:$src1, i32mem:$src2),
+                        "cvtsi2sd\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst, (int_x86_sse2_cvtsi2sd VR128:$src1,
+                                           (loadi32 addr:$src2)))]>;
+def Int_CVTSD2SSrr: SDI<0x5A, MRMSrcReg,
+                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                   "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
+                   [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1,
+                                      VR128:$src2))]>;
+def Int_CVTSD2SSrm: SDI<0x5A, MRMSrcMem,
+                        (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
+                   "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
+                   [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1,
+                                      (load addr:$src2)))]>;
+def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
+                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
+                                       VR128:$src2))]>, XS,
+                    Requires<[HasSSE2]>;
+def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
+                      (outs VR128:$dst), (ins VR128:$src1, f32mem:$src2),
+                    "cvtss2sd\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1,
+                                       (load addr:$src2)))]>, XS,
+                    Requires<[HasSSE2]>;
+}
+
+// Arithmetic
+
+/// sse2_fp_unop_rm - SSE2 unops come in both scalar and vector forms.
+///
+/// In addition, we also have a special variant of the scalar form here to
+/// represent the associated intrinsic operation.  This form is unlike the
+/// plain scalar form, in that it takes an entire vector (instead of a
+/// scalar) and leaves the top elements undefined.
+///
+/// And, we have a special variant form for a full-vector intrinsic form.
+///
+/// These four forms can each have a reg or a mem operand, so there are a
+/// total of eight "instructions".
+///
+multiclass sse2_fp_unop_rm<bits<8> opc, string OpcodeStr,
+                           SDNode OpNode,
+                           Intrinsic F64Int,
+                           Intrinsic V2F64Int,
+                           bit Commutable = 0> {
+  // Scalar operation, reg.
+  def SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
+                !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
+                [(set FR64:$dst, (OpNode FR64:$src))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Scalar operation, mem.
+  def SDm : SDI<opc, MRMSrcMem, (outs FR64:$dst), (ins f64mem:$src),
+                !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
+                [(set FR64:$dst, (OpNode (load addr:$src)))]>;
+
+  // Vector operation, reg.
+  def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+              !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+              [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector operation, mem.
+  def PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+                [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))]>;
+
+  // Intrinsic operation, reg.
+  def SDr_Int : SDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                    !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (F64Int VR128:$src))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Intrinsic operation, mem.
+  def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst), (ins sdmem:$src),
+                    !strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (F64Int sse_load_f64:$src))]>;
+
+  // Vector intrinsic operation, reg
+  def PDr_Int : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                    !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (V2F64Int VR128:$src))]> {
+    let isCommutable = Commutable;
+  }
+
+  // Vector intrinsic operation, mem
+  def PDm_Int : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                    !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (V2F64Int (memopv2f64 addr:$src)))]>;
+}
+
+// Square root.
+defm SQRT  : sse2_fp_unop_rm<0x51, "sqrt",  fsqrt,
+                             int_x86_sse2_sqrt_sd, int_x86_sse2_sqrt_pd>;
+
+// There is no f64 version of the reciprocal approximation instructions.
+
+// Logical
+let Constraints = "$src1 = $dst" in {
+  let isCommutable = 1 in {
+    def ANDPDrr : PDI<0x54, MRMSrcReg,
+                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                      "andpd\t{$src2, $dst|$dst, $src2}",
+                      [(set VR128:$dst,
+                        (and (bc_v2i64 (v2f64 VR128:$src1)),
+                         (bc_v2i64 (v2f64 VR128:$src2))))]>;
+    def ORPDrr  : PDI<0x56, MRMSrcReg,
+                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                      "orpd\t{$src2, $dst|$dst, $src2}",
+                      [(set VR128:$dst,
+                        (or (bc_v2i64 (v2f64 VR128:$src1)),
+                         (bc_v2i64 (v2f64 VR128:$src2))))]>;
+    def XORPDrr : PDI<0x57, MRMSrcReg,
+                      (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                      "xorpd\t{$src2, $dst|$dst, $src2}",
+                      [(set VR128:$dst,
+                        (xor (bc_v2i64 (v2f64 VR128:$src1)),
+                         (bc_v2i64 (v2f64 VR128:$src2))))]>;
+  }
+
+  def ANDPDrm : PDI<0x54, MRMSrcMem,
+                    (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+                    "andpd\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst,
+                      (and (bc_v2i64 (v2f64 VR128:$src1)),
+                       (memopv2i64 addr:$src2)))]>;
+  def ORPDrm  : PDI<0x56, MRMSrcMem,
+                    (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+                    "orpd\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst,
+                      (or (bc_v2i64 (v2f64 VR128:$src1)),
+                       (memopv2i64 addr:$src2)))]>;
+  def XORPDrm : PDI<0x57, MRMSrcMem,
+                    (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+                    "xorpd\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst,
+                      (xor (bc_v2i64 (v2f64 VR128:$src1)),
+                       (memopv2i64 addr:$src2)))]>;
+  def ANDNPDrr : PDI<0x55, MRMSrcReg,
+                     (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                     "andnpd\t{$src2, $dst|$dst, $src2}",
+                     [(set VR128:$dst,
+                       (and (vnot (bc_v2i64 (v2f64 VR128:$src1))),
+                        (bc_v2i64 (v2f64 VR128:$src2))))]>;
+  def ANDNPDrm : PDI<0x55, MRMSrcMem,
+                     (outs VR128:$dst), (ins VR128:$src1,f128mem:$src2),
+                     "andnpd\t{$src2, $dst|$dst, $src2}",
+                     [(set VR128:$dst,
+                       (and (vnot (bc_v2i64 (v2f64 VR128:$src1))),
+                        (memopv2i64 addr:$src2)))]>;
+}
+
+let Constraints = "$src1 = $dst" in {
+  def CMPPDrri : PDIi8<0xC2, MRMSrcReg,
+                    (outs VR128:$dst), (ins VR128:$src1, VR128:$src, SSECC:$cc),
+                    "cmp${cc}pd\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst, (int_x86_sse2_cmp_pd VR128:$src1,
+                                                        VR128:$src, imm:$cc))]>;
+  def CMPPDrmi : PDIi8<0xC2, MRMSrcMem,
+                  (outs VR128:$dst), (ins VR128:$src1, f128mem:$src, SSECC:$cc),
+                  "cmp${cc}pd\t{$src, $dst|$dst, $src}",
+                  [(set VR128:$dst, (int_x86_sse2_cmp_pd VR128:$src1,
+                                                 (memop addr:$src), imm:$cc))]>;
+}
+def : Pat<(v2i64 (X86cmppd (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
+          (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
+def : Pat<(v2i64 (X86cmppd (v2f64 VR128:$src1), (memop addr:$src2), imm:$cc)),
+          (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
+
+// Shuffle and unpack instructions
+let Constraints = "$src1 = $dst" in {
+  def SHUFPDrri : PDIi8<0xC6, MRMSrcReg,
+                 (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+                 "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                 [(set VR128:$dst,
+                   (v2f64 (shufp:$src3 VR128:$src1, VR128:$src2)))]>;
+  def SHUFPDrmi : PDIi8<0xC6, MRMSrcMem,
+                        (outs VR128:$dst), (ins VR128:$src1,
+                         f128mem:$src2, i8imm:$src3),
+                        "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                        [(set VR128:$dst,
+                          (v2f64 (shufp:$src3
+                                  VR128:$src1, (memopv2f64 addr:$src2))))]>;
+
+  let AddedComplexity = 10 in {
+    def UNPCKHPDrr : PDI<0x15, MRMSrcReg,
+                         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                         "unpckhpd\t{$src2, $dst|$dst, $src2}",
+                         [(set VR128:$dst,
+                           (v2f64 (unpckh VR128:$src1, VR128:$src2)))]>;
+    def UNPCKHPDrm : PDI<0x15, MRMSrcMem,
+                         (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+                         "unpckhpd\t{$src2, $dst|$dst, $src2}",
+                         [(set VR128:$dst,
+                           (v2f64 (unpckh VR128:$src1,
+                                          (memopv2f64 addr:$src2))))]>;
+
+    def UNPCKLPDrr : PDI<0x14, MRMSrcReg,
+                         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                         "unpcklpd\t{$src2, $dst|$dst, $src2}",
+                         [(set VR128:$dst,
+                           (v2f64 (unpckl VR128:$src1, VR128:$src2)))]>;
+    def UNPCKLPDrm : PDI<0x14, MRMSrcMem,
+                         (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+                         "unpcklpd\t{$src2, $dst|$dst, $src2}",
+                         [(set VR128:$dst,
+                           (unpckl VR128:$src1, (memopv2f64 addr:$src2)))]>;
+  } // AddedComplexity
+} // Constraints = "$src1 = $dst"
+
+
+//===---------------------------------------------------------------------===//
+// SSE integer instructions
+
+// Move Instructions
+let neverHasSideEffects = 1 in
+def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                   "movdqa\t{$src, $dst|$dst, $src}", []>;
+let canFoldAsLoad = 1, mayLoad = 1 in
+def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                   "movdqa\t{$src, $dst|$dst, $src}",
+                   [/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/]>;
+let mayStore = 1 in
+def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+                   "movdqa\t{$src, $dst|$dst, $src}",
+                   [/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/]>;
+let canFoldAsLoad = 1, mayLoad = 1 in
+def MOVDQUrm :   I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                   "movdqu\t{$src, $dst|$dst, $src}",
+                   [/*(set VR128:$dst, (loadv2i64 addr:$src))*/]>,
+                 XS, Requires<[HasSSE2]>;
+let mayStore = 1 in
+def MOVDQUmr :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+                   "movdqu\t{$src, $dst|$dst, $src}",
+                   [/*(store (v2i64 VR128:$src), addr:$dst)*/]>,
+                 XS, Requires<[HasSSE2]>;
+
+// Intrinsic forms of MOVDQU load and store
+let canFoldAsLoad = 1 in
+def MOVDQUrm_Int :   I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                       "movdqu\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse2_loadu_dq addr:$src))]>,
+                 XS, Requires<[HasSSE2]>;
+def MOVDQUmr_Int :   I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+                       "movdqu\t{$src, $dst|$dst, $src}",
+                       [(int_x86_sse2_storeu_dq addr:$dst, VR128:$src)]>,
+                     XS, Requires<[HasSSE2]>;
+
+let Constraints = "$src1 = $dst" in {
+
+multiclass PDI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
+                            bit Commutable = 0> {
+  def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst), 
+                               (ins VR128:$src1, VR128:$src2),
+               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]> {
+    let isCommutable = Commutable;
+  }
+  def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst), 
+                               (ins VR128:$src1, i128mem:$src2),
+               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (IntId VR128:$src1,
+                                        (bitconvert (memopv2i64 
+                                                     addr:$src2))))]>;
+}
+
+multiclass PDI_binop_rmi_int<bits<8> opc, bits<8> opc2, Format ImmForm,
+                             string OpcodeStr,
+                             Intrinsic IntId, Intrinsic IntId2> {
+  def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst), 
+                               (ins VR128:$src1, VR128:$src2),
+               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2))]>;
+  def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst),
+                               (ins VR128:$src1, i128mem:$src2),
+               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (IntId VR128:$src1,
+                                      (bitconvert (memopv2i64 addr:$src2))))]>;
+  def ri : PDIi8<opc2, ImmForm, (outs VR128:$dst), 
+                                (ins VR128:$src1, i32i8imm:$src2),
+               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (IntId2 VR128:$src1, (i32 imm:$src2)))]>;
+}
+
+/// PDI_binop_rm - Simple SSE2 binary operator.
+multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                        ValueType OpVT, bit Commutable = 0> {
+  def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst), 
+                               (ins VR128:$src1, VR128:$src2),
+               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (OpVT (OpNode VR128:$src1, VR128:$src2)))]> {
+    let isCommutable = Commutable;
+  }
+  def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst), 
+                               (ins VR128:$src1, i128mem:$src2),
+               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (OpVT (OpNode VR128:$src1,
+                                     (bitconvert (memopv2i64 addr:$src2)))))]>;
+}
+
+/// PDI_binop_rm_v2i64 - Simple SSE2 binary operator whose type is v2i64.
+///
+/// FIXME: we could eliminate this and use PDI_binop_rm instead if tblgen knew
+/// to collapse (bitconvert VT to VT) into its operand.
+///
+multiclass PDI_binop_rm_v2i64<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                              bit Commutable = 0> {
+  def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst),
+               (ins VR128:$src1, VR128:$src2),
+               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))]> {
+    let isCommutable = Commutable;
+  }
+  def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst),
+               (ins VR128:$src1, i128mem:$src2),
+               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+               [(set VR128:$dst, (OpNode VR128:$src1,
+               (memopv2i64 addr:$src2)))]>;
+}
+
+} // Constraints = "$src1 = $dst"
+
+// 128-bit Integer Arithmetic
+
+defm PADDB : PDI_binop_rm<0xFC, "paddb", add, v16i8, 1>;
+defm PADDW : PDI_binop_rm<0xFD, "paddw", add, v8i16, 1>;
+defm PADDD : PDI_binop_rm<0xFE, "paddd", add, v4i32, 1>;
+defm PADDQ : PDI_binop_rm_v2i64<0xD4, "paddq", add, 1>;
+
+defm PADDSB  : PDI_binop_rm_int<0xEC, "paddsb" , int_x86_sse2_padds_b, 1>;
+defm PADDSW  : PDI_binop_rm_int<0xED, "paddsw" , int_x86_sse2_padds_w, 1>;
+defm PADDUSB : PDI_binop_rm_int<0xDC, "paddusb", int_x86_sse2_paddus_b, 1>;
+defm PADDUSW : PDI_binop_rm_int<0xDD, "paddusw", int_x86_sse2_paddus_w, 1>;
+
+defm PSUBB : PDI_binop_rm<0xF8, "psubb", sub, v16i8>;
+defm PSUBW : PDI_binop_rm<0xF9, "psubw", sub, v8i16>;
+defm PSUBD : PDI_binop_rm<0xFA, "psubd", sub, v4i32>;
+defm PSUBQ : PDI_binop_rm_v2i64<0xFB, "psubq", sub>;
+
+defm PSUBSB  : PDI_binop_rm_int<0xE8, "psubsb" , int_x86_sse2_psubs_b>;
+defm PSUBSW  : PDI_binop_rm_int<0xE9, "psubsw" , int_x86_sse2_psubs_w>;
+defm PSUBUSB : PDI_binop_rm_int<0xD8, "psubusb", int_x86_sse2_psubus_b>;
+defm PSUBUSW : PDI_binop_rm_int<0xD9, "psubusw", int_x86_sse2_psubus_w>;
+
+defm PMULLW : PDI_binop_rm<0xD5, "pmullw", mul, v8i16, 1>;
+
+defm PMULHUW : PDI_binop_rm_int<0xE4, "pmulhuw", int_x86_sse2_pmulhu_w, 1>;
+defm PMULHW  : PDI_binop_rm_int<0xE5, "pmulhw" , int_x86_sse2_pmulh_w , 1>;
+defm PMULUDQ : PDI_binop_rm_int<0xF4, "pmuludq", int_x86_sse2_pmulu_dq, 1>;
+
+defm PMADDWD : PDI_binop_rm_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd, 1>;
+
+defm PAVGB  : PDI_binop_rm_int<0xE0, "pavgb", int_x86_sse2_pavg_b, 1>;
+defm PAVGW  : PDI_binop_rm_int<0xE3, "pavgw", int_x86_sse2_pavg_w, 1>;
+
+
+defm PMINUB : PDI_binop_rm_int<0xDA, "pminub", int_x86_sse2_pminu_b, 1>;
+defm PMINSW : PDI_binop_rm_int<0xEA, "pminsw", int_x86_sse2_pmins_w, 1>;
+defm PMAXUB : PDI_binop_rm_int<0xDE, "pmaxub", int_x86_sse2_pmaxu_b, 1>;
+defm PMAXSW : PDI_binop_rm_int<0xEE, "pmaxsw", int_x86_sse2_pmaxs_w, 1>;
+defm PSADBW : PDI_binop_rm_int<0xF6, "psadbw", int_x86_sse2_psad_bw, 1>;
+
+
+defm PSLLW : PDI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw",
+                               int_x86_sse2_psll_w, int_x86_sse2_pslli_w>;
+defm PSLLD : PDI_binop_rmi_int<0xF2, 0x72, MRM6r, "pslld",
+                               int_x86_sse2_psll_d, int_x86_sse2_pslli_d>;
+defm PSLLQ : PDI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq",
+                               int_x86_sse2_psll_q, int_x86_sse2_pslli_q>;
+
+defm PSRLW : PDI_binop_rmi_int<0xD1, 0x71, MRM2r, "psrlw",
+                               int_x86_sse2_psrl_w, int_x86_sse2_psrli_w>;
+defm PSRLD : PDI_binop_rmi_int<0xD2, 0x72, MRM2r, "psrld",
+                               int_x86_sse2_psrl_d, int_x86_sse2_psrli_d>;
+defm PSRLQ : PDI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq",
+                               int_x86_sse2_psrl_q, int_x86_sse2_psrli_q>;
+
+defm PSRAW : PDI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw",
+                               int_x86_sse2_psra_w, int_x86_sse2_psrai_w>;
+defm PSRAD : PDI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad",
+                               int_x86_sse2_psra_d, int_x86_sse2_psrai_d>;
+
+// 128-bit logical shifts.
+let Constraints = "$src1 = $dst", neverHasSideEffects = 1 in {
+  def PSLLDQri : PDIi8<0x73, MRM7r,
+                       (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                       "pslldq\t{$src2, $dst|$dst, $src2}", []>;
+  def PSRLDQri : PDIi8<0x73, MRM3r,
+                       (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                       "psrldq\t{$src2, $dst|$dst, $src2}", []>;
+  // PSRADQri doesn't exist in SSE[1-3].
+}
+
+let Predicates = [HasSSE2] in {
+  def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2),
+            (v2i64 (PSLLDQri VR128:$src1, (BYTE_imm imm:$src2)))>;
+  def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2),
+            (v2i64 (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2)))>;
+  def : Pat<(int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2),
+            (v2i64 (PSLLDQri VR128:$src1, imm:$src2))>;
+  def : Pat<(int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2),
+            (v2i64 (PSRLDQri VR128:$src1, imm:$src2))>;
+  def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
+            (v2f64 (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2)))>;
+
+  // Shift up / down and insert zero's.
+  def : Pat<(v2i64 (X86vshl  VR128:$src, (i8 imm:$amt))),
+            (v2i64 (PSLLDQri VR128:$src, (BYTE_imm imm:$amt)))>;
+  def : Pat<(v2i64 (X86vshr  VR128:$src, (i8 imm:$amt))),
+            (v2i64 (PSRLDQri VR128:$src, (BYTE_imm imm:$amt)))>;
+}
+
+// Logical
+defm PAND : PDI_binop_rm_v2i64<0xDB, "pand", and, 1>;
+defm POR  : PDI_binop_rm_v2i64<0xEB, "por" , or , 1>;
+defm PXOR : PDI_binop_rm_v2i64<0xEF, "pxor", xor, 1>;
+
+let Constraints = "$src1 = $dst" in {
+  def PANDNrr : PDI<0xDF, MRMSrcReg,
+                    (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                    "pandn\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst, (v2i64 (and (vnot VR128:$src1),
+                                              VR128:$src2)))]>;
+
+  def PANDNrm : PDI<0xDF, MRMSrcMem,
+                    (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+                    "pandn\t{$src2, $dst|$dst, $src2}",
+                    [(set VR128:$dst, (v2i64 (and (vnot VR128:$src1),
+                                              (memopv2i64 addr:$src2))))]>;
+}
+
+// SSE2 Integer comparison
+defm PCMPEQB  : PDI_binop_rm_int<0x74, "pcmpeqb", int_x86_sse2_pcmpeq_b>;
+defm PCMPEQW  : PDI_binop_rm_int<0x75, "pcmpeqw", int_x86_sse2_pcmpeq_w>;
+defm PCMPEQD  : PDI_binop_rm_int<0x76, "pcmpeqd", int_x86_sse2_pcmpeq_d>;
+defm PCMPGTB  : PDI_binop_rm_int<0x64, "pcmpgtb", int_x86_sse2_pcmpgt_b>;
+defm PCMPGTW  : PDI_binop_rm_int<0x65, "pcmpgtw", int_x86_sse2_pcmpgt_w>;
+defm PCMPGTD  : PDI_binop_rm_int<0x66, "pcmpgtd", int_x86_sse2_pcmpgt_d>;
+
+def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, VR128:$src2)),
+          (PCMPEQBrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, (memop addr:$src2))),
+          (PCMPEQBrm VR128:$src1, addr:$src2)>;
+def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, VR128:$src2)),
+          (PCMPEQWrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, (memop addr:$src2))),
+          (PCMPEQWrm VR128:$src1, addr:$src2)>;
+def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, VR128:$src2)),
+          (PCMPEQDrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, (memop addr:$src2))),
+          (PCMPEQDrm VR128:$src1, addr:$src2)>;
+
+def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, VR128:$src2)),
+          (PCMPGTBrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, (memop addr:$src2))),
+          (PCMPGTBrm VR128:$src1, addr:$src2)>;
+def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, VR128:$src2)),
+          (PCMPGTWrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, (memop addr:$src2))),
+          (PCMPGTWrm VR128:$src1, addr:$src2)>;
+def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, VR128:$src2)),
+          (PCMPGTDrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, (memop addr:$src2))),
+          (PCMPGTDrm VR128:$src1, addr:$src2)>;
+
+
+// Pack instructions
+defm PACKSSWB : PDI_binop_rm_int<0x63, "packsswb", int_x86_sse2_packsswb_128>;
+defm PACKSSDW : PDI_binop_rm_int<0x6B, "packssdw", int_x86_sse2_packssdw_128>;
+defm PACKUSWB : PDI_binop_rm_int<0x67, "packuswb", int_x86_sse2_packuswb_128>;
+
+// Shuffle and unpack instructions
+let AddedComplexity = 5 in {
+def PSHUFDri : PDIi8<0x70, MRMSrcReg,
+                     (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2),
+                     "pshufd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     [(set VR128:$dst, (v4i32 (pshufd:$src2
+                                               VR128:$src1, (undef))))]>;
+def PSHUFDmi : PDIi8<0x70, MRMSrcMem,
+                     (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
+                     "pshufd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                     [(set VR128:$dst, (v4i32 (pshufd:$src2
+                                             (bc_v4i32 (memopv2i64 addr:$src1)),
+                                             (undef))))]>;
+}
+
+// SSE2 with ImmT == Imm8 and XS prefix.
+def PSHUFHWri : Ii8<0x70, MRMSrcReg,
+                    (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2),
+                    "pshufhw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set VR128:$dst, (v8i16 (pshufhw:$src2 VR128:$src1,
+                                                            (undef))))]>,
+                XS, Requires<[HasSSE2]>;
+def PSHUFHWmi : Ii8<0x70, MRMSrcMem,
+                    (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
+                    "pshufhw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set VR128:$dst, (v8i16 (pshufhw:$src2
+                                            (bc_v8i16 (memopv2i64 addr:$src1)),
+                                            (undef))))]>,
+                XS, Requires<[HasSSE2]>;
+
+// SSE2 with ImmT == Imm8 and XD prefix.
+def PSHUFLWri : Ii8<0x70, MRMSrcReg,
+                    (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2),
+                    "pshuflw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set VR128:$dst, (v8i16 (pshuflw:$src2 VR128:$src1,
+                                                            (undef))))]>,
+                XD, Requires<[HasSSE2]>;
+def PSHUFLWmi : Ii8<0x70, MRMSrcMem,
+                    (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
+                    "pshuflw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set VR128:$dst, (v8i16 (pshuflw:$src2
+                                             (bc_v8i16 (memopv2i64 addr:$src1)),
+                                             (undef))))]>,
+                XD, Requires<[HasSSE2]>;
+
+
+let Constraints = "$src1 = $dst" in {
+  def PUNPCKLBWrr : PDI<0x60, MRMSrcReg,
+                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                        "punpcklbw\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v16i8 (unpckl VR128:$src1, VR128:$src2)))]>;
+  def PUNPCKLBWrm : PDI<0x60, MRMSrcMem,
+                        (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+                        "punpcklbw\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (unpckl VR128:$src1,
+                                  (bc_v16i8 (memopv2i64 addr:$src2))))]>;
+  def PUNPCKLWDrr : PDI<0x61, MRMSrcReg,
+                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                        "punpcklwd\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v8i16 (unpckl VR128:$src1, VR128:$src2)))]>;
+  def PUNPCKLWDrm : PDI<0x61, MRMSrcMem,
+                        (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+                        "punpcklwd\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (unpckl VR128:$src1,
+                                  (bc_v8i16 (memopv2i64 addr:$src2))))]>;
+  def PUNPCKLDQrr : PDI<0x62, MRMSrcReg,
+                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                        "punpckldq\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v4i32 (unpckl VR128:$src1, VR128:$src2)))]>;
+  def PUNPCKLDQrm : PDI<0x62, MRMSrcMem,
+                        (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+                        "punpckldq\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (unpckl VR128:$src1,
+                                  (bc_v4i32 (memopv2i64 addr:$src2))))]>;
+  def PUNPCKLQDQrr : PDI<0x6C, MRMSrcReg,
+                         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                         "punpcklqdq\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v2i64 (unpckl VR128:$src1, VR128:$src2)))]>;
+  def PUNPCKLQDQrm : PDI<0x6C, MRMSrcMem,
+                         (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+                         "punpcklqdq\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v2i64 (unpckl VR128:$src1,
+                                         (memopv2i64 addr:$src2))))]>;
+
+  def PUNPCKHBWrr : PDI<0x68, MRMSrcReg,
+                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                        "punpckhbw\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v16i8 (unpckh VR128:$src1, VR128:$src2)))]>;
+  def PUNPCKHBWrm : PDI<0x68, MRMSrcMem,
+                        (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+                        "punpckhbw\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (unpckh VR128:$src1,
+                                  (bc_v16i8 (memopv2i64 addr:$src2))))]>;
+  def PUNPCKHWDrr : PDI<0x69, MRMSrcReg,
+                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                        "punpckhwd\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v8i16 (unpckh VR128:$src1, VR128:$src2)))]>;
+  def PUNPCKHWDrm : PDI<0x69, MRMSrcMem,
+                        (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+                        "punpckhwd\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (unpckh VR128:$src1,
+                                  (bc_v8i16 (memopv2i64 addr:$src2))))]>;
+  def PUNPCKHDQrr : PDI<0x6A, MRMSrcReg,
+                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                        "punpckhdq\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v4i32 (unpckh VR128:$src1, VR128:$src2)))]>;
+  def PUNPCKHDQrm : PDI<0x6A, MRMSrcMem,
+                        (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+                        "punpckhdq\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (unpckh VR128:$src1,
+                                  (bc_v4i32 (memopv2i64 addr:$src2))))]>;
+  def PUNPCKHQDQrr : PDI<0x6D, MRMSrcReg,
+                         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                         "punpckhqdq\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v2i64 (unpckh VR128:$src1, VR128:$src2)))]>;
+  def PUNPCKHQDQrm : PDI<0x6D, MRMSrcMem,
+                        (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+                        "punpckhqdq\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst,
+                          (v2i64 (unpckh VR128:$src1,
+                                         (memopv2i64 addr:$src2))))]>;
+}
+
+// Extract / Insert
+def PEXTRWri : PDIi8<0xC5, MRMSrcReg,
+                    (outs GR32:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                    "pextrw\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set GR32:$dst, (X86pextrw (v8i16 VR128:$src1),
+                                                imm:$src2))]>;
+let Constraints = "$src1 = $dst" in {
+  def PINSRWrri : PDIi8<0xC4, MRMSrcReg,
+                       (outs VR128:$dst), (ins VR128:$src1,
+                        GR32:$src2, i32i8imm:$src3),
+                       "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                       [(set VR128:$dst,
+                         (X86pinsrw VR128:$src1, GR32:$src2, imm:$src3))]>;
+  def PINSRWrmi : PDIi8<0xC4, MRMSrcMem,
+                       (outs VR128:$dst), (ins VR128:$src1,
+                        i16mem:$src2, i32i8imm:$src3),
+                       "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                       [(set VR128:$dst,
+                         (X86pinsrw VR128:$src1, (extloadi16 addr:$src2),
+                                    imm:$src3))]>;
+}
+
+// Mask creation
+def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
+                     "pmovmskb\t{$src, $dst|$dst, $src}",
+                     [(set GR32:$dst, (int_x86_sse2_pmovmskb_128 VR128:$src))]>;
+
+// Conditional store
+let Uses = [EDI] in
+def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
+                     "maskmovdqu\t{$mask, $src|$src, $mask}",
+                     [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)]>;
+
+let Uses = [RDI] in
+def MASKMOVDQU64 : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
+                     "maskmovdqu\t{$mask, $src|$src, $mask}",
+                     [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)]>;
+
+// Non-temporal stores
+def MOVNTPDmr : PDI<0x2B, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
+                    "movntpd\t{$src, $dst|$dst, $src}",
+                    [(int_x86_sse2_movnt_pd addr:$dst, VR128:$src)]>;
+def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
+                    "movntdq\t{$src, $dst|$dst, $src}",
+                    [(int_x86_sse2_movnt_dq addr:$dst, VR128:$src)]>;
+def MOVNTImr  :   I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+                    "movnti\t{$src, $dst|$dst, $src}",
+                    [(int_x86_sse2_movnt_i addr:$dst, GR32:$src)]>,
+                  TB, Requires<[HasSSE2]>;
+
+// Flush cache
+def CLFLUSH : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
+               "clflush\t$src", [(int_x86_sse2_clflush addr:$src)]>,
+              TB, Requires<[HasSSE2]>;
+
+// Load, store, and memory fence
+def LFENCE : I<0xAE, MRM5r, (outs), (ins),
+               "lfence", [(int_x86_sse2_lfence)]>, TB, Requires<[HasSSE2]>;
+def MFENCE : I<0xAE, MRM6r, (outs), (ins),
+               "mfence", [(int_x86_sse2_mfence)]>, TB, Requires<[HasSSE2]>;
+
+//TODO: custom lower this so as to never even generate the noop
+def : Pat<(membarrier (i8 imm:$ll), (i8 imm:$ls), (i8 imm:$sl), (i8 imm:$ss),
+           (i8 0)), (NOOP)>;
+def : Pat<(membarrier (i8 0), (i8 0), (i8 0), (i8 1), (i8 1)), (SFENCE)>;
+def : Pat<(membarrier (i8 1), (i8 0), (i8 0), (i8 0), (i8 1)), (LFENCE)>;
+def : Pat<(membarrier (i8 imm:$ll), (i8 imm:$ls), (i8 imm:$sl), (i8 imm:$ss),
+           (i8 1)), (MFENCE)>;
+
+// Alias instructions that map zero vector to pxor / xorp* for sse.
+// We set canFoldAsLoad because this can be converted to a constant-pool
+// load of an all-ones value if folding it would be beneficial.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
+    isCodeGenOnly = 1 in
+  // FIXME: Change encoding to pseudo.
+  def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "",
+                         [(set VR128:$dst, (v4i32 immAllOnesV))]>;
+
+// FR64 to 128-bit vector conversion.
+let isAsCheapAsAMove = 1 in
+def MOVSD2PDrr : SDI<0x10, MRMSrcReg, (outs VR128:$dst), (ins FR64:$src),
+                      "movsd\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v2f64 (scalar_to_vector FR64:$src)))]>;
+def MOVSD2PDrm : SDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+                     "movsd\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst,
+                       (v2f64 (scalar_to_vector (loadf64 addr:$src))))]>;
+
+def MOVDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v4i32 (scalar_to_vector GR32:$src)))]>;
+def MOVDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v4i32 (scalar_to_vector (loadi32 addr:$src))))]>;
+
+def MOVDI2SSrr  : PDI<0x6E, MRMSrcReg, (outs FR32:$dst), (ins GR32:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(set FR32:$dst, (bitconvert GR32:$src))]>;
+
+def MOVDI2SSrm  : PDI<0x6E, MRMSrcMem, (outs FR32:$dst), (ins i32mem:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(set FR32:$dst, (bitconvert (loadi32 addr:$src)))]>;
+
+// SSE2 instructions with XS prefix
+def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                    "movq\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst,
+                      (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
+                  Requires<[HasSSE2]>;
+def MOVPQI2QImr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+                      "movq\t{$src, $dst|$dst, $src}",
+                      [(store (i64 (vector_extract (v2i64 VR128:$src),
+                                    (iPTR 0))), addr:$dst)]>;
+
+// FIXME: may not be able to eliminate this movss with coalescing the src and
+// dest register classes are different. We really want to write this pattern
+// like this:
+// def : Pat<(f32 (vector_extract (v4f32 VR128:$src), (iPTR 0))),
+//           (f32 FR32:$src)>;
+let isAsCheapAsAMove = 1 in
+def MOVPD2SDrr : SDI<0x10, MRMSrcReg, (outs FR64:$dst), (ins VR128:$src),
+                     "movsd\t{$src, $dst|$dst, $src}",
+                     [(set FR64:$dst, (vector_extract (v2f64 VR128:$src),
+                                       (iPTR 0)))]>;
+def MOVPD2SDmr : SDI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
+                     "movsd\t{$src, $dst|$dst, $src}",
+                     [(store (f64 (vector_extract (v2f64 VR128:$src),
+                                   (iPTR 0))), addr:$dst)]>;
+def MOVPDI2DIrr  : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128:$src),
+                       "movd\t{$src, $dst|$dst, $src}",
+                       [(set GR32:$dst, (vector_extract (v4i32 VR128:$src),
+                                        (iPTR 0)))]>;
+def MOVPDI2DImr  : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
+                       "movd\t{$src, $dst|$dst, $src}",
+                       [(store (i32 (vector_extract (v4i32 VR128:$src),
+                                     (iPTR 0))), addr:$dst)]>;
+
+def MOVSS2DIrr  : PDI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(set GR32:$dst, (bitconvert FR32:$src))]>;
+def MOVSS2DImr  : PDI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32:$src),
+                      "movd\t{$src, $dst|$dst, $src}",
+                      [(store (i32 (bitconvert FR32:$src)), addr:$dst)]>;
+
+
+// Move to lower bits of a VR128, leaving upper bits alone.
+// Three operand (but two address) aliases.
+let Constraints = "$src1 = $dst" in {
+  let neverHasSideEffects = 1 in
+  def MOVLSD2PDrr : SDI<0x10, MRMSrcReg,
+                        (outs VR128:$dst), (ins VR128:$src1, FR64:$src2),
+                        "movsd\t{$src2, $dst|$dst, $src2}", []>;
+
+  let AddedComplexity = 15 in
+    def MOVLPDrr : SDI<0x10, MRMSrcReg,
+                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                       "movsd\t{$src2, $dst|$dst, $src2}",
+                       [(set VR128:$dst,
+                         (v2f64 (movl VR128:$src1, VR128:$src2)))]>;
+}
+
+// Store / copy lower 64-bits of a XMM register.
+def MOVLQ128mr : PDI<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+                     "movq\t{$src, $dst|$dst, $src}",
+                     [(int_x86_sse2_storel_dq addr:$dst, VR128:$src)]>;
+
+// Move to lower bits of a VR128 and zeroing upper bits.
+// Loading from memory automatically zeroing upper bits.
+let AddedComplexity = 20 in {
+def MOVZSD2PDrm : SDI<0x10, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+                      "movsd\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,
+                        (v2f64 (X86vzmovl (v2f64 (scalar_to_vector
+                                                 (loadf64 addr:$src))))))]>;
+
+def : Pat<(v2f64 (X86vzmovl (loadv2f64 addr:$src))),
+            (MOVZSD2PDrm addr:$src)>;
+def : Pat<(v2f64 (X86vzmovl (bc_v2f64 (loadv4f32 addr:$src)))),
+            (MOVZSD2PDrm addr:$src)>;
+def : Pat<(v2f64 (X86vzload addr:$src)), (MOVZSD2PDrm addr:$src)>;
+}
+
+// movd / movq to XMM register zero-extends
+let AddedComplexity = 15 in {
+def MOVZDI2PDIrr : PDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR32:$src),
+                       "movd\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (v4i32 (X86vzmovl
+                                      (v4i32 (scalar_to_vector GR32:$src)))))]>;
+// This is X86-64 only.
+def MOVZQI2PQIrr : RPDI<0x6E, MRMSrcReg, (outs VR128:$dst), (ins GR64:$src),
+                       "mov{d|q}\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (v2i64 (X86vzmovl
+                                      (v2i64 (scalar_to_vector GR64:$src)))))]>;
+}
+
+let AddedComplexity = 20 in {
+def MOVZDI2PDIrm : PDI<0x6E, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+                       "movd\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst,
+                         (v4i32 (X86vzmovl (v4i32 (scalar_to_vector
+                                                   (loadi32 addr:$src))))))]>;
+
+def : Pat<(v4i32 (X86vzmovl (loadv4i32 addr:$src))),
+            (MOVZDI2PDIrm addr:$src)>;
+def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
+            (MOVZDI2PDIrm addr:$src)>;
+def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv2i64 addr:$src)))),
+            (MOVZDI2PDIrm addr:$src)>;
+
+def MOVZQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                     "movq\t{$src, $dst|$dst, $src}",
+                     [(set VR128:$dst,
+                       (v2i64 (X86vzmovl (v2i64 (scalar_to_vector
+                                                 (loadi64 addr:$src))))))]>, XS,
+                   Requires<[HasSSE2]>;
+
+def : Pat<(v2i64 (X86vzmovl (loadv2i64 addr:$src))),
+            (MOVZQI2PQIrm addr:$src)>;
+def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4f32 addr:$src)))),
+            (MOVZQI2PQIrm addr:$src)>;
+def : Pat<(v2i64 (X86vzload addr:$src)), (MOVZQI2PQIrm addr:$src)>;
+}
+
+// Moving from XMM to XMM and clear upper 64 bits. Note, there is a bug in
+// IA32 document. movq xmm1, xmm2 does clear the high bits.
+let AddedComplexity = 15 in
+def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                        "movq\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))]>,
+                      XS, Requires<[HasSSE2]>;
+
+let AddedComplexity = 20 in {
+def MOVZPQILo2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                        "movq\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst, (v2i64 (X86vzmovl
+                                             (loadv2i64 addr:$src))))]>,
+                      XS, Requires<[HasSSE2]>;
+
+def : Pat<(v2i64 (X86vzmovl (bc_v2i64 (loadv4i32 addr:$src)))),
+            (MOVZPQILo2PQIrm addr:$src)>;
+}
+
+// Instructions for the disassembler
+// xr = XMM register
+// xm = mem64
+
+def MOVQxrxr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                 "movq\t{$src, $dst|$dst, $src}", []>, XS;
+
+//===---------------------------------------------------------------------===//
+// SSE3 Instructions
+//===---------------------------------------------------------------------===//
+
+// Move Instructions
+def MOVSHDUPrr : S3SI<0x16, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                      "movshdup\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (v4f32 (movshdup
+                                                VR128:$src, (undef))))]>;
+def MOVSHDUPrm : S3SI<0x16, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                      "movshdup\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (movshdup
+                                         (memopv4f32 addr:$src), (undef)))]>;
+
+def MOVSLDUPrr : S3SI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                      "movsldup\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (v4f32 (movsldup
+                                                VR128:$src, (undef))))]>;
+def MOVSLDUPrm : S3SI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+                      "movsldup\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst, (movsldup
+                                         (memopv4f32 addr:$src), (undef)))]>;
+
+def MOVDDUPrr  : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                      "movddup\t{$src, $dst|$dst, $src}",
+                      [(set VR128:$dst,(v2f64 (movddup VR128:$src, (undef))))]>;
+def MOVDDUPrm  : S3DI<0x12, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
+                      "movddup\t{$src, $dst|$dst, $src}",
+                    [(set VR128:$dst,
+                      (v2f64 (movddup (scalar_to_vector (loadf64 addr:$src)),
+                                      (undef))))]>;
+
+def : Pat<(movddup (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src)))),
+                   (undef)),
+          (MOVDDUPrm addr:$src)>, Requires<[HasSSE3]>;
+
+let AddedComplexity = 5 in {
+def : Pat<(movddup (memopv2f64 addr:$src), (undef)),
+          (MOVDDUPrm addr:$src)>, Requires<[HasSSE3]>;
+def : Pat<(movddup (bc_v4f32 (memopv2f64 addr:$src)), (undef)),
+          (MOVDDUPrm addr:$src)>, Requires<[HasSSE3]>;
+def : Pat<(movddup (memopv2i64 addr:$src), (undef)),
+          (MOVDDUPrm addr:$src)>, Requires<[HasSSE3]>;
+def : Pat<(movddup (bc_v4i32 (memopv2i64 addr:$src)), (undef)),
+          (MOVDDUPrm addr:$src)>, Requires<[HasSSE3]>;
+}
+
+// Arithmetic
+let Constraints = "$src1 = $dst" in {
+  def ADDSUBPSrr : S3DI<0xD0, MRMSrcReg,
+                        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                        "addsubps\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst, (int_x86_sse3_addsub_ps VR128:$src1,
+                                           VR128:$src2))]>;
+  def ADDSUBPSrm : S3DI<0xD0, MRMSrcMem,
+                        (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+                        "addsubps\t{$src2, $dst|$dst, $src2}",
+                        [(set VR128:$dst, (int_x86_sse3_addsub_ps VR128:$src1,
+                                           (memop addr:$src2)))]>;
+  def ADDSUBPDrr : S3I<0xD0, MRMSrcReg,
+                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+                       "addsubpd\t{$src2, $dst|$dst, $src2}",
+                       [(set VR128:$dst, (int_x86_sse3_addsub_pd VR128:$src1,
+                                          VR128:$src2))]>;
+  def ADDSUBPDrm : S3I<0xD0, MRMSrcMem,
+                       (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+                       "addsubpd\t{$src2, $dst|$dst, $src2}",
+                       [(set VR128:$dst, (int_x86_sse3_addsub_pd VR128:$src1,
+                                          (memop addr:$src2)))]>;
+}
+
+def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                   "lddqu\t{$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>;
+
+// Horizontal ops
+class S3D_Intrr<bits<8> o, string OpcodeStr, Intrinsic IntId>
+  : S3DI<o, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         [(set VR128:$dst, (v4f32 (IntId VR128:$src1, VR128:$src2)))]>;
+class S3D_Intrm<bits<8> o, string OpcodeStr, Intrinsic IntId>
+  : S3DI<o, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+         [(set VR128:$dst, (v4f32 (IntId VR128:$src1, (memop addr:$src2))))]>;
+class S3_Intrr<bits<8> o, string OpcodeStr, Intrinsic IntId>
+  : S3I<o, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+        !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+        [(set VR128:$dst, (v2f64 (IntId VR128:$src1, VR128:$src2)))]>;
+class S3_Intrm<bits<8> o, string OpcodeStr, Intrinsic IntId>
+  : S3I<o, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2),
+        !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+      [(set VR128:$dst, (v2f64 (IntId VR128:$src1, (memopv2f64 addr:$src2))))]>;
+
+let Constraints = "$src1 = $dst" in {
+  def HADDPSrr : S3D_Intrr<0x7C, "haddps", int_x86_sse3_hadd_ps>;
+  def HADDPSrm : S3D_Intrm<0x7C, "haddps", int_x86_sse3_hadd_ps>;
+  def HADDPDrr : S3_Intrr <0x7C, "haddpd", int_x86_sse3_hadd_pd>;
+  def HADDPDrm : S3_Intrm <0x7C, "haddpd", int_x86_sse3_hadd_pd>;
+  def HSUBPSrr : S3D_Intrr<0x7D, "hsubps", int_x86_sse3_hsub_ps>;
+  def HSUBPSrm : S3D_Intrm<0x7D, "hsubps", int_x86_sse3_hsub_ps>;
+  def HSUBPDrr : S3_Intrr <0x7D, "hsubpd", int_x86_sse3_hsub_pd>;
+  def HSUBPDrm : S3_Intrm <0x7D, "hsubpd", int_x86_sse3_hsub_pd>;
+}
+
+// Thread synchronization
+def MONITOR : I<0x01, MRM1r, (outs), (ins), "monitor",
+                [(int_x86_sse3_monitor EAX, ECX, EDX)]>,TB, Requires<[HasSSE3]>;
+def MWAIT   : I<0x01, MRM1r, (outs), (ins), "mwait",
+                [(int_x86_sse3_mwait ECX, EAX)]>, TB, Requires<[HasSSE3]>;
+
+// vector_shuffle v1, <undef> <1, 1, 3, 3>
+let AddedComplexity = 15 in
+def : Pat<(v4i32 (movshdup VR128:$src, (undef))),
+          (MOVSHDUPrr VR128:$src)>, Requires<[HasSSE3]>;
+let AddedComplexity = 20 in
+def : Pat<(v4i32 (movshdup (bc_v4i32 (memopv2i64 addr:$src)), (undef))),
+          (MOVSHDUPrm addr:$src)>, Requires<[HasSSE3]>;
+
+// vector_shuffle v1, <undef> <0, 0, 2, 2>
+let AddedComplexity = 15 in
+  def : Pat<(v4i32 (movsldup VR128:$src, (undef))),
+            (MOVSLDUPrr VR128:$src)>, Requires<[HasSSE3]>;
+let AddedComplexity = 20 in
+  def : Pat<(v4i32 (movsldup (bc_v4i32 (memopv2i64 addr:$src)), (undef))),
+            (MOVSLDUPrm addr:$src)>, Requires<[HasSSE3]>;
+
+//===---------------------------------------------------------------------===//
+// SSSE3 Instructions
+//===---------------------------------------------------------------------===//
+
+/// SS3I_unop_rm_int_8 - Simple SSSE3 unary operator whose type is v*i8.
+multiclass SS3I_unop_rm_int_8<bits<8> opc, string OpcodeStr,
+                              Intrinsic IntId64, Intrinsic IntId128> {
+  def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
+                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                   [(set VR64:$dst, (IntId64 VR64:$src))]>;
+
+  def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
+                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                   [(set VR64:$dst,
+                     (IntId64 (bitconvert (memopv8i8 addr:$src))))]>;
+
+  def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+                    (ins VR128:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (IntId128 VR128:$src))]>,
+                    OpSize;
+
+  def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+                    (ins i128mem:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst,
+                      (IntId128
+                       (bitconvert (memopv16i8 addr:$src))))]>, OpSize;
+}
+
+/// SS3I_unop_rm_int_16 - Simple SSSE3 unary operator whose type is v*i16.
+multiclass SS3I_unop_rm_int_16<bits<8> opc, string OpcodeStr,
+                               Intrinsic IntId64, Intrinsic IntId128> {
+  def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
+                   (ins VR64:$src),
+                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                   [(set VR64:$dst, (IntId64 VR64:$src))]>;
+
+  def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
+                   (ins i64mem:$src),
+                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                   [(set VR64:$dst,
+                     (IntId64
+                      (bitconvert (memopv4i16 addr:$src))))]>;
+
+  def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+                    (ins VR128:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (IntId128 VR128:$src))]>,
+                    OpSize;
+
+  def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+                    (ins i128mem:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst,
+                      (IntId128
+                       (bitconvert (memopv8i16 addr:$src))))]>, OpSize;
+}
+
+/// SS3I_unop_rm_int_32 - Simple SSSE3 unary operator whose type is v*i32.
+multiclass SS3I_unop_rm_int_32<bits<8> opc, string OpcodeStr,
+                               Intrinsic IntId64, Intrinsic IntId128> {
+  def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
+                   (ins VR64:$src),
+                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                   [(set VR64:$dst, (IntId64 VR64:$src))]>;
+
+  def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
+                   (ins i64mem:$src),
+                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                   [(set VR64:$dst,
+                     (IntId64
+                      (bitconvert (memopv2i32 addr:$src))))]>;
+
+  def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+                    (ins VR128:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (IntId128 VR128:$src))]>,
+                    OpSize;
+
+  def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+                    (ins i128mem:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst,
+                      (IntId128
+                       (bitconvert (memopv4i32 addr:$src))))]>, OpSize;
+}
+
+defm PABSB       : SS3I_unop_rm_int_8 <0x1C, "pabsb",
+                                       int_x86_ssse3_pabs_b,
+                                       int_x86_ssse3_pabs_b_128>;
+defm PABSW       : SS3I_unop_rm_int_16<0x1D, "pabsw",
+                                       int_x86_ssse3_pabs_w,
+                                       int_x86_ssse3_pabs_w_128>;
+defm PABSD       : SS3I_unop_rm_int_32<0x1E, "pabsd",
+                                       int_x86_ssse3_pabs_d,
+                                       int_x86_ssse3_pabs_d_128>;
+
+/// SS3I_binop_rm_int_8 - Simple SSSE3 binary operator whose type is v*i8.
+let Constraints = "$src1 = $dst" in {
+  multiclass SS3I_binop_rm_int_8<bits<8> opc, string OpcodeStr,
+                                 Intrinsic IntId64, Intrinsic IntId128,
+                                 bit Commutable = 0> {
+    def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
+                     (ins VR64:$src1, VR64:$src2),
+                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                     [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))]> {
+      let isCommutable = Commutable;
+    }
+    def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
+                     (ins VR64:$src1, i64mem:$src2),
+                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                     [(set VR64:$dst,
+                       (IntId64 VR64:$src1,
+                        (bitconvert (memopv8i8 addr:$src2))))]>;
+
+    def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+                      (ins VR128:$src1, VR128:$src2),
+                      !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                      [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+                      OpSize {
+      let isCommutable = Commutable;
+    }
+    def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+                      (ins VR128:$src1, i128mem:$src2),
+                      !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                      [(set VR128:$dst,
+                        (IntId128 VR128:$src1,
+                         (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
+  }
+}
+
+/// SS3I_binop_rm_int_16 - Simple SSSE3 binary operator whose type is v*i16.
+let Constraints = "$src1 = $dst" in {
+  multiclass SS3I_binop_rm_int_16<bits<8> opc, string OpcodeStr,
+                                  Intrinsic IntId64, Intrinsic IntId128,
+                                  bit Commutable = 0> {
+    def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
+                     (ins VR64:$src1, VR64:$src2),
+                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                     [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))]> {
+      let isCommutable = Commutable;
+    }
+    def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
+                     (ins VR64:$src1, i64mem:$src2),
+                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                     [(set VR64:$dst,
+                       (IntId64 VR64:$src1,
+                        (bitconvert (memopv4i16 addr:$src2))))]>;
+
+    def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+                      (ins VR128:$src1, VR128:$src2),
+                      !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                      [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+                      OpSize {
+      let isCommutable = Commutable;
+    }
+    def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+                      (ins VR128:$src1, i128mem:$src2),
+                      !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                      [(set VR128:$dst,
+                        (IntId128 VR128:$src1,
+                         (bitconvert (memopv8i16 addr:$src2))))]>, OpSize;
+  }
+}
+
+/// SS3I_binop_rm_int_32 - Simple SSSE3 binary operator whose type is v*i32.
+let Constraints = "$src1 = $dst" in {
+  multiclass SS3I_binop_rm_int_32<bits<8> opc, string OpcodeStr,
+                                  Intrinsic IntId64, Intrinsic IntId128,
+                                  bit Commutable = 0> {
+    def rr64 : SS38I<opc, MRMSrcReg, (outs VR64:$dst),
+                     (ins VR64:$src1, VR64:$src2),
+                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                     [(set VR64:$dst, (IntId64 VR64:$src1, VR64:$src2))]> {
+      let isCommutable = Commutable;
+    }
+    def rm64 : SS38I<opc, MRMSrcMem, (outs VR64:$dst),
+                     (ins VR64:$src1, i64mem:$src2),
+                     !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                     [(set VR64:$dst,
+                       (IntId64 VR64:$src1,
+                        (bitconvert (memopv2i32 addr:$src2))))]>;
+
+    def rr128 : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
+                      (ins VR128:$src1, VR128:$src2),
+                      !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                      [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+                      OpSize {
+      let isCommutable = Commutable;
+    }
+    def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
+                      (ins VR128:$src1, i128mem:$src2),
+                      !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                      [(set VR128:$dst,
+                        (IntId128 VR128:$src1,
+                         (bitconvert (memopv4i32 addr:$src2))))]>, OpSize;
+  }
+}
+
+defm PHADDW      : SS3I_binop_rm_int_16<0x01, "phaddw",
+                                        int_x86_ssse3_phadd_w,
+                                        int_x86_ssse3_phadd_w_128>;
+defm PHADDD      : SS3I_binop_rm_int_32<0x02, "phaddd",
+                                        int_x86_ssse3_phadd_d,
+                                        int_x86_ssse3_phadd_d_128>;
+defm PHADDSW     : SS3I_binop_rm_int_16<0x03, "phaddsw",
+                                        int_x86_ssse3_phadd_sw,
+                                        int_x86_ssse3_phadd_sw_128>;
+defm PHSUBW      : SS3I_binop_rm_int_16<0x05, "phsubw",
+                                        int_x86_ssse3_phsub_w,
+                                        int_x86_ssse3_phsub_w_128>;
+defm PHSUBD      : SS3I_binop_rm_int_32<0x06, "phsubd",
+                                        int_x86_ssse3_phsub_d,
+                                        int_x86_ssse3_phsub_d_128>;
+defm PHSUBSW     : SS3I_binop_rm_int_16<0x07, "phsubsw",
+                                        int_x86_ssse3_phsub_sw,
+                                        int_x86_ssse3_phsub_sw_128>;
+defm PMADDUBSW   : SS3I_binop_rm_int_8 <0x04, "pmaddubsw",
+                                        int_x86_ssse3_pmadd_ub_sw,
+                                        int_x86_ssse3_pmadd_ub_sw_128>;
+defm PMULHRSW    : SS3I_binop_rm_int_16<0x0B, "pmulhrsw",
+                                        int_x86_ssse3_pmul_hr_sw,
+                                        int_x86_ssse3_pmul_hr_sw_128, 1>;
+defm PSHUFB      : SS3I_binop_rm_int_8 <0x00, "pshufb",
+                                        int_x86_ssse3_pshuf_b,
+                                        int_x86_ssse3_pshuf_b_128>;
+defm PSIGNB      : SS3I_binop_rm_int_8 <0x08, "psignb",
+                                        int_x86_ssse3_psign_b,
+                                        int_x86_ssse3_psign_b_128>;
+defm PSIGNW      : SS3I_binop_rm_int_16<0x09, "psignw",
+                                        int_x86_ssse3_psign_w,
+                                        int_x86_ssse3_psign_w_128>;
+defm PSIGND      : SS3I_binop_rm_int_32<0x0A, "psignd",
+                                        int_x86_ssse3_psign_d,
+                                        int_x86_ssse3_psign_d_128>;
+
+let Constraints = "$src1 = $dst" in {
+  def PALIGNR64rr  : SS3AI<0x0F, MRMSrcReg, (outs VR64:$dst),
+                           (ins VR64:$src1, VR64:$src2, i8imm:$src3),
+                           "palignr\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                           []>;
+  def PALIGNR64rm  : SS3AI<0x0F, MRMSrcMem, (outs VR64:$dst),
+                           (ins VR64:$src1, i64mem:$src2, i8imm:$src3),
+                           "palignr\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                           []>;
+
+  def PALIGNR128rr : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst),
+                           (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+                           "palignr\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                           []>, OpSize;
+  def PALIGNR128rm : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst),
+                           (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+                           "palignr\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                           []>, OpSize;
+}
+
+// palignr patterns.
+def : Pat<(int_x86_ssse3_palign_r VR64:$src1, VR64:$src2, (i8 imm:$src3)),
+          (PALIGNR64rr VR64:$src1, VR64:$src2, (BYTE_imm imm:$src3))>,
+          Requires<[HasSSSE3]>;
+def : Pat<(int_x86_ssse3_palign_r VR64:$src1,
+                                      (memop64 addr:$src2),
+                                      (i8 imm:$src3)),
+          (PALIGNR64rm VR64:$src1, addr:$src2, (BYTE_imm imm:$src3))>,
+          Requires<[HasSSSE3]>;
+
+def : Pat<(int_x86_ssse3_palign_r_128 VR128:$src1, VR128:$src2, (i8 imm:$src3)),
+          (PALIGNR128rr VR128:$src1, VR128:$src2, (BYTE_imm imm:$src3))>,
+          Requires<[HasSSSE3]>;
+def : Pat<(int_x86_ssse3_palign_r_128 VR128:$src1,
+                                      (memopv2i64 addr:$src2),
+                                      (i8 imm:$src3)),
+          (PALIGNR128rm VR128:$src1, addr:$src2, (BYTE_imm imm:$src3))>,
+          Requires<[HasSSSE3]>;
+
+let AddedComplexity = 5 in {
+def : Pat<(v4i32 (palign:$src3 VR128:$src1, VR128:$src2)),
+          (PALIGNR128rr VR128:$src2, VR128:$src1,
+                        (SHUFFLE_get_palign_imm VR128:$src3))>,
+      Requires<[HasSSSE3]>;
+def : Pat<(v4f32 (palign:$src3 VR128:$src1, VR128:$src2)),
+          (PALIGNR128rr VR128:$src2, VR128:$src1,
+                        (SHUFFLE_get_palign_imm VR128:$src3))>,
+      Requires<[HasSSSE3]>;
+def : Pat<(v8i16 (palign:$src3 VR128:$src1, VR128:$src2)),
+          (PALIGNR128rr VR128:$src2, VR128:$src1,
+                        (SHUFFLE_get_palign_imm VR128:$src3))>,
+      Requires<[HasSSSE3]>;
+def : Pat<(v16i8 (palign:$src3 VR128:$src1, VR128:$src2)),
+          (PALIGNR128rr VR128:$src2, VR128:$src1,
+                        (SHUFFLE_get_palign_imm VR128:$src3))>,
+      Requires<[HasSSSE3]>;
+}
+
+def : Pat<(X86pshufb VR128:$src, VR128:$mask),
+          (PSHUFBrr128 VR128:$src, VR128:$mask)>, Requires<[HasSSSE3]>;
+def : Pat<(X86pshufb VR128:$src, (bc_v16i8 (memopv2i64 addr:$mask))),
+          (PSHUFBrm128 VR128:$src, addr:$mask)>, Requires<[HasSSSE3]>;
+
+//===---------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===---------------------------------------------------------------------===//
+
+// extload f32 -> f64.  This matches load+fextend because we have a hack in
+// the isel (PreprocessForFPConvert) that can introduce loads after dag
+// combine.
+// Since these loads aren't folded into the fextend, we have to match it
+// explicitly here.
+let Predicates = [HasSSE2] in
+ def : Pat<(fextend (loadf32 addr:$src)),
+           (CVTSS2SDrm addr:$src)>;
+
+// bit_convert
+let Predicates = [HasSSE2] in {
+  def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
+  def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
+}
+
+// Move scalar to XMM zero-extended
+// movd to XMM register zero-extends
+let AddedComplexity = 15 in {
+// Zeroing a VR128 then do a MOVS{S|D} to the lower bits.
+def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
+          (MOVLSD2PDrr (V_SET0), FR64:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
+          (MOVLSS2PSrr (V_SET0), FR32:$src)>, Requires<[HasSSE1]>;
+def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
+          (MOVLPSrr (V_SET0), VR128:$src)>, Requires<[HasSSE1]>;
+def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
+          (MOVLPSrr (V_SET0), VR128:$src)>, Requires<[HasSSE1]>;
+}
+
+// Splat v2f64 / v2i64
+let AddedComplexity = 10 in {
+def : Pat<(splat_lo (v2f64 VR128:$src), (undef)),
+          (UNPCKLPDrr VR128:$src, VR128:$src)>,   Requires<[HasSSE2]>;
+def : Pat<(unpckh (v2f64 VR128:$src), (undef)),
+          (UNPCKHPDrr VR128:$src, VR128:$src)>,   Requires<[HasSSE2]>;
+def : Pat<(splat_lo (v2i64 VR128:$src), (undef)),
+          (PUNPCKLQDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(unpckh (v2i64 VR128:$src), (undef)),
+          (PUNPCKHQDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+}
+
+// Special unary SHUFPSrri case.
+def : Pat<(v4f32 (pshufd:$src3 VR128:$src1, (undef))),
+          (SHUFPSrri VR128:$src1, VR128:$src1,
+                     (SHUFFLE_get_shuf_imm VR128:$src3))>,
+      Requires<[HasSSE1]>;
+let AddedComplexity = 5 in
+def : Pat<(v4f32 (pshufd:$src2 VR128:$src1, (undef))),
+          (PSHUFDri VR128:$src1, (SHUFFLE_get_shuf_imm VR128:$src2))>,
+      Requires<[HasSSE2]>;
+// Special unary SHUFPDrri case.
+def : Pat<(v2i64 (pshufd:$src3 VR128:$src1, (undef))),
+          (SHUFPDrri VR128:$src1, VR128:$src1,
+                     (SHUFFLE_get_shuf_imm VR128:$src3))>,
+      Requires<[HasSSE2]>;
+// Special unary SHUFPDrri case.
+def : Pat<(v2f64 (pshufd:$src3 VR128:$src1, (undef))),
+          (SHUFPDrri VR128:$src1, VR128:$src1,
+                     (SHUFFLE_get_shuf_imm VR128:$src3))>,
+      Requires<[HasSSE2]>;
+// Unary v4f32 shuffle with PSHUF* in order to fold a load.
+def : Pat<(pshufd:$src2 (bc_v4i32 (memopv4f32 addr:$src1)), (undef)),
+          (PSHUFDmi addr:$src1, (SHUFFLE_get_shuf_imm VR128:$src2))>,
+      Requires<[HasSSE2]>;
+
+// Special binary v4i32 shuffle cases with SHUFPS.
+def : Pat<(v4i32 (shufp:$src3 VR128:$src1, (v4i32 VR128:$src2))),
+          (SHUFPSrri VR128:$src1, VR128:$src2,
+                     (SHUFFLE_get_shuf_imm VR128:$src3))>,
+           Requires<[HasSSE2]>;
+def : Pat<(v4i32 (shufp:$src3 VR128:$src1, (bc_v4i32 (memopv2i64 addr:$src2)))),
+          (SHUFPSrmi VR128:$src1, addr:$src2,
+                    (SHUFFLE_get_shuf_imm VR128:$src3))>,
+           Requires<[HasSSE2]>;
+// Special binary v2i64 shuffle cases using SHUFPDrri.
+def : Pat<(v2i64 (shufp:$src3 VR128:$src1, VR128:$src2)),
+          (SHUFPDrri VR128:$src1, VR128:$src2,
+                     (SHUFFLE_get_shuf_imm VR128:$src3))>,
+          Requires<[HasSSE2]>;
+
+// vector_shuffle v1, <undef>, <0, 0, 1, 1, ...>
+let AddedComplexity = 15 in {
+def : Pat<(v4i32 (unpckl_undef:$src2 VR128:$src, (undef))),
+          (PSHUFDri VR128:$src, (SHUFFLE_get_shuf_imm VR128:$src2))>,
+          Requires<[OptForSpeed, HasSSE2]>;
+def : Pat<(v4f32 (unpckl_undef:$src2 VR128:$src, (undef))),
+          (PSHUFDri VR128:$src, (SHUFFLE_get_shuf_imm VR128:$src2))>,
+          Requires<[OptForSpeed, HasSSE2]>;
+}
+let AddedComplexity = 10 in {
+def : Pat<(v4f32 (unpckl_undef VR128:$src, (undef))),
+          (UNPCKLPSrr VR128:$src, VR128:$src)>, Requires<[HasSSE1]>;
+def : Pat<(v16i8 (unpckl_undef VR128:$src, (undef))),
+          (PUNPCKLBWrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v8i16 (unpckl_undef VR128:$src, (undef))),
+          (PUNPCKLWDrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v4i32 (unpckl_undef VR128:$src, (undef))),
+          (PUNPCKLDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+}
+
+// vector_shuffle v1, <undef>, <2, 2, 3, 3, ...>
+let AddedComplexity = 15 in {
+def : Pat<(v4i32 (unpckh_undef:$src2 VR128:$src, (undef))),
+          (PSHUFDri VR128:$src, (SHUFFLE_get_shuf_imm VR128:$src2))>,
+          Requires<[OptForSpeed, HasSSE2]>;
+def : Pat<(v4f32 (unpckh_undef:$src2 VR128:$src, (undef))),
+          (PSHUFDri VR128:$src, (SHUFFLE_get_shuf_imm VR128:$src2))>,
+          Requires<[OptForSpeed, HasSSE2]>;
+}
+let AddedComplexity = 10 in {
+def : Pat<(v4f32 (unpckh_undef VR128:$src, (undef))),
+          (UNPCKHPSrr VR128:$src, VR128:$src)>, Requires<[HasSSE1]>;
+def : Pat<(v16i8 (unpckh_undef VR128:$src, (undef))),
+          (PUNPCKHBWrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v8i16 (unpckh_undef VR128:$src, (undef))),
+          (PUNPCKHWDrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v4i32 (unpckh_undef VR128:$src, (undef))),
+          (PUNPCKHDQrr VR128:$src, VR128:$src)>, Requires<[HasSSE2]>;
+}
+
+let AddedComplexity = 20 in {
+// vector_shuffle v1, v2 <0, 1, 4, 5> using MOVLHPS
+def : Pat<(v4i32 (movlhps VR128:$src1, VR128:$src2)),
+          (MOVLHPSrr VR128:$src1, VR128:$src2)>;
+
+// vector_shuffle v1, v2 <6, 7, 2, 3> using MOVHLPS
+def : Pat<(v4i32 (movhlps VR128:$src1, VR128:$src2)),
+          (MOVHLPSrr VR128:$src1, VR128:$src2)>;
+
+// vector_shuffle v1, undef <2, ?, ?, ?> using MOVHLPS
+def : Pat<(v4f32 (movhlps_undef VR128:$src1, (undef))),
+          (MOVHLPSrr VR128:$src1, VR128:$src1)>;
+def : Pat<(v4i32 (movhlps_undef VR128:$src1, (undef))),
+          (MOVHLPSrr VR128:$src1, VR128:$src1)>;
+}
+
+let AddedComplexity = 20 in {
+// vector_shuffle v1, (load v2) <4, 5, 2, 3> using MOVLPS
+def : Pat<(v4f32 (movlp VR128:$src1, (load addr:$src2))),
+          (MOVLPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE1]>;
+def : Pat<(v2f64 (movlp VR128:$src1, (load addr:$src2))),
+          (MOVLPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v4i32 (movlp VR128:$src1, (load addr:$src2))),
+          (MOVLPSrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (movlp VR128:$src1, (load addr:$src2))),
+          (MOVLPDrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+}
+
+// (store (vector_shuffle (load addr), v2, <4, 5, 2, 3>), addr) using MOVLPS
+def : Pat<(store (v4f32 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
+          (MOVLPSmr addr:$src1, VR128:$src2)>, Requires<[HasSSE1]>;
+def : Pat<(store (v2f64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
+          (MOVLPDmr addr:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(store (v4i32 (movlp (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)),
+                 addr:$src1),
+          (MOVLPSmr addr:$src1, VR128:$src2)>, Requires<[HasSSE1]>;
+def : Pat<(store (v2i64 (movlp (load addr:$src1), VR128:$src2)), addr:$src1),
+          (MOVLPDmr addr:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+
+let AddedComplexity = 15 in {
+// Setting the lowest element in the vector.
+def : Pat<(v4i32 (movl VR128:$src1, VR128:$src2)),
+          (MOVLPSrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (movl VR128:$src1, VR128:$src2)),
+          (MOVLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+
+// vector_shuffle v1, v2 <4, 5, 2, 3> using MOVLPDrr (movsd)
+def : Pat<(v4f32 (movlp VR128:$src1, VR128:$src2)),
+          (MOVLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v4i32 (movlp VR128:$src1, VR128:$src2)),
+          (MOVLPDrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+}
+
+// vector_shuffle v1, v2 <4, 5, 2, 3> using SHUFPSrri (we prefer movsd, but
+// fall back to this for SSE1)
+def : Pat<(v4f32 (movlp:$src3 VR128:$src1, (v4f32 VR128:$src2))),
+          (SHUFPSrri VR128:$src2, VR128:$src1,
+                     (SHUFFLE_get_shuf_imm VR128:$src3))>, Requires<[HasSSE1]>;
+
+// Set lowest element and zero upper elements.
+let AddedComplexity = 15 in
+def : Pat<(v2f64 (movl immAllZerosV_bc, VR128:$src)),
+          (MOVZPQILo2PQIrr VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v2f64 (X86vzmovl (v2f64 VR128:$src))),
+          (MOVZPQILo2PQIrr VR128:$src)>, Requires<[HasSSE2]>;
+
+// Some special case pandn patterns.
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v4i32 immAllOnesV))),
+                  VR128:$src2)),
+          (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v8i16 immAllOnesV))),
+                  VR128:$src2)),
+          (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))),
+                  VR128:$src2)),
+          (PANDNrr VR128:$src1, VR128:$src2)>, Requires<[HasSSE2]>;
+
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v4i32 immAllOnesV))),
+                  (memop addr:$src2))),
+          (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v8i16 immAllOnesV))),
+                  (memop addr:$src2))),
+          (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+def : Pat<(v2i64 (and (xor VR128:$src1, (bc_v2i64 (v16i8 immAllOnesV))),
+                  (memop addr:$src2))),
+          (PANDNrm VR128:$src1, addr:$src2)>, Requires<[HasSSE2]>;
+
+// vector -> vector casts
+def : Pat<(v4f32 (sint_to_fp (v4i32 VR128:$src))),
+          (Int_CVTDQ2PSrr VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v4i32 (fp_to_sint (v4f32 VR128:$src))),
+          (Int_CVTTPS2DQrr VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v2f64 (sint_to_fp (v2i32 VR64:$src))),
+          (Int_CVTPI2PDrr VR64:$src)>, Requires<[HasSSE2]>;
+def : Pat<(v2i32 (fp_to_sint (v2f64 VR128:$src))),
+          (Int_CVTTPD2PIrr VR128:$src)>, Requires<[HasSSE2]>;
+
+// Use movaps / movups for SSE integer load / store (one byte shorter).
+def : Pat<(alignedloadv4i32 addr:$src),
+          (MOVAPSrm addr:$src)>, Requires<[HasSSE1]>;
+def : Pat<(loadv4i32 addr:$src),
+          (MOVUPSrm addr:$src)>, Requires<[HasSSE1]>;
+def : Pat<(alignedloadv2i64 addr:$src),
+          (MOVAPSrm addr:$src)>, Requires<[HasSSE2]>;
+def : Pat<(loadv2i64 addr:$src),
+          (MOVUPSrm addr:$src)>, Requires<[HasSSE2]>;
+
+def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
+          (MOVAPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
+          (MOVAPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
+          (MOVAPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
+          (MOVAPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(store (v2i64 VR128:$src), addr:$dst),
+          (MOVUPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(store (v4i32 VR128:$src), addr:$dst),
+          (MOVUPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(store (v8i16 VR128:$src), addr:$dst),
+          (MOVUPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+def : Pat<(store (v16i8 VR128:$src), addr:$dst),
+          (MOVUPSmr addr:$dst, VR128:$src)>, Requires<[HasSSE2]>;
+
+//===----------------------------------------------------------------------===//
+// SSE4.1 Instructions
+//===----------------------------------------------------------------------===//
+
+multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd,
+                            string OpcodeStr,
+                            Intrinsic V4F32Int,
+                            Intrinsic V2F64Int> {
+  // Intrinsic operation, reg.
+  // Vector intrinsic operation, reg
+  def PSr_Int : SS4AIi8<opcps, MRMSrcReg,
+                    (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set VR128:$dst, (V4F32Int VR128:$src1, imm:$src2))]>,
+                    OpSize;
+
+  // Vector intrinsic operation, mem
+  def PSm_Int : Ii8<opcps, MRMSrcMem,
+                    (outs VR128:$dst), (ins f128mem:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set VR128:$dst,
+                          (V4F32Int (memopv4f32 addr:$src1),imm:$src2))]>,
+                    TA, OpSize,
+                Requires<[HasSSE41]>;
+
+  // Vector intrinsic operation, reg
+  def PDr_Int : SS4AIi8<opcpd, MRMSrcReg,
+                    (outs VR128:$dst), (ins VR128:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set VR128:$dst, (V2F64Int VR128:$src1, imm:$src2))]>,
+                    OpSize;
+
+  // Vector intrinsic operation, mem
+  def PDm_Int : SS4AIi8<opcpd, MRMSrcMem,
+                    (outs VR128:$dst), (ins f128mem:$src1, i32i8imm:$src2),
+                    !strconcat(OpcodeStr,
+                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                    [(set VR128:$dst,
+                          (V2F64Int (memopv2f64 addr:$src1),imm:$src2))]>,
+                    OpSize;
+}
+
+let Constraints = "$src1 = $dst" in {
+multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd,
+                            string OpcodeStr,
+                            Intrinsic F32Int,
+                            Intrinsic F64Int> {
+  // Intrinsic operation, reg.
+  def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
+                    (outs VR128:$dst),
+                                 (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
+                    !strconcat(OpcodeStr,
+                    "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                    [(set VR128:$dst,
+                            (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>,
+                    OpSize;
+
+  // Intrinsic operation, mem.
+  def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
+                    (outs VR128:$dst),
+                                (ins VR128:$src1, ssmem:$src2, i32i8imm:$src3),
+                    !strconcat(OpcodeStr,
+                    "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                    [(set VR128:$dst,
+                         (F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
+                    OpSize;
+
+  // Intrinsic operation, reg.
+  def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
+                    (outs VR128:$dst),
+                            (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
+                    !strconcat(OpcodeStr,
+                    "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                    [(set VR128:$dst,
+                            (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>,
+                    OpSize;
+
+  // Intrinsic operation, mem.
+  def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
+                    (outs VR128:$dst),
+                            (ins VR128:$src1, sdmem:$src2, i32i8imm:$src3),
+                    !strconcat(OpcodeStr,
+                    "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                    [(set VR128:$dst,
+                        (F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
+                    OpSize;
+}
+}
+
+// FP round - roundss, roundps, roundsd, roundpd
+defm ROUND  : sse41_fp_unop_rm<0x08, 0x09, "round",
+                               int_x86_sse41_round_ps, int_x86_sse41_round_pd>;
+defm ROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "round",
+                               int_x86_sse41_round_ss, int_x86_sse41_round_sd>;
+
+// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
+multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
+                                 Intrinsic IntId128> {
+  def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+                    (ins VR128:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set VR128:$dst, (IntId128 VR128:$src))]>, OpSize;
+  def rm128 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+                     (ins i128mem:$src),
+                     !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                     [(set VR128:$dst,
+                       (IntId128
+                       (bitconvert (memopv8i16 addr:$src))))]>, OpSize;
+}
+
+defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw",
+                                         int_x86_sse41_phminposuw>;
+
+/// SS41I_binop_rm_int - Simple SSE 4.1 binary operator
+let Constraints = "$src1 = $dst" in {
+  multiclass SS41I_binop_rm_int<bits<8> opc, string OpcodeStr,
+                                Intrinsic IntId128, bit Commutable = 0> {
+    def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+                   (ins VR128:$src1, VR128:$src2),
+                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                   [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+                   OpSize {
+      let isCommutable = Commutable;
+    }
+    def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+                   (ins VR128:$src1, i128mem:$src2),
+                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                   [(set VR128:$dst,
+                     (IntId128 VR128:$src1,
+                      (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
+  }
+}
+
+defm PCMPEQQ      : SS41I_binop_rm_int<0x29, "pcmpeqq",
+                                       int_x86_sse41_pcmpeqq, 1>;
+defm PACKUSDW     : SS41I_binop_rm_int<0x2B, "packusdw",
+                                       int_x86_sse41_packusdw, 0>;
+defm PMINSB       : SS41I_binop_rm_int<0x38, "pminsb",
+                                       int_x86_sse41_pminsb, 1>;
+defm PMINSD       : SS41I_binop_rm_int<0x39, "pminsd",
+                                       int_x86_sse41_pminsd, 1>;
+defm PMINUD       : SS41I_binop_rm_int<0x3B, "pminud",
+                                       int_x86_sse41_pminud, 1>;
+defm PMINUW       : SS41I_binop_rm_int<0x3A, "pminuw",
+                                       int_x86_sse41_pminuw, 1>;
+defm PMAXSB       : SS41I_binop_rm_int<0x3C, "pmaxsb",
+                                       int_x86_sse41_pmaxsb, 1>;
+defm PMAXSD       : SS41I_binop_rm_int<0x3D, "pmaxsd",
+                                       int_x86_sse41_pmaxsd, 1>;
+defm PMAXUD       : SS41I_binop_rm_int<0x3F, "pmaxud",
+                                       int_x86_sse41_pmaxud, 1>;
+defm PMAXUW       : SS41I_binop_rm_int<0x3E, "pmaxuw",
+                                       int_x86_sse41_pmaxuw, 1>;
+
+defm PMULDQ       : SS41I_binop_rm_int<0x28, "pmuldq", int_x86_sse41_pmuldq, 1>;
+
+def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, VR128:$src2)),
+          (PCMPEQQrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, (memop addr:$src2))),
+          (PCMPEQQrm VR128:$src1, addr:$src2)>;
+
+/// SS41I_binop_rm_int - Simple SSE 4.1 binary operator
+let Constraints = "$src1 = $dst" in {
+  multiclass SS41I_binop_patint<bits<8> opc, string OpcodeStr, ValueType OpVT,
+                                SDNode OpNode, Intrinsic IntId128,
+                                bit Commutable = 0> {
+    def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+                   (ins VR128:$src1, VR128:$src2),
+                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                   [(set VR128:$dst, (OpNode (OpVT VR128:$src1),
+                                                   VR128:$src2))]>, OpSize {
+      let isCommutable = Commutable;
+    }
+    def rr_int : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+                      (ins VR128:$src1, VR128:$src2),
+                      !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                      [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+                      OpSize {
+      let isCommutable = Commutable;
+    }
+    def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+                   (ins VR128:$src1, i128mem:$src2),
+                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                   [(set VR128:$dst,
+                     (OpNode VR128:$src1, (memop addr:$src2)))]>, OpSize;
+    def rm_int : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+                       (ins VR128:$src1, i128mem:$src2),
+                       !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                       [(set VR128:$dst,
+                        (IntId128 VR128:$src1, (memop addr:$src2)))]>,
+                       OpSize;
+  }
+}
+defm PMULLD       : SS41I_binop_patint<0x40, "pmulld", v4i32, mul,
+                                       int_x86_sse41_pmulld, 1>;
+
+/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
+let Constraints = "$src1 = $dst" in {
+  multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
+                                 Intrinsic IntId128, bit Commutable = 0> {
+    def rri : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+                    (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
+                    !strconcat(OpcodeStr,
+                     "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                    [(set VR128:$dst,
+                      (IntId128 VR128:$src1, VR128:$src2, imm:$src3))]>,
+                    OpSize {
+      let isCommutable = Commutable;
+    }
+    def rmi : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+                    (ins VR128:$src1, i128mem:$src2, i32i8imm:$src3),
+                    !strconcat(OpcodeStr,
+                     "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                    [(set VR128:$dst,
+                      (IntId128 VR128:$src1,
+                       (bitconvert (memopv16i8 addr:$src2)), imm:$src3))]>,
+                    OpSize;
+  }
+}
+
+defm BLENDPS      : SS41I_binop_rmi_int<0x0C, "blendps",
+                                        int_x86_sse41_blendps, 0>;
+defm BLENDPD      : SS41I_binop_rmi_int<0x0D, "blendpd",
+                                        int_x86_sse41_blendpd, 0>;
+defm PBLENDW      : SS41I_binop_rmi_int<0x0E, "pblendw",
+                                        int_x86_sse41_pblendw, 0>;
+defm DPPS         : SS41I_binop_rmi_int<0x40, "dpps",
+                                        int_x86_sse41_dpps, 1>;
+defm DPPD         : SS41I_binop_rmi_int<0x41, "dppd",
+                                        int_x86_sse41_dppd, 1>;
+defm MPSADBW      : SS41I_binop_rmi_int<0x42, "mpsadbw",
+                                        int_x86_sse41_mpsadbw, 1>;
+
+
+/// SS41I_ternary_int - SSE 4.1 ternary operator
+let Uses = [XMM0], Constraints = "$src1 = $dst" in {
+  multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
+    def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
+                    (ins VR128:$src1, VR128:$src2),
+                    !strconcat(OpcodeStr,
+                     "\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}"),
+                    [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))]>,
+                    OpSize;
+
+    def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
+                    (ins VR128:$src1, i128mem:$src2),
+                    !strconcat(OpcodeStr,
+                     "\t{%xmm0, $src2, $dst|$dst, $src2, %xmm0}"),
+                    [(set VR128:$dst,
+                      (IntId VR128:$src1,
+                       (bitconvert (memopv16i8 addr:$src2)), XMM0))]>, OpSize;
+  }
+}
+
+defm BLENDVPD     : SS41I_ternary_int<0x15, "blendvpd", int_x86_sse41_blendvpd>;
+defm BLENDVPS     : SS41I_ternary_int<0x14, "blendvps", int_x86_sse41_blendvps>;
+defm PBLENDVB     : SS41I_ternary_int<0x10, "pblendvb", int_x86_sse41_pblendvb>;
+
+
+multiclass SS41I_binop_rm_int8<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
+  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                 [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
+
+  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
+                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+       [(set VR128:$dst,
+         (IntId (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))]>,
+       OpSize;
+}
+
+defm PMOVSXBW   : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw>;
+defm PMOVSXWD   : SS41I_binop_rm_int8<0x23, "pmovsxwd", int_x86_sse41_pmovsxwd>;
+defm PMOVSXDQ   : SS41I_binop_rm_int8<0x25, "pmovsxdq", int_x86_sse41_pmovsxdq>;
+defm PMOVZXBW   : SS41I_binop_rm_int8<0x30, "pmovzxbw", int_x86_sse41_pmovzxbw>;
+defm PMOVZXWD   : SS41I_binop_rm_int8<0x33, "pmovzxwd", int_x86_sse41_pmovzxwd>;
+defm PMOVZXDQ   : SS41I_binop_rm_int8<0x35, "pmovzxdq", int_x86_sse41_pmovzxdq>;
+
+// Common patterns involving scalar load.
+def : Pat<(int_x86_sse41_pmovsxbw (vzmovl_v2i64 addr:$src)),
+          (PMOVSXBWrm addr:$src)>, Requires<[HasSSE41]>;
+def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)),
+          (PMOVSXBWrm addr:$src)>, Requires<[HasSSE41]>;
+
+def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)),
+          (PMOVSXWDrm addr:$src)>, Requires<[HasSSE41]>;
+def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)),
+          (PMOVSXWDrm addr:$src)>, Requires<[HasSSE41]>;
+
+def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)),
+          (PMOVSXDQrm addr:$src)>, Requires<[HasSSE41]>;
+def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)),
+          (PMOVSXDQrm addr:$src)>, Requires<[HasSSE41]>;
+
+def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)),
+          (PMOVZXBWrm addr:$src)>, Requires<[HasSSE41]>;
+def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)),
+          (PMOVZXBWrm addr:$src)>, Requires<[HasSSE41]>;
+
+def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)),
+          (PMOVZXWDrm addr:$src)>, Requires<[HasSSE41]>;
+def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)),
+          (PMOVZXWDrm addr:$src)>, Requires<[HasSSE41]>;
+
+def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)),
+          (PMOVZXDQrm addr:$src)>, Requires<[HasSSE41]>;
+def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)),
+          (PMOVZXDQrm addr:$src)>, Requires<[HasSSE41]>;
+
+
+multiclass SS41I_binop_rm_int4<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
+  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                 [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
+
+  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i32mem:$src),
+                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+       [(set VR128:$dst,
+         (IntId (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))]>,
+          OpSize;
+}
+
+defm PMOVSXBD   : SS41I_binop_rm_int4<0x21, "pmovsxbd", int_x86_sse41_pmovsxbd>;
+defm PMOVSXWQ   : SS41I_binop_rm_int4<0x24, "pmovsxwq", int_x86_sse41_pmovsxwq>;
+defm PMOVZXBD   : SS41I_binop_rm_int4<0x31, "pmovzxbd", int_x86_sse41_pmovzxbd>;
+defm PMOVZXWQ   : SS41I_binop_rm_int4<0x34, "pmovzxwq", int_x86_sse41_pmovzxwq>;
+
+// Common patterns involving scalar load
+def : Pat<(int_x86_sse41_pmovsxbd (vzmovl_v4i32 addr:$src)),
+          (PMOVSXBDrm addr:$src)>, Requires<[HasSSE41]>;
+def : Pat<(int_x86_sse41_pmovsxwq (vzmovl_v4i32 addr:$src)),
+          (PMOVSXWQrm addr:$src)>, Requires<[HasSSE41]>;
+
+def : Pat<(int_x86_sse41_pmovzxbd (vzmovl_v4i32 addr:$src)),
+          (PMOVZXBDrm addr:$src)>, Requires<[HasSSE41]>;
+def : Pat<(int_x86_sse41_pmovzxwq (vzmovl_v4i32 addr:$src)),
+          (PMOVZXWQrm addr:$src)>, Requires<[HasSSE41]>;
+
+
+multiclass SS41I_binop_rm_int2<bits<8> opc, string OpcodeStr, Intrinsic IntId> {
+  def rr : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                 [(set VR128:$dst, (IntId VR128:$src))]>, OpSize;
+
+  // Expecting a i16 load any extended to i32 value.
+  def rm : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i16mem:$src),
+                 !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                 [(set VR128:$dst, (IntId (bitconvert
+                     (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))]>,
+                 OpSize;
+}
+
+defm PMOVSXBQ   : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq>;
+defm PMOVZXBQ   : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq>;
+
+// Common patterns involving scalar load
+def : Pat<(int_x86_sse41_pmovsxbq
+            (bitconvert (v4i32 (X86vzmovl
+                             (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+          (PMOVSXBQrm addr:$src)>, Requires<[HasSSE41]>;
+
+def : Pat<(int_x86_sse41_pmovzxbq
+            (bitconvert (v4i32 (X86vzmovl
+                             (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+          (PMOVZXBQrm addr:$src)>, Requires<[HasSSE41]>;
+
+
+/// SS41I_binop_ext8 - SSE 4.1 extract 8 bits to 32 bit reg or 8 bit mem
+multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
+  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
+                 (ins VR128:$src1, i32i8imm:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(set GR32:$dst, (X86pextrb (v16i8 VR128:$src1), imm:$src2))]>,
+                 OpSize;
+  def mr : SS4AIi8<opc, MRMDestMem, (outs),
+                 (ins i8mem:$dst, VR128:$src1, i32i8imm:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 []>, OpSize;
+// FIXME:
+// There's an AssertZext in the way of writing the store pattern
+// (store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))), addr:$dst)
+}
+
+defm PEXTRB      : SS41I_extract8<0x14, "pextrb">;
+
+
+/// SS41I_extract16 - SSE 4.1 extract 16 bits to memory destination
+multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
+  def mr : SS4AIi8<opc, MRMDestMem, (outs),
+                 (ins i16mem:$dst, VR128:$src1, i32i8imm:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 []>, OpSize;
+// FIXME:
+// There's an AssertZext in the way of writing the store pattern
+// (store (i16 (trunc (X86pextrw (v16i8 VR128:$src1), imm:$src2))), addr:$dst)
+}
+
+defm PEXTRW      : SS41I_extract16<0x15, "pextrw">;
+
+
+/// SS41I_extract32 - SSE 4.1 extract 32 bits to int reg or memory destination
+multiclass SS41I_extract32<bits<8> opc, string OpcodeStr> {
+  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
+                 (ins VR128:$src1, i32i8imm:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(set GR32:$dst,
+                  (extractelt (v4i32 VR128:$src1), imm:$src2))]>, OpSize;
+  def mr : SS4AIi8<opc, MRMDestMem, (outs),
+                 (ins i32mem:$dst, VR128:$src1, i32i8imm:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(store (extractelt (v4i32 VR128:$src1), imm:$src2),
+                          addr:$dst)]>, OpSize;
+}
+
+defm PEXTRD      : SS41I_extract32<0x16, "pextrd">;
+
+
+/// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
+/// destination
+multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
+  def rr : SS4AIi8<opc, MRMDestReg, (outs GR32:$dst),
+                 (ins VR128:$src1, i32i8imm:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(set GR32:$dst,
+                    (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2))]>,
+           OpSize;
+  def mr : SS4AIi8<opc, MRMDestMem, (outs),
+                 (ins f32mem:$dst, VR128:$src1, i32i8imm:$src2),
+                 !strconcat(OpcodeStr,
+                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+                 [(store (extractelt (bc_v4i32 (v4f32 VR128:$src1)), imm:$src2),
+                          addr:$dst)]>, OpSize;
+}
+
+defm EXTRACTPS   : SS41I_extractf32<0x17, "extractps">;
+
+// Also match an EXTRACTPS store when the store is done as f32 instead of i32.
+def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
+                                              imm:$src2))),
+                 addr:$dst),
+          (EXTRACTPSmr addr:$dst, VR128:$src1, imm:$src2)>,
+         Requires<[HasSSE41]>;
+
+let Constraints = "$src1 = $dst" in {
+  multiclass SS41I_insert8<bits<8> opc, string OpcodeStr> {
+    def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+                   (ins VR128:$src1, GR32:$src2, i32i8imm:$src3),
+                   !strconcat(OpcodeStr,
+                    "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR128:$dst,
+                     (X86pinsrb VR128:$src1, GR32:$src2, imm:$src3))]>, OpSize;
+    def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+                   (ins VR128:$src1, i8mem:$src2, i32i8imm:$src3),
+                   !strconcat(OpcodeStr,
+                    "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR128:$dst,
+                     (X86pinsrb VR128:$src1, (extloadi8 addr:$src2),
+                                imm:$src3))]>, OpSize;
+  }
+}
+
+defm PINSRB      : SS41I_insert8<0x20, "pinsrb">;
+
+let Constraints = "$src1 = $dst" in {
+  multiclass SS41I_insert32<bits<8> opc, string OpcodeStr> {
+    def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+                   (ins VR128:$src1, GR32:$src2, i32i8imm:$src3),
+                   !strconcat(OpcodeStr,
+                    "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR128:$dst,
+                     (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
+                   OpSize;
+    def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+                   (ins VR128:$src1, i32mem:$src2, i32i8imm:$src3),
+                   !strconcat(OpcodeStr,
+                    "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR128:$dst,
+                     (v4i32 (insertelt VR128:$src1, (loadi32 addr:$src2),
+                                       imm:$src3)))]>, OpSize;
+  }
+}
+
+defm PINSRD      : SS41I_insert32<0x22, "pinsrd">;
+
+// insertps has a few different modes, there's the first two here below which
+// are optimized inserts that won't zero arbitrary elements in the destination
+// vector. The next one matches the intrinsic and could zero arbitrary elements
+// in the target vector.
+let Constraints = "$src1 = $dst" in {
+  multiclass SS41I_insertf32<bits<8> opc, string OpcodeStr> {
+    def rr : SS4AIi8<opc, MRMSrcReg, (outs VR128:$dst),
+                   (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
+                   !strconcat(OpcodeStr,
+                    "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR128:$dst,
+                     (X86insrtps VR128:$src1, VR128:$src2, imm:$src3))]>,
+      OpSize;
+    def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
+                   (ins VR128:$src1, f32mem:$src2, i32i8imm:$src3),
+                   !strconcat(OpcodeStr,
+                    "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+                   [(set VR128:$dst,
+                     (X86insrtps VR128:$src1,
+                                (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
+                                 imm:$src3))]>, OpSize;
+  }
+}
+
+defm INSERTPS    : SS41I_insertf32<0x21, "insertps">;
+
+def : Pat<(int_x86_sse41_insertps VR128:$src1, VR128:$src2, imm:$src3),
+          (INSERTPSrr VR128:$src1, VR128:$src2, imm:$src3)>;
+
+// ptest instruction we'll lower to this in X86ISelLowering primarily from
+// the intel intrinsic that corresponds to this.
+let Defs = [EFLAGS] in {
+def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
+                    "ptest \t{$src2, $src1|$src1, $src2}",
+                    [(X86ptest VR128:$src1, VR128:$src2),
+                      (implicit EFLAGS)]>, OpSize;
+def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, i128mem:$src2),
+                    "ptest \t{$src2, $src1|$src1, $src2}",
+                    [(X86ptest VR128:$src1, (load addr:$src2)),
+                        (implicit EFLAGS)]>, OpSize;
+}
+
+def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                       "movntdqa\t{$src, $dst|$dst, $src}",
+                       [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>,
+                       OpSize;
+
+
+//===----------------------------------------------------------------------===//
+// SSE4.2 Instructions
+//===----------------------------------------------------------------------===//
+
+/// SS42I_binop_rm_int - Simple SSE 4.2 binary operator
+let Constraints = "$src1 = $dst" in {
+  multiclass SS42I_binop_rm_int<bits<8> opc, string OpcodeStr,
+                                Intrinsic IntId128, bit Commutable = 0> {
+    def rr : SS428I<opc, MRMSrcReg, (outs VR128:$dst),
+                   (ins VR128:$src1, VR128:$src2),
+                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                   [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+                   OpSize {
+      let isCommutable = Commutable;
+    }
+    def rm : SS428I<opc, MRMSrcMem, (outs VR128:$dst),
+                   (ins VR128:$src1, i128mem:$src2),
+                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+                   [(set VR128:$dst,
+                     (IntId128 VR128:$src1,
+                      (bitconvert (memopv16i8 addr:$src2))))]>, OpSize;
+  }
+}
+
+defm PCMPGTQ      : SS42I_binop_rm_int<0x37, "pcmpgtq", int_x86_sse42_pcmpgtq>;
+
+def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, VR128:$src2)),
+          (PCMPGTQrr VR128:$src1, VR128:$src2)>;
+def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, (memop addr:$src2))),
+          (PCMPGTQrm VR128:$src1, addr:$src2)>;
+
+// crc intrinsic instruction
+// This set of instructions are only rm, the only difference is the size
+// of r and m.
+let Constraints = "$src1 = $dst" in {
+  def CRC32m8  : SS42FI<0xF0, MRMSrcMem, (outs GR32:$dst),
+                      (ins GR32:$src1, i8mem:$src2),
+                      "crc32 \t{$src2, $src1|$src1, $src2}",
+                       [(set GR32:$dst,
+                         (int_x86_sse42_crc32_8 GR32:$src1,
+                         (load addr:$src2)))]>, OpSize;
+  def CRC32r8  : SS42FI<0xF0, MRMSrcReg, (outs GR32:$dst),
+                      (ins GR32:$src1, GR8:$src2),
+                      "crc32 \t{$src2, $src1|$src1, $src2}",
+                       [(set GR32:$dst,
+                         (int_x86_sse42_crc32_8 GR32:$src1, GR8:$src2))]>,
+                         OpSize;
+  def CRC32m16  : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst),
+                      (ins GR32:$src1, i16mem:$src2),
+                      "crc32 \t{$src2, $src1|$src1, $src2}",
+                       [(set GR32:$dst,
+                         (int_x86_sse42_crc32_16 GR32:$src1,
+                         (load addr:$src2)))]>,
+                         OpSize;
+  def CRC32r16  : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst),
+                      (ins GR32:$src1, GR16:$src2),
+                      "crc32 \t{$src2, $src1|$src1, $src2}",
+                       [(set GR32:$dst,
+                         (int_x86_sse42_crc32_16 GR32:$src1, GR16:$src2))]>,
+                         OpSize;
+  def CRC32m32  : SS42FI<0xF1, MRMSrcMem, (outs GR32:$dst),
+                      (ins GR32:$src1, i32mem:$src2),
+                      "crc32 \t{$src2, $src1|$src1, $src2}",
+                       [(set GR32:$dst,
+                         (int_x86_sse42_crc32_32 GR32:$src1,
+                         (load addr:$src2)))]>, OpSize;
+  def CRC32r32  : SS42FI<0xF1, MRMSrcReg, (outs GR32:$dst),
+                      (ins GR32:$src1, GR32:$src2),
+                      "crc32 \t{$src2, $src1|$src1, $src2}",
+                       [(set GR32:$dst,
+                         (int_x86_sse42_crc32_32 GR32:$src1, GR32:$src2))]>,
+                         OpSize;
+  def CRC64m64  : SS42FI<0xF0, MRMSrcMem, (outs GR64:$dst),
+                      (ins GR64:$src1, i64mem:$src2),
+                      "crc32 \t{$src2, $src1|$src1, $src2}",
+                       [(set GR64:$dst,
+                         (int_x86_sse42_crc32_64 GR64:$src1,
+                         (load addr:$src2)))]>,
+                         OpSize, REX_W;
+  def CRC64r64  : SS42FI<0xF0, MRMSrcReg, (outs GR64:$dst),
+                      (ins GR64:$src1, GR64:$src2),
+                      "crc32 \t{$src2, $src1|$src1, $src2}",
+                       [(set GR64:$dst,
+                         (int_x86_sse42_crc32_64 GR64:$src1, GR64:$src2))]>,
+                         OpSize, REX_W;
+}
+
+// String/text processing instructions.
+let Defs = [EFLAGS], usesCustomInserter = 1 in {
+def PCMPISTRM128REG : SS42AI<0, Pseudo, (outs VR128:$dst),
+  (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+  "#PCMPISTRM128rr PSEUDO!",
+  [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, VR128:$src2,
+                                                imm:$src3))]>, OpSize;
+def PCMPISTRM128MEM : SS42AI<0, Pseudo, (outs VR128:$dst),
+  (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+  "#PCMPISTRM128rm PSEUDO!",
+  [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1, (load addr:$src2),
+                                                imm:$src3))]>, OpSize;
+}
+
+let Defs = [XMM0, EFLAGS] in {
+def PCMPISTRM128rr : SS42AI<0x62, MRMSrcReg, (outs),
+  (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+   "pcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize;
+def PCMPISTRM128rm : SS42AI<0x62, MRMSrcMem, (outs),
+  (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+  "pcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize;
+}
+
+let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
+def PCMPESTRM128REG : SS42AI<0, Pseudo, (outs VR128:$dst),
+  (ins VR128:$src1, VR128:$src3, i8imm:$src5),
+  "#PCMPESTRM128rr PSEUDO!",
+  [(set VR128:$dst, 
+        (int_x86_sse42_pcmpestrm128 
+         VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>, OpSize;
+
+def PCMPESTRM128MEM : SS42AI<0, Pseudo, (outs VR128:$dst),
+  (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
+  "#PCMPESTRM128rm PSEUDO!",
+  [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 
+                     VR128:$src1, EAX, (load addr:$src3), EDX, imm:$src5))]>, 
+  OpSize;
+}
+
+let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX] in {
+def PCMPESTRM128rr : SS42AI<0x60, MRMSrcReg, (outs),
+  (ins VR128:$src1, VR128:$src3, i8imm:$src5),
+  "pcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize;
+def PCMPESTRM128rm : SS42AI<0x60, MRMSrcMem, (outs),
+  (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
+  "pcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize;
+}
+
+let Defs = [ECX, EFLAGS] in {
+  multiclass SS42AI_pcmpistri<Intrinsic IntId128> {
+    def rr : SS42AI<0x63, MRMSrcReg, (outs), 
+      (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+      "pcmpistri\t{$src3, $src2, $src1|$src1, $src2, $src3}",
+      [(set ECX, (IntId128 VR128:$src1, VR128:$src2, imm:$src3)),
+       (implicit EFLAGS)]>, OpSize;
+    def rm : SS42AI<0x63, MRMSrcMem, (outs),
+      (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+      "pcmpistri\t{$src3, $src2, $src1|$src1, $src2, $src3}",
+      [(set ECX, (IntId128 VR128:$src1, (load addr:$src2), imm:$src3)),
+       (implicit EFLAGS)]>, OpSize;
+  }
+}
+
+defm PCMPISTRI  : SS42AI_pcmpistri<int_x86_sse42_pcmpistri128>;
+defm PCMPISTRIA : SS42AI_pcmpistri<int_x86_sse42_pcmpistria128>;
+defm PCMPISTRIC : SS42AI_pcmpistri<int_x86_sse42_pcmpistric128>;
+defm PCMPISTRIO : SS42AI_pcmpistri<int_x86_sse42_pcmpistrio128>;
+defm PCMPISTRIS : SS42AI_pcmpistri<int_x86_sse42_pcmpistris128>;
+defm PCMPISTRIZ : SS42AI_pcmpistri<int_x86_sse42_pcmpistriz128>;
+
+let Defs = [ECX, EFLAGS] in {
+let Uses = [EAX, EDX] in {
+  multiclass SS42AI_pcmpestri<Intrinsic IntId128> {
+    def rr : SS42AI<0x61, MRMSrcReg, (outs),
+      (ins VR128:$src1, VR128:$src3, i8imm:$src5),
+      "pcmpestri\t{$src5, $src3, $src1|$src1, $src3, $src5}",
+      [(set ECX, (IntId128 VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5)),
+       (implicit EFLAGS)]>, OpSize;
+    def rm : SS42AI<0x61, MRMSrcMem, (outs),
+      (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
+       "pcmpestri\t{$src5, $src3, $src1|$src1, $src3, $src5}",
+       [(set ECX, 
+             (IntId128 VR128:$src1, EAX, (load addr:$src3), EDX, imm:$src5)),
+        (implicit EFLAGS)]>, OpSize;
+  }
+}
+}
+
+defm PCMPESTRI  : SS42AI_pcmpestri<int_x86_sse42_pcmpestri128>;
+defm PCMPESTRIA : SS42AI_pcmpestri<int_x86_sse42_pcmpestria128>;
+defm PCMPESTRIC : SS42AI_pcmpestri<int_x86_sse42_pcmpestric128>;
+defm PCMPESTRIO : SS42AI_pcmpestri<int_x86_sse42_pcmpestrio128>;
+defm PCMPESTRIS : SS42AI_pcmpestri<int_x86_sse42_pcmpestris128>;
+defm PCMPESTRIZ : SS42AI_pcmpestri<int_x86_sse42_pcmpestriz128>;

diff --git a/lib/Target/X86/X86JITInfo.cpp b/lib/Target/X86/X86JITInfo.cpp
new file mode 100644
index 0000000..d297d24
--- /dev/null
+++ b/lib/Target/X86/X86JITInfo.cpp

@@ -0,0 +1,559 @@
+//===-- X86JITInfo.cpp - Implement the JIT interfaces for the X86 target --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the JIT interfaces for the X86 target.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jit"
+#include "X86JITInfo.h"
+#include "X86Relocations.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/Function.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cstdlib>
+#include <cstring>
+using namespace llvm;
+
+// Determine the platform we're running on
+#if defined (__x86_64__) || defined (_M_AMD64) || defined (_M_X64)
+# define X86_64_JIT
+#elif defined(__i386__) || defined(i386) || defined(_M_IX86)
+# define X86_32_JIT
+#endif
+
+void X86JITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
+  unsigned char *OldByte = (unsigned char *)Old;
+  *OldByte++ = 0xE9;                // Emit JMP opcode.
+  unsigned *OldWord = (unsigned *)OldByte;
+  unsigned NewAddr = (intptr_t)New;
+  unsigned OldAddr = (intptr_t)OldWord;
+  *OldWord = NewAddr - OldAddr - 4; // Emit PC-relative addr of New code.
+}
+
+
+/// JITCompilerFunction - This contains the address of the JIT function used to
+/// compile a function lazily.
+static TargetJITInfo::JITCompilerFn JITCompilerFunction;
+
+// Get the ASMPREFIX for the current host.  This is often '_'.
+#ifndef __USER_LABEL_PREFIX__
+#define __USER_LABEL_PREFIX__
+#endif
+#define GETASMPREFIX2(X) #X
+#define GETASMPREFIX(X) GETASMPREFIX2(X)
+#define ASMPREFIX GETASMPREFIX(__USER_LABEL_PREFIX__)
+
+// For ELF targets, use a .size and .type directive, to let tools
+// know the extent of functions defined in assembler.
+#if defined(__ELF__)
+# define SIZE(sym) ".size " #sym ", . - " #sym "\n"
+# define TYPE_FUNCTION(sym) ".type " #sym ", @function\n"
+#else
+# define SIZE(sym)
+# define TYPE_FUNCTION(sym)
+#endif
+
+// Provide a convenient way for disabling usage of CFI directives.
+// This is needed for old/broken assemblers (for example, gas on
+// Darwin is pretty old and doesn't support these directives)
+#if defined(__APPLE__)
+# define CFI(x)
+#else
+// FIXME: Disable this until we really want to use it. Also, we will
+//        need to add some workarounds for compilers, which support
+//        only subset of these directives.
+# define CFI(x)
+#endif
+
+// Provide a wrapper for X86CompilationCallback2 that saves non-traditional
+// callee saved registers, for the fastcc calling convention.
+extern "C" {
+#if defined(X86_64_JIT)
+# ifndef _MSC_VER
+  // No need to save EAX/EDX for X86-64.
+  void X86CompilationCallback(void);
+  asm(
+    ".text\n"
+    ".align 8\n"
+    ".globl " ASMPREFIX "X86CompilationCallback\n"
+    TYPE_FUNCTION(X86CompilationCallback)
+  ASMPREFIX "X86CompilationCallback:\n"
+    CFI(".cfi_startproc\n")
+    // Save RBP
+    "pushq   %rbp\n"
+    CFI(".cfi_def_cfa_offset 16\n")
+    CFI(".cfi_offset %rbp, -16\n")
+    // Save RSP
+    "movq    %rsp, %rbp\n"
+    CFI(".cfi_def_cfa_register %rbp\n")
+    // Save all int arg registers
+    "pushq   %rdi\n"
+    CFI(".cfi_rel_offset %rdi, 0\n")
+    "pushq   %rsi\n"
+    CFI(".cfi_rel_offset %rsi, 8\n")
+    "pushq   %rdx\n"
+    CFI(".cfi_rel_offset %rdx, 16\n")
+    "pushq   %rcx\n"
+    CFI(".cfi_rel_offset %rcx, 24\n")
+    "pushq   %r8\n"
+    CFI(".cfi_rel_offset %r8, 32\n")
+    "pushq   %r9\n"
+    CFI(".cfi_rel_offset %r9, 40\n")
+    // Align stack on 16-byte boundary. ESP might not be properly aligned
+    // (8 byte) if this is called from an indirect stub.
+    "andq    $-16, %rsp\n"
+    // Save all XMM arg registers
+    "subq    $128, %rsp\n"
+    "movaps  %xmm0, (%rsp)\n"
+    "movaps  %xmm1, 16(%rsp)\n"
+    "movaps  %xmm2, 32(%rsp)\n"
+    "movaps  %xmm3, 48(%rsp)\n"
+    "movaps  %xmm4, 64(%rsp)\n"
+    "movaps  %xmm5, 80(%rsp)\n"
+    "movaps  %xmm6, 96(%rsp)\n"
+    "movaps  %xmm7, 112(%rsp)\n"
+    // JIT callee
+    "movq    %rbp, %rdi\n"    // Pass prev frame and return address
+    "movq    8(%rbp), %rsi\n"
+    "call    " ASMPREFIX "X86CompilationCallback2\n"
+    // Restore all XMM arg registers
+    "movaps  112(%rsp), %xmm7\n"
+    "movaps  96(%rsp), %xmm6\n"
+    "movaps  80(%rsp), %xmm5\n"
+    "movaps  64(%rsp), %xmm4\n"
+    "movaps  48(%rsp), %xmm3\n"
+    "movaps  32(%rsp), %xmm2\n"
+    "movaps  16(%rsp), %xmm1\n"
+    "movaps  (%rsp), %xmm0\n"
+    // Restore RSP
+    "movq    %rbp, %rsp\n"
+    CFI(".cfi_def_cfa_register %rsp\n")
+    // Restore all int arg registers
+    "subq    $48, %rsp\n"
+    CFI(".cfi_adjust_cfa_offset 48\n")
+    "popq    %r9\n"
+    CFI(".cfi_adjust_cfa_offset -8\n")
+    CFI(".cfi_restore %r9\n")
+    "popq    %r8\n"
+    CFI(".cfi_adjust_cfa_offset -8\n")
+    CFI(".cfi_restore %r8\n")
+    "popq    %rcx\n"
+    CFI(".cfi_adjust_cfa_offset -8\n")
+    CFI(".cfi_restore %rcx\n")
+    "popq    %rdx\n"
+    CFI(".cfi_adjust_cfa_offset -8\n")
+    CFI(".cfi_restore %rdx\n")
+    "popq    %rsi\n"
+    CFI(".cfi_adjust_cfa_offset -8\n")
+    CFI(".cfi_restore %rsi\n")
+    "popq    %rdi\n"
+    CFI(".cfi_adjust_cfa_offset -8\n")
+    CFI(".cfi_restore %rdi\n")
+    // Restore RBP
+    "popq    %rbp\n"
+    CFI(".cfi_adjust_cfa_offset -8\n")
+    CFI(".cfi_restore %rbp\n")
+    "ret\n"
+    CFI(".cfi_endproc\n")
+    SIZE(X86CompilationCallback)
+  );
+# else
+  // No inline assembler support on this platform. The routine is in external
+  // file.
+  void X86CompilationCallback();
+
+# endif
+#elif defined (X86_32_JIT)
+# ifndef _MSC_VER
+  void X86CompilationCallback(void);
+  asm(
+    ".text\n"
+    ".align 8\n"
+    ".globl " ASMPREFIX "X86CompilationCallback\n"
+    TYPE_FUNCTION(X86CompilationCallback)
+  ASMPREFIX "X86CompilationCallback:\n"
+    CFI(".cfi_startproc\n")
+    "pushl   %ebp\n"
+    CFI(".cfi_def_cfa_offset 8\n")
+    CFI(".cfi_offset %ebp, -8\n")
+    "movl    %esp, %ebp\n"    // Standard prologue
+    CFI(".cfi_def_cfa_register %ebp\n")
+    "pushl   %eax\n"
+    CFI(".cfi_rel_offset %eax, 0\n")
+    "pushl   %edx\n"          // Save EAX/EDX/ECX
+    CFI(".cfi_rel_offset %edx, 4\n")
+    "pushl   %ecx\n"
+    CFI(".cfi_rel_offset %ecx, 8\n")
+#  if defined(__APPLE__)
+    "andl    $-16, %esp\n"    // Align ESP on 16-byte boundary
+#  endif
+    "subl    $16, %esp\n"
+    "movl    4(%ebp), %eax\n" // Pass prev frame and return address
+    "movl    %eax, 4(%esp)\n"
+    "movl    %ebp, (%esp)\n"
+    "call    " ASMPREFIX "X86CompilationCallback2\n"
+    "movl    %ebp, %esp\n"    // Restore ESP
+    CFI(".cfi_def_cfa_register %esp\n")
+    "subl    $12, %esp\n"
+    CFI(".cfi_adjust_cfa_offset 12\n")
+    "popl    %ecx\n"
+    CFI(".cfi_adjust_cfa_offset -4\n")
+    CFI(".cfi_restore %ecx\n")
+    "popl    %edx\n"
+    CFI(".cfi_adjust_cfa_offset -4\n")
+    CFI(".cfi_restore %edx\n")
+    "popl    %eax\n"
+    CFI(".cfi_adjust_cfa_offset -4\n")
+    CFI(".cfi_restore %eax\n")
+    "popl    %ebp\n"
+    CFI(".cfi_adjust_cfa_offset -4\n")
+    CFI(".cfi_restore %ebp\n")
+    "ret\n"
+    CFI(".cfi_endproc\n")
+    SIZE(X86CompilationCallback)
+  );
+
+  // Same as X86CompilationCallback but also saves XMM argument registers.
+  void X86CompilationCallback_SSE(void);
+  asm(
+    ".text\n"
+    ".align 8\n"
+    ".globl " ASMPREFIX "X86CompilationCallback_SSE\n"
+    TYPE_FUNCTION(X86CompilationCallback_SSE)
+  ASMPREFIX "X86CompilationCallback_SSE:\n"
+    CFI(".cfi_startproc\n")
+    "pushl   %ebp\n"
+    CFI(".cfi_def_cfa_offset 8\n")
+    CFI(".cfi_offset %ebp, -8\n")
+    "movl    %esp, %ebp\n"    // Standard prologue
+    CFI(".cfi_def_cfa_register %ebp\n")
+    "pushl   %eax\n"
+    CFI(".cfi_rel_offset %eax, 0\n")
+    "pushl   %edx\n"          // Save EAX/EDX/ECX
+    CFI(".cfi_rel_offset %edx, 4\n")
+    "pushl   %ecx\n"
+    CFI(".cfi_rel_offset %ecx, 8\n")
+    "andl    $-16, %esp\n"    // Align ESP on 16-byte boundary
+    // Save all XMM arg registers
+    "subl    $64, %esp\n"
+    // FIXME: provide frame move information for xmm registers.
+    // This can be tricky, because CFA register is ebp (unaligned)
+    // and we need to produce offsets relative to it.
+    "movaps  %xmm0, (%esp)\n"
+    "movaps  %xmm1, 16(%esp)\n"
+    "movaps  %xmm2, 32(%esp)\n"
+    "movaps  %xmm3, 48(%esp)\n"
+    "subl    $16, %esp\n"
+    "movl    4(%ebp), %eax\n" // Pass prev frame and return address
+    "movl    %eax, 4(%esp)\n"
+    "movl    %ebp, (%esp)\n"
+    "call    " ASMPREFIX "X86CompilationCallback2\n"
+    "addl    $16, %esp\n"
+    "movaps  48(%esp), %xmm3\n"
+    CFI(".cfi_restore %xmm3\n")
+    "movaps  32(%esp), %xmm2\n"
+    CFI(".cfi_restore %xmm2\n")
+    "movaps  16(%esp), %xmm1\n"
+    CFI(".cfi_restore %xmm1\n")
+    "movaps  (%esp), %xmm0\n"
+    CFI(".cfi_restore %xmm0\n")
+    "movl    %ebp, %esp\n"    // Restore ESP
+    CFI(".cfi_def_cfa_register esp\n")
+    "subl    $12, %esp\n"
+    CFI(".cfi_adjust_cfa_offset 12\n")
+    "popl    %ecx\n"
+    CFI(".cfi_adjust_cfa_offset -4\n")
+    CFI(".cfi_restore %ecx\n")
+    "popl    %edx\n"
+    CFI(".cfi_adjust_cfa_offset -4\n")
+    CFI(".cfi_restore %edx\n")
+    "popl    %eax\n"
+    CFI(".cfi_adjust_cfa_offset -4\n")
+    CFI(".cfi_restore %eax\n")
+    "popl    %ebp\n"
+    CFI(".cfi_adjust_cfa_offset -4\n")
+    CFI(".cfi_restore %ebp\n")
+    "ret\n"
+    CFI(".cfi_endproc\n")
+    SIZE(X86CompilationCallback_SSE)
+  );
+# else
+  void X86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr);
+
+  _declspec(naked) void X86CompilationCallback(void) {
+    __asm {
+      push  ebp
+      mov   ebp, esp
+      push  eax
+      push  edx
+      push  ecx
+      and   esp, -16
+      sub   esp, 16
+      mov   eax, dword ptr [ebp+4]
+      mov   dword ptr [esp+4], eax
+      mov   dword ptr [esp], ebp
+      call  X86CompilationCallback2
+      mov   esp, ebp
+      sub   esp, 12
+      pop   ecx
+      pop   edx
+      pop   eax
+      pop   ebp
+      ret
+    }
+  }
+
+# endif // _MSC_VER
+
+#else // Not an i386 host
+  void X86CompilationCallback() {
+    llvm_unreachable("Cannot call X86CompilationCallback() on a non-x86 arch!");
+  }
+#endif
+}
+
+/// X86CompilationCallback2 - This is the target-specific function invoked by the
+/// function stub when we did not know the real target of a call.  This function
+/// must locate the start of the stub or call site and pass it into the JIT
+/// compiler function.
+extern "C" {
+#if !(defined (X86_64_JIT) && defined(_MSC_VER))
+ // the following function is called only from this translation unit,
+ // unless we are under 64bit Windows with MSC, where there is 
+ // no support for inline assembly
+static
+#endif
+void ATTRIBUTE_USED
+X86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr) {
+  intptr_t *RetAddrLoc = &StackPtr[1];
+  assert(*RetAddrLoc == RetAddr &&
+         "Could not find return address on the stack!");
+
+  // It's a stub if there is an interrupt marker after the call.
+  bool isStub = ((unsigned char*)RetAddr)[0] == 0xCE;
+
+  // The call instruction should have pushed the return value onto the stack...
+#if defined (X86_64_JIT)
+  RetAddr--;     // Backtrack to the reference itself...
+#else
+  RetAddr -= 4;  // Backtrack to the reference itself...
+#endif
+
+#if 0
+  DEBUG(dbgs() << "In callback! Addr=" << (void*)RetAddr
+               << " ESP=" << (void*)StackPtr
+               << ": Resolving call to function: "
+               << TheVM->getFunctionReferencedName((void*)RetAddr) << "\n");
+#endif
+
+  // Sanity check to make sure this really is a call instruction.
+#if defined (X86_64_JIT)
+  assert(((unsigned char*)RetAddr)[-2] == 0x41 &&"Not a call instr!");
+  assert(((unsigned char*)RetAddr)[-1] == 0xFF &&"Not a call instr!");
+#else
+  assert(((unsigned char*)RetAddr)[-1] == 0xE8 &&"Not a call instr!");
+#endif
+
+  intptr_t NewVal = (intptr_t)JITCompilerFunction((void*)RetAddr);
+
+  // Rewrite the call target... so that we don't end up here every time we
+  // execute the call.
+#if defined (X86_64_JIT)
+  assert(isStub &&
+         "X86-64 doesn't support rewriting non-stub lazy compilation calls:"
+         " the call instruction varies too much.");
+#else
+  *(intptr_t *)RetAddr = (intptr_t)(NewVal-RetAddr-4);
+#endif
+
+  if (isStub) {
+    // If this is a stub, rewrite the call into an unconditional branch
+    // instruction so that two return addresses are not pushed onto the stack
+    // when the requested function finally gets called.  This also makes the
+    // 0xCE byte (interrupt) dead, so the marker doesn't effect anything.
+#if defined (X86_64_JIT)
+    // If the target address is within 32-bit range of the stub, use a
+    // PC-relative branch instead of loading the actual address.  (This is
+    // considerably shorter than the 64-bit immediate load already there.)
+    // We assume here intptr_t is 64 bits.
+    intptr_t diff = NewVal-RetAddr+7;
+    if (diff >= -2147483648LL && diff <= 2147483647LL) {
+      *(unsigned char*)(RetAddr-0xc) = 0xE9;
+      *(intptr_t *)(RetAddr-0xb) = diff & 0xffffffff;
+    } else {
+      *(intptr_t *)(RetAddr - 0xa) = NewVal;
+      ((unsigned char*)RetAddr)[0] = (2 | (4 << 3) | (3 << 6));
+    }
+#else
+    ((unsigned char*)RetAddr)[-1] = 0xE9;
+#endif
+  }
+
+  // Change the return address to reexecute the call instruction...
+#if defined (X86_64_JIT)
+  *RetAddrLoc -= 0xd;
+#else
+  *RetAddrLoc -= 5;
+#endif
+}
+}
+
+TargetJITInfo::LazyResolverFn
+X86JITInfo::getLazyResolverFunction(JITCompilerFn F) {
+  JITCompilerFunction = F;
+
+#if defined (X86_32_JIT) && !defined (_MSC_VER)
+  if (Subtarget->hasSSE1())
+    return X86CompilationCallback_SSE;
+#endif
+
+  return X86CompilationCallback;
+}
+
+X86JITInfo::X86JITInfo(X86TargetMachine &tm) : TM(tm) {
+  Subtarget = &TM.getSubtarget<X86Subtarget>();
+  useGOT = 0;
+  TLSOffset = 0;
+}
+
+void *X86JITInfo::emitGlobalValueIndirectSym(const GlobalValue* GV, void *ptr,
+                                             JITCodeEmitter &JCE) {
+#if defined (X86_64_JIT)
+  const unsigned Alignment = 8;
+  uint8_t Buffer[8];
+  uint8_t *Cur = Buffer;
+  MachineCodeEmitter::emitWordLEInto(Cur, (unsigned)(intptr_t)ptr);
+  MachineCodeEmitter::emitWordLEInto(Cur, (unsigned)(((intptr_t)ptr) >> 32));
+#else
+  const unsigned Alignment = 4;
+  uint8_t Buffer[4];
+  uint8_t *Cur = Buffer;
+  MachineCodeEmitter::emitWordLEInto(Cur, (intptr_t)ptr);
+#endif
+  return JCE.allocIndirectGV(GV, Buffer, sizeof(Buffer), Alignment);
+}
+
+TargetJITInfo::StubLayout X86JITInfo::getStubLayout() {
+  // The 64-bit stub contains:
+  //   movabs r10 <- 8-byte-target-address  # 10 bytes
+  //   call|jmp *r10  # 3 bytes
+  // The 32-bit stub contains a 5-byte call|jmp.
+  // If the stub is a call to the compilation callback, an extra byte is added
+  // to mark it as a stub.
+  StubLayout Result = {14, 4};
+  return Result;
+}
+
+void *X86JITInfo::emitFunctionStub(const Function* F, void *Target,
+                                   JITCodeEmitter &JCE) {
+  // Note, we cast to intptr_t here to silence a -pedantic warning that 
+  // complains about casting a function pointer to a normal pointer.
+#if defined (X86_32_JIT) && !defined (_MSC_VER)
+  bool NotCC = (Target != (void*)(intptr_t)X86CompilationCallback &&
+                Target != (void*)(intptr_t)X86CompilationCallback_SSE);
+#else
+  bool NotCC = Target != (void*)(intptr_t)X86CompilationCallback;
+#endif
+  JCE.emitAlignment(4);
+  void *Result = (void*)JCE.getCurrentPCValue();
+  if (NotCC) {
+#if defined (X86_64_JIT)
+    JCE.emitByte(0x49);          // REX prefix
+    JCE.emitByte(0xB8+2);        // movabsq r10
+    JCE.emitWordLE((unsigned)(intptr_t)Target);
+    JCE.emitWordLE((unsigned)(((intptr_t)Target) >> 32));
+    JCE.emitByte(0x41);          // REX prefix
+    JCE.emitByte(0xFF);          // jmpq *r10
+    JCE.emitByte(2 | (4 << 3) | (3 << 6));
+#else
+    JCE.emitByte(0xE9);
+    JCE.emitWordLE((intptr_t)Target-JCE.getCurrentPCValue()-4);
+#endif
+    return Result;
+  }
+
+#if defined (X86_64_JIT)
+  JCE.emitByte(0x49);          // REX prefix
+  JCE.emitByte(0xB8+2);        // movabsq r10
+  JCE.emitWordLE((unsigned)(intptr_t)Target);
+  JCE.emitWordLE((unsigned)(((intptr_t)Target) >> 32));
+  JCE.emitByte(0x41);          // REX prefix
+  JCE.emitByte(0xFF);          // callq *r10
+  JCE.emitByte(2 | (2 << 3) | (3 << 6));
+#else
+  JCE.emitByte(0xE8);   // Call with 32 bit pc-rel destination...
+
+  JCE.emitWordLE((intptr_t)Target-JCE.getCurrentPCValue()-4);
+#endif
+
+  // This used to use 0xCD, but that value is used by JITMemoryManager to
+  // initialize the buffer with garbage, which means it may follow a
+  // noreturn function call, confusing X86CompilationCallback2.  PR 4929.
+  JCE.emitByte(0xCE);   // Interrupt - Just a marker identifying the stub!
+  return Result;
+}
+
+/// getPICJumpTableEntry - Returns the value of the jumptable entry for the
+/// specific basic block.
+uintptr_t X86JITInfo::getPICJumpTableEntry(uintptr_t BB, uintptr_t Entry) {
+#if defined(X86_64_JIT)
+  return BB - Entry;
+#else
+  return BB - PICBase;
+#endif
+}
+
+/// relocate - Before the JIT can run a block of code that has been emitted,
+/// it must rewrite the code to contain the actual addresses of any
+/// referenced global symbols.
+void X86JITInfo::relocate(void *Function, MachineRelocation *MR,
+                          unsigned NumRelocs, unsigned char* GOTBase) {
+  for (unsigned i = 0; i != NumRelocs; ++i, ++MR) {
+    void *RelocPos = (char*)Function + MR->getMachineCodeOffset();
+    intptr_t ResultPtr = (intptr_t)MR->getResultPointer();
+    switch ((X86::RelocationType)MR->getRelocationType()) {
+    case X86::reloc_pcrel_word: {
+      // PC relative relocation, add the relocated value to the value already in
+      // memory, after we adjust it for where the PC is.
+      ResultPtr = ResultPtr -(intptr_t)RelocPos - 4 - MR->getConstantVal();
+      *((unsigned*)RelocPos) += (unsigned)ResultPtr;
+      break;
+    }
+    case X86::reloc_picrel_word: {
+      // PIC base relative relocation, add the relocated value to the value
+      // already in memory, after we adjust it for where the PIC base is.
+      ResultPtr = ResultPtr - ((intptr_t)Function + MR->getConstantVal());
+      *((unsigned*)RelocPos) += (unsigned)ResultPtr;
+      break;
+    }
+    case X86::reloc_absolute_word:
+    case X86::reloc_absolute_word_sext:
+      // Absolute relocation, just add the relocated value to the value already
+      // in memory.
+      *((unsigned*)RelocPos) += (unsigned)ResultPtr;
+      break;
+    case X86::reloc_absolute_dword:
+      *((intptr_t*)RelocPos) += ResultPtr;
+      break;
+    }
+  }
+}
+
+char* X86JITInfo::allocateThreadLocalMemory(size_t size) {
+#if defined(X86_32_JIT) && !defined(__APPLE__) && !defined(_MSC_VER)
+  TLSOffset -= size;
+  return TLSOffset;
+#else
+  llvm_unreachable("Cannot allocate thread local storage on this arch!");
+  return 0;
+#endif
+}

diff --git a/lib/Target/X86/X86JITInfo.h b/lib/Target/X86/X86JITInfo.h
new file mode 100644
index 0000000..238420c
--- /dev/null
+++ b/lib/Target/X86/X86JITInfo.h

@@ -0,0 +1,81 @@
+//===- X86JITInfo.h - X86 implementation of the JIT interface  --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetJITInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86JITINFO_H
+#define X86JITINFO_H
+
+#include "llvm/Function.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/Target/TargetJITInfo.h"
+
+namespace llvm {
+  class X86TargetMachine;
+  class X86Subtarget;
+
+  class X86JITInfo : public TargetJITInfo {
+    X86TargetMachine &TM;
+    const X86Subtarget *Subtarget;
+    uintptr_t PICBase;
+    char* TLSOffset;
+  public:
+    explicit X86JITInfo(X86TargetMachine &tm);
+
+    /// replaceMachineCodeForFunction - Make it so that calling the function
+    /// whose machine code is at OLD turns into a call to NEW, perhaps by
+    /// overwriting OLD with a branch to NEW.  This is used for self-modifying
+    /// code.
+    ///
+    virtual void replaceMachineCodeForFunction(void *Old, void *New);
+
+    /// emitGlobalValueIndirectSym - Use the specified JITCodeEmitter object
+    /// to emit an indirect symbol which contains the address of the specified
+    /// ptr.
+    virtual void *emitGlobalValueIndirectSym(const GlobalValue* GV, void *ptr,
+                                             JITCodeEmitter &JCE);
+
+    // getStubLayout - Returns the size and alignment of the largest call stub
+    // on X86.
+    virtual StubLayout getStubLayout();
+
+    /// emitFunctionStub - Use the specified JITCodeEmitter object to emit a
+    /// small native function that simply calls the function at the specified
+    /// address.
+    virtual void *emitFunctionStub(const Function* F, void *Target,
+                                   JITCodeEmitter &JCE);
+
+    /// getPICJumpTableEntry - Returns the value of the jumptable entry for the
+    /// specific basic block.
+    virtual uintptr_t getPICJumpTableEntry(uintptr_t BB, uintptr_t JTBase);
+
+    /// getLazyResolverFunction - Expose the lazy resolver to the JIT.
+    virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn);
+
+    /// relocate - Before the JIT can run a block of code that has been emitted,
+    /// it must rewrite the code to contain the actual addresses of any
+    /// referenced global symbols.
+    virtual void relocate(void *Function, MachineRelocation *MR,
+                          unsigned NumRelocs, unsigned char* GOTBase);
+    
+    /// allocateThreadLocalMemory - Each target has its own way of
+    /// handling thread local variables. This method returns a value only
+    /// meaningful to the target.
+    virtual char* allocateThreadLocalMemory(size_t size);
+
+    /// setPICBase / getPICBase - Getter / setter of PICBase, used to compute
+    /// PIC jumptable entry.
+    void setPICBase(uintptr_t Base) { PICBase = Base; }
+    uintptr_t getPICBase() const { return PICBase; }
+  };
+}
+
+#endif

diff --git a/lib/Target/X86/X86MCAsmInfo.cpp b/lib/Target/X86/X86MCAsmInfo.cpp
new file mode 100644
index 0000000..d3b0052
--- /dev/null
+++ b/lib/Target/X86/X86MCAsmInfo.cpp

@@ -0,0 +1,120 @@
+//===-- X86MCAsmInfo.cpp - X86 asm properties -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declarations of the X86MCAsmInfo properties.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MCAsmInfo.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/CommandLine.h"
+using namespace llvm;
+
+enum AsmWriterFlavorTy {
+  // Note: This numbering has to match the GCC assembler dialects for inline
+  // asm alternatives to work right.
+  ATT = 0, Intel = 1
+};
+
+static cl::opt<AsmWriterFlavorTy>
+AsmWriterFlavor("x86-asm-syntax", cl::init(ATT),
+  cl::desc("Choose style of code to emit from X86 backend:"),
+  cl::values(clEnumValN(ATT,   "att",   "Emit AT&T-style assembly"),
+             clEnumValN(Intel, "intel", "Emit Intel-style assembly"),
+             clEnumValEnd));
+
+
+static const char *const x86_asm_table[] = {
+  "{si}", "S",
+  "{di}", "D",
+  "{ax}", "a",
+  "{cx}", "c",
+  "{memory}", "memory",
+  "{flags}", "",
+  "{dirflag}", "",
+  "{fpsr}", "",
+  "{cc}", "cc",
+  0,0};
+
+X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &Triple) {
+  AsmTransCBE = x86_asm_table;
+  AssemblerDialect = AsmWriterFlavor;
+    
+  bool is64Bit = Triple.getArch() == Triple::x86_64;
+
+  TextAlignFillValue = 0x90;
+
+  if (!is64Bit)
+    Data64bitsDirective = 0;       // we can't emit a 64-bit unit
+
+  CommentString = "##";
+  PCSymbol = ".";
+
+  SupportsDebugInformation = true;
+  DwarfUsesInlineInfoSection = true;
+
+  // Exceptions handling
+  ExceptionsType = ExceptionHandling::Dwarf;
+  AbsoluteEHSectionOffsets = false;
+}
+
+X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &Triple) {
+  AsmTransCBE = x86_asm_table;
+  AssemblerDialect = AsmWriterFlavor;
+
+  PrivateGlobalPrefix = ".L";
+  WeakRefDirective = "\t.weak\t";
+  PCSymbol = ".";
+
+  // Set up DWARF directives
+  HasLEB128 = true;  // Target asm supports leb128 directives (little-endian)
+
+  // Debug Information
+  AbsoluteDebugSectionOffsets = true;
+  SupportsDebugInformation = true;
+
+  // Exceptions handling
+  ExceptionsType = ExceptionHandling::Dwarf;
+  AbsoluteEHSectionOffsets = false;
+}
+
+MCSection *X86ELFMCAsmInfo::getNonexecutableStackSection(MCContext &Ctx) const {
+  return MCSectionELF::Create(".note.GNU-stack", MCSectionELF::SHT_PROGBITS,
+                              0, SectionKind::getMetadata(), false, Ctx);
+}
+
+X86MCAsmInfoCOFF::X86MCAsmInfoCOFF(const Triple &Triple) {
+  AsmTransCBE = x86_asm_table;
+  AssemblerDialect = AsmWriterFlavor;
+}
+
+
+X86WinMCAsmInfo::X86WinMCAsmInfo(const Triple &Triple) {
+  AsmTransCBE = x86_asm_table;
+  AssemblerDialect = AsmWriterFlavor;
+
+  GlobalPrefix = "_";
+  CommentString = ";";
+
+  PrivateGlobalPrefix = "$";
+  AlignDirective = "\tALIGN\t";
+  ZeroDirective = "\tdb\t";
+  AsciiDirective = "\tdb\t";
+  AscizDirective = 0;
+  Data8bitsDirective = "\tdb\t";
+  Data16bitsDirective = "\tdw\t";
+  Data32bitsDirective = "\tdd\t";
+  Data64bitsDirective = "\tdq\t";
+  HasDotTypeDotSizeDirective = false;
+  HasSingleParameterDotFile = false;
+
+  AlignmentIsInBytes = true;
+}

diff --git a/lib/Target/X86/X86MCAsmInfo.h b/lib/Target/X86/X86MCAsmInfo.h
new file mode 100644
index 0000000..ca227b7
--- /dev/null
+++ b/lib/Target/X86/X86MCAsmInfo.h

@@ -0,0 +1,43 @@
+//=====-- X86MCAsmInfo.h - X86 asm properties -----------------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the X86MCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86TARGETASMINFO_H
+#define X86TARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmInfoCOFF.h"
+#include "llvm/MC/MCAsmInfoDarwin.h"
+
+namespace llvm {
+  class Triple;
+
+  struct X86MCAsmInfoDarwin : public MCAsmInfoDarwin {
+    explicit X86MCAsmInfoDarwin(const Triple &Triple);
+  };
+
+  struct X86ELFMCAsmInfo : public MCAsmInfo {
+    explicit X86ELFMCAsmInfo(const Triple &Triple);
+    virtual MCSection *getNonexecutableStackSection(MCContext &Ctx) const;
+  };
+
+  struct X86MCAsmInfoCOFF : public MCAsmInfoCOFF {
+    explicit X86MCAsmInfoCOFF(const Triple &Triple);
+  };
+
+  struct X86WinMCAsmInfo : public MCAsmInfo {
+    explicit X86WinMCAsmInfo(const Triple &Triple);
+  };
+
+} // namespace llvm
+
+#endif

diff --git a/lib/Target/X86/X86MCCodeEmitter.cpp b/lib/Target/X86/X86MCCodeEmitter.cpp
new file mode 100644
index 0000000..764c87a
--- /dev/null
+++ b/lib/Target/X86/X86MCCodeEmitter.cpp

@@ -0,0 +1,675 @@
+//===-- X86/X86MCCodeEmitter.cpp - Convert X86 code to machine code -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the X86MCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86-emitter"
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+// FIXME: This should move to a header.
+namespace llvm {
+namespace X86 {
+enum Fixups {
+  reloc_pcrel_word = FirstTargetFixupKind,
+  reloc_picrel_word,
+  reloc_absolute_word,
+  reloc_absolute_word_sext,
+  reloc_absolute_dword
+};
+}
+}
+
+namespace {
+class X86MCCodeEmitter : public MCCodeEmitter {
+  X86MCCodeEmitter(const X86MCCodeEmitter &); // DO NOT IMPLEMENT
+  void operator=(const X86MCCodeEmitter &); // DO NOT IMPLEMENT
+  const TargetMachine &TM;
+  const TargetInstrInfo &TII;
+  bool Is64BitMode;
+public:
+  X86MCCodeEmitter(TargetMachine &tm, bool is64Bit) 
+    : TM(tm), TII(*TM.getInstrInfo()) {
+    Is64BitMode = is64Bit;
+  }
+
+  ~X86MCCodeEmitter() {}
+
+  unsigned getNumFixupKinds() const {
+    return 5;
+  }
+
+  MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const {
+    static MCFixupKindInfo Infos[] = {
+      { "reloc_pcrel_word", 0, 4 * 8 },
+      { "reloc_picrel_word", 0, 4 * 8 },
+      { "reloc_absolute_word", 0, 4 * 8 },
+      { "reloc_absolute_word_sext", 0, 4 * 8 },
+      { "reloc_absolute_dword", 0, 8 * 8 }
+    };
+
+    assert(Kind >= FirstTargetFixupKind && Kind < MaxTargetFixupKind &&
+           "Invalid kind!");
+    return Infos[Kind - FirstTargetFixupKind];
+  }
+  
+  static unsigned GetX86RegNum(const MCOperand &MO) {
+    return X86RegisterInfo::getX86RegNum(MO.getReg());
+  }
+  
+  void EmitByte(unsigned char C, unsigned &CurByte, raw_ostream &OS) const {
+    OS << (char)C;
+    ++CurByte;
+  }
+  
+  void EmitConstant(uint64_t Val, unsigned Size, unsigned &CurByte,
+                    raw_ostream &OS) const {
+    // Output the constant in little endian byte order.
+    for (unsigned i = 0; i != Size; ++i) {
+      EmitByte(Val & 255, CurByte, OS);
+      Val >>= 8;
+    }
+  }
+
+  void EmitDisplacementField(const MCOperand &Disp, int64_t Adj, bool IsPCRel,
+                             unsigned &CurByte, raw_ostream &OS,
+                             SmallVectorImpl<MCFixup> &Fixups) const;
+  
+  inline static unsigned char ModRMByte(unsigned Mod, unsigned RegOpcode,
+                                        unsigned RM) {
+    assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!");
+    return RM | (RegOpcode << 3) | (Mod << 6);
+  }
+  
+  void EmitRegModRMByte(const MCOperand &ModRMReg, unsigned RegOpcodeFld,
+                        unsigned &CurByte, raw_ostream &OS) const {
+    EmitByte(ModRMByte(3, RegOpcodeFld, GetX86RegNum(ModRMReg)), CurByte, OS);
+  }
+  
+  void EmitSIBByte(unsigned SS, unsigned Index, unsigned Base,
+                   unsigned &CurByte, raw_ostream &OS) const {
+    // SIB byte is in the same format as the ModRMByte.
+    EmitByte(ModRMByte(SS, Index, Base), CurByte, OS);
+  }
+  
+  
+  void EmitMemModRMByte(const MCInst &MI, unsigned Op,
+                        unsigned RegOpcodeField, intptr_t PCAdj,
+                        unsigned &CurByte, raw_ostream &OS,
+                        SmallVectorImpl<MCFixup> &Fixups) const;
+  
+  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups) const;
+  
+};
+
+} // end anonymous namespace
+
+
+MCCodeEmitter *llvm::createX86_32MCCodeEmitter(const Target &,
+                                               TargetMachine &TM) {
+  return new X86MCCodeEmitter(TM, false);
+}
+
+MCCodeEmitter *llvm::createX86_64MCCodeEmitter(const Target &,
+                                               TargetMachine &TM) {
+  return new X86MCCodeEmitter(TM, true);
+}
+
+
+/// isDisp8 - Return true if this signed displacement fits in a 8-bit 
+/// sign-extended field. 
+static bool isDisp8(int Value) {
+  return Value == (signed char)Value;
+}
+
+void X86MCCodeEmitter::
+EmitDisplacementField(const MCOperand &DispOp, int64_t Adj, bool IsPCRel,
+                      unsigned &CurByte, raw_ostream &OS,
+                      SmallVectorImpl<MCFixup> &Fixups) const {
+  // If this is a simple integer displacement that doesn't require a relocation,
+  // emit it now.
+  if (DispOp.isImm()) {
+    EmitConstant(DispOp.getImm(), 4, CurByte, OS);
+    return;
+  }
+
+#if 0
+  // Otherwise, this is something that requires a relocation.  Emit it as such
+  // now.
+  unsigned RelocType = Is64BitMode ?
+  (IsPCRel ? X86::reloc_pcrel_word : X86::reloc_absolute_word_sext)
+  : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
+#endif
+  
+  // Emit a symbolic constant as a fixup and 4 zeros.
+  Fixups.push_back(MCFixup::Create(CurByte, DispOp.getExpr(),
+                                   MCFixupKind(X86::reloc_absolute_word)));
+  EmitConstant(0, 4, CurByte, OS);
+}
+
+
+void X86MCCodeEmitter::EmitMemModRMByte(const MCInst &MI, unsigned Op,
+                                        unsigned RegOpcodeField,
+                                        intptr_t PCAdj,
+                                        unsigned &CurByte,
+                                        raw_ostream &OS,
+                                        SmallVectorImpl<MCFixup> &Fixups) const{
+  const MCOperand &Disp     = MI.getOperand(Op+3);
+  const MCOperand &Base     = MI.getOperand(Op);
+  const MCOperand &Scale    = MI.getOperand(Op+1);
+  const MCOperand &IndexReg = MI.getOperand(Op+2);
+  unsigned BaseReg = Base.getReg();
+
+  // FIXME: Eliminate!
+  bool IsPCRel = false;
+    
+  // Determine whether a SIB byte is needed.
+  // If no BaseReg, issue a RIP relative instruction only if the MCE can 
+  // resolve addresses on-the-fly, otherwise use SIB (Intel Manual 2A, table
+  // 2-7) and absolute references.
+  if (// The SIB byte must be used if there is an index register.
+      IndexReg.getReg() == 0 && 
+      // The SIB byte must be used if the base is ESP/RSP.
+      BaseReg != X86::ESP && BaseReg != X86::RSP &&
+      // If there is no base register and we're in 64-bit mode, we need a SIB
+      // byte to emit an addr that is just 'disp32' (the non-RIP relative form).
+      (!Is64BitMode || BaseReg != 0)) {
+
+    if (BaseReg == 0 ||          // [disp32]     in X86-32 mode
+        BaseReg == X86::RIP) {   // [disp32+RIP] in X86-64 mode
+      EmitByte(ModRMByte(0, RegOpcodeField, 5), CurByte, OS);
+      EmitDisplacementField(Disp, PCAdj, true, CurByte, OS, Fixups);
+      return;
+    }
+    
+    unsigned BaseRegNo = GetX86RegNum(Base);
+
+    // If the base is not EBP/ESP and there is no displacement, use simple
+    // indirect register encoding, this handles addresses like [EAX].  The
+    // encoding for [EBP] with no displacement means [disp32] so we handle it
+    // by emitting a displacement of 0 below.
+    if (Disp.isImm() && Disp.getImm() == 0 && BaseRegNo != N86::EBP) {
+      EmitByte(ModRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS);
+      return;
+    }
+    
+    // Otherwise, if the displacement fits in a byte, encode as [REG+disp8].
+    if (Disp.isImm() && isDisp8(Disp.getImm())) {
+      EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS);
+      EmitConstant(Disp.getImm(), 1, CurByte, OS);
+      return;
+    }
+    
+    // Otherwise, emit the most general non-SIB encoding: [REG+disp32]
+    EmitByte(ModRMByte(2, RegOpcodeField, BaseRegNo), CurByte, OS);
+    EmitDisplacementField(Disp, PCAdj, IsPCRel, CurByte, OS, Fixups);
+    return;
+  }
+    
+  // We need a SIB byte, so start by outputting the ModR/M byte first
+  assert(IndexReg.getReg() != X86::ESP &&
+         IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!");
+  
+  bool ForceDisp32 = false;
+  bool ForceDisp8  = false;
+  if (BaseReg == 0) {
+    // If there is no base register, we emit the special case SIB byte with
+    // MOD=0, BASE=5, to JUST get the index, scale, and displacement.
+    EmitByte(ModRMByte(0, RegOpcodeField, 4), CurByte, OS);
+    ForceDisp32 = true;
+  } else if (!Disp.isImm()) {
+    // Emit the normal disp32 encoding.
+    EmitByte(ModRMByte(2, RegOpcodeField, 4), CurByte, OS);
+    ForceDisp32 = true;
+  } else if (Disp.getImm() == 0 && BaseReg != X86::EBP) {
+    // Emit no displacement ModR/M byte
+    EmitByte(ModRMByte(0, RegOpcodeField, 4), CurByte, OS);
+  } else if (isDisp8(Disp.getImm())) {
+    // Emit the disp8 encoding.
+    EmitByte(ModRMByte(1, RegOpcodeField, 4), CurByte, OS);
+    ForceDisp8 = true;           // Make sure to force 8 bit disp if Base=EBP
+  } else {
+    // Emit the normal disp32 encoding.
+    EmitByte(ModRMByte(2, RegOpcodeField, 4), CurByte, OS);
+  }
+  
+  // Calculate what the SS field value should be...
+  static const unsigned SSTable[] = { ~0, 0, 1, ~0, 2, ~0, ~0, ~0, 3 };
+  unsigned SS = SSTable[Scale.getImm()];
+  
+  if (BaseReg == 0) {
+    // Handle the SIB byte for the case where there is no base, see Intel 
+    // Manual 2A, table 2-7. The displacement has already been output.
+    unsigned IndexRegNo;
+    if (IndexReg.getReg())
+      IndexRegNo = GetX86RegNum(IndexReg);
+    else // Examples: [ESP+1*<noreg>+4] or [scaled idx]+disp32 (MOD=0,BASE=5)
+      IndexRegNo = 4;
+    EmitSIBByte(SS, IndexRegNo, 5, CurByte, OS);
+  } else {
+    unsigned IndexRegNo;
+    if (IndexReg.getReg())
+      IndexRegNo = GetX86RegNum(IndexReg);
+    else
+      IndexRegNo = 4;   // For example [ESP+1*<noreg>+4]
+    EmitSIBByte(SS, IndexRegNo, GetX86RegNum(Base), CurByte, OS);
+  }
+  
+  // Do we need to output a displacement?
+  if (ForceDisp8)
+    EmitConstant(Disp.getImm(), 1, CurByte, OS);
+  else if (ForceDisp32 || Disp.getImm() != 0)
+    EmitDisplacementField(Disp, PCAdj, IsPCRel, CurByte, OS, Fixups);
+}
+
+/// DetermineREXPrefix - Determine if the MCInst has to be encoded with a X86-64
+/// REX prefix which specifies 1) 64-bit instructions, 2) non-default operand
+/// size, and 3) use of X86-64 extended registers.
+static unsigned DetermineREXPrefix(const MCInst &MI, unsigned TSFlags,
+                                   const TargetInstrDesc &Desc) {
+  unsigned REX = 0;
+  
+  // Pseudo instructions do not need REX prefix byte.
+  if ((TSFlags & X86II::FormMask) == X86II::Pseudo)
+    return 0;
+  if (TSFlags & X86II::REX_W)
+    REX |= 1 << 3;
+  
+  if (MI.getNumOperands() == 0) return REX;
+  
+  unsigned NumOps = MI.getNumOperands();
+  // FIXME: MCInst should explicitize the two-addrness.
+  bool isTwoAddr = NumOps > 1 &&
+                      Desc.getOperandConstraint(1, TOI::TIED_TO) != -1;
+  
+  // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
+  unsigned i = isTwoAddr ? 1 : 0;
+  for (; i != NumOps; ++i) {
+    const MCOperand &MO = MI.getOperand(i);
+    if (!MO.isReg()) continue;
+    unsigned Reg = MO.getReg();
+    if (!X86InstrInfo::isX86_64NonExtLowByteReg(Reg)) continue;
+    // FIXME: The caller of DetermineREXPrefix slaps this prefix onto anything
+    // that returns non-zero.
+    REX |= 0x40;
+    break;
+  }
+  
+  switch (TSFlags & X86II::FormMask) {
+  case X86II::MRMInitReg: assert(0 && "FIXME: Remove this!");
+  case X86II::MRMSrcReg:
+    if (MI.getOperand(0).isReg() &&
+        X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+      REX |= 1 << 2;
+    i = isTwoAddr ? 2 : 1;
+    for (; i != NumOps; ++i) {
+      const MCOperand &MO = MI.getOperand(i);
+      if (MO.isReg() && X86InstrInfo::isX86_64ExtendedReg(MO.getReg()))
+        REX |= 1 << 0;
+    }
+    break;
+  case X86II::MRMSrcMem: {
+    if (MI.getOperand(0).isReg() &&
+        X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+      REX |= 1 << 2;
+    unsigned Bit = 0;
+    i = isTwoAddr ? 2 : 1;
+    for (; i != NumOps; ++i) {
+      const MCOperand &MO = MI.getOperand(i);
+      if (MO.isReg()) {
+        if (X86InstrInfo::isX86_64ExtendedReg(MO.getReg()))
+          REX |= 1 << Bit;
+        Bit++;
+      }
+    }
+    break;
+  }
+  case X86II::MRM0m: case X86II::MRM1m:
+  case X86II::MRM2m: case X86II::MRM3m:
+  case X86II::MRM4m: case X86II::MRM5m:
+  case X86II::MRM6m: case X86II::MRM7m:
+  case X86II::MRMDestMem: {
+    unsigned e = (isTwoAddr ? X86AddrNumOperands+1 : X86AddrNumOperands);
+    i = isTwoAddr ? 1 : 0;
+    if (NumOps > e && MI.getOperand(e).isReg() &&
+        X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(e).getReg()))
+      REX |= 1 << 2;
+    unsigned Bit = 0;
+    for (; i != e; ++i) {
+      const MCOperand &MO = MI.getOperand(i);
+      if (MO.isReg()) {
+        if (X86InstrInfo::isX86_64ExtendedReg(MO.getReg()))
+          REX |= 1 << Bit;
+        Bit++;
+      }
+    }
+    break;
+  }
+  default:
+    if (MI.getOperand(0).isReg() &&
+        X86InstrInfo::isX86_64ExtendedReg(MI.getOperand(0).getReg()))
+      REX |= 1 << 0;
+    i = isTwoAddr ? 2 : 1;
+    for (unsigned e = NumOps; i != e; ++i) {
+      const MCOperand &MO = MI.getOperand(i);
+      if (MO.isReg() && X86InstrInfo::isX86_64ExtendedReg(MO.getReg()))
+        REX |= 1 << 2;
+    }
+    break;
+  }
+  return REX;
+}
+
+void X86MCCodeEmitter::
+EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                  SmallVectorImpl<MCFixup> &Fixups) const {
+  unsigned Opcode = MI.getOpcode();
+  const TargetInstrDesc &Desc = TII.get(Opcode);
+  unsigned TSFlags = Desc.TSFlags;
+
+  // Keep track of the current byte being emitted.
+  unsigned CurByte = 0;
+  
+  // FIXME: We should emit the prefixes in exactly the same order as GAS does,
+  // in order to provide diffability.
+
+  // Emit the lock opcode prefix as needed.
+  if (TSFlags & X86II::LOCK)
+    EmitByte(0xF0, CurByte, OS);
+  
+  // Emit segment override opcode prefix as needed.
+  switch (TSFlags & X86II::SegOvrMask) {
+  default: assert(0 && "Invalid segment!");
+  case 0: break;  // No segment override!
+  case X86II::FS:
+    EmitByte(0x64, CurByte, OS);
+    break;
+  case X86II::GS:
+    EmitByte(0x65, CurByte, OS);
+    break;
+  }
+  
+  // Emit the repeat opcode prefix as needed.
+  if ((TSFlags & X86II::Op0Mask) == X86II::REP)
+    EmitByte(0xF3, CurByte, OS);
+  
+  // Emit the operand size opcode prefix as needed.
+  if (TSFlags & X86II::OpSize)
+    EmitByte(0x66, CurByte, OS);
+  
+  // Emit the address size opcode prefix as needed.
+  if (TSFlags & X86II::AdSize)
+    EmitByte(0x67, CurByte, OS);
+  
+  bool Need0FPrefix = false;
+  switch (TSFlags & X86II::Op0Mask) {
+  default: assert(0 && "Invalid prefix!");
+  case 0: break;  // No prefix!
+  case X86II::REP: break; // already handled.
+  case X86II::TB:  // Two-byte opcode prefix
+  case X86II::T8:  // 0F 38
+  case X86II::TA:  // 0F 3A
+    Need0FPrefix = true;
+    break;
+  case X86II::TF: // F2 0F 38
+    EmitByte(0xF2, CurByte, OS);
+    Need0FPrefix = true;
+    break;
+  case X86II::XS:   // F3 0F
+    EmitByte(0xF3, CurByte, OS);
+    Need0FPrefix = true;
+    break;
+  case X86II::XD:   // F2 0F
+    EmitByte(0xF2, CurByte, OS);
+    Need0FPrefix = true;
+    break;
+  case X86II::D8: EmitByte(0xD8, CurByte, OS); break;
+  case X86II::D9: EmitByte(0xD9, CurByte, OS); break;
+  case X86II::DA: EmitByte(0xDA, CurByte, OS); break;
+  case X86II::DB: EmitByte(0xDB, CurByte, OS); break;
+  case X86II::DC: EmitByte(0xDC, CurByte, OS); break;
+  case X86II::DD: EmitByte(0xDD, CurByte, OS); break;
+  case X86II::DE: EmitByte(0xDE, CurByte, OS); break;
+  case X86II::DF: EmitByte(0xDF, CurByte, OS); break;
+  }
+  
+  // Handle REX prefix.
+  // FIXME: Can this come before F2 etc to simplify emission?
+  if (Is64BitMode) {
+    if (unsigned REX = DetermineREXPrefix(MI, TSFlags, Desc))
+      EmitByte(0x40 | REX, CurByte, OS);
+  }
+  
+  // 0x0F escape code must be emitted just before the opcode.
+  if (Need0FPrefix)
+    EmitByte(0x0F, CurByte, OS);
+  
+  // FIXME: Pull this up into previous switch if REX can be moved earlier.
+  switch (TSFlags & X86II::Op0Mask) {
+  case X86II::TF:    // F2 0F 38
+  case X86II::T8:    // 0F 38
+    EmitByte(0x38, CurByte, OS);
+    break;
+  case X86II::TA:    // 0F 3A
+    EmitByte(0x3A, CurByte, OS);
+    break;
+  }
+  
+  // If this is a two-address instruction, skip one of the register operands.
+  unsigned NumOps = Desc.getNumOperands();
+  unsigned CurOp = 0;
+  if (NumOps > 1 && Desc.getOperandConstraint(1, TOI::TIED_TO) != -1)
+    ++CurOp;
+  else if (NumOps > 2 && Desc.getOperandConstraint(NumOps-1, TOI::TIED_TO)== 0)
+    // Skip the last source operand that is tied_to the dest reg. e.g. LXADD32
+    --NumOps;
+  
+  unsigned char BaseOpcode = X86II::getBaseOpcodeFor(TSFlags);
+  switch (TSFlags & X86II::FormMask) {
+  case X86II::MRMInitReg:
+    assert(0 && "FIXME: Remove this form when the JIT moves to MCCodeEmitter!");
+  default: errs() << "FORM: " << (TSFlags & X86II::FormMask) << "\n";
+      assert(0 && "Unknown FormMask value in X86MCCodeEmitter!");
+  case X86II::RawFrm: {
+    EmitByte(BaseOpcode, CurByte, OS);
+    
+    if (CurOp == NumOps)
+      break;
+    
+    assert(0 && "Unimpl RawFrm expr");
+    break;
+  }
+      
+  case X86II::AddRegFrm: {
+    EmitByte(BaseOpcode + GetX86RegNum(MI.getOperand(CurOp++)), CurByte, OS);
+    if (CurOp == NumOps)
+      break;
+
+    const MCOperand &MO1 = MI.getOperand(CurOp++);
+    if (MO1.isImm()) {
+      unsigned Size = X86II::getSizeOfImm(TSFlags);
+      EmitConstant(MO1.getImm(), Size, CurByte, OS);
+      break;
+    }
+
+    assert(0 && "Unimpl AddRegFrm expr");
+    break;
+  }
+      
+  case X86II::MRMDestReg:
+    EmitByte(BaseOpcode, CurByte, OS);
+    EmitRegModRMByte(MI.getOperand(CurOp),
+                     GetX86RegNum(MI.getOperand(CurOp+1)), CurByte, OS);
+    CurOp += 2;
+    if (CurOp != NumOps)
+      EmitConstant(MI.getOperand(CurOp++).getImm(),
+                   X86II::getSizeOfImm(TSFlags), CurByte, OS);
+    break;
+  
+  case X86II::MRMDestMem:
+    EmitByte(BaseOpcode, CurByte, OS);
+    EmitMemModRMByte(MI, CurOp,
+                     GetX86RegNum(MI.getOperand(CurOp + X86AddrNumOperands)),
+                     0, CurByte, OS, Fixups);
+    CurOp += X86AddrNumOperands + 1;
+    if (CurOp != NumOps)
+      EmitConstant(MI.getOperand(CurOp++).getImm(),
+                   X86II::getSizeOfImm(TSFlags), CurByte, OS);
+    break;
+      
+  case X86II::MRMSrcReg:
+    EmitByte(BaseOpcode, CurByte, OS);
+    EmitRegModRMByte(MI.getOperand(CurOp+1), GetX86RegNum(MI.getOperand(CurOp)),
+                     CurByte, OS);
+    CurOp += 2;
+    if (CurOp != NumOps)
+      EmitConstant(MI.getOperand(CurOp++).getImm(),
+                   X86II::getSizeOfImm(TSFlags), CurByte, OS);
+    break;
+    
+  case X86II::MRMSrcMem: {
+    EmitByte(BaseOpcode, CurByte, OS);
+
+    // FIXME: Maybe lea should have its own form?  This is a horrible hack.
+    int AddrOperands;
+    if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r ||
+        Opcode == X86::LEA16r || Opcode == X86::LEA32r)
+      AddrOperands = X86AddrNumOperands - 1; // No segment register
+    else
+      AddrOperands = X86AddrNumOperands;
+    
+    // FIXME: What is this actually doing?
+    intptr_t PCAdj = (CurOp + AddrOperands + 1 != NumOps) ?
+       X86II::getSizeOfImm(TSFlags) : 0;
+    
+    EmitMemModRMByte(MI, CurOp+1, GetX86RegNum(MI.getOperand(CurOp)),
+                     PCAdj, CurByte, OS, Fixups);
+    CurOp += AddrOperands + 1;
+    if (CurOp != NumOps)
+      EmitConstant(MI.getOperand(CurOp++).getImm(),
+                   X86II::getSizeOfImm(TSFlags), CurByte, OS);
+    break;
+  }
+
+  case X86II::MRM0r: case X86II::MRM1r:
+  case X86II::MRM2r: case X86II::MRM3r:
+  case X86II::MRM4r: case X86II::MRM5r:
+  case X86II::MRM6r: case X86II::MRM7r: {
+    EmitByte(BaseOpcode, CurByte, OS);
+
+    // Special handling of lfence, mfence, monitor, and mwait.
+    // FIXME: This is terrible, they should get proper encoding bits in TSFlags.
+    if (Opcode == X86::LFENCE || Opcode == X86::MFENCE ||
+        Opcode == X86::MONITOR || Opcode == X86::MWAIT) {
+      EmitByte(ModRMByte(3, (TSFlags & X86II::FormMask)-X86II::MRM0r, 0),
+               CurByte, OS);
+
+      switch (Opcode) {
+      default: break;
+      case X86::MONITOR: EmitByte(0xC8, CurByte, OS); break;
+      case X86::MWAIT:   EmitByte(0xC9, CurByte, OS); break;
+      }
+    } else {
+      EmitRegModRMByte(MI.getOperand(CurOp++),
+                       (TSFlags & X86II::FormMask)-X86II::MRM0r,
+                       CurByte, OS);
+    }
+
+    if (CurOp == NumOps)
+      break;
+    
+    const MCOperand &MO1 = MI.getOperand(CurOp++);
+    if (MO1.isImm()) {
+      EmitConstant(MO1.getImm(), X86II::getSizeOfImm(TSFlags), CurByte, OS);
+      break;
+    }
+
+    assert(0 && "relo unimpl");
+#if 0
+    unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
+      : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
+    if (Opcode == X86::MOV64ri32)
+      rt = X86::reloc_absolute_word_sext;  // FIXME: add X86II flag?
+    if (MO1.isGlobal()) {
+      bool Indirect = gvNeedsNonLazyPtr(MO1, TM);
+      emitGlobalAddress(MO1.getGlobal(), rt, MO1.getOffset(), 0,
+                        Indirect);
+    } else if (MO1.isSymbol())
+      emitExternalSymbolAddress(MO1.getSymbolName(), rt);
+    else if (MO1.isCPI())
+      emitConstPoolAddress(MO1.getIndex(), rt);
+    else if (MO1.isJTI())
+      emitJumpTableAddress(MO1.getIndex(), rt);
+    break;
+#endif
+  }
+  case X86II::MRM0m: case X86II::MRM1m:
+  case X86II::MRM2m: case X86II::MRM3m:
+  case X86II::MRM4m: case X86II::MRM5m:
+  case X86II::MRM6m: case X86II::MRM7m: {
+    intptr_t PCAdj = 0;
+    if (CurOp + X86AddrNumOperands != NumOps) {
+      if (MI.getOperand(CurOp+X86AddrNumOperands).isImm())
+        PCAdj = X86II::getSizeOfImm(TSFlags);
+      else
+        PCAdj = 4;
+    }
+
+    EmitByte(BaseOpcode, CurByte, OS);
+    EmitMemModRMByte(MI, CurOp, (TSFlags & X86II::FormMask)-X86II::MRM0m,
+                     PCAdj, CurByte, OS, Fixups);
+    CurOp += X86AddrNumOperands;
+    
+    if (CurOp == NumOps)
+      break;
+    
+    const MCOperand &MO = MI.getOperand(CurOp++);
+    if (MO.isImm()) {
+      EmitConstant(MO.getImm(), X86II::getSizeOfImm(TSFlags), CurByte, OS);
+      break;
+    }
+    
+    assert(0 && "relo not handled");
+#if 0
+    unsigned rt = Is64BitMode ? X86::reloc_pcrel_word
+    : (IsPIC ? X86::reloc_picrel_word : X86::reloc_absolute_word);
+    if (Opcode == X86::MOV64mi32)
+      rt = X86::reloc_absolute_word_sext;  // FIXME: add X86II flag?
+    if (MO.isGlobal()) {
+      bool Indirect = gvNeedsNonLazyPtr(MO, TM);
+      emitGlobalAddress(MO.getGlobal(), rt, MO.getOffset(), 0,
+                        Indirect);
+    } else if (MO.isSymbol())
+      emitExternalSymbolAddress(MO.getSymbolName(), rt);
+    else if (MO.isCPI())
+      emitConstPoolAddress(MO.getIndex(), rt);
+    else if (MO.isJTI())
+      emitJumpTableAddress(MO.getIndex(), rt);
+#endif
+    break;
+  }
+  }
+  
+#ifndef NDEBUG
+  // FIXME: Verify.
+  if (/*!Desc.isVariadic() &&*/ CurOp != NumOps) {
+    errs() << "Cannot encode all operands of: ";
+    MI.dump();
+    errs() << '\n';
+    abort();
+  }
+#endif
+}

diff --git a/lib/Target/X86/X86MCTargetExpr.cpp b/lib/Target/X86/X86MCTargetExpr.cpp
new file mode 100644
index 0000000..17b4fe8
--- /dev/null
+++ b/lib/Target/X86/X86MCTargetExpr.cpp

@@ -0,0 +1,48 @@
+//===- X86MCTargetExpr.cpp - X86 Target Specific MCExpr Implementation ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MCTargetExpr.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+X86MCTargetExpr *X86MCTargetExpr::Create(const MCSymbol *Sym, VariantKind K,
+                                         MCContext &Ctx) {
+  return new (Ctx) X86MCTargetExpr(Sym, K);
+}
+
+void X86MCTargetExpr::PrintImpl(raw_ostream &OS) const {
+  OS << *Sym;
+  
+  switch (Kind) {
+  case Invalid:   OS << "@<invalid>"; break;
+  case GOT:       OS << "@GOT"; break;
+  case GOTOFF:    OS << "@GOTOFF"; break;
+  case GOTPCREL:  OS << "@GOTPCREL"; break;
+  case GOTTPOFF:  OS << "@GOTTPOFF"; break;
+  case INDNTPOFF: OS << "@INDNTPOFF"; break;
+  case NTPOFF:    OS << "@NTPOFF"; break;
+  case PLT:       OS << "@PLT"; break;
+  case TLSGD:     OS << "@TLSGD"; break;
+  case TPOFF:     OS << "@TPOFF"; break;
+  }
+}
+
+bool X86MCTargetExpr::EvaluateAsRelocatableImpl(MCValue &Res) const {
+  // FIXME: I don't know if this is right, it followed MCSymbolRefExpr.
+  
+  // Evaluate recursively if this is a variable.
+  if (Sym->isVariable())
+    return Sym->getValue()->EvaluateAsRelocatable(Res);
+  
+  Res = MCValue::get(Sym, 0, 0);
+  return true;
+}

diff --git a/lib/Target/X86/X86MCTargetExpr.h b/lib/Target/X86/X86MCTargetExpr.h
new file mode 100644
index 0000000..7de8a5c
--- /dev/null
+++ b/lib/Target/X86/X86MCTargetExpr.h

@@ -0,0 +1,49 @@
+//===- X86MCTargetExpr.h - X86 Target Specific MCExpr -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86_MCTARGETEXPR_H
+#define X86_MCTARGETEXPR_H
+
+#include "llvm/MC/MCExpr.h"
+
+namespace llvm {
+
+/// X86MCTargetExpr - This class represents symbol variants, like foo@GOT.
+class X86MCTargetExpr : public MCTargetExpr {
+public:
+  enum VariantKind {
+    Invalid,
+    GOT,
+    GOTOFF,
+    GOTPCREL,
+    GOTTPOFF,
+    INDNTPOFF,
+    NTPOFF,
+    PLT,
+    TLSGD,
+    TPOFF
+  };
+private:
+  /// Sym - The symbol being referenced.
+  const MCSymbol * const Sym;
+  /// Kind - The modifier.
+  const VariantKind Kind;
+  
+  X86MCTargetExpr(const MCSymbol *S, VariantKind K) : Sym(S), Kind(K) {}
+public:
+  static X86MCTargetExpr *Create(const MCSymbol *Sym, VariantKind K,
+                                 MCContext &Ctx);
+  
+  void PrintImpl(raw_ostream &OS) const;
+  bool EvaluateAsRelocatableImpl(MCValue &Res) const;
+};
+  
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/X86/X86MachineFunctionInfo.h b/lib/Target/X86/X86MachineFunctionInfo.h
new file mode 100644
index 0000000..bb53bf1
--- /dev/null
+++ b/lib/Target/X86/X86MachineFunctionInfo.h

@@ -0,0 +1,112 @@
+//====- X86MachineFuctionInfo.h - X86 machine function info -----*- C++ -*-===//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file declares X86-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86MACHINEFUNCTIONINFO_H
+#define X86MACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+enum NameDecorationStyle {
+  None,
+  StdCall,
+  FastCall
+};
+  
+/// X86MachineFunctionInfo - This class is derived from MachineFunction and
+/// contains private X86 target-specific information for each MachineFunction.
+class X86MachineFunctionInfo : public MachineFunctionInfo {
+  /// ForceFramePointer - True if the function is required to use of frame
+  /// pointer for reasons other than it containing dynamic allocation or 
+  /// that FP eliminatation is turned off. For example, Cygwin main function
+  /// contains stack pointer re-alignment code which requires FP.
+  bool ForceFramePointer;
+
+  /// CalleeSavedFrameSize - Size of the callee-saved register portion of the
+  /// stack frame in bytes.
+  unsigned CalleeSavedFrameSize;
+
+  /// BytesToPopOnReturn - Number of bytes function pops on return.
+  /// Used on windows platform for stdcall & fastcall name decoration
+  unsigned BytesToPopOnReturn;
+
+  /// DecorationStyle - If the function requires additional name decoration,
+  /// DecorationStyle holds the right way to do so.
+  NameDecorationStyle DecorationStyle;
+
+  /// ReturnAddrIndex - FrameIndex for return slot.
+  int ReturnAddrIndex;
+
+  /// TailCallReturnAddrDelta - The number of bytes by which return address
+  /// stack slot is moved as the result of tail call optimization.
+  int TailCallReturnAddrDelta;
+
+  /// SRetReturnReg - Some subtargets require that sret lowering includes
+  /// returning the value of the returned struct in a register. This field
+  /// holds the virtual register into which the sret argument is passed.
+  unsigned SRetReturnReg;
+
+  /// GlobalBaseReg - keeps track of the virtual register initialized for
+  /// use as the global base register. This is used for PIC in some PIC
+  /// relocation models.
+  unsigned GlobalBaseReg;
+
+public:
+  X86MachineFunctionInfo() : ForceFramePointer(false),
+                             CalleeSavedFrameSize(0),
+                             BytesToPopOnReturn(0),
+                             DecorationStyle(None),
+                             ReturnAddrIndex(0),
+                             TailCallReturnAddrDelta(0),
+                             SRetReturnReg(0),
+                             GlobalBaseReg(0) {}
+  
+  explicit X86MachineFunctionInfo(MachineFunction &MF)
+    : ForceFramePointer(false),
+      CalleeSavedFrameSize(0),
+      BytesToPopOnReturn(0),
+      DecorationStyle(None),
+      ReturnAddrIndex(0),
+      TailCallReturnAddrDelta(0),
+      SRetReturnReg(0),
+      GlobalBaseReg(0) {}
+  
+  bool getForceFramePointer() const { return ForceFramePointer;} 
+  void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; }
+
+  unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
+  void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }
+
+  unsigned getBytesToPopOnReturn() const { return BytesToPopOnReturn; }
+  void setBytesToPopOnReturn (unsigned bytes) { BytesToPopOnReturn = bytes;}
+
+  NameDecorationStyle getDecorationStyle() const { return DecorationStyle; }
+  void setDecorationStyle(NameDecorationStyle style) { DecorationStyle = style;}
+
+  int getRAIndex() const { return ReturnAddrIndex; }
+  void setRAIndex(int Index) { ReturnAddrIndex = Index; }
+
+  int getTCReturnAddrDelta() const { return TailCallReturnAddrDelta; }
+  void setTCReturnAddrDelta(int delta) {TailCallReturnAddrDelta = delta;}
+
+  unsigned getSRetReturnReg() const { return SRetReturnReg; }
+  void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+
+  unsigned getGlobalBaseReg() const { return GlobalBaseReg; }
+  void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; }
+};
+
+} // End llvm namespace
+
+#endif

diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
new file mode 100644
index 0000000..081c6d9
--- /dev/null
+++ b/lib/Target/X86/X86RegisterInfo.cpp

@@ -0,0 +1,1476 @@
+//===- X86RegisterInfo.cpp - X86 Register Information -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetRegisterInfo class.
+// This file is responsible for the frame pointer elimination optimization
+// on X86.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86RegisterInfo.h"
+#include "X86InstrBuilder.h"
+#include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Type.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+using namespace llvm;
+
+X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm,
+                                 const TargetInstrInfo &tii)
+  : X86GenRegisterInfo(tm.getSubtarget<X86Subtarget>().is64Bit() ?
+                         X86::ADJCALLSTACKDOWN64 :
+                         X86::ADJCALLSTACKDOWN32,
+                       tm.getSubtarget<X86Subtarget>().is64Bit() ?
+                         X86::ADJCALLSTACKUP64 :
+                         X86::ADJCALLSTACKUP32),
+    TM(tm), TII(tii) {
+  // Cache some information.
+  const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
+  Is64Bit = Subtarget->is64Bit();
+  IsWin64 = Subtarget->isTargetWin64();
+  StackAlign = TM.getFrameInfo()->getStackAlignment();
+
+  if (Is64Bit) {
+    SlotSize = 8;
+    StackPtr = X86::RSP;
+    FramePtr = X86::RBP;
+  } else {
+    SlotSize = 4;
+    StackPtr = X86::ESP;
+    FramePtr = X86::EBP;
+  }
+}
+
+/// getDwarfRegNum - This function maps LLVM register identifiers to the DWARF
+/// specific numbering, used in debug info and exception tables.
+int X86RegisterInfo::getDwarfRegNum(unsigned RegNo, bool isEH) const {
+  const X86Subtarget *Subtarget = &TM.getSubtarget<X86Subtarget>();
+  unsigned Flavour = DWARFFlavour::X86_64;
+
+  if (!Subtarget->is64Bit()) {
+    if (Subtarget->isTargetDarwin()) {
+      if (isEH)
+        Flavour = DWARFFlavour::X86_32_DarwinEH;
+      else
+        Flavour = DWARFFlavour::X86_32_Generic;
+    } else if (Subtarget->isTargetCygMing()) {
+      // Unsupported by now, just quick fallback
+      Flavour = DWARFFlavour::X86_32_Generic;
+    } else {
+      Flavour = DWARFFlavour::X86_32_Generic;
+    }
+  }
+
+  return X86GenRegisterInfo::getDwarfRegNumFull(RegNo, Flavour);
+}
+
+/// getX86RegNum - This function maps LLVM register identifiers to their X86
+/// specific numbering, which is used in various places encoding instructions.
+unsigned X86RegisterInfo::getX86RegNum(unsigned RegNo) {
+  switch(RegNo) {
+  case X86::RAX: case X86::EAX: case X86::AX: case X86::AL: return N86::EAX;
+  case X86::RCX: case X86::ECX: case X86::CX: case X86::CL: return N86::ECX;
+  case X86::RDX: case X86::EDX: case X86::DX: case X86::DL: return N86::EDX;
+  case X86::RBX: case X86::EBX: case X86::BX: case X86::BL: return N86::EBX;
+  case X86::RSP: case X86::ESP: case X86::SP: case X86::SPL: case X86::AH:
+    return N86::ESP;
+  case X86::RBP: case X86::EBP: case X86::BP: case X86::BPL: case X86::CH:
+    return N86::EBP;
+  case X86::RSI: case X86::ESI: case X86::SI: case X86::SIL: case X86::DH:
+    return N86::ESI;
+  case X86::RDI: case X86::EDI: case X86::DI: case X86::DIL: case X86::BH:
+    return N86::EDI;
+
+  case X86::R8:  case X86::R8D:  case X86::R8W:  case X86::R8B:
+    return N86::EAX;
+  case X86::R9:  case X86::R9D:  case X86::R9W:  case X86::R9B:
+    return N86::ECX;
+  case X86::R10: case X86::R10D: case X86::R10W: case X86::R10B:
+    return N86::EDX;
+  case X86::R11: case X86::R11D: case X86::R11W: case X86::R11B:
+    return N86::EBX;
+  case X86::R12: case X86::R12D: case X86::R12W: case X86::R12B:
+    return N86::ESP;
+  case X86::R13: case X86::R13D: case X86::R13W: case X86::R13B:
+    return N86::EBP;
+  case X86::R14: case X86::R14D: case X86::R14W: case X86::R14B:
+    return N86::ESI;
+  case X86::R15: case X86::R15D: case X86::R15W: case X86::R15B:
+    return N86::EDI;
+
+  case X86::ST0: case X86::ST1: case X86::ST2: case X86::ST3:
+  case X86::ST4: case X86::ST5: case X86::ST6: case X86::ST7:
+    return RegNo-X86::ST0;
+
+  case X86::XMM0: case X86::XMM8: case X86::MM0:
+    return 0;
+  case X86::XMM1: case X86::XMM9: case X86::MM1:
+    return 1;
+  case X86::XMM2: case X86::XMM10: case X86::MM2:
+    return 2;
+  case X86::XMM3: case X86::XMM11: case X86::MM3:
+    return 3;
+  case X86::XMM4: case X86::XMM12: case X86::MM4:
+    return 4;
+  case X86::XMM5: case X86::XMM13: case X86::MM5:
+    return 5;
+  case X86::XMM6: case X86::XMM14: case X86::MM6:
+    return 6;
+  case X86::XMM7: case X86::XMM15: case X86::MM7:
+    return 7;
+
+  default:
+    assert(isVirtualRegister(RegNo) && "Unknown physical register!");
+    llvm_unreachable("Register allocator hasn't allocated reg correctly yet!");
+    return 0;
+  }
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A,
+                                          const TargetRegisterClass *B,
+                                          unsigned SubIdx) const {
+  switch (SubIdx) {
+  default: return 0;
+  case 1:
+    // 8-bit
+    if (B == &X86::GR8RegClass) {
+      if (A->getSize() == 2 || A->getSize() == 4 || A->getSize() == 8)
+        return A;
+    } else if (B == &X86::GR8_ABCD_LRegClass || B == &X86::GR8_ABCD_HRegClass) {
+      if (A == &X86::GR64RegClass || A == &X86::GR64_ABCDRegClass ||
+          A == &X86::GR64_NOREXRegClass ||
+          A == &X86::GR64_NOSPRegClass ||
+          A == &X86::GR64_NOREX_NOSPRegClass)
+        return &X86::GR64_ABCDRegClass;
+      else if (A == &X86::GR32RegClass || A == &X86::GR32_ABCDRegClass ||
+               A == &X86::GR32_NOREXRegClass ||
+               A == &X86::GR32_NOSPRegClass)
+        return &X86::GR32_ABCDRegClass;
+      else if (A == &X86::GR16RegClass || A == &X86::GR16_ABCDRegClass ||
+               A == &X86::GR16_NOREXRegClass)
+        return &X86::GR16_ABCDRegClass;
+    } else if (B == &X86::GR8_NOREXRegClass) {
+      if (A == &X86::GR64RegClass || A == &X86::GR64_NOREXRegClass ||
+          A == &X86::GR64_NOSPRegClass || A == &X86::GR64_NOREX_NOSPRegClass)
+        return &X86::GR64_NOREXRegClass;
+      else if (A == &X86::GR64_ABCDRegClass)
+        return &X86::GR64_ABCDRegClass;
+      else if (A == &X86::GR32RegClass || A == &X86::GR32_NOREXRegClass ||
+               A == &X86::GR32_NOSPRegClass)
+        return &X86::GR32_NOREXRegClass;
+      else if (A == &X86::GR32_ABCDRegClass)
+        return &X86::GR32_ABCDRegClass;
+      else if (A == &X86::GR16RegClass || A == &X86::GR16_NOREXRegClass)
+        return &X86::GR16_NOREXRegClass;
+      else if (A == &X86::GR16_ABCDRegClass)
+        return &X86::GR16_ABCDRegClass;
+    }
+    break;
+  case 2:
+    // 8-bit hi
+    if (B == &X86::GR8_ABCD_HRegClass) {
+      if (A == &X86::GR64RegClass || A == &X86::GR64_ABCDRegClass ||
+          A == &X86::GR64_NOREXRegClass ||
+          A == &X86::GR64_NOSPRegClass ||
+          A == &X86::GR64_NOREX_NOSPRegClass)
+        return &X86::GR64_ABCDRegClass;
+      else if (A == &X86::GR32RegClass || A == &X86::GR32_ABCDRegClass ||
+               A == &X86::GR32_NOREXRegClass || A == &X86::GR32_NOSPRegClass)
+        return &X86::GR32_ABCDRegClass;
+      else if (A == &X86::GR16RegClass || A == &X86::GR16_ABCDRegClass ||
+               A == &X86::GR16_NOREXRegClass)
+        return &X86::GR16_ABCDRegClass;
+    }
+    break;
+  case 3:
+    // 16-bit
+    if (B == &X86::GR16RegClass) {
+      if (A->getSize() == 4 || A->getSize() == 8)
+        return A;
+    } else if (B == &X86::GR16_ABCDRegClass) {
+      if (A == &X86::GR64RegClass || A == &X86::GR64_ABCDRegClass ||
+          A == &X86::GR64_NOREXRegClass ||
+          A == &X86::GR64_NOSPRegClass ||
+          A == &X86::GR64_NOREX_NOSPRegClass)
+        return &X86::GR64_ABCDRegClass;
+      else if (A == &X86::GR32RegClass || A == &X86::GR32_ABCDRegClass ||
+               A == &X86::GR32_NOREXRegClass || A == &X86::GR32_NOSPRegClass)
+        return &X86::GR32_ABCDRegClass;
+    } else if (B == &X86::GR16_NOREXRegClass) {
+      if (A == &X86::GR64RegClass || A == &X86::GR64_NOREXRegClass ||
+          A == &X86::GR64_NOSPRegClass || A == &X86::GR64_NOREX_NOSPRegClass)
+        return &X86::GR64_NOREXRegClass;
+      else if (A == &X86::GR64_ABCDRegClass)
+        return &X86::GR64_ABCDRegClass;
+      else if (A == &X86::GR32RegClass || A == &X86::GR32_NOREXRegClass ||
+               A == &X86::GR32_NOSPRegClass)
+        return &X86::GR32_NOREXRegClass;
+      else if (A == &X86::GR32_ABCDRegClass)
+        return &X86::GR64_ABCDRegClass;
+    }
+    break;
+  case 4:
+    // 32-bit
+    if (B == &X86::GR32RegClass || B == &X86::GR32_NOSPRegClass) {
+      if (A->getSize() == 8)
+        return A;
+    } else if (B == &X86::GR32_ABCDRegClass) {
+      if (A == &X86::GR64RegClass || A == &X86::GR64_ABCDRegClass ||
+          A == &X86::GR64_NOREXRegClass ||
+          A == &X86::GR64_NOSPRegClass ||
+          A == &X86::GR64_NOREX_NOSPRegClass)
+        return &X86::GR64_ABCDRegClass;
+    } else if (B == &X86::GR32_NOREXRegClass) {
+      if (A == &X86::GR64RegClass || A == &X86::GR64_NOREXRegClass ||
+          A == &X86::GR64_NOSPRegClass || A == &X86::GR64_NOREX_NOSPRegClass)
+        return &X86::GR64_NOREXRegClass;
+      else if (A == &X86::GR64_ABCDRegClass)
+        return &X86::GR64_ABCDRegClass;
+    }
+    break;
+  }
+  return 0;
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getPointerRegClass(unsigned Kind) const {
+  switch (Kind) {
+  default: llvm_unreachable("Unexpected Kind in getPointerRegClass!");
+  case 0: // Normal GPRs.
+    if (TM.getSubtarget<X86Subtarget>().is64Bit())
+      return &X86::GR64RegClass;
+    return &X86::GR32RegClass;
+  case 1: // Normal GRPs except the stack pointer (for encoding reasons).
+    if (TM.getSubtarget<X86Subtarget>().is64Bit())
+      return &X86::GR64_NOSPRegClass;
+    return &X86::GR32_NOSPRegClass;
+  }
+}
+
+const TargetRegisterClass *
+X86RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
+  if (RC == &X86::CCRRegClass) {
+    if (Is64Bit)
+      return &X86::GR64RegClass;
+    else
+      return &X86::GR32RegClass;
+  }
+  return NULL;
+}
+
+const unsigned *
+X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  bool callsEHReturn = false;
+
+  if (MF) {
+    const MachineFrameInfo *MFI = MF->getFrameInfo();
+    const MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
+    callsEHReturn = (MMI ? MMI->callsEHReturn() : false);
+  }
+
+  static const unsigned CalleeSavedRegs32Bit[] = {
+    X86::ESI, X86::EDI, X86::EBX, X86::EBP,  0
+  };
+
+  static const unsigned CalleeSavedRegs32EHRet[] = {
+    X86::EAX, X86::EDX, X86::ESI, X86::EDI, X86::EBX, X86::EBP,  0
+  };
+
+  static const unsigned CalleeSavedRegs64Bit[] = {
+    X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
+  };
+
+  static const unsigned CalleeSavedRegs64EHRet[] = {
+    X86::RAX, X86::RDX, X86::RBX, X86::R12,
+    X86::R13, X86::R14, X86::R15, X86::RBP, 0
+  };
+
+  static const unsigned CalleeSavedRegsWin64[] = {
+    X86::RBX,   X86::RBP,   X86::RDI,   X86::RSI,
+    X86::R12,   X86::R13,   X86::R14,   X86::R15,
+    X86::XMM6,  X86::XMM7,  X86::XMM8,  X86::XMM9,
+    X86::XMM10, X86::XMM11, X86::XMM12, X86::XMM13,
+    X86::XMM14, X86::XMM15, 0
+  };
+
+  if (Is64Bit) {
+    if (IsWin64)
+      return CalleeSavedRegsWin64;
+    else
+      return (callsEHReturn ? CalleeSavedRegs64EHRet : CalleeSavedRegs64Bit);
+  } else {
+    return (callsEHReturn ? CalleeSavedRegs32EHRet : CalleeSavedRegs32Bit);
+  }
+}
+
+const TargetRegisterClass* const*
+X86RegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
+  bool callsEHReturn = false;
+
+  if (MF) {
+    const MachineFrameInfo *MFI = MF->getFrameInfo();
+    const MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
+    callsEHReturn = (MMI ? MMI->callsEHReturn() : false);
+  }
+
+  static const TargetRegisterClass * const CalleeSavedRegClasses32Bit[] = {
+    &X86::GR32RegClass, &X86::GR32RegClass,
+    &X86::GR32RegClass, &X86::GR32RegClass,  0
+  };
+  static const TargetRegisterClass * const CalleeSavedRegClasses32EHRet[] = {
+    &X86::GR32RegClass, &X86::GR32RegClass,
+    &X86::GR32RegClass, &X86::GR32RegClass,
+    &X86::GR32RegClass, &X86::GR32RegClass,  0
+  };
+  static const TargetRegisterClass * const CalleeSavedRegClasses64Bit[] = {
+    &X86::GR64RegClass, &X86::GR64RegClass,
+    &X86::GR64RegClass, &X86::GR64RegClass,
+    &X86::GR64RegClass, &X86::GR64RegClass, 0
+  };
+  static const TargetRegisterClass * const CalleeSavedRegClasses64EHRet[] = {
+    &X86::GR64RegClass, &X86::GR64RegClass,
+    &X86::GR64RegClass, &X86::GR64RegClass,
+    &X86::GR64RegClass, &X86::GR64RegClass,
+    &X86::GR64RegClass, &X86::GR64RegClass, 0
+  };
+  static const TargetRegisterClass * const CalleeSavedRegClassesWin64[] = {
+    &X86::GR64RegClass,  &X86::GR64RegClass,
+    &X86::GR64RegClass,  &X86::GR64RegClass,
+    &X86::GR64RegClass,  &X86::GR64RegClass,
+    &X86::GR64RegClass,  &X86::GR64RegClass,
+    &X86::VR128RegClass, &X86::VR128RegClass,
+    &X86::VR128RegClass, &X86::VR128RegClass,
+    &X86::VR128RegClass, &X86::VR128RegClass,
+    &X86::VR128RegClass, &X86::VR128RegClass,
+    &X86::VR128RegClass, &X86::VR128RegClass, 0
+  };
+
+  if (Is64Bit) {
+    if (IsWin64)
+      return CalleeSavedRegClassesWin64;
+    else
+      return (callsEHReturn ?
+              CalleeSavedRegClasses64EHRet : CalleeSavedRegClasses64Bit);
+  } else {
+    return (callsEHReturn ?
+            CalleeSavedRegClasses32EHRet : CalleeSavedRegClasses32Bit);
+  }
+}
+
+BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  // Set the stack-pointer register and its aliases as reserved.
+  Reserved.set(X86::RSP);
+  Reserved.set(X86::ESP);
+  Reserved.set(X86::SP);
+  Reserved.set(X86::SPL);
+
+  // Set the instruction pointer register and its aliases as reserved.
+  Reserved.set(X86::RIP);
+  Reserved.set(X86::EIP);
+  Reserved.set(X86::IP);
+
+  // Set the frame-pointer register and its aliases as reserved if needed.
+  if (hasFP(MF)) {
+    Reserved.set(X86::RBP);
+    Reserved.set(X86::EBP);
+    Reserved.set(X86::BP);
+    Reserved.set(X86::BPL);
+  }
+
+  // Mark the x87 stack registers as reserved, since they don't behave normally
+  // with respect to liveness. We don't fully model the effects of x87 stack
+  // pushes and pops after stackification.
+  Reserved.set(X86::ST0);
+  Reserved.set(X86::ST1);
+  Reserved.set(X86::ST2);
+  Reserved.set(X86::ST3);
+  Reserved.set(X86::ST4);
+  Reserved.set(X86::ST5);
+  Reserved.set(X86::ST6);
+  Reserved.set(X86::ST7);
+  return Reserved;
+}
+
+//===----------------------------------------------------------------------===//
+// Stack Frame Processing methods
+//===----------------------------------------------------------------------===//
+
+/// hasFP - Return true if the specified function should have a dedicated frame
+/// pointer register.  This is true if the function has variable sized allocas
+/// or if frame pointer elimination is disabled.
+bool X86RegisterInfo::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
+
+  return (NoFramePointerElim ||
+          needsStackRealignment(MF) ||
+          MFI->hasVarSizedObjects() ||
+          MFI->isFrameAddressTaken() ||
+          MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() ||
+          (MMI && MMI->callsUnwindInit()));
+}
+
+bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  return (RealignStack &&
+          !MFI->hasVarSizedObjects());
+}
+
+bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  bool requiresRealignment =
+    RealignStack && (MFI->getMaxAlignment() > StackAlign);
+
+  // FIXME: Currently we don't support stack realignment for functions with
+  //        variable-sized allocas.
+  // FIXME: Temporary disable the error - it seems to be too conservative.
+  if (0 && requiresRealignment && MFI->hasVarSizedObjects())
+    llvm_report_error(
+      "Stack realignment in presense of dynamic allocas is not supported");
+
+  return (requiresRealignment && !MFI->hasVarSizedObjects());
+}
+
+bool X86RegisterInfo::hasReservedCallFrame(MachineFunction &MF) const {
+  return !MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+bool X86RegisterInfo::hasReservedSpillSlot(MachineFunction &MF, unsigned Reg,
+                                           int &FrameIdx) const {
+  if (Reg == FramePtr && hasFP(MF)) {
+    FrameIdx = MF.getFrameInfo()->getObjectIndexBegin();
+    return true;
+  }
+  return false;
+}
+
+int
+X86RegisterInfo::getFrameIndexOffset(const MachineFunction &MF, int FI) const {
+  const TargetFrameInfo &TFI = *MF.getTarget().getFrameInfo();
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  int Offset = MFI->getObjectOffset(FI) - TFI.getOffsetOfLocalArea();
+  uint64_t StackSize = MFI->getStackSize();
+
+  if (needsStackRealignment(MF)) {
+    if (FI < 0) {
+      // Skip the saved EBP.
+      Offset += SlotSize;
+    } else {
+      unsigned Align = MFI->getObjectAlignment(FI);
+      assert( (-(Offset + StackSize)) % Align == 0);
+      Align = 0;
+      return Offset + StackSize;
+    }
+    // FIXME: Support tail calls
+  } else {
+    if (!hasFP(MF))
+      return Offset + StackSize;
+
+    // Skip the saved EBP.
+    Offset += SlotSize;
+
+    // Skip the RETADDR move area
+    const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+    int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+    if (TailCallReturnAddrDelta < 0)
+      Offset -= TailCallReturnAddrDelta;
+  }
+
+  return Offset;
+}
+
+void X86RegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  if (!hasReservedCallFrame(MF)) {
+    // If the stack pointer can be changed after prologue, turn the
+    // adjcallstackup instruction into a 'sub ESP, <amt>' and the
+    // adjcallstackdown instruction into 'add ESP, <amt>'
+    // TODO: consider using push / pop instead of sub + store / add
+    MachineInstr *Old = I;
+    uint64_t Amount = Old->getOperand(0).getImm();
+    if (Amount != 0) {
+      // We need to keep the stack aligned properly.  To do this, we round the
+      // amount of space needed for the outgoing arguments up to the next
+      // alignment boundary.
+      Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign;
+
+      MachineInstr *New = 0;
+      if (Old->getOpcode() == getCallFrameSetupOpcode()) {
+        New = BuildMI(MF, Old->getDebugLoc(),
+                      TII.get(Is64Bit ? X86::SUB64ri32 : X86::SUB32ri),
+                      StackPtr)
+          .addReg(StackPtr)
+          .addImm(Amount);
+      } else {
+        assert(Old->getOpcode() == getCallFrameDestroyOpcode());
+
+        // Factor out the amount the callee already popped.
+        uint64_t CalleeAmt = Old->getOperand(1).getImm();
+        Amount -= CalleeAmt;
+  
+      if (Amount) {
+          unsigned Opc = (Amount < 128) ?
+            (Is64Bit ? X86::ADD64ri8 : X86::ADD32ri8) :
+            (Is64Bit ? X86::ADD64ri32 : X86::ADD32ri);
+          New = BuildMI(MF, Old->getDebugLoc(), TII.get(Opc), StackPtr)
+            .addReg(StackPtr)
+            .addImm(Amount);
+        }
+      }
+
+      if (New) {
+        // The EFLAGS implicit def is dead.
+        New->getOperand(3).setIsDead();
+
+        // Replace the pseudo instruction with a new instruction.
+        MBB.insert(I, New);
+      }
+    }
+  } else if (I->getOpcode() == getCallFrameDestroyOpcode()) {
+    // If we are performing frame pointer elimination and if the callee pops
+    // something off the stack pointer, add it back.  We do this until we have
+    // more advanced stack pointer tracking ability.
+    if (uint64_t CalleeAmt = I->getOperand(1).getImm()) {
+      unsigned Opc = (CalleeAmt < 128) ?
+        (Is64Bit ? X86::SUB64ri8 : X86::SUB32ri8) :
+        (Is64Bit ? X86::SUB64ri32 : X86::SUB32ri);
+      MachineInstr *Old = I;
+      MachineInstr *New =
+        BuildMI(MF, Old->getDebugLoc(), TII.get(Opc), 
+                StackPtr)
+          .addReg(StackPtr)
+          .addImm(CalleeAmt);
+
+      // The EFLAGS implicit def is dead.
+      New->getOperand(3).setIsDead();
+      MBB.insert(I, New);
+    }
+  }
+
+  MBB.erase(I);
+}
+
+unsigned
+X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                     int SPAdj, int *Value,
+                                     RegScavenger *RS) const{
+  assert(SPAdj == 0 && "Unexpected");
+
+  unsigned i = 0;
+  MachineInstr &MI = *II;
+  MachineFunction &MF = *MI.getParent()->getParent();
+
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+
+  int FrameIndex = MI.getOperand(i).getIndex();
+  unsigned BasePtr;
+
+  if (needsStackRealignment(MF))
+    BasePtr = (FrameIndex < 0 ? FramePtr : StackPtr);
+  else
+    BasePtr = (hasFP(MF) ? FramePtr : StackPtr);
+
+  // This must be part of a four operand memory reference.  Replace the
+  // FrameIndex with base register with EBP.  Add an offset to the offset.
+  MI.getOperand(i).ChangeToRegister(BasePtr, false);
+
+  // Now add the frame object offset to the offset from EBP.
+  if (MI.getOperand(i+3).isImm()) {
+    // Offset is a 32-bit integer.
+    int Offset = getFrameIndexOffset(MF, FrameIndex) +
+      (int)(MI.getOperand(i + 3).getImm());
+
+    MI.getOperand(i + 3).ChangeToImmediate(Offset);
+  } else {
+    // Offset is symbolic. This is extremely rare.
+    uint64_t Offset = getFrameIndexOffset(MF, FrameIndex) +
+                      (uint64_t)MI.getOperand(i+3).getOffset();
+    MI.getOperand(i+3).setOffset(Offset);
+  }
+  return 0;
+}
+
+void
+X86RegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                                      RegScavenger *RS) const {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Calculate and set max stack object alignment early, so we can decide
+  // whether we will need stack realignment (and thus FP).
+  MFI->calculateMaxStackAlignment();
+
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  int32_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+
+  if (TailCallReturnAddrDelta < 0) {
+    // create RETURNADDR area
+    //   arg
+    //   arg
+    //   RETADDR
+    //   { ...
+    //     RETADDR area
+    //     ...
+    //   }
+    //   [EBP]
+    MFI->CreateFixedObject(-TailCallReturnAddrDelta,
+                           (-1U*SlotSize)+TailCallReturnAddrDelta,
+                           true, false);
+  }
+
+  if (hasFP(MF)) {
+    assert((TailCallReturnAddrDelta <= 0) &&
+           "The Delta should always be zero or negative");
+    const TargetFrameInfo &TFI = *MF.getTarget().getFrameInfo();
+
+    // Create a frame entry for the EBP register that must be saved.
+    int FrameIdx = MFI->CreateFixedObject(SlotSize,
+                                          -(int)SlotSize +
+                                          TFI.getOffsetOfLocalArea() +
+                                          TailCallReturnAddrDelta,
+                                          true, false);
+    assert(FrameIdx == MFI->getObjectIndexBegin() &&
+           "Slot for EBP register must be last in order to be found!");
+    FrameIdx = 0;
+  }
+}
+
+/// emitSPUpdate - Emit a series of instructions to increment / decrement the
+/// stack pointer by a constant value.
+static
+void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+                  unsigned StackPtr, int64_t NumBytes, bool Is64Bit,
+                  const TargetInstrInfo &TII) {
+  bool isSub = NumBytes < 0;
+  uint64_t Offset = isSub ? -NumBytes : NumBytes;
+  unsigned Opc = isSub
+    ? ((Offset < 128) ?
+       (Is64Bit ? X86::SUB64ri8 : X86::SUB32ri8) :
+       (Is64Bit ? X86::SUB64ri32 : X86::SUB32ri))
+    : ((Offset < 128) ?
+       (Is64Bit ? X86::ADD64ri8 : X86::ADD32ri8) :
+       (Is64Bit ? X86::ADD64ri32 : X86::ADD32ri));
+  uint64_t Chunk = (1LL << 31) - 1;
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+
+  while (Offset) {
+    uint64_t ThisVal = (Offset > Chunk) ? Chunk : Offset;
+    MachineInstr *MI =
+      BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
+        .addReg(StackPtr)
+        .addImm(ThisVal);
+    MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+    Offset -= ThisVal;
+  }
+}
+
+/// mergeSPUpdatesUp - Merge two stack-manipulating instructions upper iterator.
+static
+void mergeSPUpdatesUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
+                      unsigned StackPtr, uint64_t *NumBytes = NULL) {
+  if (MBBI == MBB.begin()) return;
+
+  MachineBasicBlock::iterator PI = prior(MBBI);
+  unsigned Opc = PI->getOpcode();
+  if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
+       Opc == X86::ADD32ri || Opc == X86::ADD32ri8) &&
+      PI->getOperand(0).getReg() == StackPtr) {
+    if (NumBytes)
+      *NumBytes += PI->getOperand(2).getImm();
+    MBB.erase(PI);
+  } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
+              Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
+             PI->getOperand(0).getReg() == StackPtr) {
+    if (NumBytes)
+      *NumBytes -= PI->getOperand(2).getImm();
+    MBB.erase(PI);
+  }
+}
+
+/// mergeSPUpdatesUp - Merge two stack-manipulating instructions lower iterator.
+static
+void mergeSPUpdatesDown(MachineBasicBlock &MBB,
+                        MachineBasicBlock::iterator &MBBI,
+                        unsigned StackPtr, uint64_t *NumBytes = NULL) {
+  // FIXME: THIS ISN'T RUN!!!
+  return;
+
+  if (MBBI == MBB.end()) return;
+
+  MachineBasicBlock::iterator NI = llvm::next(MBBI);
+  if (NI == MBB.end()) return;
+
+  unsigned Opc = NI->getOpcode();
+  if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
+       Opc == X86::ADD32ri || Opc == X86::ADD32ri8) &&
+      NI->getOperand(0).getReg() == StackPtr) {
+    if (NumBytes)
+      *NumBytes -= NI->getOperand(2).getImm();
+    MBB.erase(NI);
+    MBBI = NI;
+  } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
+              Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
+             NI->getOperand(0).getReg() == StackPtr) {
+    if (NumBytes)
+      *NumBytes += NI->getOperand(2).getImm();
+    MBB.erase(NI);
+    MBBI = NI;
+  }
+}
+
+/// mergeSPUpdates - Checks the instruction before/after the passed
+/// instruction. If it is an ADD/SUB instruction it is deleted argument and the
+/// stack adjustment is returned as a positive value for ADD and a negative for
+/// SUB.
+static int mergeSPUpdates(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator &MBBI,
+                           unsigned StackPtr,
+                           bool doMergeWithPrevious) {
+  if ((doMergeWithPrevious && MBBI == MBB.begin()) ||
+      (!doMergeWithPrevious && MBBI == MBB.end()))
+    return 0;
+
+  MachineBasicBlock::iterator PI = doMergeWithPrevious ? prior(MBBI) : MBBI;
+  MachineBasicBlock::iterator NI = doMergeWithPrevious ? 0 : llvm::next(MBBI);
+  unsigned Opc = PI->getOpcode();
+  int Offset = 0;
+
+  if ((Opc == X86::ADD64ri32 || Opc == X86::ADD64ri8 ||
+       Opc == X86::ADD32ri || Opc == X86::ADD32ri8) &&
+      PI->getOperand(0).getReg() == StackPtr){
+    Offset += PI->getOperand(2).getImm();
+    MBB.erase(PI);
+    if (!doMergeWithPrevious) MBBI = NI;
+  } else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB64ri8 ||
+              Opc == X86::SUB32ri || Opc == X86::SUB32ri8) &&
+             PI->getOperand(0).getReg() == StackPtr) {
+    Offset -= PI->getOperand(2).getImm();
+    MBB.erase(PI);
+    if (!doMergeWithPrevious) MBBI = NI;
+  }
+
+  return Offset;
+}
+
+void X86RegisterInfo::emitCalleeSavedFrameMoves(MachineFunction &MF,
+                                                unsigned LabelId,
+                                                unsigned FramePtr) const {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
+  if (!MMI) return;
+
+  // Add callee saved registers to move list.
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  if (CSI.empty()) return;
+
+  std::vector<MachineMove> &Moves = MMI->getFrameMoves();
+  const TargetData *TD = MF.getTarget().getTargetData();
+  bool HasFP = hasFP(MF);
+
+  // Calculate amount of bytes used for return address storing.
+  int stackGrowth =
+    (MF.getTarget().getFrameInfo()->getStackGrowthDirection() ==
+     TargetFrameInfo::StackGrowsUp ?
+     TD->getPointerSize() : -TD->getPointerSize());
+
+  // FIXME: This is dirty hack. The code itself is pretty mess right now.
+  // It should be rewritten from scratch and generalized sometimes.
+
+  // Determine maximum offset (minumum due to stack growth).
+  int64_t MaxOffset = 0;
+  for (std::vector<CalleeSavedInfo>::const_iterator
+         I = CSI.begin(), E = CSI.end(); I != E; ++I)
+    MaxOffset = std::min(MaxOffset,
+                         MFI->getObjectOffset(I->getFrameIdx()));
+
+  // Calculate offsets.
+  int64_t saveAreaOffset = (HasFP ? 3 : 2) * stackGrowth;
+  for (std::vector<CalleeSavedInfo>::const_iterator
+         I = CSI.begin(), E = CSI.end(); I != E; ++I) {
+    int64_t Offset = MFI->getObjectOffset(I->getFrameIdx());
+    unsigned Reg = I->getReg();
+    Offset = MaxOffset - Offset + saveAreaOffset;
+
+    // Don't output a new machine move if we're re-saving the frame
+    // pointer. This happens when the PrologEpilogInserter has inserted an extra
+    // "PUSH" of the frame pointer -- the "emitPrologue" method automatically
+    // generates one when frame pointers are used. If we generate a "machine
+    // move" for this extra "PUSH", the linker will lose track of the fact that
+    // the frame pointer should have the value of the first "PUSH" when it's
+    // trying to unwind.
+    // 
+    // FIXME: This looks inelegant. It's possibly correct, but it's covering up
+    //        another bug. I.e., one where we generate a prolog like this:
+    //
+    //          pushl  %ebp
+    //          movl   %esp, %ebp
+    //          pushl  %ebp
+    //          pushl  %esi
+    //           ...
+    //
+    //        The immediate re-push of EBP is unnecessary. At the least, it's an
+    //        optimization bug. EBP can be used as a scratch register in certain
+    //        cases, but probably not when we have a frame pointer.
+    if (HasFP && FramePtr == Reg)
+      continue;
+
+    MachineLocation CSDst(MachineLocation::VirtualFP, Offset);
+    MachineLocation CSSrc(Reg);
+    Moves.push_back(MachineMove(LabelId, CSDst, CSSrc));
+  }
+}
+
+/// emitPrologue - Push callee-saved registers onto the stack, which
+/// automatically adjust the stack pointer. Adjust the stack pointer to allocate
+/// space for local variables. Also emit labels used by the exception handler to
+/// generate the exception handling frames.
+void X86RegisterInfo::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front(); // Prologue goes in entry BB.
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const Function *Fn = MF.getFunction();
+  const X86Subtarget *Subtarget = &MF.getTarget().getSubtarget<X86Subtarget>();
+  MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  bool needsFrameMoves = (MMI && MMI->hasDebugInfo()) ||
+                          !Fn->doesNotThrow() || UnwindTablesMandatory;
+  uint64_t MaxAlign  = MFI->getMaxAlignment(); // Desired stack alignment.
+  uint64_t StackSize = MFI->getStackSize();    // Number of bytes to allocate.
+  bool HasFP = hasFP(MF);
+  DebugLoc DL;
+
+  // Add RETADDR move area to callee saved frame size.
+  int TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
+  if (TailCallReturnAddrDelta < 0)
+    X86FI->setCalleeSavedFrameSize(
+      X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta);
+
+  // If this is x86-64 and the Red Zone is not disabled, if we are a leaf
+  // function, and use up to 128 bytes of stack space, don't have a frame
+  // pointer, calls, or dynamic alloca then we do not need to adjust the
+  // stack pointer (we fit in the Red Zone).
+  if (Is64Bit && !Fn->hasFnAttr(Attribute::NoRedZone) &&
+      !needsStackRealignment(MF) &&
+      !MFI->hasVarSizedObjects() &&                // No dynamic alloca.
+      !MFI->hasCalls() &&                          // No calls.
+      !Subtarget->isTargetWin64()) {               // Win64 has no Red Zone
+    uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
+    if (HasFP) MinSize += SlotSize;
+    StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0);
+    MFI->setStackSize(StackSize);
+  } else if (Subtarget->isTargetWin64()) {
+    // We need to always allocate 32 bytes as register spill area.
+    // FIXME: We might reuse these 32 bytes for leaf functions.
+    StackSize += 32;
+    MFI->setStackSize(StackSize);
+  }
+
+  // Insert stack pointer adjustment for later moving of return addr.  Only
+  // applies to tail call optimized functions where the callee argument stack
+  // size is bigger than the callers.
+  if (TailCallReturnAddrDelta < 0) {
+    MachineInstr *MI =
+      BuildMI(MBB, MBBI, DL, TII.get(Is64Bit? X86::SUB64ri32 : X86::SUB32ri),
+              StackPtr)
+        .addReg(StackPtr)
+        .addImm(-TailCallReturnAddrDelta);
+    MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+  }
+
+  // Mapping for machine moves:
+  //
+  //   DST: VirtualFP AND
+  //        SRC: VirtualFP              => DW_CFA_def_cfa_offset
+  //        ELSE                        => DW_CFA_def_cfa
+  //
+  //   SRC: VirtualFP AND
+  //        DST: Register               => DW_CFA_def_cfa_register
+  //
+  //   ELSE
+  //        OFFSET < 0                  => DW_CFA_offset_extended_sf
+  //        REG < 64                    => DW_CFA_offset + Reg
+  //        ELSE                        => DW_CFA_offset_extended
+
+  std::vector<MachineMove> &Moves = MMI->getFrameMoves();
+  const TargetData *TD = MF.getTarget().getTargetData();
+  uint64_t NumBytes = 0;
+  int stackGrowth =
+    (MF.getTarget().getFrameInfo()->getStackGrowthDirection() ==
+     TargetFrameInfo::StackGrowsUp ?
+       TD->getPointerSize() : -TD->getPointerSize());
+
+  if (HasFP) {
+    // Calculate required stack adjustment.
+    uint64_t FrameSize = StackSize - SlotSize;
+    if (needsStackRealignment(MF))
+      FrameSize = (FrameSize + MaxAlign - 1) / MaxAlign * MaxAlign;
+
+    NumBytes = FrameSize - X86FI->getCalleeSavedFrameSize();
+
+    // Get the offset of the stack slot for the EBP register, which is
+    // guaranteed to be the last slot by processFunctionBeforeFrameFinalized.
+    // Update the frame offset adjustment.
+    MFI->setOffsetAdjustment(-NumBytes);
+
+    // Save EBP/RBP into the appropriate stack slot.
+    BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
+      .addReg(FramePtr, RegState::Kill);
+
+    if (needsFrameMoves) {
+      // Mark the place where EBP/RBP was saved.
+      unsigned FrameLabelId = MMI->NextLabelID();
+      BuildMI(MBB, MBBI, DL, TII.get(X86::DBG_LABEL)).addImm(FrameLabelId);
+
+      // Define the current CFA rule to use the provided offset.
+      if (StackSize) {
+        MachineLocation SPDst(MachineLocation::VirtualFP);
+        MachineLocation SPSrc(MachineLocation::VirtualFP, 2 * stackGrowth);
+        Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc));
+      } else {
+        // FIXME: Verify & implement for FP
+        MachineLocation SPDst(StackPtr);
+        MachineLocation SPSrc(StackPtr, stackGrowth);
+        Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc));
+      }
+
+      // Change the rule for the FramePtr to be an "offset" rule.
+      MachineLocation FPDst(MachineLocation::VirtualFP,
+                            2 * stackGrowth);
+      MachineLocation FPSrc(FramePtr);
+      Moves.push_back(MachineMove(FrameLabelId, FPDst, FPSrc));
+    }
+
+    // Update EBP with the new base value...
+    BuildMI(MBB, MBBI, DL,
+            TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), FramePtr)
+        .addReg(StackPtr);
+
+    if (needsFrameMoves) {
+      // Mark effective beginning of when frame pointer becomes valid.
+      unsigned FrameLabelId = MMI->NextLabelID();
+      BuildMI(MBB, MBBI, DL, TII.get(X86::DBG_LABEL)).addImm(FrameLabelId);
+
+      // Define the current CFA to use the EBP/RBP register.
+      MachineLocation FPDst(FramePtr);
+      MachineLocation FPSrc(MachineLocation::VirtualFP);
+      Moves.push_back(MachineMove(FrameLabelId, FPDst, FPSrc));
+    }
+
+    // Mark the FramePtr as live-in in every block except the entry.
+    for (MachineFunction::iterator I = llvm::next(MF.begin()), E = MF.end();
+         I != E; ++I)
+      I->addLiveIn(FramePtr);
+
+    // Realign stack
+    if (needsStackRealignment(MF)) {
+      MachineInstr *MI =
+        BuildMI(MBB, MBBI, DL,
+                TII.get(Is64Bit ? X86::AND64ri32 : X86::AND32ri),
+                StackPtr).addReg(StackPtr).addImm(-MaxAlign);
+
+      // The EFLAGS implicit def is dead.
+      MI->getOperand(3).setIsDead();
+    }
+  } else {
+    NumBytes = StackSize - X86FI->getCalleeSavedFrameSize();
+  }
+
+  // Skip the callee-saved push instructions.
+  bool PushedRegs = false;
+  int StackOffset = 2 * stackGrowth;
+
+  while (MBBI != MBB.end() &&
+         (MBBI->getOpcode() == X86::PUSH32r ||
+          MBBI->getOpcode() == X86::PUSH64r)) {
+    PushedRegs = true;
+    ++MBBI;
+
+    if (!HasFP && needsFrameMoves) {
+      // Mark callee-saved push instruction.
+      unsigned LabelId = MMI->NextLabelID();
+      BuildMI(MBB, MBBI, DL, TII.get(X86::DBG_LABEL)).addImm(LabelId);
+
+      // Define the current CFA rule to use the provided offset.
+      unsigned Ptr = StackSize ?
+        MachineLocation::VirtualFP : StackPtr;
+      MachineLocation SPDst(Ptr);
+      MachineLocation SPSrc(Ptr, StackOffset);
+      Moves.push_back(MachineMove(LabelId, SPDst, SPSrc));
+      StackOffset += stackGrowth;
+    }
+  }
+
+  DL = MBB.findDebugLoc(MBBI);
+
+  // Adjust stack pointer: ESP -= numbytes.
+  if (NumBytes >= 4096 && Subtarget->isTargetCygMing()) {
+    // Check, whether EAX is livein for this function.
+    bool isEAXAlive = false;
+    for (MachineRegisterInfo::livein_iterator
+           II = MF.getRegInfo().livein_begin(),
+           EE = MF.getRegInfo().livein_end(); (II != EE) && !isEAXAlive; ++II) {
+      unsigned Reg = II->first;
+      isEAXAlive = (Reg == X86::EAX || Reg == X86::AX ||
+                    Reg == X86::AH || Reg == X86::AL);
+    }
+
+    // Function prologue calls _alloca to probe the stack when allocating more
+    // than 4k bytes in one go. Touching the stack at 4K increments is necessary
+    // to ensure that the guard pages used by the OS virtual memory manager are
+    // allocated in correct sequence.
+    if (!isEAXAlive) {
+      BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
+        .addImm(NumBytes);
+      BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32))
+        .addExternalSymbol("_alloca");
+    } else {
+      // Save EAX
+      BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH32r))
+        .addReg(X86::EAX, RegState::Kill);
+
+      // Allocate NumBytes-4 bytes on stack. We'll also use 4 already
+      // allocated bytes for EAX.
+      BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
+        .addImm(NumBytes - 4);
+      BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32))
+        .addExternalSymbol("_alloca");
+
+      // Restore EAX
+      MachineInstr *MI = addRegOffset(BuildMI(MF, DL, TII.get(X86::MOV32rm),
+                                              X86::EAX),
+                                      StackPtr, false, NumBytes - 4);
+      MBB.insert(MBBI, MI);
+    }
+  } else if (NumBytes) {
+    // If there is an SUB32ri of ESP immediately before this instruction, merge
+    // the two. This can be the case when tail call elimination is enabled and
+    // the callee has more arguments then the caller.
+    NumBytes -= mergeSPUpdates(MBB, MBBI, StackPtr, true);
+
+    // If there is an ADD32ri or SUB32ri of ESP immediately after this
+    // instruction, merge the two instructions.
+    mergeSPUpdatesDown(MBB, MBBI, StackPtr, &NumBytes);
+
+    if (NumBytes)
+      emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, TII);
+  }
+
+  if ((NumBytes || PushedRegs) && needsFrameMoves) {
+    // Mark end of stack pointer adjustment.
+    unsigned LabelId = MMI->NextLabelID();
+    BuildMI(MBB, MBBI, DL, TII.get(X86::DBG_LABEL)).addImm(LabelId);
+
+    if (!HasFP && NumBytes) {
+      // Define the current CFA rule to use the provided offset.
+      if (StackSize) {
+        MachineLocation SPDst(MachineLocation::VirtualFP);
+        MachineLocation SPSrc(MachineLocation::VirtualFP,
+                              -StackSize + stackGrowth);
+        Moves.push_back(MachineMove(LabelId, SPDst, SPSrc));
+      } else {
+        // FIXME: Verify & implement for FP
+        MachineLocation SPDst(StackPtr);
+        MachineLocation SPSrc(StackPtr, stackGrowth);
+        Moves.push_back(MachineMove(LabelId, SPDst, SPSrc));
+      }
+    }
+
+    // Emit DWARF info specifying the offsets of the callee-saved registers.
+    if (PushedRegs)
+      emitCalleeSavedFrameMoves(MF, LabelId, HasFP ? FramePtr : StackPtr);
+  }
+}
+
+void X86RegisterInfo::emitEpilogue(MachineFunction &MF,
+                                   MachineBasicBlock &MBB) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  unsigned RetOpcode = MBBI->getOpcode();
+  DebugLoc DL = MBBI->getDebugLoc();
+
+  switch (RetOpcode) {
+  default:
+    llvm_unreachable("Can only insert epilog into returning blocks");
+  case X86::RET:
+  case X86::RETI:
+  case X86::TCRETURNdi:
+  case X86::TCRETURNri:
+  case X86::TCRETURNri64:
+  case X86::TCRETURNdi64:
+  case X86::EH_RETURN:
+  case X86::EH_RETURN64:
+  case X86::TAILJMPd:
+  case X86::TAILJMPr:
+  case X86::TAILJMPm:
+    break;  // These are ok
+  }
+
+  // Get the number of bytes to allocate from the FrameInfo.
+  uint64_t StackSize = MFI->getStackSize();
+  uint64_t MaxAlign  = MFI->getMaxAlignment();
+  unsigned CSSize = X86FI->getCalleeSavedFrameSize();
+  uint64_t NumBytes = 0;
+
+  if (hasFP(MF)) {
+    // Calculate required stack adjustment.
+    uint64_t FrameSize = StackSize - SlotSize;
+    if (needsStackRealignment(MF))
+      FrameSize = (FrameSize + MaxAlign - 1)/MaxAlign*MaxAlign;
+
+    NumBytes = FrameSize - CSSize;
+
+    // Pop EBP.
+    BuildMI(MBB, MBBI, DL,
+            TII.get(Is64Bit ? X86::POP64r : X86::POP32r), FramePtr);
+  } else {
+    NumBytes = StackSize - CSSize;
+  }
+
+  // Skip the callee-saved pop instructions.
+  MachineBasicBlock::iterator LastCSPop = MBBI;
+  while (MBBI != MBB.begin()) {
+    MachineBasicBlock::iterator PI = prior(MBBI);
+    unsigned Opc = PI->getOpcode();
+
+    if (Opc != X86::POP32r && Opc != X86::POP64r &&
+        !PI->getDesc().isTerminator())
+      break;
+
+    --MBBI;
+  }
+
+  DL = MBBI->getDebugLoc();
+
+  // If there is an ADD32ri or SUB32ri of ESP immediately before this
+  // instruction, merge the two instructions.
+  if (NumBytes || MFI->hasVarSizedObjects())
+    mergeSPUpdatesUp(MBB, MBBI, StackPtr, &NumBytes);
+
+  // If dynamic alloca is used, then reset esp to point to the last callee-saved
+  // slot before popping them off! Same applies for the case, when stack was
+  // realigned.
+  if (needsStackRealignment(MF)) {
+    // We cannot use LEA here, because stack pointer was realigned. We need to
+    // deallocate local frame back.
+    if (CSSize) {
+      emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, TII);
+      MBBI = prior(LastCSPop);
+    }
+
+    BuildMI(MBB, MBBI, DL,
+            TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr),
+            StackPtr).addReg(FramePtr);
+  } else if (MFI->hasVarSizedObjects()) {
+    if (CSSize) {
+      unsigned Opc = Is64Bit ? X86::LEA64r : X86::LEA32r;
+      MachineInstr *MI =
+        addLeaRegOffset(BuildMI(MF, DL, TII.get(Opc), StackPtr),
+                        FramePtr, false, -CSSize);
+      MBB.insert(MBBI, MI);
+    } else {
+      BuildMI(MBB, MBBI, DL,
+              TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr), StackPtr)
+        .addReg(FramePtr);
+    }
+  } else if (NumBytes) {
+    // Adjust stack pointer back: ESP += numbytes.
+    emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, TII);
+  }
+
+  // We're returning from function via eh_return.
+  if (RetOpcode == X86::EH_RETURN || RetOpcode == X86::EH_RETURN64) {
+    MBBI = prior(MBB.end());
+    MachineOperand &DestAddr  = MBBI->getOperand(0);
+    assert(DestAddr.isReg() && "Offset should be in register!");
+    BuildMI(MBB, MBBI, DL,
+            TII.get(Is64Bit ? X86::MOV64rr : X86::MOV32rr),
+            StackPtr).addReg(DestAddr.getReg());
+  } else if (RetOpcode == X86::TCRETURNri || RetOpcode == X86::TCRETURNdi ||
+             RetOpcode== X86::TCRETURNri64 || RetOpcode == X86::TCRETURNdi64) {
+    // Tail call return: adjust the stack pointer and jump to callee.
+    MBBI = prior(MBB.end());
+    MachineOperand &JumpTarget = MBBI->getOperand(0);
+    MachineOperand &StackAdjust = MBBI->getOperand(1);
+    assert(StackAdjust.isImm() && "Expecting immediate value.");
+
+    // Adjust stack pointer.
+    int StackAdj = StackAdjust.getImm();
+    int MaxTCDelta = X86FI->getTCReturnAddrDelta();
+    int Offset = 0;
+    assert(MaxTCDelta <= 0 && "MaxTCDelta should never be positive");
+
+    // Incoporate the retaddr area.
+    Offset = StackAdj-MaxTCDelta;
+    assert(Offset >= 0 && "Offset should never be negative");
+
+    if (Offset) {
+      // Check for possible merge with preceeding ADD instruction.
+      Offset += mergeSPUpdates(MBB, MBBI, StackPtr, true);
+      emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, TII);
+    }
+
+    // Jump to label or value in register.
+    if (RetOpcode == X86::TCRETURNdi|| RetOpcode == X86::TCRETURNdi64) {
+      BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPd)).
+        addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
+                         JumpTarget.getTargetFlags());
+    } else if (RetOpcode == X86::TCRETURNri64) {
+      BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPr64), JumpTarget.getReg());
+    } else {
+      BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPr), JumpTarget.getReg());
+    }
+
+    MachineInstr *NewMI = prior(MBBI);
+    for (unsigned i = 2, e = MBBI->getNumOperands(); i != e; ++i)
+      NewMI->addOperand(MBBI->getOperand(i));
+
+    // Delete the pseudo instruction TCRETURN.
+    MBB.erase(MBBI);
+  } else if ((RetOpcode == X86::RET || RetOpcode == X86::RETI) &&
+             (X86FI->getTCReturnAddrDelta() < 0)) {
+    // Add the return addr area delta back since we are not tail calling.
+    int delta = -1*X86FI->getTCReturnAddrDelta();
+    MBBI = prior(MBB.end());
+
+    // Check for possible merge with preceeding ADD instruction.
+    delta += mergeSPUpdates(MBB, MBBI, StackPtr, true);
+    emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, TII);
+  }
+}
+
+unsigned X86RegisterInfo::getRARegister() const {
+  return Is64Bit ? X86::RIP     // Should have dwarf #16.
+                 : X86::EIP;    // Should have dwarf #8.
+}
+
+unsigned X86RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  return hasFP(MF) ? FramePtr : StackPtr;
+}
+
+void
+X86RegisterInfo::getInitialFrameState(std::vector<MachineMove> &Moves) const {
+  // Calculate amount of bytes used for return address storing
+  int stackGrowth = (Is64Bit ? -8 : -4);
+
+  // Initial state of the frame pointer is esp+4.
+  MachineLocation Dst(MachineLocation::VirtualFP);
+  MachineLocation Src(StackPtr, stackGrowth);
+  Moves.push_back(MachineMove(0, Dst, Src));
+
+  // Add return address to move list
+  MachineLocation CSDst(StackPtr, stackGrowth);
+  MachineLocation CSSrc(getRARegister());
+  Moves.push_back(MachineMove(0, CSDst, CSSrc));
+}
+
+unsigned X86RegisterInfo::getEHExceptionRegister() const {
+  llvm_unreachable("What is the exception register");
+  return 0;
+}
+
+unsigned X86RegisterInfo::getEHHandlerRegister() const {
+  llvm_unreachable("What is the exception handler register");
+  return 0;
+}
+
+namespace llvm {
+unsigned getX86SubSuperRegister(unsigned Reg, EVT VT, bool High) {
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: return Reg;
+  case MVT::i8:
+    if (High) {
+      switch (Reg) {
+      default: return 0;
+      case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+        return X86::AH;
+      case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+        return X86::DH;
+      case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+        return X86::CH;
+      case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+        return X86::BH;
+      }
+    } else {
+      switch (Reg) {
+      default: return 0;
+      case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+        return X86::AL;
+      case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+        return X86::DL;
+      case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+        return X86::CL;
+      case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+        return X86::BL;
+      case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+        return X86::SIL;
+      case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+        return X86::DIL;
+      case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+        return X86::BPL;
+      case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+        return X86::SPL;
+      case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+        return X86::R8B;
+      case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+        return X86::R9B;
+      case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+        return X86::R10B;
+      case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+        return X86::R11B;
+      case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+        return X86::R12B;
+      case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+        return X86::R13B;
+      case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+        return X86::R14B;
+      case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+        return X86::R15B;
+      }
+    }
+  case MVT::i16:
+    switch (Reg) {
+    default: return Reg;
+    case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+      return X86::AX;
+    case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+      return X86::DX;
+    case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+      return X86::CX;
+    case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+      return X86::BX;
+    case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+      return X86::SI;
+    case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+      return X86::DI;
+    case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+      return X86::BP;
+    case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+      return X86::SP;
+    case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+      return X86::R8W;
+    case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+      return X86::R9W;
+    case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+      return X86::R10W;
+    case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+      return X86::R11W;
+    case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+      return X86::R12W;
+    case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+      return X86::R13W;
+    case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+      return X86::R14W;
+    case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+      return X86::R15W;
+    }
+  case MVT::i32:
+    switch (Reg) {
+    default: return Reg;
+    case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+      return X86::EAX;
+    case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+      return X86::EDX;
+    case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+      return X86::ECX;
+    case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+      return X86::EBX;
+    case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+      return X86::ESI;
+    case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+      return X86::EDI;
+    case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+      return X86::EBP;
+    case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+      return X86::ESP;
+    case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+      return X86::R8D;
+    case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+      return X86::R9D;
+    case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+      return X86::R10D;
+    case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+      return X86::R11D;
+    case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+      return X86::R12D;
+    case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+      return X86::R13D;
+    case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+      return X86::R14D;
+    case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+      return X86::R15D;
+    }
+  case MVT::i64:
+    switch (Reg) {
+    default: return Reg;
+    case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+      return X86::RAX;
+    case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+      return X86::RDX;
+    case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+      return X86::RCX;
+    case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+      return X86::RBX;
+    case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+      return X86::RSI;
+    case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+      return X86::RDI;
+    case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+      return X86::RBP;
+    case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+      return X86::RSP;
+    case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+      return X86::R8;
+    case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+      return X86::R9;
+    case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+      return X86::R10;
+    case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+      return X86::R11;
+    case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+      return X86::R12;
+    case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+      return X86::R13;
+    case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+      return X86::R14;
+    case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+      return X86::R15;
+    }
+  }
+
+  return Reg;
+}
+}
+
+#include "X86GenRegisterInfo.inc"

diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
new file mode 100644
index 0000000..8fb5e92
--- /dev/null
+++ b/lib/Target/X86/X86RegisterInfo.h

@@ -0,0 +1,174 @@
+//===- X86RegisterInfo.h - X86 Register Information Impl --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86REGISTERINFO_H
+#define X86REGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "X86GenRegisterInfo.h.inc"
+
+namespace llvm {
+  class Type;
+  class TargetInstrInfo;
+  class X86TargetMachine;
+
+/// N86 namespace - Native X86 register numbers
+///
+namespace N86 {
+  enum {
+    EAX = 0, ECX = 1, EDX = 2, EBX = 3, ESP = 4, EBP = 5, ESI = 6, EDI = 7
+  };
+}
+
+namespace X86 {
+  /// SubregIndex - The index of various sized subregister classes. Note that 
+  /// these indices must be kept in sync with the class indices in the 
+  /// X86RegisterInfo.td file.
+  enum SubregIndex {
+    SUBREG_8BIT = 1, SUBREG_8BIT_HI = 2, SUBREG_16BIT = 3, SUBREG_32BIT = 4
+  };
+}
+
+/// DWARFFlavour - Flavour of dwarf regnumbers
+///
+namespace DWARFFlavour {
+  enum {
+    X86_64 = 0, X86_32_DarwinEH = 1, X86_32_Generic = 2
+  };
+} 
+  
+class X86RegisterInfo : public X86GenRegisterInfo {
+public:
+  X86TargetMachine &TM;
+  const TargetInstrInfo &TII;
+
+private:
+  /// Is64Bit - Is the target 64-bits.
+  ///
+  bool Is64Bit;
+
+  /// IsWin64 - Is the target on of win64 flavours
+  ///
+  bool IsWin64;
+
+  /// SlotSize - Stack slot size in bytes.
+  ///
+  unsigned SlotSize;
+
+  /// StackAlign - Default stack alignment.
+  ///
+  unsigned StackAlign;
+
+  /// StackPtr - X86 physical register used as stack ptr.
+  ///
+  unsigned StackPtr;
+
+  /// FramePtr - X86 physical register used as frame ptr.
+  ///
+  unsigned FramePtr;
+
+public:
+  X86RegisterInfo(X86TargetMachine &tm, const TargetInstrInfo &tii);
+
+  /// getX86RegNum - Returns the native X86 register number for the given LLVM
+  /// register identifier.
+  static unsigned getX86RegNum(unsigned RegNo);
+
+  unsigned getStackAlignment() const { return StackAlign; }
+
+  /// getDwarfRegNum - allows modification of X86GenRegisterInfo::getDwarfRegNum
+  /// (created by TableGen) for target dependencies.
+  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+
+  /// Code Generation virtual methods...
+  /// 
+
+  /// getMatchingSuperRegClass - Return a subclass of the specified register
+  /// class A so that each register in it has a sub-register of the
+  /// specified sub-register index which is in the specified register class B.
+  virtual const TargetRegisterClass *
+  getMatchingSuperRegClass(const TargetRegisterClass *A,
+                           const TargetRegisterClass *B, unsigned Idx) const;
+
+  /// getPointerRegClass - Returns a TargetRegisterClass used for pointer
+  /// values.
+  const TargetRegisterClass *getPointerRegClass(unsigned Kind = 0) const;
+
+  /// getCrossCopyRegClass - Returns a legal register class to copy a register
+  /// in the specified class to or from. Returns NULL if it is possible to copy
+  /// between a two registers of the specified class.
+  const TargetRegisterClass *
+  getCrossCopyRegClass(const TargetRegisterClass *RC) const;
+
+  /// getCalleeSavedRegs - Return a null-terminated list of all of the
+  /// callee-save registers on this target.
+  const unsigned *getCalleeSavedRegs(const MachineFunction* MF = 0) const;
+
+  /// getCalleeSavedRegClasses - Return a null-terminated list of the preferred
+  /// register classes to spill each callee-saved register with.  The order and
+  /// length of this list match the getCalleeSavedRegs() list.
+  const TargetRegisterClass* const*
+  getCalleeSavedRegClasses(const MachineFunction *MF = 0) const;
+
+  /// getReservedRegs - Returns a bitset indexed by physical register number
+  /// indicating if a register is a special register that has particular uses and
+  /// should be considered unavailable at all times, e.g. SP, RA. This is used by
+  /// register scavenger to determine what registers are free.
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+
+  bool hasFP(const MachineFunction &MF) const;
+
+  bool canRealignStack(const MachineFunction &MF) const;
+
+  bool needsStackRealignment(const MachineFunction &MF) const;
+
+  bool hasReservedCallFrame(MachineFunction &MF) const;
+
+  bool hasReservedSpillSlot(MachineFunction &MF, unsigned Reg,
+                            int &FrameIdx) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MI) const;
+
+  unsigned eliminateFrameIndex(MachineBasicBlock::iterator MI,
+                               int SPAdj, int *Value = NULL,
+                               RegScavenger *RS = NULL) const;
+
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                            RegScavenger *RS = NULL) const;
+
+  void emitCalleeSavedFrameMoves(MachineFunction &MF, unsigned LabelId,
+                                 unsigned FramePtr) const;
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+
+  // Debug information queries.
+  unsigned getRARegister() const;
+  unsigned getFrameRegister(const MachineFunction &MF) const;
+  int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
+  void getInitialFrameState(std::vector<MachineMove> &Moves) const;
+
+  // Exception handling queries.
+  unsigned getEHExceptionRegister() const;
+  unsigned getEHHandlerRegister() const;
+};
+
+// getX86SubSuperRegister - X86 utility function. It returns the sub or super
+// register of a specific X86 register.
+// e.g. getX86SubSuperRegister(X86::EAX, EVT::i16) return X86:AX
+unsigned getX86SubSuperRegister(unsigned, EVT, bool High=false);
+
+} // End llvm namespace
+
+#endif

diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
new file mode 100644
index 0000000..1559bf7
--- /dev/null
+++ b/lib/Target/X86/X86RegisterInfo.td

@@ -0,0 +1,819 @@
+//===- X86RegisterInfo.td - Describe the X86 Register File --*- tablegen -*-==//
+// 
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+// 
+//===----------------------------------------------------------------------===//
+//
+// This file describes the X86 Register file, defining the registers themselves,
+// aliases between the registers, and the register classes built out of the
+// registers.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Register definitions...
+//
+let Namespace = "X86" in {
+
+  // In the register alias definitions below, we define which registers alias
+  // which others.  We only specify which registers the small registers alias,
+  // because the register file generator is smart enough to figure out that
+  // AL aliases AX if we tell it that AX aliased AL (for example).
+
+  // Dwarf numbering is different for 32-bit and 64-bit, and there are 
+  // variations by target as well. Currently the first entry is for X86-64, 
+  // second - for EH on X86-32/Darwin and third is 'generic' one (X86-32/Linux
+  // and debug information on X86-32/Darwin)
+
+  // 8-bit registers
+  // Low registers
+  def AL : Register<"al">, DwarfRegNum<[0, 0, 0]>;
+  def DL : Register<"dl">, DwarfRegNum<[1, 2, 2]>;
+  def CL : Register<"cl">, DwarfRegNum<[2, 1, 1]>;
+  def BL : Register<"bl">, DwarfRegNum<[3, 3, 3]>;
+
+  // X86-64 only
+  def SIL : Register<"sil">, DwarfRegNum<[4, 6, 6]>;
+  def DIL : Register<"dil">, DwarfRegNum<[5, 7, 7]>;
+  def BPL : Register<"bpl">, DwarfRegNum<[6, 4, 5]>;
+  def SPL : Register<"spl">, DwarfRegNum<[7, 5, 4]>;
+  def R8B  : Register<"r8b">,  DwarfRegNum<[8, -2, -2]>;
+  def R9B  : Register<"r9b">,  DwarfRegNum<[9, -2, -2]>;
+  def R10B : Register<"r10b">, DwarfRegNum<[10, -2, -2]>;
+  def R11B : Register<"r11b">, DwarfRegNum<[11, -2, -2]>;
+  def R12B : Register<"r12b">, DwarfRegNum<[12, -2, -2]>;
+  def R13B : Register<"r13b">, DwarfRegNum<[13, -2, -2]>;
+  def R14B : Register<"r14b">, DwarfRegNum<[14, -2, -2]>;
+  def R15B : Register<"r15b">, DwarfRegNum<[15, -2, -2]>;
+
+  // High registers. On x86-64, these cannot be used in any instruction
+  // with a REX prefix.
+  def AH : Register<"ah">, DwarfRegNum<[0, 0, 0]>;
+  def DH : Register<"dh">, DwarfRegNum<[1, 2, 2]>;
+  def CH : Register<"ch">, DwarfRegNum<[2, 1, 1]>;
+  def BH : Register<"bh">, DwarfRegNum<[3, 3, 3]>;
+
+  // 16-bit registers
+  def AX : RegisterWithSubRegs<"ax", [AL,AH]>, DwarfRegNum<[0, 0, 0]>;
+  def DX : RegisterWithSubRegs<"dx", [DL,DH]>, DwarfRegNum<[1, 2, 2]>;
+  def CX : RegisterWithSubRegs<"cx", [CL,CH]>, DwarfRegNum<[2, 1, 1]>;
+  def BX : RegisterWithSubRegs<"bx", [BL,BH]>, DwarfRegNum<[3, 3, 3]>;
+  def SI : RegisterWithSubRegs<"si", [SIL]>, DwarfRegNum<[4, 6, 6]>;
+  def DI : RegisterWithSubRegs<"di", [DIL]>, DwarfRegNum<[5, 7, 7]>;
+  def BP : RegisterWithSubRegs<"bp", [BPL]>, DwarfRegNum<[6, 4, 5]>;
+  def SP : RegisterWithSubRegs<"sp", [SPL]>, DwarfRegNum<[7, 5, 4]>;
+  def IP : Register<"ip">, DwarfRegNum<[16]>;
+  
+  // X86-64 only
+  def R8W  : RegisterWithSubRegs<"r8w", [R8B]>, DwarfRegNum<[8, -2, -2]>;
+  def R9W  : RegisterWithSubRegs<"r9w", [R9B]>, DwarfRegNum<[9, -2, -2]>;
+  def R10W : RegisterWithSubRegs<"r10w", [R10B]>, DwarfRegNum<[10, -2, -2]>;
+  def R11W : RegisterWithSubRegs<"r11w", [R11B]>, DwarfRegNum<[11, -2, -2]>;
+  def R12W : RegisterWithSubRegs<"r12w", [R12B]>, DwarfRegNum<[12, -2, -2]>;
+  def R13W : RegisterWithSubRegs<"r13w", [R13B]>, DwarfRegNum<[13, -2, -2]>;
+  def R14W : RegisterWithSubRegs<"r14w", [R14B]>, DwarfRegNum<[14, -2, -2]>;
+  def R15W : RegisterWithSubRegs<"r15w", [R15B]>, DwarfRegNum<[15, -2, -2]>;
+
+  // 32-bit registers
+  def EAX : RegisterWithSubRegs<"eax", [AX]>, DwarfRegNum<[0, 0, 0]>;
+  def EDX : RegisterWithSubRegs<"edx", [DX]>, DwarfRegNum<[1, 2, 2]>;
+  def ECX : RegisterWithSubRegs<"ecx", [CX]>, DwarfRegNum<[2, 1, 1]>;
+  def EBX : RegisterWithSubRegs<"ebx", [BX]>, DwarfRegNum<[3, 3, 3]>;
+  def ESI : RegisterWithSubRegs<"esi", [SI]>, DwarfRegNum<[4, 6, 6]>;
+  def EDI : RegisterWithSubRegs<"edi", [DI]>, DwarfRegNum<[5, 7, 7]>;
+  def EBP : RegisterWithSubRegs<"ebp", [BP]>, DwarfRegNum<[6, 4, 5]>;
+  def ESP : RegisterWithSubRegs<"esp", [SP]>, DwarfRegNum<[7, 5, 4]>;
+  def EIP : RegisterWithSubRegs<"eip", [IP]>, DwarfRegNum<[16, 8, 8]>;  
+  
+  // X86-64 only
+  def R8D  : RegisterWithSubRegs<"r8d", [R8W]>, DwarfRegNum<[8, -2, -2]>;
+  def R9D  : RegisterWithSubRegs<"r9d", [R9W]>, DwarfRegNum<[9, -2, -2]>;
+  def R10D : RegisterWithSubRegs<"r10d", [R10W]>, DwarfRegNum<[10, -2, -2]>;
+  def R11D : RegisterWithSubRegs<"r11d", [R11W]>, DwarfRegNum<[11, -2, -2]>;
+  def R12D : RegisterWithSubRegs<"r12d", [R12W]>, DwarfRegNum<[12, -2, -2]>;
+  def R13D : RegisterWithSubRegs<"r13d", [R13W]>, DwarfRegNum<[13, -2, -2]>;
+  def R14D : RegisterWithSubRegs<"r14d", [R14W]>, DwarfRegNum<[14, -2, -2]>;
+  def R15D : RegisterWithSubRegs<"r15d", [R15W]>, DwarfRegNum<[15, -2, -2]>;
+
+  // 64-bit registers, X86-64 only
+  def RAX : RegisterWithSubRegs<"rax", [EAX]>, DwarfRegNum<[0, -2, -2]>;
+  def RDX : RegisterWithSubRegs<"rdx", [EDX]>, DwarfRegNum<[1, -2, -2]>;
+  def RCX : RegisterWithSubRegs<"rcx", [ECX]>, DwarfRegNum<[2, -2, -2]>;
+  def RBX : RegisterWithSubRegs<"rbx", [EBX]>, DwarfRegNum<[3, -2, -2]>;
+  def RSI : RegisterWithSubRegs<"rsi", [ESI]>, DwarfRegNum<[4, -2, -2]>;
+  def RDI : RegisterWithSubRegs<"rdi", [EDI]>, DwarfRegNum<[5, -2, -2]>;
+  def RBP : RegisterWithSubRegs<"rbp", [EBP]>, DwarfRegNum<[6, -2, -2]>;
+  def RSP : RegisterWithSubRegs<"rsp", [ESP]>, DwarfRegNum<[7, -2, -2]>;
+
+  def R8  : RegisterWithSubRegs<"r8", [R8D]>, DwarfRegNum<[8, -2, -2]>;
+  def R9  : RegisterWithSubRegs<"r9", [R9D]>, DwarfRegNum<[9, -2, -2]>;
+  def R10 : RegisterWithSubRegs<"r10", [R10D]>, DwarfRegNum<[10, -2, -2]>;
+  def R11 : RegisterWithSubRegs<"r11", [R11D]>, DwarfRegNum<[11, -2, -2]>;
+  def R12 : RegisterWithSubRegs<"r12", [R12D]>, DwarfRegNum<[12, -2, -2]>;
+  def R13 : RegisterWithSubRegs<"r13", [R13D]>, DwarfRegNum<[13, -2, -2]>;
+  def R14 : RegisterWithSubRegs<"r14", [R14D]>, DwarfRegNum<[14, -2, -2]>;
+  def R15 : RegisterWithSubRegs<"r15", [R15D]>, DwarfRegNum<[15, -2, -2]>;
+  def RIP : RegisterWithSubRegs<"rip", [EIP]>,  DwarfRegNum<[16, -2, -2]>;
+
+  // MMX Registers. These are actually aliased to ST0 .. ST7
+  def MM0 : Register<"mm0">, DwarfRegNum<[41, 29, 29]>;
+  def MM1 : Register<"mm1">, DwarfRegNum<[42, 30, 30]>;
+  def MM2 : Register<"mm2">, DwarfRegNum<[43, 31, 31]>;
+  def MM3 : Register<"mm3">, DwarfRegNum<[44, 32, 32]>;
+  def MM4 : Register<"mm4">, DwarfRegNum<[45, 33, 33]>;
+  def MM5 : Register<"mm5">, DwarfRegNum<[46, 34, 34]>;
+  def MM6 : Register<"mm6">, DwarfRegNum<[47, 35, 35]>;
+  def MM7 : Register<"mm7">, DwarfRegNum<[48, 36, 36]>;
+  
+  // Pseudo Floating Point registers
+  def FP0 : Register<"fp0">;
+  def FP1 : Register<"fp1">;
+  def FP2 : Register<"fp2">;
+  def FP3 : Register<"fp3">;
+  def FP4 : Register<"fp4">;
+  def FP5 : Register<"fp5">;
+  def FP6 : Register<"fp6">; 
+
+  // XMM Registers, used by the various SSE instruction set extensions
+  def XMM0: Register<"xmm0">, DwarfRegNum<[17, 21, 21]>;
+  def XMM1: Register<"xmm1">, DwarfRegNum<[18, 22, 22]>;
+  def XMM2: Register<"xmm2">, DwarfRegNum<[19, 23, 23]>;
+  def XMM3: Register<"xmm3">, DwarfRegNum<[20, 24, 24]>;
+  def XMM4: Register<"xmm4">, DwarfRegNum<[21, 25, 25]>;
+  def XMM5: Register<"xmm5">, DwarfRegNum<[22, 26, 26]>;
+  def XMM6: Register<"xmm6">, DwarfRegNum<[23, 27, 27]>;
+  def XMM7: Register<"xmm7">, DwarfRegNum<[24, 28, 28]>;
+
+  // X86-64 only
+  def XMM8:  Register<"xmm8">,  DwarfRegNum<[25, -2, -2]>;
+  def XMM9:  Register<"xmm9">,  DwarfRegNum<[26, -2, -2]>;
+  def XMM10: Register<"xmm10">, DwarfRegNum<[27, -2, -2]>;
+  def XMM11: Register<"xmm11">, DwarfRegNum<[28, -2, -2]>;
+  def XMM12: Register<"xmm12">, DwarfRegNum<[29, -2, -2]>;
+  def XMM13: Register<"xmm13">, DwarfRegNum<[30, -2, -2]>;
+  def XMM14: Register<"xmm14">, DwarfRegNum<[31, -2, -2]>;
+  def XMM15: Register<"xmm15">, DwarfRegNum<[32, -2, -2]>;
+
+  // YMM Registers, used by AVX instructions
+  def YMM0: Register<"ymm0">, DwarfRegNum<[17, 21, 21]>;
+  def YMM1: Register<"ymm1">, DwarfRegNum<[18, 22, 22]>;
+  def YMM2: Register<"ymm2">, DwarfRegNum<[19, 23, 23]>;
+  def YMM3: Register<"ymm3">, DwarfRegNum<[20, 24, 24]>;
+  def YMM4: Register<"ymm4">, DwarfRegNum<[21, 25, 25]>;
+  def YMM5: Register<"ymm5">, DwarfRegNum<[22, 26, 26]>;
+  def YMM6: Register<"ymm6">, DwarfRegNum<[23, 27, 27]>;
+  def YMM7: Register<"ymm7">, DwarfRegNum<[24, 28, 28]>;
+  def YMM8:  Register<"ymm8">,  DwarfRegNum<[25, -2, -2]>;
+  def YMM9:  Register<"ymm9">,  DwarfRegNum<[26, -2, -2]>;
+  def YMM10: Register<"ymm10">, DwarfRegNum<[27, -2, -2]>;
+  def YMM11: Register<"ymm11">, DwarfRegNum<[28, -2, -2]>;
+  def YMM12: Register<"ymm12">, DwarfRegNum<[29, -2, -2]>;
+  def YMM13: Register<"ymm13">, DwarfRegNum<[30, -2, -2]>;
+  def YMM14: Register<"ymm14">, DwarfRegNum<[31, -2, -2]>;
+  def YMM15: Register<"ymm15">, DwarfRegNum<[32, -2, -2]>;
+
+  // Floating point stack registers
+  def ST0 : Register<"st(0)">, DwarfRegNum<[33, 12, 11]>;
+  def ST1 : Register<"st(1)">, DwarfRegNum<[34, 13, 12]>;
+  def ST2 : Register<"st(2)">, DwarfRegNum<[35, 14, 13]>;
+  def ST3 : Register<"st(3)">, DwarfRegNum<[36, 15, 14]>;
+  def ST4 : Register<"st(4)">, DwarfRegNum<[37, 16, 15]>;
+  def ST5 : Register<"st(5)">, DwarfRegNum<[38, 17, 16]>;
+  def ST6 : Register<"st(6)">, DwarfRegNum<[39, 18, 17]>;
+  def ST7 : Register<"st(7)">, DwarfRegNum<[40, 19, 18]>; 
+
+  // Status flags register
+  def EFLAGS : Register<"flags">;
+
+  // Segment registers
+  def CS : Register<"cs">;
+  def DS : Register<"ds">;
+  def SS : Register<"ss">;
+  def ES : Register<"es">;
+  def FS : Register<"fs">;
+  def GS : Register<"gs">;
+  
+  // Debug registers
+  def DR0 : Register<"dr0">;
+  def DR1 : Register<"dr1">;
+  def DR2 : Register<"dr2">;
+  def DR3 : Register<"dr3">;
+  def DR4 : Register<"dr4">;
+  def DR5 : Register<"dr5">;
+  def DR6 : Register<"dr6">;
+  def DR7 : Register<"dr7">;
+  
+  // Condition registers
+  def ECR0 : Register<"ecr0">;
+  def ECR1 : Register<"ecr1">;
+  def ECR2 : Register<"ecr2">;
+  def ECR3 : Register<"ecr3">;
+  def ECR4 : Register<"ecr4">;
+  def ECR5 : Register<"ecr5">;
+  def ECR6 : Register<"ecr6">;
+  def ECR7 : Register<"ecr7">;
+
+  def RCR0 : Register<"rcr0">;
+  def RCR1 : Register<"rcr1">;
+  def RCR2 : Register<"rcr2">;
+  def RCR3 : Register<"rcr3">;
+  def RCR4 : Register<"rcr4">;
+  def RCR5 : Register<"rcr5">;
+  def RCR6 : Register<"rcr6">;
+  def RCR7 : Register<"rcr7">;
+  def RCR8 : Register<"rcr8">; 
+}
+
+
+//===----------------------------------------------------------------------===//
+// Subregister Set Definitions... now that we have all of the pieces, define the
+// sub registers for each register.
+//
+
+def x86_subreg_8bit    : PatLeaf<(i32 1)>;
+def x86_subreg_8bit_hi : PatLeaf<(i32 2)>;
+def x86_subreg_16bit   : PatLeaf<(i32 3)>;
+def x86_subreg_32bit   : PatLeaf<(i32 4)>;
+
+def : SubRegSet<1, [AX, CX, DX, BX, SP,  BP,  SI,  DI,  
+                    R8W, R9W, R10W, R11W, R12W, R13W, R14W, R15W],
+                   [AL, CL, DL, BL, SPL, BPL, SIL, DIL, 
+                    R8B, R9B, R10B, R11B, R12B, R13B, R14B, R15B]>;
+
+def : SubRegSet<2, [AX, CX, DX, BX],
+                   [AH, CH, DH, BH]>;
+
+def : SubRegSet<1, [EAX, ECX, EDX, EBX, ESP, EBP, ESI, EDI,  
+                    R8D, R9D, R10D, R11D, R12D, R13D, R14D, R15D],
+                   [AL, CL, DL, BL, SPL, BPL, SIL, DIL, 
+                    R8B, R9B, R10B, R11B, R12B, R13B, R14B, R15B]>;
+
+def : SubRegSet<2, [EAX, ECX, EDX, EBX],
+                   [AH, CH, DH, BH]>;
+
+def : SubRegSet<3, [EAX, ECX, EDX, EBX, ESP, EBP, ESI, EDI,
+                    R8D, R9D, R10D, R11D, R12D, R13D, R14D, R15D],
+                   [AX,  CX,  DX,  BX,  SP,  BP,  SI,  DI, 
+                    R8W, R9W, R10W, R11W, R12W, R13W, R14W, R15W]>;
+
+def : SubRegSet<1, [RAX, RCX, RDX, RBX, RSP, RBP, RSI, RDI,  
+                    R8,  R9,  R10, R11, R12, R13, R14, R15],
+                   [AL, CL, DL, BL, SPL, BPL, SIL, DIL, 
+                    R8B, R9B, R10B, R11B, R12B, R13B, R14B, R15B]>;
+
+def : SubRegSet<2, [RAX, RCX, RDX, RBX],
+                   [AH, CH, DH, BH]>;
+
+def : SubRegSet<3, [RAX, RCX, RDX, RBX, RSP, RBP, RSI, RDI,
+                    R8,  R9,  R10, R11, R12, R13, R14, R15],
+                   [AX,  CX,  DX,  BX,  SP,  BP,  SI,  DI, 
+                    R8W, R9W, R10W, R11W, R12W, R13W, R14W, R15W]>;
+
+def : SubRegSet<4, [RAX, RCX, RDX, RBX, RSP, RBP, RSI, RDI,
+                    R8,  R9,  R10, R11, R12, R13, R14, R15],
+                   [EAX, ECX, EDX, EBX, ESP, EBP, ESI, EDI, 
+                    R8D, R9D, R10D, R11D, R12D, R13D, R14D, R15D]>;
+
+def : SubRegSet<1, [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,  
+                    YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15],
+                   [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, 
+                    XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15]>;
+
+//===----------------------------------------------------------------------===//
+// Register Class Definitions... now that we have all of the pieces, define the
+// top-level register classes.  The order specified in the register list is
+// implicitly defined to be the register allocation order.
+//
+
+// List call-clobbered registers before callee-save registers. RBX, RBP, (and 
+// R12, R13, R14, and R15 for X86-64) are callee-save registers.
+// In 64-mode, there are 12 additional i8 registers, SIL, DIL, BPL, SPL, and
+// R8B, ... R15B. 
+// Allocate R12 and R13 last, as these require an extra byte when
+// encoded in x86_64 instructions.
+// FIXME: Allow AH, CH, DH, BH to be used as general-purpose registers in
+// 64-bit mode. The main complication is that they cannot be encoded in an
+// instruction requiring a REX prefix, while SIL, DIL, BPL, R8D, etc.
+// require a REX prefix. For example, "addb %ah, %dil" and "movzbl %ah, %r8d"
+// cannot be encoded.
+def GR8 : RegisterClass<"X86", [i8],  8,
+                        [AL, CL, DL, AH, CH, DH, BL, BH, SIL, DIL, BPL, SPL,
+                         R8B, R9B, R10B, R11B, R14B, R15B, R12B, R13B]> {
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    static const unsigned X86_GR8_AO_64[] = {
+      X86::AL,   X86::CL,   X86::DL,   X86::SIL, X86::DIL,
+      X86::R8B,  X86::R9B,  X86::R10B, X86::R11B,
+      X86::BL,   X86::R14B, X86::R15B, X86::R12B, X86::R13B, X86::BPL
+    };
+
+    GR8Class::iterator
+    GR8Class::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (Subtarget.is64Bit())
+        return X86_GR8_AO_64;
+      else
+        return begin();
+    }
+
+    GR8Class::iterator
+    GR8Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      // Does the function dedicate RBP / EBP to being a frame ptr?
+      if (!Subtarget.is64Bit())
+        // In 32-mode, none of the 8-bit registers aliases EBP or ESP.
+        return begin() + 8;
+      else if (RI->hasFP(MF))
+        // If so, don't allocate SPL or BPL.
+        return array_endof(X86_GR8_AO_64) - 1;
+      else
+        // If not, just don't allocate SPL.
+        return array_endof(X86_GR8_AO_64);
+    }
+  }];
+}
+
+def GR16 : RegisterClass<"X86", [i16], 16,
+                         [AX, CX, DX, SI, DI, BX, BP, SP,
+                          R8W, R9W, R10W, R11W, R14W, R15W, R12W, R13W]> {
+  let SubRegClassList = [GR8, GR8];
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    static const unsigned X86_GR16_AO_64[] = {
+      X86::AX,  X86::CX,   X86::DX,   X86::SI,   X86::DI,
+      X86::R8W, X86::R9W,  X86::R10W, X86::R11W,
+      X86::BX, X86::R14W, X86::R15W,  X86::R12W, X86::R13W, X86::BP
+    };
+
+    GR16Class::iterator
+    GR16Class::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (Subtarget.is64Bit())
+        return X86_GR16_AO_64;
+      else
+        return begin();
+    }
+
+    GR16Class::iterator
+    GR16Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (Subtarget.is64Bit()) {
+        // Does the function dedicate RBP to being a frame ptr?
+        if (RI->hasFP(MF))
+          // If so, don't allocate SP or BP.
+          return array_endof(X86_GR16_AO_64) - 1;
+        else
+          // If not, just don't allocate SP.
+          return array_endof(X86_GR16_AO_64);
+      } else {
+        // Does the function dedicate EBP to being a frame ptr?
+        if (RI->hasFP(MF))
+          // If so, don't allocate SP or BP.
+          return begin() + 6;
+        else
+          // If not, just don't allocate SP.
+          return begin() + 7;
+      }
+    }
+  }];
+}
+
+def GR32 : RegisterClass<"X86", [i32], 32, 
+                         [EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP,
+                          R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D]> {
+  let SubRegClassList = [GR8, GR8, GR16];
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    static const unsigned X86_GR32_AO_64[] = {
+      X86::EAX, X86::ECX,  X86::EDX,  X86::ESI,  X86::EDI,
+      X86::R8D, X86::R9D,  X86::R10D, X86::R11D,
+      X86::EBX, X86::R14D, X86::R15D, X86::R12D, X86::R13D, X86::EBP
+    };
+
+    GR32Class::iterator
+    GR32Class::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (Subtarget.is64Bit())
+        return X86_GR32_AO_64;
+      else
+        return begin();
+    }
+
+    GR32Class::iterator
+    GR32Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (Subtarget.is64Bit()) {
+        // Does the function dedicate RBP to being a frame ptr?
+        if (RI->hasFP(MF))
+          // If so, don't allocate ESP or EBP.
+          return array_endof(X86_GR32_AO_64) - 1;
+        else
+          // If not, just don't allocate ESP.
+          return array_endof(X86_GR32_AO_64);
+      } else {
+        // Does the function dedicate EBP to being a frame ptr?
+        if (RI->hasFP(MF))
+          // If so, don't allocate ESP or EBP.
+          return begin() + 6;
+        else
+          // If not, just don't allocate ESP.
+          return begin() + 7;
+      }
+    }
+  }];
+}
+
+// GR64 - 64-bit GPRs. This oddly includes RIP, which isn't accurate, since
+// RIP isn't really a register and it can't be used anywhere except in an
+// address, but it doesn't cause trouble.
+def GR64 : RegisterClass<"X86", [i64], 64, 
+                         [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+                          RBX, R14, R15, R12, R13, RBP, RSP, RIP]> {
+  let SubRegClassList = [GR8, GR8, GR16, GR32];
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    GR64Class::iterator
+    GR64Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (!Subtarget.is64Bit())
+        return begin();  // None of these are allocatable in 32-bit.
+      if (RI->hasFP(MF)) // Does the function dedicate RBP to being a frame ptr?
+        return end()-3;  // If so, don't allocate RIP, RSP or RBP
+      else
+        return end()-2;  // If not, just don't allocate RIP or RSP
+    }
+  }];
+}
+
+// Segment registers for use by MOV instructions (and others) that have a
+//   segment register as one operand.  Always contain a 16-bit segment
+//   descriptor.
+def SEGMENT_REG : RegisterClass<"X86", [i16], 16, [CS, DS, SS, ES, FS, GS]> {
+}
+
+// Debug registers.
+def DEBUG_REG : RegisterClass<"X86", [i32], 32, 
+                              [DR0, DR1, DR2, DR3, DR4, DR5, DR6, DR7]> {
+}
+
+// Control registers.
+def CONTROL_REG_32 : RegisterClass<"X86", [i32], 32,
+                                   [ECR0, ECR1, ECR2, ECR3, ECR4, ECR5, ECR6,
+                                    ECR7]> {
+}
+
+def CONTROL_REG_64 : RegisterClass<"X86", [i64], 64,
+                                   [RCR0, RCR1, RCR2, RCR3, RCR4, RCR5, RCR6,
+                                    RCR7, RCR8]> {
+}
+
+// GR8_ABCD_L, GR8_ABCD_H, GR16_ABCD, GR32_ABCD, GR64_ABCD - Subclasses of
+// GR8, GR16, GR32, and GR64 which contain just the "a" "b", "c", and "d"
+// registers. On x86-32, GR16_ABCD and GR32_ABCD are classes for registers
+// that support 8-bit subreg operations. On x86-64, GR16_ABCD, GR32_ABCD,
+// and GR64_ABCD are classes for registers that support 8-bit h-register
+// operations.
+def GR8_ABCD_L : RegisterClass<"X86", [i8], 8, [AL, CL, DL, BL]> {
+}
+def GR8_ABCD_H : RegisterClass<"X86", [i8], 8, [AH, CH, DH, BH]> {
+}
+def GR16_ABCD : RegisterClass<"X86", [i16], 16, [AX, CX, DX, BX]> {
+  let SubRegClassList = [GR8_ABCD_L, GR8_ABCD_H];
+}
+def GR32_ABCD : RegisterClass<"X86", [i32], 32, [EAX, ECX, EDX, EBX]> {
+  let SubRegClassList = [GR8_ABCD_L, GR8_ABCD_H, GR16_ABCD];
+}
+def GR64_ABCD : RegisterClass<"X86", [i64], 64, [RAX, RCX, RDX, RBX]> {
+  let SubRegClassList = [GR8_ABCD_L, GR8_ABCD_H, GR16_ABCD, GR32_ABCD];
+}
+
+// GR8_NOREX - GR8 registers which do not require a REX prefix.
+def GR8_NOREX : RegisterClass<"X86", [i8], 8,
+                              [AL, CL, DL, AH, CH, DH, BL, BH]> {
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    // In 64-bit mode, it's not safe to blindly allocate H registers.
+    static const unsigned X86_GR8_NOREX_AO_64[] = {
+      X86::AL, X86::CL, X86::DL, X86::BL
+    };
+
+    GR8_NOREXClass::iterator
+    GR8_NOREXClass::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (Subtarget.is64Bit())
+        return X86_GR8_NOREX_AO_64;
+      else
+        return begin();
+    }
+
+    GR8_NOREXClass::iterator
+    GR8_NOREXClass::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (Subtarget.is64Bit())
+        return array_endof(X86_GR8_NOREX_AO_64);
+      else
+        return end();
+    }
+  }];
+}
+// GR16_NOREX - GR16 registers which do not require a REX prefix.
+def GR16_NOREX : RegisterClass<"X86", [i16], 16,
+                               [AX, CX, DX, SI, DI, BX, BP, SP]> {
+  let SubRegClassList = [GR8_NOREX, GR8_NOREX];
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    GR16_NOREXClass::iterator
+    GR16_NOREXClass::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      // Does the function dedicate RBP / EBP to being a frame ptr?
+      if (RI->hasFP(MF))
+        // If so, don't allocate SP or BP.
+        return end() - 2;
+      else
+        // If not, just don't allocate SP.
+        return end() - 1;
+    }
+  }];
+}
+// GR32_NOREX - GR32 registers which do not require a REX prefix.
+def GR32_NOREX : RegisterClass<"X86", [i32], 32,
+                               [EAX, ECX, EDX, ESI, EDI, EBX, EBP, ESP]> {
+  let SubRegClassList = [GR8_NOREX, GR8_NOREX, GR16_NOREX];
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    GR32_NOREXClass::iterator
+    GR32_NOREXClass::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      // Does the function dedicate RBP / EBP to being a frame ptr?
+      if (RI->hasFP(MF))
+        // If so, don't allocate ESP or EBP.
+        return end() - 2;
+      else
+        // If not, just don't allocate ESP.
+        return end() - 1;
+    }
+  }];
+}
+// GR64_NOREX - GR64 registers which do not require a REX prefix.
+def GR64_NOREX : RegisterClass<"X86", [i64], 64,
+                               [RAX, RCX, RDX, RSI, RDI, RBX, RBP, RSP, RIP]> {
+  let SubRegClassList = [GR8_NOREX, GR8_NOREX, GR16_NOREX, GR32_NOREX];
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    GR64_NOREXClass::iterator
+    GR64_NOREXClass::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      // Does the function dedicate RBP to being a frame ptr?
+      if (RI->hasFP(MF))
+        // If so, don't allocate RIP, RSP or RBP.
+        return end() - 3;
+      else
+        // If not, just don't allocate RIP or RSP.
+        return end() - 2;
+    }
+  }];
+}
+
+// GR32_NOSP - GR32 registers except ESP.
+def GR32_NOSP : RegisterClass<"X86", [i32], 32,
+                              [EAX, ECX, EDX, ESI, EDI, EBX, EBP,
+                               R8D, R9D, R10D, R11D, R14D, R15D, R12D, R13D]> {
+  let SubRegClassList = [GR8, GR8, GR16];
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    static const unsigned X86_GR32_NOSP_AO_64[] = {
+      X86::EAX, X86::ECX,  X86::EDX,  X86::ESI,  X86::EDI,
+      X86::R8D, X86::R9D,  X86::R10D, X86::R11D,
+      X86::EBX, X86::R14D, X86::R15D, X86::R12D, X86::R13D, X86::EBP
+    };
+
+    GR32_NOSPClass::iterator
+    GR32_NOSPClass::allocation_order_begin(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (Subtarget.is64Bit())
+        return X86_GR32_NOSP_AO_64;
+      else
+        return begin();
+    }
+
+    GR32_NOSPClass::iterator
+    GR32_NOSPClass::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (Subtarget.is64Bit()) {
+        // Does the function dedicate RBP to being a frame ptr?
+        if (RI->hasFP(MF))
+          // If so, don't allocate EBP.
+          return array_endof(X86_GR32_NOSP_AO_64) - 1;
+        else
+          // If not, any reg in this class is ok.
+          return array_endof(X86_GR32_NOSP_AO_64);
+      } else {
+        // Does the function dedicate EBP to being a frame ptr?
+        if (RI->hasFP(MF))
+          // If so, don't allocate EBP.
+          return begin() + 6;
+        else
+          // If not, any reg in this class is ok.
+          return begin() + 7;
+      }
+    }
+  }];
+}
+
+// GR64_NOSP - GR64 registers except RSP (and RIP).
+def GR64_NOSP : RegisterClass<"X86", [i64], 64,
+                              [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+                               RBX, R14, R15, R12, R13, RBP]> {
+  let SubRegClassList = [GR8, GR8, GR16, GR32_NOSP];
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    GR64_NOSPClass::iterator
+    GR64_NOSPClass::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (!Subtarget.is64Bit())
+        return begin();  // None of these are allocatable in 32-bit.
+      if (RI->hasFP(MF)) // Does the function dedicate RBP to being a frame ptr?
+        return end()-1;  // If so, don't allocate RBP
+      else
+        return end();  // If not, any reg in this class is ok.
+    }
+  }];
+}
+
+// GR64_NOREX_NOSP - GR64_NOREX registers except RSP.
+def GR64_NOREX_NOSP : RegisterClass<"X86", [i64], 64,
+                                    [RAX, RCX, RDX, RSI, RDI, RBX, RBP]> {
+  let SubRegClassList = [GR8_NOREX, GR8_NOREX, GR16_NOREX, GR32_NOREX];
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    GR64_NOREX_NOSPClass::iterator
+    GR64_NOREX_NOSPClass::allocation_order_end(const MachineFunction &MF) const
+  {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      // Does the function dedicate RBP to being a frame ptr?
+      if (RI->hasFP(MF))
+        // If so, don't allocate RBP.
+        return end() - 1;
+      else
+        // If not, any reg in this class is ok.
+        return end();
+    }
+  }];
+}
+
+// A class to support the 'A' assembler constraint: EAX then EDX.
+def GR32_AD : RegisterClass<"X86", [i32], 32, [EAX, EDX]> {
+  let SubRegClassList = [GR8_ABCD_L, GR8_ABCD_H, GR16_ABCD];
+}
+
+// Scalar SSE2 floating point registers.
+def FR32 : RegisterClass<"X86", [f32], 32,
+                         [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+                          XMM8, XMM9, XMM10, XMM11,
+                          XMM12, XMM13, XMM14, XMM15]> {
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    FR32Class::iterator
+    FR32Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (!Subtarget.is64Bit())
+        return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode.
+      else
+        return end();
+    }
+  }];
+}
+
+def FR64 : RegisterClass<"X86", [f64], 64,
+                         [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+                          XMM8, XMM9, XMM10, XMM11,
+                          XMM12, XMM13, XMM14, XMM15]> {
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    FR64Class::iterator
+    FR64Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (!Subtarget.is64Bit())
+        return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode.
+      else
+        return end();
+    }
+  }];
+}
+
+
+// FIXME: This sets up the floating point register files as though they are f64
+// values, though they really are f80 values.  This will cause us to spill
+// values as 64-bit quantities instead of 80-bit quantities, which is much much
+// faster on common hardware.  In reality, this should be controlled by a
+// command line option or something.
+
+def RFP32 : RegisterClass<"X86",[f32], 32, [FP0, FP1, FP2, FP3, FP4, FP5, FP6]>;
+def RFP64 : RegisterClass<"X86",[f64], 32, [FP0, FP1, FP2, FP3, FP4, FP5, FP6]>;
+def RFP80 : RegisterClass<"X86",[f80], 32, [FP0, FP1, FP2, FP3, FP4, FP5, FP6]>;
+
+// Floating point stack registers (these are not allocatable by the
+// register allocator - the floating point stackifier is responsible
+// for transforming FPn allocations to STn registers)
+def RST : RegisterClass<"X86", [f80, f64, f32], 32,
+                        [ST0, ST1, ST2, ST3, ST4, ST5, ST6, ST7]> {
+    let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    RSTClass::iterator
+    RSTClass::allocation_order_end(const MachineFunction &MF) const {
+      return begin();
+    }
+  }];
+}
+
+// Generic vector registers: VR64 and VR128.
+def VR64  : RegisterClass<"X86", [v8i8, v4i16, v2i32, v1i64, v2f32], 64,
+                          [MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7]>;
+def VR128 : RegisterClass<"X86", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],128,
+                          [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+                           XMM8, XMM9, XMM10, XMM11,
+                           XMM12, XMM13, XMM14, XMM15]> {
+  let MethodProtos = [{
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    VR128Class::iterator
+    VR128Class::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
+      if (!Subtarget.is64Bit())
+        return end()-8; // Only XMM0 to XMM7 are available in 32-bit mode.
+      else
+        return end();
+    }
+  }];
+}
+def VR256 : RegisterClass<"X86", [ v8i32, v4i64, v8f32, v4f64],256,
+                          [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
+                           YMM8, YMM9, YMM10, YMM11,
+                           YMM12, YMM13, YMM14, YMM15]>;
+
+// Status flags registers.
+def CCR : RegisterClass<"X86", [i32], 32, [EFLAGS]> {
+  let CopyCost = -1;  // Don't allow copying of status registers.
+}

diff --git a/lib/Target/X86/X86Relocations.h b/lib/Target/X86/X86Relocations.h
new file mode 100644
index 0000000..990962d
--- /dev/null
+++ b/lib/Target/X86/X86Relocations.h

@@ -0,0 +1,52 @@
+//===- X86Relocations.h - X86 Code Relocations ------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86 target-specific relocation types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86RELOCATIONS_H
+#define X86RELOCATIONS_H
+
+#include "llvm/CodeGen/MachineRelocation.h"
+
+namespace llvm {
+  namespace X86 {
+    /// RelocationType - An enum for the x86 relocation codes. Note that
+    /// the terminology here doesn't follow x86 convention - word means
+    /// 32-bit and dword means 64-bit. The relocations will be treated
+    /// by JIT or ObjectCode emitters, this is transparent to the x86 code 
+    /// emitter but JIT and ObjectCode will treat them differently
+    enum RelocationType {
+      /// reloc_pcrel_word - PC relative relocation, add the relocated value to
+      /// the value already in memory, after we adjust it for where the PC is.
+      reloc_pcrel_word = 0,
+
+      /// reloc_picrel_word - PIC base relative relocation, add the relocated
+      /// value to the value already in memory, after we adjust it for where the
+      /// PIC base is.
+      reloc_picrel_word = 1,
+
+      /// reloc_absolute_word - absolute relocation, just add the relocated
+      /// value to the value already in memory.
+      reloc_absolute_word = 2,
+
+      /// reloc_absolute_word_sext - absolute relocation, just add the relocated
+      /// value to the value already in memory. In object files, it represents a
+      /// value which must be sign-extended when resolving the relocation.
+      reloc_absolute_word_sext = 3,
+
+      /// reloc_absolute_dword - absolute relocation, just add the relocated
+      /// value to the value already in memory.
+      reloc_absolute_dword = 4
+    };
+  }
+}
+
+#endif

diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
new file mode 100644
index 0000000..adef5bc
--- /dev/null
+++ b/lib/Target/X86/X86Subtarget.cpp

@@ -0,0 +1,372 @@
+//===-- X86Subtarget.cpp - X86 Subtarget Information ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the X86 specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "subtarget"
+#include "X86Subtarget.h"
+#include "X86InstrInfo.h"
+#include "X86GenSubtarget.inc"
+#include "llvm/GlobalValue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/System/Host.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/SmallVector.h"
+using namespace llvm;
+
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif
+
+/// ClassifyBlockAddressReference - Classify a blockaddress reference for the
+/// current subtarget according to how we should reference it in a non-pcrel
+/// context.
+unsigned char X86Subtarget::
+ClassifyBlockAddressReference() const {
+  if (isPICStyleGOT())    // 32-bit ELF targets.
+    return X86II::MO_GOTOFF;
+  
+  if (isPICStyleStubPIC())   // Darwin/32 in PIC mode.
+    return X86II::MO_PIC_BASE_OFFSET;
+  
+  // Direct static reference to label.
+  return X86II::MO_NO_FLAG;
+}
+
+/// ClassifyGlobalReference - Classify a global variable reference for the
+/// current subtarget according to how we should reference it in a non-pcrel
+/// context.
+unsigned char X86Subtarget::
+ClassifyGlobalReference(const GlobalValue *GV, const TargetMachine &TM) const {
+  // DLLImport only exists on windows, it is implemented as a load from a
+  // DLLIMPORT stub.
+  if (GV->hasDLLImportLinkage())
+    return X86II::MO_DLLIMPORT;
+
+  // Materializable GVs (in JIT lazy compilation mode) do not require an
+  // extra load from stub.
+  bool isDecl = GV->isDeclaration() && !GV->isMaterializable();
+
+  // X86-64 in PIC mode.
+  if (isPICStyleRIPRel()) {
+    // Large model never uses stubs.
+    if (TM.getCodeModel() == CodeModel::Large)
+      return X86II::MO_NO_FLAG;
+      
+    if (isTargetDarwin()) {
+      // If symbol visibility is hidden, the extra load is not needed if
+      // target is x86-64 or the symbol is definitely defined in the current
+      // translation unit.
+      if (GV->hasDefaultVisibility() &&
+          (isDecl || GV->isWeakForLinker()))
+        return X86II::MO_GOTPCREL;
+    } else {
+      assert(isTargetELF() && "Unknown rip-relative target");
+
+      // Extra load is needed for all externally visible.
+      if (!GV->hasLocalLinkage() && GV->hasDefaultVisibility())
+        return X86II::MO_GOTPCREL;
+    }
+
+    return X86II::MO_NO_FLAG;
+  }
+  
+  if (isPICStyleGOT()) {   // 32-bit ELF targets.
+    // Extra load is needed for all externally visible.
+    if (GV->hasLocalLinkage() || GV->hasHiddenVisibility())
+      return X86II::MO_GOTOFF;
+    return X86II::MO_GOT;
+  }
+  
+  if (isPICStyleStubPIC()) {  // Darwin/32 in PIC mode.
+    // Determine whether we have a stub reference and/or whether the reference
+    // is relative to the PIC base or not.
+    
+    // If this is a strong reference to a definition, it is definitely not
+    // through a stub.
+    if (!isDecl && !GV->isWeakForLinker())
+      return X86II::MO_PIC_BASE_OFFSET;
+
+    // Unless we have a symbol with hidden visibility, we have to go through a
+    // normal $non_lazy_ptr stub because this symbol might be resolved late.
+    if (!GV->hasHiddenVisibility())  // Non-hidden $non_lazy_ptr reference.
+      return X86II::MO_DARWIN_NONLAZY_PIC_BASE;
+    
+    // If symbol visibility is hidden, we have a stub for common symbol
+    // references and external declarations.
+    if (isDecl || GV->hasCommonLinkage()) {
+      // Hidden $non_lazy_ptr reference.
+      return X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE;
+    }
+    
+    // Otherwise, no stub.
+    return X86II::MO_PIC_BASE_OFFSET;
+  }
+  
+  if (isPICStyleStubNoDynamic()) {  // Darwin/32 in -mdynamic-no-pic mode.
+    // Determine whether we have a stub reference.
+    
+    // If this is a strong reference to a definition, it is definitely not
+    // through a stub.
+    if (!isDecl && !GV->isWeakForLinker())
+      return X86II::MO_NO_FLAG;
+    
+    // Unless we have a symbol with hidden visibility, we have to go through a
+    // normal $non_lazy_ptr stub because this symbol might be resolved late.
+    if (!GV->hasHiddenVisibility())  // Non-hidden $non_lazy_ptr reference.
+      return X86II::MO_DARWIN_NONLAZY;
+
+    // Otherwise, no stub.
+    return X86II::MO_NO_FLAG;
+  }
+  
+  // Direct static reference to global.
+  return X86II::MO_NO_FLAG;
+}
+
+
+/// getBZeroEntry - This function returns the name of a function which has an
+/// interface like the non-standard bzero function, if such a function exists on
+/// the current subtarget and it is considered prefereable over memset with zero
+/// passed as the second argument. Otherwise it returns null.
+const char *X86Subtarget::getBZeroEntry() const {
+  // Darwin 10 has a __bzero entry point for this purpose.
+  if (getDarwinVers() >= 10)
+    return "__bzero";
+
+  return 0;
+}
+
+/// IsLegalToCallImmediateAddr - Return true if the subtarget allows calls
+/// to immediate address.
+bool X86Subtarget::IsLegalToCallImmediateAddr(const TargetMachine &TM) const {
+  if (Is64Bit)
+    return false;
+  return isTargetELF() || TM.getRelocationModel() == Reloc::Static;
+}
+
+/// getSpecialAddressLatency - For targets where it is beneficial to
+/// backschedule instructions that compute addresses, return a value
+/// indicating the number of scheduling cycles of backscheduling that
+/// should be attempted.
+unsigned X86Subtarget::getSpecialAddressLatency() const {
+  // For x86 out-of-order targets, back-schedule address computations so
+  // that loads and stores aren't blocked.
+  // This value was chosen arbitrarily.
+  return 200;
+}
+
+/// GetCpuIDAndInfo - Execute the specified cpuid and return the 4 values in the
+/// specified arguments.  If we can't run cpuid on the host, return true.
+static bool GetCpuIDAndInfo(unsigned value, unsigned *rEAX,
+                            unsigned *rEBX, unsigned *rECX, unsigned *rEDX) {
+#if defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
+  #if defined(__GNUC__)
+    // gcc doesn't know cpuid would clobber ebx/rbx. Preseve it manually.
+    asm ("movq\t%%rbx, %%rsi\n\t"
+         "cpuid\n\t"
+         "xchgq\t%%rbx, %%rsi\n\t"
+         : "=a" (*rEAX),
+           "=S" (*rEBX),
+           "=c" (*rECX),
+           "=d" (*rEDX)
+         :  "a" (value));
+    return false;
+  #elif defined(_MSC_VER)
+    int registers[4];
+    __cpuid(registers, value);
+    *rEAX = registers[0];
+    *rEBX = registers[1];
+    *rECX = registers[2];
+    *rEDX = registers[3];
+    return false;
+  #endif
+#elif defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)
+  #if defined(__GNUC__)
+    asm ("movl\t%%ebx, %%esi\n\t"
+         "cpuid\n\t"
+         "xchgl\t%%ebx, %%esi\n\t"
+         : "=a" (*rEAX),
+           "=S" (*rEBX),
+           "=c" (*rECX),
+           "=d" (*rEDX)
+         :  "a" (value));
+    return false;
+  #elif defined(_MSC_VER)
+    __asm {
+      mov   eax,value
+      cpuid
+      mov   esi,rEAX
+      mov   dword ptr [esi],eax
+      mov   esi,rEBX
+      mov   dword ptr [esi],ebx
+      mov   esi,rECX
+      mov   dword ptr [esi],ecx
+      mov   esi,rEDX
+      mov   dword ptr [esi],edx
+    }
+    return false;
+  #endif
+#endif
+  return true;
+}
+
+static void DetectFamilyModel(unsigned EAX, unsigned &Family, unsigned &Model) {
+  Family = (EAX >> 8) & 0xf; // Bits 8 - 11
+  Model  = (EAX >> 4) & 0xf; // Bits 4 - 7
+  if (Family == 6 || Family == 0xf) {
+    if (Family == 0xf)
+      // Examine extended family ID if family ID is F.
+      Family += (EAX >> 20) & 0xff;    // Bits 20 - 27
+    // Examine extended model ID if family ID is 6 or F.
+    Model += ((EAX >> 16) & 0xf) << 4; // Bits 16 - 19
+  }
+}
+
+void X86Subtarget::AutoDetectSubtargetFeatures() {
+  unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
+  union {
+    unsigned u[3];
+    char     c[12];
+  } text;
+  
+  if (GetCpuIDAndInfo(0, &EAX, text.u+0, text.u+2, text.u+1))
+    return;
+
+  GetCpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX);
+  
+  if ((EDX >> 15) & 1) HasCMov = true;
+  if ((EDX >> 23) & 1) X86SSELevel = MMX;
+  if ((EDX >> 25) & 1) X86SSELevel = SSE1;
+  if ((EDX >> 26) & 1) X86SSELevel = SSE2;
+  if (ECX & 0x1)       X86SSELevel = SSE3;
+  if ((ECX >> 9)  & 1) X86SSELevel = SSSE3;
+  if ((ECX >> 19) & 1) X86SSELevel = SSE41;
+  if ((ECX >> 20) & 1) X86SSELevel = SSE42;
+
+  bool IsIntel = memcmp(text.c, "GenuineIntel", 12) == 0;
+  bool IsAMD   = !IsIntel && memcmp(text.c, "AuthenticAMD", 12) == 0;
+
+  HasFMA3 = IsIntel && ((ECX >> 12) & 0x1);
+  HasAVX = ((ECX >> 28) & 0x1);
+
+  if (IsIntel || IsAMD) {
+    // Determine if bit test memory instructions are slow.
+    unsigned Family = 0;
+    unsigned Model  = 0;
+    DetectFamilyModel(EAX, Family, Model);
+    IsBTMemSlow = IsAMD || (Family == 6 && Model >= 13);
+
+    GetCpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
+    HasX86_64 = (EDX >> 29) & 0x1;
+    HasSSE4A = IsAMD && ((ECX >> 6) & 0x1);
+    HasFMA4 = IsAMD && ((ECX >> 16) & 0x1);
+  }
+}
+
+X86Subtarget::X86Subtarget(const std::string &TT, const std::string &FS, 
+                           bool is64Bit)
+  : PICStyle(PICStyles::None)
+  , X86SSELevel(NoMMXSSE)
+  , X863DNowLevel(NoThreeDNow)
+  , HasCMov(false)
+  , HasX86_64(false)
+  , HasSSE4A(false)
+  , HasAVX(false)
+  , HasFMA3(false)
+  , HasFMA4(false)
+  , IsBTMemSlow(false)
+  , HasVectorUAMem(false)
+  , DarwinVers(0)
+  , stackAlignment(8)
+  // FIXME: this is a known good value for Yonah. How about others?
+  , MaxInlineSizeThreshold(128)
+  , Is64Bit(is64Bit)
+  , TargetType(isELF) { // Default to ELF unless otherwise specified.
+
+  // default to hard float ABI
+  if (FloatABIType == FloatABI::Default)
+    FloatABIType = FloatABI::Hard;
+    
+  // Determine default and user specified characteristics
+  if (!FS.empty()) {
+    // If feature string is not empty, parse features string.
+    std::string CPU = sys::getHostCPUName();
+    ParseSubtargetFeatures(FS, CPU);
+    // All X86-64 CPUs also have SSE2, however user might request no SSE via 
+    // -mattr, so don't force SSELevel here.
+  } else {
+    // Otherwise, use CPUID to auto-detect feature set.
+    AutoDetectSubtargetFeatures();
+    // Make sure SSE2 is enabled; it is available on all X86-64 CPUs.
+    if (Is64Bit && X86SSELevel < SSE2)
+      X86SSELevel = SSE2;
+  }
+
+  // If requesting codegen for X86-64, make sure that 64-bit features
+  // are enabled.
+  if (Is64Bit)
+    HasX86_64 = true;
+
+  DEBUG(dbgs() << "Subtarget features: SSELevel " << X86SSELevel
+               << ", 3DNowLevel " << X863DNowLevel
+               << ", 64bit " << HasX86_64 << "\n");
+  assert((!Is64Bit || HasX86_64) &&
+         "64-bit code requested on a subtarget that doesn't support it!");
+
+  // Set the boolean corresponding to the current target triple, or the default
+  // if one cannot be determined, to true.
+  if (TT.length() > 5) {
+    size_t Pos;
+    if ((Pos = TT.find("-darwin")) != std::string::npos) {
+      TargetType = isDarwin;
+      
+      // Compute the darwin version number.
+      if (isdigit(TT[Pos+7]))
+        DarwinVers = atoi(&TT[Pos+7]);
+      else
+        DarwinVers = 8;  // Minimum supported darwin is Tiger.
+    } else if (TT.find("linux") != std::string::npos) {
+      // Linux doesn't imply ELF, but we don't currently support anything else.
+      TargetType = isELF;
+    } else if (TT.find("cygwin") != std::string::npos) {
+      TargetType = isCygwin;
+    } else if (TT.find("mingw") != std::string::npos) {
+      TargetType = isMingw;
+    } else if (TT.find("win32") != std::string::npos) {
+      TargetType = isWindows;
+    } else if (TT.find("windows") != std::string::npos) {
+      TargetType = isWindows;
+    } else if (TT.find("-cl") != std::string::npos) {
+      TargetType = isDarwin;
+      DarwinVers = 9;
+    }
+  }
+
+  // Stack alignment is 16 bytes on Darwin (both 32 and 64 bit) and for all 64
+  // bit targets.
+  if (TargetType == isDarwin || Is64Bit)
+    stackAlignment = 16;
+
+  if (StackAlignment)
+    stackAlignment = StackAlignment;
+}
+
+bool X86Subtarget::enablePostRAScheduler(
+            CodeGenOpt::Level OptLevel,
+            TargetSubtarget::AntiDepBreakMode& Mode,
+            RegClassVector& CriticalPathRCs) const {
+  Mode = TargetSubtarget::ANTIDEP_CRITICAL;
+  CriticalPathRCs.clear();
+  return OptLevel >= CodeGenOpt::Aggressive;
+}

diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
new file mode 100644
index 0000000..618dd10
--- /dev/null
+++ b/lib/Target/X86/X86Subtarget.h

@@ -0,0 +1,241 @@
+//=====---- X86Subtarget.h - Define Subtarget for the X86 -----*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the X86 specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86SUBTARGET_H
+#define X86SUBTARGET_H
+
+#include "llvm/Target/TargetSubtarget.h"
+#include <string>
+
+namespace llvm {
+class GlobalValue;
+class TargetMachine;
+  
+/// PICStyles - The X86 backend supports a number of different styles of PIC.
+/// 
+namespace PICStyles {
+enum Style {
+  StubPIC,          // Used on i386-darwin in -fPIC mode.
+  StubDynamicNoPIC, // Used on i386-darwin in -mdynamic-no-pic mode.
+  GOT,              // Used on many 32-bit unices in -fPIC mode.
+  RIPRel,           // Used on X86-64 when not in -static mode.
+  None              // Set when in -static mode (not PIC or DynamicNoPIC mode).
+};
+}
+
+class X86Subtarget : public TargetSubtarget {
+protected:
+  enum X86SSEEnum {
+    NoMMXSSE, MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42
+  };
+
+  enum X863DNowEnum {
+    NoThreeDNow, ThreeDNow, ThreeDNowA
+  };
+
+  /// PICStyle - Which PIC style to use
+  ///
+  PICStyles::Style PICStyle;
+  
+  /// X86SSELevel - MMX, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, or
+  /// none supported.
+  X86SSEEnum X86SSELevel;
+
+  /// X863DNowLevel - 3DNow or 3DNow Athlon, or none supported.
+  ///
+  X863DNowEnum X863DNowLevel;
+
+  /// HasCMov - True if this processor has conditional move instructions
+  /// (generally pentium pro+).
+  bool HasCMov;
+  
+  /// HasX86_64 - True if the processor supports X86-64 instructions.
+  ///
+  bool HasX86_64;
+
+  /// HasSSE4A - True if the processor supports SSE4A instructions.
+  bool HasSSE4A;
+
+  /// HasAVX - Target has AVX instructions
+  bool HasAVX;
+
+  /// HasFMA3 - Target has 3-operand fused multiply-add
+  bool HasFMA3;
+
+  /// HasFMA4 - Target has 4-operand fused multiply-add
+  bool HasFMA4;
+
+  /// IsBTMemSlow - True if BT (bit test) of memory instructions are slow.
+  bool IsBTMemSlow;
+
+  /// HasVectorUAMem - True if SIMD operations can have unaligned memory operands.
+  ///                  This may require setting a feature bit in the processor.
+  bool HasVectorUAMem;
+
+  /// DarwinVers - Nonzero if this is a darwin platform: the numeric
+  /// version of the platform, e.g. 8 = 10.4 (Tiger), 9 = 10.5 (Leopard), etc.
+  unsigned char DarwinVers; // Is any darwin-x86 platform.
+
+  /// stackAlignment - The minimum alignment known to hold of the stack frame on
+  /// entry to the function and which must be maintained by every function.
+  unsigned stackAlignment;
+
+  /// Max. memset / memcpy size that is turned into rep/movs, rep/stos ops.
+  ///
+  unsigned MaxInlineSizeThreshold;
+
+private:
+  /// Is64Bit - True if the processor supports 64-bit instructions and
+  /// pointer size is 64 bit.
+  bool Is64Bit;
+
+public:
+  enum {
+    isELF, isCygwin, isDarwin, isWindows, isMingw
+  } TargetType;
+
+  /// This constructor initializes the data members to match that
+  /// of the specified triple.
+  ///
+  X86Subtarget(const std::string &TT, const std::string &FS, bool is64Bit);
+
+  /// getStackAlignment - Returns the minimum alignment known to hold of the
+  /// stack frame on entry to the function and which must be maintained by every
+  /// function for this subtarget.
+  unsigned getStackAlignment() const { return stackAlignment; }
+
+  /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
+  /// that still makes it profitable to inline the call.
+  unsigned getMaxInlineSizeThreshold() const { return MaxInlineSizeThreshold; }
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  std::string ParseSubtargetFeatures(const std::string &FS,
+                                     const std::string &CPU);
+
+  /// AutoDetectSubtargetFeatures - Auto-detect CPU features using CPUID
+  /// instruction.
+  void AutoDetectSubtargetFeatures();
+
+  bool is64Bit() const { return Is64Bit; }
+
+  PICStyles::Style getPICStyle() const { return PICStyle; }
+  void setPICStyle(PICStyles::Style Style)  { PICStyle = Style; }
+
+  bool hasMMX() const { return X86SSELevel >= MMX; }
+  bool hasSSE1() const { return X86SSELevel >= SSE1; }
+  bool hasSSE2() const { return X86SSELevel >= SSE2; }
+  bool hasSSE3() const { return X86SSELevel >= SSE3; }
+  bool hasSSSE3() const { return X86SSELevel >= SSSE3; }
+  bool hasSSE41() const { return X86SSELevel >= SSE41; }
+  bool hasSSE42() const { return X86SSELevel >= SSE42; }
+  bool hasSSE4A() const { return HasSSE4A; }
+  bool has3DNow() const { return X863DNowLevel >= ThreeDNow; }
+  bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; }
+  bool hasAVX() const { return HasAVX; }
+  bool hasFMA3() const { return HasFMA3; }
+  bool hasFMA4() const { return HasFMA4; }
+  bool isBTMemSlow() const { return IsBTMemSlow; }
+  bool hasVectorUAMem() const { return HasVectorUAMem; }
+
+  bool isTargetDarwin() const { return TargetType == isDarwin; }
+  bool isTargetELF() const { return TargetType == isELF; }
+  
+  bool isTargetWindows() const { return TargetType == isWindows; }
+  bool isTargetMingw() const { return TargetType == isMingw; }
+  bool isTargetCygwin() const { return TargetType == isCygwin; }
+  bool isTargetCygMing() const {
+    return TargetType == isMingw || TargetType == isCygwin;
+  }
+  
+  /// isTargetCOFF - Return true if this is any COFF/Windows target variant.
+  bool isTargetCOFF() const {
+    return TargetType == isMingw || TargetType == isCygwin ||
+           TargetType == isWindows;
+  }
+  
+  bool isTargetWin64() const {
+    return Is64Bit && (TargetType == isMingw || TargetType == isWindows);
+  }
+
+  std::string getDataLayout() const {
+    const char *p;
+    if (is64Bit())
+      p = "e-p:64:64-s:64-f64:64:64-i64:64:64-f80:128:128-n8:16:32:64";
+    else if (isTargetDarwin())
+      p = "e-p:32:32-f64:32:64-i64:32:64-f80:128:128-n8:16:32";
+    else if (isTargetMingw() || isTargetWindows())
+      p = "e-p:32:32-f64:64:64-i64:64:64-f80:128:128-n8:16:32";
+    else
+      p = "e-p:32:32-f64:32:64-i64:32:64-f80:32:32-n8:16:32";
+
+    return std::string(p);
+  }
+
+  bool isPICStyleSet() const { return PICStyle != PICStyles::None; }
+  bool isPICStyleGOT() const { return PICStyle == PICStyles::GOT; }
+  bool isPICStyleRIPRel() const { return PICStyle == PICStyles::RIPRel; }
+
+  bool isPICStyleStubPIC() const {
+    return PICStyle == PICStyles::StubPIC;
+  }
+
+  bool isPICStyleStubNoDynamic() const {
+    return PICStyle == PICStyles::StubDynamicNoPIC;
+  }
+  bool isPICStyleStubAny() const {
+    return PICStyle == PICStyles::StubDynamicNoPIC ||
+           PICStyle == PICStyles::StubPIC; }
+  
+  /// getDarwinVers - Return the darwin version number, 8 = Tiger, 9 = Leopard,
+  /// 10 = Snow Leopard, etc.
+  unsigned getDarwinVers() const { return DarwinVers; }
+    
+  /// ClassifyGlobalReference - Classify a global variable reference for the
+  /// current subtarget according to how we should reference it in a non-pcrel
+  /// context.
+  unsigned char ClassifyGlobalReference(const GlobalValue *GV,
+                                        const TargetMachine &TM)const;
+
+  /// ClassifyBlockAddressReference - Classify a blockaddress reference for the
+  /// current subtarget according to how we should reference it in a non-pcrel
+  /// context.
+  unsigned char ClassifyBlockAddressReference() const;
+
+  /// IsLegalToCallImmediateAddr - Return true if the subtarget allows calls
+  /// to immediate address.
+  bool IsLegalToCallImmediateAddr(const TargetMachine &TM) const;
+
+  /// This function returns the name of a function which has an interface
+  /// like the non-standard bzero function, if such a function exists on
+  /// the current subtarget and it is considered prefereable over
+  /// memset with zero passed as the second argument. Otherwise it
+  /// returns null.
+  const char *getBZeroEntry() const;
+
+  /// getSpecialAddressLatency - For targets where it is beneficial to
+  /// backschedule instructions that compute addresses, return a value
+  /// indicating the number of scheduling cycles of backscheduling that
+  /// should be attempted.
+  unsigned getSpecialAddressLatency() const;
+
+  /// enablePostRAScheduler - X86 target is enabling post-alloc scheduling
+  /// at 'More' optimization level.
+  bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
+                             TargetSubtarget::AntiDepBreakMode& Mode,
+                             RegClassVector& CriticalPathRCs) const;
+};
+
+} // End llvm namespace
+
+#endif

diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
new file mode 100644
index 0000000..f835e29
--- /dev/null
+++ b/lib/Target/X86/X86TargetMachine.cpp

@@ -0,0 +1,232 @@
+//===-- X86TargetMachine.cpp - Define TargetMachine for the X86 -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the X86 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MCAsmInfo.h"
+#include "X86TargetMachine.h"
+#include "X86.h"
+#include "llvm/PassManager.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+static const MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) {
+  Triple TheTriple(TT);
+  switch (TheTriple.getOS()) {
+  case Triple::Darwin:
+    return new X86MCAsmInfoDarwin(TheTriple);
+  case Triple::MinGW32:
+  case Triple::MinGW64:
+  case Triple::Cygwin:
+    return new X86MCAsmInfoCOFF(TheTriple);
+  case Triple::Win32:
+    return new X86WinMCAsmInfo(TheTriple);
+  default:
+    return new X86ELFMCAsmInfo(TheTriple);
+  }
+}
+
+extern "C" void LLVMInitializeX86Target() { 
+  // Register the target.
+  RegisterTargetMachine<X86_32TargetMachine> X(TheX86_32Target);
+  RegisterTargetMachine<X86_64TargetMachine> Y(TheX86_64Target);
+
+  // Register the target asm info.
+  RegisterAsmInfoFn A(TheX86_32Target, createMCAsmInfo);
+  RegisterAsmInfoFn B(TheX86_64Target, createMCAsmInfo);
+
+  // Register the code emitter.
+  // FIXME: Remove the heinous one when the new one works.
+  TargetRegistry::RegisterCodeEmitter(TheX86_32Target,
+                                      createHeinousX86MCCodeEmitter);
+  TargetRegistry::RegisterCodeEmitter(TheX86_64Target,
+                                      createHeinousX86MCCodeEmitter);
+}
+
+
+X86_32TargetMachine::X86_32TargetMachine(const Target &T, const std::string &TT,
+                                         const std::string &FS)
+  : X86TargetMachine(T, TT, FS, false) {
+}
+
+
+X86_64TargetMachine::X86_64TargetMachine(const Target &T, const std::string &TT,
+                                         const std::string &FS)
+  : X86TargetMachine(T, TT, FS, true) {
+}
+
+/// X86TargetMachine ctor - Create an X86 target.
+///
+X86TargetMachine::X86TargetMachine(const Target &T, const std::string &TT, 
+                                   const std::string &FS, bool is64Bit)
+  : LLVMTargetMachine(T, TT), 
+    Subtarget(TT, FS, is64Bit),
+    DataLayout(Subtarget.getDataLayout()),
+    FrameInfo(TargetFrameInfo::StackGrowsDown,
+              Subtarget.getStackAlignment(),
+              (Subtarget.isTargetWin64() ? -40 :
+               (Subtarget.is64Bit() ? -8 : -4))),
+    InstrInfo(*this), JITInfo(*this), TLInfo(*this), ELFWriterInfo(*this) {
+  DefRelocModel = getRelocationModel();
+      
+  // If no relocation model was picked, default as appropriate for the target.
+  if (getRelocationModel() == Reloc::Default) {
+    if (!Subtarget.isTargetDarwin())
+      setRelocationModel(Reloc::Static);
+    else if (Subtarget.is64Bit())
+      setRelocationModel(Reloc::PIC_);
+    else
+      setRelocationModel(Reloc::DynamicNoPIC);
+  }
+
+  assert(getRelocationModel() != Reloc::Default &&
+         "Relocation mode not picked");
+
+  // ELF and X86-64 don't have a distinct DynamicNoPIC model.  DynamicNoPIC
+  // is defined as a model for code which may be used in static or dynamic
+  // executables but not necessarily a shared library. On X86-32 we just
+  // compile in -static mode, in x86-64 we use PIC.
+  if (getRelocationModel() == Reloc::DynamicNoPIC) {
+    if (is64Bit)
+      setRelocationModel(Reloc::PIC_);
+    else if (!Subtarget.isTargetDarwin())
+      setRelocationModel(Reloc::Static);
+  }
+
+  // If we are on Darwin, disallow static relocation model in X86-64 mode, since
+  // the Mach-O file format doesn't support it.
+  if (getRelocationModel() == Reloc::Static &&
+      Subtarget.isTargetDarwin() &&
+      is64Bit)
+    setRelocationModel(Reloc::PIC_);
+      
+  // Determine the PICStyle based on the target selected.
+  if (getRelocationModel() == Reloc::Static) {
+    // Unless we're in PIC or DynamicNoPIC mode, set the PIC style to None.
+    Subtarget.setPICStyle(PICStyles::None);
+  } else if (Subtarget.isTargetCygMing()) {
+    Subtarget.setPICStyle(PICStyles::None);
+  } else if (Subtarget.isTargetDarwin()) {
+    if (Subtarget.is64Bit())
+      Subtarget.setPICStyle(PICStyles::RIPRel);
+    else if (getRelocationModel() == Reloc::PIC_)
+      Subtarget.setPICStyle(PICStyles::StubPIC);
+    else {
+      assert(getRelocationModel() == Reloc::DynamicNoPIC);
+      Subtarget.setPICStyle(PICStyles::StubDynamicNoPIC);
+    }
+  } else if (Subtarget.isTargetELF()) {
+    if (Subtarget.is64Bit())
+      Subtarget.setPICStyle(PICStyles::RIPRel);
+    else
+      Subtarget.setPICStyle(PICStyles::GOT);
+  }
+      
+  // Finally, if we have "none" as our PIC style, force to static mode.
+  if (Subtarget.getPICStyle() == PICStyles::None)
+    setRelocationModel(Reloc::Static);
+}
+
+//===----------------------------------------------------------------------===//
+// Pass Pipeline Configuration
+//===----------------------------------------------------------------------===//
+
+bool X86TargetMachine::addInstSelector(PassManagerBase &PM,
+                                       CodeGenOpt::Level OptLevel) {
+  // Install an instruction selector.
+  PM.add(createX86ISelDag(*this, OptLevel));
+
+  // Install a pass to insert x87 FP_REG_KILL instructions, as needed.
+  PM.add(createX87FPRegKillInserterPass());
+
+  return false;
+}
+
+bool X86TargetMachine::addPreRegAlloc(PassManagerBase &PM,
+                                      CodeGenOpt::Level OptLevel) {
+  return false;  // -print-machineinstr shouldn't print after this.
+}
+
+bool X86TargetMachine::addPostRegAlloc(PassManagerBase &PM,
+                                       CodeGenOpt::Level OptLevel) {
+  PM.add(createX86FloatingPointStackifierPass());
+  return true;  // -print-machineinstr should print after this.
+}
+
+bool X86TargetMachine::addCodeEmitter(PassManagerBase &PM,
+                                      CodeGenOpt::Level OptLevel,
+                                      JITCodeEmitter &JCE) {
+  // FIXME: Move this to TargetJITInfo!
+  // On Darwin, do not override 64-bit setting made in X86TargetMachine().
+  if (DefRelocModel == Reloc::Default && 
+      (!Subtarget.isTargetDarwin() || !Subtarget.is64Bit())) {
+    setRelocationModel(Reloc::Static);
+    Subtarget.setPICStyle(PICStyles::None);
+  }
+  
+
+  PM.add(createX86JITCodeEmitterPass(*this, JCE));
+
+  return false;
+}
+
+void X86TargetMachine::setCodeModelForStatic() {
+
+    if (getCodeModel() != CodeModel::Default) return;
+
+    // For static codegen, if we're not already set, use Small codegen.
+    setCodeModel(CodeModel::Small);
+}
+
+
+void X86TargetMachine::setCodeModelForJIT() {
+
+  if (getCodeModel() != CodeModel::Default) return;
+
+  // 64-bit JIT places everything in the same buffer except external functions.
+  if (Subtarget.is64Bit())
+    setCodeModel(CodeModel::Large);
+  else
+    setCodeModel(CodeModel::Small);
+}
+
+/// getLSDAEncoding - Returns the LSDA pointer encoding. The choices are 4-byte,
+/// 8-byte, and target default. The CIE is hard-coded to indicate that the LSDA
+/// pointer in the FDE section is an "sdata4", and should be encoded as a 4-byte
+/// pointer by default. However, some systems may require a different size due
+/// to bugs or other conditions. We will default to a 4-byte encoding unless the
+/// system tells us otherwise.
+///
+/// The issue is when the CIE says their is an LSDA. That mandates that every
+/// FDE have an LSDA slot. But if the function does not need an LSDA. There
+/// needs to be some way to signify there is none. The LSDA is encoded as
+/// pc-rel. But you don't look for some magic value after adding the pc. You
+/// have to look for a zero before adding the pc. The problem is that the size
+/// of the zero to look for depends on the encoding. The unwinder bug in SL is
+/// that it always checks for a pointer-size zero. So on x86_64 it looks for 8
+/// bytes of zero. If you have an LSDA, it works fine since the 8-bytes are
+/// non-zero so it goes ahead and then reads the value based on the encoding.
+/// But if you use sdata4 and there is no LSDA, then the test for zero gives a
+/// false negative and the unwinder thinks there is an LSDA.
+///
+/// FIXME: This call-back isn't good! We should be using the correct encoding
+/// regardless of the system. However, there are some systems which have bugs
+/// that prevent this from occuring.
+DwarfLSDAEncoding::Encoding X86TargetMachine::getLSDAEncoding() const {
+  if (Subtarget.isTargetDarwin() && Subtarget.getDarwinVers() != 10)
+    return DwarfLSDAEncoding::Default;
+
+  return DwarfLSDAEncoding::EightByte;
+}

diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
new file mode 100644
index 0000000..eee29be
--- /dev/null
+++ b/lib/Target/X86/X86TargetMachine.h

@@ -0,0 +1,103 @@
+//===-- X86TargetMachine.h - Define TargetMachine for the X86 ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the X86 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86TARGETMACHINE_H
+#define X86TARGETMACHINE_H
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "X86.h"
+#include "X86ELFWriterInfo.h"
+#include "X86InstrInfo.h"
+#include "X86JITInfo.h"
+#include "X86Subtarget.h"
+#include "X86ISelLowering.h"
+
+namespace llvm {
+  
+class formatted_raw_ostream;
+
+class X86TargetMachine : public LLVMTargetMachine {
+  X86Subtarget      Subtarget;
+  const TargetData  DataLayout; // Calculates type size & alignment
+  TargetFrameInfo   FrameInfo;
+  X86InstrInfo      InstrInfo;
+  X86JITInfo        JITInfo;
+  X86TargetLowering TLInfo;
+  X86ELFWriterInfo  ELFWriterInfo;
+  Reloc::Model      DefRelocModel; // Reloc model before it's overridden.
+
+private:
+  // We have specific defaults for X86.
+  virtual void setCodeModelForJIT();
+  virtual void setCodeModelForStatic();
+  
+public:
+  X86TargetMachine(const Target &T, const std::string &TT, 
+                   const std::string &FS, bool is64Bit);
+
+  virtual const X86InstrInfo     *getInstrInfo() const { return &InstrInfo; }
+  virtual const TargetFrameInfo  *getFrameInfo() const { return &FrameInfo; }
+  virtual       X86JITInfo       *getJITInfo()         { return &JITInfo; }
+  virtual const X86Subtarget     *getSubtargetImpl() const{ return &Subtarget; }
+  virtual       X86TargetLowering *getTargetLowering() const { 
+    return const_cast<X86TargetLowering*>(&TLInfo); 
+  }
+  virtual const X86RegisterInfo  *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+  virtual const TargetData       *getTargetData() const { return &DataLayout; }
+  virtual const X86ELFWriterInfo *getELFWriterInfo() const {
+    return Subtarget.isTargetELF() ? &ELFWriterInfo : 0;
+  }
+
+  /// getLSDAEncoding - Returns the LSDA pointer encoding. The choices are
+  /// 4-byte, 8-byte, and target default. The CIE is hard-coded to indicate that
+  /// the LSDA pointer in the FDE section is an "sdata4", and should be encoded
+  /// as a 4-byte pointer by default. However, some systems may require a
+  /// different size due to bugs or other conditions. We will default to a
+  /// 4-byte encoding unless the system tells us otherwise.
+  ///
+  /// FIXME: This call-back isn't good! We should be using the correct encoding
+  /// regardless of the system. However, there are some systems which have bugs
+  /// that prevent this from occuring.
+  virtual DwarfLSDAEncoding::Encoding getLSDAEncoding() const;
+
+  // Set up the pass pipeline.
+  virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+  virtual bool addPreRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+  virtual bool addPostRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+  virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel,
+                              JITCodeEmitter &JCE);
+};
+
+/// X86_32TargetMachine - X86 32-bit target machine.
+///
+class X86_32TargetMachine : public X86TargetMachine {
+public:
+  X86_32TargetMachine(const Target &T, const std::string &M,
+                      const std::string &FS);
+};
+
+/// X86_64TargetMachine - X86 64-bit target machine.
+///
+class X86_64TargetMachine : public X86TargetMachine {
+public:
+  X86_64TargetMachine(const Target &T, const std::string &TT,
+                      const std::string &FS);
+};
+
+} // End llvm namespace
+
+#endif

diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp
new file mode 100644
index 0000000..b8cef7d
--- /dev/null
+++ b/lib/Target/X86/X86TargetObjectFile.cpp

@@ -0,0 +1,67 @@
+//===-- llvm/Target/X86/X86TargetObjectFile.cpp - X86 Object Info ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86TargetObjectFile.h"
+#include "X86MCTargetExpr.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Target/Mangler.h"
+#include "llvm/ADT/SmallString.h"
+using namespace llvm;
+
+const MCExpr *X8632_MachoTargetObjectFile::
+getSymbolForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
+                                 MachineModuleInfo *MMI,
+                                 bool &IsIndirect, bool &IsPCRel) const {
+  // The mach-o version of this method defaults to returning a stub reference.
+  IsIndirect = true;
+  IsPCRel    = false;
+  
+  
+  MachineModuleInfoMachO &MachOMMI =
+  MMI->getObjFileInfo<MachineModuleInfoMachO>();
+  
+  // FIXME: Use GetSymbolWithGlobalValueBase.
+  SmallString<128> Name;
+  Mang->getNameWithPrefix(Name, GV, true);
+  Name += "$non_lazy_ptr";
+  
+  // Add information about the stub reference to MachOMMI so that the stub gets
+  // emitted by the asmprinter.
+  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name.str());
+  MCSymbol *&StubSym = MachOMMI.getGVStubEntry(Sym);
+  if (StubSym == 0) {
+    Name.clear();
+    Mang->getNameWithPrefix(Name, GV, false);
+    StubSym = getContext().GetOrCreateSymbol(Name.str());
+  }
+  
+  return MCSymbolRefExpr::Create(Sym, getContext());
+}
+
+const MCExpr *X8664_MachoTargetObjectFile::
+getSymbolForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
+                                 MachineModuleInfo *MMI,
+                                 bool &IsIndirect, bool &IsPCRel) const {
+  
+  // On Darwin/X86-64, we can reference dwarf symbols with foo@GOTPCREL+4, which
+  // is an indirect pc-relative reference.
+  IsIndirect = true;
+  IsPCRel    = true;
+  
+  // FIXME: Use GetSymbolWithGlobalValueBase.
+  SmallString<128> Name;
+  Mang->getNameWithPrefix(Name, GV, false);
+  const MCSymbol *Sym = getContext().CreateSymbol(Name);
+  const MCExpr *Res =
+    X86MCTargetExpr::Create(Sym, X86MCTargetExpr::GOTPCREL, getContext());
+  const MCExpr *Four = MCConstantExpr::Create(4, getContext());
+  return MCBinaryExpr::CreateAdd(Res, Four, getContext());
+}
+

diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h
new file mode 100644
index 0000000..377a93b
--- /dev/null
+++ b/lib/Target/X86/X86TargetObjectFile.h

@@ -0,0 +1,40 @@
+//===-- llvm/Target/X86/X86TargetObjectFile.h - X86 Object Info -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_X86_TARGETOBJECTFILE_H
+#define LLVM_TARGET_X86_TARGETOBJECTFILE_H
+
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+namespace llvm {
+  
+  /// X8632_MachoTargetObjectFile - This TLOF implementation is used for
+  /// Darwin/x86-32.
+  class X8632_MachoTargetObjectFile : public TargetLoweringObjectFileMachO {
+  public:
+    
+    virtual const MCExpr *
+    getSymbolForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
+                                     MachineModuleInfo *MMI,
+                                     bool &IsIndirect, bool &IsPCRel) const;
+  };
+  
+  /// X8664_MachoTargetObjectFile - This TLOF implementation is used for
+  /// Darwin/x86-64.
+  class X8664_MachoTargetObjectFile : public TargetLoweringObjectFileMachO {
+  public:
+
+    virtual const MCExpr *
+    getSymbolForDwarfGlobalReference(const GlobalValue *GV, Mangler *Mang,
+                                     MachineModuleInfo *MMI,
+                                     bool &IsIndirect, bool &IsPCRel) const;
+  };
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/XCore/AsmPrinter/CMakeLists.txt b/lib/Target/XCore/AsmPrinter/CMakeLists.txt
new file mode 100644
index 0000000..7c7c2f4
--- /dev/null
+++ b/lib/Target/XCore/AsmPrinter/CMakeLists.txt

@@ -0,0 +1,6 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMXCoreAsmPrinter
+  XCoreAsmPrinter.cpp
+  )
+add_dependencies(LLVMXCoreAsmPrinter XCoreCodeGenTable_gen)

diff --git a/lib/Target/XCore/AsmPrinter/Makefile b/lib/Target/XCore/AsmPrinter/Makefile
new file mode 100644
index 0000000..82dc1df
--- /dev/null
+++ b/lib/Target/XCore/AsmPrinter/Makefile

@@ -0,0 +1,16 @@
+##===- lib/Target/XCore/AsmPrinter/Makefile ----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMXCoreAsmPrinter
+
+# Hack: we need to include 'main' XCore target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common

diff --git a/lib/Target/XCore/AsmPrinter/XCoreAsmPrinter.cpp b/lib/Target/XCore/AsmPrinter/XCoreAsmPrinter.cpp
new file mode 100644
index 0000000..d18f55d
--- /dev/null
+++ b/lib/Target/XCore/AsmPrinter/XCoreAsmPrinter.cpp

@@ -0,0 +1,319 @@
+//===-- XCoreAsmPrinter.cpp - XCore LLVM assembly writer ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a printer that converts from our internal representation
+// of machine-dependent LLVM code to the XAS-format XCore assembly language.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "XCore.h"
+#include "XCoreInstrInfo.h"
+#include "XCoreSubtarget.h"
+#include "XCoreMCAsmInfo.h"
+#include "XCoreTargetMachine.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DwarfWriter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetRegistry.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+#include <cctype>
+using namespace llvm;
+
+static cl::opt<unsigned> MaxThreads("xcore-max-threads", cl::Optional,
+  cl::desc("Maximum number of threads (for emulation thread-local storage)"),
+  cl::Hidden,
+  cl::value_desc("number"),
+  cl::init(8));
+
+namespace {
+  class XCoreAsmPrinter : public AsmPrinter {
+    const XCoreSubtarget &Subtarget;
+  public:
+    explicit XCoreAsmPrinter(formatted_raw_ostream &O, TargetMachine &TM,
+                             MCContext &Ctx, MCStreamer &Streamer,
+                             const MCAsmInfo *T)
+      : AsmPrinter(O, TM, Ctx, Streamer, T),
+      Subtarget(TM.getSubtarget<XCoreSubtarget>()) {}
+
+    virtual const char *getPassName() const {
+      return "XCore Assembly Printer";
+    }
+
+    void printMemOperand(const MachineInstr *MI, int opNum);
+    void printOperand(const MachineInstr *MI, int opNum);
+    bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                        unsigned AsmVariant, const char *ExtraCode);
+
+    void emitGlobalDirective(const MCSymbol *Sym);
+    
+    void emitArrayBound(const MCSymbol *Sym, const GlobalVariable *GV);
+    virtual void EmitGlobalVariable(const GlobalVariable *GV);
+
+    void emitFunctionStart(MachineFunction &MF);
+
+    void printInstruction(const MachineInstr *MI);  // autogenerated.
+    static const char *getRegisterName(unsigned RegNo);
+
+    bool runOnMachineFunction(MachineFunction &MF);
+    void EmitInstruction(const MachineInstr *MI);
+    void EmitFunctionBodyEnd();
+    
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AsmPrinter::getAnalysisUsage(AU);
+      AU.setPreservesAll();
+      AU.addRequired<MachineModuleInfo>();
+      AU.addRequired<DwarfWriter>();
+    }
+  };
+} // end of anonymous namespace
+
+#include "XCoreGenAsmWriter.inc"
+
+void XCoreAsmPrinter::emitGlobalDirective(const MCSymbol *Sym) {
+  O << MAI->getGlobalDirective() << *Sym << "\n";
+}
+
+void XCoreAsmPrinter::emitArrayBound(const MCSymbol *Sym,
+                                     const GlobalVariable *GV) {
+  assert(((GV->hasExternalLinkage() ||
+    GV->hasWeakLinkage()) ||
+    GV->hasLinkOnceLinkage()) && "Unexpected linkage");
+  if (const ArrayType *ATy = dyn_cast<ArrayType>(
+    cast<PointerType>(GV->getType())->getElementType())) {
+    O << MAI->getGlobalDirective() << *Sym;
+    O << ".globound" << "\n";
+    O << "\t.set\t" << *Sym;
+    O << ".globound" << "," << ATy->getNumElements() << "\n";
+    if (GV->hasWeakLinkage() || GV->hasLinkOnceLinkage()) {
+      // TODO Use COMDAT groups for LinkOnceLinkage
+      O << MAI->getWeakDefDirective() << *Sym << ".globound" << "\n";
+    }
+  }
+}
+
+void XCoreAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
+  // Check to see if this is a special global used by LLVM, if so, emit it.
+  if (!GV->hasInitializer() ||
+      EmitSpecialLLVMGlobal(GV))
+    return;
+
+  const TargetData *TD = TM.getTargetData();
+  OutStreamer.SwitchSection(getObjFileLowering().SectionForGlobal(GV, Mang,TM));
+
+  
+  MCSymbol *GVSym = GetGlobalValueSymbol(GV);
+  Constant *C = GV->getInitializer();
+  unsigned Align = (unsigned)TD->getPreferredTypeAlignmentShift(C->getType());
+  
+  // Mark the start of the global
+  O << "\t.cc_top " << *GVSym << ".data," << *GVSym << "\n";
+
+  switch (GV->getLinkage()) {
+  case GlobalValue::AppendingLinkage:
+    llvm_report_error("AppendingLinkage is not supported by this target!");
+  case GlobalValue::LinkOnceAnyLinkage:
+  case GlobalValue::LinkOnceODRLinkage:
+  case GlobalValue::WeakAnyLinkage:
+  case GlobalValue::WeakODRLinkage:
+  case GlobalValue::ExternalLinkage:
+    emitArrayBound(GVSym, GV);
+    emitGlobalDirective(GVSym);
+    // TODO Use COMDAT groups for LinkOnceLinkage
+    if (GV->hasWeakLinkage() || GV->hasLinkOnceLinkage())
+      O << MAI->getWeakDefDirective() << *GVSym << "\n";
+    // FALL THROUGH
+  case GlobalValue::InternalLinkage:
+  case GlobalValue::PrivateLinkage:
+  case GlobalValue::LinkerPrivateLinkage:
+    break;
+  case GlobalValue::DLLImportLinkage:
+    llvm_unreachable("DLLImport linkage is not supported by this target!");
+  case GlobalValue::DLLExportLinkage:
+    llvm_unreachable("DLLExport linkage is not supported by this target!");
+  default:
+    llvm_unreachable("Unknown linkage type!");
+  }
+
+  EmitAlignment(Align, GV, 2);
+  
+  unsigned Size = TD->getTypeAllocSize(C->getType());
+  if (GV->isThreadLocal()) {
+    Size *= MaxThreads;
+  }
+  if (MAI->hasDotTypeDotSizeDirective()) {
+    O << "\t.type " << *GVSym << ",@object\n";
+    O << "\t.size " << *GVSym << "," << Size << "\n";
+  }
+  O << *GVSym << ":\n";
+  
+  EmitGlobalConstant(C);
+  if (GV->isThreadLocal()) {
+    for (unsigned i = 1; i < MaxThreads; ++i)
+      EmitGlobalConstant(C);
+  }
+  // The ABI requires that unsigned scalar types smaller than 32 bits
+  // are padded to 32 bits.
+  if (Size < 4)
+    OutStreamer.EmitZeros(4 - Size, 0);
+  
+  // Mark the end of the global
+  O << "\t.cc_bottom " << *GVSym << ".data\n";
+}
+
+/// Emit the directives on the start of functions
+void XCoreAsmPrinter::emitFunctionStart(MachineFunction &MF) {
+  // Print out the label for the function.
+  const Function *F = MF.getFunction();
+
+  OutStreamer.SwitchSection(getObjFileLowering().SectionForGlobal(F, Mang, TM));
+  
+  // Mark the start of the function
+  O << "\t.cc_top " << *CurrentFnSym << ".function," << *CurrentFnSym << "\n";
+
+  switch (F->getLinkage()) {
+  default: llvm_unreachable("Unknown linkage type!");
+  case Function::InternalLinkage:  // Symbols default to internal.
+  case Function::PrivateLinkage:
+  case Function::LinkerPrivateLinkage:
+    break;
+  case Function::ExternalLinkage:
+    emitGlobalDirective(CurrentFnSym);
+    break;
+  case Function::LinkOnceAnyLinkage:
+  case Function::LinkOnceODRLinkage:
+  case Function::WeakAnyLinkage:
+  case Function::WeakODRLinkage:
+    // TODO Use COMDAT groups for LinkOnceLinkage
+    O << MAI->getGlobalDirective() << *CurrentFnSym << "\n";
+    O << MAI->getWeakDefDirective() << *CurrentFnSym << "\n";
+    break;
+  }
+  // (1 << 1) byte aligned
+  EmitAlignment(MF.getAlignment(), F, 1);
+  if (MAI->hasDotTypeDotSizeDirective())
+    O << "\t.type " << *CurrentFnSym << ",@function\n";
+
+  O << *CurrentFnSym << ":\n";
+}
+
+
+/// EmitFunctionBodyEnd - Targets can override this to emit stuff after
+/// the last basic block in the function.
+void XCoreAsmPrinter::EmitFunctionBodyEnd() {
+  // Emit function end directives
+  O << "\t.cc_bottom " << *CurrentFnSym << ".function\n";
+}
+
+/// runOnMachineFunction - This uses the printMachineInstruction()
+/// method to print assembly for each instruction.
+///
+bool XCoreAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  SetupMachineFunction(MF);
+
+  // Print out constants referenced by the function
+  EmitConstantPool();
+
+  // Emit the function start directives
+  emitFunctionStart(MF);
+  
+  // Emit pre-function debug information.
+  DW->BeginFunction(&MF);
+
+  EmitFunctionBody();
+  return false;
+}
+
+void XCoreAsmPrinter::printMemOperand(const MachineInstr *MI, int opNum)
+{
+  printOperand(MI, opNum);
+  
+  if (MI->getOperand(opNum+1).isImm()
+    && MI->getOperand(opNum+1).getImm() == 0)
+    return;
+  
+  O << "+";
+  printOperand(MI, opNum+1);
+}
+
+void XCoreAsmPrinter::printOperand(const MachineInstr *MI, int opNum) {
+  const MachineOperand &MO = MI->getOperand(opNum);
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    O << getRegisterName(MO.getReg());
+    break;
+  case MachineOperand::MO_Immediate:
+    O << MO.getImm();
+    break;
+  case MachineOperand::MO_MachineBasicBlock:
+    O << *MO.getMBB()->getSymbol(OutContext);
+    break;
+  case MachineOperand::MO_GlobalAddress:
+    O << *GetGlobalValueSymbol(MO.getGlobal());
+    break;
+  case MachineOperand::MO_ExternalSymbol:
+    O << MO.getSymbolName();
+    break;
+  case MachineOperand::MO_ConstantPoolIndex:
+    O << MAI->getPrivateGlobalPrefix() << "CPI" << getFunctionNumber()
+      << '_' << MO.getIndex();
+    break;
+  case MachineOperand::MO_JumpTableIndex:
+    O << MAI->getPrivateGlobalPrefix() << "JTI" << getFunctionNumber()
+      << '_' << MO.getIndex();
+    break;
+  case MachineOperand::MO_BlockAddress:
+    O << *GetBlockAddressSymbol(MO.getBlockAddress());
+    break;
+  default:
+    llvm_unreachable("not implemented");
+  }
+}
+
+/// PrintAsmOperand - Print out an operand for an inline asm expression.
+///
+bool XCoreAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                      unsigned AsmVariant, 
+                                      const char *ExtraCode) {
+  printOperand(MI, OpNo);
+  return false;
+}
+
+void XCoreAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  // Check for mov mnemonic
+  unsigned src, dst, srcSR, dstSR;
+  if (TM.getInstrInfo()->isMoveInstr(*MI, src, dst, srcSR, dstSR)) {
+    O << "\tmov " << getRegisterName(dst) << ", ";
+    O << getRegisterName(src);
+    OutStreamer.AddBlankLine();
+    return;
+  }
+  printInstruction(MI);
+  OutStreamer.AddBlankLine();
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeXCoreAsmPrinter() { 
+  RegisterAsmPrinter<XCoreAsmPrinter> X(TheXCoreTarget);
+}

diff --git a/lib/Target/XCore/CMakeLists.txt b/lib/Target/XCore/CMakeLists.txt
new file mode 100644
index 0000000..0965323
--- /dev/null
+++ b/lib/Target/XCore/CMakeLists.txt

@@ -0,0 +1,24 @@
+set(LLVM_TARGET_DEFINITIONS XCore.td)
+
+tablegen(XCoreGenRegisterInfo.h.inc -gen-register-desc-header)
+tablegen(XCoreGenRegisterNames.inc -gen-register-enums)
+tablegen(XCoreGenRegisterInfo.inc -gen-register-desc)
+tablegen(XCoreGenInstrNames.inc -gen-instr-enums)
+tablegen(XCoreGenInstrInfo.inc -gen-instr-desc)
+tablegen(XCoreGenAsmWriter.inc -gen-asm-writer)
+tablegen(XCoreGenDAGISel.inc -gen-dag-isel)
+tablegen(XCoreGenCallingConv.inc -gen-callingconv)
+tablegen(XCoreGenSubtarget.inc -gen-subtarget)
+
+add_llvm_target(XCore
+  MCSectionXCore.cpp
+  XCoreFrameInfo.cpp
+  XCoreInstrInfo.cpp
+  XCoreISelDAGToDAG.cpp
+  XCoreISelLowering.cpp
+  XCoreMCAsmInfo.cpp
+  XCoreRegisterInfo.cpp
+  XCoreSubtarget.cpp
+  XCoreTargetMachine.cpp
+  XCoreTargetObjectFile.cpp
+  )

diff --git a/lib/Target/XCore/MCSectionXCore.cpp b/lib/Target/XCore/MCSectionXCore.cpp
new file mode 100644
index 0000000..5acceaf
--- /dev/null
+++ b/lib/Target/XCore/MCSectionXCore.cpp

@@ -0,0 +1,35 @@
+//===- MCSectionXCore.cpp - XCore-specific section representation ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MCSectionXCore class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCSectionXCore.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+MCSectionXCore *
+MCSectionXCore::Create(const StringRef &Section, unsigned Type,
+                       unsigned Flags, SectionKind K,
+                       bool isExplicit, MCContext &Ctx) {
+  return new (Ctx) MCSectionXCore(Section, Type, Flags, K, isExplicit);
+}
+
+
+/// PrintTargetSpecificSectionFlags - This handles the XCore-specific cp/dp
+/// section flags.
+void MCSectionXCore::PrintTargetSpecificSectionFlags(const MCAsmInfo &MAI,
+                                                     raw_ostream &OS) const {
+  if (getFlags() & MCSectionXCore::SHF_CP_SECTION)
+    OS << 'c';
+  if (getFlags() & MCSectionXCore::SHF_DP_SECTION)
+    OS << 'd';
+}

diff --git a/lib/Target/XCore/MCSectionXCore.h b/lib/Target/XCore/MCSectionXCore.h
new file mode 100644
index 0000000..02f8f95
--- /dev/null
+++ b/lib/Target/XCore/MCSectionXCore.h

@@ -0,0 +1,54 @@
+//===- MCSectionXCore.h - XCore-specific section representation -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the MCSectionXCore class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MCSECTION_XCORE_H
+#define LLVM_MCSECTION_XCORE_H
+
+#include "llvm/MC/MCSectionELF.h"
+
+namespace llvm {
+  
+class MCSectionXCore : public MCSectionELF {
+  MCSectionXCore(const StringRef &Section, unsigned Type, unsigned Flags,
+                 SectionKind K, bool isExplicit)
+    : MCSectionELF(Section, Type, Flags, K, isExplicit) {}
+  
+public:
+  
+  enum {
+    /// SHF_CP_SECTION - All sections with the "c" flag are grouped together
+    /// by the linker to form the constant pool and the cp register is set to
+    /// the start of the constant pool by the boot code.
+    SHF_CP_SECTION = FIRST_TARGET_DEP_FLAG,
+    
+    /// SHF_DP_SECTION - All sections with the "d" flag are grouped together
+    /// by the linker to form the data section and the dp register is set to
+    /// the start of the section by the boot code.
+    SHF_DP_SECTION = FIRST_TARGET_DEP_FLAG << 1
+  };
+  
+  static MCSectionXCore *Create(const StringRef &Section, unsigned Type,
+                                unsigned Flags, SectionKind K,
+                                bool isExplicit, MCContext &Ctx);
+  
+  
+  /// PrintTargetSpecificSectionFlags - This handles the XCore-specific cp/dp
+  /// section flags.
+  virtual void PrintTargetSpecificSectionFlags(const MCAsmInfo &MAI,
+                                               raw_ostream &OS) const;
+
+};
+  
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/XCore/Makefile b/lib/Target/XCore/Makefile
new file mode 100644
index 0000000..1b70974
--- /dev/null
+++ b/lib/Target/XCore/Makefile

@@ -0,0 +1,24 @@
+##===- lib/Target/XCore/Makefile ---------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMXCoreCodeGen
+TARGET = XCore
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = XCoreGenRegisterInfo.h.inc XCoreGenRegisterNames.inc \
+                XCoreGenRegisterInfo.inc XCoreGenInstrNames.inc \
+                XCoreGenInstrInfo.inc XCoreGenAsmWriter.inc \
+                XCoreGenDAGISel.inc XCoreGenCallingConv.inc \
+		XCoreGenSubtarget.inc
+
+DIRS = AsmPrinter TargetInfo
+
+include $(LEVEL)/Makefile.common
+

diff --git a/lib/Target/XCore/README.txt b/lib/Target/XCore/README.txt
new file mode 100644
index 0000000..deaeb0f
--- /dev/null
+++ b/lib/Target/XCore/README.txt

@@ -0,0 +1,8 @@
+To-do
+-----
+
+* Instruction encodings
+* Tailcalls
+* Investigate loop alignment
+* Add builtins
+* Make better use of lmul / macc

diff --git a/lib/Target/XCore/TargetInfo/CMakeLists.txt b/lib/Target/XCore/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000..0a568de
--- /dev/null
+++ b/lib/Target/XCore/TargetInfo/CMakeLists.txt

@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMXCoreInfo
+  XCoreTargetInfo.cpp
+  )
+
+add_dependencies(LLVMXCoreInfo XCoreTable_gen)

diff --git a/lib/Target/XCore/TargetInfo/Makefile b/lib/Target/XCore/TargetInfo/Makefile
new file mode 100644
index 0000000..f8a4095
--- /dev/null
+++ b/lib/Target/XCore/TargetInfo/Makefile

@@ -0,0 +1,16 @@
+##===- lib/Target/XCore/TargetInfo/Makefile ----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMXCoreInfo
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common

diff --git a/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp b/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp
new file mode 100644
index 0000000..7aa8965
--- /dev/null
+++ b/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp

@@ -0,0 +1,19 @@
+//===-- XCoreTargetInfo.cpp - XCore Target Implementation -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCore.h"
+#include "llvm/Module.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+Target llvm::TheXCoreTarget;
+
+extern "C" void LLVMInitializeXCoreTargetInfo() { 
+  RegisterTarget<Triple::xcore> X(TheXCoreTarget, "xcore", "XCore");
+}

diff --git a/lib/Target/XCore/XCore.h b/lib/Target/XCore/XCore.h
new file mode 100644
index 0000000..8937fbe
--- /dev/null
+++ b/lib/Target/XCore/XCore.h

@@ -0,0 +1,41 @@
+//===-- XCore.h - Top-level interface for XCore representation --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// XCore back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_XCORE_H
+#define TARGET_XCORE_H
+
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+  class FunctionPass;
+  class TargetMachine;
+  class XCoreTargetMachine;
+  class formatted_raw_ostream;
+
+  FunctionPass *createXCoreISelDag(XCoreTargetMachine &TM);
+
+  extern Target TheXCoreTarget;
+
+} // end namespace llvm;
+
+// Defines symbolic names for XCore registers.  This defines a mapping from
+// register name to register number.
+//
+#include "XCoreGenRegisterNames.inc"
+
+// Defines symbolic names for the XCore instructions.
+//
+#include "XCoreGenInstrNames.inc"
+
+#endif

diff --git a/lib/Target/XCore/XCore.td b/lib/Target/XCore/XCore.td
new file mode 100644
index 0000000..b07445d
--- /dev/null
+++ b/lib/Target/XCore/XCore.td

@@ -0,0 +1,49 @@
+//===- XCore.td - Describe the XCore Target Machine --------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// Descriptions
+//===----------------------------------------------------------------------===//
+
+include "XCoreRegisterInfo.td"
+include "XCoreInstrInfo.td"
+include "XCoreCallingConv.td"
+
+def XCoreInstrInfo : InstrInfo {
+  let TSFlagsFields = [];
+  let TSFlagsShifts = [];
+}
+
+//===----------------------------------------------------------------------===//
+// XCore processors supported.
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, list<SubtargetFeature> Features>
+ : Processor<Name, NoItineraries, Features>;
+
+def : Proc<"generic",      []>;
+def : Proc<"xs1b-generic", []>;
+
+//===----------------------------------------------------------------------===//
+// Declare the target which we are implementing
+//===----------------------------------------------------------------------===//
+
+def XCore : Target {
+  // Pull in Instruction Info:
+  let InstructionSet = XCoreInstrInfo;
+}

diff --git a/lib/Target/XCore/XCoreCallingConv.td b/lib/Target/XCore/XCoreCallingConv.td
new file mode 100644
index 0000000..8107e32
--- /dev/null
+++ b/lib/Target/XCore/XCoreCallingConv.td

@@ -0,0 +1,33 @@
+//===- XCoreCallingConv.td - Calling Conventions for XCore -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This describes the calling conventions for XCore architecture.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// XCore Return Value Calling Convention
+//===----------------------------------------------------------------------===//
+def RetCC_XCore : CallingConv<[
+  // i32 are returned in registers R0, R1, R2, R3
+  CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>
+]>;
+
+//===----------------------------------------------------------------------===//
+// XCore Argument Calling Conventions
+//===----------------------------------------------------------------------===//
+def CC_XCore : CallingConv<[
+  // Promote i8/i16 arguments to i32.
+  CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+  // The first 4 integer arguments are passed in integer registers.
+  CCIfType<[i32], CCAssignToReg<[R0, R1, R2, R3]>>,
+
+  // Integer values get stored in stack slots that are 4 bytes in
+  // size and 4-byte aligned.
+  CCIfType<[i32], CCAssignToStack<4, 4>>
+]>;

diff --git a/lib/Target/XCore/XCoreFrameInfo.cpp b/lib/Target/XCore/XCoreFrameInfo.cpp
new file mode 100644
index 0000000..f50dc96
--- /dev/null
+++ b/lib/Target/XCore/XCoreFrameInfo.cpp

@@ -0,0 +1,27 @@
+//===-- XCoreFrameInfo.cpp - Frame info for XCore Target ---------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains XCore frame information that doesn't fit anywhere else
+// cleanly...
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCore.h"
+#include "XCoreFrameInfo.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// XCoreFrameInfo:
+//===----------------------------------------------------------------------===//
+
+XCoreFrameInfo::XCoreFrameInfo(const TargetMachine &tm):
+  TargetFrameInfo(TargetFrameInfo::StackGrowsDown, 4, 0)
+{
+  // Do nothing
+}

diff --git a/lib/Target/XCore/XCoreFrameInfo.h b/lib/Target/XCore/XCoreFrameInfo.h
new file mode 100644
index 0000000..2c67577
--- /dev/null
+++ b/lib/Target/XCore/XCoreFrameInfo.h

@@ -0,0 +1,34 @@
+//===-- XCoreFrameInfo.h - Frame info for XCore Target -----------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains XCore frame information that doesn't fit anywhere else
+// cleanly...
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCOREFRAMEINFO_H
+#define XCOREFRAMEINFO_H
+
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+  class XCoreFrameInfo: public TargetFrameInfo {
+
+  public:
+    XCoreFrameInfo(const TargetMachine &tm);
+
+    //! Stack slot size (4 bytes)
+    static int stackSlotSize() {
+      return 4;
+    }
+  };
+}
+
+#endif // XCOREFRAMEINFO_H

diff --git a/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
new file mode 100644
index 0000000..383fd91
--- /dev/null
+++ b/lib/Target/XCore/XCoreISelDAGToDAG.cpp

@@ -0,0 +1,222 @@
+//===-- XCoreISelDAGToDAG.cpp - A dag to dag inst selector for XCore ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the XCore target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCore.h"
+#include "XCoreISelLowering.h"
+#include "XCoreTargetMachine.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/CallingConv.h"
+#include "llvm/Constants.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <queue>
+#include <set>
+using namespace llvm;
+
+/// XCoreDAGToDAGISel - XCore specific code to select XCore machine
+/// instructions for SelectionDAG operations.
+///
+namespace {
+  class XCoreDAGToDAGISel : public SelectionDAGISel {
+    XCoreTargetLowering &Lowering;
+    const XCoreSubtarget &Subtarget;
+
+  public:
+    XCoreDAGToDAGISel(XCoreTargetMachine &TM)
+      : SelectionDAGISel(TM),
+        Lowering(*TM.getTargetLowering()), 
+        Subtarget(*TM.getSubtargetImpl()) { }
+
+    SDNode *Select(SDNode *N);
+    
+    /// getI32Imm - Return a target constant with the specified value, of type
+    /// i32.
+    inline SDValue getI32Imm(unsigned Imm) {
+      return CurDAG->getTargetConstant(Imm, MVT::i32);
+    }
+
+    // Complex Pattern Selectors.
+    bool SelectADDRspii(SDNode *Op, SDValue Addr, SDValue &Base,
+                        SDValue &Offset);
+    bool SelectADDRdpii(SDNode *Op, SDValue Addr, SDValue &Base,
+                        SDValue &Offset);
+    bool SelectADDRcpii(SDNode *Op, SDValue Addr, SDValue &Base,
+                        SDValue &Offset);
+    
+    virtual void InstructionSelect();
+
+    virtual const char *getPassName() const {
+      return "XCore DAG->DAG Pattern Instruction Selection";
+    } 
+    
+    // Include the pieces autogenerated from the target description.
+  #include "XCoreGenDAGISel.inc"
+  };
+}  // end anonymous namespace
+
+/// createXCoreISelDag - This pass converts a legalized DAG into a 
+/// XCore-specific DAG, ready for instruction scheduling.
+///
+FunctionPass *llvm::createXCoreISelDag(XCoreTargetMachine &TM) {
+  return new XCoreDAGToDAGISel(TM);
+}
+
+bool XCoreDAGToDAGISel::SelectADDRspii(SDNode *Op, SDValue Addr,
+                                  SDValue &Base, SDValue &Offset) {
+  FrameIndexSDNode *FIN = 0;
+  if ((FIN = dyn_cast<FrameIndexSDNode>(Addr))) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+  if (Addr.getOpcode() == ISD::ADD) {
+    ConstantSDNode *CN = 0;
+    if ((FIN = dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
+      && (CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
+      && (CN->getSExtValue() % 4 == 0 && CN->getSExtValue() >= 0)) {
+      // Constant positive word offset from frame index
+      Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+      Offset = CurDAG->getTargetConstant(CN->getSExtValue(), MVT::i32);
+      return true;
+    }
+  }
+  return false;
+}
+
+bool XCoreDAGToDAGISel::SelectADDRdpii(SDNode *Op, SDValue Addr,
+                                  SDValue &Base, SDValue &Offset) {
+  if (Addr.getOpcode() == XCoreISD::DPRelativeWrapper) {
+    Base = Addr.getOperand(0);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+  if (Addr.getOpcode() == ISD::ADD) {
+    ConstantSDNode *CN = 0;
+    if ((Addr.getOperand(0).getOpcode() == XCoreISD::DPRelativeWrapper)
+      && (CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
+      && (CN->getSExtValue() % 4 == 0)) {
+      // Constant word offset from a object in the data region
+      Base = Addr.getOperand(0).getOperand(0);
+      Offset = CurDAG->getTargetConstant(CN->getSExtValue(), MVT::i32);
+      return true;
+    }
+  }
+  return false;
+}
+
+bool XCoreDAGToDAGISel::SelectADDRcpii(SDNode *Op, SDValue Addr,
+                                  SDValue &Base, SDValue &Offset) {
+  if (Addr.getOpcode() == XCoreISD::CPRelativeWrapper) {
+    Base = Addr.getOperand(0);
+    Offset = CurDAG->getTargetConstant(0, MVT::i32);
+    return true;
+  }
+  if (Addr.getOpcode() == ISD::ADD) {
+    ConstantSDNode *CN = 0;
+    if ((Addr.getOperand(0).getOpcode() == XCoreISD::CPRelativeWrapper)
+      && (CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
+      && (CN->getSExtValue() % 4 == 0)) {
+      // Constant word offset from a object in the data region
+      Base = Addr.getOperand(0).getOperand(0);
+      Offset = CurDAG->getTargetConstant(CN->getSExtValue(), MVT::i32);
+      return true;
+    }
+  }
+  return false;
+}
+
+/// InstructionSelect - This callback is invoked by
+/// SelectionDAGISel when it has created a SelectionDAG for us to codegen.
+void XCoreDAGToDAGISel::InstructionSelect() {
+  // Select target instructions for the DAG.
+  SelectRoot(*CurDAG);
+  
+  CurDAG->RemoveDeadNodes();
+}
+
+SDNode *XCoreDAGToDAGISel::Select(SDNode *N) {
+  DebugLoc dl = N->getDebugLoc();
+  EVT NVT = N->getValueType(0);
+  if (NVT == MVT::i32) {
+    switch (N->getOpcode()) {
+      default: break;
+      case ISD::Constant: {
+        if (Predicate_immMskBitp(N)) {
+          SDValue MskSize = Transform_msksize_xform(N);
+          return CurDAG->getMachineNode(XCore::MKMSK_rus, dl,
+                                        MVT::i32, MskSize);
+        }
+        else if (! Predicate_immU16(N)) {
+          unsigned Val = cast<ConstantSDNode>(N)->getZExtValue();
+          SDValue CPIdx =
+            CurDAG->getTargetConstantPool(ConstantInt::get(
+                                  Type::getInt32Ty(*CurDAG->getContext()), Val),
+                                          TLI.getPointerTy());
+          return CurDAG->getMachineNode(XCore::LDWCP_lru6, dl, MVT::i32, 
+                                        MVT::Other, CPIdx, 
+                                        CurDAG->getEntryNode());
+        }
+        break;
+      }
+      case ISD::SMUL_LOHI: {
+        // FIXME fold addition into the macc instruction
+        SDValue Zero(CurDAG->getMachineNode(XCore::LDC_ru6, dl, MVT::i32,
+                                CurDAG->getTargetConstant(0, MVT::i32)), 0);
+        SDValue Ops[] = { Zero, Zero, N->getOperand(0), N->getOperand(1) };
+        SDNode *ResNode = CurDAG->getMachineNode(XCore::MACCS_l4r, dl,
+                                                 MVT::i32, MVT::i32, Ops, 4);
+        ReplaceUses(SDValue(N, 0), SDValue(ResNode, 1));
+        ReplaceUses(SDValue(N, 1), SDValue(ResNode, 0));
+        return NULL;
+      }
+      case ISD::UMUL_LOHI: {
+        // FIXME fold addition into the macc / lmul instruction
+        SDValue Zero(CurDAG->getMachineNode(XCore::LDC_ru6, dl, MVT::i32,
+                                  CurDAG->getTargetConstant(0, MVT::i32)), 0);
+        SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                            Zero, Zero };
+        SDNode *ResNode = CurDAG->getMachineNode(XCore::LMUL_l6r, dl, MVT::i32,
+                                                 MVT::i32, Ops, 4);
+        ReplaceUses(SDValue(N, 0), SDValue(ResNode, 1));
+        ReplaceUses(SDValue(N, 1), SDValue(ResNode, 0));
+        return NULL;
+      }
+      case XCoreISD::LADD: {
+        SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                            N->getOperand(2) };
+        return CurDAG->getMachineNode(XCore::LADD_l5r, dl, MVT::i32, MVT::i32,
+                                      Ops, 3);
+      }
+      case XCoreISD::LSUB: {
+        SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
+                            N->getOperand(2) };
+        return CurDAG->getMachineNode(XCore::LSUB_l5r, dl, MVT::i32, MVT::i32,
+                                      Ops, 3);
+      }
+      // Other cases are autogenerated.
+    }
+  }
+  return SelectCode(N);
+}

diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
new file mode 100644
index 0000000..bf8c38f
--- /dev/null
+++ b/lib/Target/XCore/XCoreISelLowering.cpp

@@ -0,0 +1,1198 @@
+//===-- XCoreISelLowering.cpp - XCore DAG Lowering Implementation   ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the XCoreTargetLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "xcore-lower"
+
+#include "XCoreISelLowering.h"
+#include "XCoreMachineFunctionInfo.h"
+#include "XCore.h"
+#include "XCoreTargetObjectFile.h"
+#include "XCoreTargetMachine.h"
+#include "XCoreSubtarget.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/CallingConv.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/GlobalAlias.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/VectorExtras.h"
+#include <queue>
+#include <set>
+using namespace llvm;
+
+const char *XCoreTargetLowering::
+getTargetNodeName(unsigned Opcode) const 
+{
+  switch (Opcode) 
+  {
+    case XCoreISD::BL                : return "XCoreISD::BL";
+    case XCoreISD::PCRelativeWrapper : return "XCoreISD::PCRelativeWrapper";
+    case XCoreISD::DPRelativeWrapper : return "XCoreISD::DPRelativeWrapper";
+    case XCoreISD::CPRelativeWrapper : return "XCoreISD::CPRelativeWrapper";
+    case XCoreISD::STWSP             : return "XCoreISD::STWSP";
+    case XCoreISD::RETSP             : return "XCoreISD::RETSP";
+    case XCoreISD::LADD              : return "XCoreISD::LADD";
+    case XCoreISD::LSUB              : return "XCoreISD::LSUB";
+    default                           : return NULL;
+  }
+}
+
+XCoreTargetLowering::XCoreTargetLowering(XCoreTargetMachine &XTM)
+  : TargetLowering(XTM, new XCoreTargetObjectFile()),
+    TM(XTM),
+    Subtarget(*XTM.getSubtargetImpl()) {
+
+  // Set up the register classes.
+  addRegisterClass(MVT::i32, XCore::GRRegsRegisterClass);
+
+  // Compute derived properties from the register classes
+  computeRegisterProperties();
+
+  // Division is expensive
+  setIntDivIsCheap(false);
+
+  setShiftAmountType(MVT::i32);
+  setStackPointerRegisterToSaveRestore(XCore::SP);
+
+  setSchedulingPreference(SchedulingForRegPressure);
+
+  // Use i32 for setcc operations results (slt, sgt, ...).
+  setBooleanContents(ZeroOrOneBooleanContent);
+
+  // XCore does not have the NodeTypes below.
+  setOperationAction(ISD::BR_CC,     MVT::Other, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::i32,   Custom);
+  setOperationAction(ISD::ADDC, MVT::i32, Expand);
+  setOperationAction(ISD::ADDE, MVT::i32, Expand);
+  setOperationAction(ISD::SUBC, MVT::i32, Expand);
+  setOperationAction(ISD::SUBE, MVT::i32, Expand);
+
+  // Stop the combiner recombining select and set_cc
+  setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
+  
+  // 64bit
+  setOperationAction(ISD::ADD, MVT::i64, Custom);
+  setOperationAction(ISD::SUB, MVT::i64, Custom);
+  setOperationAction(ISD::MULHS, MVT::i32, Expand);
+  setOperationAction(ISD::MULHU, MVT::i32, Expand);
+  setOperationAction(ISD::SHL_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::SRA_PARTS, MVT::i32, Expand);
+  setOperationAction(ISD::SRL_PARTS, MVT::i32, Expand);
+  
+  // Bit Manipulation
+  setOperationAction(ISD::CTPOP, MVT::i32, Expand);
+  setOperationAction(ISD::ROTL , MVT::i32, Expand);
+  setOperationAction(ISD::ROTR , MVT::i32, Expand);
+  
+  setOperationAction(ISD::TRAP, MVT::Other, Legal);
+  
+  // Expand jump tables for now
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+  setOperationAction(ISD::JumpTable, MVT::i32, Custom);
+
+  setOperationAction(ISD::GlobalAddress, MVT::i32,   Custom);
+  setOperationAction(ISD::BlockAddress, MVT::i32 , Custom);
+
+  // Thread Local Storage
+  setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
+  
+  // Conversion of i64 -> double produces constantpool nodes
+  setOperationAction(ISD::ConstantPool, MVT::i32,   Custom);
+
+  // Loads
+  setLoadExtAction(ISD::EXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
+
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Expand);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Expand);
+
+  // Custom expand misaligned loads / stores.
+  setOperationAction(ISD::LOAD, MVT::i32, Custom);
+  setOperationAction(ISD::STORE, MVT::i32, Custom);
+
+  // Varargs
+  setOperationAction(ISD::VAEND, MVT::Other, Expand);
+  setOperationAction(ISD::VACOPY, MVT::Other, Expand);
+  setOperationAction(ISD::VAARG, MVT::Other, Custom);
+  setOperationAction(ISD::VASTART, MVT::Other, Custom);
+  
+  // Dynamic stack
+  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
+  
+  maxStoresPerMemset = 4;
+  maxStoresPerMemmove = maxStoresPerMemcpy = 2;
+
+  // We have target-specific dag combine patterns for the following nodes:
+  setTargetDAGCombine(ISD::STORE);
+}
+
+SDValue XCoreTargetLowering::
+LowerOperation(SDValue Op, SelectionDAG &DAG) {
+  switch (Op.getOpcode()) 
+  {
+  case ISD::GlobalAddress:    return LowerGlobalAddress(Op, DAG);
+  case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::BlockAddress:     return LowerBlockAddress(Op, DAG);
+  case ISD::ConstantPool:     return LowerConstantPool(Op, DAG);
+  case ISD::JumpTable:        return LowerJumpTable(Op, DAG);
+  case ISD::LOAD:             return LowerLOAD(Op, DAG);
+  case ISD::STORE:            return LowerSTORE(Op, DAG);
+  case ISD::SELECT_CC:        return LowerSELECT_CC(Op, DAG);
+  case ISD::VAARG:            return LowerVAARG(Op, DAG);
+  case ISD::VASTART:          return LowerVASTART(Op, DAG);
+  // FIXME: Remove these when LegalizeDAGTypes lands.
+  case ISD::ADD:
+  case ISD::SUB:              return ExpandADDSUB(Op.getNode(), DAG);
+  case ISD::FRAMEADDR:        return LowerFRAMEADDR(Op, DAG);
+  default:
+    llvm_unreachable("unimplemented operand");
+    return SDValue();
+  }
+}
+
+/// ReplaceNodeResults - Replace the results of node with an illegal result
+/// type with new values built out of custom code.
+void XCoreTargetLowering::ReplaceNodeResults(SDNode *N,
+                                             SmallVectorImpl<SDValue>&Results,
+                                             SelectionDAG &DAG) {
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Don't know how to custom expand this!");
+    return;
+  case ISD::ADD:
+  case ISD::SUB:
+    Results.push_back(ExpandADDSUB(N, DAG));
+    return;
+  }
+}
+
+/// getFunctionAlignment - Return the Log2 alignment of this function.
+unsigned XCoreTargetLowering::
+getFunctionAlignment(const Function *) const {
+  return 1;
+}
+
+//===----------------------------------------------------------------------===//
+//  Misc Lower Operation implementation
+//===----------------------------------------------------------------------===//
+
+SDValue XCoreTargetLowering::
+LowerSELECT_CC(SDValue Op, SelectionDAG &DAG)
+{
+  DebugLoc dl = Op.getDebugLoc();
+  SDValue Cond = DAG.getNode(ISD::SETCC, dl, MVT::i32, Op.getOperand(2),
+                             Op.getOperand(3), Op.getOperand(4));
+  return DAG.getNode(ISD::SELECT, dl, MVT::i32, Cond, Op.getOperand(0),
+                     Op.getOperand(1));
+}
+
+SDValue XCoreTargetLowering::
+getGlobalAddressWrapper(SDValue GA, GlobalValue *GV, SelectionDAG &DAG)
+{
+  // FIXME there is no actual debug info here
+  DebugLoc dl = GA.getDebugLoc();
+  if (isa<Function>(GV)) {
+    return DAG.getNode(XCoreISD::PCRelativeWrapper, dl, MVT::i32, GA);
+  }
+  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+  if (!GVar) {
+    // If GV is an alias then use the aliasee to determine constness
+    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+      GVar = dyn_cast_or_null<GlobalVariable>(GA->resolveAliasedGlobal());
+  }
+  bool isConst = GVar && GVar->isConstant();
+  if (isConst) {
+    return DAG.getNode(XCoreISD::CPRelativeWrapper, dl, MVT::i32, GA);
+  }
+  return DAG.getNode(XCoreISD::DPRelativeWrapper, dl, MVT::i32, GA);
+}
+
+SDValue XCoreTargetLowering::
+LowerGlobalAddress(SDValue Op, SelectionDAG &DAG)
+{
+  GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  SDValue GA = DAG.getTargetGlobalAddress(GV, MVT::i32);
+  // If it's a debug information descriptor, don't mess with it.
+  if (DAG.isVerifiedDebugInfoDesc(Op))
+    return GA;
+  return getGlobalAddressWrapper(GA, GV, DAG);
+}
+
+static inline SDValue BuildGetId(SelectionDAG &DAG, DebugLoc dl) {
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32,
+                     DAG.getConstant(Intrinsic::xcore_getid, MVT::i32));
+}
+
+static inline bool isZeroLengthArray(const Type *Ty) {
+  const ArrayType *AT = dyn_cast_or_null<ArrayType>(Ty);
+  return AT && (AT->getNumElements() == 0);
+}
+
+SDValue XCoreTargetLowering::
+LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG)
+{
+  // FIXME there isn't really debug info here
+  DebugLoc dl = Op.getDebugLoc();
+  // transform to label + getid() * size
+  GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  SDValue GA = DAG.getTargetGlobalAddress(GV, MVT::i32);
+  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+  if (!GVar) {
+    // If GV is an alias then use the aliasee to determine size
+    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+      GVar = dyn_cast_or_null<GlobalVariable>(GA->resolveAliasedGlobal());
+  }
+  if (! GVar) {
+    llvm_unreachable("Thread local object not a GlobalVariable?");
+    return SDValue();
+  }
+  const Type *Ty = cast<PointerType>(GV->getType())->getElementType();
+  if (!Ty->isSized() || isZeroLengthArray(Ty)) {
+#ifndef NDEBUG
+    errs() << "Size of thread local object " << GVar->getName()
+           << " is unknown\n";
+#endif
+    llvm_unreachable(0);
+  }
+  SDValue base = getGlobalAddressWrapper(GA, GV, DAG);
+  const TargetData *TD = TM.getTargetData();
+  unsigned Size = TD->getTypeAllocSize(Ty);
+  SDValue offset = DAG.getNode(ISD::MUL, dl, MVT::i32, BuildGetId(DAG, dl),
+                       DAG.getConstant(Size, MVT::i32));
+  return DAG.getNode(ISD::ADD, dl, MVT::i32, base, offset);
+}
+
+SDValue XCoreTargetLowering::
+LowerBlockAddress(SDValue Op, SelectionDAG &DAG)
+{
+  DebugLoc DL = Op.getDebugLoc();
+
+  BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+  SDValue Result = DAG.getBlockAddress(BA, getPointerTy(), /*isTarget=*/true);
+
+  return DAG.getNode(XCoreISD::PCRelativeWrapper, DL, getPointerTy(), Result);
+}
+
+SDValue XCoreTargetLowering::
+LowerConstantPool(SDValue Op, SelectionDAG &DAG)
+{
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+  // FIXME there isn't really debug info here
+  DebugLoc dl = CP->getDebugLoc();
+  EVT PtrVT = Op.getValueType();
+  SDValue Res;
+  if (CP->isMachineConstantPoolEntry()) {
+    Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
+                                    CP->getAlignment());
+  } else {
+    Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
+                                    CP->getAlignment());
+  }
+  return DAG.getNode(XCoreISD::CPRelativeWrapper, dl, MVT::i32, Res);
+}
+
+SDValue XCoreTargetLowering::
+LowerJumpTable(SDValue Op, SelectionDAG &DAG)
+{
+  // FIXME there isn't really debug info here
+  DebugLoc dl = Op.getDebugLoc();
+  EVT PtrVT = Op.getValueType();
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+  SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PtrVT);
+  return DAG.getNode(XCoreISD::DPRelativeWrapper, dl, MVT::i32, JTI);
+}
+
+static bool
+IsWordAlignedBasePlusConstantOffset(SDValue Addr, SDValue &AlignedBase,
+                                    int64_t &Offset)
+{
+  if (Addr.getOpcode() != ISD::ADD) {
+    return false;
+  }
+  ConstantSDNode *CN = 0;
+  if (!(CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))) {
+    return false;
+  }
+  int64_t off = CN->getSExtValue();
+  const SDValue &Base = Addr.getOperand(0);
+  const SDValue *Root = &Base;
+  if (Base.getOpcode() == ISD::ADD &&
+      Base.getOperand(1).getOpcode() == ISD::SHL) {
+    ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Base.getOperand(1)
+                                                      .getOperand(1));
+    if (CN && (CN->getSExtValue() >= 2)) {
+      Root = &Base.getOperand(0);
+    }
+  }
+  if (isa<FrameIndexSDNode>(*Root)) {
+    // All frame indicies are word aligned
+    AlignedBase = Base;
+    Offset = off;
+    return true;
+  }
+  if (Root->getOpcode() == XCoreISD::DPRelativeWrapper ||
+      Root->getOpcode() == XCoreISD::CPRelativeWrapper) {
+    // All dp / cp relative addresses are word aligned
+    AlignedBase = Base;
+    Offset = off;
+    return true;
+  }
+  return false;
+}
+
+SDValue XCoreTargetLowering::
+LowerLOAD(SDValue Op, SelectionDAG &DAG)
+{
+  LoadSDNode *LD = cast<LoadSDNode>(Op);
+  assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
+         "Unexpected extension type");
+  assert(LD->getMemoryVT() == MVT::i32 && "Unexpected load EVT");
+  if (allowsUnalignedMemoryAccesses(LD->getMemoryVT())) {
+    return SDValue();
+  }
+  unsigned ABIAlignment = getTargetData()->
+    getABITypeAlignment(LD->getMemoryVT().getTypeForEVT(*DAG.getContext()));
+  // Leave aligned load alone.
+  if (LD->getAlignment() >= ABIAlignment) {
+    return SDValue();
+  }
+  SDValue Chain = LD->getChain();
+  SDValue BasePtr = LD->getBasePtr();
+  DebugLoc dl = Op.getDebugLoc();
+  
+  SDValue Base;
+  int64_t Offset;
+  if (!LD->isVolatile() &&
+      IsWordAlignedBasePlusConstantOffset(BasePtr, Base, Offset)) {
+    if (Offset % 4 == 0) {
+      // We've managed to infer better alignment information than the load
+      // already has. Use an aligned load.
+      return DAG.getLoad(getPointerTy(), dl, Chain, BasePtr, NULL, 4);
+    }
+    // Lower to
+    // ldw low, base[offset >> 2]
+    // ldw high, base[(offset >> 2) + 1]
+    // shr low_shifted, low, (offset & 0x3) * 8
+    // shl high_shifted, high, 32 - (offset & 0x3) * 8
+    // or result, low_shifted, high_shifted
+    SDValue LowOffset = DAG.getConstant(Offset & ~0x3, MVT::i32);
+    SDValue HighOffset = DAG.getConstant((Offset & ~0x3) + 4, MVT::i32);
+    SDValue LowShift = DAG.getConstant((Offset & 0x3) * 8, MVT::i32);
+    SDValue HighShift = DAG.getConstant(32 - (Offset & 0x3) * 8, MVT::i32);
+    
+    SDValue LowAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, Base, LowOffset);
+    SDValue HighAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, Base, HighOffset);
+    
+    SDValue Low = DAG.getLoad(getPointerTy(), dl, Chain,
+                               LowAddr, NULL, 4);
+    SDValue High = DAG.getLoad(getPointerTy(), dl, Chain,
+                               HighAddr, NULL, 4);
+    SDValue LowShifted = DAG.getNode(ISD::SRL, dl, MVT::i32, Low, LowShift);
+    SDValue HighShifted = DAG.getNode(ISD::SHL, dl, MVT::i32, High, HighShift);
+    SDValue Result = DAG.getNode(ISD::OR, dl, MVT::i32, LowShifted, HighShifted);
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Low.getValue(1),
+                             High.getValue(1));
+    SDValue Ops[] = { Result, Chain };
+    return DAG.getMergeValues(Ops, 2, dl);
+  }
+  
+  if (LD->getAlignment() == 2) {
+    int SVOffset = LD->getSrcValueOffset();
+    SDValue Low = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i32, Chain,
+                                 BasePtr, LD->getSrcValue(), SVOffset, MVT::i16,
+                                 LD->isVolatile(), 2);
+    SDValue HighAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, BasePtr,
+                                   DAG.getConstant(2, MVT::i32));
+    SDValue High = DAG.getExtLoad(ISD::EXTLOAD, dl, MVT::i32, Chain,
+                                  HighAddr, LD->getSrcValue(), SVOffset + 2,
+                                  MVT::i16, LD->isVolatile(), 2);
+    SDValue HighShifted = DAG.getNode(ISD::SHL, dl, MVT::i32, High,
+                                      DAG.getConstant(16, MVT::i32));
+    SDValue Result = DAG.getNode(ISD::OR, dl, MVT::i32, Low, HighShifted);
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Low.getValue(1),
+                             High.getValue(1));
+    SDValue Ops[] = { Result, Chain };
+    return DAG.getMergeValues(Ops, 2, dl);
+  }
+  
+  // Lower to a call to __misaligned_load(BasePtr).
+  const Type *IntPtrTy = getTargetData()->getIntPtrType(*DAG.getContext());
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  
+  Entry.Ty = IntPtrTy;
+  Entry.Node = BasePtr;
+  Args.push_back(Entry);
+  
+  std::pair<SDValue, SDValue> CallResult =
+        LowerCallTo(Chain, IntPtrTy, false, false,
+                    false, false, 0, CallingConv::C, false,
+                    /*isReturnValueUsed=*/true,
+                    DAG.getExternalSymbol("__misaligned_load", getPointerTy()),
+                    Args, DAG, dl, DAG.GetOrdering(Chain.getNode()));
+
+  SDValue Ops[] =
+    { CallResult.first, CallResult.second };
+
+  return DAG.getMergeValues(Ops, 2, dl);
+}
+
+SDValue XCoreTargetLowering::
+LowerSTORE(SDValue Op, SelectionDAG &DAG)
+{
+  StoreSDNode *ST = cast<StoreSDNode>(Op);
+  assert(!ST->isTruncatingStore() && "Unexpected store type");
+  assert(ST->getMemoryVT() == MVT::i32 && "Unexpected store EVT");
+  if (allowsUnalignedMemoryAccesses(ST->getMemoryVT())) {
+    return SDValue();
+  }
+  unsigned ABIAlignment = getTargetData()->
+    getABITypeAlignment(ST->getMemoryVT().getTypeForEVT(*DAG.getContext()));
+  // Leave aligned store alone.
+  if (ST->getAlignment() >= ABIAlignment) {
+    return SDValue();
+  }
+  SDValue Chain = ST->getChain();
+  SDValue BasePtr = ST->getBasePtr();
+  SDValue Value = ST->getValue();
+  DebugLoc dl = Op.getDebugLoc();
+  
+  if (ST->getAlignment() == 2) {
+    int SVOffset = ST->getSrcValueOffset();
+    SDValue Low = Value;
+    SDValue High = DAG.getNode(ISD::SRL, dl, MVT::i32, Value,
+                                      DAG.getConstant(16, MVT::i32));
+    SDValue StoreLow = DAG.getTruncStore(Chain, dl, Low, BasePtr,
+                                         ST->getSrcValue(), SVOffset, MVT::i16,
+                                         ST->isVolatile(), 2);
+    SDValue HighAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, BasePtr,
+                                   DAG.getConstant(2, MVT::i32));
+    SDValue StoreHigh = DAG.getTruncStore(Chain, dl, High, HighAddr,
+                                          ST->getSrcValue(), SVOffset + 2,
+                                          MVT::i16, ST->isVolatile(), 2);
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StoreLow, StoreHigh);
+  }
+  
+  // Lower to a call to __misaligned_store(BasePtr, Value).
+  const Type *IntPtrTy = getTargetData()->getIntPtrType(*DAG.getContext());
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  
+  Entry.Ty = IntPtrTy;
+  Entry.Node = BasePtr;
+  Args.push_back(Entry);
+  
+  Entry.Node = Value;
+  Args.push_back(Entry);
+  
+  std::pair<SDValue, SDValue> CallResult =
+        LowerCallTo(Chain, Type::getVoidTy(*DAG.getContext()), false, false,
+                    false, false, 0, CallingConv::C, false,
+                    /*isReturnValueUsed=*/true,
+                    DAG.getExternalSymbol("__misaligned_store", getPointerTy()),
+                    Args, DAG, dl, DAG.GetOrdering(Chain.getNode()));
+
+  return CallResult.second;
+}
+
+SDValue XCoreTargetLowering::
+ExpandADDSUB(SDNode *N, SelectionDAG &DAG)
+{
+  assert(N->getValueType(0) == MVT::i64 &&
+         (N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
+        "Unknown operand to lower!");
+  DebugLoc dl = N->getDebugLoc();
+  
+  // Extract components
+  SDValue LHSL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                            N->getOperand(0),  DAG.getConstant(0, MVT::i32));
+  SDValue LHSH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                            N->getOperand(0),  DAG.getConstant(1, MVT::i32));
+  SDValue RHSL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                             N->getOperand(1), DAG.getConstant(0, MVT::i32));
+  SDValue RHSH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
+                             N->getOperand(1), DAG.getConstant(1, MVT::i32));
+  
+  // Expand
+  unsigned Opcode = (N->getOpcode() == ISD::ADD) ? XCoreISD::LADD :
+                                                   XCoreISD::LSUB;
+  SDValue Zero = DAG.getConstant(0, MVT::i32);
+  SDValue Carry = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
+                                  LHSL, RHSL, Zero);
+  SDValue Lo(Carry.getNode(), 1);
+  
+  SDValue Ignored = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
+                                  LHSH, RHSH, Carry);
+  SDValue Hi(Ignored.getNode(), 1);
+  // Merge the pieces
+  return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
+}
+
+SDValue XCoreTargetLowering::
+LowerVAARG(SDValue Op, SelectionDAG &DAG)
+{
+  llvm_unreachable("unimplemented");
+  // FIX Arguments passed by reference need a extra dereference.
+  SDNode *Node = Op.getNode();
+  DebugLoc dl = Node->getDebugLoc();
+  const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
+  EVT VT = Node->getValueType(0);
+  SDValue VAList = DAG.getLoad(getPointerTy(), dl, Node->getOperand(0),
+                               Node->getOperand(1), V, 0);
+  // Increment the pointer, VAList, to the next vararg
+  SDValue Tmp3 = DAG.getNode(ISD::ADD, dl, getPointerTy(), VAList, 
+                     DAG.getConstant(VT.getSizeInBits(), 
+                                     getPointerTy()));
+  // Store the incremented VAList to the legalized pointer
+  Tmp3 = DAG.getStore(VAList.getValue(1), dl, Tmp3, Node->getOperand(1), V, 0);
+  // Load the actual argument out of the pointer VAList
+  return DAG.getLoad(VT, dl, Tmp3, VAList, NULL, 0);
+}
+
+SDValue XCoreTargetLowering::
+LowerVASTART(SDValue Op, SelectionDAG &DAG)
+{
+  DebugLoc dl = Op.getDebugLoc();
+  // vastart stores the address of the VarArgsFrameIndex slot into the
+  // memory location argument
+  MachineFunction &MF = DAG.getMachineFunction();
+  XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+  SDValue Addr = DAG.getFrameIndex(XFI->getVarArgsFrameIndex(), MVT::i32);
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), dl, Addr, Op.getOperand(1), SV, 0);
+}
+
+SDValue XCoreTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) {
+  DebugLoc dl = Op.getDebugLoc();
+  // Depths > 0 not supported yet! 
+  if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() > 0)
+    return SDValue();
+  
+  MachineFunction &MF = DAG.getMachineFunction();
+  const TargetRegisterInfo *RegInfo = getTargetMachine().getRegisterInfo();
+  return DAG.getCopyFromReg(DAG.getEntryNode(), dl, 
+                            RegInfo->getFrameRegister(MF), MVT::i32);
+}
+
+//===----------------------------------------------------------------------===//
+//                      Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+#include "XCoreGenCallingConv.inc"
+
+//===----------------------------------------------------------------------===//
+//                  Call Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+/// XCore call implementation
+SDValue
+XCoreTargetLowering::LowerCall(SDValue Chain, SDValue Callee,
+                               CallingConv::ID CallConv, bool isVarArg,
+                               bool &isTailCall,
+                               const SmallVectorImpl<ISD::OutputArg> &Outs,
+                               const SmallVectorImpl<ISD::InputArg> &Ins,
+                               DebugLoc dl, SelectionDAG &DAG,
+                               SmallVectorImpl<SDValue> &InVals) {
+  // XCore target does not yet support tail call optimization.
+  isTailCall = false;
+
+  // For now, only CallingConv::C implemented
+  switch (CallConv)
+  {
+    default:
+      llvm_unreachable("Unsupported calling convention");
+    case CallingConv::Fast:
+    case CallingConv::C:
+      return LowerCCCCallTo(Chain, Callee, CallConv, isVarArg, isTailCall,
+                            Outs, Ins, dl, DAG, InVals);
+  }
+}
+
+/// LowerCCCCallTo - functions arguments are copied from virtual
+/// regs to (physical regs)/(stack frame), CALLSEQ_START and
+/// CALLSEQ_END are emitted.
+/// TODO: isTailCall, sret.
+SDValue
+XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
+                                    CallingConv::ID CallConv, bool isVarArg,
+                                    bool isTailCall,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    DebugLoc dl, SelectionDAG &DAG,
+                                    SmallVectorImpl<SDValue> &InVals) {
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 ArgLocs, *DAG.getContext());
+
+  // The ABI dictates there should be one stack slot available to the callee
+  // on function entry (for saving lr).
+  CCInfo.AllocateStack(4, 4);
+
+  CCInfo.AnalyzeCallOperands(Outs, CC_XCore);
+
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
+
+  Chain = DAG.getCALLSEQ_START(Chain,DAG.getConstant(NumBytes, 
+                                 getPointerTy(), true));
+
+  SmallVector<std::pair<unsigned, SDValue>, 4> RegsToPass;
+  SmallVector<SDValue, 12> MemOpChains;
+
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    SDValue Arg = Outs[i].Val;
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+      default: llvm_unreachable("Unknown loc info!");
+      case CCValAssign::Full: break;
+      case CCValAssign::SExt:
+        Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::ZExt:
+        Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+      case CCValAssign::AExt:
+        Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+        break;
+    }
+    
+    // Arguments that can be passed on register must be kept at 
+    // RegsToPass vector
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      assert(VA.isMemLoc());
+
+      int Offset = VA.getLocMemOffset();
+
+      MemOpChains.push_back(DAG.getNode(XCoreISD::STWSP, dl, MVT::Other, 
+                                        Chain, Arg,
+                                        DAG.getConstant(Offset/4, MVT::i32)));
+    }
+  }
+
+  // Transform all store nodes into one single node because
+  // all store nodes are independent of each other.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, 
+                        &MemOpChains[0], MemOpChains.size());
+
+  // Build a sequence of copy-to-reg nodes chained together with token 
+  // chain and flag operands which copy the outgoing args into registers.
+  // The InFlag in necessary since all emited instructions must be
+  // stuck together.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, 
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
+  }
+
+  // If the callee is a GlobalAddress node (quite common, every direct call is)
+  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+  // Likewise ExternalSymbol -> TargetExternalSymbol.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    Callee = DAG.getTargetGlobalAddress(G->getGlobal(), MVT::i32);
+  else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee))
+    Callee = DAG.getTargetExternalSymbol(E->getSymbol(), MVT::i32);
+
+  // XCoreBranchLink = #chain, #target_address, #opt_in_flags...
+  //             = Chain, Callee, Reg#1, Reg#2, ...  
+  //
+  // Returns a chain & a flag for retval copy to use.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
+
+  // Add argument registers to the end of the list so that they are 
+  // known live into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
+
+  Chain  = DAG.getNode(XCoreISD::BL, dl, NodeTys, &Ops[0], Ops.size());
+  InFlag = Chain.getValue(1);
+
+  // Create the CALLSEQ_END node.
+  Chain = DAG.getCALLSEQ_END(Chain,
+                             DAG.getConstant(NumBytes, getPointerTy(), true),
+                             DAG.getConstant(0, getPointerTy(), true),
+                             InFlag);
+  InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
+                         Ins, dl, DAG, InVals);
+}
+
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+SDValue
+XCoreTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
+                                     CallingConv::ID CallConv, bool isVarArg,
+                                     const SmallVectorImpl<ISD::InputArg> &Ins,
+                                     DebugLoc dl, SelectionDAG &DAG,
+                                     SmallVectorImpl<SDValue> &InVals) {
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 RVLocs, *DAG.getContext());
+
+  CCInfo.AnalyzeCallResult(Ins, RetCC_XCore);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    Chain = DAG.getCopyFromReg(Chain, dl, RVLocs[i].getLocReg(),
+                                 RVLocs[i].getValVT(), InFlag).getValue(1);
+    InFlag = Chain.getValue(2);
+    InVals.push_back(Chain.getValue(0));
+  }
+
+  return Chain;
+}
+
+//===----------------------------------------------------------------------===//
+//             Formal Arguments Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+/// XCore formal arguments implementation
+SDValue
+XCoreTargetLowering::LowerFormalArguments(SDValue Chain,
+                                          CallingConv::ID CallConv,
+                                          bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                          DebugLoc dl,
+                                          SelectionDAG &DAG,
+                                          SmallVectorImpl<SDValue> &InVals) {
+  switch (CallConv)
+  {
+    default:
+      llvm_unreachable("Unsupported calling convention");
+    case CallingConv::C:
+    case CallingConv::Fast:
+      return LowerCCCArguments(Chain, CallConv, isVarArg,
+                               Ins, dl, DAG, InVals);
+  }
+}
+
+/// LowerCCCArguments - transform physical registers into
+/// virtual registers and generate load operations for
+/// arguments places on the stack.
+/// TODO: sret
+SDValue
+XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
+                                       CallingConv::ID CallConv,
+                                       bool isVarArg,
+                                       const SmallVectorImpl<ISD::InputArg>
+                                         &Ins,
+                                       DebugLoc dl,
+                                       SelectionDAG &DAG,
+                                       SmallVectorImpl<SDValue> &InVals) {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 ArgLocs, *DAG.getContext());
+
+  CCInfo.AnalyzeFormalArguments(Ins, CC_XCore);
+
+  unsigned StackSlotSize = XCoreFrameInfo::stackSlotSize();
+
+  unsigned LRSaveSize = StackSlotSize;
+  
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+
+    CCValAssign &VA = ArgLocs[i];
+    
+    if (VA.isRegLoc()) {
+      // Arguments passed in registers
+      EVT RegVT = VA.getLocVT();
+      switch (RegVT.getSimpleVT().SimpleTy) {
+      default:
+        {
+#ifndef NDEBUG
+          errs() << "LowerFormalArguments Unhandled argument type: "
+                 << RegVT.getSimpleVT().SimpleTy << "\n";
+#endif
+          llvm_unreachable(0);
+        }
+      case MVT::i32:
+        unsigned VReg = RegInfo.createVirtualRegister(
+                          XCore::GRRegsRegisterClass);
+        RegInfo.addLiveIn(VA.getLocReg(), VReg);
+        InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
+      }
+    } else {
+      // sanity check
+      assert(VA.isMemLoc());
+      // Load the argument to a virtual register
+      unsigned ObjSize = VA.getLocVT().getSizeInBits()/8;
+      if (ObjSize > StackSlotSize) {
+        errs() << "LowerFormalArguments Unhandled argument type: "
+               << (unsigned)VA.getLocVT().getSimpleVT().SimpleTy
+               << "\n";
+      }
+      // Create the frame index object for this incoming parameter...
+      int FI = MFI->CreateFixedObject(ObjSize,
+                                      LRSaveSize + VA.getLocMemOffset(),
+                                      true, false);
+
+      // Create the SelectionDAG nodes corresponding to a load
+      //from this parameter
+      SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+      InVals.push_back(DAG.getLoad(VA.getLocVT(), dl, Chain, FIN, NULL, 0));
+    }
+  }
+  
+  if (isVarArg) {
+    /* Argument registers */
+    static const unsigned ArgRegs[] = {
+      XCore::R0, XCore::R1, XCore::R2, XCore::R3
+    };
+    XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+    unsigned FirstVAReg = CCInfo.getFirstUnallocated(ArgRegs,
+                                                     array_lengthof(ArgRegs));
+    if (FirstVAReg < array_lengthof(ArgRegs)) {
+      SmallVector<SDValue, 4> MemOps;
+      int offset = 0;
+      // Save remaining registers, storing higher register numbers at a higher
+      // address
+      for (unsigned i = array_lengthof(ArgRegs) - 1; i >= FirstVAReg; --i) {
+        // Create a stack slot
+        int FI = MFI->CreateFixedObject(4, offset, true, false);
+        if (i == FirstVAReg) {
+          XFI->setVarArgsFrameIndex(FI);
+        }
+        offset -= StackSlotSize;
+        SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
+        // Move argument from phys reg -> virt reg
+        unsigned VReg = RegInfo.createVirtualRegister(
+                          XCore::GRRegsRegisterClass);
+        RegInfo.addLiveIn(ArgRegs[i], VReg);
+        SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
+        // Move argument from virt reg -> stack
+        SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, NULL, 0);
+        MemOps.push_back(Store);
+      }
+      if (!MemOps.empty())
+        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                            &MemOps[0], MemOps.size());
+    } else {
+      // This will point to the next argument passed via stack.
+      XFI->setVarArgsFrameIndex(
+        MFI->CreateFixedObject(4, LRSaveSize + CCInfo.getNextStackOffset(),
+                               true, false));
+    }
+  }
+  
+  return Chain;
+}
+
+//===----------------------------------------------------------------------===//
+//               Return Value Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+
+bool XCoreTargetLowering::
+CanLowerReturn(CallingConv::ID CallConv, bool isVarArg,
+               const SmallVectorImpl<EVT> &OutTys,
+               const SmallVectorImpl<ISD::ArgFlagsTy> &ArgsFlags,
+               SelectionDAG &DAG) {
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 RVLocs, *DAG.getContext());
+  return CCInfo.CheckReturn(OutTys, ArgsFlags, RetCC_XCore);
+}
+
+SDValue
+XCoreTargetLowering::LowerReturn(SDValue Chain,
+                                 CallingConv::ID CallConv, bool isVarArg,
+                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                 DebugLoc dl, SelectionDAG &DAG) {
+
+  // CCValAssign - represent the assignment of
+  // the return value to a location
+  SmallVector<CCValAssign, 16> RVLocs;
+
+  // CCState - Info about the registers and stack slot.
+  CCState CCInfo(CallConv, isVarArg, getTargetMachine(),
+                 RVLocs, *DAG.getContext());
+
+  // Analize return values.
+  CCInfo.AnalyzeReturn(Outs, RetCC_XCore);
+
+  // If this is the first return lowered for this function, add 
+  // the regs to the liveout set for the function.
+  if (DAG.getMachineFunction().getRegInfo().liveout_empty()) {
+    for (unsigned i = 0; i != RVLocs.size(); ++i)
+      if (RVLocs[i].isRegLoc())
+        DAG.getMachineFunction().getRegInfo().addLiveOut(RVLocs[i].getLocReg());
+  }
+
+  SDValue Flag;
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), 
+                             Outs[i].Val, Flag);
+
+    // guarantee that all emitted copies are
+    // stuck together, avoiding something bad
+    Flag = Chain.getValue(1);
+  }
+
+  // Return on XCore is always a "retsp 0"
+  if (Flag.getNode())
+    return DAG.getNode(XCoreISD::RETSP, dl, MVT::Other,
+                       Chain, DAG.getConstant(0, MVT::i32), Flag);
+  else // Return Void
+    return DAG.getNode(XCoreISD::RETSP, dl, MVT::Other,
+                       Chain, DAG.getConstant(0, MVT::i32));
+}
+
+//===----------------------------------------------------------------------===//
+//  Other Lowering Code
+//===----------------------------------------------------------------------===//
+
+MachineBasicBlock *
+XCoreTargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                 MachineBasicBlock *BB,
+                   DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const {
+  const TargetInstrInfo &TII = *getTargetMachine().getInstrInfo();
+  DebugLoc dl = MI->getDebugLoc();
+  assert((MI->getOpcode() == XCore::SELECT_CC) &&
+         "Unexpected instr type to insert");
+  
+  // To "insert" a SELECT_CC instruction, we actually have to insert the diamond
+  // control-flow pattern.  The incoming instruction knows the destination vreg
+  // to set, the condition code register to branch on, the true/false values to
+  // select between, and a branch opcode to use.
+  const BasicBlock *LLVM_BB = BB->getBasicBlock();
+  MachineFunction::iterator It = BB;
+  ++It;
+  
+  //  thisMBB:
+  //  ...
+  //   TrueVal = ...
+  //   cmpTY ccX, r1, r2
+  //   bCC copy1MBB
+  //   fallthrough --> copy0MBB
+  MachineBasicBlock *thisMBB = BB;
+  MachineFunction *F = BB->getParent();
+  MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+  BuildMI(BB, dl, TII.get(XCore::BRFT_lru6))
+    .addReg(MI->getOperand(1).getReg()).addMBB(sinkMBB);
+  F->insert(It, copy0MBB);
+  F->insert(It, sinkMBB);
+  // Update machine-CFG edges by first adding all successors of the current
+  // block to the new block which will contain the Phi node for the select.
+  // Also inform sdisel of the edge changes.
+  for (MachineBasicBlock::succ_iterator I = BB->succ_begin(), 
+         E = BB->succ_end(); I != E; ++I) {
+    EM->insert(std::make_pair(*I, sinkMBB));
+    sinkMBB->addSuccessor(*I);
+  }
+  // Next, remove all successors of the current block, and add the true
+  // and fallthrough blocks as its successors.
+  while (!BB->succ_empty())
+    BB->removeSuccessor(BB->succ_begin());
+  // Next, add the true and fallthrough blocks as its successors.
+  BB->addSuccessor(copy0MBB);
+  BB->addSuccessor(sinkMBB);
+  
+  //  copy0MBB:
+  //   %FalseValue = ...
+  //   # fallthrough to sinkMBB
+  BB = copy0MBB;
+  
+  // Update machine-CFG edges
+  BB->addSuccessor(sinkMBB);
+  
+  //  sinkMBB:
+  //   %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+  //  ...
+  BB = sinkMBB;
+  BuildMI(BB, dl, TII.get(XCore::PHI), MI->getOperand(0).getReg())
+    .addReg(MI->getOperand(3).getReg()).addMBB(copy0MBB)
+    .addReg(MI->getOperand(2).getReg()).addMBB(thisMBB);
+  
+  F->DeleteMachineInstr(MI);   // The pseudo instruction is gone now.
+  return BB;
+}
+
+//===----------------------------------------------------------------------===//
+// Target Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
+                                             DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  DebugLoc dl = N->getDebugLoc();
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::STORE: {
+    // Replace unaligned store of unaligned load with memmove.
+    StoreSDNode *ST  = cast<StoreSDNode>(N);
+    if (!DCI.isBeforeLegalize() ||
+        allowsUnalignedMemoryAccesses(ST->getMemoryVT()) ||
+        ST->isVolatile() || ST->isIndexed()) {
+      break;
+    }
+    SDValue Chain = ST->getChain();
+
+    unsigned StoreBits = ST->getMemoryVT().getStoreSizeInBits();
+    if (StoreBits % 8) {
+      break;
+    }
+    unsigned ABIAlignment = getTargetData()->getABITypeAlignment(
+        ST->getMemoryVT().getTypeForEVT(*DCI.DAG.getContext()));
+    unsigned Alignment = ST->getAlignment();
+    if (Alignment >= ABIAlignment) {
+      break;
+    }
+
+    if (LoadSDNode *LD = dyn_cast<LoadSDNode>(ST->getValue())) {
+      if (LD->hasNUsesOfValue(1, 0) && ST->getMemoryVT() == LD->getMemoryVT() &&
+        LD->getAlignment() == Alignment &&
+        !LD->isVolatile() && !LD->isIndexed() &&
+        Chain.reachesChainWithoutSideEffects(SDValue(LD, 1))) {
+        return DAG.getMemmove(Chain, dl, ST->getBasePtr(),
+                              LD->getBasePtr(),
+                              DAG.getConstant(StoreBits/8, MVT::i32),
+                              Alignment, ST->getSrcValue(),
+                              ST->getSrcValueOffset(), LD->getSrcValue(),
+                              LD->getSrcValueOffset());
+      }
+    }
+    break;
+  }
+  }
+  return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+//  Addressing mode description hooks
+//===----------------------------------------------------------------------===//
+
+static inline bool isImmUs(int64_t val)
+{
+  return (val >= 0 && val <= 11);
+}
+
+static inline bool isImmUs2(int64_t val)
+{
+  return (val%2 == 0 && isImmUs(val/2));
+}
+
+static inline bool isImmUs4(int64_t val)
+{
+  return (val%4 == 0 && isImmUs(val/4));
+}
+
+/// isLegalAddressingMode - Return true if the addressing mode represented
+/// by AM is legal for this target, for a load/store of the specified type.
+bool
+XCoreTargetLowering::isLegalAddressingMode(const AddrMode &AM, 
+                                              const Type *Ty) const {
+  // Be conservative with void
+  // FIXME: Can we be more aggressive?
+  if (Ty->getTypeID() == Type::VoidTyID)
+    return false;
+
+  const TargetData *TD = TM.getTargetData();
+  unsigned Size = TD->getTypeAllocSize(Ty);
+  if (AM.BaseGV) {
+    return Size >= 4 && !AM.HasBaseReg && AM.Scale == 0 &&
+                 AM.BaseOffs%4 == 0;
+  }
+  
+  switch (Size) {
+  case 1:
+    // reg + imm
+    if (AM.Scale == 0) {
+      return isImmUs(AM.BaseOffs);
+    }
+    // reg + reg
+    return AM.Scale == 1 && AM.BaseOffs == 0;
+  case 2:
+  case 3:
+    // reg + imm
+    if (AM.Scale == 0) {
+      return isImmUs2(AM.BaseOffs);
+    }
+    // reg + reg<<1
+    return AM.Scale == 2 && AM.BaseOffs == 0;
+  default:
+    // reg + imm
+    if (AM.Scale == 0) {
+      return isImmUs4(AM.BaseOffs);
+    }
+    // reg + reg<<2
+    return AM.Scale == 4 && AM.BaseOffs == 0;
+  }
+  
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+//                           XCore Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+std::vector<unsigned> XCoreTargetLowering::
+getRegClassForInlineAsmConstraint(const std::string &Constraint,
+                                  EVT VT) const 
+{
+  if (Constraint.size() != 1)
+    return std::vector<unsigned>();
+
+  switch (Constraint[0]) {
+    default : break;
+    case 'r':
+      return make_vector<unsigned>(XCore::R0, XCore::R1,  XCore::R2, 
+                                   XCore::R3, XCore::R4,  XCore::R5, 
+                                   XCore::R6, XCore::R7,  XCore::R8, 
+                                   XCore::R9, XCore::R10, XCore::R11, 0);
+      break;
+  }
+  return std::vector<unsigned>();
+}

diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h
new file mode 100644
index 0000000..f7b620e
--- /dev/null
+++ b/lib/Target/XCore/XCoreISelLowering.h

@@ -0,0 +1,172 @@
+//===-- XCoreISelLowering.h - XCore DAG Lowering Interface ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interfaces that XCore uses to lower LLVM code into a
+// selection DAG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCOREISELLOWERING_H
+#define XCOREISELLOWERING_H
+
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+#include "XCore.h"
+
+namespace llvm {
+  
+  // Forward delcarations
+  class XCoreSubtarget;
+  class XCoreTargetMachine;
+  
+  namespace XCoreISD {
+    enum NodeType {
+      // Start the numbering where the builtin ops and target ops leave off.
+      FIRST_NUMBER = ISD::BUILTIN_OP_END+XCore::INSTRUCTION_LIST_END,
+
+      // Branch and link (call)
+      BL,
+
+      // pc relative address
+      PCRelativeWrapper,
+
+      // dp relative address
+      DPRelativeWrapper,
+      
+      // cp relative address
+      CPRelativeWrapper,
+      
+      // Store word to stack
+      STWSP,
+
+      // Corresponds to retsp instruction
+      RETSP,
+      
+      // Corresponds to LADD instruction
+      LADD,
+
+      // Corresponds to LSUB instruction
+      LSUB
+    };
+  }
+
+  //===--------------------------------------------------------------------===//
+  // TargetLowering Implementation
+  //===--------------------------------------------------------------------===//
+  class XCoreTargetLowering : public TargetLowering 
+  {
+  public:
+
+    explicit XCoreTargetLowering(XCoreTargetMachine &TM);
+
+    /// LowerOperation - Provide custom lowering hooks for some operations.
+    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG);
+
+    /// ReplaceNodeResults - Replace the results of node with an illegal result
+    /// type with new values built out of custom code.
+    ///
+    virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+                                    SelectionDAG &DAG);
+
+    /// getTargetNodeName - This method returns the name of a target specific 
+    //  DAG node.
+    virtual const char *getTargetNodeName(unsigned Opcode) const;
+  
+    virtual MachineBasicBlock *EmitInstrWithCustomInserter(MachineInstr *MI,
+                                                         MachineBasicBlock *MBB,
+                    DenseMap<MachineBasicBlock*, MachineBasicBlock*> *EM) const;
+
+    virtual bool isLegalAddressingMode(const AddrMode &AM,
+                                       const Type *Ty) const;
+
+    /// getFunctionAlignment - Return the Log2 alignment of this function.
+    virtual unsigned getFunctionAlignment(const Function *F) const;
+
+  private:
+    const XCoreTargetMachine &TM;
+    const XCoreSubtarget &Subtarget;
+  
+    // Lower Operand helpers
+    SDValue LowerCCCArguments(SDValue Chain,
+                              CallingConv::ID CallConv,
+                              bool isVarArg,
+                              const SmallVectorImpl<ISD::InputArg> &Ins,
+                              DebugLoc dl, SelectionDAG &DAG,
+                              SmallVectorImpl<SDValue> &InVals);
+    SDValue LowerCCCCallTo(SDValue Chain, SDValue Callee,
+                           CallingConv::ID CallConv, bool isVarArg,
+                           bool isTailCall,
+                           const SmallVectorImpl<ISD::OutputArg> &Outs,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals);
+    SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                            CallingConv::ID CallConv, bool isVarArg,
+                            const SmallVectorImpl<ISD::InputArg> &Ins,
+                            DebugLoc dl, SelectionDAG &DAG,
+                            SmallVectorImpl<SDValue> &InVals);
+    SDValue getReturnAddressFrameIndex(SelectionDAG &DAG);
+    SDValue getGlobalAddressWrapper(SDValue GA, GlobalValue *GV,
+                                    SelectionDAG &DAG);
+
+    // Lower Operand specifics
+    SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG);
+    SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG);
+  
+    // Inline asm support
+    std::vector<unsigned>
+    getRegClassForInlineAsmConstraint(const std::string &Constraint,
+              EVT VT) const;
+  
+    // Expand specifics
+    SDValue ExpandADDSUB(SDNode *Op, SelectionDAG &DAG);
+
+    virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+    virtual SDValue
+      LowerFormalArguments(SDValue Chain,
+                           CallingConv::ID CallConv,
+                           bool isVarArg,
+                           const SmallVectorImpl<ISD::InputArg> &Ins,
+                           DebugLoc dl, SelectionDAG &DAG,
+                           SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerCall(SDValue Chain, SDValue Callee,
+                CallingConv::ID CallConv, bool isVarArg,
+                bool &isTailCall,
+                const SmallVectorImpl<ISD::OutputArg> &Outs,
+                const SmallVectorImpl<ISD::InputArg> &Ins,
+                DebugLoc dl, SelectionDAG &DAG,
+                SmallVectorImpl<SDValue> &InVals);
+
+    virtual SDValue
+      LowerReturn(SDValue Chain,
+                  CallingConv::ID CallConv, bool isVarArg,
+                  const SmallVectorImpl<ISD::OutputArg> &Outs,
+                  DebugLoc dl, SelectionDAG &DAG);
+
+    virtual bool
+      CanLowerReturn(CallingConv::ID CallConv, bool isVarArg,
+                     const SmallVectorImpl<EVT> &OutTys,
+                     const SmallVectorImpl<ISD::ArgFlagsTy> &ArgsFlags,
+                     SelectionDAG &DAG);
+  };
+}
+
+#endif // XCOREISELLOWERING_H

diff --git a/lib/Target/XCore/XCoreInstrFormats.td b/lib/Target/XCore/XCoreInstrFormats.td
new file mode 100644
index 0000000..8002c99
--- /dev/null
+++ b/lib/Target/XCore/XCoreInstrFormats.td

@@ -0,0 +1,120 @@
+//===- XCoreInstrFormats.td - XCore Instruction Formats ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+class InstXCore<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : Instruction {
+  field bits<32> Inst;
+
+  let Namespace = "XCore";
+  dag OutOperandList = outs;
+  dag InOperandList = ins;
+  let AsmString   = asmstr;
+  let Pattern = pattern;
+}
+
+// XCore pseudo instructions format
+class PseudoInstXCore<dag outs, dag ins, string asmstr, list<dag> pattern>
+   : InstXCore<outs, ins, asmstr, pattern>;
+
+//===----------------------------------------------------------------------===//
+// Instruction formats
+//===----------------------------------------------------------------------===//
+
+class _F3R<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<outs, ins, asmstr, pattern> {
+  let Inst{31-0} = 0;
+}
+
+class _FL3R<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<outs, ins, asmstr, pattern> {
+  let Inst{31-0} = 0;
+}
+
+class _F2RUS<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<outs, ins, asmstr, pattern> {
+  let Inst{31-0} = 0;
+}
+
+class _FL2RUS<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<outs, ins, asmstr, pattern> {
+  let Inst{31-0} = 0;
+}
+
+class _FRU6<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<outs, ins, asmstr, pattern> {
+  let Inst{31-0} = 0;
+}
+
+class _FLRU6<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<outs, ins, asmstr, pattern> {
+  let Inst{31-0} = 0;
+}
+
+class _FU6<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<outs, ins, asmstr, pattern> {
+  let Inst{31-0} = 0;
+}
+
+class _FLU6<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<outs, ins, asmstr, pattern> {
+  let Inst{31-0} = 0;
+}
+
+class _FU10<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<outs, ins, asmstr, pattern> {
+  let Inst{31-0} = 0;
+}
+
+class _FLU10<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<outs, ins, asmstr, pattern> {
+  let Inst{31-0} = 0;
+}
+
+class _F2R<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<outs, ins, asmstr, pattern> {
+  let Inst{31-0} = 0;
+}
+
+class _FRUS<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<outs, ins, asmstr, pattern> {
+  let Inst{31-0} = 0;
+}
+
+class _FL2R<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<outs, ins, asmstr, pattern> {
+  let Inst{31-0} = 0;
+}
+
+class _F1R<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<outs, ins, asmstr, pattern> {
+  let Inst{31-0} = 0;
+}
+
+class _F0R<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<outs, ins, asmstr, pattern> {
+  let Inst{31-0} = 0;
+}
+
+class _L4R<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<outs, ins, asmstr, pattern> {
+  let Inst{31-0} = 0;
+}
+
+class _L5R<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<outs, ins, asmstr, pattern> {
+  let Inst{31-0} = 0;
+}
+
+class _L6R<dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<outs, ins, asmstr, pattern> {
+  let Inst{31-0} = 0;
+}

diff --git a/lib/Target/XCore/XCoreInstrInfo.cpp b/lib/Target/XCore/XCoreInstrInfo.cpp
new file mode 100644
index 0000000..5a54844
--- /dev/null
+++ b/lib/Target/XCore/XCoreInstrInfo.cpp

@@ -0,0 +1,465 @@
+//===- XCoreInstrInfo.cpp - XCore Instruction Information -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the XCore implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreMachineFunctionInfo.h"
+#include "XCoreInstrInfo.h"
+#include "XCore.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "XCoreGenInstrInfo.inc"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+namespace XCore {
+
+  // XCore Condition Codes
+  enum CondCode {
+    COND_TRUE,
+    COND_FALSE,
+    COND_INVALID
+  };
+}
+}
+
+using namespace llvm;
+
+XCoreInstrInfo::XCoreInstrInfo()
+  : TargetInstrInfoImpl(XCoreInsts, array_lengthof(XCoreInsts)),
+    RI(*this) {
+}
+
+static bool isZeroImm(const MachineOperand &op) {
+  return op.isImm() && op.getImm() == 0;
+}
+
+/// Return true if the instruction is a register to register move and
+/// leave the source and dest operands in the passed parameters.
+///
+bool XCoreInstrInfo::isMoveInstr(const MachineInstr &MI,
+                                 unsigned &SrcReg, unsigned &DstReg,
+                                 unsigned &SrcSR, unsigned &DstSR) const {
+  SrcSR = DstSR = 0; // No sub-registers.
+
+  // We look for 4 kinds of patterns here:
+  // add dst, src, 0
+  // sub dst, src, 0
+  // or dst, src, src
+  // and dst, src, src
+  if ((MI.getOpcode() == XCore::ADD_2rus || MI.getOpcode() == XCore::SUB_2rus)
+      && isZeroImm(MI.getOperand(2))) {
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+    return true;
+  } else if ((MI.getOpcode() == XCore::OR_3r || MI.getOpcode() == XCore::AND_3r)
+      && MI.getOperand(1).getReg() == MI.getOperand(2).getReg()) {
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+    return true;
+  }
+  return false;
+}
+
+/// isLoadFromStackSlot - If the specified machine instruction is a direct
+/// load from a stack slot, return the virtual or physical register number of
+/// the destination along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than loading from the stack slot.
+unsigned
+XCoreInstrInfo::isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const{
+  int Opcode = MI->getOpcode();
+  if (Opcode == XCore::LDWFI) 
+  {
+    if ((MI->getOperand(1).isFI()) && // is a stack slot
+        (MI->getOperand(2).isImm()) &&  // the imm is zero
+        (isZeroImm(MI->getOperand(2)))) 
+    {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+  }
+  return 0;
+}
+  
+  /// isStoreToStackSlot - If the specified machine instruction is a direct
+  /// store to a stack slot, return the virtual or physical register number of
+  /// the source reg along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than storing to the stack slot.
+unsigned
+XCoreInstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                   int &FrameIndex) const {
+  int Opcode = MI->getOpcode();
+  if (Opcode == XCore::STWFI)
+  {
+    if ((MI->getOperand(1).isFI()) && // is a stack slot
+        (MI->getOperand(2).isImm()) &&  // the imm is zero
+        (isZeroImm(MI->getOperand(2))))
+    {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+  }
+  return 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Branch Analysis
+//===----------------------------------------------------------------------===//
+
+static inline bool IsBRU(unsigned BrOpc) {
+  return BrOpc == XCore::BRFU_u6
+      || BrOpc == XCore::BRFU_lu6
+      || BrOpc == XCore::BRBU_u6
+      || BrOpc == XCore::BRBU_lu6;
+}
+
+static inline bool IsBRT(unsigned BrOpc) {
+  return BrOpc == XCore::BRFT_ru6
+      || BrOpc == XCore::BRFT_lru6
+      || BrOpc == XCore::BRBT_ru6
+      || BrOpc == XCore::BRBT_lru6;
+}
+
+static inline bool IsBRF(unsigned BrOpc) {
+  return BrOpc == XCore::BRFF_ru6
+      || BrOpc == XCore::BRFF_lru6
+      || BrOpc == XCore::BRBF_ru6
+      || BrOpc == XCore::BRBF_lru6;
+}
+
+static inline bool IsCondBranch(unsigned BrOpc) {
+  return IsBRF(BrOpc) || IsBRT(BrOpc);
+}
+
+/// GetCondFromBranchOpc - Return the XCore CC that matches 
+/// the correspondent Branch instruction opcode.
+static XCore::CondCode GetCondFromBranchOpc(unsigned BrOpc) 
+{
+  if (IsBRT(BrOpc)) {
+    return XCore::COND_TRUE;
+  } else if (IsBRF(BrOpc)) {
+    return XCore::COND_FALSE;
+  } else {
+    return XCore::COND_INVALID;
+  }
+}
+
+/// GetCondBranchFromCond - Return the Branch instruction
+/// opcode that matches the cc.
+static inline unsigned GetCondBranchFromCond(XCore::CondCode CC) 
+{
+  switch (CC) {
+  default: llvm_unreachable("Illegal condition code!");
+  case XCore::COND_TRUE   : return XCore::BRFT_lru6;
+  case XCore::COND_FALSE  : return XCore::BRFF_lru6;
+  }
+}
+
+/// GetOppositeBranchCondition - Return the inverse of the specified 
+/// condition, e.g. turning COND_E to COND_NE.
+static inline XCore::CondCode GetOppositeBranchCondition(XCore::CondCode CC)
+{
+  switch (CC) {
+  default: llvm_unreachable("Illegal condition code!");
+  case XCore::COND_TRUE   : return XCore::COND_FALSE;
+  case XCore::COND_FALSE  : return XCore::COND_TRUE;
+  }
+}
+
+/// AnalyzeBranch - Analyze the branching code at the end of MBB, returning
+/// true if it cannot be understood (e.g. it's a switch dispatch or isn't
+/// implemented for a target).  Upon success, this returns false and returns
+/// with the following information in various cases:
+///
+/// 1. If this block ends with no branches (it just falls through to its succ)
+///    just return false, leaving TBB/FBB null.
+/// 2. If this block ends with only an unconditional branch, it sets TBB to be
+///    the destination block.
+/// 3. If this block ends with an conditional branch and it falls through to
+///    an successor block, it sets TBB to be the branch destination block and a
+///    list of operands that evaluate the condition. These
+///    operands can be passed to other TargetInstrInfo methods to create new
+///    branches.
+/// 4. If this block ends with an conditional branch and an unconditional
+///    block, it returns the 'true' destination in TBB, the 'false' destination
+///    in FBB, and a list of operands that evaluate the condition. These
+///    operands can be passed to other TargetInstrInfo methods to create new
+///    branches.
+///
+/// Note that RemoveBranch and InsertBranch must be implemented to support
+/// cases where this method returns success.
+///
+bool
+XCoreInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                              MachineBasicBlock *&FBB,
+                              SmallVectorImpl<MachineOperand> &Cond,
+                              bool AllowModify) const {
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+  
+  // If there is only one terminator instruction, process it.
+  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
+    if (IsBRU(LastInst->getOpcode())) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    }
+    
+    XCore::CondCode BranchCode = GetCondFromBranchOpc(LastInst->getOpcode());
+    if (BranchCode == XCore::COND_INVALID)
+      return true;  // Can't handle indirect branch.
+    
+    // Conditional branch
+    // Block ends with fall-through condbranch.
+
+    TBB = LastInst->getOperand(1).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(BranchCode));
+    Cond.push_back(LastInst->getOperand(0));
+    return false;
+  }
+  
+  // Get the instruction before it if it's a terminator.
+  MachineInstr *SecondLastInst = I;
+
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() &&
+      isUnpredicatedTerminator(--I))
+    return true;
+  
+  unsigned SecondLastOpc    = SecondLastInst->getOpcode();
+  XCore::CondCode BranchCode = GetCondFromBranchOpc(SecondLastOpc);
+  
+  // If the block ends with conditional branch followed by unconditional,
+  // handle it.
+  if (BranchCode != XCore::COND_INVALID
+    && IsBRU(LastInst->getOpcode())) {
+
+    TBB = SecondLastInst->getOperand(1).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(BranchCode));
+    Cond.push_back(SecondLastInst->getOperand(0));
+
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+  
+  // If the block ends with two unconditional branches, handle it.  The second
+  // one is not executed, so remove it.
+  if (IsBRU(SecondLastInst->getOpcode()) && 
+      IsBRU(LastInst->getOpcode())) {
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return false;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+unsigned
+XCoreInstrInfo::InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB,
+                             MachineBasicBlock *FBB,
+                             const SmallVectorImpl<MachineOperand> &Cond)const{
+  // FIXME there should probably be a DebugLoc argument here
+  DebugLoc dl = DebugLoc::getUnknownLoc();
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 2 || Cond.size() == 0) &&
+         "Unexpected number of components!");
+  
+  if (FBB == 0) { // One way branch.
+    if (Cond.empty()) {
+      // Unconditional branch
+      BuildMI(&MBB, dl, get(XCore::BRFU_lu6)).addMBB(TBB);
+    } else {
+      // Conditional branch.
+      unsigned Opc = GetCondBranchFromCond((XCore::CondCode)Cond[0].getImm());
+      BuildMI(&MBB, dl, get(Opc)).addReg(Cond[1].getReg())
+                             .addMBB(TBB);
+    }
+    return 1;
+  }
+  
+  // Two-way Conditional branch.
+  assert(Cond.size() == 2 && "Unexpected number of components!");
+  unsigned Opc = GetCondBranchFromCond((XCore::CondCode)Cond[0].getImm());
+  BuildMI(&MBB, dl, get(Opc)).addReg(Cond[1].getReg())
+                         .addMBB(TBB);
+  BuildMI(&MBB, dl, get(XCore::BRFU_lu6)).addMBB(FBB);
+  return 2;
+}
+
+unsigned
+XCoreInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin()) return 0;
+  --I;
+  if (!IsBRU(I->getOpcode()) && !IsCondBranch(I->getOpcode()))
+    return 0;
+  
+  // Remove the branch.
+  I->eraseFromParent();
+  
+  I = MBB.end();
+
+  if (I == MBB.begin()) return 1;
+  --I;
+  if (!IsCondBranch(I->getOpcode()))
+    return 1;
+  
+  // Remove the branch.
+  I->eraseFromParent();
+  return 2;
+}
+
+bool XCoreInstrInfo::copyRegToReg(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I,
+                                  unsigned DestReg, unsigned SrcReg,
+                                  const TargetRegisterClass *DestRC,
+                                  const TargetRegisterClass *SrcRC) const {
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+
+  if (DestRC == SrcRC) {
+    if (DestRC == XCore::GRRegsRegisterClass) {
+      BuildMI(MBB, I, DL, get(XCore::ADD_2rus), DestReg)
+        .addReg(SrcReg)
+        .addImm(0);
+      return true;
+    } else {
+      return false;
+    }
+  }
+  
+  if (SrcRC == XCore::RRegsRegisterClass && SrcReg == XCore::SP &&
+    DestRC == XCore::GRRegsRegisterClass) {
+    BuildMI(MBB, I, DL, get(XCore::LDAWSP_ru6), DestReg)
+      .addImm(0);
+    return true;
+  }
+  if (DestRC == XCore::RRegsRegisterClass && DestReg == XCore::SP &&
+    SrcRC == XCore::GRRegsRegisterClass) {
+    BuildMI(MBB, I, DL, get(XCore::SETSP_1r))
+      .addReg(SrcReg);
+    return true;
+  }
+  return false;
+}
+
+void XCoreInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator I,
+                                         unsigned SrcReg, bool isKill,
+                                         int FrameIndex,
+                                         const TargetRegisterClass *RC) const
+{
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+  BuildMI(MBB, I, DL, get(XCore::STWFI))
+    .addReg(SrcReg, getKillRegState(isKill))
+    .addFrameIndex(FrameIndex)
+    .addImm(0);
+}
+
+void XCoreInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator I,
+                                          unsigned DestReg, int FrameIndex,
+                                          const TargetRegisterClass *RC) const
+{
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (I != MBB.end()) DL = I->getDebugLoc();
+  BuildMI(MBB, I, DL, get(XCore::LDWFI), DestReg)
+    .addFrameIndex(FrameIndex)
+    .addImm(0);
+}
+
+bool XCoreInstrInfo::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                               MachineBasicBlock::iterator MI,
+                                  const std::vector<CalleeSavedInfo> &CSI) const
+{
+  if (CSI.empty()) {
+    return true;
+  }
+  MachineFunction *MF = MBB.getParent();
+  const MachineFrameInfo *MFI = MF->getFrameInfo();
+  MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
+  XCoreFunctionInfo *XFI = MF->getInfo<XCoreFunctionInfo>();
+  
+  bool emitFrameMoves = XCoreRegisterInfo::needsFrameMoves(*MF);
+
+  DebugLoc DL = DebugLoc::getUnknownLoc();
+  if (MI != MBB.end()) DL = MI->getDebugLoc();
+  
+  for (std::vector<CalleeSavedInfo>::const_iterator it = CSI.begin();
+                                                    it != CSI.end(); ++it) {
+    // Add the callee-saved register as live-in. It's killed at the spill.
+    MBB.addLiveIn(it->getReg());
+
+    storeRegToStackSlot(MBB, MI, it->getReg(), true,
+                        it->getFrameIdx(), it->getRegClass());
+    if (emitFrameMoves) {
+      unsigned SaveLabelId = MMI->NextLabelID();
+      BuildMI(MBB, MI, DL, get(XCore::DBG_LABEL)).addImm(SaveLabelId);
+      XFI->getSpillLabels().push_back(
+          std::pair<unsigned, CalleeSavedInfo>(SaveLabelId, *it));
+    }
+  }
+  return true;
+}
+
+bool XCoreInstrInfo::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator MI,
+                               const std::vector<CalleeSavedInfo> &CSI) const
+{
+  bool AtStart = MI == MBB.begin();
+  MachineBasicBlock::iterator BeforeI = MI;
+  if (!AtStart)
+    --BeforeI;
+  for (std::vector<CalleeSavedInfo>::const_iterator it = CSI.begin();
+                                                    it != CSI.end(); ++it) {
+    
+    loadRegFromStackSlot(MBB, MI, it->getReg(),
+                                  it->getFrameIdx(),
+                                  it->getRegClass());
+    assert(MI != MBB.begin() &&
+           "loadRegFromStackSlot didn't insert any code!");
+    // Insert in reverse order.  loadRegFromStackSlot can insert multiple
+    // instructions.
+    if (AtStart)
+      MI = MBB.begin();
+    else {
+      MI = BeforeI;
+      ++MI;
+    }
+  }
+  return true;
+}
+
+/// ReverseBranchCondition - Return the inverse opcode of the 
+/// specified Branch instruction.
+bool XCoreInstrInfo::
+ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const 
+{
+  assert((Cond.size() == 2) && 
+          "Invalid XCore branch condition!");
+  Cond[0].setImm(GetOppositeBranchCondition((XCore::CondCode)Cond[0].getImm()));
+  return false;
+}

diff --git a/lib/Target/XCore/XCoreInstrInfo.h b/lib/Target/XCore/XCoreInstrInfo.h
new file mode 100644
index 0000000..3e0a765
--- /dev/null
+++ b/lib/Target/XCore/XCoreInstrInfo.h

@@ -0,0 +1,96 @@
+//===- XCoreInstrInfo.h - XCore Instruction Information ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the XCore implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCOREINSTRUCTIONINFO_H
+#define XCOREINSTRUCTIONINFO_H
+
+#include "llvm/Target/TargetInstrInfo.h"
+#include "XCoreRegisterInfo.h"
+
+namespace llvm {
+
+class XCoreInstrInfo : public TargetInstrInfoImpl {
+  const XCoreRegisterInfo RI;
+public:
+  XCoreInstrInfo();
+
+  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
+  /// such, whenever a client has an instance of instruction info, it should
+  /// always be able to get register info as well (through this method).
+  ///
+  virtual const TargetRegisterInfo &getRegisterInfo() const { return RI; }
+
+  /// Return true if the instruction is a register to register move and return
+  /// the source and dest operands and their sub-register indices by reference.
+  virtual bool isMoveInstr(const MachineInstr &MI,
+                           unsigned &SrcReg, unsigned &DstReg,
+                           unsigned &SrcSubIdx, unsigned &DstSubIdx) const;
+  
+  /// isLoadFromStackSlot - If the specified machine instruction is a direct
+  /// load from a stack slot, return the virtual or physical register number of
+  /// the destination along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than loading from the stack slot.
+  virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                                       int &FrameIndex) const;
+  
+  /// isStoreToStackSlot - If the specified machine instruction is a direct
+  /// store to a stack slot, return the virtual or physical register number of
+  /// the source reg along with the FrameIndex of the loaded stack slot.  If
+  /// not, return 0.  This predicate must return 0 if the instruction has
+  /// any side effects other than storing to the stack slot.
+  virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
+                                      int &FrameIndex) const;
+  
+  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                             MachineBasicBlock *&FBB,
+                             SmallVectorImpl<MachineOperand> &Cond,
+                             bool AllowModify) const;
+  
+  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                             MachineBasicBlock *FBB,
+                             const SmallVectorImpl<MachineOperand> &Cond) const;
+  
+  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+
+  virtual bool copyRegToReg(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator I,
+                            unsigned DestReg, unsigned SrcReg,
+                            const TargetRegisterClass *DestRC,
+                            const TargetRegisterClass *SrcRC) const;
+
+  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   unsigned SrcReg, bool isKill, int FrameIndex,
+                                   const TargetRegisterClass *RC) const;
+
+  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MI,
+                                    unsigned DestReg, int FrameIndex,
+                                    const TargetRegisterClass *RC) const;
+
+  virtual bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                const std::vector<CalleeSavedInfo> &CSI) const;
+  
+  virtual bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator MI,
+                               const std::vector<CalleeSavedInfo> &CSI) const;
+
+  virtual bool ReverseBranchCondition(
+                            SmallVectorImpl<MachineOperand> &Cond) const;
+};
+
+}
+
+#endif

diff --git a/lib/Target/XCore/XCoreInstrInfo.td b/lib/Target/XCore/XCoreInstrInfo.td
new file mode 100644
index 0000000..10dc18c
--- /dev/null
+++ b/lib/Target/XCore/XCoreInstrInfo.td

@@ -0,0 +1,1001 @@
+//===- XCoreInstrInfo.td - Target Description for XCore ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the XCore instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+
+// Uses of CP, DP are not currently reflected in the patterns, since
+// having a physical register as an operand prevents loop hoisting and
+// since the value of these registers never changes during the life of the
+// function.
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass.
+//===----------------------------------------------------------------------===//
+
+include "XCoreInstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// XCore specific DAG Nodes.
+//
+
+// Call
+def SDT_XCoreBranchLink : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
+def XCoreBranchLink     : SDNode<"XCoreISD::BL",SDT_XCoreBranchLink,
+                            [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+
+def XCoreRetsp       : SDNode<"XCoreISD::RETSP", SDTNone,
+                         [SDNPHasChain, SDNPOptInFlag]>;
+
+def SDT_XCoreAddress    : SDTypeProfile<1, 1,
+                            [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>;
+
+def pcrelwrapper : SDNode<"XCoreISD::PCRelativeWrapper", SDT_XCoreAddress,
+                           []>;
+
+def dprelwrapper : SDNode<"XCoreISD::DPRelativeWrapper", SDT_XCoreAddress,
+                           []>;
+
+def cprelwrapper : SDNode<"XCoreISD::CPRelativeWrapper", SDT_XCoreAddress,
+                           []>;
+
+def SDT_XCoreStwsp    : SDTypeProfile<0, 2, [SDTCisInt<1>]>;
+def XCoreStwsp        : SDNode<"XCoreISD::STWSP", SDT_XCoreStwsp,
+                               [SDNPHasChain]>;
+
+// These are target-independent nodes, but have target-specific formats.
+def SDT_XCoreCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>;
+def SDT_XCoreCallSeqEnd   : SDCallSeqEnd<[ SDTCisVT<0, i32>,
+                                        SDTCisVT<1, i32> ]>;
+
+def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_XCoreCallSeqStart,
+                           [SDNPHasChain, SDNPOutFlag]>;
+def callseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_XCoreCallSeqEnd,
+                           [SDNPHasChain, SDNPOptInFlag, SDNPOutFlag]>;
+
+//===----------------------------------------------------------------------===//
+// Instruction Pattern Stuff
+//===----------------------------------------------------------------------===//
+
+def div4_xform : SDNodeXForm<imm, [{
+  // Transformation function: imm/4
+  assert(N->getZExtValue() % 4 == 0);
+  return getI32Imm(N->getZExtValue()/4);
+}]>;
+
+def msksize_xform : SDNodeXForm<imm, [{
+  // Transformation function: get the size of a mask
+  assert(isMask_32(N->getZExtValue()));
+  // look for the first non-zero bit
+  return getI32Imm(32 - CountLeadingZeros_32(N->getZExtValue()));
+}]>;
+
+def neg_xform : SDNodeXForm<imm, [{
+  // Transformation function: -imm
+  uint32_t value = N->getZExtValue();
+  return getI32Imm(-value);
+}]>;
+
+def bpwsub_xform : SDNodeXForm<imm, [{
+  // Transformation function: 32-imm
+  uint32_t value = N->getZExtValue();
+  return getI32Imm(32-value);
+}]>;
+
+def div4neg_xform : SDNodeXForm<imm, [{
+  // Transformation function: -imm/4
+  uint32_t value = N->getZExtValue();
+  assert(-value % 4 == 0);
+  return getI32Imm(-value/4);
+}]>;
+
+def immUs4Neg : PatLeaf<(imm), [{
+  uint32_t value = (uint32_t)N->getZExtValue();
+  return (-value)%4 == 0 && (-value)/4 <= 11;
+}]>;
+
+def immUs4 : PatLeaf<(imm), [{
+  uint32_t value = (uint32_t)N->getZExtValue();
+  return value%4 == 0 && value/4 <= 11;
+}]>;
+
+def immUsNeg : PatLeaf<(imm), [{
+  return -((uint32_t)N->getZExtValue()) <= 11;
+}]>;
+
+def immUs : PatLeaf<(imm), [{
+  return (uint32_t)N->getZExtValue() <= 11;
+}]>;
+
+def immU6 : PatLeaf<(imm), [{
+  return (uint32_t)N->getZExtValue() < (1 << 6);
+}]>;
+
+def immU10 : PatLeaf<(imm), [{
+  return (uint32_t)N->getZExtValue() < (1 << 10);
+}]>;
+
+def immU16 : PatLeaf<(imm), [{
+  return (uint32_t)N->getZExtValue() < (1 << 16);
+}]>;
+
+def immU20 : PatLeaf<(imm), [{
+  return (uint32_t)N->getZExtValue() < (1 << 20);
+}]>;
+
+def immMskBitp : PatLeaf<(imm), [{
+  uint32_t value = (uint32_t)N->getZExtValue();
+  if (!isMask_32(value)) {
+    return false;
+  }
+  int msksize = 32 - CountLeadingZeros_32(value);
+  return (msksize >= 1 && msksize <= 8)
+          || msksize == 16
+          || msksize == 24
+          || msksize == 32;
+}]>;
+
+def immBitp : PatLeaf<(imm), [{
+  uint32_t value = (uint32_t)N->getZExtValue();
+  return (value >= 1 && value <= 8)
+          || value == 16
+          || value == 24
+          || value == 32;
+}]>;
+
+def immBpwSubBitp : PatLeaf<(imm), [{
+  uint32_t value = (uint32_t)N->getZExtValue();
+  return (value >= 24 && value <= 31)
+          || value == 16
+          || value == 8
+          || value == 0;
+}]>;
+
+def lda16f : PatFrag<(ops node:$addr, node:$offset),
+                     (add node:$addr, (shl node:$offset, 1))>;
+def lda16b : PatFrag<(ops node:$addr, node:$offset),
+                     (sub node:$addr, (shl node:$offset, 1))>;
+def ldawf : PatFrag<(ops node:$addr, node:$offset),
+                     (add node:$addr, (shl node:$offset, 2))>;
+def ldawb : PatFrag<(ops node:$addr, node:$offset),
+                     (sub node:$addr, (shl node:$offset, 2))>;
+
+// Instruction operand types
+def calltarget  : Operand<i32>;
+def brtarget : Operand<OtherVT>;
+def pclabel : Operand<i32>;
+
+// Addressing modes
+def ADDRspii : ComplexPattern<i32, 2, "SelectADDRspii", [add, frameindex], []>;
+def ADDRdpii : ComplexPattern<i32, 2, "SelectADDRdpii", [add, dprelwrapper],
+                 []>;
+def ADDRcpii : ComplexPattern<i32, 2, "SelectADDRcpii", [add, cprelwrapper],
+                 []>;
+
+// Address operands
+def MEMii : Operand<i32> {
+  let PrintMethod = "printMemOperand";
+  let MIOperandInfo = (ops i32imm, i32imm);
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction Class Templates
+//===----------------------------------------------------------------------===//
+
+// Three operand short
+
+multiclass F3R_2RUS<string OpcStr, SDNode OpNode> {
+  def _3r: _F3R<
+                 (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+                 !strconcat(OpcStr, " $dst, $b, $c"),
+                 [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+  def _2rus : _F2RUS<
+                 (outs GRRegs:$dst), (ins GRRegs:$b, i32imm:$c),
+                 !strconcat(OpcStr, " $dst, $b, $c"),
+                 [(set GRRegs:$dst, (OpNode GRRegs:$b, immUs:$c))]>;
+}
+
+multiclass F3R_2RUS_np<string OpcStr> {
+  def _3r: _F3R<
+                 (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+                 !strconcat(OpcStr, " $dst, $b, $c"),
+                 []>;
+  def _2rus : _F2RUS<
+                 (outs GRRegs:$dst), (ins GRRegs:$b, i32imm:$c),
+                 !strconcat(OpcStr, " $dst, $b, $c"),
+                 []>;
+}
+
+multiclass F3R_2RBITP<string OpcStr, SDNode OpNode> {
+  def _3r: _F3R<
+                 (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+                 !strconcat(OpcStr, " $dst, $b, $c"),
+                 [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+  def _2rus : _F2RUS<
+                 (outs GRRegs:$dst), (ins GRRegs:$b, i32imm:$c),
+                 !strconcat(OpcStr, " $dst, $b, $c"),
+                 [(set GRRegs:$dst, (OpNode GRRegs:$b, immBitp:$c))]>;
+}
+
+class F3R<string OpcStr, SDNode OpNode> : _F3R<
+                 (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+                 !strconcat(OpcStr, " $dst, $b, $c"),
+                 [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+
+class F3R_np<string OpcStr> : _F3R<
+                 (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+                 !strconcat(OpcStr, " $dst, $b, $c"),
+                 []>;
+// Three operand long
+
+/// FL3R_L2RUS multiclass - Define a normal FL3R/FL2RUS pattern in one shot.
+multiclass FL3R_L2RUS<string OpcStr, SDNode OpNode> {
+  def _l3r: _FL3R<
+                 (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+                 !strconcat(OpcStr, " $dst, $b, $c"),
+                 [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+  def _l2rus : _FL2RUS<
+                 (outs GRRegs:$dst), (ins GRRegs:$b, i32imm:$c),
+                 !strconcat(OpcStr, " $dst, $b, $c"),
+                 [(set GRRegs:$dst, (OpNode GRRegs:$b, immUs:$c))]>;
+}
+
+/// FL3R_L2RUS multiclass - Define a normal FL3R/FL2RUS pattern in one shot.
+multiclass FL3R_L2RBITP<string OpcStr, SDNode OpNode> {
+  def _l3r: _FL3R<
+                 (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+                 !strconcat(OpcStr, " $dst, $b, $c"),
+                 [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+  def _l2rus : _FL2RUS<
+                 (outs GRRegs:$dst), (ins GRRegs:$b, i32imm:$c),
+                 !strconcat(OpcStr, " $dst, $b, $c"),
+                 [(set GRRegs:$dst, (OpNode GRRegs:$b, immBitp:$c))]>;
+}
+
+class FL3R<string OpcStr, SDNode OpNode> : _FL3R<
+                 (outs GRRegs:$dst), (ins GRRegs:$b, GRRegs:$c),
+                 !strconcat(OpcStr, " $dst, $b, $c"),
+                 [(set GRRegs:$dst, (OpNode GRRegs:$b, GRRegs:$c))]>;
+
+// Register - U6
+// Operand register - U6
+multiclass FRU6_LRU6_branch<string OpcStr> {
+  def _ru6: _FRU6<
+                 (outs), (ins GRRegs:$cond, brtarget:$dest),
+                 !strconcat(OpcStr, " $cond, $dest"),
+                 []>;
+  def _lru6: _FLRU6<
+                 (outs), (ins GRRegs:$cond, brtarget:$dest),
+                 !strconcat(OpcStr, " $cond, $dest"),
+                 []>;
+}
+
+multiclass FRU6_LRU6_cp<string OpcStr> {
+  def _ru6: _FRU6<
+                 (outs GRRegs:$dst), (ins i32imm:$a),
+                 !strconcat(OpcStr, " $dst, cp[$a]"),
+                 []>;
+  def _lru6: _FLRU6<
+                 (outs GRRegs:$dst), (ins i32imm:$a),
+                 !strconcat(OpcStr, " $dst, cp[$a]"),
+                 []>;
+}
+
+// U6
+multiclass FU6_LU6<string OpcStr, SDNode OpNode> {
+  def _u6: _FU6<
+                 (outs), (ins i32imm:$b),
+                 !strconcat(OpcStr, " $b"),
+                 [(OpNode immU6:$b)]>;
+  def _lu6: _FLU6<
+                 (outs), (ins i32imm:$b),
+                 !strconcat(OpcStr, " $b"),
+                 [(OpNode immU16:$b)]>;
+}
+
+multiclass FU6_LU6_np<string OpcStr> {
+  def _u6: _FU6<
+                 (outs), (ins i32imm:$b),
+                 !strconcat(OpcStr, " $b"),
+                 []>;
+  def _lu6: _FLU6<
+                 (outs), (ins i32imm:$b),
+                 !strconcat(OpcStr, " $b"),
+                 []>;
+}
+
+// U10
+multiclass FU10_LU10_np<string OpcStr> {
+  def _u10: _FU10<
+                 (outs), (ins i32imm:$b),
+                 !strconcat(OpcStr, " $b"),
+                 []>;
+  def _lu10: _FLU10<
+                 (outs), (ins i32imm:$b),
+                 !strconcat(OpcStr, " $b"),
+                 []>;
+}
+
+// Two operand short
+
+class F2R_np<string OpcStr> : _F2R<
+                 (outs GRRegs:$dst), (ins GRRegs:$b),
+                 !strconcat(OpcStr, " $dst, $b"),
+                 []>;
+
+// Two operand long
+
+//===----------------------------------------------------------------------===//
+// Pseudo Instructions
+//===----------------------------------------------------------------------===//
+
+let Defs = [SP], Uses = [SP] in {
+def ADJCALLSTACKDOWN : PseudoInstXCore<(outs), (ins i32imm:$amt),
+                               "${:comment} ADJCALLSTACKDOWN $amt",
+                               [(callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP : PseudoInstXCore<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                            "${:comment} ADJCALLSTACKUP $amt1",
+                            [(callseq_end timm:$amt1, timm:$amt2)]>;
+}
+
+def LDWFI : PseudoInstXCore<(outs GRRegs:$dst), (ins MEMii:$addr),
+                             "${:comment} LDWFI $dst, $addr",
+                             [(set GRRegs:$dst, (load ADDRspii:$addr))]>;
+
+def LDAWFI : PseudoInstXCore<(outs GRRegs:$dst), (ins MEMii:$addr),
+                             "${:comment} LDAWFI $dst, $addr",
+                             [(set GRRegs:$dst, ADDRspii:$addr)]>;
+
+def STWFI : PseudoInstXCore<(outs), (ins GRRegs:$src, MEMii:$addr),
+                            "${:comment} STWFI $src, $addr",
+                            [(store GRRegs:$src, ADDRspii:$addr)]>;
+
+// SELECT_CC_* - Used to implement the SELECT_CC DAG operation.  Expanded after
+// instruction selection into a branch sequence.
+let usesCustomInserter = 1 in {
+  def SELECT_CC : PseudoInstXCore<(outs GRRegs:$dst),
+                              (ins GRRegs:$cond, GRRegs:$T, GRRegs:$F),
+                              "${:comment} SELECT_CC PSEUDO!",
+                              [(set GRRegs:$dst,
+                                 (select GRRegs:$cond, GRRegs:$T, GRRegs:$F))]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+// Three operand short
+defm ADD : F3R_2RUS<"add", add>;
+defm SUB : F3R_2RUS<"sub", sub>;
+let neverHasSideEffects = 1 in {
+defm EQ : F3R_2RUS_np<"eq">;
+def LSS_3r : F3R_np<"lss">;
+def LSU_3r : F3R_np<"lsu">;
+}
+def AND_3r : F3R<"and", and>;
+def OR_3r : F3R<"or", or>;
+
+let mayLoad=1 in {
+def LDW_3r : _F3R<(outs GRRegs:$dst), (ins GRRegs:$addr, GRRegs:$offset),
+                  "ldw $dst, $addr[$offset]",
+                  []>;
+
+def LDW_2rus : _F2RUS<(outs GRRegs:$dst), (ins GRRegs:$addr, i32imm:$offset),
+                  "ldw $dst, $addr[$offset]",
+                  []>;
+
+def LD16S_3r :  _F3R<(outs GRRegs:$dst), (ins GRRegs:$addr, GRRegs:$offset),
+                  "ld16s $dst, $addr[$offset]",
+                  []>;
+
+def LD8U_3r :  _F3R<(outs GRRegs:$dst), (ins GRRegs:$addr, GRRegs:$offset),
+                  "ld8u $dst, $addr[$offset]",
+                  []>;
+}
+
+let mayStore=1 in {
+def STW_3r : _F3R<(outs), (ins GRRegs:$val, GRRegs:$addr, GRRegs:$offset),
+                  "stw $val, $addr[$offset]",
+                  []>;
+
+def STW_2rus : _F2RUS<(outs), (ins GRRegs:$val, GRRegs:$addr, i32imm:$offset),
+                  "stw $val, $addr[$offset]",
+                  []>;
+}
+
+defm SHL : F3R_2RBITP<"shl", shl>;
+defm SHR : F3R_2RBITP<"shr", srl>;
+// TODO tsetr
+
+// Three operand long
+def LDAWF_l3r : _FL3R<(outs GRRegs:$dst), (ins GRRegs:$addr, GRRegs:$offset),
+                  "ldaw $dst, $addr[$offset]",
+                  [(set GRRegs:$dst, (ldawf GRRegs:$addr, GRRegs:$offset))]>;
+
+let neverHasSideEffects = 1 in
+def LDAWF_l2rus : _FL2RUS<(outs GRRegs:$dst),
+                    (ins GRRegs:$addr, i32imm:$offset),
+                    "ldaw $dst, $addr[$offset]",
+                    []>;
+
+def LDAWB_l3r : _FL3R<(outs GRRegs:$dst), (ins GRRegs:$addr, GRRegs:$offset),
+                  "ldaw $dst, $addr[-$offset]",
+                  [(set GRRegs:$dst, (ldawb GRRegs:$addr, GRRegs:$offset))]>;
+
+let neverHasSideEffects = 1 in
+def LDAWB_l2rus : _FL2RUS<(outs GRRegs:$dst),
+                    (ins GRRegs:$addr, i32imm:$offset),
+                    "ldaw $dst, $addr[-$offset]",
+                    []>;
+
+def LDA16F_l3r : _FL3R<(outs GRRegs:$dst), (ins GRRegs:$addr, GRRegs:$offset),
+                  "lda16 $dst, $addr[$offset]",
+                  [(set GRRegs:$dst, (lda16f GRRegs:$addr, GRRegs:$offset))]>;
+
+def LDA16B_l3r : _FL3R<(outs GRRegs:$dst), (ins GRRegs:$addr, GRRegs:$offset),
+                  "lda16 $dst, $addr[-$offset]",
+                  [(set GRRegs:$dst, (lda16b GRRegs:$addr, GRRegs:$offset))]>;
+
+def MUL_l3r : FL3R<"mul", mul>;
+// Instructions which may trap are marked as side effecting.
+let hasSideEffects = 1 in {
+def DIVS_l3r : FL3R<"divs", sdiv>;
+def DIVU_l3r : FL3R<"divu", udiv>;
+def REMS_l3r : FL3R<"rems", srem>;
+def REMU_l3r : FL3R<"remu", urem>;
+}
+def XOR_l3r : FL3R<"xor", xor>;
+defm ASHR : FL3R_L2RBITP<"ashr", sra>;
+// TODO crc32, crc8, inpw, outpw
+let mayStore=1 in {
+def ST16_l3r : _FL3R<(outs), (ins GRRegs:$val, GRRegs:$addr, GRRegs:$offset),
+                "st16 $val, $addr[$offset]",
+                []>;
+
+def ST8_l3r : _FL3R<(outs), (ins GRRegs:$val, GRRegs:$addr, GRRegs:$offset),
+                "st8 $val, $addr[$offset]",
+                []>;
+}
+
+// Four operand long
+let Constraints = "$src1 = $dst1,$src2 = $dst2" in {
+def MACCU_l4r : _L4R<(outs GRRegs:$dst1, GRRegs:$dst2),
+                    (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3,
+                      GRRegs:$src4),
+                    "maccu $dst1, $dst2, $src3, $src4",
+                    []>;
+
+def MACCS_l4r : _L4R<(outs GRRegs:$dst1, GRRegs:$dst2),
+                    (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3,
+                      GRRegs:$src4),
+                    "maccs $dst1, $dst2, $src3, $src4",
+                    []>;
+}
+
+// Five operand long
+
+def LADD_l5r : _L5R<(outs GRRegs:$dst1, GRRegs:$dst2),
+                    (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3),
+                    "ladd $dst1, $dst2, $src1, $src2, $src3",
+                    []>;
+
+def LSUB_l5r : _L5R<(outs GRRegs:$dst1, GRRegs:$dst2),
+                    (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3),
+                    "lsub $dst1, $dst2, $src1, $src2, $src3",
+                    []>;
+
+def LDIV_l5r : _L5R<(outs GRRegs:$dst1, GRRegs:$dst2),
+                    (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3),
+                    "ldiv $dst1, $dst2, $src1, $src2, $src3",
+                    []>;
+
+// Six operand long
+
+def LMUL_l6r : _L6R<(outs GRRegs:$dst1, GRRegs:$dst2),
+                    (ins GRRegs:$src1, GRRegs:$src2, GRRegs:$src3,
+                      GRRegs:$src4),
+                    "lmul $dst1, $dst2, $src1, $src2, $src3, $src4",
+                    []>;
+
+// Register - U6
+
+//let Uses = [DP] in ...
+let neverHasSideEffects = 1, isReMaterializable = 1 in
+def LDAWDP_ru6: _FRU6<(outs GRRegs:$dst), (ins MEMii:$a),
+                    "ldaw $dst, dp[$a]",
+                    []>;
+
+let isReMaterializable = 1 in                    
+def LDAWDP_lru6: _FLRU6<
+                    (outs GRRegs:$dst), (ins MEMii:$a),
+                    "ldaw $dst, dp[$a]",
+                    [(set GRRegs:$dst, ADDRdpii:$a)]>;
+
+let mayLoad=1 in
+def LDWDP_ru6: _FRU6<(outs GRRegs:$dst), (ins MEMii:$a),
+                    "ldw $dst, dp[$a]",
+                    []>;
+                    
+def LDWDP_lru6: _FLRU6<
+                    (outs GRRegs:$dst), (ins MEMii:$a),
+                    "ldw $dst, dp[$a]",
+                    [(set GRRegs:$dst, (load ADDRdpii:$a))]>;
+
+let mayStore=1 in
+def STWDP_ru6 : _FRU6<(outs), (ins GRRegs:$val, MEMii:$addr),
+                  "stw $val, dp[$addr]",
+                  []>;
+
+def STWDP_lru6 : _FLRU6<(outs), (ins GRRegs:$val, MEMii:$addr),
+                  "stw $val, dp[$addr]",
+                  [(store GRRegs:$val, ADDRdpii:$addr)]>;
+
+//let Uses = [CP] in ..
+let mayLoad = 1, isReMaterializable = 1 in
+defm LDWCP : FRU6_LRU6_cp<"ldw">;
+
+let Uses = [SP] in {
+let mayStore=1 in {
+def STWSP_ru6 : _FRU6<
+                 (outs), (ins GRRegs:$val, i32imm:$index),
+                 "stw $val, sp[$index]",
+                 [(XCoreStwsp GRRegs:$val, immU6:$index)]>;
+
+def STWSP_lru6 : _FLRU6<
+                 (outs), (ins GRRegs:$val, i32imm:$index),
+                 "stw $val, sp[$index]",
+                 [(XCoreStwsp GRRegs:$val, immU16:$index)]>;
+}
+
+let mayLoad=1 in {
+def LDWSP_ru6 : _FRU6<
+                 (outs GRRegs:$dst), (ins i32imm:$b),
+                 "ldw $dst, sp[$b]",
+                 []>;
+
+def LDWSP_lru6 : _FLRU6<
+                 (outs GRRegs:$dst), (ins i32imm:$b),
+                 "ldw $dst, sp[$b]",
+                 []>;
+}
+
+let neverHasSideEffects = 1 in {
+def LDAWSP_ru6 : _FRU6<
+                 (outs GRRegs:$dst), (ins i32imm:$b),
+                 "ldaw $dst, sp[$b]",
+                 []>;
+
+def LDAWSP_lru6 : _FLRU6<
+                 (outs GRRegs:$dst), (ins i32imm:$b),
+                 "ldaw $dst, sp[$b]",
+                 []>;
+
+def LDAWSP_ru6_RRegs : _FRU6<
+                 (outs RRegs:$dst), (ins i32imm:$b),
+                 "ldaw $dst, sp[$b]",
+                 []>;
+
+def LDAWSP_lru6_RRegs : _FLRU6<
+                 (outs RRegs:$dst), (ins i32imm:$b),
+                 "ldaw $dst, sp[$b]",
+                 []>;
+}
+}
+
+let isReMaterializable = 1 in {
+def LDC_ru6 : _FRU6<
+                 (outs GRRegs:$dst), (ins i32imm:$b),
+                 "ldc $dst, $b",
+                 [(set GRRegs:$dst, immU6:$b)]>;
+
+def LDC_lru6 : _FLRU6<
+                 (outs GRRegs:$dst), (ins i32imm:$b),
+                 "ldc $dst, $b",
+                 [(set GRRegs:$dst, immU16:$b)]>;
+}
+
+// Operand register - U6
+// TODO setc
+let isBranch = 1, isTerminator = 1 in {
+defm BRFT: FRU6_LRU6_branch<"bt">;
+defm BRBT: FRU6_LRU6_branch<"bt">;
+defm BRFF: FRU6_LRU6_branch<"bf">;
+defm BRBF: FRU6_LRU6_branch<"bf">;
+}
+
+// U6
+let Defs = [SP], Uses = [SP] in {
+let neverHasSideEffects = 1 in
+defm EXTSP : FU6_LU6_np<"extsp">;
+let mayStore = 1 in
+defm ENTSP : FU6_LU6_np<"entsp">;
+
+let isReturn = 1, isTerminator = 1, mayLoad = 1, isBarrier = 1 in {
+defm RETSP : FU6_LU6<"retsp", XCoreRetsp>;
+}
+}
+
+// TODO extdp, kentsp, krestsp, blat, setsr
+// clrsr, getsr, kalli
+let isBranch = 1, isTerminator = 1 in {
+def BRBU_u6 : _FU6<
+                 (outs),
+                 (ins brtarget:$target),
+                 "bu $target",
+                 []>;
+
+def BRBU_lu6 : _FLU6<
+                 (outs),
+                 (ins brtarget:$target),
+                 "bu $target",
+                 []>;
+
+def BRFU_u6 : _FU6<
+                 (outs),
+                 (ins brtarget:$target),
+                 "bu $target",
+                 []>;
+
+def BRFU_lu6 : _FLU6<
+                 (outs),
+                 (ins brtarget:$target),
+                 "bu $target",
+                 []>;
+}
+
+//let Uses = [CP] in ...
+let Defs = [R11], neverHasSideEffects = 1, isReMaterializable = 1 in
+def LDAWCP_u6: _FRU6<(outs), (ins MEMii:$a),
+                    "ldaw r11, cp[$a]",
+                    []>;
+
+let Defs = [R11], isReMaterializable = 1 in
+def LDAWCP_lu6: _FLRU6<
+                    (outs), (ins MEMii:$a),
+                    "ldaw r11, cp[$a]",
+                    [(set R11, ADDRcpii:$a)]>;
+
+// U10
+// TODO ldwcpl, blacp
+
+let Defs = [R11], isReMaterializable = 1, neverHasSideEffects = 1 in
+def LDAP_u10 : _FU10<
+                  (outs),
+                  (ins i32imm:$addr),
+                  "ldap r11, $addr",
+                  []>;
+
+let Defs = [R11], isReMaterializable = 1 in
+def LDAP_lu10 : _FLU10<
+                  (outs),
+                  (ins i32imm:$addr),
+                  "ldap r11, $addr",
+                  [(set R11, (pcrelwrapper tglobaladdr:$addr))]>;
+
+let Defs = [R11], isReMaterializable = 1 in
+def LDAP_lu10_ba : _FLU10<(outs),
+                          (ins i32imm:$addr),
+                          "ldap r11, $addr",
+                          [(set R11, (pcrelwrapper tblockaddress:$addr))]>;
+
+let isCall=1,
+// All calls clobber the link register and the non-callee-saved registers:
+Defs = [R0, R1, R2, R3, R11, LR] in {
+def BL_u10 : _FU10<
+                  (outs),
+                  (ins calltarget:$target, variable_ops),
+                  "bl $target",
+                  [(XCoreBranchLink immU10:$target)]>;
+
+def BL_lu10 : _FLU10<
+                  (outs),
+                  (ins calltarget:$target, variable_ops),
+                  "bl $target",
+                  [(XCoreBranchLink immU20:$target)]>;
+}
+
+// Two operand short
+// TODO getr, getst
+def NOT : _F2R<(outs GRRegs:$dst), (ins GRRegs:$b),
+                 "not $dst, $b",
+                 [(set GRRegs:$dst, (not GRRegs:$b))]>;
+
+def NEG : _F2R<(outs GRRegs:$dst), (ins GRRegs:$b),
+                 "neg $dst, $b",
+                 [(set GRRegs:$dst, (ineg GRRegs:$b))]>;
+
+// TODO setd, eet, eef, getts, setpt, outct, inct, chkct, outt, intt, out,
+// in, outshr, inshr, testct, testwct, tinitpc, tinitdp, tinitsp, tinitcp,
+// tsetmr, sext (reg), zext (reg)
+let isTwoAddress = 1 in {
+let neverHasSideEffects = 1 in
+def SEXT_rus : _FRUS<(outs GRRegs:$dst), (ins GRRegs:$src1, i32imm:$src2),
+                 "sext $dst, $src2",
+                 []>;
+
+let neverHasSideEffects = 1 in
+def ZEXT_rus : _FRUS<(outs GRRegs:$dst), (ins GRRegs:$src1, i32imm:$src2),
+                 "zext $dst, $src2",
+                 []>;
+
+def ANDNOT_2r : _F2R<(outs GRRegs:$dst), (ins GRRegs:$src1, GRRegs:$src2),
+                 "andnot $dst, $src2",
+                 [(set GRRegs:$dst, (and GRRegs:$src1, (not GRRegs:$src2)))]>;
+}
+
+let isReMaterializable = 1, neverHasSideEffects = 1 in
+def MKMSK_rus : _FRUS<(outs GRRegs:$dst), (ins i32imm:$size),
+                 "mkmsk $dst, $size",
+                 []>;
+
+def MKMSK_2r : _FRUS<(outs GRRegs:$dst), (ins GRRegs:$size),
+                 "mkmsk $dst, $size",
+                 [(set GRRegs:$dst, (add (shl 1, GRRegs:$size), 0xffffffff))]>;
+
+// Two operand long
+// TODO settw, setclk, setrdy, setpsc, endin, peek,
+// getd, testlcl, tinitlr, getps, setps
+def BITREV_l2r : _FL2R<(outs GRRegs:$dst), (ins GRRegs:$src),
+                 "bitrev $dst, $src",
+                 [(set GRRegs:$dst, (int_xcore_bitrev GRRegs:$src))]>;
+
+def BYTEREV_l2r : _FL2R<(outs GRRegs:$dst), (ins GRRegs:$src),
+                 "byterev $dst, $src",
+                 [(set GRRegs:$dst, (bswap GRRegs:$src))]>;
+
+def CLZ_l2r : _FL2R<(outs GRRegs:$dst), (ins GRRegs:$src),
+                 "clz $dst, $src",
+                 [(set GRRegs:$dst, (ctlz GRRegs:$src))]>;
+
+// One operand short
+// TODO edu, eeu, waitet, waitef, freer, tstart, msync, mjoin, syncr, clrtp
+// bru, setdp, setcp, setv, setev, kcall
+// dgetreg
+let isBranch=1, isIndirectBranch=1, isTerminator=1 in
+def BAU_1r : _F1R<(outs), (ins GRRegs:$addr),
+                 "bau $addr",
+                 [(brind GRRegs:$addr)]>;
+
+let Defs=[SP], neverHasSideEffects=1 in
+def SETSP_1r : _F1R<(outs), (ins GRRegs:$src),
+                 "set sp, $src",
+                 []>;
+
+let isBarrier = 1, hasCtrlDep = 1 in 
+def ECALLT_1r : _F1R<(outs), (ins GRRegs:$src),
+                 "ecallt $src",
+                 []>;
+
+let isBarrier = 1, hasCtrlDep = 1 in 
+def ECALLF_1r : _F1R<(outs), (ins GRRegs:$src),
+                 "ecallf $src",
+                 []>;
+
+let isCall=1, 
+// All calls clobber the link register and the non-callee-saved registers:
+Defs = [R0, R1, R2, R3, R11, LR] in {
+def BLA_1r : _F1R<(outs), (ins GRRegs:$addr, variable_ops),
+                 "bla $addr",
+                 [(XCoreBranchLink GRRegs:$addr)]>;
+}
+
+// Zero operand short
+// TODO waiteu, clre, ssync, freet, ldspc, stspc, ldssr, stssr, ldsed, stsed,
+// stet, geted, getet, getkep, getksp, setkep, getid, kret, dcall, dret,
+// dentsp, drestsp
+
+let Defs = [R11] in
+def GETID_0R : _F0R<(outs), (ins),
+                 "get r11, id",
+                 [(set R11, (int_xcore_getid))]>;
+
+//===----------------------------------------------------------------------===//
+// Non-Instruction Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat<(XCoreBranchLink tglobaladdr:$addr), (BL_lu10 tglobaladdr:$addr)>;
+def : Pat<(XCoreBranchLink texternalsym:$addr), (BL_lu10 texternalsym:$addr)>;
+
+/// sext_inreg
+def : Pat<(sext_inreg GRRegs:$b, i1), (SEXT_rus GRRegs:$b, 1)>;
+def : Pat<(sext_inreg GRRegs:$b, i8), (SEXT_rus GRRegs:$b, 8)>;
+def : Pat<(sext_inreg GRRegs:$b, i16), (SEXT_rus GRRegs:$b, 16)>;
+
+/// loads
+def : Pat<(zextloadi8 (add GRRegs:$addr, GRRegs:$offset)),
+          (LD8U_3r GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(zextloadi8 GRRegs:$addr), (LD8U_3r GRRegs:$addr, (LDC_ru6 0))>;
+
+def : Pat<(sextloadi16 (lda16f GRRegs:$addr, GRRegs:$offset)),
+          (LD16S_3r GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(sextloadi16 GRRegs:$addr), (LD16S_3r GRRegs:$addr, (LDC_ru6 0))>;
+
+def : Pat<(load (ldawf GRRegs:$addr, GRRegs:$offset)),
+          (LDW_3r GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(load (add GRRegs:$addr, immUs4:$offset)),
+          (LDW_2rus GRRegs:$addr, (div4_xform immUs4:$offset))>;
+def : Pat<(load GRRegs:$addr), (LDW_2rus GRRegs:$addr, 0)>;
+
+/// anyext
+def : Pat<(extloadi8 (add GRRegs:$addr, GRRegs:$offset)),
+          (LD8U_3r GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(extloadi8 GRRegs:$addr), (LD8U_3r GRRegs:$addr, (LDC_ru6 0))>;
+def : Pat<(extloadi16 (lda16f GRRegs:$addr, GRRegs:$offset)),
+          (LD16S_3r GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(extloadi16 GRRegs:$addr), (LD16S_3r GRRegs:$addr, (LDC_ru6 0))>;
+
+/// stores
+def : Pat<(truncstorei8 GRRegs:$val, (add GRRegs:$addr, GRRegs:$offset)),
+          (ST8_l3r GRRegs:$val, GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(truncstorei8 GRRegs:$val, GRRegs:$addr),
+          (ST8_l3r GRRegs:$val, GRRegs:$addr, (LDC_ru6 0))>;
+          
+def : Pat<(truncstorei16 GRRegs:$val, (lda16f GRRegs:$addr, GRRegs:$offset)),
+          (ST16_l3r GRRegs:$val, GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(truncstorei16 GRRegs:$val, GRRegs:$addr),
+          (ST16_l3r GRRegs:$val, GRRegs:$addr, (LDC_ru6 0))>;
+
+def : Pat<(store GRRegs:$val, (ldawf GRRegs:$addr, GRRegs:$offset)),
+          (STW_3r GRRegs:$val, GRRegs:$addr, GRRegs:$offset)>;
+def : Pat<(store GRRegs:$val, (add GRRegs:$addr, immUs4:$offset)),
+          (STW_2rus GRRegs:$val, GRRegs:$addr, (div4_xform immUs4:$offset))>;
+def : Pat<(store GRRegs:$val, GRRegs:$addr),
+          (STW_2rus GRRegs:$val, GRRegs:$addr, 0)>;
+
+/// cttz
+def : Pat<(cttz GRRegs:$src), (CLZ_l2r (BITREV_l2r GRRegs:$src))>;
+
+/// trap
+def : Pat<(trap), (ECALLF_1r (LDC_ru6 0))>;
+
+///
+/// branch patterns
+///
+
+// unconditional branch
+def : Pat<(br bb:$addr), (BRFU_lu6 bb:$addr)>;
+
+// direct match equal/notequal zero brcond
+def : Pat<(brcond (setne GRRegs:$lhs, 0), bb:$dst),
+          (BRFT_lru6 GRRegs:$lhs, bb:$dst)>;
+def : Pat<(brcond (seteq GRRegs:$lhs, 0), bb:$dst),
+          (BRFF_lru6 GRRegs:$lhs, bb:$dst)>;
+
+def : Pat<(brcond (setle GRRegs:$lhs, GRRegs:$rhs), bb:$dst),
+          (BRFF_lru6 (LSS_3r GRRegs:$rhs, GRRegs:$lhs), bb:$dst)>;
+def : Pat<(brcond (setule GRRegs:$lhs, GRRegs:$rhs), bb:$dst),
+          (BRFF_lru6 (LSU_3r GRRegs:$rhs, GRRegs:$lhs), bb:$dst)>;
+def : Pat<(brcond (setge GRRegs:$lhs, GRRegs:$rhs), bb:$dst),
+          (BRFF_lru6 (LSS_3r GRRegs:$lhs, GRRegs:$rhs), bb:$dst)>;
+def : Pat<(brcond (setuge GRRegs:$lhs, GRRegs:$rhs), bb:$dst),
+          (BRFF_lru6 (LSU_3r GRRegs:$lhs, GRRegs:$rhs), bb:$dst)>;
+def : Pat<(brcond (setne GRRegs:$lhs, GRRegs:$rhs), bb:$dst),
+          (BRFF_lru6 (EQ_3r GRRegs:$lhs, GRRegs:$rhs), bb:$dst)>;
+def : Pat<(brcond (setne GRRegs:$lhs, immUs:$rhs), bb:$dst),
+          (BRFF_lru6 (EQ_2rus GRRegs:$lhs, immUs:$rhs), bb:$dst)>;
+
+// generic brcond pattern
+def : Pat<(brcond GRRegs:$cond, bb:$addr), (BRFT_lru6 GRRegs:$cond, bb:$addr)>;
+
+
+///
+/// Select patterns
+///
+
+// direct match equal/notequal zero select
+def : Pat<(select (setne GRRegs:$lhs, 0), GRRegs:$T, GRRegs:$F),
+        (SELECT_CC GRRegs:$lhs, GRRegs:$T, GRRegs:$F)>;
+
+def : Pat<(select (seteq GRRegs:$lhs, 0), GRRegs:$T, GRRegs:$F),
+        (SELECT_CC GRRegs:$lhs, GRRegs:$F, GRRegs:$T)>;
+
+def : Pat<(select (setle GRRegs:$lhs, GRRegs:$rhs), GRRegs:$T, GRRegs:$F),
+          (SELECT_CC (LSS_3r GRRegs:$rhs, GRRegs:$lhs), GRRegs:$F, GRRegs:$T)>;
+def : Pat<(select (setule GRRegs:$lhs, GRRegs:$rhs), GRRegs:$T, GRRegs:$F),
+          (SELECT_CC (LSU_3r GRRegs:$rhs, GRRegs:$lhs), GRRegs:$F, GRRegs:$T)>;
+def : Pat<(select (setge GRRegs:$lhs, GRRegs:$rhs), GRRegs:$T, GRRegs:$F),
+          (SELECT_CC (LSS_3r GRRegs:$lhs, GRRegs:$rhs), GRRegs:$F, GRRegs:$T)>;
+def : Pat<(select (setuge GRRegs:$lhs, GRRegs:$rhs), GRRegs:$T, GRRegs:$F),
+          (SELECT_CC (LSU_3r GRRegs:$lhs, GRRegs:$rhs), GRRegs:$F, GRRegs:$T)>;
+def : Pat<(select (setne GRRegs:$lhs, GRRegs:$rhs), GRRegs:$T, GRRegs:$F),
+          (SELECT_CC (EQ_3r GRRegs:$lhs, GRRegs:$rhs), GRRegs:$F, GRRegs:$T)>;
+def : Pat<(select (setne GRRegs:$lhs, immUs:$rhs), GRRegs:$T, GRRegs:$F),
+          (SELECT_CC (EQ_2rus GRRegs:$lhs, immUs:$rhs), GRRegs:$F, GRRegs:$T)>;
+
+///
+/// setcc patterns, only matched when none of the above brcond
+/// patterns match
+///
+
+// setcc 2 register operands
+def : Pat<(setle GRRegs:$lhs, GRRegs:$rhs),
+          (EQ_2rus (LSS_3r GRRegs:$rhs, GRRegs:$lhs), 0)>;
+def : Pat<(setule GRRegs:$lhs, GRRegs:$rhs),
+          (EQ_2rus (LSU_3r GRRegs:$rhs, GRRegs:$lhs), 0)>;
+
+def : Pat<(setgt GRRegs:$lhs, GRRegs:$rhs),
+          (LSS_3r GRRegs:$rhs, GRRegs:$lhs)>;
+def : Pat<(setugt GRRegs:$lhs, GRRegs:$rhs),
+          (LSU_3r GRRegs:$rhs, GRRegs:$lhs)>;
+
+def : Pat<(setge GRRegs:$lhs, GRRegs:$rhs),
+          (EQ_2rus (LSS_3r GRRegs:$lhs, GRRegs:$rhs), 0)>;
+def : Pat<(setuge GRRegs:$lhs, GRRegs:$rhs),
+          (EQ_2rus (LSU_3r GRRegs:$lhs, GRRegs:$rhs), 0)>;
+
+def : Pat<(setlt GRRegs:$lhs, GRRegs:$rhs),
+          (LSS_3r GRRegs:$lhs, GRRegs:$rhs)>;
+def : Pat<(setult GRRegs:$lhs, GRRegs:$rhs),
+          (LSU_3r GRRegs:$lhs, GRRegs:$rhs)>;
+
+def : Pat<(setne GRRegs:$lhs, GRRegs:$rhs),
+          (EQ_2rus (EQ_3r GRRegs:$lhs, GRRegs:$rhs), 0)>;
+
+def : Pat<(seteq GRRegs:$lhs, GRRegs:$rhs),
+          (EQ_3r GRRegs:$lhs, GRRegs:$rhs)>;
+
+// setcc reg/imm operands
+def : Pat<(seteq GRRegs:$lhs, immUs:$rhs),
+          (EQ_2rus GRRegs:$lhs, immUs:$rhs)>;
+def : Pat<(setne GRRegs:$lhs, immUs:$rhs),
+          (EQ_2rus (EQ_2rus GRRegs:$lhs, immUs:$rhs), 0)>;
+
+// misc
+def : Pat<(add GRRegs:$addr, immUs4:$offset),
+          (LDAWF_l2rus GRRegs:$addr, (div4_xform immUs4:$offset))>;
+
+def : Pat<(sub GRRegs:$addr, immUs4:$offset),
+          (LDAWB_l2rus GRRegs:$addr, (div4_xform immUs4:$offset))>;
+
+def : Pat<(and GRRegs:$val, immMskBitp:$mask),
+          (ZEXT_rus GRRegs:$val, (msksize_xform immMskBitp:$mask))>;
+
+// (sub X, imm) gets canonicalized to (add X, -imm).  Match this form.
+def : Pat<(add GRRegs:$src1, immUsNeg:$src2),
+          (SUB_2rus GRRegs:$src1, (neg_xform immUsNeg:$src2))>;
+
+def : Pat<(add GRRegs:$src1, immUs4Neg:$src2),
+          (LDAWB_l2rus GRRegs:$src1, (div4neg_xform immUs4Neg:$src2))>;
+
+///
+/// Some peepholes
+///
+
+def : Pat<(mul GRRegs:$src, 3),
+          (LDA16F_l3r GRRegs:$src, GRRegs:$src)>;
+
+def : Pat<(mul GRRegs:$src, 5),
+          (LDAWF_l3r GRRegs:$src, GRRegs:$src)>;
+
+def : Pat<(mul GRRegs:$src, -3),
+          (LDAWB_l3r GRRegs:$src, GRRegs:$src)>;
+
+// ashr X, 32 is equivalent to ashr X, 31 on the XCore.
+def : Pat<(sra GRRegs:$src, 31),
+          (ASHR_l2rus GRRegs:$src, 32)>;
+
+def : Pat<(brcond (setlt GRRegs:$lhs, 0), bb:$dst),
+          (BRFT_lru6 (ASHR_l2rus GRRegs:$lhs, 32), bb:$dst)>;
+
+// setge X, 0 is canonicalized to setgt X, -1
+def : Pat<(brcond (setgt GRRegs:$lhs, -1), bb:$dst),
+          (BRFF_lru6 (ASHR_l2rus GRRegs:$lhs, 32), bb:$dst)>;
+
+def : Pat<(select (setlt GRRegs:$lhs, 0), GRRegs:$T, GRRegs:$F),
+          (SELECT_CC (ASHR_l2rus GRRegs:$lhs, 32), GRRegs:$T, GRRegs:$F)>;
+
+def : Pat<(select (setgt GRRegs:$lhs, -1), GRRegs:$T, GRRegs:$F),
+          (SELECT_CC (ASHR_l2rus GRRegs:$lhs, 32), GRRegs:$F, GRRegs:$T)>;
+
+def : Pat<(setgt GRRegs:$lhs, -1),
+          (EQ_2rus (ASHR_l2rus GRRegs:$lhs, 32), 0)>;
+
+def : Pat<(sra (shl GRRegs:$src, immBpwSubBitp:$imm), immBpwSubBitp:$imm),
+          (SEXT_rus GRRegs:$src, (bpwsub_xform immBpwSubBitp:$imm))>;

diff --git a/lib/Target/XCore/XCoreMCAsmInfo.cpp b/lib/Target/XCore/XCoreMCAsmInfo.cpp
new file mode 100644
index 0000000..bf78575
--- /dev/null
+++ b/lib/Target/XCore/XCoreMCAsmInfo.cpp

@@ -0,0 +1,30 @@
+//===-- XCoreMCAsmInfo.cpp - XCore asm properties -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreMCAsmInfo.h"
+using namespace llvm;
+
+XCoreMCAsmInfo::XCoreMCAsmInfo(const Target &T, const StringRef &TT) {
+  SupportsDebugInformation = true;
+  Data16bitsDirective = "\t.short\t";
+  Data32bitsDirective = "\t.long\t";
+  Data64bitsDirective = 0;
+  ZeroDirective = "\t.space\t";
+  CommentString = "#";
+    
+  PrivateGlobalPrefix = ".L";
+  AscizDirective = ".asciiz";
+  WeakDefDirective = "\t.weak\t";
+  WeakRefDirective = "\t.weak\t";
+
+  // Debug
+  HasLEB128 = true;
+  AbsoluteDebugSectionOffsets = true;
+}
+

diff --git a/lib/Target/XCore/XCoreMCAsmInfo.h b/lib/Target/XCore/XCoreMCAsmInfo.h
new file mode 100644
index 0000000..01f8e48
--- /dev/null
+++ b/lib/Target/XCore/XCoreMCAsmInfo.h

@@ -0,0 +1,29 @@
+//=====-- XCoreMCAsmInfo.h - XCore asm properties -------------*- C++ -*--====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the XCoreMCAsmInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCORETARGETASMINFO_H
+#define XCORETARGETASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+  class Target;
+  class StringRef;
+  class XCoreMCAsmInfo : public MCAsmInfo {
+  public:
+    explicit XCoreMCAsmInfo(const Target &T, const StringRef &TT);
+  };
+
+} // namespace llvm
+
+#endif

diff --git a/lib/Target/XCore/XCoreMachineFunctionInfo.h b/lib/Target/XCore/XCoreMachineFunctionInfo.h
new file mode 100644
index 0000000..124a011
--- /dev/null
+++ b/lib/Target/XCore/XCoreMachineFunctionInfo.h

@@ -0,0 +1,69 @@
+//====- XCoreMachineFuctionInfo.h - XCore machine function info -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares XCore-specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCOREMACHINEFUNCTIONINFO_H
+#define XCOREMACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include <vector>
+
+namespace llvm {
+
+// Forward declarations
+class Function;
+
+/// XCoreFunctionInfo - This class is derived from MachineFunction private
+/// XCore target-specific information for each MachineFunction.
+class XCoreFunctionInfo : public MachineFunctionInfo {
+private:
+  bool UsesLR;
+  int LRSpillSlot;
+  int FPSpillSlot;
+  int VarArgsFrameIndex;
+  std::vector<std::pair<unsigned, CalleeSavedInfo> > SpillLabels;
+
+public:
+  XCoreFunctionInfo() :
+    UsesLR(false),
+    LRSpillSlot(0),
+    FPSpillSlot(0),
+    VarArgsFrameIndex(0) {}
+  
+  explicit XCoreFunctionInfo(MachineFunction &MF) :
+    UsesLR(false),
+    LRSpillSlot(0),
+    FPSpillSlot(0),
+    VarArgsFrameIndex(0) {}
+  
+  ~XCoreFunctionInfo() {}
+  
+  void setVarArgsFrameIndex(int off) { VarArgsFrameIndex = off; }
+  int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
+  
+  void setUsesLR(bool val) { UsesLR = val; }
+  bool getUsesLR() const { return UsesLR; }
+  
+  void setLRSpillSlot(int off) { LRSpillSlot = off; }
+  int getLRSpillSlot() const { return LRSpillSlot; }
+  
+  void setFPSpillSlot(int off) { FPSpillSlot = off; }
+  int getFPSpillSlot() const { return FPSpillSlot; }
+  
+  std::vector<std::pair<unsigned, CalleeSavedInfo> >&getSpillLabels() {
+    return SpillLabels;
+  }
+};
+} // End llvm namespace
+
+#endif // XCOREMACHINEFUNCTIONINFO_H

diff --git a/lib/Target/XCore/XCoreRegisterInfo.cpp b/lib/Target/XCore/XCoreRegisterInfo.cpp
new file mode 100644
index 0000000..c7c8c7b
--- /dev/null
+++ b/lib/Target/XCore/XCoreRegisterInfo.cpp

@@ -0,0 +1,618 @@
+//===- XCoreRegisterInfo.cpp - XCore Register Information -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the XCore implementation of the MRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreRegisterInfo.h"
+#include "XCoreMachineFunctionInfo.h"
+#include "XCore.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineLocation.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/Target/TargetFrameInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Type.h"
+#include "llvm/Function.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+XCoreRegisterInfo::XCoreRegisterInfo(const TargetInstrInfo &tii)
+  : XCoreGenRegisterInfo(XCore::ADJCALLSTACKDOWN, XCore::ADJCALLSTACKUP),
+    TII(tii) {
+}
+
+// helper functions
+static inline bool isImmUs(unsigned val) {
+  return val <= 11;
+}
+
+static inline bool isImmU6(unsigned val) {
+  return val < (1 << 6);
+}
+
+static inline bool isImmU16(unsigned val) {
+  return val < (1 << 16);
+}
+
+static const unsigned XCore_ArgRegs[] = {
+  XCore::R0, XCore::R1, XCore::R2, XCore::R3
+};
+
+const unsigned * XCoreRegisterInfo::getArgRegs(const MachineFunction *MF)
+{
+  return XCore_ArgRegs;
+}
+
+unsigned XCoreRegisterInfo::getNumArgRegs(const MachineFunction *MF)
+{
+  return array_lengthof(XCore_ArgRegs);
+}
+
+bool XCoreRegisterInfo::needsFrameMoves(const MachineFunction &MF)
+{
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
+  return (MMI && MMI->hasDebugInfo()) ||
+          !MF.getFunction()->doesNotThrow() ||
+          UnwindTablesMandatory;
+}
+
+const unsigned* XCoreRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
+                                                                         const {
+  static const unsigned CalleeSavedRegs[] = {
+    XCore::R4, XCore::R5, XCore::R6, XCore::R7,
+    XCore::R8, XCore::R9, XCore::R10, XCore::LR,
+    0
+  };
+  return CalleeSavedRegs;
+}
+
+const TargetRegisterClass* const*
+XCoreRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
+  static const TargetRegisterClass * const CalleeSavedRegClasses[] = {
+    XCore::GRRegsRegisterClass, XCore::GRRegsRegisterClass,
+    XCore::GRRegsRegisterClass, XCore::GRRegsRegisterClass,
+    XCore::GRRegsRegisterClass, XCore::GRRegsRegisterClass,
+    XCore::GRRegsRegisterClass, XCore::RRegsRegisterClass,
+    0
+  };
+  return CalleeSavedRegClasses;
+}
+
+BitVector XCoreRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  Reserved.set(XCore::CP);
+  Reserved.set(XCore::DP);
+  Reserved.set(XCore::SP);
+  Reserved.set(XCore::LR);
+  if (hasFP(MF)) {
+    Reserved.set(XCore::R10);
+  }
+  return Reserved;
+}
+
+bool
+XCoreRegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const {
+  // TODO can we estimate stack size?
+  return hasFP(MF);
+}
+
+bool XCoreRegisterInfo::hasFP(const MachineFunction &MF) const {
+  return NoFramePointerElim || MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+// This function eliminates ADJCALLSTACKDOWN,
+// ADJCALLSTACKUP pseudo instructions
+void XCoreRegisterInfo::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I) const {
+  if (!hasReservedCallFrame(MF)) {
+    // Turn the adjcallstackdown instruction into 'extsp <amt>' and the
+    // adjcallstackup instruction into 'ldaw sp, sp[<amt>]'
+    MachineInstr *Old = I;
+    uint64_t Amount = Old->getOperand(0).getImm();
+    if (Amount != 0) {
+      // We need to keep the stack aligned properly.  To do this, we round the
+      // amount of space needed for the outgoing arguments up to the next
+      // alignment boundary.
+      unsigned Align = MF.getTarget().getFrameInfo()->getStackAlignment();
+      Amount = (Amount+Align-1)/Align*Align;
+
+      assert(Amount%4 == 0);
+      Amount /= 4;
+      
+      bool isU6 = isImmU6(Amount);
+      
+      if (!isU6 && !isImmU16(Amount)) {
+        // FIX could emit multiple instructions in this case.
+#ifndef NDEBUG
+        errs() << "eliminateCallFramePseudoInstr size too big: "
+               << Amount << "\n";
+#endif
+        llvm_unreachable(0);
+      }
+
+      MachineInstr *New;
+      if (Old->getOpcode() == XCore::ADJCALLSTACKDOWN) {
+        int Opcode = isU6 ? XCore::EXTSP_u6 : XCore::EXTSP_lu6;
+        New=BuildMI(MF, Old->getDebugLoc(), TII.get(Opcode))
+          .addImm(Amount);
+      } else {
+        assert(Old->getOpcode() == XCore::ADJCALLSTACKUP);
+        int Opcode = isU6 ? XCore::LDAWSP_ru6_RRegs : XCore::LDAWSP_lru6_RRegs;
+        New=BuildMI(MF, Old->getDebugLoc(), TII.get(Opcode), XCore::SP)
+          .addImm(Amount);
+      }
+
+      // Replace the pseudo instruction with a new instruction...
+      MBB.insert(I, New);
+    }
+  }
+  
+  MBB.erase(I);
+}
+
+unsigned
+XCoreRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                       int SPAdj, int *Value,
+                                       RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+  MachineInstr &MI = *II;
+  DebugLoc dl = MI.getDebugLoc();
+  unsigned i = 0;
+
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+
+  MachineOperand &FrameOp = MI.getOperand(i);
+  int FrameIndex = FrameOp.getIndex();
+
+  MachineFunction &MF = *MI.getParent()->getParent();
+  int Offset = MF.getFrameInfo()->getObjectOffset(FrameIndex);
+  int StackSize = MF.getFrameInfo()->getStackSize();
+
+  #ifndef NDEBUG
+  DEBUG(errs() << "\nFunction         : " 
+        << MF.getFunction()->getName() << "\n");
+  DEBUG(errs() << "<--------->\n");
+  DEBUG(MI.print(errs()));
+  DEBUG(errs() << "FrameIndex         : " << FrameIndex << "\n");
+  DEBUG(errs() << "FrameOffset        : " << Offset << "\n");
+  DEBUG(errs() << "StackSize          : " << StackSize << "\n");
+  #endif
+
+  Offset += StackSize;
+  
+  // fold constant into offset.
+  Offset += MI.getOperand(i + 1).getImm();
+  MI.getOperand(i + 1).ChangeToImmediate(0);
+  
+  assert(Offset%4 == 0 && "Misaligned stack offset");
+
+  DEBUG(errs() << "Offset             : " << Offset << "\n" << "<--------->\n");
+  
+  Offset/=4;
+  
+  bool FP = hasFP(MF);
+  
+  unsigned Reg = MI.getOperand(0).getReg();
+  bool isKill = MI.getOpcode() == XCore::STWFI && MI.getOperand(0).isKill();
+
+  assert(XCore::GRRegsRegisterClass->contains(Reg) &&
+         "Unexpected register operand");
+  
+  MachineBasicBlock &MBB = *MI.getParent();
+  
+  if (FP) {
+    bool isUs = isImmUs(Offset);
+    unsigned FramePtr = XCore::R10;
+    
+    if (!isUs) {
+      if (!RS) {
+        std::string msg;
+        raw_string_ostream Msg(msg);
+        Msg << "eliminateFrameIndex Frame size too big: " << Offset;
+        llvm_report_error(Msg.str());
+      }
+      unsigned ScratchReg = RS->scavengeRegister(XCore::GRRegsRegisterClass, II,
+                                                 SPAdj);
+      loadConstant(MBB, II, ScratchReg, Offset, dl);
+      switch (MI.getOpcode()) {
+      case XCore::LDWFI:
+        BuildMI(MBB, II, dl, TII.get(XCore::LDW_3r), Reg)
+              .addReg(FramePtr)
+              .addReg(ScratchReg, RegState::Kill);
+        break;
+      case XCore::STWFI:
+        BuildMI(MBB, II, dl, TII.get(XCore::STW_3r))
+              .addReg(Reg, getKillRegState(isKill))
+              .addReg(FramePtr)
+              .addReg(ScratchReg, RegState::Kill);
+        break;
+      case XCore::LDAWFI:
+        BuildMI(MBB, II, dl, TII.get(XCore::LDAWF_l3r), Reg)
+              .addReg(FramePtr)
+              .addReg(ScratchReg, RegState::Kill);
+        break;
+      default:
+        llvm_unreachable("Unexpected Opcode");
+      }
+    } else {
+      switch (MI.getOpcode()) {
+      case XCore::LDWFI:
+        BuildMI(MBB, II, dl, TII.get(XCore::LDW_2rus), Reg)
+              .addReg(FramePtr)
+              .addImm(Offset);
+        break;
+      case XCore::STWFI:
+        BuildMI(MBB, II, dl, TII.get(XCore::STW_2rus))
+              .addReg(Reg, getKillRegState(isKill))
+              .addReg(FramePtr)
+              .addImm(Offset);
+        break;
+      case XCore::LDAWFI:
+        BuildMI(MBB, II, dl, TII.get(XCore::LDAWF_l2rus), Reg)
+              .addReg(FramePtr)
+              .addImm(Offset);
+        break;
+      default:
+        llvm_unreachable("Unexpected Opcode");
+      }
+    }
+  } else {
+    bool isU6 = isImmU6(Offset);
+    if (!isU6 && !isImmU16(Offset)) {
+      std::string msg;
+      raw_string_ostream Msg(msg);
+      Msg << "eliminateFrameIndex Frame size too big: " << Offset;
+      llvm_report_error(Msg.str());
+    }
+
+    switch (MI.getOpcode()) {
+    int NewOpcode;
+    case XCore::LDWFI:
+      NewOpcode = (isU6) ? XCore::LDWSP_ru6 : XCore::LDWSP_lru6;
+      BuildMI(MBB, II, dl, TII.get(NewOpcode), Reg)
+            .addImm(Offset);
+      break;
+    case XCore::STWFI:
+      NewOpcode = (isU6) ? XCore::STWSP_ru6 : XCore::STWSP_lru6;
+      BuildMI(MBB, II, dl, TII.get(NewOpcode))
+            .addReg(Reg, getKillRegState(isKill))
+            .addImm(Offset);
+      break;
+    case XCore::LDAWFI:
+      NewOpcode = (isU6) ? XCore::LDAWSP_ru6 : XCore::LDAWSP_lru6;
+      BuildMI(MBB, II, dl, TII.get(NewOpcode), Reg)
+            .addImm(Offset);
+      break;
+    default:
+      llvm_unreachable("Unexpected Opcode");
+    }
+  }
+  // Erase old instruction.
+  MBB.erase(II);
+  return 0;
+}
+
+void
+XCoreRegisterInfo::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                                      RegScavenger *RS) const {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  bool LRUsed = MF.getRegInfo().isPhysRegUsed(XCore::LR);
+  const TargetRegisterClass *RC = XCore::GRRegsRegisterClass;
+  XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+  if (LRUsed) {
+    MF.getRegInfo().setPhysRegUnused(XCore::LR);
+    
+    bool isVarArg = MF.getFunction()->isVarArg();
+    int FrameIdx;
+    if (! isVarArg) {
+      // A fixed offset of 0 allows us to save / restore LR using entsp / retsp.
+      FrameIdx = MFI->CreateFixedObject(RC->getSize(), 0, true, false);
+    } else {
+      FrameIdx = MFI->CreateStackObject(RC->getSize(), RC->getAlignment(),
+                                        false);
+    }
+    XFI->setUsesLR(FrameIdx);
+    XFI->setLRSpillSlot(FrameIdx);
+  }
+  if (requiresRegisterScavenging(MF)) {
+    // Reserve a slot close to SP or frame pointer.
+    RS->setScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
+                                                       RC->getAlignment(),
+                                                       false));
+  }
+  if (hasFP(MF)) {
+    // A callee save register is used to hold the FP.
+    // This needs saving / restoring in the epilogue / prologue.
+    XFI->setFPSpillSlot(MFI->CreateStackObject(RC->getSize(),
+                                               RC->getAlignment(),
+                                               false));
+  }
+}
+
+void XCoreRegisterInfo::
+processFunctionBeforeFrameFinalized(MachineFunction &MF) const {
+  
+}
+
+void XCoreRegisterInfo::
+loadConstant(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+            unsigned DstReg, int64_t Value, DebugLoc dl) const {
+  // TODO use mkmsk if possible.
+  if (!isImmU16(Value)) {
+    // TODO use constant pool.
+    std::string msg;
+    raw_string_ostream Msg(msg);
+    Msg << "loadConstant value too big " << Value;
+    llvm_report_error(Msg.str());
+  }
+  int Opcode = isImmU6(Value) ? XCore::LDC_ru6 : XCore::LDC_lru6;
+  BuildMI(MBB, I, dl, TII.get(Opcode), DstReg).addImm(Value);
+}
+
+void XCoreRegisterInfo::
+storeToStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                  unsigned SrcReg, int Offset, DebugLoc dl) const {
+  assert(Offset%4 == 0 && "Misaligned stack offset");
+  Offset/=4;
+  bool isU6 = isImmU6(Offset);
+  if (!isU6 && !isImmU16(Offset)) {
+    std::string msg;
+    raw_string_ostream Msg(msg);
+    Msg << "storeToStack offset too big " << Offset;
+    llvm_report_error(Msg.str());
+  }
+  int Opcode = isU6 ? XCore::STWSP_ru6 : XCore::STWSP_lru6;
+  BuildMI(MBB, I, dl, TII.get(Opcode))
+    .addReg(SrcReg)
+    .addImm(Offset);
+}
+
+void XCoreRegisterInfo::
+loadFromStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                  unsigned DstReg, int Offset, DebugLoc dl) const {
+  assert(Offset%4 == 0 && "Misaligned stack offset");
+  Offset/=4;
+  bool isU6 = isImmU6(Offset);
+  if (!isU6 && !isImmU16(Offset)) {
+    std::string msg;
+    raw_string_ostream Msg(msg);
+    Msg << "loadFromStack offset too big " << Offset;
+    llvm_report_error(Msg.str());
+  }
+  int Opcode = isU6 ? XCore::LDWSP_ru6 : XCore::LDWSP_lru6;
+  BuildMI(MBB, I, dl, TII.get(Opcode), DstReg)
+    .addImm(Offset);
+}
+
+void XCoreRegisterInfo::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front();   // Prolog goes in entry BB
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MachineModuleInfo *MMI = MFI->getMachineModuleInfo();
+  XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+  DebugLoc dl = (MBBI != MBB.end() ?
+                 MBBI->getDebugLoc() : DebugLoc::getUnknownLoc());
+
+  bool FP = hasFP(MF);
+
+  // Work out frame sizes.
+  int FrameSize = MFI->getStackSize();
+
+  assert(FrameSize%4 == 0 && "Misaligned frame size");
+  
+  FrameSize/=4;
+  
+  bool isU6 = isImmU6(FrameSize);
+
+  if (!isU6 && !isImmU16(FrameSize)) {
+    // FIXME could emit multiple instructions.
+    std::string msg;
+    raw_string_ostream Msg(msg);
+    Msg << "emitPrologue Frame size too big: " << FrameSize;
+    llvm_report_error(Msg.str());
+  }
+  bool emitFrameMoves = needsFrameMoves(MF);
+
+  // Do we need to allocate space on the stack?
+  if (FrameSize) {
+    bool saveLR = XFI->getUsesLR();
+    bool LRSavedOnEntry = false;
+    int Opcode;
+    if (saveLR && (MFI->getObjectOffset(XFI->getLRSpillSlot()) == 0)) {
+      Opcode = (isU6) ? XCore::ENTSP_u6 : XCore::ENTSP_lu6;
+      MBB.addLiveIn(XCore::LR);
+      saveLR = false;
+      LRSavedOnEntry = true;
+    } else {
+      Opcode = (isU6) ? XCore::EXTSP_u6 : XCore::EXTSP_lu6;
+    }
+    BuildMI(MBB, MBBI, dl, TII.get(Opcode)).addImm(FrameSize);
+    
+    if (emitFrameMoves) {
+      std::vector<MachineMove> &Moves = MMI->getFrameMoves();
+      
+      // Show update of SP.
+      unsigned FrameLabelId = MMI->NextLabelID();
+      BuildMI(MBB, MBBI, dl, TII.get(XCore::DBG_LABEL)).addImm(FrameLabelId);
+      
+      MachineLocation SPDst(MachineLocation::VirtualFP);
+      MachineLocation SPSrc(MachineLocation::VirtualFP, -FrameSize * 4);
+      Moves.push_back(MachineMove(FrameLabelId, SPDst, SPSrc));
+      
+      if (LRSavedOnEntry) {
+        MachineLocation CSDst(MachineLocation::VirtualFP, 0);
+        MachineLocation CSSrc(XCore::LR);
+        Moves.push_back(MachineMove(FrameLabelId, CSDst, CSSrc));
+      }
+    }
+    if (saveLR) {
+      int LRSpillOffset = MFI->getObjectOffset(XFI->getLRSpillSlot());
+      storeToStack(MBB, MBBI, XCore::LR, LRSpillOffset + FrameSize*4, dl);
+      MBB.addLiveIn(XCore::LR);
+      
+      if (emitFrameMoves) {
+        unsigned SaveLRLabelId = MMI->NextLabelID();
+        BuildMI(MBB, MBBI, dl, TII.get(XCore::DBG_LABEL)).addImm(SaveLRLabelId);
+        MachineLocation CSDst(MachineLocation::VirtualFP, LRSpillOffset);
+        MachineLocation CSSrc(XCore::LR);
+        MMI->getFrameMoves().push_back(MachineMove(SaveLRLabelId,
+                                                   CSDst, CSSrc));
+      }
+    }
+  }
+  
+  if (FP) {
+    // Save R10 to the stack.
+    int FPSpillOffset = MFI->getObjectOffset(XFI->getFPSpillSlot());
+    storeToStack(MBB, MBBI, XCore::R10, FPSpillOffset + FrameSize*4, dl);
+    // R10 is live-in. It is killed at the spill.
+    MBB.addLiveIn(XCore::R10);
+    if (emitFrameMoves) {
+      unsigned SaveR10LabelId = MMI->NextLabelID();
+      BuildMI(MBB, MBBI, dl, TII.get(XCore::DBG_LABEL)).addImm(SaveR10LabelId);
+      MachineLocation CSDst(MachineLocation::VirtualFP, FPSpillOffset);
+      MachineLocation CSSrc(XCore::R10);
+      MMI->getFrameMoves().push_back(MachineMove(SaveR10LabelId,
+                                                 CSDst, CSSrc));
+    }
+    // Set the FP from the SP.
+    unsigned FramePtr = XCore::R10;
+    BuildMI(MBB, MBBI, dl, TII.get(XCore::LDAWSP_ru6), FramePtr)
+      .addImm(0);
+    if (emitFrameMoves) {
+      // Show FP is now valid.
+      unsigned FrameLabelId = MMI->NextLabelID();
+      BuildMI(MBB, MBBI, dl, TII.get(XCore::DBG_LABEL)).addImm(FrameLabelId);
+      MachineLocation SPDst(FramePtr);
+      MachineLocation SPSrc(MachineLocation::VirtualFP);
+      MMI->getFrameMoves().push_back(MachineMove(FrameLabelId, SPDst, SPSrc));
+    }
+  }
+  
+  if (emitFrameMoves) {
+    // Frame moves for callee saved.
+    std::vector<MachineMove> &Moves = MMI->getFrameMoves();
+    std::vector<std::pair<unsigned, CalleeSavedInfo> >&SpillLabels =
+        XFI->getSpillLabels();
+    for (unsigned I = 0, E = SpillLabels.size(); I != E; ++I) {
+      unsigned SpillLabel = SpillLabels[I].first;
+      CalleeSavedInfo &CSI = SpillLabels[I].second;
+      int Offset = MFI->getObjectOffset(CSI.getFrameIdx());
+      unsigned Reg = CSI.getReg();
+      MachineLocation CSDst(MachineLocation::VirtualFP, Offset);
+      MachineLocation CSSrc(Reg);
+      Moves.push_back(MachineMove(SpillLabel, CSDst, CSSrc));
+    }
+  }
+}
+
+void XCoreRegisterInfo::emitEpilogue(MachineFunction &MF,
+                                     MachineBasicBlock &MBB) const {
+  MachineFrameInfo *MFI            = MF.getFrameInfo();
+  MachineBasicBlock::iterator MBBI = prior(MBB.end());
+  DebugLoc dl = MBBI->getDebugLoc();
+  
+  bool FP = hasFP(MF);
+  
+  if (FP) {
+    // Restore the stack pointer.
+    unsigned FramePtr = XCore::R10;
+    BuildMI(MBB, MBBI, dl, TII.get(XCore::SETSP_1r))
+      .addReg(FramePtr);
+  }
+
+  // Work out frame sizes.
+  int FrameSize = MFI->getStackSize();
+
+  assert(FrameSize%4 == 0 && "Misaligned frame size");
+
+  FrameSize/=4;
+  
+  bool isU6 = isImmU6(FrameSize);
+
+  if (!isU6 && !isImmU16(FrameSize)) {
+    // FIXME could emit multiple instructions.
+    std::string msg;
+    raw_string_ostream Msg(msg);
+    Msg << "emitEpilogue Frame size too big: " << FrameSize;
+    llvm_report_error(Msg.str());
+  }
+
+  if (FrameSize) {
+    XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
+    
+    if (FP) {
+      // Restore R10
+      int FPSpillOffset = MFI->getObjectOffset(XFI->getFPSpillSlot());
+      FPSpillOffset += FrameSize*4;
+      loadFromStack(MBB, MBBI, XCore::R10, FPSpillOffset, dl);
+    }
+    bool restoreLR = XFI->getUsesLR();
+    if (restoreLR && MFI->getObjectOffset(XFI->getLRSpillSlot()) != 0) {
+      int LRSpillOffset = MFI->getObjectOffset(XFI->getLRSpillSlot());
+      LRSpillOffset += FrameSize*4;
+      loadFromStack(MBB, MBBI, XCore::LR, LRSpillOffset, dl);
+      restoreLR = false;
+    }
+    if (restoreLR) {
+      // Fold prologue into return instruction
+      assert(MBBI->getOpcode() == XCore::RETSP_u6
+        || MBBI->getOpcode() == XCore::RETSP_lu6);
+      int Opcode = (isU6) ? XCore::RETSP_u6 : XCore::RETSP_lu6;
+      BuildMI(MBB, MBBI, dl, TII.get(Opcode)).addImm(FrameSize);
+      MBB.erase(MBBI);
+    } else {
+      int Opcode = (isU6) ? XCore::LDAWSP_ru6_RRegs : XCore::LDAWSP_lru6_RRegs;
+      BuildMI(MBB, MBBI, dl, TII.get(Opcode), XCore::SP).addImm(FrameSize);
+    }
+  }
+}
+
+int XCoreRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
+  return XCoreGenRegisterInfo::getDwarfRegNumFull(RegNum, 0);
+}
+
+unsigned XCoreRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  bool FP = hasFP(MF);
+  
+  return FP ? XCore::R10 : XCore::SP;
+}
+
+unsigned XCoreRegisterInfo::getRARegister() const {
+  return XCore::LR;
+}
+
+void XCoreRegisterInfo::getInitialFrameState(std::vector<MachineMove> &Moves)
+                                                                         const {
+  // Initial state of the frame pointer is SP.
+  MachineLocation Dst(MachineLocation::VirtualFP);
+  MachineLocation Src(XCore::SP, 0);
+  Moves.push_back(MachineMove(0, Dst, Src));
+}
+
+#include "XCoreGenRegisterInfo.inc"
+

diff --git a/lib/Target/XCore/XCoreRegisterInfo.h b/lib/Target/XCore/XCoreRegisterInfo.h
new file mode 100644
index 0000000..8ab1750
--- /dev/null
+++ b/lib/Target/XCore/XCoreRegisterInfo.h

@@ -0,0 +1,95 @@
+//===- XCoreRegisterInfo.h - XCore Register Information Impl ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the XCore implementation of the MRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCOREREGISTERINFO_H
+#define XCOREREGISTERINFO_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "XCoreGenRegisterInfo.h.inc"
+
+namespace llvm {
+
+class TargetInstrInfo;
+
+struct XCoreRegisterInfo : public XCoreGenRegisterInfo {
+private:
+  const TargetInstrInfo &TII;
+
+  void loadConstant(MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator I,
+                  unsigned DstReg, int64_t Value, DebugLoc dl) const;
+
+  void storeToStack(MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator I,
+                  unsigned SrcReg, int Offset, DebugLoc dl) const;
+
+  void loadFromStack(MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator I,
+                  unsigned DstReg, int Offset, DebugLoc dl) const;
+
+public:
+  XCoreRegisterInfo(const TargetInstrInfo &tii);
+
+  /// Code Generation virtual methods...
+
+  const unsigned *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+
+  const TargetRegisterClass* const* getCalleeSavedRegClasses(
+                                     const MachineFunction *MF = 0) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const;
+  
+  bool requiresRegisterScavenging(const MachineFunction &MF) const;
+
+  bool hasFP(const MachineFunction &MF) const;
+
+  void eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I) const;
+
+  unsigned eliminateFrameIndex(MachineBasicBlock::iterator II,
+                               int SPAdj, int *Value = NULL,
+                               RegScavenger *RS = NULL) const;
+
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                                RegScavenger *RS = NULL) const;
+
+  void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
+
+  void emitPrologue(MachineFunction &MF) const;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+  
+  // Debug information queries.
+  unsigned getRARegister() const;
+  unsigned getFrameRegister(const MachineFunction &MF) const;
+  void getInitialFrameState(std::vector<MachineMove> &Moves) const;
+
+  //! Return the array of argument passing registers
+  /*!
+    \note The size of this array is returned by getArgRegsSize().
+    */
+  static const unsigned *getArgRegs(const MachineFunction *MF = 0);
+
+  //! Return the size of the argument passing register array
+  static unsigned getNumArgRegs(const MachineFunction *MF = 0);
+  
+  //! Return whether to emit frame moves
+  static bool needsFrameMoves(const MachineFunction &MF);
+
+  //! Get DWARF debugging register number
+  int getDwarfRegNum(unsigned RegNum, bool isEH) const;
+};
+
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/XCore/XCoreRegisterInfo.td b/lib/Target/XCore/XCoreRegisterInfo.td
new file mode 100644
index 0000000..62daf5d
--- /dev/null
+++ b/lib/Target/XCore/XCoreRegisterInfo.td

@@ -0,0 +1,91 @@
+//===- XCoreRegisterInfo.td - XCore Register defs ----------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Declarations that describe the XCore register file 
+//===----------------------------------------------------------------------===//
+
+class XCoreReg<string n> : Register<n> {
+  field bits<4> Num;
+  let Namespace = "XCore";
+}
+
+// Registers are identified with 4-bit ID numbers.
+// Ri - 32-bit integer registers
+class Ri<bits<4> num, string n> : XCoreReg<n> {
+  let Num = num;
+}
+
+// CPU registers
+def R0  : Ri< 0, "r0">, DwarfRegNum<[0]>;
+def R1  : Ri< 1, "r1">, DwarfRegNum<[1]>;
+def R2  : Ri< 2, "r2">, DwarfRegNum<[2]>; 
+def R3  : Ri< 3, "r3">, DwarfRegNum<[3]>;
+def R4  : Ri< 4, "r4">, DwarfRegNum<[4]>;
+def R5  : Ri< 5, "r5">, DwarfRegNum<[5]>; 
+def R6  : Ri< 6, "r6">, DwarfRegNum<[6]>;
+def R7  : Ri< 7, "r7">, DwarfRegNum<[7]>;
+def R8  : Ri< 8, "r8">, DwarfRegNum<[8]>;
+def R9  : Ri< 9, "r9">, DwarfRegNum<[9]>; 
+def R10 : Ri<10, "r10">, DwarfRegNum<[10]>;
+def R11 : Ri<11, "r11">, DwarfRegNum<[11]>;
+def CP : Ri<12, "cp">, DwarfRegNum<[12]>; 
+def DP : Ri<13, "dp">, DwarfRegNum<[13]>;
+def SP : Ri<14, "sp">, DwarfRegNum<[14]>;
+def LR : Ri<15, "lr">, DwarfRegNum<[15]>;
+
+// Register classes.
+//
+def GRRegs : RegisterClass<"XCore", [i32], 32,
+  // Return values and arguments
+  [R0, R1, R2, R3,
+  // Not preserved across procedure calls
+  R11,
+  // Callee save
+  R4, R5, R6, R7, R8, R9, R10]> {
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    GRRegsClass::iterator
+    GRRegsClass::allocation_order_begin(const MachineFunction &MF) const {
+      return begin();
+    }
+    GRRegsClass::iterator
+    GRRegsClass::allocation_order_end(const MachineFunction &MF) const {
+      const TargetMachine &TM = MF.getTarget();
+      const TargetRegisterInfo *RI = TM.getRegisterInfo();
+      if (RI->hasFP(MF))
+        return end()-1;  // don't allocate R10
+      else
+        return end();
+    }
+  }];
+}
+
+def RRegs : RegisterClass<"XCore", [i32], 32,
+  // Reserved
+  [CP, DP, SP, LR]> {
+  let MethodProtos = [{
+    iterator allocation_order_begin(const MachineFunction &MF) const;
+    iterator allocation_order_end(const MachineFunction &MF) const;
+  }];
+  let MethodBodies = [{
+    RRegsClass::iterator
+    RRegsClass::allocation_order_begin(const MachineFunction &MF) const {
+      return begin();
+    }
+    RRegsClass::iterator
+    RRegsClass::allocation_order_end(const MachineFunction &MF) const {
+      // No allocatable registers
+      return begin();
+    }
+  }];
+}

diff --git a/lib/Target/XCore/XCoreSubtarget.cpp b/lib/Target/XCore/XCoreSubtarget.cpp
new file mode 100644
index 0000000..78a6fa5
--- /dev/null
+++ b/lib/Target/XCore/XCoreSubtarget.cpp

@@ -0,0 +1,20 @@
+//===- XCoreSubtarget.cpp - XCore Subtarget Information -----------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the XCore specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreSubtarget.h"
+#include "XCore.h"
+using namespace llvm;
+
+XCoreSubtarget::XCoreSubtarget(const std::string &TT, const std::string &FS)
+{
+}

diff --git a/lib/Target/XCore/XCoreSubtarget.h b/lib/Target/XCore/XCoreSubtarget.h
new file mode 100644
index 0000000..f8be3ec
--- /dev/null
+++ b/lib/Target/XCore/XCoreSubtarget.h

@@ -0,0 +1,39 @@
+//=====-- XCoreSubtarget.h - Define Subtarget for the XCore -----*- C++ -*--==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the XCore specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCORESUBTARGET_H
+#define XCORESUBTARGET_H
+
+#include "llvm/Target/TargetSubtarget.h"
+#include "llvm/Target/TargetMachine.h"
+
+#include <string>
+
+namespace llvm {
+
+class XCoreSubtarget : public TargetSubtarget {
+
+public:
+  /// This constructor initializes the data members to match that
+  /// of the specified triple.
+  ///
+  XCoreSubtarget(const std::string &TT, const std::string &FS);
+  
+  /// ParseSubtargetFeatures - Parses features string setting specified 
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  std::string ParseSubtargetFeatures(const std::string &FS,
+                                     const std::string &CPU);
+};
+} // End llvm namespace
+
+#endif

diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp
new file mode 100644
index 0000000..267f46a
--- /dev/null
+++ b/lib/Target/XCore/XCoreTargetMachine.cpp

@@ -0,0 +1,44 @@
+//===-- XCoreTargetMachine.cpp - Define TargetMachine for XCore -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreMCAsmInfo.h"
+#include "XCoreTargetMachine.h"
+#include "XCore.h"
+#include "llvm/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/Target/TargetRegistry.h"
+using namespace llvm;
+
+/// XCoreTargetMachine ctor - Create an ILP32 architecture model
+///
+XCoreTargetMachine::XCoreTargetMachine(const Target &T, const std::string &TT,
+                                       const std::string &FS)
+  : LLVMTargetMachine(T, TT),
+    Subtarget(TT, FS),
+    DataLayout("e-p:32:32:32-a0:0:32-f32:32:32-f64:32:32-i1:8:32-i8:8:32-"
+               "i16:16:32-i32:32:32-i64:32:32-n32"),
+    InstrInfo(),
+    FrameInfo(*this),
+    TLInfo(*this) {
+}
+
+bool XCoreTargetMachine::addInstSelector(PassManagerBase &PM,
+                                         CodeGenOpt::Level OptLevel) {
+  PM.add(createXCoreISelDag(*this));
+  return false;
+}
+
+// Force static initialization.
+extern "C" void LLVMInitializeXCoreTarget() {
+  RegisterTargetMachine<XCoreTargetMachine> X(TheXCoreTarget);
+  RegisterAsmInfo<XCoreMCAsmInfo> Y(TheXCoreTarget);
+}

diff --git a/lib/Target/XCore/XCoreTargetMachine.h b/lib/Target/XCore/XCoreTargetMachine.h
new file mode 100644
index 0000000..b0b1464
--- /dev/null
+++ b/lib/Target/XCore/XCoreTargetMachine.h

@@ -0,0 +1,54 @@
+//===-- XCoreTargetMachine.h - Define TargetMachine for XCore ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the XCore specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCORETARGETMACHINE_H
+#define XCORETARGETMACHINE_H
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetData.h"
+#include "XCoreFrameInfo.h"
+#include "XCoreSubtarget.h"
+#include "XCoreInstrInfo.h"
+#include "XCoreISelLowering.h"
+
+namespace llvm {
+
+class XCoreTargetMachine : public LLVMTargetMachine {
+  XCoreSubtarget Subtarget;
+  const TargetData DataLayout;       // Calculates type size & alignment
+  XCoreInstrInfo InstrInfo;
+  XCoreFrameInfo FrameInfo;
+  XCoreTargetLowering TLInfo;
+public:
+  XCoreTargetMachine(const Target &T, const std::string &TT,
+                     const std::string &FS);
+
+  virtual const XCoreInstrInfo *getInstrInfo() const { return &InstrInfo; }
+  virtual const XCoreFrameInfo *getFrameInfo() const { return &FrameInfo; }
+  virtual const XCoreSubtarget *getSubtargetImpl() const { return &Subtarget; }
+  virtual       XCoreTargetLowering *getTargetLowering() const {
+    return const_cast<XCoreTargetLowering*>(&TLInfo);
+  }
+
+  virtual const TargetRegisterInfo *getRegisterInfo() const {
+    return &InstrInfo.getRegisterInfo();
+  }
+  virtual const TargetData       *getTargetData() const { return &DataLayout; }
+
+  // Pass Pipeline Configuration
+  virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+};
+
+} // end namespace llvm
+
+#endif

diff --git a/lib/Target/XCore/XCoreTargetObjectFile.cpp b/lib/Target/XCore/XCoreTargetObjectFile.cpp
new file mode 100644
index 0000000..7de3b55
--- /dev/null
+++ b/lib/Target/XCore/XCoreTargetObjectFile.cpp

@@ -0,0 +1,67 @@
+//===-- XCoreTargetObjectFile.cpp - XCore object files --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "XCoreTargetObjectFile.h"
+#include "XCoreSubtarget.h"
+#include "MCSectionXCore.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+
+void XCoreTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM){
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+
+  DataSection =
+    MCSectionXCore::Create(".dp.data", MCSectionELF::SHT_PROGBITS, 
+                           MCSectionELF::SHF_ALLOC | MCSectionELF::SHF_WRITE |
+                           MCSectionXCore::SHF_DP_SECTION,
+                           SectionKind::getDataRel(), false, getContext());
+  BSSSection =
+    MCSectionXCore::Create(".dp.bss", MCSectionELF::SHT_NOBITS,
+                           MCSectionELF::SHF_ALLOC | MCSectionELF::SHF_WRITE |
+                           MCSectionXCore::SHF_DP_SECTION,
+                           SectionKind::getBSS(), false, getContext());
+  
+  MergeableConst4Section = 
+    MCSectionXCore::Create(".cp.rodata.cst4", MCSectionELF::SHT_PROGBITS,
+                           MCSectionELF::SHF_ALLOC | MCSectionELF::SHF_MERGE |
+                           MCSectionXCore::SHF_CP_SECTION,
+                           SectionKind::getMergeableConst4(), false,
+                           getContext());
+  MergeableConst8Section = 
+    MCSectionXCore::Create(".cp.rodata.cst8", MCSectionELF::SHT_PROGBITS,
+                           MCSectionELF::SHF_ALLOC | MCSectionELF::SHF_MERGE |
+                           MCSectionXCore::SHF_CP_SECTION,
+                           SectionKind::getMergeableConst8(), false,
+                           getContext());
+  MergeableConst16Section = 
+    MCSectionXCore::Create(".cp.rodata.cst16", MCSectionELF::SHT_PROGBITS,
+                           MCSectionELF::SHF_ALLOC | MCSectionELF::SHF_MERGE |
+                           MCSectionXCore::SHF_CP_SECTION,
+                           SectionKind::getMergeableConst16(), false,
+                           getContext());
+  
+  // TLS globals are lowered in the backend to arrays indexed by the current
+  // thread id. After lowering they require no special handling by the linker
+  // and can be placed in the standard data / bss sections.
+  TLSDataSection = DataSection;
+  TLSBSSSection = BSSSection;
+
+  ReadOnlySection = 
+    MCSectionXCore::Create(".cp.rodata", MCSectionELF::SHT_PROGBITS,
+                           MCSectionELF::SHF_ALLOC |
+                           MCSectionXCore::SHF_CP_SECTION,
+                           SectionKind::getReadOnlyWithRel(), false,
+                           getContext());
+
+  // Dynamic linking is not supported. Data with relocations is placed in the
+  // same section as data without relocations.
+  DataRelSection = DataRelLocalSection = DataSection;
+  DataRelROSection = DataRelROLocalSection = ReadOnlySection;
+}

diff --git a/lib/Target/XCore/XCoreTargetObjectFile.h b/lib/Target/XCore/XCoreTargetObjectFile.h
new file mode 100644
index 0000000..7efb990
--- /dev/null
+++ b/lib/Target/XCore/XCoreTargetObjectFile.h

@@ -0,0 +1,26 @@
+//===-- llvm/Target/XCoreTargetObjectFile.h - XCore Object Info -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_XCORE_TARGETOBJECTFILE_H
+#define LLVM_TARGET_XCORE_TARGETOBJECTFILE_H
+
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+namespace llvm {
+
+  class XCoreTargetObjectFile : public TargetLoweringObjectFileELF {
+  public:
+    
+    void Initialize(MCContext &Ctx, const TargetMachine &TM);
+
+    // TODO: Classify globals as xcore wishes.
+  };
+} // end namespace llvm
+
+#endif
commit	e264f62ca09a8f65c87a46d562a4d0f9ec5d457e	[log] [tgz]
author	Shih-wei Liao <sliao@google.com>	Wed Feb 10 11:10:31 2010 -0800
committer	Shih-wei Liao <sliao@google.com>	Wed Feb 10 11:10:31 2010 -0800
tree	59e3d57ef656cef79afa708ae0a3daf25cd91fcf